diff options
1451 files changed, 140156 insertions, 120026 deletions
diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 000000000..58c855615 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,4 @@ +* text=auto + +Makefile* text whitespace=-tab-in-indent +*.sh text eol=lf diff --git a/.gitignore b/.gitignore index 074874a51..efdbc9bf0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,26 +1,11 @@ -*.pyc -*.pyo -*.class -*~ -*.DS_Store -wine-py2exe/ -py2exe.log -*.kate-swp -build/ -dist/ -MANIFEST -README.txt -youtube-dl.1 -youtube-dl.bash-completion -youtube-dl.fish -youtube_dl/extractor/lazy_extractors.py -youtube-dl -youtube-dl.exe -youtube-dl.tar.gz -.coverage -cover/ -updates_key.pem -*.egg-info +# Config +*.conf +*.spec +cookies +*cookies.txt +.netrc + +# Downloaded *.srt *.ttml *.sbv @@ -31,23 +16,93 @@ updates_key.pem *.m4v *.mp3 *.3gp +*.webm *.wav *.ape *.mkv +*.flac +*.avi *.swf *.part +*.part-* *.ytdl +*.dump +*.frag +*.frag.urls +*.aria2 *.swp +*.ogg +*.opus +*.info.json +*.live_chat.json +*.jpg +*.jpeg +*.png +*.webp +*.annotations.xml +*.description + +# Allow config/media files in testdata +!test/** + +# Python +*.pyc +*.pyo +.pytest_cache +wine-py2exe/ +py2exe.log +build/ +dist/ +zip/ +tmp/ +venv/ +completions/ + +# Misc +*~ +*.DS_Store +*.kate-swp +MANIFEST test/local_parameters.json +.coverage +cover/ +secrets/ +updates_key.pem +*.egg-info .tox -youtube-dl.zsh +*.class + +# Generated +AUTHORS +README.txt +.mailmap +*.1 +*.bash-completion +*.fish +*.exe +*.tar.gz +*.zsh +*.spec +test/testdata/player-*.js + +# Binary +/ytdlp +yt-dlp.zip +*.exe -# IntelliJ related files +# Text Editor / IDE .idea *.iml +.vscode +*.sublime-* -tmp/ -venv/ +# Lazy extractors +*/extractor/lazy_extractors.py + +# Plugins +ytdlp_plugins/extractor/* +!ytdlp_plugins/extractor/__init__.py +!ytdlp_plugins/extractor/sample.py # VS Code related files .vscode @@ -96,8 +151,5 @@ test/test_iqiyi_sdk_interpreter.py test/test_swfinterp.py test/test_update.py test/versions.json -youtube_dl/swfinterp.py -youtube_dl/update.py - -# flycheck -flycheck_*.py +yt_dlp/swfinterp.py +yt_dlp/update.py diff --git a/AUTHORS b/AUTHORS deleted file mode 100644 index 4a6d7dacd..000000000 --- a/AUTHORS +++ /dev/null @@ -1,249 +0,0 @@ -Ricardo Garcia Gonzalez -Danny Colligan -Benjamin Johnson -Vasyl' Vavrychuk -Witold Baryluk -Paweł Paprota -Gergely Imreh -Rogério Brito -Philipp Hagemeister -Sören Schulze -Kevin Ngo -Ori Avtalion -shizeeg -Filippo Valsorda -Christian Albrecht -Dave Vasilevsky -Jaime Marquínez Ferrándiz -Jeff Crouse -Osama Khalid -Michael Walter -M. Yasoob Ullah Khalid -Julien Fraichard -Johny Mo Swag -Axel Noack -Albert Kim -Pierre Rudloff -Huarong Huo -Ismael Mejía -Steffan Donal -Andras Elso -Jelle van der Waa -Marcin Cieślak -Anton Larionov -Takuya Tsuchida -Sergey M. -Michael Orlitzky -Chris Gahan -Saimadhav Heblikar -Mike Col -Oleg Prutz -pulpe -Andreas Schmitz -Michael Kaiser -Niklas Laxström -David Triendl -Anthony Weems -David Wagner -Juan C. Olivares -Mattias Harrysson -phaer -Sainyam Kapoor -Nicolas Évrard -Jason Normore -Hoje Lee -Adam Thalhammer -Georg Jähnig -Ralf Haring -Koki Takahashi -Ariset Llerena -Adam Malcontenti-Wilson -Tobias Bell -Naglis Jonaitis -Charles Chen -Hassaan Ali -Dobrosław Żybort -David Fabijan -Sebastian Haas -Alexander Kirk -Erik Johnson -Keith Beckman -Ole Ernst -Aaron McDaniel (mcd1992) -Magnus Kolstad -Hari Padmanaban -Carlos Ramos -5moufl -lenaten -Dennis Scheiba -Damon Timm -winwon -Xavier Beynon -Gabriel Schubiner -xantares -Jan Matějka -Mauroy Sébastien -William Sewell -Dao Hoang Son -Oskar Jauch -Matthew Rayfield -t0mm0 -Tithen-Firion -Zack Fernandes -cryptonaut -Adrian Kretz -Mathias Rav -Petr Kutalek -Will Glynn -Max Reimann -Cédric Luthi -Thijs Vermeir -Joel Leclerc -Christopher Krooss -Ondřej Caletka -Dinesh S -Johan K. Jensen -Yen Chi Hsuan -Enam Mijbah Noor -David Luhmer -Shaya Goldberg -Paul Hartmann -Frans de Jonge -Robin de Rooij -Ryan Schmidt -Leslie P. Polzer -Duncan Keall -Alexander Mamay -Devin J. Pohly -Eduardo Ferro Aldama -Jeff Buchbinder -Amish Bhadeshia -Joram Schrijver -Will W. -Mohammad Teimori Pabandi -Roman Le Négrate -Matthias Küch -Julian Richen -Ping O. -Mister Hat -Peter Ding -jackyzy823 -George Brighton -Remita Amine -Aurélio A. Heckert -Bernhard Minks -sceext -Zach Bruggeman -Tjark Saul -slangangular -Behrouz Abbasi -ngld -nyuszika7h -Shaun Walbridge -Lee Jenkins -Anssi Hannula -Lukáš Lalinský -Qijiang Fan -Rémy Léone -Marco Ferragina -reiv -Muratcan Simsek -Evan Lu -flatgreen -Brian Foley -Vignesh Venkat -Tom Gijselinck -Founder Fang -Andrew Alexeyew -Saso Bezlaj -Erwin de Haan -Jens Wille -Robin Houtevelts -Patrick Griffis -Aidan Rowe -mutantmonkey -Ben Congdon -Kacper Michajłow -José Joaquín Atria -Viťas Strádal -Kagami Hiiragi -Philip Huppert -blahgeek -Kevin Deldycke -inondle -Tomáš Čech -Déstin Reed -Roman Tsiupa -Artur Krysiak -Jakub Adam Wieczorek -Aleksandar Topuzović -Nehal Patel -Rob van Bekkum -Petr Zvoníček -Pratyush Singh -Aleksander Nitecki -Sebastian Blunt -Matěj Cepl -Xie Yanbo -Philip Xu -John Hawkinson -Rich Leeper -Zhong Jianxin -Thor77 -Mattias Wadman -Arjan Verwer -Costy Petrisor -Logan B -Alex Seiler -Vijay Singh -Paul Hartmann -Stephen Chen -Fabian Stahl -Bagira -Odd Stråbø -Philip Herzog -Thomas Christlieb -Marek Rusinowski -Tobias Gruetzmacher -Olivier Bilodeau -Lars Vierbergen -Juanjo Benages -Xiao Di Guan -Thomas Winant -Daniel Twardowski -Jeremie Jarosh -Gerard Rovira -Marvin Ewald -Frédéric Bournival -Timendum -gritstub -Adam Voss -Mike Fährmann -Jan Kundrát -Giuseppe Fabiano -Örn Guðjónsson -Parmjit Virk -Genki Sky -Ľuboš Katrinec -Corey Nicholson -Ashutosh Chaudhary -John Dong -Tatsuyuki Ishi -Daniel Weber -Kay Bouché -Yang Hongbo -Lei Wang -Petr Novák -Leonardo Taccari -Martin Weinelt -Surya Oktafendri -TingPing -Alexandre Macabies -Bastian de Groot -Niklas Haas -András Veres-Szentkirályi -Enes Solak -Nathan Rossi -Thomas van der Berg -Luca Cherubin -Adrian Heine
\ No newline at end of file diff --git a/CONTRIBUTORS b/CONTRIBUTORS new file mode 100644 index 000000000..048d98852 --- /dev/null +++ b/CONTRIBUTORS @@ -0,0 +1,127 @@ +pukkandan (owner) +shirt-dev (collaborator) +coletdjnz/colethedj (collaborator) +Ashish0804 (collaborator) +h-h-h-h +pauldubois98 +nixxo +GreyAlien502 +kyuyeunk +siikamiika +jbruchon +alexmerkel +glenn-slayden +Unrud +wporr +mariuszskon +ohnonot +samiksome +alxnull +FelixFrog +Zocker1999NET +nao20010128nao +kurumigi +bbepis +animelover1984/horahoradev +Pccode66 +RobinD42 +hseg +DennyDai +codeasashu +teesid +kevinoconnor7 +damianoamatruda +2ShedsJackson +CXwudi +xtkoba +llacb47 +hheimbuerger +B0pol +lkho +fstirlitz +Lamieur +tsukumijima +Hadi0609 +b5eff52 +craftingmod +tpikonen +tripulse +king-millez +alex-gedeon +hhirtz +louie-github +MinePlayersPE +olifre +rhsmachine/zenerdi0de +nihil-admirari +krichbanana +ohmybahgosh +nyuszika7h +blackjack4494 +pyx +TpmKranz +mzbaulhaque +zackmark29 +mbway +zerodytrash +wesnm +pento +rigstot +dirkf +funniray +Jessecar96 +jhwgh1968 +kikuyan +max-te +nchilada +pgaig +PSlava +stdedos +u-spec-png +Sipherdrakon +kidonng +smege1001 +tandy1000 +IONECarter +capntrips +mrfade +ParadoxGBB +wlritchi +NeroBurner +mahanstreamer +alerikaisattera +Derkades +BunnyHelp +i6t +std-move +Chocobozzz +ouwou +korli +octotherp +CeruleanSky +zootedb0t +chao813 +ChillingPepper +ConquerorDopy +dalanmiller +DigitalDJ +f4pp3rk1ng +gesa +Jules-A +makeworld-the-better-one +MKSherbini +mrx23dot +poschi3 +raphaeldore +renalid +sleaux-meaux +sulyi +tmarki +Vangelis66 +AjaxGb +ajj8 +jakubadamw +jfogelman +timethrow +sarnoud +Bojidarist diff --git a/ChangeLog b/ChangeLog deleted file mode 100644 index 680fffdf8..000000000 --- a/ChangeLog +++ /dev/null @@ -1,6142 +0,0 @@ -version 2021.06.06 - -Extractors -* [facebook] Improve login required detection -* [youporn] Fix formats and view count extraction (#29216) -* [orf:tvthek] Fix thumbnails extraction (#29217) -* [formula1] Fix extraction (#29206) -* [ard] Relax URL regular expression and fix video ids (#22724, #29091) -+ [ustream] Detect https embeds (#29133) -* [ted] Prefer own formats over external sources (#29142) -* [twitch:clips] Improve extraction (#29149) -+ [twitch:clips] Add access token query to download URLs (#29136) -* [youtube] Fix get_video_info request (#29086, #29165) -* [vimeo] Fix vimeo pro embed extraction (#29126) -* [redbulltv] Fix embed data extraction (#28770) -* [shahid] Relax URL regular expression (#28772, #28930) - - -version 2021.05.16 - -Core -* [options] Fix thumbnail option group name (#29042) -* [YoutubeDL] Improve extract_info doc (#28946) - -Extractors -+ [playstuff] Add support for play.stuff.co.nz (#28901, #28931) -* [eroprofile] Fix extraction (#23200, #23626, #29008) -+ [vivo] Add support for vivo.st (#29009) -+ [generic] Add support for og:audio (#28311, #29015) -* [phoenix] Fix extraction (#29057) -+ [generic] Add support for sibnet embeds -+ [vk] Add support for sibnet embeds (#9500) -+ [generic] Add Referer header for direct videojs download URLs (#2879, - #20217, #29053) -* [orf:radio] Switch download URLs to HTTPS (#29012, #29046) -- [blinkx] Remove extractor (#28941) -* [medaltv] Relax URL regular expression (#28884) -+ [funimation] Add support for optional lang code in URLs (#28950) -+ [gdcvault] Add support for HTML5 videos -* [dispeak] Improve FLV extraction (#13513, #28970) -* [kaltura] Improve iframe extraction (#28969) -* [kaltura] Make embed code alternatives actually work -* [cda] Improve extraction (#28709, #28937) -* [twitter] Improve formats extraction from vmap URL (#28909) -* [xtube] Fix formats extraction (#28870) -* [svtplay] Improve extraction (#28507, #28876) -* [tv2dk] Fix extraction (#28888) - - -version 2021.04.26 - -Extractors -+ [xfileshare] Add support for wolfstream.tv (#28858) -* [francetvinfo] Improve video id extraction (#28792) -* [medaltv] Fix extraction (#28807) -* [tver] Redirect all downloads to Brightcove (#28849) -* [go] Improve video id extraction (#25207, #25216, #26058) -* [youtube] Fix lazy extractors (#28780) -+ [bbc] Extract description and timestamp from __INITIAL_DATA__ (#28774) -* [cbsnews] Fix extraction for python <3.6 (#23359) - - -version 2021.04.17 - -Core -+ [utils] Add support for experimental HTTP response status code - 308 Permanent Redirect (#27877, #28768) - -Extractors -+ [lbry] Add support for HLS videos (#27877, #28768) -* [youtube] Fix stretched ratio calculation -* [youtube] Improve stretch extraction (#28769) -* [youtube:tab] Improve grid extraction (#28725) -+ [youtube:tab] Detect series playlist on playlists page (#28723) -+ [youtube] Add more invidious instances (#28706) -* [pluralsight] Extend anti-throttling timeout (#28712) -* [youtube] Improve URL to extractor routing (#27572, #28335, #28742) -+ [maoritv] Add support for maoritelevision.com (#24552) -+ [youtube:tab] Pass innertube context and x-goog-visitor-id header along with - continuation requests (#28702) -* [mtv] Fix Viacom A/B Testing Video Player extraction (#28703) -+ [pornhub] Extract DASH and HLS formats from get_media end point (#28698) -* [cbssports] Fix extraction (#28682) -* [jamendo] Fix track extraction (#28686) -* [curiositystream] Fix format extraction (#26845, #28668) - - -version 2021.04.07 - -Core -* [extractor/common] Use compat_cookies_SimpleCookie for _get_cookies -+ [compat] Introduce compat_cookies_SimpleCookie -* [extractor/common] Improve JSON-LD author extraction -* [extractor/common] Fix _get_cookies on python 2 (#20673, #23256, #20326, - #28640) - -Extractors -* [youtube] Fix extraction of videos with restricted location (#28685) -+ [line] Add support for live.line.me (#17205, #28658) -* [vimeo] Improve extraction (#28591) -* [youku] Update ccode (#17852, #28447, #28460, #28648) -* [youtube] Prefer direct entry metadata over entry metadata from playlist - (#28619, #28636) -* [screencastomatic] Fix extraction (#11976, #24489) -+ [palcomp3] Add support for palcomp3.com (#13120) -+ [arnes] Add support for video.arnes.si (#28483) -+ [youtube:tab] Add support for hashtags (#28308) - - -version 2021.04.01 - -Extractors -* [youtube] Setup CONSENT cookie when needed (#28604) -* [vimeo] Fix password protected review extraction (#27591) -* [youtube] Improve age-restricted video extraction (#28578) - - -version 2021.03.31 - -Extractors -* [vlive] Fix inkey request (#28589) -* [francetvinfo] Improve video id extraction (#28584) -+ [instagram] Extract duration (#28469) -* [instagram] Improve title extraction (#28469) -+ [sbs] Add support for ondemand watch URLs (#28566) -* [youtube] Fix video's channel extraction (#28562) -* [picarto] Fix live stream extraction (#28532) -* [vimeo] Fix unlisted video extraction (#28414) -* [youtube:tab] Fix playlist/community continuation items extraction (#28266) -* [ard] Improve clip id extraction (#22724, #28528) - - -version 2021.03.25 - -Extractors -+ [zoom] Add support for zoom.us (#16597, #27002, #28531) -* [bbc] Fix BBC IPlayer Episodes/Group extraction (#28360) -* [youtube] Fix default value for youtube_include_dash_manifest (#28523) -* [zingmp3] Fix extraction (#11589, #16409, #16968, #27205) -+ [vgtv] Add support for new tv.aftonbladet.se URL schema (#28514) -+ [tiktok] Detect private videos (#28453) -* [vimeo:album] Fix extraction for albums with number of videos multiple - to page size (#28486) -* [vvvvid] Fix kenc format extraction (#28473) -* [mlb] Fix video extraction (#21241) -* [svtplay] Improve extraction (#28448) -* [applepodcasts] Fix extraction (#28445) -* [rtve] Improve extraction - + Extract all formats - * Fix RTVE Infantil extraction (#24851) - + Extract is_live and series - - -version 2021.03.14 - -Core -+ Introduce release_timestamp meta field (#28386) - -Extractors -+ [southpark] Add support for southparkstudios.com (#28413) -* [southpark] Fix extraction (#26763, #28413) -* [sportdeutschland] Fix extraction (#21856, #28425) -* [pinterest] Reduce the number of HLS format requests -* [peertube] Improve thumbnail extraction (#28419) -* [tver] Improve title extraction (#28418) -* [fujitv] Fix HLS formats extension (#28416) -* [shahid] Fix format extraction (#28383) -+ [lbry] Add support for channel filters (#28385) -+ [bandcamp] Extract release timestamp -+ [lbry] Extract release timestamp (#28386) -* [pornhub] Detect flagged videos -+ [pornhub] Extract formats from get_media end point (#28395) -* [bilibili] Fix video info extraction (#28341) -+ [cbs] Add support for Paramount+ (#28342) -+ [trovo] Add Origin header to VOD formats (#28346) -* [voxmedia] Fix volume embed extraction (#28338) - - -version 2021.03.03 - -Extractors -* [youtube:tab] Switch continuation to browse API (#28289, #28327) -* [9c9media] Fix extraction for videos with multiple ContentPackages (#28309) -+ [bbc] Add support for BBC Reel videos (#21870, #23660, #28268) - - -version 2021.03.02 - -Extractors -* [zdf] Rework extractors (#11606, #13473, #17354, #21185, #26711, #27068, - #27930, #28198, #28199, #28274) - * Generalize cross-extractor video ids for zdf based extractors - * Improve extraction - * Fix 3sat and phoenix -* [stretchinternet] Fix extraction (#28297) -* [urplay] Fix episode data extraction (#28292) -+ [bandaichannel] Add support for b-ch.com (#21404) -* [srgssr] Improve extraction (#14717, #14725, #27231, #28238) - + Extract subtitle - * Fix extraction for new videos - * Update srf download domains -* [vvvvid] Reduce season request payload size -+ [vvvvid] Extract series sublists playlist title (#27601, #27618) -+ [dplay] Extract Ad-Free uplynk URLs (#28160) -+ [wat] Detect DRM protected videos (#27958) -* [tf1] Improve extraction (#27980, #28040) -* [tmz] Fix and improve extraction (#24603, #24687, 28211) -+ [gedidigital] Add support for Gedi group sites (#7347, #26946) -* [youtube] Fix get_video_info request - - -version 2021.02.22 - -Core -+ [postprocessor/embedthumbnail] Recognize atomicparsley binary in lowercase - (#28112) - -Extractors -* [apa] Fix and improve extraction (#27750) -+ [youporn] Extract duration (#28019) -+ [peertube] Add support for canard.tube (#28190) -* [youtube] Fixup m4a_dash formats (#28165) -+ [samplefocus] Add support for samplefocus.com (#27763) -+ [vimeo] Add support for unlisted video source format extraction -* [viki] Improve extraction (#26522, #28203) - * Extract uploader URL and episode number - * Report login required error - + Extract 480p formats - * Fix API v4 calls -* [ninegag] Unescape title (#28201) -* [youtube] Improve URL regular expression (#28193) -+ [youtube] Add support for redirect.invidious.io (#28193) -+ [dplay] Add support for de.hgtv.com (#28182) -+ [dplay] Add support for discoveryplus.com (#24698) -+ [simplecast] Add support for simplecast.com (#24107) -* [youtube] Fix uploader extraction in flat playlist mode (#28045) -* [yandexmusic:playlist] Request missing tracks in chunks (#27355, #28184) -+ [storyfire] Add support for storyfire.com (#25628, #26349) -+ [zhihu] Add support for zhihu.com (#28177) -* [youtube] Fix controversial videos when authenticated with cookies (#28174) -* [ccma] Fix timestamp parsing in python 2 -+ [videopress] Add support for video.wordpress.com -* [kakao] Improve info extraction and detect geo restriction (#26577) -* [xboxclips] Fix extraction (#27151) -* [ard] Improve formats extraction (#28155) -+ [canvas] Add support for dagelijksekost.een.be (#28119) - - -version 2021.02.10 - -Extractors -* [youtube:tab] Improve grid continuation extraction (#28130) -* [ign] Fix extraction (#24771) -+ [xhamster] Extract format filesize -+ [xhamster] Extract formats from xplayer settings (#28114) -+ [youtube] Add support phone/tablet JS player (#26424) -* [archiveorg] Fix and improve extraction (#21330, #23586, #25277, #26780, - #27109, #27236, #28063) -+ [cda] Detect geo restricted videos (#28106) -* [urplay] Fix extraction (#28073, #28074) -* [youtube] Fix release date extraction (#28094) -+ [youtube] Extract abr and vbr (#28100) -* [youtube] Skip OTF formats (#28070) - - -version 2021.02.04.1 - -Extractors -* [youtube] Prefer DASH formats (#28070) -* [azmedien] Fix extraction (#28064) - - -version 2021.02.04 - -Extractors -* [pornhub] Implement lazy playlist extraction -* [svtplay] Fix video id extraction (#28058) -+ [pornhub] Add support for authentication (#18797, #21416, #24294) -* [pornhub:user] Improve paging -+ [pornhub:user] Add support for URLs unavailable via /videos page (#27853) -+ [bravotv] Add support for oxygen.com (#13357, #22500) -+ [youtube] Pass embed URL to get_video_info request -* [ccma] Improve metadata extraction (#27994) - + Extract age limit, alt title, categories, series and episode number - * Fix timestamp multiple subtitles extraction -* [egghead] Update API domain (#28038) -- [vidzi] Remove extractor (#12629) -* [vidio] Improve metadata extraction -* [youtube] Improve subtitles extraction -* [youtube] Fix chapter extraction fallback -* [youtube] Rewrite extractor - * Improve format sorting - * Remove unused code - * Fix series metadata extraction - * Fix trailer video extraction - * Improve error reporting - + Extract video location -+ [vvvvid] Add support for youtube embeds (#27825) -* [googledrive] Report download page errors (#28005) -* [vlive] Fix error message decoding for python 2 (#28004) -* [youtube] Improve DASH formats file size extraction -* [cda] Improve birth validation detection (#14022, #27929) -+ [awaan] Extract uploader id (#27963) -+ [medialaan] Add support DPG Media MyChannels based websites (#14871, #15597, - #16106, #16489) -* [abcnews] Fix extraction (#12394, #27920) -* [AMP] Fix upload date and timestamp extraction (#27970) -* [tv4] Relax URL regular expression (#27964) -+ [tv2] Add support for mtvuutiset.fi (#27744) -* [adn] Improve login warning reporting -* [zype] Fix uplynk id extraction (#27956) -+ [adn] Add support for authentication (#17091, #27841, #27937) - - -version 2021.01.24.1 - -Core -* Introduce --output-na-placeholder (#27896) - -Extractors -* [franceculture] Make thumbnail optional (#18807) -* [franceculture] Fix extraction (#27891, #27903) -* [njpwworld] Fix extraction (#27890) -* [comedycentral] Fix extraction (#27905) -* [wat] Fix format extraction (#27901) -+ [americastestkitchen:season] Add support for seasons (#27861) -+ [trovo] Add support for trovo.live (#26125) -+ [aol] Add support for yahoo videos (#26650) -* [yahoo] Fix single video extraction -* [lbry] Unescape lbry URI (#27872) -* [9gag] Fix and improve extraction (#23022) -* [americastestkitchen] Improve metadata extraction for ATK episodes (#27860) -* [aljazeera] Fix extraction (#20911, #27779) -+ [minds] Add support for minds.com (#17934) -* [ard] Fix title and description extraction (#27761) -+ [spotify] Add support for Spotify Podcasts (#27443) - - -version 2021.01.16 - -Core -* [YoutubeDL] Protect from infinite recursion due to recursively nested - playlists (#27833) -* [YoutubeDL] Ignore failure to create existing directory (#27811) -* [YoutubeDL] Raise syntax error for format selection expressions with multiple - + operators (#27803) - -Extractors -+ [animeondemand] Add support for lazy playlist extraction (#27829) -* [youporn] Restrict fallback download URL (#27822) -* [youporn] Improve height and tbr extraction (#20425, #23659) -* [youporn] Fix extraction (#27822) -+ [twitter] Add support for unified cards (#27826) -+ [twitch] Add Authorization header with OAuth token for GraphQL requests - (#27790) -* [mixcloud:playlist:base] Extract video id in flat playlist mode (#27787) -* [cspan] Improve info extraction (#27791) -* [adn] Improve info extraction -* [adn] Fix extraction (#26963, #27732) -* [youtube:search] Extract from all sections (#27604) -* [youtube:search] fix viewcount and try to extract all video sections (#27604) -* [twitch] Improve login error extraction -* [twitch] Fix authentication (#27743) -* [3qsdn] Improve extraction (#21058) -* [peertube] Extract formats from streamingPlaylists (#26002, #27586, #27728) -* [khanacademy] Fix extraction (#2887, #26803) -* [spike] Update Paramount Network feed URL (#27715) - - -version 2021.01.08 - -Core -* [downloader/hls] Disable decryption in tests (#27660) -+ [utils] Add a function to clean podcast URLs - -Extractors -* [rai] Improve subtitles extraction (#27698, #27705) -* [canvas] Match only supported VRT NU URLs (#27707) -+ [bibeltv] Add support for bibeltv.de (#14361) -+ [bfmtv] Add support for bfmtv.com (#16053, #26615) -+ [sbs] Add support for ondemand play and news embed URLs (#17650, #27629) -* [twitch] Drop legacy kraken API v5 code altogether and refactor -* [twitch:vod] Switch to GraphQL for video metadata -* [canvas] Fix VRT NU extraction (#26957, #27053) -* [twitch] Switch access token to GraphQL and refactor (#27646) -+ [rai] Detect ContentItem in iframe (#12652, #27673) -* [ketnet] Fix extraction (#27662) -+ [dplay] Add suport Discovery+ domains (#27680) -* [motherless] Improve extraction (#26495, #27450) -* [motherless] Fix recent videos upload date extraction (#27661) -* [nrk] Fix extraction for videos without a legalAge rating -- [googleplus] Remove extractor (#4955, #7400) -+ [applepodcasts] Add support for podcasts.apple.com (#25918) -+ [googlepodcasts] Add support for podcasts.google.com -+ [iheart] Add support for iheart.com (#27037) -* [acast] Clean podcast URLs -* [stitcher] Clean podcast URLs -+ [xfileshare] Add support for aparat.cam (#27651) -+ [twitter] Add support for summary card (#25121) -* [twitter] Try to use a Generic fallback for unknown twitter cards (#25982) -+ [stitcher] Add support for shows and show metadata extraction (#20510) -* [stv] Improve episode id extraction (#23083) - - -version 2021.01.03 - -Extractors -* [nrk] Improve series metadata extraction (#27473) -+ [nrk] Extract subtitles -* [nrk] Fix age limit extraction -* [nrk] Improve video id extraction -+ [nrk] Add support for podcasts (#27634, #27635) -* [nrk] Generalize and delegate all item extractors to nrk -+ [nrk] Add support for mp3 formats -* [nrktv] Switch to playback endpoint -* [vvvvid] Fix season metadata extraction (#18130) -* [stitcher] Fix extraction (#20811, #27606) -* [acast] Fix extraction (#21444, #27612, #27613) -+ [arcpublishing] Add support for arcpublishing.com (#2298, #9340, #17200) -+ [sky] Add support for Sports News articles and Brighcove videos (#13054) -+ [vvvvid] Extract akamai formats -* [vvvvid] Skip unplayable episodes (#27599) -* [yandexvideo] Fix extraction for Python 3.4 - - -version 2020.12.31 - -Core -* [utils] Accept only supported protocols in url_or_none -* [YoutubeDL] Allow format filtering using audio language (#16209) - -Extractors -+ [redditr] Extract all thumbnails (#27503) -* [vvvvid] Improve info extraction -+ [vvvvid] Add support for playlists (#18130, #27574) -+ [yandexdisk] Extract info from webpage -* [yandexdisk] Fix extraction (#17861, #27131) -* [yandexvideo] Use old API call as fallback -* [yandexvideo] Fix extraction (#25000) -- [nbc] Remove CSNNE extractor -* [nbc] Fix NBCSport VPlayer URL extraction (#16640) -+ [aenetworks] Add support for biography.com (#3863) -* [uktvplay] Match new video URLs (#17909) -* [sevenplay] Detect API errors -* [tenplay] Fix format extraction (#26653) -* [brightcove] Raise error for DRM protected videos (#23467, #27568) - - -version 2020.12.29 - -Extractors -* [youtube] Improve yt initial data extraction (#27524) -* [youtube:tab] Improve URL matching #27559) -* [youtube:tab] Restore retry on browse requests (#27313, #27564) -* [aparat] Fix extraction (#22285, #22611, #23348, #24354, #24591, #24904, - #25418, #26070, #26350, #26738, #27563) -- [brightcove] Remove sonyliv specific code -* [piksel] Improve format extraction -+ [zype] Add support for uplynk videos -+ [toggle] Add support for live.mewatch.sg (#27555) -+ [go] Add support for fxnow.fxnetworks.com (#13972, #22467, #23754, #26826) -* [teachable] Improve embed detection (#26923) -* [mitele] Fix free video extraction (#24624, #25827, #26757) -* [telecinco] Fix extraction -* [youtube] Update invidious.snopyta.org (#22667) -* [amcnetworks] Improve auth only video detection (#27548) -+ [generic] Add support for VHX Embeds (#27546) - - -version 2020.12.26 - -Extractors -* [instagram] Fix comment count extraction -+ [instagram] Add support for reel URLs (#26234, #26250) -* [bbc] Switch to media selector v6 (#23232, #23933, #26303, #26432, #26821, - #27538) -* [instagram] Improve thumbnail extraction -* [instagram] Fix extraction when authenticated (#22880, #26377, #26981, - #27422) -* [spankbang:playlist] Fix extraction (#24087) -+ [spankbang] Add support for playlist videos -* [pornhub] Improve like and dislike count extraction (#27356) -* [pornhub] Fix lq formats extraction (#27386, #27393) -+ [bongacams] Add support for bongacams.com (#27440) -* [youtube:tab] Extend URL regular expression (#27501) -* [theweatherchannel] Fix extraction (#25930, #26051) -+ [sprout] Add support for Universal Kids (#22518) -* [theplatform] Allow passing geo bypass countries from other extractors -+ [wistia] Add support for playlists (#27533) -+ [ctv] Add support for ctv.ca (#27525) -* [9c9media] Improve info extraction -* [youtube] Fix automatic captions extraction (#27162, #27388) -* [sonyliv] Fix title for movies -* [sonyliv] Fix extraction (#25667) -* [streetvoice] Fix extraction (#27455, #27492) -+ [facebook] Add support for watchparty pages (#27507) -* [cbslocal] Fix video extraction -+ [brightcove] Add another method to extract policyKey -* [mewatch] Relax URL regular expression (#27506) - - -version 2020.12.22 - -Core -* [common] Remove unwanted query params from unsigned akamai manifest URLs - -Extractors -- [tastytrade] Remove extractor (#25716) -* [niconico] Fix playlist extraction (#27428) -- [everyonesmixtape] Remove extractor -- [kanalplay] Remove extractor -* [arkena] Fix extraction -* [nba] Rewrite extractor -* [turner] Improve info extraction -* [youtube] Improve xsrf token extraction (#27442) -* [generic] Improve RSS age limit extraction -* [generic] Fix RSS itunes thumbnail extraction (#27405) -+ [redditr] Extract duration (#27426) -- [zaq1] Remove extractor -+ [asiancrush] Add support for retrocrush.tv -* [asiancrush] Fix extraction -- [noco] Remove extractor (#10864) -* [nfl] Fix extraction (#22245) -* [skysports] Relax URL regular expression (#27435) -+ [tv5unis] Add support for tv5unis.ca (#22399, #24890) -+ [videomore] Add support for more.tv (#27088) -+ [yandexmusic] Add support for music.yandex.com (#27425) -+ [nhk:program] Add support for audio programs and program clips -+ [nhk] Add support for NHK video programs (#27230) - - -version 2020.12.14 - -Core -* [extractor/common] Improve JSON-LD interaction statistic extraction (#23306) -* [downloader/hls] Delegate manifests with media initialization to ffmpeg -+ [extractor/common] Document duration meta field for playlists - -Extractors -* [mdr] Bypass geo restriction -* [mdr] Improve extraction (#24346, #26873) -* [yandexmusic:album] Improve album title extraction (#27418) -* [eporner] Fix view count extraction and make optional (#23306) -+ [eporner] Extend URL regular expression -* [eporner] Fix hash extraction and extend _VALID_URL (#27396) -* [slideslive] Use m3u8 entry protocol for m3u8 formats (#27400) -* [twitcasting] Fix format extraction and improve info extraction (#24868) -* [linuxacademy] Fix authentication and extraction (#21129, #26223, #27402) -* [itv] Clean description from HTML tags (#27399) -* [vlive] Sort live formats (#27404) -* [hotstart] Fix and improve extraction - * Fix format extraction (#26690) - + Extract thumbnail URL (#16079, #20412) - + Add support for country specific playlist URLs (#23496) - * Select the last id in video URL (#26412) -+ [youtube] Add some invidious instances (#27373) - - -version 2020.12.12 - -Core -* [YoutubeDL] Improve thumbnail filename deducing (#26010, #27244) - -Extractors -+ [ruutu] Extract more metadata -+ [ruutu] Detect non-free videos (#21154) -* [ruutu] Authenticate format URLs (#21031, #26782) -+ [ruutu] Add support for static.nelonenmedia.fi (#25412) -+ [ruutu] Extend URL regular expression (#24839) -+ [facebook] Add support archived live video URLs (#15859) -* [wdr] Improve overall extraction -+ [wdr] Extend subtitles extraction (#22672, #22723) -+ [facebook] Add support for videos attached to Relay based story pages - (#10795) -+ [wdr:page] Add support for kinder.wdr.de (#27350) -+ [facebook] Add another regular expression for handleServerJS -* [facebook] Fix embed page extraction -+ [facebook] Add support for Relay post pages (#26935) -+ [facebook] Add support for watch videos (#22795, #27062) -+ [facebook] Add support for group posts with multiple videos (#19131) -* [itv] Fix series metadata extraction (#26897) -- [itv] Remove old extraction method (#23177) -* [facebook] Redirect mobile URLs to desktop URLs (#24831, #25624) -+ [facebook] Add support for Relay based pages (#26823) -* [facebook] Try to reduce unnecessary tahoe requests -- [facebook] Remove hardcoded Chrome User-Agent (#18974, #25411, #26958, - #27329) -- [smotri] Remove extractor (#27358) -- [beampro] Remove extractor (#17290, #22871, #23020, #23061, #26099) - - -version 2020.12.09 - -Core -* [extractor/common] Fix inline HTML5 media tags processing (#27345) - -Extractors -* [youtube:tab] Improve identity token extraction (#27197) -* [youtube:tab] Make click tracking params on continuation optional -* [youtube:tab] Delegate inline playlists to tab-based playlists (27298) -+ [tubitv] Extract release year (#27317) -* [amcnetworks] Fix free content extraction (#20354) -+ [lbry:channel] Add support for channels (#25584) -+ [lbry] Add support for short and embed URLs -* [lbry] Fix channel metadata extraction -+ [telequebec] Add support for video.telequebec.tv (#27339) -* [telequebec] Fix extraction (#25733, #26883) -+ [youtube:tab] Capture and output alerts (#27340) -* [tvplay:home] Fix extraction (#21153) -* [americastestkitchen] Fix Extraction and add support - for Cook's Country and Cook's Illustrated (#17234, #27322) -+ [slideslive] Add support for yoda service videos and extract subtitles - (#27323) - - -version 2020.12.07 - -Core -* [extractor/common] Extract timestamp from Last-Modified header -+ [extractor/common] Add support for dl8-* media tags (#27283) -* [extractor/common] Fix media type extraction for HTML5 media tags - in start/end form - -Extractors -* [aenetworks] Fix extraction (#23363, #23390, #26795, #26985) - * Fix Fastly format extraction - + Add support for play and watch subdomains - + Extract series metadata -* [youtube] Improve youtu.be extraction in non-existing playlists (#27324) -+ [generic] Extract RSS video description, timestamp and itunes metadata - (#27177) -* [nrk] Reduce the number of instalments and episodes requests -* [nrk] Improve extraction - * Improve format extraction for old akamai formats - + Add is_live value to entry info dict - * Request instalments only when available - * Fix skole extraction -+ [peertube] Extract fps -+ [peertube] Recognize audio-only formats (#27295) - - -version 2020.12.05 - -Core -* [extractor/common] Improve Akamai HTTP format extraction - * Allow m3u8 manifest without an additional audio format - * Fix extraction for qualities starting with a number - -Extractors -* [teachable:course] Improve extraction (#24507, #27286) -* [nrk] Improve error extraction -* [nrktv:series] Improve extraction (#21926) -* [nrktv:season] Improve extraction -* [nrk] Improve format extraction and geo-restriction detection (#24221) -* [pornhub] Handle HTTP errors gracefully (#26414) -* [nrktv] Relax URL regular expression (#27299, #26185) -+ [zdf] Extract webm formats (#26659) -+ [gamespot] Extract DASH and HTTP formats -+ [tver] Add support for tver.jp (#26662, #27284) -+ [pornhub] Add support for pornhub.org (#27276) - - -version 2020.12.02 - -Extractors -+ [tva] Add support for qub.ca (#27235) -+ [toggle] Detect DRM protected videos (#16479, #20805) -+ [toggle] Add support for new MeWatch URLs (#27256) -* [youtube:tab] Extract channels only from channels tab (#27266) -+ [cspan] Extract info from jwplayer data (#3672, #3734, #10638, #13030, - #18806, #23148, #24461, #26171, #26800, #27263) -* [cspan] Pass Referer header with format's video URL (#26032, #25729) -* [youtube] Improve age-gated videos extraction (#27259) -+ [mediaset] Add support for movie URLs (#27240) -* [yandexmusic] Refactor -+ [yandexmusic] Add support for artist's tracks and albums (#11887, #22284) -* [yandexmusic:track] Fix extraction (#26449, #26669, #26747, #26748, #26762) - - -version 2020.11.29 - -Core -* [YoutubeDL] Write static debug to stderr and respect quiet for dynamic debug - (#14579, #22593) - -Extractors -* [drtv] Extend URL regular expression (#27243) -* [tiktok] Fix extraction (#20809, #22838, #22850, #25987, #26281, #26411, - #26639, #26776, #27237) -+ [ina] Add support for mobile URLs (#27229) -* [pornhub] Fix like and dislike count extraction (#27227, #27234) -* [youtube] Improve yt initial player response extraction (#27216) -* [videa] Fix extraction (#25650, #25973, #26301) - - -version 2020.11.26 - -Core -* [downloader/fragment] Set final file's mtime according to last fragment's - Last-Modified header (#11718, #18384, #27138) - -Extractors -+ [spreaker] Add support for spreaker.com (#13480, #13877) -* [vlive] Improve extraction for geo-restricted videos -+ [vlive] Add support for post URLs (#27122, #27123) -* [viki] Fix video API request (#27184) -* [bbc] Fix BBC Three clip extraction -* [bbc] Fix BBC News videos extraction -+ [medaltv] Add support for medal.tv (#27149) -* [youtube] Improve music metadata and license extraction (#26013) -* [nrk] Fix extraction -* [cda] Fix extraction (#17803, #24458, #24518, #26381) - - -version 2020.11.24 - -Core -+ [extractor/common] Add generic support for akamai HTTP format extraction - -Extractors -* [youtube:tab] Fix feeds extraction (#25695, #26452) -* [youtube:favorites] Restore extractor -* [youtube:tab] Fix some weird typo (#27157) -+ [pinterest] Add support for large collections (more than 25 pins) -+ [franceinter] Extract thumbnail (#27153) -+ [box] Add support for box.com (#5949) -+ [nytimes] Add support for cooking.nytimes.com (#27112, #27143) -* [lbry] Relax URL regular expression (#27144) -+ [rumble] Add support for embed pages (#10785) -+ [skyit] Add support for multiple Sky Italia websites (#26629) -+ [pinterest] Add support for pinterest.com (#25747) - - -version 2020.11.21.1 - -Core -* [downloader/http] Fix crash during urlopen caused by missing reason - of URLError -* [YoutubeDL] Fix --ignore-errors for playlists with generator-based entries - of url_transparent (#27064) - -Extractors -+ [svtplay] Add support for svt.se/barnkanalen (#24817) -+ [svt] Extract timestamp (#27130) -* [svtplay] Improve thumbnail extraction (#27130) -* [youtube] Fix error reason extraction (#27081) -* [youtube] Fix like and dislike count extraction (#25977) -+ [youtube:tab] Add support for current video and fix lives extraction (#27126) -* [infoq] Fix format extraction (#25984) -* [francetv] Update to fix thumbnail URL issue (#27120) -* [youtube] Improve yt initial data extraction (#27093) -+ [discoverynetworks] Add support new TLC/DMAX URLs (#27100) -* [rai] Fix protocol relative relinker URLs (#22766) -* [rai] Fix unavailable video format detection -* [rai] Improve extraction -* [rai] Fix extraction (#27077) -* [viki] Improve format extraction -* [viki] Fix stream extraction from MPD (#27092) -* [googledrive] Fix format extraction (#26979) -+ [amara] Add support for amara.org (#20618) -* [vimeo:album] Fix extraction (#27079) -* [mtv] Fix mgid extraction (#26841) - - -version 2020.11.19 - -Core -* [extractor/common] Output error for invalid URLs in _is_valid_url (#21400, - #24151, #25617, #25618, #25586, #26068, #27072) - -Extractors -* [youporn] Fix upload date extraction -* [youporn] Make comment count optional (#26986) -* [arte] Rework extractors - * Reimplement embed and playlist extractors to delegate to the single - entrypoint artetv extractor - * Improve embeds detection (#27057) -+ [arte] Extract m3u8 formats (#27061) -* [mgtv] Fix format extraction (#26415) -+ [lbry] Add support for odysee.com (#26806) -* [francetv] Improve info extraction -+ [francetv] Add fallback video URL extraction (#27047) - - -version 2020.11.18 - -Extractors -* [spiegel] Fix extraction (#24206, #24767) -* [youtube] Improve extraction - + Add support for --no-playlist (#27009) - * Improve playlist and mix extraction (#26390, #26509, #26534, #27011) - + Extract playlist uploader data -* [youtube:tab] Fix view count extraction (#27051) -* [malltv] Fix extraction (#27035) -+ [bandcamp] Extract playlist description (#22684) -* [urplay] Fix extraction (#26828) -* [youtube:tab] Fix playlist title extraction (#27015) -* [youtube] Fix chapters extraction (#26005) - - -version 2020.11.17 - -Core -* [utils] Skip ! prefixed code in js_to_json - -Extractors -* [youtube:tab] Fix extraction with cookies provided (#27005) -* [lrt] Fix extraction with empty tags (#20264) -+ [ndr:embed:base] Extract subtitles (#25447, #26106) -+ [servus] Add support for pm-wissen.com (#25869) -* [servus] Fix extraction (#26872, #26967, #26983, #27000) -* [xtube] Fix extraction (#26996) -* [lrt] Fix extraction -+ [lbry] Add support for lbry.tv -+ [condenast] Extract subtitles -* [condenast] Fix extraction -* [bandcamp] Fix extraction (#26681, #26684) -* [rai] Fix RaiPlay extraction (#26064, #26096) -* [vlive] Fix extraction -* [usanetwork] Fix extraction -* [nbc] Fix NBCNews/Today/MSNBC extraction -* [cnbc] Fix extraction - - -version 2020.11.12 - -Extractors -* [youtube] Rework extractors - - -version 2020.11.01 - -Core -* [utils] Don't attempt to coerce JS strings to numbers in js_to_json (#26851) -* [downloader/http] Properly handle missing message in SSLError (#26646) -* [downloader/http] Fix access to not yet opened stream in retry - -Extractors -* [youtube] Fix JS player URL extraction -* [ytsearch] Fix extraction (#26920) -* [afreecatv] Fix typo (#26970) -* [23video] Relax URL regular expression (#26870) -+ [ustream] Add support for video.ibm.com (#26894) -* [iqiyi] Fix typo (#26884) -+ [expressen] Add support for di.se (#26670) -* [iprima] Improve video id extraction (#26507, #26494) - - -version 2020.09.20 - -Core -* [extractor/common] Relax interaction count extraction in _json_ld -+ [extractor/common] Extract author as uploader for VideoObject in _json_ld -* [downloader/hls] Fix incorrect end byte in Range HTTP header for - media segments with EXT-X-BYTERANGE (#14748, #24512) -* [extractor/common] Handle ssl.CertificateError in _request_webpage (#26601) -* [downloader/http] Improve timeout detection when reading block of data - (#10935) -* [downloader/http] Retry download when urlopen times out (#10935, #26603) - -Extractors -* [redtube] Extend URL regular expression (#26506) -* [twitch] Refactor -* [twitch:stream] Switch to GraphQL and fix reruns (#26535) -+ [telequebec] Add support for brightcove videos (#25833) -* [pornhub] Extract metadata from JSON-LD (#26614) -* [pornhub] Fix view count extraction (#26621, #26614) - - -version 2020.09.14 - -Core -+ [postprocessor/embedthumbnail] Add support for non jpg/png thumbnails - (#25687, #25717) - -Extractors -* [rtlnl] Extend URL regular expression (#26549, #25821) -* [youtube] Fix empty description extraction (#26575, #26006) -* [srgssr] Extend URL regular expression (#26555, #26556, #26578) -* [googledrive] Use redirect URLs for source format (#18877, #23919, #24689, - #26565) -* [svtplay] Fix id extraction (#26576) -* [redbulltv] Improve support for rebull.com TV localized URLs (#22063) -+ [redbulltv] Add support for new redbull.com TV URLs (#22037, #22063) -* [soundcloud:pagedplaylist] Reduce pagination limit (#26557) - - -version 2020.09.06 - -Core -+ [utils] Recognize wav mimetype (#26463) - -Extractors -* [nrktv:episode] Improve video id extraction (#25594, #26369, #26409) -* [youtube] Fix age gate content detection (#26100, #26152, #26311, #26384) -* [youtube:user] Extend URL regular expression (#26443) -* [xhamster] Improve initials regular expression (#26526, #26353) -* [svtplay] Fix video id extraction (#26425, #26428, #26438) -* [twitch] Rework extractors (#12297, #20414, #20604, #21811, #21812, #22979, - #24263, #25010, #25553, #25606) - * Switch to GraphQL - + Add support for collections - + Add support for clips and collections playlists -* [biqle] Improve video ext extraction -* [xhamster] Fix extraction (#26157, #26254) -* [xhamster] Extend URL regular expression (#25789, #25804, #25927)) - - -version 2020.07.28 - -Extractors -* [youtube] Fix sigfunc name extraction (#26134, #26135, #26136, #26137) -* [youtube] Improve description extraction (#25937, #25980) -* [wistia] Restrict embed regular expression (#25969) -* [youtube] Prevent excess HTTP 301 (#25786) -+ [youtube:playlists] Extend URL regular expression (#25810) -+ [bellmedia] Add support for cp24.com clip URLs (#25764) -* [brightcove] Improve embed detection (#25674) - - -version 2020.06.16.1 - -Extractors -* [youtube] Force old layout (#25682, #25683, #25680, #25686) -* [youtube] Fix categories and improve tags extraction - - -version 2020.06.16 - -Extractors -* [youtube] Fix uploader id and uploader URL extraction -* [youtube] Improve view count extraction -* [youtube] Fix upload date extraction (#25677) -* [youtube] Fix thumbnails extraction (#25676) -* [youtube] Fix playlist and feed extraction (#25675) -+ [facebook] Add support for single-video ID links -+ [youtube] Extract chapters from JSON (#24819) -+ [kaltura] Add support for multiple embeds on a webpage (#25523) - - -version 2020.06.06 - -Extractors -* [tele5] Bypass geo restriction -+ [jwplatform] Add support for bypass geo restriction -* [tele5] Prefer jwplatform over nexx (#25533) -* [twitch:stream] Expect 400 and 410 HTTP errors from API -* [twitch:stream] Fix extraction (#25528) -* [twitch] Fix thumbnails extraction (#25531) -+ [twitch] Pass v5 Accept HTTP header (#25531) -* [brightcove] Fix subtitles extraction (#25540) -+ [malltv] Add support for sk.mall.tv (#25445) -* [periscope] Fix untitled broadcasts (#25482) -* [jwplatform] Improve embeds extraction (#25467) - - -version 2020.05.29 - -Core -* [postprocessor/ffmpeg] Embed series metadata with --add-metadata -* [utils] Fix file permissions in write_json_file (#12471, #25122) - -Extractors -* [ard:beta] Extend URL regular expression (#25405) -+ [youtube] Add support for more invidious instances (#25417) -* [giantbomb] Extend URL regular expression (#25222) -* [ard] Improve URL regular expression (#25134, #25198) -* [redtube] Improve formats extraction and extract m3u8 formats (#25311, - #25321) -* [indavideo] Switch to HTTPS for API request (#25191) -* [redtube] Improve title extraction (#25208) -* [vimeo] Improve format extraction and sorting (#25285) -* [soundcloud] Reduce API playlist page limit (#25274) -+ [youtube] Add support for yewtu.be (#25226) -* [mailru] Fix extraction (#24530, #25239) -* [bellator] Fix mgid extraction (#25195) - - -version 2020.05.08 - -Core -* [downloader/http] Request last data block of exact remaining size -* [downloader/http] Finish downloading once received data length matches - expected -* [extractor/common] Use compat_cookiejar_Cookie for _set_cookie to always - ensure cookie name and value are bytestrings on python 2 (#23256, #24776) -+ [compat] Introduce compat_cookiejar_Cookie -* [utils] Improve cookie files support - + Add support for UTF-8 in cookie files - * Skip malformed cookie file entries instead of crashing (invalid entry - length, invalid expires at) - -Extractors -* [youtube] Improve signature cipher extraction (#25187, #25188) -* [iprima] Improve extraction (#25138) -* [uol] Fix extraction (#22007) -+ [orf] Add support for more radio stations (#24938, #24968) -* [dailymotion] Fix typo -- [puhutv] Remove no longer available HTTP formats (#25124) - - -version 2020.05.03 - -Core -+ [extractor/common] Extract multiple JSON-LD entries -* [options] Clarify doc on --exec command (#19087, #24883) -* [extractor/common] Skip malformed ISM manifest XMLs while extracting - ISM formats (#24667) - -Extractors -* [crunchyroll] Fix and improve extraction (#25096, #25060) -* [youtube] Improve player id extraction -* [youtube] Use redirected video id if any (#25063) -* [yahoo] Fix GYAO Player extraction and relax URL regular expression - (#24178, #24778) -* [tvplay] Fix Viafree extraction (#15189, #24473, #24789) -* [tenplay] Relax URL regular expression (#25001) -+ [prosiebensat1] Extract series metadata -* [prosiebensat1] Improve extraction and remove 7tv.de support (#24948) -- [prosiebensat1] Remove 7tv.de support (#24948) -* [youtube] Fix DRM videos detection (#24736) -* [thisoldhouse] Fix video id extraction (#24548, #24549) -+ [soundcloud] Extract AAC format (#19173, #24708) -* [youtube] Skip broken multifeed videos (#24711) -* [nova:embed] Fix extraction (#24700) -* [motherless] Fix extraction (#24699) -* [twitch:clips] Extend URL regular expression (#24290, #24642) -* [tv4] Fix ISM formats extraction (#24667) -* [tele5] Fix extraction (#24553) -+ [mofosex] Add support for generic embeds (#24633) -+ [youporn] Add support for generic embeds -+ [spankwire] Add support for generic embeds (#24633) -* [spankwire] Fix extraction (#18924, #20648) - - -version 2020.03.24 - -Core -- [utils] Revert support for cookie files with spaces used instead of tabs - -Extractors -* [teachable] Update upskillcourses and gns3 domains -* [generic] Look for teachable embeds before wistia -+ [teachable] Extract chapter metadata (#24421) -+ [bilibili] Add support for player.bilibili.com (#24402) -+ [bilibili] Add support for new URL schema with BV ids (#24439, #24442) -* [limelight] Remove disabled API requests (#24255) -* [soundcloud] Fix download URL extraction (#24394) -+ [cbc:watch] Add support for authentication (#19160) -* [hellporno] Fix extraction (#24399) -* [xtube] Fix formats extraction (#24348) -* [ndr] Fix extraction (#24326) -* [nhk] Update m3u8 URL and use native HLS downloader (#24329) -- [nhk] Remove obsolete rtmp formats (#24329) -* [nhk] Relax URL regular expression (#24329) -- [vimeo] Revert fix showcase password protected video extraction (#24224) - - -version 2020.03.08 - -Core -+ [utils] Add support for cookie files with spaces used instead of tabs - -Extractors -+ [pornhub] Add support for pornhubpremium.com (#24288) -- [youtube] Remove outdated code and unnecessary requests -* [youtube] Improve extraction in 429 HTTP error conditions (#24283) -* [nhk] Update API version (#24270) - - -version 2020.03.06 - -Extractors -* [youtube] Fix age-gated videos support without login (#24248) -* [vimeo] Fix showcase password protected video extraction (#24224) -* [pornhub] Improve title extraction (#24184) -* [peertube] Improve extraction (#23657) -+ [servus] Add support for new URL schema (#23475, #23583, #24142) -* [vimeo] Fix subtitles URLs (#24209) - - -version 2020.03.01 - -Core -* [YoutubeDL] Force redirect URL to unicode on python 2 -- [options] Remove duplicate short option -v for --version (#24162) - -Extractors -* [xhamster] Fix extraction (#24205) -* [franceculture] Fix extraction (#24204) -+ [telecinco] Add support for article opening videos -* [telecinco] Fix extraction (#24195) -* [xtube] Fix metadata extraction (#21073, #22455) -* [youjizz] Fix extraction (#24181) -- Remove no longer needed compat_str around geturl -* [pornhd] Fix extraction (#24128) -+ [teachable] Add support for multiple videos per lecture (#24101) -+ [wistia] Add support for multiple generic embeds (#8347, 11385) -* [imdb] Fix extraction (#23443) -* [tv2dk:bornholm:play] Fix extraction (#24076) - - -version 2020.02.16 - -Core -* [YoutubeDL] Fix playlist entry indexing with --playlist-items (#10591, - #10622) -* [update] Fix updating via symlinks (#23991) -+ [compat] Introduce compat_realpath (#23991) - -Extractors -+ [npr] Add support for streams (#24042) -+ [24video] Add support for porn.24video.net (#23779, #23784) -- [jpopsuki] Remove extractor (#23858) -* [nova] Improve extraction (#23690) -* [nova:embed] Improve (#23690) -* [nova:embed] Fix extraction (#23672) -+ [abc:iview] Add support for 720p (#22907, #22921) -* [nytimes] Improve format sorting (#24010) -+ [toggle] Add support for mewatch.sg (#23895, #23930) -* [thisoldhouse] Fix extraction (#23951) -+ [popcorntimes] Add support for popcorntimes.tv (#23949) -* [sportdeutschland] Update to new API -* [twitch:stream] Lowercase channel id for stream request (#23917) -* [tv5mondeplus] Fix extraction (#23907, #23911) -* [tva] Relax URL regular expression (#23903) -* [vimeo] Fix album extraction (#23864) -* [viewlift] Improve extraction - * Fix extraction (#23851) - + Add support for authentication - + Add support for more domains -* [svt] Fix series extraction (#22297) -* [svt] Fix article extraction (#22897, #22919) -* [soundcloud] Improve private playlist/set tracks extraction (#3707) - - -version 2020.01.24 - -Extractors -* [youtube] Fix sigfunc name extraction (#23819) -* [stretchinternet] Fix extraction (#4319) -* [voicerepublic] Fix extraction -* [azmedien] Fix extraction (#23783) -* [businessinsider] Fix jwplatform id extraction (#22929, #22954) -+ [24video] Add support for 24video.vip (#23753) -* [ivi:compilation] Fix entries extraction (#23770) -* [ard] Improve extraction (#23761) - * Simplify extraction - + Extract age limit and series - * Bypass geo-restriction -+ [nbc] Add support for nbc multi network URLs (#23049) -* [americastestkitchen] Fix extraction -* [zype] Improve extraction - + Extract subtitles (#21258) - + Support URLs with alternative keys/tokens (#21258) - + Extract more metadata -* [orf:tvthek] Improve geo restricted videos detection (#23741) -* [soundcloud] Restore previews extraction (#23739) - - -version 2020.01.15 - -Extractors -* [yourporn] Fix extraction (#21645, #22255, #23459) -+ [canvas] Add support for new API endpoint (#17680, #18629) -* [ndr:base:embed] Improve thumbnails extraction (#23731) -+ [vodplatform] Add support for embed.kwikmotion.com domain -+ [twitter] Add support for promo_video_website cards (#23711) -* [orf:radio] Clean description and improve extraction -* [orf:fm4] Fix extraction (#23599) -* [safari] Fix kaltura session extraction (#23679, #23670) -* [lego] Fix extraction and extract subtitle (#23687) -* [cloudflarestream] Improve extraction - + Add support for bytehighway.net domain - + Add support for signed URLs - + Extract thumbnail -* [naver] Improve extraction - * Improve geo-restriction handling - + Extract automatic captions - + Extract uploader metadata - + Extract VLive HLS formats - * Improve metadata extraction -- [pandatv] Remove extractor (#23630) -* [dctp] Fix format extraction (#23656) -+ [scrippsnetworks] Add support for www.discovery.com videos -* [discovery] Fix anonymous token extraction (#23650) -* [nrktv:seriebase] Fix extraction (#23625, #23537) -* [wistia] Improve format extraction and extract subtitles (#22590) -* [vice] Improve extraction (#23631) -* [redtube] Detect private videos (#23518) - - -version 2020.01.01 - -Extractors -* [brightcove] Invalidate policy key cache on failing requests -* [pornhub] Improve locked videos detection (#22449, #22780) -+ [pornhub] Add support for m3u8 formats -* [pornhub] Fix extraction (#22749, #23082) -* [brightcove] Update policy key on failing requests -* [spankbang] Improve removed video detection (#23423) -* [spankbang] Fix extraction (#23307, #23423, #23444) -* [soundcloud] Automatically update client id on failing requests -* [prosiebensat1] Improve geo restriction handling (#23571) -* [brightcove] Cache brightcove player policy keys -* [teachable] Fail with error message if no video URL found -* [teachable] Improve locked lessons detection (#23528) -+ [scrippsnetworks] Add support for Scripps Networks sites (#19857, #22981) -* [mitele] Fix extraction (#21354, #23456) -* [soundcloud] Update client id (#23516) -* [mailru] Relax URL regular expressions (#23509) - - -version 2019.12.25 - -Core -* [utils] Improve str_to_int -+ [downloader/hls] Add ability to override AES decryption key URL (#17521) - -Extractors -* [mediaset] Fix parse formats (#23508) -+ [tv2dk:bornholm:play] Add support for play.tv2bornholm.dk (#23291) -+ [slideslive] Add support for url and vimeo service names (#23414) -* [slideslive] Fix extraction (#23413) -* [twitch:clips] Fix extraction (#23375) -+ [soundcloud] Add support for token protected embeds (#18954) -* [vk] Improve extraction - * Fix User Videos extraction (#23356) - * Extract all videos for lists with more than 1000 videos (#23356) - + Add support for video albums (#14327, #14492) -- [kontrtube] Remove extractor -- [videopremium] Remove extractor -- [musicplayon] Remove extractor (#9225) -+ [ufctv] Add support for ufcfightpass.imgdge.com and - ufcfightpass.imggaming.com (#23343) -+ [twitch] Extract m3u8 formats frame rate (#23333) -+ [imggaming] Add support for playlists and extract subtitles -+ [ufcarabia] Add support for UFC Arabia (#23312) -* [ufctv] Fix extraction -* [yahoo] Fix gyao brightcove player id (#23303) -* [vzaar] Override AES decryption key URL (#17521) -+ [vzaar] Add support for AES HLS manifests (#17521, #23299) -* [nrl] Fix extraction -* [teachingchannel] Fix extraction -* [nintendo] Fix extraction and partially add support for Nintendo Direct - videos (#4592) -+ [ooyala] Add better fallback values for domain and streams variables -+ [youtube] Add support youtubekids.com (#23272) -* [tv2] Detect DRM protection -+ [tv2] Add support for katsomo.fi and mtv.fi (#10543) -* [tv2] Fix tv2.no article extraction -* [msn] Improve extraction - + Add support for YouTube and NBCSports embeds - + Add support for articles with multiple videos - * Improve AOL embed support - * Improve format extraction -* [abcotvs] Relax URL regular expression and improve metadata extraction - (#18014) -* [channel9] Reduce response size -* [adobetv] Improve extraction - * Use OnDemandPagedList for list extractors - * Reduce show extraction requests - * Extract original video format and subtitles - + Add support for adobe tv embeds - - -version 2019.11.28 - -Core -+ [utils] Add generic caesar cipher and rot47 -* [utils] Handle rd-suffixed day parts in unified_strdate (#23199) - -Extractors -* [vimeo] Improve extraction - * Fix review extraction - * Fix ondemand extraction - * Make password protected player case as an expected error (#22896) - * Simplify channel based extractors code -- [openload] Remove extractor (#11999) -- [verystream] Remove extractor -- [streamango] Remove extractor (#15406) -* [dailymotion] Improve extraction - * Extract http formats included in m3u8 manifest - * Fix user extraction (#3553, #21415) - + Add support for User Authentication (#11491) - * Fix password protected videos extraction (#23176) - * Respect age limit option and family filter cookie value (#18437) - * Handle video url playlist query param - * Report allowed countries for geo-restricted videos -* [corus] Improve extraction - + Add support for Series Plus, W Network, YTV, ABC Spark, disneychannel.com - and disneylachaine.ca (#20861) - + Add support for self hosted videos (#22075) - * Detect DRM protection (#14910, #9164) -* [vivo] Fix extraction (#22328, #22279) -+ [bitchute] Extract upload date (#22990, #23193) -* [soundcloud] Update client id (#23214) - - -version 2019.11.22 - -Core -+ [extractor/common] Clean jwplayer description HTML tags -+ [extractor/common] Add data, headers and query to all major extract formats - methods - -Extractors -* [chaturbate] Fix extraction (#23010, #23012) -+ [ntvru] Add support for non relative file URLs (#23140) -* [vk] Fix wall audio thumbnails extraction (#23135) -* [ivi] Fix format extraction (#21991) -- [comcarcoff] Remove extractor -+ [drtv] Add support for new URL schema (#23059) -+ [nexx] Add support for Multi Player JS Setup (#23052) -+ [teamcoco] Add support for new videos (#23054) -* [soundcloud] Check if the soundtrack has downloads left (#23045) -* [facebook] Fix posts video data extraction (#22473) -- [addanime] Remove extractor -- [minhateca] Remove extractor -- [daisuki] Remove extractor -* [seeker] Fix extraction -- [revision3] Remove extractors -* [twitch] Fix video comments URL (#18593, #15828) -* [twitter] Improve extraction - + Add support for generic embeds (#22168) - * Always extract http formats for native videos (#14934) - + Add support for Twitter Broadcasts (#21369) - + Extract more metadata - * Improve VMap format extraction - * Unify extraction code for both twitter statuses and cards -+ [twitch] Add support for Clip embed URLs -* [lnkgo] Fix extraction (#16834) -* [mixcloud] Improve extraction - * Improve metadata extraction (#11721) - * Fix playlist extraction (#22378) - * Fix user mixes extraction (#15197, #17865) -+ [kinja] Add support for Kinja embeds (#5756, #11282, #22237, #22384) -* [onionstudios] Fix extraction -+ [hotstar] Pass Referer header to format requests (#22836) -* [dplay] Minimize response size -+ [patreon] Extract uploader_id and filesize -* [patreon] Minimize response size -* [roosterteeth] Fix login request (#16094, #22689) - - -version 2019.11.05 - -Extractors -+ [scte] Add support for learning.scte.org (#22975) -+ [msn] Add support for Vidible and AOL embeds (#22195, #22227) -* [myspass] Fix video URL extraction and improve metadata extraction (#22448) -* [jamendo] Improve extraction - * Fix album extraction (#18564) - * Improve metadata extraction (#18565, #21379) -* [mediaset] Relax URL guid matching (#18352) -+ [mediaset] Extract unprotected M3U and MPD manifests (#17204) -* [telegraaf] Fix extraction -+ [bellmedia] Add support for marilyn.ca videos (#22193) -* [stv] Fix extraction (#22928) -- [iconosquare] Remove extractor -- [keek] Remove extractor -- [gameone] Remove extractor (#21778) -- [flipagram] Remove extractor -- [bambuser] Remove extractor -* [wistia] Reduce embed extraction false positives -+ [wistia] Add support for inline embeds (#22931) -- [go90] Remove extractor -* [kakao] Remove raw request -+ [kakao] Extract format total bitrate -* [daum] Fix VOD and Clip extraction (#15015) -* [kakao] Improve extraction - + Add support for embed URLs - + Add support for Kakao Legacy vid based embed URLs - * Only extract fields used for extraction - * Strip description and extract tags -* [mixcloud] Fix cloudcast data extraction (#22821) -* [yahoo] Improve extraction - + Add support for live streams (#3597, #3779, #22178) - * Bypass cookie consent page for european domains (#16948, #22576) - + Add generic support for embeds (#20332) -* [tv2] Fix and improve extraction (#22787) -+ [tv2dk] Add support for TV2 DK sites -* [onet] Improve extraction … - + Add support for onet100.vod.pl - + Extract m3u8 formats - * Correct audio only format info -* [fox9] Fix extraction - - -version 2019.10.29 - -Core -* [utils] Actualize major IPv4 address blocks per country - -Extractors -+ [go] Add support for abc.com and freeform.com (#22823, #22864) -+ [mtv] Add support for mtvjapan.com -* [mtv] Fix extraction for mtv.de (#22113) -* [videodetective] Fix extraction -* [internetvideoarchive] Fix extraction -* [nbcnews] Fix extraction (#12569, #12576, #21703, #21923) -- [hark] Remove extractor -- [tutv] Remove extractor -- [learnr] Remove extractor -- [macgamestore] Remove extractor -* [la7] Update Kaltura service URL (#22358) -* [thesun] Fix extraction (#16966) -- [makertv] Remove extractor -+ [tenplay] Add support for 10play.com.au (#21446) -* [soundcloud] Improve extraction - * Improve format extraction (#22123) - + Extract uploader_id and uploader_url (#21916) - + Extract all known thumbnails (#19071, #20659) - * Fix extraction for private playlists (#20976) - + Add support for playlist embeds (#20976) - * Skip preview formats (#22806) -* [dplay] Improve extraction - + Add support for dplay.fi, dplay.jp and es.dplay.com (#16969) - * Fix it.dplay.com extraction (#22826) - + Extract creator, tags and thumbnails - * Handle playback API call errors -+ [discoverynetworks] Add support for dplay.co.uk -* [vk] Improve extraction - + Add support for Odnoklassniki embeds - + Extract more videos from user lists (#4470) - + Fix wall post audio extraction (#18332) - * Improve error detection (#22568) -+ [odnoklassniki] Add support for embeds -* [puhutv] Improve extraction - * Fix subtitles extraction - * Transform HLS URLs to HTTP URLs - * Improve metadata extraction -* [ceskatelevize] Skip DRM media -+ [facebook] Extract subtitles (#22777) -* [globo] Handle alternative hash signing method - - -version 2019.10.22 - -Core -* [utils] Improve subtitles_filename (#22753) - -Extractors -* [facebook] Bypass download rate limits (#21018) -+ [contv] Add support for contv.com -- [viewster] Remove extractor -* [xfileshare] Improve extractor (#17032, #17906, #18237, #18239) - * Update the list of domains - + Add support for aa-encoded video data - * Improve jwplayer format extraction - + Add support for Clappr sources -* [mangomolo] Fix video format extraction and add support for player URLs -* [audioboom] Improve metadata extraction -* [twitch] Update VOD URL matching (#22395, #22727) -- [mit] Remove support for video.mit.edu (#22403) -- [servingsys] Remove extractor (#22639) -* [dumpert] Fix extraction (#22428, #22564) -* [atresplayer] Fix extraction (#16277, #16716) - - -version 2019.10.16 - -Core -* [extractor/common] Make _is_valid_url more relaxed - -Extractors -* [vimeo] Improve album videos id extraction (#22599) -+ [globo] Extract subtitles (#22713) -* [bokecc] Improve player params extraction (#22638) -* [nexx] Handle result list (#22666) -* [vimeo] Fix VHX embed extraction -* [nbc] Switch to graphql API (#18581, #22693, #22701) -- [vessel] Remove extractor -- [promptfile] Remove extractor (#6239) -* [kaltura] Fix service URL extraction (#22658) -* [kaltura] Fix embed info strip (#22658) -* [globo] Fix format extraction (#20319) -* [redtube] Improve metadata extraction (#22492, #22615) -* [pornhub:uservideos:upload] Fix extraction (#22619) -+ [telequebec:squat] Add support for squat.telequebec.tv (#18503) -- [wimp] Remove extractor (#22088, #22091) -+ [gfycat] Extend URL regular expression (#22225) -+ [chaturbate] Extend URL regular expression (#22309) -* [peertube] Update instances (#22414) -+ [telequebec] Add support for coucou.telequebec.tv (#22482) -+ [xvideos] Extend URL regular expression (#22471) -- [youtube] Remove support for invidious.enkirton.net (#22543) -+ [openload] Add support for oload.monster (#22592) -* [nrktv:seriebase] Fix extraction (#22596) -+ [youtube] Add support for yt.lelux.fi (#22597) -* [orf:tvthek] Make manifest requests non fatal (#22578) -* [teachable] Skip login when already logged in (#22572) -* [viewlift] Improve extraction (#22545) -* [nonktube] Fix extraction (#22544) - - -version 2019.09.28 - -Core -* [YoutubeDL] Honour all --get-* options with --flat-playlist (#22493) - -Extractors -* [vk] Fix extraction (#22522) -* [heise] Fix kaltura embeds extraction (#22514) -* [ted] Check for resources validity and extract subtitled downloads (#22513) -+ [youtube] Add support for - owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya.b32.i2p (#22292) -+ [nhk] Add support for clips -* [nhk] Fix video extraction (#22249, #22353) -* [byutv] Fix extraction (#22070) -+ [openload] Add support for oload.online (#22304) -+ [youtube] Add support for invidious.drycat.fr (#22451) -* [jwplatfom] Do not match video URLs (#20596, #22148) -* [youtube:playlist] Unescape playlist uploader (#22483) -+ [bilibili] Add support audio albums and songs (#21094) -+ [instagram] Add support for tv URLs -+ [mixcloud] Allow uppercase letters in format URLs (#19280) -* [brightcove] Delegate all supported legacy URLs to new extractor (#11523, - #12842, #13912, #15669, #16303) -* [hotstar] Use native HLS downloader by default -+ [hotstar] Extract more formats (#22323) -* [9now] Fix extraction (#22361) -* [zdf] Bypass geo restriction -+ [tv4] Extract series metadata -* [tv4] Fix extraction (#22443) - - -version 2019.09.12.1 - -Extractors -* [youtube] Remove quality and tbr for itag 43 (#22372) - - -version 2019.09.12 - -Extractors -* [youtube] Quick extraction tempfix (#22367, #22163) - - -version 2019.09.01 - -Core -+ [extractor/generic] Add support for squarespace embeds (#21294, #21802, - #21859) -+ [downloader/external] Respect mtime option for aria2c (#22242) - -Extractors -+ [xhamster:user] Add support for user pages (#16330, #18454) -+ [xhamster] Add support for more domains -+ [verystream] Add support for woof.tube (#22217) -+ [dailymotion] Add support for lequipe.fr (#21328, #22152) -+ [openload] Add support for oload.vip (#22205) -+ [bbccouk] Extend URL regular expression (#19200) -+ [youtube] Add support for invidious.nixnet.xyz and yt.elukerio.org (#22223) -* [safari] Fix authentication (#22161, #22184) -* [usanetwork] Fix extraction (#22105) -+ [einthusan] Add support for einthusan.ca (#22171) -* [youtube] Improve unavailable message extraction (#22117) -+ [piksel] Extract subtitles (#20506) - - -version 2019.08.13 - -Core -* [downloader/fragment] Fix ETA calculation of resumed download (#21992) -* [YoutubeDL] Check annotations availability (#18582) - -Extractors -* [youtube:playlist] Improve flat extraction (#21927) -* [youtube] Fix annotations extraction (#22045) -+ [discovery] Extract series meta field (#21808) -* [youtube] Improve error detection (#16445) -* [vimeo] Fix album extraction (#1933, #15704, #15855, #18967, #21986) -+ [roosterteeth] Add support for watch URLs -* [discovery] Limit video data by show slug (#21980) - - -version 2019.08.02 - -Extractors -+ [tvigle] Add support for HLS and DASH formats (#21967) -* [tvigle] Fix extraction (#21967) -+ [yandexvideo] Add support for DASH formats (#21971) -* [discovery] Use API call for video data extraction (#21808) -+ [mgtv] Extract format_note (#21881) -* [tvn24] Fix metadata extraction (#21833, #21834) -* [dlive] Relax URL regular expression (#21909) -+ [openload] Add support for oload.best (#21913) -* [youtube] Improve metadata extraction for age gate content (#21943) - - -version 2019.07.30 - -Extractors -* [youtube] Fix and improve title and description extraction (#21934) - - -version 2019.07.27 - -Extractors -+ [yahoo:japannews] Add support for yahoo.co.jp (#21698, #21265) -+ [discovery] Add support go.discovery.com URLs -* [youtube:playlist] Relax video regular expression (#21844) -* [generic] Restrict --default-search schemeless URLs detection pattern - (#21842) -* [vrv] Fix CMS signing query extraction (#21809) - - -version 2019.07.16 - -Extractors -+ [asiancrush] Add support for yuyutv.com, midnightpulp.com and cocoro.tv - (#21281, #21290) -* [kaltura] Check source format URL (#21290) -* [ctsnews] Fix YouTube embeds extraction (#21678) -+ [einthusan] Add support for einthusan.com (#21748, #21775) -+ [youtube] Add support for invidious.mastodon.host (#21777) -+ [gfycat] Extend URL regular expression (#21779, #21780) -* [youtube] Restrict is_live extraction (#21782) - - -version 2019.07.14 - -Extractors -* [porn91] Fix extraction (#21312) -+ [yandexmusic] Extract track number and disk number (#21421) -+ [yandexmusic] Add support for multi disk albums (#21420, #21421) -* [lynda] Handle missing subtitles (#20490, #20513) -+ [youtube] Add more invidious instances to URL regular expression (#21694) -* [twitter] Improve uploader id extraction (#21705) -* [spankbang] Fix and improve metadata extraction -* [spankbang] Fix extraction (#21763, #21764) -+ [dlive] Add support for dlive.tv (#18080) -+ [livejournal] Add support for livejournal.com (#21526) -* [roosterteeth] Fix free episode extraction (#16094) -* [dbtv] Fix extraction -* [bellator] Fix extraction -- [rudo] Remove extractor (#18430, #18474) -* [facebook] Fallback to twitter:image meta for thumbnail extraction (#21224) -* [bleacherreport] Fix Bleacher Report CMS extraction -* [espn] Fix fivethirtyeight.com extraction -* [5tv] Relax video URL regular expression and support https URLs -* [youtube] Fix is_live extraction (#21734) -* [youtube] Fix authentication (#11270) - - -version 2019.07.12 - -Core -+ [adobepass] Add support for AT&T U-verse (mso ATT) (#13938, #21016) - -Extractors -+ [mgtv] Pass Referer HTTP header for format URLs (#21726) -+ [beeg] Add support for api/v6 v2 URLs without t argument (#21701) -* [voxmedia:volume] Improvevox embed extraction (#16846) -* [funnyordie] Move extraction to VoxMedia extractor (#16846) -* [gameinformer] Fix extraction (#8895, #15363, #17206) -* [funk] Fix extraction (#17915) -* [packtpub] Relax lesson URL regular expression (#21695) -* [packtpub] Fix extraction (#21268) -* [philharmoniedeparis] Relax URL regular expression (#21672) -* [peertube] Detect embed URLs in generic extraction (#21666) -* [mixer:vod] Relax URL regular expression (#21657, #21658) -+ [lecturio] Add support id based URLs (#21630) -+ [go] Add site info for disneynow (#21613) -* [ted] Restrict info regular expression (#21631) -* [twitch:vod] Actualize m3u8 URL (#21538, #21607) -* [vzaar] Fix videos with empty title (#21606) -* [tvland] Fix extraction (#21384) -* [arte] Clean extractor (#15583, #21614) - - -version 2019.07.02 - -Core -+ [utils] Introduce random_user_agent and use as default User-Agent (#21546) - -Extractors -+ [vevo] Add support for embed.vevo.com URLs (#21565) -+ [openload] Add support for oload.biz (#21574) -* [xiami] Update API base URL (#21575) -* [yourporn] Fix extraction (#21585) -+ [acast] Add support for URLs with episode id (#21444) -+ [dailymotion] Add support for DM.player embeds -* [soundcloud] Update client id - - -version 2019.06.27 - -Extractors -+ [go] Add support for disneynow.com (#21528) -* [mixer:vod] Relax URL regular expression (#21531, #21536) -* [drtv] Relax URL regular expression -* [fusion] Fix extraction (#17775, #21269) -- [nfb] Remove extractor (#21518) -+ [beeg] Add support for api/v6 v2 URLs (#21511) -+ [brightcove:new] Add support for playlists (#21331) -+ [openload] Add support for oload.life (#21495) -* [vimeo:channel,group] Make title extraction non fatal -* [vimeo:likes] Implement extrator in terms of channel extractor (#21493) -+ [pornhub] Add support for more paged video sources -+ [pornhub] Add support for downloading single pages and search pages (#15570) -* [pornhub] Rework extractors (#11922, #16078, #17454, #17936) -+ [youtube] Add another signature function pattern -* [tf1] Fix extraction (#21365, #21372) -* [crunchyroll] Move Accept-Language workaround to video extractor since - it causes playlists not to list any videos -* [crunchyroll:playlist] Fix and relax title extraction (#21291, #21443) - - -version 2019.06.21 - -Core -* [utils] Restrict parse_codecs and add theora as known vcodec (#21381) - -Extractors -* [youtube] Update signature function patterns (#21469, #21476) -* [youtube] Make --write-annotations non fatal (#21452) -+ [sixplay] Add support for rtlmost.hu (#21405) -* [youtube] Hardcode codec metadata for av01 video only formats (#21381) -* [toutv] Update client key (#21370) -+ [biqle] Add support for new embed domain -* [cbs] Improve DRM protected videos detection (#21339) - - -version 2019.06.08 - -Core -* [downloader/common] Improve rate limit (#21301) -* [utils] Improve strip_or_none -* [extractor/common] Strip src attribute for HTML5 entries code (#18485, - #21169) - -Extractors -* [ted] Fix playlist extraction (#20844, #21032) -* [vlive:playlist] Fix video extraction when no playlist is found (#20590) -+ [vlive] Add CH+ support (#16887, #21209) -+ [openload] Add support for oload.website (#21329) -+ [tvnow] Extract HD formats (#21201) -+ [redbulltv] Add support for rrn:content URLs (#21297) -* [youtube] Fix average rating extraction (#21304) -+ [bitchute] Extract HTML5 formats (#21306) -* [cbsnews] Fix extraction (#9659, #15397) -* [vvvvid] Relax URL regular expression (#21299) -+ [prosiebensat1] Add support for new API (#21272) -+ [vrv] Extract adaptive_hls formats (#21243) -* [viki] Switch to HTTPS (#21001) -* [LiveLeak] Check if the original videos exist (#21206, #21208) -* [rtp] Fix extraction (#15099) -* [youtube] Improve DRM protected videos detection (#1774) -+ [srgssrplay] Add support for popupvideoplayer URLs (#21155) -+ [24video] Add support for porno.24video.net (#21194) -+ [24video] Add support for 24video.site (#21193) -- [pornflip] Remove extractor -- [criterion] Remove extractor (#21195) -* [pornhub] Use HTTPS (#21061) -* [bitchute] Fix uploader extraction (#21076) -* [streamcloud] Reduce waiting time to 6 seconds (#21092) -- [novamov] Remove extractors (#21077) -+ [openload] Add support for oload.press (#21135) -* [vivo] Fix extraction (#18906, #19217) - - -version 2019.05.20 - -Core -+ [extractor/common] Move workaround for applying first Set-Cookie header - into a separate _apply_first_set_cookie_header method - -Extractors -* [safari] Fix authentication (#21090) -* [vk] Use _apply_first_set_cookie_header -* [vrt] Fix extraction (#20527) -+ [canvas] Add support for vrtnieuws and sporza site ids and extract - AES HLS formats -+ [vrv] Extract captions (#19238) -* [tele5] Improve video id extraction -* [tele5] Relax URL regular expression (#21020, #21063) -* [svtplay] Update API URL (#21075) -+ [yahoo:gyao] Add X-User-Agent header to dam proxy requests (#21071) - - -version 2019.05.11 - -Core -* [utils] Transliterate "þ" as "th" (#20897) - -Extractors -+ [cloudflarestream] Add support for videodelivery.net (#21049) -+ [byutv] Add support for DVR videos (#20574, #20676) -+ [gfycat] Add support for URLs with tags (#20696, #20731) -+ [openload] Add support for verystream.com (#20701, #20967) -* [youtube] Use sp field value for signature field name (#18841, #18927, - #21028) -+ [yahoo:gyao] Extend URL regular expression (#21008) -* [youtube] Fix channel id extraction (#20982, #21003) -+ [sky] Add support for news.sky.com (#13055) -+ [youtube:entrylistbase] Retry on 5xx HTTP errors (#20965) -+ [francetvinfo] Extend video id extraction (#20619, #20740) -* [4tube] Update token hosts (#20918) -* [hotstar] Move to API v2 (#20931) -* [fox] Fix API error handling under python 2 (#20925) -+ [redbulltv] Extend URL regular expression (#20922) - - -version 2019.04.30 - -Extractors -* [openload] Use real Chrome versions (#20902) -- [youtube] Remove info el for get_video_info request -* [youtube] Improve extraction robustness -- [dramafever] Remove extractor (#20868) -* [adn] Fix subtitle extraction (#12724) -+ [ccc] Extract creator (#20355) -+ [ccc:playlist] Add support for media.ccc.de playlists (#14601, #20355) -+ [sverigesradio] Add support for sverigesradio.se (#18635) -+ [cinemax] Add support for cinemax.com -* [sixplay] Try extracting non-DRM protected manifests (#20849) -+ [youtube] Extract Youtube Music Auto-generated metadata (#20599, #20742) -- [wrzuta] Remove extractor (#20684, #20801) -* [twitch] Prefer source format (#20850) -+ [twitcasting] Add support for private videos (#20843) -* [reddit] Validate thumbnail URL (#20030) -* [yandexmusic] Fix track URL extraction (#20820) - - -version 2019.04.24 - -Extractors -* [youtube] Fix extraction (#20758, #20759, #20761, #20762, #20764, #20766, - #20767, #20769, #20771, #20768, #20770) -* [toutv] Fix extraction and extract series info (#20757) -+ [vrv] Add support for movie listings (#19229) -+ [youtube] Print error when no data is available (#20737) -+ [soundcloud] Add support for new rendition and improve extraction (#20699) -+ [ooyala] Add support for geo verification proxy -+ [nrl] Add support for nrl.com (#15991) -+ [vimeo] Extract live archive source format (#19144) -+ [vimeo] Add support for live streams and improve info extraction (#19144) -+ [ntvcojp] Add support for cu.ntv.co.jp -+ [nhk] Extract RTMPT format -+ [nhk] Add support for audio URLs -+ [udemy] Add another course id extraction pattern (#20491) -+ [openload] Add support for oload.services (#20691) -+ [openload] Add support for openloed.co (#20691, #20693) -* [bravotv] Fix extraction (#19213) - - -version 2019.04.17 - -Extractors -* [openload] Randomize User-Agent (#20688) -+ [openload] Add support for oladblock domains (#20471) -* [adn] Fix subtitle extraction (#12724) -+ [aol] Add support for localized websites -+ [yahoo] Add support GYAO episode URLs -+ [yahoo] Add support for streaming.yahoo.co.jp (#5811, #7098) -+ [yahoo] Add support for gyao.yahoo.co.jp -* [aenetworks] Fix history topic extraction and extract more formats -+ [cbs] Extract smpte and vtt subtitles -+ [streamango] Add support for streamcherry.com (#20592) -+ [yourporn] Add support for sxyprn.com (#20646) -* [mgtv] Fix extraction (#20650) -* [linkedin:learning] Use urljoin for form action URL (#20431) -+ [gdc] Add support for kaltura embeds (#20575) -* [dispeak] Improve mp4 bitrate extraction -* [kaltura] Sanitize embed URLs -* [jwplatfom] Do not match manifest URLs (#20596) -* [aol] Restrict URL regular expression and improve format extraction -+ [tiktok] Add support for new URL schema (#20573) -+ [stv:player] Add support for player.stv.tv (#20586) - - -version 2019.04.07 - -Core -+ [downloader/external] Pass rtmp_conn to ffmpeg - -Extractors -+ [ruutu] Add support for audio podcasts (#20473, #20545) -+ [xvideos] Extract all thumbnails (#20432) -+ [platzi] Add support for platzi.com (#20562) -* [dvtv] Fix extraction (#18514, #19174) -+ [vrv] Add basic support for individual movie links (#19229) -+ [bfi:player] Add support for player.bfi.org.uk (#19235) -* [hbo] Fix extraction and extract subtitles (#14629, #13709) -* [youtube] Extract srv[1-3] subtitle formats (#20566) -* [adultswim] Fix extraction (#18025) -* [teamcoco] Fix extraction and add support for subdomains (#17099, #20339) -* [adn] Fix subtitle compatibility with ffmpeg -* [adn] Fix extraction and add support for positioning styles (#20549) -* [vk] Use unique video id (#17848) -* [newstube] Fix extraction -* [rtl2] Actualize extraction -+ [adobeconnect] Add support for adobeconnect.com (#20283) -+ [gaia] Add support for authentication (#14605) -+ [mediasite] Add support for dashed ids and named catalogs (#20531) - - -version 2019.04.01 - -Core -* [utils] Improve int_or_none and float_or_none (#20403) -* Check for valid --min-sleep-interval when --max-sleep-interval is specified - (#20435) - -Extractors -+ [weibo] Extend URL regular expression (#20496) -+ [xhamster] Add support for xhamster.one (#20508) -+ [mediasite] Add support for catalogs (#20507) -+ [teamtreehouse] Add support for teamtreehouse.com (#9836) -+ [ina] Add support for audio URLs -* [ina] Improve extraction -* [cwtv] Fix episode number extraction (#20461) -* [npo] Improve DRM detection -+ [pornhub] Add support for DASH formats (#20403) -* [svtplay] Update API endpoint (#20430) - - -version 2019.03.18 - -Core -* [extractor/common] Improve HTML5 entries extraction -+ [utils] Introduce parse_bitrate -* [update] Hide update URLs behind redirect -* [extractor/common] Fix url meta field for unfragmented DASH formats (#20346) - -Extractors -+ [yandexvideo] Add extractor -* [openload] Improve embed detection -+ [corus] Add support for bigbrothercanada.ca (#20357) -+ [orf:radio] Extract series (#20012) -+ [cbc:watch] Add support for gem.cbc.ca (#20251, #20359) -- [anysex] Remove extractor (#19279) -+ [ciscolive] Add support for new URL schema (#20320, #20351) -+ [youtube] Add support for invidiou.sh (#20309) -- [anitube] Remove extractor (#20334) -- [ruleporn] Remove extractor (#15344, #20324) -* [npr] Fix extraction (#10793, #13440) -* [biqle] Fix extraction (#11471, #15313) -* [viddler] Modernize -* [moevideo] Fix extraction -* [primesharetv] Remove extractor -* [hypem] Modernize and extract more metadata (#15320) -* [veoh] Fix extraction -* [escapist] Modernize -- [videomega] Remove extractor (#10108) -+ [beeg] Add support for beeg.porn (#20306) -* [vimeo:review] Improve config url extraction and extract original format - (#20305) -* [fox] Detect geo restriction and authentication errors (#20208) - - -version 2019.03.09 - -Core -* [extractor/common] Use compat_etree_Element -+ [compat] Introduce compat_etree_Element -* [extractor/common] Fallback url to base URL for DASH formats -* [extractor/common] Do not fail on invalid data while parsing F4M manifest - in non fatal mode -* [extractor/common] Return MPD manifest as format's url meta field (#20242) -* [utils] Strip #HttpOnly_ prefix from cookies files (#20219) - -Extractors -* [francetv:site] Relax video id regular expression (#20268) -* [toutv] Detect invalid login error -* [toutv] Fix authentication (#20261) -+ [urplay] Extract timestamp (#20235) -+ [openload] Add support for oload.space (#20246) -* [facebook] Improve uploader extraction (#20250) -* [bbc] Use compat_etree_Element -* [crunchyroll] Use compat_etree_Element -* [npo] Improve ISM extraction -* [rai] Improve extraction (#20253) -* [paramountnetwork] Fix mgid extraction (#20241) -* [libsyn] Improve extraction (#20229) -+ [youtube] Add more invidious instances to URL regular expression (#20228) -* [spankbang] Fix extraction (#20023) -* [espn] Extend URL regular expression (#20013) -* [sixplay] Handle videos with empty assets (#20016) -+ [vimeo] Add support for Vimeo Pro portfolio protected videos (#20070) - - -version 2019.03.01 - -Core -+ [downloader/external] Add support for rate limit and retries for wget -* [downloader/external] Fix infinite retries for curl (#19303) - -Extractors -* [npo] Fix extraction (#20084) -* [francetv:site] Extend video id regex (#20029, #20071) -+ [periscope] Extract width and height (#20015) -* [servus] Fix extraction (#19297) -* [bbccouk] Make subtitles non fatal (#19651) -* [metacafe] Fix family filter bypass (#19287) - - -version 2019.02.18 - -Extractors -* [tvp:website] Fix and improve extraction -+ [tvp] Detect unavailable videos -* [tvp] Fix description extraction and make thumbnail optional -+ [linuxacademy] Add support for linuxacademy.com (#12207) -* [bilibili] Update keys (#19233) -* [udemy] Extend URL regular expressions (#14330, #15883) -* [udemy] Update User-Agent and detect captcha (#14713, #15839, #18126) -* [noovo] Fix extraction (#19230) -* [rai] Relax URL regular expression (#19232) -+ [vshare] Pass Referer to download request (#19205, #19221) -+ [openload] Add support for oload.live (#19222) -* [imgur] Use video id as title fallback (#18590) -+ [twitch] Add new source format detection approach (#19193) -* [tvplayhome] Fix video id extraction (#19190) -* [tvplayhome] Fix episode metadata extraction (#19190) -* [rutube:embed] Fix extraction (#19163) -+ [rutube:embed] Add support private videos (#19163) -+ [soundcloud] Extract more metadata -+ [trunews] Add support for trunews.com (#19153) -+ [linkedin:learning] Extract chapter_number and chapter_id (#19162) - - -version 2019.02.08 - -Core -* [utils] Improve JSON-LD regular expression (#18058) -* [YoutubeDL] Fallback to ie_key of matching extractor while making - download archive id when no explicit ie_key is provided (#19022) - -Extractors -+ [malltv] Add support for mall.tv (#18058, #17856) -+ [spankbang:playlist] Add support for playlists (#19145) -* [spankbang] Extend URL regular expression -* [trutv] Fix extraction (#17336) -* [toutv] Fix authentication (#16398, #18700) -* [pornhub] Fix tags and categories extraction (#13720, #19135) -* [pornhd] Fix formats extraction -+ [pornhd] Extract like count (#19123, #19125) -* [radiocanada] Switch to the new media requests (#19115) -+ [teachable] Add support for courses.workitdaily.com (#18871) -- [vporn] Remove extractor (#16276) -+ [soundcloud:pagedplaylist] Add ie and title to entries (#19022, #19086) -+ [drtuber] Extract duration (#19078) -* [soundcloud] Fix paged playlists extraction, add support for albums and update client id -* [soundcloud] Update client id -* [drtv] Improve preference (#19079) -+ [openload] Add support for openload.pw and oload.pw (#18930) -+ [openload] Add support for oload.info (#19073) -* [crackle] Authorize media detail request (#16931) - - -version 2019.01.30.1 - -Core -* [postprocessor/ffmpeg] Fix avconv processing broken in #19025 (#19067) - - -version 2019.01.30 - -Core -* [postprocessor/ffmpeg] Do not copy Apple TV chapter tracks while embedding - subtitles (#19024, #19042) -* [postprocessor/ffmpeg] Disable "Last message repeated" messages (#19025) - -Extractors -* [yourporn] Fix extraction and extract duration (#18815, #18852, #19061) -* [drtv] Improve extraction (#19039) - + Add support for EncryptedUri videos - + Extract more metadata - * Fix subtitles extraction -+ [fox] Add support for locked videos using cookies (#19060) -* [fox] Fix extraction for free videos (#19060) -+ [zattoo] Add support for tv.salt.ch (#19059) - - -version 2019.01.27 - -Core -+ [extractor/common] Extract season in _json_ld -* [postprocessor/ffmpeg] Fallback to ffmpeg/avconv for audio codec detection - (#681) - -Extractors -* [vice] Fix extraction for locked videos (#16248) -+ [wakanim] Detect DRM protected videos -+ [wakanim] Add support for wakanim.tv (#14374) -* [usatoday] Fix extraction for videos with custom brightcove partner id - (#18990) -* [drtv] Fix extraction (#18989) -* [nhk] Extend URL regular expression (#18968) -* [go] Fix Adobe Pass requests for Disney Now (#18901) -+ [openload] Add support for oload.club (#18969) - - -version 2019.01.24 - -Core -* [YoutubeDL] Fix negation for string operators in format selection (#18961) - - -version 2019.01.23 - -Core -* [utils] Fix urljoin for paths with non-http(s) schemes -* [extractor/common] Improve jwplayer relative URL handling (#18892) -+ [YoutubeDL] Add negation support for string comparisons in format selection - expressions (#18600, #18805) -* [extractor/common] Improve HLS video-only format detection (#18923) - -Extractors -* [crunchyroll] Extend URL regular expression (#18955) -* [pornhub] Bypass scrape detection (#4822, #5930, #7074, #10175, #12722, - #17197, #18338 #18842, #18899) -+ [vrv] Add support for authentication (#14307) -* [videomore:season] Fix extraction -* [videomore] Improve extraction (#18908) -+ [tnaflix] Pass Referer in metadata request (#18925) -* [radiocanada] Relax DRM check (#18608, #18609) -* [vimeo] Fix video password verification for videos protected by - Referer HTTP header -+ [hketv] Add support for hkedcity.net (#18696) -+ [streamango] Add support for fruithosts.net (#18710) -+ [instagram] Add support for tags (#18757) -+ [odnoklassniki] Detect paid videos (#18876) -* [ted] Correct acodec for HTTP formats (#18923) -* [cartoonnetwork] Fix extraction (#15664, #17224) -* [vimeo] Fix extraction for password protected player URLs (#18889) - - -version 2019.01.17 - -Extractors -* [youtube] Extend JS player signature function name regular expressions - (#18890, #18891, #18893) - - -version 2019.01.16 - -Core -+ [test/helper] Add support for maxcount and count collection len checkers -* [downloader/hls] Fix uplynk ad skipping (#18824) -* [postprocessor/ffmpeg] Improve ffmpeg version parsing (#18813) - -Extractors -* [youtube] Skip unsupported adaptive stream type (#18804) -+ [youtube] Extract DASH formats from player response (#18804) -* [funimation] Fix extraction (#14089) -* [skylinewebcams] Fix extraction (#18853) -+ [curiositystream] Add support for non app URLs -+ [bitchute] Check formats (#18833) -* [wistia] Extend URL regular expression (#18823) -+ [playplustv] Add support for playplus.com (#18789) - - -version 2019.01.10 - -Core -* [extractor/common] Use episode name as title in _json_ld -+ [extractor/common] Add support for movies in _json_ld -* [postprocessor/ffmpeg] Embed subtitles with non-standard language codes - (#18765) -+ [utils] Add language codes replaced in 1989 revision of ISO 639 - to ISO639Utils (#18765) - -Extractors -* [youtube] Extract live HLS URL from player response (#18799) -+ [outsidetv] Add support for outsidetv.com (#18774) -* [jwplatform] Use JW Platform Delivery API V2 and add support for more URLs -+ [fox] Add support National Geographic (#17985, #15333, #14698) -+ [playplustv] Add support for playplus.tv (#18789) -* [globo] Set GLBID cookie manually (#17346) -+ [gaia] Add support for gaia.com (#14605) -* [youporn] Fix title and description extraction (#18748) -+ [hungama] Add support for hungama.com (#17402, #18771) -* [dtube] Fix extraction (#18741) -* [tvnow] Fix and rework extractors and prepare for a switch to the new API - (#17245, #18499) -* [carambatv:page] Fix extraction (#18739) - - -version 2019.01.02 - -Extractors -* [discovery] Use geo verification headers (#17838) -+ [packtpub] Add support for subscription.packtpub.com (#18718) -* [yourporn] Fix extraction (#18583) -+ [acast:channel] Add support for play.acast.com (#18587) -+ [extractors] Add missing age limits (#18621) -+ [rmcdecouverte] Add support for live stream -* [rmcdecouverte] Bypass geo restriction -* [rmcdecouverte] Update URL regular expression (#18595, 18697) -* [manyvids] Fix extraction (#18604, #18614) -* [bitchute] Fix extraction (#18567) - - -version 2018.12.31 - -Extractors -+ [bbc] Add support for another embed pattern (#18643) -+ [npo:live] Add support for npostart.nl (#18644) -* [beeg] Fix extraction (#18610, #18626) -* [youtube] Unescape HTML for series (#18641) -+ [youtube] Extract more format metadata -* [youtube] Detect DRM protected videos (#1774) -* [youtube] Relax HTML5 player regular expressions (#18465, #18466) -* [youtube] Extend HTML5 player regular expression (#17516) -+ [liveleak] Add support for another embed type and restore original - format extraction -+ [crackle] Extract ISM and HTTP formats -+ [twitter] Pass Referer with card request (#18579) -* [mediasite] Extend URL regular expression (#18558) -+ [lecturio] Add support for lecturio.de (#18562) -+ [discovery] Add support for Scripps Networks watch domains (#17947) - - -version 2018.12.17 - -Extractors -* [ard:beta] Improve geo restricted videos extraction -* [ard:beta] Fix subtitles extraction -* [ard:beta] Improve extraction robustness -* [ard:beta] Relax URL regular expression (#18441) -* [acast] Add support for embed.acast.com and play.acast.com (#18483) -* [iprima] Relax URL regular expression (#18515, #18540) -* [vrv] Fix initial state extraction (#18553) -* [youtube] Fix mark watched (#18546) -+ [safari] Add support for learning.oreilly.com (#18510) -* [youtube] Fix multifeed extraction (#18531) -* [lecturio] Improve subtitles extraction (#18488) -* [uol] Fix format URL extraction (#18480) -+ [ard:mediathek] Add support for classic.ardmediathek.de (#18473) - - -version 2018.12.09 - -Core -* [YoutubeDL] Keep session cookies in cookie file between runs -* [YoutubeDL] Recognize session cookies with expired set to 0 (#12929) - -Extractors -+ [teachable] Add support for teachable platform sites (#5451, #18150, #18272) -+ [aenetworks] Add support for historyvault.com (#18460) -* [imgur] Improve gallery and album detection and extraction (#9133, #16577, - #17223, #18404) -* [iprima] Relax URL regular expression (#18453) -* [hotstar] Fix video data extraction (#18386) -* [ard:mediathek] Fix title and description extraction (#18349, #18371) -* [xvideos] Switch to HTTPS (#18422, #18427) -+ [lecturio] Add support for lecturio.com (#18405) -+ [nrktv:series] Add support for extra materials -* [nrktv:season,series] Fix extraction (#17159, #17258) -* [nrktv] Relax URL regular expression (#18304, #18387) -* [yourporn] Fix extraction (#18424, #18425) -* [tbs] Fix info extraction (#18403) -+ [gamespot] Add support for review URLs - - -version 2018.12.03 - -Core -* [utils] Fix random_birthday to generate existing dates only (#18284) - -Extractors -+ [tiktok] Add support for tiktok.com (#18108, #18135) -* [pornhub] Use actual URL host for requests (#18359) -* [lynda] Fix authentication (#18158, #18217) -* [gfycat] Update API endpoint (#18333, #18343) -+ [hotstar] Add support for alternative app state layout (#18320) -* [azmedien] Fix extraction (#18334, #18336) -+ [vimeo] Add support for VHX (Vimeo OTT) (#14835) -* [joj] Fix extraction (#18280, #18281) -+ [wistia] Add support for fast.wistia.com (#18287) - - -version 2018.11.23 - -Core -+ [setup.py] Add more relevant classifiers - -Extractors -* [mixcloud] Fallback to hardcoded decryption key (#18016) -* [nbc:news] Fix article extraction (#16194) -* [foxsports] Fix extraction (#17543) -* [loc] Relax regular expression and improve formats extraction -+ [ciscolive] Add support for ciscolive.cisco.com (#17984) -* [nzz] Relax kaltura regex (#18228) -* [sixplay] Fix formats extraction -* [bitchute] Improve title extraction -* [kaltura] Limit requested MediaEntry fields -+ [americastestkitchen] Add support for zype embeds (#18225) -+ [pornhub] Add pornhub.net alias -* [nova:embed] Fix extraction (#18222) - - -version 2018.11.18 - -Extractors -+ [wwe] Extract subtitles -+ [wwe] Add support for playlists (#14781) -+ [wwe] Add support for wwe.com (#14781, #17450) -* [vk] Detect geo restriction (#17767) -* [openload] Use original host during extraction (#18211) -* [atvat] Fix extraction (#18041) -+ [rte] Add support for new API endpoint (#18206) -* [tnaflixnetwork:embed] Fix extraction (#18205) -* [picarto] Use API and add token support (#16518) -+ [zype] Add support for player.zype.com (#18143) -* [vivo] Fix extraction (#18139) -* [ruutu] Update API endpoint (#18138) - - -version 2018.11.07 - -Extractors -+ [youtube] Add another JS signature function name regex (#18091, #18093, - #18094) -* [facebook] Fix tahoe request (#17171) -* [cliphunter] Fix extraction (#18083) -+ [youtube:playlist] Add support for invidio.us (#18077) -* [zattoo] Arrange API hosts for derived extractors (#18035) -+ [youtube] Add fallback metadata extraction from videoDetails (#18052) - - -version 2018.11.03 - -Core -* [extractor/common] Ensure response handle is not prematurely closed before - it can be read if it matches expected_status (#17195, #17846, #17447) - -Extractors -* [laola1tv:embed] Set correct stream access URL scheme (#16341) -+ [ehftv] Add support for ehftv.com (#15408) -* [azmedien] Adopt to major site redesign (#17745, #17746) -+ [twitcasting] Add support for twitcasting.tv (#17981) -* [orf:tvthek] Fix extraction (#17737, #17956, #18024) -+ [openload] Add support for oload.fun (#18045) -* [njpwworld] Fix authentication (#17427) -+ [linkedin:learning] Add support for linkedin.com/learning (#13545) -* [theplatform] Improve error detection (#13222) -* [cnbc] Simplify extraction (#14280, #17110) -+ [cbnc] Add support for new URL schema (#14193) -* [aparat] Improve extraction and extract more metadata (#17445, #18008) -* [aparat] Fix extraction - - -version 2018.10.29 - -Core -+ [extractor/common] Add validation for JSON-LD URLs - -Extractors -+ [sportbox] Add support for matchtv.ru -* [sportbox] Fix extraction (#17978) -* [screencast] Fix extraction (#14590, #14617, #17990) -+ [openload] Add support for oload.icu -+ [ivi] Add support for ivi.tv -* [crunchyroll] Improve extraction failsafeness (#17991) -* [dailymail] Fix formats extraction (#17976) -* [viewster] Reduce format requests -* [cwtv] Handle API errors (#17905) -+ [rutube] Use geo verification headers (#17897) -+ [brightcove:legacy] Add fallbacks to brightcove:new (#13912) -- [tv3] Remove extractor (#10461, #15339) -* [ted] Fix extraction for HTTP and RTMP formats (#5941, #17572, #17894) -+ [openload] Add support for oload.cc (#17823) -+ [patreon] Extract post_file URL (#17792) -* [patreon] Fix extraction (#14502, #10471) - - -version 2018.10.05 - -Extractors -* [pluralsight] Improve authentication (#17762) -* [dailymotion] Fix extraction (#17699) -* [crunchyroll] Switch to HTTPS for RpcApi (#17749) -+ [philharmoniedeparis] Add support for pad.philharmoniedeparis.fr (#17705) -* [philharmoniedeparis] Fix extraction (#17705) -+ [jamendo] Add support for licensing.jamendo.com (#17724) -+ [openload] Add support for oload.cloud (#17710) -* [pluralsight] Fix subtitles extraction (#17726, #17728) -+ [vimeo] Add another config regular expression (#17690) -* [spike] Fix Paramount Network extraction (#17677) -* [hotstar] Fix extraction (#14694, #14931, #17637) - - -version 2018.09.26 - -Extractors -* [pluralsight] Fix subtitles extraction (#17671) -* [mediaset] Improve embed support (#17668) -+ [youtube] Add support for invidio.us (#17613) -+ [zattoo] Add support for more zattoo platform sites -* [zattoo] Fix extraction (#17175, #17542) - - -version 2018.09.18 - -Core -+ [extractor/common] Introduce channel meta fields - -Extractors -* [adobepass] Don't pollute default headers dict -* [udemy] Don't pollute default headers dict -* [twitch] Don't pollute default headers dict -* [youtube] Don't pollute default query dict (#17593) -* [crunchyroll] Prefer hardsubless formats and formats in locale language -* [vrv] Make format ids deterministic -* [vimeo] Fix ondemand playlist extraction (#14591) -+ [pornhub] Extract upload date (#17574) -+ [porntube] Extract channel meta fields -+ [vimeo] Extract channel meta fields -+ [youtube] Extract channel meta fields (#9676, #12939) -* [porntube] Fix extraction (#17541) -* [asiancrush] Fix extraction (#15630) -+ [twitch:clips] Extend URL regular expression (#17559) -+ [vzaar] Add support for HLS -* [tube8] Fix metadata extraction (#17520) -* [eporner] Extract JSON-LD (#17519) - - -version 2018.09.10 - -Core -+ [utils] Properly recognize AV1 codec (#17506) - -Extractors -+ [iprima] Add support for prima.iprima.cz (#17514) -+ [tele5] Add support for tele5.de (#7805, #7922, #17331, #17414) -* [nbc] Fix extraction of percent encoded URLs (#17374) - - -version 2018.09.08 - -Extractors -* [youtube] Fix extraction (#17457, #17464) -+ [pornhub:uservideos] Add support for new URLs (#17388) -* [iprima] Confirm adult check (#17437) -* [slideslive] Make check for video service name case-insensitive (#17429) -* [radiojavan] Fix extraction (#17151) -* [generic] Skip unsuccessful jwplayer extraction (#16735) - - -version 2018.09.01 - -Core -* [utils] Skip remote IP addresses non matching to source address' IP version - when creating a connection (#13422, #17362) - -Extractors -+ [ard] Add support for one.ard.de (#17397) -* [niconico] Fix extraction on python3 (#17393, #17407) -* [ard] Extract f4m formats -* [crunchyroll] Parse vilos media data (#17343) -+ [ard] Add support for Beta ARD Mediathek -+ [bandcamp] Extract more metadata (#13197) -* [internazionale] Fix extraction of non-available-abroad videos (#17386) - - -version 2018.08.28 - -Extractors -+ [youtube:playlist] Add support for music album playlists (OLAK5uy_ prefix) - (#17361) -* [bitchute] Fix extraction by pass custom User-Agent (#17360) -* [webofstories:playlist] Fix extraction (#16914) -+ [tvplayhome] Add support for new tvplay URLs (#17344) -+ [generic] Allow relative src for videojs embeds (#17324) -+ [xfileshare] Add support for vidto.se (#17317) -+ [vidzi] Add support for vidzi.nu (#17316) -+ [nova:embed] Add support for media.cms.nova.cz (#17282) - - -version 2018.08.22 - -Core -* [utils] Use pure browser header for User-Agent (#17236) - -Extractors -+ [kinopoisk] Add support for kinopoisk.ru (#17283) -+ [yourporn] Add support for yourporn.sexy (#17298) -+ [go] Add support for disneynow.go.com (#16299, #17264) -+ [6play] Add support for play.rtl.hr (#17249) -* [anvato] Fallback to generic API key for access-key-to-API-key lookup - (#16788, #17254) -* [lci] Fix extraction (#17274) -* [bbccouk] Extend id URL regular expression (#17270) -* [cwtv] Fix extraction (#17256) -* [nova] Fix extraction (#17241) -+ [generic] Add support for expressen embeds -* [raywenderlich] Adapt to site redesign (#17225) -+ [redbulltv] Add support redbull.com tv URLs (#17218) -+ [bitchute] Add support for bitchute.com (#14052) -+ [clyp] Add support for token protected media (#17184) -* [imdb] Fix extension extraction (#17167) - - -version 2018.08.04 - -Extractors -* [funk:channel] Improve byChannelAlias extraction (#17142) -* [twitch] Fix authentication (#17024, #17126) -* [twitch:vod] Improve URL regular expression (#17135) -* [watchbox] Fix extraction (#17107) -* [pbs] Fix extraction (#17109) -* [theplatform] Relax URL regular expression (#16181, #17097) -+ [viqeo] Add support for viqeo.tv (#17066) - - -version 2018.07.29 - -Extractors -* [crunchyroll:playlist] Restrict URL regular expression (#17069, #17076) -+ [pornhub] Add support for subtitles (#16924, #17088) -* [ceskatelevize] Use https for API call (#16997, #16999) -* [dailymotion:playlist] Fix extraction (#16894) -* [ted] Improve extraction -* [ted] Fix extraction for videos without nativeDownloads (#16756, #17085) -* [telecinco] Fix extraction (#17080) -* [mitele] Reduce number of requests -* [rai] Return non HTTP relinker URL intact (#17055) -* [vk] Fix extraction for inline only videos (#16923) -* [streamcloud] Fix extraction (#17054) -* [facebook] Fix tahoe player extraction with authentication (#16655) -+ [puhutv] Add support for puhutv.com (#12712, #16010, #16269) - - -version 2018.07.21 - -Core -+ [utils] Introduce url_or_none -* [utils] Allow JSONP without function name (#17028) -+ [extractor/common] Extract DASH and MSS formats from SMIL manifests - -Extractors -+ [bbc] Add support for BBC Radio Play pages (#17022) -* [iwara] Fix download URLs (#17026) -* [vrtnu] Relax title extraction and extract JSON-LD (#17018) -+ [viu] Pass Referer and Origin headers and area id (#16992) -+ [vimeo] Add another config regular expression (#17013) -+ [facebook] Extract view count (#16942) -* [dailymotion] Improve description extraction (#16984) -* [slutload] Fix and improve extraction (#17001) -* [mediaset] Fix extraction (#16977) -+ [theplatform] Add support for theplatform TLD customization (#16977) -* [imgur] Relax URL regular expression (#16987) -* [pornhub] Improve extraction and extract all formats (#12166, #15891, #16262, - #16959) - - -version 2018.07.10 - -Core -* [utils] Share JSON-LD regular expression -* [downloader/dash] Improve error handling (#16927) - -Extractors -+ [nrktv] Add support for new season and serie URL schema -+ [nrktv] Add support for new episode URL schema (#16909) -+ [frontendmasters] Add support for frontendmasters.com (#3661, #16328) -* [funk] Fix extraction (#16918) -* [watchbox] Fix extraction (#16904) -* [dplayit] Sort formats -* [dplayit] Fix extraction (#16901) -* [youtube] Improve login error handling (#13822) - - -version 2018.07.04 - -Core -* [extractor/common] Properly escape % in MPD templates (#16867) -* [extractor/common] Use source URL as Referer for HTML5 entries (16849) -* Prefer ffmpeg over avconv by default (#8622) - -Extractors -* [pluralsight] Switch to graphql (#16889, #16895, #16896, #16899) -* [lynda] Simplify login and improve error capturing (#16891) -+ [go90] Add support for embed URLs (#16873) -* [go90] Detect geo restriction error and pass geo verification headers - (#16874) -* [vlive] Fix live streams extraction (#16871) -* [npo] Fix typo (#16872) -+ [mediaset] Add support for new videos and extract all formats (#16568) -* [dctptv] Restore extraction based on REST API (#16850) -* [svt] Improve extraction and add support for pages (#16802) -* [porncom] Fix extraction (#16808) - - -version 2018.06.25 - -Extractors -* [joj] Relax URL regular expression (#16771) -* [brightcove] Workaround sonyliv DRM protected videos (#16807) -* [motherless] Fix extraction (#16786) -* [itv] Make SOAP request non fatal and extract metadata from webpage (#16780) -- [foxnews:insider] Remove extractor (#15810) -+ [foxnews] Add support for iframe embeds (#15810, #16711) - - -version 2018.06.19 - -Core -+ [extractor/common] Introduce expected_status in _download_* methods - for convenient accept of HTTP requests failed with non 2xx status codes -+ [compat] Introduce compat_integer_types - -Extractors -* [peertube] Improve generic support (#16733) -+ [6play] Use geo verification headers -* [rtbf] Fix extraction for python 3.2 -* [vgtv] Improve HLS formats extraction -+ [vgtv] Add support for www.aftonbladet.se/tv URLs -* [bbccouk] Use expected_status -* [markiza] Expect 500 HTTP status code -* [tvnow] Try all clear manifest URLs (#15361) - - -version 2018.06.18 - -Core -* [downloader/rtmp] Fix downloading in verbose mode (#16736) - -Extractors -+ [markiza] Add support for markiza.sk (#16750) -* [wat] Try all supported adaptive URLs -+ [6play] Add support for rtlplay.be and extract hd usp formats -+ [rtbf] Add support for audio and live streams (#9638, #11923) -+ [rtbf] Extract HLS, DASH and all HTTP formats -+ [rtbf] Extract subtitles -+ [rtbf] Fixup specific HTTP URLs (#16101) -+ [expressen] Add support for expressen.se -* [vidzi] Fix extraction (#16678) -* [pbs] Improve extraction (#16623, #16684) -* [bilibili] Restrict cid regular expression (#16638, #16734) - - -version 2018.06.14 - -Core -* [downloader/http] Fix retry on error when streaming to stdout (#16699) - -Extractors -+ [discoverynetworks] Add support for disco-api videos (#16724) -+ [dailymotion] Add support for password protected videos (#9789) -+ [abc:iview] Add support for livestreams (#12354) -* [abc:iview] Fix extraction (#16704) -+ [crackle] Add support for sonycrackle.com (#16698) -+ [tvnet] Add support for tvnet.gov.vn (#15462) -* [nrk] Update API hosts and try all previously known ones (#16690) -* [wimp] Fix Youtube embeds extraction - - -version 2018.06.11 - -Extractors -* [npo] Extend URL regular expression and add support for npostart.nl (#16682) -+ [inc] Add support for another embed schema (#16666) -* [tv4] Fix format extraction (#16650) -+ [nexx] Add support for free cdn (#16538) -+ [pbs] Add another cove id pattern (#15373) -+ [rbmaradio] Add support for 192k format (#16631) - - -version 2018.06.04 - -Extractors -+ [camtube] Add support for camtube.co -+ [twitter:card] Extract guest token (#16609) -+ [chaturbate] Use geo verification headers -+ [bbc] Add support for bbcthree (#16612) -* [youtube] Move metadata extraction after video availability check -+ [youtube] Extract track and artist -+ [safari] Add support for new URL schema (#16614) -* [adn] Fix extraction - - -version 2018.06.02 - -Core -* [utils] Improve determine_ext - -Extractors -+ [facebook] Add support for tahoe player videos (#15441, #16554) -* [cbc] Improve extraction (#16583, #16593) -* [openload] Improve ext extraction (#16595) -+ [twitter:card] Add support for another endpoint (#16586) -+ [openload] Add support for oload.win and oload.download (#16592) -* [audimedia] Fix extraction (#15309) -+ [francetv] Add support for sport.francetvinfo.fr (#15645) -* [mlb] Improve extraction (#16587) -- [nhl] Remove old extractors -* [rbmaradio] Check formats availability (#16585) - - -version 2018.05.30 - -Core -* [downloader/rtmp] Generalize download messages and report time elapsed - on finish -* [downloader/rtmp] Gracefully handle live streams interrupted by user - -Extractors -* [teamcoco] Fix extraction for full episodes (#16573) -* [spiegel] Fix info extraction (#16538) -+ [apa] Add support for apa.at (#15041, #15672) -+ [bellmedia] Add support for bnnbloomberg.ca (#16560) -+ [9c9media] Extract MPD formats and subtitles -* [cammodels] Use geo verification headers -+ [ufctv] Add support for authentication (#16542) -+ [cammodels] Add support for cammodels.com (#14499) -* [utils] Fix style id extraction for namespaced id attribute in dfxp2srt - (#16551) -* [soundcloud] Detect format extension (#16549) -* [cbc] Fix playlist title extraction (#16502) -+ [tumblr] Detect and report sensitive media (#13829) -+ [tumblr] Add support for authentication (#15133) - - -version 2018.05.26 - -Core -* [utils] Improve parse_age_limit - -Extractors -* [audiomack] Stringify video id (#15310) -* [izlesene] Fix extraction (#16233, #16271, #16407) -+ [indavideo] Add support for generic embeds (#11989) -* [indavideo] Fix extraction (#11221) -* [indavideo] Sign download URLs (#16174) -+ [peertube] Add support for PeerTube based sites (#16301, #16329) -* [imgur] Fix extraction (#16537) -+ [hidive] Add support for authentication (#16534) -+ [nbc] Add support for stream.nbcsports.com (#13911) -+ [viewlift] Add support for hoichoi.tv (#16536) -* [go90] Extract age limit and detect DRM protection(#10127) -* [viewlift] fix extraction for snagfilms.com (#15766) -* [globo] Improve extraction (#4189) - * Add support for authentication - * Simplify URL signing - * Extract DASH and MSS formats -* [leeco] Fix extraction (#16464) -* [teamcoco] Add fallback for format extraction (#16484) -* [teamcoco] Improve URL regular expression (#16484) -* [imdb] Improve extraction (#4085, #14557) - - -version 2018.05.18 - -Extractors -* [vimeo:likes] Relax URL regular expression and fix single page likes - extraction (#16475) -* [pluralsight] Fix clip id extraction (#16460) -+ [mychannels] Add support for mychannels.com (#15334) -- [moniker] Remove extractor (#15336) -* [pbs] Fix embed data extraction (#16474) -+ [mtv] Add support for paramountnetwork.com and bellator.com (#15418) -* [youtube] Fix hd720 format position -* [dailymotion] Remove fragment part from m3u8 URLs (#8915) -* [3sat] Improve extraction (#15350) - * Extract all formats - * Extract more format metadata - * Improve format sorting - * Use hls native downloader - * Detect and bypass geo-restriction -+ [dtube] Add support for d.tube (#15201) -* [options] Fix typo (#16450) -* [youtube] Improve format filesize extraction (#16453) -* [youtube] Make uploader extraction non fatal (#16444) -* [youtube] Fix extraction for embed restricted live streams (#16433) -* [nbc] Improve info extraction (#16440) -* [twitch:clips] Fix extraction (#16429) -* [redditr] Relax URL regular expression (#16426, #16427) -* [mixcloud] Bypass throttling for HTTP formats (#12579, #16424) -+ [nick] Add support for nickjr.de (#13230) -* [teamcoco] Fix extraction (#16374) - - -version 2018.05.09 - -Core -* [YoutubeDL] Ensure ext exists for automatic captions -* Introduce --geo-bypass-ip-block - -Extractors -+ [udemy] Extract asset captions -+ [udemy] Extract stream URLs (#16372) -+ [businessinsider] Add support for businessinsider.com (#16387, #16388, #16389) -+ [cloudflarestream] Add support for cloudflarestream.com (#16375) -* [watchbox] Fix extraction (#16356) -* [discovery] Extract Affiliate/Anonymous Auth Token from cookies (#14954) -+ [itv:btcc] Add support for itv.com/btcc (#16139) -* [tunein] Use live title for live streams (#16347) -* [itv] Improve extraction (#16253) - - -version 2018.05.01 - -Core -* [downloader/fragment] Restart download if .ytdl file is corrupt (#16312) -+ [extractor/common] Extract interaction statistic -+ [utils] Add merge_dicts -+ [extractor/common] Add _download_json_handle - -Extractors -* [kaltura] Improve iframe embeds detection (#16337) -+ [udemy] Extract outputs renditions (#16289, #16291, #16320, #16321, #16334, - #16335) -+ [zattoo] Add support for zattoo.com and mobiltv.quickline.com (#14668, #14676) -* [yandexmusic] Convert release_year to int -* [udemy] Override _download_webpage_handle instead of _download_webpage -* [xiami] Override _download_webpage_handle instead of _download_webpage -* [yandexmusic] Override _download_webpage_handle instead of _download_webpage -* [youtube] Correctly disable polymer on all requests (#16323, #16326) -* [generic] Prefer enclosures over links in RSS feeds (#16189) -+ [redditr] Add support for old.reddit.com URLs (#16274) -* [nrktv] Update API host (#16324) -+ [imdb] Extract all formats (#16249) -+ [vimeo] Extract JSON-LD (#16295) -* [funk:channel] Improve extraction (#16285) - - -version 2018.04.25 - -Core -* [utils] Fix match_str for boolean meta fields -+ [Makefile] Add support for pandoc 2 and disable smart extension (#16251) -* [YoutubeDL] Fix typo in media extension compatibility checker (#16215) - -Extractors -+ [openload] Recognize IPv6 stream URLs (#16136, #16137, #16205, #16246, - #16250) -+ [twitch] Extract is_live according to status (#16259) -* [pornflip] Relax URL regular expression (#16258) -- [etonline] Remove extractor (#16256) -* [breakcom] Fix extraction (#16254) -+ [youtube] Add ability to authenticate with cookies -* [youtube:feed] Implement lazy playlist extraction (#10184) -+ [svt] Add support for TV channel live streams (#15279, #15809) -* [ccma] Fix video extraction (#15931) -* [rentv] Fix extraction (#15227) -+ [nick] Add support for nickjr.nl (#16230) -* [extremetube] Fix metadata extraction -+ [keezmovies] Add support for generic embeds (#16134, #16154) -* [nexx] Extract new azure URLs (#16223) -* [cbssports] Fix extraction (#16217) -* [kaltura] Improve embeds detection (#16201) -* [instagram:user] Fix extraction (#16119) -* [cbs] Skip DRM asset types (#16104) - - -version 2018.04.16 - -Extractors -* [smotri:broadcast] Fix extraction (#16180) -+ [picarto] Add support for picarto.tv (#6205, #12514, #15276, #15551) -* [vine:user] Fix extraction (#15514, #16190) -* [pornhub] Relax URL regular expression (#16165) -* [cbc:watch] Re-acquire device token when expired (#16160) -+ [fxnetworks] Add support for https theplatform URLs (#16125, #16157) -+ [instagram:user] Add request signing (#16119) -+ [twitch] Add support for mobile URLs (#16146) - - -version 2018.04.09 - -Core -* [YoutubeDL] Do not save/restore console title while simulate (#16103) -* [extractor/common] Relax JSON-LD context check (#16006) - -Extractors -+ [generic] Add support for tube8 embeds -+ [generic] Add support for share-videos.se embeds (#16089, #16115) -* [odnoklassniki] Extend URL regular expression (#16081) -* [steam] Bypass mature content check (#16113) -+ [acast] Extract more metadata -* [acast] Fix extraction (#16118) -* [instagram:user] Fix extraction (#16119) -* [drtuber] Fix title extraction (#16107, #16108) -* [liveleak] Extend URL regular expression (#16117) -+ [openload] Add support for oload.xyz -* [openload] Relax stream URL regular expression -* [openload] Fix extraction (#16099) -+ [svtplay:series] Add support for season URLs -+ [svtplay:series] Add support for series (#11130, #16059) - - -version 2018.04.03 - -Extractors -+ [tvnow] Add support for shows (#15837) -* [dramafever] Fix authentication (#16067) -* [afreecatv] Use partial view only when necessary (#14450) -+ [afreecatv] Add support for authentication (#14450) -+ [nationalgeographic] Add support for new URL schema (#16001, #16054) -* [xvideos] Fix thumbnail extraction (#15978, #15979) -* [medialaan] Fix vod id (#16038) -+ [openload] Add support for oload.site (#16039) -* [naver] Fix extraction (#16029) -* [dramafever] Partially switch to API v5 (#16026) -* [abc:iview] Unescape title and series meta fields (#15994) -* [videa] Extend URL regular expression (#16003) - - -version 2018.03.26.1 - -Core -+ [downloader/external] Add elapsed time to progress hook (#10876) -* [downloader/external,fragment] Fix download finalization when writing file - to stdout (#10809, #10876, #15799) - -Extractors -* [vrv] Fix extraction on python2 (#15928) -* [afreecatv] Update referrer (#15947) -+ [24video] Add support for 24video.sexy (#15973) -* [crackle] Bypass geo restriction -* [crackle] Fix extraction (#15969) -+ [lenta] Add support for lenta.ru (#15953) -+ [instagram:user] Add pagination (#15934) -* [youku] Update ccode (#15939) -* [libsyn] Adapt to new page structure - - -version 2018.03.20 - -Core -* [extractor/common] Improve thumbnail extraction for HTML5 entries -* Generalize XML manifest processing code and improve XSPF parsing -+ [extractor/common] Add _download_xml_handle -+ [extractor/common] Add support for relative URIs in _parse_xspf (#15794) - -Extractors -+ [7plus] Extract series metadata (#15862, #15906) -* [9now] Bypass geo restriction (#15920) -* [cbs] Skip unavailable assets (#13490, #13506, #15776) -+ [canalc2] Add support for HTML5 videos (#15916, #15919) -+ [ceskatelevize] Add support for iframe embeds (#15918) -+ [prosiebensat1] Add support for galileo.tv (#15894) -+ [generic] Add support for xfileshare embeds (#15879) -* [bilibili] Switch to v2 playurl API -* [bilibili] Fix and improve extraction (#15048, #15430, #15622, #15863) -* [heise] Improve extraction (#15496, #15784, #15026) -* [instagram] Fix user videos extraction (#15858) - - -version 2018.03.14 - -Extractors -* [soundcloud] Update client id (#15866) -+ [tennistv] Add support for tennistv.com -+ [line] Add support for tv.line.me (#9427) -* [xnxx] Fix extraction (#15817) -* [njpwworld] Fix authentication (#15815) - - -version 2018.03.10 - -Core -* [downloader/hls] Skip uplynk ad fragments (#15748) - -Extractors -* [pornhub] Don't override session cookies (#15697) -+ [raywenderlich] Add support for videos.raywenderlich.com (#15251) -* [funk] Fix extraction and rework extractors (#15792) -* [nexx] Restore reverse engineered approach -+ [heise] Add support for kaltura embeds (#14961, #15728) -+ [tvnow] Extract series metadata (#15774) -* [ruutu] Continue formats extraction on NOT-USED URLs (#15775) -* [vrtnu] Use redirect URL for building video JSON URL (#15767, #15769) -* [vimeo] Modernize login code and improve error messaging -* [archiveorg] Fix extraction (#15770, #15772) -+ [hidive] Add support for hidive.com (#15494) -* [afreecatv] Detect deleted videos -* [afreecatv] Fix extraction (#15755) -* [vice] Fix extraction and rework extractors (#11101, #13019, #13622, #13778) -+ [vidzi] Add support for vidzi.si (#15751) -* [npo] Fix typo - - -version 2018.03.03 - -Core -+ [utils] Add parse_resolution -Revert respect --prefer-insecure while updating - -Extractors -+ [yapfiles] Add support for yapfiles.ru (#15726, #11085) -* [spankbang] Fix formats extraction (#15727) -* [adn] Fix extraction (#15716) -+ [toggle] Extract DASH and ISM formats (#15721) -+ [nickelodeon] Add support for nickelodeon.com.tr (#15706) -* [npo] Validate and filter format URLs (#15709) - - -version 2018.02.26 - -Extractors -* [udemy] Use custom User-Agent (#15571) - - -version 2018.02.25 - -Core -* [postprocessor/embedthumbnail] Skip embedding when there aren't any - thumbnails (#12573) -* [extractor/common] Improve jwplayer subtitles extraction (#15695) - -Extractors -+ [vidlii] Add support for vidlii.com (#14472, #14512, #14779) -+ [streamango] Capture and output error messages -* [streamango] Fix extraction (#14160, #14256) -+ [telequebec] Add support for emissions (#14649, #14655) -+ [telequebec:live] Add support for live streams (#15688) -+ [mailru:music] Add support for mail.ru/music (#15618) -* [aenetworks] Switch to akamai HLS formats (#15612) -* [ytsearch] Fix flat title extraction (#11260, #15681) - - -version 2018.02.22 - -Core -+ [utils] Fixup some common URL typos in sanitize_url (#15649) -* Respect --prefer-insecure while updating (#15497) - -Extractors -* [vidio] Fix HLS URL extraction (#15675) -+ [nexx] Add support for arc.nexx.cloud URLs -* [nexx] Switch to arc API (#15652) -* [redtube] Fix duration extraction (#15659) -+ [sonyliv] Respect referrer (#15648) -+ [brightcove:new] Use referrer for formats' HTTP headers -+ [cbc] Add support for olympics.cbc.ca (#15535) -+ [fusion] Add support for fusion.tv (#15628) -* [npo] Improve quality metadata extraction -* [npo] Relax URL regular expression (#14987, #14994) -+ [npo] Capture and output error message -+ [pornhub] Add support for channels (#15613) -* [youtube] Handle shared URLs with generic extractor (#14303) - - -version 2018.02.11 - -Core -+ [YoutubeDL] Add support for filesize_approx in format selector (#15550) - -Extractors -+ [francetv] Add support for live streams (#13689) -+ [francetv] Add support for zouzous.fr and ludo.fr (#10454, #13087, #13103, - #15012) -* [francetv] Separate main extractor and rework others to delegate to it -* [francetv] Improve manifest URL signing (#15536) -+ [francetv] Sign m3u8 manifest URLs (#15565) -+ [veoh] Add support for embed URLs (#15561) -* [afreecatv] Fix extraction (#15556) -* [periscope] Use accessVideoPublic endpoint (#15554) -* [discovery] Fix auth request (#15542) -+ [6play] Extract subtitles (#15541) -* [newgrounds] Fix metadata extraction (#15531) -+ [nbc] Add support for stream.nbcolympics.com (#10295) -* [dvtv] Fix live streams extraction (#15442) - - -version 2018.02.08 - -Extractors -+ [myvi] Extend URL regular expression -+ [myvi:embed] Add support for myvi.tv embeds (#15521) -+ [prosiebensat1] Extend URL regular expression (#15520) -* [pokemon] Relax URL regular expression and extend title extraction (#15518) -+ [gameinformer] Use geo verification headers -* [la7] Fix extraction (#15501, #15502) -* [gameinformer] Fix brightcove id extraction (#15416) -+ [afreecatv] Pass referrer to video info request (#15507) -+ [telebruxelles] Add support for live streams -* [telebruxelles] Relax URL regular expression -* [telebruxelles] Fix extraction (#15504) -* [extractor/common] Respect secure schemes in _extract_wowza_formats - - -version 2018.02.04 - -Core -* [downloader/http] Randomize HTTP chunk size -+ [downloader/http] Add ability to pass downloader options via info dict -* [downloader/http] Fix 302 infinite loops by not reusing requests -+ Document http_chunk_size - -Extractors -+ [brightcove] Pass embed page URL as referrer (#15486) -+ [youtube] Enforce using chunked HTTP downloading for DASH formats - - -version 2018.02.03 - -Core -+ Introduce --http-chunk-size for chunk-based HTTP downloading -+ Add support for IronPython -* [downloader/ism] Fix Python 3.2 support - -Extractors -* [redbulltv] Fix extraction (#15481) -* [redtube] Fix metadata extraction (#15472) -* [pladform] Respect platform id and extract HLS formats (#15468) -- [rtlnl] Remove progressive formats (#15459) -* [6play] Do no modify asset URLs with a token (#15248) -* [nationalgeographic] Relax URL regular expression -* [dplay] Relax URL regular expression (#15458) -* [cbsinteractive] Fix data extraction (#15451) -+ [amcnetworks] Add support for sundancetv.com (#9260) - - -version 2018.01.27 - -Core -* [extractor/common] Improve _json_ld for articles -* Switch codebase to use compat_b64decode -+ [compat] Add compat_b64decode - -Extractors -+ [seznamzpravy] Add support for seznam.cz and seznamzpravy.cz (#14102, #14616) -* [dplay] Bypass geo restriction -+ [dplay] Add support for disco-api videos (#15396) -* [youtube] Extract precise error messages (#15284) -* [teachertube] Capture and output error message -* [teachertube] Fix and relax thumbnail extraction (#15403) -+ [prosiebensat1] Add another clip id regular expression (#15378) -* [tbs] Update tokenizer url (#15395) -* [mixcloud] Use compat_b64decode (#15394) -- [thesixtyone] Remove extractor (#15341) - - -version 2018.01.21 - -Core -* [extractor/common] Improve jwplayer DASH formats extraction (#9242, #15187) -* [utils] Improve scientific notation handling in js_to_json (#14789) - -Extractors -+ [southparkdk] Add support for southparkstudios.nu -+ [southpark] Add support for collections (#14803) -* [franceinter] Fix upload date extraction (#14996) -+ [rtvs] Add support for rtvs.sk (#9242, #15187) -* [restudy] Fix extraction and extend URL regular expression (#15347) -* [youtube:live] Improve live detection (#15365) -+ [springboardplatform] Add support for springboardplatform.com -* [prosiebensat1] Add another clip id regular expression (#15290) -- [ringtv] Remove extractor (#15345) - - -version 2018.01.18 - -Extractors -* [soundcloud] Update client id (#15306) -- [kamcord] Remove extractor (#15322) -+ [spiegel] Add support for nexx videos (#15285) -* [twitch] Fix authentication and error capture (#14090, #15264) -* [vk] Detect more errors due to copyright complaints (#15259) - - -version 2018.01.14 - -Extractors -* [youtube] Fix live streams extraction (#15202) -* [wdr] Bypass geo restriction -* [wdr] Rework extractors (#14598) -+ [wdr] Add support for wdrmaus.de/elefantenseite (#14598) -+ [gamestar] Add support for gamepro.de (#3384) -* [viafree] Skip rtmp formats (#15232) -+ [pandoratv] Add support for mobile URLs (#12441) -+ [pandoratv] Add support for new URL format (#15131) -+ [ximalaya] Add support for ximalaya.com (#14687) -+ [digg] Add support for digg.com (#15214) -* [limelight] Tolerate empty pc formats (#15150, #15151, #15207) -* [ndr:embed:base] Make separate formats extraction non fatal (#15203) -+ [weibo] Add extractor (#15079) -+ [ok] Add support for live streams -* [canalplus] Fix extraction (#15072) -* [bilibili] Fix extraction (#15188) - - -version 2018.01.07 - -Core -* [utils] Fix youtube-dl under PyPy3 on Windows -* [YoutubeDL] Output python implementation in debug header - -Extractors -+ [jwplatform] Add support for multiple embeds (#15192) -* [mitele] Fix extraction (#15186) -+ [motherless] Add support for groups (#15124) -* [lynda] Relax URL regular expression (#15185) -* [soundcloud] Fallback to avatar picture for thumbnail (#12878) -* [youku] Fix list extraction (#15135) -* [openload] Fix extraction (#15166) -* [lynda] Skip invalid subtitles (#15159) -* [twitch] Pass video id to url_result when extracting playlist (#15139) -* [rtve.es:alacarta] Fix extraction of some new URLs -* [acast] Fix extraction (#15147) - - -version 2017.12.31 - -Core -+ [extractor/common] Add container meta field for formats extracted - in _parse_mpd_formats (#13616) -+ [downloader/hls] Use HTTP headers for key request -* [common] Use AACL as the default fourcc when AudioTag is 255 -* [extractor/common] Fix extraction of DASH formats with the same - representation id (#15111) - -Extractors -+ [slutload] Add support for mobile URLs (#14806) -* [abc:iview] Bypass geo restriction -* [abc:iview] Fix extraction (#14711, #14782, #14838, #14917, #14963, #14985, - #15035, #15057, #15061, #15071, #15095, #15106) -* [openload] Fix extraction (#15118) -- [sandia] Remove extractor -- [collegerama] Remove extractor -+ [mediasite] Add support for sites based on Mediasite Video Platform (#5428, - #11185, #14343) -+ [ufctv] Add support for ufc.tv (#14520) -* [pluralsight] Fix missing first line of subtitles (#11118) -* [openload] Fallback on f-page extraction (#14665, #14879) -* [vimeo] Improve password protected videos extraction (#15114) -* [aws] Fix canonical/signed headers generation on python 2 (#15102) - - -version 2017.12.28 - -Extractors -+ [internazionale] Add support for internazionale.it (#14973) -* [playtvak] Relax video regular expression and make description optional - (#15037) -+ [filmweb] Add support for filmweb.no (#8773, #10368) -+ [23video] Add support for 23video.com -+ [espn] Add support for fivethirtyeight.com (#6864) -+ [umg:de] Add support for universal-music.de (#11582, #11584) -+ [espn] Add support for espnfc and extract more formats (#8053) -* [youku] Update ccode (#14880) -+ [openload] Add support for oload.stream (#15070) -* [youku] Fix list extraction (#15065) - - -version 2017.12.23 - -Core -* [extractor/common] Move X-Forwarded-For setup code into _request_webpage -+ [YoutubeDL] Add support for playlist_uploader and playlist_uploader_id in - output template (#11427, #15018) -+ [extractor/common] Introduce uploader, uploader_id and uploader_url - meta fields for playlists (#11427, #15018) -* [downloader/fragment] Encode filename of fragment being removed (#15020) -+ [utils] Add another date format pattern (#14999) - -Extractors -+ [kaltura] Add another embed pattern for entry_id -+ [7plus] Add support for 7plus.com.au (#15043) -* [animeondemand] Relax login error regular expression -+ [shahid] Add support for show pages (#7401) -+ [youtube] Extract uploader, uploader_id and uploader_url for playlists - (#11427, #15018) -* [afreecatv] Improve format extraction (#15019) -+ [cspan] Add support for audio only pages and catch page errors (#14995) -+ [mailru] Add support for embed URLs (#14904) -* [crunchyroll] Future-proof XML element checks (#15013) -* [cbslocal] Fix timestamp extraction (#14999, #15000) -* [discoverygo] Correct TTML subtitle extension -* [vk] Make view count optional (#14979) -* [disney] Skip Apple FairPlay formats (#14982) -* [voot] Fix format extraction (#14758) - - -version 2017.12.14 - -Core -* [postprocessor/xattr] Clarify NO_SPACE message (#14970) -* [downloader/http] Return actual download result from real_download (#14971) - -Extractors -+ [itv] Extract more subtitles and duration -* [itv] Improve extraction (#14944) -+ [byutv] Add support for geo restricted videos -* [byutv] Fix extraction (#14966, #14967) -+ [bbccouk] Fix extraction for 320k HLS streams -+ [toutv] Add support for special video URLs (#14179) -* [discovery] Fix free videos extraction (#14157, #14954) -* [tvnow] Fix extraction (#7831) -+ [nickelodeon:br] Add support for nickelodeon brazil websites (#14893) -* [nick] Improve extraction (#14876) -* [tbs] Fix extraction (#13658) - - -version 2017.12.10 - -Core -+ [utils] Add sami mimetype to mimetype2ext - -Extractors -* [culturebox] Improve video id extraction (#14947) -* [twitter] Improve extraction (#14197) -+ [udemy] Extract more HLS formats -* [udemy] Improve course id extraction (#14938) -+ [stretchinternet] Add support for portal.stretchinternet.com (#14576) -* [ellentube] Fix extraction (#14407, #14570) -+ [raiplay:playlist] Add support for playlists (#14563) -* [sonyliv] Bypass geo restriction -* [sonyliv] Extract higher quality formats (#14922) -* [fox] Extract subtitles -+ [fox] Add support for Adobe Pass authentication (#14205, #14489) -- [dailymotion:cloud] Remove extractor (#6794) -* [xhamster] Fix thumbnail extraction (#14780) -+ [xhamster] Add support for mobile URLs (#14780) -* [generic] Don't pass video id as mpd id while extracting DASH (#14902) -* [ard] Skip invalid stream URLs (#14906) -* [porncom] Fix metadata extraction (#14911) -* [pluralsight] Detect agreement request (#14913) -* [toutv] Fix login (#14614) - - -version 2017.12.02 - -Core -+ [downloader/fragment] Commit part file after each fragment -+ [extractor/common] Add durations for DASH fragments with bare SegmentURLs -+ [extractor/common] Add support for DASH manifests with SegmentLists with - bare SegmentURLs (#14844) -+ [utils] Add hvc1 codec code to parse_codecs - -Extractors -* [xhamster] Fix extraction (#14884) -* [youku] Update ccode (#14872) -* [mnet] Fix format extraction (#14883) -+ [xiami] Add Referer header to API request -* [mtv] Correct scc extension in extracted subtitles (#13730) -* [vvvvid] Fix extraction for kenc videos (#13406) -+ [br] Add support for BR Mediathek videos (#14560, #14788) -+ [daisuki] Add support for motto.daisuki.com (#14681) -* [odnoklassniki] Fix API metadata request (#14862) -* [itv] Fix HLS formats extraction -+ [pbs] Add another media id regular expression - - -version 2017.11.26 - -Core -* [extractor/common] Use final URL when dumping request (#14769) - -Extractors -* [fczenit] Fix extraction -- [firstpost] Remove extractor -* [freespeech] Fix extraction -* [nexx] Extract more formats -+ [openload] Add support for openload.link (#14763) -* [empflix] Relax URL regular expression -* [empflix] Fix extraction -* [tnaflix] Don't modify download URLs (#14811) -- [gamersyde] Remove extractor -* [francetv:generationwhat] Fix extraction -+ [massengeschmacktv] Add support for Massengeschmack TV -* [fox9] Fix extraction -* [faz] Fix extraction and add support for Perform Group embeds (#14714) -+ [performgroup] Add support for performgroup.com -+ [jwplatform] Add support for iframes (#14828) -* [culturebox] Fix extraction (#14827) -* [youku] Fix extraction; update ccode (#14815) -* [livestream] Make SMIL extraction non fatal (#14792) -+ [drtuber] Add support for mobile URLs (#14772) -+ [spankbang] Add support for mobile URLs (#14771) -* [instagram] Fix description, timestamp and counters extraction (#14755) - - -version 2017.11.15 - -Core -* [common] Skip Apple FairPlay m3u8 manifests (#14741) -* [YoutubeDL] Fix playlist range optimization for --playlist-items (#14740) - -Extractors -* [vshare] Capture and output error message -* [vshare] Fix extraction (#14473) -* [crunchyroll] Extract old RTMP formats -* [tva] Fix extraction (#14736) -* [gamespot] Lower preference of HTTP formats (#14652) -* [instagram:user] Fix extraction (#14699) -* [ccma] Fix typo (#14730) -- Remove sensitive data from logging in messages -* [instagram:user] Fix extraction (#14699) -+ [gamespot] Add support for article URLs (#14652) -* [gamespot] Skip Brightcove Once HTTP formats (#14652) -* [cartoonnetwork] Update tokenizer_src (#14666) -+ [wsj] Recognize another URL pattern (#14704) -* [pandatv] Update API URL and sign format URLs (#14693) -* [crunchyroll] Use old login method (#11572) - - -version 2017.11.06 - -Core -+ [extractor/common] Add protocol for f4m formats -* [f4m] Prefer baseURL for relative URLs (#14660) -* [extractor/common] Respect URL query in _extract_wowza_formats (14645) - -Extractors -+ [hotstar:playlist] Add support for playlists (#12465) -* [hotstar] Bypass geo restriction (#14672) -- [22tracks] Remove extractor (#11024, #14628) -+ [skysport] Sdd support ooyala videos protected with embed_token (#14641) -* [gamespot] Extract formats referenced with new data fields (#14652) -* [spankbang] Detect unavailable videos (#14644) - - -version 2017.10.29 - -Core -* [extractor/common] Prefix format id for audio only HLS formats -+ [utils] Add support for zero years and months in parse_duration - -Extractors -* [egghead] Fix extraction (#14388) -+ [fxnetworks] Extract series metadata (#14603) -+ [younow] Add support for younow.com (#9255, #9432, #12436) -* [dctptv] Fix extraction (#14599) -* [youtube] Restrict embed regular expression (#14600) -* [vimeo] Restrict iframe embed regular expression (#14600) -* [soundgasm] Improve extraction (#14588) -- [myvideo] Remove extractor (#8557) -+ [nbc] Add support for classic-tv videos (#14575) -+ [vrtnu] Add support for cookies authentication and simplify (#11873) -+ [canvas] Add support for vrt.be/vrtnu (#11873) -* [twitch:clips] Fix title extraction (#14566) -+ [ndtv] Add support for sub-sites (#14534) -* [dramafever] Fix login error message extraction -+ [nick] Add support for more nickelodeon sites (no, dk, se, ch, fr, es, pt, - ro, hu) (#14553) - - -version 2017.10.20 - -Core -* [downloader/fragment] Report warning instead of error on inconsistent - download state -* [downloader/hls] Fix total fragments count when ad fragments exist - -Extractors -* [parliamentliveuk] Fix extraction (#14524) -* [soundcloud] Update client id (#14546) -+ [servus] Add support for servus.com (#14362) -+ [unity] Add support for unity3d.com (#14528) -* [youtube] Replace youtube redirect URLs in description (#14517) -* [pbs] Restrict direct video URL regular expression (#14519) -* [drtv] Respect preference for direct HTTP formats (#14509) -+ [eporner] Add support for embed URLs (#14507) -* [arte] Capture and output error message -* [niconico] Improve uploader metadata extraction robustness (#14135) - - -version 2017.10.15.1 - -Core -* [downloader/hls] Ignore anvato ad fragments (#14496) -* [downloader/fragment] Output ad fragment count - -Extractors -* [scrippsnetworks:watch] Bypass geo restriction -+ [anvato] Add ability to bypass geo restriction -* [redditr] Fix extraction for URLs with query (#14495) - - -version 2017.10.15 - -Core -+ [common] Add support for jwplayer youtube embeds - -Extractors -* [scrippsnetworks:watch] Fix extraction (#14389) -* [anvato] Process master m3u8 manifests -* [youtube] Fix relative URLs in description -* [spike] Bypass geo restriction -+ [howstuffworks] Add support for more domains -* [infoq] Fix http format downloading -+ [rtlnl] Add support for another type of embeds -+ [onionstudios] Add support for bulbs-video embeds -* [udn] Fix extraction -* [shahid] Fix extraction (#14448) -* [kaltura] Ignore Widevine encrypted video (.wvm) (#14471) -* [vh1] Fix extraction (#9613) - - -version 2017.10.12 - -Core -* [YoutubeDL] Improve _default_format_spec (#14461) - -Extractors -* [steam] Fix extraction (#14067) -+ [funk] Add support for funk.net (#14464) -+ [nexx] Add support for shortcuts and relax domain id extraction -+ [voxmedia] Add support for recode.net (#14173) -+ [once] Add support for vmap URLs -+ [generic] Add support for channel9 embeds (#14469) -* [tva] Fix extraction (#14328) -+ [tubitv] Add support for new URL format (#14460) -- [afreecatv:global] Remove extractor -- [youtube:shared] Removed extractor (#14420) -+ [slideslive] Add support for slideslive.com (#2680) -+ [facebook] Support thumbnails (#14416) -* [vvvvid] Fix episode number extraction (#14456) -* [hrti:playlist] Relax URL regular expression -* [wdr] Relax media link regular expression (#14447) -* [hrti] Relax URL regular expression (#14443) -* [fox] Delegate extraction to uplynk:preplay (#14147) -+ [youtube] Add support for hooktube.com (#14437) - - -version 2017.10.07 - -Core -* [YoutubeDL] Ignore duplicates in --playlist-items -* [YoutubeDL] Fix out of range --playlist-items for iterable playlists and - reduce code duplication (#14425) -+ [utils] Use cache in OnDemandPagedList by default -* [postprocessor/ffmpeg] Convert to opus using libopus (#14381) - -Extractors -* [reddit] Sort formats (#14430) -* [lnkgo] Relax URL regular expression (#14423) -* [pornflip] Extend URL regular expression (#14405, #14406) -+ [xtube] Add support for embed URLs (#14417) -+ [xvideos] Add support for embed URLs and improve extraction (#14409) -* [beeg] Fix extraction (#14403) -* [tvn24] Relax URL regular expression (#14395) -* [nbc] Fix extraction (#13651, #13715, #14137, #14198, #14312, #14314, #14378, - #14392, #14414, #14419, #14431) -+ [ketnet] Add support for videos without direct sources (#14377) -* [canvas] Generalize mediazone.vrt.be extractor and rework canvas and een -+ [afreecatv] Add support for adult videos (#14376) - - -version 2017.10.01 - -Core -* [YoutubeDL] Document youtube_include_dash_manifest - -Extractors -+ [tvp] Add support for new URL schema (#14368) -+ [generic] Add support for single format Video.js embeds (#14371) -* [yahoo] Bypass geo restriction for brightcove (#14210) -* [yahoo] Use extracted brightcove account id (#14210) -* [rtve:alacarta] Fix extraction (#14290) -+ [yahoo] Add support for custom brightcove embeds (#14210) -+ [generic] Add support for Video.js embeds -+ [gfycat] Add support for /gifs/detail URLs (#14322) -* [generic] Fix infinite recursion for twitter:player URLs (#14339) -* [xhamsterembed] Fix extraction (#14308) - - -version 2017.09.24 - -Core -+ [options] Accept lrc as a subtitle conversion target format (#14292) -* [utils] Fix handling raw TTML subtitles (#14191) - -Extractors -* [24video] Fix timestamp extraction and make non fatal (#14295) -+ [24video] Add support for 24video.adult (#14295) -+ [kakao] Add support for tv.kakao.com (#12298, #14007) -+ [twitter] Add support for URLs without user id (#14270) -+ [americastestkitchen] Add support for americastestkitchen.com (#10764, - #13996) -* [generic] Fix support for multiple HTML5 videos on one page (#14080) -* [mixcloud] Fix extraction (#14088, #14132) -+ [lynda] Add support for educourse.ga (#14286) -* [beeg] Fix extraction (#14275) -* [nbcsports:vplayer] Correct theplatform URL (#13873) -* [twitter] Fix duration extraction (#14141) -* [tvplay] Bypass geo restriction -+ [heise] Add support for YouTube embeds (#14109) -+ [popcorntv] Add support for popcorntv.it (#5914, #14211) -* [viki] Update app data (#14181) -* [morningstar] Relax URL regular expression (#14222) -* [openload] Fix extraction (#14225, #14257) -* [noovo] Fix extraction (#14214) -* [dailymotion:playlist] Relax URL regular expression (#14219) -+ [twitch] Add support for go.twitch.tv URLs (#14215) -* [vgtv] Relax URL regular expression (#14223) - - -version 2017.09.15 - -Core -* [downloader/fragment] Restart inconsistent incomplete fragment downloads - (#13731) -* [YoutubeDL] Download raw subtitles files (#12909, #14191) - -Extractors -* [condenast] Fix extraction (#14196, #14207) -+ [orf] Add support for f4m stories -* [tv4] Relax URL regular expression (#14206) -* [animeondemand] Bypass geo restriction -+ [animeondemand] Add support for flash videos (#9944) - - -version 2017.09.11 - -Extractors -* [rutube:playlist] Fix suitable (#14166) - - -version 2017.09.10 - -Core -+ [utils] Introduce bool_or_none -* [YoutubeDL] Ensure dir existence for each requested format (#14116) - -Extractors -* [fox] Fix extraction (#14147) -* [rutube] Use bool_or_none -* [rutube] Rework and generalize playlist extractors (#13565) -+ [rutube:playlist] Add support for playlists (#13534, #13565) -+ [radiocanada] Add fallback for title extraction (#14145) -* [vk] Use dedicated YouTube embeds extraction routine -* [vice] Use dedicated YouTube embeds extraction routine -* [cracked] Use dedicated YouTube embeds extraction routine -* [chilloutzone] Use dedicated YouTube embeds extraction routine -* [abcnews] Use dedicated YouTube embeds extraction routine -* [youtube] Separate methods for embeds extraction -* [redtube] Fix formats extraction (#14122) -* [arte] Relax unavailability check (#14112) -+ [manyvids] Add support for preview videos from manyvids.com (#14053, #14059) -* [vidme:user] Relax URL regular expression (#14054) -* [bpb] Fix extraction (#14043, #14086) -* [soundcloud] Fix download URL with private tracks (#14093) -* [aliexpress:live] Add support for live.aliexpress.com (#13698, #13707) -* [viidea] Capture and output lecture error message (#14099) -* [radiocanada] Skip unsupported platforms (#14100) - - -version 2017.09.02 - -Extractors -* [youtube] Force old layout for each webpage (#14068, #14072, #14074, #14076, - #14077, #14079, #14082, #14083, #14094, #14095, #14096) -* [youtube] Fix upload date extraction (#14065) -+ [charlierose] Add support for episodes (#14062) -+ [bbccouk] Add support for w-prefixed ids (#14056) -* [googledrive] Extend URL regular expression (#9785) -+ [googledrive] Add support for source format (#14046) -* [pornhd] Fix extraction (#14005) - - -version 2017.08.27.1 - -Extractors - -* [youtube] Fix extraction with --youtube-skip-dash-manifest enabled (#14037) - - -version 2017.08.27 - -Core -+ [extractor/common] Extract height and format id for HTML5 videos (#14034) -* [downloader/http] Rework HTTP downloader (#506, #809, #2849, #4240, #6023, - #8625, #9483) - * Simplify code and split into separate routines to facilitate maintaining - * Make retry mechanism work on errors during actual download not only - during connection establishment phase - * Retry on ECONNRESET and ETIMEDOUT during reading data from network - * Retry on content too short - * Show error description on retry - -Extractors -* [generic] Lower preference for extraction from LD-JSON -* [rai] Fix audio formats extraction (#14024) -* [youtube] Fix controversy videos extraction (#14027, #14029) -* [mixcloud] Fix extraction (#14015, #14020) - - -version 2017.08.23 - -Core -+ [extractor/common] Introduce _parse_xml -* [extractor/common] Make HLS and DASH extraction in_parse_html5_media_entries - non fatal (#13970) -* [utils] Fix unescapeHTML for misformed string like "&a"" (#13935) - -Extractors -* [cbc:watch] Bypass geo restriction (#13993) -* [toutv] Relax DRM check (#13994) -+ [googledrive] Add support for subtitles (#13619, #13638) -* [pornhub] Relax uploader regular expression (#13906, #13975) -* [bandcamp:album] Extract track titles (#13962) -+ [bbccouk] Add support for events URLs (#13893) -+ [liveleak] Support multi-video pages (#6542) -+ [liveleak] Support another liveleak embedding pattern (#13336) -* [cda] Fix extraction (#13935) -+ [laola1tv] Add support for tv.ittf.com (#13965) -* [mixcloud] Fix extraction (#13958, #13974, #13980, #14003) - - -version 2017.08.18 - -Core -* [YoutubeDL] Sanitize byte string format URLs (#13951) -+ [extractor/common] Add support for float durations in _parse_mpd_formats - (#13919) - -Extractors -* [arte] Detect unavailable videos (#13945) -* [generic] Convert redirect URLs to unicode strings (#13951) -* [udemy] Fix paid course detection (#13943) -* [pluralsight] Use RPC API for course extraction (#13937) -+ [clippit] Add support for clippituser.tv -+ [qqmusic] Support new URL schemes (#13805) -* [periscope] Renew HLS extraction (#13917) -* [mixcloud] Extract decrypt key - - -version 2017.08.13 - -Core -* [YoutubeDL] Make sure format id is not empty -* [extractor/common] Make _family_friendly_search optional -* [extractor/common] Respect source's type attribute for HTML5 media (#13892) - -Extractors -* [pornhub:playlistbase] Skip videos from drop-down menu (#12819, #13902) -+ [fourtube] Add support pornerbros.com (#6022) -+ [fourtube] Add support porntube.com (#7859, #13901) -+ [fourtube] Add support fux.com -* [limelight] Improve embeds detection (#13895) -+ [reddit] Add support for v.redd.it and reddit.com (#13847) -* [aparat] Extract all formats (#13887) -* [mixcloud] Fix play info decryption (#13885) -+ [generic] Add support for vzaar embeds (#13876) - - -version 2017.08.09 - -Core -* [utils] Skip missing params in cli_bool_option (#13865) - -Extractors -* [xxxymovies] Fix title extraction (#13868) -+ [nick] Add support for nick.com.pl (#13860) -* [mixcloud] Fix play info decryption (#13867) -* [20min] Fix embeds extraction (#13852) -* [dplayit] Fix extraction (#13851) -+ [niconico] Support videos with multiple formats (#13522) -+ [niconico] Support HTML5-only videos (#13806) - - -version 2017.08.06 - -Core -* Use relative paths for DASH fragments (#12990) - -Extractors -* [pluralsight] Fix format selection -- [mpora] Remove extractor (#13826) -+ [voot] Add support for voot.com (#10255, #11644, #11814, #12350, #13218) -* [vlive:channel] Limit number of videos per page to 100 (#13830) -* [podomatic] Extend URL regular expression (#13827) -* [cinchcast] Extend URL regular expression -* [yandexdisk] Relax URL regular expression (#13824) -* [vidme] Extract DASH and HLS formats -- [teamfour] Remove extractor (#13782) -* [pornhd] Fix extraction (#13783) -* [udemy] Fix subtitles extraction (#13812) -* [mlb] Extend URL regular expression (#13740, #13773) -+ [pbs] Add support for new URL schema (#13801) -* [nrktv] Update API host (#13796) - - -version 2017.07.30.1 - -Core -* [downloader/hls] Use redirect URL as manifest base (#13755) -* [options] Correctly hide login info from debug outputs (#13696) - -Extractors -+ [watchbox] Add support for watchbox.de (#13739) -- [clipfish] Remove extractor -+ [youjizz] Fix extraction (#13744) -+ [generic] Add support for another ooyala embed pattern (#13727) -+ [ard] Add support for lives (#13771) -* [soundcloud] Update client id -+ [soundcloud:trackstation] Add support for track stations (#13733) -* [svtplay] Use geo verification proxy for API request -* [svtplay] Update API URL (#13767) -+ [yandexdisk] Add support for yadi.sk (#13755) -+ [megaphone] Add support for megaphone.fm -* [amcnetworks] Make rating optional (#12453) -* [cloudy] Fix extraction (#13737) -+ [nickru] Add support for nickelodeon.ru -* [mtv] Improve thumbnail extraction -* [nick] Automate geo-restriction bypass (#13711) -* [niconico] Improve error reporting (#13696) - - -version 2017.07.23 - -Core -* [YoutubeDL] Improve default format specification (#13704) -* [YoutubeDL] Do not override id, extractor and extractor_key for - url_transparent entities -* [extractor/common] Fix playlist_from_matches - -Extractors -* [itv] Fix production id extraction (#13671, #13703) -* [vidio] Make duration non fatal and fix typo -* [mtv] Skip missing video parts (#13690) -* [sportbox:embed] Fix extraction -+ [npo] Add support for npo3.nl URLs (#13695) -* [dramafever] Remove video id from title (#13699) -+ [egghead:lesson] Add support for lessons (#6635) -* [funnyordie] Extract more metadata (#13677) -* [youku:show] Fix playlist extraction (#13248) -+ [dispeak] Recognize sevt subdomain (#13276) -* [adn] Improve error reporting (#13663) -* [crunchyroll] Relax series and season regular expression (#13659) -+ [spiegel:article] Add support for nexx iframe embeds (#13029) -+ [nexx:embed] Add support for iframe embeds -* [nexx] Improve JS embed extraction -+ [pearvideo] Add support for pearvideo.com (#13031) - - -version 2017.07.15 - -Core -* [YoutubeDL] Don't expand environment variables in meta fields (#13637) - -Extractors -* [spiegeltv] Delegate extraction to nexx extractor (#13159) -+ [nexx] Add support for nexx.cloud (#10807, #13465) -* [generic] Fix rutube embeds extraction (#13641) -* [karrierevideos] Fix title extraction (#13641) -* [youtube] Don't capture YouTube Red ad for creator meta field (#13621) -* [slideshare] Fix extraction (#13617) -+ [5tv] Add another video URL pattern (#13354, #13606) -* [drtv] Make HLS and HDS extraction non fatal -* [ted] Fix subtitles extraction (#13628, #13629) -* [vine] Make sure the title won't be empty -+ [twitter] Support HLS streams in vmap URLs -+ [periscope] Support pscp.tv URLs in embedded frames -* [twitter] Extract mp4 urls via mobile API (#12726) -* [niconico] Fix authentication error handling (#12486) -* [giantbomb] Extract m3u8 formats (#13626) -+ [vlive:playlist] Add support for playlists (#13613) - - -version 2017.07.09 - -Core -+ [extractor/common] Add support for AMP tags in _parse_html5_media_entries -+ [utils] Support attributes with no values in get_elements_by_attribute - -Extractors -+ [dailymail] Add support for embeds -+ [joj] Add support for joj.sk (#13268) -* [abc.net.au:iview] Extract more formats (#13492, #13489) -* [egghead:course] Fix extraction (#6635, #13370) -+ [cjsw] Add support for cjsw.com (#13525) -+ [eagleplatform] Add support for referrer protected videos (#13557) -+ [eagleplatform] Add support for another embed pattern (#13557) -* [veoh] Extend URL regular expression (#13601) -* [npo:live] Fix live stream id extraction (#13568, #13605) -* [googledrive] Fix height extraction (#13603) -+ [dailymotion] Add support for new layout (#13580) -- [yam] Remove extractor -* [xhamster] Extract all formats and fix duration extraction (#13593) -+ [xhamster] Add support for new URL schema (#13593) -* [espn] Extend URL regular expression (#13244, #13549) -* [kaltura] Fix typo in subtitles extraction (#13569) -* [vier] Adapt extraction to redesign (#13575) - - -version 2017.07.02 - -Core -* [extractor/common] Improve _json_ld - -Extractors -+ [thisoldhouse] Add more fallbacks for video id -* [thisoldhouse] Fix video id extraction (#13540, #13541) -* [xfileshare] Extend format regular expression (#13536) -* [ted] Fix extraction (#13535) -+ [tastytrade] Add support for tastytrade.com (#13521) -* [dplayit] Relax video id regular expression (#13524) -+ [generic] Extract more generic metadata (#13527) -+ [bbccouk] Capture and output error message (#13501, #13518) -* [cbsnews] Relax video info regular expression (#13284, #13503) -+ [facebook] Add support for plugin video embeds and multiple embeds (#13493) -* [soundcloud] Switch to https for API requests (#13502) -* [pandatv] Switch to https for API and download URLs -+ [pandatv] Add support for https URLs (#13491) -+ [niconico] Support sp subdomain (#13494) - - -version 2017.06.25 - -Core -+ [adobepass] Add support for DIRECTV NOW (mso ATTOTT) (#13472) -* [YoutubeDL] Skip malformed formats for better extraction robustness - -Extractors -+ [wsj] Add support for barrons.com (#13470) -+ [ign] Add another video id pattern (#13328) -+ [raiplay:live] Add support for live streams (#13414) -+ [redbulltv] Add support for live videos and segments (#13486) -+ [onetpl] Add support for videos embedded via pulsembed (#13482) -* [ooyala] Make more robust -* [ooyala] Skip empty format URLs (#13471, #13476) -* [hgtv.com:show] Fix typo - - -version 2017.06.23 - -Core -* [adobepass] Fix extraction on older python 2.6 - -Extractors -* [youtube] Adapt to new automatic captions rendition (#13467) -* [hgtv.com:show] Relax video config regular expression (#13279, #13461) -* [drtuber] Fix formats extraction (#12058) -* [youporn] Fix upload date extraction -* [youporn] Improve formats extraction -* [youporn] Fix title extraction (#13456) -* [googledrive] Fix formats sorting (#13443) -* [watchindianporn] Fix extraction (#13411, #13415) -+ [vimeo] Add fallback mp4 extension for original format -+ [ruv] Add support for ruv.is (#13396) -* [viu] Fix extraction on older python 2.6 -* [pandora.tv] Fix upload_date extraction (#12846) -+ [asiancrush] Add support for asiancrush.com (#13420) - - -version 2017.06.18 - -Core -* [downloader/common] Use utils.shell_quote for debug command line -* [utils] Use compat_shlex_quote in shell_quote -* [postprocessor/execafterdownload] Encode command line (#13407) -* [compat] Fix compat_shlex_quote on Windows (#5889, #10254) -* [postprocessor/metadatafromtitle] Fix missing optional meta fields processing - in --metadata-from-title (#13408) -* [extractor/common] Fix json dumping with --geo-bypass -+ [extractor/common] Improve jwplayer subtitles extraction -+ [extractor/common] Improve jwplayer formats extraction (#13379) - -Extractors -* [polskieradio] Fix extraction (#13392) -+ [xfileshare] Add support for fastvideo.me (#13385) -* [bilibili] Fix extraction of videos with double quotes in titles (#13387) -* [4tube] Fix extraction (#13381, #13382) -+ [disney] Add support for disneychannel.de (#13383) -* [npo] Improve URL regular expression (#13376) -+ [corus] Add support for showcase.ca -+ [corus] Add support for history.ca (#13359) - - -version 2017.06.12 - -Core -* [utils] Handle compat_HTMLParseError in extract_attributes (#13349) -+ [compat] Introduce compat_HTMLParseError -* [utils] Improve unified_timestamp -* [extractor/generic] Ensure format id is unicode string -* [extractor/common] Return unicode string from _match_id -+ [YoutubeDL] Sanitize more fields (#13313) - -Extractors -+ [xfileshare] Add support for rapidvideo.tv (#13348) -* [xfileshare] Modernize and pass Referer -+ [rutv] Add support for testplayer.vgtrk.com (#13347) -+ [newgrounds] Extract more metadata (#13232) -+ [newgrounds:playlist] Add support for playlists (#10611) -* [newgrounds] Improve formats and uploader extraction (#13346) -* [msn] Fix formats extraction -* [turbo] Ensure format id is string -* [sexu] Ensure height is int -* [jove] Ensure comment count is int -* [golem] Ensure format id is string -* [gfycat] Ensure filesize is int -* [foxgay] Ensure height is int -* [flickr] Ensure format id is string -* [sohu] Fix numeric fields -* [safari] Improve authentication detection (#13319) -* [liveleak] Ensure height is int (#13313) -* [streamango] Make title optional (#13292) -* [rtlnl] Improve URL regular expression (#13295) -* [tvplayer] Fix extraction (#13291) - - -version 2017.06.05 - -Core -* [YoutubeDL] Don't emit ANSI escape codes on Windows (#13270) - -Extractors -+ [bandcamp:weekly] Add support for bandcamp weekly (#12758) -* [pornhub:playlist] Fix extraction (#13281) -- [godtv] Remove extractor (#13175) -* [safari] Fix typo (#13252) -* [youtube] Improve chapters extraction (#13247) -* [1tv] Lower preference for HTTP formats (#13246) -* [francetv] Relax URL regular expression -* [drbonanza] Fix extraction (#13231) -* [packtpub] Fix authentication (#13240) - - -version 2017.05.29 - -Extractors -* [youtube] Fix DASH MPD extraction for videos with non-encrypted format URLs - (#13211) -* [xhamster] Fix uploader and like/dislike count extraction (#13216)) -+ [xhamster] Extract categories (#11728) -+ [abcnews] Add support for embed URLs (#12851) -* [gaskrank] Fix extraction (#12493) -* [medialaan] Fix videos with missing videoUrl (#12774) -* [dvtv] Fix playlist support -+ [dvtv] Add support for DASH and HLS formats (#3063) -+ [beam:vod] Add support for beam.pro/mixer.com VODs (#13032)) -* [cbsinteractive] Relax URL regular expression (#13213) -* [adn] Fix formats extraction -+ [youku] Extract more metadata (#10433) -* [cbsnews] Fix extraction (#13205) - - -version 2017.05.26 - -Core -+ [utils] strip_jsonp() can recognize more patterns -* [postprocessor/ffmpeg] Fix metadata filename handling on Python 2 (#13182) - -Extractors -+ [youtube] DASH MPDs with cipher signatures are recognized now (#11381) -+ [bbc] Add support for authentication -* [tudou] Merge into youku extractor (#12214) -* [youku:show] Fix extraction -* [youku] Fix extraction (#13191) -* [udemy] Fix extraction for outputs' format entries without URL (#13192) -* [vimeo] Fix formats' sorting (#13189) -* [cbsnews] Fix extraction for 60 Minutes videos (#12861) - - -version 2017.05.23 - -Core -+ [downloader/external] Pass -loglevel to ffmpeg downloader (#13183) -+ [adobepass] Add support for Bright House Networks (#13149) - -Extractors -+ [streamcz] Add support for subtitles (#13174) -* [youtube] Fix DASH manifest signature decryption (#8944, #13156) -* [toggle] Relax URL regular expression (#13172) -* [toypics] Fix extraction (#13077) -* [njpwworld] Fix extraction (#13162, #13169) -+ [hitbox] Add support for smashcast.tv (#13154) -* [mitele] Update app key regular expression (#13158) - - -version 2017.05.18.1 - -Core -* [jsinterp] Fix typo and cleanup regular expressions (#13134) - - -version 2017.05.18 - -Core -+ [jsinterp] Add support for quoted names and indexers (#13123, #13124, #13125, - #13126, #13128, #13129, #13130, #13131, #13132) -+ [extractor/common] Add support for schemeless URLs in _extract_wowza_formats - (#13088, #13092) -+ [utils] Recognize more audio codecs (#13081) - -Extractors -+ [vier] Extract more metadata (#12539) -* [vier] Improve extraction (#12801) - + Add support for authentication - * Bypass authentication when no credentials provided - * Improve extraction robustness -* [dailymail] Fix sources extraction (#13057) -* [dailymotion] Extend URL regular expression (#13079) - - -version 2017.05.14 - -Core -+ [extractor/common] Respect Width and Height attributes in ISM manifests -+ [postprocessor/metadatafromtitle] Add support regular expression syntax for - --metadata-from-title (#13065) - -Extractors -+ [mediaset] Add support for video.mediaset.it (#12708, #12964) -* [orf:radio] Fix extraction (#11643, #12926) -* [aljazeera] Extend URL regular expression (#13053) -* [imdb] Relax URL regular expression (#13056) -+ [francetv] Add support for mobile.france.tv (#13068) -+ [upskill] Add support for upskillcourses.com (#13043) -* [thescene] Fix extraction (#13061) -* [condenast] Improve embed support -* [liveleak] Fix extraction (#12053) -+ [douyu] Support Douyu shows (#12228) -* [myspace] Improve URL regular expression (#13040) -* [adultswim] Use desktop platform in assets URL (#13041) - - -version 2017.05.09 - -Core -* [YoutubeDL] Force --restrict-filenames when no locale is set on all python - versions (#13027) - -Extractors -* [francetv] Adapt to site redesign (#13034) -+ [packtpub] Add support for authentication (#12622) -* [drtv] Lower preference for SignLanguage formats (#13013, #13016) -+ [cspan] Add support for brightcove live embeds (#13028) -* [vrv] Extract DASH formats and subtitles -* [funimation] Fix authentication (#13021) -* [adultswim] Fix extraction (#8640, #10950, #11042, #12121) - + Add support for Adobe Pass authentication - + Add support for live streams - + Add support for show pages -* [turner] Extract thumbnail, is_live and strip description -+ [nonktube] Add support for nonktube.com (#8647, #13024) -+ [nuevo] Pass headers to _extract_nuevo -* [nbc] Improve extraction (#12364) - - -version 2017.05.07 - -Common -* [extractor/common] Fix typo in _extract_akamai_formats -+ [postprocessor/ffmpeg] Embed chapters into media file with --add-metadata -+ [extractor/common] Introduce chapters meta field - -Extractors -* [youtube] Fix authentication (#12820, #12927, #12973, #12992, #12993, #12995, - #13003) -* [bilibili] Fix video downloading (#13001) -* [rmcdecouverte] Fix extraction (#12937) -* [theplatform] Extract chapters -* [bandcamp] Fix thumbnail extraction (#12980) -* [pornhub] Extend URL regular expression (#12996) -+ [youtube] Extract chapters -+ [nrk] Extract chapters -+ [vice] Add support for ooyala embeds in article pages -+ [vice] Support vice articles (#12968) -* [vice] Fix extraction for non en_us videos (#12967) -* [gdcvault] Fix extraction for some videos (#12733) -* [pbs] Improve multipart video support (#12981) -* [laola1tv] Fix extraction (#12880) -+ [cda] Support birthday verification (#12789) -* [leeco] Fix extraction (#12974) -+ [pbs] Extract chapters -* [amp] Improve thumbnail and subtitles extraction -* [foxsports] Fix extraction (#12945) -- [coub] Remove comment count extraction (#12941) - - -version 2017.05.01 - -Core -+ [extractor/common] Extract view count from JSON-LD -* [utils] Improve unified_timestamp -+ [utils] Add video/mp2t to mimetype2ext -* [downloader/external] Properly handle live stream downloading cancellation - (#8932) -+ [utils] Add support for unicode whitespace in clean_html on python 2 (#12906) - -Extractors -* [infoq] Make audio format extraction non fatal (#12938) -* [brightcove] Allow whitespace around attribute names in embedded code -+ [zaq1] Add support for zaq1.pl (#12693) -+ [xvideos] Extract duration (#12828) -* [vevo] Fix extraction (#12879) -+ [noovo] Add support for noovo.ca (#12792) -+ [washingtonpost] Add support for embeds (#12699) -* [yandexmusic:playlist] Fix extraction for python 3 (#12888) -* [anvato] Improve extraction (#12913) - * Promote to regular shortcut based extractor - * Add mcp to access key mapping table - * Add support for embeds extraction - * Add support for anvato embeds in generic extractor -* [xtube] Fix extraction for older FLV videos (#12734) -* [tvplayer] Fix extraction (#12908) - - -version 2017.04.28 - -Core -+ [adobepass] Use geo verification headers for all requests -- [downloader/fragment] Remove assert for resume_len when no fragments - downloaded -+ [extractor/common] Add manifest_url for explicit group rendition formats -* [extractor/common] Fix manifest_url for m3u8 formats -- [extractor/common] Don't list master m3u8 playlists in format list (#12832) - -Extractor -* [aenetworks] Fix extraction for shows with single season -+ [go] Add support for Disney, DisneyJunior and DisneyXD show pages -* [youtube] Recognize new locale-based player URLs (#12885) -+ [streamable] Add support for new embedded URL schema (#12844) -* [arte:+7] Relax URL regular expression (#12837) - - -version 2017.04.26 - -Core -* Introduce --keep-fragments for keeping fragments of fragmented download - on disk after download is finished -* [YoutubeDL] Fix output template for missing timestamp (#12796) -* [socks] Handle cases where credentials are required but missing -* [extractor/common] Improve HLS extraction (#12211) - * Extract m3u8 parsing to separate method - * Improve rendition groups extraction - * Build stream name according stream GROUP-ID - * Ignore reference to AUDIO group without URI when stream has no CODECS - * Use float for scaled tbr in _parse_m3u8_formats -* [utils] Add support for TTML styles in dfxp2srt -* [downloader/hls] No need to download keys for fragments that have been - already downloaded -* [downloader/fragment] Improve fragment downloading - * Resume immediately - * Don't concatenate fragments and decrypt them on every resume - * Optimize disk storage usage, don't store intermediate fragments on disk - * Store bookkeeping download state file -+ [extractor/common] Add support for multiple getters in try_get -+ [extractor/common] Add support for video of WebPage context in _json_ld - (#12778) -+ [extractor/common] Relax JWPlayer regular expression and remove - duplicate URLs (#12768) - -Extractors -* [iqiyi] Fix extraction of Yule videos -* [vidio] Improve extraction and sort formats -+ [brightcove] Match only video elements with data-video-id attribute -* [iqiyi] Fix playlist detection (#12504) -- [azubu] Remove extractor (#12813) -* [porn91] Fix extraction (#12814) -* [vidzi] Fix extraction (#12793) -+ [amp] Extract error message (#12795) -+ [xfileshare] Add support for gorillavid.com and daclips.com (#12776) -* [instagram] Fix extraction (#12777) -+ [generic] Support Brightcove videos in <iframe> (#12482) -+ [brightcove] Support URLs with bcpid instead of playerID (#12482) -* [brightcove] Fix _extract_url (#12782) -+ [odnoklassniki] Extract HLS formats - - -version 2017.04.17 - -Extractors -* [limelight] Improve extraction LimelightEmbeddedPlayerFlash media embeds and - add support for channel and channelList embeds -* [generic] Extract multiple Limelight embeds (#12761) -+ [itv] Extract series metadata -* [itv] Fix RTMP formats downloading (#12759) -* [itv] Use native HLS downloader by default -+ [go90] Extract subtitles (#12752) -+ [go90] Extract series metadata (#12752) - - -version 2017.04.16 - -Core -* [YoutubeDL] Apply expand_path after output template substitution -+ [YoutubeDL] Propagate overridden meta fields to extraction results of type - url (#11163) - -Extractors -+ [generic] Extract RSS entries as url_transparent (#11163) -+ [streamango] Add support for streamango.com (#12643) -+ [wsj:article] Add support for articles (#12558) -* [brightcove] Relax video tag embeds extraction and validate ambiguous embeds' - URLs (#9163, #12005, #12178, #12480) -+ [udemy] Add support for react rendition (#12744) - - -version 2017.04.15 - -Extractors -* [youku] Fix fileid extraction (#12741, #12743) - - -version 2017.04.14 - -Core -+ [downloader/hls] Add basic support for EXT-X-BYTERANGE tag (#10955) -+ [adobepass] Improve Comcast and Verizon login code (#10803) -+ [adobepass] Add support for Verizon (#10803) - -Extractors -+ [aenetworks] Add support for specials (#12723) -+ [hbo] Extract HLS formats -+ [go90] Add support for go90.com (#10127) -+ [tv2hu] Add support for tv2.hu (#10509) -+ [generic] Exclude URLs with xml ext from valid video URLs (#10768, #11654) -* [youtube] Improve HLS formats extraction -* [afreecatv] Fix extraction for videos with different key layout (#12718) -- [youtube] Remove explicit preference for audio-only and video-only formats in - order not to break sorting when new formats appear -* [canalplus] Bypass geo restriction - - -version 2017.04.11 - -Extractors -* [afreecatv] Fix extraction (#12706) -+ [generic] Add support for <object> YouTube embeds (#12637) -* [bbccouk] Treat bitrate as audio+video bitrate in media selector -+ [bbccouk] Skip unrecognized formats in media selector (#12701) -+ [bbccouk] Add support for https protocol in media selector (#12701) -* [curiositystream] Fix extraction (#12638) -* [adn] Update subtitle decryption key -* [chaturbate] Fix extraction (#12665, #12688, #12690) - - -version 2017.04.09 - -Extractors -+ [medici] Add support for medici.tv (#3406) -+ [rbmaradio] Add support for redbullradio.com URLs (#12687) -+ [npo:live] Add support for default URL (#12555) -* [mixcloud:playlist] Fix title, description and view count extraction (#12582) -+ [thesun] Add support for thesun.co.uk (#11298, #12674) -+ [ceskateleveize:porady] Add support for porady (#7411, #12645) -* [ceskateleveize] Improve extraction and remove URL replacement hacks -+ [kaltura] Add support for iframe embeds (#12679) -* [airmozilla] Fix extraction (#12670) -* [wshh] Extract html5 entries and delegate to generic extractor (12676) -+ [raiplay] Extract subtitles -+ [xfileshare] Add support for vidlo.us (#12660) -+ [xfileshare] Add support for vidbom.com (#12661) -+ [aenetworks] Add more video URL regular expressions (#12657) -+ [odnoklassniki] Fix format sorting for 1080p quality -+ [rtl2] Add support for you.rtl2.de (#10257) -+ [vshare] Add support for vshare.io (#12278) - - -version 2017.04.03 - -Core -+ [extractor/common] Add censorship check for TransTelekom ISP -* [extractor/common] Move censorship checks to a separate method - -Extractors -+ [discoveryvr] Add support for discoveryvr.com (#12578) -+ [tv5mondeplus] Add support for tv5mondeplus.com (#11386) -+ [periscope] Add support for pscp.tv URLs (#12618, #12625) - - -version 2017.04.02 - -Core -* [YoutubeDL] Return early when extraction of url_transparent fails - -Extractors -* [rai] Fix and improve extraction (#11790) -+ [vrv] Add support for series pages -* [limelight] Improve extraction for audio only formats -* [funimation] Fix extraction (#10696, #11773) -+ [xfileshare] Add support for vidabc.com (#12589) -+ [xfileshare] Improve extraction and extract hls formats -+ [crunchyroll] Pass geo verification proxy -+ [cwtv] Extract ISM formats -+ [tvplay] Bypass geo restriction -+ [vrv] Add support for vrv.co -+ [packtpub] Add support for packtpub.com (#12610) -+ [generic] Pass base_url to _parse_jwplayer_data -+ [adn] Add support for animedigitalnetwork.fr (#4866) -+ [allocine] Extract more metadata -* [allocine] Fix extraction (#12592) -* [openload] Fix extraction - - -version 2017.03.26 - -Core -* Don't raise an error if JWPlayer config data is not a Javascript object - literal. _find_jwplayer_data now returns a dict rather than an str. (#12307) -* Expand environment variables for options representing paths (#12556) -+ [utils] Introduce expand_path -* [downloader/hls] Delegate downloading to ffmpeg immediately for live streams - -Extractors -* [afreecatv] Fix extraction (#12179) -+ [atvat] Add support for atv.at (#5325) -+ [fox] Add metadata extraction (#12391) -+ [atresplayer] Extract DASH formats -+ [atresplayer] Extract HD manifest (#12548) -* [atresplayer] Fix login error detection (#12548) -* [franceculture] Fix extraction (#12547) -* [youtube] Improve URL regular expression (#12538) -* [generic] Do not follow redirects to the same URL - - -version 2017.03.24 - -Extractors -- [9c9media] Remove mp4 URL extraction request -+ [bellmedia] Add support for etalk.ca and space.ca (#12447) -* [channel9] Fix extraction (#11323) -* [cloudy] Fix extraction (#12525) -+ [hbo] Add support for free episode URLs and new formats extraction (#12519) -* [condenast] Fix extraction and style (#12526) -* [viu] Relax URL regular expression (#12529) - - -version 2017.03.22 - -Extractors -- [pluralsight] Omit module title from video title (#12506) -* [pornhub] Decode obfuscated video URL (#12470, #12515) -* [senateisvp] Allow https URL scheme for embeds (#12512) - - -version 2017.03.20 - -Core -+ [YoutubeDL] Allow multiple input URLs to be used with stdout (-) as - output template -+ [adobepass] Detect and output error on authz token extraction (#12472) - -Extractors -+ [bostonglobe] Add extractor for bostonglobe.com (#12099) -+ [toongoggles] Add support for toongoggles.com (#12171) -+ [medialaan] Add support for Medialaan sites (#9974, #11912) -+ [discoverynetworks] Add support for more domains and bypass geo restriction -* [openload] Fix extraction (#10408) - - -version 2017.03.16 - -Core -+ [postprocessor/ffmpeg] Add support for flac -+ [extractor/common] Extract SMIL formats from jwplayer - -Extractors -+ [generic] Add forgotten return for jwplayer formats -* [redbulltv] Improve extraction - - -version 2017.03.15 - -Core -* Fix missing subtitles if --add-metadata is used (#12423) - -Extractors -* [facebook] Make title optional (#12443) -+ [mitele] Add support for ooyala videos (#12430) -* [openload] Fix extraction (#12435, #12446) -* [streamable] Update API URL (#12433) -+ [crunchyroll] Extract season name (#12428) -* [discoverygo] Bypass geo restriction -+ [discoverygo:playlist] Add support for playlists (#12424) - - -version 2017.03.10 - -Extractors -* [generic] Make title optional for jwplayer embeds (#12410) -* [wdr:maus] Fix extraction (#12373) -* [prosiebensat1] Improve title extraction (#12318, #12327) -* [dplayit] Separate and rewrite extractor and bypass geo restriction (#12393) -* [miomio] Fix extraction (#12291, #12388, #12402) -* [telequebec] Fix description extraction (#12399) -* [openload] Fix extraction (#12357) -* [brightcove:legacy] Relax videoPlayer validation check (#12381) - - -version 2017.03.07 - -Core -* Metadata are now added after conversion (#5594) - -Extractors -* [soundcloud] Update client id (#12376) -* [openload] Fix extraction (#10408, #12357) - - -version 2017.03.06 - -Core -+ [utils] Process bytestrings in urljoin (#12369) -* [extractor/common] Improve height extraction and extract bitrate -* [extractor/common] Move jwplayer formats extraction in separate method -+ [external:ffmpeg] Limit test download size to 10KiB (#12362) - -Extractors -+ [drtv] Add geo countries to GeoRestrictedError -+ [drtv:live] Bypass geo restriction -+ [tunepk] Add extractor (#12197, #12243) - - -version 2017.03.05 - -Extractors -+ [twitch] Add basic support for two-factor authentication (#11974) -+ [vier] Add support for vijf.be (#12304) -+ [redbulltv] Add support for redbull.tv (#3919, #11948) -* [douyutv] Switch to the PC API to escape the 5-min limitation (#12316) -+ [generic] Add support for rutube embeds -+ [rutube] Relax URL regular expression -+ [vrak] Add support for vrak.tv (#11452) -+ [brightcove:new] Add ability to smuggle geo_countries into URL -+ [brightcove:new] Raise GeoRestrictedError -* [go] Relax URL regular expression (#12341) -* [24video] Use original host for requests (#12339) -* [ruutu] Disable DASH formats (#12322) - - -version 2017.03.02 - -Core -+ [adobepass] Add support for Charter Spectrum (#11465) -* [YoutubeDL] Don't sanitize identifiers in output template (#12317) - -Extractors -* [facebook] Fix extraction (#12323, #12330) -* [youtube] Mark errors about rental videos as expected (#12324) -+ [npo] Add support for audio -* [npo] Adapt to app.php API (#12311, #12320) - - -version 2017.02.28 - -Core -+ [utils] Add bytes_to_long and long_to_bytes -+ [utils] Add pkcs1pad -+ [aes] Add aes_cbc_encrypt - -Extractors -+ [azmedien:showplaylist] Add support for show playlists (#12160) -+ [youtube:playlist] Recognize another playlist pattern (#11928, #12286) -+ [daisuki] Add support for daisuki.net (#2486, #3186, #4738, #6175, #7776, - #10060) -* [douyu] Fix extraction (#12301) - - -version 2017.02.27 - -Core -* [downloader/common] Limit displaying 2 digits after decimal point in sleep - interval message (#12183) -+ [extractor/common] Add preference to _parse_html5_media_entries - -Extractors -+ [npo] Add support for zapp.nl -+ [npo] Add support for hetklokhuis.nl (#12293) -- [scivee] Remove extractor (#9315) -+ [cda] Decode download URL (#12255) -+ [crunchyroll] Improve uploader extraction (#12267) -+ [youtube] Raise GeoRestrictedError -+ [dailymotion] Raise GeoRestrictedError -+ [mdr] Recognize more URL patterns (#12169) -+ [tvigle] Raise GeoRestrictedError -* [vevo] Fix extraction for videos with the new streams/streamsV3 format - (#11719) -+ [freshlive] Add support for freshlive.tv (#12175) -+ [xhamster] Capture and output videoClosed error (#12263) -+ [etonline] Add support for etonline.com (#12236) -+ [njpwworld] Add support for njpwworld.com (#11561) -* [amcnetworks] Relax URL regular expression (#12127) - - -version 2017.02.24.1 - -Extractors -* [noco] Modernize -* [noco] Switch login URL to https (#12246) -+ [thescene] Extract more metadata -* [thescene] Fix extraction (#12235) -+ [tubitv] Use geo bypass mechanism -* [openload] Fix extraction (#10408) -+ [ivi] Raise GeoRestrictedError - - -version 2017.02.24 - -Core -* [options] Hide deprecated options from --help -* [options] Deprecate --autonumber-size -+ [YoutubeDL] Add support for string formatting operations in output template - (#5185, #5748, #6841, #9929, #9966 #9978, #12189) - -Extractors -+ [lynda:course] Add webpage extraction fallback (#12238) -* [go] Sign all uplynk URLs and use geo bypass only for free videos - (#12087, #12210) -+ [skylinewebcams] Add support for skylinewebcams.com (#12221) -+ [instagram] Add support for multi video posts (#12226) -+ [crunchyroll] Extract playlist entries ids -* [mgtv] Fix extraction -+ [sohu] Raise GeoRestrictedError -+ [leeco] Raise GeoRestrictedError and use geo bypass mechanism - - -version 2017.02.22 - -Extractors -* [crunchyroll] Fix descriptions with double quotes (#12124) -* [dailymotion] Make comment count optional (#12209) -+ [vidzi] Add support for vidzi.cc (#12213) -+ [24video] Add support for 24video.tube (#12217) -+ [crackle] Use geo bypass mechanism -+ [viewster] Use geo verification headers -+ [tfo] Improve geo restriction detection and use geo bypass mechanism -+ [telequebec] Use geo bypass mechanism -+ [limelight] Extract PlaylistService errors and improve geo restriction - detection - - -version 2017.02.21 - -Core -* [extractor/common] Allow calling _initialize_geo_bypass from extractors - (#11970) -+ [adobepass] Add support for Time Warner Cable (#12191) -+ [travis] Run tests in parallel -+ [downloader/ism] Honor HTTP headers when downloading fragments -+ [downloader/dash] Honor HTTP headers when downloading fragments -+ [utils] Add GeoUtils class for working with geo tools and GeoUtils.random_ipv4 -+ Add option --geo-bypass-country for explicit geo bypass on behalf of - specified country -+ Add options to control geo bypass mechanism --geo-bypass and --no-geo-bypass -+ Add experimental geo restriction bypass mechanism based on faking - X-Forwarded-For HTTP header -+ [utils] Introduce GeoRestrictedError for geo restricted videos -+ [utils] Introduce YoutubeDLError base class for all youtube-dl exceptions - -Extractors -+ [ninecninemedia] Use geo bypass mechanism -* [spankbang] Make uploader optional (#12193) -+ [iprima] Improve geo restriction detection and disable geo bypass -* [iprima] Modernize -* [commonmistakes] Disable UnicodeBOM extractor test for python 3.2 -+ [prosiebensat1] Throw ExtractionError on unsupported page type (#12180) -* [nrk] Update _API_HOST and relax _VALID_URL -+ [tv4] Bypass geo restriction and improve detection -* [tv4] Switch to hls3 protocol (#12177) -+ [viki] Improve geo restriction detection -+ [vgtv] Improve geo restriction detection -+ [srgssr] Improve geo restriction detection -+ [vbox7] Improve geo restriction detection and use geo bypass mechanism -+ [svt] Improve geo restriction detection and use geo bypass mechanism -+ [pbs] Improve geo restriction detection and use geo bypass mechanism -+ [ondemandkorea] Improve geo restriction detection and use geo bypass mechanism -+ [nrk] Improve geo restriction detection and use geo bypass mechanism -+ [itv] Improve geo restriction detection and use geo bypass mechanism -+ [go] Improve geo restriction detection and use geo bypass mechanism -+ [dramafever] Improve geo restriction detection and use geo bypass mechanism -* [brightcove:legacy] Restrict videoPlayer value (#12040) -+ [tvn24] Add support for tvn24.pl and tvn24bis.pl (#11679) -+ [thisav] Add support for HTML5 media (#11771) -* [metacafe] Bypass family filter (#10371) -* [viceland] Improve info extraction - - -version 2017.02.17 - -Extractors -* [heise] Improve extraction (#9725) -* [ellentv] Improve (#11653) -* [openload] Fix extraction (#10408, #12002) -+ [theplatform] Recognize URLs with whitespaces (#12044) -* [einthusan] Relax URL regular expression (#12141, #12159) -+ [generic] Support complex JWPlayer embedded videos (#12030) -* [elpais] Improve extraction (#12139) - - -version 2017.02.16 - -Core -+ [utils] Add support for quoted string literals in --match-filter (#8050, - #12142, #12144) - -Extractors -* [ceskatelevize] Lower priority for audio description sources (#12119) -* [amcnetworks] Fix extraction (#12127) -* [pinkbike] Fix uploader extraction (#12054) -+ [onetpl] Add support for businessinsider.com.pl and plejada.pl -+ [onetpl] Add support for onet.pl (#10507) -+ [onetmvp] Add shortcut extractor -+ [vodpl] Add support for vod.pl (#12122) -+ [pornhub] Extract video URL from tv platform site (#12007, #12129) -+ [ceskatelevize] Extract DASH formats (#12119, #12133) - - -version 2017.02.14 - -Core -* TypeError is fixed with Python 2.7.13 on Windows (#11540, #12085) - -Extractor -* [zdf] Fix extraction (#12117) -* [xtube] Fix extraction for both kinds of video id (#12088) -* [xtube] Improve title extraction (#12088) -+ [lemonde] Fallback delegate extraction to generic extractor (#12115, #12116) -* [bellmedia] Allow video id longer than 6 characters (#12114) -+ [limelight] Add support for referer protected videos -* [disney] Improve extraction (#4975, #11000, #11882, #11936) -* [hotstar] Improve extraction (#12096) -* [einthusan] Fix extraction (#11416) -+ [aenetworks] Add support for lifetimemovieclub.com (#12097) -* [youtube] Fix parsing codecs (#12091) - - -version 2017.02.11 - -Core -+ [utils] Introduce get_elements_by_class and get_elements_by_attribute - utility functions -+ [extractor/common] Skip m3u8 manifests protected with Adobe Flash Access - -Extractor -* [pluralsight:course] Fix extraction (#12075) -+ [bbc] Extract m3u8 formats with 320k audio -* [facebook] Relax video id matching (#11017, #12055, #12056) -+ [corus] Add support for Corus Entertainment sites (#12060, #9164) -+ [pluralsight] Detect blocked account error message (#12070) -+ [bloomberg] Add another video id pattern (#12062) -* [extractor/commonmistakes] Restrict URL regular expression (#12050) -+ [tvplayer] Add support for tvplayer.com - - -version 2017.02.10 - -Extractors -* [xtube] Fix extraction (#12023) -* [pornhub] Fix extraction (#12007, #12018) -* [facebook] Improve JS data regular expression (#12042) -* [kaltura] Improve embed partner id extraction (#12041) -+ [sprout] Add support for sproutonline.com -* [6play] Improve extraction -+ [scrippsnetworks:watch] Add support for Scripps Networks sites (#10765) -+ [go] Add support for Adobe Pass authentication (#11468, #10831) -* [6play] Fix extraction (#12011) -+ [nbc] Add support for Adobe Pass authentication (#12006) - - -version 2017.02.07 - -Core -* [extractor/common] Fix audio only with audio group in m3u8 (#11995) -+ [downloader/fragment] Respect --no-part -* [extractor/common] Speed-up HTML5 media entries extraction (#11979) - -Extractors -* [pornhub] Fix extraction (#11997) -+ [canalplus] Add support for cstar.fr (#11990) -+ [extractor/generic] Improve RTMP support (#11993) -+ [gaskrank] Add support for gaskrank.tv (#11685) -* [bandcamp] Fix extraction for incomplete albums (#11727) -* [iwara] Fix extraction (#11781) -* [googledrive] Fix extraction on Python 3.6 -+ [videopress] Add support for videopress.com -+ [afreecatv] Extract RTMP formats - - -version 2017.02.04.1 - -Extractors -+ [twitch:stream] Add support for player.twitch.tv (#11971) -* [radiocanada] Fix extraction for toutv rtmp formats - - -version 2017.02.04 - -Core -+ Add --playlist-random to shuffle playlists (#11889, #11901) -* [utils] Improve comments processing in js_to_json (#11947) -* [utils] Handle single-line comments in js_to_json -* [downloader/external:ffmpeg] Minimize the use of aac_adtstoasc filter - -Extractors -+ [piksel] Add another app token pattern (#11969) -+ [vk] Capture and output author blocked error message (#11965) -+ [turner] Fix secure HLS formats downloading with ffmpeg (#11358, #11373, - #11800) -+ [drtv] Add support for live and radio sections (#1827, #3427) -* [myspace] Fix extraction and extract HLS and HTTP formats -+ [youtube] Add format info for itag 325 and 328 -* [vine] Fix extraction (#11955) -- [sportbox] Remove extractor (#11954) -+ [filmon] Add support for filmon.com (#11187) -+ [infoq] Add audio only formats (#11565) -* [douyutv] Improve room id regular expression (#11931) -* [iprima] Fix extraction (#11920, #11896) -* [youtube] Fix ytsearch when cookies are provided (#11924) -* [go] Relax video id regular expression (#11937) -* [facebook] Fix title extraction (#11941) -+ [youtube:playlist] Recognize TL playlists (#11945) -+ [bilibili] Support new Bangumi URLs (#11845) -+ [cbc:watch] Extract audio codec for audio only formats (#11893) -+ [elpais] Fix extraction for some URLs (#11765) - - -version 2017.02.01 - -Extractors -+ [facebook] Add another fallback extraction scenario (#11926) -* [prosiebensat1] Fix extraction of descriptions (#11810, #11929) -- [crunchyroll] Remove ScaledBorderAndShadow settings (#9028) -+ [vimeo] Extract upload timestamp -+ [vimeo] Extract license (#8726, #11880) -+ [nrk:series] Add support for series (#11571, #11711) - - -version 2017.01.31 - -Core -+ [compat] Add compat_etree_register_namespace - -Extractors -* [youtube] Fix extraction for domainless player URLs (#11890, #11891, #11892, - #11894, #11895, #11897, #11900, #11903, #11904, #11906, #11907, #11909, - #11913, #11914, #11915, #11916, #11917, #11918, #11919) -+ [vimeo] Extract both mixed and separated DASH formats -+ [ruutu] Extract DASH formats -* [itv] Fix extraction for python 2.6 - - -version 2017.01.29 - -Core -* [extractor/common] Fix initialization template (#11605, #11825) -+ [extractor/common] Document fragment_base_url and fragment's path fields -* [extractor/common] Fix duration per DASH segment (#11868) -+ Introduce --autonumber-start option for initial value of %(autonumber)s - template (#727, #2702, #9362, #10457, #10529, #11862) - -Extractors -+ [azmedien:playlist] Add support for topic and themen playlists (#11817) -* [npo] Fix subtitles extraction -+ [itv] Extract subtitles -+ [itv] Add support for itv.com (#9240) -+ [mtv81] Add support for mtv81.com (#7619) -+ [vlive] Add support for channels (#11826) -+ [kaltura] Add fallback for fileExt -+ [kaltura] Improve uploader_id extraction -+ [konserthusetplay] Add support for rspoplay.se (#11828) - - -version 2017.01.28 - -Core -* [utils] Improve parse_duration - -Extractors -* [crunchyroll] Improve series and season metadata extraction (#11832) -* [soundcloud] Improve formats extraction and extract audio bitrate -+ [soundcloud] Extract HLS formats -* [soundcloud] Fix track URL extraction (#11852) -+ [twitch:vod] Expand URL regular expressions (#11846) -* [aenetworks] Fix season episodes extraction (#11669) -+ [tva] Add support for videos.tva.ca (#11842) -* [jamendo] Improve and extract more metadata (#11836) -+ [disney] Add support for Disney sites (#7409, #11801, #4975, #11000) -* [vevo] Remove request to old API and catch API v2 errors -+ [cmt,mtv,southpark] Add support for episode URLs (#11837) -+ [youtube] Add fallback for duration extraction (#11841) - - -version 2017.01.25 - -Extractors -+ [openload] Fallback video extension to mp4 -+ [extractor/generic] Add support for Openload embeds (#11536, #11812) -* [srgssr] Fix rts video extraction (#11831) -+ [afreecatv:global] Add support for afreeca.tv (#11807) -+ [crackle] Extract vtt subtitles -+ [crackle] Extract multiple resolutions for thumbnails -+ [crackle] Add support for mobile URLs -+ [konserthusetplay] Extract subtitles (#11823) -+ [konserthusetplay] Add support for HLS videos (#11823) -* [vimeo:review] Fix config URL extraction (#11821) - - -version 2017.01.24 - -Extractors -* [pluralsight] Fix extraction (#11820) -+ [nextmedia] Add support for NextTV (壹電視) -* [24video] Fix extraction (#11811) -* [youtube:playlist] Fix nonexistent and private playlist detection (#11604) -+ [chirbit] Extract uploader (#11809) - - -version 2017.01.22 - -Extractors -+ [pornflip] Add support for pornflip.com (#11556, #11795) -* [chaturbate] Fix extraction (#11797, #11802) -+ [azmedien] Add support for AZ Medien sites (#11784, #11785) -+ [nextmedia] Support redirected URLs -+ [vimeo:channel] Extract videos' titles for playlist entries (#11796) -+ [youtube] Extract episode metadata (#9695, #11774) -+ [cspan] Support Ustream embedded videos (#11547) -+ [1tv] Add support for HLS videos (#11786) -* [uol] Fix extraction (#11770) -* [mtv] Relax triforce feed regular expression (#11766) - - -version 2017.01.18 - -Extractors -* [bilibili] Fix extraction (#11077) -+ [canalplus] Add fallback for video id (#11764) -* [20min] Fix extraction (#11683, #11751) -* [imdb] Extend URL regular expression (#11744) -+ [naver] Add support for tv.naver.com links (#11743) - - -version 2017.01.16 - -Core -* [options] Apply custom config to final composite configuration (#11741) -* [YoutubeDL] Improve protocol auto determining (#11720) - -Extractors -* [xiami] Relax URL regular expressions -* [xiami] Improve track metadata extraction (#11699) -+ [limelight] Check hand-make direct HTTP links -+ [limelight] Add support for direct HTTP links at video.llnw.net (#11737) -+ [brightcove] Recognize another player ID pattern (#11688) -+ [niconico] Support login via cookies (#7968) -* [yourupload] Fix extraction (#11601) -+ [beam:live] Add support for beam.pro live streams (#10702, #11596) -* [vevo] Improve geo restriction detection -+ [dramafever] Add support for URLs with language code (#11714) -* [cbc] Improve playlist support (#11704) - - -version 2017.01.14 - -Core -+ [common] Add ability to customize akamai manifest host -+ [utils] Add more date formats - -Extractors -- [mtv] Eliminate _transform_rtmp_url -* [mtv] Generalize triforce mgid extraction -+ [cmt] Add support for full episodes and video clips (#11623) -+ [mitele] Extract DASH formats -+ [ooyala] Add support for videos with embedToken (#11684) -* [mixcloud] Fix extraction (#11674) -* [openload] Fix extraction (#10408) -* [tv4] Improve extraction (#11698) -* [freesound] Fix and improve extraction (#11602) -+ [nick] Add support for beta.nick.com (#11655) -* [mtv,cc] Use HLS by default with native HLS downloader (#11641) -* [mtv] Fix non-HLS extraction - - -version 2017.01.10 - -Extractors -* [youtube] Fix extraction (#11663, #11664) -+ [inc] Add support for inc.com (#11277, #11647) -+ [youtube] Add itag 212 (#11575) -+ [egghead:course] Add support for egghead.io courses - - -version 2017.01.08 - -Core -* Fix "invalid escape sequence" errors under Python 3.6 (#11581) - -Extractors -+ [hitrecord] Add support for hitrecord.org (#10867, #11626) -- [videott] Remove extractor -* [swrmediathek] Improve extraction -- [sharesix] Remove extractor -- [aol:features] Remove extractor -* [sendtonews] Improve info extraction -* [3sat,phoenix] Fix extraction (#11619) -* [comedycentral/mtv] Add support for HLS videos (#11600) -* [discoverygo] Fix JSON data parsing (#11219, #11522) - - -version 2017.01.05 - -Extractors -+ [zdf] Fix extraction (#11055, #11063) -* [pornhub:playlist] Improve extraction (#11594) -+ [cctv] Add support for ncpa-classic.com (#11591) -+ [tunein] Add support for embeds (#11579) - - -version 2017.01.02 - -Extractors -* [cctv] Improve extraction (#879, #6753, #8541) -+ [nrktv:episodes] Add support for episodes (#11571) -+ [arkena] Add support for video.arkena.com (#11568) - - -version 2016.12.31 - -Core -+ Introduce --config-location option for custom configuration files (#6745, - #10648) - -Extractors -+ [twitch] Add support for player.twitch.tv (#11535, #11537) -+ [videa] Add support for videa.hu (#8181, #11133) -* [vk] Fix postlive videos extraction -* [vk] Extract from playerParams (#11555) -- [freevideo] Remove extractor (#11515) -+ [showroomlive] Add support for showroom-live.com (#11458) -* [xhamster] Fix duration extraction (#11549) -* [rtve:live] Fix extraction (#11529) -* [brightcove:legacy] Improve embeds detection (#11523) -+ [twitch] Add support for rechat messages (#11524) -* [acast] Fix audio and timestamp extraction (#11521) - - -version 2016.12.22 - -Core -* [extractor/common] Improve detection of video-only formats in m3u8 - manifests (#11507) - -Extractors -+ [theplatform] Pass geo verification headers to SMIL request (#10146) -+ [viu] Pass geo verification headers to auth request -* [rtl2] Extract more formats and metadata -* [vbox7] Skip malformed JSON-LD (#11501) -* [uplynk] Force downloading using native HLS downloader (#11496) -+ [laola1] Add support for another extraction scenario (#11460) - - -version 2016.12.20 - -Core -* [extractor/common] Improve fragment URL construction for DASH media -* [extractor/common] Fix codec information extraction for mixed audio/video - DASH media (#11490) - -Extractors -* [vbox7] Fix extraction (#11494) -+ [uktvplay] Add support for uktvplay.uktv.co.uk (#11027) -+ [piksel] Add support for player.piksel.com (#11246) -+ [vimeo] Add support for DASH formats -* [vimeo] Fix extraction for HLS formats (#11490) -* [kaltura] Fix wrong widget ID in some cases (#11480) -+ [nrktv:direkte] Add support for live streams (#11488) -* [pbs] Fix extraction for geo restricted videos (#7095) -* [brightcove:new] Skip widevine classic videos -+ [viu] Add support for viu.com (#10607, #11329) - - -version 2016.12.18 - -Core -+ [extractor/common] Recognize DASH formats in html5 media entries - -Extractors -+ [ccma] Add support for ccma.cat (#11359) -* [laola1tv] Improve extraction -+ [laola1tv] Add support embed URLs (#11460) -* [nbc] Fix extraction for MSNBC videos (#11466) -* [twitch] Adapt to new videos pages URL schema (#11469) -+ [meipai] Add support for meipai.com (#10718) -* [jwplatform] Improve subtitles and duration extraction -+ [ondemandkorea] Add support for ondemandkorea.com (#10772) -+ [vvvvid] Add support for vvvvid.it (#5915) - - -version 2016.12.15 - -Core -+ [utils] Add convenience urljoin - -Extractors -+ [openload] Recognize oload.tv URLs (#10408) -+ [facebook] Recognize .onion URLs (#11443) -* [vlive] Fix extraction (#11375, #11383) -+ [canvas] Extract DASH formats -+ [melonvod] Add support for vod.melon.com (#11419) - - -version 2016.12.12 - -Core -+ [utils] Add common user agents map -+ [common] Recognize HLS manifests that contain video only formats (#11394) - -Extractors -+ [dplay] Use Safari user agent for HLS (#11418) -+ [facebook] Detect login required error message -* [facebook] Improve video selection (#11390) -+ [canalplus] Add another video id pattern (#11399) -* [mixcloud] Relax URL regular expression (#11406) -* [ctvnews] Relax URL regular expression (#11394) -+ [rte] Capture and output error message (#7746, #10498) -+ [prosiebensat1] Add support for DASH formats -* [srgssr] Improve extraction for geo restricted videos (#11089) -* [rts] Improve extraction for geo restricted videos (#4989) - - -version 2016.12.09 - -Core -* [socks] Fix error reporting (#11355) - -Extractors -* [openload] Fix extraction (#10408) -* [pandoratv] Fix extraction (#11023) -+ [telebruxelles] Add support for emission URLs -* [telebruxelles] Extract all formats -+ [bloomberg] Add another video id regular expression (#11371) -* [fusion] Update ooyala id regular expression (#11364) -+ [1tv] Add support for playlists (#11335) -* [1tv] Improve extraction (#11335) -+ [aenetworks] Extract more formats (#11321) -+ [thisoldhouse] Recognize /tv-episode/ URLs (#11271) - - -version 2016.12.01 - -Extractors -* [soundcloud] Update client id (#11327) -* [ruutu] Detect DRM protected videos -+ [liveleak] Add support for youtube embeds (#10688) -* [spike] Fix full episodes support (#11312) -* [comedycentral] Fix full episodes support -* [normalboots] Rewrite in terms of JWPlatform (#11184) -* [teamfourstar] Rewrite in terms of JWPlatform (#11184) -- [screenwavemedia] Remove extractor (#11184) - - -version 2016.11.27 - -Extractors -+ [webcaster] Add support for webcaster.pro -+ [azubu] Add support for azubu.uol.com.br (#11305) -* [viki] Prefer hls formats -* [viki] Fix rtmp formats extraction (#11255) -* [puls4] Relax URL regular expression (#11267) -* [vevo] Improve artist extraction (#10911) -* [mitele] Relax URL regular expression and extract more metadata (#11244) -+ [cbslocal] Recognize New York site (#11285) -+ [youtube:playlist] Pass disable_polymer in URL query (#11193) - - -version 2016.11.22 - -Extractors -* [hellporno] Fix video extension extraction (#11247) -+ [hellporno] Add support for hellporno.net (#11247) -+ [amcnetworks] Recognize more BBC America URLs (#11263) -* [funnyordie] Improve extraction (#11208) -* [extractor/generic] Improve limelight embeds support -- [crunchyroll] Remove ScaledBorderAndShadow from ASS subtitles (#8207, #9028) -* [bandcamp] Fix free downloads extraction and extract all formats (#11067) -* [twitter:card] Relax URL regular expression (#11225) -+ [tvanouvelles] Add support for tvanouvelles.ca (#10616) - - -version 2016.11.18 - -Extractors -* [youtube:live] Relax URL regular expression (#11164) -* [openload] Fix extraction (#10408, #11122) -* [vlive] Prefer locale over language for subtitles id (#11203) - - -version 2016.11.14.1 - -Core -+ [downoader/fragment,f4m,hls] Respect HTTP headers from info dict -* [extractor/common] Fix media templates with Bandwidth substitution pattern in - MPD manifests (#11175) -* [extractor/common] Improve thumbnail extraction from JSON-LD - -Extractors -+ [nrk] Workaround geo restriction -+ [nrk] Improve error detection and messages -+ [afreecatv] Add support for vod.afreecatv.com (#11174) -* [cda] Fix and improve extraction (#10929, #10936) -* [plays] Fix extraction (#11165) -* [eagleplatform] Fix extraction (#11160) -+ [audioboom] Recognize /posts/ URLs (#11149) - - -version 2016.11.08.1 - -Extractors -* [espn:article] Fix support for espn.com articles -* [franceculture] Fix extraction (#11140) - - -version 2016.11.08 - -Extractors -* [tmz:article] Fix extraction (#11052) -* [espn] Fix extraction (#11041) -* [mitele] Fix extraction after website redesign (#10824) -- [ard] Remove age restriction check (#11129) -* [generic] Improve support for pornhub.com embeds (#11100) -+ [generic] Add support for redtube.com embeds (#11099) -+ [generic] Add support for drtuber.com embeds (#11098) -+ [redtube] Add support for embed URLs -+ [drtuber] Add support for embed URLs -+ [yahoo] Improve content id extraction (#11088) -* [toutv] Relax URL regular expression (#11121) - - -version 2016.11.04 - -Core -* [extractor/common] Tolerate malformed RESOLUTION attribute in m3u8 - manifests (#11113) -* [downloader/ism] Fix AVC Decoder Configuration Record - -Extractors -+ [fox9] Add support for fox9.com (#11110) -+ [anvato] Extract more metadata and improve formats extraction -* [vodlocker] Improve removed videos detection (#11106) -+ [vzaar] Add support for vzaar.com (#11093) -+ [vice] Add support for uplynk preplay videos (#11101) -* [tubitv] Fix extraction (#11061) -+ [shahid] Add support for authentication (#11091) -+ [radiocanada] Add subtitles support (#11096) -+ [generic] Add support for ISM manifests - - -version 2016.11.02 - -Core -+ Add basic support for Smooth Streaming protocol (#8118, #10969) -* Improve MPD manifest base URL extraction (#10909, #11079) -* Fix --match-filter for int-like strings (#11082) - -Extractors -+ [mva] Add support for ISM formats -+ [msn] Add support for ISM formats -+ [onet] Add support for ISM formats -+ [tvp] Add support for ISM formats -+ [nicknight] Add support for nicknight sites (#10769) - - -version 2016.10.30 - -Extractors -* [facebook] Improve 1080P video detection (#11073) -* [imgur] Recognize /r/ URLs (#11071) -* [beeg] Fix extraction (#11069) -* [openload] Fix extraction (#10408) -* [gvsearch] Modernize and fix search request (#11051) -* [adultswim] Fix extraction (#10979) -+ [nobelprize] Add support for nobelprize.org (#9999) -* [hornbunny] Fix extraction (#10981) -* [tvp] Improve video id extraction (#10585) - - -version 2016.10.26 - -Extractors -+ [rentv] Add support for ren.tv (#10620) -+ [ard] Detect unavailable videos (#11018) -* [vk] Fix extraction (#11022) - - -version 2016.10.25 - -Core -* Running youtube-dl in the background is fixed (#10996, #10706, #955) - -Extractors -+ [jamendo] Add support for jamendo.com (#10132, #10736) -+ [pandatv] Add support for panda.tv (#10736) -+ [dotsub] Support Vimeo embed (#10964) -* [litv] Fix extraction -+ [vimeo] Delegate ondemand redirects to ondemand extractor (#10994) -* [vivo] Fix extraction (#11003) -+ [twitch:stream] Add support for rebroadcasts (#10995) -* [pluralsight] Fix subtitles conversion (#10990) - - -version 2016.10.21.1 - -Extractors -+ [pluralsight] Process all clip URLs (#10984) - - -version 2016.10.21 - -Core -- Disable thumbnails embedding in mkv -+ Add support for Comcast multiple-system operator (#10819) - -Extractors -* [pluralsight] Adapt to new API (#10972) -* [openload] Fix extraction (#10408, #10971) -+ [natgeo] Extract m3u8 formats (#10959) - - -version 2016.10.19 - -Core -+ [utils] Expose PACKED_CODES_RE -+ [extractor/common] Extract non smil wowza mpd manifests -+ [extractor/common] Detect f4m audio-only formats - -Extractors -* [vidzi] Fix extraction (#10908, #10952) -* [urplay] Fix subtitles extraction -+ [urplay] Add support for urskola.se (#10915) -+ [orf] Add subtitles support (#10939) -* [youtube] Fix --no-playlist behavior for youtu.be/id URLs (#10896) -* [nrk] Relax URL regular expression (#10928) -+ [nytimes] Add support for podcasts (#10926) -* [pluralsight] Relax URL regular expression (#10941) - - -version 2016.10.16 - -Core -* [postprocessor/ffmpeg] Return correct filepath and ext in updated information - in FFmpegExtractAudioPP (#10879) - -Extractors -+ [ruutu] Add support for supla.fi (#10849) -+ [theoperaplatform] Add support for theoperaplatform.eu (#10914) -* [lynda] Fix height for prioritized streams -+ [lynda] Add fallback extraction scenario -* [lynda] Switch to https (#10916) -+ [huajiao] New extractor (#10917) -* [cmt] Fix mgid extraction (#10813) -+ [safari:course] Add support for techbus.safaribooksonline.com -* [orf:tvthek] Fix extraction and modernize (#10898) -* [chirbit] Fix extraction of user profile pages -* [carambatv] Fix extraction -* [canalplus] Fix extraction for some videos -* [cbsinteractive] Fix extraction for cnet.com -* [parliamentliveuk] Lower case URLs are now recognized (#10912) - - -version 2016.10.12 - -Core -+ Support HTML media elements without child nodes -* [Makefile] Support for GNU make < 4 is fixed; BSD make dropped (#9387) - -Extractors -* [dailymotion] Fix extraction (#10901) -* [vimeo:review] Fix extraction (#10900) -* [nhl] Correctly handle invalid formats (#10713) -* [footyroom] Fix extraction (#10810) -* [abc.net.au:iview] Fix for standalone (non series) videos (#10895) -+ [hbo] Add support for episode pages (#10892) -* [allocine] Fix extraction (#10860) -+ [nextmedia] Recognize action news on AppleDaily -* [lego] Improve info extraction and bypass geo restriction (#10872) - - -version 2016.10.07 - -Extractors -+ [iprima] Detect geo restriction -* [facebook] Fix video extraction (#10846) -+ [commonprotocols] Support direct MMS links (#10838) -+ [generic] Add support for multiple vimeo embeds (#10862) -+ [nzz] Add support for nzz.ch (#4407) -+ [npo] Detect geo restriction -+ [npo] Add support for 2doc.nl (#10842) -+ [lego] Add support for lego.com (#10369) -+ [tonline] Add support for t-online.de (#10376) -* [techtalks] Relax URL regular expression (#10840) -* [youtube:live] Extend URL regular expression (#10839) -+ [theweatherchannel] Add support for weather.com (#7188) -+ [thisoldhouse] Add support for thisoldhouse.com (#10837) -+ [nhl] Add support for wch2016.com (#10833) -* [pornoxo] Use JWPlatform to improve metadata extraction - - -version 2016.10.02 - -Core -* Fix possibly lost extended attributes during post-processing -+ Support pyxattr as well as python-xattr for --xattrs and - --xattr-set-filesize (#9054) - -Extractors -+ [jwplatform] Support DASH streams in JWPlayer -+ [jwplatform] Support old-style JWPlayer playlists -+ [byutv:event] Add extractor -* [periscope:user] Fix extraction (#10820) -* [dctp] Fix extraction (#10734) -+ [instagram] Extract video dimensions (#10790) -+ [tvland] Extend URL regular expression (#10812) -+ [vgtv] Add support for tv.aftonbladet.se (#10800) -- [aftonbladet] Remove extractor -* [vk] Fix timestamp and view count extraction (#10760) -+ [vk] Add support for running and finished live streams (#10799) -+ [leeco] Recognize more Le Sports URLs (#10794) -+ [instagram] Extract comments (#10788) -+ [ketnet] Extract mzsource formats (#10770) -* [limelight:media] Improve HTTP formats extraction - - -version 2016.09.27 - -Core -+ Add hdcore query parameter to akamai f4m formats -+ Delegate HLS live streams downloading to ffmpeg -+ Improved support for HTML5 subtitles - -Extractors -+ [vk] Add support for dailymotion embeds (#10661) -* [promptfile] Fix extraction (#10634) -* [kaltura] Speed up embed regular expressions (#10764) -+ [npo] Add support for anderetijden.nl (#10754) -+ [prosiebensat1] Add support for advopedia sites -* [mwave] Relax URL regular expression (#10735, #10748) -* [prosiebensat1] Fix playlist support (#10745) -+ [prosiebensat1] Add support for sat1gold sites (#10745) -+ [cbsnews:livevideo] Fix extraction and extract m3u8 formats -+ [brightcove:new] Add support for live streams -* [soundcloud] Generalize playlist entries extraction (#10733) -+ [mtv] Add support for new URL schema (#8169, #9808) -* [einthusan] Fix extraction (#10714) -+ [twitter] Support Periscope embeds (#10737) -+ [openload] Support subtitles (#10625) - - -version 2016.09.24 - -Core -+ Add support for watchTVeverywhere.com authentication provider based MSOs for - Adobe Pass authentication (#10709) - -Extractors -+ [soundcloud:playlist] Provide video id for early playlist entries (#10733) -+ [prosiebensat1] Add support for kabeleinsdoku (#10732) -* [cbs] Extract info from thunder videoPlayerService (#10728) -* [openload] Fix extraction (#10408) -+ [ustream] Support the new HLS streams (#10698) -+ [ooyala] Extract all HLS formats -+ [cartoonnetwork] Add support for Adobe Pass authentication -+ [soundcloud] Extract license metadata -+ [fox] Add support for Adobe Pass authentication (#8584) -+ [tbs] Add support for Adobe Pass authentication (#10642, #10222) -+ [trutv] Add support for Adobe Pass authentication (#10519) -+ [turner] Add support for Adobe Pass authentication - - -version 2016.09.19 - -Extractors -+ [crunchyroll] Check if already authenticated (#10700) -- [twitch:stream] Remove fallback to profile extraction when stream is offline -* [thisav] Improve title extraction (#10682) -* [vyborymos] Improve station info extraction - - -version 2016.09.18 - -Core -+ Introduce manifest_url and fragments fields in formats dictionary for - fragmented media -+ Provide manifest_url field for DASH segments, HLS and HDS -+ Provide fragments field for DASH segments -* Rework DASH segments downloader to use fragments field -+ Add helper method for Wowza Streaming Engine formats extraction - -Extractors -+ [vyborymos] Add extractor for vybory.mos.ru (#10692) -+ [xfileshare] Add title regular expression for streamin.to (#10646) -+ [globo:article] Add support for multiple videos (#10653) -+ [thisav] Recognize HTML5 videos (#10447) -* [jwplatform] Improve JWPlayer detection -+ [mangomolo] Add support for Mangomolo embeds -+ [toutv] Add support for authentication (#10669) -* [franceinter] Fix upload date extraction -* [tv4] Fix HLS and HDS formats extraction (#10659) - - -version 2016.09.15 - -Core -* Improve _hidden_inputs -+ Introduce improved explicit Adobe Pass support -+ Add --ap-mso to provide multiple-system operator identifier -+ Add --ap-username to provide MSO account username -+ Add --ap-password to provide MSO account password -+ Add --ap-list-mso to list all supported MSOs -+ Add support for Rogers Cable multiple-system operator (#10606) - -Extractors -* [crunchyroll] Fix authentication (#10655) -* [twitch] Fix API calls (#10654, #10660) -+ [bellmedia] Add support for more Bell Media Television sites -* [franceinter] Fix extraction (#10538, #2105) -* [kuwo] Improve error detection (#10650) -+ [go] Add support for free full episodes (#10439) -* [bilibili] Fix extraction for specific videos (#10647) -* [nhk] Fix extraction (#10633) -* [kaltura] Improve audio detection -* [kaltura] Skip chun format -+ [vimeo:ondemand] Pass Referer along with embed URL (#10624) -+ [nbc] Add support for NBC Olympics (#10361) - - -version 2016.09.11.1 - -Extractors -+ [tube8] Extract categories and tags (#10579) -+ [pornhub] Extract categories and tags (#10499) -* [openload] Temporary fix (#10408) -+ [foxnews] Add support Fox News articles (#10598) -* [viafree] Improve video id extraction (#10615) -* [iwara] Fix extraction after relaunch (#10462, #3215) -+ [tfo] Add extractor for tfo.org -* [lrt] Fix audio extraction (#10566) -* [9now] Fix extraction (#10561) -+ [canalplus] Add support for c8.fr (#10577) -* [newgrounds] Fix uploader extraction (#10584) -+ [polskieradio:category] Add support for category lists (#10576) -+ [ketnet] Add extractor for ketnet.be (#10343) -+ [canvas] Add support for een.be (#10605) -+ [telequebec] Add extractor for telequebec.tv (#1999) -* [parliamentliveuk] Fix extraction (#9137) - - -version 2016.09.08 - -Extractors -+ [jwplatform] Extract height from format label -+ [yahoo] Extract Brightcove Legacy Studio embeds (#9345) -* [videomore] Fix extraction (#10592) -* [foxgay] Fix extraction (#10480) -+ [rmcdecouverte] Add extractor for rmcdecouverte.bfmtv.com (#9709) -* [gamestar] Fix metadata extraction (#10479) -* [puls4] Fix extraction (#10583) -+ [cctv] Add extractor for CCTV and CNTV (#8153) -+ [lci] Add extractor for lci.fr (#10573) -+ [wat] Extract DASH formats -+ [viafree] Improve video id detection (#10569) -+ [trutv] Add extractor for trutv.com (#10519) -+ [nick] Add support for nickelodeon.nl (#10559) -+ [abcotvs:clips] Add support for clips.abcotvs.com -+ [abcotvs] Add support for ABC Owned Television Stations sites (#9551) -+ [miaopai] Add extractor for miaopai.com (#10556) -* [gamestar] Fix metadata extraction (#10479) -+ [bilibili] Add support for episodes (#10190) -+ [tvnoe] Add extractor for tvnoe.cz (#10524) - - -version 2016.09.04.1 - -Core -* In DASH downloader if the first segment fails, abort the whole download - process to prevent throttling (#10497) -+ Add support for --skip-unavailable-fragments and --fragment retries in - hlsnative downloader (#10165, #10448). -+ Add support for --skip-unavailable-fragments in DASH downloader -+ Introduce --skip-unavailable-fragments option for fragment based downloaders - that allows to skip fragments unavailable due to a HTTP error -* Fix extraction of video/audio entries with src attribute in - _parse_html5_media_entries (#10540) - -Extractors -* [theplatform] Relax URL regular expression (#10546) -* [youtube:playlist] Extend URL regular expression -* [rottentomatoes] Delegate extraction to internetvideoarchive extractor -* [internetvideoarchive] Extract all formats -* [pornvoisines] Fix extraction (#10469) -* [rottentomatoes] Fix extraction (#10467) -* [espn] Extend URL regular expression (#10549) -* [vimple] Extend URL regular expression (#10547) -* [youtube:watchlater] Fix extraction (#10544) -* [youjizz] Fix extraction (#10437) -+ [foxnews] Add support for FoxNews Insider (#10445) -+ [fc2] Recognize Flash player URLs (#10512) - - -version 2016.09.03 - -Core -* Restore usage of NAME attribute from EXT-X-MEDIA tag for formats codes in - _extract_m3u8_formats (#10522) -* Handle semicolon in mimetype2ext - -Extractors -+ [youtube] Add support for rental videos' previews (#10532) -* [youtube:playlist] Fallback to video extraction for video/playlist URLs when - no playlist is actually served (#10537) -+ [drtv] Add support for dr.dk/nyheder (#10536) -+ [facebook:plugins:video] Add extractor (#10530) -+ [go] Add extractor for *.go.com sites -* [adobepass] Check for authz_token expiration (#10527) -* [nytimes] improve extraction -* [thestar] Fix extraction (#10465) -* [glide] Fix extraction (#10478) -- [exfm] Remove extractor (#10482) -* [youporn] Fix categories and tags extraction (#10521) -+ [curiositystream] Add extractor for app.curiositystream.com -- [thvideo] Remove extractor (#10464) -* [movingimage] Fix for the new site name (#10466) -+ [cbs] Add support for once formats (#10515) -* [limelight] Skip ism snd duplicate manifests -+ [porncom] Extract categories and tags (#10510) -+ [facebook] Extract timestamp (#10508) -+ [yahoo] Extract more formats - - -version 2016.08.31 - -Extractors -* [soundcloud] Fix URL regular expression to avoid clashes with sets (#10505) -* [bandcamp:album] Fix title extraction (#10455) -* [pyvideo] Fix extraction (#10468) -+ [ctv] Add support for tsn.ca, bnn.ca and thecomedynetwork.ca (#10016) -* [9c9media] Extract more metadata -* [9c9media] Fix multiple stacks extraction (#10016) -* [adultswim] Improve video info extraction (#10492) -* [vodplatform] Improve embed regular expression -- [played] Remove extractor (#10470) -+ [tbs] Add extractor for tbs.com and tntdrama.com (#10222) -+ [cartoonnetwork] Add extractor for cartoonnetwork.com (#10110) -* [adultswim] Rework in terms of turner extractor -* [cnn] Rework in terms of turner extractor -* [nba] Rework in terms of turner extractor -+ [turner] Add base extractor for Turner Broadcasting System based sites -* [bilibili] Fix extraction (#10375) -* [openload] Fix extraction (#10408) - - -version 2016.08.28 - -Core -+ Add warning message that ffmpeg doesn't support SOCKS -* Improve thumbnail sorting -+ Extract formats from #EXT-X-MEDIA tags in _extract_m3u8_formats -* Fill IV with leading zeros for IVs shorter than 16 octets in hlsnative -+ Add ac-3 to the list of audio codecs in parse_codecs - -Extractors -* [periscope:user] Fix extraction (#10453) -* [douyutv] Fix extraction (#10153, #10318, #10444) -+ [nhk:vod] Add extractor for www3.nhk.or.jp on demand (#4437, #10424) -- [trutube] Remove extractor (#10438) -+ [usanetwork] Add extractor for usanetwork.com -* [crackle] Fix extraction (#10333) -* [spankbang] Fix description and uploader extraction (#10339) -* [discoverygo] Detect cable provider restricted videos (#10425) -+ [cbc] Add support for watch.cbc.ca -* [kickstarter] Silent the warning for og:description (#10415) -* [mtvservices:embedded] Fix extraction for the new 'edge' player (#10363) - - -version 2016.08.24.1 - -Extractors -+ [pluralsight] Add support for subtitles (#9681) - - -version 2016.08.24 - -Extractors -* [youtube] Fix authentication (#10392) -* [openload] Fix extraction (#10408) -+ [bravotv] Add support for Adobe Pass (#10407) -* [bravotv] Fix clip info extraction (#10407) -* [eagleplatform] Improve embedded videos detection (#10409) -* [awaan] Fix extraction -* [mtvservices:embedded] Update config URL -+ [abc:iview] Add extractor (#6148) - - -version 2016.08.22 - -Core -* Improve formats and subtitles extension auto calculation -+ Recognize full unit names in parse_filesize -+ Add support for m3u8 manifests in HTML5 multimedia tags -* Fix octal/hexadecimal number detection in js_to_json - -Extractors -+ [ivi] Add support for 720p and 1080p -+ [charlierose] Add new extractor (#10382) -* [1tv] Fix extraction (#9249) -* [twitch] Renew authentication -* [kaltura] Improve subtitles extension calculation -+ [zingmp3] Add support for video clips -* [zingmp3] Fix extraction (#10041) -* [kaltura] Improve subtitles extraction (#10279) -* [cultureunplugged] Fix extraction (#10330) -+ [cnn] Add support for money.cnn.com (#2797) -* [cbsnews] Fix extraction (#10362) -* [cbs] Fix extraction (#10393) -+ [litv] Support 'promo' URLs (#10385) -* [snotr] Fix extraction (#10338) -* [n-tv.de] Fix extraction (#10331) -* [globo:article] Relax URL and video id regular expressions (#10379) - - -version 2016.08.19 - -Core -- Remove output template description from --help -* Recognize lowercase units in parse_filesize - -Extractors -+ [porncom] Add extractor for porn.com (#2251, #10251) -+ [generic] Add support for DBTV embeds -* [vk:wallpost] Fix audio extraction for new site layout -* [vk] Fix authentication -+ [hgtvcom:show] Add extractor for hgtv.com shows (#10365) -+ [discoverygo] Add support for another GO network sites - - -version 2016.08.17 - -Core -+ Add _get_netrc_login_info - -Extractors -* [mofosex] Extract all formats (#10335) -+ [generic] Add support for vbox7 embeds -+ [vbox7] Add support for embed URLs -+ [viafree] Add extractor (#10358) -+ [mtg] Add support for viafree URLs (#10358) -* [theplatform] Extract all subtitles per language -+ [xvideos] Fix HLS extraction (#10356) -+ [amcnetworks] Add extractor -+ [bbc:playlist] Add support for pagination (#10349) -+ [fxnetworks] Add extractor (#9462) -* [cbslocal] Fix extraction for SendtoNews-based videos -* [sendtonews] Fix extraction -* [jwplatform] Extract video id from JWPlayer data -- [zippcast] Remove extractor (#10332) -+ [viceland] Add extractor (#8799) -+ [adobepass] Add base extractor for Adobe Pass Authentication -* [life:embed] Improve extraction -* [vgtv] Detect geo restricted videos (#10348) -+ [uplynk] Add extractor -* [xiami] Fix extraction (#10342) - - -version 2016.08.13 - -Core -* Show progress for curl external downloader -* Forward more options to curl external downloader - -Extractors -* [pbs] Fix description extraction -* [franceculture] Fix extraction (#10324) -* [pornotube] Fix extraction (#10322) -* [4tube] Fix metadata extraction (#10321) -* [imgur] Fix width and height extraction (#10325) -* [expotv] Improve extraction -+ [vbox7] Fix extraction (#10309) -- [tapely] Remove extractor (#10323) -* [muenchentv] Fix extraction (#10313) -+ [24video] Add support for .me and .xxx TLDs -* [24video] Fix comment count extraction -* [sunporno] Add support for embed URLs -* [sunporno] Fix metadata extraction (#10316) -+ [hgtv] Add extractor for hgtv.ca (#3999) -- [pbs] Remove request to unavailable API -+ [pbs] Add support for high quality HTTP formats -+ [crunchyroll] Add support for HLS formats (#10301) - - -version 2016.08.12 - -Core -* Subtitles are now written as is. Newline conversions are disabled. (#10268) -+ Recognize more formats in unified_timestamp - -Extractors -- [goldenmoustache] Remove extractor (#10298) -* [drtuber] Improve title extraction -* [drtuber] Make dislike count optional (#10297) -* [chirbit] Fix extraction (#10296) -* [francetvinfo] Relax URL regular expression -* [rtlnl] Relax URL regular expression (#10282) -* [formula1] Relax URL regular expression (#10283) -* [wat] Improve extraction (#10281) -* [ctsnews] Fix extraction - - -version 2016.08.10 - -Core -* Make --metadata-from-title non fatal when title does not match the pattern -* Introduce options for randomized sleep before each download - --min-sleep-interval and --max-sleep-interval (#9930) -* Respect default in _search_json_ld - -Extractors -+ [uol] Add extractor for uol.com.br (#4263) -* [rbmaradio] Fix extraction and extract all formats (#10242) -+ [sonyliv] Add extractor for sonyliv.com (#10258) -* [aparat] Fix extraction -* [cwtv] Extract HTTP formats -+ [rozhlas] Add extractor for prehravac.rozhlas.cz (#10253) -* [kuwo:singer] Fix extraction - - -version 2016.08.07 - -Core -+ Add support for TV Parental Guidelines ratings in parse_age_limit -+ Add decode_png (#9706) -+ Add support for partOfTVSeries in JSON-LD -* Lower master M3U8 manifest preference for better format sorting - -Extractors -+ [discoverygo] Add extractor (#10245) -* [flipagram] Make JSON-LD extraction non fatal -* [generic] Make JSON-LD extraction non fatal -+ [bbc] Add support for morph embeds (#10239) -* [tnaflixnetworkbase] Improve title extraction -* [tnaflix] Fix metadata extraction (#10249) -* [fox] Fix theplatform release URL query -* [openload] Fix extraction (#9706) -* [bbc] Skip duplicate manifest URLs -* [bbc] Improve format code -+ [bbc] Add support for DASH and F4M -* [bbc] Improve format sorting and listing -* [bbc] Improve playlist extraction -+ [pokemon] Add extractor (#10093) -+ [condenast] Add fallback scenario for video info extraction - - -version 2016.08.06 - -Core -* Add support for JSON-LD root list entries (#10203) -* Improve unified_timestamp -* Lower preference of RTSP formats in generic sorting -+ Add support for multiple properties in _og_search_property -* Improve password hiding from verbose output - -Extractors -+ [adultswim] Add support for trailers (#10235) -* [archiveorg] Improve extraction (#10219) -+ [jwplatform] Add support for playlists -+ [jwplatform] Add support for relative URLs -* [jwplatform] Improve audio detection -+ [tvplay] Capture and output native error message -+ [tvplay] Extract series metadata -+ [tvplay] Add support for subtitles (#10194) -* [tvp] Improve extraction (#7799) -* [cbslocal] Fix timestamp parsing (#10213) -+ [naver] Add support for subtitles (#8096) -* [naver] Improve extraction -* [condenast] Improve extraction -* [engadget] Relax URL regular expression -* [5min] Fix extraction -+ [nationalgeographic] Add support for Episode Guide -+ [kaltura] Add support for subtitles -* [kaltura] Optimize network requests -+ [vodplatform] Add extractor for vod-platform.net -- [gamekings] Remove extractor -* [limelight] Extract HTTP formats -* [ntvru] Fix extraction -+ [comedycentral] Re-add :tds and :thedailyshow shortnames - - -version 2016.08.01 - -Fixed/improved extractors -- [yandexmusic:track] Adapt to changes in track location JSON (#10193) -- [bloomberg] Support another form of player (#10187) -- [limelight] Skip DRM protected videos -- [safari] Relax regular expressions for URL matching (#10202) -- [cwtv] Add support for cwtvpr.com (#10196) - - -version 2016.07.30 - -Fixed/improved extractors -- [twitch:clips] Sort formats -- [tv2] Use m3u8_native -- [tv2:article] Fix video detection (#10188) -- rtve (#10076) -- [dailymotion:playlist] Optimize download archive processing (#10180) - - -version 2016.07.28 - -Fixed/improved extractors -- shared (#10170) -- soundcloud (#10179) -- twitch (#9767) - - -version 2016.07.26.2 - -Fixed/improved extractors -- smotri -- camdemy -- mtv -- comedycentral -- cmt -- cbc -- mgtv -- orf - - -version 2016.07.24 - -New extractors -- arkena (#8682) -- lcp (#8682) - -Fixed/improved extractors -- facebook (#10151) -- dailymail -- telegraaf -- dcn -- onet -- tvp - -Miscellaneous -- Support $Time$ in DASH manifests - - -version 2016.07.22 - -New extractors -- odatv (#9285) - -Fixed/improved extractors -- bbc -- youjizz (#10131) -- youtube (#10140) -- pornhub (#10138) -- eporner (#10139) - - -version 2016.07.17 - -New extractors -- nintendo (#9986) -- streamable (#9122) - -Fixed/improved extractors -- ard (#10095) -- mtv -- comedycentral (#10101) -- viki (#10098) -- spike (#10106) - -Miscellaneous -- Improved twitter player detection (#10090) - - -version 2016.07.16 - -New extractors -- ninenow (#5181) - -Fixed/improved extractors -- rtve (#10076) -- brightcove -- 3qsdn -- syfy (#9087, #3820, #2388) -- youtube (#10083) - -Miscellaneous -- Fix subtitle embedding for video-only and audio-only files (#10081) - - -version 2016.07.13 - -New extractors -- rudo - -Fixed/improved extractors -- biobiochiletv -- tvplay -- dbtv -- brightcove -- tmz -- youtube (#10059) -- shahid (#10062) -- vk -- ellentv (#10067) - - -version 2016.07.11 - -New Extractors -- roosterteeth (#9864) - -Fixed/improved extractors -- miomio (#9605) -- vuclip -- youtube -- vidzi (#10058) - - -version 2016.07.09.2 - -Fixed/improved extractors -- vimeo (#1638) -- facebook (#10048) -- lynda (#10047) -- animeondemand - -Fixed/improved features -- Embedding subtitles no longer throws an error with problematic inputs (#9063) - - -version 2016.07.09.1 - -Fixed/improved extractors -- youtube -- ard -- srmediathek (#9373) - - -version 2016.07.09 - -New extractors -- Flipagram (#9898) - -Fixed/improved extractors -- telecinco -- toutv -- radiocanada -- tweakers (#9516) -- lynda -- nick (#7542) -- polskieradio (#10028) -- le -- facebook (#9851) -- mgtv -- animeondemand (#10031) - -Fixed/improved features -- `--postprocessor-args` and `--downloader-args` now accepts non-ASCII inputs - on non-Windows systems - - -version 2016.07.07 - -New extractors -- kamcord (#10001) - -Fixed/improved extractors -- spiegel (#10018) -- metacafe (#8539, #3253) -- onet (#9950) -- francetv (#9955) -- brightcove (#9965) -- daum (#9972) - - -version 2016.07.06 - -Fixed/improved extractors -- youtube (#10007, #10009) -- xuite -- stitcher -- spiegel -- slideshare -- sandia -- rtvnh -- prosiebensat1 -- onionstudios - - -version 2016.07.05 - -Fixed/improved extractors -- brightcove -- yahoo (#9995) -- pornhub (#9997) -- iqiyi -- kaltura (#5557) -- la7 -- Changed features -- Rename --cn-verification-proxy to --geo-verification-proxy -Miscellaneous -- Add script for displaying downloads statistics - - -version 2016.07.03.1 - -Fixed/improved extractors -- theplatform -- aenetworks -- nationalgeographic -- hrti (#9482) -- facebook (#5701) -- buzzfeed (#5701) -- rai (#8617, #9157, #9232, #8552, #8551) -- nationalgeographic (#9991) -- iqiyi - - -version 2016.07.03 - -New extractors -- hrti (#9482) - -Fixed/improved extractors -- vk (#9981) -- facebook (#9938) -- xtube (#9953, #9961) - - -version 2016.07.02 - -New extractors -- fusion (#9958) - -Fixed/improved extractors -- twitch (#9975) -- vine (#9970) -- periscope (#9967) -- pornhub (#8696) - - -version 2016.07.01 - -New extractors -- 9c9media -- ctvnews (#2156) -- ctv (#4077) - -Fixed/Improved extractors -- rds -- meta (#8789) -- pornhub (#9964) -- sixplay (#2183) - -New features -- Accept quoted strings across multiple lines (#9940) diff --git a/Changelog.md b/Changelog.md new file mode 100644 index 000000000..2e6da33fb --- /dev/null +++ b/Changelog.md @@ -0,0 +1,1237 @@ +# Changelog + +<!-- +# Instuctions for creating release + +* Run `make doc` +* Update Changelog.md and CONTRIBUTORS +* Change "Merged with ytdl" version in Readme.md if needed +* Add new/fixed extractors in "new features" section of Readme.md +* Commit as `Release <version>` +* Push to origin/release using `git push origin master:release` + build task will now run + +--> + + +### 2021.10.10 + +* [downloader/ffmpeg] Fix bug in initializing `FFmpegPostProcessor` +* [minicurses] Fix when printing to file +* [downloader] Fix throttledratelimit +* [francetv] Fix extractor by [fstirlitz](https://github.com/fstirlitz), [sarnoud](https://github.com/sarnoud) +* [NovaPlay] Add extractor by [Bojidarist](https://github.com/Bojidarist) +* [ffmpeg] Revert "Set max probesize" - No longer needed +* [docs] Remove incorrect dependency on VC++10 +* [build] Allow to release without changelog + +### 2021.10.09 + +* Improved progress reporting + * Separate `--console-title` and `--no-progress` + * Add option `--progress` to show progress-bar even in quiet mode + * Fix and refactor `minicurses` and use it for all progress reporting + * Standardize use of terminal sequences and enable color support for windows 10 + * Add option `--progress-template` to customize progress-bar and console-title + * Add postprocessor hooks and progress reporting +* [postprocessor] Add plugin support with option `--use-postprocessor` +* [extractor] Extract storyboards from SMIL manifests by [fstirlitz](https://github.com/fstirlitz) +* [outtmpl] Alternate form of format type `l` for `\n` delimited list +* [outtmpl] Format type `U` for unicode normalization +* [outtmpl] Allow empty output template to skip a type of file +* Merge webm formats into mkv if thumbnails are to be embedded +* [adobepass] Add RCN as MSO by [jfogelman](https://github.com/jfogelman) +* [ciscowebex] Add extractor by [damianoamatruda](https://github.com/damianoamatruda) +* [Gettr] Add extractor by [i6t](https://github.com/i6t) +* [GoPro] Add extractor by [i6t](https://github.com/i6t) +* [N1] Add extractor by [u-spec-png](https://github.com/u-spec-png) +* [Theta] Add video extractor by [alerikaisattera](https://github.com/alerikaisattera) +* [Veo] Add extractor by [i6t](https://github.com/i6t) +* [Vupload] Add extractor by [u-spec-png](https://github.com/u-spec-png) +* [bbc] Extract better quality videos by [ajj8](https://github.com/ajj8) +* [Bilibili] Add subtitle converter by [u-spec-png](https://github.com/u-spec-png) +* [CBC] Cleanup tests by [makeworld-the-better-one](https://github.com/makeworld-the-better-one) +* [Douyin] Rewrite extractor by [MinePlayersPE](https://github.com/MinePlayersPE) +* [Funimation] Fix for /v/ urls by [pukkandan](https://github.com/pukkandan), [Jules-A](https://github.com/Jules-A) +* [Funimation] Sort formats according to the relevant extractor-args +* [Hidive] Fix duplicate and incorrect formats +* [HotStarSeries] Fix cookies by [Ashish0804](https://github.com/Ashish0804) +* [LinkedInLearning] Add subtitles by [Ashish0804](https://github.com/Ashish0804) +* [Mediaite] Relax valid url by [coletdjnz](https://github.com/coletdjnz) +* [Newgrounds] Add age_limit and fix duration by [u-spec-png](https://github.com/u-spec-png) +* [Newgrounds] Fix view count on songs by [u-spec-png](https://github.com/u-spec-png) +* [parliamentlive.tv] Fix extractor by [u-spec-png](https://github.com/u-spec-png) +* [PolskieRadio] Fix extractors by [jakubadamw](https://github.com/jakubadamw), [u-spec-png](https://github.com/u-spec-png) +* [reddit] Add embedded url by [u-spec-png](https://github.com/u-spec-png) +* [reddit] Fix 429 by generating a random `reddit_session` by [AjaxGb](https://github.com/AjaxGb) +* [Rumble] Add RumbleChannelIE by [Ashish0804](https://github.com/Ashish0804) +* [soundcloud:playlist] Detect last page correctly +* [SovietsCloset] Add duration from m3u8 by [ChillingPepper](https://github.com/ChillingPepper) +* [Streamable] Add codecs by [u-spec-png](https://github.com/u-spec-png) +* [vidme] Remove extractor by [alerikaisattera](https://github.com/alerikaisattera) +* [youtube:tab] Fallback to API when webpage fails to download by [coletdjnz](https://github.com/coletdjnz) +* [youtube] Fix non-fatal errors in fetching player +* Fix `--flat-playlist` when neither IE nor id is known +* Fix `-f mp4` behaving differently from youtube-dl +* Workaround for bug in `ssl.SSLContext.load_default_certs` +* [aes] Improve performance slightly by [sulyi](https://github.com/sulyi) +* [cookies] Fix keyring fallback by [mbway](https://github.com/mbway) +* [embedsubtitle] Fix error when duration is unknown +* [ffmpeg] Fix error when subtitle file is missing +* [ffmpeg] Set max probesize to workaround AAC HLS stream issues by [shirt](https://github.com/shirt-dev) +* [FixupM3u8] Remove redundant run if merged is needed +* [hls] Fix decryption issues by [shirt](https://github.com/shirt-dev), [pukkandan](https://github.com/pukkandan) +* [http] Respect user-provided chunk size over extractor's +* [utils] Let traverse_obj accept functions as keys +* [docs] Add note about our custom ffmpeg builds +* [docs] Write embedding and contributing documentation by [pukkandan](https://github.com/pukkandan), [timethrow](https://github.com/timethrow) +* [update] Check for new version even if not updateable +* [build] Add more files to the tarball +* [build] Allow building with py2exe (and misc fixes) +* [build] Use pycryptodomex by [shirt](https://github.com/shirt-dev), [pukkandan](https://github.com/pukkandan) +* [cleanup] Some minor refactoring, improve docs and misc cleanup + + +### 2021.09.25 + +* Add new option `--netrc-location` +* [outtmpl] Allow alternate fields using `,` +* [outtmpl] Add format type `B` to treat the value as bytes (eg: to limit the filename to a certain number of bytes) +* Separate the options `--ignore-errors` and `--no-abort-on-error` +* Basic framework for simultaneous download of multiple formats by [nao20010128nao](https://github.com/nao20010128nao) +* [17live] Add 17.live extractor by [nao20010128nao](https://github.com/nao20010128nao) +* [bilibili] Add BiliIntlIE and BiliIntlSeriesIE by [Ashish0804](https://github.com/Ashish0804) +* [CAM4] Add extractor by [alerikaisattera](https://github.com/alerikaisattera) +* [Chingari] Add extractors by [Ashish0804](https://github.com/Ashish0804) +* [CGTN] Add extractor by [chao813](https://github.com/chao813) +* [damtomo] Add extractor by [nao20010128nao](https://github.com/nao20010128nao) +* [gotostage] Add extractor by [poschi3](https://github.com/poschi3) +* [Koo] Add extractor by [Ashish0804](https://github.com/Ashish0804) +* [Mediaite] Add Extractor by [Ashish0804](https://github.com/Ashish0804) +* [Mediaklikk] Add Extractor by [tmarki](https://github.com/tmarki), [mrx23dot](https://github.com/mrx23dot), [coletdjnz](https://github.com/coletdjnz) +* [MuseScore] Add Extractor by [Ashish0804](https://github.com/Ashish0804) +* [Newgrounds] Add NewgroundsUserIE and improve extractor by [u-spec-png](https://github.com/u-spec-png) +* [nzherald] Add NZHeraldIE by [coletdjnz](https://github.com/coletdjnz) +* [Olympics] Add replay extractor by [Ashish0804](https://github.com/Ashish0804) +* [Peertube] Add channel and playlist extractors by [u-spec-png](https://github.com/u-spec-png) +* [radlive] Add extractor by [nyuszika7h](https://github.com/nyuszika7h) +* [SovietsCloset] Add extractor by [ChillingPepper](https://github.com/ChillingPepper) +* [Streamanity] Add Extractor by [alerikaisattera](https://github.com/alerikaisattera) +* [Theta] Add extractor by [alerikaisattera](https://github.com/alerikaisattera) +* [Yandex] Add ZenYandexIE and ZenYandexChannelIE by [Ashish0804](https://github.com/Ashish0804) +* [9Now] handle episodes of series by [dalanmiller](https://github.com/dalanmiller) +* [AnimalPlanet] Fix extractor by [Sipherdrakon](https://github.com/Sipherdrakon) +* [Arte] Improve description extraction by [renalid](https://github.com/renalid) +* [atv.at] Use jwt for API by [NeroBurner](https://github.com/NeroBurner) +* [brightcove] Extract subtitles from manifests +* [CBC] Fix CBC Gem extractors by [makeworld-the-better-one](https://github.com/makeworld-the-better-one) +* [cbs] Report appropriate error for DRM +* [comedycentral] Support `collection-playlist` by [nixxo](https://github.com/nixxo) +* [DIYNetwork] Support new format by [Sipherdrakon](https://github.com/Sipherdrakon) +* [downloader/niconico] Pass custom headers by [nao20010128nao](https://github.com/nao20010128nao) +* [dw] Fix extractor +* [Fancode] Fix live streams by [zenerdi0de](https://github.com/zenerdi0de) +* [funimation] Fix for locations outside US by [Jules-A](https://github.com/Jules-A), [pukkandan](https://github.com/pukkandan) +* [globo] Fix GloboIE by [Ashish0804](https://github.com/Ashish0804) +* [HiDive] Fix extractor by [Ashish0804](https://github.com/Ashish0804) +* [Hotstar] Add referer for subs by [Ashish0804](https://github.com/Ashish0804) +* [itv] Fix extractor, add subtitles and thumbnails by [coletdjnz](https://github.com/coletdjnz), [sleaux-meaux](https://github.com/sleaux-meaux), [Vangelis66](https://github.com/Vangelis66) +* [lbry] Show error message from API response +* [Mxplayer] Use mobile API by [Ashish0804](https://github.com/Ashish0804) +* [NDR] Rewrite NDRIE by [Ashish0804](https://github.com/Ashish0804) +* [Nuvid] Fix extractor by [u-spec-png](https://github.com/u-spec-png) +* [Oreilly] Handle new web url by [MKSherbini](https://github.com/MKSherbini) +* [pbs] Fix subtitle extraction by [coletdjnz](https://github.com/coletdjnz), [gesa](https://github.com/gesa), [raphaeldore](https://github.com/raphaeldore) +* [peertube] Update instances by [u-spec-png](https://github.com/u-spec-png) +* [plutotv] Fix extractor for URLs with `/en` +* [reddit] Workaround for 429 by redirecting to old.reddit.com +* [redtube] Fix exts +* [soundcloud] Make playlist extraction lazy +* [soundcloud] Retry playlist pages on `502` error and update `_CLIENT_ID` +* [southpark] Fix SouthParkDE by [coletdjnz](https://github.com/coletdjnz) +* [SovietsCloset] Fix playlists for games with only named categories by [ConquerorDopy](https://github.com/ConquerorDopy) +* [SpankBang] Fix uploader by [f4pp3rk1ng](https://github.com/f4pp3rk1ng), [coletdjnz](https://github.com/coletdjnz) +* [tiktok] Use API to fetch higher quality video by [MinePlayersPE](https://github.com/MinePlayersPE), [llacb47](https://github.com/llacb47) +* [TikTokUser] Fix extractor using mobile API by [MinePlayersPE](https://github.com/MinePlayersPE), [llacb47](https://github.com/llacb47) +* [videa] Fix some extraction errors by [nyuszika7h](https://github.com/nyuszika7h) +* [VrtNU] Handle login errors by [llacb47](https://github.com/llacb47) +* [vrv] Don't raise error when thumbnails are missing +* [youtube] Cleanup authentication code by [coletdjnz](https://github.com/coletdjnz) +* [youtube] Fix `--mark-watched` with `--cookies-from-browser` +* [youtube] Improvements to JS player extraction and add extractor-args to skip it by [coletdjnz](https://github.com/coletdjnz) +* [youtube] Retry on 'Unknown Error' by [coletdjnz](https://github.com/coletdjnz) +* [youtube] Return full URL instead of just ID +* [youtube] Warn when trying to download clips +* [zdf] Improve format sorting +* [zype] Extract subtitles from the m3u8 manifest by [fstirlitz](https://github.com/fstirlitz) +* Allow `--force-write-archive` to work with `--flat-playlist` +* Download subtitles in order of `--sub-langs` +* Allow `0` in `--playlist-items` +* Handle more playlist errors with `-i` +* Fix `--no-get-comments` +* Fix `extra_info` being reused across runs +* Fix compat options `no-direct-merge` and `playlist-index` +* Dump files should obey `--trim-filename` by [sulyi](https://github.com/sulyi) +* [aes] Add `aes_gcm_decrypt_and_verify` by [sulyi](https://github.com/sulyi), [pukkandan](https://github.com/pukkandan) +* [aria2c] Fix IV for some AES-128 streams by [shirt](https://github.com/shirt-dev) +* [compat] Don't ignore `HOME` (if set) on windows +* [cookies] Make browser names case insensitive +* [cookies] Print warning for cookie decoding error only once +* [extractor] Fix root-relative URLs in MPD by [DigitalDJ](https://github.com/DigitalDJ) +* [ffmpeg] Add `aac_adtstoasc` when merging if needed +* [fragment,aria2c] Generalize and refactor some code +* [fragment] Avoid repeated request for AES key +* [fragment] Fix range header when using `-N` and media sequence by [shirt](https://github.com/shirt-dev) +* [hls,aes] Fallback to native implementation for AES-CBC and detect `Cryptodome` in addition to `Crypto` +* [hls] Byterange + AES128 is supported by native downloader +* [ModifyChapters] Improve sponsor chapter merge algorithm by [nihil-admirari](https://github.com/nihil-admirari) +* [ModifyChapters] Minor fixes +* [WebVTT] Adjust parser to accommodate PBS subtitles +* [utils] Improve `extract_timezone` by [dirkf](https://github.com/dirkf) +* [options] Fix `--no-config` and refactor reading of config files +* [options] Strip spaces and ignore empty entries in list-like switches +* [test/cookies] Improve logging +* [build] Automate more of the release process by [animelover1984](https://github.com/animelover1984), [pukkandan](https://github.com/pukkandan) +* [build] Fix sha256 by [nihil-admirari](https://github.com/nihil-admirari) +* [build] Bring back brew taps by [nao20010128nao](https://github.com/nao20010128nao) +* [build] Provide `--onedir` zip for windows by [pukkandan](https://github.com/pukkandan) +* [cleanup,docs] Add deprecation warning in docs for some counter intuitive behaviour +* [cleanup] Fix line endings for `nebula.py` by [glenn-slayden](https://github.com/glenn-slayden) +* [cleanup] Improve `make clean-test` by [sulyi](https://github.com/sulyi) +* [cleanup] Misc + + +### 2021.09.02 + +* **Native SponsorBlock** implementation by [nihil-admirari](https://github.com/nihil-admirari), [pukkandan](https://github.com/pukkandan) + * `--sponsorblock-remove CATS` removes specified chapters from file + * `--sponsorblock-mark CATS` marks the specified sponsor sections as chapters + * `--sponsorblock-chapter-title TMPL` to specify sponsor chapter template + * `--sponsorblock-api URL` to use a different API + * No re-encoding is done unless `--force-keyframes-at-cuts` is used + * The fetched sponsor sections are written to the infojson + * Deprecates: `--sponskrub`, `--no-sponskrub`, `--sponskrub-cut`, `--no-sponskrub-cut`, `--sponskrub-force`, `--no-sponskrub-force`, `--sponskrub-location`, `--sponskrub-args` +* Split `--embed-chapters` from `--embed-metadata` (it still implies the former by default) +* Add option `--remove-chapters` to remove arbitrary chapters by [nihil-admirari](https://github.com/nihil-admirari), [pukkandan](https://github.com/pukkandan) +* Add option `--force-keyframes-at-cuts` for more accurate cuts when removing and splitting chapters by [nihil-admirari](https://github.com/nihil-admirari) +* Let `--match-filter` reject entries early + * Makes redundant: `--match-title`, `--reject-title`, `--min-views`, `--max-views` +* [lazy_extractor] Improvements (It now passes all tests) + * Bugfix for when plugin directory doesn't exist by [kidonng](https://github.com/kidonng) + * Create instance only after pre-checking archive + * Import actual class if an attribute is accessed + * Fix `suitable` and add flake8 test +* [downloader/ffmpeg] Experimental support for DASH manifests (including live) + * Your ffmpeg must have [this patch](https://github.com/FFmpeg/FFmpeg/commit/3249c757aed678780e22e99a1a49f4672851bca9) applied for YouTube DASH to work +* [downloader/ffmpeg] Allow passing custom arguments before `-i` +* [BannedVideo] Add extractor by [smege1001](https://github.com/smege1001), [blackjack4494](https://github.com/blackjack4494), [pukkandan](https://github.com/pukkandan) +* [bilibili] Add category extractor by [animelover1984](https://github.com/animelover1984) +* [Epicon] Add extractors by [Ashish0804](https://github.com/Ashish0804) +* [filmmodu] Add extractor by [mzbaulhaque](https://github.com/mzbaulhaque) +* [GabTV] Add extractor by [Ashish0804](https://github.com/Ashish0804) +* [Hungama] Fix `HungamaSongIE` and add `HungamaAlbumPlaylistIE` by [Ashish0804](https://github.com/Ashish0804) +* [ManotoTV] Add new extractors by [tandy1000](https://github.com/tandy1000) +* [Niconico] Add Search extractors by [animelover1984](https://github.com/animelover1984), [pukkandan](https://github.com/pukkandan) +* [Patreon] Add `PatreonUserIE` by [zenerdi0de](https://github.com/zenerdi0de) +* [peloton] Add extractor by [IONECarter](https://github.com/IONECarter), [capntrips](https://github.com/capntrips), [pukkandan](https://github.com/pukkandan) +* [ProjectVeritas] Add extractor by [Ashish0804](https://github.com/Ashish0804) +* [radiko] Add extractors by [nao20010128nao](https://github.com/nao20010128nao) +* [StarTV] Add extractor for `startv.com.tr` by [mrfade](https://github.com/mrfade), [coletdjnz](https://github.com/coletdjnz) +* [tiktok] Add `TikTokUserIE` by [Ashish0804](https://github.com/Ashish0804), [pukkandan](https://github.com/pukkandan) +* [Tokentube] Add extractor by [u-spec-png](https://github.com/u-spec-png) +* [TV2Hu] Fix `TV2HuIE` and add `TV2HuSeriesIE` by [Ashish0804](https://github.com/Ashish0804) +* [voicy] Add extractor by [nao20010128nao](https://github.com/nao20010128nao) +* [adobepass] Fix Verizon SAML login by [nyuszika7h](https://github.com/nyuszika7h), [ParadoxGBB](https://github.com/ParadoxGBB) +* [afreecatv] Fix adult VODs by [wlritchi](https://github.com/wlritchi) +* [afreecatv] Tolerate failure to parse date string by [wlritchi](https://github.com/wlritchi) +* [aljazeera] Fix extractor by [MinePlayersPE](https://github.com/MinePlayersPE) +* [ATV.at] Fix extractor for ATV.at by [NeroBurner](https://github.com/NeroBurner), [coletdjnz](https://github.com/coletdjnz) +* [bitchute] Fix test by [mahanstreamer](https://github.com/mahanstreamer) +* [camtube] Remove obsolete extractor by [alerikaisattera](https://github.com/alerikaisattera) +* [CDA] Add more formats by [u-spec-png](https://github.com/u-spec-png) +* [eroprofile] Fix page skipping in albums by [jhwgh1968](https://github.com/jhwgh1968) +* [facebook] Fix format sorting +* [facebook] Fix metadata extraction by [kikuyan](https://github.com/kikuyan) +* [facebook] Update onion URL by [Derkades](https://github.com/Derkades) +* [HearThisAtIE] Fix extractor by [Ashish0804](https://github.com/Ashish0804) +* [instagram] Add referrer to prevent throttling by [u-spec-png](https://github.com/u-spec-png), [kikuyan](https://github.com/kikuyan) +* [iwara.tv] Extract more metadata by [BunnyHelp](https://github.com/BunnyHelp) +* [iwara] Add thumbnail by [i6t](https://github.com/i6t) +* [kakao] Fix extractor +* [mediaset] Fix extraction for some videos by [nyuszika7h](https://github.com/nyuszika7h) +* [Motherless] Fix extractor by [coletdjnz](https://github.com/coletdjnz) +* [Nova] fix extractor by [std-move](https://github.com/std-move) +* [ParamountPlus] Fix geo verification by [shirt](https://github.com/shirt-dev) +* [peertube] handle new video URL format by [Chocobozzz](https://github.com/Chocobozzz) +* [pornhub] Separate and fix playlist extractor by [mzbaulhaque](https://github.com/mzbaulhaque) +* [reddit] Fix for quarantined subreddits by [ouwou](https://github.com/ouwou) +* [ShemarooMe] Fix extractor by [Ashish0804](https://github.com/Ashish0804) +* [soundcloud] Refetch `client_id` on 403 +* [tiktok] Fix metadata extraction +* [TV2] Fix extractor by [Ashish0804](https://github.com/Ashish0804) +* [tv5mondeplus] Fix extractor by [korli](https://github.com/korli) +* [VH1,TVLand] Fix extractors by [Sipherdrakon](https://github.com/Sipherdrakon) +* [Viafree] Fix extractor and extract subtitles by [coletdjnz](https://github.com/coletdjnz) +* [XHamster] Extract `uploader_id` by [octotherp](https://github.com/octotherp) +* [youtube] Add `shorts` to `_VALID_URL` +* [youtube] Add av01 itags to known formats list by [blackjack4494](https://github.com/blackjack4494) +* [youtube] Extract error messages from HTTPError response by [coletdjnz](https://github.com/coletdjnz) +* [youtube] Fix subtitle names +* [youtube] Prefer audio stream that YouTube considers default +* [youtube] Remove annotations and deprecate `--write-annotations` by [coletdjnz](https://github.com/coletdjnz) +* [Zee5] Fix extractor and add subtitles by [Ashish0804](https://github.com/Ashish0804) +* [aria2c] Obey `--rate-limit` +* [EmbedSubtitle] Continue even if some files are missing +* [extractor] Better error message for DRM +* [extractor] Common function `_match_valid_url` +* [extractor] Show video id in error messages if possible +* [FormatSort] Remove priority of `lang` +* [options] Add `_set_from_options_callback` +* [SubtitleConvertor] Fix bug during subtitle conversion +* [utils] Add `parse_qs` +* [webvtt] Fix timestamp overflow adjustment by [fstirlitz](https://github.com/fstirlitz) +* Bugfix for `--replace-in-metadata` +* Don't try to merge with final extension +* Fix `--force-overwrites` when using `-k` +* Fix `--no-prefer-free-formats` by [CeruleanSky](https://github.com/CeruleanSky) +* Fix `-F` for extractors that directly return url +* Fix `-J` when there are failed videos +* Fix `extra_info` being reused across runs +* Fix `playlist_index` not obeying `playlist_start` and add tests +* Fix resuming of single formats when using `--no-part` +* Revert erroneous use of the `Content-Length` header by [fstirlitz](https://github.com/fstirlitz) +* Use `os.replace` where applicable by; paulwrubel +* [build] Add homebrew taps `yt-dlp/taps/yt-dlp` by [nao20010128nao](https://github.com/nao20010128nao) +* [build] Fix bug in making `yt-dlp.tar.gz` +* [docs] Fix some typos by [pukkandan](https://github.com/pukkandan), [zootedb0t](https://github.com/zootedb0t) +* [cleanup] Replace improper use of tab in trovo by [glenn-slayden](https://github.com/glenn-slayden) + + +### 2021.08.10 + +* Add option `--replace-in-metadata` +* Add option `--no-simulate` to not simulate even when `--print` or `--list...` are used - Deprecates `--print-json` +* Allow entire infodict to be printed using `%()s` - makes `--dump-json` redundant +* Allow multiple `--exec` and `--exec-before-download` +* Add regex to `--match-filter` +* Add all format filtering operators also to `--match-filter` by [max-te](https://github.com/max-te) +* Add compat-option `no-keep-subs` +* [adobepass] Add MSO Cablevision by [Jessecar96](https://github.com/Jessecar96) +* [BandCamp] Add BandcampMusicIE by [Ashish0804](https://github.com/Ashish0804) +* [blackboardcollaborate] Add new extractor by [mzbaulhaque](https://github.com/mzbaulhaque) +* [eroprofile] Add album downloader by [jhwgh1968](https://github.com/jhwgh1968) +* [mirrativ] Add extractors by [nao20010128nao](https://github.com/nao20010128nao) +* [openrec] Add extractors by [nao20010128nao](https://github.com/nao20010128nao) +* [nbcolympics:stream] Fix extractor by [nchilada](https://github.com/nchilada), [pukkandan](https://github.com/pukkandan) +* [nbcolympics] Update extractor for 2020 olympics by [wesnm](https://github.com/wesnm) +* [paramountplus] Separate extractor and fix some titles by [shirt](https://github.com/shirt-dev), [pukkandan](https://github.com/pukkandan) +* [RCTIPlus] Support events and TV by [MinePlayersPE](https://github.com/MinePlayersPE) +* [Newgrounds] Improve extractor and fix playlist by [u-spec-png](https://github.com/u-spec-png) +* [aenetworks] Update `_THEPLATFORM_KEY` and `_THEPLATFORM_SECRET` by [wesnm](https://github.com/wesnm) +* [crunchyroll] Fix thumbnail by [funniray](https://github.com/funniray) +* [HotStar] Use API for metadata and extract subtitles by [Ashish0804](https://github.com/Ashish0804) +* [instagram] Fix comments extraction by [u-spec-png](https://github.com/u-spec-png) +* [peertube] Fix videos without description by [u-spec-png](https://github.com/u-spec-png) +* [twitch:clips] Extract `display_id` by [dirkf](https://github.com/dirkf) +* [viki] Print error message from API request +* [Vine] Remove invalid formats by [u-spec-png](https://github.com/u-spec-png) +* [VrtNU] Fix XSRF token by [pgaig](https://github.com/pgaig) +* [vrv] Fix thumbnail extraction by [funniray](https://github.com/funniray) +* [youtube] Add extractor-arg `include-live-dash` to show live dash formats +* [youtube] Improve signature function detection by [PSlava](https://github.com/PSlava) +* [youtube] Raise appropriate error when API pages can't be downloaded +* Ensure `_write_ytdl_file` closes file handle on error +* Fix `--compat-options filename` by [stdedos](https://github.com/stdedos) +* Fix issues with infodict sanitization +* Fix resuming when using `--no-part` +* Fix wrong extension for intermediate files +* Handle `BrokenPipeError` by [kikuyan](https://github.com/kikuyan) +* Show libraries present in verbose head +* [extractor] Detect `sttp` as subtitles in MPD by [fstirlitz](https://github.com/fstirlitz) +* [extractor] Reset non-repeating warnings per video +* [ffmpeg] Fix streaming `mp4` to `stdout` +* [ffpmeg] Allow `--ffmpeg-location` to be a file with different name +* [utils] Fix `InAdvancePagedList.__getitem__` +* [utils] Fix `traverse_obj` depth when `is_user_input` +* [webvtt] Merge daisy-chained duplicate cues by [fstirlitz](https://github.com/fstirlitz) +* [build] Use custom build of `pyinstaller` by [shirt](https://github.com/shirt-dev) +* [tests:download] Add batch testing for extractors (`test_YourExtractor_all`) +* [docs] Document which fields `--add-metadata` adds to the file +* [docs] Fix some mistakes and improve doc +* [cleanup] Misc code cleanup + + +### 2021.08.02 + +* Add logo, banner and donate links +* [outtmpl] Expand and escape environment variables +* [outtmpl] Add format types `j` (json), `l` (comma delimited list), `q` (quoted for terminal) +* [downloader] Allow streaming some unmerged formats to stdout using ffmpeg +* [youtube] **Age-gate bypass** + * Add `agegate` clients by [pukkandan](https://github.com/pukkandan), [MinePlayersPE](https://github.com/MinePlayersPE) + * Add `thirdParty` to agegate clients to bypass more videos + * Simplify client definitions, expose `embedded` clients + * Improve age-gate detection by [coletdjnz](https://github.com/coletdjnz) + * Fix default global API key by [coletdjnz](https://github.com/coletdjnz) + * Add `creator` clients for age-gate bypass using unverified accounts by [zerodytrash](https://github.com/zerodytrash), [coletdjnz](https://github.com/coletdjnz), [pukkandan](https://github.com/pukkandan) +* [adobepass] Add MSO Sling TV by [wesnm](https://github.com/wesnm) +* [CBS] Add ParamountPlusSeriesIE by [Ashish0804](https://github.com/Ashish0804) +* [dplay] Add `ScienceChannelIE` by [Sipherdrakon](https://github.com/Sipherdrakon) +* [UtreonIE] Add extractor by [Ashish0804](https://github.com/Ashish0804) +* [youtube] Add `mweb` client by [coletdjnz](https://github.com/coletdjnz) +* [youtube] Add `player_client=all` +* [youtube] Force `hl=en` for comments by [coletdjnz](https://github.com/coletdjnz) +* [youtube] Fix format sorting when using alternate clients +* [youtube] Misc cleanup by [pukkandan](https://github.com/pukkandan), [coletdjnz](https://github.com/coletdjnz) +* [youtube] Extract SAPISID only once +* [CBS] Add fallback by [llacb47](https://github.com/llacb47), [pukkandan](https://github.com/pukkandan) +* [Hotstar] Support cookies by [Ashish0804](https://github.com/Ashish0804) +* [HotStarSeriesIE] Fix regex by [Ashish0804](https://github.com/Ashish0804) +* [bilibili] Improve `_VALID_URL` +* [mediaset] Fix extraction by [nixxo](https://github.com/nixxo) +* [Mxplayer] Add h265 formats by [Ashish0804](https://github.com/Ashish0804) +* [RCTIPlus] Remove PhantomJS dependency by [MinePlayersPE](https://github.com/MinePlayersPE) +* [tenplay] Add MA15+ age limit by [pento](https://github.com/pento) +* [vidio] Fix login error detection by [MinePlayersPE](https://github.com/MinePlayersPE) +* [vimeo] Better extraction of original file by [Ashish0804](https://github.com/Ashish0804) +* [generic] Support KVS player (replaces ThisVidIE) by [rigstot](https://github.com/rigstot) +* Add compat-option `no-clean-infojson` +* Remove `asr` appearing twice in `-F` +* Set `home:` as the default key for `-P` +* [utils] Fix slicing of reversed `LazyList` +* [FormatSort] Fix bug for audio with unknown codec +* [test:download] Support testing with `ignore_no_formats_error` +* [cleanup] Refactor some code + + +### 2021.07.24 + +* [youtube:tab] Extract video duration early +* [downloader] Pass `info_dict` to `progress_hook`s +* [youtube] Fix age-gated videos for API clients when cookies are supplied by [coletdjnz](https://github.com/coletdjnz) +* [youtube] Disable `get_video_info` age-gate workaround - This endpoint seems to be completely dead +* [youtube] Try all clients even if age-gated +* [youtube] Fix subtitles only being extracted from the first client +* [youtube] Simplify `_get_text` +* [cookies] bugfix for microsoft edge on macOS +* [cookies] Handle `sqlite` `ImportError` gracefully by [mbway](https://github.com/mbway) +* [cookies] Handle errors when importing `keyring` + +### 2021.07.21 + +* **Add option `--cookies-from-browser`** to load cookies from a browser by [mbway](https://github.com/mbway) + * Usage: `--cookies-from-browser BROWSER[:PROFILE_NAME_OR_PATH]` + * Also added `--no-cookies-from-browser` + * To decrypt chromium cookies, `keyring` is needed for UNIX and `pycryptodome` for Windows +* Add option `--exec-before-download` +* Add field `live_status` +* [FFmpegMetadata] Add language of each stream and some refactoring +* [douyin] Add extractor by [pukkandan](https://github.com/pukkandan), [pyx](https://github.com/pyx) +* [pornflip] Add extractor by [mzbaulhaque](https://github.com/mzbaulhaque) +* **[youtube] Extract data from multiple clients** by [pukkandan](https://github.com/pukkandan), [coletdjnz](https://github.com/coletdjnz) + * `player_client` now accepts multiple clients + * Default `player_client` = `android,web` + * This uses twice as many requests, but avoids throttling for most videos while also not losing any formats + * Music clients can be specifically requested and is enabled by default if `music.youtube.com` + * Added `player_client=ios` (Known issue: formats from ios are not sorted correctly) + * Add age-gate bypass for android and ios clients +* [youtube] Extract more thumbnails + * The thumbnail URLs are hard-coded and their actual existence is tested lazily + * Added option `--no-check-formats` to not test them +* [youtube] Misc fixes + * Improve extraction of livestream metadata by [pukkandan](https://github.com/pukkandan), [krichbanana](https://github.com/krichbanana) + * Hide live dash formats since they can't be downloaded anyway + * Fix authentication when using multiple accounts by [coletdjnz](https://github.com/coletdjnz) + * Fix controversial videos when requested via API by [coletdjnz](https://github.com/coletdjnz) + * Fix session index extraction and headers for non-web player clients by [coletdjnz](https://github.com/coletdjnz) + * Make `--extractor-retries` work for more errors + * Fix sorting of 3gp format + * Sanity check `chapters` (and refactor related code) + * Make `parse_time_text` and `_extract_chapters` non-fatal + * Misc cleanup and bug fixes by [coletdjnz](https://github.com/coletdjnz) +* [youtube:tab] Fix channels tab +* [youtube:tab] Extract playlist availability by [coletdjnz](https://github.com/coletdjnz) +* **[youtube:comments] Move comment extraction to new API** by [coletdjnz](https://github.com/coletdjnz) + * Adds extractor-args `comment_sort` (`top`/`new`), `max_comments`, `max_comment_depth` +* [youtube:comments] Fix `is_favorited`, improve `like_count` parsing by [coletdjnz](https://github.com/coletdjnz) +* [BravoTV] Improve metadata extraction by [kevinoconnor7](https://github.com/kevinoconnor7) +* [crunchyroll:playlist] Force http +* [yahoo:gyao:player] Relax `_VALID_URL` by [nao20010128nao](https://github.com/nao20010128nao) +* [nebula] Authentication via tokens from cookie jar by [hheimbuerger](https://github.com/hheimbuerger), [TpmKranz](https://github.com/TpmKranz) +* [RTP] Fix extraction and add subtitles by [fstirlitz](https://github.com/fstirlitz) +* [viki] Rewrite extractors and add extractor-arg `video_types` to `vikichannel` by [zackmark29](https://github.com/zackmark29), [pukkandan](https://github.com/pukkandan) +* [vlive] Extract thumbnail directly in addition to the one from Naver +* [generic] Extract previously missed subtitles by [fstirlitz](https://github.com/fstirlitz) +* [generic] Extract everything in the SMIL manifest and detect discarded subtitles by [fstirlitz](https://github.com/fstirlitz) +* [embedthumbnail] Fix `_get_thumbnail_resolution` +* [metadatafromfield] Do not detect numbers as field names +* Fix selectors `all`, `mergeall` and add tests +* Errors in playlist extraction should obey `--ignore-errors` +* Fix bug where `original_url` was not propagated when `_type`=`url` +* Revert "Merge webm formats into mkv if thumbnails are to be embedded (#173)" + * This was wrongly checking for `write_thumbnail` +* Improve `extractor_args` parsing +* Rename `NOTE` in `-F` to `MORE INFO` since it's often confused to be the same as `format_note` +* Add `only_once` param for `write_debug` and `report_warning` +* [extractor] Allow extracting multiple groups in `_search_regex` by [fstirlitz](https://github.com/fstirlitz) +* [utils] Improve `traverse_obj` +* [utils] Add `variadic` +* [utils] Improve `js_to_json` comment regex by [fstirlitz](https://github.com/fstirlitz) +* [webtt] Fix timestamps +* [compat] Remove unnecessary code +* [docs] fix default of multistreams + + +### 2021.07.07 + +* Merge youtube-dl: Upto [commit/a803582](https://github.com/ytdl-org/youtube-dl/commit/a8035827177d6b59aca03bd717acb6a9bdd75ada) +* Add `--extractor-args` to pass some extractor-specific arguments. See [readme](https://github.com/yt-dlp/yt-dlp#extractor-arguments) + * Add extractor option `skip` for `youtube`. Eg: `--extractor-args youtube:skip=hls,dash` + * Deprecates `--youtube-skip-dash-manifest`, `--youtube-skip-hls-manifest`, `--youtube-include-dash-manifest`, `--youtube-include-hls-manifest` +* Allow `--list...` options to work with `--print`, `--quiet` and other `--list...` options +* [youtube] Use `player` API for additional video extraction requests by [coletdjnz](https://github.com/coletdjnz) + * **Fixes youtube premium music** (format 141) extraction + * Adds extractor option `player_client` = `web`/`android` + * **`--extractor-args youtube:player_client=android` works around the throttling** for the time-being + * Adds extractor option `player_skip=config` + * Adds age-gate fallback using embedded client +* [youtube] Choose correct Live chat API for upcoming streams by [krichbanana](https://github.com/krichbanana) +* [youtube] Fix subtitle names for age-gated videos +* [youtube:comments] Fix error handling and add `itct` to params by [coletdjnz](https://github.com/coletdjnz) +* [youtube_live_chat] Fix download with cookies by [siikamiika](https://github.com/siikamiika) +* [youtube_live_chat] use `clickTrackingParams` by [siikamiika](https://github.com/siikamiika) +* [Funimation] Rewrite extractor + * Add `FunimationShowIE` by [Mevious](https://github.com/Mevious) + * **Treat the different versions of an episode as different formats of a single video** + * This changes the video `id` and will break break existing archives + * Compat option `seperate-video-versions` to fall back to old behavior including using the old video ids + * Support direct `/player/` URL + * Extractor options `language` and `version` to pre-select them during extraction + * These options may be removed in the future if we can extract all formats without additional network requests + * Do not rely on these for format selection and use `-f` filters instead +* [AdobePass] Add Spectrum MSO by [kevinoconnor7](https://github.com/kevinoconnor7), [ohmybahgosh](https://github.com/ohmybahgosh) +* [facebook] Extract description and fix title +* [fancode] Fix extraction, support live and allow login with refresh token by [zenerdi0de](https://github.com/zenerdi0de) +* [plutotv] Improve `_VALID_URL` +* [RCTIPlus] Add extractor by [MinePlayersPE](https://github.com/MinePlayersPE) +* [Soundcloud] Allow login using oauth token by [blackjack4494](https://github.com/blackjack4494) +* [TBS] Support livestreams by [llacb47](https://github.com/llacb47) +* [videa] Fix extraction by [nyuszika7h](https://github.com/nyuszika7h) +* [yahoo] Fix extraction by [llacb47](https://github.com/llacb47), [pukkandan](https://github.com/pukkandan) +* Process videos when using `--ignore-no-formats-error` by [krichbanana](https://github.com/krichbanana) +* Fix `--throttled-rate` when using `--load-info-json` +* Fix `--flat-playlist` when entry has no `ie_key` +* Fix `check_formats` catching `ExtractorError` instead of `DownloadError` +* Fix deprecated option `--list-formats-old` +* [downloader/ffmpeg] Fix `--ppa` when using simultaneous download +* [extractor] Prevent unnecessary download of hls manifests and refactor `hls_split_discontinuity` +* [fragment] Handle status of download and errors in threads correctly; and minor refactoring +* [thumbnailsconvertor] Treat `jpeg` as `jpg` +* [utils] Fix issues with `LazyList` reversal +* [extractor] Allow extractors to set their own login hint +* [cleanup] Simplify format selector code with `LazyList` and `yield from` +* [cleanup] Clean `extractor.common._merge_subtitles` signature +* [cleanup] Fix some typos + + +### 2021.06.23 + +* Merge youtube-dl: Upto [commit/379f52a](https://github.com/ytdl-org/youtube-dl/commit/379f52a4954013767219d25099cce9e0f9401961) +* **Add option `--throttled-rate`** below which video data is re-extracted +* [fragment] **Merge during download for `-N`**, and refactor `hls`/`dash` +* [websockets] Add `WebSocketFragmentFD` by [nao20010128nao](https://github.com/nao20010128nao), [pukkandan](https://github.com/pukkandan) +* Allow `images` formats in addition to video/audio +* [downloader/mhtml] Add new downloader for slideshows/storyboards by [fstirlitz](https://github.com/fstirlitz) +* [youtube] Temporary **fix for age-gate** +* [youtube] Support ongoing live chat by [siikamiika](https://github.com/siikamiika) +* [youtube] Improve SAPISID cookie handling by [coletdjnz](https://github.com/coletdjnz) +* [youtube] Login is not needed for `:ytrec` +* [youtube] Non-fatal alert reporting for unavailable videos page by [coletdjnz](https://github.com/coletdjnz) +* [twitcasting] Websocket support by [nao20010128nao](https://github.com/nao20010128nao) +* [mediasite] Extract slides by [fstirlitz](https://github.com/fstirlitz) +* [funimation] Extract subtitles +* [pornhub] Extract `cast` +* [hotstar] Use server time for authentication instead of local time +* [EmbedThumbnail] Fix for already downloaded thumbnail +* [EmbedThumbnail] Add compat-option `embed-thumbnail-atomicparsley` +* Expand `--check-formats` to thumbnails +* Fix id sanitization in filenames +* Skip fixup of existing files and add `--fixup force` to force it +* Better error handling of syntax errors in `-f` +* Use `NamedTemporaryFile` for `--check-formats` +* [aria2c] Lower `--min-split-size` for HTTP downloads +* [options] Rename `--add-metadata` to `--embed-metadata` +* [utils] Improve `LazyList` and add tests +* [build] Build Windows x86 version with py3.7 and remove redundant tests by [pukkandan](https://github.com/pukkandan), [shirt](https://github.com/shirt-dev) +* [docs] Clarify that `--embed-metadata` embeds chapter markers +* [cleanup] Refactor fixup + + +### 2021.06.09 + +* Fix bug where `%(field)d` in filename template throws error +* [outtmpl] Improve offset parsing +* [test] More rigorous tests for `prepare_filename` + +### 2021.06.08 + +* Remove support for obsolete Python versions: Only 3.6+ is now supported +* Merge youtube-dl: Upto [commit/c2350ca](https://github.com/ytdl-org/youtube-dl/commit/c2350cac243ba1ec1586fe85b0d62d1b700047a2) +* [hls] Fix decryption for multithreaded downloader +* [extractor] Fix pre-checking archive for some extractors +* [extractor] Fix FourCC fallback when parsing ISM by [fstirlitz](https://github.com/fstirlitz) +* [twitcasting] Add TwitCastingUserIE, TwitCastingLiveIE by [pukkandan](https://github.com/pukkandan), [nao20010128nao](https://github.com/nao20010128nao) +* [vidio] Add VidioPremierIE and VidioLiveIE by [MinePlayersPE](Https://github.com/MinePlayersPE) +* [viki] Fix extraction from [ytdl-org/youtube-dl@59e583f](https://github.com/ytdl-org/youtube-dl/commit/59e583f7e8530ca92776c866897d895c072e2a82) +* [youtube] Support shorts URL +* [zoom] Extract transcripts as subtitles +* Add field `original_url` with the user-inputted URL +* Fix and refactor `prepare_outtmpl` +* Make more fields available for `--print` when used with `--flat-playlist` +* [utils] Generalize `traverse_dict` to `traverse_obj` +* [downloader/ffmpeg] Hide FFmpeg banner unless in verbose mode by [fstirlitz](https://github.com/fstirlitz) +* [build] Release `yt-dlp.tar.gz` +* [build,update] Add GNU-style SHA512 and prepare updater for simlar SHA256 by [nihil-admirari](https://github.com/nihil-admirari) +* [pyinst] Show Python version in exe metadata by [nihil-admirari](https://github.com/nihil-admirari) +* [docs] Improve documentation of dependencies +* [cleanup] Mark unused files +* [cleanup] Point all shebang to `python3` by [fstirlitz](https://github.com/fstirlitz) +* [cleanup] Remove duplicate file `trovolive.py` + + +### 2021.06.01 + +* Merge youtube-dl: Upto [commit/d495292](https://github.com/ytdl-org/youtube-dl/commit/d495292852b6c2f1bd58bc2141ff2b0265c952cf) +* Pre-check archive and filters during playlist extraction +* Handle Basic Auth `user:pass` in URLs by [hhirtz](https://github.com/hhirtz) and [pukkandan](https://github.com/pukkandan) +* [archiveorg] Add YoutubeWebArchiveIE by [coletdjnz](https://github.com/coletdjnz) and [alex-gedeon](https://github.com/alex-gedeon) +* [fancode] Add extractor by [rhsmachine](https://github.com/rhsmachine) +* [patreon] Support vimeo embeds by [rhsmachine](https://github.com/rhsmachine) +* [Saitosan] Add new extractor by [llacb47](https://github.com/llacb47) +* [ShemarooMe] Add extractor by [Ashish0804](https://github.com/Ashish0804) and [pukkandan](https://github.com/pukkandan) +* [telemundo] Add extractor by [king-millez](https://github.com/king-millez) +* [SonyLIV] Add SonyLIVSeriesIE and subtitle support by [Ashish0804](https://github.com/Ashish0804) +* [Hotstar] Add HotStarSeriesIE by [Ashish0804](https://github.com/Ashish0804) +* [Voot] Add VootSeriesIE by [Ashish0804](https://github.com/Ashish0804) +* [vidio] Support login and premium videos by [MinePlayersPE](https://github.com/MinePlayersPE) +* [fragment] When using `-N`, do not keep the fragment content in memory +* [ffmpeg] Download and merge in a single step if possible +* [ThumbnailsConvertor] Support conversion to `png` and make it the default by [louie-github](https://github.com/louie-github) +* [VideoConvertor] Generalize with remuxer and allow conditional recoding +* [EmbedThumbnail] Embed in `mp4`/`m4a` using mutagen by [tripulse](https://github.com/tripulse) and [pukkandan](https://github.com/pukkandan) +* [EmbedThumbnail] Embed if any thumbnail was downloaded, not just the best +* [EmbedThumbnail] Correctly escape filename +* [update] replace self without launching a subprocess in windows +* [update] Block further update for unsupported systems +* Refactor `__process_playlist` by creating `LazyList` +* Write messages to `stderr` when both `quiet` and `verbose` +* Sanitize and sort playlist thumbnails +* Remove `None` values from `info.json` +* [extractor] Always prefer native hls downloader by default +* [extractor] Skip subtitles without URI in m3u8 manifests by [hheimbuerger](https://github.com/hheimbuerger) +* [extractor] Functions to parse `socket.io` response as `json` by [pukkandan](https://github.com/pukkandan) and [llacb47](https://github.com/llacb47) +* [extractor] Allow `note=False` when extracting manifests +* [utils] Escape URLs in `sanitized_Request`, not `sanitize_url` +* [hls] Disable external downloader for `webtt` +* [youtube] `/live` URLs should raise error if channel is not live +* [youtube] Bug fixes +* [zee5] Fix m3u8 formats' extension +* [ard] Allow URLs without `-` before id by [olifre](https://github.com/olifre) +* [cleanup] `YoutubeDL._match_entry` +* [cleanup] Refactor updater +* [cleanup] Refactor ffmpeg convertors +* [cleanup] setup.py + + +### 2021.05.20 + +* **Youtube improvements**: + * Support youtube music `MP`, `VL` and `browse` pages + * Extract more formats for youtube music by [craftingmod](https://github.com/craftingmod), [coletdjnz](https://github.com/coletdjnz) and [pukkandan](https://github.com/pukkandan) + * Extract multiple subtitles in same language by [pukkandan](https://github.com/pukkandan) and [tpikonen](https://github.com/tpikonen) + * Redirect channels that doesn't have a `videos` tab to their `UU` playlists + * Support in-channel search + * Sort audio-only formats correctly + * Always extract `maxresdefault` thumbnail + * Extract audio language + * Add subtitle language names by [nixxo](https://github.com/nixxo) and [tpikonen](https://github.com/tpikonen) + * Show alerts only from the final webpage + * Add `html5=1` param to `get_video_info` page requests by [coletdjnz](https://github.com/coletdjnz) + * Better message when login required +* **Add option `--print`**: to print any field/template + * Makes redundant: `--get-description`, `--get-duration`, `--get-filename`, `--get-format`, `--get-id`, `--get-thumbnail`, `--get-title`, `--get-url` +* Field `additional_urls` to download additional videos from metadata using [`--parse-metadata`](https://github.com/yt-dlp/yt-dlp#modifying-metadata) +* Merge youtube-dl: Upto [commit/dfbbe29](https://github.com/ytdl-org/youtube-dl/commit/dfbbe2902fc67f0f93ee47a8077c148055c67a9b) +* Write thumbnail of playlist and add `pl_thumbnail` outtmpl key +* [embedthumbnail] Add `flac` support and refactor `mutagen` code by [pukkandan](https://github.com/pukkandan) and [tripulse](https://github.com/tripulse) +* [audius:artist] Add extractor by [king-millez](https://github.com/king-millez) +* [parlview] Add extractor by [king-millez](https://github.com/king-millez) +* [tenplay] Fix extractor by [king-millez](https://github.com/king-millez) +* [rmcdecouverte] Generalize `_VALID_URL` +* Add compat-option `no-attach-infojson` +* Add field `name` for subtitles +* Ensure `post_extract` and `pre_process` only run once +* Fix `--check-formats` when there is network error +* Standardize `write_debug` and `get_param` +* [options] Alias `--write-comments`, `--no-write-comments` +* [options] Refactor callbacks +* [test:download] Only extract enough videos for `playlist_mincount` +* [extractor] bugfix for when `compat_opts` is not given +* [build] Fix x86 build by [shirt](https://github.com/shirt-dev) +* [cleanup] code formatting, youtube tests and readme + +### 2021.05.11 +* **Deprecate support for python versions < 3.6** +* **Subtitle extraction from manifests** by [fstirlitz](https://github.com/fstirlitz). See [be6202f](https://github.com/yt-dlp/yt-dlp/commit/be6202f12b97858b9d716e608394b51065d0419f) for details +* **Improve output template:** + * Allow slicing lists/strings using `field.start:end:step` + * A field can also be used as offset like `field1+num+field2` + * A default value can be given using `field|default` + * Prevent invalid fields from causing errors +* **Merge youtube-dl**: Upto [commit/a726009](https://github.com/ytdl-org/youtube-dl/commit/a7260099873acc6dc7d76cafad2f6b139087afd0) +* **Remove options** `-l`, `-t`, `-A` completely and disable `--auto-number`, `--title`, `--literal`, `--id` +* [Plugins] Prioritize plugins over standard extractors and prevent plugins from overwriting the standard extractor classes +* [downloader] Fix `quiet` and `to_stderr` +* [fragment] Ensure the file is closed on error +* [fragment] Make sure first segment is not skipped +* [aria2c] Fix whitespace being stripped off +* [embedthumbnail] Fix bug where jpeg thumbnails were converted again +* [FormatSort] Fix for when some formats have quality and others don't +* [utils] Add `network_exceptions` +* [utils] Escape URL while sanitizing +* [ukcolumn] Add Extractor +* [whowatch] Add extractor by [nao20010128nao](https://github.com/nao20010128nao) +* [CBS] Improve `_VALID_URL` to support movies +* [crackle] Improve extraction +* [curiositystream] Fix collections +* [francetvinfo] Improve video id extraction +* [generic] Respect the encoding in manifest +* [limelight] Obey `allow_unplayable_formats` +* [mediasite] Generalize URL pattern by [fstirlitz](https://github.com/fstirlitz) +* [mxplayer] Add MxplayerShowIE by [Ashish0804](https://github.com/Ashish0804) +* [nebula] Move to nebula.app by [Lamieur](https://github.com/Lamieur) +* [niconico] Fix HLS formats by [CXwudi](https://github.com/CXwudi), [tsukumijima](https://github.com/tsukumijima), [nao20010128nao](https://github.com/nao20010128nao) and [pukkandan](https://github.com/pukkandan) +* [niconico] Fix title and thumbnail extraction by [CXwudi](https://github.com/CXwudi) +* [plutotv] Extract subtitles from manifests +* [plutotv] Fix format extraction for some urls +* [rmcdecouverte] Improve `_VALID_URL` +* [sonyliv] Fix `title` and `series` extraction by [Ashish0804](https://github.com/Ashish0804) +* [tubi] Raise "no video formats" error when video url is empty +* [youtube:tab] Detect playlists inside community posts +* [youtube] Add `oembed` to reserved names +* [zee5] Fix extraction for some URLs by [Hadi0609](https://github.com/Hadi0609) +* [zee5] Fix py2 compatibility +* Fix `playlist_index` and add `playlist_autonumber`. See [#302](https://github.com/yt-dlp/yt-dlp/issues/302) for details +* Add experimental option `--check-formats` to test the URLs before format selection +* Option `--compat-options` to revert [some of yt-dlp's changes](https://github.com/yt-dlp/yt-dlp#differences-in-default-behavior) + * Deprecates `--list-formats-as-table`, `--list-formats-old` +* Fix number of digits in `%(playlist_index)s` +* Fix case sensitivity of format selector +* Revert "[core] be able to hand over id and title using url_result" +* Do not strip out whitespaces in `-o` and `-P` +* Fix `preload_download_archive` writing verbose message to `stdout` +* Move option warnings to `YoutubeDL`so that they obey `--no-warnings` and can output colors +* Py2 compatibility for `FileNotFoundError` + + +### 2021.04.22 +* **Improve output template:** + * Objects can be traversed like `%(field.key1.key2)s` + * An offset can be added to numeric fields as `%(field+N)s` + * Deprecates `--autonumber-start` +* **Improve `--sub-langs`:** + * Treat `--sub-langs` entries as regex + * `all` can be used to refer to all the subtitles + * language codes can be prefixed with `-` to exclude it + * Deprecates `--all-subs` +* Add option `--ignore-no-formats-error` to ignore the "no video format" and similar errors +* Add option `--skip-playlist-after-errors` to skip the rest of a playlist after a given number of errors are encountered +* Merge youtube-dl: Upto [commit/7e8b3f9](https://github.com/ytdl-org/youtube-dl/commit/7e8b3f9439ebefb3a3a4e5da9c0bd2b595976438) +* [downloader] Fix bug in downloader selection +* [BilibiliChannel] Fix pagination by [nao20010128nao](https://github.com/nao20010128nao) and [pukkandan](https://github.com/pukkandan) +* [rai] Add support for http formats by [nixxo](https://github.com/nixxo) +* [TubiTv] Add TubiTvShowIE by [Ashish0804](https://github.com/Ashish0804) +* [twitcasting] Fix extractor +* [viu:ott] Fix extractor and support series by [lkho](https://github.com/lkho) and [pukkandan](https://github.com/pukkandan) +* [youtube:tab] Show unavailable videos in playlists by [coletdjnz](https://github.com/coletdjnz) +* [youtube:tab] Reload with unavailable videos for all playlists +* [youtube] Ignore invalid stretch ratio +* [youtube] Improve channel syncid extraction to support ytcfg by [coletdjnz](https://github.com/coletdjnz) +* [youtube] Standardize API calls for tabs, mixes and search by [coletdjnz](https://github.com/coletdjnz) +* [youtube] Bugfix in `_extract_ytcfg` +* [mildom:user:vod] Download only necessary amount of pages +* [mildom] Remove proxy completely by [fstirlitz](https://github.com/fstirlitz) +* [go] Fix `_VALID_URL` +* [MetadataFromField] Improve regex and add tests +* [Exec] Ensure backward compatibility when the command contains `%` +* [extractor] Fix inconsistent use of `report_warning` +* Ensure `mergeall` selects best format when multistreams are disabled +* Improve the yt-dlp.sh script by [fstirlitz](https://github.com/fstirlitz) +* [lazy_extractor] Do not load plugins +* [ci] Disable fail-fast +* [docs] Clarify which deprecated options still work +* [docs] Fix typos + + +### 2021.04.11 +* Add option `--convert-thumbnails` (only jpg currently supported) +* Format selector `mergeall` to download and merge all formats +* Pass any field to `--exec` using similar syntax to output template +* Choose downloader for each protocol using `--downloader PROTO:NAME` + * Alias `--downloader` for `--external-downloader` + * Added `native` as an option for the downloader +* Merge youtube-dl: Upto [commit/4fb25ff](https://github.com/ytdl-org/youtube-dl/commit/4fb25ff5a3be5206bb72e5c4046715b1529fb2c7) (except vimeo) +* [DiscoveryPlusIndia] Add DiscoveryPlusIndiaShowIE by [Ashish0804](https://github.com/Ashish0804) +* [NFHSNetwork] Add extractor by [llacb47](https://github.com/llacb47) +* [nebula] Add extractor (watchnebula.com) by [hheimbuerger](https://github.com/hheimbuerger) +* [nitter] Fix extraction of reply tweets and update instance list by [B0pol](https://github.com/B0pol) +* [nitter] Fix thumbnails by [B0pol](https://github.com/B0pol) +* [youtube] Fix thumbnail URL +* [youtube] Parse API parameters from initial webpage by [coletdjnz](https://github.com/coletdjnz) +* [youtube] Extract comments' approximate timestamp by [coletdjnz](https://github.com/coletdjnz) +* [youtube] Fix alert extraction +* [bilibili] Fix uploader +* [utils] Add `datetime_from_str` and `datetime_add_months` by [coletdjnz](https://github.com/coletdjnz) +* Run some `postprocessors` before actual download +* Improve argument parsing for `-P`, `-o`, `-S` +* Fix some `m3u8` not obeying `--allow-unplayable-formats` +* Fix default of `dynamic_mpd` +* Deprecate `--all-formats`, `--include-ads`, `--hls-prefer-native`, `--hls-prefer-ffmpeg` +* [docs] Improvements + +### 2021.04.03 +* Merge youtube-dl: Upto [commit/654b4f4](https://github.com/ytdl-org/youtube-dl/commit/654b4f4ff2718f38b3182c1188c5d569c14cc70a) +* Ability to set a specific field in the file's metadata using `--parse-metadata` +* Ability to select n'th best format like `-f bv*.2` +* [DiscoveryPlus] Add discoveryplus.in +* [la7] Add podcasts and podcast playlists by [nixxo](https://github.com/nixxo) +* [mildom] Update extractor with current proxy by [nao20010128nao](https://github.com/nao20010128nao) +* [ard:mediathek] Fix video id extraction +* [generic] Detect Invidious' link element +* [youtube] Show premium state in `availability` by [coletdjnz](https://github.com/coletdjnz) +* [viewsource] Add extractor to handle `view-source:` +* [sponskrub] Run before embedding thumbnail +* [docs] Improve `--parse-metadata` documentation + + +### 2021.03.24.1 +* Revert [commit/8562218](https://github.com/ytdl-org/youtube-dl/commit/8562218350a79d4709da8593bb0c538aa0824acf) + +### 2021.03.24 +* Merge youtube-dl: Upto 2021.03.25 ([commit/8562218](https://github.com/ytdl-org/youtube-dl/commit/8562218350a79d4709da8593bb0c538aa0824acf)) +* Parse metadata from multiple fields using `--parse-metadata` +* Ability to load playlist infojson using `--load-info-json` +* Write current epoch to infojson when using `--no-clean-infojson` +* [youtube_live_chat] fix bug when trying to set cookies +* [niconico] Fix for when logged in by [CXwudi](https://github.com/CXwudi) and [xtkoba](https://github.com/xtkoba) +* [linuxacadamy] Fix login + + +### 2021.03.21 +* Merge youtube-dl: Upto [commit/7e79ba7](https://github.com/ytdl-org/youtube-dl/commit/7e79ba7dd6e6649dd2ce3a74004b2044f2182881) +* Option `--no-clean-infojson` to keep private keys in the infojson +* [aria2c] Support retry/abort unavailable fragments by [damianoamatruda](https://github.com/damianoamatruda) +* [aria2c] Better default arguments +* [movefiles] Fix bugs and make more robust +* [formatSort] Fix `quality` being ignored +* [splitchapters] Fix for older ffmpeg +* [sponskrub] Pass proxy to sponskrub +* Make sure `post_hook` gets the final filename +* Recursively remove any private keys from infojson +* Embed video URL metadata inside `mp4` by [damianoamatruda](https://github.com/damianoamatruda) and [pukkandan](https://github.com/pukkandan) +* Merge `webm` formats into `mkv` if thumbnails are to be embedded by [damianoamatruda](https://github.com/damianoamatruda) +* Use headers and cookies when downloading subtitles by [damianoamatruda](https://github.com/damianoamatruda) +* Parse resolution in info dictionary by [damianoamatruda](https://github.com/damianoamatruda) +* More consistent warning messages by [damianoamatruda](https://github.com/damianoamatruda) and [pukkandan](https://github.com/pukkandan) +* [docs] Add deprecated options and aliases in readme +* [docs] Fix some minor mistakes + +* [niconico] Partial fix adapted from [animelover1984/youtube-dl@b5eff52](https://github.com/animelover1984/youtube-dl/commit/b5eff52dd9ed5565672ea1694b38c9296db3fade) (login and smile formats still don't work) +* [niconico] Add user extractor by [animelover1984](https://github.com/animelover1984) +* [bilibili] Add anthology support by [animelover1984](https://github.com/animelover1984) +* [amcnetworks] Fix extractor by [2ShedsJackson](https://github.com/2ShedsJackson) +* [stitcher] Merge from youtube-dl by [nixxo](https://github.com/nixxo) +* [rcs] Improved extraction by [nixxo](https://github.com/nixxo) +* [linuxacadamy] Improve regex +* [youtube] Show if video is `private`, `unlisted` etc in info (`availability`) by [coletdjnz](https://github.com/coletdjnz) and [pukkandan](https://github.com/pukkandan) +* [youtube] bugfix for channel playlist extraction +* [nbc] Improve metadata extraction by [2ShedsJackson](https://github.com/2ShedsJackson) + + +### 2021.03.15 +* **Split video by chapters**: using option `--split-chapters` + * The output file of the split files can be set with `-o`/`-P` using the prefix `chapter:` + * Additional keys `section_title`, `section_number`, `section_start`, `section_end` are available in the output template +* **Parallel fragment downloads** by [shirt](https://github.com/shirt-dev) + * Use option `--concurrent-fragments` (`-N`) to set the number of threads (default 1) +* Merge youtube-dl: Upto [commit/3be0980](https://github.com/ytdl-org/youtube-dl/commit/3be098010f667b14075e3dfad1e74e5e2becc8ea) +* [zee5] Add Show Extractor by [Ashish0804](https://github.com/Ashish0804) and [pukkandan](https://github.com/pukkandan) +* [rai] fix drm check [nixxo](https://github.com/nixxo) +* [wimtv] Add extractor by [nixxo](https://github.com/nixxo) +* [mtv] Add mtv.it and extract series metadata by [nixxo](https://github.com/nixxo) +* [pluto.tv] Add extractor by [kevinoconnor7](https://github.com/kevinoconnor7) +* [youtube] Rewrite comment extraction by [coletdjnz](https://github.com/coletdjnz) +* [embedthumbnail] Set mtime correctly +* Refactor some postprocessor/downloader code by [pukkandan](https://github.com/pukkandan) and [shirt](https://github.com/shirt-dev) + + +### 2021.03.07 +* [youtube] Fix history, mixes, community pages and trending by [pukkandan](https://github.com/pukkandan) and [coletdjnz](https://github.com/coletdjnz) +* [youtube] Fix private feeds/playlists on multi-channel accounts by [coletdjnz](https://github.com/coletdjnz) +* [youtube] Extract alerts from continuation by [coletdjnz](https://github.com/coletdjnz) +* [cbs] Add support for ParamountPlus by [shirt](https://github.com/shirt-dev) +* [mxplayer] Rewrite extractor with show support by [pukkandan](https://github.com/pukkandan) and [Ashish0804](https://github.com/Ashish0804) +* [gedi] Improvements from youtube-dl by [nixxo](https://github.com/nixxo) +* [vimeo] Fix videos with password by [teesid](https://github.com/teesid) +* [lbry] Support `lbry://` url by [nixxo](https://github.com/nixxo) +* [bilibili] Change `Accept` header by [pukkandan](https://github.com/pukkandan) and [animelover1984](https://github.com/animelover1984) +* [trovo] Pass origin header +* [rai] Check for DRM by [nixxo](https://github.com/nixxo) +* [downloader] Fix bug for `ffmpeg`/`httpie` +* [update] Fix updater removing the executable bit on some UNIX distros +* [update] Fix current build hash for UNIX +* [docs] Include wget/curl/aria2c install instructions for Unix by [Ashish0804](https://github.com/Ashish0804) +* Fix some videos downloading with `m3u8` extension +* Remove "fixup is ignored" warning when fixup wasn't passed by user + + +### 2021.03.03.2 +* [build] Fix bug + +### 2021.03.03 +* [youtube] Use new browse API for continuation page extraction by [coletdjnz](https://github.com/coletdjnz) and [pukkandan](https://github.com/pukkandan) +* Fix HLS playlist downloading by [shirt](https://github.com/shirt-dev) +* Merge youtube-dl: Upto [2021.03.03](https://github.com/ytdl-org/youtube-dl/releases/tag/2021.03.03) +* [mtv] Fix extractor +* [nick] Fix extractor by [DennyDai](https://github.com/DennyDai) +* [mxplayer] Add new extractor by [codeasashu](https://github.com/codeasashu) +* [youtube] Throw error when `--extractor-retries` are exhausted +* Reduce default of `--extractor-retries` to 3 +* Fix packaging bugs by [hseg](https://github.com/hseg) + + +### 2021.03.01 +* Allow specifying path in `--external-downloader` +* Add option `--sleep-requests` to sleep b/w requests +* Add option `--extractor-retries` to retry on known extractor errors +* Extract comments only when needed +* `--get-comments` doesn't imply `--write-info-json` if `-J`, `-j` or `--print-json` are used +* Fix `get_executable_path` by [shirt](https://github.com/shirt-dev) +* [youtube] Retry on more known errors than just HTTP-5xx +* [youtube] Fix inconsistent `webpage_url` +* [tennistv] Fix format sorting +* [bilibiliaudio] Recognize the file as audio-only +* [hrfensehen] Fix wrong import +* [viki] Fix viki play pass authentication by [RobinD42](https://github.com/RobinD42) +* [readthedocs] Improvements by [shirt](https://github.com/shirt-dev) +* [hls] Fix bug with m3u8 format extraction +* [hls] Enable `--hls-use-mpegts` by default when downloading live-streams +* [embedthumbnail] Fix bug with deleting original thumbnail +* [build] Fix completion paths, zsh pip completion install by [hseg](https://github.com/hseg) +* [ci] Disable download tests unless specifically invoked +* Cleanup some code and fix typos + + +### 2021.02.24 +* Moved project to an organization [yt-dlp](https://github.com/yt-dlp) +* **Completely changed project name to yt-dlp** by [Pccode66](https://github.com/Pccode66) and [pukkandan](https://github.com/pukkandan) + * Also, `youtube-dlc` config files are no longer loaded +* Merge youtube-dl: Upto [commit/4460329](https://github.com/ytdl-org/youtube-dl/commit/44603290e5002153f3ebad6230cc73aef42cc2cd) (except tmz, gedi) +* [Readthedocs](https://yt-dlp.readthedocs.io) support by [shirt](https://github.com/shirt-dev) +* [youtube] Show if video was a live stream in info (`was_live`) +* [Zee5] Add new extractor by [Ashish0804](https://github.com/Ashish0804) and [pukkandan](https://github.com/pukkandan) +* [jwplatform] Add support for `hyland.com` +* [tennistv] Fix extractor +* [hls] Support media initialization by [shirt](https://github.com/shirt-dev) +* [hls] Added options `--hls-split-discontinuity` to better support media discontinuity by [shirt](https://github.com/shirt-dev) +* [ffmpeg] Allow passing custom arguments before -i using `--ppa "ffmpeg_i1:ARGS"` syntax +* Fix `--windows-filenames` removing `/` from UNIX paths +* [hls] Show warning if pycryptodome is not found +* [docs] Improvements + * Fix documentation of `Extractor Options` + * Document `all` in format selection + * Document `playable_in_embed` in output templates + + +### 2021.02.19 +* Merge youtube-dl: Upto [commit/cf2dbec](https://github.com/ytdl-org/youtube-dl/commit/cf2dbec6301177a1fddf72862de05fa912d9869d) (except kakao) +* [viki] Fix extractor +* [niconico] Extract `channel` and `channel_id` by [kurumigi](https://github.com/kurumigi) +* [youtube] Multiple page support for hashtag URLs +* [youtube] Add more invidious instances +* [youtube] Fix comment extraction when comment text is empty +* Option `--windows-filenames` to force use of windows compatible filenames +* [ExtractAudio] Bugfix +* Don't raise `parser.error` when exiting for update +* [MoveFiles] Fix for when merger can't run +* Changed `--trim-file-name` to `--trim-filenames` to be similar to related options +* Format Sort improvements: + * Prefer `vp9.2` more than other `vp9` codecs + * Remove forced priority of `quality` + * Remove unnecessary `field_preference` and misuse of `preference` from extractors +* Build improvements: + * Fix hash output by [shirt](https://github.com/shirt-dev) + * Lock python package versions for x86 and use `wheels` by [shirt](https://github.com/shirt-dev) + * Exclude `vcruntime140.dll` from UPX by [jbruchon](https://github.com/jbruchon) + * Set version number based on UTC time, not local time + * Publish on PyPi only if token is set +* [docs] Better document `--prefer-free-formats` and add `--no-prefer-free-format` + + +### 2021.02.15 +* Merge youtube-dl: Upto [2021.02.10](https://github.com/ytdl-org/youtube-dl/releases/tag/2021.02.10) (except archive.org) +* [niconico] Improved extraction and support encrypted/SMILE movies by [kurumigi](https://github.com/kurumigi), [tsukumijima](https://github.com/tsukumijima), [bbepis](https://github.com/bbepis), [pukkandan](https://github.com/pukkandan) +* Fix HLS AES-128 with multiple keys in external downloaders by [shirt](https://github.com/shirt-dev) +* [youtube_live_chat] Fix by using POST API by [siikamiika](https://github.com/siikamiika) +* [rumble] Add support for video page +* Option `--allow-unplayable-formats` to allow downloading unplayable video formats +* [ExtractAudio] Don't re-encode when file is already in a common audio format +* [youtube] Fix search continuations +* [youtube] Fix for new accounts +* Improve build/updater: by [pukkandan](https://github.com/pukkandan) and [shirt](https://github.com/shirt-dev) + * Fix SHA256 calculation in build and implement hash checking for updater + * Exit immediately in windows once the update process starts + * Fix updater for `x86.exe` + * Updater looks for both `yt-dlp` and `youtube-dlc` in releases for future-proofing + * Change optional dependency to `pycryptodome` +* Fix issue with unicode filenames in aria2c by [shirt](https://github.com/shirt-dev) +* Fix `allow_playlist_files` not being correctly passed through +* Fix for empty HTTP head requests by [shirt](https://github.com/shirt-dev) +* Fix `get_executable_path` in UNIX +* [sponskrub] Print ffmpeg output and errors to terminal +* `__real_download` should be false when ffmpeg unavailable and no download +* Show `exe`/`zip`/`source` and 32/64bit in verbose message + + +### 2021.02.09 +* **aria2c support for DASH/HLS**: by [shirt](https://github.com/shirt-dev) +* **Implement Updater** (`-U`) by [shirt](https://github.com/shirt-dev) +* [youtube] Fix comment extraction +* [youtube_live_chat] Improve extraction +* [youtube] Fix for channel URLs sometimes not downloading all pages +* [aria2c] Changed default arguments to `--console-log-level=warn --summary-interval=0 --file-allocation=none -x16 -j16 -s16` +* Add fallback for thumbnails +* [embedthumbnail] Keep original thumbnail after conversion if write_thumbnail given +* [embedsubtitle] Keep original subtitle after conversion if write_subtitles given +* [pyinst.py] Move back to root dir +* [youtube] Simplified renderer parsing and bugfixes +* [movefiles] Fix compatibility with python2 +* [remuxvideo] Fix validation of conditional remux +* [sponskrub] Don't raise error when the video does not exist +* [docs] Crypto is an optional dependency + + +### 2021.02.04 +* Merge youtube-dl: Upto [2021.02.04.1](https://github.com/ytdl-org/youtube-dl/releases/tag/2021.02.04.1) +* **Date/time formatting in output template:** + * You can use [`strftime`](https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes) to format date/time fields. Example: `%(upload_date>%Y-%m-%d)s` +* **Multiple output templates:** + * Separate output templates can be given for the different metadata files by using `-o TYPE:TEMPLATE` + * The allowed types are: `subtitle|thumbnail|description|annotation|infojson|pl_description|pl_infojson` +* [youtube] More metadata extraction for channel/playlist URLs (channel, uploader, thumbnail, tags) +* New option `--no-write-playlist-metafiles` to prevent writing playlist metadata files +* [audius] Fix extractor +* [youtube_live_chat] Fix `parse_yt_initial_data` and add `fragment_retries` +* [postprocessor] Raise errors correctly +* [metadatafromtitle] Fix bug when extracting data from numeric fields +* Fix issue with overwriting files +* Fix "Default format spec" appearing in quiet mode +* [FormatSort] Allow user to prefer av01 over vp9 (The default is still vp9) +* [FormatSort] fix bug where `quality` had more priority than `hasvid` +* [pyinst] Automatically detect python architecture and working directory +* Strip out internal fields such as `_filename` from infojson + + +### 2021.01.29 +* **Features from [animelover1984/youtube-dl](https://github.com/animelover1984/youtube-dl)**: by [animelover1984](https://github.com/animelover1984) and [bbepis](https://github.com/bbepis) + * Add `--get-comments` + * [youtube] Extract comments + * [billibilli] Added BiliBiliSearchIE, BilibiliChannelIE + * [billibilli] Extract comments + * [billibilli] Better video extraction + * Write playlist data to infojson + * [FFmpegMetadata] Embed infojson inside the video + * [EmbedThumbnail] Try embedding in mp4 using ffprobe and `-disposition` + * [EmbedThumbnail] Treat mka like mkv and mov like mp4 + * [EmbedThumbnail] Embed in ogg/opus + * [VideoRemuxer] Conditionally remux video + * [VideoRemuxer] Add `-movflags +faststart` when remuxing to mp4 + * [ffmpeg] Print entire stderr in verbose when there is error + * [EmbedSubtitle] Warn when embedding ass in mp4 + * [anvato] Use NFLTokenGenerator if possible +* **Parse additional metadata**: New option `--parse-metadata` to extract additional metadata from existing fields + * The extracted fields can be used in `--output` + * Deprecated `--metadata-from-title` +* [Audius] Add extractor +* [youtube] Extract playlist description and write it to `.description` file +* Detect existing files even when using `recode`/`remux` (`extract-audio` is partially fixed) +* Fix wrong user config from v2021.01.24 +* [youtube] Report error message from youtube as error instead of warning +* [FormatSort] Fix some fields not sorting from v2021.01.24 +* [postprocessor] Deprecate `avconv`/`avprobe`. All current functionality is left untouched. But don't expect any new features to work with avconv +* [postprocessor] fix `write_debug` to not throw error when there is no `_downloader` +* [movefiles] Don't give "cant find" warning when move is unnecessary +* Refactor `update-version`, `pyinst.py` and related files +* [ffmpeg] Document more formats that are supported for remux/recode + + +### 2021.01.24 +* Merge youtube-dl: Upto [2021.01.24](https://github.com/ytdl-org/youtube-dl/releases/tag/2021.01.16) +* Plugin support ([documentation](https://github.com/yt-dlp/yt-dlp#plugins)) +* **Multiple paths**: New option `-P`/`--paths` to give different paths for different types of files + * The syntax is `-P "type:path" -P "type:path"` + * Valid types are: home, temp, description, annotation, subtitle, infojson, thumbnail + * Additionally, configuration file is taken from home directory or current directory +* Allow passing different arguments to different external downloaders +* [mildom] Add extractor by [nao20010128nao](https://github.com/nao20010128nao) +* Warn when using old style `--external-downloader-args` and `--post-processor-args` +* Fix `--no-overwrite` when using `--write-link` +* [sponskrub] Output `unrecognized argument` error message correctly +* [cbs] Make failure to extract title non-fatal +* Fix typecasting when pre-checking archive +* Fix issue with setting title on UNIX +* Deprecate redundant aliases in `formatSort`. The aliases remain functional for backward compatibility, but will be left undocumented +* [tests] Fix test_post_hooks +* [tests] Split core and download tests + + +### 2021.01.20 +* [TrovoLive] Add extractor (only VODs) +* [pokemon] Add `/#/player` URLs +* Improved parsing of multiple postprocessor-args, add `--ppa` as alias +* [EmbedThumbnail] Simplify embedding in mkv +* [sponskrub] Encode filenames correctly, better debug output and error message +* [readme] Cleanup options + + +### 2021.01.16 +* Merge youtube-dl: Upto [2021.01.16](https://github.com/ytdl-org/youtube-dl/releases/tag/2021.01.16) +* **Configuration files:** + * Portable configuration file: `./yt-dlp.conf` + * Allow the configuration files to be named `yt-dlp` instead of `youtube-dlc`. See [this](https://github.com/yt-dlp/yt-dlp#configuration) for details +* Add PyPI release + + +### 2021.01.14 +* Added option `--break-on-reject` +* [roosterteeth.com] Fix for bonus episodes by [Zocker1999NET](https://github.com/Zocker1999NET) +* [tiktok] Fix for when share_info is empty +* [EmbedThumbnail] Fix bug due to incorrect function name +* [docs] Changed sponskrub links to point to [yt-dlp/SponSkrub](https://github.com/yt-dlp/SponSkrub) since I am now providing both linux and windows releases +* [docs] Change all links to correctly point to new fork URL +* [docs] Fixes typos + + +### 2021.01.12 +* [roosterteeth.com] Add subtitle support by [samiksome](https://github.com/samiksome) +* Added `--force-overwrites`, `--no-force-overwrites` by [alxnull](https://github.com/alxnull) +* Changed fork name to `yt-dlp` +* Fix typos by [FelixFrog](https://github.com/FelixFrog) +* [ci] Option to skip +* [changelog] Added unreleased changes in blackjack4494/yt-dlc + + +### 2021.01.10 +* [archive.org] Fix extractor and add support for audio and playlists by [wporr](https://github.com/wporr) +* [Animelab] Added by [mariuszskon](https://github.com/mariuszskon) +* [youtube:search] Fix view_count by [ohnonot](https://github.com/ohnonot) +* [youtube] Show if video is embeddable in info (`playable_in_embed`) +* Update version badge automatically in README +* Enable `test_youtube_search_matching` +* Create `to_screen` and similar functions in postprocessor/common + + +### 2021.01.09 +* [youtube] Fix bug in automatic caption extraction +* Add `post_hooks` to YoutubeDL by [alexmerkel](https://github.com/alexmerkel) +* Batch file enumeration improvements by [glenn-slayden](https://github.com/glenn-slayden) +* Stop immediately when reaching `--max-downloads` by [glenn-slayden](https://github.com/glenn-slayden) +* Fix incorrect ANSI sequence for restoring console-window title by [glenn-slayden](https://github.com/glenn-slayden) +* Kill child processes when yt-dlc is killed by [Unrud](https://github.com/Unrud) + + +### 2021.01.08 +* Merge youtube-dl: Upto [2021.01.08](https://github.com/ytdl-org/youtube-dl/releases/tag/2021.01.08) except stitcher ([1](https://github.com/ytdl-org/youtube-dl/commit/bb38a1215718cdf36d73ff0a7830a64cd9fa37cc), [2](https://github.com/ytdl-org/youtube-dl/commit/a563c97c5cddf55f8989ed7ea8314ef78e30107f)) +* Moved changelog to separate file + + +### 2021.01.07-1 +* [Akamai] fix by [nixxo](https://github.com/nixxo) +* [Tiktok] merge youtube-dl tiktok extractor by [GreyAlien502](https://github.com/GreyAlien502) +* [vlive] add support for playlists by [kyuyeunk](https://github.com/kyuyeunk) +* [youtube_live_chat] make sure playerOffsetMs is positive by [siikamiika](https://github.com/siikamiika) +* Ignore extra data streams in ffmpeg by [jbruchon](https://github.com/jbruchon) +* Allow passing different arguments to different postprocessors using `--postprocessor-args` +* Deprecated `--sponskrub-args`. The same can now be done using `--postprocessor-args "sponskrub:<args>"` +* [CI] Split tests into core-test and full-test + + +### 2021.01.07 +* Removed priority of `av01` codec in `-S` since most devices don't support it yet +* Added `duration_string` to be used in `--output` +* Created First Release + + +### 2021.01.05-1 +* **Changed defaults:** + * Enabled `--ignore` + * Disabled `--video-multistreams` and `--audio-multistreams` + * Changed default format selection to `bv*+ba/b` when `--audio-multistreams` is disabled + * Changed default format sort order to `res,fps,codec,size,br,asr,proto,ext,has_audio,source,format_id` + * Changed `webm` to be more preferable than `flv` in format sorting + * Changed default output template to `%(title)s [%(id)s].%(ext)s` + * Enabled `--list-formats-as-table` + + +### 2021.01.05 +* **Format Sort:** Added `--format-sort` (`-S`), `--format-sort-force` (`--S-force`) - See [Sorting Formats](README.md#sorting-formats) for details +* **Format Selection:** See [Format Selection](README.md#format-selection) for details + * New format selectors: `best*`, `worst*`, `bestvideo*`, `bestaudio*`, `worstvideo*`, `worstaudio*` + * Changed video format sorting to show video only files and video+audio files together. + * Added `--video-multistreams`, `--no-video-multistreams`, `--audio-multistreams`, `--no-audio-multistreams` + * Added `b`,`w`,`v`,`a` as alias for `best`, `worst`, `video` and `audio` respectively +* Shortcut Options: Added `--write-link`, `--write-url-link`, `--write-webloc-link`, `--write-desktop-link` by [h-h-h-h](https://github.com/h-h-h-h) - See [Internet Shortcut Options](README.md#internet-shortcut-options) for details +* **Sponskrub integration:** Added `--sponskrub`, `--sponskrub-cut`, `--sponskrub-force`, `--sponskrub-location`, `--sponskrub-args` - See [SponSkrub Options](README.md#sponskrub-sponsorblock-options) for details +* Added `--force-download-archive` (`--force-write-archive`) by [h-h-h-h](https://github.com/h-h-h-h) +* Added `--list-formats-as-table`, `--list-formats-old` +* **Negative Options:** Makes it possible to negate most boolean options by adding a `no-` to the switch. Usefull when you want to reverse an option that is defined in a config file + * Added `--no-ignore-dynamic-mpd`, `--no-allow-dynamic-mpd`, `--allow-dynamic-mpd`, `--youtube-include-hls-manifest`, `--no-youtube-include-hls-manifest`, `--no-youtube-skip-hls-manifest`, `--no-download`, `--no-download-archive`, `--resize-buffer`, `--part`, `--mtime`, `--no-keep-fragments`, `--no-cookies`, `--no-write-annotations`, `--no-write-info-json`, `--no-write-description`, `--no-write-thumbnail`, `--youtube-include-dash-manifest`, `--post-overwrites`, `--no-keep-video`, `--no-embed-subs`, `--no-embed-thumbnail`, `--no-add-metadata`, `--no-include-ads`, `--no-write-sub`, `--no-write-auto-sub`, `--no-playlist-reverse`, `--no-restrict-filenames`, `--youtube-include-dash-manifest`, `--no-format-sort-force`, `--flat-videos`, `--no-list-formats-as-table`, `--no-sponskrub`, `--no-sponskrub-cut`, `--no-sponskrub-force` + * Renamed: `--write-subs`, `--no-write-subs`, `--no-write-auto-subs`, `--write-auto-subs`. Note that these can still be used without the ending "s" +* Relaxed validation for format filters so that any arbitrary field can be used +* Fix for embedding thumbnail in mp3 by [pauldubois98](https://github.com/pauldubois98) ([ytdl-org/youtube-dl#21569](https://github.com/ytdl-org/youtube-dl/pull/21569)) +* Make Twitch Video ID output from Playlist and VOD extractor same. This is only a temporary fix +* Merge youtube-dl: Upto [2021.01.03](https://github.com/ytdl-org/youtube-dl/commit/8e953dcbb10a1a42f4e12e4e132657cb0100a1f8) - See [blackjack4494/yt-dlc#280](https://github.com/blackjack4494/yt-dlc/pull/280) for details + * Extractors [tiktok](https://github.com/ytdl-org/youtube-dl/commit/fb626c05867deab04425bad0c0b16b55473841a2) and [hotstar](https://github.com/ytdl-org/youtube-dl/commit/bb38a1215718cdf36d73ff0a7830a64cd9fa37cc) have not been merged +* Cleaned up the fork for public use + + +**PS**: All uncredited changes above this point are authored by [pukkandan](https://github.com/pukkandan) + +### Unreleased changes in [blackjack4494/yt-dlc](https://github.com/blackjack4494/yt-dlc) +* Updated to youtube-dl release 2020.11.26 by [pukkandan](https://github.com/pukkandan) +* Youtube improvements by [pukkandan](https://github.com/pukkandan) + * Implemented all Youtube Feeds (ytfav, ytwatchlater, ytsubs, ythistory, ytrec) and SearchURL + * Fix some improper Youtube URLs + * Redirect channel home to /video + * Print youtube's warning message + * Handle Multiple pages for feeds better +* [youtube] Fix ytsearch not returning results sometimes due to promoted content by [coletdjnz](https://github.com/coletdjnz) +* [youtube] Temporary fix for automatic captions - disable json3 by [blackjack4494](https://github.com/blackjack4494) +* Add --break-on-existing by [gergesh](https://github.com/gergesh) +* Pre-check video IDs in the archive before downloading by [pukkandan](https://github.com/pukkandan) +* [bitwave.tv] New extractor by [lorpus](https://github.com/lorpus) +* [Gedi] Add extractor by [nixxo](https://github.com/nixxo) +* [Rcs] Add new extractor by [nixxo](https://github.com/nixxo) +* [skyit] New skyitalia extractor by [nixxo](https://github.com/nixxo) +* [france.tv] Fix thumbnail URL by [renalid](https://github.com/renalid) +* [ina] support mobile links by [B0pol](https://github.com/B0pol) +* [instagram] Fix thumbnail extractor by [nao20010128nao](https://github.com/nao20010128nao) +* [SouthparkDe] Support for English URLs by [xypwn](https://github.com/xypwn) +* [spreaker] fix SpreakerShowIE test URL by [pukkandan](https://github.com/pukkandan) +* [Vlive] Fix playlist handling when downloading a channel by [kyuyeunk](https://github.com/kyuyeunk) +* [tmz] Fix extractor by [diegorodriguezv](https://github.com/diegorodriguezv) +* [generic] Detect embedded bitchute videos by [pukkandan](https://github.com/pukkandan) +* [generic] Extract embedded youtube and twitter videos by [diegorodriguezv](https://github.com/diegorodriguezv) +* [ffmpeg] Ensure all streams are copied by [pukkandan](https://github.com/pukkandan) +* [embedthumbnail] Fix for os.rename error by [pukkandan](https://github.com/pukkandan) +* make_win.bat: don't use UPX to pack vcruntime140.dll by [jbruchon](https://github.com/jbruchon) diff --git a/Collaborators.md b/Collaborators.md new file mode 100644 index 000000000..0017e1cd4 --- /dev/null +++ b/Collaborators.md @@ -0,0 +1,39 @@ +# Collaborators + +This is a list of the collaborators of the project and their major contributions. See the [Changelog](Changelog.md) for more details. + +You can also find lists of all [contributors of yt-dlp](CONTRIBUTORS) and [authors of youtube-dl](https://github.com/ytdl-org/youtube-dl/blob/master/AUTHORS) + + +## [pukkandan](https://github.com/pukkandan) + +[](https://ko-fi.com/pukkandan) + +* Owner of the fork + + + +## [shirt](https://github.com/shirt-dev) + +[](https://ko-fi.com/shirt) + +* Multithreading (`-N`) and aria2c support for fragment downloads +* Support for media initialization and discontinuity in HLS +* The self-updater (`-U`) + + + +## [coletdjnz](https://github.com/coletdjnz) + +[](https://github.com/sponsors/coletdjnz) + +* YouTube improvements including: age-gate bypass, private playlists, multiple-clients (to avoid throttling) and a lot of under-the-hood improvements + + + +## [Ashish0804](https://github.com/Ashish0804) + +[](https://ko-fi.com/ashish0804) + +* Added support for new websites Zee5, MXPlayer, DiscoveryPlusIndia, ShemarooMe, Utreon etc +* Added playlist/series downloads for TubiTv, SonyLIV, Voot, HotStar etc diff --git a/MANIFEST.in b/MANIFEST.in index 4e43e99f3..38d83a9a5 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,9 +1,9 @@ -include README.md -include LICENSE include AUTHORS -include ChangeLog -include youtube-dl.bash-completion -include youtube-dl.fish -include youtube-dl.1 -recursive-include docs Makefile conf.py *.rst +include Changelog.md +include LICENSE +include README.md +include completions/*/* +include supportedsites.md +include yt-dlp.1 +recursive-include devscripts * recursive-include test * @@ -1,15 +1,37 @@ -all: youtube-dl README.md CONTRIBUTING.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish +all: yt-dlp doc pypi-files +clean: clean-test clean-dist clean-cache +completions: completion-bash completion-fish completion-zsh +doc: README.md CONTRIBUTING.md +ot: offlinetest +tar: yt-dlp.tar.gz + +# Keep this list in sync with MANIFEST.in +# intended use: when building a source distribution, +# make pypi-files && python setup.py sdist +pypi-files: AUTHORS Changelog.md LICENSE README.md README.txt completions yt-dlp.1 devscripts/* test/* + +.PHONY: all clean install test tar pypi-files completions ot offlinetest codetest + +clean-test: + rm -rf *.3gp *.annotations.xml *.ape *.avi *.description *.dump *.flac *.flv *.frag *.frag.aria2 *.frag.urls \ + *.info.json *.jpeg *.jpg *.live_chat.json *.m4a *.m4v *.mkv *.mp3 *.mp4 *.ogg *.opus *.part* *.png *.sbv *.srt \ + *.swf *.swp *.ttml *.vtt *.wav *.webm *.webp *.ytdl test/testdata/player-*.js +clean-dist: + rm -rf yt-dlp.1.temp.md yt-dlp.1 README.txt MANIFEST build/ dist/ .coverage cover/ yt-dlp.tar.gz completions/ yt_dlp/extractor/lazy_extractors.py *.spec CONTRIBUTING.md.tmp yt-dlp yt-dlp.exe yt_dlp.egg-info/ AUTHORS .mailmap +clean-cache: + find . -name "*.pyc" -o -name "*.class" -delete -clean: - rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish youtube_dl/extractor/lazy_extractors.py *.dump *.part* *.ytdl *.info.json *.mp4 *.m4a *.flv *.mp3 *.avi *.mkv *.webm *.3gp *.wav *.ape *.swf *.jpg *.png CONTRIBUTING.md.tmp youtube-dl youtube-dl.exe - find . -name "*.pyc" -delete - find . -name "*.class" -delete +completion-bash: completions/bash/yt-dlp +completion-fish: completions/fish/yt-dlp.fish +completion-zsh: completions/zsh/_yt-dlp +lazy-extractors: yt_dlp/extractor/lazy_extractors.py PREFIX ?= /usr/local +DESTDIR ?= . BINDIR ?= $(PREFIX)/bin MANDIR ?= $(PREFIX)/man SHAREDIR ?= $(PREFIX)/share -PYTHON ?= /usr/bin/env python +PYTHON ?= /usr/bin/env python3 # set SYSCONFDIR to /etc if PREFIX=/usr or PREFIX=/usr/local SYSCONFDIR = $(shell if [ $(PREFIX) = /usr -o $(PREFIX) = /usr/local ]; then echo /etc; else echo $(PREFIX)/etc; fi) @@ -17,61 +39,40 @@ SYSCONFDIR = $(shell if [ $(PREFIX) = /usr -o $(PREFIX) = /usr/local ]; then ech # set markdown input format to "markdown-smart" for pandoc version 2 and to "markdown" for pandoc prior to version 2 MARKDOWN = $(shell if [ `pandoc -v | head -n1 | cut -d" " -f2 | head -c1` = "2" ]; then echo markdown-smart; else echo markdown; fi) -install: youtube-dl youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish - install -d $(DESTDIR)$(BINDIR) - install -m 755 youtube-dl $(DESTDIR)$(BINDIR) - install -d $(DESTDIR)$(MANDIR)/man1 - install -m 644 youtube-dl.1 $(DESTDIR)$(MANDIR)/man1 - install -d $(DESTDIR)$(SYSCONFDIR)/bash_completion.d - install -m 644 youtube-dl.bash-completion $(DESTDIR)$(SYSCONFDIR)/bash_completion.d/youtube-dl - install -d $(DESTDIR)$(SHAREDIR)/zsh/site-functions - install -m 644 youtube-dl.zsh $(DESTDIR)$(SHAREDIR)/zsh/site-functions/_youtube-dl - install -d $(DESTDIR)$(SYSCONFDIR)/fish/completions - install -m 644 youtube-dl.fish $(DESTDIR)$(SYSCONFDIR)/fish/completions/youtube-dl.fish +install: yt-dlp yt-dlp.1 completions + install -Dm755 yt-dlp $(DESTDIR)$(BINDIR) + install -Dm644 yt-dlp.1 $(DESTDIR)$(MANDIR)/man1 + install -Dm644 completions/bash/yt-dlp $(DESTDIR)$(SHAREDIR)/bash-completion/completions/yt-dlp + install -Dm644 completions/zsh/_yt-dlp $(DESTDIR)$(SHAREDIR)/zsh/site-functions/_yt-dlp + install -Dm644 completions/fish/yt-dlp.fish $(DESTDIR)$(SHAREDIR)/fish/vendor_completions.d/yt-dlp.fish codetest: flake8 . test: - nosetests --verbose test + $(PYTHON) -m pytest $(MAKE) codetest -ot: offlinetest - -# Keep this list in sync with devscripts/run_tests.sh offlinetest: codetest - $(PYTHON) -m nose --verbose test \ - --exclude test_age_restriction.py \ - --exclude test_download.py \ - --exclude test_socks.py \ - --exclude test_subtitles.py \ - --exclude test_write_annotations.py \ - --exclude test_youtube_lists.py \ - --exclude test_youtube_signature.py - -tar: youtube-dl.tar.gz - -.PHONY: all clean install test tar bash-completion pypi-files zsh-completion fish-completion ot offlinetest codetest + $(PYTHON) -m pytest -k "not download" -pypi-files: youtube-dl.bash-completion README.txt youtube-dl.1 youtube-dl.fish - -youtube-dl: youtube_dl/*.py youtube_dl/*/*.py +yt-dlp: yt_dlp/*.py yt_dlp/*/*.py mkdir -p zip - for d in youtube_dl youtube_dl/downloader youtube_dl/extractor youtube_dl/postprocessor ; do \ + for d in yt_dlp yt_dlp/downloader yt_dlp/extractor yt_dlp/postprocessor ; do \ mkdir -p zip/$$d ;\ cp -pPR $$d/*.py zip/$$d/ ;\ done - touch -t 200001010101 zip/youtube_dl/*.py zip/youtube_dl/*/*.py - mv zip/youtube_dl/__main__.py zip/ - cd zip ; zip -q ../youtube-dl youtube_dl/*.py youtube_dl/*/*.py __main__.py + touch -t 200001010101 zip/yt_dlp/*.py zip/yt_dlp/*/*.py + mv zip/yt_dlp/__main__.py zip/ + cd zip ; zip -q ../yt-dlp yt_dlp/*.py yt_dlp/*/*.py __main__.py rm -rf zip - echo '#!$(PYTHON)' > youtube-dl - cat youtube-dl.zip >> youtube-dl - rm youtube-dl.zip - chmod a+x youtube-dl + echo '#!$(PYTHON)' > yt-dlp + cat yt-dlp.zip >> yt-dlp + rm yt-dlp.zip + chmod a+x yt-dlp -README.md: youtube_dl/*.py youtube_dl/*/*.py - COLUMNS=80 $(PYTHON) youtube_dl/__main__.py --help | $(PYTHON) devscripts/make_readme.py +README.md: yt_dlp/*.py yt_dlp/*/*.py + COLUMNS=80 $(PYTHON) yt_dlp/__main__.py --help | $(PYTHON) devscripts/make_readme.py CONTRIBUTING.md: README.md $(PYTHON) devscripts/make_contributing.py README.md CONTRIBUTING.md @@ -79,34 +80,29 @@ CONTRIBUTING.md: README.md README.txt: README.md pandoc -f $(MARKDOWN) -t plain README.md -o README.txt -youtube-dl.1: README.md - $(PYTHON) devscripts/prepare_manpage.py youtube-dl.1.temp.md - pandoc -s -f $(MARKDOWN) -t man youtube-dl.1.temp.md -o youtube-dl.1 - rm -f youtube-dl.1.temp.md +yt-dlp.1: README.md + $(PYTHON) devscripts/prepare_manpage.py yt-dlp.1.temp.md + pandoc -s -f $(MARKDOWN) -t man yt-dlp.1.temp.md -o yt-dlp.1 + rm -f yt-dlp.1.temp.md -youtube-dl.bash-completion: youtube_dl/*.py youtube_dl/*/*.py devscripts/bash-completion.in +completions/bash/yt-dlp: yt_dlp/*.py yt_dlp/*/*.py devscripts/bash-completion.in + mkdir -p completions/bash $(PYTHON) devscripts/bash-completion.py -bash-completion: youtube-dl.bash-completion - -youtube-dl.zsh: youtube_dl/*.py youtube_dl/*/*.py devscripts/zsh-completion.in +completions/zsh/_yt-dlp: yt_dlp/*.py yt_dlp/*/*.py devscripts/zsh-completion.in + mkdir -p completions/zsh $(PYTHON) devscripts/zsh-completion.py -zsh-completion: youtube-dl.zsh - -youtube-dl.fish: youtube_dl/*.py youtube_dl/*/*.py devscripts/fish-completion.in +completions/fish/yt-dlp.fish: yt_dlp/*.py yt_dlp/*/*.py devscripts/fish-completion.in + mkdir -p completions/fish $(PYTHON) devscripts/fish-completion.py -fish-completion: youtube-dl.fish - -lazy-extractors: youtube_dl/extractor/lazy_extractors.py - -_EXTRACTOR_FILES = $(shell find youtube_dl/extractor -iname '*.py' -and -not -iname 'lazy_extractors.py') -youtube_dl/extractor/lazy_extractors.py: devscripts/make_lazy_extractors.py devscripts/lazy_load_template.py $(_EXTRACTOR_FILES) +_EXTRACTOR_FILES = $(shell find yt_dlp/extractor -iname '*.py' -and -not -iname 'lazy_extractors.py') +yt_dlp/extractor/lazy_extractors.py: devscripts/make_lazy_extractors.py devscripts/lazy_load_template.py $(_EXTRACTOR_FILES) $(PYTHON) devscripts/make_lazy_extractors.py $@ -youtube-dl.tar.gz: youtube-dl README.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish ChangeLog AUTHORS - @tar -czf youtube-dl.tar.gz --transform "s|^|youtube-dl/|" --owner 0 --group 0 \ +yt-dlp.tar.gz: all + @tar -czf $(DESTDIR)/yt-dlp.tar.gz --transform "s|^|yt-dlp/|" --owner 0 --group 0 \ --exclude '*.DS_Store' \ --exclude '*.kate-swp' \ --exclude '*.pyc' \ @@ -114,10 +110,15 @@ youtube-dl.tar.gz: youtube-dl README.md README.txt youtube-dl.1 youtube-dl.bash- --exclude '*~' \ --exclude '__pycache__' \ --exclude '.git' \ - --exclude 'docs/_build' \ -- \ - bin devscripts test youtube_dl docs \ - ChangeLog AUTHORS LICENSE README.md README.txt \ - Makefile MANIFEST.in youtube-dl.1 youtube-dl.bash-completion \ - youtube-dl.zsh youtube-dl.fish setup.py setup.cfg \ - youtube-dl + bin README.md Changelog.md LICENSE \ + CONTRIBUTING.md Collaborators.md CONTRIBUTORS AUTHORS \ + Makefile MANIFEST.in yt-dlp.1 README.txt completions \ + setup.py setup.cfg yt-dlp yt_dlp requirements.txt \ + devscripts test tox.ini pytest.ini + +AUTHORS: .mailmap + git shortlog -s -n | cut -f2 | sort > AUTHORS + +.mailmap: + git shortlog -s -e -n | awk '!(out[$$NF]++) { $$1="";sub(/^[ \t]+/,""); print}' > .mailmap @@ -33,9 +33,6 @@ To install on Hyperbola run: # OPTIONS -h, --help Print this help text and exit --version Print program version and exit - -U, --update Update this program to latest version. - Make sure that you have sufficient - permissions (run with sudo if needed) -i, --ignore-errors Continue on download errors, for example to skip unavailable videos in a playlist diff --git a/bin/youtube-dl b/bin/youtube-dl deleted file mode 100755 index 73bf9b06d..000000000 --- a/bin/youtube-dl +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env python - -import hypervideo_dl - -if __name__ == '__main__': - hypervideo_dl.main() diff --git a/bin/yt-dlp b/bin/yt-dlp new file mode 100644 index 000000000..baecdeb0a --- /dev/null +++ b/bin/yt-dlp @@ -0,0 +1,6 @@ +#!/usr/bin/python + +import hypervideo_dl + +if __name__ == '__main__': + hypervideo_dl.main() @@ -10,9 +10,8 @@ run_rebrand() { printf '%b%s%b%s%b\n' '\e[1;32m' '==> ' '\e[0m\033[1m' 'Rebrand...' '\e[m' - mv -T youtube_dl hypervideo_dl - mv bin/youtube-dl bin/hypervideo - mv youtube-dl.plugin.zsh hypervideo.plugin.zsh + mv -T yt_dlp hypervideo_dl + mv bin/yt-dlp bin/hypervideo find . -type f \( \ -iname "*" \ @@ -20,29 +19,28 @@ run_rebrand() ! -iname ".travis.yml" \ ! -iname ".gitlab-ci.yml" \ ! -path "./.git*" \) \ - -exec grep -rIl 'youtube_dl' {} + -exec sed -i 's|youtube_dl|hypervideo_dl|g' {} \; + -exec grep -rIl 'yt_dlp' {} + -exec sed -i 's|yt_dlp|hypervideo_dl|g' {} \; - find . -name "tox.ini" -type f -exec sed -i 's|youtube-dl|hypervideo|g' {} \; - find . -name "setup.cfg" -type f -exec sed -i 's|youtube-dl|hypervideo|g' {} \; - find . -name "Makefile" -type f -exec sed -i 's|youtube-dl|hypervideo|g' {} \; - find . -name "*.py" -type f -exec sed -i 's|youtube-dl|hypervideo|g' {} \; - find . -name "*.in" -type f -exec sed -i 's|youtube-dl|hypervideo|g' {} \; - find . -name "*.zsh" -type f -exec sed -i 's|youtube-dl|hypervideo|g' {} \; - find docs -type f -exec sed -i 's|youtube-dl|hypervideo|g' {} \; + find . -name "tox.ini" -type f -exec sed -i 's|yt-dlp|hypervideo|g' {} \; + find . -name "setup.cfg" -type f -exec sed -i 's|yt-dlp|hypervideo|g' {} \; + find . -name "Makefile" -type f -exec sed -i 's|yt-dlp|hypervideo|g' {} \; + find . -name "*.py" -type f -exec sed -i 's|yt-dlp|hypervideo|g' {} \; + find . -name "*.in" -type f -exec sed -i 's|yt-dlp|hypervideo|g' {} \; + find docs -type f -exec sed -i 's|yt-dlp|hypervideo|g' {} \; # fixes URLs - find . -name "*.py" -type f -exec sed -i 's|hypervideo+test+video|youtube-dl+test+video|g' {} \; - find . -name "*.py" -type f -exec sed -i 's|https://github.com/ytdl-org/hypervideo/|https://github.com/ytdl-org/youtube-dl/|g' {} \; - find . -name "*.py" -type f -exec sed -i 's|https://github.com/rg3/hypervideo/issues/|https://github.com/rg3/youtube-dl/issues/|g' {} \; - find . -name "*.py" -type f -exec sed -i 's|http://hypervideo.bandcamp.com/|http://youtube-dl.bandcamp.com/|g' {} \; - find . -name "*.py" -type f -exec sed -i 's|http://youtube-dl.bandcamp.com/track/hypervideo-test-song|http://youtube-dl.bandcamp.com/track/youtube-dl-test-song|g' {} \; - find . -name "*.py" -type f -exec sed -i 's|https://www.dropbox.com/s/nelirfsxnmcfbfh/hypervideo|https://www.dropbox.com/s/nelirfsxnmcfbfh/youtube-dl|g' {} \; - find . -name "*.py" -type f -exec sed -i 's|http://8tracks.com/ytdl/hypervideo-test-tracks-a|http://8tracks.com/ytdl/youtube-dl-test-tracks-a|g' {} \; - find . -name "*.py" -type f -exec sed -i 's|http://phihag.de/2014/hypervideo/|http://phihag.de/2014/youtube-dl/|g' {} \; - find . -name "*.py" -type f -exec sed -i 's|https://hypervideo-demo.neocities.org/|https://youtube-dl-demo.neocities.org/|g' {} \; - find . -name "*.py" -type f -exec sed -i 's|https://travis-ci.org/ytdl-org/hypervideo/|https://travis-ci.org/ytdl-org/youtube-dl/|g' {} \; - find . -name "*.py" -type f -exec sed -i 's|https://soundcloud.com/jaimemf/hypervideo-test|https://soundcloud.com/jaimemf/youtube-dl-test|g' {} \; - find . -name "*.py" -type f -exec sed -i 's|http://streamcloud.eu/skp9j99s4bpz/hypervideo_test_video|http://streamcloud.eu/skp9j99s4bpz/youtube_dl_test_video|g' {} \; + find . -name "*.py" -type f -exec sed -i 's|hypervideo+test+video|yt-dlp+test+video|g' {} \; + find . -name "*.py" -type f -exec sed -i 's|https://github.com/ytdl-org/hypervideo/|https://github.com/ytdl-org/yt-dlp/|g' {} \; + find . -name "*.py" -type f -exec sed -i 's|https://github.com/rg3/hypervideo/issues/|https://github.com/rg3/yt-dlp/issues/|g' {} \; + find . -name "*.py" -type f -exec sed -i 's|http://hypervideo.bandcamp.com/|http://yt-dlp.bandcamp.com/|g' {} \; + find . -name "*.py" -type f -exec sed -i 's|http://yt-dlp.bandcamp.com/track/hypervideo-test-song|http://yt-dlp.bandcamp.com/track/yt-dlp-test-song|g' {} \; + find . -name "*.py" -type f -exec sed -i 's|https://www.dropbox.com/s/nelirfsxnmcfbfh/hypervideo|https://www.dropbox.com/s/nelirfsxnmcfbfh/yt-dlp|g' {} \; + find . -name "*.py" -type f -exec sed -i 's|http://8tracks.com/ytdl/hypervideo-test-tracks-a|http://8tracks.com/ytdl/yt-dlp-test-tracks-a|g' {} \; + find . -name "*.py" -type f -exec sed -i 's|http://phihag.de/2014/hypervideo/|http://phihag.de/2014/yt-dlp/|g' {} \; + find . -name "*.py" -type f -exec sed -i 's|https://hypervideo-demo.neocities.org/|https://yt-dlp-demo.neocities.org/|g' {} \; + find . -name "*.py" -type f -exec sed -i 's|https://travis-ci.org/ytdl-org/hypervideo/|https://travis-ci.org/ytdl-org/yt-dlp/|g' {} \; + find . -name "*.py" -type f -exec sed -i 's|https://soundcloud.com/jaimemf/hypervideo-test|https://soundcloud.com/jaimemf/yt-dlp-test|g' {} \; + find . -name "*.py" -type f -exec sed -i 's|http://streamcloud.eu/skp9j99s4bpz/hypervideo_test_video|http://streamcloud.eu/skp9j99s4bpz/yt_dlp_test_video|g' {} \; find . -type f \( \ -iname "*" \ @@ -50,7 +48,7 @@ run_rebrand() ! -iname ".travis.yml" \ ! -iname ".gitlab-ci.yml" \ ! -path "./.git*" \) \ - -exec grep -rIl 'YOUTUBE-DL' {} + -exec sed -i 's|YOUTUBE-DL|HYPERVIDEO|g' {} \; + -exec grep -rIl 'YT-DLP' {} + -exec sed -i 's|YT-DLP|HYPERVIDEO|g' {} \; # Set current version sed -i "s|__version.*|__version__ = '${VERSION}'|" hypervideo_dl/version.py @@ -59,7 +57,7 @@ run_rebrand() run_clean() { printf '%b%s%b%s%b\n' '\e[1;32m' '==> ' '\e[0m\033[1m' 'Clean tempfiles...' '\e[m' - rm -rf -- hypervideo.1.temp.md hypervideo.1 hypervideo.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ hypervideo.tar.gz hypervideo.zsh hypervideo.fish hypervideo_dl/extractor/lazy_extractors.py *.dump *.part* *.ytdl *.info.json *.mp4 *.m4a *.flv *.mp3 *.avi *.mkv *.webm *.3gp *.wav *.ape *.swf *.jpg *.png CONTRIBUTING.md.tmp hypervideo hypervideo.exe + rm -rf -- hypervideo.1.temp.md hypervideo.1 hypervideo.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ hypervideo.tar.gz hypervideo_dl/extractor/lazy_extractors.py *.dump *.part* *.ytdl *.info.json *.mp4 *.m4a *.flv *.mp3 *.avi *.mkv *.webm *.3gp *.wav *.ape *.swf *.jpg *.png CONTRIBUTING.md.tmp hypervideo hypervideo.exe find . -name "*.pyc" -delete find . -name "*.class" -delete } @@ -87,10 +85,9 @@ run_copy() run_reset() { printf '%b%s%b%s%b\n' '\e[1;32m' '==> ' '\e[0m\033[1m' 'Restore compatibility...' '\e[m' - mv -T hypervideo_dl youtube_dl - mv bin/hypervideo bin/youtube-dl - mv hypervideo.plugin.zsh youtube-dl.plugin.zsh - rm -v hypervideo hypervideo.1 hypervideo.bash-completion hypervideo.fish hypervideo.zsh + mv -T hypervideo_dl yt_dlp + mv bin/hypervideo bin/yt-dlp + rm -v hypervideo hypervideo.1 hypervideo.bash-completion printf '%b%s%b%s%b\n' '\e[1;32m' '==> ' '\e[0m\033[1m' 'Please execute: "git checkout ." for complete restore' '\e[m' } diff --git a/devscripts/bash-completion.in b/devscripts/bash-completion.in index 28bd23727..21f52798e 100644 --- a/devscripts/bash-completion.in +++ b/devscripts/bash-completion.in @@ -1,4 +1,4 @@ -__youtube_dl() +__yt_dlp() { local cur prev opts fileopts diropts keywords COMPREPLY=() @@ -26,4 +26,4 @@ __youtube_dl() fi } -complete -F __youtube_dl youtube-dl +complete -F __yt_dlp yt-dlp diff --git a/devscripts/bash-completion.py b/devscripts/bash-completion.py index 3d1391334..46b4b2ff5 100755 --- a/devscripts/bash-completion.py +++ b/devscripts/bash-completion.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 from __future__ import unicode_literals import os @@ -6,9 +6,9 @@ from os.path import dirname as dirn import sys sys.path.insert(0, dirn(dirn((os.path.abspath(__file__))))) -import youtube_dl +import yt_dlp -BASH_COMPLETION_FILE = "youtube-dl.bash-completion" +BASH_COMPLETION_FILE = "completions/bash/yt-dlp" BASH_COMPLETION_TEMPLATE = "devscripts/bash-completion.in" @@ -26,5 +26,5 @@ def build_completion(opt_parser): f.write(filled_template) -parser = youtube_dl.parseOpts()[0] +parser = yt_dlp.parseOpts()[0] build_completion(parser) diff --git a/devscripts/buildserver.py b/devscripts/buildserver.py index 4a4295ba9..cd544b816 100644 --- a/devscripts/buildserver.py +++ b/devscripts/buildserver.py @@ -1,3 +1,5 @@ +# UNUSED + #!/usr/bin/python3 import argparse @@ -12,7 +14,7 @@ import traceback import os.path sys.path.insert(0, os.path.dirname(os.path.dirname((os.path.abspath(__file__))))) -from youtube_dl.compat import ( +from yt_dlp.compat import ( compat_input, compat_http_server, compat_str, @@ -325,7 +327,7 @@ class YoutubeDLBuilder(object): authorizedUsers = ['fraca7', 'phihag', 'rg3', 'FiloSottile', 'ytdl-org'] def __init__(self, **kwargs): - if self.repoName != 'youtube-dl': + if self.repoName != 'yt-dlp': raise BuildError('Invalid repository "%s"' % self.repoName) if self.user not in self.authorizedUsers: raise HTTPError('Unauthorized user "%s"' % self.user, 401) diff --git a/devscripts/check-porn.py b/devscripts/check-porn.py index 740f04de0..50f6bebc6 100644 --- a/devscripts/check-porn.py +++ b/devscripts/check-porn.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 from __future__ import unicode_literals """ @@ -15,8 +15,8 @@ import sys sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import gettestcases -from youtube_dl.utils import compat_urllib_parse_urlparse -from youtube_dl.utils import compat_urllib_request +from yt_dlp.utils import compat_urllib_parse_urlparse +from yt_dlp.utils import compat_urllib_request if len(sys.argv) > 1: METHOD = 'LIST' diff --git a/devscripts/fish-completion.in b/devscripts/fish-completion.in index eb79765da..32938fbb4 100644 --- a/devscripts/fish-completion.in +++ b/devscripts/fish-completion.in @@ -2,4 +2,4 @@ {{commands}} -complete --command youtube-dl --arguments ":ytfavorites :ytrecommended :ytsubscriptions :ytwatchlater :ythistory" +complete --command yt-dlp --arguments ":ytfavorites :ytrecommended :ytsubscriptions :ytwatchlater :ythistory" diff --git a/devscripts/fish-completion.py b/devscripts/fish-completion.py index 51d19dd33..fb45e0280 100755 --- a/devscripts/fish-completion.py +++ b/devscripts/fish-completion.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 from __future__ import unicode_literals import optparse @@ -7,13 +7,14 @@ from os.path import dirname as dirn import sys sys.path.insert(0, dirn(dirn((os.path.abspath(__file__))))) -import youtube_dl -from youtube_dl.utils import shell_quote +import yt_dlp +from yt_dlp.utils import shell_quote -FISH_COMPLETION_FILE = 'youtube-dl.fish' +FISH_COMPLETION_FILE = 'completions/fish/yt-dlp.fish' FISH_COMPLETION_TEMPLATE = 'devscripts/fish-completion.in' EXTRA_ARGS = { + 'remux-video': ['--arguments', 'mp4 mkv', '--exclusive'], 'recode-video': ['--arguments', 'mp4 flv ogg webm mkv', '--exclusive'], # Options that need a file parameter @@ -30,7 +31,7 @@ def build_completion(opt_parser): for group in opt_parser.option_groups: for option in group.option_list: long_option = option.get_opt_string().strip('-') - complete_cmd = ['complete', '--command', 'youtube-dl', '--long-option', long_option] + complete_cmd = ['complete', '--command', 'yt-dlp', '--long-option', long_option] if option._short_opts: complete_cmd += ['--short-option', option._short_opts[0].strip('-')] if option.help != optparse.SUPPRESS_HELP: @@ -45,5 +46,5 @@ def build_completion(opt_parser): f.write(filled_template) -parser = youtube_dl.parseOpts()[0] +parser = yt_dlp.parseOpts()[0] build_completion(parser) diff --git a/devscripts/generate_aes_testdata.py b/devscripts/generate_aes_testdata.py index e3df42cc2..0979eee5b 100644 --- a/devscripts/generate_aes_testdata.py +++ b/devscripts/generate_aes_testdata.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 from __future__ import unicode_literals import codecs @@ -7,8 +8,8 @@ import os import sys sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from youtube_dl.utils import intlist_to_bytes -from youtube_dl.aes import aes_encrypt, key_expansion +from yt_dlp.utils import intlist_to_bytes +from yt_dlp.aes import aes_encrypt, key_expansion secret_msg = b'Secret message goes here' diff --git a/devscripts/lazy_load_template.py b/devscripts/lazy_load_template.py index c4e5fc1f4..da89e070d 100644 --- a/devscripts/lazy_load_template.py +++ b/devscripts/lazy_load_template.py @@ -1,19 +1,31 @@ # coding: utf-8 -from __future__ import unicode_literals - import re +from ..utils import bug_reports_message, write_string + + +class LazyLoadMetaClass(type): + def __getattr__(cls, name): + if '_real_class' not in cls.__dict__: + write_string( + f'WARNING: Falling back to normal extractor since lazy extractor ' + f'{cls.__name__} does not have attribute {name}{bug_reports_message()}') + return getattr(cls._get_real_class(), name) + -class LazyLoadExtractor(object): +class LazyLoadExtractor(metaclass=LazyLoadMetaClass): _module = None + _WORKING = True @classmethod - def ie_key(cls): - return cls.__name__[:-2] + def _get_real_class(cls): + if '_real_class' not in cls.__dict__: + mod = __import__(cls._module, fromlist=(cls.__name__,)) + cls._real_class = getattr(mod, cls.__name__) + return cls._real_class def __new__(cls, *args, **kwargs): - mod = __import__(cls._module, fromlist=(cls.__name__,)) - real_cls = getattr(mod, cls.__name__) + real_cls = cls._get_real_class() instance = real_cls.__new__(real_cls) instance.__init__(*args, **kwargs) return instance diff --git a/devscripts/logo.ico b/devscripts/logo.ico Binary files differnew file mode 100644 index 000000000..5503a4350 --- /dev/null +++ b/devscripts/logo.ico diff --git a/devscripts/make_contributing.py b/devscripts/make_contributing.py index 226d1a5d6..6b1b8219c 100755 --- a/devscripts/make_contributing.py +++ b/devscripts/make_contributing.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 from __future__ import unicode_literals import io @@ -7,6 +7,8 @@ import re def main(): + return # This is unused in yt-dlp + parser = optparse.OptionParser(usage='%prog INFILE OUTFILE') options, args = parser.parse_args() if len(args) != 2: @@ -20,8 +22,7 @@ def main(): bug_text = re.search( r'(?s)#\s*BUGS\s*[^\n]*\s*(.*?)#\s*COPYRIGHT', readme).group(1) dev_text = re.search( - r'(?s)(#\s*DEVELOPER INSTRUCTIONS.*?)#\s*EMBEDDING YOUTUBE-DL', - readme).group(1) + r'(?s)(#\s*DEVELOPER INSTRUCTIONS.*?)#\s*EMBEDDING yt-dlp', readme).group(1) out = bug_text + dev_text diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py index 878ae72b1..427045b98 100644 --- a/devscripts/make_lazy_extractors.py +++ b/devscripts/make_lazy_extractors.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 from __future__ import unicode_literals, print_function from inspect import getsource @@ -6,27 +7,35 @@ import os from os.path import dirname as dirn import sys -print('WARNING: Lazy loading extractors is an experimental feature that may not always work', file=sys.stderr) - sys.path.insert(0, dirn(dirn((os.path.abspath(__file__))))) lazy_extractors_filename = sys.argv[1] if os.path.exists(lazy_extractors_filename): os.remove(lazy_extractors_filename) -from youtube_dl.extractor import _ALL_CLASSES -from youtube_dl.extractor.common import InfoExtractor, SearchInfoExtractor +# Block plugins from loading +plugins_dirname = 'ytdlp_plugins' +plugins_blocked_dirname = 'ytdlp_plugins_blocked' +if os.path.exists(plugins_dirname): + os.rename(plugins_dirname, plugins_blocked_dirname) + +from yt_dlp.extractor import _ALL_CLASSES +from yt_dlp.extractor.common import InfoExtractor, SearchInfoExtractor + +if os.path.exists(plugins_blocked_dirname): + os.rename(plugins_blocked_dirname, plugins_dirname) with open('devscripts/lazy_load_template.py', 'rt') as f: module_template = f.read() +CLASS_PROPERTIES = ['ie_key', 'working', '_match_valid_url', 'suitable', '_match_id', 'get_temp_id'] module_contents = [ - module_template + '\n' + getsource(InfoExtractor.suitable) + '\n', - 'class LazyLoadSearchExtractor(LazyLoadExtractor):\n pass\n'] + module_template, + *[getsource(getattr(InfoExtractor, k)) for k in CLASS_PROPERTIES], + '\nclass LazyLoadSearchExtractor(LazyLoadExtractor):\n pass\n'] ie_template = ''' class {name}({bases}): - _VALID_URL = {valid_url!r} _module = '{module}' ''' @@ -47,14 +56,17 @@ def get_base_name(base): def build_lazy_ie(ie, name): - valid_url = getattr(ie, '_VALID_URL', None) s = ie_template.format( name=name, bases=', '.join(map(get_base_name, ie.__bases__)), - valid_url=valid_url, module=ie.__module__) + valid_url = getattr(ie, '_VALID_URL', None) + if valid_url: + s += f' _VALID_URL = {valid_url!r}\n' + if not ie._WORKING: + s += ' _WORKING = False\n' if ie.suitable.__func__ is not InfoExtractor.suitable.__func__: - s += '\n' + getsource(ie.suitable) + s += f'\n{getsource(ie.suitable)}' if hasattr(ie, '_make_valid_url'): # search extractors s += make_valid_template.format(valid_url=ie._make_valid_url()) @@ -92,7 +104,7 @@ for ie in ordered_cls: names.append(name) module_contents.append( - '_ALL_CLASSES = [{0}]'.format(', '.join(names))) + '\n_ALL_CLASSES = [{0}]'.format(', '.join(names))) module_src = '\n'.join(module_contents) + '\n' diff --git a/devscripts/make_readme.py b/devscripts/make_readme.py index 8fbce0796..3f56af744 100755 --- a/devscripts/make_readme.py +++ b/devscripts/make_readme.py @@ -1,3 +1,8 @@ +#!/usr/bin/env python3 + +# yt-dlp --help | make_readme.py +# This must be run in a console of correct width + from __future__ import unicode_literals import io @@ -13,12 +18,12 @@ if isinstance(helptext, bytes): with io.open(README_FILE, encoding='utf-8') as f: oldreadme = f.read() -header = oldreadme[:oldreadme.index('# OPTIONS')] +header = oldreadme[:oldreadme.index('## General Options:')] footer = oldreadme[oldreadme.index('# CONFIGURATION'):] -options = helptext[helptext.index(' General Options:') + 19:] +options = helptext[helptext.index(' General Options:'):] options = re.sub(r'(?m)^ (\w.+)$', r'## \1', options) -options = '# OPTIONS\n' + options + '\n' +options = options + '\n' with io.open(README_FILE, 'w', encoding='utf-8') as f: f.write(header) diff --git a/devscripts/make_supportedsites.py b/devscripts/make_supportedsites.py index 764795bc5..17a34843f 100644 --- a/devscripts/make_supportedsites.py +++ b/devscripts/make_supportedsites.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 from __future__ import unicode_literals import io @@ -7,10 +7,10 @@ import os import sys -# Import youtube_dl +# Import yt_dlp ROOT_DIR = os.path.join(os.path.dirname(__file__), '..') sys.path.insert(0, ROOT_DIR) -import youtube_dl +import yt_dlp def main(): @@ -33,7 +33,7 @@ def main(): ie_md += ' (Currently broken)' yield ie_md - ies = sorted(youtube_dl.gen_extractors(), key=lambda i: i.IE_NAME.lower()) + ies = sorted(yt_dlp.gen_extractors(), key=lambda i: i.IE_NAME.lower()) out = '# Supported sites\n' + ''.join( ' - ' + md + '\n' for md in gen_ies_md(ies)) diff --git a/devscripts/prepare_manpage.py b/devscripts/prepare_manpage.py index 76bf873e1..485b39e9f 100644 --- a/devscripts/prepare_manpage.py +++ b/devscripts/prepare_manpage.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 from __future__ import unicode_literals import io @@ -8,7 +9,7 @@ import re ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) README_FILE = os.path.join(ROOT_DIR, 'README.md') -PREFIX = r'''%YOUTUBE-DL(1) +PREFIX = r'''%yt-dlp(1) # NAME @@ -16,7 +17,7 @@ youtube\-dl \- download videos from youtube.com or other video platforms # SYNOPSIS -**youtube-dl** \[OPTIONS\] URL [URL...] +**yt-dlp** \[OPTIONS\] URL [URL...] ''' @@ -33,7 +34,7 @@ def main(): readme = f.read() readme = re.sub(r'(?s)^.*?(?=# DESCRIPTION)', '', readme) - readme = re.sub(r'\s+youtube-dl \[OPTIONS\] URL \[URL\.\.\.\]', '', readme) + readme = re.sub(r'\s+yt-dlp \[OPTIONS\] URL \[URL\.\.\.\]', '', readme) readme = PREFIX + readme readme = filter_options(readme) diff --git a/devscripts/run_tests.bat b/devscripts/run_tests.bat index 01a79b6dd..b8bb393d9 100644 --- a/devscripts/run_tests.bat +++ b/devscripts/run_tests.bat @@ -1,17 +1,16 @@ +@setlocal @echo off +cd /d %~dp0.. -rem Keep this list in sync with the `offlinetest` target in Makefile -set DOWNLOAD_TESTS="age_restriction^|download^|socks^|subtitles^|write_annotations^|youtube_lists^|youtube_signature" - -if "%YTDL_TEST_SET%" == "core" ( - set test_set="-I test_("%DOWNLOAD_TESTS%")\.py" - set multiprocess_args="" -) else if "%YTDL_TEST_SET%" == "download" ( - set test_set="-I test_(?!"%DOWNLOAD_TESTS%").+\.py" - set multiprocess_args="--processes=4 --process-timeout=540" +if ["%~1"]==[""] ( + set "test_set="test"" +) else if ["%~1"]==["core"] ( + set "test_set="-m not download"" +) else if ["%~1"]==["download"] ( + set "test_set="-m "download"" ) else ( - echo YTDL_TEST_SET is not set or invalid + echo.Invalid test type "%~1". Use "core" ^| "download" exit /b 1 ) -nosetests test --verbose %test_set:"=% %multiprocess_args:"=% +pytest %test_set% diff --git a/devscripts/run_tests.sh b/devscripts/run_tests.sh index b8f48b9df..c9a75ba00 100755 --- a/devscripts/run_tests.sh +++ b/devscripts/run_tests.sh @@ -1,22 +1,14 @@ -#!/bin/bash - -# Keep this list in sync with the `offlinetest` target in Makefile -DOWNLOAD_TESTS="age_restriction|download|socks|subtitles|write_annotations|youtube_lists|youtube_signature" - -test_set="" -multiprocess_args="" - -case "$YTDL_TEST_SET" in - core) - test_set="-I test_($DOWNLOAD_TESTS)\.py" - ;; - download) - test_set="-I test_(?!$DOWNLOAD_TESTS).+\.py" - multiprocess_args="--processes=4 --process-timeout=540" - ;; - *) - break - ;; -esac - -nosetests test --verbose $test_set $multiprocess_args +#!/bin/sh + +if [ -z $1 ]; then + test_set='test' +elif [ $1 = 'core' ]; then + test_set="-m not download" +elif [ $1 = 'download' ]; then + test_set="-m download" +else + echo 'Invalid test type "'$1'". Use "core" | "download"' + exit 1 +fi + +python3 -m pytest "$test_set" diff --git a/devscripts/zsh-completion.in b/devscripts/zsh-completion.in index b394a1ae7..9117d339e 100644 --- a/devscripts/zsh-completion.in +++ b/devscripts/zsh-completion.in @@ -1,6 +1,6 @@ -#compdef youtube-dl +#compdef yt-dlp -__youtube_dl() { +__yt_dlp() { local curcontext="$curcontext" fileopts diropts cur prev typeset -A opt_args fileopts="{{fileopts}}" @@ -16,6 +16,8 @@ __youtube_dl() { _path_files elif [[ ${prev} =~ ${diropts} ]]; then _path_files -/ + elif [[ ${prev} == "--remux-video" ]]; then + _arguments '*: :(mp4 mkv)' elif [[ ${prev} == "--recode-video" ]]; then _arguments '*: :(mp4 flv ogg webm mkv)' else @@ -25,4 +27,4 @@ __youtube_dl() { esac } -__youtube_dl
\ No newline at end of file +__yt_dlp
\ No newline at end of file diff --git a/devscripts/zsh-completion.py b/devscripts/zsh-completion.py index 60aaf76cc..780df0de6 100755 --- a/devscripts/zsh-completion.py +++ b/devscripts/zsh-completion.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 from __future__ import unicode_literals import os @@ -6,9 +6,9 @@ from os.path import dirname as dirn import sys sys.path.insert(0, dirn(dirn((os.path.abspath(__file__))))) -import youtube_dl +import yt_dlp -ZSH_COMPLETION_FILE = "youtube-dl.zsh" +ZSH_COMPLETION_FILE = "completions/zsh/_yt-dlp" ZSH_COMPLETION_TEMPLATE = "devscripts/zsh-completion.in" @@ -45,5 +45,5 @@ def build_completion(opt_parser): f.write(template) -parser = youtube_dl.parseOpts()[0] +parser = yt_dlp.parseOpts()[0] build_completion(parser) diff --git a/docs/Changelog.md b/docs/Changelog.md new file mode 100644 index 000000000..99de25fb1 --- /dev/null +++ b/docs/Changelog.md @@ -0,0 +1,5 @@ +--- +orphan: true +--- +```{include} ../Changelog.md +``` diff --git a/docs/Collaborators.md b/docs/Collaborators.md new file mode 100644 index 000000000..5f493d814 --- /dev/null +++ b/docs/Collaborators.md @@ -0,0 +1,5 @@ +--- +orphan: true +--- +```{include} ../Collaborators.md +``` diff --git a/docs/LICENSE.md b/docs/LICENSE.md new file mode 100644 index 000000000..8521669f8 --- /dev/null +++ b/docs/LICENSE.md @@ -0,0 +1,6 @@ +--- +orphan: true +--- +# LICENSE +```{include} ../LICENSE +``` diff --git a/docs/Makefile b/docs/Makefile index 712218045..1a8e3cb1c 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -85,17 +85,17 @@ qthelp: @echo @echo "Build finished; now you can run "qcollectiongenerator" with the" \ ".qhcp project file in $(BUILDDIR)/qthelp, like this:" - @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/youtube-dl.qhcp" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/yt-dlp.qhcp" @echo "To view the help file:" - @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/youtube-dl.qhc" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/yt-dlp.qhc" devhelp: $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp @echo @echo "Build finished." @echo "To view the help file:" - @echo "# mkdir -p $$HOME/.local/share/devhelp/youtube-dl" - @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/youtube-dl" + @echo "# mkdir -p $$HOME/.local/share/devhelp/yt-dlp" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/yt-dlp" @echo "# devhelp" epub: diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 000000000..451bedaec --- /dev/null +++ b/docs/README.md @@ -0,0 +1,2 @@ +```{include} ../README.md +``` diff --git a/docs/conf.py b/docs/conf.py index 0aaf1b8fc..c4010bbc7 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,21 +1,12 @@ # coding: utf-8 # -# youtube-dl documentation build configuration file, created by -# sphinx-quickstart on Fri Mar 14 21:05:43 2014. -# -# This file is execfile()d with the current directory set to its -# containing dir. -# -# Note that not all possible configuration values are present in this -# autogenerated file. -# -# All configuration values have a default; values that are commented out -# serve to show the default. +# yt-dlp documentation build configuration file import sys import os -# Allows to import youtube_dl -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +# Allows to import yt-dlp +sys.path.insert(0, os.path.abspath('..')) # -- General configuration ------------------------------------------------ @@ -23,28 +14,26 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autodoc', + 'myst_parser', ] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] -# The suffix of source filenames. -source_suffix = '.rst' - # The master toctree document. -master_doc = 'index' +master_doc = 'README' # General information about the project. -project = u'youtube-dl' -copyright = u'2014, Ricardo Garcia Gonzalez' +project = u'yt-dlp' +author = u'yt-dlp' +copyright = u'UNLICENSE' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. -from youtube_dl.version import __version__ +from yt_dlp.version import __version__ version = __version__ # The full version, including alpha/beta/rc tags. release = version @@ -62,10 +51,18 @@ pygments_style = 'sphinx' # a list of builtin themes. html_theme = 'default' +# Disable highlights +highlight_language = 'none' + # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +# html_static_path = ['_static'] -# Output file base name for HTML help builder. -htmlhelp_basename = 'youtube-dldoc' +# Enable heading anchors +myst_heading_anchors = 4 + +# Suppress heading warnings +suppress_warnings = [ + 'myst.header', +] diff --git a/docs/index.rst b/docs/index.rst deleted file mode 100644 index b746ff95b..000000000 --- a/docs/index.rst +++ /dev/null @@ -1,23 +0,0 @@ -Welcome to youtube-dl's documentation! -====================================== - -*youtube-dl* is a command-line program to download videos from YouTube.com and more sites. -It can also be used in Python code. - -Developer guide ---------------- - -This section contains information for using *youtube-dl* from Python programs. - -.. toctree:: - :maxdepth: 2 - - module_guide - -Indices and tables -================== - -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` - diff --git a/docs/module_guide.rst b/docs/module_guide.rst deleted file mode 100644 index 03d72882e..000000000 --- a/docs/module_guide.rst +++ /dev/null @@ -1,67 +0,0 @@ -Using the ``youtube_dl`` module -=============================== - -When using the ``youtube_dl`` module, you start by creating an instance of :class:`YoutubeDL` and adding all the available extractors: - -.. code-block:: python - - >>> from youtube_dl import YoutubeDL - >>> ydl = YoutubeDL() - >>> ydl.add_default_info_extractors() - -Extracting video information ----------------------------- - -You use the :meth:`YoutubeDL.extract_info` method for getting the video information, which returns a dictionary: - -.. code-block:: python - - >>> info = ydl.extract_info('http://www.youtube.com/watch?v=BaW_jenozKc', download=False) - [youtube] Setting language - [youtube] BaW_jenozKc: Downloading webpage - [youtube] BaW_jenozKc: Downloading video info webpage - [youtube] BaW_jenozKc: Extracting video information - >>> info['title'] - 'youtube-dl test video "\'/\\ä↭𝕐' - >>> info['height'], info['width'] - (720, 1280) - -If you want to download or play the video you can get its url: - -.. code-block:: python - - >>> info['url'] - 'https://...' - -Extracting playlist information -------------------------------- - -The playlist information is extracted in a similar way, but the dictionary is a bit different: - -.. code-block:: python - - >>> playlist = ydl.extract_info('http://www.ted.com/playlists/13/open_source_open_world', download=False) - [TED] open_source_open_world: Downloading playlist webpage - ... - >>> playlist['title'] - 'Open-source, open world' - - - -You can access the videos in the playlist with the ``entries`` field: - -.. code-block:: python - - >>> for video in playlist['entries']: - ... print('Video #%d: %s' % (video['playlist_index'], video['title'])) - - Video #1: How Arduino is open-sourcing imagination - Video #2: The year open data went worldwide - Video #3: Massive-scale online collaboration - Video #4: The art of asking - Video #5: How cognitive surplus will change the world - Video #6: The birth of Wikipedia - Video #7: Coding a better government - Video #8: The era of open innovation - Video #9: The currency of the new economy is trust - diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 000000000..f0694bdc0 --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1 @@ +myst-parser diff --git a/docs/supportedsites.md b/docs/supportedsites.md index ed0d5e9d9..55c023415 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1,1228 +1,5 @@ -# Supported sites - - **1tv**: Первый канал - - **20min** - - **220.ro** - - **23video** - - **247sports** - - **24video** - - **3qsdn**: 3Q SDN - - **3sat** - - **4tube** - - **56.com** - - **5min** - - **6play** - - **7plus** - - **8tracks** - - **91porn** - - **9c9media** - - **9gag** - - **9now.com.au** - - **abc.net.au** - - **abc.net.au:iview** - - **abcnews** - - **abcnews:video** - - **abcotvs**: ABC Owned Television Stations - - **abcotvs:clips** - - **AcademicEarth:Course** - - **acast** - - **acast:channel** - - **ADN**: Anime Digital Network - - **AdobeConnect** - - **adobetv** - - **adobetv:channel** - - **adobetv:embed** - - **adobetv:show** - - **adobetv:video** - - **AdultSwim** - - **aenetworks**: A+E Networks: A&E, Lifetime, History.com, FYI Network and History Vault - - **aenetworks:collection** - - **aenetworks:show** - - **afreecatv**: afreecatv.com - - **AirMozilla** - - **AliExpressLive** - - **AlJazeera** - - **Allocine** - - **AlphaPorno** - - **Amara** - - **AMCNetworks** - - **AmericasTestKitchen** - - **AmericasTestKitchenSeason** - - **anderetijden**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl - - **AnimeOnDemand** - - **Anvato** - - **aol.com**: Yahoo screen and movies - - **APA** - - **Aparat** - - **AppleConnect** - - **AppleDaily**: 臺灣蘋果日報 - - **ApplePodcasts** - - **appletrailers** - - **appletrailers:section** - - **archive.org**: archive.org videos - - **ArcPublishing** - - **ARD** - - **ARD:mediathek** - - **ARDBetaMediathek** - - **Arkena** - - **arte.sky.it** - - **ArteTV** - - **ArteTVEmbed** - - **ArteTVPlaylist** - - **AsianCrush** - - **AsianCrushPlaylist** - - **AtresPlayer** - - **ATTTechChannel** - - **ATVAt** - - **AudiMedia** - - **AudioBoom** - - **audiomack** - - **audiomack:album** - - **AWAAN** - - **awaan:live** - - **awaan:season** - - **awaan:video** - - **AZMedien**: AZ Medien videos - - **BaiduVideo**: 百度视频 - - **bandaichannel** - - **Bandcamp** - - **Bandcamp:album** - - **Bandcamp:weekly** - - **bangumi.bilibili.com**: BiliBili番剧 - - **bbc**: BBC - - **bbc.co.uk**: BBC iPlayer - - **bbc.co.uk:article**: BBC articles - - **bbc.co.uk:iplayer:episodes** - - **bbc.co.uk:iplayer:group** - - **bbc.co.uk:playlist** - - **BBVTV** - - **Beatport** - - **Beeg** - - **BehindKink** - - **Bellator** - - **BellMedia** - - **Bet** - - **bfi:player** - - **bfmtv** - - **bfmtv:article** - - **bfmtv:live** - - **BibelTV** - - **Bigflix** - - **Bild**: Bild.de - - **BiliBili** - - **BilibiliAudio** - - **BilibiliAudioAlbum** - - **BiliBiliPlayer** - - **BioBioChileTV** - - **Biography** - - **BIQLE** - - **BitChute** - - **BitChuteChannel** - - **BleacherReport** - - **BleacherReportCMS** - - **Bloomberg** - - **BokeCC** - - **BongaCams** - - **BostonGlobe** - - **Box** - - **Bpb**: Bundeszentrale für politische Bildung - - **BR**: Bayerischer Rundfunk - - **BravoTV** - - **Break** - - **brightcove:legacy** - - **brightcove:new** - - **BRMediathek**: Bayerischer Rundfunk Mediathek - - **bt:article**: Bergens Tidende Articles - - **bt:vestlendingen**: Bergens Tidende - Vestlendingen - - **BusinessInsider** - - **BuzzFeed** - - **BYUtv** - - **Camdemy** - - **CamdemyFolder** - - **CamModels** - - **CamTube** - - **CamWithHer** - - **canalc2.tv** - - **Canalplus**: mycanal.fr and piwiplus.fr - - **Canvas** - - **CanvasEen**: canvas.be and een.be - - **CarambaTV** - - **CarambaTVPage** - - **CartoonNetwork** - - **cbc.ca** - - **cbc.ca:olympics** - - **cbc.ca:player** - - **cbc.ca:watch** - - **cbc.ca:watch:video** - - **CBS** - - **CBSInteractive** - - **CBSLocal** - - **CBSLocalArticle** - - **cbsnews**: CBS News - - **cbsnews:embed** - - **cbsnews:livevideo**: CBS News Live Videos - - **cbssports** - - **cbssports:embed** - - **CCMA** - - **CCTV**: 央视网 - - **CDA** - - **CeskaTelevize** - - **CeskaTelevizePorady** - - **channel9**: Channel 9 - - **CharlieRose** - - **Chaturbate** - - **Chilloutzone** - - **chirbit** - - **chirbit:profile** - - **cielotv.it** - - **Cinchcast** - - **Cinemax** - - **CiscoLiveSearch** - - **CiscoLiveSession** - - **CJSW** - - **cliphunter** - - **Clippit** - - **ClipRs** - - **Clipsyndicate** - - **CloserToTruth** - - **CloudflareStream** - - **Cloudy** - - **Clubic** - - **Clyp** - - **cmt.com** - - **CNBC** - - **CNBCVideo** - - **CNN** - - **CNNArticle** - - **CNNBlogs** - - **ComedyCentral** - - **ComedyCentralTV** - - **CondeNast**: Condé Nast media group: Allure, Architectural Digest, Ars Technica, Bon Appétit, Brides, Condé Nast, Condé Nast Traveler, Details, Epicurious, GQ, Glamour, Golf Digest, SELF, Teen Vogue, The New Yorker, Vanity Fair, Vogue, W Magazine, WIRED - - **CONtv** - - **Corus** - - **Coub** - - **Cracked** - - **Crackle** - - **CrooksAndLiars** - - **crunchyroll** - - **crunchyroll:playlist** - - **CSpan**: C-SPAN - - **CtsNews**: 華視新聞 - - **CTV** - - **CTVNews** - - **cu.ntv.co.jp**: Nippon Television Network - - **Culturebox** - - **CultureUnplugged** - - **curiositystream** - - **curiositystream:collection** - - **CWTV** - - **DagelijkseKost**: dagelijksekost.een.be - - **DailyMail** - - **dailymotion** - - **dailymotion:playlist** - - **dailymotion:user** - - **daum.net** - - **daum.net:clip** - - **daum.net:playlist** - - **daum.net:user** - - **DBTV** - - **DctpTv** - - **DeezerPlaylist** - - **defense.gouv.fr** - - **democracynow** - - **DHM**: Filmarchiv - Deutsches Historisches Museum - - **Digg** - - **DigitallySpeaking** - - **Digiteka** - - **Discovery** - - **DiscoveryGo** - - **DiscoveryGoPlaylist** - - **DiscoveryNetworksDe** - - **DiscoveryPlus** - - **DiscoveryVR** - - **Disney** - - **dlive:stream** - - **dlive:vod** - - **Dotsub** - - **DouyuShow** - - **DouyuTV**: 斗鱼 - - **DPlay** - - **DRBonanza** - - **Dropbox** - - **DrTuber** - - **drtv** - - **drtv:live** - - **DTube** - - **Dumpert** - - **dvtv**: http://video.aktualne.cz/ - - **dw** - - **dw:article** - - **EaglePlatform** - - **EbaumsWorld** - - **EchoMsk** - - **egghead:course**: egghead.io course - - **egghead:lesson**: egghead.io lesson - - **ehftv** - - **eHow** - - **EinsUndEinsTV** - - **Einthusan** - - **eitb.tv** - - **EllenTube** - - **EllenTubePlaylist** - - **EllenTubeVideo** - - **ElPais**: El País - - **Embedly** - - **EMPFlix** - - **Engadget** - - **Eporner** - - **EroProfile** - - **Escapist** - - **ESPN** - - **ESPNArticle** - - **EsriVideo** - - **Europa** - - **EWETV** - - **ExpoTV** - - **Expressen** - - **ExtremeTube** - - **EyedoTV** - - **facebook** - - **FacebookPluginsVideo** - - **faz.net** - - **fc2** - - **fc2:embed** - - **Fczenit** - - **filmon** - - **filmon:channel** - - **Filmweb** - - **FiveThirtyEight** - - **FiveTV** - - **Flickr** - - **Folketinget**: Folketinget (ft.dk; Danish parliament) - - **FootyRoom** - - **Formula1** - - **FOX** - - **FOX9** - - **FOX9News** - - **Foxgay** - - **foxnews**: Fox News and Fox Business Video - - **foxnews:article** - - **FoxSports** - - **france2.fr:generation-what** - - **FranceCulture** - - **FranceInter** - - **FranceTV** - - **FranceTVEmbed** - - **francetvinfo.fr** - - **FranceTVJeunesse** - - **FranceTVSite** - - **Freesound** - - **freespeech.org** - - **FreshLive** - - **FrontendMasters** - - **FrontendMastersCourse** - - **FrontendMastersLesson** - - **FujiTVFODPlus7** - - **Funimation** - - **Funk** - - **Fusion** - - **Fux** - - **Gaia** - - **GameInformer** - - **GameSpot** - - **GameStar** - - **Gaskrank** - - **Gazeta** - - **GDCVault** - - **GediDigital** - - **generic**: Generic downloader that works on some sites - - **Gfycat** - - **GiantBomb** - - **Giga** - - **GlattvisionTV** - - **Glide**: Glide mobile video messages (glide.me) - - **Globo** - - **GloboArticle** - - **Go** - - **GodTube** - - **Golem** - - **google:podcasts** - - **google:podcasts:feed** - - **GoogleDrive** - - **Goshgay** - - **GPUTechConf** - - **Groupon** - - **hbo** - - **HearThisAt** - - **Heise** - - **HellPorno** - - **Helsinki**: helsinki.fi - - **HentaiStigma** - - **hetklokhuis** - - **hgtv.com:show** - - **HGTVDe** - - **HiDive** - - **HistoricFilms** - - **history:player** - - **history:topic**: History.com Topic - - **hitbox** - - **hitbox:live** - - **HitRecord** - - **hketv**: 香港教育局教育電視 (HKETV) Educational Television, Hong Kong Educational Bureau - - **HornBunny** - - **HotNewHipHop** - - **hotstar** - - **hotstar:playlist** - - **Howcast** - - **HowStuffWorks** - - **HRTi** - - **HRTiPlaylist** - - **Huajiao**: 花椒直播 - - **HuffPost**: Huffington Post - - **Hungama** - - **HungamaSong** - - **Hypem** - - **ign.com** - - **IGNArticle** - - **IGNVideo** - - **IHeartRadio** - - **iheartradio:podcast** - - **imdb**: Internet Movie Database trailers - - **imdb:list**: Internet Movie Database lists - - **Imgur** - - **imgur:album** - - **imgur:gallery** - - **Ina** - - **Inc** - - **IndavideoEmbed** - - **InfoQ** - - **Instagram** - - **instagram:tag**: Instagram hashtag search - - **instagram:user**: Instagram user profile - - **Internazionale** - - **InternetVideoArchive** - - **IPrima** - - **iqiyi**: 爱奇艺 - - **Ir90Tv** - - **ITTF** - - **ITV** - - **ITVBTCC** - - **ivi**: ivi.ru - - **ivi:compilation**: ivi.ru compilations - - **ivideon**: Ivideon TV - - **Iwara** - - **Izlesene** - - **Jamendo** - - **JamendoAlbum** - - **JeuxVideo** - - **Joj** - - **Jove** - - **JWPlatform** - - **Kakao** - - **Kaltura** - - **Kankan** - - **Karaoketv** - - **KarriereVideos** - - **Katsomo** - - **KeezMovies** - - **Ketnet** - - **khanacademy** - - **khanacademy:unit** - - **KickStarter** - - **KinjaEmbed** - - **KinoPoisk** - - **KonserthusetPlay** - - **KrasView**: Красвью - - **Ku6** - - **KUSI** - - **kuwo:album**: 酷我音乐 - 专辑 - - **kuwo:category**: 酷我音乐 - 分类 - - **kuwo:chart**: 酷我音乐 - 排行榜 - - **kuwo:mv**: 酷我音乐 - MV - - **kuwo:singer**: 酷我音乐 - 歌手 - - **kuwo:song**: 酷我音乐 - - **la7.it** - - **laola1tv** - - **laola1tv:embed** - - **lbry** - - **lbry:channel** - - **LCI** - - **Lcp** - - **LcpPlay** - - **Le**: 乐视网 - - **Lecture2Go** - - **Lecturio** - - **LecturioCourse** - - **LecturioDeCourse** - - **LEGO** - - **Lemonde** - - **Lenta** - - **LePlaylist** - - **LetvCloud**: 乐视云 - - **Libsyn** - - **life**: Life.ru - - **life:embed** - - **limelight** - - **limelight:channel** - - **limelight:channel_list** - - **LineLive** - - **LineLiveChannel** - - **LineTV** - - **linkedin:learning** - - **linkedin:learning:course** - - **LinuxAcademy** - - **LiTV** - - **LiveJournal** - - **LiveLeak** - - **LiveLeakEmbed** - - **livestream** - - **livestream:original** - - **LnkGo** - - **loc**: Library of Congress - - **LocalNews8** - - **LoveHomePorn** - - **lrt.lt** - - **lynda**: lynda.com videos - - **lynda:course**: lynda.com online courses - - **m6** - - **mailru**: Видео@Mail.Ru - - **mailru:music**: Музыка@Mail.Ru - - **mailru:music:search**: Музыка@Mail.Ru - - **MallTV** - - **mangomolo:live** - - **mangomolo:video** - - **ManyVids** - - **MaoriTV** - - **Markiza** - - **MarkizaPage** - - **massengeschmack.tv** - - **MatchTV** - - **MDR**: MDR.DE and KiKA - - **MedalTV** - - **media.ccc.de** - - **media.ccc.de:lists** - - **Medialaan** - - **Mediaset** - - **Mediasite** - - **MediasiteCatalog** - - **MediasiteNamedCatalog** - - **Medici** - - **megaphone.fm**: megaphone.fm embedded players - - **Meipai**: 美拍 - - **MelonVOD** - - **META** - - **metacafe** - - **Metacritic** - - **mewatch** - - **Mgoon** - - **MGTV**: 芒果TV - - **MiaoPai** - - **minds** - - **minds:channel** - - **minds:group** - - **MinistryGrid** - - **Minoto** - - **miomio.tv** - - **MiTele**: mitele.es - - **mixcloud** - - **mixcloud:playlist** - - **mixcloud:user** - - **MLB** - - **MLBVideo** - - **Mnet** - - **MNetTV** - - **MoeVideo**: LetitBit video services: moevideo.net, playreplay.net and videochart.net - - **Mofosex** - - **MofosexEmbed** - - **Mojvideo** - - **Morningstar**: morningstar.com - - **Motherless** - - **MotherlessGroup** - - **Motorsport**: motorsport.com - - **MovieClips** - - **MovieFap** - - **Moviezine** - - **MovingImage** - - **MSN** - - **mtg**: MTG services - - **mtv** - - **mtv.de** - - **mtv:video** - - **mtvjapan** - - **mtvservices:embedded** - - **MTVUutisetArticle** - - **MuenchenTV**: münchen.tv - - **mva**: Microsoft Virtual Academy videos - - **mva:course**: Microsoft Virtual Academy courses - - **Mwave** - - **MwaveMeetGreet** - - **MyChannels** - - **MySpace** - - **MySpace:album** - - **MySpass** - - **Myvi** - - **MyVidster** - - **MyviEmbed** - - **MyVisionTV** - - **n-tv.de** - - **natgeo:video** - - **NationalGeographicTV** - - **Naver** - - **NBA** - - **nba:watch** - - **nba:watch:collection** - - **NBAChannel** - - **NBAEmbed** - - **NBAWatchEmbed** - - **NBC** - - **NBCNews** - - **nbcolympics** - - **nbcolympics:stream** - - **NBCSports** - - **NBCSportsStream** - - **NBCSportsVPlayer** - - **ndr**: NDR.de - Norddeutscher Rundfunk - - **ndr:embed** - - **ndr:embed:base** - - **NDTV** - - **NerdCubedFeed** - - **netease:album**: 网易云音乐 - 专辑 - - **netease:djradio**: 网易云音乐 - 电台 - - **netease:mv**: 网易云音乐 - MV - - **netease:playlist**: 网易云音乐 - 歌单 - - **netease:program**: 网易云音乐 - 电台节目 - - **netease:singer**: 网易云音乐 - 歌手 - - **netease:song**: 网易云音乐 - - **NetPlus** - - **Netzkino** - - **Newgrounds** - - **NewgroundsPlaylist** - - **Newstube** - - **NextMedia**: 蘋果日報 - - **NextMediaActionNews**: 蘋果日報 - 動新聞 - - **NextTV**: 壹電視 - - **Nexx** - - **NexxEmbed** - - **nfl.com** (Currently broken) - - **nfl.com:article** (Currently broken) - - **NhkVod** - - **NhkVodProgram** - - **nhl.com** - - **nick.com** - - **nick.de** - - **nickelodeon:br** - - **nickelodeonru** - - **nicknight** - - **niconico**: ニコニコ動画 - - **NiconicoPlaylist** - - **Nintendo** - - **njoy**: N-JOY - - **njoy:embed** - - **NJPWWorld**: 新日本プロレスワールド - - **NobelPrize** - - **NonkTube** - - **Noovo** - - **Normalboots** - - **NosVideo** - - **Nova**: TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz - - **NovaEmbed** - - **nowness** - - **nowness:playlist** - - **nowness:series** - - **Noz** - - **npo**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl - - **npo.nl:live** - - **npo.nl:radio** - - **npo.nl:radio:fragment** - - **Npr** - - **NRK** - - **NRKPlaylist** - - **NRKRadioPodkast** - - **NRKSkole**: NRK Skole - - **NRKTV**: NRK TV and NRK Radio - - **NRKTVDirekte**: NRK TV Direkte and NRK Radio Direkte - - **NRKTVEpisode** - - **NRKTVEpisodes** - - **NRKTVSeason** - - **NRKTVSeries** - - **NRLTV** - - **ntv.ru** - - **Nuvid** - - **NYTimes** - - **NYTimesArticle** - - **NYTimesCooking** - - **NZZ** - - **ocw.mit.edu** - - **OdaTV** - - **Odnoklassniki** - - **OktoberfestTV** - - **OnDemandKorea** - - **onet.pl** - - **onet.tv** - - **onet.tv:channel** - - **OnetMVP** - - **OnionStudios** - - **Ooyala** - - **OoyalaExternal** - - **OraTV** - - **orf:burgenland**: Radio Burgenland - - **orf:fm4**: radio FM4 - - **orf:fm4:story**: fm4.orf.at stories - - **orf:iptv**: iptv.ORF.at - - **orf:kaernten**: Radio Kärnten - - **orf:noe**: Radio Niederösterreich - - **orf:oberoesterreich**: Radio Oberösterreich - - **orf:oe1**: Radio Österreich 1 - - **orf:oe3**: Radio Österreich 3 - - **orf:salzburg**: Radio Salzburg - - **orf:steiermark**: Radio Steiermark - - **orf:tirol**: Radio Tirol - - **orf:tvthek**: ORF TVthek - - **orf:vorarlberg**: Radio Vorarlberg - - **orf:wien**: Radio Wien - - **OsnatelTV** - - **OutsideTV** - - **PacktPub** - - **PacktPubCourse** - - **PalcoMP3:artist** - - **PalcoMP3:song** - - **PalcoMP3:video** - - **pandora.tv**: 판도라TV - - **ParamountNetwork** - - **parliamentlive.tv**: UK parliament videos - - **Patreon** - - **pbs**: Public Broadcasting Service (PBS) and member stations: PBS: Public Broadcasting Service, APT - Alabama Public Television (WBIQ), GPB/Georgia Public Broadcasting (WGTV), Mississippi Public Broadcasting (WMPN), Nashville Public Television (WNPT), WFSU-TV (WFSU), WSRE (WSRE), WTCI (WTCI), WPBA/Channel 30 (WPBA), Alaska Public Media (KAKM), Arizona PBS (KAET), KNME-TV/Channel 5 (KNME), Vegas PBS (KLVX), AETN/ARKANSAS ETV NETWORK (KETS), KET (WKLE), WKNO/Channel 10 (WKNO), LPB/LOUISIANA PUBLIC BROADCASTING (WLPB), OETA (KETA), Ozarks Public Television (KOZK), WSIU Public Broadcasting (WSIU), KEET TV (KEET), KIXE/Channel 9 (KIXE), KPBS San Diego (KPBS), KQED (KQED), KVIE Public Television (KVIE), PBS SoCal/KOCE (KOCE), ValleyPBS (KVPT), CONNECTICUT PUBLIC TELEVISION (WEDH), KNPB Channel 5 (KNPB), SOPTV (KSYS), Rocky Mountain PBS (KRMA), KENW-TV3 (KENW), KUED Channel 7 (KUED), Wyoming PBS (KCWC), Colorado Public Television / KBDI 12 (KBDI), KBYU-TV (KBYU), Thirteen/WNET New York (WNET), WGBH/Channel 2 (WGBH), WGBY (WGBY), NJTV Public Media NJ (WNJT), WLIW21 (WLIW), mpt/Maryland Public Television (WMPB), WETA Television and Radio (WETA), WHYY (WHYY), PBS 39 (WLVT), WVPT - Your Source for PBS and More! (WVPT), Howard University Television (WHUT), WEDU PBS (WEDU), WGCU Public Media (WGCU), WPBT2 (WPBT), WUCF TV (WUCF), WUFT/Channel 5 (WUFT), WXEL/Channel 42 (WXEL), WLRN/Channel 17 (WLRN), WUSF Public Broadcasting (WUSF), ETV (WRLK), UNC-TV (WUNC), PBS Hawaii - Oceanic Cable Channel 10 (KHET), Idaho Public Television (KAID), KSPS (KSPS), OPB (KOPB), KWSU/Channel 10 & KTNW/Channel 31 (KWSU), WILL-TV (WILL), Network Knowledge - WSEC/Springfield (WSEC), WTTW11 (WTTW), Iowa Public Television/IPTV (KDIN), Nine Network (KETC), PBS39 Fort Wayne (WFWA), WFYI Indianapolis (WFYI), Milwaukee Public Television (WMVS), WNIN (WNIN), WNIT Public Television (WNIT), WPT (WPNE), WVUT/Channel 22 (WVUT), WEIU/Channel 51 (WEIU), WQPT-TV (WQPT), WYCC PBS Chicago (WYCC), WIPB-TV (WIPB), WTIU (WTIU), CET (WCET), ThinkTVNetwork (WPTD), WBGU-TV (WBGU), WGVU TV (WGVU), NET1 (KUON), Pioneer Public Television (KWCM), SDPB Television (KUSD), TPT (KTCA), KSMQ (KSMQ), KPTS/Channel 8 (KPTS), KTWU/Channel 11 (KTWU), East Tennessee PBS (WSJK), WCTE-TV (WCTE), WLJT, Channel 11 (WLJT), WOSU TV (WOSU), WOUB/WOUC (WOUB), WVPB (WVPB), WKYU-PBS (WKYU), KERA 13 (KERA), MPBN (WCBB), Mountain Lake PBS (WCFE), NHPTV (WENH), Vermont PBS (WETK), witf (WITF), WQED Multimedia (WQED), WMHT Educational Telecommunications (WMHT), Q-TV (WDCQ), WTVS Detroit Public TV (WTVS), CMU Public Television (WCMU), WKAR-TV (WKAR), WNMU-TV Public TV 13 (WNMU), WDSE - WRPT (WDSE), WGTE TV (WGTE), Lakeland Public Television (KAWE), KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS), MontanaPBS (KUSM), KRWG/Channel 22 (KRWG), KACV (KACV), KCOS/Channel 13 (KCOS), WCNY/Channel 24 (WCNY), WNED (WNED), WPBS (WPBS), WSKG Public TV (WSKG), WXXI (WXXI), WPSU (WPSU), WVIA Public Media Studios (WVIA), WTVI (WTVI), Western Reserve PBS (WNEO), WVIZ/PBS ideastream (WVIZ), KCTS 9 (KCTS), Basin PBS (KPBT), KUHT / Channel 8 (KUHT), KLRN (KLRN), KLRU (KLRU), WTJX Channel 12 (WTJX), WCVE PBS (WCVE), KBTC Public Television (KBTC) - - **PearVideo** - - **PeerTube** - - **People** - - **PerformGroup** - - **periscope**: Periscope - - **periscope:user**: Periscope user videos - - **PhilharmonieDeParis**: Philharmonie de Paris - - **phoenix.de** - - **Photobucket** - - **Picarto** - - **PicartoVod** - - **Piksel** - - **Pinkbike** - - **Pinterest** - - **PinterestCollection** - - **Pladform** - - **Platzi** - - **PlatziCourse** - - **play.fm** - - **player.sky.it** - - **PlayPlusTV** - - **PlayStuff** - - **PlaysTV** - - **Playtvak**: Playtvak.cz, iDNES.cz and Lidovky.cz - - **Playvid** - - **Playwire** - - **pluralsight** - - **pluralsight:course** - - **podomatic** - - **Pokemon** - - **PolskieRadio** - - **PolskieRadioCategory** - - **Popcorntimes** - - **PopcornTV** - - **PornCom** - - **PornerBros** - - **PornHd** - - **PornHub**: PornHub and Thumbzilla - - **PornHubPagedVideoList** - - **PornHubUser** - - **PornHubUserVideosUpload** - - **Pornotube** - - **PornoVoisines** - - **PornoXO** - - **PornTube** - - **PressTV** - - **prosiebensat1**: ProSiebenSat.1 Digital - - **puhutv** - - **puhutv:serie** - - **Puls4** - - **Pyvideo** - - **qqmusic**: QQ音乐 - - **qqmusic:album**: QQ音乐 - 专辑 - - **qqmusic:playlist**: QQ音乐 - 歌单 - - **qqmusic:singer**: QQ音乐 - 歌手 - - **qqmusic:toplist**: QQ音乐 - 排行榜 - - **QuantumTV** - - **Qub** - - **Quickline** - - **QuicklineLive** - - **R7** - - **R7Article** - - **radio.de** - - **radiobremen** - - **radiocanada** - - **radiocanada:audiovideo** - - **radiofrance** - - **RadioJavan** - - **Rai** - - **RaiPlay** - - **RaiPlayLive** - - **RaiPlayPlaylist** - - **RayWenderlich** - - **RayWenderlichCourse** - - **RBMARadio** - - **RDS**: RDS.ca - - **RedBull** - - **RedBullEmbed** - - **RedBullTV** - - **RedBullTVRrnContent** - - **Reddit** - - **RedditR** - - **RedTube** - - **RegioTV** - - **RENTV** - - **RENTVArticle** - - **Restudy** - - **Reuters** - - **ReverbNation** - - **RICE** - - **RMCDecouverte** - - **RockstarGames** - - **RoosterTeeth** - - **RottenTomatoes** - - **Roxwel** - - **Rozhlas** - - **RTBF** - - **rte**: Raidió Teilifís Éireann TV - - **rte:radio**: Raidió Teilifís Éireann radio - - **rtl.nl**: rtl.nl and rtlxl.nl - - **rtl2** - - **rtl2:you** - - **rtl2:you:series** - - **RTP** - - **RTS**: RTS.ch - - **rtve.es:alacarta**: RTVE a la carta - - **rtve.es:infantil**: RTVE infantil - - **rtve.es:live**: RTVE.es live streams - - **rtve.es:television** - - **RTVNH** - - **RTVS** - - **RUHD** - - **RumbleEmbed** - - **rutube**: Rutube videos - - **rutube:channel**: Rutube channels - - **rutube:embed**: Rutube embedded videos - - **rutube:movie**: Rutube movies - - **rutube:person**: Rutube person videos - - **rutube:playlist**: Rutube playlists - - **RUTV**: RUTV.RU - - **Ruutu** - - **Ruv** - - **safari**: safaribooksonline.com online video - - **safari:api** - - **safari:course**: safaribooksonline.com online courses - - **SAKTV** - - **SaltTV** - - **SampleFocus** - - **Sapo**: SAPO Vídeos - - **savefrom.net** - - **SBS**: sbs.com.au - - **schooltv** - - **screen.yahoo:search**: Yahoo screen search - - **Screencast** - - **ScreencastOMatic** - - **ScrippsNetworks** - - **scrippsnetworks:watch** - - **SCTE** - - **SCTECourse** - - **Seeker** - - **SenateISVP** - - **SendtoNews** - - **Servus** - - **Sexu** - - **SeznamZpravy** - - **SeznamZpravyArticle** - - **Shahid** - - **ShahidShow** - - **Shared**: shared.sx - - **ShowRoomLive** - - **simplecast** - - **simplecast:episode** - - **simplecast:podcast** - - **Sina** - - **sky.it** - - **sky:news** - - **sky:sports** - - **sky:sports:news** - - **skyacademy.it** - - **SkylineWebcams** - - **skynewsarabia:article** - - **skynewsarabia:video** - - **Slideshare** - - **SlidesLive** - - **Slutload** - - **Snotr** - - **Sohu** - - **SonyLIV** - - **soundcloud** - - **soundcloud:playlist** - - **soundcloud:search**: Soundcloud search - - **soundcloud:set** - - **soundcloud:trackstation** - - **soundcloud:user** - - **SoundcloudEmbed** - - **soundgasm** - - **soundgasm:profile** - - **southpark.cc.com** - - **southpark.cc.com:español** - - **southpark.de** - - **southpark.nl** - - **southparkstudios.dk** - - **SpankBang** - - **SpankBangPlaylist** - - **Spankwire** - - **Spiegel** - - **sport.francetvinfo.fr** - - **Sport5** - - **SportBox** - - **SportDeutschland** - - **spotify** - - **spotify:show** - - **Spreaker** - - **SpreakerPage** - - **SpreakerShow** - - **SpreakerShowPage** - - **SpringboardPlatform** - - **Sprout** - - **sr:mediathek**: Saarländischer Rundfunk - - **SRGSSR** - - **SRGSSRPlay**: srf.ch, rts.ch, rsi.ch, rtr.ch and swissinfo.ch play sites - - **stanfordoc**: Stanford Open ClassRoom - - **Steam** - - **Stitcher** - - **StitcherShow** - - **StoryFire** - - **StoryFireSeries** - - **StoryFireUser** - - **Streamable** - - **streamcloud.eu** - - **StreamCZ** - - **StreetVoice** - - **StretchInternet** - - **stv:player** - - **SunPorno** - - **sverigesradio:episode** - - **sverigesradio:publication** - - **SVT** - - **SVTPage** - - **SVTPlay**: SVT Play and Öppet arkiv - - **SVTSeries** - - **SWRMediathek** - - **Syfy** - - **SztvHu** - - **t-online.de** - - **Tagesschau** - - **tagesschau:player** - - **Tass** - - **TBS** - - **TDSLifeway** - - **Teachable** - - **TeachableCourse** - - **teachertube**: teachertube.com videos - - **teachertube:user:collection**: teachertube.com user and collection videos - - **TeachingChannel** - - **Teamcoco** - - **TeamTreeHouse** - - **TechTalks** - - **techtv.mit.edu** - - **ted** - - **Tele13** - - **Tele5** - - **TeleBruxelles** - - **Telecinco**: telecinco.es, cuatro.com and mediaset.es - - **Telegraaf** - - **TeleMB** - - **TeleQuebec** - - **TeleQuebecEmission** - - **TeleQuebecLive** - - **TeleQuebecSquat** - - **TeleQuebecVideo** - - **TeleTask** - - **Telewebion** - - **TennisTV** - - **TenPlay** - - **TF1** - - **TFO** - - **TheIntercept** - - **ThePlatform** - - **ThePlatformFeed** - - **TheScene** - - **TheStar** - - **TheSun** - - **TheWeatherChannel** - - **ThisAmericanLife** - - **ThisAV** - - **ThisOldHouse** - - **TikTok** - - **TikTokUser** (Currently broken) - - **tinypic**: tinypic.com videos - - **TMZ** - - **TMZArticle** - - **TNAFlix** - - **TNAFlixNetworkEmbed** - - **toggle** - - **ToonGoggles** - - **tou.tv** - - **Toypics**: Toypics video - - **ToypicsUser**: Toypics user profile - - **TrailerAddict** (Currently broken) - - **Trilulilu** - - **Trovo** - - **TrovoVod** - - **TruNews** - - **TruTV** - - **Tube8** - - **TubiTv** - - **Tumblr** - - **tunein:clip** - - **tunein:program** - - **tunein:station** - - **tunein:topic** - - **TunePk** - - **Turbo** - - **tv.dfb.de** - - **TV2** - - **tv2.hu** - - **TV2Article** - - **TV2DK** - - **TV2DKBornholmPlay** - - **TV4**: tv4.se and tv4play.se - - **TV5MondePlus**: TV5MONDE+ - - **tv5unis** - - **tv5unis:video** - - **tv8.it** - - **TVA** - - **TVANouvelles** - - **TVANouvellesArticle** - - **TVC** - - **TVCArticle** - - **TVer** - - **tvigle**: Интернет-телевидение Tvigle.ru - - **tvland.com** - - **TVN24** - - **TVNet** - - **TVNoe** - - **TVNow** - - **TVNowAnnual** - - **TVNowNew** - - **TVNowSeason** - - **TVNowShow** - - **tvp**: Telewizja Polska - - **tvp:embed**: Telewizja Polska - - **tvp:series** - - **TVPlayer** - - **TVPlayHome** - - **Tweakers** - - **TwitCasting** - - **twitch:clips** - - **twitch:stream** - - **twitch:vod** - - **TwitchCollection** - - **TwitchVideos** - - **TwitchVideosClips** - - **TwitchVideosCollections** - - **twitter** - - **twitter:amplify** - - **twitter:broadcast** - - **twitter:card** - - **udemy** - - **udemy:course** - - **UDNEmbed**: 聯合影音 - - **UFCArabia** - - **UFCTV** - - **UKTVPlay** - - **umg:de**: Universal Music Deutschland - - **Unistra** - - **Unity** - - **uol.com.br** - - **uplynk** - - **uplynk:preplay** - - **Urort**: NRK P3 Urørt - - **URPlay** - - **USANetwork** - - **USAToday** - - **ustream** - - **ustream:channel** - - **ustudio** - - **ustudio:embed** - - **Varzesh3** - - **Vbox7** - - **VeeHD** - - **Veoh** - - **Vesti**: Вести.Ru - - **Vevo** - - **VevoPlaylist** - - **VGTV**: VGTV, BTTV, FTV, Aftenposten and Aftonbladet - - **vh1.com** - - **vhx:embed** - - **Viafree** - - **vice** - - **vice:article** - - **vice:show** - - **Vidbit** - - **Viddler** - - **Videa** - - **video.arnes.si**: Arnes Video - - **video.google:search**: Google Video search - - **video.sky.it** - - **video.sky.it:live** - - **VideoDetective** - - **videofy.me** - - **videomore** - - **videomore:season** - - **videomore:video** - - **VideoPress** - - **Vidio** - - **VidLii** - - **vidme** - - **vidme:user** - - **vidme:user:likes** - - **vier**: vier.be and vijf.be - - **vier:videos** - - **viewlift** - - **viewlift:embed** - - **Viidea** - - **viki** - - **viki:channel** - - **vimeo** - - **vimeo:album** - - **vimeo:channel** - - **vimeo:group** - - **vimeo:likes**: Vimeo user likes - - **vimeo:ondemand** - - **vimeo:review**: Review pages on vimeo - - **vimeo:user** - - **vimeo:watchlater**: Vimeo watch later list, "vimeowatchlater" keyword (requires authentication) - - **Vimple**: Vimple - one-click video hosting - - **Vine** - - **vine:user** - - **Viqeo** - - **Viu** - - **viu:ott** - - **viu:playlist** - - **Vivo**: vivo.sx - - **vk**: VK - - **vk:uservideos**: VK - User's Videos - - **vk:wallpost** - - **vlive** - - **vlive:channel** - - **vlive:post** - - **Vodlocker** - - **VODPl** - - **VODPlatform** - - **VoiceRepublic** - - **Voot** - - **VoxMedia** - - **VoxMediaVolume** - - **vpro**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl - - **Vrak** - - **VRT**: VRT NWS, Flanders News, Flandern Info and Sporza - - **VrtNU**: VrtNU.be - - **vrv** - - **vrv:series** - - **VShare** - - **VTM** - - **VTXTV** - - **vube**: Vube.com - - **VuClip** - - **VVVVID** - - **VVVVIDShow** - - **VyboryMos** - - **Vzaar** - - **Wakanim** - - **Walla** - - **WalyTV** - - **washingtonpost** - - **washingtonpost:article** - - **wat.tv** - - **WatchBox** - - **WatchIndianPorn**: Watch Indian Porn - - **WDR** - - **wdr:mobile** - - **WDRElefant** - - **WDRPage** - - **Webcaster** - - **WebcasterFeed** - - **WebOfStories** - - **WebOfStoriesPlaylist** - - **Weibo** - - **WeiboMobile** - - **WeiqiTV**: WQTV - - **Wistia** - - **WistiaPlaylist** - - **wnl**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl - - **WorldStarHipHop** - - **WSJ**: Wall Street Journal - - **WSJArticle** - - **WWE** - - **XBef** - - **XboxClips** - - **XFileShare**: XFileShare based sites: Aparat, ClipWatching, GoUnlimited, GoVid, HolaVid, Streamty, TheVideoBee, Uqload, VidBom, vidlo, VidLocker, VidShare, VUp, WolfStream, XVideoSharing - - **XHamster** - - **XHamsterEmbed** - - **XHamsterUser** - - **xiami:album**: 虾米音乐 - 专辑 - - **xiami:artist**: 虾米音乐 - 歌手 - - **xiami:collection**: 虾米音乐 - 精选集 - - **xiami:song**: 虾米音乐 - - **ximalaya**: 喜马拉雅FM - - **ximalaya:album**: 喜马拉雅FM 专辑 - - **XMinus** - - **XNXX** - - **Xstream** - - **XTube** - - **XTubeUser**: XTube user profile - - **Xuite**: 隨意窩Xuite影音 - - **XVideos** - - **XXXYMovies** - - **Yahoo**: Yahoo screen and movies - - **yahoo:gyao** - - **yahoo:gyao:player** - - **yahoo:japannews**: Yahoo! Japan News - - **YandexDisk** - - **yandexmusic:album**: Яндекс.Музыка - Альбом - - **yandexmusic:artist:albums**: Яндекс.Музыка - Артист - Альбомы - - **yandexmusic:artist:tracks**: Яндекс.Музыка - Артист - Треки - - **yandexmusic:playlist**: Яндекс.Музыка - Плейлист - - **yandexmusic:track**: Яндекс.Музыка - Трек - - **YandexVideo** - - **YapFiles** - - **YesJapan** - - **yinyuetai:video**: 音悦Tai - - **Ynet** - - **YouJizz** - - **youku**: 优酷 - - **youku:show** - - **YouNowChannel** - - **YouNowLive** - - **YouNowMoment** - - **YouPorn** - - **YourPorn** - - **YourUpload** - - **youtube**: YouTube.com - - **youtube:favorites**: YouTube.com favourite videos, ":ytfav" for short (requires authentication) - - **youtube:history**: Youtube watch history, ":ythistory" for short (requires authentication) - - **youtube:playlist**: YouTube.com playlists - - **youtube:recommended**: YouTube.com recommended videos, ":ytrec" for short (requires authentication) - - **youtube:search**: YouTube.com searches - - **youtube:search:date**: YouTube.com searches, newest videos first - - **youtube:subscriptions**: YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication) - - **youtube:tab**: YouTube.com tab - - **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" for short (requires authentication) - - **YoutubeYtBe** - - **YoutubeYtUser** - - **Zapiks** - - **Zattoo** - - **ZattooLive** - - **ZDF** - - **ZDFChannel** - - **Zhihu** - - **zingmp3**: mp3.zing.vn - - **zingmp3:album** - - **zoom** - - **Zype** +--- +orphan: true +--- +```{include} ../supportedsites.md +``` diff --git a/docs/ytdlp_plugins.md b/docs/ytdlp_plugins.md new file mode 100644 index 000000000..483b9c46e --- /dev/null +++ b/docs/ytdlp_plugins.md @@ -0,0 +1,6 @@ +--- +orphan: true +--- +# ytdlp_plugins + +See [https://github.com/yt-dlp/yt-dlp/tree/master/ytdlp_plugins](https://github.com/yt-dlp/yt-dlp/tree/master/ytdlp_plugins). diff --git a/pyinst.py b/pyinst.py new file mode 100644 index 000000000..ed410e0f2 --- /dev/null +++ b/pyinst.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +from __future__ import unicode_literals +import sys +import platform + +from PyInstaller.utils.hooks import collect_submodules +from PyInstaller.utils.win32.versioninfo import ( + VarStruct, VarFileInfo, StringStruct, StringTable, + StringFileInfo, FixedFileInfo, VSVersionInfo, SetVersion, +) +import PyInstaller.__main__ + +arch = platform.architecture()[0][:2] +assert arch in ('32', '64') +_x86 = '_x86' if arch == '32' else '' + +# Compatability with older arguments +opts = sys.argv[1:] +if opts[0:1] in (['32'], ['64']): + if arch != opts[0]: + raise Exception(f'{opts[0]}bit executable cannot be built on a {arch}bit system') + opts = opts[1:] +opts = opts or ['--onefile'] + +print(f'Building {arch}bit version with options {opts}') + +FILE_DESCRIPTION = 'yt-dlp%s' % (' (32 Bit)' if _x86 else '') + +exec(compile(open('yt_dlp/version.py').read(), 'yt_dlp/version.py', 'exec')) +VERSION = locals()['__version__'] + +VERSION_LIST = VERSION.split('.') +VERSION_LIST = list(map(int, VERSION_LIST)) + [0] * (4 - len(VERSION_LIST)) + +print('Version: %s%s' % (VERSION, _x86)) +print('Remember to update the version using devscipts\\update-version.py') + +VERSION_FILE = VSVersionInfo( + ffi=FixedFileInfo( + filevers=VERSION_LIST, + prodvers=VERSION_LIST, + mask=0x3F, + flags=0x0, + OS=0x4, + fileType=0x1, + subtype=0x0, + date=(0, 0), + ), + kids=[ + StringFileInfo([ + StringTable( + '040904B0', [ + StringStruct('Comments', 'yt-dlp%s Command Line Interface.' % _x86), + StringStruct('CompanyName', 'https://github.com/yt-dlp'), + StringStruct('FileDescription', FILE_DESCRIPTION), + StringStruct('FileVersion', VERSION), + StringStruct('InternalName', 'yt-dlp%s' % _x86), + StringStruct( + 'LegalCopyright', + 'pukkandan.ytdlp@gmail.com | UNLICENSE', + ), + StringStruct('OriginalFilename', 'yt-dlp%s.exe' % _x86), + StringStruct('ProductName', 'yt-dlp%s' % _x86), + StringStruct( + 'ProductVersion', + '%s%s on Python %s' % (VERSION, _x86, platform.python_version())), + ])]), + VarFileInfo([VarStruct('Translation', [0, 1200])]) + ] +) + + +def pycryptodome_module(): + try: + import Cryptodome # noqa: F401 + except ImportError: + try: + import Crypto # noqa: F401 + print('WARNING: Using Crypto since Cryptodome is not available. ' + 'Install with: pip install pycryptodomex', file=sys.stderr) + return 'Crypto' + except ImportError: + pass + return 'Cryptodome' + + +dependancies = [pycryptodome_module(), 'mutagen'] + collect_submodules('websockets') +excluded_modules = ['test', 'ytdlp_plugins', 'youtube-dl', 'youtube-dlc'] + +PyInstaller.__main__.run([ + '--name=yt-dlp%s' % _x86, + '--icon=devscripts/logo.ico', + *[f'--exclude-module={module}' for module in excluded_modules], + *[f'--hidden-import={module}' for module in dependancies], + '--upx-exclude=vcruntime140.dll', + '--noconfirm', + *opts, + 'yt_dlp/__main__.py', +]) +SetVersion('dist/%syt-dlp%s.exe' % ('yt-dlp/' if '--onedir' in opts else '', _x86), VERSION_FILE) diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 000000000..52feb4aba --- /dev/null +++ b/pytest.ini @@ -0,0 +1,4 @@ +[pytest] +addopts = -ra -v --strict-markers +markers = + download diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 000000000..cecd08eae --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +mutagen +pycryptodomex +websockets @@ -2,5 +2,5 @@ universal = True [flake8] -exclude = youtube_dl/extractor/__init__.py,devscripts/buildserver.py,devscripts/lazy_load_template.py,devscripts/make_issue_template.py,setup.py,build,.git,venv -ignore = E402,E501,E731,E741,W503 +exclude = yt_dlp/extractor/__init__.py,devscripts/buildserver.py,devscripts/lazy_load_template.py,devscripts/make_issue_template.py,setup.py,build,.git,venv,devscripts/create-github-release.py,devscripts/release.sh,devscripts/show-downloads-statistics.py +ignore = E402,E501,E731,E741,W503
\ No newline at end of file @@ -1,68 +1,66 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # coding: utf-8 - -from __future__ import print_function - import os.path import warnings import sys try: - from setuptools import setup, Command + from setuptools import setup, Command, find_packages setuptools_available = True except ImportError: from distutils.core import setup, Command setuptools_available = False from distutils.spawn import spawn -try: - # This will create an exe that needs Microsoft Visual C++ 2008 - # Redistributable Package +# Get the version from yt_dlp/version.py without importing the package +exec(compile(open('yt_dlp/version.py').read(), 'yt_dlp/version.py', 'exec')) + + +DESCRIPTION = 'Command-line program to download videos from YouTube.com and many other other video platforms.' + +LONG_DESCRIPTION = '\n\n'.join(( + 'Official repository: <https://github.com/yt-dlp/yt-dlp>', + '**PS**: Some links in this document will not work since this is a copy of the README.md from Github', + open('README.md', 'r', encoding='utf-8').read())) + +REQUIREMENTS = ['mutagen', 'pycryptodomex', 'websockets'] + + +if sys.argv[1:2] == ['py2exe']: import py2exe -except ImportError: - if len(sys.argv) >= 2 and sys.argv[1] == 'py2exe': - print('Cannot import py2exe', file=sys.stderr) - exit(1) - -py2exe_options = { - 'bundle_files': 1, - 'compressed': 1, - 'optimize': 2, - 'dist_dir': '.', - 'dll_excludes': ['w9xpopen.exe', 'crypt32.dll'], -} - -# Get the version from youtube_dl/version.py without importing the package -exec(compile(open('youtube_dl/version.py').read(), - 'youtube_dl/version.py', 'exec')) - -DESCRIPTION = 'YouTube video downloader' -LONG_DESCRIPTION = 'Command-line program to download videos from YouTube.com and other video sites' - -py2exe_console = [{ - 'script': './youtube_dl/__main__.py', - 'dest_base': 'youtube-dl', - 'version': __version__, - 'description': DESCRIPTION, - 'comments': LONG_DESCRIPTION, - 'product_name': 'youtube-dl', - 'product_version': __version__, -}] - -py2exe_params = { - 'console': py2exe_console, - 'options': {'py2exe': py2exe_options}, - 'zipfile': None -} - -if len(sys.argv) >= 2 and sys.argv[1] == 'py2exe': - params = py2exe_params + warnings.warn( + 'Building with py2exe is not officially supported. ' + 'The recommended way is to use "pyinst.py" to build using pyinstaller') + params = { + 'console': [{ + 'script': './yt_dlp/__main__.py', + 'dest_base': 'yt-dlp', + 'version': __version__, + 'description': DESCRIPTION, + 'comments': LONG_DESCRIPTION.split('\n')[0], + 'product_name': 'yt-dlp', + 'product_version': __version__, + }], + 'options': { + 'py2exe': { + 'bundle_files': 0, + 'compressed': 1, + 'optimize': 2, + 'dist_dir': './dist', + 'excludes': ['Crypto', 'Cryptodome'], # py2exe cannot import Crypto + 'dll_excludes': ['w9xpopen.exe', 'crypt32.dll'], + } + }, + 'zipfile': None + } + else: files_spec = [ - ('etc/bash_completion.d', ['youtube-dl.bash-completion']), - ('etc/fish/completions', ['youtube-dl.fish']), - ('share/doc/youtube_dl', ['README.txt']), - ('share/man/man1', ['youtube-dl.1']) + ('share/bash-completion/completions', ['completions/bash/yt-dlp']), + ('share/zsh/site-functions', ['completions/zsh/_yt-dlp']), + ('share/fish/vendor_completions.d', ['completions/fish/yt-dlp.fish']), + ('share/doc/yt_dlp', ['README.txt']), + ('share/man/man1', ['yt-dlp.1']) ] root = os.path.dirname(os.path.abspath(__file__)) data_files = [] @@ -70,7 +68,7 @@ else: resfiles = [] for fn in files: if not os.path.exists(fn): - warnings.warn('Skipping file %s since it is not present. Type make to build all automatically generated files.' % fn) + warnings.warn('Skipping file %s since it is not present. Try running `make pypi-files` first' % fn) else: resfiles.append(fn) data_files.append((dirname, resfiles)) @@ -78,10 +76,12 @@ else: params = { 'data_files': data_files, } + if setuptools_available: - params['entry_points'] = {'console_scripts': ['youtube-dl = youtube_dl:main']} + params['entry_points'] = {'console_scripts': ['yt-dlp = yt_dlp:main']} else: - params['scripts'] = ['bin/youtube-dl'] + params['scripts'] = ['yt-dlp'] + class build_lazy_extractors(Command): description = 'Build the extractor lazy loading module' @@ -94,54 +94,43 @@ class build_lazy_extractors(Command): pass def run(self): - spawn( - [sys.executable, 'devscripts/make_lazy_extractors.py', 'youtube_dl/extractor/lazy_extractors.py'], - dry_run=self.dry_run, - ) + spawn([sys.executable, 'devscripts/make_lazy_extractors.py', 'yt_dlp/extractor/lazy_extractors.py'], + dry_run=self.dry_run) + + +if setuptools_available: + packages = find_packages(exclude=('youtube_dl', 'youtube_dlc', 'test', 'ytdlp_plugins')) +else: + packages = ['yt_dlp', 'yt_dlp.downloader', 'yt_dlp.extractor', 'yt_dlp.postprocessor'] + setup( - name='youtube_dl', + name='yt-dlp', version=__version__, - description=DESCRIPTION, - long_description=LONG_DESCRIPTION, - url='https://git.conocimientoslibres.ga/software/hypervideo.git', - author='Ricardo Garcia', - author_email='ytdl@yt-dl.org', maintainer='Jesús E..', maintainer_email='heckyel@hyperbola.info', license='CC0-1.0', - packages=[ - 'youtube_dl', - 'youtube_dl.extractor', 'youtube_dl.downloader', - 'youtube_dl.postprocessor'], - - # Provokes warning on most systems (why?!) - # test_suite = 'nose.collector', - # test_requires = ['nosetest'], - + description=DESCRIPTION, + long_description=LONG_DESCRIPTION, + long_description_content_type='text/markdown', + url='https://git.conocimientoslibres.ga/software/hypervideo.git', + packages=packages, + install_requires=REQUIREMENTS, classifiers=[ 'Topic :: Multimedia :: Video', 'Development Status :: 5 - Production/Stable', 'Environment :: Console', - 'License :: Public Domain', 'Programming Language :: Python', - 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.6', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.2', - 'Programming Language :: Python :: 3.3', - 'Programming Language :: Python :: 3.4', - 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: Implementation', 'Programming Language :: Python :: Implementation :: CPython', - 'Programming Language :: Python :: Implementation :: IronPython', - 'Programming Language :: Python :: Implementation :: Jython', 'Programming Language :: Python :: Implementation :: PyPy', + 'License :: Public Domain', + 'Operating System :: OS Independent', ], + python_requires='>=3.6', cmdclass={'build_lazy_extractors': build_lazy_extractors}, **params diff --git a/supportedsites.md b/supportedsites.md new file mode 100644 index 000000000..02be6b918 --- /dev/null +++ b/supportedsites.md @@ -0,0 +1,1364 @@ +# Supported sites + - **17live** + - **17live:clip** + - **1tv**: Первый канал + - **20min** + - **220.ro** + - **23video** + - **247sports** + - **24video** + - **3qsdn**: 3Q SDN + - **3sat** + - **4tube** + - **56.com** + - **5min** + - **6play** + - **7plus** + - **8tracks** + - **91porn** + - **9c9media** + - **9gag** + - **9now.com.au** + - **abc.net.au** + - **abc.net.au:iview** + - **abcnews** + - **abcnews:video** + - **abcotvs**: ABC Owned Television Stations + - **abcotvs:clips** + - **AcademicEarth:Course** + - **acast** + - **acast:channel** + - **ADN**: Anime Digital Network + - **AdobeConnect** + - **adobetv** + - **adobetv:channel** + - **adobetv:embed** + - **adobetv:show** + - **adobetv:video** + - **AdultSwim** + - **aenetworks**: A+E Networks: A&E, Lifetime, History.com, FYI Network and History Vault + - **aenetworks:collection** + - **aenetworks:show** + - **afreecatv**: afreecatv.com + - **AirMozilla** + - **AliExpressLive** + - **AlJazeera** + - **Allocine** + - **AlphaPorno** + - **Alura** + - **AluraCourse** + - **Amara** + - **AMCNetworks** + - **AmericasTestKitchen** + - **AmericasTestKitchenSeason** + - **anderetijden**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl + - **AnimalPlanet** + - **AnimeLab** + - **AnimeLabShows** + - **AnimeOnDemand** + - **Anvato** + - **aol.com**: Yahoo screen and movies + - **APA** + - **Aparat** + - **AppleConnect** + - **AppleDaily**: 臺灣蘋果日報 + - **ApplePodcasts** + - **appletrailers** + - **appletrailers:section** + - **archive.org**: archive.org video and audio + - **ArcPublishing** + - **ARD** + - **ARD:mediathek** + - **ARDBetaMediathek** + - **Arkena** + - **arte.sky.it** + - **ArteTV** + - **ArteTVEmbed** + - **ArteTVPlaylist** + - **AsianCrush** + - **AsianCrushPlaylist** + - **AtresPlayer** + - **ATTTechChannel** + - **ATVAt** + - **AudiMedia** + - **AudioBoom** + - **audiomack** + - **audiomack:album** + - **Audius**: Audius.co + - **audius:artist**: Audius.co profile/artist pages + - **audius:playlist**: Audius.co playlists + - **audius:track**: Audius track ID or API link. Prepend with "audius:" + - **AWAAN** + - **awaan:live** + - **awaan:season** + - **awaan:video** + - **AZMedien**: AZ Medien videos + - **BaiduVideo**: 百度视频 + - **bandaichannel** + - **Bandcamp** + - **Bandcamp:album** + - **Bandcamp:weekly** + - **BandcampMusic** + - **bangumi.bilibili.com**: BiliBili番剧 + - **BannedVideo** + - **bbc**: BBC + - **bbc.co.uk**: BBC iPlayer + - **bbc.co.uk:article**: BBC articles + - **bbc.co.uk:iplayer:episodes** + - **bbc.co.uk:iplayer:group** + - **bbc.co.uk:playlist** + - **BBVTV** + - **Beatport** + - **Beeg** + - **BehindKink** + - **Bellator** + - **BellMedia** + - **Bet** + - **bfi:player** + - **bfmtv** + - **bfmtv:article** + - **bfmtv:live** + - **BibelTV** + - **Bigflix** + - **Bild**: Bild.de + - **BiliBili** + - **Bilibili category extractor** + - **BilibiliAudio** + - **BilibiliAudioAlbum** + - **BilibiliChannel** + - **BiliBiliPlayer** + - **BiliBiliSearch**: Bilibili video search, "bilisearch" keyword + - **BiliIntl** + - **BiliIntlSeries** + - **BioBioChileTV** + - **Biography** + - **BIQLE** + - **BitChute** + - **BitChuteChannel** + - **bitwave:replay** + - **bitwave:stream** + - **BlackboardCollaborate** + - **BleacherReport** + - **BleacherReportCMS** + - **Bloomberg** + - **BokeCC** + - **BongaCams** + - **BostonGlobe** + - **Box** + - **Bpb**: Bundeszentrale für politische Bildung + - **BR**: Bayerischer Rundfunk + - **BravoTV** + - **Break** + - **brightcove:legacy** + - **brightcove:new** + - **BRMediathek**: Bayerischer Rundfunk Mediathek + - **bt:article**: Bergens Tidende Articles + - **bt:vestlendingen**: Bergens Tidende - Vestlendingen + - **BusinessInsider** + - **BuzzFeed** + - **BYUtv** + - **CAM4** + - **Camdemy** + - **CamdemyFolder** + - **CamModels** + - **CamWithHer** + - **canalc2.tv** + - **Canalplus**: mycanal.fr and piwiplus.fr + - **Canvas** + - **CanvasEen**: canvas.be and een.be + - **CarambaTV** + - **CarambaTVPage** + - **CartoonNetwork** + - **cbc.ca** + - **cbc.ca:player** + - **CBS** + - **CBSInteractive** + - **CBSLocal** + - **CBSLocalArticle** + - **cbsnews**: CBS News + - **cbsnews:embed** + - **cbsnews:livevideo**: CBS News Live Videos + - **cbssports** + - **cbssports:embed** + - **CCMA** + - **CCTV**: 央视网 + - **CDA** + - **CeskaTelevize** + - **CeskaTelevizePorady** + - **CGTN** + - **channel9**: Channel 9 + - **CharlieRose** + - **Chaturbate** + - **Chilloutzone** + - **Chingari** + - **ChingariUser** + - **chirbit** + - **chirbit:profile** + - **cielotv.it** + - **Cinchcast** + - **Cinemax** + - **CiscoLiveSearch** + - **CiscoLiveSession** + - **ciscowebex**: Cisco Webex + - **CJSW** + - **cliphunter** + - **Clippit** + - **ClipRs** + - **Clipsyndicate** + - **CloserToTruth** + - **CloudflareStream** + - **Cloudy** + - **Clubic** + - **Clyp** + - **cmt.com** + - **CNBC** + - **CNBCVideo** + - **CNN** + - **CNNArticle** + - **CNNBlogs** + - **ComedyCentral** + - **ComedyCentralTV** + - **CondeNast**: Condé Nast media group: Allure, Architectural Digest, Ars Technica, Bon Appétit, Brides, Condé Nast, Condé Nast Traveler, Details, Epicurious, GQ, Glamour, Golf Digest, SELF, Teen Vogue, The New Yorker, Vanity Fair, Vogue, W Magazine, WIRED + - **CONtv** + - **Corus** + - **Coub** + - **Cracked** + - **Crackle** + - **CrooksAndLiars** + - **crunchyroll** + - **crunchyroll:playlist** + - **CSpan**: C-SPAN + - **CtsNews**: 華視新聞 + - **CTV** + - **CTVNews** + - **cu.ntv.co.jp**: Nippon Television Network + - **CultureUnplugged** + - **curiositystream** + - **curiositystream:collection** + - **CWTV** + - **DagelijkseKost**: dagelijksekost.een.be + - **DailyMail** + - **dailymotion** + - **dailymotion:playlist** + - **dailymotion:user** + - **damtomo:record** + - **damtomo:video** + - **daum.net** + - **daum.net:clip** + - **daum.net:playlist** + - **daum.net:user** + - **DBTV** + - **DctpTv** + - **DeezerAlbum** + - **DeezerPlaylist** + - **defense.gouv.fr** + - **democracynow** + - **DHM**: Filmarchiv - Deutsches Historisches Museum + - **Digg** + - **DigitallySpeaking** + - **Digiteka** + - **Discovery** + - **DiscoveryGo** + - **DiscoveryGoPlaylist** + - **DiscoveryNetworksDe** + - **DiscoveryPlus** + - **DiscoveryPlusIndia** + - **DiscoveryPlusIndiaShow** + - **DiscoveryVR** + - **Disney** + - **DIYNetwork** + - **dlive:stream** + - **dlive:vod** + - **DoodStream** + - **Dotsub** + - **Douyin** + - **DouyuShow** + - **DouyuTV**: 斗鱼 + - **DPlay** + - **DRBonanza** + - **Dropbox** + - **DrTuber** + - **drtv** + - **drtv:live** + - **DTube** + - **duboku**: www.duboku.co + - **duboku:list**: www.duboku.co entire series + - **Dumpert** + - **dvtv**: http://video.aktualne.cz/ + - **dw** + - **dw:article** + - **EaglePlatform** + - **EbaumsWorld** + - **EchoMsk** + - **egghead:course**: egghead.io course + - **egghead:lesson**: egghead.io lesson + - **ehftv** + - **eHow** + - **EinsUndEinsTV** + - **Einthusan** + - **eitb.tv** + - **EllenTube** + - **EllenTubePlaylist** + - **EllenTubeVideo** + - **Elonet** + - **ElPais**: El País + - **Embedly** + - **EMPFlix** + - **Engadget** + - **Epicon** + - **EpiconSeries** + - **Eporner** + - **EroProfile** + - **EroProfile:album** + - **Escapist** + - **ESPN** + - **ESPNArticle** + - **EsriVideo** + - **Europa** + - **EWETV** + - **ExpoTV** + - **Expressen** + - **ExtremeTube** + - **EyedoTV** + - **facebook** + - **FacebookPluginsVideo** + - **fancode:live** + - **fancode:vod** + - **faz.net** + - **fc2** + - **fc2:embed** + - **Fczenit** + - **Filmmodu** + - **filmon** + - **filmon:channel** + - **Filmweb** + - **FiveThirtyEight** + - **FiveTV** + - **Flickr** + - **Folketinget**: Folketinget (ft.dk; Danish parliament) + - **FootyRoom** + - **Formula1** + - **FOX** + - **FOX9** + - **FOX9News** + - **Foxgay** + - **foxnews**: Fox News and Fox Business Video + - **foxnews:article** + - **FoxSports** + - **FranceCulture** + - **FranceInter** + - **FranceTV** + - **francetvinfo.fr** + - **FranceTVSite** + - **Freesound** + - **freespeech.org** + - **FreshLive** + - **FrontendMasters** + - **FrontendMastersCourse** + - **FrontendMastersLesson** + - **FujiTVFODPlus7** + - **Funimation** + - **funimation:page** + - **funimation:show** + - **Funk** + - **Fusion** + - **Fux** + - **GabTV** + - **Gaia** + - **GameInformer** + - **GameSpot** + - **GameStar** + - **Gaskrank** + - **Gazeta** + - **GDCVault** + - **GediDigital** + - **gem.cbc.ca** + - **gem.cbc.ca:live** + - **gem.cbc.ca:playlist** + - **generic**: Generic downloader that works on some sites + - **Gettr** + - **Gfycat** + - **GiantBomb** + - **Giga** + - **GlattvisionTV** + - **Glide**: Glide mobile video messages (glide.me) + - **Globo** + - **GloboArticle** + - **Go** + - **GodTube** + - **Golem** + - **google:podcasts** + - **google:podcasts:feed** + - **GoogleDrive** + - **GoPro** + - **Goshgay** + - **GoToStage** + - **GPUTechConf** + - **Groupon** + - **hbo** + - **HearThisAt** + - **Heise** + - **HellPorno** + - **Helsinki**: helsinki.fi + - **HentaiStigma** + - **hetklokhuis** + - **hgtv.com:show** + - **HGTVDe** + - **HiDive** + - **HistoricFilms** + - **history:player** + - **history:topic**: History.com Topic + - **hitbox** + - **hitbox:live** + - **HitRecord** + - **hketv**: 香港教育局教育電視 (HKETV) Educational Television, Hong Kong Educational Bureau + - **HornBunny** + - **HotNewHipHop** + - **hotstar** + - **hotstar:playlist** + - **hotstar:series** + - **Howcast** + - **HowStuffWorks** + - **hrfernsehen** + - **HRTi** + - **HRTiPlaylist** + - **Huajiao**: 花椒直播 + - **HuffPost**: Huffington Post + - **Hungama** + - **HungamaAlbumPlaylist** + - **HungamaSong** + - **Hypem** + - **ign.com** + - **IGNArticle** + - **IGNVideo** + - **IHeartRadio** + - **iheartradio:podcast** + - **imdb**: Internet Movie Database trailers + - **imdb:list**: Internet Movie Database lists + - **Imgur** + - **imgur:album** + - **imgur:gallery** + - **Ina** + - **Inc** + - **IndavideoEmbed** + - **InfoQ** + - **Instagram** + - **instagram:tag**: Instagram hashtag search + - **instagram:user**: Instagram user profile + - **Internazionale** + - **InternetVideoArchive** + - **IPrima** + - **iqiyi**: 爱奇艺 + - **Ir90Tv** + - **ITTF** + - **ITV** + - **ITVBTCC** + - **ivi**: ivi.ru + - **ivi:compilation**: ivi.ru compilations + - **ivideon**: Ivideon TV + - **Iwara** + - **Izlesene** + - **Jamendo** + - **JamendoAlbum** + - **JeuxVideo** + - **Joj** + - **Jove** + - **JWPlatform** + - **Kakao** + - **Kaltura** + - **Kankan** + - **Karaoketv** + - **KarriereVideos** + - **Katsomo** + - **KeezMovies** + - **Ketnet** + - **khanacademy** + - **khanacademy:unit** + - **KickStarter** + - **KinjaEmbed** + - **KinoPoisk** + - **KonserthusetPlay** + - **Koo** + - **KrasView**: Красвью + - **Ku6** + - **KUSI** + - **kuwo:album**: 酷我音乐 - 专辑 + - **kuwo:category**: 酷我音乐 - 分类 + - **kuwo:chart**: 酷我音乐 - 排行榜 + - **kuwo:mv**: 酷我音乐 - MV + - **kuwo:singer**: 酷我音乐 - 歌手 + - **kuwo:song**: 酷我音乐 + - **la7.it** + - **la7.it:pod:episode** + - **la7.it:podcast** + - **laola1tv** + - **laola1tv:embed** + - **lbry** + - **lbry:channel** + - **LCI** + - **Lcp** + - **LcpPlay** + - **Le**: 乐视网 + - **Lecture2Go** + - **Lecturio** + - **LecturioCourse** + - **LecturioDeCourse** + - **LEGO** + - **Lemonde** + - **Lenta** + - **LePlaylist** + - **LetvCloud**: 乐视云 + - **Libsyn** + - **life**: Life.ru + - **life:embed** + - **limelight** + - **limelight:channel** + - **limelight:channel_list** + - **LineLive** + - **LineLiveChannel** + - **LineTV** + - **linkedin:learning** + - **linkedin:learning:course** + - **LinuxAcademy** + - **LiTV** + - **LiveJournal** + - **livestream** + - **livestream:original** + - **LnkGo** + - **loc**: Library of Congress + - **LocalNews8** + - **LoveHomePorn** + - **lrt.lt** + - **lynda**: lynda.com videos + - **lynda:course**: lynda.com online courses + - **m6** + - **MagentaMusik360** + - **mailru**: Видео@Mail.Ru + - **mailru:music**: Музыка@Mail.Ru + - **mailru:music:search**: Музыка@Mail.Ru + - **MallTV** + - **mangomolo:live** + - **mangomolo:video** + - **ManotoTV**: Manoto TV (Episode) + - **ManotoTVLive**: Manoto TV (Live) + - **ManotoTVShow**: Manoto TV (Show) + - **ManyVids** + - **MaoriTV** + - **Markiza** + - **MarkizaPage** + - **massengeschmack.tv** + - **MatchTV** + - **MDR**: MDR.DE and KiKA + - **MedalTV** + - **media.ccc.de** + - **media.ccc.de:lists** + - **Mediaite** + - **MediaKlikk** + - **Medialaan** + - **Mediaset** + - **Mediasite** + - **MediasiteCatalog** + - **MediasiteNamedCatalog** + - **Medici** + - **megaphone.fm**: megaphone.fm embedded players + - **Meipai**: 美拍 + - **MelonVOD** + - **META** + - **metacafe** + - **Metacritic** + - **mewatch** + - **Mgoon** + - **MGTV**: 芒果TV + - **MiaoPai** + - **mildom**: Record ongoing live by specific user in Mildom + - **mildom:user:vod**: Download all VODs from specific user in Mildom + - **mildom:vod**: Download a VOD in Mildom + - **minds** + - **minds:channel** + - **minds:group** + - **MinistryGrid** + - **Minoto** + - **miomio.tv** + - **mirrativ** + - **mirrativ:user** + - **MiTele**: mitele.es + - **mixcloud** + - **mixcloud:playlist** + - **mixcloud:user** + - **MLB** + - **MLBVideo** + - **Mnet** + - **MNetTV** + - **MoeVideo**: LetitBit video services: moevideo.net, playreplay.net and videochart.net + - **Mofosex** + - **MofosexEmbed** + - **Mojvideo** + - **Morningstar**: morningstar.com + - **Motherless** + - **MotherlessGroup** + - **Motorsport**: motorsport.com + - **MovieClips** + - **MovieFap** + - **Moviezine** + - **MovingImage** + - **MSN** + - **mtg**: MTG services + - **mtv** + - **mtv.de** + - **mtv.it** + - **mtv.it:programma** + - **mtv:video** + - **mtvjapan** + - **mtvservices:embedded** + - **MTVUutisetArticle** + - **MuenchenTV**: münchen.tv + - **MuseScore** + - **mva**: Microsoft Virtual Academy videos + - **mva:course**: Microsoft Virtual Academy courses + - **Mwave** + - **MwaveMeetGreet** + - **Mxplayer** + - **MxplayerShow** + - **MyChannels** + - **MySpace** + - **MySpace:album** + - **MySpass** + - **Myvi** + - **MyVideoGe** + - **MyVidster** + - **MyviEmbed** + - **MyVisionTV** + - **n-tv.de** + - **N1Info:article** + - **N1InfoAsset** + - **natgeo:video** + - **NationalGeographicTV** + - **Naver** + - **Naver:live** + - **NBA** + - **nba:watch** + - **nba:watch:collection** + - **NBAChannel** + - **NBAEmbed** + - **NBAWatchEmbed** + - **NBC** + - **NBCNews** + - **nbcolympics** + - **nbcolympics:stream** + - **NBCSports** + - **NBCSportsStream** + - **NBCSportsVPlayer** + - **ndr**: NDR.de - Norddeutscher Rundfunk + - **ndr:embed** + - **ndr:embed:base** + - **NDTV** + - **Nebula** + - **NerdCubedFeed** + - **netease:album**: 网易云音乐 - 专辑 + - **netease:djradio**: 网易云音乐 - 电台 + - **netease:mv**: 网易云音乐 - MV + - **netease:playlist**: 网易云音乐 - 歌单 + - **netease:program**: 网易云音乐 - 电台节目 + - **netease:singer**: 网易云音乐 - 歌手 + - **netease:song**: 网易云音乐 + - **NetPlus** + - **Netzkino** + - **Newgrounds** + - **Newgrounds:playlist** + - **Newgrounds:user** + - **Newstube** + - **NextMedia**: 蘋果日報 + - **NextMediaActionNews**: 蘋果日報 - 動新聞 + - **NextTV**: 壹電視 + - **Nexx** + - **NexxEmbed** + - **NFHSNetwork** + - **nfl.com** (Currently broken) + - **nfl.com:article** (Currently broken) + - **NhkVod** + - **NhkVodProgram** + - **nhl.com** + - **nick.com** + - **nick.de** + - **nickelodeon:br** + - **nickelodeonru** + - **nicknight** + - **niconico**: ニコニコ動画 + - **NiconicoPlaylist** + - **NiconicoUser** + - **nicovideo:search**: Nico video searches + - **nicovideo:search:date**: Nico video searches, newest first + - **nicovideo:search_url**: Nico video search URLs + - **Nintendo** + - **Nitter** + - **njoy**: N-JOY + - **njoy:embed** + - **NJPWWorld**: 新日本プロレスワールド + - **NobelPrize** + - **NonkTube** + - **Noovo** + - **Normalboots** + - **NosVideo** + - **Nova**: TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz + - **NovaEmbed** + - **NovaPlay** + - **nowness** + - **nowness:playlist** + - **nowness:series** + - **Noz** + - **npo**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl + - **npo.nl:live** + - **npo.nl:radio** + - **npo.nl:radio:fragment** + - **Npr** + - **NRK** + - **NRKPlaylist** + - **NRKRadioPodkast** + - **NRKSkole**: NRK Skole + - **NRKTV**: NRK TV and NRK Radio + - **NRKTVDirekte**: NRK TV Direkte and NRK Radio Direkte + - **NRKTVEpisode** + - **NRKTVEpisodes** + - **NRKTVSeason** + - **NRKTVSeries** + - **NRLTV** + - **ntv.ru** + - **Nuvid** + - **NYTimes** + - **NYTimesArticle** + - **NYTimesCooking** + - **nzherald** + - **NZZ** + - **ocw.mit.edu** + - **OdaTV** + - **Odnoklassniki** + - **OktoberfestTV** + - **OlympicsReplay** + - **OnDemandKorea** + - **onet.pl** + - **onet.tv** + - **onet.tv:channel** + - **OnetMVP** + - **OnionStudios** + - **Ooyala** + - **OoyalaExternal** + - **openrec** + - **openrec:capture** + - **OraTV** + - **orf:burgenland**: Radio Burgenland + - **orf:fm4**: radio FM4 + - **orf:fm4:story**: fm4.orf.at stories + - **orf:iptv**: iptv.ORF.at + - **orf:kaernten**: Radio Kärnten + - **orf:noe**: Radio Niederösterreich + - **orf:oberoesterreich**: Radio Oberösterreich + - **orf:oe1**: Radio Österreich 1 + - **orf:oe3**: Radio Österreich 3 + - **orf:salzburg**: Radio Salzburg + - **orf:steiermark**: Radio Steiermark + - **orf:tirol**: Radio Tirol + - **orf:tvthek**: ORF TVthek + - **orf:vorarlberg**: Radio Vorarlberg + - **orf:wien**: Radio Wien + - **OsnatelTV** + - **OutsideTV** + - **PacktPub** + - **PacktPubCourse** + - **PalcoMP3:artist** + - **PalcoMP3:song** + - **PalcoMP3:video** + - **pandora.tv**: 판도라TV + - **ParamountNetwork** + - **ParamountPlus** + - **ParamountPlusSeries** + - **parliamentlive.tv**: UK parliament videos + - **Parlview** + - **Patreon** + - **PatreonUser** + - **pbs**: Public Broadcasting Service (PBS) and member stations: PBS: Public Broadcasting Service, APT - Alabama Public Television (WBIQ), GPB/Georgia Public Broadcasting (WGTV), Mississippi Public Broadcasting (WMPN), Nashville Public Television (WNPT), WFSU-TV (WFSU), WSRE (WSRE), WTCI (WTCI), WPBA/Channel 30 (WPBA), Alaska Public Media (KAKM), Arizona PBS (KAET), KNME-TV/Channel 5 (KNME), Vegas PBS (KLVX), AETN/ARKANSAS ETV NETWORK (KETS), KET (WKLE), WKNO/Channel 10 (WKNO), LPB/LOUISIANA PUBLIC BROADCASTING (WLPB), OETA (KETA), Ozarks Public Television (KOZK), WSIU Public Broadcasting (WSIU), KEET TV (KEET), KIXE/Channel 9 (KIXE), KPBS San Diego (KPBS), KQED (KQED), KVIE Public Television (KVIE), PBS SoCal/KOCE (KOCE), ValleyPBS (KVPT), CONNECTICUT PUBLIC TELEVISION (WEDH), KNPB Channel 5 (KNPB), SOPTV (KSYS), Rocky Mountain PBS (KRMA), KENW-TV3 (KENW), KUED Channel 7 (KUED), Wyoming PBS (KCWC), Colorado Public Television / KBDI 12 (KBDI), KBYU-TV (KBYU), Thirteen/WNET New York (WNET), WGBH/Channel 2 (WGBH), WGBY (WGBY), NJTV Public Media NJ (WNJT), WLIW21 (WLIW), mpt/Maryland Public Television (WMPB), WETA Television and Radio (WETA), WHYY (WHYY), PBS 39 (WLVT), WVPT - Your Source for PBS and More! (WVPT), Howard University Television (WHUT), WEDU PBS (WEDU), WGCU Public Media (WGCU), WPBT2 (WPBT), WUCF TV (WUCF), WUFT/Channel 5 (WUFT), WXEL/Channel 42 (WXEL), WLRN/Channel 17 (WLRN), WUSF Public Broadcasting (WUSF), ETV (WRLK), UNC-TV (WUNC), PBS Hawaii - Oceanic Cable Channel 10 (KHET), Idaho Public Television (KAID), KSPS (KSPS), OPB (KOPB), KWSU/Channel 10 & KTNW/Channel 31 (KWSU), WILL-TV (WILL), Network Knowledge - WSEC/Springfield (WSEC), WTTW11 (WTTW), Iowa Public Television/IPTV (KDIN), Nine Network (KETC), PBS39 Fort Wayne (WFWA), WFYI Indianapolis (WFYI), Milwaukee Public Television (WMVS), WNIN (WNIN), WNIT Public Television (WNIT), WPT (WPNE), WVUT/Channel 22 (WVUT), WEIU/Channel 51 (WEIU), WQPT-TV (WQPT), WYCC PBS Chicago (WYCC), WIPB-TV (WIPB), WTIU (WTIU), CET (WCET), ThinkTVNetwork (WPTD), WBGU-TV (WBGU), WGVU TV (WGVU), NET1 (KUON), Pioneer Public Television (KWCM), SDPB Television (KUSD), TPT (KTCA), KSMQ (KSMQ), KPTS/Channel 8 (KPTS), KTWU/Channel 11 (KTWU), East Tennessee PBS (WSJK), WCTE-TV (WCTE), WLJT, Channel 11 (WLJT), WOSU TV (WOSU), WOUB/WOUC (WOUB), WVPB (WVPB), WKYU-PBS (WKYU), KERA 13 (KERA), MPBN (WCBB), Mountain Lake PBS (WCFE), NHPTV (WENH), Vermont PBS (WETK), witf (WITF), WQED Multimedia (WQED), WMHT Educational Telecommunications (WMHT), Q-TV (WDCQ), WTVS Detroit Public TV (WTVS), CMU Public Television (WCMU), WKAR-TV (WKAR), WNMU-TV Public TV 13 (WNMU), WDSE - WRPT (WDSE), WGTE TV (WGTE), Lakeland Public Television (KAWE), KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS), MontanaPBS (KUSM), KRWG/Channel 22 (KRWG), KACV (KACV), KCOS/Channel 13 (KCOS), WCNY/Channel 24 (WCNY), WNED (WNED), WPBS (WPBS), WSKG Public TV (WSKG), WXXI (WXXI), WPSU (WPSU), WVIA Public Media Studios (WVIA), WTVI (WTVI), Western Reserve PBS (WNEO), WVIZ/PBS ideastream (WVIZ), KCTS 9 (KCTS), Basin PBS (KPBT), KUHT / Channel 8 (KUHT), KLRN (KLRN), KLRU (KLRU), WTJX Channel 12 (WTJX), WCVE PBS (WCVE), KBTC Public Television (KBTC) + - **PearVideo** + - **PeerTube** + - **PeerTube:Playlist** + - **peloton** + - **peloton:live**: Peloton Live + - **People** + - **PerformGroup** + - **periscope**: Periscope + - **periscope:user**: Periscope user videos + - **PhilharmonieDeParis**: Philharmonie de Paris + - **phoenix.de** + - **Photobucket** + - **Picarto** + - **PicartoVod** + - **Piksel** + - **Pinkbike** + - **Pinterest** + - **PinterestCollection** + - **Pladform** + - **Platzi** + - **PlatziCourse** + - **play.fm** + - **player.sky.it** + - **PlayPlusTV** + - **PlayStuff** + - **PlaysTV** + - **Playtvak**: Playtvak.cz, iDNES.cz and Lidovky.cz + - **Playvid** + - **Playwire** + - **pluralsight** + - **pluralsight:course** + - **PlutoTV** + - **podomatic** + - **Pokemon** + - **PokemonWatch** + - **PolskieRadio** + - **PolskieRadioCategory** + - **Popcorntimes** + - **PopcornTV** + - **PornCom** + - **PornerBros** + - **PornFlip** + - **PornHd** + - **PornHub**: PornHub and Thumbzilla + - **PornHubPagedVideoList** + - **PornHubPlaylist** + - **PornHubUser** + - **PornHubUserVideosUpload** + - **Pornotube** + - **PornoVoisines** + - **PornoXO** + - **PornTube** + - **PressTV** + - **ProjectVeritas** + - **prosiebensat1**: ProSiebenSat.1 Digital + - **puhutv** + - **puhutv:serie** + - **Puls4** + - **Pyvideo** + - **qqmusic**: QQ音乐 + - **qqmusic:album**: QQ音乐 - 专辑 + - **qqmusic:playlist**: QQ音乐 - 歌单 + - **qqmusic:singer**: QQ音乐 - 歌手 + - **qqmusic:toplist**: QQ音乐 - 排行榜 + - **QuantumTV** + - **Qub** + - **Quickline** + - **QuicklineLive** + - **R7** + - **R7Article** + - **Radiko** + - **RadikoRadio** + - **radio.de** + - **radiobremen** + - **radiocanada** + - **radiocanada:audiovideo** + - **radiofrance** + - **RadioJavan** + - **radlive** + - **radlive:channel** + - **radlive:season** + - **Rai** + - **RaiPlay** + - **RaiPlayLive** + - **RaiPlayPlaylist** + - **RayWenderlich** + - **RayWenderlichCourse** + - **RBMARadio** + - **RCS** + - **RCSEmbeds** + - **RCSVarious** + - **RCTIPlus** + - **RCTIPlusSeries** + - **RCTIPlusTV** + - **RDS**: RDS.ca + - **RedBull** + - **RedBullEmbed** + - **RedBullTV** + - **RedBullTVRrnContent** + - **Reddit** + - **RedditR** + - **RedTube** + - **RegioTV** + - **RENTV** + - **RENTVArticle** + - **Restudy** + - **Reuters** + - **ReverbNation** + - **RICE** + - **RMCDecouverte** + - **RockstarGames** + - **RoosterTeeth** + - **RottenTomatoes** + - **Roxwel** + - **Rozhlas** + - **RTBF** + - **rte**: Raidió Teilifís Éireann TV + - **rte:radio**: Raidió Teilifís Éireann radio + - **rtl.nl**: rtl.nl and rtlxl.nl + - **rtl2** + - **rtl2:you** + - **rtl2:you:series** + - **RTP** + - **RTS**: RTS.ch + - **rtve.es:alacarta**: RTVE a la carta + - **rtve.es:infantil**: RTVE infantil + - **rtve.es:live**: RTVE.es live streams + - **rtve.es:television** + - **RTVNH** + - **RTVS** + - **RUHD** + - **RumbleChannel** + - **RumbleEmbed** + - **rutube**: Rutube videos + - **rutube:channel**: Rutube channels + - **rutube:embed**: Rutube embedded videos + - **rutube:movie**: Rutube movies + - **rutube:person**: Rutube person videos + - **rutube:playlist**: Rutube playlists + - **RUTV**: RUTV.RU + - **Ruutu** + - **Ruv** + - **safari**: safaribooksonline.com online video + - **safari:api** + - **safari:course**: safaribooksonline.com online courses + - **Saitosan** + - **SAKTV** + - **SaltTV** + - **SampleFocus** + - **Sapo**: SAPO Vídeos + - **savefrom.net** + - **SBS**: sbs.com.au + - **schooltv** + - **ScienceChannel** + - **screen.yahoo:search**: Yahoo screen search + - **Screencast** + - **ScreencastOMatic** + - **ScrippsNetworks** + - **scrippsnetworks:watch** + - **SCTE** + - **SCTECourse** + - **Seeker** + - **SenateISVP** + - **SendtoNews** + - **Servus** + - **Sexu** + - **SeznamZpravy** + - **SeznamZpravyArticle** + - **Shahid** + - **ShahidShow** + - **Shared**: shared.sx + - **ShemarooMe** + - **ShowRoomLive** + - **simplecast** + - **simplecast:episode** + - **simplecast:podcast** + - **Sina** + - **sky.it** + - **sky:news** + - **sky:sports** + - **sky:sports:news** + - **skyacademy.it** + - **SkylineWebcams** + - **skynewsarabia:article** + - **skynewsarabia:video** + - **Slideshare** + - **SlidesLive** + - **Slutload** + - **Snotr** + - **Sohu** + - **SonyLIV** + - **SonyLIVSeries** + - **soundcloud** + - **soundcloud:playlist** + - **soundcloud:search**: Soundcloud search + - **soundcloud:set** + - **soundcloud:trackstation** + - **soundcloud:user** + - **SoundcloudEmbed** + - **soundgasm** + - **soundgasm:profile** + - **southpark.cc.com** + - **southpark.cc.com:español** + - **southpark.de** + - **southpark.nl** + - **southparkstudios.dk** + - **SovietsCloset** + - **SovietsClosetPlaylist** + - **SpankBang** + - **SpankBangPlaylist** + - **Spankwire** + - **Spiegel** + - **Sport5** + - **SportBox** + - **SportDeutschland** + - **spotify** + - **spotify:show** + - **Spreaker** + - **SpreakerPage** + - **SpreakerShow** + - **SpreakerShowPage** + - **SpringboardPlatform** + - **Sprout** + - **sr:mediathek**: Saarländischer Rundfunk + - **SRGSSR** + - **SRGSSRPlay**: srf.ch, rts.ch, rsi.ch, rtr.ch and swissinfo.ch play sites + - **stanfordoc**: Stanford Open ClassRoom + - **startv** + - **Steam** + - **Stitcher** + - **StitcherShow** + - **StoryFire** + - **StoryFireSeries** + - **StoryFireUser** + - **Streamable** + - **Streamanity** + - **streamcloud.eu** + - **StreamCZ** + - **StreetVoice** + - **StretchInternet** + - **stv:player** + - **SunPorno** + - **sverigesradio:episode** + - **sverigesradio:publication** + - **SVT** + - **SVTPage** + - **SVTPlay**: SVT Play and Öppet arkiv + - **SVTSeries** + - **SWRMediathek** + - **Syfy** + - **SztvHu** + - **t-online.de** + - **Tagesschau** + - **tagesschau:player** + - **Tass** + - **TBS** + - **TDSLifeway** + - **Teachable** + - **TeachableCourse** + - **teachertube**: teachertube.com videos + - **teachertube:user:collection**: teachertube.com user and collection videos + - **TeachingChannel** + - **Teamcoco** + - **TeamTreeHouse** + - **TechTalks** + - **techtv.mit.edu** + - **ted** + - **Tele13** + - **Tele5** + - **TeleBruxelles** + - **Telecinco**: telecinco.es, cuatro.com and mediaset.es + - **Telegraaf** + - **TeleMB** + - **Telemundo** + - **TeleQuebec** + - **TeleQuebecEmission** + - **TeleQuebecLive** + - **TeleQuebecSquat** + - **TeleQuebecVideo** + - **TeleTask** + - **Telewebion** + - **TennisTV** + - **TenPlay** + - **TF1** + - **TFO** + - **TheIntercept** + - **ThePlatform** + - **ThePlatformFeed** + - **TheScene** + - **TheStar** + - **TheSun** + - **ThetaStream** + - **ThetaVideo** + - **TheWeatherChannel** + - **ThisAmericanLife** + - **ThisAV** + - **ThisOldHouse** + - **TikTok** + - **tiktok:user** + - **tinypic**: tinypic.com videos + - **TMZ** + - **TNAFlix** + - **TNAFlixNetworkEmbed** + - **toggle** + - **Tokentube** + - **Tokentube:channel** + - **ToonGoggles** + - **tou.tv** + - **Toypics**: Toypics video + - **ToypicsUser**: Toypics user profile + - **TrailerAddict** (Currently broken) + - **Trilulilu** + - **Trovo** + - **TrovoVod** + - **TruNews** + - **TruTV** + - **Tube8** + - **TubiTv** + - **TubiTvShow** + - **Tumblr** + - **tunein:clip** + - **tunein:program** + - **tunein:station** + - **tunein:topic** + - **TunePk** + - **Turbo** + - **tv.dfb.de** + - **TV2** + - **TV2Article** + - **TV2DK** + - **TV2DKBornholmPlay** + - **tv2play.hu** + - **tv2playseries.hu** + - **TV4**: tv4.se and tv4play.se + - **TV5MondePlus**: TV5MONDE+ + - **tv5unis** + - **tv5unis:video** + - **tv8.it** + - **TVA** + - **TVANouvelles** + - **TVANouvellesArticle** + - **TVC** + - **TVCArticle** + - **TVer** + - **tvigle**: Интернет-телевидение Tvigle.ru + - **tvland.com** + - **TVN24** + - **TVNet** + - **TVNoe** + - **TVNow** + - **TVNowAnnual** + - **TVNowFilm** + - **TVNowNew** + - **TVNowSeason** + - **TVNowShow** + - **tvp**: Telewizja Polska + - **tvp:embed**: Telewizja Polska + - **tvp:series** + - **TVPlayer** + - **TVPlayHome** + - **Tweakers** + - **TwitCasting** + - **TwitCastingLive** + - **TwitCastingUser** + - **twitch:clips** + - **twitch:stream** + - **twitch:vod** + - **TwitchCollection** + - **TwitchVideos** + - **TwitchVideosClips** + - **TwitchVideosCollections** + - **twitter** + - **twitter:amplify** + - **twitter:broadcast** + - **twitter:card** + - **twitter:shortener** + - **udemy** + - **udemy:course** + - **UDNEmbed**: 聯合影音 + - **UFCArabia** + - **UFCTV** + - **ukcolumn** + - **UKTVPlay** + - **umg:de**: Universal Music Deutschland + - **Unistra** + - **Unity** + - **uol.com.br** + - **uplynk** + - **uplynk:preplay** + - **Urort**: NRK P3 Urørt + - **URPlay** + - **USANetwork** + - **USAToday** + - **ustream** + - **ustream:channel** + - **ustudio** + - **ustudio:embed** + - **Utreon** + - **Varzesh3** + - **Vbox7** + - **VeeHD** + - **Veo** + - **Veoh** + - **Vesti**: Вести.Ru + - **Vevo** + - **VevoPlaylist** + - **VGTV**: VGTV, BTTV, FTV, Aftenposten and Aftonbladet + - **vh1.com** + - **vhx:embed** + - **Viafree** + - **vice** + - **vice:article** + - **vice:show** + - **Vidbit** + - **Viddler** + - **Videa** + - **video.arnes.si**: Arnes Video + - **video.google:search**: Google Video search + - **video.sky.it** + - **video.sky.it:live** + - **VideoDetective** + - **videofy.me** + - **videomore** + - **videomore:season** + - **videomore:video** + - **VideoPress** + - **Vidio** + - **VidioLive** + - **VidioPremier** + - **VidLii** + - **vier**: vier.be and vijf.be + - **vier:videos** + - **viewlift** + - **viewlift:embed** + - **Viidea** + - **viki** + - **viki:channel** + - **vimeo** + - **vimeo:album** + - **vimeo:channel** + - **vimeo:group** + - **vimeo:likes**: Vimeo user likes + - **vimeo:ondemand** + - **vimeo:review**: Review pages on vimeo + - **vimeo:user** + - **vimeo:watchlater**: Vimeo watch later list, "vimeowatchlater" keyword (requires authentication) + - **Vimple**: Vimple - one-click video hosting + - **Vine** + - **vine:user** + - **Viqeo** + - **Viu** + - **viu:ott** + - **viu:playlist** + - **Vivo**: vivo.sx + - **vk**: VK + - **vk:uservideos**: VK - User's Videos + - **vk:wallpost** + - **vlive** + - **vlive:channel** + - **vlive:post** + - **Vodlocker** + - **VODPl** + - **VODPlatform** + - **VoiceRepublic** + - **voicy** + - **voicy:channel** + - **Voot** + - **VootSeries** + - **VoxMedia** + - **VoxMediaVolume** + - **vpro**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl + - **Vrak** + - **VRT**: VRT NWS, Flanders News, Flandern Info and Sporza + - **VrtNU**: VrtNU.be + - **vrv** + - **vrv:series** + - **VShare** + - **VTM** + - **VTXTV** + - **vube**: Vube.com + - **VuClip** + - **Vupload** + - **VVVVID** + - **VVVVIDShow** + - **VyboryMos** + - **Vzaar** + - **Wakanim** + - **Walla** + - **WalyTV** + - **washingtonpost** + - **washingtonpost:article** + - **wat.tv** + - **WatchBox** + - **WatchIndianPorn**: Watch Indian Porn + - **WDR** + - **wdr:mobile** + - **WDRElefant** + - **WDRPage** + - **web.archive:youtube**: web.archive.org saved youtube videos + - **Webcaster** + - **WebcasterFeed** + - **WebOfStories** + - **WebOfStoriesPlaylist** + - **Weibo** + - **WeiboMobile** + - **WeiqiTV**: WQTV + - **whowatch** + - **WimTV** + - **Wistia** + - **WistiaPlaylist** + - **wnl**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl + - **WorldStarHipHop** + - **WSJ**: Wall Street Journal + - **WSJArticle** + - **WWE** + - **XBef** + - **XboxClips** + - **XFileShare**: XFileShare based sites: Aparat, ClipWatching, GoUnlimited, GoVid, HolaVid, Streamty, TheVideoBee, Uqload, VidBom, vidlo, VidLocker, VidShare, VUp, WolfStream, XVideoSharing + - **XHamster** + - **XHamsterEmbed** + - **XHamsterUser** + - **xiami:album**: 虾米音乐 - 专辑 + - **xiami:artist**: 虾米音乐 - 歌手 + - **xiami:collection**: 虾米音乐 - 精选集 + - **xiami:song**: 虾米音乐 + - **ximalaya**: 喜马拉雅FM + - **ximalaya:album**: 喜马拉雅FM 专辑 + - **XMinus** + - **XNXX** + - **Xstream** + - **XTube** + - **XTubeUser**: XTube user profile + - **Xuite**: 隨意窩Xuite影音 + - **XVideos** + - **XXXYMovies** + - **Yahoo**: Yahoo screen and movies + - **yahoo:gyao** + - **yahoo:gyao:player** + - **yahoo:japannews**: Yahoo! Japan News + - **YandexDisk** + - **yandexmusic:album**: Яндекс.Музыка - Альбом + - **yandexmusic:artist:albums**: Яндекс.Музыка - Артист - Альбомы + - **yandexmusic:artist:tracks**: Яндекс.Музыка - Артист - Треки + - **yandexmusic:playlist**: Яндекс.Музыка - Плейлист + - **yandexmusic:track**: Яндекс.Музыка - Трек + - **YandexVideo** + - **YapFiles** + - **YesJapan** + - **yinyuetai:video**: 音悦Tai + - **Ynet** + - **YouJizz** + - **youku**: 优酷 + - **youku:show** + - **YouNowChannel** + - **YouNowLive** + - **YouNowMoment** + - **YouPorn** + - **YourPorn** + - **YourUpload** + - **youtube**: YouTube.com + - **youtube:favorites**: YouTube.com liked videos, ":ytfav" for short (requires authentication) + - **youtube:history**: Youtube watch history, ":ythis" for short (requires authentication) + - **youtube:playlist**: YouTube.com playlists + - **youtube:recommended**: YouTube.com recommended videos, ":ytrec" for short (requires authentication) + - **youtube:search**: YouTube.com searches, "ytsearch" keyword + - **youtube:search:date**: YouTube.com searches, newest videos first, "ytsearchdate" keyword + - **youtube:search_url**: YouTube.com search URLs + - **youtube:subscriptions**: YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication) + - **youtube:tab**: YouTube.com tab + - **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" for short (requires authentication) + - **YoutubeYtBe**: youtu.be + - **YoutubeYtUser**: YouTube.com user videos, URL or "ytuser" keyword + - **Zapiks** + - **Zattoo** + - **ZattooLive** + - **ZDF** + - **ZDFChannel** + - **Zee5** + - **zee5:series** + - **ZenYandex** + - **ZenYandexChannel** + - **Zhihu** + - **zingmp3**: mp3.zing.vn + - **zingmp3:album** + - **zoom** + - **Zype** diff --git a/test/helper.py b/test/helper.py index e62aab11e..5c0e645f9 100644 --- a/test/helper.py +++ b/test/helper.py @@ -10,23 +10,31 @@ import types import ssl import sys -import youtube_dl.extractor -from youtube_dl import YoutubeDL -from youtube_dl.compat import ( +import yt_dlp.extractor +from yt_dlp import YoutubeDL +from yt_dlp.compat import ( compat_os_name, compat_str, ) -from youtube_dl.utils import ( +from yt_dlp.utils import ( preferredencoding, write_string, ) +if 'pytest' in sys.modules: + import pytest + is_download_test = pytest.mark.download +else: + def is_download_test(testClass): + return testClass + + def get_params(override=None): PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), - "parameters.json") + 'parameters.json') LOCAL_PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), - "local_parameters.json") + 'local_parameters.json') with io.open(PARAMETERS_FILE, encoding='utf-8') as pf: parameters = json.load(pf) if os.path.exists(LOCAL_PARAMETERS_FILE): @@ -90,7 +98,7 @@ class FakeYDL(YoutubeDL): def gettestcases(include_onlymatching=False): - for ie in youtube_dl.extractor.gen_extractors(): + for ie in yt_dlp.extractor.gen_extractors(): for tc in ie.get_testcases(include_onlymatching): yield tc @@ -190,7 +198,10 @@ def expect_info_dict(self, got_dict, expected_dict): expect_dict(self, got_dict, expected_dict) # Check for the presence of mandatory fields if got_dict.get('_type') not in ('playlist', 'multi_video'): - for key in ('id', 'url', 'title', 'ext'): + mandatory_fields = ['id', 'title'] + if expected_dict.get('ext'): + mandatory_fields.extend(('url', 'ext')) + for key in mandatory_fields: self.assertTrue(got_dict.get(key), 'Missing mandatory field %s' % key) # Check for mandatory fields that are automatically set by YoutubeDL for key in ['webpage_url', 'extractor', 'extractor_key']: diff --git a/test/parameters.json b/test/parameters.json index 65fd54428..9ca7d2ca9 100644 --- a/test/parameters.json +++ b/test/parameters.json @@ -1,40 +1,46 @@ { - "consoletitle": false, - "continuedl": true, - "forcedescription": false, - "forcefilename": false, - "forceformat": false, - "forcethumbnail": false, - "forcetitle": false, - "forceurl": false, + "check_formats": false, + "consoletitle": false, + "continuedl": true, + "forcedescription": false, + "forcefilename": false, + "forceformat": false, + "forcethumbnail": false, + "forcetitle": false, + "forceurl": false, + "force_write_download_archive": false, "format": "best", - "ignoreerrors": false, - "listformats": null, - "logtostderr": false, - "matchtitle": null, - "max_downloads": null, - "nooverwrites": false, - "nopart": false, - "noprogress": false, - "outtmpl": "%(id)s.%(ext)s", - "password": null, - "playlistend": -1, - "playliststart": 1, - "prefer_free_formats": false, - "quiet": false, - "ratelimit": null, - "rejecttitle": null, - "retries": 10, - "simulate": false, - "subtitleslang": null, + "ignoreerrors": false, + "listformats": null, + "logtostderr": false, + "matchtitle": null, + "max_downloads": null, + "overwrites": null, + "nopart": false, + "noprogress": false, + "outtmpl": "%(id)s.%(ext)s", + "password": null, + "playliststart": 1, + "prefer_free_formats": false, + "quiet": false, + "ratelimit": null, + "rejecttitle": null, + "retries": 10, + "simulate": false, + "subtitleslang": null, "subtitlesformat": "best", - "test": true, - "updatetime": true, - "usenetrc": false, - "username": null, - "verbose": true, - "writedescription": false, - "writeinfojson": true, + "test": true, + "updatetime": true, + "usenetrc": false, + "username": null, + "verbose": true, + "writedescription": false, + "writeinfojson": true, + "writeannotations": false, + "writelink": false, + "writeurllink": false, + "writewebloclink": false, + "writedesktoplink": false, "writesubtitles": false, "allsubtitles": false, "listsubtitles": false, diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index dd69a681b..cbca22c91 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 from __future__ import unicode_literals @@ -10,10 +10,10 @@ import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import FakeYDL, expect_dict, expect_value, http_server_port -from youtube_dl.compat import compat_etree_fromstring, compat_http_server -from youtube_dl.extractor.common import InfoExtractor -from youtube_dl.extractor import YoutubeIE, get_info_extractor -from youtube_dl.utils import encode_data_uri, strip_jsonp, ExtractorError, RegexNotFoundError +from yt_dlp.compat import compat_etree_fromstring, compat_http_server +from yt_dlp.extractor.common import InfoExtractor +from yt_dlp.extractor import YoutubeIE, get_info_extractor +from yt_dlp.utils import encode_data_uri, strip_jsonp, ExtractorError, RegexNotFoundError import threading @@ -35,13 +35,13 @@ class InfoExtractorTestRequestHandler(compat_http_server.BaseHTTPRequestHandler) assert False -class TestIE(InfoExtractor): +class DummyIE(InfoExtractor): pass class TestInfoExtractor(unittest.TestCase): def setUp(self): - self.ie = TestIE(FakeYDL()) + self.ie = DummyIE(FakeYDL()) def test_ie_key(self): self.assertEqual(get_info_extractor(YoutubeIE.ie_key()), YoutubeIE) @@ -440,371 +440,430 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ def test_parse_m3u8_formats(self): _TEST_CASES = [ ( - # https://github.com/ytdl-org/youtube-dl/issues/11507 - # http://pluzz.francetv.fr/videos/le_ministere.html - 'pluzz_francetv_11507', - 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/master.m3u8?caption=2017%2F16%2F156589847-1492488987.m3u8%3Afra%3AFrancais&audiotrack=0%3Afra%3AFrancais', + # https://github.com/ytdl-org/youtube-dl/issues/11995 + # http://teamcoco.com/video/clueless-gamer-super-bowl-for-honor + 'img_bipbop_adv_example_fmp4', + 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8', [{ - 'url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/index_0_av.m3u8?null=0', - 'manifest_url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/master.m3u8?caption=2017%2F16%2F156589847-1492488987.m3u8%3Afra%3AFrancais&audiotrack=0%3Afra%3AFrancais', + 'format_id': 'aud1-English', + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/a1/prog_index.m3u8', + 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8', + 'language': 'en', 'ext': 'mp4', - 'format_id': '180', - 'protocol': 'm3u8', - 'acodec': 'mp4a.40.2', - 'vcodec': 'avc1.66.30', - 'tbr': 180, - 'width': 256, - 'height': 144, + 'protocol': 'm3u8_native', + 'audio_ext': 'mp4', }, { - 'url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/index_1_av.m3u8?null=0', - 'manifest_url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/master.m3u8?caption=2017%2F16%2F156589847-1492488987.m3u8%3Afra%3AFrancais&audiotrack=0%3Afra%3AFrancais', + 'format_id': 'aud2-English', + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/a2/prog_index.m3u8', + 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8', + 'language': 'en', 'ext': 'mp4', - 'format_id': '303', - 'protocol': 'm3u8', - 'acodec': 'mp4a.40.2', - 'vcodec': 'avc1.66.30', - 'tbr': 303, - 'width': 320, - 'height': 180, + 'protocol': 'm3u8_native', + 'audio_ext': 'mp4', }, { - 'url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/index_2_av.m3u8?null=0', - 'manifest_url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/master.m3u8?caption=2017%2F16%2F156589847-1492488987.m3u8%3Afra%3AFrancais&audiotrack=0%3Afra%3AFrancais', + 'format_id': 'aud3-English', + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/a3/prog_index.m3u8', + 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8', + 'language': 'en', 'ext': 'mp4', - 'format_id': '575', - 'protocol': 'm3u8', - 'acodec': 'mp4a.40.2', - 'vcodec': 'avc1.66.30', - 'tbr': 575, - 'width': 512, - 'height': 288, + 'protocol': 'm3u8_native', + 'audio_ext': 'mp4', }, { - 'url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/index_3_av.m3u8?null=0', - 'manifest_url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/master.m3u8?caption=2017%2F16%2F156589847-1492488987.m3u8%3Afra%3AFrancais&audiotrack=0%3Afra%3AFrancais', + 'format_id': '530', + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/v2/prog_index.m3u8', + 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8', 'ext': 'mp4', - 'format_id': '831', - 'protocol': 'm3u8', - 'acodec': 'mp4a.40.2', - 'vcodec': 'avc1.77.30', - 'tbr': 831, - 'width': 704, - 'height': 396, + 'protocol': 'm3u8_native', + 'width': 480, + 'height': 270, + 'vcodec': 'avc1.640015', }, { - 'url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/index_4_av.m3u8?null=0', - 'manifest_url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/master.m3u8?caption=2017%2F16%2F156589847-1492488987.m3u8%3Afra%3AFrancais&audiotrack=0%3Afra%3AFrancais', + 'format_id': '561', + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/v2/prog_index.m3u8', + 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8', 'ext': 'mp4', - 'protocol': 'm3u8', - 'format_id': '1467', - 'acodec': 'mp4a.40.2', - 'vcodec': 'avc1.77.30', - 'tbr': 1467, - 'width': 1024, - 'height': 576, - }] - ), - ( - # https://github.com/ytdl-org/youtube-dl/issues/11995 - # http://teamcoco.com/video/clueless-gamer-super-bowl-for-honor - 'teamcoco_11995', - 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/main.m3u8', - [{ - 'url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/hls/CONAN_020217_Highlight_show-audio-160k_v4.m3u8', - 'manifest_url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/main.m3u8', + 'protocol': 'm3u8_native', + 'width': 480, + 'height': 270, + 'vcodec': 'avc1.640015', + }, { + 'format_id': '753', + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/v2/prog_index.m3u8', + 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8', 'ext': 'mp4', - 'format_id': 'audio-0-Default', - 'protocol': 'm3u8', - 'vcodec': 'none', + 'protocol': 'm3u8_native', + 'width': 480, + 'height': 270, + 'vcodec': 'avc1.640015', }, { - 'url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/hls/CONAN_020217_Highlight_show-audio-64k_v4.m3u8', - 'manifest_url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/main.m3u8', + 'format_id': '895', + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/v3/prog_index.m3u8', + 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8', 'ext': 'mp4', - 'format_id': 'audio-1-Default', - 'protocol': 'm3u8', - 'vcodec': 'none', + 'protocol': 'm3u8_native', + 'width': 640, + 'height': 360, + 'vcodec': 'avc1.64001e', }, { - 'url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/hls/CONAN_020217_Highlight_show-audio-64k_v4.m3u8', - 'manifest_url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/main.m3u8', + 'format_id': '926', + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/v3/prog_index.m3u8', + 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8', 'ext': 'mp4', - 'format_id': '71', - 'protocol': 'm3u8', - 'acodec': 'mp4a.40.5', - 'vcodec': 'none', - 'tbr': 71, + 'protocol': 'm3u8_native', + 'width': 640, + 'height': 360, + 'vcodec': 'avc1.64001e', }, { - 'url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/hls/CONAN_020217_Highlight_show-400k_v4.m3u8', - 'manifest_url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/main.m3u8', + 'format_id': '1118', + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/v3/prog_index.m3u8', + 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8', 'ext': 'mp4', - 'format_id': '413', - 'protocol': 'm3u8', - 'acodec': 'none', - 'vcodec': 'avc1.42001e', - 'tbr': 413, - 'width': 400, - 'height': 224, + 'protocol': 'm3u8_native', + 'width': 640, + 'height': 360, + 'vcodec': 'avc1.64001e', }, { - 'url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/hls/CONAN_020217_Highlight_show-400k_v4.m3u8', - 'manifest_url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/main.m3u8', + 'format_id': '1265', + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/v4/prog_index.m3u8', + 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8', 'ext': 'mp4', - 'format_id': '522', - 'protocol': 'm3u8', - 'acodec': 'none', - 'vcodec': 'avc1.42001e', - 'tbr': 522, - 'width': 400, - 'height': 224, + 'protocol': 'm3u8_native', + 'width': 768, + 'height': 432, + 'vcodec': 'avc1.64001e', }, { - 'url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/hls/CONAN_020217_Highlight_show-1m_v4.m3u8', - 'manifest_url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/main.m3u8', + 'format_id': '1295', + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/v4/prog_index.m3u8', + 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8', 'ext': 'mp4', - 'format_id': '1205', - 'protocol': 'm3u8', - 'acodec': 'none', - 'vcodec': 'avc1.4d001e', - 'tbr': 1205, - 'width': 640, - 'height': 360, + 'protocol': 'm3u8_native', + 'width': 768, + 'height': 432, + 'vcodec': 'avc1.64001e', }, { - 'url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/hls/CONAN_020217_Highlight_show-2m_v4.m3u8', - 'manifest_url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/main.m3u8', + 'format_id': '1487', + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/v4/prog_index.m3u8', + 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8', 'ext': 'mp4', - 'format_id': '2374', - 'protocol': 'm3u8', - 'acodec': 'none', - 'vcodec': 'avc1.4d001f', - 'tbr': 2374, - 'width': 1024, - 'height': 576, - }] - ), - ( - # https://github.com/ytdl-org/youtube-dl/issues/12211 - # http://video.toggle.sg/en/series/whoopie-s-world/ep3/478601 - 'toggle_mobile_12211', - 'http://cdnapi.kaltura.com/p/2082311/sp/208231100/playManifest/protocol/http/entryId/0_89q6e8ku/format/applehttp/tags/mobile_sd/f/a.m3u8', - [{ - 'url': 'http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/2/pv/1/flavorId/0_sa2ntrdg/name/a.mp4/index.m3u8', - 'manifest_url': 'http://cdnapi.kaltura.com/p/2082311/sp/208231100/playManifest/protocol/http/entryId/0_89q6e8ku/format/applehttp/tags/mobile_sd/f/a.m3u8', + 'protocol': 'm3u8_native', + 'width': 768, + 'height': 432, + 'vcodec': 'avc1.64001e', + }, { + 'format_id': '2168', + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/v5/prog_index.m3u8', + 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8', 'ext': 'mp4', - 'format_id': 'audio-English', - 'protocol': 'm3u8', - 'language': 'eng', - 'vcodec': 'none', + 'protocol': 'm3u8_native', + 'width': 960, + 'height': 540, + 'vcodec': 'avc1.640020', }, { - 'url': 'http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/2/pv/1/flavorId/0_r7y0nitg/name/a.mp4/index.m3u8', - 'manifest_url': 'http://cdnapi.kaltura.com/p/2082311/sp/208231100/playManifest/protocol/http/entryId/0_89q6e8ku/format/applehttp/tags/mobile_sd/f/a.m3u8', + 'format_id': '2198', + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/v5/prog_index.m3u8', + 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8', 'ext': 'mp4', - 'format_id': 'audio-Undefined', - 'protocol': 'm3u8', - 'language': 'und', - 'vcodec': 'none', + 'protocol': 'm3u8_native', + 'width': 960, + 'height': 540, + 'vcodec': 'avc1.640020', }, { - 'url': 'http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/2/pv/1/flavorId/0_qlk9hlzr/name/a.mp4/index.m3u8', - 'manifest_url': 'http://cdnapi.kaltura.com/p/2082311/sp/208231100/playManifest/protocol/http/entryId/0_89q6e8ku/format/applehttp/tags/mobile_sd/f/a.m3u8', + 'format_id': '2390', + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/v5/prog_index.m3u8', + 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8', 'ext': 'mp4', - 'format_id': '155', - 'protocol': 'm3u8', - 'tbr': 155.648, - 'width': 320, - 'height': 180, + 'protocol': 'm3u8_native', + 'width': 960, + 'height': 540, + 'vcodec': 'avc1.640020', }, { - 'url': 'http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/2/pv/1/flavorId/0_oefackmi/name/a.mp4/index.m3u8', - 'manifest_url': 'http://cdnapi.kaltura.com/p/2082311/sp/208231100/playManifest/protocol/http/entryId/0_89q6e8ku/format/applehttp/tags/mobile_sd/f/a.m3u8', + 'format_id': '3168', + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/v6/prog_index.m3u8', + 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8', 'ext': 'mp4', - 'format_id': '502', - 'protocol': 'm3u8', - 'tbr': 502.784, - 'width': 480, - 'height': 270, + 'protocol': 'm3u8_native', + 'width': 1280, + 'height': 720, + 'vcodec': 'avc1.640020', }, { - 'url': 'http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/12/pv/1/flavorId/0_vyg9pj7k/name/a.mp4/index.m3u8', - 'manifest_url': 'http://cdnapi.kaltura.com/p/2082311/sp/208231100/playManifest/protocol/http/entryId/0_89q6e8ku/format/applehttp/tags/mobile_sd/f/a.m3u8', + 'format_id': '3199', + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/v6/prog_index.m3u8', + 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8', 'ext': 'mp4', - 'format_id': '827', - 'protocol': 'm3u8', - 'tbr': 827.392, - 'width': 640, - 'height': 360, + 'protocol': 'm3u8_native', + 'width': 1280, + 'height': 720, + 'vcodec': 'avc1.640020', }, { - 'url': 'http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/12/pv/1/flavorId/0_50n4psvx/name/a.mp4/index.m3u8', - 'manifest_url': 'http://cdnapi.kaltura.com/p/2082311/sp/208231100/playManifest/protocol/http/entryId/0_89q6e8ku/format/applehttp/tags/mobile_sd/f/a.m3u8', + 'format_id': '3391', + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/v6/prog_index.m3u8', + 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8', 'ext': 'mp4', - 'format_id': '1396', - 'protocol': 'm3u8', - 'tbr': 1396.736, - 'width': 854, - 'height': 480, - }] - ), - ( - # http://www.twitch.tv/riotgames/v/6528877 - 'twitch_vod', - 'https://usher.ttvnw.net/vod/6528877?allow_source=true&allow_audio_only=true&allow_spectre=true&player=twitchweb&nauth=%7B%22user_id%22%3Anull%2C%22vod_id%22%3A6528877%2C%22expires%22%3A1492887874%2C%22chansub%22%3A%7B%22restricted_bitrates%22%3A%5B%5D%7D%2C%22privileged%22%3Afalse%2C%22https_required%22%3Afalse%7D&nauthsig=3e29296a6824a0f48f9e731383f77a614fc79bee', - [{ - 'url': 'https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/audio_only/index-muted-HM49I092CC.m3u8', - 'manifest_url': 'https://usher.ttvnw.net/vod/6528877?allow_source=true&allow_audio_only=true&allow_spectre=true&player=twitchweb&nauth=%7B%22user_id%22%3Anull%2C%22vod_id%22%3A6528877%2C%22expires%22%3A1492887874%2C%22chansub%22%3A%7B%22restricted_bitrates%22%3A%5B%5D%7D%2C%22privileged%22%3Afalse%2C%22https_required%22%3Afalse%7D&nauthsig=3e29296a6824a0f48f9e731383f77a614fc79bee', + 'protocol': 'm3u8_native', + 'width': 1280, + 'height': 720, + 'vcodec': 'avc1.640020', + }, { + 'format_id': '4670', + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/v7/prog_index.m3u8', + 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8', 'ext': 'mp4', - 'format_id': 'Audio Only', - 'protocol': 'm3u8', - 'acodec': 'mp4a.40.2', - 'vcodec': 'none', - 'tbr': 182.725, + 'protocol': 'm3u8_native', + 'width': 1920, + 'height': 1080, + 'vcodec': 'avc1.64002a', }, { - 'url': 'https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/mobile/index-muted-HM49I092CC.m3u8', - 'manifest_url': 'https://usher.ttvnw.net/vod/6528877?allow_source=true&allow_audio_only=true&allow_spectre=true&player=twitchweb&nauth=%7B%22user_id%22%3Anull%2C%22vod_id%22%3A6528877%2C%22expires%22%3A1492887874%2C%22chansub%22%3A%7B%22restricted_bitrates%22%3A%5B%5D%7D%2C%22privileged%22%3Afalse%2C%22https_required%22%3Afalse%7D&nauthsig=3e29296a6824a0f48f9e731383f77a614fc79bee', + 'format_id': '4701', + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/v7/prog_index.m3u8', + 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8', 'ext': 'mp4', - 'format_id': 'Mobile', - 'protocol': 'm3u8', - 'acodec': 'mp4a.40.2', - 'vcodec': 'avc1.42C00D', - 'tbr': 280.474, - 'width': 400, - 'height': 226, + 'protocol': 'm3u8_native', + 'width': 1920, + 'height': 1080, + 'vcodec': 'avc1.64002a', }, { - 'url': 'https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/low/index-muted-HM49I092CC.m3u8', - 'manifest_url': 'https://usher.ttvnw.net/vod/6528877?allow_source=true&allow_audio_only=true&allow_spectre=true&player=twitchweb&nauth=%7B%22user_id%22%3Anull%2C%22vod_id%22%3A6528877%2C%22expires%22%3A1492887874%2C%22chansub%22%3A%7B%22restricted_bitrates%22%3A%5B%5D%7D%2C%22privileged%22%3Afalse%2C%22https_required%22%3Afalse%7D&nauthsig=3e29296a6824a0f48f9e731383f77a614fc79bee', + 'format_id': '4893', + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/v7/prog_index.m3u8', + 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8', 'ext': 'mp4', - 'format_id': 'Low', - 'protocol': 'm3u8', - 'acodec': 'mp4a.40.2', - 'vcodec': 'avc1.42C01E', - 'tbr': 628.347, - 'width': 640, - 'height': 360, + 'protocol': 'm3u8_native', + 'width': 1920, + 'height': 1080, + 'vcodec': 'avc1.64002a', }, { - 'url': 'https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/medium/index-muted-HM49I092CC.m3u8', - 'manifest_url': 'https://usher.ttvnw.net/vod/6528877?allow_source=true&allow_audio_only=true&allow_spectre=true&player=twitchweb&nauth=%7B%22user_id%22%3Anull%2C%22vod_id%22%3A6528877%2C%22expires%22%3A1492887874%2C%22chansub%22%3A%7B%22restricted_bitrates%22%3A%5B%5D%7D%2C%22privileged%22%3Afalse%2C%22https_required%22%3Afalse%7D&nauthsig=3e29296a6824a0f48f9e731383f77a614fc79bee', + 'format_id': '6170', + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/v8/prog_index.m3u8', + 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8', 'ext': 'mp4', - 'format_id': 'Medium', - 'protocol': 'm3u8', - 'acodec': 'mp4a.40.2', - 'vcodec': 'avc1.42C01E', - 'tbr': 893.387, - 'width': 852, - 'height': 480, + 'protocol': 'm3u8_native', + 'width': 1920, + 'height': 1080, + 'vcodec': 'avc1.64002a', }, { - 'url': 'https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/high/index-muted-HM49I092CC.m3u8', - 'manifest_url': 'https://usher.ttvnw.net/vod/6528877?allow_source=true&allow_audio_only=true&allow_spectre=true&player=twitchweb&nauth=%7B%22user_id%22%3Anull%2C%22vod_id%22%3A6528877%2C%22expires%22%3A1492887874%2C%22chansub%22%3A%7B%22restricted_bitrates%22%3A%5B%5D%7D%2C%22privileged%22%3Afalse%2C%22https_required%22%3Afalse%7D&nauthsig=3e29296a6824a0f48f9e731383f77a614fc79bee', + 'format_id': '6200', + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/v8/prog_index.m3u8', + 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8', 'ext': 'mp4', - 'format_id': 'High', - 'protocol': 'm3u8', - 'acodec': 'mp4a.40.2', - 'vcodec': 'avc1.42C01F', - 'tbr': 1603.789, - 'width': 1280, - 'height': 720, + 'protocol': 'm3u8_native', + 'width': 1920, + 'height': 1080, + 'vcodec': 'avc1.64002a', }, { - 'url': 'https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/chunked/index-muted-HM49I092CC.m3u8', - 'manifest_url': 'https://usher.ttvnw.net/vod/6528877?allow_source=true&allow_audio_only=true&allow_spectre=true&player=twitchweb&nauth=%7B%22user_id%22%3Anull%2C%22vod_id%22%3A6528877%2C%22expires%22%3A1492887874%2C%22chansub%22%3A%7B%22restricted_bitrates%22%3A%5B%5D%7D%2C%22privileged%22%3Afalse%2C%22https_required%22%3Afalse%7D&nauthsig=3e29296a6824a0f48f9e731383f77a614fc79bee', + 'format_id': '6392', + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/v8/prog_index.m3u8', + 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8', 'ext': 'mp4', - 'format_id': 'Source', - 'protocol': 'm3u8', - 'acodec': 'mp4a.40.2', - 'vcodec': 'avc1.100.31', - 'tbr': 3214.134, - 'width': 1280, - 'height': 720, - }] - ), - ( - # http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015 - # EXT-X-STREAM-INF tag with NAME attribute that is not defined - # in HLS specification - 'vidio', - 'https://www.vidio.com/videos/165683/playlist.m3u8', - [{ - 'url': 'https://cdn1-a.production.vidio.static6.com/uploads/165683/dj_ambred-4383-b300.mp4.m3u8', - 'manifest_url': 'https://www.vidio.com/videos/165683/playlist.m3u8', + 'protocol': 'm3u8_native', + 'width': 1920, + 'height': 1080, + 'vcodec': 'avc1.64002a', + }, { + 'format_id': '7968', + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/v9/prog_index.m3u8', + 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8', 'ext': 'mp4', - 'format_id': '270p 3G', - 'protocol': 'm3u8', - 'tbr': 300, - 'width': 480, - 'height': 270, + 'protocol': 'm3u8_native', + 'width': 1920, + 'height': 1080, + 'vcodec': 'avc1.64002a', }, { - 'url': 'https://cdn1-a.production.vidio.static6.com/uploads/165683/dj_ambred-4383-b600.mp4.m3u8', - 'manifest_url': 'https://www.vidio.com/videos/165683/playlist.m3u8', + 'format_id': '7998', + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/v9/prog_index.m3u8', + 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8', 'ext': 'mp4', - 'format_id': '360p SD', - 'protocol': 'm3u8', - 'tbr': 600, - 'width': 640, - 'height': 360, + 'protocol': 'm3u8_native', + 'width': 1920, + 'height': 1080, + 'vcodec': 'avc1.64002a', }, { - 'url': 'https://cdn1-a.production.vidio.static6.com/uploads/165683/dj_ambred-4383-b1200.mp4.m3u8', - 'manifest_url': 'https://www.vidio.com/videos/165683/playlist.m3u8', + 'format_id': '8190', + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/v9/prog_index.m3u8', + 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8', 'ext': 'mp4', - 'format_id': '720p HD', - 'protocol': 'm3u8', - 'tbr': 1200, - 'width': 1280, - 'height': 720, - }] + 'protocol': 'm3u8_native', + 'width': 1920, + 'height': 1080, + 'vcodec': 'avc1.64002a', + }], + {} ), ( - # https://github.com/ytdl-org/youtube-dl/issues/18923 - # https://www.ted.com/talks/boris_hesser_a_grassroots_healthcare_revolution_in_africa - 'ted_18923', - 'http://hls.ted.com/talks/31241.m3u8', + 'bipbop_16x9', + 'https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/bipbop_16x9_variant.m3u8', [{ - 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/audio/600k.m3u8?nobumpers=true&uniqueId=76011e2b', - 'format_id': '600k-Audio', + 'format_id': 'bipbop_audio-BipBop Audio 2', + 'format_index': None, + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/alternate_audio_aac/prog_index.m3u8', + 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/bipbop_16x9_variant.m3u8', + 'language': 'eng', + 'ext': 'mp4', + 'protocol': 'm3u8_native', + 'preference': None, + 'quality': None, 'vcodec': 'none', + 'audio_ext': 'mp4', + 'video_ext': 'none', }, { - 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/audio/600k.m3u8?nobumpers=true&uniqueId=76011e2b', - 'format_id': '68', + 'format_id': '41', + 'format_index': None, + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/gear0/prog_index.m3u8', + 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/bipbop_16x9_variant.m3u8', + 'tbr': 41.457, + 'ext': 'mp4', + 'fps': None, + 'protocol': 'm3u8_native', + 'preference': None, + 'quality': None, 'vcodec': 'none', + 'acodec': 'mp4a.40.2', + 'audio_ext': 'mp4', + 'video_ext': 'none', + 'abr': 41.457, }, { - 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/64k.m3u8?nobumpers=true&uniqueId=76011e2b', - 'format_id': '163', - 'acodec': 'none', - 'width': 320, - 'height': 180, - }, { - 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/180k.m3u8?nobumpers=true&uniqueId=76011e2b', - 'format_id': '481', - 'acodec': 'none', - 'width': 512, - 'height': 288, - }, { - 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/320k.m3u8?nobumpers=true&uniqueId=76011e2b', - 'format_id': '769', - 'acodec': 'none', - 'width': 512, - 'height': 288, - }, { - 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/450k.m3u8?nobumpers=true&uniqueId=76011e2b', - 'format_id': '984', - 'acodec': 'none', - 'width': 512, - 'height': 288, + 'format_id': '263', + 'format_index': None, + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/gear1/prog_index.m3u8', + 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/bipbop_16x9_variant.m3u8', + 'tbr': 263.851, + 'ext': 'mp4', + 'fps': None, + 'protocol': 'm3u8_native', + 'preference': None, + 'quality': None, + 'width': 416, + 'height': 234, + 'vcodec': 'avc1.4d400d', + 'acodec': 'mp4a.40.2', + 'video_ext': 'mp4', + 'audio_ext': 'none', + 'vbr': 263.851, + 'abr': 0, }, { - 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/600k.m3u8?nobumpers=true&uniqueId=76011e2b', - 'format_id': '1255', - 'acodec': 'none', + 'format_id': '577', + 'format_index': None, + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/gear2/prog_index.m3u8', + 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/bipbop_16x9_variant.m3u8', + 'tbr': 577.61, + 'ext': 'mp4', + 'fps': None, + 'protocol': 'm3u8_native', + 'preference': None, + 'quality': None, 'width': 640, 'height': 360, + 'vcodec': 'avc1.4d401e', + 'acodec': 'mp4a.40.2', + 'video_ext': 'mp4', + 'audio_ext': 'none', + 'vbr': 577.61, + 'abr': 0, }, { - 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/950k.m3u8?nobumpers=true&uniqueId=76011e2b', - 'format_id': '1693', - 'acodec': 'none', - 'width': 853, - 'height': 480, + 'format_id': '915', + 'format_index': None, + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/gear3/prog_index.m3u8', + 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/bipbop_16x9_variant.m3u8', + 'tbr': 915.905, + 'ext': 'mp4', + 'fps': None, + 'protocol': 'm3u8_native', + 'preference': None, + 'quality': None, + 'width': 960, + 'height': 540, + 'vcodec': 'avc1.4d401f', + 'acodec': 'mp4a.40.2', + 'video_ext': 'mp4', + 'audio_ext': 'none', + 'vbr': 915.905, + 'abr': 0, }, { - 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/1500k.m3u8?nobumpers=true&uniqueId=76011e2b', - 'format_id': '2462', - 'acodec': 'none', + 'format_id': '1030', + 'format_index': None, + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/gear4/prog_index.m3u8', + 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/bipbop_16x9_variant.m3u8', + 'tbr': 1030.138, + 'ext': 'mp4', + 'fps': None, + 'protocol': 'm3u8_native', + 'preference': None, + 'quality': None, 'width': 1280, 'height': 720, - }] + 'vcodec': 'avc1.4d401f', + 'acodec': 'mp4a.40.2', + 'video_ext': 'mp4', + 'audio_ext': 'none', + 'vbr': 1030.138, + 'abr': 0, + }, { + 'format_id': '1924', + 'format_index': None, + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/gear5/prog_index.m3u8', + 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/bipbop_16x9_variant.m3u8', + 'tbr': 1924.009, + 'ext': 'mp4', + 'fps': None, + 'protocol': 'm3u8_native', + 'preference': None, + 'quality': None, + 'width': 1920, + 'height': 1080, + 'vcodec': 'avc1.4d401f', + 'acodec': 'mp4a.40.2', + 'video_ext': 'mp4', + 'audio_ext': 'none', + 'vbr': 1924.009, + 'abr': 0, + }], + { + 'en': [{ + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/subtitles/eng/prog_index.m3u8', + 'ext': 'vtt', + 'protocol': 'm3u8_native' + }, { + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/subtitles/eng_forced/prog_index.m3u8', + 'ext': 'vtt', + 'protocol': 'm3u8_native' + }], + 'fr': [{ + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/subtitles/fra/prog_index.m3u8', + 'ext': 'vtt', + 'protocol': 'm3u8_native' + }, { + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/subtitles/fra_forced/prog_index.m3u8', + 'ext': 'vtt', + 'protocol': 'm3u8_native' + }], + 'es': [{ + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/subtitles/spa/prog_index.m3u8', + 'ext': 'vtt', + 'protocol': 'm3u8_native' + }, { + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/subtitles/spa_forced/prog_index.m3u8', + 'ext': 'vtt', + 'protocol': 'm3u8_native' + }], + 'ja': [{ + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/subtitles/jpn/prog_index.m3u8', + 'ext': 'vtt', + 'protocol': 'm3u8_native' + }, { + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/subtitles/jpn_forced/prog_index.m3u8', + 'ext': 'vtt', + 'protocol': 'm3u8_native' + }], + } ), ] - for m3u8_file, m3u8_url, expected_formats in _TEST_CASES: + for m3u8_file, m3u8_url, expected_formats, expected_subs in _TEST_CASES: with io.open('./test/testdata/m3u8/%s.m3u8' % m3u8_file, mode='r', encoding='utf-8') as f: - formats = self.ie._parse_m3u8_formats( + formats, subs = self.ie._parse_m3u8_formats_and_subtitles( f.read(), m3u8_url, ext='mp4') self.ie._sort_formats(formats) expect_value(self, formats, expected_formats, None) + expect_value(self, subs, expected_subs, None) def test_parse_mpd_formats(self): _TEST_CASES = [ @@ -890,7 +949,8 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'tbr': 5997.485, 'width': 1920, 'height': 1080, - }] + }], + {}, ), ( # https://github.com/ytdl-org/youtube-dl/pull/14844 'urls_only', @@ -973,7 +1033,8 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'tbr': 4400, 'width': 1920, 'height': 1080, - }] + }], + {}, ), ( # https://github.com/ytdl-org/youtube-dl/issues/20346 # Media considered unfragmented even though it contains @@ -1019,18 +1080,328 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'width': 360, 'height': 360, 'fps': 30, - }] + }], + {}, + ), ( + 'subtitles', + 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd', + 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/', + [{ + 'format_id': 'audio=128001', + 'manifest_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd', + 'ext': 'm4a', + 'tbr': 128.001, + 'asr': 48000, + 'format_note': 'DASH audio', + 'container': 'm4a_dash', + 'vcodec': 'none', + 'acodec': 'mp4a.40.2', + 'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd', + 'fragment_base_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/', + 'protocol': 'http_dash_segments', + 'audio_ext': 'm4a', + 'video_ext': 'none', + 'abr': 128.001, + }, { + 'format_id': 'video=100000', + 'manifest_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd', + 'ext': 'mp4', + 'width': 336, + 'height': 144, + 'tbr': 100, + 'format_note': 'DASH video', + 'container': 'mp4_dash', + 'vcodec': 'avc1.4D401F', + 'acodec': 'none', + 'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd', + 'fragment_base_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/', + 'protocol': 'http_dash_segments', + 'video_ext': 'mp4', + 'audio_ext': 'none', + 'vbr': 100, + }, { + 'format_id': 'video=326000', + 'manifest_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd', + 'ext': 'mp4', + 'width': 562, + 'height': 240, + 'tbr': 326, + 'format_note': 'DASH video', + 'container': 'mp4_dash', + 'vcodec': 'avc1.4D401F', + 'acodec': 'none', + 'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd', + 'fragment_base_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/', + 'protocol': 'http_dash_segments', + 'video_ext': 'mp4', + 'audio_ext': 'none', + 'vbr': 326, + }, { + 'format_id': 'video=698000', + 'manifest_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd', + 'ext': 'mp4', + 'width': 844, + 'height': 360, + 'tbr': 698, + 'format_note': 'DASH video', + 'container': 'mp4_dash', + 'vcodec': 'avc1.4D401F', + 'acodec': 'none', + 'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd', + 'fragment_base_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/', + 'protocol': 'http_dash_segments', + 'video_ext': 'mp4', + 'audio_ext': 'none', + 'vbr': 698, + }, { + 'format_id': 'video=1493000', + 'manifest_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd', + 'ext': 'mp4', + 'width': 1126, + 'height': 480, + 'tbr': 1493, + 'format_note': 'DASH video', + 'container': 'mp4_dash', + 'vcodec': 'avc1.4D401F', + 'acodec': 'none', + 'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd', + 'fragment_base_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/', + 'protocol': 'http_dash_segments', + 'video_ext': 'mp4', + 'audio_ext': 'none', + 'vbr': 1493, + }, { + 'format_id': 'video=4482000', + 'manifest_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd', + 'ext': 'mp4', + 'width': 1688, + 'height': 720, + 'tbr': 4482, + 'format_note': 'DASH video', + 'container': 'mp4_dash', + 'vcodec': 'avc1.4D401F', + 'acodec': 'none', + 'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd', + 'fragment_base_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/', + 'protocol': 'http_dash_segments', + 'video_ext': 'mp4', + 'audio_ext': 'none', + 'vbr': 4482, + }], + { + 'en': [ + { + 'ext': 'mp4', + 'manifest_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd', + 'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd', + 'fragment_base_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/', + 'protocol': 'http_dash_segments', + } + ] + }, ) ] - for mpd_file, mpd_url, mpd_base_url, expected_formats in _TEST_CASES: + for mpd_file, mpd_url, mpd_base_url, expected_formats, expected_subtitles in _TEST_CASES: with io.open('./test/testdata/mpd/%s.mpd' % mpd_file, mode='r', encoding='utf-8') as f: - formats = self.ie._parse_mpd_formats( + formats, subtitles = self.ie._parse_mpd_formats_and_subtitles( compat_etree_fromstring(f.read().encode('utf-8')), mpd_base_url=mpd_base_url, mpd_url=mpd_url) self.ie._sort_formats(formats) expect_value(self, formats, expected_formats, None) + expect_value(self, subtitles, expected_subtitles, None) + + def test_parse_ism_formats(self): + _TEST_CASES = [ + ( + 'sintel', + 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest', + [{ + 'format_id': 'audio-128', + 'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest', + 'manifest_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest', + 'ext': 'isma', + 'tbr': 128, + 'asr': 48000, + 'vcodec': 'none', + 'acodec': 'AACL', + 'protocol': 'ism', + '_download_params': { + 'stream_type': 'audio', + 'duration': 8880746666, + 'timescale': 10000000, + 'width': 0, + 'height': 0, + 'fourcc': 'AACL', + 'codec_private_data': '1190', + 'sampling_rate': 48000, + 'channels': 2, + 'bits_per_sample': 16, + 'nal_unit_length_field': 4 + }, + 'audio_ext': 'isma', + 'video_ext': 'none', + 'abr': 128, + }, { + 'format_id': 'video-100', + 'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest', + 'manifest_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest', + 'ext': 'ismv', + 'width': 336, + 'height': 144, + 'tbr': 100, + 'vcodec': 'AVC1', + 'acodec': 'none', + 'protocol': 'ism', + '_download_params': { + 'stream_type': 'video', + 'duration': 8880746666, + 'timescale': 10000000, + 'width': 336, + 'height': 144, + 'fourcc': 'AVC1', + 'codec_private_data': '00000001674D401FDA0544EFFC2D002CBC40000003004000000C03C60CA80000000168EF32C8', + 'channels': 2, + 'bits_per_sample': 16, + 'nal_unit_length_field': 4 + }, + 'video_ext': 'ismv', + 'audio_ext': 'none', + 'vbr': 100, + }, { + 'format_id': 'video-326', + 'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest', + 'manifest_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest', + 'ext': 'ismv', + 'width': 562, + 'height': 240, + 'tbr': 326, + 'vcodec': 'AVC1', + 'acodec': 'none', + 'protocol': 'ism', + '_download_params': { + 'stream_type': 'video', + 'duration': 8880746666, + 'timescale': 10000000, + 'width': 562, + 'height': 240, + 'fourcc': 'AVC1', + 'codec_private_data': '00000001674D401FDA0241FE23FFC3BC83BA44000003000400000300C03C60CA800000000168EF32C8', + 'channels': 2, + 'bits_per_sample': 16, + 'nal_unit_length_field': 4 + }, + 'video_ext': 'ismv', + 'audio_ext': 'none', + 'vbr': 326, + }, { + 'format_id': 'video-698', + 'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest', + 'manifest_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest', + 'ext': 'ismv', + 'width': 844, + 'height': 360, + 'tbr': 698, + 'vcodec': 'AVC1', + 'acodec': 'none', + 'protocol': 'ism', + '_download_params': { + 'stream_type': 'video', + 'duration': 8880746666, + 'timescale': 10000000, + 'width': 844, + 'height': 360, + 'fourcc': 'AVC1', + 'codec_private_data': '00000001674D401FDA0350BFB97FF06AF06AD1000003000100000300300F1832A00000000168EF32C8', + 'channels': 2, + 'bits_per_sample': 16, + 'nal_unit_length_field': 4 + }, + 'video_ext': 'ismv', + 'audio_ext': 'none', + 'vbr': 698, + }, { + 'format_id': 'video-1493', + 'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest', + 'manifest_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest', + 'ext': 'ismv', + 'width': 1126, + 'height': 480, + 'tbr': 1493, + 'vcodec': 'AVC1', + 'acodec': 'none', + 'protocol': 'ism', + '_download_params': { + 'stream_type': 'video', + 'duration': 8880746666, + 'timescale': 10000000, + 'width': 1126, + 'height': 480, + 'fourcc': 'AVC1', + 'codec_private_data': '00000001674D401FDA011C3DE6FFF0D890D871000003000100000300300F1832A00000000168EF32C8', + 'channels': 2, + 'bits_per_sample': 16, + 'nal_unit_length_field': 4 + }, + 'video_ext': 'ismv', + 'audio_ext': 'none', + 'vbr': 1493, + }, { + 'format_id': 'video-4482', + 'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest', + 'manifest_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest', + 'ext': 'ismv', + 'width': 1688, + 'height': 720, + 'tbr': 4482, + 'vcodec': 'AVC1', + 'acodec': 'none', + 'protocol': 'ism', + '_download_params': { + 'stream_type': 'video', + 'duration': 8880746666, + 'timescale': 10000000, + 'width': 1688, + 'height': 720, + 'fourcc': 'AVC1', + 'codec_private_data': '00000001674D401FDA01A816F97FFC1ABC1AB440000003004000000C03C60CA80000000168EF32C8', + 'channels': 2, + 'bits_per_sample': 16, + 'nal_unit_length_field': 4 + }, + 'video_ext': 'ismv', + 'audio_ext': 'none', + 'vbr': 4482, + }], + { + 'eng': [ + { + 'ext': 'ismt', + 'protocol': 'ism', + 'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest', + 'manifest_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest', + '_download_params': { + 'stream_type': 'text', + 'duration': 8880746666, + 'timescale': 10000000, + 'fourcc': 'TTML', + 'codec_private_data': '' + } + } + ] + }, + ), + ] + + for ism_file, ism_url, expected_formats, expected_subtitles in _TEST_CASES: + with io.open('./test/testdata/ism/%s.Manifest' % ism_file, + mode='r', encoding='utf-8') as f: + formats, subtitles = self.ie._parse_ism_formats_and_subtitles( + compat_etree_fromstring(f.read().encode('utf-8')), ism_url=ism_url) + self.ie._sort_formats(formats) + expect_value(self, formats, expected_formats, None) + expect_value(self, subtitles, expected_subtitles, None) def test_parse_f4m_formats(self): _TEST_CASES = [ diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index a35effe0e..bd2d752e2 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # coding: utf-8 from __future__ import unicode_literals @@ -10,14 +10,15 @@ import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import copy +import json from test.helper import FakeYDL, assertRegexpMatches -from youtube_dl import YoutubeDL -from youtube_dl.compat import compat_str, compat_urllib_error -from youtube_dl.extractor import YoutubeIE -from youtube_dl.extractor.common import InfoExtractor -from youtube_dl.postprocessor.common import PostProcessor -from youtube_dl.utils import ExtractorError, match_filter_func +from yt_dlp import YoutubeDL +from yt_dlp.compat import compat_os_name, compat_setenv, compat_str, compat_urllib_error +from yt_dlp.extractor import YoutubeIE +from yt_dlp.extractor.common import InfoExtractor +from yt_dlp.postprocessor.common import PostProcessor +from yt_dlp.utils import ExtractorError, int_or_none, match_filter_func, LazyList TEST_URL = 'http://localhost/sample.mp4' @@ -29,11 +30,15 @@ class YDL(FakeYDL): self.msgs = [] def process_info(self, info_dict): + info_dict.pop('__original_infodict', None) self.downloaded_info_dicts.append(info_dict) def to_screen(self, msg): self.msgs.append(msg) + def dl(self, *args, **kwargs): + assert False, 'Downloader must not be invoked for test_YoutubeDL' + def _make_result(formats, **kwargs): res = { @@ -42,6 +47,7 @@ def _make_result(formats, **kwargs): 'title': 'testttitle', 'extractor': 'testex', 'extractor_key': 'TestEx', + 'webpage_url': 'http://example.com/watch?v=shenanigans', } res.update(**kwargs) return res @@ -77,7 +83,7 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['ext'], 'mp4') - # No prefer_free_formats => prefer mp4 and flv for greater compatibility + # No prefer_free_formats => prefer mp4 and webm ydl = YDL() ydl.params['prefer_free_formats'] = False formats = [ @@ -103,7 +109,7 @@ class TestFormatSelection(unittest.TestCase): yie._sort_formats(info_dict['formats']) ydl.process_ie_result(info_dict) downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded['ext'], 'flv') + self.assertEqual(downloaded['ext'], 'webm') def test_format_selection(self): formats = [ @@ -115,35 +121,24 @@ class TestFormatSelection(unittest.TestCase): ] info_dict = _make_result(formats) - ydl = YDL({'format': '20/47'}) - ydl.process_ie_result(info_dict.copy()) - downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded['format_id'], '47') - - ydl = YDL({'format': '20/71/worst'}) - ydl.process_ie_result(info_dict.copy()) - downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded['format_id'], '35') - - ydl = YDL() - ydl.process_ie_result(info_dict.copy()) - downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded['format_id'], '2') - - ydl = YDL({'format': 'webm/mp4'}) - ydl.process_ie_result(info_dict.copy()) - downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded['format_id'], '47') - - ydl = YDL({'format': '3gp/40/mp4'}) - ydl.process_ie_result(info_dict.copy()) - downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded['format_id'], '35') - - ydl = YDL({'format': 'example-with-dashes'}) - ydl.process_ie_result(info_dict.copy()) - downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded['format_id'], 'example-with-dashes') + def test(inp, *expected, multi=False): + ydl = YDL({ + 'format': inp, + 'allow_multiple_video_streams': multi, + 'allow_multiple_audio_streams': multi, + }) + ydl.process_ie_result(info_dict.copy()) + downloaded = map(lambda x: x['format_id'], ydl.downloaded_info_dicts) + self.assertEqual(list(downloaded), list(expected)) + + test('20/47', '47') + test('20/71/worst', '35') + test(None, '2') + test('webm/mp4', '47') + test('3gp/40/mp4', '35') + test('example-with-dashes', 'example-with-dashes') + test('all', '35', 'example-with-dashes', '45', '47', '2') # Order doesn't actually matter for this + test('mergeall', '2+47+45+example-with-dashes+35', multi=True) def test_format_selection_audio(self): formats = [ @@ -310,6 +305,9 @@ class TestFormatSelection(unittest.TestCase): self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy()) def test_youtube_format_selection(self): + # FIXME: Rewrite in accordance with the new format sorting options + return + order = [ '38', '37', '46', '22', '45', '35', '44', '18', '34', '43', '6', '5', '17', '36', '13', # Apple HTTP Live Streaming @@ -347,7 +345,7 @@ class TestFormatSelection(unittest.TestCase): yie._sort_formats(info_dict['formats']) ydl.process_ie_result(info_dict) downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded['format_id'], '137+141') + self.assertEqual(downloaded['format_id'], '248+172') self.assertEqual(downloaded['ext'], 'mp4') info_dict = _make_result(list(formats_order), extractor='youtube') @@ -456,15 +454,13 @@ class TestFormatSelection(unittest.TestCase): def test_invalid_format_specs(self): def assert_syntax_error(format_spec): - ydl = YDL({'format': format_spec}) - info_dict = _make_result([{'format_id': 'foo', 'url': TEST_URL}]) - self.assertRaises(SyntaxError, ydl.process_ie_result, info_dict) + self.assertRaises(SyntaxError, YDL, {'format': format_spec}) assert_syntax_error('bestvideo,,best') assert_syntax_error('+bestaudio') assert_syntax_error('bestvideo+') assert_syntax_error('/') - assert_syntax_error('bestvideo+bestvideo+bestaudio') + assert_syntax_error('[720<height]') def test_format_filtering(self): formats = [ @@ -535,19 +531,19 @@ class TestFormatSelection(unittest.TestCase): def test_default_format_spec(self): ydl = YDL({'simulate': True}) - self.assertEqual(ydl._default_format_spec({}), 'bestvideo+bestaudio/best') + self.assertEqual(ydl._default_format_spec({}), 'bestvideo*+bestaudio/best') ydl = YDL({}) self.assertEqual(ydl._default_format_spec({'is_live': True}), 'best/bestvideo+bestaudio') ydl = YDL({'simulate': True}) - self.assertEqual(ydl._default_format_spec({'is_live': True}), 'bestvideo+bestaudio/best') + self.assertEqual(ydl._default_format_spec({'is_live': True}), 'bestvideo*+bestaudio/best') ydl = YDL({'outtmpl': '-'}) self.assertEqual(ydl._default_format_spec({}), 'best/bestvideo+bestaudio') ydl = YDL({}) - self.assertEqual(ydl._default_format_spec({}, download=False), 'bestvideo+bestaudio/best') + self.assertEqual(ydl._default_format_spec({}, download=False), 'bestvideo*+bestaudio/best') self.assertEqual(ydl._default_format_spec({'is_live': True}), 'best/bestvideo+bestaudio') @@ -568,6 +564,7 @@ class TestYoutubeDL(unittest.TestCase): 'subtitles': subtitles, 'automatic_captions': auto_captions, 'extractor': 'TEST', + 'webpage_url': 'http://example.com/watch?v=shenanigans', } def get_info(params={}): @@ -597,6 +594,26 @@ class TestYoutubeDL(unittest.TestCase): self.assertTrue(subs) self.assertEqual(set(subs.keys()), set(['es', 'fr'])) + result = get_info({'writesubtitles': True, 'subtitleslangs': ['all', '-en']}) + subs = result['requested_subtitles'] + self.assertTrue(subs) + self.assertEqual(set(subs.keys()), set(['es', 'fr'])) + + result = get_info({'writesubtitles': True, 'subtitleslangs': ['en', 'fr', '-en']}) + subs = result['requested_subtitles'] + self.assertTrue(subs) + self.assertEqual(set(subs.keys()), set(['fr'])) + + result = get_info({'writesubtitles': True, 'subtitleslangs': ['-en', 'en']}) + subs = result['requested_subtitles'] + self.assertTrue(subs) + self.assertEqual(set(subs.keys()), set(['en'])) + + result = get_info({'writesubtitles': True, 'subtitleslangs': ['e.+']}) + subs = result['requested_subtitles'] + self.assertTrue(subs) + self.assertEqual(set(subs.keys()), set(['es', 'en'])) + result = get_info({'writesubtitles': True, 'writeautomaticsub': True, 'subtitleslangs': ['es', 'pt']}) subs = result['requested_subtitles'] self.assertTrue(subs) @@ -623,47 +640,195 @@ class TestYoutubeDL(unittest.TestCase): self.assertEqual(test_dict['extractor'], 'Foo') self.assertEqual(test_dict['playlist'], 'funny videos') - def test_prepare_filename(self): - info = { - 'id': '1234', - 'ext': 'mp4', - 'width': None, - 'height': 1080, - 'title1': '$PATH', - 'title2': '%PATH%', - } + outtmpl_info = { + 'id': '1234', + 'ext': 'mp4', + 'width': None, + 'height': 1080, + 'title1': '$PATH', + 'title2': '%PATH%', + 'title3': 'foo/bar\\test', + 'title4': 'foo "bar" test', + 'title5': 'áéí 𝐀', + 'timestamp': 1618488000, + 'duration': 100000, + 'playlist_index': 1, + 'playlist_autonumber': 2, + '_last_playlist_index': 100, + 'n_entries': 10, + 'formats': [{'id': 'id1'}, {'id': 'id2'}, {'id': 'id3'}] + } - def fname(templ, na_placeholder='NA'): - params = {'outtmpl': templ} - if na_placeholder != 'NA': - params['outtmpl_na_placeholder'] = na_placeholder + def test_prepare_outtmpl_and_filename(self): + def test(tmpl, expected, *, info=None, **params): + params['outtmpl'] = tmpl ydl = YoutubeDL(params) - return ydl.prepare_filename(info) - self.assertEqual(fname('%(id)s.%(ext)s'), '1234.mp4') - self.assertEqual(fname('%(id)s-%(width)s.%(ext)s'), '1234-NA.mp4') - NA_TEST_OUTTMPL = '%(uploader_date)s-%(width)d-%(id)s.%(ext)s' - # Replace missing fields with 'NA' by default - self.assertEqual(fname(NA_TEST_OUTTMPL), 'NA-NA-1234.mp4') - # Or by provided placeholder - self.assertEqual(fname(NA_TEST_OUTTMPL, na_placeholder='none'), 'none-none-1234.mp4') - self.assertEqual(fname(NA_TEST_OUTTMPL, na_placeholder=''), '--1234.mp4') - self.assertEqual(fname('%(height)d.%(ext)s'), '1080.mp4') - self.assertEqual(fname('%(height)6d.%(ext)s'), ' 1080.mp4') - self.assertEqual(fname('%(height)-6d.%(ext)s'), '1080 .mp4') - self.assertEqual(fname('%(height)06d.%(ext)s'), '001080.mp4') - self.assertEqual(fname('%(height) 06d.%(ext)s'), ' 01080.mp4') - self.assertEqual(fname('%(height) 06d.%(ext)s'), ' 01080.mp4') - self.assertEqual(fname('%(height)0 6d.%(ext)s'), ' 01080.mp4') - self.assertEqual(fname('%(height)0 6d.%(ext)s'), ' 01080.mp4') - self.assertEqual(fname('%(height) 0 6d.%(ext)s'), ' 01080.mp4') - self.assertEqual(fname('%%'), '%') - self.assertEqual(fname('%%%%'), '%%') - self.assertEqual(fname('%%(height)06d.%(ext)s'), '%(height)06d.mp4') - self.assertEqual(fname('%(width)06d.%(ext)s'), 'NA.mp4') - self.assertEqual(fname('%(width)06d.%%(ext)s'), 'NA.%(ext)s') - self.assertEqual(fname('%%(width)06d.%(ext)s'), '%(width)06d.mp4') - self.assertEqual(fname('Hello %(title1)s'), 'Hello $PATH') - self.assertEqual(fname('Hello %(title2)s'), 'Hello %PATH%') + ydl._num_downloads = 1 + self.assertEqual(ydl.validate_outtmpl(tmpl), None) + + out = ydl.evaluate_outtmpl(tmpl, info or self.outtmpl_info) + fname = ydl.prepare_filename(info or self.outtmpl_info) + + if not isinstance(expected, (list, tuple)): + expected = (expected, expected) + for (name, got), expect in zip((('outtmpl', out), ('filename', fname)), expected): + if callable(expect): + self.assertTrue(expect(got), f'Wrong {name} from {tmpl}') + else: + self.assertEqual(got, expect, f'Wrong {name} from {tmpl}') + + # Side-effects + original_infodict = dict(self.outtmpl_info) + test('foo.bar', 'foo.bar') + original_infodict['epoch'] = self.outtmpl_info.get('epoch') + self.assertTrue(isinstance(original_infodict['epoch'], int)) + test('%(epoch)d', int_or_none) + self.assertEqual(original_infodict, self.outtmpl_info) + + # Auto-generated fields + test('%(id)s.%(ext)s', '1234.mp4') + test('%(duration_string)s', ('27:46:40', '27-46-40')) + test('%(resolution)s', '1080p') + test('%(playlist_index)s', '001') + test('%(playlist_autonumber)s', '02') + test('%(autonumber)s', '00001') + test('%(autonumber+2)03d', '005', autonumber_start=3) + test('%(autonumber)s', '001', autonumber_size=3) + + # Escaping % + test('%', '%') + test('%%', '%') + test('%%%%', '%%') + test('%s', '%s') + test('%%%s', '%%s') + test('%d', '%d') + test('%abc%', '%abc%') + test('%%(width)06d.%(ext)s', '%(width)06d.mp4') + test('%%%(height)s', '%1080') + test('%(width)06d.%(ext)s', 'NA.mp4') + test('%(width)06d.%%(ext)s', 'NA.%(ext)s') + test('%%(width)06d.%(ext)s', '%(width)06d.mp4') + + # ID sanitization + test('%(id)s', '_abcd', info={'id': '_abcd'}) + test('%(some_id)s', '_abcd', info={'some_id': '_abcd'}) + test('%(formats.0.id)s', '_abcd', info={'formats': [{'id': '_abcd'}]}) + test('%(id)s', '-abcd', info={'id': '-abcd'}) + test('%(id)s', '.abcd', info={'id': '.abcd'}) + test('%(id)s', 'ab__cd', info={'id': 'ab__cd'}) + test('%(id)s', ('ab:cd', 'ab -cd'), info={'id': 'ab:cd'}) + + # Invalid templates + self.assertTrue(isinstance(YoutubeDL.validate_outtmpl('%(title)'), ValueError)) + test('%(invalid@tmpl|def)s', 'none', outtmpl_na_placeholder='none') + test('%(..)s', 'NA') + + # Entire info_dict + def expect_same_infodict(out): + got_dict = json.loads(out) + for info_field, expected in self.outtmpl_info.items(): + self.assertEqual(got_dict.get(info_field), expected, info_field) + return True + + test('%()j', (expect_same_infodict, str)) + + # NA placeholder + NA_TEST_OUTTMPL = '%(uploader_date)s-%(width)d-%(x|def)s-%(id)s.%(ext)s' + test(NA_TEST_OUTTMPL, 'NA-NA-def-1234.mp4') + test(NA_TEST_OUTTMPL, 'none-none-def-1234.mp4', outtmpl_na_placeholder='none') + test(NA_TEST_OUTTMPL, '--def-1234.mp4', outtmpl_na_placeholder='') + + # String formatting + FMT_TEST_OUTTMPL = '%%(height)%s.%%(ext)s' + test(FMT_TEST_OUTTMPL % 's', '1080.mp4') + test(FMT_TEST_OUTTMPL % 'd', '1080.mp4') + test(FMT_TEST_OUTTMPL % '6d', ' 1080.mp4') + test(FMT_TEST_OUTTMPL % '-6d', '1080 .mp4') + test(FMT_TEST_OUTTMPL % '06d', '001080.mp4') + test(FMT_TEST_OUTTMPL % ' 06d', ' 01080.mp4') + test(FMT_TEST_OUTTMPL % ' 06d', ' 01080.mp4') + test(FMT_TEST_OUTTMPL % '0 6d', ' 01080.mp4') + test(FMT_TEST_OUTTMPL % '0 6d', ' 01080.mp4') + test(FMT_TEST_OUTTMPL % ' 0 6d', ' 01080.mp4') + + # Type casting + test('%(id)d', '1234') + test('%(height)c', '1') + test('%(ext)c', 'm') + test('%(id)d %(id)r', "1234 '1234'") + test('%(id)r %(height)r', "'1234' 1080") + test('%(ext)s-%(ext|def)d', 'mp4-def') + test('%(width|0)04d', '0000') + test('a%(width|)d', 'a', outtmpl_na_placeholder='none') + + FORMATS = self.outtmpl_info['formats'] + sanitize = lambda x: x.replace(':', ' -').replace('"', "'") + + # Custom type casting + test('%(formats.:.id)l', 'id1, id2, id3') + test('%(formats.:.id)#l', ('id1\nid2\nid3', 'id1 id2 id3')) + test('%(ext)l', 'mp4') + test('%(formats.:.id) 15l', ' id1, id2, id3') + test('%(formats)j', (json.dumps(FORMATS), sanitize(json.dumps(FORMATS)))) + test('%(title5).3B', 'á') + test('%(title5)U', 'áéí 𝐀') + test('%(title5)#U', 'a\u0301e\u0301i\u0301 𝐀') + test('%(title5)+U', 'áéí A') + test('%(title5)+#U', 'a\u0301e\u0301i\u0301 A') + if compat_os_name == 'nt': + test('%(title4)q', ('"foo \\"bar\\" test"', "'foo _'bar_' test'")) + else: + test('%(title4)q', ('\'foo "bar" test\'', "'foo 'bar' test'")) + + # Internal formatting + test('%(timestamp-1000>%H-%M-%S)s', '11-43-20') + test('%(title|%)s %(title|%%)s', '% %%') + test('%(id+1-height+3)05d', '00158') + test('%(width+100)05d', 'NA') + test('%(formats.0) 15s', ('% 15s' % FORMATS[0], '% 15s' % sanitize(str(FORMATS[0])))) + test('%(formats.0)r', (repr(FORMATS[0]), sanitize(repr(FORMATS[0])))) + test('%(height.0)03d', '001') + test('%(-height.0)04d', '-001') + test('%(formats.-1.id)s', FORMATS[-1]['id']) + test('%(formats.0.id.-1)d', FORMATS[0]['id'][-1]) + test('%(formats.3)s', 'NA') + test('%(formats.:2:-1)r', repr(FORMATS[:2:-1])) + test('%(formats.0.id.-1+id)f', '1235.000000') + test('%(formats.0.id.-1+formats.1.id.-1)d', '3') + + # Alternates + test('%(title,id)s', '1234') + test('%(width-100,height+20|def)d', '1100') + test('%(width-100,height+width|def)s', 'def') + test('%(timestamp-x>%H\\,%M\\,%S,timestamp>%H\\,%M\\,%S)s', '12,00,00') + + # Laziness + def gen(): + yield from range(5) + raise self.assertTrue(False, 'LazyList should not be evaluated till here') + test('%(key.4)s', '4', info={'key': LazyList(gen())}) + + # Empty filename + test('%(foo|)s-%(bar|)s.%(ext)s', '-.mp4') + # test('%(foo|)s.%(ext)s', ('.mp4', '_.mp4')) # fixme + # test('%(foo|)s', ('', '_')) # fixme + + # Environment variable expansion for prepare_filename + compat_setenv('__yt_dlp_var', 'expanded') + envvar = '%__yt_dlp_var%' if compat_os_name == 'nt' else '$__yt_dlp_var' + test(envvar, (envvar, 'expanded')) + if compat_os_name == 'nt': + test('%s%', ('%s%', '%s%')) + compat_setenv('s', 'expanded') + test('%s%', ('%s%', 'expanded')) # %s% should be expanded before escaping %s + compat_setenv('(test)s', 'expanded') + test('%(test)s%', ('NA%', 'expanded')) # Environment should take priority over template + + # Path expansion and escaping + test('Hello %(title1)s', 'Hello $PATH') + test('Hello %(title2)s', 'Hello %PATH%') + test('%(title3)s', ('foo/bar\\test', 'foo_bar_test')) + test('folder/%(title3)s', ('folder/foo/bar\\test', 'folder%sfoo_bar_test' % os.path.sep)) def test_format_note(self): ydl = YoutubeDL() @@ -722,7 +887,7 @@ class TestYoutubeDL(unittest.TestCase): def process_info(self, info_dict): super(YDL, self).process_info(info_dict) - def _match_entry(self, info_dict, incomplete): + def _match_entry(self, info_dict, incomplete=False): res = super(FilterYDL, self)._match_entry(info_dict, incomplete) if res is None: self.downloaded_info_dicts.append(info_dict) @@ -738,6 +903,7 @@ class TestYoutubeDL(unittest.TestCase): 'playlist_id': '42', 'uploader': "變態妍字幕版 太妍 тест", 'creator': "тест ' 123 ' тест--", + 'webpage_url': 'http://example.com/watch?v=shenanigans', } second = { 'id': '2', @@ -749,6 +915,7 @@ class TestYoutubeDL(unittest.TestCase): 'filesize': 5 * 1024, 'playlist_id': '43', 'uploader': "тест 123", + 'webpage_url': 'http://example.com/watch?v=SHENANIGANS', } videos = [first, second] @@ -831,54 +998,32 @@ class TestYoutubeDL(unittest.TestCase): ydl.process_ie_result(copy.deepcopy(playlist)) return ydl.downloaded_info_dicts - def get_ids(params): - return [int(v['id']) for v in get_downloaded_info_dicts(params)] - - result = get_ids({}) - self.assertEqual(result, [1, 2, 3, 4]) - - result = get_ids({'playlistend': 10}) - self.assertEqual(result, [1, 2, 3, 4]) - - result = get_ids({'playlistend': 2}) - self.assertEqual(result, [1, 2]) - - result = get_ids({'playliststart': 10}) - self.assertEqual(result, []) - - result = get_ids({'playliststart': 2}) - self.assertEqual(result, [2, 3, 4]) - - result = get_ids({'playlist_items': '2-4'}) - self.assertEqual(result, [2, 3, 4]) - - result = get_ids({'playlist_items': '2,4'}) - self.assertEqual(result, [2, 4]) - - result = get_ids({'playlist_items': '10'}) - self.assertEqual(result, []) - - result = get_ids({'playlist_items': '3-10'}) - self.assertEqual(result, [3, 4]) - - result = get_ids({'playlist_items': '2-4,3-4,3'}) - self.assertEqual(result, [2, 3, 4]) + def test_selection(params, expected_ids): + results = [ + (v['playlist_autonumber'] - 1, (int(v['id']), v['playlist_index'])) + for v in get_downloaded_info_dicts(params)] + self.assertEqual(results, list(enumerate(zip(expected_ids, expected_ids)))) + + test_selection({}, [1, 2, 3, 4]) + test_selection({'playlistend': 10}, [1, 2, 3, 4]) + test_selection({'playlistend': 2}, [1, 2]) + test_selection({'playliststart': 10}, []) + test_selection({'playliststart': 2}, [2, 3, 4]) + test_selection({'playlist_items': '2-4'}, [2, 3, 4]) + test_selection({'playlist_items': '2,4'}, [2, 4]) + test_selection({'playlist_items': '10'}, []) + test_selection({'playlist_items': '0'}, []) # Tests for https://github.com/ytdl-org/youtube-dl/issues/10591 - # @{ - result = get_downloaded_info_dicts({'playlist_items': '2-4,3-4,3'}) - self.assertEqual(result[0]['playlist_index'], 2) - self.assertEqual(result[1]['playlist_index'], 3) - - result = get_downloaded_info_dicts({'playlist_items': '2-4,3-4,3'}) - self.assertEqual(result[0]['playlist_index'], 2) - self.assertEqual(result[1]['playlist_index'], 3) - self.assertEqual(result[2]['playlist_index'], 4) - - result = get_downloaded_info_dicts({'playlist_items': '4,2'}) - self.assertEqual(result[0]['playlist_index'], 4) - self.assertEqual(result[1]['playlist_index'], 2) - # @} + test_selection({'playlist_items': '2-4,3-4,3'}, [2, 3, 4]) + test_selection({'playlist_items': '4,2'}, [4, 2]) + + # Tests for https://github.com/yt-dlp/yt-dlp/issues/720 + # https://github.com/yt-dlp/yt-dlp/issues/302 + test_selection({'playlistreverse': True}, [4, 3, 2, 1]) + test_selection({'playliststart': 2, 'playlistreverse': True}, [4, 3, 2]) + test_selection({'playlist_items': '2,4', 'playlistreverse': True}, [4, 2]) + test_selection({'playlist_items': '4,2'}, [4, 2]) def test_urlopen_no_file_protocol(self): # see https://github.com/ytdl-org/youtube-dl/issues/8227 diff --git a/test/test_YoutubeDLCookieJar.py b/test/test_YoutubeDLCookieJar.py index 05f48bd74..c514413a4 100644 --- a/test/test_YoutubeDLCookieJar.py +++ b/test/test_YoutubeDLCookieJar.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # coding: utf-8 from __future__ import unicode_literals @@ -10,7 +10,7 @@ import tempfile import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from youtube_dl.utils import YoutubeDLCookieJar +from yt_dlp.utils import YoutubeDLCookieJar class TestYoutubeDLCookieJar(unittest.TestCase): diff --git a/test/test_aes.py b/test/test_aes.py index cc89fb6ab..46db59e57 100644 --- a/test/test_aes.py +++ b/test/test_aes.py @@ -1,5 +1,4 @@ -#!/usr/bin/env python - +#!/usr/bin/env python3 from __future__ import unicode_literals # Allow direct execution @@ -8,8 +7,20 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from youtube_dl.aes import aes_decrypt, aes_encrypt, aes_cbc_decrypt, aes_cbc_encrypt, aes_decrypt_text -from youtube_dl.utils import bytes_to_intlist, intlist_to_bytes +from yt_dlp.aes import ( + aes_decrypt, + aes_encrypt, + aes_cbc_decrypt, + aes_cbc_decrypt_bytes, + aes_cbc_encrypt, + aes_ctr_decrypt, + aes_ctr_encrypt, + aes_gcm_decrypt_and_verify, + aes_gcm_decrypt_and_verify_bytes, + aes_decrypt_text +) +from yt_dlp.compat import compat_pycrypto_AES +from yt_dlp.utils import bytes_to_intlist, intlist_to_bytes import base64 # the encrypted data can be generate with 'devscripts/generate_aes_testdata.py' @@ -28,18 +39,43 @@ class TestAES(unittest.TestCase): self.assertEqual(decrypted, msg) def test_cbc_decrypt(self): - data = bytes_to_intlist( - b"\x97\x92+\xe5\x0b\xc3\x18\x91ky9m&\xb3\xb5@\xe6'\xc2\x96.\xc8u\x88\xab9-[\x9e|\xf1\xcd" - ) - decrypted = intlist_to_bytes(aes_cbc_decrypt(data, self.key, self.iv)) + data = b'\x97\x92+\xe5\x0b\xc3\x18\x91ky9m&\xb3\xb5@\xe6\x27\xc2\x96.\xc8u\x88\xab9-[\x9e|\xf1\xcd' + decrypted = intlist_to_bytes(aes_cbc_decrypt(bytes_to_intlist(data), self.key, self.iv)) self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg) + if compat_pycrypto_AES: + decrypted = aes_cbc_decrypt_bytes(data, intlist_to_bytes(self.key), intlist_to_bytes(self.iv)) + self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg) def test_cbc_encrypt(self): data = bytes_to_intlist(self.secret_msg) encrypted = intlist_to_bytes(aes_cbc_encrypt(data, self.key, self.iv)) self.assertEqual( encrypted, - b"\x97\x92+\xe5\x0b\xc3\x18\x91ky9m&\xb3\xb5@\xe6'\xc2\x96.\xc8u\x88\xab9-[\x9e|\xf1\xcd") + b'\x97\x92+\xe5\x0b\xc3\x18\x91ky9m&\xb3\xb5@\xe6\'\xc2\x96.\xc8u\x88\xab9-[\x9e|\xf1\xcd') + + def test_ctr_decrypt(self): + data = bytes_to_intlist(b'\x03\xc7\xdd\xd4\x8e\xb3\xbc\x1a*O\xdc1\x12+8Aio\xd1z\xb5#\xaf\x08') + decrypted = intlist_to_bytes(aes_ctr_decrypt(data, self.key, self.iv)) + self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg) + + def test_ctr_encrypt(self): + data = bytes_to_intlist(self.secret_msg) + encrypted = intlist_to_bytes(aes_ctr_encrypt(data, self.key, self.iv)) + self.assertEqual( + encrypted, + b'\x03\xc7\xdd\xd4\x8e\xb3\xbc\x1a*O\xdc1\x12+8Aio\xd1z\xb5#\xaf\x08') + + def test_gcm_decrypt(self): + data = b'\x159Y\xcf5eud\x90\x9c\x85&]\x14\x1d\x0f.\x08\xb4T\xe4/\x17\xbd' + authentication_tag = b'\xe8&I\x80rI\x07\x9d}YWuU@:e' + + decrypted = intlist_to_bytes(aes_gcm_decrypt_and_verify( + bytes_to_intlist(data), self.key, bytes_to_intlist(authentication_tag), self.iv[:12])) + self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg) + if compat_pycrypto_AES: + decrypted = aes_gcm_decrypt_and_verify_bytes( + data, intlist_to_bytes(self.key), authentication_tag, intlist_to_bytes(self.iv[:12])) + self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg) def test_decrypt_text(self): password = intlist_to_bytes(self.key).decode('utf-8') diff --git a/test/test_age_restriction.py b/test/test_age_restriction.py index 6f5513faa..70f9f4845 100644 --- a/test/test_age_restriction.py +++ b/test/test_age_restriction.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 from __future__ import unicode_literals # Allow direct execution @@ -7,10 +7,9 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import try_rm +from test.helper import try_rm, is_download_test - -from youtube_dl import YoutubeDL +from yt_dlp import YoutubeDL def _download_restricted(url, filename, age): @@ -32,6 +31,7 @@ def _download_restricted(url, filename, age): return res +@is_download_test class TestAgeRestriction(unittest.TestCase): def _assert_restricted(self, url, filename, age, old_age=None): self.assertTrue(_download_restricted(url, filename, old_age)) diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 365b66bad..68c1c68d3 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 from __future__ import unicode_literals @@ -12,7 +12,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import gettestcases -from youtube_dl.extractor import ( +from yt_dlp.extractor import ( FacebookIE, gen_extractors, YoutubeIE, @@ -35,6 +35,8 @@ class TestAllURLsMatching(unittest.TestCase): assertPlaylist('ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') assertPlaylist('UUBABnxM4Ar9ten8Mdjj1j0Q') # 585 assertPlaylist('PL63F0C78739B09958') + assertTab('https://www.youtube.com/AsapSCIENCE') + assertTab('https://www.youtube.com/embedded') assertTab('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q') assertTab('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') assertTab('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC') @@ -47,7 +49,7 @@ class TestAllURLsMatching(unittest.TestCase): self.assertTrue(YoutubeIE.suitable('PLtS2H6bU1M')) self.assertFalse(YoutubeIE.suitable('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012')) # 668 self.assertMatch('http://youtu.be/BaW_jenozKc', ['youtube']) - self.assertMatch('http://www.youtube.com/v/BaW_jenozKc', ['youtube']) + # self.assertMatch('http://www.youtube.com/v/BaW_jenozKc', ['youtube']) # /v/ is no longer valid self.assertMatch('https://youtube.googleapis.com/v/BaW_jenozKc', ['youtube']) self.assertMatch('http://www.cleanvideosearch.com/media/action/yt/watch?videoId=8v_4O44sfjM', ['youtube']) @@ -66,9 +68,9 @@ class TestAllURLsMatching(unittest.TestCase): self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:tab']) self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:tab']) - # def test_youtube_search_matching(self): - # self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url']) - # self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url']) + def test_youtube_search_matching(self): + self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url']) + self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url']) def test_facebook_matching(self): self.assertTrue(FacebookIE.suitable('https://www.facebook.com/Shiniknoh#!/photo.php?v=10153317450565268')) diff --git a/test/test_cache.py b/test/test_cache.py index a16160142..8c4f85387 100644 --- a/test/test_cache.py +++ b/test/test_cache.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # coding: utf-8 from __future__ import unicode_literals @@ -13,7 +13,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import FakeYDL -from youtube_dl.cache import Cache +from yt_dlp.cache import Cache def _is_empty(d): diff --git a/test/test_compat.py b/test/test_compat.py index 86ff389fd..c9bc4d7fb 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # coding: utf-8 from __future__ import unicode_literals @@ -10,7 +10,7 @@ import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from youtube_dl.compat import ( +from yt_dlp.compat import ( compat_getenv, compat_setenv, compat_etree_Element, @@ -19,6 +19,8 @@ from youtube_dl.compat import ( compat_shlex_split, compat_str, compat_struct_unpack, + compat_urllib_parse_quote, + compat_urllib_parse_quote_plus, compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, compat_urllib_parse_urlencode, @@ -28,11 +30,11 @@ from youtube_dl.compat import ( class TestCompat(unittest.TestCase): def test_compat_getenv(self): test_str = 'тест' - compat_setenv('YOUTUBE_DL_COMPAT_GETENV', test_str) - self.assertEqual(compat_getenv('YOUTUBE_DL_COMPAT_GETENV'), test_str) + compat_setenv('yt_dlp_COMPAT_GETENV', test_str) + self.assertEqual(compat_getenv('yt_dlp_COMPAT_GETENV'), test_str) def test_compat_setenv(self): - test_var = 'YOUTUBE_DL_COMPAT_SETENV' + test_var = 'yt_dlp_COMPAT_SETENV' test_str = 'тест' compat_setenv(test_var, test_str) compat_getenv(test_var) @@ -46,13 +48,34 @@ class TestCompat(unittest.TestCase): compat_setenv('HOME', old_home or '') def test_all_present(self): - import youtube_dl.compat - all_names = youtube_dl.compat.__all__ + import yt_dlp.compat + all_names = yt_dlp.compat.__all__ present_names = set(filter( lambda c: '_' in c and not c.startswith('_'), - dir(youtube_dl.compat))) - set(['unicode_literals']) + dir(yt_dlp.compat))) - set(['unicode_literals']) self.assertEqual(all_names, sorted(present_names)) + def test_compat_urllib_parse_quote(self): + self.assertEqual(compat_urllib_parse_quote('abc def'), 'abc%20def') + self.assertEqual(compat_urllib_parse_quote('/user/abc+def'), '/user/abc%2Bdef') + self.assertEqual(compat_urllib_parse_quote('/user/abc+def', safe='+'), '%2Fuser%2Fabc+def') + self.assertEqual(compat_urllib_parse_quote(''), '') + self.assertEqual(compat_urllib_parse_quote('%'), '%25') + self.assertEqual(compat_urllib_parse_quote('%', safe='%'), '%') + self.assertEqual(compat_urllib_parse_quote('津波'), '%E6%B4%A5%E6%B3%A2') + self.assertEqual( + compat_urllib_parse_quote('''<meta property="og:description" content="▁▂▃▄%▅▆▇█" /> +%<a href="https://ar.wikipedia.org/wiki/تسونامي">%a''', safe='<>=":%/ \r\n'), + '''<meta property="og:description" content="%E2%96%81%E2%96%82%E2%96%83%E2%96%84%%E2%96%85%E2%96%86%E2%96%87%E2%96%88" /> +%<a href="https://ar.wikipedia.org/wiki/%D8%AA%D8%B3%D9%88%D9%86%D8%A7%D9%85%D9%8A">%a''') + self.assertEqual( + compat_urllib_parse_quote('''(^◣_◢^)っ︻デ═一 ⇀ ⇀ ⇀ ⇀ ⇀ ↶%I%Break%25Things%''', safe='% '), + '''%28%5E%E2%97%A3_%E2%97%A2%5E%29%E3%81%A3%EF%B8%BB%E3%83%87%E2%95%90%E4%B8%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%86%B6%I%Break%25Things%''') + + def test_compat_urllib_parse_quote_plus(self): + self.assertEqual(compat_urllib_parse_quote_plus('abc def'), 'abc+def') + self.assertEqual(compat_urllib_parse_quote_plus('/abc def'), '%2Fabc+def') + def test_compat_urllib_parse_unquote(self): self.assertEqual(compat_urllib_parse_unquote('abc%20def'), 'abc def') self.assertEqual(compat_urllib_parse_unquote('%7e/abc+def'), '~/abc+def') diff --git a/test/test_cookies.py b/test/test_cookies.py new file mode 100644 index 000000000..7d509ebe8 --- /dev/null +++ b/test/test_cookies.py @@ -0,0 +1,107 @@ +import unittest +from datetime import datetime, timezone + +from yt_dlp import cookies +from yt_dlp.cookies import ( + LinuxChromeCookieDecryptor, + MacChromeCookieDecryptor, + WindowsChromeCookieDecryptor, + parse_safari_cookies, + pbkdf2_sha1, +) + + +class Logger: + def debug(self, message): + print(f'[verbose] {message}') + + def info(self, message): + print(message) + + def warning(self, message, only_once=False): + self.error(message) + + def error(self, message): + raise Exception(message) + + +class MonkeyPatch: + def __init__(self, module, temporary_values): + self._module = module + self._temporary_values = temporary_values + self._backup_values = {} + + def __enter__(self): + for name, temp_value in self._temporary_values.items(): + self._backup_values[name] = getattr(self._module, name) + setattr(self._module, name, temp_value) + + def __exit__(self, exc_type, exc_val, exc_tb): + for name, backup_value in self._backup_values.items(): + setattr(self._module, name, backup_value) + + +class TestCookies(unittest.TestCase): + def test_chrome_cookie_decryptor_linux_derive_key(self): + key = LinuxChromeCookieDecryptor.derive_key(b'abc') + self.assertEqual(key, b'7\xa1\xec\xd4m\xfcA\xc7\xb19Z\xd0\x19\xdcM\x17') + + def test_chrome_cookie_decryptor_mac_derive_key(self): + key = MacChromeCookieDecryptor.derive_key(b'abc') + self.assertEqual(key, b'Y\xe2\xc0\xd0P\xf6\xf4\xe1l\xc1\x8cQ\xcb|\xcdY') + + def test_chrome_cookie_decryptor_linux_v10(self): + with MonkeyPatch(cookies, {'_get_linux_keyring_password': lambda *args, **kwargs: b''}): + encrypted_value = b'v10\xccW%\xcd\xe6\xe6\x9fM" \xa7\xb0\xca\xe4\x07\xd6' + value = 'USD' + decryptor = LinuxChromeCookieDecryptor('Chrome', Logger()) + self.assertEqual(decryptor.decrypt(encrypted_value), value) + + def test_chrome_cookie_decryptor_linux_v11(self): + with MonkeyPatch(cookies, {'_get_linux_keyring_password': lambda *args, **kwargs: b'', + 'KEYRING_AVAILABLE': True}): + encrypted_value = b'v11#\x81\x10>`w\x8f)\xc0\xb2\xc1\r\xf4\x1al\xdd\x93\xfd\xf8\xf8N\xf2\xa9\x83\xf1\xe9o\x0elVQd' + value = 'tz=Europe.London' + decryptor = LinuxChromeCookieDecryptor('Chrome', Logger()) + self.assertEqual(decryptor.decrypt(encrypted_value), value) + + def test_chrome_cookie_decryptor_windows_v10(self): + with MonkeyPatch(cookies, { + '_get_windows_v10_key': lambda *args, **kwargs: b'Y\xef\xad\xad\xeerp\xf0Y\xe6\x9b\x12\xc2<z\x16]\n\xbb\xb8\xcb\xd7\x9bA\xc3\x14e\x99{\xd6\xf4&' + }): + encrypted_value = b'v10T\xb8\xf3\xb8\x01\xa7TtcV\xfc\x88\xb8\xb8\xef\x05\xb5\xfd\x18\xc90\x009\xab\xb1\x893\x85)\x87\xe1\xa9-\xa3\xad=' + value = '32101439' + decryptor = WindowsChromeCookieDecryptor('', Logger()) + self.assertEqual(decryptor.decrypt(encrypted_value), value) + + def test_chrome_cookie_decryptor_mac_v10(self): + with MonkeyPatch(cookies, {'_get_mac_keyring_password': lambda *args, **kwargs: b'6eIDUdtKAacvlHwBVwvg/Q=='}): + encrypted_value = b'v10\xb3\xbe\xad\xa1[\x9fC\xa1\x98\xe0\x9a\x01\xd9\xcf\xbfc' + value = '2021-06-01-22' + decryptor = MacChromeCookieDecryptor('', Logger()) + self.assertEqual(decryptor.decrypt(encrypted_value), value) + + def test_safari_cookie_parsing(self): + cookies = \ + b'cook\x00\x00\x00\x01\x00\x00\x00i\x00\x00\x01\x00\x01\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00\x00Y' \ + b'\x00\x00\x00\x00\x00\x00\x00 \x00\x00\x00\x00\x00\x00\x008\x00\x00\x00B\x00\x00\x00F\x00\x00\x00H' \ + b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x80\x03\xa5>\xc3A\x00\x00\x80\xc3\x07:\xc3A' \ + b'localhost\x00foo\x00/\x00test%20%3Bcookie\x00\x00\x00\x054\x07\x17 \x05\x00\x00\x00Kbplist00\xd1\x01' \ + b'\x02_\x10\x18NSHTTPCookieAcceptPolicy\x10\x02\x08\x0b&\x00\x00\x00\x00\x00\x00\x01\x01\x00\x00\x00' \ + b'\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00(' + + jar = parse_safari_cookies(cookies) + self.assertEqual(len(jar), 1) + cookie = list(jar)[0] + self.assertEqual(cookie.domain, 'localhost') + self.assertEqual(cookie.port, None) + self.assertEqual(cookie.path, '/') + self.assertEqual(cookie.name, 'foo') + self.assertEqual(cookie.value, 'test%20%3Bcookie') + self.assertFalse(cookie.secure) + expected_expiration = datetime(2021, 6, 18, 21, 39, 19, tzinfo=timezone.utc) + self.assertEqual(cookie.expires, int(expected_expiration.timestamp())) + + def test_pbkdf2_sha1(self): + key = pbkdf2_sha1(b'peanuts', b' ' * 16, 1, 16) + self.assertEqual(key, b'g\xe1\x8e\x0fQ\x1c\x9b\xf3\xc9`!\xaa\x90\xd9\xd34') diff --git a/test/test_download.py b/test/test_download.py index ebe820dfc..d7c469f3d 100644..100755 --- a/test/test_download.py +++ b/test/test_download.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 from __future__ import unicode_literals @@ -10,12 +10,13 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import ( assertGreaterEqual, + expect_info_dict, expect_warnings, get_params, gettestcases, - expect_info_dict, - try_rm, + is_download_test, report_warning, + try_rm, ) @@ -24,24 +25,24 @@ import io import json import socket -import youtube_dl.YoutubeDL -from youtube_dl.compat import ( +import yt_dlp.YoutubeDL +from yt_dlp.compat import ( compat_http_client, compat_urllib_error, compat_HTTPError, ) -from youtube_dl.utils import ( +from yt_dlp.utils import ( DownloadError, ExtractorError, format_bytes, UnavailableVideoError, ) -from youtube_dl.extractor import get_info_extractor +from yt_dlp.extractor import get_info_extractor RETRIES = 3 -class YoutubeDL(youtube_dl.YoutubeDL): +class YoutubeDL(yt_dlp.YoutubeDL): def __init__(self, *args, **kwargs): self.to_stderr = self.to_screen self.processed_info_dicts = [] @@ -64,6 +65,7 @@ def _file_md5(fn): defs = gettestcases() +@is_download_test class TestDownload(unittest.TestCase): # Parallel testing in nosetests. See # http://nose.readthedocs.org/en/latest/doc_tests/test_multiprocess/multiprocess.html @@ -71,6 +73,8 @@ class TestDownload(unittest.TestCase): maxDiff = None + COMPLETED_TESTS = {} + def __str__(self): """Identify each test with the `add_ie` attribute, if available.""" @@ -92,7 +96,10 @@ class TestDownload(unittest.TestCase): def generator(test_case, tname): def test_template(self): - ie = youtube_dl.extractor.get_info_extractor(test_case['name'])() + if self.COMPLETED_TESTS.get(tname): + return + self.COMPLETED_TESTS[tname] = True + ie = yt_dlp.extractor.get_info_extractor(test_case['name'])() other_ies = [get_info_extractor(ie_key)() for ie_key in test_case.get('add_ie', [])] is_playlist = any(k.startswith('playlist') for k in test_case) test_cases = test_case.get( @@ -106,8 +113,13 @@ def generator(test_case, tname): for tc in test_cases: info_dict = tc.get('info_dict', {}) - if not (info_dict.get('id') and info_dict.get('ext')): - raise Exception('Test definition incorrect. The output file cannot be known. Are both \'id\' and \'ext\' keys present?') + params = tc.get('params', {}) + if not info_dict.get('id'): + raise Exception('Test definition incorrect. \'id\' key is not present') + elif not info_dict.get('ext'): + if params.get('skip_download') and params.get('ignore_no_formats_error'): + continue + raise Exception('Test definition incorrect. The output file cannot be known. \'ext\' key is not present') if 'skip' in test_case: print_skipping(test_case['skip']) @@ -121,6 +133,7 @@ def generator(test_case, tname): params['outtmpl'] = tname + '_' + params['outtmpl'] if is_playlist and 'playlist' not in test_case: params.setdefault('extract_flat', 'in_playlist') + params.setdefault('playlistend', test_case.get('playlist_mincount')) params.setdefault('skip_download', True) ydl = YoutubeDL(params, auto_init=False) @@ -134,7 +147,7 @@ def generator(test_case, tname): expect_warnings(ydl, test_case.get('expected_warnings', [])) def get_tc_filename(tc): - return ydl.prepare_filename(tc.get('info_dict', {})) + return ydl.prepare_filename(dict(tc.get('info_dict', {}))) res_dict = None @@ -247,12 +260,12 @@ def generator(test_case, tname): # And add them to TestDownload -for n, test_case in enumerate(defs): - tname = 'test_' + str(test_case['name']) - i = 1 - while hasattr(TestDownload, tname): - tname = 'test_%s_%d' % (test_case['name'], i) - i += 1 +tests_counter = {} +for test_case in defs: + name = test_case['name'] + i = tests_counter.get(name, 0) + tests_counter[name] = i + 1 + tname = f'test_{name}_{i}' if i else f'test_{name}' test_method = generator(test_case, tname) test_method.__name__ = str(tname) ie_list = test_case.get('add_ie') @@ -261,5 +274,22 @@ for n, test_case in enumerate(defs): del test_method +def batch_generator(name, num_tests): + + def test_template(self): + for i in range(num_tests): + getattr(self, f'test_{name}_{i}' if i else f'test_{name}')() + + return test_template + + +for name, num_tests in tests_counter.items(): + test_method = batch_generator(name, num_tests) + test_method.__name__ = f'test_{name}_all' + test_method.add_ie = '' + setattr(TestDownload, test_method.__name__, test_method) + del test_method + + if __name__ == '__main__': unittest.main() diff --git a/test/test_downloader_http.py b/test/test_downloader_http.py index 750472281..03ae8c62a 100644 --- a/test/test_downloader_http.py +++ b/test/test_downloader_http.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # coding: utf-8 from __future__ import unicode_literals @@ -10,10 +10,10 @@ import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import http_server_port, try_rm -from youtube_dl import YoutubeDL -from youtube_dl.compat import compat_http_server -from youtube_dl.downloader.http import HttpFD -from youtube_dl.utils import encodeFilename +from yt_dlp import YoutubeDL +from yt_dlp.compat import compat_http_server +from yt_dlp.downloader.http import HttpFD +from yt_dlp.utils import encodeFilename import threading TEST_DIR = os.path.dirname(os.path.abspath(__file__)) diff --git a/test/test_execution.py b/test/test_execution.py index 32948d93e..cf6b6b913 100644 --- a/test/test_execution.py +++ b/test/test_execution.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # coding: utf-8 from __future__ import unicode_literals @@ -10,7 +10,7 @@ import os import subprocess sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from youtube_dl.utils import encodeArgument +from yt_dlp.utils import encodeArgument rootDir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) @@ -23,29 +23,29 @@ except AttributeError: class TestExecution(unittest.TestCase): def test_import(self): - subprocess.check_call([sys.executable, '-c', 'import youtube_dl'], cwd=rootDir) + subprocess.check_call([sys.executable, '-c', 'import yt_dlp'], cwd=rootDir) def test_module_exec(self): if sys.version_info >= (2, 7): # Python 2.6 doesn't support package execution - subprocess.check_call([sys.executable, '-m', 'youtube_dl', '--version'], cwd=rootDir, stdout=_DEV_NULL) + subprocess.check_call([sys.executable, '-m', 'yt_dlp', '--version'], cwd=rootDir, stdout=_DEV_NULL) def test_main_exec(self): - subprocess.check_call([sys.executable, 'youtube_dl/__main__.py', '--version'], cwd=rootDir, stdout=_DEV_NULL) + subprocess.check_call([sys.executable, 'yt_dlp/__main__.py', '--version'], cwd=rootDir, stdout=_DEV_NULL) def test_cmdline_umlauts(self): p = subprocess.Popen( - [sys.executable, 'youtube_dl/__main__.py', encodeArgument('ä'), '--version'], + [sys.executable, 'yt_dlp/__main__.py', encodeArgument('ä'), '--version'], cwd=rootDir, stdout=_DEV_NULL, stderr=subprocess.PIPE) _, stderr = p.communicate() self.assertFalse(stderr) def test_lazy_extractors(self): try: - subprocess.check_call([sys.executable, 'devscripts/make_lazy_extractors.py', 'youtube_dl/extractor/lazy_extractors.py'], cwd=rootDir, stdout=_DEV_NULL) + subprocess.check_call([sys.executable, 'devscripts/make_lazy_extractors.py', 'yt_dlp/extractor/lazy_extractors.py'], cwd=rootDir, stdout=_DEV_NULL) subprocess.check_call([sys.executable, 'test/test_all_urls.py'], cwd=rootDir, stdout=_DEV_NULL) finally: try: - os.remove('youtube_dl/extractor/lazy_extractors.py') + os.remove('yt_dlp/extractor/lazy_extractors.py') except (IOError, OSError): pass diff --git a/test/test_http.py b/test/test_http.py index 3ee0a5dda..40df167e0 100644 --- a/test/test_http.py +++ b/test/test_http.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # coding: utf-8 from __future__ import unicode_literals @@ -9,8 +9,8 @@ import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import http_server_port -from youtube_dl import YoutubeDL -from youtube_dl.compat import compat_http_server, compat_urllib_request +from yt_dlp import YoutubeDL +from yt_dlp.compat import compat_http_server, compat_urllib_request import ssl import threading diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index c24b8ca74..8b2b60403 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 from __future__ import unicode_literals @@ -8,7 +8,7 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from youtube_dl.jsinterp import JSInterpreter +from yt_dlp.jsinterp import JSInterpreter class TestJSInterpreter(unittest.TestCase): diff --git a/test/test_netrc.py b/test/test_netrc.py index 7cf3a6a2e..36b943591 100644 --- a/test/test_netrc.py +++ b/test/test_netrc.py @@ -7,7 +7,7 @@ import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from youtube_dl.extractor import ( +from yt_dlp.extractor import ( gen_extractors, ) diff --git a/test/test_options.py b/test/test_options.py index 3a25a6ba3..42d9183a9 100644 --- a/test/test_options.py +++ b/test/test_options.py @@ -8,7 +8,7 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from youtube_dl.options import _hide_login_info +from yt_dlp.options import _hide_login_info class TestOptions(unittest.TestCase): diff --git a/test/test_overwrites.py b/test/test_overwrites.py new file mode 100644 index 000000000..f5d10a409 --- /dev/null +++ b/test/test_overwrites.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 +from __future__ import unicode_literals + +import os +from os.path import join +import subprocess +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from test.helper import is_download_test, try_rm + + +root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +download_file = join(root_dir, 'test.webm') + + +@is_download_test +class TestOverwrites(unittest.TestCase): + def setUp(self): + # create an empty file + open(download_file, 'a').close() + + def test_default_overwrites(self): + outp = subprocess.Popen( + [ + sys.executable, 'yt_dlp/__main__.py', + '-o', 'test.webm', + 'https://www.youtube.com/watch?v=jNQXAC9IVRw' + ], cwd=root_dir, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + sout, serr = outp.communicate() + self.assertTrue(b'has already been downloaded' in sout) + # if the file has no content, it has not been redownloaded + self.assertTrue(os.path.getsize(download_file) < 1) + + def test_yes_overwrites(self): + outp = subprocess.Popen( + [ + sys.executable, 'yt_dlp/__main__.py', '--yes-overwrites', + '-o', 'test.webm', + 'https://www.youtube.com/watch?v=jNQXAC9IVRw' + ], cwd=root_dir, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + sout, serr = outp.communicate() + self.assertTrue(b'has already been downloaded' not in sout) + # if the file has no content, it has not been redownloaded + self.assertTrue(os.path.getsize(download_file) > 1) + + def tearDown(self): + try_rm(join(root_dir, 'test.webm')) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_post_hooks.py b/test/test_post_hooks.py new file mode 100644 index 000000000..1555a23e0 --- /dev/null +++ b/test/test_post_hooks.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 + +from __future__ import unicode_literals + +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from test.helper import get_params, try_rm, is_download_test +import yt_dlp.YoutubeDL +from yt_dlp.utils import DownloadError + + +class YoutubeDL(yt_dlp.YoutubeDL): + def __init__(self, *args, **kwargs): + super(YoutubeDL, self).__init__(*args, **kwargs) + self.to_stderr = self.to_screen + + +TEST_ID = 'gr51aVj-mLg' +EXPECTED_NAME = 'gr51aVj-mLg' + + +@is_download_test +class TestPostHooks(unittest.TestCase): + def setUp(self): + self.stored_name_1 = None + self.stored_name_2 = None + self.params = get_params({ + 'skip_download': False, + 'writeinfojson': False, + 'quiet': True, + 'verbose': False, + 'cachedir': False, + }) + self.files = [] + + def test_post_hooks(self): + self.params['post_hooks'] = [self.hook_one, self.hook_two] + ydl = YoutubeDL(self.params) + ydl.download([TEST_ID]) + self.assertEqual(self.stored_name_1, EXPECTED_NAME, 'Not the expected name from hook 1') + self.assertEqual(self.stored_name_2, EXPECTED_NAME, 'Not the expected name from hook 2') + + def test_post_hook_exception(self): + self.params['post_hooks'] = [self.hook_three] + ydl = YoutubeDL(self.params) + self.assertRaises(DownloadError, ydl.download, [TEST_ID]) + + def hook_one(self, filename): + self.stored_name_1, _ = os.path.splitext(os.path.basename(filename)) + self.files.append(filename) + + def hook_two(self, filename): + self.stored_name_2, _ = os.path.splitext(os.path.basename(filename)) + self.files.append(filename) + + def hook_three(self, filename): + self.files.append(filename) + raise Exception('Test exception for \'%s\'' % filename) + + def tearDown(self): + for f in self.files: + try_rm(f) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_postprocessors.py b/test/test_postprocessors.py index 4209d1d9a..090c7b47b 100644 --- a/test/test_postprocessors.py +++ b/test/test_postprocessors.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 from __future__ import unicode_literals @@ -6,12 +6,557 @@ from __future__ import unicode_literals import os import sys import unittest + sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from youtube_dl.postprocessor import MetadataFromTitlePP +from yt_dlp import YoutubeDL +from yt_dlp.compat import compat_shlex_quote +from yt_dlp.postprocessor import ( + ExecPP, + FFmpegThumbnailsConvertorPP, + MetadataFromFieldPP, + MetadataParserPP, + ModifyChaptersPP +) + +class TestMetadataFromField(unittest.TestCase): -class TestMetadataFromTitle(unittest.TestCase): def test_format_to_regex(self): - pp = MetadataFromTitlePP(None, '%(title)s - %(artist)s') - self.assertEqual(pp._titleregex, r'(?P<title>.+)\ \-\ (?P<artist>.+)') + self.assertEqual( + MetadataParserPP.format_to_regex('%(title)s - %(artist)s'), + r'(?P<title>.+)\ \-\ (?P<artist>.+)') + self.assertEqual(MetadataParserPP.format_to_regex(r'(?P<x>.+)'), r'(?P<x>.+)') + + def test_field_to_template(self): + self.assertEqual(MetadataParserPP.field_to_template('title'), '%(title)s') + self.assertEqual(MetadataParserPP.field_to_template('1'), '1') + self.assertEqual(MetadataParserPP.field_to_template('foo bar'), 'foo bar') + self.assertEqual(MetadataParserPP.field_to_template(' literal'), ' literal') + + def test_metadatafromfield(self): + self.assertEqual( + MetadataFromFieldPP.to_action('%(title)s \\: %(artist)s:%(title)s : %(artist)s'), + (MetadataParserPP.Actions.INTERPRET, '%(title)s : %(artist)s', '%(title)s : %(artist)s')) + + +class TestConvertThumbnail(unittest.TestCase): + def test_escaping(self): + pp = FFmpegThumbnailsConvertorPP() + if not pp.available: + print('Skipping: ffmpeg not found') + return + + file = 'test/testdata/thumbnails/foo %d bar/foo_%d.{}' + tests = (('webp', 'png'), ('png', 'jpg')) + + for inp, out in tests: + out_file = file.format(out) + if os.path.exists(out_file): + os.remove(out_file) + pp.convert_thumbnail(file.format(inp), out) + assert os.path.exists(out_file) + + for _, out in tests: + os.remove(file.format(out)) + + +class TestExec(unittest.TestCase): + def test_parse_cmd(self): + pp = ExecPP(YoutubeDL(), '') + info = {'filepath': 'file name'} + cmd = 'echo %s' % compat_shlex_quote(info['filepath']) + + self.assertEqual(pp.parse_cmd('echo', info), cmd) + self.assertEqual(pp.parse_cmd('echo {}', info), cmd) + self.assertEqual(pp.parse_cmd('echo %(filepath)q', info), cmd) + + +class TestModifyChaptersPP(unittest.TestCase): + def setUp(self): + self._pp = ModifyChaptersPP(YoutubeDL()) + + @staticmethod + def _sponsor_chapter(start, end, cat, remove=False): + c = {'start_time': start, 'end_time': end, '_categories': [(cat, start, end)]} + if remove: + c['remove'] = True + return c + + @staticmethod + def _chapter(start, end, title=None, remove=False): + c = {'start_time': start, 'end_time': end} + if title is not None: + c['title'] = title + if remove: + c['remove'] = True + return c + + def _chapters(self, ends, titles): + self.assertEqual(len(ends), len(titles)) + start = 0 + chapters = [] + for e, t in zip(ends, titles): + chapters.append(self._chapter(start, e, t)) + start = e + return chapters + + def _remove_marked_arrange_sponsors_test_impl( + self, chapters, expected_chapters, expected_removed): + actual_chapters, actual_removed = ( + self._pp._remove_marked_arrange_sponsors(chapters)) + for c in actual_removed: + c.pop('title', None) + c.pop('_categories', None) + actual_chapters = [{ + 'start_time': c['start_time'], + 'end_time': c['end_time'], + 'title': c['title'], + } for c in actual_chapters] + self.assertSequenceEqual(expected_chapters, actual_chapters) + self.assertSequenceEqual(expected_removed, actual_removed) + + def test_remove_marked_arrange_sponsors_CanGetThroughUnaltered(self): + chapters = self._chapters([10, 20, 30, 40], ['c1', 'c2', 'c3', 'c4']) + self._remove_marked_arrange_sponsors_test_impl(chapters, chapters, []) + + def test_remove_marked_arrange_sponsors_ChapterWithSponsors(self): + chapters = self._chapters([70], ['c']) + [ + self._sponsor_chapter(10, 20, 'sponsor'), + self._sponsor_chapter(30, 40, 'preview'), + self._sponsor_chapter(50, 60, 'sponsor')] + expected = self._chapters( + [10, 20, 30, 40, 50, 60, 70], + ['c', '[SponsorBlock]: Sponsor', 'c', '[SponsorBlock]: Preview/Recap', + 'c', '[SponsorBlock]: Sponsor', 'c']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, []) + + def test_remove_marked_arrange_sponsors_UniqueNamesForOverlappingSponsors(self): + chapters = self._chapters([120], ['c']) + [ + self._sponsor_chapter(10, 45, 'sponsor'), self._sponsor_chapter(20, 40, 'selfpromo'), + self._sponsor_chapter(50, 70, 'sponsor'), self._sponsor_chapter(60, 85, 'selfpromo'), + self._sponsor_chapter(90, 120, 'selfpromo'), self._sponsor_chapter(100, 110, 'sponsor')] + expected = self._chapters( + [10, 20, 40, 45, 50, 60, 70, 85, 90, 100, 110, 120], + ['c', '[SponsorBlock]: Sponsor', '[SponsorBlock]: Sponsor, Unpaid/Self Promotion', + '[SponsorBlock]: Sponsor', + 'c', '[SponsorBlock]: Sponsor', '[SponsorBlock]: Sponsor, Unpaid/Self Promotion', + '[SponsorBlock]: Unpaid/Self Promotion', + 'c', '[SponsorBlock]: Unpaid/Self Promotion', '[SponsorBlock]: Unpaid/Self Promotion, Sponsor', + '[SponsorBlock]: Unpaid/Self Promotion']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, []) + + def test_remove_marked_arrange_sponsors_ChapterWithCuts(self): + cuts = [self._chapter(10, 20, remove=True), + self._sponsor_chapter(30, 40, 'sponsor', remove=True), + self._chapter(50, 60, remove=True)] + chapters = self._chapters([70], ['c']) + cuts + self._remove_marked_arrange_sponsors_test_impl( + chapters, self._chapters([40], ['c']), cuts) + + def test_remove_marked_arrange_sponsors_ChapterWithSponsorsAndCuts(self): + chapters = self._chapters([70], ['c']) + [ + self._sponsor_chapter(10, 20, 'sponsor'), + self._sponsor_chapter(30, 40, 'selfpromo', remove=True), + self._sponsor_chapter(50, 60, 'interaction')] + expected = self._chapters([10, 20, 40, 50, 60], + ['c', '[SponsorBlock]: Sponsor', 'c', + '[SponsorBlock]: Interaction Reminder', 'c']) + self._remove_marked_arrange_sponsors_test_impl( + chapters, expected, [self._chapter(30, 40, remove=True)]) + + def test_remove_marked_arrange_sponsors_ChapterWithSponsorCutInTheMiddle(self): + cuts = [self._sponsor_chapter(20, 30, 'selfpromo', remove=True), + self._chapter(40, 50, remove=True)] + chapters = self._chapters([70], ['c']) + [self._sponsor_chapter(10, 60, 'sponsor')] + cuts + expected = self._chapters( + [10, 40, 50], ['c', '[SponsorBlock]: Sponsor', 'c']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, cuts) + + def test_remove_marked_arrange_sponsors_ChapterWithCutHidingSponsor(self): + cuts = [self._sponsor_chapter(20, 50, 'selpromo', remove=True)] + chapters = self._chapters([60], ['c']) + [ + self._sponsor_chapter(10, 20, 'intro'), + self._sponsor_chapter(30, 40, 'sponsor'), + self._sponsor_chapter(50, 60, 'outro'), + ] + cuts + expected = self._chapters( + [10, 20, 30], ['c', '[SponsorBlock]: Intermission/Intro Animation', '[SponsorBlock]: Endcards/Credits']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, cuts) + + def test_remove_marked_arrange_sponsors_ChapterWithAdjacentSponsors(self): + chapters = self._chapters([70], ['c']) + [ + self._sponsor_chapter(10, 20, 'sponsor'), + self._sponsor_chapter(20, 30, 'selfpromo'), + self._sponsor_chapter(30, 40, 'interaction')] + expected = self._chapters( + [10, 20, 30, 40, 70], + ['c', '[SponsorBlock]: Sponsor', '[SponsorBlock]: Unpaid/Self Promotion', + '[SponsorBlock]: Interaction Reminder', 'c']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, []) + + def test_remove_marked_arrange_sponsors_ChapterWithAdjacentCuts(self): + chapters = self._chapters([70], ['c']) + [ + self._sponsor_chapter(10, 20, 'sponsor'), + self._sponsor_chapter(20, 30, 'interaction', remove=True), + self._chapter(30, 40, remove=True), + self._sponsor_chapter(40, 50, 'selpromo', remove=True), + self._sponsor_chapter(50, 60, 'interaction')] + expected = self._chapters([10, 20, 30, 40], + ['c', '[SponsorBlock]: Sponsor', + '[SponsorBlock]: Interaction Reminder', 'c']) + self._remove_marked_arrange_sponsors_test_impl( + chapters, expected, [self._chapter(20, 50, remove=True)]) + + def test_remove_marked_arrange_sponsors_ChapterWithOverlappingSponsors(self): + chapters = self._chapters([70], ['c']) + [ + self._sponsor_chapter(10, 30, 'sponsor'), + self._sponsor_chapter(20, 50, 'selfpromo'), + self._sponsor_chapter(40, 60, 'interaction')] + expected = self._chapters( + [10, 20, 30, 40, 50, 60, 70], + ['c', '[SponsorBlock]: Sponsor', '[SponsorBlock]: Sponsor, Unpaid/Self Promotion', + '[SponsorBlock]: Unpaid/Self Promotion', '[SponsorBlock]: Unpaid/Self Promotion, Interaction Reminder', + '[SponsorBlock]: Interaction Reminder', 'c']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, []) + + def test_remove_marked_arrange_sponsors_ChapterWithOverlappingCuts(self): + chapters = self._chapters([70], ['c']) + [ + self._sponsor_chapter(10, 30, 'sponsor', remove=True), + self._sponsor_chapter(20, 50, 'selfpromo', remove=True), + self._sponsor_chapter(40, 60, 'interaction', remove=True)] + self._remove_marked_arrange_sponsors_test_impl( + chapters, self._chapters([20], ['c']), [self._chapter(10, 60, remove=True)]) + + def test_remove_marked_arrange_sponsors_ChapterWithRunsOfOverlappingSponsors(self): + chapters = self._chapters([170], ['c']) + [ + self._sponsor_chapter(0, 30, 'intro'), + self._sponsor_chapter(20, 50, 'sponsor'), + self._sponsor_chapter(40, 60, 'selfpromo'), + self._sponsor_chapter(70, 90, 'sponsor'), + self._sponsor_chapter(80, 100, 'sponsor'), + self._sponsor_chapter(90, 110, 'sponsor'), + self._sponsor_chapter(120, 140, 'selfpromo'), + self._sponsor_chapter(130, 160, 'interaction'), + self._sponsor_chapter(150, 170, 'outro')] + expected = self._chapters( + [20, 30, 40, 50, 60, 70, 110, 120, 130, 140, 150, 160, 170], + ['[SponsorBlock]: Intermission/Intro Animation', '[SponsorBlock]: Intermission/Intro Animation, Sponsor', '[SponsorBlock]: Sponsor', + '[SponsorBlock]: Sponsor, Unpaid/Self Promotion', '[SponsorBlock]: Unpaid/Self Promotion', 'c', + '[SponsorBlock]: Sponsor', 'c', '[SponsorBlock]: Unpaid/Self Promotion', + '[SponsorBlock]: Unpaid/Self Promotion, Interaction Reminder', + '[SponsorBlock]: Interaction Reminder', + '[SponsorBlock]: Interaction Reminder, Endcards/Credits', '[SponsorBlock]: Endcards/Credits']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, []) + + def test_remove_marked_arrange_sponsors_ChapterWithRunsOfOverlappingCuts(self): + chapters = self._chapters([170], ['c']) + [ + self._chapter(0, 30, remove=True), + self._sponsor_chapter(20, 50, 'sponsor', remove=True), + self._chapter(40, 60, remove=True), + self._sponsor_chapter(70, 90, 'sponsor', remove=True), + self._chapter(80, 100, remove=True), + self._chapter(90, 110, remove=True), + self._sponsor_chapter(120, 140, 'sponsor', remove=True), + self._sponsor_chapter(130, 160, 'selfpromo', remove=True), + self._chapter(150, 170, remove=True)] + expected_cuts = [self._chapter(0, 60, remove=True), + self._chapter(70, 110, remove=True), + self._chapter(120, 170, remove=True)] + self._remove_marked_arrange_sponsors_test_impl( + chapters, self._chapters([20], ['c']), expected_cuts) + + def test_remove_marked_arrange_sponsors_OverlappingSponsorsDifferentTitlesAfterCut(self): + chapters = self._chapters([60], ['c']) + [ + self._sponsor_chapter(10, 60, 'sponsor'), + self._sponsor_chapter(10, 40, 'intro'), + self._sponsor_chapter(30, 50, 'interaction'), + self._sponsor_chapter(30, 50, 'selfpromo', remove=True), + self._sponsor_chapter(40, 50, 'interaction'), + self._sponsor_chapter(50, 60, 'outro')] + expected = self._chapters( + [10, 30, 40], ['c', '[SponsorBlock]: Sponsor, Intermission/Intro Animation', '[SponsorBlock]: Sponsor, Endcards/Credits']) + self._remove_marked_arrange_sponsors_test_impl( + chapters, expected, [self._chapter(30, 50, remove=True)]) + + def test_remove_marked_arrange_sponsors_SponsorsNoLongerOverlapAfterCut(self): + chapters = self._chapters([70], ['c']) + [ + self._sponsor_chapter(10, 30, 'sponsor'), + self._sponsor_chapter(20, 50, 'interaction'), + self._sponsor_chapter(30, 50, 'selpromo', remove=True), + self._sponsor_chapter(40, 60, 'sponsor'), + self._sponsor_chapter(50, 60, 'interaction')] + expected = self._chapters( + [10, 20, 40, 50], ['c', '[SponsorBlock]: Sponsor', + '[SponsorBlock]: Sponsor, Interaction Reminder', 'c']) + self._remove_marked_arrange_sponsors_test_impl( + chapters, expected, [self._chapter(30, 50, remove=True)]) + + def test_remove_marked_arrange_sponsors_SponsorsStillOverlapAfterCut(self): + chapters = self._chapters([70], ['c']) + [ + self._sponsor_chapter(10, 60, 'sponsor'), + self._sponsor_chapter(20, 60, 'interaction'), + self._sponsor_chapter(30, 50, 'selfpromo', remove=True)] + expected = self._chapters( + [10, 20, 40, 50], ['c', '[SponsorBlock]: Sponsor', + '[SponsorBlock]: Sponsor, Interaction Reminder', 'c']) + self._remove_marked_arrange_sponsors_test_impl( + chapters, expected, [self._chapter(30, 50, remove=True)]) + + def test_remove_marked_arrange_sponsors_ChapterWithRunsOfOverlappingSponsorsAndCuts(self): + chapters = self._chapters([200], ['c']) + [ + self._sponsor_chapter(10, 40, 'sponsor'), + self._sponsor_chapter(10, 30, 'intro'), + self._chapter(20, 30, remove=True), + self._sponsor_chapter(30, 40, 'selfpromo'), + self._sponsor_chapter(50, 70, 'sponsor'), + self._sponsor_chapter(60, 80, 'interaction'), + self._chapter(70, 80, remove=True), + self._sponsor_chapter(70, 90, 'sponsor'), + self._sponsor_chapter(80, 100, 'interaction'), + self._sponsor_chapter(120, 170, 'selfpromo'), + self._sponsor_chapter(130, 180, 'outro'), + self._chapter(140, 150, remove=True), + self._chapter(150, 160, remove=True)] + expected = self._chapters( + [10, 20, 30, 40, 50, 70, 80, 100, 110, 130, 140, 160], + ['c', '[SponsorBlock]: Sponsor, Intermission/Intro Animation', '[SponsorBlock]: Sponsor, Unpaid/Self Promotion', + 'c', '[SponsorBlock]: Sponsor', '[SponsorBlock]: Sponsor, Interaction Reminder', + '[SponsorBlock]: Interaction Reminder', 'c', '[SponsorBlock]: Unpaid/Self Promotion', + '[SponsorBlock]: Unpaid/Self Promotion, Endcards/Credits', '[SponsorBlock]: Endcards/Credits', 'c']) + expected_cuts = [self._chapter(20, 30, remove=True), + self._chapter(70, 80, remove=True), + self._chapter(140, 160, remove=True)] + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, expected_cuts) + + def test_remove_marked_arrange_sponsors_SponsorOverlapsMultipleChapters(self): + chapters = (self._chapters([20, 40, 60, 80, 100], ['c1', 'c2', 'c3', 'c4', 'c5']) + + [self._sponsor_chapter(10, 90, 'sponsor')]) + expected = self._chapters([10, 90, 100], ['c1', '[SponsorBlock]: Sponsor', 'c5']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, []) + + def test_remove_marked_arrange_sponsors_CutOverlapsMultipleChapters(self): + cuts = [self._chapter(10, 90, remove=True)] + chapters = self._chapters([20, 40, 60, 80, 100], ['c1', 'c2', 'c3', 'c4', 'c5']) + cuts + expected = self._chapters([10, 20], ['c1', 'c5']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, cuts) + + def test_remove_marked_arrange_sponsors_SponsorsWithinSomeChaptersAndOverlappingOthers(self): + chapters = (self._chapters([10, 40, 60, 80], ['c1', 'c2', 'c3', 'c4']) + + [self._sponsor_chapter(20, 30, 'sponsor'), + self._sponsor_chapter(50, 70, 'selfpromo')]) + expected = self._chapters([10, 20, 30, 40, 50, 70, 80], + ['c1', 'c2', '[SponsorBlock]: Sponsor', 'c2', 'c3', + '[SponsorBlock]: Unpaid/Self Promotion', 'c4']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, []) + + def test_remove_marked_arrange_sponsors_CutsWithinSomeChaptersAndOverlappingOthers(self): + cuts = [self._chapter(20, 30, remove=True), self._chapter(50, 70, remove=True)] + chapters = self._chapters([10, 40, 60, 80], ['c1', 'c2', 'c3', 'c4']) + cuts + expected = self._chapters([10, 30, 40, 50], ['c1', 'c2', 'c3', 'c4']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, cuts) + + def test_remove_marked_arrange_sponsors_ChaptersAfterLastSponsor(self): + chapters = (self._chapters([20, 40, 50, 60], ['c1', 'c2', 'c3', 'c4']) + + [self._sponsor_chapter(10, 30, 'music_offtopic')]) + expected = self._chapters( + [10, 30, 40, 50, 60], + ['c1', '[SponsorBlock]: Non-Music Section', 'c2', 'c3', 'c4']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, []) + + def test_remove_marked_arrange_sponsors_ChaptersAfterLastCut(self): + cuts = [self._chapter(10, 30, remove=True)] + chapters = self._chapters([20, 40, 50, 60], ['c1', 'c2', 'c3', 'c4']) + cuts + expected = self._chapters([10, 20, 30, 40], ['c1', 'c2', 'c3', 'c4']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, cuts) + + def test_remove_marked_arrange_sponsors_SponsorStartsAtChapterStart(self): + chapters = (self._chapters([10, 20, 40], ['c1', 'c2', 'c3']) + + [self._sponsor_chapter(20, 30, 'sponsor')]) + expected = self._chapters([10, 20, 30, 40], ['c1', 'c2', '[SponsorBlock]: Sponsor', 'c3']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, []) + + def test_remove_marked_arrange_sponsors_CutStartsAtChapterStart(self): + cuts = [self._chapter(20, 30, remove=True)] + chapters = self._chapters([10, 20, 40], ['c1', 'c2', 'c3']) + cuts + expected = self._chapters([10, 20, 30], ['c1', 'c2', 'c3']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, cuts) + + def test_remove_marked_arrange_sponsors_SponsorEndsAtChapterEnd(self): + chapters = (self._chapters([10, 30, 40], ['c1', 'c2', 'c3']) + + [self._sponsor_chapter(20, 30, 'sponsor')]) + expected = self._chapters([10, 20, 30, 40], ['c1', 'c2', '[SponsorBlock]: Sponsor', 'c3']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, []) + + def test_remove_marked_arrange_sponsors_CutEndsAtChapterEnd(self): + cuts = [self._chapter(20, 30, remove=True)] + chapters = self._chapters([10, 30, 40], ['c1', 'c2', 'c3']) + cuts + expected = self._chapters([10, 20, 30], ['c1', 'c2', 'c3']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, cuts) + + def test_remove_marked_arrange_sponsors_SponsorCoincidesWithChapters(self): + chapters = (self._chapters([10, 20, 30, 40], ['c1', 'c2', 'c3', 'c4']) + + [self._sponsor_chapter(10, 30, 'sponsor')]) + expected = self._chapters([10, 30, 40], ['c1', '[SponsorBlock]: Sponsor', 'c4']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, []) + + def test_remove_marked_arrange_sponsors_CutCoincidesWithChapters(self): + cuts = [self._chapter(10, 30, remove=True)] + chapters = self._chapters([10, 20, 30, 40], ['c1', 'c2', 'c3', 'c4']) + cuts + expected = self._chapters([10, 20], ['c1', 'c4']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, cuts) + + def test_remove_marked_arrange_sponsors_SponsorsAtVideoBoundaries(self): + chapters = (self._chapters([20, 40, 60], ['c1', 'c2', 'c3']) + + [self._sponsor_chapter(0, 10, 'intro'), self._sponsor_chapter(50, 60, 'outro')]) + expected = self._chapters( + [10, 20, 40, 50, 60], ['[SponsorBlock]: Intermission/Intro Animation', 'c1', 'c2', 'c3', '[SponsorBlock]: Endcards/Credits']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, []) + + def test_remove_marked_arrange_sponsors_CutsAtVideoBoundaries(self): + cuts = [self._chapter(0, 10, remove=True), self._chapter(50, 60, remove=True)] + chapters = self._chapters([20, 40, 60], ['c1', 'c2', 'c3']) + cuts + expected = self._chapters([10, 30, 40], ['c1', 'c2', 'c3']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, cuts) + + def test_remove_marked_arrange_sponsors_SponsorsOverlapChaptersAtVideoBoundaries(self): + chapters = (self._chapters([10, 40, 50], ['c1', 'c2', 'c3']) + + [self._sponsor_chapter(0, 20, 'intro'), self._sponsor_chapter(30, 50, 'outro')]) + expected = self._chapters( + [20, 30, 50], ['[SponsorBlock]: Intermission/Intro Animation', 'c2', '[SponsorBlock]: Endcards/Credits']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, []) + + def test_remove_marked_arrange_sponsors_CutsOverlapChaptersAtVideoBoundaries(self): + cuts = [self._chapter(0, 20, remove=True), self._chapter(30, 50, remove=True)] + chapters = self._chapters([10, 40, 50], ['c1', 'c2', 'c3']) + cuts + expected = self._chapters([10], ['c2']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, cuts) + + def test_remove_marked_arrange_sponsors_EverythingSponsored(self): + chapters = (self._chapters([10, 20, 30, 40], ['c1', 'c2', 'c3', 'c4']) + + [self._sponsor_chapter(0, 20, 'intro'), self._sponsor_chapter(20, 40, 'outro')]) + expected = self._chapters([20, 40], ['[SponsorBlock]: Intermission/Intro Animation', '[SponsorBlock]: Endcards/Credits']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, []) + + def test_remove_marked_arrange_sponsors_EverythingCut(self): + cuts = [self._chapter(0, 20, remove=True), self._chapter(20, 40, remove=True)] + chapters = self._chapters([10, 20, 30, 40], ['c1', 'c2', 'c3', 'c4']) + cuts + self._remove_marked_arrange_sponsors_test_impl( + chapters, [], [self._chapter(0, 40, remove=True)]) + + def test_remove_marked_arrange_sponsors_TinyChaptersInTheOriginalArePreserved(self): + chapters = self._chapters([0.1, 0.2, 0.3, 0.4], ['c1', 'c2', 'c3', 'c4']) + self._remove_marked_arrange_sponsors_test_impl(chapters, chapters, []) + + def test_remove_marked_arrange_sponsors_TinySponsorsAreIgnored(self): + chapters = [self._sponsor_chapter(0, 0.1, 'intro'), self._chapter(0.1, 0.2, 'c1'), + self._sponsor_chapter(0.2, 0.3, 'sponsor'), self._chapter(0.3, 0.4, 'c2'), + self._sponsor_chapter(0.4, 0.5, 'outro')] + self._remove_marked_arrange_sponsors_test_impl( + chapters, self._chapters([0.3, 0.5], ['c1', 'c2']), []) + + def test_remove_marked_arrange_sponsors_TinyChaptersResultingFromCutsAreIgnored(self): + cuts = [self._chapter(1.5, 2.5, remove=True)] + chapters = self._chapters([2, 3, 3.5], ['c1', 'c2', 'c3']) + cuts + self._remove_marked_arrange_sponsors_test_impl( + chapters, self._chapters([2, 2.5], ['c1', 'c3']), cuts) + + def test_remove_marked_arrange_sponsors_SingleTinyChapterIsPreserved(self): + cuts = [self._chapter(0.5, 2, remove=True)] + chapters = self._chapters([2], ['c']) + cuts + self._remove_marked_arrange_sponsors_test_impl( + chapters, self._chapters([0.5], ['c']), cuts) + + def test_remove_marked_arrange_sponsors_TinyChapterAtTheStartPrependedToTheNext(self): + cuts = [self._chapter(0.5, 2, remove=True)] + chapters = self._chapters([2, 4], ['c1', 'c2']) + cuts + self._remove_marked_arrange_sponsors_test_impl( + chapters, self._chapters([2.5], ['c2']), cuts) + + def test_remove_marked_arrange_sponsors_TinyChaptersResultingFromSponsorOverlapAreIgnored(self): + chapters = self._chapters([1, 3, 4], ['c1', 'c2', 'c3']) + [ + self._sponsor_chapter(1.5, 2.5, 'sponsor')] + self._remove_marked_arrange_sponsors_test_impl( + chapters, self._chapters([1.5, 2.5, 4], ['c1', '[SponsorBlock]: Sponsor', 'c3']), []) + + def test_remove_marked_arrange_sponsors_TinySponsorsOverlapsAreIgnored(self): + chapters = self._chapters([2, 3, 5], ['c1', 'c2', 'c3']) + [ + self._sponsor_chapter(1, 3, 'sponsor'), + self._sponsor_chapter(2.5, 4, 'selfpromo') + ] + self._remove_marked_arrange_sponsors_test_impl( + chapters, self._chapters([1, 3, 4, 5], [ + 'c1', '[SponsorBlock]: Sponsor', '[SponsorBlock]: Unpaid/Self Promotion', 'c3']), []) + + def test_remove_marked_arrange_sponsors_TinySponsorsPrependedToTheNextSponsor(self): + chapters = self._chapters([4], ['c']) + [ + self._sponsor_chapter(1.5, 2, 'sponsor'), + self._sponsor_chapter(2, 4, 'selfpromo') + ] + self._remove_marked_arrange_sponsors_test_impl( + chapters, self._chapters([1.5, 4], ['c', '[SponsorBlock]: Unpaid/Self Promotion']), []) + + def test_remove_marked_arrange_sponsors_SmallestSponsorInTheOverlapGetsNamed(self): + self._pp._sponsorblock_chapter_title = '[SponsorBlock]: %(name)s' + chapters = self._chapters([10], ['c']) + [ + self._sponsor_chapter(2, 8, 'sponsor'), + self._sponsor_chapter(4, 6, 'selfpromo') + ] + self._remove_marked_arrange_sponsors_test_impl( + chapters, self._chapters([2, 4, 6, 8, 10], [ + 'c', '[SponsorBlock]: Sponsor', '[SponsorBlock]: Unpaid/Self Promotion', + '[SponsorBlock]: Sponsor', 'c' + ]), []) + + def test_make_concat_opts_CommonCase(self): + sponsor_chapters = [self._chapter(1, 2, 's1'), self._chapter(10, 20, 's2')] + expected = '''ffconcat version 1.0 +file 'file:test' +outpoint 1.000000 +file 'file:test' +inpoint 2.000000 +outpoint 10.000000 +file 'file:test' +inpoint 20.000000 +''' + opts = self._pp._make_concat_opts(sponsor_chapters, 30) + self.assertEqual(expected, ''.join(self._pp._concat_spec(['test'] * len(opts), opts))) + + def test_make_concat_opts_NoZeroDurationChunkAtVideoStart(self): + sponsor_chapters = [self._chapter(0, 1, 's1'), self._chapter(10, 20, 's2')] + expected = '''ffconcat version 1.0 +file 'file:test' +inpoint 1.000000 +outpoint 10.000000 +file 'file:test' +inpoint 20.000000 +''' + opts = self._pp._make_concat_opts(sponsor_chapters, 30) + self.assertEqual(expected, ''.join(self._pp._concat_spec(['test'] * len(opts), opts))) + + def test_make_concat_opts_NoZeroDurationChunkAtVideoEnd(self): + sponsor_chapters = [self._chapter(1, 2, 's1'), self._chapter(10, 20, 's2')] + expected = '''ffconcat version 1.0 +file 'file:test' +outpoint 1.000000 +file 'file:test' +inpoint 2.000000 +outpoint 10.000000 +''' + opts = self._pp._make_concat_opts(sponsor_chapters, 20) + self.assertEqual(expected, ''.join(self._pp._concat_spec(['test'] * len(opts), opts))) + + def test_quote_for_concat_RunsOfQuotes(self): + self.assertEqual( + r"'special '\'' '\'\''characters'\'\'\''galore'", + self._pp._quote_for_ffmpeg("special ' ''characters'''galore")) + + def test_quote_for_concat_QuotesAtStart(self): + self.assertEqual( + r"\'\'\''special '\'' characters '\'' galore'", + self._pp._quote_for_ffmpeg("'''special ' characters ' galore")) + + def test_quote_for_concat_QuotesAtEnd(self): + self.assertEqual( + r"'special '\'' characters '\'' galore'\'\'\'", + self._pp._quote_for_ffmpeg("special ' characters ' galore'''")) diff --git a/test/test_socks.py b/test/test_socks.py index 1e68eb0da..cf1f613ab 100644 --- a/test/test_socks.py +++ b/test/test_socks.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # coding: utf-8 from __future__ import unicode_literals @@ -14,13 +14,15 @@ import subprocess from test.helper import ( FakeYDL, get_params, + is_download_test, ) -from youtube_dl.compat import ( +from yt_dlp.compat import ( compat_str, compat_urllib_request, ) +@is_download_test class TestMultipleSocks(unittest.TestCase): @staticmethod def _check_params(attrs): @@ -76,6 +78,7 @@ class TestMultipleSocks(unittest.TestCase): params['secondary_server_ip']) +@is_download_test class TestSocks(unittest.TestCase): _SKIP_SOCKS_TEST = True diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 550e0ca00..9b39dbd39 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 from __future__ import unicode_literals # Allow direct execution @@ -7,10 +7,10 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import FakeYDL, md5 +from test.helper import FakeYDL, md5, is_download_test -from youtube_dl.extractor import ( +from yt_dlp.extractor import ( YoutubeIE, DailymotionIE, TEDIE, @@ -19,6 +19,7 @@ from youtube_dl.extractor import ( CeskaTelevizeIE, LyndaIE, NPOIE, + PBSIE, ComedyCentralIE, NRKTVIE, RaiPlayIE, @@ -30,6 +31,7 @@ from youtube_dl.extractor import ( ) +@is_download_test class BaseTestSubtitles(unittest.TestCase): url = None IE = None @@ -55,6 +57,7 @@ class BaseTestSubtitles(unittest.TestCase): return dict((l, sub_info['data']) for l, sub_info in subtitles.items()) +@is_download_test class TestYoutubeSubtitles(BaseTestSubtitles): url = 'QRS8MkLhQmM' IE = YoutubeIE @@ -64,8 +67,8 @@ class TestYoutubeSubtitles(BaseTestSubtitles): self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(len(subtitles.keys()), 13) - self.assertEqual(md5(subtitles['en']), '3cb210999d3e021bd6c7f0ea751eab06') - self.assertEqual(md5(subtitles['it']), '6d752b98c31f1cf8d597050c7a2cb4b5') + self.assertEqual(md5(subtitles['en']), '688dd1ce0981683867e7fe6fde2a224b') + self.assertEqual(md5(subtitles['it']), '31324d30b8430b309f7f5979a504a769') for lang in ['fr', 'de']: self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang) @@ -73,13 +76,13 @@ class TestYoutubeSubtitles(BaseTestSubtitles): self.DL.params['writesubtitles'] = True self.DL.params['subtitlesformat'] = 'ttml' subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['en']), 'e306f8c42842f723447d9f63ad65df54') + self.assertEqual(md5(subtitles['en']), 'c97ddf1217390906fa9fbd34901f3da2') def test_youtube_subtitles_vtt_format(self): self.DL.params['writesubtitles'] = True self.DL.params['subtitlesformat'] = 'vtt' subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['en']), '3cb210999d3e021bd6c7f0ea751eab06') + self.assertEqual(md5(subtitles['en']), 'ae1bd34126571a77aabd4d276b28044d') def test_youtube_automatic_captions(self): self.url = '8YoUxe5ncPo' @@ -88,9 +91,15 @@ class TestYoutubeSubtitles(BaseTestSubtitles): subtitles = self.getSubtitles() self.assertTrue(subtitles['it'] is not None) + def test_youtube_no_automatic_captions(self): + self.url = 'QRS8MkLhQmM' + self.DL.params['writeautomaticsub'] = True + subtitles = self.getSubtitles() + self.assertTrue(not subtitles) + def test_youtube_translated_subtitles(self): # This video has a subtitles track, which can be translated - self.url = 'Ky9eprVWzlI' + self.url = 'i0ZabxXmH4Y' self.DL.params['writeautomaticsub'] = True self.DL.params['subtitleslangs'] = ['it'] subtitles = self.getSubtitles() @@ -105,6 +114,7 @@ class TestYoutubeSubtitles(BaseTestSubtitles): self.assertFalse(subtitles) +@is_download_test class TestDailymotionSubtitles(BaseTestSubtitles): url = 'http://www.dailymotion.com/video/xczg00' IE = DailymotionIE @@ -128,6 +138,7 @@ class TestDailymotionSubtitles(BaseTestSubtitles): self.assertFalse(subtitles) +@is_download_test class TestTedSubtitles(BaseTestSubtitles): url = 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html' IE = TEDIE @@ -143,6 +154,7 @@ class TestTedSubtitles(BaseTestSubtitles): self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang) +@is_download_test class TestVimeoSubtitles(BaseTestSubtitles): url = 'http://vimeo.com/76979871' IE = VimeoIE @@ -164,6 +176,7 @@ class TestVimeoSubtitles(BaseTestSubtitles): self.assertFalse(subtitles) +@is_download_test class TestWallaSubtitles(BaseTestSubtitles): url = 'http://vod.walla.co.il/movie/2705958/the-yes-men' IE = WallaIE @@ -185,6 +198,7 @@ class TestWallaSubtitles(BaseTestSubtitles): self.assertFalse(subtitles) +@is_download_test class TestCeskaTelevizeSubtitles(BaseTestSubtitles): url = 'http://www.ceskatelevize.cz/ivysilani/10600540290-u6-uzasny-svet-techniky' IE = CeskaTelevizeIE @@ -206,6 +220,7 @@ class TestCeskaTelevizeSubtitles(BaseTestSubtitles): self.assertFalse(subtitles) +@is_download_test class TestLyndaSubtitles(BaseTestSubtitles): url = 'http://www.lynda.com/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html' IE = LyndaIE @@ -218,6 +233,7 @@ class TestLyndaSubtitles(BaseTestSubtitles): self.assertEqual(md5(subtitles['en']), '09bbe67222259bed60deaa26997d73a7') +@is_download_test class TestNPOSubtitles(BaseTestSubtitles): url = 'http://www.npo.nl/nos-journaal/28-08-2014/POW_00722860' IE = NPOIE @@ -230,6 +246,7 @@ class TestNPOSubtitles(BaseTestSubtitles): self.assertEqual(md5(subtitles['nl']), 'fc6435027572b63fb4ab143abd5ad3f4') +@is_download_test class TestMTVSubtitles(BaseTestSubtitles): url = 'http://www.cc.com/video-clips/p63lk0/adam-devine-s-house-party-chasing-white-swans' IE = ComedyCentralIE @@ -245,6 +262,7 @@ class TestMTVSubtitles(BaseTestSubtitles): self.assertEqual(md5(subtitles['en']), '78206b8d8a0cfa9da64dc026eea48961') +@is_download_test class TestNRKSubtitles(BaseTestSubtitles): url = 'http://tv.nrk.no/serie/ikke-gjoer-dette-hjemme/DMPV73000411/sesong-2/episode-1' IE = NRKTVIE @@ -257,6 +275,7 @@ class TestNRKSubtitles(BaseTestSubtitles): self.assertEqual(md5(subtitles['no']), '544fa917d3197fcbee64634559221cc2') +@is_download_test class TestRaiPlaySubtitles(BaseTestSubtitles): IE = RaiPlayIE @@ -277,6 +296,7 @@ class TestRaiPlaySubtitles(BaseTestSubtitles): self.assertEqual(md5(subtitles['it']), '4b3264186fbb103508abe5311cfcb9cd') +@is_download_test class TestVikiSubtitles(BaseTestSubtitles): url = 'http://www.viki.com/videos/1060846v-punch-episode-18' IE = VikiIE @@ -289,6 +309,7 @@ class TestVikiSubtitles(BaseTestSubtitles): self.assertEqual(md5(subtitles['en']), '53cb083a5914b2d84ef1ab67b880d18a') +@is_download_test class TestThePlatformSubtitles(BaseTestSubtitles): # from http://www.3playmedia.com/services-features/tools/integrations/theplatform/ # (see http://theplatform.com/about/partners/type/subtitles-closed-captioning/) @@ -303,6 +324,7 @@ class TestThePlatformSubtitles(BaseTestSubtitles): self.assertEqual(md5(subtitles['en']), '97e7670cbae3c4d26ae8bcc7fdd78d4b') +@is_download_test class TestThePlatformFeedSubtitles(BaseTestSubtitles): url = 'http://feed.theplatform.com/f/7wvmTC/msnbc_video-p-test?form=json&pretty=true&range=-40&byGuid=n_hardball_5biden_140207' IE = ThePlatformFeedIE @@ -315,6 +337,7 @@ class TestThePlatformFeedSubtitles(BaseTestSubtitles): self.assertEqual(md5(subtitles['en']), '48649a22e82b2da21c9a67a395eedade') +@is_download_test class TestRtveSubtitles(BaseTestSubtitles): url = 'http://www.rtve.es/alacarta/videos/los-misterios-de-laura/misterios-laura-capitulo-32-misterio-del-numero-17-2-parte/2428621/' IE = RTVEALaCartaIE @@ -329,6 +352,7 @@ class TestRtveSubtitles(BaseTestSubtitles): self.assertEqual(md5(subtitles['es']), '69e70cae2d40574fb7316f31d6eb7fca') +@is_download_test class TestDemocracynowSubtitles(BaseTestSubtitles): url = 'http://www.democracynow.org/shows/2015/7/3' IE = DemocracynowIE @@ -349,5 +373,42 @@ class TestDemocracynowSubtitles(BaseTestSubtitles): self.assertEqual(md5(subtitles['en']), 'acaca989e24a9e45a6719c9b3d60815c') +@is_download_test +class TestPBSSubtitles(BaseTestSubtitles): + url = 'https://www.pbs.org/video/how-fantasy-reflects-our-world-picecq/' + IE = PBSIE + + def test_allsubtitles(self): + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(set(subtitles.keys()), set(['en'])) + + def test_subtitles_dfxp_format(self): + self.DL.params['writesubtitles'] = True + self.DL.params['subtitlesformat'] = 'dfxp' + subtitles = self.getSubtitles() + self.assertIn(md5(subtitles['en']), ['643b034254cdc3768ff1e750b6b5873b']) + + def test_subtitles_vtt_format(self): + self.DL.params['writesubtitles'] = True + self.DL.params['subtitlesformat'] = 'vtt' + subtitles = self.getSubtitles() + self.assertIn( + md5(subtitles['en']), ['937a05711555b165d4c55a9667017045', 'f49ea998d6824d94959c8152a368ff73']) + + def test_subtitles_srt_format(self): + self.DL.params['writesubtitles'] = True + self.DL.params['subtitlesformat'] = 'srt' + subtitles = self.getSubtitles() + self.assertIn(md5(subtitles['en']), ['2082c21b43759d9bf172931b2f2ca371']) + + def test_subtitles_sami_format(self): + self.DL.params['writesubtitles'] = True + self.DL.params['subtitlesformat'] = 'sami' + subtitles = self.getSubtitles() + self.assertIn(md5(subtitles['en']), ['4256b16ac7da6a6780fafd04294e85cd']) + + if __name__ == '__main__': unittest.main() diff --git a/test/test_unicode_literals.py b/test/test_unicode_literals.py deleted file mode 100644 index 6c1b7ec91..000000000 --- a/test/test_unicode_literals.py +++ /dev/null @@ -1,63 +0,0 @@ -from __future__ import unicode_literals - -# Allow direct execution -import os -import sys -import unittest -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -import io -import re - -rootDir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - -IGNORED_FILES = [ - 'setup.py', # http://bugs.python.org/issue13943 - 'conf.py', - 'buildserver.py', -] - -IGNORED_DIRS = [ - '.git', - '.tox', -] - -from test.helper import assertRegexpMatches - - -class TestUnicodeLiterals(unittest.TestCase): - def test_all_files(self): - for dirpath, dirnames, filenames in os.walk(rootDir): - for ignore_dir in IGNORED_DIRS: - if ignore_dir in dirnames: - # If we remove the directory from dirnames os.walk won't - # recurse into it - dirnames.remove(ignore_dir) - for basename in filenames: - if not basename.endswith('.py'): - continue - if basename in IGNORED_FILES: - continue - - fn = os.path.join(dirpath, basename) - with io.open(fn, encoding='utf-8') as inf: - code = inf.read() - - if "'" not in code and '"' not in code: - continue - assertRegexpMatches( - self, - code, - r'(?:(?:#.*?|\s*)\n)*from __future__ import (?:[a-z_]+,\s*)*unicode_literals', - 'unicode_literals import missing in %s' % fn) - - m = re.search(r'(?<=\s)u[\'"](?!\)|,|$)', code) - if m is not None: - self.assertTrue( - m is None, - 'u present in %s, around %s' % ( - fn, code[m.start() - 10:m.end() + 10])) - - -if __name__ == '__main__': - unittest.main() diff --git a/test/test_utils.py b/test/test_utils.py index 259c4763e..7fc431505 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # coding: utf-8 from __future__ import unicode_literals @@ -12,10 +12,11 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # Various small unit tests import io +import itertools import json import xml.etree.ElementTree -from youtube_dl.utils import ( +from yt_dlp.utils import ( age_restricted, args_to_str, encode_base_n, @@ -23,6 +24,7 @@ from youtube_dl.utils import ( clean_html, clean_podcast_url, date_from_str, + datetime_from_str, DateRange, detect_exe_version, determine_ext, @@ -60,11 +62,13 @@ from youtube_dl.utils import ( parse_iso8601, parse_resolution, parse_bitrate, + parse_qs, pkcs1pad, read_batch_urls, sanitize_filename, sanitize_path, sanitize_url, + sanitized_Request, expand_path, prepend_extension, replace_extension, @@ -105,15 +109,15 @@ from youtube_dl.utils import ( cli_valueless_option, cli_bool_option, parse_codecs, + iri_to_uri, + LazyList, ) -from youtube_dl.compat import ( +from yt_dlp.compat import ( compat_chr, compat_etree_fromstring, compat_getenv, compat_os_name, compat_setenv, - compat_urlparse, - compat_parse_qs, ) @@ -123,6 +127,7 @@ class TestUtil(unittest.TestCase): self.assertTrue(timeconvert('bougrg') is None) def test_sanitize_filename(self): + self.assertEqual(sanitize_filename(''), '') self.assertEqual(sanitize_filename('abc'), 'abc') self.assertEqual(sanitize_filename('abc_d-e'), 'abc_d-e') @@ -236,17 +241,27 @@ class TestUtil(unittest.TestCase): self.assertEqual(sanitize_url('httpss://foo.bar'), 'https://foo.bar') self.assertEqual(sanitize_url('rmtps://foo.bar'), 'rtmps://foo.bar') self.assertEqual(sanitize_url('https://foo.bar'), 'https://foo.bar') + self.assertEqual(sanitize_url('foo bar'), 'foo bar') + + def test_extract_basic_auth(self): + auth_header = lambda url: sanitized_Request(url).get_header('Authorization') + self.assertFalse(auth_header('http://foo.bar')) + self.assertFalse(auth_header('http://:foo.bar')) + self.assertEqual(auth_header('http://@foo.bar'), 'Basic Og==') + self.assertEqual(auth_header('http://:pass@foo.bar'), 'Basic OnBhc3M=') + self.assertEqual(auth_header('http://user:@foo.bar'), 'Basic dXNlcjo=') + self.assertEqual(auth_header('http://user:pass@foo.bar'), 'Basic dXNlcjpwYXNz') def test_expand_path(self): def env(var): return '%{0}%'.format(var) if sys.platform == 'win32' else '${0}'.format(var) - compat_setenv('YOUTUBE_DL_EXPATH_PATH', 'expanded') - self.assertEqual(expand_path(env('YOUTUBE_DL_EXPATH_PATH')), 'expanded') + compat_setenv('yt_dlp_EXPATH_PATH', 'expanded') + self.assertEqual(expand_path(env('yt_dlp_EXPATH_PATH')), 'expanded') self.assertEqual(expand_path(env('HOME')), compat_getenv('HOME')) self.assertEqual(expand_path('~'), compat_getenv('HOME')) self.assertEqual( - expand_path('~/%s' % env('YOUTUBE_DL_EXPATH_PATH')), + expand_path('~/%s' % env('yt_dlp_EXPATH_PATH')), '%s/expanded' % compat_getenv('HOME')) def test_prepend_extension(self): @@ -310,8 +325,18 @@ class TestUtil(unittest.TestCase): self.assertEqual(date_from_str('yesterday'), date_from_str('now-1day')) self.assertEqual(date_from_str('now+7day'), date_from_str('now+1week')) self.assertEqual(date_from_str('now+14day'), date_from_str('now+2week')) - self.assertEqual(date_from_str('now+365day'), date_from_str('now+1year')) - self.assertEqual(date_from_str('now+30day'), date_from_str('now+1month')) + self.assertEqual(date_from_str('20200229+365day'), date_from_str('20200229+1year')) + self.assertEqual(date_from_str('20210131+28day'), date_from_str('20210131+1month')) + + def test_datetime_from_str(self): + self.assertEqual(datetime_from_str('yesterday', precision='day'), datetime_from_str('now-1day', precision='auto')) + self.assertEqual(datetime_from_str('now+7day', precision='day'), datetime_from_str('now+1week', precision='auto')) + self.assertEqual(datetime_from_str('now+14day', precision='day'), datetime_from_str('now+2week', precision='auto')) + self.assertEqual(datetime_from_str('20200229+365day', precision='day'), datetime_from_str('20200229+1year', precision='auto')) + self.assertEqual(datetime_from_str('20210131+28day', precision='day'), datetime_from_str('20210131+1month', precision='auto')) + self.assertEqual(datetime_from_str('20210131+59day', precision='day'), datetime_from_str('20210131+2month', precision='auto')) + self.assertEqual(datetime_from_str('now+1day', precision='hour'), datetime_from_str('now+24hours', precision='auto')) + self.assertEqual(datetime_from_str('now+23hours', precision='hour'), datetime_from_str('now+23hours', precision='auto')) def test_daterange(self): _20century = DateRange("19000101", "20000101") @@ -662,38 +687,36 @@ class TestUtil(unittest.TestCase): self.assertTrue(isinstance(data, bytes)) def test_update_url_query(self): - def query_dict(url): - return compat_parse_qs(compat_urlparse.urlparse(url).query) - self.assertEqual(query_dict(update_url_query( + self.assertEqual(parse_qs(update_url_query( 'http://example.com/path', {'quality': ['HD'], 'format': ['mp4']})), - query_dict('http://example.com/path?quality=HD&format=mp4')) - self.assertEqual(query_dict(update_url_query( + parse_qs('http://example.com/path?quality=HD&format=mp4')) + self.assertEqual(parse_qs(update_url_query( 'http://example.com/path', {'system': ['LINUX', 'WINDOWS']})), - query_dict('http://example.com/path?system=LINUX&system=WINDOWS')) - self.assertEqual(query_dict(update_url_query( + parse_qs('http://example.com/path?system=LINUX&system=WINDOWS')) + self.assertEqual(parse_qs(update_url_query( 'http://example.com/path', {'fields': 'id,formats,subtitles'})), - query_dict('http://example.com/path?fields=id,formats,subtitles')) - self.assertEqual(query_dict(update_url_query( + parse_qs('http://example.com/path?fields=id,formats,subtitles')) + self.assertEqual(parse_qs(update_url_query( 'http://example.com/path', {'fields': ('id,formats,subtitles', 'thumbnails')})), - query_dict('http://example.com/path?fields=id,formats,subtitles&fields=thumbnails')) - self.assertEqual(query_dict(update_url_query( + parse_qs('http://example.com/path?fields=id,formats,subtitles&fields=thumbnails')) + self.assertEqual(parse_qs(update_url_query( 'http://example.com/path?manifest=f4m', {'manifest': []})), - query_dict('http://example.com/path')) - self.assertEqual(query_dict(update_url_query( + parse_qs('http://example.com/path')) + self.assertEqual(parse_qs(update_url_query( 'http://example.com/path?system=LINUX&system=WINDOWS', {'system': 'LINUX'})), - query_dict('http://example.com/path?system=LINUX')) - self.assertEqual(query_dict(update_url_query( + parse_qs('http://example.com/path?system=LINUX')) + self.assertEqual(parse_qs(update_url_query( 'http://example.com/path', {'fields': b'id,formats,subtitles'})), - query_dict('http://example.com/path?fields=id,formats,subtitles')) - self.assertEqual(query_dict(update_url_query( + parse_qs('http://example.com/path?fields=id,formats,subtitles')) + self.assertEqual(parse_qs(update_url_query( 'http://example.com/path', {'width': 1080, 'height': 720})), - query_dict('http://example.com/path?width=1080&height=720')) - self.assertEqual(query_dict(update_url_query( + parse_qs('http://example.com/path?width=1080&height=720')) + self.assertEqual(parse_qs(update_url_query( 'http://example.com/path', {'bitrate': 5020.43})), - query_dict('http://example.com/path?bitrate=5020.43')) - self.assertEqual(query_dict(update_url_query( + parse_qs('http://example.com/path?bitrate=5020.43')) + self.assertEqual(parse_qs(update_url_query( 'http://example.com/path', {'test': '第二行тест'})), - query_dict('http://example.com/path?test=%E7%AC%AC%E4%BA%8C%E8%A1%8C%D1%82%D0%B5%D1%81%D1%82')) + parse_qs('http://example.com/path?test=%E7%AC%AC%E4%BA%8C%E8%A1%8C%D1%82%D0%B5%D1%81%D1%82')) def test_multipart_encode(self): self.assertEqual( @@ -1028,6 +1051,9 @@ class TestUtil(unittest.TestCase): on = js_to_json('{ "040": "040" }') self.assertEqual(json.loads(on), {'040': '040'}) + on = js_to_json('[1,//{},\n2]') + self.assertEqual(json.loads(on), [1, 2]) + def test_js_to_json_malformed(self): self.assertEqual(js_to_json('42a1'), '42"a1"') self.assertEqual(js_to_json('42a-1'), '42"a"-1') @@ -1178,12 +1204,26 @@ ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4') '9999 51') def test_match_str(self): - self.assertRaises(ValueError, match_str, 'xy>foobar', {}) + # Unary self.assertFalse(match_str('xy', {'x': 1200})) self.assertTrue(match_str('!xy', {'x': 1200})) self.assertTrue(match_str('x', {'x': 1200})) self.assertFalse(match_str('!x', {'x': 1200})) self.assertTrue(match_str('x', {'x': 0})) + self.assertTrue(match_str('is_live', {'is_live': True})) + self.assertFalse(match_str('is_live', {'is_live': False})) + self.assertFalse(match_str('is_live', {'is_live': None})) + self.assertFalse(match_str('is_live', {})) + self.assertFalse(match_str('!is_live', {'is_live': True})) + self.assertTrue(match_str('!is_live', {'is_live': False})) + self.assertTrue(match_str('!is_live', {'is_live': None})) + self.assertTrue(match_str('!is_live', {})) + self.assertTrue(match_str('title', {'title': 'abc'})) + self.assertTrue(match_str('title', {'title': ''})) + self.assertFalse(match_str('!title', {'title': 'abc'})) + self.assertFalse(match_str('!title', {'title': ''})) + + # Numeric self.assertFalse(match_str('x>0', {'x': 0})) self.assertFalse(match_str('x>0', {})) self.assertTrue(match_str('x>?0', {})) @@ -1191,10 +1231,26 @@ ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4') self.assertFalse(match_str('x>2K', {'x': 1200})) self.assertTrue(match_str('x>=1200 & x < 1300', {'x': 1200})) self.assertFalse(match_str('x>=1100 & x < 1200', {'x': 1200})) + self.assertTrue(match_str('x > 1:0:0', {'x': 3700})) + + # String self.assertFalse(match_str('y=a212', {'y': 'foobar42'})) self.assertTrue(match_str('y=foobar42', {'y': 'foobar42'})) self.assertFalse(match_str('y!=foobar42', {'y': 'foobar42'})) self.assertTrue(match_str('y!=foobar2', {'y': 'foobar42'})) + self.assertTrue(match_str('y^=foo', {'y': 'foobar42'})) + self.assertFalse(match_str('y!^=foo', {'y': 'foobar42'})) + self.assertFalse(match_str('y^=bar', {'y': 'foobar42'})) + self.assertTrue(match_str('y!^=bar', {'y': 'foobar42'})) + self.assertRaises(ValueError, match_str, 'x^=42', {'x': 42}) + self.assertTrue(match_str('y*=bar', {'y': 'foobar42'})) + self.assertFalse(match_str('y!*=bar', {'y': 'foobar42'})) + self.assertFalse(match_str('y*=baz', {'y': 'foobar42'})) + self.assertTrue(match_str('y!*=baz', {'y': 'foobar42'})) + self.assertTrue(match_str('y$=42', {'y': 'foobar42'})) + self.assertFalse(match_str('y$=43', {'y': 'foobar42'})) + + # And self.assertFalse(match_str( 'like_count > 100 & dislike_count <? 50 & description', {'like_count': 90, 'description': 'foo'})) @@ -1207,18 +1263,35 @@ ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4') self.assertFalse(match_str( 'like_count > 100 & dislike_count <? 50 & description', {'like_count': 190, 'dislike_count': 10})) - self.assertTrue(match_str('is_live', {'is_live': True})) - self.assertFalse(match_str('is_live', {'is_live': False})) - self.assertFalse(match_str('is_live', {'is_live': None})) - self.assertFalse(match_str('is_live', {})) - self.assertFalse(match_str('!is_live', {'is_live': True})) - self.assertTrue(match_str('!is_live', {'is_live': False})) - self.assertTrue(match_str('!is_live', {'is_live': None})) - self.assertTrue(match_str('!is_live', {})) - self.assertTrue(match_str('title', {'title': 'abc'})) - self.assertTrue(match_str('title', {'title': ''})) - self.assertFalse(match_str('!title', {'title': 'abc'})) - self.assertFalse(match_str('!title', {'title': ''})) + + # Regex + self.assertTrue(match_str(r'x~=\bbar', {'x': 'foo bar'})) + self.assertFalse(match_str(r'x~=\bbar.+', {'x': 'foo bar'})) + self.assertFalse(match_str(r'x~=^FOO', {'x': 'foo bar'})) + self.assertTrue(match_str(r'x~=(?i)^FOO', {'x': 'foo bar'})) + + # Quotes + self.assertTrue(match_str(r'x^="foo"', {'x': 'foo "bar"'})) + self.assertFalse(match_str(r'x^="foo "', {'x': 'foo "bar"'})) + self.assertFalse(match_str(r'x$="bar"', {'x': 'foo "bar"'})) + self.assertTrue(match_str(r'x$=" \"bar\""', {'x': 'foo "bar"'})) + + # Escaping & + self.assertFalse(match_str(r'x=foo & bar', {'x': 'foo & bar'})) + self.assertTrue(match_str(r'x=foo \& bar', {'x': 'foo & bar'})) + self.assertTrue(match_str(r'x=foo \& bar & x^=foo', {'x': 'foo & bar'})) + self.assertTrue(match_str(r'x="foo \& bar" & x^=foo', {'x': 'foo & bar'})) + + # Example from docs + self.assertTrue(match_str( + r"!is_live & like_count>?100 & description~='(?i)\bcats \& dogs\b'", + {'description': 'Raining Cats & Dogs'})) + + # Incomplete + self.assertFalse(match_str('id!=foo', {'id': 'foo'}, True)) + self.assertTrue(match_str('x', {'id': 'foo'}, True)) + self.assertTrue(match_str('!x', {'id': 'foo'}, True)) + self.assertFalse(match_str('x', {'id': 'foo'}, False)) def test_parse_dfxp_time_expr(self): self.assertEqual(parse_dfxp_time_expr(None), None) @@ -1424,8 +1497,8 @@ Line 1 self.assertEqual(caesar('ebg', 'acegik', -2), 'abc') def test_rot47(self): - self.assertEqual(rot47('youtube-dl'), r'J@FEF36\5=') - self.assertEqual(rot47('YOUTUBE-DL'), r'*~&%&qt\s{') + self.assertEqual(rot47('yt-dlp'), r'JE\5=A') + self.assertEqual(rot47('YT-DLP'), r'*%\s{!') def test_urshift(self): self.assertEqual(urshift(3, 1), 1) @@ -1471,10 +1544,81 @@ Line 1 self.assertEqual(get_elements_by_attribute('class', 'foo', html), []) self.assertEqual(get_elements_by_attribute('class', 'no-such-foo', html), []) + def test_iri_to_uri(self): + self.assertEqual( + iri_to_uri('https://www.google.com/search?q=foo&ie=utf-8&oe=utf-8&client=firefox-b'), + 'https://www.google.com/search?q=foo&ie=utf-8&oe=utf-8&client=firefox-b') # Same + self.assertEqual( + iri_to_uri('https://www.google.com/search?q=Käsesoßenrührlöffel'), # German for cheese sauce stirring spoon + 'https://www.google.com/search?q=K%C3%A4seso%C3%9Fenr%C3%BChrl%C3%B6ffel') + self.assertEqual( + iri_to_uri('https://www.google.com/search?q=lt<+gt>+eq%3D+amp%26+percent%25+hash%23+colon%3A+tilde~#trash=?&garbage=#'), + 'https://www.google.com/search?q=lt%3C+gt%3E+eq%3D+amp%26+percent%25+hash%23+colon%3A+tilde~#trash=?&garbage=#') + self.assertEqual( + iri_to_uri('http://правозащита38.рф/category/news/'), + 'http://xn--38-6kcaak9aj5chl4a3g.xn--p1ai/category/news/') + self.assertEqual( + iri_to_uri('http://www.правозащита38.рф/category/news/'), + 'http://www.xn--38-6kcaak9aj5chl4a3g.xn--p1ai/category/news/') + self.assertEqual( + iri_to_uri('https://i❤.ws/emojidomain/👍👏🤝💪'), + 'https://xn--i-7iq.ws/emojidomain/%F0%9F%91%8D%F0%9F%91%8F%F0%9F%A4%9D%F0%9F%92%AA') + self.assertEqual( + iri_to_uri('http://日本語.jp/'), + 'http://xn--wgv71a119e.jp/') + self.assertEqual( + iri_to_uri('http://导航.中国/'), + 'http://xn--fet810g.xn--fiqs8s/') + def test_clean_podcast_url(self): self.assertEqual(clean_podcast_url('https://www.podtrac.com/pts/redirect.mp3/chtbl.com/track/5899E/traffic.megaphone.fm/HSW7835899191.mp3'), 'https://traffic.megaphone.fm/HSW7835899191.mp3') self.assertEqual(clean_podcast_url('https://play.podtrac.com/npr-344098539/edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3'), 'https://edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3') + def test_LazyList(self): + it = list(range(10)) + + self.assertEqual(list(LazyList(it)), it) + self.assertEqual(LazyList(it).exhaust(), it) + self.assertEqual(LazyList(it)[5], it[5]) + + self.assertEqual(LazyList(it)[5:], it[5:]) + self.assertEqual(LazyList(it)[:5], it[:5]) + self.assertEqual(LazyList(it)[::2], it[::2]) + self.assertEqual(LazyList(it)[1::2], it[1::2]) + self.assertEqual(LazyList(it)[5::-1], it[5::-1]) + self.assertEqual(LazyList(it)[6:2:-2], it[6:2:-2]) + self.assertEqual(LazyList(it)[::-1], it[::-1]) + + self.assertTrue(LazyList(it)) + self.assertFalse(LazyList(range(0))) + self.assertEqual(len(LazyList(it)), len(it)) + self.assertEqual(repr(LazyList(it)), repr(it)) + self.assertEqual(str(LazyList(it)), str(it)) + + self.assertEqual(list(LazyList(it).reverse()), it[::-1]) + self.assertEqual(list(LazyList(it).reverse()[1:3:7]), it[::-1][1:3:7]) + self.assertEqual(list(LazyList(it).reverse()[::-1]), it) + + def test_LazyList_laziness(self): + + def test(ll, idx, val, cache): + self.assertEqual(ll[idx], val) + self.assertEqual(getattr(ll, '_LazyList__cache'), list(cache)) + + ll = LazyList(range(10)) + test(ll, 0, 0, range(1)) + test(ll, 5, 5, range(6)) + test(ll, -3, 7, range(10)) + + ll = LazyList(range(10)).reverse() + test(ll, -1, 0, range(1)) + test(ll, 3, 6, range(10)) + + ll = LazyList(itertools.count()) + test(ll, 10, 10, range(11)) + ll.reverse() + test(ll, -15, 14, range(15)) + if __name__ == '__main__': unittest.main() diff --git a/test/test_verbose_output.py b/test/test_verbose_output.py index c1465fe8c..86b039a4a 100644 --- a/test/test_verbose_output.py +++ b/test/test_verbose_output.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # coding: utf-8 from __future__ import unicode_literals @@ -17,7 +17,7 @@ class TestVerboseOutput(unittest.TestCase): def test_private_info_arg(self): outp = subprocess.Popen( [ - sys.executable, 'youtube_dl/__main__.py', '-v', + sys.executable, 'yt_dlp/__main__.py', '-v', '--username', 'johnsmith@gmail.com', '--password', 'secret', ], cwd=rootDir, stdout=subprocess.PIPE, stderr=subprocess.PIPE) @@ -30,7 +30,7 @@ class TestVerboseOutput(unittest.TestCase): def test_private_info_shortarg(self): outp = subprocess.Popen( [ - sys.executable, 'youtube_dl/__main__.py', '-v', + sys.executable, 'yt_dlp/__main__.py', '-v', '-u', 'johnsmith@gmail.com', '-p', 'secret', ], cwd=rootDir, stdout=subprocess.PIPE, stderr=subprocess.PIPE) @@ -43,7 +43,7 @@ class TestVerboseOutput(unittest.TestCase): def test_private_info_eq(self): outp = subprocess.Popen( [ - sys.executable, 'youtube_dl/__main__.py', '-v', + sys.executable, 'yt_dlp/__main__.py', '-v', '--username=johnsmith@gmail.com', '--password=secret', ], cwd=rootDir, stdout=subprocess.PIPE, stderr=subprocess.PIPE) @@ -56,7 +56,7 @@ class TestVerboseOutput(unittest.TestCase): def test_private_info_shortarg_eq(self): outp = subprocess.Popen( [ - sys.executable, 'youtube_dl/__main__.py', '-v', + sys.executable, 'yt_dlp/__main__.py', '-v', '-u=johnsmith@gmail.com', '-p=secret', ], cwd=rootDir, stdout=subprocess.PIPE, stderr=subprocess.PIPE) diff --git a/test/test_write_annotations.py b/test/test_write_annotations.py deleted file mode 100644 index 41abdfe3b..000000000 --- a/test/test_write_annotations.py +++ /dev/null @@ -1,80 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 -from __future__ import unicode_literals - -# Allow direct execution -import os -import sys -import unittest -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -from test.helper import get_params, try_rm - - -import io - -import xml.etree.ElementTree - -import youtube_dl.YoutubeDL -import youtube_dl.extractor - - -class YoutubeDL(youtube_dl.YoutubeDL): - def __init__(self, *args, **kwargs): - super(YoutubeDL, self).__init__(*args, **kwargs) - self.to_stderr = self.to_screen - - -params = get_params({ - 'writeannotations': True, - 'skip_download': True, - 'writeinfojson': False, - 'format': 'flv', -}) - - -TEST_ID = 'gr51aVj-mLg' -ANNOTATIONS_FILE = TEST_ID + '.annotations.xml' -EXPECTED_ANNOTATIONS = ['Speech bubble', 'Note', 'Title', 'Spotlight', 'Label'] - - -class TestAnnotations(unittest.TestCase): - def setUp(self): - # Clear old files - self.tearDown() - - def test_info_json(self): - expected = list(EXPECTED_ANNOTATIONS) # Two annotations could have the same text. - ie = youtube_dl.extractor.YoutubeIE() - ydl = YoutubeDL(params) - ydl.add_info_extractor(ie) - ydl.download([TEST_ID]) - self.assertTrue(os.path.exists(ANNOTATIONS_FILE)) - annoxml = None - with io.open(ANNOTATIONS_FILE, 'r', encoding='utf-8') as annof: - annoxml = xml.etree.ElementTree.parse(annof) - self.assertTrue(annoxml is not None, 'Failed to parse annotations XML') - root = annoxml.getroot() - self.assertEqual(root.tag, 'document') - annotationsTag = root.find('annotations') - self.assertEqual(annotationsTag.tag, 'annotations') - annotations = annotationsTag.findall('annotation') - - # Not all the annotations have TEXT children and the annotations are returned unsorted. - for a in annotations: - self.assertEqual(a.tag, 'annotation') - if a.get('type') == 'text': - textTag = a.find('TEXT') - text = textTag.text - self.assertTrue(text in expected) # assertIn only added in python 2.7 - # remove the first occurrence, there could be more than one annotation with the same text - expected.remove(text) - # We should have seen (and removed) all the expected annotation texts. - self.assertEqual(len(expected), 0, 'Not all expected annotations were found.') - - def tearDown(self): - try_rm(ANNOTATIONS_FILE) - - -if __name__ == '__main__': - unittest.main() diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index cf2fdf14f..e831393e4 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 from __future__ import unicode_literals # Allow direct execution @@ -7,16 +7,17 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import FakeYDL +from test.helper import FakeYDL, is_download_test -from youtube_dl.extractor import ( +from yt_dlp.extractor import ( YoutubePlaylistIE, YoutubeTabIE, YoutubeIE, ) +@is_download_test class TestYoutubeLists(unittest.TestCase): def assertIsPlaylist(self, info): """Make sure the info has '_type' set to 'playlist'""" diff --git a/test/test_youtube_misc.py b/test/test_youtube_misc.py index e18e71101..402681cad 100644 --- a/test/test_youtube_misc.py +++ b/test/test_youtube_misc.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 from __future__ import unicode_literals # Allow direct execution @@ -8,7 +8,7 @@ import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from youtube_dl.extractor import YoutubeIE +from yt_dlp.extractor import YoutubeIE class TestYoutubeMisc(unittest.TestCase): diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 627d4cb92..dcf6ab60d 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 from __future__ import unicode_literals @@ -12,9 +12,9 @@ import io import re import string -from test.helper import FakeYDL -from youtube_dl.extractor import YoutubeIE -from youtube_dl.compat import compat_str, compat_urlretrieve +from test.helper import FakeYDL, is_download_test +from yt_dlp.extractor import YoutubeIE +from yt_dlp.compat import compat_str, compat_urlretrieve _TESTS = [ ( @@ -65,6 +65,7 @@ _TESTS = [ ] +@is_download_test class TestPlayerInfo(unittest.TestCase): def test_youtube_extract_player_info(self): PLAYER_URLS = ( @@ -87,6 +88,7 @@ class TestPlayerInfo(unittest.TestCase): self.assertEqual(player_id, expected_player_id) +@is_download_test class TestSignature(unittest.TestCase): def setUp(self): TEST_DIR = os.path.dirname(os.path.abspath(__file__)) diff --git a/test/testdata/ism/sintel.Manifest b/test/testdata/ism/sintel.Manifest new file mode 100644 index 000000000..2ff8c2447 --- /dev/null +++ b/test/testdata/ism/sintel.Manifest @@ -0,0 +1,988 @@ +<?xml version="1.0" encoding="utf-8"?> +<!-- Created with Unified Streaming Platform (version=1.10.18-20255) --> +<SmoothStreamingMedia + MajorVersion="2" + MinorVersion="0" + TimeScale="10000000" + Duration="8880746666"> + <StreamIndex + Type="audio" + QualityLevels="1" + TimeScale="10000000" + Name="audio" + Chunks="445" + Url="QualityLevels({bitrate})/Fragments(audio={start time})"> + <QualityLevel + Index="0" + Bitrate="128001" + CodecPrivateData="1190" + SamplingRate="48000" + Channels="2" + BitsPerSample="16" + PacketSize="4" + AudioTag="255" + FourCC="AACL" /> + <c t="0" d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="20053333" /> + <c d="20053333" /> + <c d="20053334" /> + <c d="19840000" /> + <c d="746666" /> + </StreamIndex> + <StreamIndex + Type="text" + QualityLevels="1" + TimeScale="10000000" + Language="eng" + Subtype="CAPT" + Name="textstream_eng" + Chunks="11" + Url="QualityLevels({bitrate})/Fragments(textstream_eng={start time})"> + <QualityLevel + Index="0" + Bitrate="1000" + CodecPrivateData="" + FourCC="TTML" /> + <c t="0" d="600000000" /> + <c d="600000000" /> + <c d="600000000" /> + <c d="600000000" /> + <c d="600000000" /> + <c d="600000000" /> + <c d="600000000" /> + <c d="600000000" /> + <c d="600000000" /> + <c d="600000000" /> + <c d="240000000" /> + </StreamIndex> + <StreamIndex + Type="video" + QualityLevels="5" + TimeScale="10000000" + Name="video" + Chunks="444" + Url="QualityLevels({bitrate})/Fragments(video={start time})" + MaxWidth="1688" + MaxHeight="720" + DisplayWidth="1689" + DisplayHeight="720"> + <QualityLevel + Index="0" + Bitrate="100000" + CodecPrivateData="00000001674D401FDA0544EFFC2D002CBC40000003004000000C03C60CA80000000168EF32C8" + MaxWidth="336" + MaxHeight="144" + FourCC="AVC1" /> + <QualityLevel + Index="1" + Bitrate="326000" + CodecPrivateData="00000001674D401FDA0241FE23FFC3BC83BA44000003000400000300C03C60CA800000000168EF32C8" + MaxWidth="562" + MaxHeight="240" + FourCC="AVC1" /> + <QualityLevel + Index="2" + Bitrate="698000" + CodecPrivateData="00000001674D401FDA0350BFB97FF06AF06AD1000003000100000300300F1832A00000000168EF32C8" + MaxWidth="844" + MaxHeight="360" + FourCC="AVC1" /> + <QualityLevel + Index="3" + Bitrate="1493000" + CodecPrivateData="00000001674D401FDA011C3DE6FFF0D890D871000003000100000300300F1832A00000000168EF32C8" + MaxWidth="1126" + MaxHeight="480" + FourCC="AVC1" /> + <QualityLevel + Index="4" + Bitrate="4482000" + CodecPrivateData="00000001674D401FDA01A816F97FFC1ABC1AB440000003004000000C03C60CA80000000168EF32C8" + MaxWidth="1688" + MaxHeight="720" + FourCC="AVC1" /> + <c t="0" d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + <c d="20000000" /> + </StreamIndex> +</SmoothStreamingMedia> diff --git a/test/testdata/m3u8/bipbop_16x9.m3u8 b/test/testdata/m3u8/bipbop_16x9.m3u8 new file mode 100644 index 000000000..1ce87dd04 --- /dev/null +++ b/test/testdata/m3u8/bipbop_16x9.m3u8 @@ -0,0 +1,38 @@ +#EXTM3U + +#EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID="bipbop_audio",LANGUAGE="eng",NAME="BipBop Audio 1",AUTOSELECT=YES,DEFAULT=YES +#EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID="bipbop_audio",LANGUAGE="eng",NAME="BipBop Audio 2",AUTOSELECT=NO,DEFAULT=NO,URI="alternate_audio_aac/prog_index.m3u8" + + +#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="subs",NAME="English",DEFAULT=YES,AUTOSELECT=YES,FORCED=NO,LANGUAGE="en",CHARACTERISTICS="public.accessibility.transcribes-spoken-dialog, public.accessibility.describes-music-and-sound",URI="subtitles/eng/prog_index.m3u8" +#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="subs",NAME="English (Forced)",DEFAULT=NO,AUTOSELECT=NO,FORCED=YES,LANGUAGE="en",URI="subtitles/eng_forced/prog_index.m3u8" +#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="subs",NAME="Français",DEFAULT=NO,AUTOSELECT=YES,FORCED=NO,LANGUAGE="fr",CHARACTERISTICS="public.accessibility.transcribes-spoken-dialog, public.accessibility.describes-music-and-sound",URI="subtitles/fra/prog_index.m3u8" +#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="subs",NAME="Français (Forced)",DEFAULT=NO,AUTOSELECT=NO,FORCED=YES,LANGUAGE="fr",URI="subtitles/fra_forced/prog_index.m3u8" +#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="subs",NAME="Español",DEFAULT=NO,AUTOSELECT=YES,FORCED=NO,LANGUAGE="es",CHARACTERISTICS="public.accessibility.transcribes-spoken-dialog, public.accessibility.describes-music-and-sound",URI="subtitles/spa/prog_index.m3u8" +#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="subs",NAME="Español (Forced)",DEFAULT=NO,AUTOSELECT=NO,FORCED=YES,LANGUAGE="es",URI="subtitles/spa_forced/prog_index.m3u8" +#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="subs",NAME="日本語",DEFAULT=NO,AUTOSELECT=YES,FORCED=NO,LANGUAGE="ja",CHARACTERISTICS="public.accessibility.transcribes-spoken-dialog, public.accessibility.describes-music-and-sound",URI="subtitles/jpn/prog_index.m3u8" +#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="subs",NAME="日本語 (Forced)",DEFAULT=NO,AUTOSELECT=NO,FORCED=YES,LANGUAGE="ja",URI="subtitles/jpn_forced/prog_index.m3u8" + + +#EXT-X-STREAM-INF:BANDWIDTH=263851,CODECS="mp4a.40.2, avc1.4d400d",RESOLUTION=416x234,AUDIO="bipbop_audio",SUBTITLES="subs" +gear1/prog_index.m3u8 +#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=28451,CODECS="avc1.4d400d",URI="gear1/iframe_index.m3u8" + +#EXT-X-STREAM-INF:BANDWIDTH=577610,CODECS="mp4a.40.2, avc1.4d401e",RESOLUTION=640x360,AUDIO="bipbop_audio",SUBTITLES="subs" +gear2/prog_index.m3u8 +#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=181534,CODECS="avc1.4d401e",URI="gear2/iframe_index.m3u8" + +#EXT-X-STREAM-INF:BANDWIDTH=915905,CODECS="mp4a.40.2, avc1.4d401f",RESOLUTION=960x540,AUDIO="bipbop_audio",SUBTITLES="subs" +gear3/prog_index.m3u8 +#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=297056,CODECS="avc1.4d401f",URI="gear3/iframe_index.m3u8" + +#EXT-X-STREAM-INF:BANDWIDTH=1030138,CODECS="mp4a.40.2, avc1.4d401f",RESOLUTION=1280x720,AUDIO="bipbop_audio",SUBTITLES="subs" +gear4/prog_index.m3u8 +#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=339492,CODECS="avc1.4d401f",URI="gear4/iframe_index.m3u8" + +#EXT-X-STREAM-INF:BANDWIDTH=1924009,CODECS="mp4a.40.2, avc1.4d401f",RESOLUTION=1920x1080,AUDIO="bipbop_audio",SUBTITLES="subs" +gear5/prog_index.m3u8 +#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=669554,CODECS="avc1.4d401f",URI="gear5/iframe_index.m3u8" + +#EXT-X-STREAM-INF:BANDWIDTH=41457,CODECS="mp4a.40.2",AUDIO="bipbop_audio",SUBTITLES="subs" +gear0/prog_index.m3u8 diff --git a/test/testdata/m3u8/img_bipbop_adv_example_fmp4.m3u8 b/test/testdata/m3u8/img_bipbop_adv_example_fmp4.m3u8 new file mode 100644 index 000000000..620ce04c5 --- /dev/null +++ b/test/testdata/m3u8/img_bipbop_adv_example_fmp4.m3u8 @@ -0,0 +1,76 @@ +#EXTM3U +#EXT-X-VERSION:6 +#EXT-X-INDEPENDENT-SEGMENTS + + +#EXT-X-STREAM-INF:AVERAGE-BANDWIDTH=2168183,BANDWIDTH=2177116,CODECS="avc1.640020,mp4a.40.2",RESOLUTION=960x540,FRAME-RATE=60.000,CLOSED-CAPTIONS="cc1",AUDIO="aud1",SUBTITLES="sub1" +v5/prog_index.m3u8 +#EXT-X-STREAM-INF:AVERAGE-BANDWIDTH=7968416,BANDWIDTH=8001098,CODECS="avc1.64002a,mp4a.40.2",RESOLUTION=1920x1080,FRAME-RATE=60.000,CLOSED-CAPTIONS="cc1",AUDIO="aud1",SUBTITLES="sub1" +v9/prog_index.m3u8 +#EXT-X-STREAM-INF:AVERAGE-BANDWIDTH=6170000,BANDWIDTH=6312875,CODECS="avc1.64002a,mp4a.40.2",RESOLUTION=1920x1080,FRAME-RATE=60.000,CLOSED-CAPTIONS="cc1",AUDIO="aud1",SUBTITLES="sub1" +v8/prog_index.m3u8 +#EXT-X-STREAM-INF:AVERAGE-BANDWIDTH=4670769,BANDWIDTH=4943747,CODECS="avc1.64002a,mp4a.40.2",RESOLUTION=1920x1080,FRAME-RATE=60.000,CLOSED-CAPTIONS="cc1",AUDIO="aud1",SUBTITLES="sub1" +v7/prog_index.m3u8 +#EXT-X-STREAM-INF:AVERAGE-BANDWIDTH=3168702,BANDWIDTH=3216424,CODECS="avc1.640020,mp4a.40.2",RESOLUTION=1280x720,FRAME-RATE=60.000,CLOSED-CAPTIONS="cc1",AUDIO="aud1",SUBTITLES="sub1" +v6/prog_index.m3u8 +#EXT-X-STREAM-INF:AVERAGE-BANDWIDTH=1265132,BANDWIDTH=1268994,CODECS="avc1.64001e,mp4a.40.2",RESOLUTION=768x432,FRAME-RATE=30.000,CLOSED-CAPTIONS="cc1",AUDIO="aud1",SUBTITLES="sub1" +v4/prog_index.m3u8 +#EXT-X-STREAM-INF:AVERAGE-BANDWIDTH=895755,BANDWIDTH=902298,CODECS="avc1.64001e,mp4a.40.2",RESOLUTION=640x360,FRAME-RATE=30.000,CLOSED-CAPTIONS="cc1",AUDIO="aud1",SUBTITLES="sub1" +v3/prog_index.m3u8 +#EXT-X-STREAM-INF:AVERAGE-BANDWIDTH=530721,BANDWIDTH=541052,CODECS="avc1.640015,mp4a.40.2",RESOLUTION=480x270,FRAME-RATE=30.000,CLOSED-CAPTIONS="cc1",AUDIO="aud1",SUBTITLES="sub1" +v2/prog_index.m3u8 + + +#EXT-X-STREAM-INF:AVERAGE-BANDWIDTH=2390686,BANDWIDTH=2399619,CODECS="avc1.640020,ac-3",RESOLUTION=960x540,FRAME-RATE=60.000,CLOSED-CAPTIONS="cc1",AUDIO="aud2",SUBTITLES="sub1" +v5/prog_index.m3u8 +#EXT-X-STREAM-INF:AVERAGE-BANDWIDTH=8190919,BANDWIDTH=8223601,CODECS="avc1.64002a,ac-3",RESOLUTION=1920x1080,FRAME-RATE=60.000,CLOSED-CAPTIONS="cc1",AUDIO="aud2",SUBTITLES="sub1" +v9/prog_index.m3u8 +#EXT-X-STREAM-INF:AVERAGE-BANDWIDTH=6392503,BANDWIDTH=6535378,CODECS="avc1.64002a,ac-3",RESOLUTION=1920x1080,FRAME-RATE=60.000,CLOSED-CAPTIONS="cc1",AUDIO="aud2",SUBTITLES="sub1" +v8/prog_index.m3u8 +#EXT-X-STREAM-INF:AVERAGE-BANDWIDTH=4893272,BANDWIDTH=5166250,CODECS="avc1.64002a,ac-3",RESOLUTION=1920x1080,FRAME-RATE=60.000,CLOSED-CAPTIONS="cc1",AUDIO="aud2",SUBTITLES="sub1" +v7/prog_index.m3u8 +#EXT-X-STREAM-INF:AVERAGE-BANDWIDTH=3391205,BANDWIDTH=3438927,CODECS="avc1.640020,ac-3",RESOLUTION=1280x720,FRAME-RATE=60.000,CLOSED-CAPTIONS="cc1",AUDIO="aud2",SUBTITLES="sub1" +v6/prog_index.m3u8 +#EXT-X-STREAM-INF:AVERAGE-BANDWIDTH=1487635,BANDWIDTH=1491497,CODECS="avc1.64001e,ac-3",RESOLUTION=768x432,FRAME-RATE=30.000,CLOSED-CAPTIONS="cc1",AUDIO="aud2",SUBTITLES="sub1" +v4/prog_index.m3u8 +#EXT-X-STREAM-INF:AVERAGE-BANDWIDTH=1118258,BANDWIDTH=1124801,CODECS="avc1.64001e,ac-3",RESOLUTION=640x360,FRAME-RATE=30.000,CLOSED-CAPTIONS="cc1",AUDIO="aud2",SUBTITLES="sub1" +v3/prog_index.m3u8 +#EXT-X-STREAM-INF:AVERAGE-BANDWIDTH=753224,BANDWIDTH=763555,CODECS="avc1.640015,ac-3",RESOLUTION=480x270,FRAME-RATE=30.000,CLOSED-CAPTIONS="cc1",AUDIO="aud2",SUBTITLES="sub1" +v2/prog_index.m3u8 + + +#EXT-X-STREAM-INF:AVERAGE-BANDWIDTH=2198686,BANDWIDTH=2207619,CODECS="avc1.640020,ec-3",RESOLUTION=960x540,FRAME-RATE=60.000,CLOSED-CAPTIONS="cc1",AUDIO="aud3",SUBTITLES="sub1" +v5/prog_index.m3u8 +#EXT-X-STREAM-INF:AVERAGE-BANDWIDTH=7998919,BANDWIDTH=8031601,CODECS="avc1.64002a,ec-3",RESOLUTION=1920x1080,FRAME-RATE=60.000,CLOSED-CAPTIONS="cc1",AUDIO="aud3",SUBTITLES="sub1" +v9/prog_index.m3u8 +#EXT-X-STREAM-INF:AVERAGE-BANDWIDTH=6200503,BANDWIDTH=6343378,CODECS="avc1.64002a,ec-3",RESOLUTION=1920x1080,FRAME-RATE=60.000,CLOSED-CAPTIONS="cc1",AUDIO="aud3",SUBTITLES="sub1" +v8/prog_index.m3u8 +#EXT-X-STREAM-INF:AVERAGE-BANDWIDTH=4701272,BANDWIDTH=4974250,CODECS="avc1.64002a,ec-3",RESOLUTION=1920x1080,FRAME-RATE=60.000,CLOSED-CAPTIONS="cc1",AUDIO="aud3",SUBTITLES="sub1" +v7/prog_index.m3u8 +#EXT-X-STREAM-INF:AVERAGE-BANDWIDTH=3199205,BANDWIDTH=3246927,CODECS="avc1.640020,ec-3",RESOLUTION=1280x720,FRAME-RATE=60.000,CLOSED-CAPTIONS="cc1",AUDIO="aud3",SUBTITLES="sub1" +v6/prog_index.m3u8 +#EXT-X-STREAM-INF:AVERAGE-BANDWIDTH=1295635,BANDWIDTH=1299497,CODECS="avc1.64001e,ec-3",RESOLUTION=768x432,FRAME-RATE=30.000,CLOSED-CAPTIONS="cc1",AUDIO="aud3",SUBTITLES="sub1" +v4/prog_index.m3u8 +#EXT-X-STREAM-INF:AVERAGE-BANDWIDTH=926258,BANDWIDTH=932801,CODECS="avc1.64001e,ec-3",RESOLUTION=640x360,FRAME-RATE=30.000,CLOSED-CAPTIONS="cc1",AUDIO="aud3",SUBTITLES="sub1" +v3/prog_index.m3u8 +#EXT-X-STREAM-INF:AVERAGE-BANDWIDTH=561224,BANDWIDTH=571555,CODECS="avc1.640015,ec-3",RESOLUTION=480x270,FRAME-RATE=30.000,CLOSED-CAPTIONS="cc1",AUDIO="aud3",SUBTITLES="sub1" +v2/prog_index.m3u8 + + +#EXT-X-I-FRAME-STREAM-INF:AVERAGE-BANDWIDTH=183689,BANDWIDTH=187492,CODECS="avc1.64002a",RESOLUTION=1920x1080,URI="v7/iframe_index.m3u8" +#EXT-X-I-FRAME-STREAM-INF:AVERAGE-BANDWIDTH=132672,BANDWIDTH=136398,CODECS="avc1.640020",RESOLUTION=1280x720,URI="v6/iframe_index.m3u8" +#EXT-X-I-FRAME-STREAM-INF:AVERAGE-BANDWIDTH=97767,BANDWIDTH=101378,CODECS="avc1.640020",RESOLUTION=960x540,URI="v5/iframe_index.m3u8" +#EXT-X-I-FRAME-STREAM-INF:AVERAGE-BANDWIDTH=75722,BANDWIDTH=77818,CODECS="avc1.64001e",RESOLUTION=768x432,URI="v4/iframe_index.m3u8" +#EXT-X-I-FRAME-STREAM-INF:AVERAGE-BANDWIDTH=63522,BANDWIDTH=65091,CODECS="avc1.64001e",RESOLUTION=640x360,URI="v3/iframe_index.m3u8" +#EXT-X-I-FRAME-STREAM-INF:AVERAGE-BANDWIDTH=39678,BANDWIDTH=40282,CODECS="avc1.640015",RESOLUTION=480x270,URI="v2/iframe_index.m3u8" + + +#EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID="aud1",LANGUAGE="en",NAME="English",AUTOSELECT=YES,DEFAULT=YES,CHANNELS="2",URI="a1/prog_index.m3u8" +#EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID="aud2",LANGUAGE="en",NAME="English",AUTOSELECT=YES,DEFAULT=YES,CHANNELS="6",URI="a2/prog_index.m3u8" +#EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID="aud3",LANGUAGE="en",NAME="English",AUTOSELECT=YES,DEFAULT=YES,CHANNELS="6",URI="a3/prog_index.m3u8" + + +#EXT-X-MEDIA:TYPE=CLOSED-CAPTIONS,GROUP-ID="cc1",LANGUAGE="en",NAME="English",AUTOSELECT=YES,DEFAULT=YES,INSTREAM-ID="CC1" + + +#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="sub1",LANGUAGE="en",NAME="English",AUTOSELECT=YES,DEFAULT=YES,FORCED=NO,URI="s1/en/prog_index.m3u8" diff --git a/test/testdata/m3u8/pluzz_francetv_11507.m3u8 b/test/testdata/m3u8/pluzz_francetv_11507.m3u8 deleted file mode 100644 index 0809f5aa0..000000000 --- a/test/testdata/m3u8/pluzz_francetv_11507.m3u8 +++ /dev/null @@ -1,14 +0,0 @@ -#EXTM3U -
#EXT-X-VERSION:5 -
#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="subs",NAME="Francais",DEFAULT=NO,FORCED=NO,URI="http://replayftv-pmd.francetv.fr/subtitles/2017/16/156589847-1492488987.m3u8",LANGUAGE="fra" -
#EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID="aac",LANGUAGE="fra",NAME="Francais",DEFAULT=YES, AUTOSELECT=YES -#EXT-X-STREAM-INF:SUBTITLES="subs",AUDIO="aac",PROGRAM-ID=1,BANDWIDTH=180000,RESOLUTION=256x144,CODECS="avc1.66.30, mp4a.40.2" -http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/index_0_av.m3u8?null=0 -#EXT-X-STREAM-INF:SUBTITLES="subs",AUDIO="aac",PROGRAM-ID=1,BANDWIDTH=303000,RESOLUTION=320x180,CODECS="avc1.66.30, mp4a.40.2" -http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/index_1_av.m3u8?null=0 -#EXT-X-STREAM-INF:SUBTITLES="subs",AUDIO="aac",PROGRAM-ID=1,BANDWIDTH=575000,RESOLUTION=512x288,CODECS="avc1.66.30, mp4a.40.2" -http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/index_2_av.m3u8?null=0 -#EXT-X-STREAM-INF:SUBTITLES="subs",AUDIO="aac",PROGRAM-ID=1,BANDWIDTH=831000,RESOLUTION=704x396,CODECS="avc1.77.30, mp4a.40.2" -http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/index_3_av.m3u8?null=0 -#EXT-X-STREAM-INF:SUBTITLES="subs",AUDIO="aac",PROGRAM-ID=1,BANDWIDTH=1467000,RESOLUTION=1024x576,CODECS="avc1.77.30, mp4a.40.2" -http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/index_4_av.m3u8?null=0 diff --git a/test/testdata/m3u8/teamcoco_11995.m3u8 b/test/testdata/m3u8/teamcoco_11995.m3u8 deleted file mode 100644 index a6e421697..000000000 --- a/test/testdata/m3u8/teamcoco_11995.m3u8 +++ /dev/null @@ -1,16 +0,0 @@ -#EXTM3U -#EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID="audio-0",NAME="Default",AUTOSELECT=YES,DEFAULT=YES,URI="hls/CONAN_020217_Highlight_show-audio-160k_v4.m3u8" -#EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID="audio-1",NAME="Default",AUTOSELECT=YES,DEFAULT=YES,URI="hls/CONAN_020217_Highlight_show-audio-64k_v4.m3u8" -#EXT-X-I-FRAME-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=37862000,CODECS="avc1.4d001f",URI="hls/CONAN_020217_Highlight_show-2m_iframe.m3u8" -#EXT-X-I-FRAME-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=18750000,CODECS="avc1.4d001e",URI="hls/CONAN_020217_Highlight_show-1m_iframe.m3u8" -#EXT-X-I-FRAME-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=6535000,CODECS="avc1.42001e",URI="hls/CONAN_020217_Highlight_show-400k_iframe.m3u8" -#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=2374000,RESOLUTION=1024x576,CODECS="avc1.4d001f,mp4a.40.2",AUDIO="audio-0" -hls/CONAN_020217_Highlight_show-2m_v4.m3u8 -#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=1205000,RESOLUTION=640x360,CODECS="avc1.4d001e,mp4a.40.2",AUDIO="audio-0" -hls/CONAN_020217_Highlight_show-1m_v4.m3u8 -#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=522000,RESOLUTION=400x224,CODECS="avc1.42001e,mp4a.40.2",AUDIO="audio-0" -hls/CONAN_020217_Highlight_show-400k_v4.m3u8 -#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=413000,RESOLUTION=400x224,CODECS="avc1.42001e,mp4a.40.5",AUDIO="audio-1" -hls/CONAN_020217_Highlight_show-400k_v4.m3u8 -#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=71000,CODECS="mp4a.40.5",AUDIO="audio-1" -hls/CONAN_020217_Highlight_show-audio-64k_v4.m3u8 diff --git a/test/testdata/m3u8/ted_18923.m3u8 b/test/testdata/m3u8/ted_18923.m3u8 deleted file mode 100644 index 52a27118b..000000000 --- a/test/testdata/m3u8/ted_18923.m3u8 +++ /dev/null @@ -1,28 +0,0 @@ -#EXTM3U -#EXT-X-VERSION:4 -#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=1255659,PROGRAM-ID=1,CODECS="avc1.42c01e,mp4a.40.2",RESOLUTION=640x360 -/videos/BorisHesser_2018S/video/600k.m3u8?nobumpers=true&uniqueId=76011e2b -#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=163154,PROGRAM-ID=1,CODECS="avc1.42c00c,mp4a.40.2",RESOLUTION=320x180 -/videos/BorisHesser_2018S/video/64k.m3u8?nobumpers=true&uniqueId=76011e2b -#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=481701,PROGRAM-ID=1,CODECS="avc1.42c015,mp4a.40.2",RESOLUTION=512x288 -/videos/BorisHesser_2018S/video/180k.m3u8?nobumpers=true&uniqueId=76011e2b -#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=769968,PROGRAM-ID=1,CODECS="avc1.42c015,mp4a.40.2",RESOLUTION=512x288 -/videos/BorisHesser_2018S/video/320k.m3u8?nobumpers=true&uniqueId=76011e2b -#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=984037,PROGRAM-ID=1,CODECS="avc1.42c015,mp4a.40.2",RESOLUTION=512x288 -/videos/BorisHesser_2018S/video/450k.m3u8?nobumpers=true&uniqueId=76011e2b -#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=1693925,PROGRAM-ID=1,CODECS="avc1.4d401f,mp4a.40.2",RESOLUTION=853x480 -/videos/BorisHesser_2018S/video/950k.m3u8?nobumpers=true&uniqueId=76011e2b -#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=2462469,PROGRAM-ID=1,CODECS="avc1.640028,mp4a.40.2",RESOLUTION=1280x720 -/videos/BorisHesser_2018S/video/1500k.m3u8?nobumpers=true&uniqueId=76011e2b -#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=68101,PROGRAM-ID=1,CODECS="mp4a.40.2",DEFAULT=YES -/videos/BorisHesser_2018S/audio/600k.m3u8?nobumpers=true&uniqueId=76011e2b - -#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=74298,PROGRAM-ID=1,CODECS="avc1.42c00c",RESOLUTION=320x180,URI="/videos/BorisHesser_2018S/video/64k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b" -#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=216200,PROGRAM-ID=1,CODECS="avc1.42c015",RESOLUTION=512x288,URI="/videos/BorisHesser_2018S/video/180k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b" -#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=304717,PROGRAM-ID=1,CODECS="avc1.42c015",RESOLUTION=512x288,URI="/videos/BorisHesser_2018S/video/320k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b" -#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=350933,PROGRAM-ID=1,CODECS="avc1.42c015",RESOLUTION=512x288,URI="/videos/BorisHesser_2018S/video/450k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b" -#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=495850,PROGRAM-ID=1,CODECS="avc1.42c01e",RESOLUTION=640x360,URI="/videos/BorisHesser_2018S/video/600k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b" -#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=810750,PROGRAM-ID=1,CODECS="avc1.4d401f",RESOLUTION=853x480,URI="/videos/BorisHesser_2018S/video/950k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b" -#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=1273700,PROGRAM-ID=1,CODECS="avc1.640028",RESOLUTION=1280x720,URI="/videos/BorisHesser_2018S/video/1500k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b" - -#EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID="600k",LANGUAGE="en",NAME="Audio",AUTOSELECT=YES,DEFAULT=YES,URI="/videos/BorisHesser_2018S/audio/600k.m3u8?nobumpers=true&uniqueId=76011e2b",BANDWIDTH=614400 diff --git a/test/testdata/m3u8/toggle_mobile_12211.m3u8 b/test/testdata/m3u8/toggle_mobile_12211.m3u8 deleted file mode 100644 index 69604e683..000000000 --- a/test/testdata/m3u8/toggle_mobile_12211.m3u8 +++ /dev/null @@ -1,13 +0,0 @@ -#EXTM3U -#EXT-X-VERSION:4 -#EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID="audio",LANGUAGE="eng",NAME="English",URI="http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/2/pv/1/flavorId/0_sa2ntrdg/name/a.mp4/index.m3u8" -#EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID="audio",LANGUAGE="und",NAME="Undefined",URI="http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/2/pv/1/flavorId/0_r7y0nitg/name/a.mp4/index.m3u8" - -#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=155648,RESOLUTION=320x180,AUDIO="audio" -http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/2/pv/1/flavorId/0_qlk9hlzr/name/a.mp4/index.m3u8 -#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=502784,RESOLUTION=480x270,AUDIO="audio" -http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/2/pv/1/flavorId/0_oefackmi/name/a.mp4/index.m3u8 -#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=827392,RESOLUTION=640x360,AUDIO="audio" -http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/12/pv/1/flavorId/0_vyg9pj7k/name/a.mp4/index.m3u8 -#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=1396736,RESOLUTION=854x480,AUDIO="audio" -http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/12/pv/1/flavorId/0_50n4psvx/name/a.mp4/index.m3u8 diff --git a/test/testdata/m3u8/twitch_vod.m3u8 b/test/testdata/m3u8/twitch_vod.m3u8 deleted file mode 100644 index 7617277ca..000000000 --- a/test/testdata/m3u8/twitch_vod.m3u8 +++ /dev/null @@ -1,20 +0,0 @@ -#EXTM3U -#EXT-X-TWITCH-INFO:ORIGIN="s3",CLUSTER="edgecast_vod",REGION="EU",MANIFEST-CLUSTER="edgecast_vod",USER-IP="109.171.17.81" -#EXT-X-MEDIA:TYPE=VIDEO,GROUP-ID="chunked",NAME="Source",AUTOSELECT=YES,DEFAULT=YES -#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=3214134,CODECS="avc1.100.31,mp4a.40.2",RESOLUTION="1280x720",VIDEO="chunked" -https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/chunked/index-muted-HM49I092CC.m3u8 -#EXT-X-MEDIA:TYPE=VIDEO,GROUP-ID="high",NAME="High",AUTOSELECT=YES,DEFAULT=YES -#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=1603789,CODECS="avc1.42C01F,mp4a.40.2",RESOLUTION="1280x720",VIDEO="high" -https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/high/index-muted-HM49I092CC.m3u8 -#EXT-X-MEDIA:TYPE=VIDEO,GROUP-ID="medium",NAME="Medium",AUTOSELECT=YES,DEFAULT=YES -#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=893387,CODECS="avc1.42C01E,mp4a.40.2",RESOLUTION="852x480",VIDEO="medium" -https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/medium/index-muted-HM49I092CC.m3u8 -#EXT-X-MEDIA:TYPE=VIDEO,GROUP-ID="low",NAME="Low",AUTOSELECT=YES,DEFAULT=YES -#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=628347,CODECS="avc1.42C01E,mp4a.40.2",RESOLUTION="640x360",VIDEO="low" -https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/low/index-muted-HM49I092CC.m3u8 -#EXT-X-MEDIA:TYPE=VIDEO,GROUP-ID="mobile",NAME="Mobile",AUTOSELECT=YES,DEFAULT=YES -#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=280474,CODECS="avc1.42C00D,mp4a.40.2",RESOLUTION="400x226",VIDEO="mobile" -https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/mobile/index-muted-HM49I092CC.m3u8 -#EXT-X-MEDIA:TYPE=VIDEO,GROUP-ID="audio_only",NAME="Audio Only",AUTOSELECT=NO,DEFAULT=NO -#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=182725,CODECS="mp4a.40.2",VIDEO="audio_only" -https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/audio_only/index-muted-HM49I092CC.m3u8 diff --git a/test/testdata/m3u8/vidio.m3u8 b/test/testdata/m3u8/vidio.m3u8 deleted file mode 100644 index 89c244469..000000000 --- a/test/testdata/m3u8/vidio.m3u8 +++ /dev/null @@ -1,10 +0,0 @@ -#EXTM3U - -#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=300000,RESOLUTION=480x270,NAME="270p 3G" -https://cdn1-a.production.vidio.static6.com/uploads/165683/dj_ambred-4383-b300.mp4.m3u8 - -#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=600000,RESOLUTION=640x360,NAME="360p SD" -https://cdn1-a.production.vidio.static6.com/uploads/165683/dj_ambred-4383-b600.mp4.m3u8 - -#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=1200000,RESOLUTION=1280x720,NAME="720p HD" -https://cdn1-a.production.vidio.static6.com/uploads/165683/dj_ambred-4383-b1200.mp4.m3u8 diff --git a/test/testdata/mpd/subtitles.mpd b/test/testdata/mpd/subtitles.mpd new file mode 100644 index 000000000..6f948adba --- /dev/null +++ b/test/testdata/mpd/subtitles.mpd @@ -0,0 +1,351 @@ +<?xml version="1.0" encoding="utf-8"?> +<!-- Created with Unified Streaming Platform (version=1.10.18-20255) --> +<MPD + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xmlns="urn:mpeg:dash:schema:mpd:2011" + xsi:schemaLocation="urn:mpeg:dash:schema:mpd:2011 http://standards.iso.org/ittf/PubliclyAvailableStandards/MPEG-DASH_schema_files/DASH-MPD.xsd" + type="static" + mediaPresentationDuration="PT14M48S" + maxSegmentDuration="PT1M" + minBufferTime="PT10S" + profiles="urn:mpeg:dash:profile:isoff-live:2011"> + <Period + id="1" + duration="PT14M48S"> + <BaseURL>dash/</BaseURL> + <AdaptationSet + id="1" + group="1" + contentType="audio" + segmentAlignment="true" + audioSamplingRate="48000" + mimeType="audio/mp4" + codecs="mp4a.40.2" + startWithSAP="1"> + <AudioChannelConfiguration + schemeIdUri="urn:mpeg:dash:23003:3:audio_channel_configuration:2011" + value="2" /> + <Role schemeIdUri="urn:mpeg:dash:role:2011" value="main" /> + <SegmentTemplate + timescale="48000" + initialization="3144-kZT4LWMQw6Rh7Kpd-$RepresentationID$.dash" + media="3144-kZT4LWMQw6Rh7Kpd-$RepresentationID$-$Time$.dash"> + <SegmentTimeline> + <S t="0" d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="96256" r="2" /> + <S d="95232" /> + <S d="3584" /> + </SegmentTimeline> + </SegmentTemplate> + <Representation + id="audio=128001" + bandwidth="128001"> + </Representation> + </AdaptationSet> + <AdaptationSet + id="2" + group="3" + contentType="text" + lang="en" + mimeType="application/mp4" + codecs="stpp" + startWithSAP="1"> + <Role schemeIdUri="urn:mpeg:dash:role:2011" value="subtitle" /> + <SegmentTemplate + timescale="1000" + initialization="3144-kZT4LWMQw6Rh7Kpd-$RepresentationID$.dash" + media="3144-kZT4LWMQw6Rh7Kpd-$RepresentationID$-$Time$.dash"> + <SegmentTimeline> + <S t="0" d="60000" r="9" /> + <S d="24000" /> + </SegmentTimeline> + </SegmentTemplate> + <Representation + id="textstream_eng=1000" + bandwidth="1000"> + </Representation> + </AdaptationSet> + <AdaptationSet + id="3" + group="2" + contentType="video" + par="960:409" + minBandwidth="100000" + maxBandwidth="4482000" + maxWidth="1689" + maxHeight="720" + segmentAlignment="true" + mimeType="video/mp4" + codecs="avc1.4D401F" + startWithSAP="1"> + <Role schemeIdUri="urn:mpeg:dash:role:2011" value="main" /> + <SegmentTemplate + timescale="12288" + initialization="3144-kZT4LWMQw6Rh7Kpd-$RepresentationID$.dash" + media="3144-kZT4LWMQw6Rh7Kpd-$RepresentationID$-$Time$.dash"> + <SegmentTimeline> + <S t="0" d="24576" r="443" /> + </SegmentTimeline> + </SegmentTemplate> + <Representation + id="video=100000" + bandwidth="100000" + width="336" + height="144" + sar="2880:2863" + scanType="progressive"> + </Representation> + <Representation + id="video=326000" + bandwidth="326000" + width="562" + height="240" + sar="115200:114929" + scanType="progressive"> + </Representation> + <Representation + id="video=698000" + bandwidth="698000" + width="844" + height="360" + sar="86400:86299" + scanType="progressive"> + </Representation> + <Representation + id="video=1493000" + bandwidth="1493000" + width="1126" + height="480" + sar="230400:230267" + scanType="progressive"> + </Representation> + <Representation + id="video=4482000" + bandwidth="4482000" + width="1688" + height="720" + sar="86400:86299" + scanType="progressive"> + </Representation> + </AdaptationSet> + </Period> +</MPD> diff --git a/test/testdata/thumbnails/foo %d bar/foo_%d.webp b/test/testdata/thumbnails/foo %d bar/foo_%d.webp Binary files differnew file mode 100644 index 000000000..d64d0839f --- /dev/null +++ b/test/testdata/thumbnails/foo %d bar/foo_%d.webp @@ -1,5 +1,7 @@ [tox] envlist = py26,py27,py33,py34,py35 + +# Needed? [testenv] deps = nose @@ -9,5 +11,5 @@ passenv = HOME defaultargs = test --exclude test_download.py --exclude test_age_restriction.py --exclude test_subtitles.py --exclude test_write_annotations.py --exclude test_youtube_lists.py --exclude test_socks.py -commands = nosetests --verbose {posargs:{[testenv]defaultargs}} # --with-coverage --cover-package=youtube_dl --cover-html +commands = nosetests --verbose {posargs:{[testenv]defaultargs}} # --with-coverage --cover-package=yt_dlp --cover-html # test.test_download:TestDownload.test_NowVideo diff --git a/youtube-dl.plugin.zsh b/youtube-dl.plugin.zsh deleted file mode 100644 index 17ab1341a..000000000 --- a/youtube-dl.plugin.zsh +++ /dev/null @@ -1,24 +0,0 @@ -# This allows the youtube-dl command to be installed in ZSH using antigen. -# Antigen is a bundle manager. It allows you to enhance the functionality of -# your zsh session by installing bundles and themes easily. - -# Antigen documentation: -# http://antigen.sharats.me/ -# https://github.com/zsh-users/antigen - -# Install youtube-dl: -# antigen bundle ytdl-org/youtube-dl -# Bundles installed by antigen are available for use immediately. - -# Update youtube-dl (and all other antigen bundles): -# antigen update - -# The antigen command will download the git repository to a folder and then -# execute an enabling script (this file). The complete process for loading the -# code is documented here: -# https://github.com/zsh-users/antigen#notes-on-writing-plugins - -# This specific script just aliases youtube-dl to the python script that this -# library provides. This requires updating the PYTHONPATH to ensure that the -# full set of code can be located. -alias youtube-dl="PYTHONPATH=$(dirname $0) $(dirname $0)/bin/youtube-dl" diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py deleted file mode 100755 index fe30758ef..000000000 --- a/youtube_dl/YoutubeDL.py +++ /dev/null @@ -1,2469 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -from __future__ import absolute_import, unicode_literals - -import collections -import contextlib -import copy -import datetime -import errno -import fileinput -import io -import itertools -import json -import locale -import operator -import os -import platform -import re -import shutil -import subprocess -import socket -import sys -import time -import tokenize -import traceback -import random - -from string import ascii_letters - -from .compat import ( - compat_basestring, - compat_cookiejar, - compat_get_terminal_size, - compat_http_client, - compat_kwargs, - compat_numeric_types, - compat_os_name, - compat_str, - compat_tokenize_tokenize, - compat_urllib_error, - compat_urllib_request, - compat_urllib_request_DataHandler, -) -from .utils import ( - age_restricted, - args_to_str, - ContentTooShortError, - date_from_str, - DateRange, - DEFAULT_OUTTMPL, - determine_ext, - determine_protocol, - DownloadError, - encode_compat_str, - encodeFilename, - error_to_compat_str, - expand_path, - ExtractorError, - format_bytes, - formatSeconds, - GeoRestrictedError, - int_or_none, - ISO3166Utils, - locked_file, - make_HTTPS_handler, - MaxDownloadsReached, - orderedSet, - PagedList, - parse_filesize, - PerRequestProxyHandler, - platform_name, - PostProcessingError, - preferredencoding, - prepend_extension, - register_socks_protocols, - render_table, - replace_extension, - SameFileError, - sanitize_filename, - sanitize_path, - sanitize_url, - sanitized_Request, - std_headers, - str_or_none, - subtitles_filename, - UnavailableVideoError, - url_basename, - version_tuple, - write_json_file, - write_string, - YoutubeDLCookieJar, - YoutubeDLCookieProcessor, - YoutubeDLHandler, - YoutubeDLRedirectHandler, -) -from .cache import Cache -from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER -from .extractor.openload import PhantomJSwrapper -from .downloader import get_suitable_downloader -from .downloader.rtmp import rtmpdump_version -from .postprocessor import ( - FFmpegFixupM3u8PP, - FFmpegFixupM4aPP, - FFmpegFixupStretchedPP, - FFmpegMergerPP, - FFmpegPostProcessor, - get_postprocessor, -) -from .version import __version__ - -if compat_os_name == 'nt': - import ctypes - - -class YoutubeDL(object): - """YoutubeDL class. - - YoutubeDL objects are the ones responsible of downloading the - actual video file and writing it to disk if the user has requested - it, among some other tasks. In most cases there should be one per - program. As, given a video URL, the downloader doesn't know how to - extract all the needed information, task that InfoExtractors do, it - has to pass the URL to one of them. - - For this, YoutubeDL objects have a method that allows - InfoExtractors to be registered in a given order. When it is passed - a URL, the YoutubeDL object handles it to the first InfoExtractor it - finds that reports being able to handle it. The InfoExtractor extracts - all the information about the video or videos the URL refers to, and - YoutubeDL process the extracted information, possibly using a File - Downloader to download the video. - - YoutubeDL objects accept a lot of parameters. In order not to saturate - the object constructor with arguments, it receives a dictionary of - options instead. These options are available through the params - attribute for the InfoExtractors to use. The YoutubeDL also - registers itself as the downloader in charge for the InfoExtractors - that are added to it, so this is a "mutual registration". - - Available options: - - username: Username for authentication purposes. - password: Password for authentication purposes. - videopassword: Password for accessing a video. - ap_mso: Adobe Pass multiple-system operator identifier. - ap_username: Multiple-system operator account username. - ap_password: Multiple-system operator account password. - usenetrc: Use netrc for authentication instead. - verbose: Print additional info to stdout. - quiet: Do not print messages to stdout. - no_warnings: Do not print out anything for warnings. - forceurl: Force printing final URL. - forcetitle: Force printing title. - forceid: Force printing ID. - forcethumbnail: Force printing thumbnail URL. - forcedescription: Force printing description. - forcefilename: Force printing final filename. - forceduration: Force printing duration. - forcejson: Force printing info_dict as JSON. - dump_single_json: Force printing the info_dict of the whole playlist - (or video) as a single JSON line. - simulate: Do not download the video files. - format: Video format code. See options.py for more information. - outtmpl: Template for output names. - outtmpl_na_placeholder: Placeholder for unavailable meta fields. - restrictfilenames: Do not allow "&" and spaces in file names - ignoreerrors: Do not stop on download errors. - force_generic_extractor: Force downloader to use the generic extractor - nooverwrites: Prevent overwriting files. - playliststart: Playlist item to start at. - playlistend: Playlist item to end at. - playlist_items: Specific indices of playlist to download. - playlistreverse: Download playlist items in reverse order. - playlistrandom: Download playlist items in random order. - matchtitle: Download only matching titles. - rejecttitle: Reject downloads for matching titles. - logger: Log messages to a logging.Logger instance. - logtostderr: Log messages to stderr instead of stdout. - writedescription: Write the video description to a .description file - writeinfojson: Write the video description to a .info.json file - writeannotations: Write the video annotations to a .annotations.xml file - writethumbnail: Write the thumbnail image to a file - write_all_thumbnails: Write all thumbnail formats to files - writesubtitles: Write the video subtitles to a file - writeautomaticsub: Write the automatically generated subtitles to a file - allsubtitles: Downloads all the subtitles of the video - (requires writesubtitles or writeautomaticsub) - listsubtitles: Lists all available subtitles for the video - subtitlesformat: The format code for subtitles - subtitleslangs: List of languages of the subtitles to download - keepvideo: Keep the video file after post-processing - daterange: A DateRange object, download only if the upload_date is in the range. - skip_download: Skip the actual download of the video file - cachedir: Location of the cache files in the filesystem. - False to disable filesystem cache. - noplaylist: Download single video instead of a playlist if in doubt. - age_limit: An integer representing the user's age in years. - Unsuitable videos for the given age are skipped. - min_views: An integer representing the minimum view count the video - must have in order to not be skipped. - Videos without view count information are always - downloaded. None for no limit. - max_views: An integer representing the maximum view count. - Videos that are more popular than that are not - downloaded. - Videos without view count information are always - downloaded. None for no limit. - download_archive: File name of a file where all downloads are recorded. - Videos already present in the file are not downloaded - again. - cookiefile: File name where cookies should be read from and dumped to. - nocheckcertificate:Do not verify SSL certificates - prefer_insecure: Use HTTP instead of HTTPS to retrieve information. - At the moment, this is only supported by YouTube. - proxy: URL of the proxy server to use - geo_verification_proxy: URL of the proxy to use for IP address verification - on geo-restricted sites. - socket_timeout: Time to wait for unresponsive hosts, in seconds - bidi_workaround: Work around buggy terminals without bidirectional text - support, using fridibi - debug_printtraffic:Print out sent and received HTTP traffic - include_ads: Download ads as well - default_search: Prepend this string if an input url is not valid. - 'auto' for elaborate guessing - encoding: Use this encoding instead of the system-specified. - extract_flat: Do not resolve URLs, return the immediate result. - Pass in 'in_playlist' to only show this behavior for - playlist items. - postprocessors: A list of dictionaries, each with an entry - * key: The name of the postprocessor. See - youtube_dl/postprocessor/__init__.py for a list. - as well as any further keyword arguments for the - postprocessor. - progress_hooks: A list of functions that get called on download - progress, with a dictionary with the entries - * status: One of "downloading", "error", or "finished". - Check this first and ignore unknown values. - - If status is one of "downloading", or "finished", the - following properties may also be present: - * filename: The final filename (always present) - * tmpfilename: The filename we're currently writing to - * downloaded_bytes: Bytes on disk - * total_bytes: Size of the whole file, None if unknown - * total_bytes_estimate: Guess of the eventual file size, - None if unavailable. - * elapsed: The number of seconds since download started. - * eta: The estimated time in seconds, None if unknown - * speed: The download speed in bytes/second, None if - unknown - * fragment_index: The counter of the currently - downloaded video fragment. - * fragment_count: The number of fragments (= individual - files that will be merged) - - Progress hooks are guaranteed to be called at least once - (with status "finished") if the download is successful. - merge_output_format: Extension to use when merging formats. - fixup: Automatically correct known faults of the file. - One of: - - "never": do nothing - - "warn": only emit a warning - - "detect_or_warn": check whether we can do anything - about it, warn otherwise (default) - source_address: Client-side IP address to bind to. - call_home: Boolean, true iff we are allowed to contact the - youtube-dl servers for debugging. - sleep_interval: Number of seconds to sleep before each download when - used alone or a lower bound of a range for randomized - sleep before each download (minimum possible number - of seconds to sleep) when used along with - max_sleep_interval. - max_sleep_interval:Upper bound of a range for randomized sleep before each - download (maximum possible number of seconds to sleep). - Must only be used along with sleep_interval. - Actual sleep time will be a random float from range - [sleep_interval; max_sleep_interval]. - listformats: Print an overview of available video formats and exit. - list_thumbnails: Print a table of all thumbnails and exit. - match_filter: A function that gets called with the info_dict of - every video. - If it returns a message, the video is ignored. - If it returns None, the video is downloaded. - match_filter_func in utils.py is one example for this. - no_color: Do not emit color codes in output. - geo_bypass: Bypass geographic restriction via faking X-Forwarded-For - HTTP header - geo_bypass_country: - Two-letter ISO 3166-2 country code that will be used for - explicit geographic restriction bypassing via faking - X-Forwarded-For HTTP header - geo_bypass_ip_block: - IP range in CIDR notation that will be used similarly to - geo_bypass_country - - The following options determine which downloader is picked: - external_downloader: Executable of the external downloader to call. - None or unset for standard (built-in) downloader. - hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv - if True, otherwise use ffmpeg/avconv if False, otherwise - use downloader suggested by extractor if None. - - The following parameters are not used by YoutubeDL itself, they are used by - the downloader (see youtube_dl/downloader/common.py): - nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test, - noresizebuffer, retries, continuedl, noprogress, consoletitle, - xattr_set_filesize, external_downloader_args, hls_use_mpegts, - http_chunk_size. - - The following options are used by the post processors: - prefer_ffmpeg: If False, use avconv instead of ffmpeg if both are available, - otherwise prefer ffmpeg. - ffmpeg_location: Location of the ffmpeg/avconv binary; either the path - to the binary or its containing directory. - postprocessor_args: A list of additional command-line arguments for the - postprocessor. - - The following options are used by the Youtube extractor: - youtube_include_dash_manifest: If True (default), DASH manifests and related - data will be downloaded and processed by extractor. - You can reduce network I/O by disabling it if you don't - care about DASH. - """ - - _NUMERIC_FIELDS = set(( - 'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx', - 'timestamp', 'upload_year', 'upload_month', 'upload_day', - 'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count', - 'average_rating', 'comment_count', 'age_limit', - 'start_time', 'end_time', - 'chapter_number', 'season_number', 'episode_number', - 'track_number', 'disc_number', 'release_year', - 'playlist_index', - )) - - params = None - _ies = [] - _pps = [] - _download_retcode = None - _num_downloads = None - _playlist_level = 0 - _playlist_urls = set() - _screen_file = None - - def __init__(self, params=None, auto_init=True): - """Create a FileDownloader object with the given options.""" - if params is None: - params = {} - self._ies = [] - self._ies_instances = {} - self._pps = [] - self._progress_hooks = [] - self._download_retcode = 0 - self._num_downloads = 0 - self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)] - self._err_file = sys.stderr - self.params = { - # Default parameters - 'nocheckcertificate': False, - } - self.params.update(params) - self.cache = Cache(self) - - def check_deprecated(param, option, suggestion): - if self.params.get(param) is not None: - self.report_warning( - '%s is deprecated. Use %s instead.' % (option, suggestion)) - return True - return False - - if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'): - if self.params.get('geo_verification_proxy') is None: - self.params['geo_verification_proxy'] = self.params['cn_verification_proxy'] - - check_deprecated('autonumber_size', '--autonumber-size', 'output template with %(autonumber)0Nd, where N in the number of digits') - check_deprecated('autonumber', '--auto-number', '-o "%(autonumber)s-%(title)s.%(ext)s"') - check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"') - - if params.get('bidi_workaround', False): - try: - import pty - master, slave = pty.openpty() - width = compat_get_terminal_size().columns - if width is None: - width_args = [] - else: - width_args = ['-w', str(width)] - sp_kwargs = dict( - stdin=subprocess.PIPE, - stdout=slave, - stderr=self._err_file) - try: - self._output_process = subprocess.Popen( - ['bidiv'] + width_args, **sp_kwargs - ) - except OSError: - self._output_process = subprocess.Popen( - ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs) - self._output_channel = os.fdopen(master, 'rb') - except OSError as ose: - if ose.errno == errno.ENOENT: - self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that fribidi is an executable file in one of the directories in your $PATH.') - else: - raise - - if (sys.platform != 'win32' - and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] - and not params.get('restrictfilenames', False)): - # Unicode filesystem API will throw errors (#1474, #13027) - self.report_warning( - 'Assuming --restrict-filenames since file system encoding ' - 'cannot encode all characters. ' - 'Set the LC_ALL environment variable to fix this.') - self.params['restrictfilenames'] = True - - if isinstance(params.get('outtmpl'), bytes): - self.report_warning( - 'Parameter outtmpl is bytes, but should be a unicode string. ' - 'Put from __future__ import unicode_literals at the top of your code file or consider switching to Python 3.x.') - - self._setup_opener() - - if auto_init: - self.print_debug_header() - self.add_default_info_extractors() - - for pp_def_raw in self.params.get('postprocessors', []): - pp_class = get_postprocessor(pp_def_raw['key']) - pp_def = dict(pp_def_raw) - del pp_def['key'] - pp = pp_class(self, **compat_kwargs(pp_def)) - self.add_post_processor(pp) - - for ph in self.params.get('progress_hooks', []): - self.add_progress_hook(ph) - - register_socks_protocols() - - def warn_if_short_id(self, argv): - # short YouTube ID starting with dash? - idxs = [ - i for i, a in enumerate(argv) - if re.match(r'^-[0-9A-Za-z_-]{10}$', a)] - if idxs: - correct_argv = ( - ['youtube-dl'] - + [a for i, a in enumerate(argv) if i not in idxs] - + ['--'] + [argv[i] for i in idxs] - ) - self.report_warning( - 'Long argument string detected. ' - 'Use -- to separate parameters and URLs, like this:\n%s\n' % - args_to_str(correct_argv)) - - def add_info_extractor(self, ie): - """Add an InfoExtractor object to the end of the list.""" - self._ies.append(ie) - if not isinstance(ie, type): - self._ies_instances[ie.ie_key()] = ie - ie.set_downloader(self) - - def get_info_extractor(self, ie_key): - """ - Get an instance of an IE with name ie_key, it will try to get one from - the _ies list, if there's no instance it will create a new one and add - it to the extractor list. - """ - ie = self._ies_instances.get(ie_key) - if ie is None: - ie = get_info_extractor(ie_key)() - self.add_info_extractor(ie) - return ie - - def add_default_info_extractors(self): - """ - Add the InfoExtractors returned by gen_extractors to the end of the list - """ - for ie in gen_extractor_classes(): - self.add_info_extractor(ie) - - def add_post_processor(self, pp): - """Add a PostProcessor object to the end of the chain.""" - self._pps.append(pp) - pp.set_downloader(self) - - def add_progress_hook(self, ph): - """Add the progress hook (currently only for the file downloader)""" - self._progress_hooks.append(ph) - - def _bidi_workaround(self, message): - if not hasattr(self, '_output_channel'): - return message - - assert hasattr(self, '_output_process') - assert isinstance(message, compat_str) - line_count = message.count('\n') + 1 - self._output_process.stdin.write((message + '\n').encode('utf-8')) - self._output_process.stdin.flush() - res = ''.join(self._output_channel.readline().decode('utf-8') - for _ in range(line_count)) - return res[:-len('\n')] - - def to_screen(self, message, skip_eol=False): - """Print message to stdout if not in quiet mode.""" - return self.to_stdout(message, skip_eol, check_quiet=True) - - def _write_string(self, s, out=None): - write_string(s, out=out, encoding=self.params.get('encoding')) - - def to_stdout(self, message, skip_eol=False, check_quiet=False): - """Print message to stdout if not in quiet mode.""" - if self.params.get('logger'): - self.params['logger'].debug(message) - elif not check_quiet or not self.params.get('quiet', False): - message = self._bidi_workaround(message) - terminator = ['\n', ''][skip_eol] - output = message + terminator - - self._write_string(output, self._screen_file) - - def to_stderr(self, message): - """Print message to stderr.""" - assert isinstance(message, compat_str) - if self.params.get('logger'): - self.params['logger'].error(message) - else: - message = self._bidi_workaround(message) - output = message + '\n' - self._write_string(output, self._err_file) - - def to_console_title(self, message): - if not self.params.get('consoletitle', False): - return - if compat_os_name == 'nt': - if ctypes.windll.kernel32.GetConsoleWindow(): - # c_wchar_p() might not be necessary if `message` is - # already of type unicode() - ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message)) - elif 'TERM' in os.environ: - self._write_string('\033]0;%s\007' % message, self._screen_file) - - def save_console_title(self): - if not self.params.get('consoletitle', False): - return - if self.params.get('simulate', False): - return - if compat_os_name != 'nt' and 'TERM' in os.environ: - # Save the title on stack - self._write_string('\033[22;0t', self._screen_file) - - def restore_console_title(self): - if not self.params.get('consoletitle', False): - return - if self.params.get('simulate', False): - return - if compat_os_name != 'nt' and 'TERM' in os.environ: - # Restore the title from stack - self._write_string('\033[23;0t', self._screen_file) - - def __enter__(self): - self.save_console_title() - return self - - def __exit__(self, *args): - self.restore_console_title() - - if self.params.get('cookiefile') is not None: - self.cookiejar.save(ignore_discard=True, ignore_expires=True) - - def trouble(self, message=None, tb=None): - """Determine action to take when a download problem appears. - - Depending on if the downloader has been configured to ignore - download errors or not, this method may throw an exception or - not when errors are found, after printing the message. - - tb, if given, is additional traceback information. - """ - if message is not None: - self.to_stderr(message) - if self.params.get('verbose'): - if tb is None: - if sys.exc_info()[0]: # if .trouble has been called from an except block - tb = '' - if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]: - tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info)) - tb += encode_compat_str(traceback.format_exc()) - else: - tb_data = traceback.format_list(traceback.extract_stack()) - tb = ''.join(tb_data) - self.to_stderr(tb) - if not self.params.get('ignoreerrors', False): - if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]: - exc_info = sys.exc_info()[1].exc_info - else: - exc_info = sys.exc_info() - raise DownloadError(message, exc_info) - self._download_retcode = 1 - - def report_warning(self, message): - ''' - Print the message to stderr, it will be prefixed with 'WARNING:' - If stderr is a tty file the 'WARNING:' will be colored - ''' - if self.params.get('logger') is not None: - self.params['logger'].warning(message) - else: - if self.params.get('no_warnings'): - return - if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt': - _msg_header = '\033[0;33mWARNING:\033[0m' - else: - _msg_header = 'WARNING:' - warning_message = '%s %s' % (_msg_header, message) - self.to_stderr(warning_message) - - def report_error(self, message, tb=None): - ''' - Do the same as trouble, but prefixes the message with 'ERROR:', colored - in red if stderr is a tty file. - ''' - if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt': - _msg_header = '\033[0;31mERROR:\033[0m' - else: - _msg_header = 'ERROR:' - error_message = '%s %s' % (_msg_header, message) - self.trouble(error_message, tb) - - def report_file_already_downloaded(self, file_name): - """Report file has already been fully downloaded.""" - try: - self.to_screen('[download] %s has already been downloaded' % file_name) - except UnicodeEncodeError: - self.to_screen('[download] The file has already been downloaded') - - def prepare_filename(self, info_dict): - """Generate the output filename.""" - try: - template_dict = dict(info_dict) - - template_dict['epoch'] = int(time.time()) - autonumber_size = self.params.get('autonumber_size') - if autonumber_size is None: - autonumber_size = 5 - template_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads - if template_dict.get('resolution') is None: - if template_dict.get('width') and template_dict.get('height'): - template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height']) - elif template_dict.get('height'): - template_dict['resolution'] = '%sp' % template_dict['height'] - elif template_dict.get('width'): - template_dict['resolution'] = '%dx?' % template_dict['width'] - - sanitize = lambda k, v: sanitize_filename( - compat_str(v), - restricted=self.params.get('restrictfilenames'), - is_id=(k == 'id' or k.endswith('_id'))) - template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v)) - for k, v in template_dict.items() - if v is not None and not isinstance(v, (list, tuple, dict))) - template_dict = collections.defaultdict(lambda: self.params.get('outtmpl_na_placeholder', 'NA'), template_dict) - - outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL) - - # For fields playlist_index and autonumber convert all occurrences - # of %(field)s to %(field)0Nd for backward compatibility - field_size_compat_map = { - 'playlist_index': len(str(template_dict['n_entries'])), - 'autonumber': autonumber_size, - } - FIELD_SIZE_COMPAT_RE = r'(?<!%)%\((?P<field>autonumber|playlist_index)\)s' - mobj = re.search(FIELD_SIZE_COMPAT_RE, outtmpl) - if mobj: - outtmpl = re.sub( - FIELD_SIZE_COMPAT_RE, - r'%%(\1)0%dd' % field_size_compat_map[mobj.group('field')], - outtmpl) - - # Missing numeric fields used together with integer presentation types - # in format specification will break the argument substitution since - # string NA placeholder is returned for missing fields. We will patch - # output template for missing fields to meet string presentation type. - for numeric_field in self._NUMERIC_FIELDS: - if numeric_field not in template_dict: - # As of [1] format syntax is: - # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type - # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting - FORMAT_RE = r'''(?x) - (?<!%) - % - \({0}\) # mapping key - (?:[#0\-+ ]+)? # conversion flags (optional) - (?:\d+)? # minimum field width (optional) - (?:\.\d+)? # precision (optional) - [hlL]? # length modifier (optional) - [diouxXeEfFgGcrs%] # conversion type - ''' - outtmpl = re.sub( - FORMAT_RE.format(numeric_field), - r'%({0})s'.format(numeric_field), outtmpl) - - # expand_path translates '%%' into '%' and '$$' into '$' - # correspondingly that is not what we want since we need to keep - # '%%' intact for template dict substitution step. Working around - # with boundary-alike separator hack. - sep = ''.join([random.choice(ascii_letters) for _ in range(32)]) - outtmpl = outtmpl.replace('%%', '%{0}%'.format(sep)).replace('$$', '${0}$'.format(sep)) - - # outtmpl should be expand_path'ed before template dict substitution - # because meta fields may contain env variables we don't want to - # be expanded. For example, for outtmpl "%(title)s.%(ext)s" and - # title "Hello $PATH", we don't want `$PATH` to be expanded. - filename = expand_path(outtmpl).replace(sep, '') % template_dict - - # Temporary fix for #4787 - # 'Treat' all problem characters by passing filename through preferredencoding - # to workaround encoding issues with subprocess on python2 @ Windows - if sys.version_info < (3, 0) and sys.platform == 'win32': - filename = encodeFilename(filename, True).decode(preferredencoding()) - return sanitize_path(filename) - except ValueError as err: - self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')') - return None - - def _match_entry(self, info_dict, incomplete): - """ Returns None iff the file should be downloaded """ - - video_title = info_dict.get('title', info_dict.get('id', 'video')) - if 'title' in info_dict: - # This can happen when we're just evaluating the playlist - title = info_dict['title'] - matchtitle = self.params.get('matchtitle', False) - if matchtitle: - if not re.search(matchtitle, title, re.IGNORECASE): - return '"' + title + '" title did not match pattern "' + matchtitle + '"' - rejecttitle = self.params.get('rejecttitle', False) - if rejecttitle: - if re.search(rejecttitle, title, re.IGNORECASE): - return '"' + title + '" title matched reject pattern "' + rejecttitle + '"' - date = info_dict.get('upload_date') - if date is not None: - dateRange = self.params.get('daterange', DateRange()) - if date not in dateRange: - return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange) - view_count = info_dict.get('view_count') - if view_count is not None: - min_views = self.params.get('min_views') - if min_views is not None and view_count < min_views: - return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views) - max_views = self.params.get('max_views') - if max_views is not None and view_count > max_views: - return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views) - if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')): - return 'Skipping "%s" because it is age restricted' % video_title - if self.in_download_archive(info_dict): - return '%s has already been recorded in archive' % video_title - - if not incomplete: - match_filter = self.params.get('match_filter') - if match_filter is not None: - ret = match_filter(info_dict) - if ret is not None: - return ret - - return None - - @staticmethod - def add_extra_info(info_dict, extra_info): - '''Set the keys from extra_info in info dict if they are missing''' - for key, value in extra_info.items(): - info_dict.setdefault(key, value) - - def extract_info(self, url, download=True, ie_key=None, extra_info={}, - process=True, force_generic_extractor=False): - """ - Return a list with a dictionary for each video extracted. - - Arguments: - url -- URL to extract - - Keyword arguments: - download -- whether to download videos during extraction - ie_key -- extractor key hint - extra_info -- dictionary containing the extra values to add to each result - process -- whether to resolve all unresolved references (URLs, playlist items), - must be True for download to work. - force_generic_extractor -- force using the generic extractor - """ - - if not ie_key and force_generic_extractor: - ie_key = 'Generic' - - if ie_key: - ies = [self.get_info_extractor(ie_key)] - else: - ies = self._ies - - for ie in ies: - if not ie.suitable(url): - continue - - ie = self.get_info_extractor(ie.ie_key()) - if not ie.working(): - self.report_warning('The program functionality for this site has been marked as broken, ' - 'and will probably not work.') - - return self.__extract_info(url, ie, download, extra_info, process) - else: - self.report_error('no suitable InfoExtractor for URL %s' % url) - - def __handle_extraction_exceptions(func): - def wrapper(self, *args, **kwargs): - try: - return func(self, *args, **kwargs) - except GeoRestrictedError as e: - msg = e.msg - if e.countries: - msg += '\nThis video is available in %s.' % ', '.join( - map(ISO3166Utils.short2full, e.countries)) - msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.' - self.report_error(msg) - except ExtractorError as e: # An error we somewhat expected - self.report_error(compat_str(e), e.format_traceback()) - except MaxDownloadsReached: - raise - except Exception as e: - if self.params.get('ignoreerrors', False): - self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc())) - else: - raise - return wrapper - - @__handle_extraction_exceptions - def __extract_info(self, url, ie, download, extra_info, process): - ie_result = ie.extract(url) - if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here) - return - if isinstance(ie_result, list): - # Backwards compatibility: old IE result format - ie_result = { - '_type': 'compat_list', - 'entries': ie_result, - } - self.add_default_extra_info(ie_result, ie, url) - if process: - return self.process_ie_result(ie_result, download, extra_info) - else: - return ie_result - - def add_default_extra_info(self, ie_result, ie, url): - self.add_extra_info(ie_result, { - 'extractor': ie.IE_NAME, - 'webpage_url': url, - 'webpage_url_basename': url_basename(url), - 'extractor_key': ie.ie_key(), - }) - - def process_ie_result(self, ie_result, download=True, extra_info={}): - """ - Take the result of the ie(may be modified) and resolve all unresolved - references (URLs, playlist items). - - It will also download the videos if 'download'. - Returns the resolved ie_result. - """ - result_type = ie_result.get('_type', 'video') - - if result_type in ('url', 'url_transparent'): - ie_result['url'] = sanitize_url(ie_result['url']) - extract_flat = self.params.get('extract_flat', False) - if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) - or extract_flat is True): - self.__forced_printings( - ie_result, self.prepare_filename(ie_result), - incomplete=True) - return ie_result - - if result_type == 'video': - self.add_extra_info(ie_result, extra_info) - return self.process_video_result(ie_result, download=download) - elif result_type == 'url': - # We have to add extra_info to the results because it may be - # contained in a playlist - return self.extract_info(ie_result['url'], - download, - ie_key=ie_result.get('ie_key'), - extra_info=extra_info) - elif result_type == 'url_transparent': - # Use the information from the embedding page - info = self.extract_info( - ie_result['url'], ie_key=ie_result.get('ie_key'), - extra_info=extra_info, download=False, process=False) - - # extract_info may return None when ignoreerrors is enabled and - # extraction failed with an error, don't crash and return early - # in this case - if not info: - return info - - force_properties = dict( - (k, v) for k, v in ie_result.items() if v is not None) - for f in ('_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'): - if f in force_properties: - del force_properties[f] - new_result = info.copy() - new_result.update(force_properties) - - # Extracted info may not be a video result (i.e. - # info.get('_type', 'video') != video) but rather an url or - # url_transparent. In such cases outer metadata (from ie_result) - # should be propagated to inner one (info). For this to happen - # _type of info should be overridden with url_transparent. This - # fixes issue from https://github.com/ytdl-org/youtube-dl/pull/11163. - if new_result.get('_type') == 'url': - new_result['_type'] = 'url_transparent' - - return self.process_ie_result( - new_result, download=download, extra_info=extra_info) - elif result_type in ('playlist', 'multi_video'): - # Protect from infinite recursion due to recursively nested playlists - # (see https://github.com/ytdl-org/youtube-dl/issues/27833) - webpage_url = ie_result['webpage_url'] - if webpage_url in self._playlist_urls: - self.to_screen( - '[download] Skipping already downloaded playlist: %s' - % ie_result.get('title') or ie_result.get('id')) - return - - self._playlist_level += 1 - self._playlist_urls.add(webpage_url) - try: - return self.__process_playlist(ie_result, download) - finally: - self._playlist_level -= 1 - if not self._playlist_level: - self._playlist_urls.clear() - elif result_type == 'compat_list': - self.report_warning( - 'Extractor %s returned a compat_list result. ' - 'It needs to be updated.' % ie_result.get('extractor')) - - def _fixup(r): - self.add_extra_info( - r, - { - 'extractor': ie_result['extractor'], - 'webpage_url': ie_result['webpage_url'], - 'webpage_url_basename': url_basename(ie_result['webpage_url']), - 'extractor_key': ie_result['extractor_key'], - } - ) - return r - ie_result['entries'] = [ - self.process_ie_result(_fixup(r), download, extra_info) - for r in ie_result['entries'] - ] - return ie_result - else: - raise Exception('Invalid result type: %s' % result_type) - - def __process_playlist(self, ie_result, download): - # We process each entry in the playlist - playlist = ie_result.get('title') or ie_result.get('id') - - self.to_screen('[download] Downloading playlist: %s' % playlist) - - playlist_results = [] - - playliststart = self.params.get('playliststart', 1) - 1 - playlistend = self.params.get('playlistend') - # For backwards compatibility, interpret -1 as whole list - if playlistend == -1: - playlistend = None - - playlistitems_str = self.params.get('playlist_items') - playlistitems = None - if playlistitems_str is not None: - def iter_playlistitems(format): - for string_segment in format.split(','): - if '-' in string_segment: - start, end = string_segment.split('-') - for item in range(int(start), int(end) + 1): - yield int(item) - else: - yield int(string_segment) - playlistitems = orderedSet(iter_playlistitems(playlistitems_str)) - - ie_entries = ie_result['entries'] - - def make_playlistitems_entries(list_ie_entries): - num_entries = len(list_ie_entries) - return [ - list_ie_entries[i - 1] for i in playlistitems - if -num_entries <= i - 1 < num_entries] - - def report_download(num_entries): - self.to_screen( - '[%s] playlist %s: Downloading %d videos' % - (ie_result['extractor'], playlist, num_entries)) - - if isinstance(ie_entries, list): - n_all_entries = len(ie_entries) - if playlistitems: - entries = make_playlistitems_entries(ie_entries) - else: - entries = ie_entries[playliststart:playlistend] - n_entries = len(entries) - self.to_screen( - '[%s] playlist %s: Collected %d video ids (downloading %d of them)' % - (ie_result['extractor'], playlist, n_all_entries, n_entries)) - elif isinstance(ie_entries, PagedList): - if playlistitems: - entries = [] - for item in playlistitems: - entries.extend(ie_entries.getslice( - item - 1, item - )) - else: - entries = ie_entries.getslice( - playliststart, playlistend) - n_entries = len(entries) - report_download(n_entries) - else: # iterable - if playlistitems: - entries = make_playlistitems_entries(list(itertools.islice( - ie_entries, 0, max(playlistitems)))) - else: - entries = list(itertools.islice( - ie_entries, playliststart, playlistend)) - n_entries = len(entries) - report_download(n_entries) - - if self.params.get('playlistreverse', False): - entries = entries[::-1] - - if self.params.get('playlistrandom', False): - random.shuffle(entries) - - x_forwarded_for = ie_result.get('__x_forwarded_for_ip') - - for i, entry in enumerate(entries, 1): - self.to_screen('[download] Downloading video %s of %s' % (i, n_entries)) - # This __x_forwarded_for_ip thing is a bit ugly but requires - # minimal changes - if x_forwarded_for: - entry['__x_forwarded_for_ip'] = x_forwarded_for - extra = { - 'n_entries': n_entries, - 'playlist': playlist, - 'playlist_id': ie_result.get('id'), - 'playlist_title': ie_result.get('title'), - 'playlist_uploader': ie_result.get('uploader'), - 'playlist_uploader_id': ie_result.get('uploader_id'), - 'playlist_index': playlistitems[i - 1] if playlistitems else i + playliststart, - 'extractor': ie_result['extractor'], - 'webpage_url': ie_result['webpage_url'], - 'webpage_url_basename': url_basename(ie_result['webpage_url']), - 'extractor_key': ie_result['extractor_key'], - } - - reason = self._match_entry(entry, incomplete=True) - if reason is not None: - self.to_screen('[download] ' + reason) - continue - - entry_result = self.__process_iterable_entry(entry, download, extra) - # TODO: skip failed (empty) entries? - playlist_results.append(entry_result) - ie_result['entries'] = playlist_results - self.to_screen('[download] Finished downloading playlist: %s' % playlist) - return ie_result - - @__handle_extraction_exceptions - def __process_iterable_entry(self, entry, download, extra_info): - return self.process_ie_result( - entry, download=download, extra_info=extra_info) - - def _build_format_filter(self, filter_spec): - " Returns a function to filter the formats according to the filter_spec " - - OPERATORS = { - '<': operator.lt, - '<=': operator.le, - '>': operator.gt, - '>=': operator.ge, - '=': operator.eq, - '!=': operator.ne, - } - operator_rex = re.compile(r'''(?x)\s* - (?P<key>width|height|tbr|abr|vbr|asr|filesize|filesize_approx|fps) - \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s* - (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?) - $ - ''' % '|'.join(map(re.escape, OPERATORS.keys()))) - m = operator_rex.search(filter_spec) - if m: - try: - comparison_value = int(m.group('value')) - except ValueError: - comparison_value = parse_filesize(m.group('value')) - if comparison_value is None: - comparison_value = parse_filesize(m.group('value') + 'B') - if comparison_value is None: - raise ValueError( - 'Invalid value %r in format specification %r' % ( - m.group('value'), filter_spec)) - op = OPERATORS[m.group('op')] - - if not m: - STR_OPERATORS = { - '=': operator.eq, - '^=': lambda attr, value: attr.startswith(value), - '$=': lambda attr, value: attr.endswith(value), - '*=': lambda attr, value: value in attr, - } - str_operator_rex = re.compile(r'''(?x) - \s*(?P<key>ext|acodec|vcodec|container|protocol|format_id|language) - \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)? - \s*(?P<value>[a-zA-Z0-9._-]+) - \s*$ - ''' % '|'.join(map(re.escape, STR_OPERATORS.keys()))) - m = str_operator_rex.search(filter_spec) - if m: - comparison_value = m.group('value') - str_op = STR_OPERATORS[m.group('op')] - if m.group('negation'): - op = lambda attr, value: not str_op(attr, value) - else: - op = str_op - - if not m: - raise ValueError('Invalid filter specification %r' % filter_spec) - - def _filter(f): - actual_value = f.get(m.group('key')) - if actual_value is None: - return m.group('none_inclusive') - return op(actual_value, comparison_value) - return _filter - - def _default_format_spec(self, info_dict, download=True): - - def can_merge(): - merger = FFmpegMergerPP(self) - return merger.available and merger.can_merge() - - def prefer_best(): - if self.params.get('simulate', False): - return False - if not download: - return False - if self.params.get('outtmpl', DEFAULT_OUTTMPL) == '-': - return True - if info_dict.get('is_live'): - return True - if not can_merge(): - return True - return False - - req_format_list = ['bestvideo+bestaudio', 'best'] - if prefer_best(): - req_format_list.reverse() - return '/'.join(req_format_list) - - def build_format_selector(self, format_spec): - def syntax_error(note, start): - message = ( - 'Invalid format specification: ' - '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1])) - return SyntaxError(message) - - PICKFIRST = 'PICKFIRST' - MERGE = 'MERGE' - SINGLE = 'SINGLE' - GROUP = 'GROUP' - FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters']) - - def _parse_filter(tokens): - filter_parts = [] - for type, string, start, _, _ in tokens: - if type == tokenize.OP and string == ']': - return ''.join(filter_parts) - else: - filter_parts.append(string) - - def _remove_unused_ops(tokens): - # Remove operators that we don't use and join them with the surrounding strings - # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9' - ALLOWED_OPS = ('/', '+', ',', '(', ')') - last_string, last_start, last_end, last_line = None, None, None, None - for type, string, start, end, line in tokens: - if type == tokenize.OP and string == '[': - if last_string: - yield tokenize.NAME, last_string, last_start, last_end, last_line - last_string = None - yield type, string, start, end, line - # everything inside brackets will be handled by _parse_filter - for type, string, start, end, line in tokens: - yield type, string, start, end, line - if type == tokenize.OP and string == ']': - break - elif type == tokenize.OP and string in ALLOWED_OPS: - if last_string: - yield tokenize.NAME, last_string, last_start, last_end, last_line - last_string = None - yield type, string, start, end, line - elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]: - if not last_string: - last_string = string - last_start = start - last_end = end - else: - last_string += string - if last_string: - yield tokenize.NAME, last_string, last_start, last_end, last_line - - def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False): - selectors = [] - current_selector = None - for type, string, start, _, _ in tokens: - # ENCODING is only defined in python 3.x - if type == getattr(tokenize, 'ENCODING', None): - continue - elif type in [tokenize.NAME, tokenize.NUMBER]: - current_selector = FormatSelector(SINGLE, string, []) - elif type == tokenize.OP: - if string == ')': - if not inside_group: - # ')' will be handled by the parentheses group - tokens.restore_last_token() - break - elif inside_merge and string in ['/', ',']: - tokens.restore_last_token() - break - elif inside_choice and string == ',': - tokens.restore_last_token() - break - elif string == ',': - if not current_selector: - raise syntax_error('"," must follow a format selector', start) - selectors.append(current_selector) - current_selector = None - elif string == '/': - if not current_selector: - raise syntax_error('"/" must follow a format selector', start) - first_choice = current_selector - second_choice = _parse_format_selection(tokens, inside_choice=True) - current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), []) - elif string == '[': - if not current_selector: - current_selector = FormatSelector(SINGLE, 'best', []) - format_filter = _parse_filter(tokens) - current_selector.filters.append(format_filter) - elif string == '(': - if current_selector: - raise syntax_error('Unexpected "("', start) - group = _parse_format_selection(tokens, inside_group=True) - current_selector = FormatSelector(GROUP, group, []) - elif string == '+': - if inside_merge: - raise syntax_error('Unexpected "+"', start) - video_selector = current_selector - audio_selector = _parse_format_selection(tokens, inside_merge=True) - if not video_selector or not audio_selector: - raise syntax_error('"+" must be between two format selectors', start) - current_selector = FormatSelector(MERGE, (video_selector, audio_selector), []) - else: - raise syntax_error('Operator not recognized: "{0}"'.format(string), start) - elif type == tokenize.ENDMARKER: - break - if current_selector: - selectors.append(current_selector) - return selectors - - def _build_selector_function(selector): - if isinstance(selector, list): - fs = [_build_selector_function(s) for s in selector] - - def selector_function(ctx): - for f in fs: - for format in f(ctx): - yield format - return selector_function - elif selector.type == GROUP: - selector_function = _build_selector_function(selector.selector) - elif selector.type == PICKFIRST: - fs = [_build_selector_function(s) for s in selector.selector] - - def selector_function(ctx): - for f in fs: - picked_formats = list(f(ctx)) - if picked_formats: - return picked_formats - return [] - elif selector.type == SINGLE: - format_spec = selector.selector - - def selector_function(ctx): - formats = list(ctx['formats']) - if not formats: - return - if format_spec == 'all': - for f in formats: - yield f - elif format_spec in ['best', 'worst', None]: - format_idx = 0 if format_spec == 'worst' else -1 - audiovideo_formats = [ - f for f in formats - if f.get('vcodec') != 'none' and f.get('acodec') != 'none'] - if audiovideo_formats: - yield audiovideo_formats[format_idx] - # for extractors with incomplete formats (audio only (soundcloud) - # or video only (imgur)) we will fallback to best/worst - # {video,audio}-only format - elif ctx['incomplete_formats']: - yield formats[format_idx] - elif format_spec == 'bestaudio': - audio_formats = [ - f for f in formats - if f.get('vcodec') == 'none'] - if audio_formats: - yield audio_formats[-1] - elif format_spec == 'worstaudio': - audio_formats = [ - f for f in formats - if f.get('vcodec') == 'none'] - if audio_formats: - yield audio_formats[0] - elif format_spec == 'bestvideo': - video_formats = [ - f for f in formats - if f.get('acodec') == 'none'] - if video_formats: - yield video_formats[-1] - elif format_spec == 'worstvideo': - video_formats = [ - f for f in formats - if f.get('acodec') == 'none'] - if video_formats: - yield video_formats[0] - else: - extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav'] - if format_spec in extensions: - filter_f = lambda f: f['ext'] == format_spec - else: - filter_f = lambda f: f['format_id'] == format_spec - matches = list(filter(filter_f, formats)) - if matches: - yield matches[-1] - elif selector.type == MERGE: - def _merge(formats_info): - format_1, format_2 = [f['format_id'] for f in formats_info] - # The first format must contain the video and the - # second the audio - if formats_info[0].get('vcodec') == 'none': - self.report_error('The first format must ' - 'contain the video, try using ' - '"-f %s+%s"' % (format_2, format_1)) - return - # Formats must be opposite (video+audio) - if formats_info[0].get('acodec') == 'none' and formats_info[1].get('acodec') == 'none': - self.report_error( - 'Both formats %s and %s are video-only, you must specify "-f video+audio"' - % (format_1, format_2)) - return - output_ext = ( - formats_info[0]['ext'] - if self.params.get('merge_output_format') is None - else self.params['merge_output_format']) - return { - 'requested_formats': formats_info, - 'format': '%s+%s' % (formats_info[0].get('format'), - formats_info[1].get('format')), - 'format_id': '%s+%s' % (formats_info[0].get('format_id'), - formats_info[1].get('format_id')), - 'width': formats_info[0].get('width'), - 'height': formats_info[0].get('height'), - 'resolution': formats_info[0].get('resolution'), - 'fps': formats_info[0].get('fps'), - 'vcodec': formats_info[0].get('vcodec'), - 'vbr': formats_info[0].get('vbr'), - 'stretched_ratio': formats_info[0].get('stretched_ratio'), - 'acodec': formats_info[1].get('acodec'), - 'abr': formats_info[1].get('abr'), - 'ext': output_ext, - } - video_selector, audio_selector = map(_build_selector_function, selector.selector) - - def selector_function(ctx): - for pair in itertools.product( - video_selector(copy.deepcopy(ctx)), audio_selector(copy.deepcopy(ctx))): - yield _merge(pair) - - filters = [self._build_format_filter(f) for f in selector.filters] - - def final_selector(ctx): - ctx_copy = copy.deepcopy(ctx) - for _filter in filters: - ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats'])) - return selector_function(ctx_copy) - return final_selector - - stream = io.BytesIO(format_spec.encode('utf-8')) - try: - tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline))) - except tokenize.TokenError: - raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec))) - - class TokenIterator(object): - def __init__(self, tokens): - self.tokens = tokens - self.counter = 0 - - def __iter__(self): - return self - - def __next__(self): - if self.counter >= len(self.tokens): - raise StopIteration() - value = self.tokens[self.counter] - self.counter += 1 - return value - - next = __next__ - - def restore_last_token(self): - self.counter -= 1 - - parsed_selector = _parse_format_selection(iter(TokenIterator(tokens))) - return _build_selector_function(parsed_selector) - - def _calc_headers(self, info_dict): - res = std_headers.copy() - - add_headers = info_dict.get('http_headers') - if add_headers: - res.update(add_headers) - - cookies = self._calc_cookies(info_dict) - if cookies: - res['Cookie'] = cookies - - if 'X-Forwarded-For' not in res: - x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip') - if x_forwarded_for_ip: - res['X-Forwarded-For'] = x_forwarded_for_ip - - return res - - def _calc_cookies(self, info_dict): - pr = sanitized_Request(info_dict['url']) - self.cookiejar.add_cookie_header(pr) - return pr.get_header('Cookie') - - def process_video_result(self, info_dict, download=True): - assert info_dict.get('_type', 'video') == 'video' - - if 'id' not in info_dict: - raise ExtractorError('Missing "id" field in extractor result') - if 'title' not in info_dict: - raise ExtractorError('Missing "title" field in extractor result') - - def report_force_conversion(field, field_not, conversion): - self.report_warning( - '"%s" field is not %s - forcing %s conversion, there is an error in extractor' - % (field, field_not, conversion)) - - def sanitize_string_field(info, string_field): - field = info.get(string_field) - if field is None or isinstance(field, compat_str): - return - report_force_conversion(string_field, 'a string', 'string') - info[string_field] = compat_str(field) - - def sanitize_numeric_fields(info): - for numeric_field in self._NUMERIC_FIELDS: - field = info.get(numeric_field) - if field is None or isinstance(field, compat_numeric_types): - continue - report_force_conversion(numeric_field, 'numeric', 'int') - info[numeric_field] = int_or_none(field) - - sanitize_string_field(info_dict, 'id') - sanitize_numeric_fields(info_dict) - - if 'playlist' not in info_dict: - # It isn't part of a playlist - info_dict['playlist'] = None - info_dict['playlist_index'] = None - - thumbnails = info_dict.get('thumbnails') - if thumbnails is None: - thumbnail = info_dict.get('thumbnail') - if thumbnail: - info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}] - if thumbnails: - thumbnails.sort(key=lambda t: ( - t.get('preference') if t.get('preference') is not None else -1, - t.get('width') if t.get('width') is not None else -1, - t.get('height') if t.get('height') is not None else -1, - t.get('id') if t.get('id') is not None else '', t.get('url'))) - for i, t in enumerate(thumbnails): - t['url'] = sanitize_url(t['url']) - if t.get('width') and t.get('height'): - t['resolution'] = '%dx%d' % (t['width'], t['height']) - if t.get('id') is None: - t['id'] = '%d' % i - - if self.params.get('list_thumbnails'): - self.list_thumbnails(info_dict) - return - - thumbnail = info_dict.get('thumbnail') - if thumbnail: - info_dict['thumbnail'] = sanitize_url(thumbnail) - elif thumbnails: - info_dict['thumbnail'] = thumbnails[-1]['url'] - - if 'display_id' not in info_dict and 'id' in info_dict: - info_dict['display_id'] = info_dict['id'] - - for ts_key, date_key in ( - ('timestamp', 'upload_date'), - ('release_timestamp', 'release_date'), - ): - if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None: - # Working around out-of-range timestamp values (e.g. negative ones on Windows, - # see http://bugs.python.org/issue1646728) - try: - upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key]) - info_dict[date_key] = upload_date.strftime('%Y%m%d') - except (ValueError, OverflowError, OSError): - pass - - # Auto generate title fields corresponding to the *_number fields when missing - # in order to always have clean titles. This is very common for TV series. - for field in ('chapter', 'season', 'episode'): - if info_dict.get('%s_number' % field) is not None and not info_dict.get(field): - info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field]) - - for cc_kind in ('subtitles', 'automatic_captions'): - cc = info_dict.get(cc_kind) - if cc: - for _, subtitle in cc.items(): - for subtitle_format in subtitle: - if subtitle_format.get('url'): - subtitle_format['url'] = sanitize_url(subtitle_format['url']) - if subtitle_format.get('ext') is None: - subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower() - - automatic_captions = info_dict.get('automatic_captions') - subtitles = info_dict.get('subtitles') - - if self.params.get('listsubtitles', False): - if 'automatic_captions' in info_dict: - self.list_subtitles( - info_dict['id'], automatic_captions, 'automatic captions') - self.list_subtitles(info_dict['id'], subtitles, 'subtitles') - return - - info_dict['requested_subtitles'] = self.process_subtitles( - info_dict['id'], subtitles, automatic_captions) - - # We now pick which formats have to be downloaded - if info_dict.get('formats') is None: - # There's only one format available - formats = [info_dict] - else: - formats = info_dict['formats'] - - if not formats: - raise ExtractorError('No video formats found!') - - def is_wellformed(f): - url = f.get('url') - if not url: - self.report_warning( - '"url" field is missing or empty - skipping format, ' - 'there is an error in extractor') - return False - if isinstance(url, bytes): - sanitize_string_field(f, 'url') - return True - - # Filter out malformed formats for better extraction robustness - formats = list(filter(is_wellformed, formats)) - - formats_dict = {} - - # We check that all the formats have the format and format_id fields - for i, format in enumerate(formats): - sanitize_string_field(format, 'format_id') - sanitize_numeric_fields(format) - format['url'] = sanitize_url(format['url']) - if not format.get('format_id'): - format['format_id'] = compat_str(i) - else: - # Sanitize format_id from characters used in format selector expression - format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id']) - format_id = format['format_id'] - if format_id not in formats_dict: - formats_dict[format_id] = [] - formats_dict[format_id].append(format) - - # Make sure all formats have unique format_id - for format_id, ambiguous_formats in formats_dict.items(): - if len(ambiguous_formats) > 1: - for i, format in enumerate(ambiguous_formats): - format['format_id'] = '%s-%d' % (format_id, i) - - for i, format in enumerate(formats): - if format.get('format') is None: - format['format'] = '{id} - {res}{note}'.format( - id=format['format_id'], - res=self.format_resolution(format), - note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '', - ) - # Automatically determine file extension if missing - if format.get('ext') is None: - format['ext'] = determine_ext(format['url']).lower() - # Automatically determine protocol if missing (useful for format - # selection purposes) - if format.get('protocol') is None: - format['protocol'] = determine_protocol(format) - # Add HTTP headers, so that external programs can use them from the - # json output - full_format_info = info_dict.copy() - full_format_info.update(format) - format['http_headers'] = self._calc_headers(full_format_info) - # Remove private housekeeping stuff - if '__x_forwarded_for_ip' in info_dict: - del info_dict['__x_forwarded_for_ip'] - - # TODO Central sorting goes here - - if formats[0] is not info_dict: - # only set the 'formats' fields if the original info_dict list them - # otherwise we end up with a circular reference, the first (and unique) - # element in the 'formats' field in info_dict is info_dict itself, - # which can't be exported to json - info_dict['formats'] = formats - if self.params.get('listformats'): - self.list_formats(info_dict) - return - - req_format = self.params.get('format') - if req_format is None: - req_format = self._default_format_spec(info_dict, download=download) - if self.params.get('verbose'): - self._write_string('[debug] Default format spec: %s\n' % req_format) - - format_selector = self.build_format_selector(req_format) - - # While in format selection we may need to have an access to the original - # format set in order to calculate some metrics or do some processing. - # For now we need to be able to guess whether original formats provided - # by extractor are incomplete or not (i.e. whether extractor provides only - # video-only or audio-only formats) for proper formats selection for - # extractors with such incomplete formats (see - # https://github.com/ytdl-org/youtube-dl/pull/5556). - # Since formats may be filtered during format selection and may not match - # the original formats the results may be incorrect. Thus original formats - # or pre-calculated metrics should be passed to format selection routines - # as well. - # We will pass a context object containing all necessary additional data - # instead of just formats. - # This fixes incorrect format selection issue (see - # https://github.com/ytdl-org/youtube-dl/issues/10083). - incomplete_formats = ( - # All formats are video-only or - all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats) - # all formats are audio-only - or all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats)) - - ctx = { - 'formats': formats, - 'incomplete_formats': incomplete_formats, - } - - formats_to_download = list(format_selector(ctx)) - if not formats_to_download: - raise ExtractorError('requested format not available', - expected=True) - - if download: - if len(formats_to_download) > 1: - self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download))) - for format in formats_to_download: - new_info = dict(info_dict) - new_info.update(format) - self.process_info(new_info) - # We update the info dict with the best quality format (backwards compatibility) - info_dict.update(formats_to_download[-1]) - return info_dict - - def process_subtitles(self, video_id, normal_subtitles, automatic_captions): - """Select the requested subtitles and their format""" - available_subs = {} - if normal_subtitles and self.params.get('writesubtitles'): - available_subs.update(normal_subtitles) - if automatic_captions and self.params.get('writeautomaticsub'): - for lang, cap_info in automatic_captions.items(): - if lang not in available_subs: - available_subs[lang] = cap_info - - if (not self.params.get('writesubtitles') and not - self.params.get('writeautomaticsub') or not - available_subs): - return None - - if self.params.get('allsubtitles', False): - requested_langs = available_subs.keys() - else: - if self.params.get('subtitleslangs', False): - requested_langs = self.params.get('subtitleslangs') - elif 'en' in available_subs: - requested_langs = ['en'] - else: - requested_langs = [list(available_subs.keys())[0]] - - formats_query = self.params.get('subtitlesformat', 'best') - formats_preference = formats_query.split('/') if formats_query else [] - subs = {} - for lang in requested_langs: - formats = available_subs.get(lang) - if formats is None: - self.report_warning('%s subtitles not available for %s' % (lang, video_id)) - continue - for ext in formats_preference: - if ext == 'best': - f = formats[-1] - break - matches = list(filter(lambda f: f['ext'] == ext, formats)) - if matches: - f = matches[-1] - break - else: - f = formats[-1] - self.report_warning( - 'No subtitle format found matching "%s" for language %s, ' - 'using %s' % (formats_query, lang, f['ext'])) - subs[lang] = f - return subs - - def __forced_printings(self, info_dict, filename, incomplete): - def print_mandatory(field): - if (self.params.get('force%s' % field, False) - and (not incomplete or info_dict.get(field) is not None)): - self.to_stdout(info_dict[field]) - - def print_optional(field): - if (self.params.get('force%s' % field, False) - and info_dict.get(field) is not None): - self.to_stdout(info_dict[field]) - - print_mandatory('title') - print_mandatory('id') - if self.params.get('forceurl', False) and not incomplete: - if info_dict.get('requested_formats') is not None: - for f in info_dict['requested_formats']: - self.to_stdout(f['url'] + f.get('play_path', '')) - else: - # For RTMP URLs, also include the playpath - self.to_stdout(info_dict['url'] + info_dict.get('play_path', '')) - print_optional('thumbnail') - print_optional('description') - if self.params.get('forcefilename', False) and filename is not None: - self.to_stdout(filename) - if self.params.get('forceduration', False) and info_dict.get('duration') is not None: - self.to_stdout(formatSeconds(info_dict['duration'])) - print_mandatory('format') - if self.params.get('forcejson', False): - self.to_stdout(json.dumps(info_dict)) - - def process_info(self, info_dict): - """Process a single resolved IE result.""" - - assert info_dict.get('_type', 'video') == 'video' - - max_downloads = self.params.get('max_downloads') - if max_downloads is not None: - if self._num_downloads >= int(max_downloads): - raise MaxDownloadsReached() - - # TODO: backward compatibility, to be removed - info_dict['fulltitle'] = info_dict['title'] - - if 'format' not in info_dict: - info_dict['format'] = info_dict['ext'] - - reason = self._match_entry(info_dict, incomplete=False) - if reason is not None: - self.to_screen('[download] ' + reason) - return - - self._num_downloads += 1 - - info_dict['_filename'] = filename = self.prepare_filename(info_dict) - - # Forced printings - self.__forced_printings(info_dict, filename, incomplete=False) - - # Do nothing else if in simulate mode - if self.params.get('simulate', False): - return - - if filename is None: - return - - def ensure_dir_exists(path): - try: - dn = os.path.dirname(path) - if dn and not os.path.exists(dn): - os.makedirs(dn) - return True - except (OSError, IOError) as err: - if isinstance(err, OSError) and err.errno == errno.EEXIST: - return True - self.report_error('unable to create directory ' + error_to_compat_str(err)) - return False - - if not ensure_dir_exists(sanitize_path(encodeFilename(filename))): - return - - if self.params.get('writedescription', False): - descfn = replace_extension(filename, 'description', info_dict.get('ext')) - if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)): - self.to_screen('[info] Video description is already present') - elif info_dict.get('description') is None: - self.report_warning('There\'s no description to write.') - else: - try: - self.to_screen('[info] Writing video description to: ' + descfn) - with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile: - descfile.write(info_dict['description']) - except (OSError, IOError): - self.report_error('Cannot write description file ' + descfn) - return - - if self.params.get('writeannotations', False): - annofn = replace_extension(filename, 'annotations.xml', info_dict.get('ext')) - if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)): - self.to_screen('[info] Video annotations are already present') - elif not info_dict.get('annotations'): - self.report_warning('There are no annotations to write.') - else: - try: - self.to_screen('[info] Writing video annotations to: ' + annofn) - with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile: - annofile.write(info_dict['annotations']) - except (KeyError, TypeError): - self.report_warning('There are no annotations to write.') - except (OSError, IOError): - self.report_error('Cannot write annotations file: ' + annofn) - return - - subtitles_are_requested = any([self.params.get('writesubtitles', False), - self.params.get('writeautomaticsub')]) - - if subtitles_are_requested and info_dict.get('requested_subtitles'): - # subtitles download errors are already managed as troubles in relevant IE - # that way it will silently go on when used with unsupporting IE - subtitles = info_dict['requested_subtitles'] - ie = self.get_info_extractor(info_dict['extractor_key']) - for sub_lang, sub_info in subtitles.items(): - sub_format = sub_info['ext'] - sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext')) - if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)): - self.to_screen('[info] Video subtitle %s.%s is already present' % (sub_lang, sub_format)) - else: - self.to_screen('[info] Writing video subtitles to: ' + sub_filename) - if sub_info.get('data') is not None: - try: - # Use newline='' to prevent conversion of newline characters - # See https://github.com/ytdl-org/youtube-dl/issues/10268 - with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile: - subfile.write(sub_info['data']) - except (OSError, IOError): - self.report_error('Cannot write subtitles file ' + sub_filename) - return - else: - try: - sub_data = ie._request_webpage( - sub_info['url'], info_dict['id'], note=False).read() - with io.open(encodeFilename(sub_filename), 'wb') as subfile: - subfile.write(sub_data) - except (ExtractorError, IOError, OSError, ValueError) as err: - self.report_warning('Unable to download subtitle for "%s": %s' % - (sub_lang, error_to_compat_str(err))) - continue - - if self.params.get('writeinfojson', False): - infofn = replace_extension(filename, 'info.json', info_dict.get('ext')) - if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)): - self.to_screen('[info] Video description metadata is already present') - else: - self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn) - try: - write_json_file(self.filter_requested_info(info_dict), infofn) - except (OSError, IOError): - self.report_error('Cannot write metadata to JSON file ' + infofn) - return - - self._write_thumbnails(info_dict, filename) - - if not self.params.get('skip_download', False): - try: - def dl(name, info): - fd = get_suitable_downloader(info, self.params)(self, self.params) - for ph in self._progress_hooks: - fd.add_progress_hook(ph) - if self.params.get('verbose'): - self.to_screen('[debug] Invoking downloader on %r' % info.get('url')) - return fd.download(name, info) - - if info_dict.get('requested_formats') is not None: - downloaded = [] - success = True - merger = FFmpegMergerPP(self) - if not merger.available: - postprocessors = [] - self.report_warning('You have requested multiple ' - 'formats but ffmpeg or avconv are not installed.' - ' The formats won\'t be merged.') - else: - postprocessors = [merger] - - def compatible_formats(formats): - video, audio = formats - # Check extension - video_ext, audio_ext = video.get('ext'), audio.get('ext') - if video_ext and audio_ext: - COMPATIBLE_EXTS = ( - ('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma'), - ('webm') - ) - for exts in COMPATIBLE_EXTS: - if video_ext in exts and audio_ext in exts: - return True - # TODO: Check acodec/vcodec - return False - - filename_real_ext = os.path.splitext(filename)[1][1:] - filename_wo_ext = ( - os.path.splitext(filename)[0] - if filename_real_ext == info_dict['ext'] - else filename) - requested_formats = info_dict['requested_formats'] - if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats): - info_dict['ext'] = 'mkv' - self.report_warning( - 'Requested formats are incompatible for merge and will be merged into mkv.') - # Ensure filename always has a correct extension for successful merge - filename = '%s.%s' % (filename_wo_ext, info_dict['ext']) - if os.path.exists(encodeFilename(filename)): - self.to_screen( - '[download] %s has already been downloaded and ' - 'merged' % filename) - else: - for f in requested_formats: - new_info = dict(info_dict) - new_info.update(f) - fname = prepend_extension( - self.prepare_filename(new_info), - 'f%s' % f['format_id'], new_info['ext']) - if not ensure_dir_exists(fname): - return - downloaded.append(fname) - partial_success = dl(fname, new_info) - success = success and partial_success - info_dict['__postprocessors'] = postprocessors - info_dict['__files_to_merge'] = downloaded - else: - # Just a single file - success = dl(filename, info_dict) - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self.report_error('unable to download video data: %s' % error_to_compat_str(err)) - return - except (OSError, IOError) as err: - raise UnavailableVideoError(err) - except (ContentTooShortError, ) as err: - self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded)) - return - - if success and filename != '-': - # Fixup content - fixup_policy = self.params.get('fixup') - if fixup_policy is None: - fixup_policy = 'detect_or_warn' - - INSTALL_FFMPEG_MESSAGE = 'Install ffmpeg or avconv to fix this automatically.' - - stretched_ratio = info_dict.get('stretched_ratio') - if stretched_ratio is not None and stretched_ratio != 1: - if fixup_policy == 'warn': - self.report_warning('%s: Non-uniform pixel ratio (%s)' % ( - info_dict['id'], stretched_ratio)) - elif fixup_policy == 'detect_or_warn': - stretched_pp = FFmpegFixupStretchedPP(self) - if stretched_pp.available: - info_dict.setdefault('__postprocessors', []) - info_dict['__postprocessors'].append(stretched_pp) - else: - self.report_warning( - '%s: Non-uniform pixel ratio (%s). %s' - % (info_dict['id'], stretched_ratio, INSTALL_FFMPEG_MESSAGE)) - else: - assert fixup_policy in ('ignore', 'never') - - if (info_dict.get('requested_formats') is None - and info_dict.get('container') == 'm4a_dash'): - if fixup_policy == 'warn': - self.report_warning( - '%s: writing DASH m4a. ' - 'Only some players support this container.' - % info_dict['id']) - elif fixup_policy == 'detect_or_warn': - fixup_pp = FFmpegFixupM4aPP(self) - if fixup_pp.available: - info_dict.setdefault('__postprocessors', []) - info_dict['__postprocessors'].append(fixup_pp) - else: - self.report_warning( - '%s: writing DASH m4a. ' - 'Only some players support this container. %s' - % (info_dict['id'], INSTALL_FFMPEG_MESSAGE)) - else: - assert fixup_policy in ('ignore', 'never') - - if (info_dict.get('protocol') == 'm3u8_native' - or info_dict.get('protocol') == 'm3u8' - and self.params.get('hls_prefer_native')): - if fixup_policy == 'warn': - self.report_warning('%s: malformed AAC bitstream detected.' % ( - info_dict['id'])) - elif fixup_policy == 'detect_or_warn': - fixup_pp = FFmpegFixupM3u8PP(self) - if fixup_pp.available: - info_dict.setdefault('__postprocessors', []) - info_dict['__postprocessors'].append(fixup_pp) - else: - self.report_warning( - '%s: malformed AAC bitstream detected. %s' - % (info_dict['id'], INSTALL_FFMPEG_MESSAGE)) - else: - assert fixup_policy in ('ignore', 'never') - - try: - self.post_process(filename, info_dict) - except (PostProcessingError) as err: - self.report_error('postprocessing: %s' % str(err)) - return - self.record_download_archive(info_dict) - - def download(self, url_list): - """Download a given list of URLs.""" - outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL) - if (len(url_list) > 1 - and outtmpl != '-' - and '%' not in outtmpl - and self.params.get('max_downloads') != 1): - raise SameFileError(outtmpl) - - for url in url_list: - try: - # It also downloads the videos - res = self.extract_info( - url, force_generic_extractor=self.params.get('force_generic_extractor', False)) - except UnavailableVideoError: - self.report_error('unable to download video') - except MaxDownloadsReached: - self.to_screen('[info] Maximum number of downloaded files reached.') - raise - else: - if self.params.get('dump_single_json', False): - self.to_stdout(json.dumps(res)) - - return self._download_retcode - - def download_with_info_file(self, info_filename): - with contextlib.closing(fileinput.FileInput( - [info_filename], mode='r', - openhook=fileinput.hook_encoded('utf-8'))) as f: - # FileInput doesn't have a read method, we can't call json.load - info = self.filter_requested_info(json.loads('\n'.join(f))) - try: - self.process_ie_result(info, download=True) - except DownloadError: - webpage_url = info.get('webpage_url') - if webpage_url is not None: - self.report_warning('The info failed to download, trying with "%s"' % webpage_url) - return self.download([webpage_url]) - else: - raise - return self._download_retcode - - @staticmethod - def filter_requested_info(info_dict): - return dict( - (k, v) for k, v in info_dict.items() - if k not in ['requested_formats', 'requested_subtitles']) - - def post_process(self, filename, ie_info): - """Run all the postprocessors on the given file.""" - info = dict(ie_info) - info['filepath'] = filename - pps_chain = [] - if ie_info.get('__postprocessors') is not None: - pps_chain.extend(ie_info['__postprocessors']) - pps_chain.extend(self._pps) - for pp in pps_chain: - files_to_delete = [] - try: - files_to_delete, info = pp.run(info) - except PostProcessingError as e: - self.report_error(e.msg) - if files_to_delete and not self.params.get('keepvideo', False): - for old_filename in files_to_delete: - self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename) - try: - os.remove(encodeFilename(old_filename)) - except (IOError, OSError): - self.report_warning('Unable to remove downloaded original file') - - def _make_archive_id(self, info_dict): - video_id = info_dict.get('id') - if not video_id: - return - # Future-proof against any change in case - # and backwards compatibility with prior versions - extractor = info_dict.get('extractor_key') or info_dict.get('ie_key') # key in a playlist - if extractor is None: - url = str_or_none(info_dict.get('url')) - if not url: - return - # Try to find matching extractor for the URL and take its ie_key - for ie in self._ies: - if ie.suitable(url): - extractor = ie.ie_key() - break - else: - return - return extractor.lower() + ' ' + video_id - - def in_download_archive(self, info_dict): - fn = self.params.get('download_archive') - if fn is None: - return False - - vid_id = self._make_archive_id(info_dict) - if not vid_id: - return False # Incomplete video information - - try: - with locked_file(fn, 'r', encoding='utf-8') as archive_file: - for line in archive_file: - if line.strip() == vid_id: - return True - except IOError as ioe: - if ioe.errno != errno.ENOENT: - raise - return False - - def record_download_archive(self, info_dict): - fn = self.params.get('download_archive') - if fn is None: - return - vid_id = self._make_archive_id(info_dict) - assert vid_id - with locked_file(fn, 'a', encoding='utf-8') as archive_file: - archive_file.write(vid_id + '\n') - - @staticmethod - def format_resolution(format, default='unknown'): - if format.get('vcodec') == 'none': - return 'audio only' - if format.get('resolution') is not None: - return format['resolution'] - if format.get('height') is not None: - if format.get('width') is not None: - res = '%sx%s' % (format['width'], format['height']) - else: - res = '%sp' % format['height'] - elif format.get('width') is not None: - res = '%dx?' % format['width'] - else: - res = default - return res - - def _format_note(self, fdict): - res = '' - if fdict.get('ext') in ['f4f', 'f4m']: - res += '(unsupported) ' - if fdict.get('language'): - if res: - res += ' ' - res += '[%s] ' % fdict['language'] - if fdict.get('format_note') is not None: - res += fdict['format_note'] + ' ' - if fdict.get('tbr') is not None: - res += '%4dk ' % fdict['tbr'] - if fdict.get('container') is not None: - if res: - res += ', ' - res += '%s container' % fdict['container'] - if (fdict.get('vcodec') is not None - and fdict.get('vcodec') != 'none'): - if res: - res += ', ' - res += fdict['vcodec'] - if fdict.get('vbr') is not None: - res += '@' - elif fdict.get('vbr') is not None and fdict.get('abr') is not None: - res += 'video@' - if fdict.get('vbr') is not None: - res += '%4dk' % fdict['vbr'] - if fdict.get('fps') is not None: - if res: - res += ', ' - res += '%sfps' % fdict['fps'] - if fdict.get('acodec') is not None: - if res: - res += ', ' - if fdict['acodec'] == 'none': - res += 'video only' - else: - res += '%-5s' % fdict['acodec'] - elif fdict.get('abr') is not None: - if res: - res += ', ' - res += 'audio' - if fdict.get('abr') is not None: - res += '@%3dk' % fdict['abr'] - if fdict.get('asr') is not None: - res += ' (%5dHz)' % fdict['asr'] - if fdict.get('filesize') is not None: - if res: - res += ', ' - res += format_bytes(fdict['filesize']) - elif fdict.get('filesize_approx') is not None: - if res: - res += ', ' - res += '~' + format_bytes(fdict['filesize_approx']) - return res - - def list_formats(self, info_dict): - formats = info_dict.get('formats', [info_dict]) - table = [ - [f['format_id'], f['ext'], self.format_resolution(f), self._format_note(f)] - for f in formats - if f.get('preference') is None or f['preference'] >= -1000] - if len(formats) > 1: - table[-1][-1] += (' ' if table[-1][-1] else '') + '(best)' - - header_line = ['format code', 'extension', 'resolution', 'note'] - self.to_screen( - '[info] Available formats for %s:\n%s' % - (info_dict['id'], render_table(header_line, table))) - - def list_thumbnails(self, info_dict): - thumbnails = info_dict.get('thumbnails') - if not thumbnails: - self.to_screen('[info] No thumbnails present for %s' % info_dict['id']) - return - - self.to_screen( - '[info] Thumbnails for %s:' % info_dict['id']) - self.to_screen(render_table( - ['ID', 'width', 'height', 'URL'], - [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails])) - - def list_subtitles(self, video_id, subtitles, name='subtitles'): - if not subtitles: - self.to_screen('%s has no %s' % (video_id, name)) - return - self.to_screen( - 'Available %s for %s:' % (name, video_id)) - self.to_screen(render_table( - ['Language', 'formats'], - [[lang, ', '.join(f['ext'] for f in reversed(formats))] - for lang, formats in subtitles.items()])) - - def urlopen(self, req): - """ Start an HTTP download """ - if isinstance(req, compat_basestring): - req = sanitized_Request(req) - return self._opener.open(req, timeout=self._socket_timeout) - - def print_debug_header(self): - if not self.params.get('verbose'): - return - - if type('') is not compat_str: - # Python 2.6 on SLES11 SP1 (https://github.com/ytdl-org/youtube-dl/issues/3326) - self.report_warning( - 'Your Python is broken! Update to a newer and supported version') - - stdout_encoding = getattr( - sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__) - encoding_str = ( - '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % ( - locale.getpreferredencoding(), - sys.getfilesystemencoding(), - stdout_encoding, - self.get_encoding())) - write_string(encoding_str, encoding=None) - - self._write_string('[debug] youtube-dl version ' + __version__ + '\n') - if _LAZY_LOADER: - self._write_string('[debug] Lazy loading extractors enabled' + '\n') - try: - sp = subprocess.Popen( - ['git', 'rev-parse', '--short', 'HEAD'], - stdout=subprocess.PIPE, stderr=subprocess.PIPE, - cwd=os.path.dirname(os.path.abspath(__file__))) - out, err = sp.communicate() - out = out.decode().strip() - if re.match('[0-9a-f]+', out): - self._write_string('[debug] Git HEAD: ' + out + '\n') - except Exception: - try: - sys.exc_clear() - except Exception: - pass - - def python_implementation(): - impl_name = platform.python_implementation() - if impl_name == 'PyPy' and hasattr(sys, 'pypy_version_info'): - return impl_name + ' version %d.%d.%d' % sys.pypy_version_info[:3] - return impl_name - - self._write_string('[debug] Python version %s (%s) - %s\n' % ( - platform.python_version(), python_implementation(), - platform_name())) - - exe_versions = FFmpegPostProcessor.get_versions(self) - exe_versions['rtmpdump'] = rtmpdump_version() - exe_versions['phantomjs'] = PhantomJSwrapper._version() - exe_str = ', '.join( - '%s %s' % (exe, v) - for exe, v in sorted(exe_versions.items()) - if v - ) - if not exe_str: - exe_str = 'none' - self._write_string('[debug] exe versions: %s\n' % exe_str) - - proxy_map = {} - for handler in self._opener.handlers: - if hasattr(handler, 'proxies'): - proxy_map.update(handler.proxies) - self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n') - - if self.params.get('call_home', False): - ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8') - self._write_string('[debug] Public IP address: %s\n' % ipaddr) - latest_version = self.urlopen( - 'https://yt-dl.org/latest/version').read().decode('utf-8') - if version_tuple(latest_version) > version_tuple(__version__): - self.report_warning( - 'You are using an outdated version (newest version: %s)! ' - 'See https://yt-dl.org/update if you need help updating.' % - latest_version) - - def _setup_opener(self): - timeout_val = self.params.get('socket_timeout') - self._socket_timeout = 600 if timeout_val is None else float(timeout_val) - - opts_cookiefile = self.params.get('cookiefile') - opts_proxy = self.params.get('proxy') - - if opts_cookiefile is None: - self.cookiejar = compat_cookiejar.CookieJar() - else: - opts_cookiefile = expand_path(opts_cookiefile) - self.cookiejar = YoutubeDLCookieJar(opts_cookiefile) - if os.access(opts_cookiefile, os.R_OK): - self.cookiejar.load(ignore_discard=True, ignore_expires=True) - - cookie_processor = YoutubeDLCookieProcessor(self.cookiejar) - if opts_proxy is not None: - if opts_proxy == '': - proxies = {} - else: - proxies = {'http': opts_proxy, 'https': opts_proxy} - else: - proxies = compat_urllib_request.getproxies() - # Set HTTPS proxy to HTTP one if given (https://github.com/ytdl-org/youtube-dl/issues/805) - if 'http' in proxies and 'https' not in proxies: - proxies['https'] = proxies['http'] - proxy_handler = PerRequestProxyHandler(proxies) - - debuglevel = 1 if self.params.get('debug_printtraffic') else 0 - https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel) - ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel) - redirect_handler = YoutubeDLRedirectHandler() - data_handler = compat_urllib_request_DataHandler() - - # When passing our own FileHandler instance, build_opener won't add the - # default FileHandler and allows us to disable the file protocol, which - # can be used for malicious purposes (see - # https://github.com/ytdl-org/youtube-dl/issues/8227) - file_handler = compat_urllib_request.FileHandler() - - def file_open(*args, **kwargs): - raise compat_urllib_error.URLError('file:// scheme is explicitly disabled in youtube-dl for security reasons') - file_handler.file_open = file_open - - opener = compat_urllib_request.build_opener( - proxy_handler, https_handler, cookie_processor, ydlh, redirect_handler, data_handler, file_handler) - - # Delete the default user-agent header, which would otherwise apply in - # cases where our custom HTTP handler doesn't come into play - # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details) - opener.addheaders = [] - self._opener = opener - - def encode(self, s): - if isinstance(s, bytes): - return s # Already encoded - - try: - return s.encode(self.get_encoding()) - except UnicodeEncodeError as err: - err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.' - raise - - def get_encoding(self): - encoding = self.params.get('encoding') - if encoding is None: - encoding = preferredencoding() - return encoding - - def _write_thumbnails(self, info_dict, filename): - if self.params.get('writethumbnail', False): - thumbnails = info_dict.get('thumbnails') - if thumbnails: - thumbnails = [thumbnails[-1]] - elif self.params.get('write_all_thumbnails', False): - thumbnails = info_dict.get('thumbnails') - else: - return - - if not thumbnails: - # No thumbnails present, so return immediately - return - - for t in thumbnails: - thumb_ext = determine_ext(t['url'], 'jpg') - suffix = '_%s' % t['id'] if len(thumbnails) > 1 else '' - thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else '' - t['filename'] = thumb_filename = replace_extension(filename + suffix, thumb_ext, info_dict.get('ext')) - - if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)): - self.to_screen('[%s] %s: Thumbnail %sis already present' % - (info_dict['extractor'], info_dict['id'], thumb_display_id)) - else: - self.to_screen('[%s] %s: Downloading thumbnail %s...' % - (info_dict['extractor'], info_dict['id'], thumb_display_id)) - try: - uf = self.urlopen(t['url']) - with open(encodeFilename(thumb_filename), 'wb') as thumbf: - shutil.copyfileobj(uf, thumbf) - self.to_screen('[%s] %s: Writing thumbnail %sto: %s' % - (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename)) - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self.report_warning('Unable to download thumbnail "%s": %s' % - (t['url'], error_to_compat_str(err))) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py deleted file mode 100644 index a30526589..000000000 --- a/youtube_dl/__init__.py +++ /dev/null @@ -1,478 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -from __future__ import unicode_literals - -__license__ = 'CC0-1.0' - -import codecs -import io -import os -import random -import sys - - -from .options import ( - parseOpts, -) -from .compat import ( - compat_getpass, - compat_shlex_split, - workaround_optparse_bug9161, -) -from .utils import ( - DateRange, - decodeOption, - DEFAULT_OUTTMPL, - DownloadError, - expand_path, - match_filter_func, - MaxDownloadsReached, - preferredencoding, - read_batch_urls, - SameFileError, - setproctitle, - std_headers, - write_string, - render_table, -) -from .downloader import ( - FileDownloader, -) -from .extractor import gen_extractors, list_extractors -from .extractor.adobepass import MSO_INFO -from .YoutubeDL import YoutubeDL - - -def _real_main(argv=None): - # Compatibility fixes for Windows - if sys.platform == 'win32': - # https://github.com/ytdl-org/youtube-dl/issues/820 - codecs.register(lambda name: codecs.lookup('utf-8') if name == 'cp65001' else None) - - workaround_optparse_bug9161() - - setproctitle('youtube-dl') - - parser, opts, args = parseOpts(argv) - - # Set user agent - if opts.user_agent is not None: - std_headers['User-Agent'] = opts.user_agent - - # Set referer - if opts.referer is not None: - std_headers['Referer'] = opts.referer - - # Custom HTTP headers - if opts.headers is not None: - for h in opts.headers: - if ':' not in h: - parser.error('wrong header formatting, it should be key:value, not "%s"' % h) - key, value = h.split(':', 1) - if opts.verbose: - write_string('[debug] Adding header from command line option %s:%s\n' % (key, value)) - std_headers[key] = value - - # Dump user agent - if opts.dump_user_agent: - write_string(std_headers['User-Agent'] + '\n', out=sys.stdout) - sys.exit(0) - - # Batch file verification - batch_urls = [] - if opts.batchfile is not None: - try: - if opts.batchfile == '-': - batchfd = sys.stdin - else: - batchfd = io.open( - expand_path(opts.batchfile), - 'r', encoding='utf-8', errors='ignore') - batch_urls = read_batch_urls(batchfd) - if opts.verbose: - write_string('[debug] Batch file urls: ' + repr(batch_urls) + '\n') - except IOError: - sys.exit('ERROR: batch file %s could not be read' % opts.batchfile) - all_urls = batch_urls + [url.strip() for url in args] # batch_urls are already striped in read_batch_urls - _enc = preferredencoding() - all_urls = [url.decode(_enc, 'ignore') if isinstance(url, bytes) else url for url in all_urls] - - if opts.list_extractors: - for ie in list_extractors(opts.age_limit): - write_string(ie.IE_NAME + (' (CURRENTLY BROKEN)' if not ie._WORKING else '') + '\n', out=sys.stdout) - matchedUrls = [url for url in all_urls if ie.suitable(url)] - for mu in matchedUrls: - write_string(' ' + mu + '\n', out=sys.stdout) - sys.exit(0) - if opts.list_extractor_descriptions: - for ie in list_extractors(opts.age_limit): - if not ie._WORKING: - continue - desc = getattr(ie, 'IE_DESC', ie.IE_NAME) - if desc is False: - continue - if hasattr(ie, 'SEARCH_KEY'): - _SEARCHES = ('cute kittens', 'slithering pythons', 'falling cat', 'angry poodle', 'purple fish', 'running tortoise', 'sleeping bunny', 'burping cow') - _COUNTS = ('', '5', '10', 'all') - desc += ' (Example: "%s%s:%s" )' % (ie.SEARCH_KEY, random.choice(_COUNTS), random.choice(_SEARCHES)) - write_string(desc + '\n', out=sys.stdout) - sys.exit(0) - if opts.ap_list_mso: - table = [[mso_id, mso_info['name']] for mso_id, mso_info in MSO_INFO.items()] - write_string('Supported TV Providers:\n' + render_table(['mso', 'mso name'], table) + '\n', out=sys.stdout) - sys.exit(0) - - # Conflicting, missing and erroneous options - if opts.usenetrc and (opts.username is not None or opts.password is not None): - parser.error('using .netrc conflicts with giving username/password') - if opts.password is not None and opts.username is None: - parser.error('account username missing\n') - if opts.ap_password is not None and opts.ap_username is None: - parser.error('TV Provider account username missing\n') - if opts.outtmpl is not None and (opts.usetitle or opts.autonumber or opts.useid): - parser.error('using output template conflicts with using title, video ID or auto number') - if opts.autonumber_size is not None: - if opts.autonumber_size <= 0: - parser.error('auto number size must be positive') - if opts.autonumber_start is not None: - if opts.autonumber_start < 0: - parser.error('auto number start must be positive or 0') - if opts.usetitle and opts.useid: - parser.error('using title conflicts with using video ID') - if opts.username is not None and opts.password is None: - opts.password = compat_getpass('Type account password and press [Return]: ') - if opts.ap_username is not None and opts.ap_password is None: - opts.ap_password = compat_getpass('Type TV provider account password and press [Return]: ') - if opts.ratelimit is not None: - numeric_limit = FileDownloader.parse_bytes(opts.ratelimit) - if numeric_limit is None: - parser.error('invalid rate limit specified') - opts.ratelimit = numeric_limit - if opts.min_filesize is not None: - numeric_limit = FileDownloader.parse_bytes(opts.min_filesize) - if numeric_limit is None: - parser.error('invalid min_filesize specified') - opts.min_filesize = numeric_limit - if opts.max_filesize is not None: - numeric_limit = FileDownloader.parse_bytes(opts.max_filesize) - if numeric_limit is None: - parser.error('invalid max_filesize specified') - opts.max_filesize = numeric_limit - if opts.sleep_interval is not None: - if opts.sleep_interval < 0: - parser.error('sleep interval must be positive or 0') - if opts.max_sleep_interval is not None: - if opts.max_sleep_interval < 0: - parser.error('max sleep interval must be positive or 0') - if opts.sleep_interval is None: - parser.error('min sleep interval must be specified, use --min-sleep-interval') - if opts.max_sleep_interval < opts.sleep_interval: - parser.error('max sleep interval must be greater than or equal to min sleep interval') - else: - opts.max_sleep_interval = opts.sleep_interval - if opts.ap_mso and opts.ap_mso not in MSO_INFO: - parser.error('Unsupported TV Provider, use --ap-list-mso to get a list of supported TV Providers') - - def parse_retries(retries): - if retries in ('inf', 'infinite'): - parsed_retries = float('inf') - else: - try: - parsed_retries = int(retries) - except (TypeError, ValueError): - parser.error('invalid retry count specified') - return parsed_retries - if opts.retries is not None: - opts.retries = parse_retries(opts.retries) - if opts.fragment_retries is not None: - opts.fragment_retries = parse_retries(opts.fragment_retries) - if opts.buffersize is not None: - numeric_buffersize = FileDownloader.parse_bytes(opts.buffersize) - if numeric_buffersize is None: - parser.error('invalid buffer size specified') - opts.buffersize = numeric_buffersize - if opts.http_chunk_size is not None: - numeric_chunksize = FileDownloader.parse_bytes(opts.http_chunk_size) - if not numeric_chunksize: - parser.error('invalid http chunk size specified') - opts.http_chunk_size = numeric_chunksize - if opts.playliststart <= 0: - raise ValueError('Playlist start must be positive') - if opts.playlistend not in (-1, None) and opts.playlistend < opts.playliststart: - raise ValueError('Playlist end must be greater than playlist start') - if opts.extractaudio: - if opts.audioformat not in ['best', 'aac', 'flac', 'mp3', 'm4a', 'opus', 'vorbis', 'wav']: - parser.error('invalid audio format specified') - if opts.audioquality: - opts.audioquality = opts.audioquality.strip('k').strip('K') - if not opts.audioquality.isdigit(): - parser.error('invalid audio quality specified') - if opts.recodevideo is not None: - if opts.recodevideo not in ['mp4', 'flv', 'webm', 'ogg', 'mkv', 'avi']: - parser.error('invalid video recode format specified') - if opts.convertsubtitles is not None: - if opts.convertsubtitles not in ['srt', 'vtt', 'ass', 'lrc']: - parser.error('invalid subtitle format specified') - - if opts.date is not None: - date = DateRange.day(opts.date) - else: - date = DateRange(opts.dateafter, opts.datebefore) - - # Do not download videos when there are audio-only formats - if opts.extractaudio and not opts.keepvideo and opts.format is None: - opts.format = 'bestaudio/best' - - # --all-sub automatically sets --write-sub if --write-auto-sub is not given - # this was the old behaviour if only --all-sub was given. - if opts.allsubtitles and not opts.writeautomaticsub: - opts.writesubtitles = True - - outtmpl = ((opts.outtmpl is not None and opts.outtmpl) - or (opts.format == '-1' and opts.usetitle and '%(title)s-%(id)s-%(format)s.%(ext)s') - or (opts.format == '-1' and '%(id)s-%(format)s.%(ext)s') - or (opts.usetitle and opts.autonumber and '%(autonumber)s-%(title)s-%(id)s.%(ext)s') - or (opts.usetitle and '%(title)s-%(id)s.%(ext)s') - or (opts.useid and '%(id)s.%(ext)s') - or (opts.autonumber and '%(autonumber)s-%(id)s.%(ext)s') - or DEFAULT_OUTTMPL) - if not os.path.splitext(outtmpl)[1] and opts.extractaudio: - parser.error('Cannot download a video and extract audio into the same' - ' file! Use "{0}.%(ext)s" instead of "{0}" as the output' - ' template'.format(outtmpl)) - - any_getting = opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration or opts.dumpjson or opts.dump_single_json - any_printing = opts.print_json - download_archive_fn = expand_path(opts.download_archive) if opts.download_archive is not None else opts.download_archive - - # PostProcessors - postprocessors = [] - if opts.metafromtitle: - postprocessors.append({ - 'key': 'MetadataFromTitle', - 'titleformat': opts.metafromtitle - }) - if opts.extractaudio: - postprocessors.append({ - 'key': 'FFmpegExtractAudio', - 'preferredcodec': opts.audioformat, - 'preferredquality': opts.audioquality, - 'nopostoverwrites': opts.nopostoverwrites, - }) - if opts.recodevideo: - postprocessors.append({ - 'key': 'FFmpegVideoConvertor', - 'preferedformat': opts.recodevideo, - }) - # FFmpegMetadataPP should be run after FFmpegVideoConvertorPP and - # FFmpegExtractAudioPP as containers before conversion may not support - # metadata (3gp, webm, etc.) - # And this post-processor should be placed before other metadata - # manipulating post-processors (FFmpegEmbedSubtitle) to prevent loss of - # extra metadata. By default ffmpeg preserves metadata applicable for both - # source and target containers. From this point the container won't change, - # so metadata can be added here. - if opts.addmetadata: - postprocessors.append({'key': 'FFmpegMetadata'}) - if opts.convertsubtitles: - postprocessors.append({ - 'key': 'FFmpegSubtitlesConvertor', - 'format': opts.convertsubtitles, - }) - if opts.embedsubtitles: - postprocessors.append({ - 'key': 'FFmpegEmbedSubtitle', - }) - if opts.embedthumbnail: - already_have_thumbnail = opts.writethumbnail or opts.write_all_thumbnails - postprocessors.append({ - 'key': 'EmbedThumbnail', - 'already_have_thumbnail': already_have_thumbnail - }) - if not already_have_thumbnail: - opts.writethumbnail = True - # XAttrMetadataPP should be run after post-processors that may change file - # contents - if opts.xattrs: - postprocessors.append({'key': 'XAttrMetadata'}) - # Please keep ExecAfterDownload towards the bottom as it allows the user to modify the final file in any way. - # So if the user is able to remove the file before your postprocessor runs it might cause a few problems. - if opts.exec_cmd: - postprocessors.append({ - 'key': 'ExecAfterDownload', - 'exec_cmd': opts.exec_cmd, - }) - external_downloader_args = None - if opts.external_downloader_args: - external_downloader_args = compat_shlex_split(opts.external_downloader_args) - postprocessor_args = None - if opts.postprocessor_args: - postprocessor_args = compat_shlex_split(opts.postprocessor_args) - match_filter = ( - None if opts.match_filter is None - else match_filter_func(opts.match_filter)) - - ydl_opts = { - 'usenetrc': opts.usenetrc, - 'username': opts.username, - 'password': opts.password, - 'twofactor': opts.twofactor, - 'videopassword': opts.videopassword, - 'ap_mso': opts.ap_mso, - 'ap_username': opts.ap_username, - 'ap_password': opts.ap_password, - 'quiet': (opts.quiet or any_getting or any_printing), - 'no_warnings': opts.no_warnings, - 'forceurl': opts.geturl, - 'forcetitle': opts.gettitle, - 'forceid': opts.getid, - 'forcethumbnail': opts.getthumbnail, - 'forcedescription': opts.getdescription, - 'forceduration': opts.getduration, - 'forcefilename': opts.getfilename, - 'forceformat': opts.getformat, - 'forcejson': opts.dumpjson or opts.print_json, - 'dump_single_json': opts.dump_single_json, - 'simulate': opts.simulate or any_getting, - 'skip_download': opts.skip_download, - 'format': opts.format, - 'listformats': opts.listformats, - 'outtmpl': outtmpl, - 'outtmpl_na_placeholder': opts.outtmpl_na_placeholder, - 'autonumber_size': opts.autonumber_size, - 'autonumber_start': opts.autonumber_start, - 'restrictfilenames': opts.restrictfilenames, - 'ignoreerrors': opts.ignoreerrors, - 'force_generic_extractor': opts.force_generic_extractor, - 'ratelimit': opts.ratelimit, - 'nooverwrites': opts.nooverwrites, - 'retries': opts.retries, - 'fragment_retries': opts.fragment_retries, - 'skip_unavailable_fragments': opts.skip_unavailable_fragments, - 'keep_fragments': opts.keep_fragments, - 'buffersize': opts.buffersize, - 'noresizebuffer': opts.noresizebuffer, - 'http_chunk_size': opts.http_chunk_size, - 'continuedl': opts.continue_dl, - 'noprogress': opts.noprogress, - 'progress_with_newline': opts.progress_with_newline, - 'playliststart': opts.playliststart, - 'playlistend': opts.playlistend, - 'playlistreverse': opts.playlist_reverse, - 'playlistrandom': opts.playlist_random, - 'noplaylist': opts.noplaylist, - 'logtostderr': opts.outtmpl == '-', - 'consoletitle': opts.consoletitle, - 'nopart': opts.nopart, - 'updatetime': opts.updatetime, - 'writedescription': opts.writedescription, - 'writeannotations': opts.writeannotations, - 'writeinfojson': opts.writeinfojson, - 'writethumbnail': opts.writethumbnail, - 'write_all_thumbnails': opts.write_all_thumbnails, - 'writesubtitles': opts.writesubtitles, - 'writeautomaticsub': opts.writeautomaticsub, - 'allsubtitles': opts.allsubtitles, - 'listsubtitles': opts.listsubtitles, - 'subtitlesformat': opts.subtitlesformat, - 'subtitleslangs': opts.subtitleslangs, - 'matchtitle': decodeOption(opts.matchtitle), - 'rejecttitle': decodeOption(opts.rejecttitle), - 'max_downloads': opts.max_downloads, - 'prefer_free_formats': opts.prefer_free_formats, - 'verbose': opts.verbose, - 'dump_intermediate_pages': opts.dump_intermediate_pages, - 'write_pages': opts.write_pages, - 'test': opts.test, - 'keepvideo': opts.keepvideo, - 'min_filesize': opts.min_filesize, - 'max_filesize': opts.max_filesize, - 'min_views': opts.min_views, - 'max_views': opts.max_views, - 'daterange': date, - 'cachedir': opts.cachedir, - 'youtube_print_sig_code': opts.youtube_print_sig_code, - 'age_limit': opts.age_limit, - 'download_archive': download_archive_fn, - 'cookiefile': opts.cookiefile, - 'nocheckcertificate': opts.no_check_certificate, - 'prefer_insecure': opts.prefer_insecure, - 'proxy': opts.proxy, - 'socket_timeout': opts.socket_timeout, - 'bidi_workaround': opts.bidi_workaround, - 'debug_printtraffic': opts.debug_printtraffic, - 'prefer_ffmpeg': opts.prefer_ffmpeg, - 'include_ads': opts.include_ads, - 'default_search': opts.default_search, - 'youtube_include_dash_manifest': opts.youtube_include_dash_manifest, - 'encoding': opts.encoding, - 'extract_flat': opts.extract_flat, - 'mark_watched': opts.mark_watched, - 'merge_output_format': opts.merge_output_format, - 'postprocessors': postprocessors, - 'fixup': opts.fixup, - 'source_address': opts.source_address, - 'call_home': opts.call_home, - 'sleep_interval': opts.sleep_interval, - 'max_sleep_interval': opts.max_sleep_interval, - 'external_downloader': opts.external_downloader, - 'list_thumbnails': opts.list_thumbnails, - 'playlist_items': opts.playlist_items, - 'xattr_set_filesize': opts.xattr_set_filesize, - 'match_filter': match_filter, - 'no_color': opts.no_color, - 'ffmpeg_location': opts.ffmpeg_location, - 'hls_prefer_native': opts.hls_prefer_native, - 'hls_use_mpegts': opts.hls_use_mpegts, - 'external_downloader_args': external_downloader_args, - 'postprocessor_args': postprocessor_args, - 'cn_verification_proxy': opts.cn_verification_proxy, - 'geo_verification_proxy': opts.geo_verification_proxy, - 'config_location': opts.config_location, - 'geo_bypass': opts.geo_bypass, - 'geo_bypass_country': opts.geo_bypass_country, - 'geo_bypass_ip_block': opts.geo_bypass_ip_block, - # just for deprecation check - 'autonumber': opts.autonumber if opts.autonumber is True else None, - 'usetitle': opts.usetitle if opts.usetitle is True else None, - } - - with YoutubeDL(ydl_opts) as ydl: - - # Remove cache dir - if opts.rm_cachedir: - ydl.cache.remove() - - # Maybe do nothing - if (len(all_urls) < 1) and (opts.load_info_filename is None): - - ydl.warn_if_short_id(sys.argv[1:] if argv is None else argv) - parser.error( - 'You must provide at least one URL.\n' - 'Type youtube-dl --help to see a list of all options.') - - try: - if opts.load_info_filename is not None: - retcode = ydl.download_with_info_file(expand_path(opts.load_info_filename)) - else: - retcode = ydl.download(all_urls) - except MaxDownloadsReached: - ydl.to_screen('--max-download limit reached, aborting.') - retcode = 101 - - sys.exit(retcode) - - -def main(argv=None): - try: - _real_main(argv) - except DownloadError: - sys.exit(1) - except SameFileError: - sys.exit('ERROR: fixed output name but more than one file to download') - except KeyboardInterrupt: - sys.exit('\nERROR: Interrupted by user') - - -__all__ = ['main', 'YoutubeDL', 'gen_extractors', 'list_extractors'] diff --git a/youtube_dl/__main__.py b/youtube_dl/__main__.py deleted file mode 100755 index 138f5fbec..000000000 --- a/youtube_dl/__main__.py +++ /dev/null @@ -1,19 +0,0 @@ -#!/usr/bin/env python -from __future__ import unicode_literals - -# Execute with -# $ python youtube_dl/__main__.py (2.6+) -# $ python -m youtube_dl (2.7+) - -import sys - -if __package__ is None and not hasattr(sys, 'frozen'): - # direct call of __main__.py - import os.path - path = os.path.realpath(os.path.abspath(__file__)) - sys.path.insert(0, os.path.dirname(os.path.dirname(path))) - -import youtube_dl - -if __name__ == '__main__': - youtube_dl.main() diff --git a/youtube_dl/aes.py b/youtube_dl/aes.py deleted file mode 100644 index 461bb6d41..000000000 --- a/youtube_dl/aes.py +++ /dev/null @@ -1,361 +0,0 @@ -from __future__ import unicode_literals - -from math import ceil - -from .compat import compat_b64decode -from .utils import bytes_to_intlist, intlist_to_bytes - -BLOCK_SIZE_BYTES = 16 - - -def aes_ctr_decrypt(data, key, counter): - """ - Decrypt with aes in counter mode - - @param {int[]} data cipher - @param {int[]} key 16/24/32-Byte cipher key - @param {instance} counter Instance whose next_value function (@returns {int[]} 16-Byte block) - returns the next counter block - @returns {int[]} decrypted data - """ - expanded_key = key_expansion(key) - block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) - - decrypted_data = [] - for i in range(block_count): - counter_block = counter.next_value() - block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES] - block += [0] * (BLOCK_SIZE_BYTES - len(block)) - - cipher_counter_block = aes_encrypt(counter_block, expanded_key) - decrypted_data += xor(block, cipher_counter_block) - decrypted_data = decrypted_data[:len(data)] - - return decrypted_data - - -def aes_cbc_decrypt(data, key, iv): - """ - Decrypt with aes in CBC mode - - @param {int[]} data cipher - @param {int[]} key 16/24/32-Byte cipher key - @param {int[]} iv 16-Byte IV - @returns {int[]} decrypted data - """ - expanded_key = key_expansion(key) - block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) - - decrypted_data = [] - previous_cipher_block = iv - for i in range(block_count): - block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES] - block += [0] * (BLOCK_SIZE_BYTES - len(block)) - - decrypted_block = aes_decrypt(block, expanded_key) - decrypted_data += xor(decrypted_block, previous_cipher_block) - previous_cipher_block = block - decrypted_data = decrypted_data[:len(data)] - - return decrypted_data - - -def aes_cbc_encrypt(data, key, iv): - """ - Encrypt with aes in CBC mode. Using PKCS#7 padding - - @param {int[]} data cleartext - @param {int[]} key 16/24/32-Byte cipher key - @param {int[]} iv 16-Byte IV - @returns {int[]} encrypted data - """ - expanded_key = key_expansion(key) - block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) - - encrypted_data = [] - previous_cipher_block = iv - for i in range(block_count): - block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES] - remaining_length = BLOCK_SIZE_BYTES - len(block) - block += [remaining_length] * remaining_length - mixed_block = xor(block, previous_cipher_block) - - encrypted_block = aes_encrypt(mixed_block, expanded_key) - encrypted_data += encrypted_block - - previous_cipher_block = encrypted_block - - return encrypted_data - - -def key_expansion(data): - """ - Generate key schedule - - @param {int[]} data 16/24/32-Byte cipher key - @returns {int[]} 176/208/240-Byte expanded key - """ - data = data[:] # copy - rcon_iteration = 1 - key_size_bytes = len(data) - expanded_key_size_bytes = (key_size_bytes // 4 + 7) * BLOCK_SIZE_BYTES - - while len(data) < expanded_key_size_bytes: - temp = data[-4:] - temp = key_schedule_core(temp, rcon_iteration) - rcon_iteration += 1 - data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes]) - - for _ in range(3): - temp = data[-4:] - data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes]) - - if key_size_bytes == 32: - temp = data[-4:] - temp = sub_bytes(temp) - data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes]) - - for _ in range(3 if key_size_bytes == 32 else 2 if key_size_bytes == 24 else 0): - temp = data[-4:] - data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes]) - data = data[:expanded_key_size_bytes] - - return data - - -def aes_encrypt(data, expanded_key): - """ - Encrypt one block with aes - - @param {int[]} data 16-Byte state - @param {int[]} expanded_key 176/208/240-Byte expanded key - @returns {int[]} 16-Byte cipher - """ - rounds = len(expanded_key) // BLOCK_SIZE_BYTES - 1 - - data = xor(data, expanded_key[:BLOCK_SIZE_BYTES]) - for i in range(1, rounds + 1): - data = sub_bytes(data) - data = shift_rows(data) - if i != rounds: - data = mix_columns(data) - data = xor(data, expanded_key[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES]) - - return data - - -def aes_decrypt(data, expanded_key): - """ - Decrypt one block with aes - - @param {int[]} data 16-Byte cipher - @param {int[]} expanded_key 176/208/240-Byte expanded key - @returns {int[]} 16-Byte state - """ - rounds = len(expanded_key) // BLOCK_SIZE_BYTES - 1 - - for i in range(rounds, 0, -1): - data = xor(data, expanded_key[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES]) - if i != rounds: - data = mix_columns_inv(data) - data = shift_rows_inv(data) - data = sub_bytes_inv(data) - data = xor(data, expanded_key[:BLOCK_SIZE_BYTES]) - - return data - - -def aes_decrypt_text(data, password, key_size_bytes): - """ - Decrypt text - - The first 8 Bytes of decoded 'data' are the 8 high Bytes of the counter - - The cipher key is retrieved by encrypting the first 16 Byte of 'password' - with the first 'key_size_bytes' Bytes from 'password' (if necessary filled with 0's) - - Mode of operation is 'counter' - - @param {str} data Base64 encoded string - @param {str,unicode} password Password (will be encoded with utf-8) - @param {int} key_size_bytes Possible values: 16 for 128-Bit, 24 for 192-Bit or 32 for 256-Bit - @returns {str} Decrypted data - """ - NONCE_LENGTH_BYTES = 8 - - data = bytes_to_intlist(compat_b64decode(data)) - password = bytes_to_intlist(password.encode('utf-8')) - - key = password[:key_size_bytes] + [0] * (key_size_bytes - len(password)) - key = aes_encrypt(key[:BLOCK_SIZE_BYTES], key_expansion(key)) * (key_size_bytes // BLOCK_SIZE_BYTES) - - nonce = data[:NONCE_LENGTH_BYTES] - cipher = data[NONCE_LENGTH_BYTES:] - - class Counter(object): - __value = nonce + [0] * (BLOCK_SIZE_BYTES - NONCE_LENGTH_BYTES) - - def next_value(self): - temp = self.__value - self.__value = inc(self.__value) - return temp - - decrypted_data = aes_ctr_decrypt(cipher, key, Counter()) - plaintext = intlist_to_bytes(decrypted_data) - - return plaintext - - -RCON = (0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36) -SBOX = (0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76, - 0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0, - 0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC, 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15, - 0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, 0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75, - 0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0, 0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84, - 0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B, 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF, - 0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, 0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8, - 0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5, 0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2, - 0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17, 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73, - 0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, 0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB, - 0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C, 0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79, - 0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9, 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08, - 0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, 0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A, - 0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E, - 0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94, 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF, - 0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16) -SBOX_INV = (0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb, - 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb, - 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e, - 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25, - 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92, - 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84, - 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06, - 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b, - 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73, - 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e, - 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b, - 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4, - 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f, - 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef, - 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61, - 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d) -MIX_COLUMN_MATRIX = ((0x2, 0x3, 0x1, 0x1), - (0x1, 0x2, 0x3, 0x1), - (0x1, 0x1, 0x2, 0x3), - (0x3, 0x1, 0x1, 0x2)) -MIX_COLUMN_MATRIX_INV = ((0xE, 0xB, 0xD, 0x9), - (0x9, 0xE, 0xB, 0xD), - (0xD, 0x9, 0xE, 0xB), - (0xB, 0xD, 0x9, 0xE)) -RIJNDAEL_EXP_TABLE = (0x01, 0x03, 0x05, 0x0F, 0x11, 0x33, 0x55, 0xFF, 0x1A, 0x2E, 0x72, 0x96, 0xA1, 0xF8, 0x13, 0x35, - 0x5F, 0xE1, 0x38, 0x48, 0xD8, 0x73, 0x95, 0xA4, 0xF7, 0x02, 0x06, 0x0A, 0x1E, 0x22, 0x66, 0xAA, - 0xE5, 0x34, 0x5C, 0xE4, 0x37, 0x59, 0xEB, 0x26, 0x6A, 0xBE, 0xD9, 0x70, 0x90, 0xAB, 0xE6, 0x31, - 0x53, 0xF5, 0x04, 0x0C, 0x14, 0x3C, 0x44, 0xCC, 0x4F, 0xD1, 0x68, 0xB8, 0xD3, 0x6E, 0xB2, 0xCD, - 0x4C, 0xD4, 0x67, 0xA9, 0xE0, 0x3B, 0x4D, 0xD7, 0x62, 0xA6, 0xF1, 0x08, 0x18, 0x28, 0x78, 0x88, - 0x83, 0x9E, 0xB9, 0xD0, 0x6B, 0xBD, 0xDC, 0x7F, 0x81, 0x98, 0xB3, 0xCE, 0x49, 0xDB, 0x76, 0x9A, - 0xB5, 0xC4, 0x57, 0xF9, 0x10, 0x30, 0x50, 0xF0, 0x0B, 0x1D, 0x27, 0x69, 0xBB, 0xD6, 0x61, 0xA3, - 0xFE, 0x19, 0x2B, 0x7D, 0x87, 0x92, 0xAD, 0xEC, 0x2F, 0x71, 0x93, 0xAE, 0xE9, 0x20, 0x60, 0xA0, - 0xFB, 0x16, 0x3A, 0x4E, 0xD2, 0x6D, 0xB7, 0xC2, 0x5D, 0xE7, 0x32, 0x56, 0xFA, 0x15, 0x3F, 0x41, - 0xC3, 0x5E, 0xE2, 0x3D, 0x47, 0xC9, 0x40, 0xC0, 0x5B, 0xED, 0x2C, 0x74, 0x9C, 0xBF, 0xDA, 0x75, - 0x9F, 0xBA, 0xD5, 0x64, 0xAC, 0xEF, 0x2A, 0x7E, 0x82, 0x9D, 0xBC, 0xDF, 0x7A, 0x8E, 0x89, 0x80, - 0x9B, 0xB6, 0xC1, 0x58, 0xE8, 0x23, 0x65, 0xAF, 0xEA, 0x25, 0x6F, 0xB1, 0xC8, 0x43, 0xC5, 0x54, - 0xFC, 0x1F, 0x21, 0x63, 0xA5, 0xF4, 0x07, 0x09, 0x1B, 0x2D, 0x77, 0x99, 0xB0, 0xCB, 0x46, 0xCA, - 0x45, 0xCF, 0x4A, 0xDE, 0x79, 0x8B, 0x86, 0x91, 0xA8, 0xE3, 0x3E, 0x42, 0xC6, 0x51, 0xF3, 0x0E, - 0x12, 0x36, 0x5A, 0xEE, 0x29, 0x7B, 0x8D, 0x8C, 0x8F, 0x8A, 0x85, 0x94, 0xA7, 0xF2, 0x0D, 0x17, - 0x39, 0x4B, 0xDD, 0x7C, 0x84, 0x97, 0xA2, 0xFD, 0x1C, 0x24, 0x6C, 0xB4, 0xC7, 0x52, 0xF6, 0x01) -RIJNDAEL_LOG_TABLE = (0x00, 0x00, 0x19, 0x01, 0x32, 0x02, 0x1a, 0xc6, 0x4b, 0xc7, 0x1b, 0x68, 0x33, 0xee, 0xdf, 0x03, - 0x64, 0x04, 0xe0, 0x0e, 0x34, 0x8d, 0x81, 0xef, 0x4c, 0x71, 0x08, 0xc8, 0xf8, 0x69, 0x1c, 0xc1, - 0x7d, 0xc2, 0x1d, 0xb5, 0xf9, 0xb9, 0x27, 0x6a, 0x4d, 0xe4, 0xa6, 0x72, 0x9a, 0xc9, 0x09, 0x78, - 0x65, 0x2f, 0x8a, 0x05, 0x21, 0x0f, 0xe1, 0x24, 0x12, 0xf0, 0x82, 0x45, 0x35, 0x93, 0xda, 0x8e, - 0x96, 0x8f, 0xdb, 0xbd, 0x36, 0xd0, 0xce, 0x94, 0x13, 0x5c, 0xd2, 0xf1, 0x40, 0x46, 0x83, 0x38, - 0x66, 0xdd, 0xfd, 0x30, 0xbf, 0x06, 0x8b, 0x62, 0xb3, 0x25, 0xe2, 0x98, 0x22, 0x88, 0x91, 0x10, - 0x7e, 0x6e, 0x48, 0xc3, 0xa3, 0xb6, 0x1e, 0x42, 0x3a, 0x6b, 0x28, 0x54, 0xfa, 0x85, 0x3d, 0xba, - 0x2b, 0x79, 0x0a, 0x15, 0x9b, 0x9f, 0x5e, 0xca, 0x4e, 0xd4, 0xac, 0xe5, 0xf3, 0x73, 0xa7, 0x57, - 0xaf, 0x58, 0xa8, 0x50, 0xf4, 0xea, 0xd6, 0x74, 0x4f, 0xae, 0xe9, 0xd5, 0xe7, 0xe6, 0xad, 0xe8, - 0x2c, 0xd7, 0x75, 0x7a, 0xeb, 0x16, 0x0b, 0xf5, 0x59, 0xcb, 0x5f, 0xb0, 0x9c, 0xa9, 0x51, 0xa0, - 0x7f, 0x0c, 0xf6, 0x6f, 0x17, 0xc4, 0x49, 0xec, 0xd8, 0x43, 0x1f, 0x2d, 0xa4, 0x76, 0x7b, 0xb7, - 0xcc, 0xbb, 0x3e, 0x5a, 0xfb, 0x60, 0xb1, 0x86, 0x3b, 0x52, 0xa1, 0x6c, 0xaa, 0x55, 0x29, 0x9d, - 0x97, 0xb2, 0x87, 0x90, 0x61, 0xbe, 0xdc, 0xfc, 0xbc, 0x95, 0xcf, 0xcd, 0x37, 0x3f, 0x5b, 0xd1, - 0x53, 0x39, 0x84, 0x3c, 0x41, 0xa2, 0x6d, 0x47, 0x14, 0x2a, 0x9e, 0x5d, 0x56, 0xf2, 0xd3, 0xab, - 0x44, 0x11, 0x92, 0xd9, 0x23, 0x20, 0x2e, 0x89, 0xb4, 0x7c, 0xb8, 0x26, 0x77, 0x99, 0xe3, 0xa5, - 0x67, 0x4a, 0xed, 0xde, 0xc5, 0x31, 0xfe, 0x18, 0x0d, 0x63, 0x8c, 0x80, 0xc0, 0xf7, 0x70, 0x07) - - -def sub_bytes(data): - return [SBOX[x] for x in data] - - -def sub_bytes_inv(data): - return [SBOX_INV[x] for x in data] - - -def rotate(data): - return data[1:] + [data[0]] - - -def key_schedule_core(data, rcon_iteration): - data = rotate(data) - data = sub_bytes(data) - data[0] = data[0] ^ RCON[rcon_iteration] - - return data - - -def xor(data1, data2): - return [x ^ y for x, y in zip(data1, data2)] - - -def rijndael_mul(a, b): - if(a == 0 or b == 0): - return 0 - return RIJNDAEL_EXP_TABLE[(RIJNDAEL_LOG_TABLE[a] + RIJNDAEL_LOG_TABLE[b]) % 0xFF] - - -def mix_column(data, matrix): - data_mixed = [] - for row in range(4): - mixed = 0 - for column in range(4): - # xor is (+) and (-) - mixed ^= rijndael_mul(data[column], matrix[row][column]) - data_mixed.append(mixed) - return data_mixed - - -def mix_columns(data, matrix=MIX_COLUMN_MATRIX): - data_mixed = [] - for i in range(4): - column = data[i * 4: (i + 1) * 4] - data_mixed += mix_column(column, matrix) - return data_mixed - - -def mix_columns_inv(data): - return mix_columns(data, MIX_COLUMN_MATRIX_INV) - - -def shift_rows(data): - data_shifted = [] - for column in range(4): - for row in range(4): - data_shifted.append(data[((column + row) & 0b11) * 4 + row]) - return data_shifted - - -def shift_rows_inv(data): - data_shifted = [] - for column in range(4): - for row in range(4): - data_shifted.append(data[((column - row) & 0b11) * 4 + row]) - return data_shifted - - -def inc(data): - data = data[:] # copy - for i in range(len(data) - 1, -1, -1): - if data[i] == 255: - data[i] = 0 - else: - data[i] = data[i] + 1 - break - return data - - -__all__ = ['aes_encrypt', 'key_expansion', 'aes_ctr_decrypt', 'aes_cbc_decrypt', 'aes_decrypt_text'] diff --git a/youtube_dl/cache.py b/youtube_dl/cache.py deleted file mode 100644 index 7bdade1bd..000000000 --- a/youtube_dl/cache.py +++ /dev/null @@ -1,96 +0,0 @@ -from __future__ import unicode_literals - -import errno -import io -import json -import os -import re -import shutil -import traceback - -from .compat import compat_getenv -from .utils import ( - expand_path, - write_json_file, -) - - -class Cache(object): - def __init__(self, ydl): - self._ydl = ydl - - def _get_root_dir(self): - res = self._ydl.params.get('cachedir') - if res is None: - cache_root = compat_getenv('XDG_CACHE_HOME', '~/.cache') - res = os.path.join(cache_root, 'youtube-dl') - return expand_path(res) - - def _get_cache_fn(self, section, key, dtype): - assert re.match(r'^[a-zA-Z0-9_.-]+$', section), \ - 'invalid section %r' % section - assert re.match(r'^[a-zA-Z0-9_.-]+$', key), 'invalid key %r' % key - return os.path.join( - self._get_root_dir(), section, '%s.%s' % (key, dtype)) - - @property - def enabled(self): - return self._ydl.params.get('cachedir') is not False - - def store(self, section, key, data, dtype='json'): - assert dtype in ('json',) - - if not self.enabled: - return - - fn = self._get_cache_fn(section, key, dtype) - try: - try: - os.makedirs(os.path.dirname(fn)) - except OSError as ose: - if ose.errno != errno.EEXIST: - raise - write_json_file(data, fn) - except Exception: - tb = traceback.format_exc() - self._ydl.report_warning( - 'Writing cache to %r failed: %s' % (fn, tb)) - - def load(self, section, key, dtype='json', default=None): - assert dtype in ('json',) - - if not self.enabled: - return default - - cache_fn = self._get_cache_fn(section, key, dtype) - try: - try: - with io.open(cache_fn, 'r', encoding='utf-8') as cachef: - return json.load(cachef) - except ValueError: - try: - file_size = os.path.getsize(cache_fn) - except (OSError, IOError) as oe: - file_size = str(oe) - self._ydl.report_warning( - 'Cache retrieval from %s failed (%s)' % (cache_fn, file_size)) - except IOError: - pass # No cache available - - return default - - def remove(self): - if not self.enabled: - self._ydl.to_screen('Cache is disabled (Did you combine --no-cache-dir and --rm-cache-dir?)') - return - - cachedir = self._get_root_dir() - if not any((term in cachedir) for term in ('cache', 'tmp')): - raise Exception('Not removing directory %s - this does not look like a cache dir' % cachedir) - - self._ydl.to_screen( - 'Removing cache dir %s .' % cachedir, skip_eol=True) - if os.path.exists(cachedir): - self._ydl.to_screen('.', skip_eol=True) - shutil.rmtree(cachedir) - self._ydl.to_screen('.') diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py deleted file mode 100644 index 9e45c454b..000000000 --- a/youtube_dl/compat.py +++ /dev/null @@ -1,3060 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import base64 -import binascii -import collections -import ctypes -import email -import getpass -import io -import itertools -import optparse -import os -import platform -import re -import shlex -import shutil -import socket -import struct -import subprocess -import sys -import xml.etree.ElementTree - - -try: - import urllib.request as compat_urllib_request -except ImportError: # Python 2 - import urllib2 as compat_urllib_request - -try: - import urllib.error as compat_urllib_error -except ImportError: # Python 2 - import urllib2 as compat_urllib_error - -try: - import urllib.parse as compat_urllib_parse -except ImportError: # Python 2 - import urllib as compat_urllib_parse - -try: - from urllib.parse import urlparse as compat_urllib_parse_urlparse -except ImportError: # Python 2 - from urlparse import urlparse as compat_urllib_parse_urlparse - -try: - import urllib.parse as compat_urlparse -except ImportError: # Python 2 - import urlparse as compat_urlparse - -try: - import urllib.response as compat_urllib_response -except ImportError: # Python 2 - import urllib as compat_urllib_response - -try: - import http.cookiejar as compat_cookiejar -except ImportError: # Python 2 - import cookielib as compat_cookiejar - -if sys.version_info[0] == 2: - class compat_cookiejar_Cookie(compat_cookiejar.Cookie): - def __init__(self, version, name, value, *args, **kwargs): - if isinstance(name, compat_str): - name = name.encode() - if isinstance(value, compat_str): - value = value.encode() - compat_cookiejar.Cookie.__init__(self, version, name, value, *args, **kwargs) -else: - compat_cookiejar_Cookie = compat_cookiejar.Cookie - -try: - import http.cookies as compat_cookies -except ImportError: # Python 2 - import Cookie as compat_cookies - -if sys.version_info[0] == 2: - class compat_cookies_SimpleCookie(compat_cookies.SimpleCookie): - def load(self, rawdata): - if isinstance(rawdata, compat_str): - rawdata = str(rawdata) - return super(compat_cookies_SimpleCookie, self).load(rawdata) -else: - compat_cookies_SimpleCookie = compat_cookies.SimpleCookie - -try: - import html.entities as compat_html_entities -except ImportError: # Python 2 - import htmlentitydefs as compat_html_entities - -try: # Python >= 3.3 - compat_html_entities_html5 = compat_html_entities.html5 -except AttributeError: - # Copied from CPython 3.5.1 html/entities.py - compat_html_entities_html5 = { - 'Aacute': '\xc1', - 'aacute': '\xe1', - 'Aacute;': '\xc1', - 'aacute;': '\xe1', - 'Abreve;': '\u0102', - 'abreve;': '\u0103', - 'ac;': '\u223e', - 'acd;': '\u223f', - 'acE;': '\u223e\u0333', - 'Acirc': '\xc2', - 'acirc': '\xe2', - 'Acirc;': '\xc2', - 'acirc;': '\xe2', - 'acute': '\xb4', - 'acute;': '\xb4', - 'Acy;': '\u0410', - 'acy;': '\u0430', - 'AElig': '\xc6', - 'aelig': '\xe6', - 'AElig;': '\xc6', - 'aelig;': '\xe6', - 'af;': '\u2061', - 'Afr;': '\U0001d504', - 'afr;': '\U0001d51e', - 'Agrave': '\xc0', - 'agrave': '\xe0', - 'Agrave;': '\xc0', - 'agrave;': '\xe0', - 'alefsym;': '\u2135', - 'aleph;': '\u2135', - 'Alpha;': '\u0391', - 'alpha;': '\u03b1', - 'Amacr;': '\u0100', - 'amacr;': '\u0101', - 'amalg;': '\u2a3f', - 'AMP': '&', - 'amp': '&', - 'AMP;': '&', - 'amp;': '&', - 'And;': '\u2a53', - 'and;': '\u2227', - 'andand;': '\u2a55', - 'andd;': '\u2a5c', - 'andslope;': '\u2a58', - 'andv;': '\u2a5a', - 'ang;': '\u2220', - 'ange;': '\u29a4', - 'angle;': '\u2220', - 'angmsd;': '\u2221', - 'angmsdaa;': '\u29a8', - 'angmsdab;': '\u29a9', - 'angmsdac;': '\u29aa', - 'angmsdad;': '\u29ab', - 'angmsdae;': '\u29ac', - 'angmsdaf;': '\u29ad', - 'angmsdag;': '\u29ae', - 'angmsdah;': '\u29af', - 'angrt;': '\u221f', - 'angrtvb;': '\u22be', - 'angrtvbd;': '\u299d', - 'angsph;': '\u2222', - 'angst;': '\xc5', - 'angzarr;': '\u237c', - 'Aogon;': '\u0104', - 'aogon;': '\u0105', - 'Aopf;': '\U0001d538', - 'aopf;': '\U0001d552', - 'ap;': '\u2248', - 'apacir;': '\u2a6f', - 'apE;': '\u2a70', - 'ape;': '\u224a', - 'apid;': '\u224b', - 'apos;': "'", - 'ApplyFunction;': '\u2061', - 'approx;': '\u2248', - 'approxeq;': '\u224a', - 'Aring': '\xc5', - 'aring': '\xe5', - 'Aring;': '\xc5', - 'aring;': '\xe5', - 'Ascr;': '\U0001d49c', - 'ascr;': '\U0001d4b6', - 'Assign;': '\u2254', - 'ast;': '*', - 'asymp;': '\u2248', - 'asympeq;': '\u224d', - 'Atilde': '\xc3', - 'atilde': '\xe3', - 'Atilde;': '\xc3', - 'atilde;': '\xe3', - 'Auml': '\xc4', - 'auml': '\xe4', - 'Auml;': '\xc4', - 'auml;': '\xe4', - 'awconint;': '\u2233', - 'awint;': '\u2a11', - 'backcong;': '\u224c', - 'backepsilon;': '\u03f6', - 'backprime;': '\u2035', - 'backsim;': '\u223d', - 'backsimeq;': '\u22cd', - 'Backslash;': '\u2216', - 'Barv;': '\u2ae7', - 'barvee;': '\u22bd', - 'Barwed;': '\u2306', - 'barwed;': '\u2305', - 'barwedge;': '\u2305', - 'bbrk;': '\u23b5', - 'bbrktbrk;': '\u23b6', - 'bcong;': '\u224c', - 'Bcy;': '\u0411', - 'bcy;': '\u0431', - 'bdquo;': '\u201e', - 'becaus;': '\u2235', - 'Because;': '\u2235', - 'because;': '\u2235', - 'bemptyv;': '\u29b0', - 'bepsi;': '\u03f6', - 'bernou;': '\u212c', - 'Bernoullis;': '\u212c', - 'Beta;': '\u0392', - 'beta;': '\u03b2', - 'beth;': '\u2136', - 'between;': '\u226c', - 'Bfr;': '\U0001d505', - 'bfr;': '\U0001d51f', - 'bigcap;': '\u22c2', - 'bigcirc;': '\u25ef', - 'bigcup;': '\u22c3', - 'bigodot;': '\u2a00', - 'bigoplus;': '\u2a01', - 'bigotimes;': '\u2a02', - 'bigsqcup;': '\u2a06', - 'bigstar;': '\u2605', - 'bigtriangledown;': '\u25bd', - 'bigtriangleup;': '\u25b3', - 'biguplus;': '\u2a04', - 'bigvee;': '\u22c1', - 'bigwedge;': '\u22c0', - 'bkarow;': '\u290d', - 'blacklozenge;': '\u29eb', - 'blacksquare;': '\u25aa', - 'blacktriangle;': '\u25b4', - 'blacktriangledown;': '\u25be', - 'blacktriangleleft;': '\u25c2', - 'blacktriangleright;': '\u25b8', - 'blank;': '\u2423', - 'blk12;': '\u2592', - 'blk14;': '\u2591', - 'blk34;': '\u2593', - 'block;': '\u2588', - 'bne;': '=\u20e5', - 'bnequiv;': '\u2261\u20e5', - 'bNot;': '\u2aed', - 'bnot;': '\u2310', - 'Bopf;': '\U0001d539', - 'bopf;': '\U0001d553', - 'bot;': '\u22a5', - 'bottom;': '\u22a5', - 'bowtie;': '\u22c8', - 'boxbox;': '\u29c9', - 'boxDL;': '\u2557', - 'boxDl;': '\u2556', - 'boxdL;': '\u2555', - 'boxdl;': '\u2510', - 'boxDR;': '\u2554', - 'boxDr;': '\u2553', - 'boxdR;': '\u2552', - 'boxdr;': '\u250c', - 'boxH;': '\u2550', - 'boxh;': '\u2500', - 'boxHD;': '\u2566', - 'boxHd;': '\u2564', - 'boxhD;': '\u2565', - 'boxhd;': '\u252c', - 'boxHU;': '\u2569', - 'boxHu;': '\u2567', - 'boxhU;': '\u2568', - 'boxhu;': '\u2534', - 'boxminus;': '\u229f', - 'boxplus;': '\u229e', - 'boxtimes;': '\u22a0', - 'boxUL;': '\u255d', - 'boxUl;': '\u255c', - 'boxuL;': '\u255b', - 'boxul;': '\u2518', - 'boxUR;': '\u255a', - 'boxUr;': '\u2559', - 'boxuR;': '\u2558', - 'boxur;': '\u2514', - 'boxV;': '\u2551', - 'boxv;': '\u2502', - 'boxVH;': '\u256c', - 'boxVh;': '\u256b', - 'boxvH;': '\u256a', - 'boxvh;': '\u253c', - 'boxVL;': '\u2563', - 'boxVl;': '\u2562', - 'boxvL;': '\u2561', - 'boxvl;': '\u2524', - 'boxVR;': '\u2560', - 'boxVr;': '\u255f', - 'boxvR;': '\u255e', - 'boxvr;': '\u251c', - 'bprime;': '\u2035', - 'Breve;': '\u02d8', - 'breve;': '\u02d8', - 'brvbar': '\xa6', - 'brvbar;': '\xa6', - 'Bscr;': '\u212c', - 'bscr;': '\U0001d4b7', - 'bsemi;': '\u204f', - 'bsim;': '\u223d', - 'bsime;': '\u22cd', - 'bsol;': '\\', - 'bsolb;': '\u29c5', - 'bsolhsub;': '\u27c8', - 'bull;': '\u2022', - 'bullet;': '\u2022', - 'bump;': '\u224e', - 'bumpE;': '\u2aae', - 'bumpe;': '\u224f', - 'Bumpeq;': '\u224e', - 'bumpeq;': '\u224f', - 'Cacute;': '\u0106', - 'cacute;': '\u0107', - 'Cap;': '\u22d2', - 'cap;': '\u2229', - 'capand;': '\u2a44', - 'capbrcup;': '\u2a49', - 'capcap;': '\u2a4b', - 'capcup;': '\u2a47', - 'capdot;': '\u2a40', - 'CapitalDifferentialD;': '\u2145', - 'caps;': '\u2229\ufe00', - 'caret;': '\u2041', - 'caron;': '\u02c7', - 'Cayleys;': '\u212d', - 'ccaps;': '\u2a4d', - 'Ccaron;': '\u010c', - 'ccaron;': '\u010d', - 'Ccedil': '\xc7', - 'ccedil': '\xe7', - 'Ccedil;': '\xc7', - 'ccedil;': '\xe7', - 'Ccirc;': '\u0108', - 'ccirc;': '\u0109', - 'Cconint;': '\u2230', - 'ccups;': '\u2a4c', - 'ccupssm;': '\u2a50', - 'Cdot;': '\u010a', - 'cdot;': '\u010b', - 'cedil': '\xb8', - 'cedil;': '\xb8', - 'Cedilla;': '\xb8', - 'cemptyv;': '\u29b2', - 'cent': '\xa2', - 'cent;': '\xa2', - 'CenterDot;': '\xb7', - 'centerdot;': '\xb7', - 'Cfr;': '\u212d', - 'cfr;': '\U0001d520', - 'CHcy;': '\u0427', - 'chcy;': '\u0447', - 'check;': '\u2713', - 'checkmark;': '\u2713', - 'Chi;': '\u03a7', - 'chi;': '\u03c7', - 'cir;': '\u25cb', - 'circ;': '\u02c6', - 'circeq;': '\u2257', - 'circlearrowleft;': '\u21ba', - 'circlearrowright;': '\u21bb', - 'circledast;': '\u229b', - 'circledcirc;': '\u229a', - 'circleddash;': '\u229d', - 'CircleDot;': '\u2299', - 'circledR;': '\xae', - 'circledS;': '\u24c8', - 'CircleMinus;': '\u2296', - 'CirclePlus;': '\u2295', - 'CircleTimes;': '\u2297', - 'cirE;': '\u29c3', - 'cire;': '\u2257', - 'cirfnint;': '\u2a10', - 'cirmid;': '\u2aef', - 'cirscir;': '\u29c2', - 'ClockwiseContourIntegral;': '\u2232', - 'CloseCurlyDoubleQuote;': '\u201d', - 'CloseCurlyQuote;': '\u2019', - 'clubs;': '\u2663', - 'clubsuit;': '\u2663', - 'Colon;': '\u2237', - 'colon;': ':', - 'Colone;': '\u2a74', - 'colone;': '\u2254', - 'coloneq;': '\u2254', - 'comma;': ',', - 'commat;': '@', - 'comp;': '\u2201', - 'compfn;': '\u2218', - 'complement;': '\u2201', - 'complexes;': '\u2102', - 'cong;': '\u2245', - 'congdot;': '\u2a6d', - 'Congruent;': '\u2261', - 'Conint;': '\u222f', - 'conint;': '\u222e', - 'ContourIntegral;': '\u222e', - 'Copf;': '\u2102', - 'copf;': '\U0001d554', - 'coprod;': '\u2210', - 'Coproduct;': '\u2210', - 'COPY': '\xa9', - 'copy': '\xa9', - 'COPY;': '\xa9', - 'copy;': '\xa9', - 'copysr;': '\u2117', - 'CounterClockwiseContourIntegral;': '\u2233', - 'crarr;': '\u21b5', - 'Cross;': '\u2a2f', - 'cross;': '\u2717', - 'Cscr;': '\U0001d49e', - 'cscr;': '\U0001d4b8', - 'csub;': '\u2acf', - 'csube;': '\u2ad1', - 'csup;': '\u2ad0', - 'csupe;': '\u2ad2', - 'ctdot;': '\u22ef', - 'cudarrl;': '\u2938', - 'cudarrr;': '\u2935', - 'cuepr;': '\u22de', - 'cuesc;': '\u22df', - 'cularr;': '\u21b6', - 'cularrp;': '\u293d', - 'Cup;': '\u22d3', - 'cup;': '\u222a', - 'cupbrcap;': '\u2a48', - 'CupCap;': '\u224d', - 'cupcap;': '\u2a46', - 'cupcup;': '\u2a4a', - 'cupdot;': '\u228d', - 'cupor;': '\u2a45', - 'cups;': '\u222a\ufe00', - 'curarr;': '\u21b7', - 'curarrm;': '\u293c', - 'curlyeqprec;': '\u22de', - 'curlyeqsucc;': '\u22df', - 'curlyvee;': '\u22ce', - 'curlywedge;': '\u22cf', - 'curren': '\xa4', - 'curren;': '\xa4', - 'curvearrowleft;': '\u21b6', - 'curvearrowright;': '\u21b7', - 'cuvee;': '\u22ce', - 'cuwed;': '\u22cf', - 'cwconint;': '\u2232', - 'cwint;': '\u2231', - 'cylcty;': '\u232d', - 'Dagger;': '\u2021', - 'dagger;': '\u2020', - 'daleth;': '\u2138', - 'Darr;': '\u21a1', - 'dArr;': '\u21d3', - 'darr;': '\u2193', - 'dash;': '\u2010', - 'Dashv;': '\u2ae4', - 'dashv;': '\u22a3', - 'dbkarow;': '\u290f', - 'dblac;': '\u02dd', - 'Dcaron;': '\u010e', - 'dcaron;': '\u010f', - 'Dcy;': '\u0414', - 'dcy;': '\u0434', - 'DD;': '\u2145', - 'dd;': '\u2146', - 'ddagger;': '\u2021', - 'ddarr;': '\u21ca', - 'DDotrahd;': '\u2911', - 'ddotseq;': '\u2a77', - 'deg': '\xb0', - 'deg;': '\xb0', - 'Del;': '\u2207', - 'Delta;': '\u0394', - 'delta;': '\u03b4', - 'demptyv;': '\u29b1', - 'dfisht;': '\u297f', - 'Dfr;': '\U0001d507', - 'dfr;': '\U0001d521', - 'dHar;': '\u2965', - 'dharl;': '\u21c3', - 'dharr;': '\u21c2', - 'DiacriticalAcute;': '\xb4', - 'DiacriticalDot;': '\u02d9', - 'DiacriticalDoubleAcute;': '\u02dd', - 'DiacriticalGrave;': '`', - 'DiacriticalTilde;': '\u02dc', - 'diam;': '\u22c4', - 'Diamond;': '\u22c4', - 'diamond;': '\u22c4', - 'diamondsuit;': '\u2666', - 'diams;': '\u2666', - 'die;': '\xa8', - 'DifferentialD;': '\u2146', - 'digamma;': '\u03dd', - 'disin;': '\u22f2', - 'div;': '\xf7', - 'divide': '\xf7', - 'divide;': '\xf7', - 'divideontimes;': '\u22c7', - 'divonx;': '\u22c7', - 'DJcy;': '\u0402', - 'djcy;': '\u0452', - 'dlcorn;': '\u231e', - 'dlcrop;': '\u230d', - 'dollar;': '$', - 'Dopf;': '\U0001d53b', - 'dopf;': '\U0001d555', - 'Dot;': '\xa8', - 'dot;': '\u02d9', - 'DotDot;': '\u20dc', - 'doteq;': '\u2250', - 'doteqdot;': '\u2251', - 'DotEqual;': '\u2250', - 'dotminus;': '\u2238', - 'dotplus;': '\u2214', - 'dotsquare;': '\u22a1', - 'doublebarwedge;': '\u2306', - 'DoubleContourIntegral;': '\u222f', - 'DoubleDot;': '\xa8', - 'DoubleDownArrow;': '\u21d3', - 'DoubleLeftArrow;': '\u21d0', - 'DoubleLeftRightArrow;': '\u21d4', - 'DoubleLeftTee;': '\u2ae4', - 'DoubleLongLeftArrow;': '\u27f8', - 'DoubleLongLeftRightArrow;': '\u27fa', - 'DoubleLongRightArrow;': '\u27f9', - 'DoubleRightArrow;': '\u21d2', - 'DoubleRightTee;': '\u22a8', - 'DoubleUpArrow;': '\u21d1', - 'DoubleUpDownArrow;': '\u21d5', - 'DoubleVerticalBar;': '\u2225', - 'DownArrow;': '\u2193', - 'Downarrow;': '\u21d3', - 'downarrow;': '\u2193', - 'DownArrowBar;': '\u2913', - 'DownArrowUpArrow;': '\u21f5', - 'DownBreve;': '\u0311', - 'downdownarrows;': '\u21ca', - 'downharpoonleft;': '\u21c3', - 'downharpoonright;': '\u21c2', - 'DownLeftRightVector;': '\u2950', - 'DownLeftTeeVector;': '\u295e', - 'DownLeftVector;': '\u21bd', - 'DownLeftVectorBar;': '\u2956', - 'DownRightTeeVector;': '\u295f', - 'DownRightVector;': '\u21c1', - 'DownRightVectorBar;': '\u2957', - 'DownTee;': '\u22a4', - 'DownTeeArrow;': '\u21a7', - 'drbkarow;': '\u2910', - 'drcorn;': '\u231f', - 'drcrop;': '\u230c', - 'Dscr;': '\U0001d49f', - 'dscr;': '\U0001d4b9', - 'DScy;': '\u0405', - 'dscy;': '\u0455', - 'dsol;': '\u29f6', - 'Dstrok;': '\u0110', - 'dstrok;': '\u0111', - 'dtdot;': '\u22f1', - 'dtri;': '\u25bf', - 'dtrif;': '\u25be', - 'duarr;': '\u21f5', - 'duhar;': '\u296f', - 'dwangle;': '\u29a6', - 'DZcy;': '\u040f', - 'dzcy;': '\u045f', - 'dzigrarr;': '\u27ff', - 'Eacute': '\xc9', - 'eacute': '\xe9', - 'Eacute;': '\xc9', - 'eacute;': '\xe9', - 'easter;': '\u2a6e', - 'Ecaron;': '\u011a', - 'ecaron;': '\u011b', - 'ecir;': '\u2256', - 'Ecirc': '\xca', - 'ecirc': '\xea', - 'Ecirc;': '\xca', - 'ecirc;': '\xea', - 'ecolon;': '\u2255', - 'Ecy;': '\u042d', - 'ecy;': '\u044d', - 'eDDot;': '\u2a77', - 'Edot;': '\u0116', - 'eDot;': '\u2251', - 'edot;': '\u0117', - 'ee;': '\u2147', - 'efDot;': '\u2252', - 'Efr;': '\U0001d508', - 'efr;': '\U0001d522', - 'eg;': '\u2a9a', - 'Egrave': '\xc8', - 'egrave': '\xe8', - 'Egrave;': '\xc8', - 'egrave;': '\xe8', - 'egs;': '\u2a96', - 'egsdot;': '\u2a98', - 'el;': '\u2a99', - 'Element;': '\u2208', - 'elinters;': '\u23e7', - 'ell;': '\u2113', - 'els;': '\u2a95', - 'elsdot;': '\u2a97', - 'Emacr;': '\u0112', - 'emacr;': '\u0113', - 'empty;': '\u2205', - 'emptyset;': '\u2205', - 'EmptySmallSquare;': '\u25fb', - 'emptyv;': '\u2205', - 'EmptyVerySmallSquare;': '\u25ab', - 'emsp13;': '\u2004', - 'emsp14;': '\u2005', - 'emsp;': '\u2003', - 'ENG;': '\u014a', - 'eng;': '\u014b', - 'ensp;': '\u2002', - 'Eogon;': '\u0118', - 'eogon;': '\u0119', - 'Eopf;': '\U0001d53c', - 'eopf;': '\U0001d556', - 'epar;': '\u22d5', - 'eparsl;': '\u29e3', - 'eplus;': '\u2a71', - 'epsi;': '\u03b5', - 'Epsilon;': '\u0395', - 'epsilon;': '\u03b5', - 'epsiv;': '\u03f5', - 'eqcirc;': '\u2256', - 'eqcolon;': '\u2255', - 'eqsim;': '\u2242', - 'eqslantgtr;': '\u2a96', - 'eqslantless;': '\u2a95', - 'Equal;': '\u2a75', - 'equals;': '=', - 'EqualTilde;': '\u2242', - 'equest;': '\u225f', - 'Equilibrium;': '\u21cc', - 'equiv;': '\u2261', - 'equivDD;': '\u2a78', - 'eqvparsl;': '\u29e5', - 'erarr;': '\u2971', - 'erDot;': '\u2253', - 'Escr;': '\u2130', - 'escr;': '\u212f', - 'esdot;': '\u2250', - 'Esim;': '\u2a73', - 'esim;': '\u2242', - 'Eta;': '\u0397', - 'eta;': '\u03b7', - 'ETH': '\xd0', - 'eth': '\xf0', - 'ETH;': '\xd0', - 'eth;': '\xf0', - 'Euml': '\xcb', - 'euml': '\xeb', - 'Euml;': '\xcb', - 'euml;': '\xeb', - 'euro;': '\u20ac', - 'excl;': '!', - 'exist;': '\u2203', - 'Exists;': '\u2203', - 'expectation;': '\u2130', - 'ExponentialE;': '\u2147', - 'exponentiale;': '\u2147', - 'fallingdotseq;': '\u2252', - 'Fcy;': '\u0424', - 'fcy;': '\u0444', - 'female;': '\u2640', - 'ffilig;': '\ufb03', - 'fflig;': '\ufb00', - 'ffllig;': '\ufb04', - 'Ffr;': '\U0001d509', - 'ffr;': '\U0001d523', - 'filig;': '\ufb01', - 'FilledSmallSquare;': '\u25fc', - 'FilledVerySmallSquare;': '\u25aa', - 'fjlig;': 'fj', - 'flat;': '\u266d', - 'fllig;': '\ufb02', - 'fltns;': '\u25b1', - 'fnof;': '\u0192', - 'Fopf;': '\U0001d53d', - 'fopf;': '\U0001d557', - 'ForAll;': '\u2200', - 'forall;': '\u2200', - 'fork;': '\u22d4', - 'forkv;': '\u2ad9', - 'Fouriertrf;': '\u2131', - 'fpartint;': '\u2a0d', - 'frac12': '\xbd', - 'frac12;': '\xbd', - 'frac13;': '\u2153', - 'frac14': '\xbc', - 'frac14;': '\xbc', - 'frac15;': '\u2155', - 'frac16;': '\u2159', - 'frac18;': '\u215b', - 'frac23;': '\u2154', - 'frac25;': '\u2156', - 'frac34': '\xbe', - 'frac34;': '\xbe', - 'frac35;': '\u2157', - 'frac38;': '\u215c', - 'frac45;': '\u2158', - 'frac56;': '\u215a', - 'frac58;': '\u215d', - 'frac78;': '\u215e', - 'frasl;': '\u2044', - 'frown;': '\u2322', - 'Fscr;': '\u2131', - 'fscr;': '\U0001d4bb', - 'gacute;': '\u01f5', - 'Gamma;': '\u0393', - 'gamma;': '\u03b3', - 'Gammad;': '\u03dc', - 'gammad;': '\u03dd', - 'gap;': '\u2a86', - 'Gbreve;': '\u011e', - 'gbreve;': '\u011f', - 'Gcedil;': '\u0122', - 'Gcirc;': '\u011c', - 'gcirc;': '\u011d', - 'Gcy;': '\u0413', - 'gcy;': '\u0433', - 'Gdot;': '\u0120', - 'gdot;': '\u0121', - 'gE;': '\u2267', - 'ge;': '\u2265', - 'gEl;': '\u2a8c', - 'gel;': '\u22db', - 'geq;': '\u2265', - 'geqq;': '\u2267', - 'geqslant;': '\u2a7e', - 'ges;': '\u2a7e', - 'gescc;': '\u2aa9', - 'gesdot;': '\u2a80', - 'gesdoto;': '\u2a82', - 'gesdotol;': '\u2a84', - 'gesl;': '\u22db\ufe00', - 'gesles;': '\u2a94', - 'Gfr;': '\U0001d50a', - 'gfr;': '\U0001d524', - 'Gg;': '\u22d9', - 'gg;': '\u226b', - 'ggg;': '\u22d9', - 'gimel;': '\u2137', - 'GJcy;': '\u0403', - 'gjcy;': '\u0453', - 'gl;': '\u2277', - 'gla;': '\u2aa5', - 'glE;': '\u2a92', - 'glj;': '\u2aa4', - 'gnap;': '\u2a8a', - 'gnapprox;': '\u2a8a', - 'gnE;': '\u2269', - 'gne;': '\u2a88', - 'gneq;': '\u2a88', - 'gneqq;': '\u2269', - 'gnsim;': '\u22e7', - 'Gopf;': '\U0001d53e', - 'gopf;': '\U0001d558', - 'grave;': '`', - 'GreaterEqual;': '\u2265', - 'GreaterEqualLess;': '\u22db', - 'GreaterFullEqual;': '\u2267', - 'GreaterGreater;': '\u2aa2', - 'GreaterLess;': '\u2277', - 'GreaterSlantEqual;': '\u2a7e', - 'GreaterTilde;': '\u2273', - 'Gscr;': '\U0001d4a2', - 'gscr;': '\u210a', - 'gsim;': '\u2273', - 'gsime;': '\u2a8e', - 'gsiml;': '\u2a90', - 'GT': '>', - 'gt': '>', - 'GT;': '>', - 'Gt;': '\u226b', - 'gt;': '>', - 'gtcc;': '\u2aa7', - 'gtcir;': '\u2a7a', - 'gtdot;': '\u22d7', - 'gtlPar;': '\u2995', - 'gtquest;': '\u2a7c', - 'gtrapprox;': '\u2a86', - 'gtrarr;': '\u2978', - 'gtrdot;': '\u22d7', - 'gtreqless;': '\u22db', - 'gtreqqless;': '\u2a8c', - 'gtrless;': '\u2277', - 'gtrsim;': '\u2273', - 'gvertneqq;': '\u2269\ufe00', - 'gvnE;': '\u2269\ufe00', - 'Hacek;': '\u02c7', - 'hairsp;': '\u200a', - 'half;': '\xbd', - 'hamilt;': '\u210b', - 'HARDcy;': '\u042a', - 'hardcy;': '\u044a', - 'hArr;': '\u21d4', - 'harr;': '\u2194', - 'harrcir;': '\u2948', - 'harrw;': '\u21ad', - 'Hat;': '^', - 'hbar;': '\u210f', - 'Hcirc;': '\u0124', - 'hcirc;': '\u0125', - 'hearts;': '\u2665', - 'heartsuit;': '\u2665', - 'hellip;': '\u2026', - 'hercon;': '\u22b9', - 'Hfr;': '\u210c', - 'hfr;': '\U0001d525', - 'HilbertSpace;': '\u210b', - 'hksearow;': '\u2925', - 'hkswarow;': '\u2926', - 'hoarr;': '\u21ff', - 'homtht;': '\u223b', - 'hookleftarrow;': '\u21a9', - 'hookrightarrow;': '\u21aa', - 'Hopf;': '\u210d', - 'hopf;': '\U0001d559', - 'horbar;': '\u2015', - 'HorizontalLine;': '\u2500', - 'Hscr;': '\u210b', - 'hscr;': '\U0001d4bd', - 'hslash;': '\u210f', - 'Hstrok;': '\u0126', - 'hstrok;': '\u0127', - 'HumpDownHump;': '\u224e', - 'HumpEqual;': '\u224f', - 'hybull;': '\u2043', - 'hyphen;': '\u2010', - 'Iacute': '\xcd', - 'iacute': '\xed', - 'Iacute;': '\xcd', - 'iacute;': '\xed', - 'ic;': '\u2063', - 'Icirc': '\xce', - 'icirc': '\xee', - 'Icirc;': '\xce', - 'icirc;': '\xee', - 'Icy;': '\u0418', - 'icy;': '\u0438', - 'Idot;': '\u0130', - 'IEcy;': '\u0415', - 'iecy;': '\u0435', - 'iexcl': '\xa1', - 'iexcl;': '\xa1', - 'iff;': '\u21d4', - 'Ifr;': '\u2111', - 'ifr;': '\U0001d526', - 'Igrave': '\xcc', - 'igrave': '\xec', - 'Igrave;': '\xcc', - 'igrave;': '\xec', - 'ii;': '\u2148', - 'iiiint;': '\u2a0c', - 'iiint;': '\u222d', - 'iinfin;': '\u29dc', - 'iiota;': '\u2129', - 'IJlig;': '\u0132', - 'ijlig;': '\u0133', - 'Im;': '\u2111', - 'Imacr;': '\u012a', - 'imacr;': '\u012b', - 'image;': '\u2111', - 'ImaginaryI;': '\u2148', - 'imagline;': '\u2110', - 'imagpart;': '\u2111', - 'imath;': '\u0131', - 'imof;': '\u22b7', - 'imped;': '\u01b5', - 'Implies;': '\u21d2', - 'in;': '\u2208', - 'incare;': '\u2105', - 'infin;': '\u221e', - 'infintie;': '\u29dd', - 'inodot;': '\u0131', - 'Int;': '\u222c', - 'int;': '\u222b', - 'intcal;': '\u22ba', - 'integers;': '\u2124', - 'Integral;': '\u222b', - 'intercal;': '\u22ba', - 'Intersection;': '\u22c2', - 'intlarhk;': '\u2a17', - 'intprod;': '\u2a3c', - 'InvisibleComma;': '\u2063', - 'InvisibleTimes;': '\u2062', - 'IOcy;': '\u0401', - 'iocy;': '\u0451', - 'Iogon;': '\u012e', - 'iogon;': '\u012f', - 'Iopf;': '\U0001d540', - 'iopf;': '\U0001d55a', - 'Iota;': '\u0399', - 'iota;': '\u03b9', - 'iprod;': '\u2a3c', - 'iquest': '\xbf', - 'iquest;': '\xbf', - 'Iscr;': '\u2110', - 'iscr;': '\U0001d4be', - 'isin;': '\u2208', - 'isindot;': '\u22f5', - 'isinE;': '\u22f9', - 'isins;': '\u22f4', - 'isinsv;': '\u22f3', - 'isinv;': '\u2208', - 'it;': '\u2062', - 'Itilde;': '\u0128', - 'itilde;': '\u0129', - 'Iukcy;': '\u0406', - 'iukcy;': '\u0456', - 'Iuml': '\xcf', - 'iuml': '\xef', - 'Iuml;': '\xcf', - 'iuml;': '\xef', - 'Jcirc;': '\u0134', - 'jcirc;': '\u0135', - 'Jcy;': '\u0419', - 'jcy;': '\u0439', - 'Jfr;': '\U0001d50d', - 'jfr;': '\U0001d527', - 'jmath;': '\u0237', - 'Jopf;': '\U0001d541', - 'jopf;': '\U0001d55b', - 'Jscr;': '\U0001d4a5', - 'jscr;': '\U0001d4bf', - 'Jsercy;': '\u0408', - 'jsercy;': '\u0458', - 'Jukcy;': '\u0404', - 'jukcy;': '\u0454', - 'Kappa;': '\u039a', - 'kappa;': '\u03ba', - 'kappav;': '\u03f0', - 'Kcedil;': '\u0136', - 'kcedil;': '\u0137', - 'Kcy;': '\u041a', - 'kcy;': '\u043a', - 'Kfr;': '\U0001d50e', - 'kfr;': '\U0001d528', - 'kgreen;': '\u0138', - 'KHcy;': '\u0425', - 'khcy;': '\u0445', - 'KJcy;': '\u040c', - 'kjcy;': '\u045c', - 'Kopf;': '\U0001d542', - 'kopf;': '\U0001d55c', - 'Kscr;': '\U0001d4a6', - 'kscr;': '\U0001d4c0', - 'lAarr;': '\u21da', - 'Lacute;': '\u0139', - 'lacute;': '\u013a', - 'laemptyv;': '\u29b4', - 'lagran;': '\u2112', - 'Lambda;': '\u039b', - 'lambda;': '\u03bb', - 'Lang;': '\u27ea', - 'lang;': '\u27e8', - 'langd;': '\u2991', - 'langle;': '\u27e8', - 'lap;': '\u2a85', - 'Laplacetrf;': '\u2112', - 'laquo': '\xab', - 'laquo;': '\xab', - 'Larr;': '\u219e', - 'lArr;': '\u21d0', - 'larr;': '\u2190', - 'larrb;': '\u21e4', - 'larrbfs;': '\u291f', - 'larrfs;': '\u291d', - 'larrhk;': '\u21a9', - 'larrlp;': '\u21ab', - 'larrpl;': '\u2939', - 'larrsim;': '\u2973', - 'larrtl;': '\u21a2', - 'lat;': '\u2aab', - 'lAtail;': '\u291b', - 'latail;': '\u2919', - 'late;': '\u2aad', - 'lates;': '\u2aad\ufe00', - 'lBarr;': '\u290e', - 'lbarr;': '\u290c', - 'lbbrk;': '\u2772', - 'lbrace;': '{', - 'lbrack;': '[', - 'lbrke;': '\u298b', - 'lbrksld;': '\u298f', - 'lbrkslu;': '\u298d', - 'Lcaron;': '\u013d', - 'lcaron;': '\u013e', - 'Lcedil;': '\u013b', - 'lcedil;': '\u013c', - 'lceil;': '\u2308', - 'lcub;': '{', - 'Lcy;': '\u041b', - 'lcy;': '\u043b', - 'ldca;': '\u2936', - 'ldquo;': '\u201c', - 'ldquor;': '\u201e', - 'ldrdhar;': '\u2967', - 'ldrushar;': '\u294b', - 'ldsh;': '\u21b2', - 'lE;': '\u2266', - 'le;': '\u2264', - 'LeftAngleBracket;': '\u27e8', - 'LeftArrow;': '\u2190', - 'Leftarrow;': '\u21d0', - 'leftarrow;': '\u2190', - 'LeftArrowBar;': '\u21e4', - 'LeftArrowRightArrow;': '\u21c6', - 'leftarrowtail;': '\u21a2', - 'LeftCeiling;': '\u2308', - 'LeftDoubleBracket;': '\u27e6', - 'LeftDownTeeVector;': '\u2961', - 'LeftDownVector;': '\u21c3', - 'LeftDownVectorBar;': '\u2959', - 'LeftFloor;': '\u230a', - 'leftharpoondown;': '\u21bd', - 'leftharpoonup;': '\u21bc', - 'leftleftarrows;': '\u21c7', - 'LeftRightArrow;': '\u2194', - 'Leftrightarrow;': '\u21d4', - 'leftrightarrow;': '\u2194', - 'leftrightarrows;': '\u21c6', - 'leftrightharpoons;': '\u21cb', - 'leftrightsquigarrow;': '\u21ad', - 'LeftRightVector;': '\u294e', - 'LeftTee;': '\u22a3', - 'LeftTeeArrow;': '\u21a4', - 'LeftTeeVector;': '\u295a', - 'leftthreetimes;': '\u22cb', - 'LeftTriangle;': '\u22b2', - 'LeftTriangleBar;': '\u29cf', - 'LeftTriangleEqual;': '\u22b4', - 'LeftUpDownVector;': '\u2951', - 'LeftUpTeeVector;': '\u2960', - 'LeftUpVector;': '\u21bf', - 'LeftUpVectorBar;': '\u2958', - 'LeftVector;': '\u21bc', - 'LeftVectorBar;': '\u2952', - 'lEg;': '\u2a8b', - 'leg;': '\u22da', - 'leq;': '\u2264', - 'leqq;': '\u2266', - 'leqslant;': '\u2a7d', - 'les;': '\u2a7d', - 'lescc;': '\u2aa8', - 'lesdot;': '\u2a7f', - 'lesdoto;': '\u2a81', - 'lesdotor;': '\u2a83', - 'lesg;': '\u22da\ufe00', - 'lesges;': '\u2a93', - 'lessapprox;': '\u2a85', - 'lessdot;': '\u22d6', - 'lesseqgtr;': '\u22da', - 'lesseqqgtr;': '\u2a8b', - 'LessEqualGreater;': '\u22da', - 'LessFullEqual;': '\u2266', - 'LessGreater;': '\u2276', - 'lessgtr;': '\u2276', - 'LessLess;': '\u2aa1', - 'lesssim;': '\u2272', - 'LessSlantEqual;': '\u2a7d', - 'LessTilde;': '\u2272', - 'lfisht;': '\u297c', - 'lfloor;': '\u230a', - 'Lfr;': '\U0001d50f', - 'lfr;': '\U0001d529', - 'lg;': '\u2276', - 'lgE;': '\u2a91', - 'lHar;': '\u2962', - 'lhard;': '\u21bd', - 'lharu;': '\u21bc', - 'lharul;': '\u296a', - 'lhblk;': '\u2584', - 'LJcy;': '\u0409', - 'ljcy;': '\u0459', - 'Ll;': '\u22d8', - 'll;': '\u226a', - 'llarr;': '\u21c7', - 'llcorner;': '\u231e', - 'Lleftarrow;': '\u21da', - 'llhard;': '\u296b', - 'lltri;': '\u25fa', - 'Lmidot;': '\u013f', - 'lmidot;': '\u0140', - 'lmoust;': '\u23b0', - 'lmoustache;': '\u23b0', - 'lnap;': '\u2a89', - 'lnapprox;': '\u2a89', - 'lnE;': '\u2268', - 'lne;': '\u2a87', - 'lneq;': '\u2a87', - 'lneqq;': '\u2268', - 'lnsim;': '\u22e6', - 'loang;': '\u27ec', - 'loarr;': '\u21fd', - 'lobrk;': '\u27e6', - 'LongLeftArrow;': '\u27f5', - 'Longleftarrow;': '\u27f8', - 'longleftarrow;': '\u27f5', - 'LongLeftRightArrow;': '\u27f7', - 'Longleftrightarrow;': '\u27fa', - 'longleftrightarrow;': '\u27f7', - 'longmapsto;': '\u27fc', - 'LongRightArrow;': '\u27f6', - 'Longrightarrow;': '\u27f9', - 'longrightarrow;': '\u27f6', - 'looparrowleft;': '\u21ab', - 'looparrowright;': '\u21ac', - 'lopar;': '\u2985', - 'Lopf;': '\U0001d543', - 'lopf;': '\U0001d55d', - 'loplus;': '\u2a2d', - 'lotimes;': '\u2a34', - 'lowast;': '\u2217', - 'lowbar;': '_', - 'LowerLeftArrow;': '\u2199', - 'LowerRightArrow;': '\u2198', - 'loz;': '\u25ca', - 'lozenge;': '\u25ca', - 'lozf;': '\u29eb', - 'lpar;': '(', - 'lparlt;': '\u2993', - 'lrarr;': '\u21c6', - 'lrcorner;': '\u231f', - 'lrhar;': '\u21cb', - 'lrhard;': '\u296d', - 'lrm;': '\u200e', - 'lrtri;': '\u22bf', - 'lsaquo;': '\u2039', - 'Lscr;': '\u2112', - 'lscr;': '\U0001d4c1', - 'Lsh;': '\u21b0', - 'lsh;': '\u21b0', - 'lsim;': '\u2272', - 'lsime;': '\u2a8d', - 'lsimg;': '\u2a8f', - 'lsqb;': '[', - 'lsquo;': '\u2018', - 'lsquor;': '\u201a', - 'Lstrok;': '\u0141', - 'lstrok;': '\u0142', - 'LT': '<', - 'lt': '<', - 'LT;': '<', - 'Lt;': '\u226a', - 'lt;': '<', - 'ltcc;': '\u2aa6', - 'ltcir;': '\u2a79', - 'ltdot;': '\u22d6', - 'lthree;': '\u22cb', - 'ltimes;': '\u22c9', - 'ltlarr;': '\u2976', - 'ltquest;': '\u2a7b', - 'ltri;': '\u25c3', - 'ltrie;': '\u22b4', - 'ltrif;': '\u25c2', - 'ltrPar;': '\u2996', - 'lurdshar;': '\u294a', - 'luruhar;': '\u2966', - 'lvertneqq;': '\u2268\ufe00', - 'lvnE;': '\u2268\ufe00', - 'macr': '\xaf', - 'macr;': '\xaf', - 'male;': '\u2642', - 'malt;': '\u2720', - 'maltese;': '\u2720', - 'Map;': '\u2905', - 'map;': '\u21a6', - 'mapsto;': '\u21a6', - 'mapstodown;': '\u21a7', - 'mapstoleft;': '\u21a4', - 'mapstoup;': '\u21a5', - 'marker;': '\u25ae', - 'mcomma;': '\u2a29', - 'Mcy;': '\u041c', - 'mcy;': '\u043c', - 'mdash;': '\u2014', - 'mDDot;': '\u223a', - 'measuredangle;': '\u2221', - 'MediumSpace;': '\u205f', - 'Mellintrf;': '\u2133', - 'Mfr;': '\U0001d510', - 'mfr;': '\U0001d52a', - 'mho;': '\u2127', - 'micro': '\xb5', - 'micro;': '\xb5', - 'mid;': '\u2223', - 'midast;': '*', - 'midcir;': '\u2af0', - 'middot': '\xb7', - 'middot;': '\xb7', - 'minus;': '\u2212', - 'minusb;': '\u229f', - 'minusd;': '\u2238', - 'minusdu;': '\u2a2a', - 'MinusPlus;': '\u2213', - 'mlcp;': '\u2adb', - 'mldr;': '\u2026', - 'mnplus;': '\u2213', - 'models;': '\u22a7', - 'Mopf;': '\U0001d544', - 'mopf;': '\U0001d55e', - 'mp;': '\u2213', - 'Mscr;': '\u2133', - 'mscr;': '\U0001d4c2', - 'mstpos;': '\u223e', - 'Mu;': '\u039c', - 'mu;': '\u03bc', - 'multimap;': '\u22b8', - 'mumap;': '\u22b8', - 'nabla;': '\u2207', - 'Nacute;': '\u0143', - 'nacute;': '\u0144', - 'nang;': '\u2220\u20d2', - 'nap;': '\u2249', - 'napE;': '\u2a70\u0338', - 'napid;': '\u224b\u0338', - 'napos;': '\u0149', - 'napprox;': '\u2249', - 'natur;': '\u266e', - 'natural;': '\u266e', - 'naturals;': '\u2115', - 'nbsp': '\xa0', - 'nbsp;': '\xa0', - 'nbump;': '\u224e\u0338', - 'nbumpe;': '\u224f\u0338', - 'ncap;': '\u2a43', - 'Ncaron;': '\u0147', - 'ncaron;': '\u0148', - 'Ncedil;': '\u0145', - 'ncedil;': '\u0146', - 'ncong;': '\u2247', - 'ncongdot;': '\u2a6d\u0338', - 'ncup;': '\u2a42', - 'Ncy;': '\u041d', - 'ncy;': '\u043d', - 'ndash;': '\u2013', - 'ne;': '\u2260', - 'nearhk;': '\u2924', - 'neArr;': '\u21d7', - 'nearr;': '\u2197', - 'nearrow;': '\u2197', - 'nedot;': '\u2250\u0338', - 'NegativeMediumSpace;': '\u200b', - 'NegativeThickSpace;': '\u200b', - 'NegativeThinSpace;': '\u200b', - 'NegativeVeryThinSpace;': '\u200b', - 'nequiv;': '\u2262', - 'nesear;': '\u2928', - 'nesim;': '\u2242\u0338', - 'NestedGreaterGreater;': '\u226b', - 'NestedLessLess;': '\u226a', - 'NewLine;': '\n', - 'nexist;': '\u2204', - 'nexists;': '\u2204', - 'Nfr;': '\U0001d511', - 'nfr;': '\U0001d52b', - 'ngE;': '\u2267\u0338', - 'nge;': '\u2271', - 'ngeq;': '\u2271', - 'ngeqq;': '\u2267\u0338', - 'ngeqslant;': '\u2a7e\u0338', - 'nges;': '\u2a7e\u0338', - 'nGg;': '\u22d9\u0338', - 'ngsim;': '\u2275', - 'nGt;': '\u226b\u20d2', - 'ngt;': '\u226f', - 'ngtr;': '\u226f', - 'nGtv;': '\u226b\u0338', - 'nhArr;': '\u21ce', - 'nharr;': '\u21ae', - 'nhpar;': '\u2af2', - 'ni;': '\u220b', - 'nis;': '\u22fc', - 'nisd;': '\u22fa', - 'niv;': '\u220b', - 'NJcy;': '\u040a', - 'njcy;': '\u045a', - 'nlArr;': '\u21cd', - 'nlarr;': '\u219a', - 'nldr;': '\u2025', - 'nlE;': '\u2266\u0338', - 'nle;': '\u2270', - 'nLeftarrow;': '\u21cd', - 'nleftarrow;': '\u219a', - 'nLeftrightarrow;': '\u21ce', - 'nleftrightarrow;': '\u21ae', - 'nleq;': '\u2270', - 'nleqq;': '\u2266\u0338', - 'nleqslant;': '\u2a7d\u0338', - 'nles;': '\u2a7d\u0338', - 'nless;': '\u226e', - 'nLl;': '\u22d8\u0338', - 'nlsim;': '\u2274', - 'nLt;': '\u226a\u20d2', - 'nlt;': '\u226e', - 'nltri;': '\u22ea', - 'nltrie;': '\u22ec', - 'nLtv;': '\u226a\u0338', - 'nmid;': '\u2224', - 'NoBreak;': '\u2060', - 'NonBreakingSpace;': '\xa0', - 'Nopf;': '\u2115', - 'nopf;': '\U0001d55f', - 'not': '\xac', - 'Not;': '\u2aec', - 'not;': '\xac', - 'NotCongruent;': '\u2262', - 'NotCupCap;': '\u226d', - 'NotDoubleVerticalBar;': '\u2226', - 'NotElement;': '\u2209', - 'NotEqual;': '\u2260', - 'NotEqualTilde;': '\u2242\u0338', - 'NotExists;': '\u2204', - 'NotGreater;': '\u226f', - 'NotGreaterEqual;': '\u2271', - 'NotGreaterFullEqual;': '\u2267\u0338', - 'NotGreaterGreater;': '\u226b\u0338', - 'NotGreaterLess;': '\u2279', - 'NotGreaterSlantEqual;': '\u2a7e\u0338', - 'NotGreaterTilde;': '\u2275', - 'NotHumpDownHump;': '\u224e\u0338', - 'NotHumpEqual;': '\u224f\u0338', - 'notin;': '\u2209', - 'notindot;': '\u22f5\u0338', - 'notinE;': '\u22f9\u0338', - 'notinva;': '\u2209', - 'notinvb;': '\u22f7', - 'notinvc;': '\u22f6', - 'NotLeftTriangle;': '\u22ea', - 'NotLeftTriangleBar;': '\u29cf\u0338', - 'NotLeftTriangleEqual;': '\u22ec', - 'NotLess;': '\u226e', - 'NotLessEqual;': '\u2270', - 'NotLessGreater;': '\u2278', - 'NotLessLess;': '\u226a\u0338', - 'NotLessSlantEqual;': '\u2a7d\u0338', - 'NotLessTilde;': '\u2274', - 'NotNestedGreaterGreater;': '\u2aa2\u0338', - 'NotNestedLessLess;': '\u2aa1\u0338', - 'notni;': '\u220c', - 'notniva;': '\u220c', - 'notnivb;': '\u22fe', - 'notnivc;': '\u22fd', - 'NotPrecedes;': '\u2280', - 'NotPrecedesEqual;': '\u2aaf\u0338', - 'NotPrecedesSlantEqual;': '\u22e0', - 'NotReverseElement;': '\u220c', - 'NotRightTriangle;': '\u22eb', - 'NotRightTriangleBar;': '\u29d0\u0338', - 'NotRightTriangleEqual;': '\u22ed', - 'NotSquareSubset;': '\u228f\u0338', - 'NotSquareSubsetEqual;': '\u22e2', - 'NotSquareSuperset;': '\u2290\u0338', - 'NotSquareSupersetEqual;': '\u22e3', - 'NotSubset;': '\u2282\u20d2', - 'NotSubsetEqual;': '\u2288', - 'NotSucceeds;': '\u2281', - 'NotSucceedsEqual;': '\u2ab0\u0338', - 'NotSucceedsSlantEqual;': '\u22e1', - 'NotSucceedsTilde;': '\u227f\u0338', - 'NotSuperset;': '\u2283\u20d2', - 'NotSupersetEqual;': '\u2289', - 'NotTilde;': '\u2241', - 'NotTildeEqual;': '\u2244', - 'NotTildeFullEqual;': '\u2247', - 'NotTildeTilde;': '\u2249', - 'NotVerticalBar;': '\u2224', - 'npar;': '\u2226', - 'nparallel;': '\u2226', - 'nparsl;': '\u2afd\u20e5', - 'npart;': '\u2202\u0338', - 'npolint;': '\u2a14', - 'npr;': '\u2280', - 'nprcue;': '\u22e0', - 'npre;': '\u2aaf\u0338', - 'nprec;': '\u2280', - 'npreceq;': '\u2aaf\u0338', - 'nrArr;': '\u21cf', - 'nrarr;': '\u219b', - 'nrarrc;': '\u2933\u0338', - 'nrarrw;': '\u219d\u0338', - 'nRightarrow;': '\u21cf', - 'nrightarrow;': '\u219b', - 'nrtri;': '\u22eb', - 'nrtrie;': '\u22ed', - 'nsc;': '\u2281', - 'nsccue;': '\u22e1', - 'nsce;': '\u2ab0\u0338', - 'Nscr;': '\U0001d4a9', - 'nscr;': '\U0001d4c3', - 'nshortmid;': '\u2224', - 'nshortparallel;': '\u2226', - 'nsim;': '\u2241', - 'nsime;': '\u2244', - 'nsimeq;': '\u2244', - 'nsmid;': '\u2224', - 'nspar;': '\u2226', - 'nsqsube;': '\u22e2', - 'nsqsupe;': '\u22e3', - 'nsub;': '\u2284', - 'nsubE;': '\u2ac5\u0338', - 'nsube;': '\u2288', - 'nsubset;': '\u2282\u20d2', - 'nsubseteq;': '\u2288', - 'nsubseteqq;': '\u2ac5\u0338', - 'nsucc;': '\u2281', - 'nsucceq;': '\u2ab0\u0338', - 'nsup;': '\u2285', - 'nsupE;': '\u2ac6\u0338', - 'nsupe;': '\u2289', - 'nsupset;': '\u2283\u20d2', - 'nsupseteq;': '\u2289', - 'nsupseteqq;': '\u2ac6\u0338', - 'ntgl;': '\u2279', - 'Ntilde': '\xd1', - 'ntilde': '\xf1', - 'Ntilde;': '\xd1', - 'ntilde;': '\xf1', - 'ntlg;': '\u2278', - 'ntriangleleft;': '\u22ea', - 'ntrianglelefteq;': '\u22ec', - 'ntriangleright;': '\u22eb', - 'ntrianglerighteq;': '\u22ed', - 'Nu;': '\u039d', - 'nu;': '\u03bd', - 'num;': '#', - 'numero;': '\u2116', - 'numsp;': '\u2007', - 'nvap;': '\u224d\u20d2', - 'nVDash;': '\u22af', - 'nVdash;': '\u22ae', - 'nvDash;': '\u22ad', - 'nvdash;': '\u22ac', - 'nvge;': '\u2265\u20d2', - 'nvgt;': '>\u20d2', - 'nvHarr;': '\u2904', - 'nvinfin;': '\u29de', - 'nvlArr;': '\u2902', - 'nvle;': '\u2264\u20d2', - 'nvlt;': '<\u20d2', - 'nvltrie;': '\u22b4\u20d2', - 'nvrArr;': '\u2903', - 'nvrtrie;': '\u22b5\u20d2', - 'nvsim;': '\u223c\u20d2', - 'nwarhk;': '\u2923', - 'nwArr;': '\u21d6', - 'nwarr;': '\u2196', - 'nwarrow;': '\u2196', - 'nwnear;': '\u2927', - 'Oacute': '\xd3', - 'oacute': '\xf3', - 'Oacute;': '\xd3', - 'oacute;': '\xf3', - 'oast;': '\u229b', - 'ocir;': '\u229a', - 'Ocirc': '\xd4', - 'ocirc': '\xf4', - 'Ocirc;': '\xd4', - 'ocirc;': '\xf4', - 'Ocy;': '\u041e', - 'ocy;': '\u043e', - 'odash;': '\u229d', - 'Odblac;': '\u0150', - 'odblac;': '\u0151', - 'odiv;': '\u2a38', - 'odot;': '\u2299', - 'odsold;': '\u29bc', - 'OElig;': '\u0152', - 'oelig;': '\u0153', - 'ofcir;': '\u29bf', - 'Ofr;': '\U0001d512', - 'ofr;': '\U0001d52c', - 'ogon;': '\u02db', - 'Ograve': '\xd2', - 'ograve': '\xf2', - 'Ograve;': '\xd2', - 'ograve;': '\xf2', - 'ogt;': '\u29c1', - 'ohbar;': '\u29b5', - 'ohm;': '\u03a9', - 'oint;': '\u222e', - 'olarr;': '\u21ba', - 'olcir;': '\u29be', - 'olcross;': '\u29bb', - 'oline;': '\u203e', - 'olt;': '\u29c0', - 'Omacr;': '\u014c', - 'omacr;': '\u014d', - 'Omega;': '\u03a9', - 'omega;': '\u03c9', - 'Omicron;': '\u039f', - 'omicron;': '\u03bf', - 'omid;': '\u29b6', - 'ominus;': '\u2296', - 'Oopf;': '\U0001d546', - 'oopf;': '\U0001d560', - 'opar;': '\u29b7', - 'OpenCurlyDoubleQuote;': '\u201c', - 'OpenCurlyQuote;': '\u2018', - 'operp;': '\u29b9', - 'oplus;': '\u2295', - 'Or;': '\u2a54', - 'or;': '\u2228', - 'orarr;': '\u21bb', - 'ord;': '\u2a5d', - 'order;': '\u2134', - 'orderof;': '\u2134', - 'ordf': '\xaa', - 'ordf;': '\xaa', - 'ordm': '\xba', - 'ordm;': '\xba', - 'origof;': '\u22b6', - 'oror;': '\u2a56', - 'orslope;': '\u2a57', - 'orv;': '\u2a5b', - 'oS;': '\u24c8', - 'Oscr;': '\U0001d4aa', - 'oscr;': '\u2134', - 'Oslash': '\xd8', - 'oslash': '\xf8', - 'Oslash;': '\xd8', - 'oslash;': '\xf8', - 'osol;': '\u2298', - 'Otilde': '\xd5', - 'otilde': '\xf5', - 'Otilde;': '\xd5', - 'otilde;': '\xf5', - 'Otimes;': '\u2a37', - 'otimes;': '\u2297', - 'otimesas;': '\u2a36', - 'Ouml': '\xd6', - 'ouml': '\xf6', - 'Ouml;': '\xd6', - 'ouml;': '\xf6', - 'ovbar;': '\u233d', - 'OverBar;': '\u203e', - 'OverBrace;': '\u23de', - 'OverBracket;': '\u23b4', - 'OverParenthesis;': '\u23dc', - 'par;': '\u2225', - 'para': '\xb6', - 'para;': '\xb6', - 'parallel;': '\u2225', - 'parsim;': '\u2af3', - 'parsl;': '\u2afd', - 'part;': '\u2202', - 'PartialD;': '\u2202', - 'Pcy;': '\u041f', - 'pcy;': '\u043f', - 'percnt;': '%', - 'period;': '.', - 'permil;': '\u2030', - 'perp;': '\u22a5', - 'pertenk;': '\u2031', - 'Pfr;': '\U0001d513', - 'pfr;': '\U0001d52d', - 'Phi;': '\u03a6', - 'phi;': '\u03c6', - 'phiv;': '\u03d5', - 'phmmat;': '\u2133', - 'phone;': '\u260e', - 'Pi;': '\u03a0', - 'pi;': '\u03c0', - 'pitchfork;': '\u22d4', - 'piv;': '\u03d6', - 'planck;': '\u210f', - 'planckh;': '\u210e', - 'plankv;': '\u210f', - 'plus;': '+', - 'plusacir;': '\u2a23', - 'plusb;': '\u229e', - 'pluscir;': '\u2a22', - 'plusdo;': '\u2214', - 'plusdu;': '\u2a25', - 'pluse;': '\u2a72', - 'PlusMinus;': '\xb1', - 'plusmn': '\xb1', - 'plusmn;': '\xb1', - 'plussim;': '\u2a26', - 'plustwo;': '\u2a27', - 'pm;': '\xb1', - 'Poincareplane;': '\u210c', - 'pointint;': '\u2a15', - 'Popf;': '\u2119', - 'popf;': '\U0001d561', - 'pound': '\xa3', - 'pound;': '\xa3', - 'Pr;': '\u2abb', - 'pr;': '\u227a', - 'prap;': '\u2ab7', - 'prcue;': '\u227c', - 'prE;': '\u2ab3', - 'pre;': '\u2aaf', - 'prec;': '\u227a', - 'precapprox;': '\u2ab7', - 'preccurlyeq;': '\u227c', - 'Precedes;': '\u227a', - 'PrecedesEqual;': '\u2aaf', - 'PrecedesSlantEqual;': '\u227c', - 'PrecedesTilde;': '\u227e', - 'preceq;': '\u2aaf', - 'precnapprox;': '\u2ab9', - 'precneqq;': '\u2ab5', - 'precnsim;': '\u22e8', - 'precsim;': '\u227e', - 'Prime;': '\u2033', - 'prime;': '\u2032', - 'primes;': '\u2119', - 'prnap;': '\u2ab9', - 'prnE;': '\u2ab5', - 'prnsim;': '\u22e8', - 'prod;': '\u220f', - 'Product;': '\u220f', - 'profalar;': '\u232e', - 'profline;': '\u2312', - 'profsurf;': '\u2313', - 'prop;': '\u221d', - 'Proportion;': '\u2237', - 'Proportional;': '\u221d', - 'propto;': '\u221d', - 'prsim;': '\u227e', - 'prurel;': '\u22b0', - 'Pscr;': '\U0001d4ab', - 'pscr;': '\U0001d4c5', - 'Psi;': '\u03a8', - 'psi;': '\u03c8', - 'puncsp;': '\u2008', - 'Qfr;': '\U0001d514', - 'qfr;': '\U0001d52e', - 'qint;': '\u2a0c', - 'Qopf;': '\u211a', - 'qopf;': '\U0001d562', - 'qprime;': '\u2057', - 'Qscr;': '\U0001d4ac', - 'qscr;': '\U0001d4c6', - 'quaternions;': '\u210d', - 'quatint;': '\u2a16', - 'quest;': '?', - 'questeq;': '\u225f', - 'QUOT': '"', - 'quot': '"', - 'QUOT;': '"', - 'quot;': '"', - 'rAarr;': '\u21db', - 'race;': '\u223d\u0331', - 'Racute;': '\u0154', - 'racute;': '\u0155', - 'radic;': '\u221a', - 'raemptyv;': '\u29b3', - 'Rang;': '\u27eb', - 'rang;': '\u27e9', - 'rangd;': '\u2992', - 'range;': '\u29a5', - 'rangle;': '\u27e9', - 'raquo': '\xbb', - 'raquo;': '\xbb', - 'Rarr;': '\u21a0', - 'rArr;': '\u21d2', - 'rarr;': '\u2192', - 'rarrap;': '\u2975', - 'rarrb;': '\u21e5', - 'rarrbfs;': '\u2920', - 'rarrc;': '\u2933', - 'rarrfs;': '\u291e', - 'rarrhk;': '\u21aa', - 'rarrlp;': '\u21ac', - 'rarrpl;': '\u2945', - 'rarrsim;': '\u2974', - 'Rarrtl;': '\u2916', - 'rarrtl;': '\u21a3', - 'rarrw;': '\u219d', - 'rAtail;': '\u291c', - 'ratail;': '\u291a', - 'ratio;': '\u2236', - 'rationals;': '\u211a', - 'RBarr;': '\u2910', - 'rBarr;': '\u290f', - 'rbarr;': '\u290d', - 'rbbrk;': '\u2773', - 'rbrace;': '}', - 'rbrack;': ']', - 'rbrke;': '\u298c', - 'rbrksld;': '\u298e', - 'rbrkslu;': '\u2990', - 'Rcaron;': '\u0158', - 'rcaron;': '\u0159', - 'Rcedil;': '\u0156', - 'rcedil;': '\u0157', - 'rceil;': '\u2309', - 'rcub;': '}', - 'Rcy;': '\u0420', - 'rcy;': '\u0440', - 'rdca;': '\u2937', - 'rdldhar;': '\u2969', - 'rdquo;': '\u201d', - 'rdquor;': '\u201d', - 'rdsh;': '\u21b3', - 'Re;': '\u211c', - 'real;': '\u211c', - 'realine;': '\u211b', - 'realpart;': '\u211c', - 'reals;': '\u211d', - 'rect;': '\u25ad', - 'REG': '\xae', - 'reg': '\xae', - 'REG;': '\xae', - 'reg;': '\xae', - 'ReverseElement;': '\u220b', - 'ReverseEquilibrium;': '\u21cb', - 'ReverseUpEquilibrium;': '\u296f', - 'rfisht;': '\u297d', - 'rfloor;': '\u230b', - 'Rfr;': '\u211c', - 'rfr;': '\U0001d52f', - 'rHar;': '\u2964', - 'rhard;': '\u21c1', - 'rharu;': '\u21c0', - 'rharul;': '\u296c', - 'Rho;': '\u03a1', - 'rho;': '\u03c1', - 'rhov;': '\u03f1', - 'RightAngleBracket;': '\u27e9', - 'RightArrow;': '\u2192', - 'Rightarrow;': '\u21d2', - 'rightarrow;': '\u2192', - 'RightArrowBar;': '\u21e5', - 'RightArrowLeftArrow;': '\u21c4', - 'rightarrowtail;': '\u21a3', - 'RightCeiling;': '\u2309', - 'RightDoubleBracket;': '\u27e7', - 'RightDownTeeVector;': '\u295d', - 'RightDownVector;': '\u21c2', - 'RightDownVectorBar;': '\u2955', - 'RightFloor;': '\u230b', - 'rightharpoondown;': '\u21c1', - 'rightharpoonup;': '\u21c0', - 'rightleftarrows;': '\u21c4', - 'rightleftharpoons;': '\u21cc', - 'rightrightarrows;': '\u21c9', - 'rightsquigarrow;': '\u219d', - 'RightTee;': '\u22a2', - 'RightTeeArrow;': '\u21a6', - 'RightTeeVector;': '\u295b', - 'rightthreetimes;': '\u22cc', - 'RightTriangle;': '\u22b3', - 'RightTriangleBar;': '\u29d0', - 'RightTriangleEqual;': '\u22b5', - 'RightUpDownVector;': '\u294f', - 'RightUpTeeVector;': '\u295c', - 'RightUpVector;': '\u21be', - 'RightUpVectorBar;': '\u2954', - 'RightVector;': '\u21c0', - 'RightVectorBar;': '\u2953', - 'ring;': '\u02da', - 'risingdotseq;': '\u2253', - 'rlarr;': '\u21c4', - 'rlhar;': '\u21cc', - 'rlm;': '\u200f', - 'rmoust;': '\u23b1', - 'rmoustache;': '\u23b1', - 'rnmid;': '\u2aee', - 'roang;': '\u27ed', - 'roarr;': '\u21fe', - 'robrk;': '\u27e7', - 'ropar;': '\u2986', - 'Ropf;': '\u211d', - 'ropf;': '\U0001d563', - 'roplus;': '\u2a2e', - 'rotimes;': '\u2a35', - 'RoundImplies;': '\u2970', - 'rpar;': ')', - 'rpargt;': '\u2994', - 'rppolint;': '\u2a12', - 'rrarr;': '\u21c9', - 'Rrightarrow;': '\u21db', - 'rsaquo;': '\u203a', - 'Rscr;': '\u211b', - 'rscr;': '\U0001d4c7', - 'Rsh;': '\u21b1', - 'rsh;': '\u21b1', - 'rsqb;': ']', - 'rsquo;': '\u2019', - 'rsquor;': '\u2019', - 'rthree;': '\u22cc', - 'rtimes;': '\u22ca', - 'rtri;': '\u25b9', - 'rtrie;': '\u22b5', - 'rtrif;': '\u25b8', - 'rtriltri;': '\u29ce', - 'RuleDelayed;': '\u29f4', - 'ruluhar;': '\u2968', - 'rx;': '\u211e', - 'Sacute;': '\u015a', - 'sacute;': '\u015b', - 'sbquo;': '\u201a', - 'Sc;': '\u2abc', - 'sc;': '\u227b', - 'scap;': '\u2ab8', - 'Scaron;': '\u0160', - 'scaron;': '\u0161', - 'sccue;': '\u227d', - 'scE;': '\u2ab4', - 'sce;': '\u2ab0', - 'Scedil;': '\u015e', - 'scedil;': '\u015f', - 'Scirc;': '\u015c', - 'scirc;': '\u015d', - 'scnap;': '\u2aba', - 'scnE;': '\u2ab6', - 'scnsim;': '\u22e9', - 'scpolint;': '\u2a13', - 'scsim;': '\u227f', - 'Scy;': '\u0421', - 'scy;': '\u0441', - 'sdot;': '\u22c5', - 'sdotb;': '\u22a1', - 'sdote;': '\u2a66', - 'searhk;': '\u2925', - 'seArr;': '\u21d8', - 'searr;': '\u2198', - 'searrow;': '\u2198', - 'sect': '\xa7', - 'sect;': '\xa7', - 'semi;': ';', - 'seswar;': '\u2929', - 'setminus;': '\u2216', - 'setmn;': '\u2216', - 'sext;': '\u2736', - 'Sfr;': '\U0001d516', - 'sfr;': '\U0001d530', - 'sfrown;': '\u2322', - 'sharp;': '\u266f', - 'SHCHcy;': '\u0429', - 'shchcy;': '\u0449', - 'SHcy;': '\u0428', - 'shcy;': '\u0448', - 'ShortDownArrow;': '\u2193', - 'ShortLeftArrow;': '\u2190', - 'shortmid;': '\u2223', - 'shortparallel;': '\u2225', - 'ShortRightArrow;': '\u2192', - 'ShortUpArrow;': '\u2191', - 'shy': '\xad', - 'shy;': '\xad', - 'Sigma;': '\u03a3', - 'sigma;': '\u03c3', - 'sigmaf;': '\u03c2', - 'sigmav;': '\u03c2', - 'sim;': '\u223c', - 'simdot;': '\u2a6a', - 'sime;': '\u2243', - 'simeq;': '\u2243', - 'simg;': '\u2a9e', - 'simgE;': '\u2aa0', - 'siml;': '\u2a9d', - 'simlE;': '\u2a9f', - 'simne;': '\u2246', - 'simplus;': '\u2a24', - 'simrarr;': '\u2972', - 'slarr;': '\u2190', - 'SmallCircle;': '\u2218', - 'smallsetminus;': '\u2216', - 'smashp;': '\u2a33', - 'smeparsl;': '\u29e4', - 'smid;': '\u2223', - 'smile;': '\u2323', - 'smt;': '\u2aaa', - 'smte;': '\u2aac', - 'smtes;': '\u2aac\ufe00', - 'SOFTcy;': '\u042c', - 'softcy;': '\u044c', - 'sol;': '/', - 'solb;': '\u29c4', - 'solbar;': '\u233f', - 'Sopf;': '\U0001d54a', - 'sopf;': '\U0001d564', - 'spades;': '\u2660', - 'spadesuit;': '\u2660', - 'spar;': '\u2225', - 'sqcap;': '\u2293', - 'sqcaps;': '\u2293\ufe00', - 'sqcup;': '\u2294', - 'sqcups;': '\u2294\ufe00', - 'Sqrt;': '\u221a', - 'sqsub;': '\u228f', - 'sqsube;': '\u2291', - 'sqsubset;': '\u228f', - 'sqsubseteq;': '\u2291', - 'sqsup;': '\u2290', - 'sqsupe;': '\u2292', - 'sqsupset;': '\u2290', - 'sqsupseteq;': '\u2292', - 'squ;': '\u25a1', - 'Square;': '\u25a1', - 'square;': '\u25a1', - 'SquareIntersection;': '\u2293', - 'SquareSubset;': '\u228f', - 'SquareSubsetEqual;': '\u2291', - 'SquareSuperset;': '\u2290', - 'SquareSupersetEqual;': '\u2292', - 'SquareUnion;': '\u2294', - 'squarf;': '\u25aa', - 'squf;': '\u25aa', - 'srarr;': '\u2192', - 'Sscr;': '\U0001d4ae', - 'sscr;': '\U0001d4c8', - 'ssetmn;': '\u2216', - 'ssmile;': '\u2323', - 'sstarf;': '\u22c6', - 'Star;': '\u22c6', - 'star;': '\u2606', - 'starf;': '\u2605', - 'straightepsilon;': '\u03f5', - 'straightphi;': '\u03d5', - 'strns;': '\xaf', - 'Sub;': '\u22d0', - 'sub;': '\u2282', - 'subdot;': '\u2abd', - 'subE;': '\u2ac5', - 'sube;': '\u2286', - 'subedot;': '\u2ac3', - 'submult;': '\u2ac1', - 'subnE;': '\u2acb', - 'subne;': '\u228a', - 'subplus;': '\u2abf', - 'subrarr;': '\u2979', - 'Subset;': '\u22d0', - 'subset;': '\u2282', - 'subseteq;': '\u2286', - 'subseteqq;': '\u2ac5', - 'SubsetEqual;': '\u2286', - 'subsetneq;': '\u228a', - 'subsetneqq;': '\u2acb', - 'subsim;': '\u2ac7', - 'subsub;': '\u2ad5', - 'subsup;': '\u2ad3', - 'succ;': '\u227b', - 'succapprox;': '\u2ab8', - 'succcurlyeq;': '\u227d', - 'Succeeds;': '\u227b', - 'SucceedsEqual;': '\u2ab0', - 'SucceedsSlantEqual;': '\u227d', - 'SucceedsTilde;': '\u227f', - 'succeq;': '\u2ab0', - 'succnapprox;': '\u2aba', - 'succneqq;': '\u2ab6', - 'succnsim;': '\u22e9', - 'succsim;': '\u227f', - 'SuchThat;': '\u220b', - 'Sum;': '\u2211', - 'sum;': '\u2211', - 'sung;': '\u266a', - 'sup1': '\xb9', - 'sup1;': '\xb9', - 'sup2': '\xb2', - 'sup2;': '\xb2', - 'sup3': '\xb3', - 'sup3;': '\xb3', - 'Sup;': '\u22d1', - 'sup;': '\u2283', - 'supdot;': '\u2abe', - 'supdsub;': '\u2ad8', - 'supE;': '\u2ac6', - 'supe;': '\u2287', - 'supedot;': '\u2ac4', - 'Superset;': '\u2283', - 'SupersetEqual;': '\u2287', - 'suphsol;': '\u27c9', - 'suphsub;': '\u2ad7', - 'suplarr;': '\u297b', - 'supmult;': '\u2ac2', - 'supnE;': '\u2acc', - 'supne;': '\u228b', - 'supplus;': '\u2ac0', - 'Supset;': '\u22d1', - 'supset;': '\u2283', - 'supseteq;': '\u2287', - 'supseteqq;': '\u2ac6', - 'supsetneq;': '\u228b', - 'supsetneqq;': '\u2acc', - 'supsim;': '\u2ac8', - 'supsub;': '\u2ad4', - 'supsup;': '\u2ad6', - 'swarhk;': '\u2926', - 'swArr;': '\u21d9', - 'swarr;': '\u2199', - 'swarrow;': '\u2199', - 'swnwar;': '\u292a', - 'szlig': '\xdf', - 'szlig;': '\xdf', - 'Tab;': '\t', - 'target;': '\u2316', - 'Tau;': '\u03a4', - 'tau;': '\u03c4', - 'tbrk;': '\u23b4', - 'Tcaron;': '\u0164', - 'tcaron;': '\u0165', - 'Tcedil;': '\u0162', - 'tcedil;': '\u0163', - 'Tcy;': '\u0422', - 'tcy;': '\u0442', - 'tdot;': '\u20db', - 'telrec;': '\u2315', - 'Tfr;': '\U0001d517', - 'tfr;': '\U0001d531', - 'there4;': '\u2234', - 'Therefore;': '\u2234', - 'therefore;': '\u2234', - 'Theta;': '\u0398', - 'theta;': '\u03b8', - 'thetasym;': '\u03d1', - 'thetav;': '\u03d1', - 'thickapprox;': '\u2248', - 'thicksim;': '\u223c', - 'ThickSpace;': '\u205f\u200a', - 'thinsp;': '\u2009', - 'ThinSpace;': '\u2009', - 'thkap;': '\u2248', - 'thksim;': '\u223c', - 'THORN': '\xde', - 'thorn': '\xfe', - 'THORN;': '\xde', - 'thorn;': '\xfe', - 'Tilde;': '\u223c', - 'tilde;': '\u02dc', - 'TildeEqual;': '\u2243', - 'TildeFullEqual;': '\u2245', - 'TildeTilde;': '\u2248', - 'times': '\xd7', - 'times;': '\xd7', - 'timesb;': '\u22a0', - 'timesbar;': '\u2a31', - 'timesd;': '\u2a30', - 'tint;': '\u222d', - 'toea;': '\u2928', - 'top;': '\u22a4', - 'topbot;': '\u2336', - 'topcir;': '\u2af1', - 'Topf;': '\U0001d54b', - 'topf;': '\U0001d565', - 'topfork;': '\u2ada', - 'tosa;': '\u2929', - 'tprime;': '\u2034', - 'TRADE;': '\u2122', - 'trade;': '\u2122', - 'triangle;': '\u25b5', - 'triangledown;': '\u25bf', - 'triangleleft;': '\u25c3', - 'trianglelefteq;': '\u22b4', - 'triangleq;': '\u225c', - 'triangleright;': '\u25b9', - 'trianglerighteq;': '\u22b5', - 'tridot;': '\u25ec', - 'trie;': '\u225c', - 'triminus;': '\u2a3a', - 'TripleDot;': '\u20db', - 'triplus;': '\u2a39', - 'trisb;': '\u29cd', - 'tritime;': '\u2a3b', - 'trpezium;': '\u23e2', - 'Tscr;': '\U0001d4af', - 'tscr;': '\U0001d4c9', - 'TScy;': '\u0426', - 'tscy;': '\u0446', - 'TSHcy;': '\u040b', - 'tshcy;': '\u045b', - 'Tstrok;': '\u0166', - 'tstrok;': '\u0167', - 'twixt;': '\u226c', - 'twoheadleftarrow;': '\u219e', - 'twoheadrightarrow;': '\u21a0', - 'Uacute': '\xda', - 'uacute': '\xfa', - 'Uacute;': '\xda', - 'uacute;': '\xfa', - 'Uarr;': '\u219f', - 'uArr;': '\u21d1', - 'uarr;': '\u2191', - 'Uarrocir;': '\u2949', - 'Ubrcy;': '\u040e', - 'ubrcy;': '\u045e', - 'Ubreve;': '\u016c', - 'ubreve;': '\u016d', - 'Ucirc': '\xdb', - 'ucirc': '\xfb', - 'Ucirc;': '\xdb', - 'ucirc;': '\xfb', - 'Ucy;': '\u0423', - 'ucy;': '\u0443', - 'udarr;': '\u21c5', - 'Udblac;': '\u0170', - 'udblac;': '\u0171', - 'udhar;': '\u296e', - 'ufisht;': '\u297e', - 'Ufr;': '\U0001d518', - 'ufr;': '\U0001d532', - 'Ugrave': '\xd9', - 'ugrave': '\xf9', - 'Ugrave;': '\xd9', - 'ugrave;': '\xf9', - 'uHar;': '\u2963', - 'uharl;': '\u21bf', - 'uharr;': '\u21be', - 'uhblk;': '\u2580', - 'ulcorn;': '\u231c', - 'ulcorner;': '\u231c', - 'ulcrop;': '\u230f', - 'ultri;': '\u25f8', - 'Umacr;': '\u016a', - 'umacr;': '\u016b', - 'uml': '\xa8', - 'uml;': '\xa8', - 'UnderBar;': '_', - 'UnderBrace;': '\u23df', - 'UnderBracket;': '\u23b5', - 'UnderParenthesis;': '\u23dd', - 'Union;': '\u22c3', - 'UnionPlus;': '\u228e', - 'Uogon;': '\u0172', - 'uogon;': '\u0173', - 'Uopf;': '\U0001d54c', - 'uopf;': '\U0001d566', - 'UpArrow;': '\u2191', - 'Uparrow;': '\u21d1', - 'uparrow;': '\u2191', - 'UpArrowBar;': '\u2912', - 'UpArrowDownArrow;': '\u21c5', - 'UpDownArrow;': '\u2195', - 'Updownarrow;': '\u21d5', - 'updownarrow;': '\u2195', - 'UpEquilibrium;': '\u296e', - 'upharpoonleft;': '\u21bf', - 'upharpoonright;': '\u21be', - 'uplus;': '\u228e', - 'UpperLeftArrow;': '\u2196', - 'UpperRightArrow;': '\u2197', - 'Upsi;': '\u03d2', - 'upsi;': '\u03c5', - 'upsih;': '\u03d2', - 'Upsilon;': '\u03a5', - 'upsilon;': '\u03c5', - 'UpTee;': '\u22a5', - 'UpTeeArrow;': '\u21a5', - 'upuparrows;': '\u21c8', - 'urcorn;': '\u231d', - 'urcorner;': '\u231d', - 'urcrop;': '\u230e', - 'Uring;': '\u016e', - 'uring;': '\u016f', - 'urtri;': '\u25f9', - 'Uscr;': '\U0001d4b0', - 'uscr;': '\U0001d4ca', - 'utdot;': '\u22f0', - 'Utilde;': '\u0168', - 'utilde;': '\u0169', - 'utri;': '\u25b5', - 'utrif;': '\u25b4', - 'uuarr;': '\u21c8', - 'Uuml': '\xdc', - 'uuml': '\xfc', - 'Uuml;': '\xdc', - 'uuml;': '\xfc', - 'uwangle;': '\u29a7', - 'vangrt;': '\u299c', - 'varepsilon;': '\u03f5', - 'varkappa;': '\u03f0', - 'varnothing;': '\u2205', - 'varphi;': '\u03d5', - 'varpi;': '\u03d6', - 'varpropto;': '\u221d', - 'vArr;': '\u21d5', - 'varr;': '\u2195', - 'varrho;': '\u03f1', - 'varsigma;': '\u03c2', - 'varsubsetneq;': '\u228a\ufe00', - 'varsubsetneqq;': '\u2acb\ufe00', - 'varsupsetneq;': '\u228b\ufe00', - 'varsupsetneqq;': '\u2acc\ufe00', - 'vartheta;': '\u03d1', - 'vartriangleleft;': '\u22b2', - 'vartriangleright;': '\u22b3', - 'Vbar;': '\u2aeb', - 'vBar;': '\u2ae8', - 'vBarv;': '\u2ae9', - 'Vcy;': '\u0412', - 'vcy;': '\u0432', - 'VDash;': '\u22ab', - 'Vdash;': '\u22a9', - 'vDash;': '\u22a8', - 'vdash;': '\u22a2', - 'Vdashl;': '\u2ae6', - 'Vee;': '\u22c1', - 'vee;': '\u2228', - 'veebar;': '\u22bb', - 'veeeq;': '\u225a', - 'vellip;': '\u22ee', - 'Verbar;': '\u2016', - 'verbar;': '|', - 'Vert;': '\u2016', - 'vert;': '|', - 'VerticalBar;': '\u2223', - 'VerticalLine;': '|', - 'VerticalSeparator;': '\u2758', - 'VerticalTilde;': '\u2240', - 'VeryThinSpace;': '\u200a', - 'Vfr;': '\U0001d519', - 'vfr;': '\U0001d533', - 'vltri;': '\u22b2', - 'vnsub;': '\u2282\u20d2', - 'vnsup;': '\u2283\u20d2', - 'Vopf;': '\U0001d54d', - 'vopf;': '\U0001d567', - 'vprop;': '\u221d', - 'vrtri;': '\u22b3', - 'Vscr;': '\U0001d4b1', - 'vscr;': '\U0001d4cb', - 'vsubnE;': '\u2acb\ufe00', - 'vsubne;': '\u228a\ufe00', - 'vsupnE;': '\u2acc\ufe00', - 'vsupne;': '\u228b\ufe00', - 'Vvdash;': '\u22aa', - 'vzigzag;': '\u299a', - 'Wcirc;': '\u0174', - 'wcirc;': '\u0175', - 'wedbar;': '\u2a5f', - 'Wedge;': '\u22c0', - 'wedge;': '\u2227', - 'wedgeq;': '\u2259', - 'weierp;': '\u2118', - 'Wfr;': '\U0001d51a', - 'wfr;': '\U0001d534', - 'Wopf;': '\U0001d54e', - 'wopf;': '\U0001d568', - 'wp;': '\u2118', - 'wr;': '\u2240', - 'wreath;': '\u2240', - 'Wscr;': '\U0001d4b2', - 'wscr;': '\U0001d4cc', - 'xcap;': '\u22c2', - 'xcirc;': '\u25ef', - 'xcup;': '\u22c3', - 'xdtri;': '\u25bd', - 'Xfr;': '\U0001d51b', - 'xfr;': '\U0001d535', - 'xhArr;': '\u27fa', - 'xharr;': '\u27f7', - 'Xi;': '\u039e', - 'xi;': '\u03be', - 'xlArr;': '\u27f8', - 'xlarr;': '\u27f5', - 'xmap;': '\u27fc', - 'xnis;': '\u22fb', - 'xodot;': '\u2a00', - 'Xopf;': '\U0001d54f', - 'xopf;': '\U0001d569', - 'xoplus;': '\u2a01', - 'xotime;': '\u2a02', - 'xrArr;': '\u27f9', - 'xrarr;': '\u27f6', - 'Xscr;': '\U0001d4b3', - 'xscr;': '\U0001d4cd', - 'xsqcup;': '\u2a06', - 'xuplus;': '\u2a04', - 'xutri;': '\u25b3', - 'xvee;': '\u22c1', - 'xwedge;': '\u22c0', - 'Yacute': '\xdd', - 'yacute': '\xfd', - 'Yacute;': '\xdd', - 'yacute;': '\xfd', - 'YAcy;': '\u042f', - 'yacy;': '\u044f', - 'Ycirc;': '\u0176', - 'ycirc;': '\u0177', - 'Ycy;': '\u042b', - 'ycy;': '\u044b', - 'yen': '\xa5', - 'yen;': '\xa5', - 'Yfr;': '\U0001d51c', - 'yfr;': '\U0001d536', - 'YIcy;': '\u0407', - 'yicy;': '\u0457', - 'Yopf;': '\U0001d550', - 'yopf;': '\U0001d56a', - 'Yscr;': '\U0001d4b4', - 'yscr;': '\U0001d4ce', - 'YUcy;': '\u042e', - 'yucy;': '\u044e', - 'yuml': '\xff', - 'Yuml;': '\u0178', - 'yuml;': '\xff', - 'Zacute;': '\u0179', - 'zacute;': '\u017a', - 'Zcaron;': '\u017d', - 'zcaron;': '\u017e', - 'Zcy;': '\u0417', - 'zcy;': '\u0437', - 'Zdot;': '\u017b', - 'zdot;': '\u017c', - 'zeetrf;': '\u2128', - 'ZeroWidthSpace;': '\u200b', - 'Zeta;': '\u0396', - 'zeta;': '\u03b6', - 'Zfr;': '\u2128', - 'zfr;': '\U0001d537', - 'ZHcy;': '\u0416', - 'zhcy;': '\u0436', - 'zigrarr;': '\u21dd', - 'Zopf;': '\u2124', - 'zopf;': '\U0001d56b', - 'Zscr;': '\U0001d4b5', - 'zscr;': '\U0001d4cf', - 'zwj;': '\u200d', - 'zwnj;': '\u200c', - } - -try: - import http.client as compat_http_client -except ImportError: # Python 2 - import httplib as compat_http_client - -try: - from urllib.error import HTTPError as compat_HTTPError -except ImportError: # Python 2 - from urllib2 import HTTPError as compat_HTTPError - -try: - from urllib.request import urlretrieve as compat_urlretrieve -except ImportError: # Python 2 - from urllib import urlretrieve as compat_urlretrieve - -try: - from html.parser import HTMLParser as compat_HTMLParser -except ImportError: # Python 2 - from HTMLParser import HTMLParser as compat_HTMLParser - -try: # Python 2 - from HTMLParser import HTMLParseError as compat_HTMLParseError -except ImportError: # Python <3.4 - try: - from html.parser import HTMLParseError as compat_HTMLParseError - except ImportError: # Python >3.4 - - # HTMLParseError has been deprecated in Python 3.3 and removed in - # Python 3.5. Introducing dummy exception for Python >3.5 for compatible - # and uniform cross-version exception handling - class compat_HTMLParseError(Exception): - pass - -try: - from subprocess import DEVNULL - compat_subprocess_get_DEVNULL = lambda: DEVNULL -except ImportError: - compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w') - -try: - import http.server as compat_http_server -except ImportError: - import BaseHTTPServer as compat_http_server - -try: - compat_str = unicode # Python 2 -except NameError: - compat_str = str - -try: - from urllib.parse import unquote_to_bytes as compat_urllib_parse_unquote_to_bytes - from urllib.parse import unquote as compat_urllib_parse_unquote - from urllib.parse import unquote_plus as compat_urllib_parse_unquote_plus -except ImportError: # Python 2 - _asciire = (compat_urllib_parse._asciire if hasattr(compat_urllib_parse, '_asciire') - else re.compile(r'([\x00-\x7f]+)')) - - # HACK: The following are the correct unquote_to_bytes, unquote and unquote_plus - # implementations from cpython 3.4.3's stdlib. Python 2's version - # is apparently broken (see https://github.com/ytdl-org/youtube-dl/pull/6244) - - def compat_urllib_parse_unquote_to_bytes(string): - """unquote_to_bytes('abc%20def') -> b'abc def'.""" - # Note: strings are encoded as UTF-8. This is only an issue if it contains - # unescaped non-ASCII characters, which URIs should not. - if not string: - # Is it a string-like object? - string.split - return b'' - if isinstance(string, compat_str): - string = string.encode('utf-8') - bits = string.split(b'%') - if len(bits) == 1: - return string - res = [bits[0]] - append = res.append - for item in bits[1:]: - try: - append(compat_urllib_parse._hextochr[item[:2]]) - append(item[2:]) - except KeyError: - append(b'%') - append(item) - return b''.join(res) - - def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'): - """Replace %xx escapes by their single-character equivalent. The optional - encoding and errors parameters specify how to decode percent-encoded - sequences into Unicode characters, as accepted by the bytes.decode() - method. - By default, percent-encoded sequences are decoded with UTF-8, and invalid - sequences are replaced by a placeholder character. - - unquote('abc%20def') -> 'abc def'. - """ - if '%' not in string: - string.split - return string - if encoding is None: - encoding = 'utf-8' - if errors is None: - errors = 'replace' - bits = _asciire.split(string) - res = [bits[0]] - append = res.append - for i in range(1, len(bits), 2): - append(compat_urllib_parse_unquote_to_bytes(bits[i]).decode(encoding, errors)) - append(bits[i + 1]) - return ''.join(res) - - def compat_urllib_parse_unquote_plus(string, encoding='utf-8', errors='replace'): - """Like unquote(), but also replace plus signs by spaces, as required for - unquoting HTML form values. - - unquote_plus('%7e/abc+def') -> '~/abc def' - """ - string = string.replace('+', ' ') - return compat_urllib_parse_unquote(string, encoding, errors) - -try: - from urllib.parse import urlencode as compat_urllib_parse_urlencode -except ImportError: # Python 2 - # Python 2 will choke in urlencode on mixture of byte and unicode strings. - # Possible solutions are to either port it from python 3 with all - # the friends or manually ensure input query contains only byte strings. - # We will stick with latter thus recursively encoding the whole query. - def compat_urllib_parse_urlencode(query, doseq=0, encoding='utf-8'): - def encode_elem(e): - if isinstance(e, dict): - e = encode_dict(e) - elif isinstance(e, (list, tuple,)): - list_e = encode_list(e) - e = tuple(list_e) if isinstance(e, tuple) else list_e - elif isinstance(e, compat_str): - e = e.encode(encoding) - return e - - def encode_dict(d): - return dict((encode_elem(k), encode_elem(v)) for k, v in d.items()) - - def encode_list(l): - return [encode_elem(e) for e in l] - - return compat_urllib_parse.urlencode(encode_elem(query), doseq=doseq) - -try: - from urllib.request import DataHandler as compat_urllib_request_DataHandler -except ImportError: # Python < 3.4 - # Ported from CPython 98774:1733b3bd46db, Lib/urllib/request.py - class compat_urllib_request_DataHandler(compat_urllib_request.BaseHandler): - def data_open(self, req): - # data URLs as specified in RFC 2397. - # - # ignores POSTed data - # - # syntax: - # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data - # mediatype := [ type "/" subtype ] *( ";" parameter ) - # data := *urlchar - # parameter := attribute "=" value - url = req.get_full_url() - - scheme, data = url.split(':', 1) - mediatype, data = data.split(',', 1) - - # even base64 encoded data URLs might be quoted so unquote in any case: - data = compat_urllib_parse_unquote_to_bytes(data) - if mediatype.endswith(';base64'): - data = binascii.a2b_base64(data) - mediatype = mediatype[:-7] - - if not mediatype: - mediatype = 'text/plain;charset=US-ASCII' - - headers = email.message_from_string( - 'Content-type: %s\nContent-length: %d\n' % (mediatype, len(data))) - - return compat_urllib_response.addinfourl(io.BytesIO(data), headers, url) - -try: - compat_basestring = basestring # Python 2 -except NameError: - compat_basestring = str - -try: - compat_chr = unichr # Python 2 -except NameError: - compat_chr = chr - -try: - from xml.etree.ElementTree import ParseError as compat_xml_parse_error -except ImportError: # Python 2.6 - from xml.parsers.expat import ExpatError as compat_xml_parse_error - - -etree = xml.etree.ElementTree - - -class _TreeBuilder(etree.TreeBuilder): - def doctype(self, name, pubid, system): - pass - - -try: - # xml.etree.ElementTree.Element is a method in Python <=2.6 and - # the following will crash with: - # TypeError: isinstance() arg 2 must be a class, type, or tuple of classes and types - isinstance(None, xml.etree.ElementTree.Element) - from xml.etree.ElementTree import Element as compat_etree_Element -except TypeError: # Python <=2.6 - from xml.etree.ElementTree import _ElementInterface as compat_etree_Element - -if sys.version_info[0] >= 3: - def compat_etree_fromstring(text): - return etree.XML(text, parser=etree.XMLParser(target=_TreeBuilder())) -else: - # python 2.x tries to encode unicode strings with ascii (see the - # XMLParser._fixtext method) - try: - _etree_iter = etree.Element.iter - except AttributeError: # Python <=2.6 - def _etree_iter(root): - for el in root.findall('*'): - yield el - for sub in _etree_iter(el): - yield sub - - # on 2.6 XML doesn't have a parser argument, function copied from CPython - # 2.7 source - def _XML(text, parser=None): - if not parser: - parser = etree.XMLParser(target=_TreeBuilder()) - parser.feed(text) - return parser.close() - - def _element_factory(*args, **kwargs): - el = etree.Element(*args, **kwargs) - for k, v in el.items(): - if isinstance(v, bytes): - el.set(k, v.decode('utf-8')) - return el - - def compat_etree_fromstring(text): - doc = _XML(text, parser=etree.XMLParser(target=_TreeBuilder(element_factory=_element_factory))) - for el in _etree_iter(doc): - if el.text is not None and isinstance(el.text, bytes): - el.text = el.text.decode('utf-8') - return doc - -if hasattr(etree, 'register_namespace'): - compat_etree_register_namespace = etree.register_namespace -else: - def compat_etree_register_namespace(prefix, uri): - """Register a namespace prefix. - The registry is global, and any existing mapping for either the - given prefix or the namespace URI will be removed. - *prefix* is the namespace prefix, *uri* is a namespace uri. Tags and - attributes in this namespace will be serialized with prefix if possible. - ValueError is raised if prefix is reserved or is invalid. - """ - if re.match(r"ns\d+$", prefix): - raise ValueError("Prefix format reserved for internal use") - for k, v in list(etree._namespace_map.items()): - if k == uri or v == prefix: - del etree._namespace_map[k] - etree._namespace_map[uri] = prefix - -if sys.version_info < (2, 7): - # Here comes the crazy part: In 2.6, if the xpath is a unicode, - # .//node does not match if a node is a direct child of . ! - def compat_xpath(xpath): - if isinstance(xpath, compat_str): - xpath = xpath.encode('ascii') - return xpath -else: - compat_xpath = lambda xpath: xpath - -try: - from urllib.parse import parse_qs as compat_parse_qs -except ImportError: # Python 2 - # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib. - # Python 2's version is apparently totally broken - - def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False, - encoding='utf-8', errors='replace'): - qs, _coerce_result = qs, compat_str - pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] - r = [] - for name_value in pairs: - if not name_value and not strict_parsing: - continue - nv = name_value.split('=', 1) - if len(nv) != 2: - if strict_parsing: - raise ValueError('bad query field: %r' % (name_value,)) - # Handle case of a control-name with no equal sign - if keep_blank_values: - nv.append('') - else: - continue - if len(nv[1]) or keep_blank_values: - name = nv[0].replace('+', ' ') - name = compat_urllib_parse_unquote( - name, encoding=encoding, errors=errors) - name = _coerce_result(name) - value = nv[1].replace('+', ' ') - value = compat_urllib_parse_unquote( - value, encoding=encoding, errors=errors) - value = _coerce_result(value) - r.append((name, value)) - return r - - def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False, - encoding='utf-8', errors='replace'): - parsed_result = {} - pairs = _parse_qsl(qs, keep_blank_values, strict_parsing, - encoding=encoding, errors=errors) - for name, value in pairs: - if name in parsed_result: - parsed_result[name].append(value) - else: - parsed_result[name] = [value] - return parsed_result - - -compat_os_name = os._name if os.name == 'java' else os.name - - -if compat_os_name == 'nt': - def compat_shlex_quote(s): - return s if re.match(r'^[-_\w./]+$', s) else '"%s"' % s.replace('"', '\\"') -else: - try: - from shlex import quote as compat_shlex_quote - except ImportError: # Python < 3.3 - def compat_shlex_quote(s): - if re.match(r'^[-_\w./]+$', s): - return s - else: - return "'" + s.replace("'", "'\"'\"'") + "'" - - -try: - args = shlex.split('中文') - assert (isinstance(args, list) - and isinstance(args[0], compat_str) - and args[0] == '中文') - compat_shlex_split = shlex.split -except (AssertionError, UnicodeEncodeError): - # Working around shlex issue with unicode strings on some python 2 - # versions (see http://bugs.python.org/issue1548891) - def compat_shlex_split(s, comments=False, posix=True): - if isinstance(s, compat_str): - s = s.encode('utf-8') - return list(map(lambda s: s.decode('utf-8'), shlex.split(s, comments, posix))) - - -def compat_ord(c): - if type(c) is int: - return c - else: - return ord(c) - - -if sys.version_info >= (3, 0): - compat_getenv = os.getenv - compat_expanduser = os.path.expanduser - - def compat_setenv(key, value, env=os.environ): - env[key] = value -else: - # Environment variables should be decoded with filesystem encoding. - # Otherwise it will fail if any non-ASCII characters present (see #3854 #3217 #2918) - - def compat_getenv(key, default=None): - from .utils import get_filesystem_encoding - env = os.getenv(key, default) - if env: - env = env.decode(get_filesystem_encoding()) - return env - - def compat_setenv(key, value, env=os.environ): - def encode(v): - from .utils import get_filesystem_encoding - return v.encode(get_filesystem_encoding()) if isinstance(v, compat_str) else v - env[encode(key)] = encode(value) - - # HACK: The default implementations of os.path.expanduser from cpython do not decode - # environment variables with filesystem encoding. We will work around this by - # providing adjusted implementations. - # The following are os.path.expanduser implementations from cpython 2.7.8 stdlib - # for different platforms with correct environment variables decoding. - - if compat_os_name == 'posix': - def compat_expanduser(path): - """Expand ~ and ~user constructions. If user or $HOME is unknown, - do nothing.""" - if not path.startswith('~'): - return path - i = path.find('/', 1) - if i < 0: - i = len(path) - if i == 1: - if 'HOME' not in os.environ: - import pwd - userhome = pwd.getpwuid(os.getuid()).pw_dir - else: - userhome = compat_getenv('HOME') - else: - import pwd - try: - pwent = pwd.getpwnam(path[1:i]) - except KeyError: - return path - userhome = pwent.pw_dir - userhome = userhome.rstrip('/') - return (userhome + path[i:]) or '/' - elif compat_os_name in ('nt', 'ce'): - def compat_expanduser(path): - """Expand ~ and ~user constructs. - - If user or $HOME is unknown, do nothing.""" - if path[:1] != '~': - return path - i, n = 1, len(path) - while i < n and path[i] not in '/\\': - i = i + 1 - - if 'HOME' in os.environ: - userhome = compat_getenv('HOME') - elif 'USERPROFILE' in os.environ: - userhome = compat_getenv('USERPROFILE') - elif 'HOMEPATH' not in os.environ: - return path - else: - try: - drive = compat_getenv('HOMEDRIVE') - except KeyError: - drive = '' - userhome = os.path.join(drive, compat_getenv('HOMEPATH')) - - if i != 1: # ~user - userhome = os.path.join(os.path.dirname(userhome), path[1:i]) - - return userhome + path[i:] - else: - compat_expanduser = os.path.expanduser - - -if compat_os_name == 'nt' and sys.version_info < (3, 8): - # os.path.realpath on Windows does not follow symbolic links - # prior to Python 3.8 (see https://bugs.python.org/issue9949) - def compat_realpath(path): - while os.path.islink(path): - path = os.path.abspath(os.readlink(path)) - return path -else: - compat_realpath = os.path.realpath - - -if sys.version_info < (3, 0): - def compat_print(s): - from .utils import preferredencoding - print(s.encode(preferredencoding(), 'xmlcharrefreplace')) -else: - def compat_print(s): - assert isinstance(s, compat_str) - print(s) - - -if sys.version_info < (3, 0) and sys.platform == 'win32': - def compat_getpass(prompt, *args, **kwargs): - if isinstance(prompt, compat_str): - from .utils import preferredencoding - prompt = prompt.encode(preferredencoding()) - return getpass.getpass(prompt, *args, **kwargs) -else: - compat_getpass = getpass.getpass - -try: - compat_input = raw_input -except NameError: # Python 3 - compat_input = input - -# Python < 2.6.5 require kwargs to be bytes -try: - def _testfunc(x): - pass - _testfunc(**{'x': 0}) -except TypeError: - def compat_kwargs(kwargs): - return dict((bytes(k), v) for k, v in kwargs.items()) -else: - compat_kwargs = lambda kwargs: kwargs - - -try: - compat_numeric_types = (int, float, long, complex) -except NameError: # Python 3 - compat_numeric_types = (int, float, complex) - - -try: - compat_integer_types = (int, long) -except NameError: # Python 3 - compat_integer_types = (int, ) - - -if sys.version_info < (2, 7): - def compat_socket_create_connection(address, timeout, source_address=None): - host, port = address - err = None - for res in socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM): - af, socktype, proto, canonname, sa = res - sock = None - try: - sock = socket.socket(af, socktype, proto) - sock.settimeout(timeout) - if source_address: - sock.bind(source_address) - sock.connect(sa) - return sock - except socket.error as _: - err = _ - if sock is not None: - sock.close() - if err is not None: - raise err - else: - raise socket.error('getaddrinfo returns an empty list') -else: - compat_socket_create_connection = socket.create_connection - - -# Fix https://github.com/ytdl-org/youtube-dl/issues/4223 -# See http://bugs.python.org/issue9161 for what is broken -def workaround_optparse_bug9161(): - op = optparse.OptionParser() - og = optparse.OptionGroup(op, 'foo') - try: - og.add_option('-t') - except TypeError: - real_add_option = optparse.OptionGroup.add_option - - def _compat_add_option(self, *args, **kwargs): - enc = lambda v: ( - v.encode('ascii', 'replace') if isinstance(v, compat_str) - else v) - bargs = [enc(a) for a in args] - bkwargs = dict( - (k, enc(v)) for k, v in kwargs.items()) - return real_add_option(self, *bargs, **bkwargs) - optparse.OptionGroup.add_option = _compat_add_option - - -if hasattr(shutil, 'get_terminal_size'): # Python >= 3.3 - compat_get_terminal_size = shutil.get_terminal_size -else: - _terminal_size = collections.namedtuple('terminal_size', ['columns', 'lines']) - - def compat_get_terminal_size(fallback=(80, 24)): - columns = compat_getenv('COLUMNS') - if columns: - columns = int(columns) - else: - columns = None - lines = compat_getenv('LINES') - if lines: - lines = int(lines) - else: - lines = None - - if columns is None or lines is None or columns <= 0 or lines <= 0: - try: - sp = subprocess.Popen( - ['stty', 'size'], - stdout=subprocess.PIPE, stderr=subprocess.PIPE) - out, err = sp.communicate() - _lines, _columns = map(int, out.split()) - except Exception: - _columns, _lines = _terminal_size(*fallback) - - if columns is None or columns <= 0: - columns = _columns - if lines is None or lines <= 0: - lines = _lines - return _terminal_size(columns, lines) - -try: - itertools.count(start=0, step=1) - compat_itertools_count = itertools.count -except TypeError: # Python 2.6 - def compat_itertools_count(start=0, step=1): - n = start - while True: - yield n - n += step - -if sys.version_info >= (3, 0): - from tokenize import tokenize as compat_tokenize_tokenize -else: - from tokenize import generate_tokens as compat_tokenize_tokenize - - -try: - struct.pack('!I', 0) -except TypeError: - # In Python 2.6 and 2.7.x < 2.7.7, struct requires a bytes argument - # See https://bugs.python.org/issue19099 - def compat_struct_pack(spec, *args): - if isinstance(spec, compat_str): - spec = spec.encode('ascii') - return struct.pack(spec, *args) - - def compat_struct_unpack(spec, *args): - if isinstance(spec, compat_str): - spec = spec.encode('ascii') - return struct.unpack(spec, *args) - - class compat_Struct(struct.Struct): - def __init__(self, fmt): - if isinstance(fmt, compat_str): - fmt = fmt.encode('ascii') - super(compat_Struct, self).__init__(fmt) -else: - compat_struct_pack = struct.pack - compat_struct_unpack = struct.unpack - if platform.python_implementation() == 'IronPython' and sys.version_info < (2, 7, 8): - class compat_Struct(struct.Struct): - def unpack(self, string): - if not isinstance(string, buffer): # noqa: F821 - string = buffer(string) # noqa: F821 - return super(compat_Struct, self).unpack(string) - else: - compat_Struct = struct.Struct - - -try: - from future_builtins import zip as compat_zip -except ImportError: # not 2.6+ or is 3.x - try: - from itertools import izip as compat_zip # < 2.5 or 3.x - except ImportError: - compat_zip = zip - - -if sys.version_info < (3, 3): - def compat_b64decode(s, *args, **kwargs): - if isinstance(s, compat_str): - s = s.encode('ascii') - return base64.b64decode(s, *args, **kwargs) -else: - compat_b64decode = base64.b64decode - - -if platform.python_implementation() == 'PyPy' and sys.pypy_version_info < (5, 4, 0): - # PyPy2 prior to version 5.4.0 expects byte strings as Windows function - # names, see the original PyPy issue [1] and the youtube-dl one [2]. - # 1. https://bitbucket.org/pypy/pypy/issues/2360/windows-ctypescdll-typeerror-function-name - # 2. https://github.com/ytdl-org/youtube-dl/pull/4392 - def compat_ctypes_WINFUNCTYPE(*args, **kwargs): - real = ctypes.WINFUNCTYPE(*args, **kwargs) - - def resf(tpl, *args, **kwargs): - funcname, dll = tpl - return real((str(funcname), dll), *args, **kwargs) - - return resf -else: - def compat_ctypes_WINFUNCTYPE(*args, **kwargs): - return ctypes.WINFUNCTYPE(*args, **kwargs) - - -__all__ = [ - 'compat_HTMLParseError', - 'compat_HTMLParser', - 'compat_HTTPError', - 'compat_Struct', - 'compat_b64decode', - 'compat_basestring', - 'compat_chr', - 'compat_cookiejar', - 'compat_cookiejar_Cookie', - 'compat_cookies', - 'compat_cookies_SimpleCookie', - 'compat_ctypes_WINFUNCTYPE', - 'compat_etree_Element', - 'compat_etree_fromstring', - 'compat_etree_register_namespace', - 'compat_expanduser', - 'compat_get_terminal_size', - 'compat_getenv', - 'compat_getpass', - 'compat_html_entities', - 'compat_html_entities_html5', - 'compat_http_client', - 'compat_http_server', - 'compat_input', - 'compat_integer_types', - 'compat_itertools_count', - 'compat_kwargs', - 'compat_numeric_types', - 'compat_ord', - 'compat_os_name', - 'compat_parse_qs', - 'compat_print', - 'compat_realpath', - 'compat_setenv', - 'compat_shlex_quote', - 'compat_shlex_split', - 'compat_socket_create_connection', - 'compat_str', - 'compat_struct_pack', - 'compat_struct_unpack', - 'compat_subprocess_get_DEVNULL', - 'compat_tokenize_tokenize', - 'compat_urllib_error', - 'compat_urllib_parse', - 'compat_urllib_parse_unquote', - 'compat_urllib_parse_unquote_plus', - 'compat_urllib_parse_unquote_to_bytes', - 'compat_urllib_parse_urlencode', - 'compat_urllib_parse_urlparse', - 'compat_urllib_request', - 'compat_urllib_request_DataHandler', - 'compat_urllib_response', - 'compat_urlparse', - 'compat_urlretrieve', - 'compat_xml_parse_error', - 'compat_xpath', - 'compat_zip', - 'workaround_optparse_bug9161', -] diff --git a/youtube_dl/downloader/__init__.py b/youtube_dl/downloader/__init__.py deleted file mode 100644 index 2e485df9d..000000000 --- a/youtube_dl/downloader/__init__.py +++ /dev/null @@ -1,61 +0,0 @@ -from __future__ import unicode_literals - -from .common import FileDownloader -from .f4m import F4mFD -from .hls import HlsFD -from .http import HttpFD -from .rtmp import RtmpFD -from .dash import DashSegmentsFD -from .rtsp import RtspFD -from .ism import IsmFD -from .external import ( - get_external_downloader, - FFmpegFD, -) - -from ..utils import ( - determine_protocol, -) - -PROTOCOL_MAP = { - 'rtmp': RtmpFD, - 'm3u8_native': HlsFD, - 'm3u8': FFmpegFD, - 'mms': RtspFD, - 'rtsp': RtspFD, - 'f4m': F4mFD, - 'http_dash_segments': DashSegmentsFD, - 'ism': IsmFD, -} - - -def get_suitable_downloader(info_dict, params={}): - """Get the downloader class that can handle the info dict.""" - protocol = determine_protocol(info_dict) - info_dict['protocol'] = protocol - - # if (info_dict.get('start_time') or info_dict.get('end_time')) and not info_dict.get('requested_formats') and FFmpegFD.can_download(info_dict): - # return FFmpegFD - - external_downloader = params.get('external_downloader') - if external_downloader is not None: - ed = get_external_downloader(external_downloader) - if ed.can_download(info_dict): - return ed - - if protocol.startswith('m3u8') and info_dict.get('is_live'): - return FFmpegFD - - if protocol == 'm3u8' and params.get('hls_prefer_native') is True: - return HlsFD - - if protocol == 'm3u8_native' and params.get('hls_prefer_native') is False: - return FFmpegFD - - return PROTOCOL_MAP.get(protocol, HttpFD) - - -__all__ = [ - 'get_suitable_downloader', - 'FileDownloader', -] diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py deleted file mode 100644 index 1cdba89cd..000000000 --- a/youtube_dl/downloader/common.py +++ /dev/null @@ -1,391 +0,0 @@ -from __future__ import division, unicode_literals - -import os -import re -import sys -import time -import random - -from ..compat import compat_os_name -from ..utils import ( - decodeArgument, - encodeFilename, - error_to_compat_str, - format_bytes, - shell_quote, - timeconvert, -) - - -class FileDownloader(object): - """File Downloader class. - - File downloader objects are the ones responsible of downloading the - actual video file and writing it to disk. - - File downloaders accept a lot of parameters. In order not to saturate - the object constructor with arguments, it receives a dictionary of - options instead. - - Available options: - - verbose: Print additional info to stdout. - quiet: Do not print messages to stdout. - ratelimit: Download speed limit, in bytes/sec. - retries: Number of times to retry for HTTP error 5xx - buffersize: Size of download buffer in bytes. - noresizebuffer: Do not automatically resize the download buffer. - continuedl: Try to continue downloads if possible. - noprogress: Do not print the progress bar. - logtostderr: Log messages to stderr instead of stdout. - consoletitle: Display progress in console window's titlebar. - nopart: Do not use temporary .part files. - updatetime: Use the Last-modified header to set output file timestamps. - test: Download only first bytes to test the downloader. - min_filesize: Skip files smaller than this size - max_filesize: Skip files larger than this size - xattr_set_filesize: Set ytdl.filesize user xattribute with expected size. - external_downloader_args: A list of additional command-line arguments for the - external downloader. - hls_use_mpegts: Use the mpegts container for HLS videos. - http_chunk_size: Size of a chunk for chunk-based HTTP downloading. May be - useful for bypassing bandwidth throttling imposed by - a webserver (experimental) - - Subclasses of this one must re-define the real_download method. - """ - - _TEST_FILE_SIZE = 10241 - params = None - - def __init__(self, ydl, params): - """Create a FileDownloader object with the given options.""" - self.ydl = ydl - self._progress_hooks = [] - self.params = params - self.add_progress_hook(self.report_progress) - - @staticmethod - def format_seconds(seconds): - (mins, secs) = divmod(seconds, 60) - (hours, mins) = divmod(mins, 60) - if hours > 99: - return '--:--:--' - if hours == 0: - return '%02d:%02d' % (mins, secs) - else: - return '%02d:%02d:%02d' % (hours, mins, secs) - - @staticmethod - def calc_percent(byte_counter, data_len): - if data_len is None: - return None - return float(byte_counter) / float(data_len) * 100.0 - - @staticmethod - def format_percent(percent): - if percent is None: - return '---.-%' - return '%6s' % ('%3.1f%%' % percent) - - @staticmethod - def calc_eta(start, now, total, current): - if total is None: - return None - if now is None: - now = time.time() - dif = now - start - if current == 0 or dif < 0.001: # One millisecond - return None - rate = float(current) / dif - return int((float(total) - float(current)) / rate) - - @staticmethod - def format_eta(eta): - if eta is None: - return '--:--' - return FileDownloader.format_seconds(eta) - - @staticmethod - def calc_speed(start, now, bytes): - dif = now - start - if bytes == 0 or dif < 0.001: # One millisecond - return None - return float(bytes) / dif - - @staticmethod - def format_speed(speed): - if speed is None: - return '%10s' % '---b/s' - return '%10s' % ('%s/s' % format_bytes(speed)) - - @staticmethod - def format_retries(retries): - return 'inf' if retries == float('inf') else '%.0f' % retries - - @staticmethod - def best_block_size(elapsed_time, bytes): - new_min = max(bytes / 2.0, 1.0) - new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB - if elapsed_time < 0.001: - return int(new_max) - rate = bytes / elapsed_time - if rate > new_max: - return int(new_max) - if rate < new_min: - return int(new_min) - return int(rate) - - @staticmethod - def parse_bytes(bytestr): - """Parse a string indicating a byte quantity into an integer.""" - matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr) - if matchobj is None: - return None - number = float(matchobj.group(1)) - multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower()) - return int(round(number * multiplier)) - - def to_screen(self, *args, **kargs): - self.ydl.to_screen(*args, **kargs) - - def to_stderr(self, message): - self.ydl.to_screen(message) - - def to_console_title(self, message): - self.ydl.to_console_title(message) - - def trouble(self, *args, **kargs): - self.ydl.trouble(*args, **kargs) - - def report_warning(self, *args, **kargs): - self.ydl.report_warning(*args, **kargs) - - def report_error(self, *args, **kargs): - self.ydl.report_error(*args, **kargs) - - def slow_down(self, start_time, now, byte_counter): - """Sleep if the download speed is over the rate limit.""" - rate_limit = self.params.get('ratelimit') - if rate_limit is None or byte_counter == 0: - return - if now is None: - now = time.time() - elapsed = now - start_time - if elapsed <= 0.0: - return - speed = float(byte_counter) / elapsed - if speed > rate_limit: - sleep_time = float(byte_counter) / rate_limit - elapsed - if sleep_time > 0: - time.sleep(sleep_time) - - def temp_name(self, filename): - """Returns a temporary filename for the given filename.""" - if self.params.get('nopart', False) or filename == '-' or \ - (os.path.exists(encodeFilename(filename)) and not os.path.isfile(encodeFilename(filename))): - return filename - return filename + '.part' - - def undo_temp_name(self, filename): - if filename.endswith('.part'): - return filename[:-len('.part')] - return filename - - def ytdl_filename(self, filename): - return filename + '.ytdl' - - def try_rename(self, old_filename, new_filename): - try: - if old_filename == new_filename: - return - os.rename(encodeFilename(old_filename), encodeFilename(new_filename)) - except (IOError, OSError) as err: - self.report_error('unable to rename file: %s' % error_to_compat_str(err)) - - def try_utime(self, filename, last_modified_hdr): - """Try to set the last-modified time of the given file.""" - if last_modified_hdr is None: - return - if not os.path.isfile(encodeFilename(filename)): - return - timestr = last_modified_hdr - if timestr is None: - return - filetime = timeconvert(timestr) - if filetime is None: - return filetime - # Ignore obviously invalid dates - if filetime == 0: - return - try: - os.utime(filename, (time.time(), filetime)) - except Exception: - pass - return filetime - - def report_destination(self, filename): - """Report destination filename.""" - self.to_screen('[download] Destination: ' + filename) - - def _report_progress_status(self, msg, is_last_line=False): - fullmsg = '[download] ' + msg - if self.params.get('progress_with_newline', False): - self.to_screen(fullmsg) - else: - if compat_os_name == 'nt': - prev_len = getattr(self, '_report_progress_prev_line_length', - 0) - if prev_len > len(fullmsg): - fullmsg += ' ' * (prev_len - len(fullmsg)) - self._report_progress_prev_line_length = len(fullmsg) - clear_line = '\r' - else: - clear_line = ('\r\x1b[K' if sys.stderr.isatty() else '\r') - self.to_screen(clear_line + fullmsg, skip_eol=not is_last_line) - self.to_console_title('youtube-dl ' + msg) - - def report_progress(self, s): - if s['status'] == 'finished': - if self.params.get('noprogress', False): - self.to_screen('[download] Download completed') - else: - msg_template = '100%%' - if s.get('total_bytes') is not None: - s['_total_bytes_str'] = format_bytes(s['total_bytes']) - msg_template += ' of %(_total_bytes_str)s' - if s.get('elapsed') is not None: - s['_elapsed_str'] = self.format_seconds(s['elapsed']) - msg_template += ' in %(_elapsed_str)s' - self._report_progress_status( - msg_template % s, is_last_line=True) - - if self.params.get('noprogress'): - return - - if s['status'] != 'downloading': - return - - if s.get('eta') is not None: - s['_eta_str'] = self.format_eta(s['eta']) - else: - s['_eta_str'] = 'Unknown ETA' - - if s.get('total_bytes') and s.get('downloaded_bytes') is not None: - s['_percent_str'] = self.format_percent(100 * s['downloaded_bytes'] / s['total_bytes']) - elif s.get('total_bytes_estimate') and s.get('downloaded_bytes') is not None: - s['_percent_str'] = self.format_percent(100 * s['downloaded_bytes'] / s['total_bytes_estimate']) - else: - if s.get('downloaded_bytes') == 0: - s['_percent_str'] = self.format_percent(0) - else: - s['_percent_str'] = 'Unknown %' - - if s.get('speed') is not None: - s['_speed_str'] = self.format_speed(s['speed']) - else: - s['_speed_str'] = 'Unknown speed' - - if s.get('total_bytes') is not None: - s['_total_bytes_str'] = format_bytes(s['total_bytes']) - msg_template = '%(_percent_str)s of %(_total_bytes_str)s at %(_speed_str)s ETA %(_eta_str)s' - elif s.get('total_bytes_estimate') is not None: - s['_total_bytes_estimate_str'] = format_bytes(s['total_bytes_estimate']) - msg_template = '%(_percent_str)s of ~%(_total_bytes_estimate_str)s at %(_speed_str)s ETA %(_eta_str)s' - else: - if s.get('downloaded_bytes') is not None: - s['_downloaded_bytes_str'] = format_bytes(s['downloaded_bytes']) - if s.get('elapsed'): - s['_elapsed_str'] = self.format_seconds(s['elapsed']) - msg_template = '%(_downloaded_bytes_str)s at %(_speed_str)s (%(_elapsed_str)s)' - else: - msg_template = '%(_downloaded_bytes_str)s at %(_speed_str)s' - else: - msg_template = '%(_percent_str)s % at %(_speed_str)s ETA %(_eta_str)s' - - self._report_progress_status(msg_template % s) - - def report_resuming_byte(self, resume_len): - """Report attempt to resume at given byte.""" - self.to_screen('[download] Resuming download at byte %s' % resume_len) - - def report_retry(self, err, count, retries): - """Report retry in case of HTTP error 5xx""" - self.to_screen( - '[download] Got server HTTP error: %s. Retrying (attempt %d of %s)...' - % (error_to_compat_str(err), count, self.format_retries(retries))) - - def report_file_already_downloaded(self, file_name): - """Report file has already been fully downloaded.""" - try: - self.to_screen('[download] %s has already been downloaded' % file_name) - except UnicodeEncodeError: - self.to_screen('[download] The file has already been downloaded') - - def report_unable_to_resume(self): - """Report it was impossible to resume download.""" - self.to_screen('[download] Unable to resume') - - def download(self, filename, info_dict): - """Download to a filename using the info from info_dict - Return True on success and False otherwise - """ - - nooverwrites_and_exists = ( - self.params.get('nooverwrites', False) - and os.path.exists(encodeFilename(filename)) - ) - - if not hasattr(filename, 'write'): - continuedl_and_exists = ( - self.params.get('continuedl', True) - and os.path.isfile(encodeFilename(filename)) - and not self.params.get('nopart', False) - ) - - # Check file already present - if filename != '-' and (nooverwrites_and_exists or continuedl_and_exists): - self.report_file_already_downloaded(filename) - self._hook_progress({ - 'filename': filename, - 'status': 'finished', - 'total_bytes': os.path.getsize(encodeFilename(filename)), - }) - return True - - min_sleep_interval = self.params.get('sleep_interval') - if min_sleep_interval: - max_sleep_interval = self.params.get('max_sleep_interval', min_sleep_interval) - sleep_interval = random.uniform(min_sleep_interval, max_sleep_interval) - self.to_screen( - '[download] Sleeping %s seconds...' % ( - int(sleep_interval) if sleep_interval.is_integer() - else '%.2f' % sleep_interval)) - time.sleep(sleep_interval) - - return self.real_download(filename, info_dict) - - def real_download(self, filename, info_dict): - """Real download process. Redefine in subclasses.""" - raise NotImplementedError('This method must be implemented by subclasses') - - def _hook_progress(self, status): - for ph in self._progress_hooks: - ph(status) - - def add_progress_hook(self, ph): - # See YoutubeDl.py (search for progress_hooks) for a description of - # this interface - self._progress_hooks.append(ph) - - def _debug_cmd(self, args, exe=None): - if not self.params.get('verbose', False): - return - - str_args = [decodeArgument(a) for a in args] - - if exe is None: - exe = os.path.basename(str_args[0]) - - self.to_screen('[debug] %s command line: %s' % ( - exe, shell_quote(str_args))) diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py deleted file mode 100644 index c6d674bc6..000000000 --- a/youtube_dl/downloader/dash.py +++ /dev/null @@ -1,80 +0,0 @@ -from __future__ import unicode_literals - -from .fragment import FragmentFD -from ..compat import compat_urllib_error -from ..utils import ( - DownloadError, - urljoin, -) - - -class DashSegmentsFD(FragmentFD): - """ - Download segments in a DASH manifest - """ - - FD_NAME = 'dashsegments' - - def real_download(self, filename, info_dict): - fragment_base_url = info_dict.get('fragment_base_url') - fragments = info_dict['fragments'][:1] if self.params.get( - 'test', False) else info_dict['fragments'] - - ctx = { - 'filename': filename, - 'total_frags': len(fragments), - } - - self._prepare_and_start_frag_download(ctx) - - fragment_retries = self.params.get('fragment_retries', 0) - skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) - - frag_index = 0 - for i, fragment in enumerate(fragments): - frag_index += 1 - if frag_index <= ctx['fragment_index']: - continue - # In DASH, the first segment contains necessary headers to - # generate a valid MP4 file, so always abort for the first segment - fatal = i == 0 or not skip_unavailable_fragments - count = 0 - while count <= fragment_retries: - try: - fragment_url = fragment.get('url') - if not fragment_url: - assert fragment_base_url - fragment_url = urljoin(fragment_base_url, fragment['path']) - success, frag_content = self._download_fragment(ctx, fragment_url, info_dict) - if not success: - return False - self._append_fragment(ctx, frag_content) - break - except compat_urllib_error.HTTPError as err: - # YouTube may often return 404 HTTP error for a fragment causing the - # whole download to fail. However if the same fragment is immediately - # retried with the same request data this usually succeeds (1-2 attempts - # is usually enough) thus allowing to download the whole file successfully. - # To be future-proof we will retry all fragments that fail with any - # HTTP error. - count += 1 - if count <= fragment_retries: - self.report_retry_fragment(err, frag_index, count, fragment_retries) - except DownloadError: - # Don't retry fragment if error occurred during HTTP downloading - # itself since it has own retry settings - if not fatal: - self.report_skip_fragment(frag_index) - break - raise - - if count > fragment_retries: - if not fatal: - self.report_skip_fragment(frag_index) - continue - self.report_error('giving up after %s fragment retries' % fragment_retries) - return False - - self._finish_frag_download(ctx) - - return True diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py deleted file mode 100644 index c31f8910a..000000000 --- a/youtube_dl/downloader/external.py +++ /dev/null @@ -1,371 +0,0 @@ -from __future__ import unicode_literals - -import os.path -import re -import subprocess -import sys -import time - -from .common import FileDownloader -from ..compat import ( - compat_setenv, - compat_str, -) -from ..postprocessor.ffmpeg import FFmpegPostProcessor, EXT_TO_OUT_FORMATS -from ..utils import ( - cli_option, - cli_valueless_option, - cli_bool_option, - cli_configuration_args, - encodeFilename, - encodeArgument, - handle_youtubedl_headers, - check_executable, - is_outdated_version, -) - - -class ExternalFD(FileDownloader): - def real_download(self, filename, info_dict): - self.report_destination(filename) - tmpfilename = self.temp_name(filename) - - try: - started = time.time() - retval = self._call_downloader(tmpfilename, info_dict) - except KeyboardInterrupt: - if not info_dict.get('is_live'): - raise - # Live stream downloading cancellation should be considered as - # correct and expected termination thus all postprocessing - # should take place - retval = 0 - self.to_screen('[%s] Interrupted by user' % self.get_basename()) - - if retval == 0: - status = { - 'filename': filename, - 'status': 'finished', - 'elapsed': time.time() - started, - } - if filename != '-': - fsize = os.path.getsize(encodeFilename(tmpfilename)) - self.to_screen('\r[%s] Downloaded %s bytes' % (self.get_basename(), fsize)) - self.try_rename(tmpfilename, filename) - status.update({ - 'downloaded_bytes': fsize, - 'total_bytes': fsize, - }) - self._hook_progress(status) - return True - else: - self.to_stderr('\n') - self.report_error('%s exited with code %d' % ( - self.get_basename(), retval)) - return False - - @classmethod - def get_basename(cls): - return cls.__name__[:-2].lower() - - @property - def exe(self): - return self.params.get('external_downloader') - - @classmethod - def available(cls): - return check_executable(cls.get_basename(), [cls.AVAILABLE_OPT]) - - @classmethod - def supports(cls, info_dict): - return info_dict['protocol'] in ('http', 'https', 'ftp', 'ftps') - - @classmethod - def can_download(cls, info_dict): - return cls.available() and cls.supports(info_dict) - - def _option(self, command_option, param): - return cli_option(self.params, command_option, param) - - def _bool_option(self, command_option, param, true_value='true', false_value='false', separator=None): - return cli_bool_option(self.params, command_option, param, true_value, false_value, separator) - - def _valueless_option(self, command_option, param, expected_value=True): - return cli_valueless_option(self.params, command_option, param, expected_value) - - def _configuration_args(self, default=[]): - return cli_configuration_args(self.params, 'external_downloader_args', default) - - def _call_downloader(self, tmpfilename, info_dict): - """ Either overwrite this or implement _make_cmd """ - cmd = [encodeArgument(a) for a in self._make_cmd(tmpfilename, info_dict)] - - self._debug_cmd(cmd) - - p = subprocess.Popen( - cmd, stderr=subprocess.PIPE) - _, stderr = p.communicate() - if p.returncode != 0: - self.to_stderr(stderr.decode('utf-8', 'replace')) - return p.returncode - - -class CurlFD(ExternalFD): - AVAILABLE_OPT = '-V' - - def _make_cmd(self, tmpfilename, info_dict): - cmd = [self.exe, '--location', '-o', tmpfilename] - for key, val in info_dict['http_headers'].items(): - cmd += ['--header', '%s: %s' % (key, val)] - cmd += self._bool_option('--continue-at', 'continuedl', '-', '0') - cmd += self._valueless_option('--silent', 'noprogress') - cmd += self._valueless_option('--verbose', 'verbose') - cmd += self._option('--limit-rate', 'ratelimit') - retry = self._option('--retry', 'retries') - if len(retry) == 2: - if retry[1] in ('inf', 'infinite'): - retry[1] = '2147483647' - cmd += retry - cmd += self._option('--max-filesize', 'max_filesize') - cmd += self._option('--interface', 'source_address') - cmd += self._option('--proxy', 'proxy') - cmd += self._valueless_option('--insecure', 'nocheckcertificate') - cmd += self._configuration_args() - cmd += ['--', info_dict['url']] - return cmd - - def _call_downloader(self, tmpfilename, info_dict): - cmd = [encodeArgument(a) for a in self._make_cmd(tmpfilename, info_dict)] - - self._debug_cmd(cmd) - - # curl writes the progress to stderr so don't capture it. - p = subprocess.Popen(cmd) - p.communicate() - return p.returncode - - -class AxelFD(ExternalFD): - AVAILABLE_OPT = '-V' - - def _make_cmd(self, tmpfilename, info_dict): - cmd = [self.exe, '-o', tmpfilename] - for key, val in info_dict['http_headers'].items(): - cmd += ['-H', '%s: %s' % (key, val)] - cmd += self._configuration_args() - cmd += ['--', info_dict['url']] - return cmd - - -class WgetFD(ExternalFD): - AVAILABLE_OPT = '--version' - - def _make_cmd(self, tmpfilename, info_dict): - cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies'] - for key, val in info_dict['http_headers'].items(): - cmd += ['--header', '%s: %s' % (key, val)] - cmd += self._option('--limit-rate', 'ratelimit') - retry = self._option('--tries', 'retries') - if len(retry) == 2: - if retry[1] in ('inf', 'infinite'): - retry[1] = '0' - cmd += retry - cmd += self._option('--bind-address', 'source_address') - cmd += self._option('--proxy', 'proxy') - cmd += self._valueless_option('--no-check-certificate', 'nocheckcertificate') - cmd += self._configuration_args() - cmd += ['--', info_dict['url']] - return cmd - - -class Aria2cFD(ExternalFD): - AVAILABLE_OPT = '-v' - - def _make_cmd(self, tmpfilename, info_dict): - cmd = [self.exe, '-c'] - cmd += self._configuration_args([ - '--min-split-size', '1M', '--max-connection-per-server', '4']) - dn = os.path.dirname(tmpfilename) - if dn: - cmd += ['--dir', dn] - cmd += ['--out', os.path.basename(tmpfilename)] - for key, val in info_dict['http_headers'].items(): - cmd += ['--header', '%s: %s' % (key, val)] - cmd += self._option('--interface', 'source_address') - cmd += self._option('--all-proxy', 'proxy') - cmd += self._bool_option('--check-certificate', 'nocheckcertificate', 'false', 'true', '=') - cmd += self._bool_option('--remote-time', 'updatetime', 'true', 'false', '=') - cmd += ['--', info_dict['url']] - return cmd - - -class HttpieFD(ExternalFD): - @classmethod - def available(cls): - return check_executable('http', ['--version']) - - def _make_cmd(self, tmpfilename, info_dict): - cmd = ['http', '--download', '--output', tmpfilename, info_dict['url']] - for key, val in info_dict['http_headers'].items(): - cmd += ['%s:%s' % (key, val)] - return cmd - - -class FFmpegFD(ExternalFD): - @classmethod - def supports(cls, info_dict): - return info_dict['protocol'] in ('http', 'https', 'ftp', 'ftps', 'm3u8', 'rtsp', 'rtmp', 'mms') - - @classmethod - def available(cls): - return FFmpegPostProcessor().available - - def _call_downloader(self, tmpfilename, info_dict): - url = info_dict['url'] - ffpp = FFmpegPostProcessor(downloader=self) - if not ffpp.available: - self.report_error('m3u8 download detected but ffmpeg or avconv could not be found. Please install one.') - return False - ffpp.check_version() - - args = [ffpp.executable, '-y'] - - for log_level in ('quiet', 'verbose'): - if self.params.get(log_level, False): - args += ['-loglevel', log_level] - break - - seekable = info_dict.get('_seekable') - if seekable is not None: - # setting -seekable prevents ffmpeg from guessing if the server - # supports seeking(by adding the header `Range: bytes=0-`), which - # can cause problems in some cases - # https://github.com/ytdl-org/youtube-dl/issues/11800#issuecomment-275037127 - # http://trac.ffmpeg.org/ticket/6125#comment:10 - args += ['-seekable', '1' if seekable else '0'] - - args += self._configuration_args() - - # start_time = info_dict.get('start_time') or 0 - # if start_time: - # args += ['-ss', compat_str(start_time)] - # end_time = info_dict.get('end_time') - # if end_time: - # args += ['-t', compat_str(end_time - start_time)] - - if info_dict['http_headers'] and re.match(r'^https?://', url): - # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv: - # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header. - headers = handle_youtubedl_headers(info_dict['http_headers']) - args += [ - '-headers', - ''.join('%s: %s\r\n' % (key, val) for key, val in headers.items())] - - env = None - proxy = self.params.get('proxy') - if proxy: - if not re.match(r'^[\da-zA-Z]+://', proxy): - proxy = 'http://%s' % proxy - - if proxy.startswith('socks'): - self.report_warning( - '%s does not support SOCKS proxies. Downloading is likely to fail. ' - 'Consider adding --hls-prefer-native to your command.' % self.get_basename()) - - # Since December 2015 ffmpeg supports -http_proxy option (see - # http://git.videolan.org/?p=ffmpeg.git;a=commit;h=b4eb1f29ebddd60c41a2eb39f5af701e38e0d3fd) - # We could switch to the following code if we are able to detect version properly - # args += ['-http_proxy', proxy] - env = os.environ.copy() - compat_setenv('HTTP_PROXY', proxy, env=env) - compat_setenv('http_proxy', proxy, env=env) - - protocol = info_dict.get('protocol') - - if protocol == 'rtmp': - player_url = info_dict.get('player_url') - page_url = info_dict.get('page_url') - app = info_dict.get('app') - play_path = info_dict.get('play_path') - tc_url = info_dict.get('tc_url') - flash_version = info_dict.get('flash_version') - live = info_dict.get('rtmp_live', False) - conn = info_dict.get('rtmp_conn') - if player_url is not None: - args += ['-rtmp_swfverify', player_url] - if page_url is not None: - args += ['-rtmp_pageurl', page_url] - if app is not None: - args += ['-rtmp_app', app] - if play_path is not None: - args += ['-rtmp_playpath', play_path] - if tc_url is not None: - args += ['-rtmp_tcurl', tc_url] - if flash_version is not None: - args += ['-rtmp_flashver', flash_version] - if live: - args += ['-rtmp_live', 'live'] - if isinstance(conn, list): - for entry in conn: - args += ['-rtmp_conn', entry] - elif isinstance(conn, compat_str): - args += ['-rtmp_conn', conn] - - args += ['-i', url, '-c', 'copy'] - - if self.params.get('test', False): - args += ['-fs', compat_str(self._TEST_FILE_SIZE)] - - if protocol in ('m3u8', 'm3u8_native'): - if self.params.get('hls_use_mpegts', False) or tmpfilename == '-': - args += ['-f', 'mpegts'] - else: - args += ['-f', 'mp4'] - if (ffpp.basename == 'ffmpeg' and is_outdated_version(ffpp._versions['ffmpeg'], '3.2', False)) and (not info_dict.get('acodec') or info_dict['acodec'].split('.')[0] in ('aac', 'mp4a')): - args += ['-bsf:a', 'aac_adtstoasc'] - elif protocol == 'rtmp': - args += ['-f', 'flv'] - else: - args += ['-f', EXT_TO_OUT_FORMATS.get(info_dict['ext'], info_dict['ext'])] - - args = [encodeArgument(opt) for opt in args] - args.append(encodeFilename(ffpp._ffmpeg_filename_argument(tmpfilename), True)) - - self._debug_cmd(args) - - proc = subprocess.Popen(args, stdin=subprocess.PIPE, env=env) - try: - retval = proc.wait() - except KeyboardInterrupt: - # subprocces.run would send the SIGKILL signal to ffmpeg and the - # mp4 file couldn't be played, but if we ask ffmpeg to quit it - # produces a file that is playable (this is mostly useful for live - # streams). Note that Windows is not affected and produces playable - # files (see https://github.com/ytdl-org/youtube-dl/issues/8300). - if sys.platform != 'win32': - proc.communicate(b'q') - raise - return retval - - -class AVconvFD(FFmpegFD): - pass - - -_BY_NAME = dict( - (klass.get_basename(), klass) - for name, klass in globals().items() - if name.endswith('FD') and name != 'ExternalFD' -) - - -def list_external_downloaders(): - return sorted(_BY_NAME.keys()) - - -def get_external_downloader(external_downloader): - """ Given the name of the executable, see whether we support the given - downloader . """ - # Drop .exe extension on Windows - bn = os.path.splitext(os.path.basename(external_downloader))[0] - return _BY_NAME[bn] diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py deleted file mode 100644 index 8dd3c2eeb..000000000 --- a/youtube_dl/downloader/f4m.py +++ /dev/null @@ -1,438 +0,0 @@ -from __future__ import division, unicode_literals - -import io -import itertools -import time - -from .fragment import FragmentFD -from ..compat import ( - compat_b64decode, - compat_etree_fromstring, - compat_urlparse, - compat_urllib_error, - compat_urllib_parse_urlparse, - compat_struct_pack, - compat_struct_unpack, -) -from ..utils import ( - fix_xml_ampersands, - xpath_text, -) - - -class DataTruncatedError(Exception): - pass - - -class FlvReader(io.BytesIO): - """ - Reader for Flv files - The file format is documented in https://www.adobe.com/devnet/f4v.html - """ - - def read_bytes(self, n): - data = self.read(n) - if len(data) < n: - raise DataTruncatedError( - 'FlvReader error: need %d bytes while only %d bytes got' % ( - n, len(data))) - return data - - # Utility functions for reading numbers and strings - def read_unsigned_long_long(self): - return compat_struct_unpack('!Q', self.read_bytes(8))[0] - - def read_unsigned_int(self): - return compat_struct_unpack('!I', self.read_bytes(4))[0] - - def read_unsigned_char(self): - return compat_struct_unpack('!B', self.read_bytes(1))[0] - - def read_string(self): - res = b'' - while True: - char = self.read_bytes(1) - if char == b'\x00': - break - res += char - return res - - def read_box_info(self): - """ - Read a box and return the info as a tuple: (box_size, box_type, box_data) - """ - real_size = size = self.read_unsigned_int() - box_type = self.read_bytes(4) - header_end = 8 - if size == 1: - real_size = self.read_unsigned_long_long() - header_end = 16 - return real_size, box_type, self.read_bytes(real_size - header_end) - - def read_asrt(self): - # version - self.read_unsigned_char() - # flags - self.read_bytes(3) - quality_entry_count = self.read_unsigned_char() - # QualityEntryCount - for i in range(quality_entry_count): - self.read_string() - - segment_run_count = self.read_unsigned_int() - segments = [] - for i in range(segment_run_count): - first_segment = self.read_unsigned_int() - fragments_per_segment = self.read_unsigned_int() - segments.append((first_segment, fragments_per_segment)) - - return { - 'segment_run': segments, - } - - def read_afrt(self): - # version - self.read_unsigned_char() - # flags - self.read_bytes(3) - # time scale - self.read_unsigned_int() - - quality_entry_count = self.read_unsigned_char() - # QualitySegmentUrlModifiers - for i in range(quality_entry_count): - self.read_string() - - fragments_count = self.read_unsigned_int() - fragments = [] - for i in range(fragments_count): - first = self.read_unsigned_int() - first_ts = self.read_unsigned_long_long() - duration = self.read_unsigned_int() - if duration == 0: - discontinuity_indicator = self.read_unsigned_char() - else: - discontinuity_indicator = None - fragments.append({ - 'first': first, - 'ts': first_ts, - 'duration': duration, - 'discontinuity_indicator': discontinuity_indicator, - }) - - return { - 'fragments': fragments, - } - - def read_abst(self): - # version - self.read_unsigned_char() - # flags - self.read_bytes(3) - - self.read_unsigned_int() # BootstrapinfoVersion - # Profile,Live,Update,Reserved - flags = self.read_unsigned_char() - live = flags & 0x20 != 0 - # time scale - self.read_unsigned_int() - # CurrentMediaTime - self.read_unsigned_long_long() - # SmpteTimeCodeOffset - self.read_unsigned_long_long() - - self.read_string() # MovieIdentifier - server_count = self.read_unsigned_char() - # ServerEntryTable - for i in range(server_count): - self.read_string() - quality_count = self.read_unsigned_char() - # QualityEntryTable - for i in range(quality_count): - self.read_string() - # DrmData - self.read_string() - # MetaData - self.read_string() - - segments_count = self.read_unsigned_char() - segments = [] - for i in range(segments_count): - box_size, box_type, box_data = self.read_box_info() - assert box_type == b'asrt' - segment = FlvReader(box_data).read_asrt() - segments.append(segment) - fragments_run_count = self.read_unsigned_char() - fragments = [] - for i in range(fragments_run_count): - box_size, box_type, box_data = self.read_box_info() - assert box_type == b'afrt' - fragments.append(FlvReader(box_data).read_afrt()) - - return { - 'segments': segments, - 'fragments': fragments, - 'live': live, - } - - def read_bootstrap_info(self): - total_size, box_type, box_data = self.read_box_info() - assert box_type == b'abst' - return FlvReader(box_data).read_abst() - - -def read_bootstrap_info(bootstrap_bytes): - return FlvReader(bootstrap_bytes).read_bootstrap_info() - - -def build_fragments_list(boot_info): - """ Return a list of (segment, fragment) for each fragment in the video """ - res = [] - segment_run_table = boot_info['segments'][0] - fragment_run_entry_table = boot_info['fragments'][0]['fragments'] - first_frag_number = fragment_run_entry_table[0]['first'] - fragments_counter = itertools.count(first_frag_number) - for segment, fragments_count in segment_run_table['segment_run']: - # In some live HDS streams (for example Rai), `fragments_count` is - # abnormal and causing out-of-memory errors. It's OK to change the - # number of fragments for live streams as they are updated periodically - if fragments_count == 4294967295 and boot_info['live']: - fragments_count = 2 - for _ in range(fragments_count): - res.append((segment, next(fragments_counter))) - - if boot_info['live']: - res = res[-2:] - - return res - - -def write_unsigned_int(stream, val): - stream.write(compat_struct_pack('!I', val)) - - -def write_unsigned_int_24(stream, val): - stream.write(compat_struct_pack('!I', val)[1:]) - - -def write_flv_header(stream): - """Writes the FLV header to stream""" - # FLV header - stream.write(b'FLV\x01') - stream.write(b'\x05') - stream.write(b'\x00\x00\x00\x09') - stream.write(b'\x00\x00\x00\x00') - - -def write_metadata_tag(stream, metadata): - """Writes optional metadata tag to stream""" - SCRIPT_TAG = b'\x12' - FLV_TAG_HEADER_LEN = 11 - - if metadata: - stream.write(SCRIPT_TAG) - write_unsigned_int_24(stream, len(metadata)) - stream.write(b'\x00\x00\x00\x00\x00\x00\x00') - stream.write(metadata) - write_unsigned_int(stream, FLV_TAG_HEADER_LEN + len(metadata)) - - -def remove_encrypted_media(media): - return list(filter(lambda e: 'drmAdditionalHeaderId' not in e.attrib - and 'drmAdditionalHeaderSetId' not in e.attrib, - media)) - - -def _add_ns(prop, ver=1): - return '{http://ns.adobe.com/f4m/%d.0}%s' % (ver, prop) - - -def get_base_url(manifest): - base_url = xpath_text( - manifest, [_add_ns('baseURL'), _add_ns('baseURL', 2)], - 'base URL', default=None) - if base_url: - base_url = base_url.strip() - return base_url - - -class F4mFD(FragmentFD): - """ - A downloader for f4m manifests or AdobeHDS. - """ - - FD_NAME = 'f4m' - - def _get_unencrypted_media(self, doc): - media = doc.findall(_add_ns('media')) - if not media: - self.report_error('No media found') - for e in (doc.findall(_add_ns('drmAdditionalHeader')) - + doc.findall(_add_ns('drmAdditionalHeaderSet'))): - # If id attribute is missing it's valid for all media nodes - # without drmAdditionalHeaderId or drmAdditionalHeaderSetId attribute - if 'id' not in e.attrib: - self.report_error('Missing ID in f4m DRM') - media = remove_encrypted_media(media) - if not media: - self.report_error('Unsupported DRM') - return media - - def _get_bootstrap_from_url(self, bootstrap_url): - bootstrap = self.ydl.urlopen(bootstrap_url).read() - return read_bootstrap_info(bootstrap) - - def _update_live_fragments(self, bootstrap_url, latest_fragment): - fragments_list = [] - retries = 30 - while (not fragments_list) and (retries > 0): - boot_info = self._get_bootstrap_from_url(bootstrap_url) - fragments_list = build_fragments_list(boot_info) - fragments_list = [f for f in fragments_list if f[1] > latest_fragment] - if not fragments_list: - # Retry after a while - time.sleep(5.0) - retries -= 1 - - if not fragments_list: - self.report_error('Failed to update fragments') - - return fragments_list - - def _parse_bootstrap_node(self, node, base_url): - # Sometimes non empty inline bootstrap info can be specified along - # with bootstrap url attribute (e.g. dummy inline bootstrap info - # contains whitespace characters in [1]). We will prefer bootstrap - # url over inline bootstrap info when present. - # 1. http://live-1-1.rutube.ru/stream/1024/HDS/SD/C2NKsS85HQNckgn5HdEmOQ/1454167650/S-s604419906/move/four/dirs/upper/1024-576p.f4m - bootstrap_url = node.get('url') - if bootstrap_url: - bootstrap_url = compat_urlparse.urljoin( - base_url, bootstrap_url) - boot_info = self._get_bootstrap_from_url(bootstrap_url) - else: - bootstrap_url = None - bootstrap = compat_b64decode(node.text) - boot_info = read_bootstrap_info(bootstrap) - return boot_info, bootstrap_url - - def real_download(self, filename, info_dict): - man_url = info_dict['url'] - requested_bitrate = info_dict.get('tbr') - self.to_screen('[%s] Downloading f4m manifest' % self.FD_NAME) - - urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url)) - man_url = urlh.geturl() - # Some manifests may be malformed, e.g. prosiebensat1 generated manifests - # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244 - # and https://github.com/ytdl-org/youtube-dl/issues/7823) - manifest = fix_xml_ampersands(urlh.read().decode('utf-8', 'ignore')).strip() - - doc = compat_etree_fromstring(manifest) - formats = [(int(f.attrib.get('bitrate', -1)), f) - for f in self._get_unencrypted_media(doc)] - if requested_bitrate is None or len(formats) == 1: - # get the best format - formats = sorted(formats, key=lambda f: f[0]) - rate, media = formats[-1] - else: - rate, media = list(filter( - lambda f: int(f[0]) == requested_bitrate, formats))[0] - - # Prefer baseURL for relative URLs as per 11.2 of F4M 3.0 spec. - man_base_url = get_base_url(doc) or man_url - - base_url = compat_urlparse.urljoin(man_base_url, media.attrib['url']) - bootstrap_node = doc.find(_add_ns('bootstrapInfo')) - boot_info, bootstrap_url = self._parse_bootstrap_node( - bootstrap_node, man_base_url) - live = boot_info['live'] - metadata_node = media.find(_add_ns('metadata')) - if metadata_node is not None: - metadata = compat_b64decode(metadata_node.text) - else: - metadata = None - - fragments_list = build_fragments_list(boot_info) - test = self.params.get('test', False) - if test: - # We only download the first fragment - fragments_list = fragments_list[:1] - total_frags = len(fragments_list) - # For some akamai manifests we'll need to add a query to the fragment url - akamai_pv = xpath_text(doc, _add_ns('pv-2.0')) - - ctx = { - 'filename': filename, - 'total_frags': total_frags, - 'live': live, - } - - self._prepare_frag_download(ctx) - - dest_stream = ctx['dest_stream'] - - if ctx['complete_frags_downloaded_bytes'] == 0: - write_flv_header(dest_stream) - if not live: - write_metadata_tag(dest_stream, metadata) - - base_url_parsed = compat_urllib_parse_urlparse(base_url) - - self._start_frag_download(ctx) - - frag_index = 0 - while fragments_list: - seg_i, frag_i = fragments_list.pop(0) - frag_index += 1 - if frag_index <= ctx['fragment_index']: - continue - name = 'Seg%d-Frag%d' % (seg_i, frag_i) - query = [] - if base_url_parsed.query: - query.append(base_url_parsed.query) - if akamai_pv: - query.append(akamai_pv.strip(';')) - if info_dict.get('extra_param_to_segment_url'): - query.append(info_dict['extra_param_to_segment_url']) - url_parsed = base_url_parsed._replace(path=base_url_parsed.path + name, query='&'.join(query)) - try: - success, down_data = self._download_fragment(ctx, url_parsed.geturl(), info_dict) - if not success: - return False - reader = FlvReader(down_data) - while True: - try: - _, box_type, box_data = reader.read_box_info() - except DataTruncatedError: - if test: - # In tests, segments may be truncated, and thus - # FlvReader may not be able to parse the whole - # chunk. If so, write the segment as is - # See https://github.com/ytdl-org/youtube-dl/issues/9214 - dest_stream.write(down_data) - break - raise - if box_type == b'mdat': - self._append_fragment(ctx, box_data) - break - except (compat_urllib_error.HTTPError, ) as err: - if live and (err.code == 404 or err.code == 410): - # We didn't keep up with the live window. Continue - # with the next available fragment. - msg = 'Fragment %d unavailable' % frag_i - self.report_warning(msg) - fragments_list = [] - else: - raise - - if not fragments_list and not test and live and bootstrap_url: - fragments_list = self._update_live_fragments(bootstrap_url, frag_i) - total_frags += len(fragments_list) - if fragments_list and (fragments_list[0][1] > frag_i + 1): - msg = 'Missed %d fragments' % (fragments_list[0][1] - (frag_i + 1)) - self.report_warning(msg) - - self._finish_frag_download(ctx) - - return True diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py deleted file mode 100644 index 35c76feba..000000000 --- a/youtube_dl/downloader/fragment.py +++ /dev/null @@ -1,279 +0,0 @@ -from __future__ import division, unicode_literals - -import os -import time -import json - -from .common import FileDownloader -from .http import HttpFD -from ..utils import ( - error_to_compat_str, - encodeFilename, - sanitize_open, - sanitized_Request, -) - - -class HttpQuietDownloader(HttpFD): - def to_screen(self, *args, **kargs): - pass - - -class FragmentFD(FileDownloader): - """ - A base file downloader class for fragmented media (e.g. f4m/m3u8 manifests). - - Available options: - - fragment_retries: Number of times to retry a fragment for HTTP error (DASH - and hlsnative only) - skip_unavailable_fragments: - Skip unavailable fragments (DASH and hlsnative only) - keep_fragments: Keep downloaded fragments on disk after downloading is - finished - - For each incomplete fragment download youtube-dl keeps on disk a special - bookkeeping file with download state and metadata (in future such files will - be used for any incomplete download handled by youtube-dl). This file is - used to properly handle resuming, check download file consistency and detect - potential errors. The file has a .ytdl extension and represents a standard - JSON file of the following format: - - extractor: - Dictionary of extractor related data. TBD. - - downloader: - Dictionary of downloader related data. May contain following data: - current_fragment: - Dictionary with current (being downloaded) fragment data: - index: 0-based index of current fragment among all fragments - fragment_count: - Total count of fragments - - This feature is experimental and file format may change in future. - """ - - def report_retry_fragment(self, err, frag_index, count, retries): - self.to_screen( - '[download] Got server HTTP error: %s. Retrying fragment %d (attempt %d of %s)...' - % (error_to_compat_str(err), frag_index, count, self.format_retries(retries))) - - def report_skip_fragment(self, frag_index): - self.to_screen('[download] Skipping fragment %d...' % frag_index) - - def _prepare_url(self, info_dict, url): - headers = info_dict.get('http_headers') - return sanitized_Request(url, None, headers) if headers else url - - def _prepare_and_start_frag_download(self, ctx): - self._prepare_frag_download(ctx) - self._start_frag_download(ctx) - - @staticmethod - def __do_ytdl_file(ctx): - return not ctx['live'] and not ctx['tmpfilename'] == '-' - - def _read_ytdl_file(self, ctx): - assert 'ytdl_corrupt' not in ctx - stream, _ = sanitize_open(self.ytdl_filename(ctx['filename']), 'r') - try: - ctx['fragment_index'] = json.loads(stream.read())['downloader']['current_fragment']['index'] - except Exception: - ctx['ytdl_corrupt'] = True - finally: - stream.close() - - def _write_ytdl_file(self, ctx): - frag_index_stream, _ = sanitize_open(self.ytdl_filename(ctx['filename']), 'w') - downloader = { - 'current_fragment': { - 'index': ctx['fragment_index'], - }, - } - if ctx.get('fragment_count') is not None: - downloader['fragment_count'] = ctx['fragment_count'] - frag_index_stream.write(json.dumps({'downloader': downloader})) - frag_index_stream.close() - - def _download_fragment(self, ctx, frag_url, info_dict, headers=None): - fragment_filename = '%s-Frag%d' % (ctx['tmpfilename'], ctx['fragment_index']) - fragment_info_dict = { - 'url': frag_url, - 'http_headers': headers or info_dict.get('http_headers'), - } - success = ctx['dl'].download(fragment_filename, fragment_info_dict) - if not success: - return False, None - if fragment_info_dict.get('filetime'): - ctx['fragment_filetime'] = fragment_info_dict.get('filetime') - down, frag_sanitized = sanitize_open(fragment_filename, 'rb') - ctx['fragment_filename_sanitized'] = frag_sanitized - frag_content = down.read() - down.close() - return True, frag_content - - def _append_fragment(self, ctx, frag_content): - try: - ctx['dest_stream'].write(frag_content) - ctx['dest_stream'].flush() - finally: - if self.__do_ytdl_file(ctx): - self._write_ytdl_file(ctx) - if not self.params.get('keep_fragments', False): - os.remove(encodeFilename(ctx['fragment_filename_sanitized'])) - del ctx['fragment_filename_sanitized'] - - def _prepare_frag_download(self, ctx): - if 'live' not in ctx: - ctx['live'] = False - if not ctx['live']: - total_frags_str = '%d' % ctx['total_frags'] - ad_frags = ctx.get('ad_frags', 0) - if ad_frags: - total_frags_str += ' (not including %d ad)' % ad_frags - else: - total_frags_str = 'unknown (live)' - self.to_screen( - '[%s] Total fragments: %s' % (self.FD_NAME, total_frags_str)) - self.report_destination(ctx['filename']) - dl = HttpQuietDownloader( - self.ydl, - { - 'continuedl': True, - 'quiet': True, - 'noprogress': True, - 'ratelimit': self.params.get('ratelimit'), - 'retries': self.params.get('retries', 0), - 'nopart': self.params.get('nopart', False), - 'test': self.params.get('test', False), - } - ) - tmpfilename = self.temp_name(ctx['filename']) - open_mode = 'wb' - resume_len = 0 - - # Establish possible resume length - if os.path.isfile(encodeFilename(tmpfilename)): - open_mode = 'ab' - resume_len = os.path.getsize(encodeFilename(tmpfilename)) - - # Should be initialized before ytdl file check - ctx.update({ - 'tmpfilename': tmpfilename, - 'fragment_index': 0, - }) - - if self.__do_ytdl_file(ctx): - if os.path.isfile(encodeFilename(self.ytdl_filename(ctx['filename']))): - self._read_ytdl_file(ctx) - is_corrupt = ctx.get('ytdl_corrupt') is True - is_inconsistent = ctx['fragment_index'] > 0 and resume_len == 0 - if is_corrupt or is_inconsistent: - message = ( - '.ytdl file is corrupt' if is_corrupt else - 'Inconsistent state of incomplete fragment download') - self.report_warning( - '%s. Restarting from the beginning...' % message) - ctx['fragment_index'] = resume_len = 0 - if 'ytdl_corrupt' in ctx: - del ctx['ytdl_corrupt'] - self._write_ytdl_file(ctx) - else: - self._write_ytdl_file(ctx) - assert ctx['fragment_index'] == 0 - - dest_stream, tmpfilename = sanitize_open(tmpfilename, open_mode) - - ctx.update({ - 'dl': dl, - 'dest_stream': dest_stream, - 'tmpfilename': tmpfilename, - # Total complete fragments downloaded so far in bytes - 'complete_frags_downloaded_bytes': resume_len, - }) - - def _start_frag_download(self, ctx): - resume_len = ctx['complete_frags_downloaded_bytes'] - total_frags = ctx['total_frags'] - # This dict stores the download progress, it's updated by the progress - # hook - state = { - 'status': 'downloading', - 'downloaded_bytes': resume_len, - 'fragment_index': ctx['fragment_index'], - 'fragment_count': total_frags, - 'filename': ctx['filename'], - 'tmpfilename': ctx['tmpfilename'], - } - - start = time.time() - ctx.update({ - 'started': start, - # Amount of fragment's bytes downloaded by the time of the previous - # frag progress hook invocation - 'prev_frag_downloaded_bytes': 0, - }) - - def frag_progress_hook(s): - if s['status'] not in ('downloading', 'finished'): - return - - time_now = time.time() - state['elapsed'] = time_now - start - frag_total_bytes = s.get('total_bytes') or 0 - if not ctx['live']: - estimated_size = ( - (ctx['complete_frags_downloaded_bytes'] + frag_total_bytes) - / (state['fragment_index'] + 1) * total_frags) - state['total_bytes_estimate'] = estimated_size - - if s['status'] == 'finished': - state['fragment_index'] += 1 - ctx['fragment_index'] = state['fragment_index'] - state['downloaded_bytes'] += frag_total_bytes - ctx['prev_frag_downloaded_bytes'] - ctx['complete_frags_downloaded_bytes'] = state['downloaded_bytes'] - ctx['prev_frag_downloaded_bytes'] = 0 - else: - frag_downloaded_bytes = s['downloaded_bytes'] - state['downloaded_bytes'] += frag_downloaded_bytes - ctx['prev_frag_downloaded_bytes'] - if not ctx['live']: - state['eta'] = self.calc_eta( - start, time_now, estimated_size - resume_len, - state['downloaded_bytes'] - resume_len) - state['speed'] = s.get('speed') or ctx.get('speed') - ctx['speed'] = state['speed'] - ctx['prev_frag_downloaded_bytes'] = frag_downloaded_bytes - self._hook_progress(state) - - ctx['dl'].add_progress_hook(frag_progress_hook) - - return start - - def _finish_frag_download(self, ctx): - ctx['dest_stream'].close() - if self.__do_ytdl_file(ctx): - ytdl_filename = encodeFilename(self.ytdl_filename(ctx['filename'])) - if os.path.isfile(ytdl_filename): - os.remove(ytdl_filename) - elapsed = time.time() - ctx['started'] - - if ctx['tmpfilename'] == '-': - downloaded_bytes = ctx['complete_frags_downloaded_bytes'] - else: - self.try_rename(ctx['tmpfilename'], ctx['filename']) - if self.params.get('updatetime', True): - filetime = ctx.get('fragment_filetime') - if filetime: - try: - os.utime(ctx['filename'], (time.time(), filetime)) - except Exception: - pass - downloaded_bytes = os.path.getsize(encodeFilename(ctx['filename'])) - - self._hook_progress({ - 'downloaded_bytes': downloaded_bytes, - 'total_bytes': downloaded_bytes, - 'filename': ctx['filename'], - 'status': 'finished', - 'elapsed': elapsed, - }) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py deleted file mode 100644 index 7aaebc940..000000000 --- a/youtube_dl/downloader/hls.py +++ /dev/null @@ -1,216 +0,0 @@ -from __future__ import unicode_literals - -import re -import binascii -try: - from Crypto.Cipher import AES - can_decrypt_frag = True -except ImportError: - can_decrypt_frag = False - -from .fragment import FragmentFD -from .external import FFmpegFD - -from ..compat import ( - compat_urllib_error, - compat_urlparse, - compat_struct_pack, -) -from ..utils import ( - parse_m3u8_attributes, - update_url_query, -) - - -class HlsFD(FragmentFD): - """ A limited implementation that does not require ffmpeg """ - - FD_NAME = 'hlsnative' - - @staticmethod - def can_download(manifest, info_dict): - UNSUPPORTED_FEATURES = ( - r'#EXT-X-KEY:METHOD=(?!NONE|AES-128)', # encrypted streams [1] - # r'#EXT-X-BYTERANGE', # playlists composed of byte ranges of media files [2] - - # Live streams heuristic does not always work (e.g. geo restricted to Germany - # http://hls-geo.daserste.de/i/videoportal/Film/c_620000/622873/format,716451,716457,716450,716458,716459,.mp4.csmil/index_4_av.m3u8?null=0) - # r'#EXT-X-MEDIA-SEQUENCE:(?!0$)', # live streams [3] - - # This heuristic also is not correct since segments may not be appended as well. - # Twitch vods of finished streams have EXT-X-PLAYLIST-TYPE:EVENT despite - # no segments will definitely be appended to the end of the playlist. - # r'#EXT-X-PLAYLIST-TYPE:EVENT', # media segments may be appended to the end of - # # event media playlists [4] - r'#EXT-X-MAP:', # media initialization [5] - - # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.4 - # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.2 - # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.2 - # 4. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.5 - # 5. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.5 - ) - check_results = [not re.search(feature, manifest) for feature in UNSUPPORTED_FEATURES] - is_aes128_enc = '#EXT-X-KEY:METHOD=AES-128' in manifest - check_results.append(can_decrypt_frag or not is_aes128_enc) - check_results.append(not (is_aes128_enc and r'#EXT-X-BYTERANGE' in manifest)) - check_results.append(not info_dict.get('is_live')) - return all(check_results) - - def real_download(self, filename, info_dict): - man_url = info_dict['url'] - self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME) - - urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url)) - man_url = urlh.geturl() - s = urlh.read().decode('utf-8', 'ignore') - - if not self.can_download(s, info_dict): - if info_dict.get('extra_param_to_segment_url') or info_dict.get('_decryption_key_url'): - self.report_error('pycrypto not found. Please install it.') - return False - self.report_warning( - 'hlsnative has detected features it does not support, ' - 'extraction will be delegated to ffmpeg') - fd = FFmpegFD(self.ydl, self.params) - for ph in self._progress_hooks: - fd.add_progress_hook(ph) - return fd.real_download(filename, info_dict) - - def is_ad_fragment_start(s): - return (s.startswith('#ANVATO-SEGMENT-INFO') and 'type=ad' in s - or s.startswith('#UPLYNK-SEGMENT') and s.endswith(',ad')) - - def is_ad_fragment_end(s): - return (s.startswith('#ANVATO-SEGMENT-INFO') and 'type=master' in s - or s.startswith('#UPLYNK-SEGMENT') and s.endswith(',segment')) - - media_frags = 0 - ad_frags = 0 - ad_frag_next = False - for line in s.splitlines(): - line = line.strip() - if not line: - continue - if line.startswith('#'): - if is_ad_fragment_start(line): - ad_frag_next = True - elif is_ad_fragment_end(line): - ad_frag_next = False - continue - if ad_frag_next: - ad_frags += 1 - continue - media_frags += 1 - - ctx = { - 'filename': filename, - 'total_frags': media_frags, - 'ad_frags': ad_frags, - } - - self._prepare_and_start_frag_download(ctx) - - fragment_retries = self.params.get('fragment_retries', 0) - skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) - test = self.params.get('test', False) - - extra_query = None - extra_param_to_segment_url = info_dict.get('extra_param_to_segment_url') - if extra_param_to_segment_url: - extra_query = compat_urlparse.parse_qs(extra_param_to_segment_url) - i = 0 - media_sequence = 0 - decrypt_info = {'METHOD': 'NONE'} - byte_range = {} - frag_index = 0 - ad_frag_next = False - for line in s.splitlines(): - line = line.strip() - if line: - if not line.startswith('#'): - if ad_frag_next: - continue - frag_index += 1 - if frag_index <= ctx['fragment_index']: - continue - frag_url = ( - line - if re.match(r'^https?://', line) - else compat_urlparse.urljoin(man_url, line)) - if extra_query: - frag_url = update_url_query(frag_url, extra_query) - count = 0 - headers = info_dict.get('http_headers', {}) - if byte_range: - headers['Range'] = 'bytes=%d-%d' % (byte_range['start'], byte_range['end'] - 1) - while count <= fragment_retries: - try: - success, frag_content = self._download_fragment( - ctx, frag_url, info_dict, headers) - if not success: - return False - break - except compat_urllib_error.HTTPError as err: - # Unavailable (possibly temporary) fragments may be served. - # First we try to retry then either skip or abort. - # See https://github.com/ytdl-org/youtube-dl/issues/10165, - # https://github.com/ytdl-org/youtube-dl/issues/10448). - count += 1 - if count <= fragment_retries: - self.report_retry_fragment(err, frag_index, count, fragment_retries) - if count > fragment_retries: - if skip_unavailable_fragments: - i += 1 - media_sequence += 1 - self.report_skip_fragment(frag_index) - continue - self.report_error( - 'giving up after %s fragment retries' % fragment_retries) - return False - if decrypt_info['METHOD'] == 'AES-128': - iv = decrypt_info.get('IV') or compat_struct_pack('>8xq', media_sequence) - decrypt_info['KEY'] = decrypt_info.get('KEY') or self.ydl.urlopen( - self._prepare_url(info_dict, info_dict.get('_decryption_key_url') or decrypt_info['URI'])).read() - # Don't decrypt the content in tests since the data is explicitly truncated and it's not to a valid block - # size (see https://github.com/ytdl-org/youtube-dl/pull/27660). Tests only care that the correct data downloaded, - # not what it decrypts to. - if not test: - frag_content = AES.new( - decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content) - self._append_fragment(ctx, frag_content) - # We only download the first fragment during the test - if test: - break - i += 1 - media_sequence += 1 - elif line.startswith('#EXT-X-KEY'): - decrypt_url = decrypt_info.get('URI') - decrypt_info = parse_m3u8_attributes(line[11:]) - if decrypt_info['METHOD'] == 'AES-128': - if 'IV' in decrypt_info: - decrypt_info['IV'] = binascii.unhexlify(decrypt_info['IV'][2:].zfill(32)) - if not re.match(r'^https?://', decrypt_info['URI']): - decrypt_info['URI'] = compat_urlparse.urljoin( - man_url, decrypt_info['URI']) - if extra_query: - decrypt_info['URI'] = update_url_query(decrypt_info['URI'], extra_query) - if decrypt_url != decrypt_info['URI']: - decrypt_info['KEY'] = None - elif line.startswith('#EXT-X-MEDIA-SEQUENCE'): - media_sequence = int(line[22:]) - elif line.startswith('#EXT-X-BYTERANGE'): - splitted_byte_range = line[17:].split('@') - sub_range_start = int(splitted_byte_range[1]) if len(splitted_byte_range) == 2 else byte_range['end'] - byte_range = { - 'start': sub_range_start, - 'end': sub_range_start + int(splitted_byte_range[0]), - } - elif is_ad_fragment_start(line): - ad_frag_next = True - elif is_ad_fragment_end(line): - ad_frag_next = False - - self._finish_frag_download(ctx) - - return True diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py deleted file mode 100644 index d8ac41dcc..000000000 --- a/youtube_dl/downloader/http.py +++ /dev/null @@ -1,364 +0,0 @@ -from __future__ import unicode_literals - -import errno -import os -import socket -import time -import random -import re - -from .common import FileDownloader -from ..compat import ( - compat_str, - compat_urllib_error, -) -from ..utils import ( - ContentTooShortError, - encodeFilename, - int_or_none, - sanitize_open, - sanitized_Request, - write_xattr, - XAttrMetadataError, - XAttrUnavailableError, -) - - -class HttpFD(FileDownloader): - def real_download(self, filename, info_dict): - url = info_dict['url'] - - class DownloadContext(dict): - __getattr__ = dict.get - __setattr__ = dict.__setitem__ - __delattr__ = dict.__delitem__ - - ctx = DownloadContext() - ctx.filename = filename - ctx.tmpfilename = self.temp_name(filename) - ctx.stream = None - - # Do not include the Accept-Encoding header - headers = {'Youtubedl-no-compression': 'True'} - add_headers = info_dict.get('http_headers') - if add_headers: - headers.update(add_headers) - - is_test = self.params.get('test', False) - chunk_size = self._TEST_FILE_SIZE if is_test else ( - info_dict.get('downloader_options', {}).get('http_chunk_size') - or self.params.get('http_chunk_size') or 0) - - ctx.open_mode = 'wb' - ctx.resume_len = 0 - ctx.data_len = None - ctx.block_size = self.params.get('buffersize', 1024) - ctx.start_time = time.time() - ctx.chunk_size = None - - if self.params.get('continuedl', True): - # Establish possible resume length - if os.path.isfile(encodeFilename(ctx.tmpfilename)): - ctx.resume_len = os.path.getsize( - encodeFilename(ctx.tmpfilename)) - - ctx.is_resume = ctx.resume_len > 0 - - count = 0 - retries = self.params.get('retries', 0) - - class SucceedDownload(Exception): - pass - - class RetryDownload(Exception): - def __init__(self, source_error): - self.source_error = source_error - - class NextFragment(Exception): - pass - - def set_range(req, start, end): - range_header = 'bytes=%d-' % start - if end: - range_header += compat_str(end) - req.add_header('Range', range_header) - - def establish_connection(): - ctx.chunk_size = (random.randint(int(chunk_size * 0.95), chunk_size) - if not is_test and chunk_size else chunk_size) - if ctx.resume_len > 0: - range_start = ctx.resume_len - if ctx.is_resume: - self.report_resuming_byte(ctx.resume_len) - ctx.open_mode = 'ab' - elif ctx.chunk_size > 0: - range_start = 0 - else: - range_start = None - ctx.is_resume = False - range_end = range_start + ctx.chunk_size - 1 if ctx.chunk_size else None - if range_end and ctx.data_len is not None and range_end >= ctx.data_len: - range_end = ctx.data_len - 1 - has_range = range_start is not None - ctx.has_range = has_range - request = sanitized_Request(url, None, headers) - if has_range: - set_range(request, range_start, range_end) - # Establish connection - try: - try: - ctx.data = self.ydl.urlopen(request) - except (compat_urllib_error.URLError, ) as err: - # reason may not be available, e.g. for urllib2.HTTPError on python 2.6 - reason = getattr(err, 'reason', None) - if isinstance(reason, socket.timeout): - raise RetryDownload(err) - raise err - # When trying to resume, Content-Range HTTP header of response has to be checked - # to match the value of requested Range HTTP header. This is due to a webservers - # that don't support resuming and serve a whole file with no Content-Range - # set in response despite of requested Range (see - # https://github.com/ytdl-org/youtube-dl/issues/6057#issuecomment-126129799) - if has_range: - content_range = ctx.data.headers.get('Content-Range') - if content_range: - content_range_m = re.search(r'bytes (\d+)-(\d+)?(?:/(\d+))?', content_range) - # Content-Range is present and matches requested Range, resume is possible - if content_range_m: - if range_start == int(content_range_m.group(1)): - content_range_end = int_or_none(content_range_m.group(2)) - content_len = int_or_none(content_range_m.group(3)) - accept_content_len = ( - # Non-chunked download - not ctx.chunk_size - # Chunked download and requested piece or - # its part is promised to be served - or content_range_end == range_end - or content_len < range_end) - if accept_content_len: - ctx.data_len = content_len - return - # Content-Range is either not present or invalid. Assuming remote webserver is - # trying to send the whole file, resume is not possible, so wiping the local file - # and performing entire redownload - self.report_unable_to_resume() - ctx.resume_len = 0 - ctx.open_mode = 'wb' - ctx.data_len = int_or_none(ctx.data.info().get('Content-length', None)) - return - except (compat_urllib_error.HTTPError, ) as err: - if err.code == 416: - # Unable to resume (requested range not satisfiable) - try: - # Open the connection again without the range header - ctx.data = self.ydl.urlopen( - sanitized_Request(url, None, headers)) - content_length = ctx.data.info()['Content-Length'] - except (compat_urllib_error.HTTPError, ) as err: - if err.code < 500 or err.code >= 600: - raise - else: - # Examine the reported length - if (content_length is not None - and (ctx.resume_len - 100 < int(content_length) < ctx.resume_len + 100)): - # The file had already been fully downloaded. - # Explanation to the above condition: in issue #175 it was revealed that - # YouTube sometimes adds or removes a few bytes from the end of the file, - # changing the file size slightly and causing problems for some users. So - # I decided to implement a suggested change and consider the file - # completely downloaded if the file size differs less than 100 bytes from - # the one in the hard drive. - self.report_file_already_downloaded(ctx.filename) - self.try_rename(ctx.tmpfilename, ctx.filename) - self._hook_progress({ - 'filename': ctx.filename, - 'status': 'finished', - 'downloaded_bytes': ctx.resume_len, - 'total_bytes': ctx.resume_len, - }) - raise SucceedDownload() - else: - # The length does not match, we start the download over - self.report_unable_to_resume() - ctx.resume_len = 0 - ctx.open_mode = 'wb' - return - elif err.code < 500 or err.code >= 600: - # Unexpected HTTP error - raise - raise RetryDownload(err) - except socket.error as err: - if err.errno != errno.ECONNRESET: - # Connection reset is no problem, just retry - raise - raise RetryDownload(err) - - def download(): - data_len = ctx.data.info().get('Content-length', None) - - # Range HTTP header may be ignored/unsupported by a webserver - # (e.g. extractor/scivee.py, extractor/bambuser.py). - # However, for a test we still would like to download just a piece of a file. - # To achieve this we limit data_len to _TEST_FILE_SIZE and manually control - # block size when downloading a file. - if is_test and (data_len is None or int(data_len) > self._TEST_FILE_SIZE): - data_len = self._TEST_FILE_SIZE - - if data_len is not None: - data_len = int(data_len) + ctx.resume_len - min_data_len = self.params.get('min_filesize') - max_data_len = self.params.get('max_filesize') - if min_data_len is not None and data_len < min_data_len: - self.to_screen('\r[download] File is smaller than min-filesize (%s bytes < %s bytes). Aborting.' % (data_len, min_data_len)) - return False - if max_data_len is not None and data_len > max_data_len: - self.to_screen('\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len)) - return False - - byte_counter = 0 + ctx.resume_len - block_size = ctx.block_size - start = time.time() - - # measure time over whole while-loop, so slow_down() and best_block_size() work together properly - now = None # needed for slow_down() in the first loop run - before = start # start measuring - - def retry(e): - to_stdout = ctx.tmpfilename == '-' - if ctx.stream is not None: - if not to_stdout: - ctx.stream.close() - ctx.stream = None - ctx.resume_len = byte_counter if to_stdout else os.path.getsize(encodeFilename(ctx.tmpfilename)) - raise RetryDownload(e) - - while True: - try: - # Download and write - data_block = ctx.data.read(block_size if data_len is None else min(block_size, data_len - byte_counter)) - # socket.timeout is a subclass of socket.error but may not have - # errno set - except socket.timeout as e: - retry(e) - except socket.error as e: - # SSLError on python 2 (inherits socket.error) may have - # no errno set but this error message - if e.errno in (errno.ECONNRESET, errno.ETIMEDOUT) or getattr(e, 'message', None) == 'The read operation timed out': - retry(e) - raise - - byte_counter += len(data_block) - - # exit loop when download is finished - if len(data_block) == 0: - break - - # Open destination file just in time - if ctx.stream is None: - try: - ctx.stream, ctx.tmpfilename = sanitize_open( - ctx.tmpfilename, ctx.open_mode) - assert ctx.stream is not None - ctx.filename = self.undo_temp_name(ctx.tmpfilename) - self.report_destination(ctx.filename) - except (OSError, IOError) as err: - self.report_error('unable to open for writing: %s' % str(err)) - return False - - if self.params.get('xattr_set_filesize', False) and data_len is not None: - try: - write_xattr(ctx.tmpfilename, 'user.ytdl.filesize', str(data_len).encode('utf-8')) - except (XAttrUnavailableError, XAttrMetadataError) as err: - self.report_error('unable to set filesize xattr: %s' % str(err)) - - try: - ctx.stream.write(data_block) - except (IOError, OSError) as err: - self.to_stderr('\n') - self.report_error('unable to write data: %s' % str(err)) - return False - - # Apply rate limit - self.slow_down(start, now, byte_counter - ctx.resume_len) - - # end measuring of one loop run - now = time.time() - after = now - - # Adjust block size - if not self.params.get('noresizebuffer', False): - block_size = self.best_block_size(after - before, len(data_block)) - - before = after - - # Progress message - speed = self.calc_speed(start, now, byte_counter - ctx.resume_len) - if ctx.data_len is None: - eta = None - else: - eta = self.calc_eta(start, time.time(), ctx.data_len - ctx.resume_len, byte_counter - ctx.resume_len) - - self._hook_progress({ - 'status': 'downloading', - 'downloaded_bytes': byte_counter, - 'total_bytes': ctx.data_len, - 'tmpfilename': ctx.tmpfilename, - 'filename': ctx.filename, - 'eta': eta, - 'speed': speed, - 'elapsed': now - ctx.start_time, - }) - - if data_len is not None and byte_counter == data_len: - break - - if not is_test and ctx.chunk_size and ctx.data_len is not None and byte_counter < ctx.data_len: - ctx.resume_len = byte_counter - # ctx.block_size = block_size - raise NextFragment() - - if ctx.stream is None: - self.to_stderr('\n') - self.report_error('Did not get any data blocks') - return False - if ctx.tmpfilename != '-': - ctx.stream.close() - - if data_len is not None and byte_counter != data_len: - err = ContentTooShortError(byte_counter, int(data_len)) - if count <= retries: - retry(err) - raise err - - self.try_rename(ctx.tmpfilename, ctx.filename) - - # Update file modification time - if self.params.get('updatetime', True): - info_dict['filetime'] = self.try_utime(ctx.filename, ctx.data.info().get('last-modified', None)) - - self._hook_progress({ - 'downloaded_bytes': byte_counter, - 'total_bytes': byte_counter, - 'filename': ctx.filename, - 'status': 'finished', - 'elapsed': time.time() - ctx.start_time, - }) - - return True - - while count <= retries: - try: - establish_connection() - return download() - except RetryDownload as e: - count += 1 - if count <= retries: - self.report_retry(e.source_error, count, retries) - continue - except NextFragment: - continue - except SucceedDownload: - return True - - self.report_error('giving up after %s retries' % retries) - return False diff --git a/youtube_dl/downloader/ism.py b/youtube_dl/downloader/ism.py deleted file mode 100644 index 1ca666b4a..000000000 --- a/youtube_dl/downloader/ism.py +++ /dev/null @@ -1,259 +0,0 @@ -from __future__ import unicode_literals - -import time -import binascii -import io - -from .fragment import FragmentFD -from ..compat import ( - compat_Struct, - compat_urllib_error, -) - - -u8 = compat_Struct('>B') -u88 = compat_Struct('>Bx') -u16 = compat_Struct('>H') -u1616 = compat_Struct('>Hxx') -u32 = compat_Struct('>I') -u64 = compat_Struct('>Q') - -s88 = compat_Struct('>bx') -s16 = compat_Struct('>h') -s1616 = compat_Struct('>hxx') -s32 = compat_Struct('>i') - -unity_matrix = (s32.pack(0x10000) + s32.pack(0) * 3) * 2 + s32.pack(0x40000000) - -TRACK_ENABLED = 0x1 -TRACK_IN_MOVIE = 0x2 -TRACK_IN_PREVIEW = 0x4 - -SELF_CONTAINED = 0x1 - - -def box(box_type, payload): - return u32.pack(8 + len(payload)) + box_type + payload - - -def full_box(box_type, version, flags, payload): - return box(box_type, u8.pack(version) + u32.pack(flags)[1:] + payload) - - -def write_piff_header(stream, params): - track_id = params['track_id'] - fourcc = params['fourcc'] - duration = params['duration'] - timescale = params.get('timescale', 10000000) - language = params.get('language', 'und') - height = params.get('height', 0) - width = params.get('width', 0) - is_audio = width == 0 and height == 0 - creation_time = modification_time = int(time.time()) - - ftyp_payload = b'isml' # major brand - ftyp_payload += u32.pack(1) # minor version - ftyp_payload += b'piff' + b'iso2' # compatible brands - stream.write(box(b'ftyp', ftyp_payload)) # File Type Box - - mvhd_payload = u64.pack(creation_time) - mvhd_payload += u64.pack(modification_time) - mvhd_payload += u32.pack(timescale) - mvhd_payload += u64.pack(duration) - mvhd_payload += s1616.pack(1) # rate - mvhd_payload += s88.pack(1) # volume - mvhd_payload += u16.pack(0) # reserved - mvhd_payload += u32.pack(0) * 2 # reserved - mvhd_payload += unity_matrix - mvhd_payload += u32.pack(0) * 6 # pre defined - mvhd_payload += u32.pack(0xffffffff) # next track id - moov_payload = full_box(b'mvhd', 1, 0, mvhd_payload) # Movie Header Box - - tkhd_payload = u64.pack(creation_time) - tkhd_payload += u64.pack(modification_time) - tkhd_payload += u32.pack(track_id) # track id - tkhd_payload += u32.pack(0) # reserved - tkhd_payload += u64.pack(duration) - tkhd_payload += u32.pack(0) * 2 # reserved - tkhd_payload += s16.pack(0) # layer - tkhd_payload += s16.pack(0) # alternate group - tkhd_payload += s88.pack(1 if is_audio else 0) # volume - tkhd_payload += u16.pack(0) # reserved - tkhd_payload += unity_matrix - tkhd_payload += u1616.pack(width) - tkhd_payload += u1616.pack(height) - trak_payload = full_box(b'tkhd', 1, TRACK_ENABLED | TRACK_IN_MOVIE | TRACK_IN_PREVIEW, tkhd_payload) # Track Header Box - - mdhd_payload = u64.pack(creation_time) - mdhd_payload += u64.pack(modification_time) - mdhd_payload += u32.pack(timescale) - mdhd_payload += u64.pack(duration) - mdhd_payload += u16.pack(((ord(language[0]) - 0x60) << 10) | ((ord(language[1]) - 0x60) << 5) | (ord(language[2]) - 0x60)) - mdhd_payload += u16.pack(0) # pre defined - mdia_payload = full_box(b'mdhd', 1, 0, mdhd_payload) # Media Header Box - - hdlr_payload = u32.pack(0) # pre defined - hdlr_payload += b'soun' if is_audio else b'vide' # handler type - hdlr_payload += u32.pack(0) * 3 # reserved - hdlr_payload += (b'Sound' if is_audio else b'Video') + b'Handler\0' # name - mdia_payload += full_box(b'hdlr', 0, 0, hdlr_payload) # Handler Reference Box - - if is_audio: - smhd_payload = s88.pack(0) # balance - smhd_payload += u16.pack(0) # reserved - media_header_box = full_box(b'smhd', 0, 0, smhd_payload) # Sound Media Header - else: - vmhd_payload = u16.pack(0) # graphics mode - vmhd_payload += u16.pack(0) * 3 # opcolor - media_header_box = full_box(b'vmhd', 0, 1, vmhd_payload) # Video Media Header - minf_payload = media_header_box - - dref_payload = u32.pack(1) # entry count - dref_payload += full_box(b'url ', 0, SELF_CONTAINED, b'') # Data Entry URL Box - dinf_payload = full_box(b'dref', 0, 0, dref_payload) # Data Reference Box - minf_payload += box(b'dinf', dinf_payload) # Data Information Box - - stsd_payload = u32.pack(1) # entry count - - sample_entry_payload = u8.pack(0) * 6 # reserved - sample_entry_payload += u16.pack(1) # data reference index - if is_audio: - sample_entry_payload += u32.pack(0) * 2 # reserved - sample_entry_payload += u16.pack(params.get('channels', 2)) - sample_entry_payload += u16.pack(params.get('bits_per_sample', 16)) - sample_entry_payload += u16.pack(0) # pre defined - sample_entry_payload += u16.pack(0) # reserved - sample_entry_payload += u1616.pack(params['sampling_rate']) - - if fourcc == 'AACL': - sample_entry_box = box(b'mp4a', sample_entry_payload) - else: - sample_entry_payload += u16.pack(0) # pre defined - sample_entry_payload += u16.pack(0) # reserved - sample_entry_payload += u32.pack(0) * 3 # pre defined - sample_entry_payload += u16.pack(width) - sample_entry_payload += u16.pack(height) - sample_entry_payload += u1616.pack(0x48) # horiz resolution 72 dpi - sample_entry_payload += u1616.pack(0x48) # vert resolution 72 dpi - sample_entry_payload += u32.pack(0) # reserved - sample_entry_payload += u16.pack(1) # frame count - sample_entry_payload += u8.pack(0) * 32 # compressor name - sample_entry_payload += u16.pack(0x18) # depth - sample_entry_payload += s16.pack(-1) # pre defined - - codec_private_data = binascii.unhexlify(params['codec_private_data'].encode('utf-8')) - if fourcc in ('H264', 'AVC1'): - sps, pps = codec_private_data.split(u32.pack(1))[1:] - avcc_payload = u8.pack(1) # configuration version - avcc_payload += sps[1:4] # avc profile indication + profile compatibility + avc level indication - avcc_payload += u8.pack(0xfc | (params.get('nal_unit_length_field', 4) - 1)) # complete representation (1) + reserved (11111) + length size minus one - avcc_payload += u8.pack(1) # reserved (0) + number of sps (0000001) - avcc_payload += u16.pack(len(sps)) - avcc_payload += sps - avcc_payload += u8.pack(1) # number of pps - avcc_payload += u16.pack(len(pps)) - avcc_payload += pps - sample_entry_payload += box(b'avcC', avcc_payload) # AVC Decoder Configuration Record - sample_entry_box = box(b'avc1', sample_entry_payload) # AVC Simple Entry - stsd_payload += sample_entry_box - - stbl_payload = full_box(b'stsd', 0, 0, stsd_payload) # Sample Description Box - - stts_payload = u32.pack(0) # entry count - stbl_payload += full_box(b'stts', 0, 0, stts_payload) # Decoding Time to Sample Box - - stsc_payload = u32.pack(0) # entry count - stbl_payload += full_box(b'stsc', 0, 0, stsc_payload) # Sample To Chunk Box - - stco_payload = u32.pack(0) # entry count - stbl_payload += full_box(b'stco', 0, 0, stco_payload) # Chunk Offset Box - - minf_payload += box(b'stbl', stbl_payload) # Sample Table Box - - mdia_payload += box(b'minf', minf_payload) # Media Information Box - - trak_payload += box(b'mdia', mdia_payload) # Media Box - - moov_payload += box(b'trak', trak_payload) # Track Box - - mehd_payload = u64.pack(duration) - mvex_payload = full_box(b'mehd', 1, 0, mehd_payload) # Movie Extends Header Box - - trex_payload = u32.pack(track_id) # track id - trex_payload += u32.pack(1) # default sample description index - trex_payload += u32.pack(0) # default sample duration - trex_payload += u32.pack(0) # default sample size - trex_payload += u32.pack(0) # default sample flags - mvex_payload += full_box(b'trex', 0, 0, trex_payload) # Track Extends Box - - moov_payload += box(b'mvex', mvex_payload) # Movie Extends Box - stream.write(box(b'moov', moov_payload)) # Movie Box - - -def extract_box_data(data, box_sequence): - data_reader = io.BytesIO(data) - while True: - box_size = u32.unpack(data_reader.read(4))[0] - box_type = data_reader.read(4) - if box_type == box_sequence[0]: - box_data = data_reader.read(box_size - 8) - if len(box_sequence) == 1: - return box_data - return extract_box_data(box_data, box_sequence[1:]) - data_reader.seek(box_size - 8, 1) - - -class IsmFD(FragmentFD): - """ - Download segments in a ISM manifest - """ - - FD_NAME = 'ism' - - def real_download(self, filename, info_dict): - segments = info_dict['fragments'][:1] if self.params.get( - 'test', False) else info_dict['fragments'] - - ctx = { - 'filename': filename, - 'total_frags': len(segments), - } - - self._prepare_and_start_frag_download(ctx) - - fragment_retries = self.params.get('fragment_retries', 0) - skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) - - track_written = False - frag_index = 0 - for i, segment in enumerate(segments): - frag_index += 1 - if frag_index <= ctx['fragment_index']: - continue - count = 0 - while count <= fragment_retries: - try: - success, frag_content = self._download_fragment(ctx, segment['url'], info_dict) - if not success: - return False - if not track_written: - tfhd_data = extract_box_data(frag_content, [b'moof', b'traf', b'tfhd']) - info_dict['_download_params']['track_id'] = u32.unpack(tfhd_data[4:8])[0] - write_piff_header(ctx['dest_stream'], info_dict['_download_params']) - track_written = True - self._append_fragment(ctx, frag_content) - break - except compat_urllib_error.HTTPError as err: - count += 1 - if count <= fragment_retries: - self.report_retry_fragment(err, frag_index, count, fragment_retries) - if count > fragment_retries: - if skip_unavailable_fragments: - self.report_skip_fragment(frag_index) - continue - self.report_error('giving up after %s fragment retries' % fragment_retries) - return False - - self._finish_frag_download(ctx) - - return True diff --git a/youtube_dl/downloader/rtmp.py b/youtube_dl/downloader/rtmp.py deleted file mode 100644 index fbb7f51b0..000000000 --- a/youtube_dl/downloader/rtmp.py +++ /dev/null @@ -1,214 +0,0 @@ -from __future__ import unicode_literals - -import os -import re -import subprocess -import time - -from .common import FileDownloader -from ..compat import compat_str -from ..utils import ( - check_executable, - encodeFilename, - encodeArgument, - get_exe_version, -) - - -def rtmpdump_version(): - return get_exe_version( - 'rtmpdump', ['--help'], r'(?i)RTMPDump\s*v?([0-9a-zA-Z._-]+)') - - -class RtmpFD(FileDownloader): - def real_download(self, filename, info_dict): - def run_rtmpdump(args): - start = time.time() - resume_percent = None - resume_downloaded_data_len = None - proc = subprocess.Popen(args, stderr=subprocess.PIPE) - cursor_in_new_line = True - proc_stderr_closed = False - try: - while not proc_stderr_closed: - # read line from stderr - line = '' - while True: - char = proc.stderr.read(1) - if not char: - proc_stderr_closed = True - break - if char in [b'\r', b'\n']: - break - line += char.decode('ascii', 'replace') - if not line: - # proc_stderr_closed is True - continue - mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec \(([0-9]{1,2}\.[0-9])%\)', line) - if mobj: - downloaded_data_len = int(float(mobj.group(1)) * 1024) - percent = float(mobj.group(2)) - if not resume_percent: - resume_percent = percent - resume_downloaded_data_len = downloaded_data_len - time_now = time.time() - eta = self.calc_eta(start, time_now, 100 - resume_percent, percent - resume_percent) - speed = self.calc_speed(start, time_now, downloaded_data_len - resume_downloaded_data_len) - data_len = None - if percent > 0: - data_len = int(downloaded_data_len * 100 / percent) - self._hook_progress({ - 'status': 'downloading', - 'downloaded_bytes': downloaded_data_len, - 'total_bytes_estimate': data_len, - 'tmpfilename': tmpfilename, - 'filename': filename, - 'eta': eta, - 'elapsed': time_now - start, - 'speed': speed, - }) - cursor_in_new_line = False - else: - # no percent for live streams - mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec', line) - if mobj: - downloaded_data_len = int(float(mobj.group(1)) * 1024) - time_now = time.time() - speed = self.calc_speed(start, time_now, downloaded_data_len) - self._hook_progress({ - 'downloaded_bytes': downloaded_data_len, - 'tmpfilename': tmpfilename, - 'filename': filename, - 'status': 'downloading', - 'elapsed': time_now - start, - 'speed': speed, - }) - cursor_in_new_line = False - elif self.params.get('verbose', False): - if not cursor_in_new_line: - self.to_screen('') - cursor_in_new_line = True - self.to_screen('[rtmpdump] ' + line) - finally: - proc.wait() - if not cursor_in_new_line: - self.to_screen('') - return proc.returncode - - url = info_dict['url'] - player_url = info_dict.get('player_url') - page_url = info_dict.get('page_url') - app = info_dict.get('app') - play_path = info_dict.get('play_path') - tc_url = info_dict.get('tc_url') - flash_version = info_dict.get('flash_version') - live = info_dict.get('rtmp_live', False) - conn = info_dict.get('rtmp_conn') - protocol = info_dict.get('rtmp_protocol') - real_time = info_dict.get('rtmp_real_time', False) - no_resume = info_dict.get('no_resume', False) - continue_dl = self.params.get('continuedl', True) - - self.report_destination(filename) - tmpfilename = self.temp_name(filename) - test = self.params.get('test', False) - - # Check for rtmpdump first - if not check_executable('rtmpdump', ['-h']): - self.report_error('RTMP download detected but "rtmpdump" could not be run. Please install it.') - return False - - # Download using rtmpdump. rtmpdump returns exit code 2 when - # the connection was interrupted and resuming appears to be - # possible. This is part of rtmpdump's normal usage, AFAIK. - basic_args = [ - 'rtmpdump', '--verbose', '-r', url, - '-o', tmpfilename] - if player_url is not None: - basic_args += ['--swfVfy', player_url] - if page_url is not None: - basic_args += ['--pageUrl', page_url] - if app is not None: - basic_args += ['--app', app] - if play_path is not None: - basic_args += ['--playpath', play_path] - if tc_url is not None: - basic_args += ['--tcUrl', tc_url] - if test: - basic_args += ['--stop', '1'] - if flash_version is not None: - basic_args += ['--flashVer', flash_version] - if live: - basic_args += ['--live'] - if isinstance(conn, list): - for entry in conn: - basic_args += ['--conn', entry] - elif isinstance(conn, compat_str): - basic_args += ['--conn', conn] - if protocol is not None: - basic_args += ['--protocol', protocol] - if real_time: - basic_args += ['--realtime'] - - args = basic_args - if not no_resume and continue_dl and not live: - args += ['--resume'] - if not live and continue_dl: - args += ['--skip', '1'] - - args = [encodeArgument(a) for a in args] - - self._debug_cmd(args, exe='rtmpdump') - - RD_SUCCESS = 0 - RD_FAILED = 1 - RD_INCOMPLETE = 2 - RD_NO_CONNECT = 3 - - started = time.time() - - try: - retval = run_rtmpdump(args) - except KeyboardInterrupt: - if not info_dict.get('is_live'): - raise - retval = RD_SUCCESS - self.to_screen('\n[rtmpdump] Interrupted by user') - - if retval == RD_NO_CONNECT: - self.report_error('[rtmpdump] Could not connect to RTMP server.') - return False - - while retval in (RD_INCOMPLETE, RD_FAILED) and not test and not live: - prevsize = os.path.getsize(encodeFilename(tmpfilename)) - self.to_screen('[rtmpdump] Downloaded %s bytes' % prevsize) - time.sleep(5.0) # This seems to be needed - args = basic_args + ['--resume'] - if retval == RD_FAILED: - args += ['--skip', '1'] - args = [encodeArgument(a) for a in args] - retval = run_rtmpdump(args) - cursize = os.path.getsize(encodeFilename(tmpfilename)) - if prevsize == cursize and retval == RD_FAILED: - break - # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those - if prevsize == cursize and retval == RD_INCOMPLETE and cursize > 1024: - self.to_screen('[rtmpdump] Could not download the whole video. This can happen for some advertisements.') - retval = RD_SUCCESS - break - if retval == RD_SUCCESS or (test and retval == RD_INCOMPLETE): - fsize = os.path.getsize(encodeFilename(tmpfilename)) - self.to_screen('[rtmpdump] Downloaded %s bytes' % fsize) - self.try_rename(tmpfilename, filename) - self._hook_progress({ - 'downloaded_bytes': fsize, - 'total_bytes': fsize, - 'filename': filename, - 'status': 'finished', - 'elapsed': time.time() - started, - }) - return True - else: - self.to_stderr('\n') - self.report_error('rtmpdump exited with code %d' % retval) - return False diff --git a/youtube_dl/downloader/rtsp.py b/youtube_dl/downloader/rtsp.py deleted file mode 100644 index 939358b2a..000000000 --- a/youtube_dl/downloader/rtsp.py +++ /dev/null @@ -1,47 +0,0 @@ -from __future__ import unicode_literals - -import os -import subprocess - -from .common import FileDownloader -from ..utils import ( - check_executable, - encodeFilename, -) - - -class RtspFD(FileDownloader): - def real_download(self, filename, info_dict): - url = info_dict['url'] - self.report_destination(filename) - tmpfilename = self.temp_name(filename) - - if check_executable('mplayer', ['-h']): - args = [ - 'mplayer', '-really-quiet', '-vo', 'null', '-vc', 'dummy', - '-dumpstream', '-dumpfile', tmpfilename, url] - elif check_executable('mpv', ['-h']): - args = [ - 'mpv', '-really-quiet', '--vo=null', '--stream-dump=' + tmpfilename, url] - else: - self.report_error('MMS or RTSP download detected but neither "mplayer" nor "mpv" could be run. Please install any.') - return False - - self._debug_cmd(args) - - retval = subprocess.call(args) - if retval == 0: - fsize = os.path.getsize(encodeFilename(tmpfilename)) - self.to_screen('\r[%s] %s bytes' % (args[0], fsize)) - self.try_rename(tmpfilename, filename) - self._hook_progress({ - 'downloaded_bytes': fsize, - 'total_bytes': fsize, - 'filename': filename, - 'status': 'finished', - }) - return True - else: - self.to_stderr('\n') - self.report_error('%s exited with code %d' % (args[0], retval)) - return False diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py deleted file mode 100644 index 18d8dbcd6..000000000 --- a/youtube_dl/extractor/__init__.py +++ /dev/null @@ -1,46 +0,0 @@ -from __future__ import unicode_literals - -try: - from .lazy_extractors import * - from .lazy_extractors import _ALL_CLASSES - _LAZY_LOADER = True -except ImportError: - _LAZY_LOADER = False - from .extractors import * - - _ALL_CLASSES = [ - klass - for name, klass in globals().items() - if name.endswith('IE') and name != 'GenericIE' - ] - _ALL_CLASSES.append(GenericIE) - - -def gen_extractor_classes(): - """ Return a list of supported extractors. - The order does matter; the first extractor matched is the one handling the URL. - """ - return _ALL_CLASSES - - -def gen_extractors(): - """ Return a list of an instance of every supported extractor. - The order does matter; the first extractor matched is the one handling the URL. - """ - return [klass() for klass in gen_extractor_classes()] - - -def list_extractors(age_limit): - """ - Return a list of extractors that are suitable for the given age, - sorted by extractor ID. - """ - - return sorted( - filter(lambda ie: ie.is_suitable(age_limit), gen_extractors()), - key=lambda ie: ie.IE_NAME.lower()) - - -def get_info_extractor(ie_name): - """Returns the info extractor class with the given ie_name""" - return globals()[ie_name + 'IE'] diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py deleted file mode 100644 index 6637f4f35..000000000 --- a/youtube_dl/extractor/abc.py +++ /dev/null @@ -1,193 +0,0 @@ -from __future__ import unicode_literals - -import hashlib -import hmac -import re -import time - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - ExtractorError, - js_to_json, - int_or_none, - parse_iso8601, - try_get, - unescapeHTML, - update_url_query, -) - - -class ABCIE(InfoExtractor): - IE_NAME = 'abc.net.au' - _VALID_URL = r'https?://(?:www\.)?abc\.net\.au/news/(?:[^/]+/){1,2}(?P<id>\d+)' - - _TESTS = [{ - 'url': 'http://www.abc.net.au/news/2014-11-05/australia-to-staff-ebola-treatment-centre-in-sierra-leone/5868334', - 'md5': 'cb3dd03b18455a661071ee1e28344d9f', - 'info_dict': { - 'id': '5868334', - 'ext': 'mp4', - 'title': 'Australia to help staff Ebola treatment centre in Sierra Leone', - 'description': 'md5:809ad29c67a05f54eb41f2a105693a67', - }, - 'skip': 'this video has expired', - }, { - 'url': 'http://www.abc.net.au/news/2015-08-17/warren-entsch-introduces-same-sex-marriage-bill/6702326', - 'md5': 'db2a5369238b51f9811ad815b69dc086', - 'info_dict': { - 'id': 'NvqvPeNZsHU', - 'ext': 'mp4', - 'upload_date': '20150816', - 'uploader': 'ABC News (Australia)', - 'description': 'Government backbencher Warren Entsch introduces a cross-party sponsored bill to legalise same-sex marriage, saying the bill is designed to promote "an inclusive Australia, not a divided one.". Read more here: http://ab.co/1Mwc6ef', - 'uploader_id': 'NewsOnABC', - 'title': 'Marriage Equality: Warren Entsch introduces same sex marriage bill', - }, - 'add_ie': ['Youtube'], - 'skip': 'Not accessible from Travis CI server', - }, { - 'url': 'http://www.abc.net.au/news/2015-10-23/nab-lifts-interest-rates-following-westpac-and-cba/6880080', - 'md5': 'b96eee7c9edf4fc5a358a0252881cc1f', - 'info_dict': { - 'id': '6880080', - 'ext': 'mp3', - 'title': 'NAB lifts interest rates, following Westpac and CBA', - 'description': 'md5:f13d8edc81e462fce4a0437c7dc04728', - }, - }, { - 'url': 'http://www.abc.net.au/news/2015-10-19/6866214', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - mobj = re.search( - r'inline(?P<type>Video|Audio|YouTube)Data\.push\((?P<json_data>[^)]+)\);', - webpage) - if mobj is None: - expired = self._html_search_regex(r'(?s)class="expired-(?:video|audio)".+?<span>(.+?)</span>', webpage, 'expired', None) - if expired: - raise ExtractorError('%s said: %s' % (self.IE_NAME, expired), expected=True) - raise ExtractorError('Unable to extract video urls') - - urls_info = self._parse_json( - mobj.group('json_data'), video_id, transform_source=js_to_json) - - if not isinstance(urls_info, list): - urls_info = [urls_info] - - if mobj.group('type') == 'YouTube': - return self.playlist_result([ - self.url_result(url_info['url']) for url_info in urls_info]) - - formats = [{ - 'url': url_info['url'], - 'vcodec': url_info.get('codec') if mobj.group('type') == 'Video' else 'none', - 'width': int_or_none(url_info.get('width')), - 'height': int_or_none(url_info.get('height')), - 'tbr': int_or_none(url_info.get('bitrate')), - 'filesize': int_or_none(url_info.get('filesize')), - } for url_info in urls_info] - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': self._og_search_title(webpage), - 'formats': formats, - 'description': self._og_search_description(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), - } - - -class ABCIViewIE(InfoExtractor): - IE_NAME = 'abc.net.au:iview' - _VALID_URL = r'https?://iview\.abc\.net\.au/(?:[^/]+/)*video/(?P<id>[^/?#]+)' - _GEO_COUNTRIES = ['AU'] - - # ABC iview programs are normally available for 14 days only. - _TESTS = [{ - 'url': 'https://iview.abc.net.au/show/gruen/series/11/video/LE1927H001S00', - 'md5': '67715ce3c78426b11ba167d875ac6abf', - 'info_dict': { - 'id': 'LE1927H001S00', - 'ext': 'mp4', - 'title': "Series 11 Ep 1", - 'series': "Gruen", - 'description': 'md5:52cc744ad35045baf6aded2ce7287f67', - 'upload_date': '20190925', - 'uploader_id': 'abc1', - 'timestamp': 1569445289, - }, - 'params': { - 'skip_download': True, - }, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - video_params = self._download_json( - 'https://iview.abc.net.au/api/programs/' + video_id, video_id) - title = unescapeHTML(video_params.get('title') or video_params['seriesTitle']) - stream = next(s for s in video_params['playlist'] if s.get('type') in ('program', 'livestream')) - - house_number = video_params.get('episodeHouseNumber') or video_id - path = '/auth/hls/sign?ts={0}&hn={1}&d=android-tablet'.format( - int(time.time()), house_number) - sig = hmac.new( - b'android.content.res.Resources', - path.encode('utf-8'), hashlib.sha256).hexdigest() - token = self._download_webpage( - 'http://iview.abc.net.au{0}&sig={1}'.format(path, sig), video_id) - - def tokenize_url(url, token): - return update_url_query(url, { - 'hdnea': token, - }) - - for sd in ('720', 'sd', 'sd-low'): - sd_url = try_get( - stream, lambda x: x['streams']['hls'][sd], compat_str) - if not sd_url: - continue - formats = self._extract_m3u8_formats( - tokenize_url(sd_url, token), video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) - if formats: - break - self._sort_formats(formats) - - subtitles = {} - src_vtt = stream.get('captions', {}).get('src-vtt') - if src_vtt: - subtitles['en'] = [{ - 'url': src_vtt, - 'ext': 'vtt', - }] - - is_live = video_params.get('livestream') == '1' - if is_live: - title = self._live_title(title) - - return { - 'id': video_id, - 'title': title, - 'description': video_params.get('description'), - 'thumbnail': video_params.get('thumbnail'), - 'duration': int_or_none(video_params.get('eventDuration')), - 'timestamp': parse_iso8601(video_params.get('pubDate'), ' '), - 'series': unescapeHTML(video_params.get('seriesTitle')), - 'series_id': video_params.get('seriesHouseNumber') or video_id[:7], - 'season_number': int_or_none(self._search_regex( - r'\bSeries\s+(\d+)\b', title, 'season number', default=None)), - 'episode_number': int_or_none(self._search_regex( - r'\bEp\s+(\d+)\b', title, 'episode number', default=None)), - 'episode_id': house_number, - 'uploader_id': video_params.get('channel'), - 'formats': formats, - 'subtitles': subtitles, - 'is_live': is_live, - } diff --git a/youtube_dl/extractor/abcnews.py b/youtube_dl/extractor/abcnews.py deleted file mode 100644 index 908c83377..000000000 --- a/youtube_dl/extractor/abcnews.py +++ /dev/null @@ -1,158 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .amp import AMPIE -from .common import InfoExtractor -from ..utils import ( - parse_duration, - parse_iso8601, - try_get, -) - - -class AbcNewsVideoIE(AMPIE): - IE_NAME = 'abcnews:video' - _VALID_URL = r'''(?x) - https?:// - (?: - abcnews\.go\.com/ - (?: - (?:[^/]+/)*video/(?P<display_id>[0-9a-z-]+)-| - video/(?:embed|itemfeed)\?.*?\bid= - )| - fivethirtyeight\.abcnews\.go\.com/video/embed/\d+/ - ) - (?P<id>\d+) - ''' - - _TESTS = [{ - 'url': 'http://abcnews.go.com/ThisWeek/video/week-exclusive-irans-foreign-minister-zarif-20411932', - 'info_dict': { - 'id': '20411932', - 'ext': 'mp4', - 'display_id': 'week-exclusive-irans-foreign-minister-zarif', - 'title': '\'This Week\' Exclusive: Iran\'s Foreign Minister Zarif', - 'description': 'George Stephanopoulos goes one-on-one with Iranian Foreign Minister Dr. Javad Zarif.', - 'duration': 180, - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1380454200, - 'upload_date': '20130929', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'http://abcnews.go.com/video/embed?id=46979033', - 'only_matching': True, - }, { - 'url': 'http://abcnews.go.com/2020/video/2020-husband-stands-teacher-jail-student-affairs-26119478', - 'only_matching': True, - }, { - 'url': 'http://abcnews.go.com/video/itemfeed?id=46979033', - 'only_matching': True, - }, { - 'url': 'https://abcnews.go.com/GMA/News/video/history-christmas-story-67894761', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('display_id') - video_id = mobj.group('id') - info_dict = self._extract_feed_info( - 'http://abcnews.go.com/video/itemfeed?id=%s' % video_id) - info_dict.update({ - 'id': video_id, - 'display_id': display_id, - }) - return info_dict - - -class AbcNewsIE(InfoExtractor): - IE_NAME = 'abcnews' - _VALID_URL = r'https?://abcnews\.go\.com/(?:[^/]+/)+(?P<display_id>[0-9a-z-]+)/story\?id=(?P<id>\d+)' - - _TESTS = [{ - # Youtube Embeds - 'url': 'https://abcnews.go.com/Entertainment/peter-billingsley-child-actor-christmas-story-hollywood-power/story?id=51286501', - 'info_dict': { - 'id': '51286501', - 'title': "Peter Billingsley: From child actor in 'A Christmas Story' to Hollywood power player", - 'description': 'Billingsley went from a child actor to Hollywood power player.', - }, - 'playlist_count': 5, - }, { - 'url': 'http://abcnews.go.com/Entertainment/justin-timberlake-performs-stop-feeling-eurovision-2016/story?id=39125818', - 'info_dict': { - 'id': '38897857', - 'ext': 'mp4', - 'title': 'Justin Timberlake Drops Hints For Secret Single', - 'description': 'Lara Spencer reports the buzziest stories of the day in "GMA" Pop News.', - 'upload_date': '20160505', - 'timestamp': 1462442280, - }, - 'params': { - # m3u8 download - 'skip_download': True, - # The embedded YouTube video is blocked due to copyright issues - 'playlist_items': '1', - }, - 'add_ie': ['AbcNewsVideo'], - }, { - 'url': 'http://abcnews.go.com/Technology/exclusive-apple-ceo-tim-cook-iphone-cracking-software/story?id=37173343', - 'only_matching': True, - }, { - # inline.type == 'video' - 'url': 'http://abcnews.go.com/Technology/exclusive-apple-ceo-tim-cook-iphone-cracking-software/story?id=37173343', - 'only_matching': True, - }] - - def _real_extract(self, url): - story_id = self._match_id(url) - webpage = self._download_webpage(url, story_id) - story = self._parse_json(self._search_regex( - r"window\['__abcnews__'\]\s*=\s*({.+?});", - webpage, 'data'), story_id)['page']['content']['story']['everscroll'][0] - article_contents = story.get('articleContents') or {} - - def entries(): - featured_video = story.get('featuredVideo') or {} - feed = try_get(featured_video, lambda x: x['video']['feed']) - if feed: - yield { - '_type': 'url', - 'id': featured_video.get('id'), - 'title': featured_video.get('name'), - 'url': feed, - 'thumbnail': featured_video.get('images'), - 'description': featured_video.get('description'), - 'timestamp': parse_iso8601(featured_video.get('uploadDate')), - 'duration': parse_duration(featured_video.get('duration')), - 'ie_key': AbcNewsVideoIE.ie_key(), - } - - for inline in (article_contents.get('inlines') or []): - inline_type = inline.get('type') - if inline_type == 'iframe': - iframe_url = try_get(inline, lambda x: x['attrs']['src']) - if iframe_url: - yield self.url_result(iframe_url) - elif inline_type == 'video': - video_id = inline.get('id') - if video_id: - yield { - '_type': 'url', - 'id': video_id, - 'url': 'http://abcnews.go.com/video/embed?id=' + video_id, - 'thumbnail': inline.get('imgSrc') or inline.get('imgDefault'), - 'description': inline.get('description'), - 'duration': parse_duration(inline.get('duration')), - 'ie_key': AbcNewsVideoIE.ie_key(), - } - - return self.playlist_result( - entries(), story_id, article_contents.get('headline'), - article_contents.get('subHead')) diff --git a/youtube_dl/extractor/abcotvs.py b/youtube_dl/extractor/abcotvs.py deleted file mode 100644 index 0bc69a64f..000000000 --- a/youtube_dl/extractor/abcotvs.py +++ /dev/null @@ -1,137 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - dict_get, - int_or_none, - try_get, -) - - -class ABCOTVSIE(InfoExtractor): - IE_NAME = 'abcotvs' - IE_DESC = 'ABC Owned Television Stations' - _VALID_URL = r'https?://(?P<site>abc(?:7(?:news|ny|chicago)?|11|13|30)|6abc)\.com(?:(?:/[^/]+)*/(?P<display_id>[^/]+))?/(?P<id>\d+)' - _TESTS = [ - { - 'url': 'http://abc7news.com/entertainment/east-bay-museum-celebrates-vintage-synthesizers/472581/', - 'info_dict': { - 'id': '472548', - 'display_id': 'east-bay-museum-celebrates-vintage-synthesizers', - 'ext': 'mp4', - 'title': 'East Bay museum celebrates synthesized music', - 'description': 'md5:24ed2bd527096ec2a5c67b9d5a9005f3', - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1421118520, - 'upload_date': '20150113', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, - { - 'url': 'http://abc7news.com/472581', - 'only_matching': True, - }, - { - 'url': 'https://6abc.com/man-75-killed-after-being-struck-by-vehicle-in-chester/5725182/', - 'only_matching': True, - }, - ] - _SITE_MAP = { - '6abc': 'wpvi', - 'abc11': 'wtvd', - 'abc13': 'ktrk', - 'abc30': 'kfsn', - 'abc7': 'kabc', - 'abc7chicago': 'wls', - 'abc7news': 'kgo', - 'abc7ny': 'wabc', - } - - def _real_extract(self, url): - site, display_id, video_id = re.match(self._VALID_URL, url).groups() - display_id = display_id or video_id - station = self._SITE_MAP[site] - - data = self._download_json( - 'https://api.abcotvs.com/v2/content', display_id, query={ - 'id': video_id, - 'key': 'otv.web.%s.story' % station, - 'station': station, - })['data'] - video = try_get(data, lambda x: x['featuredMedia']['video'], dict) or data - video_id = compat_str(dict_get(video, ('id', 'publishedKey'), video_id)) - title = video.get('title') or video['linkText'] - - formats = [] - m3u8_url = video.get('m3u8') - if m3u8_url: - formats = self._extract_m3u8_formats( - video['m3u8'].split('?')[0], display_id, 'mp4', m3u8_id='hls', fatal=False) - mp4_url = video.get('mp4') - if mp4_url: - formats.append({ - 'abr': 128, - 'format_id': 'https', - 'height': 360, - 'url': mp4_url, - 'width': 640, - }) - self._sort_formats(formats) - - image = video.get('image') or {} - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': dict_get(video, ('description', 'caption'), try_get(video, lambda x: x['meta']['description'])), - 'thumbnail': dict_get(image, ('source', 'dynamicSource')), - 'timestamp': int_or_none(video.get('date')), - 'duration': int_or_none(video.get('length')), - 'formats': formats, - } - - -class ABCOTVSClipsIE(InfoExtractor): - IE_NAME = 'abcotvs:clips' - _VALID_URL = r'https?://clips\.abcotvs\.com/(?:[^/]+/)*video/(?P<id>\d+)' - _TEST = { - 'url': 'https://clips.abcotvs.com/kabc/video/214814', - 'info_dict': { - 'id': '214814', - 'ext': 'mp4', - 'title': 'SpaceX launch pad explosion destroys rocket, satellite', - 'description': 'md5:9f186e5ad8f490f65409965ee9c7be1b', - 'upload_date': '20160901', - 'timestamp': 1472756695, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - video_data = self._download_json('https://clips.abcotvs.com/vogo/video/getByIds?ids=' + video_id, video_id)['results'][0] - title = video_data['title'] - formats = self._extract_m3u8_formats( - video_data['videoURL'].split('?')[0], video_id, 'mp4') - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': video_data.get('description'), - 'thumbnail': video_data.get('thumbnailURL'), - 'duration': int_or_none(video_data.get('duration')), - 'timestamp': int_or_none(video_data.get('pubDate')), - 'formats': formats, - } diff --git a/youtube_dl/extractor/acast.py b/youtube_dl/extractor/acast.py deleted file mode 100644 index b9355a2c8..000000000 --- a/youtube_dl/extractor/acast.py +++ /dev/null @@ -1,126 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - clean_html, - clean_podcast_url, - int_or_none, - parse_iso8601, -) - - -class ACastBaseIE(InfoExtractor): - def _extract_episode(self, episode, show_info): - title = episode['title'] - info = { - 'id': episode['id'], - 'display_id': episode.get('episodeUrl'), - 'url': clean_podcast_url(episode['url']), - 'title': title, - 'description': clean_html(episode.get('description') or episode.get('summary')), - 'thumbnail': episode.get('image'), - 'timestamp': parse_iso8601(episode.get('publishDate')), - 'duration': int_or_none(episode.get('duration')), - 'filesize': int_or_none(episode.get('contentLength')), - 'season_number': int_or_none(episode.get('season')), - 'episode': title, - 'episode_number': int_or_none(episode.get('episode')), - } - info.update(show_info) - return info - - def _extract_show_info(self, show): - return { - 'creator': show.get('author'), - 'series': show.get('title'), - } - - def _call_api(self, path, video_id, query=None): - return self._download_json( - 'https://feeder.acast.com/api/v1/shows/' + path, video_id, query=query) - - -class ACastIE(ACastBaseIE): - IE_NAME = 'acast' - _VALID_URL = r'''(?x) - https?:// - (?: - (?:(?:embed|www)\.)?acast\.com/| - play\.acast\.com/s/ - ) - (?P<channel>[^/]+)/(?P<id>[^/#?]+) - ''' - _TESTS = [{ - 'url': 'https://www.acast.com/sparpodcast/2.raggarmordet-rosterurdetforflutna', - 'md5': 'f5598f3ad1e4776fed12ec1407153e4b', - 'info_dict': { - 'id': '2a92b283-1a75-4ad8-8396-499c641de0d9', - 'ext': 'mp3', - 'title': '2. Raggarmordet - Röster ur det förflutna', - 'description': 'md5:a992ae67f4d98f1c0141598f7bebbf67', - 'timestamp': 1477346700, - 'upload_date': '20161024', - 'duration': 2766, - 'creator': 'Anton Berg & Martin Johnson', - 'series': 'Spår', - 'episode': '2. Raggarmordet - Röster ur det förflutna', - } - }, { - 'url': 'http://embed.acast.com/adambuxton/ep.12-adam-joeschristmaspodcast2015', - 'only_matching': True, - }, { - 'url': 'https://play.acast.com/s/rattegangspodden/s04e09styckmordetihelenelund-del2-2', - 'only_matching': True, - }, { - 'url': 'https://play.acast.com/s/sparpodcast/2a92b283-1a75-4ad8-8396-499c641de0d9', - 'only_matching': True, - }] - - def _real_extract(self, url): - channel, display_id = re.match(self._VALID_URL, url).groups() - episode = self._call_api( - '%s/episodes/%s' % (channel, display_id), - display_id, {'showInfo': 'true'}) - return self._extract_episode( - episode, self._extract_show_info(episode.get('show') or {})) - - -class ACastChannelIE(ACastBaseIE): - IE_NAME = 'acast:channel' - _VALID_URL = r'''(?x) - https?:// - (?: - (?:www\.)?acast\.com/| - play\.acast\.com/s/ - ) - (?P<id>[^/#?]+) - ''' - _TESTS = [{ - 'url': 'https://www.acast.com/todayinfocus', - 'info_dict': { - 'id': '4efc5294-5385-4847-98bd-519799ce5786', - 'title': 'Today in Focus', - 'description': 'md5:c09ce28c91002ce4ffce71d6504abaae', - }, - 'playlist_mincount': 200, - }, { - 'url': 'http://play.acast.com/s/ft-banking-weekly', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return False if ACastIE.suitable(url) else super(ACastChannelIE, cls).suitable(url) - - def _real_extract(self, url): - show_slug = self._match_id(url) - show = self._call_api(show_slug, show_slug) - show_info = self._extract_show_info(show) - entries = [] - for episode in (show.get('episodes') or []): - entries.append(self._extract_episode(episode, show_info)) - return self.playlist_result( - entries, show.get('id'), show.get('title'), show.get('description')) diff --git a/youtube_dl/extractor/adobepass.py b/youtube_dl/extractor/adobepass.py deleted file mode 100644 index 38dca1b0a..000000000 --- a/youtube_dl/extractor/adobepass.py +++ /dev/null @@ -1,1572 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re -import time -import xml.etree.ElementTree as etree - -from .common import InfoExtractor -from ..compat import ( - compat_kwargs, - compat_urlparse, -) -from ..utils import ( - unescapeHTML, - urlencode_postdata, - unified_timestamp, - ExtractorError, - NO_DEFAULT, -) - - -MSO_INFO = { - 'DTV': { - 'name': 'DIRECTV', - 'username_field': 'username', - 'password_field': 'password', - }, - 'ATT': { - 'name': 'AT&T U-verse', - 'username_field': 'userid', - 'password_field': 'password', - }, - 'ATTOTT': { - 'name': 'DIRECTV NOW', - 'username_field': 'email', - 'password_field': 'loginpassword', - }, - 'Rogers': { - 'name': 'Rogers', - 'username_field': 'UserName', - 'password_field': 'UserPassword', - }, - 'Comcast_SSO': { - 'name': 'Comcast XFINITY', - 'username_field': 'user', - 'password_field': 'passwd', - }, - 'TWC': { - 'name': 'Time Warner Cable | Spectrum', - 'username_field': 'Ecom_User_ID', - 'password_field': 'Ecom_Password', - }, - 'Brighthouse': { - 'name': 'Bright House Networks | Spectrum', - 'username_field': 'j_username', - 'password_field': 'j_password', - }, - 'Charter_Direct': { - 'name': 'Charter Spectrum', - 'username_field': 'IDToken1', - 'password_field': 'IDToken2', - }, - 'Verizon': { - 'name': 'Verizon FiOS', - 'username_field': 'IDToken1', - 'password_field': 'IDToken2', - }, - 'thr030': { - 'name': '3 Rivers Communications' - }, - 'com140': { - 'name': 'Access Montana' - }, - 'acecommunications': { - 'name': 'AcenTek' - }, - 'acm010': { - 'name': 'Acme Communications' - }, - 'ada020': { - 'name': 'Adams Cable Service' - }, - 'alb020': { - 'name': 'Albany Mutual Telephone' - }, - 'algona': { - 'name': 'Algona Municipal Utilities' - }, - 'allwest': { - 'name': 'All West Communications' - }, - 'all025': { - 'name': 'Allen\'s Communications' - }, - 'spl010': { - 'name': 'Alliance Communications' - }, - 'all070': { - 'name': 'ALLO Communications' - }, - 'alpine': { - 'name': 'Alpine Communications' - }, - 'hun015': { - 'name': 'American Broadband' - }, - 'nwc010': { - 'name': 'American Broadband Missouri' - }, - 'com130-02': { - 'name': 'American Community Networks' - }, - 'com130-01': { - 'name': 'American Warrior Networks' - }, - 'tom020': { - 'name': 'Amherst Telephone/Tomorrow Valley' - }, - 'tvc020': { - 'name': 'Andycable' - }, - 'arkwest': { - 'name': 'Arkwest Communications' - }, - 'art030': { - 'name': 'Arthur Mutual Telephone Company' - }, - 'arvig': { - 'name': 'Arvig' - }, - 'nttcash010': { - 'name': 'Ashland Home Net' - }, - 'astound': { - 'name': 'Astound (now Wave)' - }, - 'dix030': { - 'name': 'ATC Broadband' - }, - 'ara010': { - 'name': 'ATC Communications' - }, - 'she030-02': { - 'name': 'Ayersville Communications' - }, - 'baldwin': { - 'name': 'Baldwin Lightstream' - }, - 'bal040': { - 'name': 'Ballard TV' - }, - 'cit025': { - 'name': 'Bardstown Cable TV' - }, - 'bay030': { - 'name': 'Bay Country Communications' - }, - 'tel095': { - 'name': 'Beaver Creek Cooperative Telephone' - }, - 'bea020': { - 'name': 'Beaver Valley Cable' - }, - 'bee010': { - 'name': 'Bee Line Cable' - }, - 'wir030': { - 'name': 'Beehive Broadband' - }, - 'bra020': { - 'name': 'BELD' - }, - 'bel020': { - 'name': 'Bellevue Municipal Cable' - }, - 'vol040-01': { - 'name': 'Ben Lomand Connect / BLTV' - }, - 'bev010': { - 'name': 'BEVCOMM' - }, - 'big020': { - 'name': 'Big Sandy Broadband' - }, - 'ble020': { - 'name': 'Bledsoe Telephone Cooperative' - }, - 'bvt010': { - 'name': 'Blue Valley Tele-Communications' - }, - 'bra050': { - 'name': 'Brandenburg Telephone Co.' - }, - 'bte010': { - 'name': 'Bristol Tennessee Essential Services' - }, - 'annearundel': { - 'name': 'Broadstripe' - }, - 'btc010': { - 'name': 'BTC Communications' - }, - 'btc040': { - 'name': 'BTC Vision - Nahunta' - }, - 'bul010': { - 'name': 'Bulloch Telephone Cooperative' - }, - 'but010': { - 'name': 'Butler-Bremer Communications' - }, - 'tel160-csp': { - 'name': 'C Spire SNAP' - }, - 'csicable': { - 'name': 'Cable Services Inc.' - }, - 'cableamerica': { - 'name': 'CableAmerica' - }, - 'cab038': { - 'name': 'CableSouth Media 3' - }, - 'weh010-camtel': { - 'name': 'Cam-Tel Company' - }, - 'car030': { - 'name': 'Cameron Communications' - }, - 'canbytel': { - 'name': 'Canby Telcom' - }, - 'crt020': { - 'name': 'CapRock Tv' - }, - 'car050': { - 'name': 'Carnegie Cable' - }, - 'cas': { - 'name': 'CAS Cable' - }, - 'casscomm': { - 'name': 'CASSCOMM' - }, - 'mid180-02': { - 'name': 'Catalina Broadband Solutions' - }, - 'cccomm': { - 'name': 'CC Communications' - }, - 'nttccde010': { - 'name': 'CDE Lightband' - }, - 'cfunet': { - 'name': 'Cedar Falls Utilities' - }, - 'dem010-01': { - 'name': 'Celect-Bloomer Telephone Area' - }, - 'dem010-02': { - 'name': 'Celect-Bruce Telephone Area' - }, - 'dem010-03': { - 'name': 'Celect-Citizens Connected Area' - }, - 'dem010-04': { - 'name': 'Celect-Elmwood/Spring Valley Area' - }, - 'dem010-06': { - 'name': 'Celect-Mosaic Telecom' - }, - 'dem010-05': { - 'name': 'Celect-West WI Telephone Area' - }, - 'net010-02': { - 'name': 'Cellcom/Nsight Telservices' - }, - 'cen100': { - 'name': 'CentraCom' - }, - 'nttccst010': { - 'name': 'Central Scott / CSTV' - }, - 'cha035': { - 'name': 'Chaparral CableVision' - }, - 'cha050': { - 'name': 'Chariton Valley Communication Corporation, Inc.' - }, - 'cha060': { - 'name': 'Chatmoss Cablevision' - }, - 'nttcche010': { - 'name': 'Cherokee Communications' - }, - 'che050': { - 'name': 'Chesapeake Bay Communications' - }, - 'cimtel': { - 'name': 'Cim-Tel Cable, LLC.' - }, - 'cit180': { - 'name': 'Citizens Cablevision - Floyd, VA' - }, - 'cit210': { - 'name': 'Citizens Cablevision, Inc.' - }, - 'cit040': { - 'name': 'Citizens Fiber' - }, - 'cit250': { - 'name': 'Citizens Mutual' - }, - 'war040': { - 'name': 'Citizens Telephone Corporation' - }, - 'wat025': { - 'name': 'City Of Monroe' - }, - 'wadsworth': { - 'name': 'CityLink' - }, - 'nor100': { - 'name': 'CL Tel' - }, - 'cla010': { - 'name': 'Clarence Telephone and Cedar Communications' - }, - 'ser060': { - 'name': 'Clear Choice Communications' - }, - 'tac020': { - 'name': 'Click! Cable TV' - }, - 'war020': { - 'name': 'CLICK1.NET' - }, - 'cml010': { - 'name': 'CML Telephone Cooperative Association' - }, - 'cns': { - 'name': 'CNS' - }, - 'com160': { - 'name': 'Co-Mo Connect' - }, - 'coa020': { - 'name': 'Coast Communications' - }, - 'coa030': { - 'name': 'Coaxial Cable TV' - }, - 'mid055': { - 'name': 'Cobalt TV (Mid-State Community TV)' - }, - 'col070': { - 'name': 'Columbia Power & Water Systems' - }, - 'col080': { - 'name': 'Columbus Telephone' - }, - 'nor105': { - 'name': 'Communications 1 Cablevision, Inc.' - }, - 'com150': { - 'name': 'Community Cable & Broadband' - }, - 'com020': { - 'name': 'Community Communications Company' - }, - 'coy010': { - 'name': 'commZoom' - }, - 'com025': { - 'name': 'Complete Communication Services' - }, - 'cat020': { - 'name': 'Comporium' - }, - 'com071': { - 'name': 'ComSouth Telesys' - }, - 'consolidatedcable': { - 'name': 'Consolidated' - }, - 'conwaycorp': { - 'name': 'Conway Corporation' - }, - 'coo050': { - 'name': 'Coon Valley Telecommunications Inc' - }, - 'coo080': { - 'name': 'Cooperative Telephone Company' - }, - 'cpt010': { - 'name': 'CP-TEL' - }, - 'cra010': { - 'name': 'Craw-Kan Telephone' - }, - 'crestview': { - 'name': 'Crestview Cable Communications' - }, - 'cross': { - 'name': 'Cross TV' - }, - 'cro030': { - 'name': 'Crosslake Communications' - }, - 'ctc040': { - 'name': 'CTC - Brainerd MN' - }, - 'phe030': { - 'name': 'CTV-Beam - East Alabama' - }, - 'cun010': { - 'name': 'Cunningham Telephone & Cable' - }, - 'dpc010': { - 'name': 'D & P Communications' - }, - 'dak030': { - 'name': 'Dakota Central Telecommunications' - }, - 'nttcdel010': { - 'name': 'Delcambre Telephone LLC' - }, - 'tel160-del': { - 'name': 'Delta Telephone Company' - }, - 'sal040': { - 'name': 'DiamondNet' - }, - 'ind060-dc': { - 'name': 'Direct Communications' - }, - 'doy010': { - 'name': 'Doylestown Cable TV' - }, - 'dic010': { - 'name': 'DRN' - }, - 'dtc020': { - 'name': 'DTC' - }, - 'dtc010': { - 'name': 'DTC Cable (Delhi)' - }, - 'dum010': { - 'name': 'Dumont Telephone Company' - }, - 'dun010': { - 'name': 'Dunkerton Telephone Cooperative' - }, - 'cci010': { - 'name': 'Duo County Telecom' - }, - 'eagle': { - 'name': 'Eagle Communications' - }, - 'weh010-east': { - 'name': 'East Arkansas Cable TV' - }, - 'eatel': { - 'name': 'EATEL Video, LLC' - }, - 'ell010': { - 'name': 'ECTA' - }, - 'emerytelcom': { - 'name': 'Emery Telcom Video LLC' - }, - 'nor200': { - 'name': 'Empire Access' - }, - 'endeavor': { - 'name': 'Endeavor Communications' - }, - 'sun045': { - 'name': 'Enhanced Telecommunications Corporation' - }, - 'mid030': { - 'name': 'enTouch' - }, - 'epb020': { - 'name': 'EPB Smartnet' - }, - 'jea010': { - 'name': 'EPlus Broadband' - }, - 'com065': { - 'name': 'ETC' - }, - 'ete010': { - 'name': 'Etex Communications' - }, - 'fbc-tele': { - 'name': 'F&B Communications' - }, - 'fal010': { - 'name': 'Falcon Broadband' - }, - 'fam010': { - 'name': 'FamilyView CableVision' - }, - 'far020': { - 'name': 'Farmers Mutual Telephone Company' - }, - 'fay010': { - 'name': 'Fayetteville Public Utilities' - }, - 'sal060': { - 'name': 'fibrant' - }, - 'fid010': { - 'name': 'Fidelity Communications' - }, - 'for030': { - 'name': 'FJ Communications' - }, - 'fli020': { - 'name': 'Flint River Communications' - }, - 'far030': { - 'name': 'FMT - Jesup' - }, - 'foo010': { - 'name': 'Foothills Communications' - }, - 'for080': { - 'name': 'Forsyth CableNet' - }, - 'fbcomm': { - 'name': 'Frankfort Plant Board' - }, - 'tel160-fra': { - 'name': 'Franklin Telephone Company' - }, - 'nttcftc010': { - 'name': 'FTC' - }, - 'fullchannel': { - 'name': 'Full Channel, Inc.' - }, - 'gar040': { - 'name': 'Gardonville Cooperative Telephone Association' - }, - 'gbt010': { - 'name': 'GBT Communications, Inc.' - }, - 'tec010': { - 'name': 'Genuine Telecom' - }, - 'clr010': { - 'name': 'Giant Communications' - }, - 'gla010': { - 'name': 'Glasgow EPB' - }, - 'gle010': { - 'name': 'Glenwood Telecommunications' - }, - 'gra060': { - 'name': 'GLW Broadband Inc.' - }, - 'goldenwest': { - 'name': 'Golden West Cablevision' - }, - 'vis030': { - 'name': 'Grantsburg Telcom' - }, - 'gpcom': { - 'name': 'Great Plains Communications' - }, - 'gri010': { - 'name': 'Gridley Cable Inc' - }, - 'hbc010': { - 'name': 'H&B Cable Services' - }, - 'hae010': { - 'name': 'Haefele TV Inc.' - }, - 'htc010': { - 'name': 'Halstad Telephone Company' - }, - 'har005': { - 'name': 'Harlan Municipal Utilities' - }, - 'har020': { - 'name': 'Hart Communications' - }, - 'ced010': { - 'name': 'Hartelco TV' - }, - 'hea040': { - 'name': 'Heart of Iowa Communications Cooperative' - }, - 'htc020': { - 'name': 'Hickory Telephone Company' - }, - 'nttchig010': { - 'name': 'Highland Communication Services' - }, - 'hig030': { - 'name': 'Highland Media' - }, - 'spc010': { - 'name': 'Hilliary Communications' - }, - 'hin020': { - 'name': 'Hinton CATV Co.' - }, - 'hometel': { - 'name': 'HomeTel Entertainment, Inc.' - }, - 'hoodcanal': { - 'name': 'Hood Canal Communications' - }, - 'weh010-hope': { - 'name': 'Hope - Prescott Cable TV' - }, - 'horizoncable': { - 'name': 'Horizon Cable TV, Inc.' - }, - 'hor040': { - 'name': 'Horizon Chillicothe Telephone' - }, - 'htc030': { - 'name': 'HTC Communications Co. - IL' - }, - 'htccomm': { - 'name': 'HTC Communications, Inc. - IA' - }, - 'wal005': { - 'name': 'Huxley Communications' - }, - 'imon': { - 'name': 'ImOn Communications' - }, - 'ind040': { - 'name': 'Independence Telecommunications' - }, - 'rrc010': { - 'name': 'Inland Networks' - }, - 'stc020': { - 'name': 'Innovative Cable TV St Croix' - }, - 'car100': { - 'name': 'Innovative Cable TV St Thomas-St John' - }, - 'icc010': { - 'name': 'Inside Connect Cable' - }, - 'int100': { - 'name': 'Integra Telecom' - }, - 'int050': { - 'name': 'Interstate Telecommunications Coop' - }, - 'irv010': { - 'name': 'Irvine Cable' - }, - 'k2c010': { - 'name': 'K2 Communications' - }, - 'kal010': { - 'name': 'Kalida Telephone Company, Inc.' - }, - 'kal030': { - 'name': 'Kalona Cooperative Telephone Company' - }, - 'kmt010': { - 'name': 'KMTelecom' - }, - 'kpu010': { - 'name': 'KPU Telecommunications' - }, - 'kuh010': { - 'name': 'Kuhn Communications, Inc.' - }, - 'lak130': { - 'name': 'Lakeland Communications' - }, - 'lan010': { - 'name': 'Langco' - }, - 'lau020': { - 'name': 'Laurel Highland Total Communications, Inc.' - }, - 'leh010': { - 'name': 'Lehigh Valley Cooperative Telephone' - }, - 'bra010': { - 'name': 'Limestone Cable/Bracken Cable' - }, - 'loc020': { - 'name': 'LISCO' - }, - 'lit020': { - 'name': 'Litestream' - }, - 'tel140': { - 'name': 'LivCom' - }, - 'loc010': { - 'name': 'LocalTel Communications' - }, - 'weh010-longview': { - 'name': 'Longview - Kilgore Cable TV' - }, - 'lon030': { - 'name': 'Lonsdale Video Ventures, LLC' - }, - 'lns010': { - 'name': 'Lost Nation-Elwood Telephone Co.' - }, - 'nttclpc010': { - 'name': 'LPC Connect' - }, - 'lumos': { - 'name': 'Lumos Networks' - }, - 'madison': { - 'name': 'Madison Communications' - }, - 'mad030': { - 'name': 'Madison County Cable Inc.' - }, - 'nttcmah010': { - 'name': 'Mahaska Communication Group' - }, - 'mar010': { - 'name': 'Marne & Elk Horn Telephone Company' - }, - 'mcc040': { - 'name': 'McClure Telephone Co.' - }, - 'mctv': { - 'name': 'MCTV' - }, - 'merrimac': { - 'name': 'Merrimac Communications Ltd.' - }, - 'metronet': { - 'name': 'Metronet' - }, - 'mhtc': { - 'name': 'MHTC' - }, - 'midhudson': { - 'name': 'Mid-Hudson Cable' - }, - 'midrivers': { - 'name': 'Mid-Rivers Communications' - }, - 'mid045': { - 'name': 'Midstate Communications' - }, - 'mil080': { - 'name': 'Milford Communications' - }, - 'min030': { - 'name': 'MINET' - }, - 'nttcmin010': { - 'name': 'Minford TV' - }, - 'san040-02': { - 'name': 'Mitchell Telecom' - }, - 'mlg010': { - 'name': 'MLGC' - }, - 'mon060': { - 'name': 'Mon-Cre TVE' - }, - 'mou110': { - 'name': 'Mountain Telephone' - }, - 'mou050': { - 'name': 'Mountain Village Cable' - }, - 'mtacomm': { - 'name': 'MTA Communications, LLC' - }, - 'mtc010': { - 'name': 'MTC Cable' - }, - 'med040': { - 'name': 'MTC Technologies' - }, - 'man060': { - 'name': 'MTCC' - }, - 'mtc030': { - 'name': 'MTCO Communications' - }, - 'mul050': { - 'name': 'Mulberry Telecommunications' - }, - 'mur010': { - 'name': 'Murray Electric System' - }, - 'musfiber': { - 'name': 'MUS FiberNET' - }, - 'mpw': { - 'name': 'Muscatine Power & Water' - }, - 'nttcsli010': { - 'name': 'myEVTV.com' - }, - 'nor115': { - 'name': 'NCC' - }, - 'nor260': { - 'name': 'NDTC' - }, - 'nctc': { - 'name': 'Nebraska Central Telecom, Inc.' - }, - 'nel020': { - 'name': 'Nelsonville TV Cable' - }, - 'nem010': { - 'name': 'Nemont' - }, - 'new075': { - 'name': 'New Hope Telephone Cooperative' - }, - 'nor240': { - 'name': 'NICP' - }, - 'cic010': { - 'name': 'NineStar Connect' - }, - 'nktelco': { - 'name': 'NKTelco' - }, - 'nortex': { - 'name': 'Nortex Communications' - }, - 'nor140': { - 'name': 'North Central Telephone Cooperative' - }, - 'nor030': { - 'name': 'Northland Communications' - }, - 'nor075': { - 'name': 'Northwest Communications' - }, - 'nor125': { - 'name': 'Norwood Light Broadband' - }, - 'net010': { - 'name': 'Nsight Telservices' - }, - 'dur010': { - 'name': 'Ntec' - }, - 'nts010': { - 'name': 'NTS Communications' - }, - 'new045': { - 'name': 'NU-Telecom' - }, - 'nulink': { - 'name': 'NuLink' - }, - 'jam030': { - 'name': 'NVC' - }, - 'far035': { - 'name': 'OmniTel Communications' - }, - 'onesource': { - 'name': 'OneSource Communications' - }, - 'cit230': { - 'name': 'Opelika Power Services' - }, - 'daltonutilities': { - 'name': 'OptiLink' - }, - 'mid140': { - 'name': 'OPTURA' - }, - 'ote010': { - 'name': 'OTEC Communication Company' - }, - 'cci020': { - 'name': 'Packerland Broadband' - }, - 'pan010': { - 'name': 'Panora Telco/Guthrie Center Communications' - }, - 'otter': { - 'name': 'Park Region Telephone & Otter Tail Telcom' - }, - 'mid050': { - 'name': 'Partner Communications Cooperative' - }, - 'fib010': { - 'name': 'Pathway' - }, - 'paulbunyan': { - 'name': 'Paul Bunyan Communications' - }, - 'pem020': { - 'name': 'Pembroke Telephone Company' - }, - 'mck010': { - 'name': 'Peoples Rural Telephone Cooperative' - }, - 'pul010': { - 'name': 'PES Energize' - }, - 'phi010': { - 'name': 'Philippi Communications System' - }, - 'phonoscope': { - 'name': 'Phonoscope Cable' - }, - 'pin070': { - 'name': 'Pine Belt Communications, Inc.' - }, - 'weh010-pine': { - 'name': 'Pine Bluff Cable TV' - }, - 'pin060': { - 'name': 'Pineland Telephone Cooperative' - }, - 'cam010': { - 'name': 'Pinpoint Communications' - }, - 'pio060': { - 'name': 'Pioneer Broadband' - }, - 'pioncomm': { - 'name': 'Pioneer Communications' - }, - 'pioneer': { - 'name': 'Pioneer DTV' - }, - 'pla020': { - 'name': 'Plant TiftNet, Inc.' - }, - 'par010': { - 'name': 'PLWC' - }, - 'pro035': { - 'name': 'PMT' - }, - 'vik011': { - 'name': 'Polar Cablevision' - }, - 'pottawatomie': { - 'name': 'Pottawatomie Telephone Co.' - }, - 'premiercomm': { - 'name': 'Premier Communications' - }, - 'psc010': { - 'name': 'PSC' - }, - 'pan020': { - 'name': 'PTCI' - }, - 'qco010': { - 'name': 'QCOL' - }, - 'qua010': { - 'name': 'Quality Cablevision' - }, - 'rad010': { - 'name': 'Radcliffe Telephone Company' - }, - 'car040': { - 'name': 'Rainbow Communications' - }, - 'rai030': { - 'name': 'Rainier Connect' - }, - 'ral010': { - 'name': 'Ralls Technologies' - }, - 'rct010': { - 'name': 'RC Technologies' - }, - 'red040': { - 'name': 'Red River Communications' - }, - 'ree010': { - 'name': 'Reedsburg Utility Commission' - }, - 'mol010': { - 'name': 'Reliance Connects- Oregon' - }, - 'res020': { - 'name': 'Reserve Telecommunications' - }, - 'weh010-resort': { - 'name': 'Resort TV Cable' - }, - 'rld010': { - 'name': 'Richland Grant Telephone Cooperative, Inc.' - }, - 'riv030': { - 'name': 'River Valley Telecommunications Coop' - }, - 'rockportcable': { - 'name': 'Rock Port Cablevision' - }, - 'rsf010': { - 'name': 'RS Fiber' - }, - 'rtc': { - 'name': 'RTC Communication Corp' - }, - 'res040': { - 'name': 'RTC-Reservation Telephone Coop.' - }, - 'rte010': { - 'name': 'RTEC Communications' - }, - 'stc010': { - 'name': 'S&T' - }, - 'san020': { - 'name': 'San Bruno Cable TV' - }, - 'san040-01': { - 'name': 'Santel' - }, - 'sav010': { - 'name': 'SCI Broadband-Savage Communications Inc.' - }, - 'sco050': { - 'name': 'Scottsboro Electric Power Board' - }, - 'scr010': { - 'name': 'Scranton Telephone Company' - }, - 'selco': { - 'name': 'SELCO' - }, - 'she010': { - 'name': 'Shentel' - }, - 'she030': { - 'name': 'Sherwood Mutual Telephone Association, Inc.' - }, - 'ind060-ssc': { - 'name': 'Silver Star Communications' - }, - 'sjoberg': { - 'name': 'Sjoberg\'s Inc.' - }, - 'sou025': { - 'name': 'SKT' - }, - 'sky050': { - 'name': 'SkyBest TV' - }, - 'nttcsmi010': { - 'name': 'Smithville Communications' - }, - 'woo010': { - 'name': 'Solarus' - }, - 'sou075': { - 'name': 'South Central Rural Telephone Cooperative' - }, - 'sou065': { - 'name': 'South Holt Cablevision, Inc.' - }, - 'sou035': { - 'name': 'South Slope Cooperative Communications' - }, - 'spa020': { - 'name': 'Spanish Fork Community Network' - }, - 'spe010': { - 'name': 'Spencer Municipal Utilities' - }, - 'spi005': { - 'name': 'Spillway Communications, Inc.' - }, - 'srt010': { - 'name': 'SRT' - }, - 'cccsmc010': { - 'name': 'St. Maarten Cable TV' - }, - 'sta025': { - 'name': 'Star Communications' - }, - 'sco020': { - 'name': 'STE' - }, - 'uin010': { - 'name': 'STRATA Networks' - }, - 'sum010': { - 'name': 'Sumner Cable TV' - }, - 'pie010': { - 'name': 'Surry TV/PCSI TV' - }, - 'swa010': { - 'name': 'Swayzee Communications' - }, - 'sweetwater': { - 'name': 'Sweetwater Cable Television Co' - }, - 'weh010-talequah': { - 'name': 'Tahlequah Cable TV' - }, - 'tct': { - 'name': 'TCT' - }, - 'tel050': { - 'name': 'Tele-Media Company' - }, - 'com050': { - 'name': 'The Community Agency' - }, - 'thr020': { - 'name': 'Three River' - }, - 'cab140': { - 'name': 'Town & Country Technologies' - }, - 'tra010': { - 'name': 'Trans-Video' - }, - 'tre010': { - 'name': 'Trenton TV Cable Company' - }, - 'tcc': { - 'name': 'Tri County Communications Cooperative' - }, - 'tri025': { - 'name': 'TriCounty Telecom' - }, - 'tri110': { - 'name': 'TrioTel Communications, Inc.' - }, - 'tro010': { - 'name': 'Troy Cablevision, Inc.' - }, - 'tsc': { - 'name': 'TSC' - }, - 'cit220': { - 'name': 'Tullahoma Utilities Board' - }, - 'tvc030': { - 'name': 'TV Cable of Rensselaer' - }, - 'tvc015': { - 'name': 'TVC Cable' - }, - 'cab180': { - 'name': 'TVision' - }, - 'twi040': { - 'name': 'Twin Lakes' - }, - 'tvtinc': { - 'name': 'Twin Valley' - }, - 'uis010': { - 'name': 'Union Telephone Company' - }, - 'uni110': { - 'name': 'United Communications - TN' - }, - 'uni120': { - 'name': 'United Services' - }, - 'uss020': { - 'name': 'US Sonet' - }, - 'cab060': { - 'name': 'USA Communications' - }, - 'she005': { - 'name': 'USA Communications/Shellsburg, IA' - }, - 'val040': { - 'name': 'Valley TeleCom Group' - }, - 'val025': { - 'name': 'Valley Telecommunications' - }, - 'val030': { - 'name': 'Valparaiso Broadband' - }, - 'cla050': { - 'name': 'Vast Broadband' - }, - 'sul015': { - 'name': 'Venture Communications Cooperative, Inc.' - }, - 'ver025': { - 'name': 'Vernon Communications Co-op' - }, - 'weh010-vicksburg': { - 'name': 'Vicksburg Video' - }, - 'vis070': { - 'name': 'Vision Communications' - }, - 'volcanotel': { - 'name': 'Volcano Vision, Inc.' - }, - 'vol040-02': { - 'name': 'VolFirst / BLTV' - }, - 'ver070': { - 'name': 'VTel' - }, - 'nttcvtx010': { - 'name': 'VTX1' - }, - 'bci010-02': { - 'name': 'Vyve Broadband' - }, - 'wab020': { - 'name': 'Wabash Mutual Telephone' - }, - 'waitsfield': { - 'name': 'Waitsfield Cable' - }, - 'wal010': { - 'name': 'Walnut Communications' - }, - 'wavebroadband': { - 'name': 'Wave' - }, - 'wav030': { - 'name': 'Waverly Communications Utility' - }, - 'wbi010': { - 'name': 'WBI' - }, - 'web020': { - 'name': 'Webster-Calhoun Cooperative Telephone Association' - }, - 'wes005': { - 'name': 'West Alabama TV Cable' - }, - 'carolinata': { - 'name': 'West Carolina Communications' - }, - 'wct010': { - 'name': 'West Central Telephone Association' - }, - 'wes110': { - 'name': 'West River Cooperative Telephone Company' - }, - 'ani030': { - 'name': 'WesTel Systems' - }, - 'westianet': { - 'name': 'Western Iowa Networks' - }, - 'nttcwhi010': { - 'name': 'Whidbey Telecom' - }, - 'weh010-white': { - 'name': 'White County Cable TV' - }, - 'wes130': { - 'name': 'Wiatel' - }, - 'wik010': { - 'name': 'Wiktel' - }, - 'wil070': { - 'name': 'Wilkes Communications, Inc./RiverStreet Networks' - }, - 'wil015': { - 'name': 'Wilson Communications' - }, - 'win010': { - 'name': 'Windomnet/SMBS' - }, - 'win090': { - 'name': 'Windstream Cable TV' - }, - 'wcta': { - 'name': 'Winnebago Cooperative Telecom Association' - }, - 'wtc010': { - 'name': 'WTC' - }, - 'wil040': { - 'name': 'WTC Communications, Inc.' - }, - 'wya010': { - 'name': 'Wyandotte Cable' - }, - 'hin020-02': { - 'name': 'X-Stream Services' - }, - 'xit010': { - 'name': 'XIT Communications' - }, - 'yel010': { - 'name': 'Yelcot Communications' - }, - 'mid180-01': { - 'name': 'yondoo' - }, - 'cou060': { - 'name': 'Zito Media' - }, -} - - -class AdobePassIE(InfoExtractor): - _SERVICE_PROVIDER_TEMPLATE = 'https://sp.auth.adobe.com/adobe-services/%s' - _USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0' - _MVPD_CACHE = 'ap-mvpd' - - _DOWNLOADING_LOGIN_PAGE = 'Downloading Provider Login Page' - - def _download_webpage_handle(self, *args, **kwargs): - headers = self.geo_verification_headers() - headers.update(kwargs.get('headers', {})) - kwargs['headers'] = headers - return super(AdobePassIE, self)._download_webpage_handle( - *args, **compat_kwargs(kwargs)) - - @staticmethod - def _get_mvpd_resource(provider_id, title, guid, rating): - channel = etree.Element('channel') - channel_title = etree.SubElement(channel, 'title') - channel_title.text = provider_id - item = etree.SubElement(channel, 'item') - resource_title = etree.SubElement(item, 'title') - resource_title.text = title - resource_guid = etree.SubElement(item, 'guid') - resource_guid.text = guid - resource_rating = etree.SubElement(item, 'media:rating') - resource_rating.attrib = {'scheme': 'urn:v-chip'} - resource_rating.text = rating - return '<rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/">' + etree.tostring(channel).decode() + '</rss>' - - def _extract_mvpd_auth(self, url, video_id, requestor_id, resource): - def xml_text(xml_str, tag): - return self._search_regex( - '<%s>(.+?)</%s>' % (tag, tag), xml_str, tag) - - def is_expired(token, date_ele): - token_expires = unified_timestamp(re.sub(r'[_ ]GMT', '', xml_text(token, date_ele))) - return token_expires and token_expires <= int(time.time()) - - def post_form(form_page_res, note, data={}): - form_page, urlh = form_page_res - post_url = self._html_search_regex(r'<form[^>]+action=(["\'])(?P<url>.+?)\1', form_page, 'post url', group='url') - if not re.match(r'https?://', post_url): - post_url = compat_urlparse.urljoin(urlh.geturl(), post_url) - form_data = self._hidden_inputs(form_page) - form_data.update(data) - return self._download_webpage_handle( - post_url, video_id, note, data=urlencode_postdata(form_data), headers={ - 'Content-Type': 'application/x-www-form-urlencoded', - }) - - def raise_mvpd_required(): - raise ExtractorError( - 'This video is only available for users of participating TV providers. ' - 'Use --ap-mso to specify Adobe Pass Multiple-system operator Identifier ' - 'and --ap-username and --ap-password or --netrc to provide account credentials.', expected=True) - - def extract_redirect_url(html, url=None, fatal=False): - # TODO: eliminate code duplication with generic extractor and move - # redirection code into _download_webpage_handle - REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)' - redirect_url = self._search_regex( - r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")' - r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX, - html, 'meta refresh redirect', - default=NO_DEFAULT if fatal else None, fatal=fatal) - if not redirect_url: - return None - if url: - redirect_url = compat_urlparse.urljoin(url, unescapeHTML(redirect_url)) - return redirect_url - - mvpd_headers = { - 'ap_42': 'anonymous', - 'ap_11': 'Linux i686', - 'ap_z': self._USER_AGENT, - 'User-Agent': self._USER_AGENT, - } - - guid = xml_text(resource, 'guid') if '<' in resource else resource - count = 0 - while count < 2: - requestor_info = self._downloader.cache.load(self._MVPD_CACHE, requestor_id) or {} - authn_token = requestor_info.get('authn_token') - if authn_token and is_expired(authn_token, 'simpleTokenExpires'): - authn_token = None - if not authn_token: - # TODO add support for other TV Providers - mso_id = self._downloader.params.get('ap_mso') - if not mso_id: - raise_mvpd_required() - username, password = self._get_login_info('ap_username', 'ap_password', mso_id) - if not username or not password: - raise_mvpd_required() - mso_info = MSO_INFO[mso_id] - - provider_redirect_page_res = self._download_webpage_handle( - self._SERVICE_PROVIDER_TEMPLATE % 'authenticate/saml', video_id, - 'Downloading Provider Redirect Page', query={ - 'noflash': 'true', - 'mso_id': mso_id, - 'requestor_id': requestor_id, - 'no_iframe': 'false', - 'domain_name': 'adobe.com', - 'redirect_url': url, - }) - - if mso_id == 'Comcast_SSO': - # Comcast page flow varies by video site and whether you - # are on Comcast's network. - provider_redirect_page, urlh = provider_redirect_page_res - if 'automatically signing you in' in provider_redirect_page: - oauth_redirect_url = self._html_search_regex( - r'window\.location\s*=\s*[\'"]([^\'"]+)', - provider_redirect_page, 'oauth redirect') - self._download_webpage( - oauth_redirect_url, video_id, 'Confirming auto login') - else: - if '<form name="signin"' in provider_redirect_page: - provider_login_page_res = provider_redirect_page_res - elif 'http-equiv="refresh"' in provider_redirect_page: - oauth_redirect_url = extract_redirect_url( - provider_redirect_page, fatal=True) - provider_login_page_res = self._download_webpage_handle( - oauth_redirect_url, video_id, - self._DOWNLOADING_LOGIN_PAGE) - else: - provider_login_page_res = post_form( - provider_redirect_page_res, - self._DOWNLOADING_LOGIN_PAGE) - - mvpd_confirm_page_res = post_form( - provider_login_page_res, 'Logging in', { - mso_info['username_field']: username, - mso_info['password_field']: password, - }) - mvpd_confirm_page, urlh = mvpd_confirm_page_res - if '<button class="submit" value="Resume">Resume</button>' in mvpd_confirm_page: - post_form(mvpd_confirm_page_res, 'Confirming Login') - elif mso_id == 'Verizon': - # In general, if you're connecting from a Verizon-assigned IP, - # you will not actually pass your credentials. - provider_redirect_page, urlh = provider_redirect_page_res - if 'Please wait ...' in provider_redirect_page: - saml_redirect_url = self._html_search_regex( - r'self\.parent\.location=(["\'])(?P<url>.+?)\1', - provider_redirect_page, - 'SAML Redirect URL', group='url') - saml_login_page = self._download_webpage( - saml_redirect_url, video_id, - 'Downloading SAML Login Page') - else: - saml_login_page_res = post_form( - provider_redirect_page_res, 'Logging in', { - mso_info['username_field']: username, - mso_info['password_field']: password, - }) - saml_login_page, urlh = saml_login_page_res - if 'Please try again.' in saml_login_page: - raise ExtractorError( - 'We\'re sorry, but either the User ID or Password entered is not correct.') - saml_login_url = self._search_regex( - r'xmlHttp\.open\("POST"\s*,\s*(["\'])(?P<url>.+?)\1', - saml_login_page, 'SAML Login URL', group='url') - saml_response_json = self._download_json( - saml_login_url, video_id, 'Downloading SAML Response', - headers={'Content-Type': 'text/xml'}) - self._download_webpage( - saml_response_json['targetValue'], video_id, - 'Confirming Login', data=urlencode_postdata({ - 'SAMLResponse': saml_response_json['SAMLResponse'], - 'RelayState': saml_response_json['RelayState'] - }), headers={ - 'Content-Type': 'application/x-www-form-urlencoded' - }) - else: - # Some providers (e.g. DIRECTV NOW) have another meta refresh - # based redirect that should be followed. - provider_redirect_page, urlh = provider_redirect_page_res - provider_refresh_redirect_url = extract_redirect_url( - provider_redirect_page, url=urlh.geturl()) - if provider_refresh_redirect_url: - provider_redirect_page_res = self._download_webpage_handle( - provider_refresh_redirect_url, video_id, - 'Downloading Provider Redirect Page (meta refresh)') - provider_login_page_res = post_form( - provider_redirect_page_res, self._DOWNLOADING_LOGIN_PAGE) - mvpd_confirm_page_res = post_form(provider_login_page_res, 'Logging in', { - mso_info.get('username_field', 'username'): username, - mso_info.get('password_field', 'password'): password, - }) - if mso_id != 'Rogers': - post_form(mvpd_confirm_page_res, 'Confirming Login') - - session = self._download_webpage( - self._SERVICE_PROVIDER_TEMPLATE % 'session', video_id, - 'Retrieving Session', data=urlencode_postdata({ - '_method': 'GET', - 'requestor_id': requestor_id, - }), headers=mvpd_headers) - if '<pendingLogout' in session: - self._downloader.cache.store(self._MVPD_CACHE, requestor_id, {}) - count += 1 - continue - authn_token = unescapeHTML(xml_text(session, 'authnToken')) - requestor_info['authn_token'] = authn_token - self._downloader.cache.store(self._MVPD_CACHE, requestor_id, requestor_info) - - authz_token = requestor_info.get(guid) - if authz_token and is_expired(authz_token, 'simpleTokenTTL'): - authz_token = None - if not authz_token: - authorize = self._download_webpage( - self._SERVICE_PROVIDER_TEMPLATE % 'authorize', video_id, - 'Retrieving Authorization Token', data=urlencode_postdata({ - 'resource_id': resource, - 'requestor_id': requestor_id, - 'authentication_token': authn_token, - 'mso_id': xml_text(authn_token, 'simpleTokenMsoID'), - 'userMeta': '1', - }), headers=mvpd_headers) - if '<pendingLogout' in authorize: - self._downloader.cache.store(self._MVPD_CACHE, requestor_id, {}) - count += 1 - continue - if '<error' in authorize: - raise ExtractorError(xml_text(authorize, 'details'), expected=True) - authz_token = unescapeHTML(xml_text(authorize, 'authzToken')) - requestor_info[guid] = authz_token - self._downloader.cache.store(self._MVPD_CACHE, requestor_id, requestor_info) - - mvpd_headers.update({ - 'ap_19': xml_text(authn_token, 'simpleSamlNameID'), - 'ap_23': xml_text(authn_token, 'simpleSamlSessionIndex'), - }) - - short_authorize = self._download_webpage( - self._SERVICE_PROVIDER_TEMPLATE % 'shortAuthorize', - video_id, 'Retrieving Media Token', data=urlencode_postdata({ - 'authz_token': authz_token, - 'requestor_id': requestor_id, - 'session_guid': xml_text(authn_token, 'simpleTokenAuthenticationGuid'), - 'hashed_guid': 'false', - }), headers=mvpd_headers) - if '<pendingLogout' in short_authorize: - self._downloader.cache.store(self._MVPD_CACHE, requestor_id, {}) - count += 1 - continue - return short_authorize diff --git a/youtube_dl/extractor/adobetv.py b/youtube_dl/extractor/adobetv.py deleted file mode 100644 index 80060f037..000000000 --- a/youtube_dl/extractor/adobetv.py +++ /dev/null @@ -1,288 +0,0 @@ -from __future__ import unicode_literals - -import functools -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - float_or_none, - int_or_none, - ISO639Utils, - OnDemandPagedList, - parse_duration, - str_or_none, - str_to_int, - unified_strdate, -) - - -class AdobeTVBaseIE(InfoExtractor): - def _call_api(self, path, video_id, query, note=None): - return self._download_json( - 'http://tv.adobe.com/api/v4/' + path, - video_id, note, query=query)['data'] - - def _parse_subtitles(self, video_data, url_key): - subtitles = {} - for translation in video_data.get('translations', []): - vtt_path = translation.get(url_key) - if not vtt_path: - continue - lang = translation.get('language_w3c') or ISO639Utils.long2short(translation['language_medium']) - subtitles.setdefault(lang, []).append({ - 'ext': 'vtt', - 'url': vtt_path, - }) - return subtitles - - def _parse_video_data(self, video_data): - video_id = compat_str(video_data['id']) - title = video_data['title'] - - s3_extracted = False - formats = [] - for source in video_data.get('videos', []): - source_url = source.get('url') - if not source_url: - continue - f = { - 'format_id': source.get('quality_level'), - 'fps': int_or_none(source.get('frame_rate')), - 'height': int_or_none(source.get('height')), - 'tbr': int_or_none(source.get('video_data_rate')), - 'width': int_or_none(source.get('width')), - 'url': source_url, - } - original_filename = source.get('original_filename') - if original_filename: - if not (f.get('height') and f.get('width')): - mobj = re.search(r'_(\d+)x(\d+)', original_filename) - if mobj: - f.update({ - 'height': int(mobj.group(2)), - 'width': int(mobj.group(1)), - }) - if original_filename.startswith('s3://') and not s3_extracted: - formats.append({ - 'format_id': 'original', - 'preference': 1, - 'url': original_filename.replace('s3://', 'https://s3.amazonaws.com/'), - }) - s3_extracted = True - formats.append(f) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': video_data.get('description'), - 'thumbnail': video_data.get('thumbnail'), - 'upload_date': unified_strdate(video_data.get('start_date')), - 'duration': parse_duration(video_data.get('duration')), - 'view_count': str_to_int(video_data.get('playcount')), - 'formats': formats, - 'subtitles': self._parse_subtitles(video_data, 'vtt'), - } - - -class AdobeTVEmbedIE(AdobeTVBaseIE): - IE_NAME = 'adobetv:embed' - _VALID_URL = r'https?://tv\.adobe\.com/embed/\d+/(?P<id>\d+)' - _TEST = { - 'url': 'https://tv.adobe.com/embed/22/4153', - 'md5': 'c8c0461bf04d54574fc2b4d07ac6783a', - 'info_dict': { - 'id': '4153', - 'ext': 'flv', - 'title': 'Creating Graphics Optimized for BlackBerry', - 'description': 'md5:eac6e8dced38bdaae51cd94447927459', - 'thumbnail': r're:https?://.*\.jpg$', - 'upload_date': '20091109', - 'duration': 377, - 'view_count': int, - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - video_data = self._call_api( - 'episode/' + video_id, video_id, {'disclosure': 'standard'})[0] - return self._parse_video_data(video_data) - - -class AdobeTVIE(AdobeTVBaseIE): - IE_NAME = 'adobetv' - _VALID_URL = r'https?://tv\.adobe\.com/(?:(?P<language>fr|de|es|jp)/)?watch/(?P<show_urlname>[^/]+)/(?P<id>[^/]+)' - - _TEST = { - 'url': 'http://tv.adobe.com/watch/the-complete-picture-with-julieanne-kost/quick-tip-how-to-draw-a-circle-around-an-object-in-photoshop/', - 'md5': '9bc5727bcdd55251f35ad311ca74fa1e', - 'info_dict': { - 'id': '10981', - 'ext': 'mp4', - 'title': 'Quick Tip - How to Draw a Circle Around an Object in Photoshop', - 'description': 'md5:99ec318dc909d7ba2a1f2b038f7d2311', - 'thumbnail': r're:https?://.*\.jpg$', - 'upload_date': '20110914', - 'duration': 60, - 'view_count': int, - }, - } - - def _real_extract(self, url): - language, show_urlname, urlname = re.match(self._VALID_URL, url).groups() - if not language: - language = 'en' - - video_data = self._call_api( - 'episode/get', urlname, { - 'disclosure': 'standard', - 'language': language, - 'show_urlname': show_urlname, - 'urlname': urlname, - })[0] - return self._parse_video_data(video_data) - - -class AdobeTVPlaylistBaseIE(AdobeTVBaseIE): - _PAGE_SIZE = 25 - - def _fetch_page(self, display_id, query, page): - page += 1 - query['page'] = page - for element_data in self._call_api( - self._RESOURCE, display_id, query, 'Download Page %d' % page): - yield self._process_data(element_data) - - def _extract_playlist_entries(self, display_id, query): - return OnDemandPagedList(functools.partial( - self._fetch_page, display_id, query), self._PAGE_SIZE) - - -class AdobeTVShowIE(AdobeTVPlaylistBaseIE): - IE_NAME = 'adobetv:show' - _VALID_URL = r'https?://tv\.adobe\.com/(?:(?P<language>fr|de|es|jp)/)?show/(?P<id>[^/]+)' - - _TEST = { - 'url': 'http://tv.adobe.com/show/the-complete-picture-with-julieanne-kost', - 'info_dict': { - 'id': '36', - 'title': 'The Complete Picture with Julieanne Kost', - 'description': 'md5:fa50867102dcd1aa0ddf2ab039311b27', - }, - 'playlist_mincount': 136, - } - _RESOURCE = 'episode' - _process_data = AdobeTVBaseIE._parse_video_data - - def _real_extract(self, url): - language, show_urlname = re.match(self._VALID_URL, url).groups() - if not language: - language = 'en' - query = { - 'disclosure': 'standard', - 'language': language, - 'show_urlname': show_urlname, - } - - show_data = self._call_api( - 'show/get', show_urlname, query)[0] - - return self.playlist_result( - self._extract_playlist_entries(show_urlname, query), - str_or_none(show_data.get('id')), - show_data.get('show_name'), - show_data.get('show_description')) - - -class AdobeTVChannelIE(AdobeTVPlaylistBaseIE): - IE_NAME = 'adobetv:channel' - _VALID_URL = r'https?://tv\.adobe\.com/(?:(?P<language>fr|de|es|jp)/)?channel/(?P<id>[^/]+)(?:/(?P<category_urlname>[^/]+))?' - - _TEST = { - 'url': 'http://tv.adobe.com/channel/development', - 'info_dict': { - 'id': 'development', - }, - 'playlist_mincount': 96, - } - _RESOURCE = 'show' - - def _process_data(self, show_data): - return self.url_result( - show_data['url'], 'AdobeTVShow', str_or_none(show_data.get('id'))) - - def _real_extract(self, url): - language, channel_urlname, category_urlname = re.match(self._VALID_URL, url).groups() - if not language: - language = 'en' - query = { - 'channel_urlname': channel_urlname, - 'language': language, - } - if category_urlname: - query['category_urlname'] = category_urlname - - return self.playlist_result( - self._extract_playlist_entries(channel_urlname, query), - channel_urlname) - - -class AdobeTVVideoIE(AdobeTVBaseIE): - IE_NAME = 'adobetv:video' - _VALID_URL = r'https?://video\.tv\.adobe\.com/v/(?P<id>\d+)' - - _TEST = { - # From https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners - 'url': 'https://video.tv.adobe.com/v/2456/', - 'md5': '43662b577c018ad707a63766462b1e87', - 'info_dict': { - 'id': '2456', - 'ext': 'mp4', - 'title': 'New experience with Acrobat DC', - 'description': 'New experience with Acrobat DC', - 'duration': 248.667, - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - video_data = self._parse_json(self._search_regex( - r'var\s+bridge\s*=\s*([^;]+);', webpage, 'bridged data'), video_id) - title = video_data['title'] - - formats = [] - sources = video_data.get('sources') or [] - for source in sources: - source_src = source.get('src') - if not source_src: - continue - formats.append({ - 'filesize': int_or_none(source.get('kilobytes') or None, invscale=1000), - 'format_id': '-'.join(filter(None, [source.get('format'), source.get('label')])), - 'height': int_or_none(source.get('height') or None), - 'tbr': int_or_none(source.get('bitrate') or None), - 'width': int_or_none(source.get('width') or None), - 'url': source_src, - }) - self._sort_formats(formats) - - # For both metadata and downloaded files the duration varies among - # formats. I just pick the max one - duration = max(filter(None, [ - float_or_none(source.get('duration'), scale=1000) - for source in sources])) - - return { - 'id': video_id, - 'formats': formats, - 'title': title, - 'description': video_data.get('description'), - 'thumbnail': video_data.get('video', {}).get('poster'), - 'duration': duration, - 'subtitles': self._parse_subtitles(video_data, 'vttPath'), - } diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py deleted file mode 100644 index 8d1d9ac7d..000000000 --- a/youtube_dl/extractor/adultswim.py +++ /dev/null @@ -1,202 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import json -import re - -from .turner import TurnerBaseIE -from ..utils import ( - determine_ext, - float_or_none, - int_or_none, - mimetype2ext, - parse_age_limit, - parse_iso8601, - strip_or_none, - try_get, -) - - -class AdultSwimIE(TurnerBaseIE): - _VALID_URL = r'https?://(?:www\.)?adultswim\.com/videos/(?P<show_path>[^/?#]+)(?:/(?P<episode_path>[^/?#]+))?' - - _TESTS = [{ - 'url': 'http://adultswim.com/videos/rick-and-morty/pilot', - 'info_dict': { - 'id': 'rQxZvXQ4ROaSOqq-or2Mow', - 'ext': 'mp4', - 'title': 'Rick and Morty - Pilot', - 'description': 'Rick moves in with his daughter\'s family and establishes himself as a bad influence on his grandson, Morty.', - 'timestamp': 1543294800, - 'upload_date': '20181127', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'expected_warnings': ['Unable to download f4m manifest'], - }, { - 'url': 'http://www.adultswim.com/videos/tim-and-eric-awesome-show-great-job/dr-steve-brule-for-your-wine/', - 'info_dict': { - 'id': 'sY3cMUR_TbuE4YmdjzbIcQ', - 'ext': 'mp4', - 'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine', - 'description': 'Dr. Brule reports live from Wine Country with a special report on wines. \nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.', - 'upload_date': '20080124', - 'timestamp': 1201150800, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'skip': '404 Not Found', - }, { - 'url': 'http://www.adultswim.com/videos/decker/inside-decker-a-new-hero/', - 'info_dict': { - 'id': 'I0LQFQkaSUaFp8PnAWHhoQ', - 'ext': 'mp4', - 'title': 'Decker - Inside Decker: A New Hero', - 'description': 'The guys recap the conclusion of the season. They announce a new hero, take a peek into the Victorville Film Archive and welcome back the talented James Dean.', - 'timestamp': 1469480460, - 'upload_date': '20160725', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'expected_warnings': ['Unable to download f4m manifest'], - }, { - 'url': 'http://www.adultswim.com/videos/attack-on-titan', - 'info_dict': { - 'id': 'attack-on-titan', - 'title': 'Attack on Titan', - 'description': 'md5:41caa9416906d90711e31dc00cb7db7e', - }, - 'playlist_mincount': 12, - }, { - 'url': 'http://www.adultswim.com/videos/streams/williams-stream', - 'info_dict': { - 'id': 'd8DEBj7QRfetLsRgFnGEyg', - 'ext': 'mp4', - 'title': r're:^Williams Stream \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', - 'description': 'original programming', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'skip': '404 Not Found', - }] - - def _real_extract(self, url): - show_path, episode_path = re.match(self._VALID_URL, url).groups() - display_id = episode_path or show_path - query = '''query { - getShowBySlug(slug:"%s") { - %%s - } -}''' % show_path - if episode_path: - query = query % '''title - getVideoBySlug(slug:"%s") { - _id - auth - description - duration - episodeNumber - launchDate - mediaID - seasonNumber - poster - title - tvRating - }''' % episode_path - ['getVideoBySlug'] - else: - query = query % '''metaDescription - title - videos(first:1000,sort:["episode_number"]) { - edges { - node { - _id - slug - } - } - }''' - show_data = self._download_json( - 'https://www.adultswim.com/api/search', display_id, - data=json.dumps({'query': query}).encode(), - headers={'Content-Type': 'application/json'})['data']['getShowBySlug'] - if episode_path: - video_data = show_data['getVideoBySlug'] - video_id = video_data['_id'] - episode_title = title = video_data['title'] - series = show_data.get('title') - if series: - title = '%s - %s' % (series, title) - info = { - 'id': video_id, - 'title': title, - 'description': strip_or_none(video_data.get('description')), - 'duration': float_or_none(video_data.get('duration')), - 'formats': [], - 'subtitles': {}, - 'age_limit': parse_age_limit(video_data.get('tvRating')), - 'thumbnail': video_data.get('poster'), - 'timestamp': parse_iso8601(video_data.get('launchDate')), - 'series': series, - 'season_number': int_or_none(video_data.get('seasonNumber')), - 'episode': episode_title, - 'episode_number': int_or_none(video_data.get('episodeNumber')), - } - - auth = video_data.get('auth') - media_id = video_data.get('mediaID') - if media_id: - info.update(self._extract_ngtv_info(media_id, { - # CDN_TOKEN_APP_ID from: - # https://d2gg02c3xr550i.cloudfront.net/assets/asvp.e9c8bef24322d060ef87.bundle.js - 'appId': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhcHBJZCI6ImFzLXR2ZS1kZXNrdG9wLXB0enQ2bSIsInByb2R1Y3QiOiJ0dmUiLCJuZXR3b3JrIjoiYXMiLCJwbGF0Zm9ybSI6ImRlc2t0b3AiLCJpYXQiOjE1MzI3MDIyNzl9.BzSCk-WYOZ2GMCIaeVb8zWnzhlgnXuJTCu0jGp_VaZE', - }, { - 'url': url, - 'site_name': 'AdultSwim', - 'auth_required': auth, - })) - - if not auth: - extract_data = self._download_json( - 'https://www.adultswim.com/api/shows/v1/videos/' + video_id, - video_id, query={'fields': 'stream'}, fatal=False) or {} - assets = try_get(extract_data, lambda x: x['data']['video']['stream']['assets'], list) or [] - for asset in assets: - asset_url = asset.get('url') - if not asset_url: - continue - ext = determine_ext(asset_url, mimetype2ext(asset.get('mime_type'))) - if ext == 'm3u8': - info['formats'].extend(self._extract_m3u8_formats( - asset_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) - elif ext == 'f4m': - continue - # info['formats'].extend(self._extract_f4m_formats( - # asset_url, video_id, f4m_id='hds', fatal=False)) - elif ext in ('scc', 'ttml', 'vtt'): - info['subtitles'].setdefault('en', []).append({ - 'url': asset_url, - }) - self._sort_formats(info['formats']) - - return info - else: - entries = [] - for edge in show_data.get('videos', {}).get('edges', []): - video = edge.get('node') or {} - slug = video.get('slug') - if not slug: - continue - entries.append(self.url_result( - 'http://adultswim.com/videos/%s/%s' % (show_path, slug), - 'AdultSwim', video.get('_id'))) - return self.playlist_result( - entries, show_path, show_data.get('title'), - strip_or_none(show_data.get('metaDescription'))) diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py deleted file mode 100644 index e55c03fd7..000000000 --- a/youtube_dl/extractor/aenetworks.py +++ /dev/null @@ -1,342 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .theplatform import ThePlatformIE -from ..utils import ( - ExtractorError, - GeoRestrictedError, - int_or_none, - update_url_query, - urlencode_postdata, -) - - -class AENetworksBaseIE(ThePlatformIE): - _BASE_URL_REGEX = r'''(?x)https?:// - (?:(?:www|play|watch)\.)? - (?P<domain> - (?:history(?:vault)?|aetv|mylifetime|lifetimemovieclub)\.com| - fyi\.tv - )/''' - _THEPLATFORM_KEY = 'crazyjava' - _THEPLATFORM_SECRET = 's3cr3t' - _DOMAIN_MAP = { - 'history.com': ('HISTORY', 'history'), - 'aetv.com': ('AETV', 'aetv'), - 'mylifetime.com': ('LIFETIME', 'lifetime'), - 'lifetimemovieclub.com': ('LIFETIMEMOVIECLUB', 'lmc'), - 'fyi.tv': ('FYI', 'fyi'), - 'historyvault.com': (None, 'historyvault'), - 'biography.com': (None, 'biography'), - } - - def _extract_aen_smil(self, smil_url, video_id, auth=None): - query = {'mbr': 'true'} - if auth: - query['auth'] = auth - TP_SMIL_QUERY = [{ - 'assetTypes': 'high_video_ak', - 'switch': 'hls_high_ak' - }, { - 'assetTypes': 'high_video_s3' - }, { - 'assetTypes': 'high_video_s3', - 'switch': 'hls_high_fastly', - }] - formats = [] - subtitles = {} - last_e = None - for q in TP_SMIL_QUERY: - q.update(query) - m_url = update_url_query(smil_url, q) - m_url = self._sign_url(m_url, self._THEPLATFORM_KEY, self._THEPLATFORM_SECRET) - try: - tp_formats, tp_subtitles = self._extract_theplatform_smil( - m_url, video_id, 'Downloading %s SMIL data' % (q.get('switch') or q['assetTypes'])) - except ExtractorError as e: - if isinstance(e, GeoRestrictedError): - raise - last_e = e - continue - formats.extend(tp_formats) - subtitles = self._merge_subtitles(subtitles, tp_subtitles) - if last_e and not formats: - raise last_e - self._sort_formats(formats) - return { - 'id': video_id, - 'formats': formats, - 'subtitles': subtitles, - } - - def _extract_aetn_info(self, domain, filter_key, filter_value, url): - requestor_id, brand = self._DOMAIN_MAP[domain] - result = self._download_json( - 'https://feeds.video.aetnd.com/api/v2/%s/videos' % brand, - filter_value, query={'filter[%s]' % filter_key: filter_value})['results'][0] - title = result['title'] - video_id = result['id'] - media_url = result['publicUrl'] - theplatform_metadata = self._download_theplatform_metadata(self._search_regex( - r'https?://link\.theplatform\.com/s/([^?]+)', media_url, 'theplatform_path'), video_id) - info = self._parse_theplatform_metadata(theplatform_metadata) - auth = None - if theplatform_metadata.get('AETN$isBehindWall'): - resource = self._get_mvpd_resource( - requestor_id, theplatform_metadata['title'], - theplatform_metadata.get('AETN$PPL_pplProgramId') or theplatform_metadata.get('AETN$PPL_pplProgramId_OLD'), - theplatform_metadata['ratings'][0]['rating']) - auth = self._extract_mvpd_auth( - url, video_id, requestor_id, resource) - info.update(self._extract_aen_smil(media_url, video_id, auth)) - info.update({ - 'title': title, - 'series': result.get('seriesName'), - 'season_number': int_or_none(result.get('tvSeasonNumber')), - 'episode_number': int_or_none(result.get('tvSeasonEpisodeNumber')), - }) - return info - - -class AENetworksIE(AENetworksBaseIE): - IE_NAME = 'aenetworks' - IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network and History Vault' - _VALID_URL = AENetworksBaseIE._BASE_URL_REGEX + r'''(?P<id> - shows/[^/]+/season-\d+/episode-\d+| - (?: - (?:movie|special)s/[^/]+| - (?:shows/[^/]+/)?videos - )/[^/?#&]+ - )''' - _TESTS = [{ - 'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1', - 'info_dict': { - 'id': '22253814', - 'ext': 'mp4', - 'title': 'Winter is Coming', - 'description': 'md5:641f424b7a19d8e24f26dea22cf59d74', - 'timestamp': 1338306241, - 'upload_date': '20120529', - 'uploader': 'AENE-NEW', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'add_ie': ['ThePlatform'], - 'skip': 'This video is only available for users of participating TV providers.', - }, { - 'url': 'http://www.aetv.com/shows/duck-dynasty/season-9/episode-1', - 'info_dict': { - 'id': '600587331957', - 'ext': 'mp4', - 'title': 'Inlawful Entry', - 'description': 'md5:57c12115a2b384d883fe64ca50529e08', - 'timestamp': 1452634428, - 'upload_date': '20160112', - 'uploader': 'AENE-NEW', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'add_ie': ['ThePlatform'], - }, { - 'url': 'http://www.fyi.tv/shows/tiny-house-nation/season-1/episode-8', - 'only_matching': True - }, { - 'url': 'http://www.mylifetime.com/shows/project-runway-junior/season-1/episode-6', - 'only_matching': True - }, { - 'url': 'http://www.mylifetime.com/movies/center-stage-on-pointe/full-movie', - 'only_matching': True - }, { - 'url': 'https://watch.lifetimemovieclub.com/movies/10-year-reunion/full-movie', - 'only_matching': True - }, { - 'url': 'http://www.history.com/specials/sniper-into-the-kill-zone/full-special', - 'only_matching': True - }, { - 'url': 'https://www.aetv.com/specials/hunting-jonbenets-killer-the-untold-story/preview-hunting-jonbenets-killer-the-untold-story', - 'only_matching': True - }, { - 'url': 'http://www.history.com/videos/history-of-valentines-day', - 'only_matching': True - }, { - 'url': 'https://play.aetv.com/shows/duck-dynasty/videos/best-of-duck-dynasty-getting-quack-in-shape', - 'only_matching': True - }] - - def _real_extract(self, url): - domain, canonical = re.match(self._VALID_URL, url).groups() - return self._extract_aetn_info(domain, 'canonical', '/' + canonical, url) - - -class AENetworksListBaseIE(AENetworksBaseIE): - def _call_api(self, resource, slug, brand, fields): - return self._download_json( - 'https://yoga.appsvcs.aetnd.com/graphql', - slug, query={'brand': brand}, data=urlencode_postdata({ - 'query': '''{ - %s(slug: "%s") { - %s - } -}''' % (resource, slug, fields), - }))['data'][resource] - - def _real_extract(self, url): - domain, slug = re.match(self._VALID_URL, url).groups() - _, brand = self._DOMAIN_MAP[domain] - playlist = self._call_api(self._RESOURCE, slug, brand, self._FIELDS) - base_url = 'http://watch.%s' % domain - - entries = [] - for item in (playlist.get(self._ITEMS_KEY) or []): - doc = self._get_doc(item) - canonical = doc.get('canonical') - if not canonical: - continue - entries.append(self.url_result( - base_url + canonical, AENetworksIE.ie_key(), doc.get('id'))) - - description = None - if self._PLAYLIST_DESCRIPTION_KEY: - description = playlist.get(self._PLAYLIST_DESCRIPTION_KEY) - - return self.playlist_result( - entries, playlist.get('id'), - playlist.get(self._PLAYLIST_TITLE_KEY), description) - - -class AENetworksCollectionIE(AENetworksListBaseIE): - IE_NAME = 'aenetworks:collection' - _VALID_URL = AENetworksBaseIE._BASE_URL_REGEX + r'(?:[^/]+/)*(?:list|collections)/(?P<id>[^/?#&]+)/?(?:[?#&]|$)' - _TESTS = [{ - 'url': 'https://watch.historyvault.com/list/america-the-story-of-us', - 'info_dict': { - 'id': '282', - 'title': 'America The Story of Us', - }, - 'playlist_mincount': 12, - }, { - 'url': 'https://watch.historyvault.com/shows/america-the-story-of-us-2/season-1/list/america-the-story-of-us', - 'only_matching': True - }, { - 'url': 'https://www.historyvault.com/collections/mysteryquest', - 'only_matching': True - }] - _RESOURCE = 'list' - _ITEMS_KEY = 'items' - _PLAYLIST_TITLE_KEY = 'display_title' - _PLAYLIST_DESCRIPTION_KEY = None - _FIELDS = '''id - display_title - items { - ... on ListVideoItem { - doc { - canonical - id - } - } - }''' - - def _get_doc(self, item): - return item.get('doc') or {} - - -class AENetworksShowIE(AENetworksListBaseIE): - IE_NAME = 'aenetworks:show' - _VALID_URL = AENetworksBaseIE._BASE_URL_REGEX + r'shows/(?P<id>[^/?#&]+)/?(?:[?#&]|$)' - _TESTS = [{ - 'url': 'http://www.history.com/shows/ancient-aliens', - 'info_dict': { - 'id': 'SERIES1574', - 'title': 'Ancient Aliens', - 'description': 'md5:3f6d74daf2672ff3ae29ed732e37ea7f', - }, - 'playlist_mincount': 150, - }] - _RESOURCE = 'series' - _ITEMS_KEY = 'episodes' - _PLAYLIST_TITLE_KEY = 'title' - _PLAYLIST_DESCRIPTION_KEY = 'description' - _FIELDS = '''description - id - title - episodes { - canonical - id - }''' - - def _get_doc(self, item): - return item - - -class HistoryTopicIE(AENetworksBaseIE): - IE_NAME = 'history:topic' - IE_DESC = 'History.com Topic' - _VALID_URL = r'https?://(?:www\.)?history\.com/topics/[^/]+/(?P<id>[\w+-]+?)-video' - _TESTS = [{ - 'url': 'https://www.history.com/topics/valentines-day/history-of-valentines-day-video', - 'info_dict': { - 'id': '40700995724', - 'ext': 'mp4', - 'title': "History of Valentine’s Day", - 'description': 'md5:7b57ea4829b391995b405fa60bd7b5f7', - 'timestamp': 1375819729, - 'upload_date': '20130806', - 'uploader': 'AENE-NEW', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'add_ie': ['ThePlatform'], - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - return self.url_result( - 'http://www.history.com/videos/' + display_id, - AENetworksIE.ie_key()) - - -class HistoryPlayerIE(AENetworksBaseIE): - IE_NAME = 'history:player' - _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:history|biography)\.com)/player/(?P<id>\d+)' - _TESTS = [] - - def _real_extract(self, url): - domain, video_id = re.match(self._VALID_URL, url).groups() - return self._extract_aetn_info(domain, 'id', video_id, url) - - -class BiographyIE(AENetworksBaseIE): - _VALID_URL = r'https?://(?:www\.)?biography\.com/video/(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'https://www.biography.com/video/vincent-van-gogh-full-episode-2075049808', - 'info_dict': { - 'id': '30322987', - 'ext': 'mp4', - 'title': 'Vincent Van Gogh - Full Episode', - 'description': 'A full biography about the most influential 20th century painter, Vincent Van Gogh.', - 'timestamp': 1311970571, - 'upload_date': '20110729', - 'uploader': 'AENE-NEW', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'add_ie': ['ThePlatform'], - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - player_url = self._search_regex( - r'<phoenix-iframe[^>]+src="(%s)' % HistoryPlayerIE._VALID_URL, - webpage, 'player URL') - return self.url_result(player_url, HistoryPlayerIE.ie_key()) diff --git a/youtube_dl/extractor/afreecatv.py b/youtube_dl/extractor/afreecatv.py deleted file mode 100644 index b56abb1e6..000000000 --- a/youtube_dl/extractor/afreecatv.py +++ /dev/null @@ -1,367 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_xpath -from ..utils import ( - determine_ext, - ExtractorError, - int_or_none, - url_or_none, - urlencode_postdata, - xpath_text, -) - - -class AfreecaTVIE(InfoExtractor): - IE_NAME = 'afreecatv' - IE_DESC = 'afreecatv.com' - _VALID_URL = r'''(?x) - https?:// - (?: - (?:(?:live|afbbs|www)\.)?afreeca(?:tv)?\.com(?::\d+)? - (?: - /app/(?:index|read_ucc_bbs)\.cgi| - /player/[Pp]layer\.(?:swf|html) - )\?.*?\bnTitleNo=| - vod\.afreecatv\.com/PLAYER/STATION/ - ) - (?P<id>\d+) - ''' - _NETRC_MACHINE = 'afreecatv' - _TESTS = [{ - 'url': 'http://live.afreecatv.com:8079/app/index.cgi?szType=read_ucc_bbs&szBjId=dailyapril&nStationNo=16711924&nBbsNo=18605867&nTitleNo=36164052&szSkin=', - 'md5': 'f72c89fe7ecc14c1b5ce506c4996046e', - 'info_dict': { - 'id': '36164052', - 'ext': 'mp4', - 'title': '데일리 에이프릴 요정들의 시상식!', - 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', - 'uploader': 'dailyapril', - 'uploader_id': 'dailyapril', - 'upload_date': '20160503', - }, - 'skip': 'Video is gone', - }, { - 'url': 'http://afbbs.afreecatv.com:8080/app/read_ucc_bbs.cgi?nStationNo=16711924&nTitleNo=36153164&szBjId=dailyapril&nBbsNo=18605867', - 'info_dict': { - 'id': '36153164', - 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'", - 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', - 'uploader': 'dailyapril', - 'uploader_id': 'dailyapril', - }, - 'playlist_count': 2, - 'playlist': [{ - 'md5': 'd8b7c174568da61d774ef0203159bf97', - 'info_dict': { - 'id': '36153164_1', - 'ext': 'mp4', - 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'", - 'upload_date': '20160502', - }, - }, { - 'md5': '58f2ce7f6044e34439ab2d50612ab02b', - 'info_dict': { - 'id': '36153164_2', - 'ext': 'mp4', - 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'", - 'upload_date': '20160502', - }, - }], - 'skip': 'Video is gone', - }, { - 'url': 'http://vod.afreecatv.com/PLAYER/STATION/18650793', - 'info_dict': { - 'id': '18650793', - 'ext': 'mp4', - 'title': '오늘은 다르다! 쏘님의 우월한 위아래~ 댄스리액션!', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': '윈아디', - 'uploader_id': 'badkids', - 'duration': 107, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://vod.afreecatv.com/PLAYER/STATION/10481652', - 'info_dict': { - 'id': '10481652', - 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'", - 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', - 'uploader': 'dailyapril', - 'uploader_id': 'dailyapril', - 'duration': 6492, - }, - 'playlist_count': 2, - 'playlist': [{ - 'md5': 'd8b7c174568da61d774ef0203159bf97', - 'info_dict': { - 'id': '20160502_c4c62b9d_174361386_1', - 'ext': 'mp4', - 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!' (part 1)", - 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', - 'uploader': 'dailyapril', - 'uploader_id': 'dailyapril', - 'upload_date': '20160502', - 'duration': 3601, - }, - }, { - 'md5': '58f2ce7f6044e34439ab2d50612ab02b', - 'info_dict': { - 'id': '20160502_39e739bb_174361386_2', - 'ext': 'mp4', - 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!' (part 2)", - 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', - 'uploader': 'dailyapril', - 'uploader_id': 'dailyapril', - 'upload_date': '20160502', - 'duration': 2891, - }, - }], - 'params': { - 'skip_download': True, - }, - }, { - # non standard key - 'url': 'http://vod.afreecatv.com/PLAYER/STATION/20515605', - 'info_dict': { - 'id': '20170411_BE689A0E_190960999_1_2_h', - 'ext': 'mp4', - 'title': '혼자사는여자집', - 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', - 'uploader': '♥이슬이', - 'uploader_id': 'dasl8121', - 'upload_date': '20170411', - 'duration': 213, - }, - 'params': { - 'skip_download': True, - }, - }, { - # PARTIAL_ADULT - 'url': 'http://vod.afreecatv.com/PLAYER/STATION/32028439', - 'info_dict': { - 'id': '20180327_27901457_202289533_1', - 'ext': 'mp4', - 'title': '[생]빨개요♥ (part 1)', - 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', - 'uploader': '[SA]서아', - 'uploader_id': 'bjdyrksu', - 'upload_date': '20180327', - 'duration': 3601, - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': ['adult content'], - }, { - 'url': 'http://www.afreecatv.com/player/Player.swf?szType=szBjId=djleegoon&nStationNo=11273158&nBbsNo=13161095&nTitleNo=36327652', - 'only_matching': True, - }, { - 'url': 'http://vod.afreecatv.com/PLAYER/STATION/15055030', - 'only_matching': True, - }] - - @staticmethod - def parse_video_key(key): - video_key = {} - m = re.match(r'^(?P<upload_date>\d{8})_\w+_(?P<part>\d+)$', key) - if m: - video_key['upload_date'] = m.group('upload_date') - video_key['part'] = int(m.group('part')) - return video_key - - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - - login_form = { - 'szWork': 'login', - 'szType': 'json', - 'szUid': username, - 'szPassword': password, - 'isSaveId': 'false', - 'szScriptVar': 'oLoginRet', - 'szAction': '', - } - - response = self._download_json( - 'https://login.afreecatv.com/app/LoginAction.php', None, - 'Logging in', data=urlencode_postdata(login_form)) - - _ERRORS = { - -4: 'Your account has been suspended due to a violation of our terms and policies.', - -5: 'https://member.afreecatv.com/app/user_delete_progress.php', - -6: 'https://login.afreecatv.com/membership/changeMember.php', - -8: "Hello! AfreecaTV here.\nThe username you have entered belongs to \n an account that requires a legal guardian's consent. \nIf you wish to use our services without restriction, \nplease make sure to go through the necessary verification process.", - -9: 'https://member.afreecatv.com/app/pop_login_block.php', - -11: 'https://login.afreecatv.com/afreeca/second_login.php', - -12: 'https://member.afreecatv.com/app/user_security.php', - 0: 'The username does not exist or you have entered the wrong password.', - -1: 'The username does not exist or you have entered the wrong password.', - -3: 'You have entered your username/password incorrectly.', - -7: 'You cannot use your Global AfreecaTV account to access Korean AfreecaTV.', - -10: 'Sorry for the inconvenience. \nYour account has been blocked due to an unauthorized access. \nPlease contact our Help Center for assistance.', - -32008: 'You have failed to log in. Please contact our Help Center.', - } - - result = int_or_none(response.get('RESULT')) - if result != 1: - error = _ERRORS.get(result, 'You have failed to log in.') - raise ExtractorError( - 'Unable to login: %s said: %s' % (self.IE_NAME, error), - expected=True) - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - if re.search(r'alert\(["\']This video has been deleted', webpage): - raise ExtractorError( - 'Video %s has been deleted' % video_id, expected=True) - - station_id = self._search_regex( - r'nStationNo\s*=\s*(\d+)', webpage, 'station') - bbs_id = self._search_regex( - r'nBbsNo\s*=\s*(\d+)', webpage, 'bbs') - video_id = self._search_regex( - r'nTitleNo\s*=\s*(\d+)', webpage, 'title', default=video_id) - - partial_view = False - for _ in range(2): - query = { - 'nTitleNo': video_id, - 'nStationNo': station_id, - 'nBbsNo': bbs_id, - } - if partial_view: - query['partialView'] = 'SKIP_ADULT' - video_xml = self._download_xml( - 'http://afbbs.afreecatv.com:8080/api/video/get_video_info.php', - video_id, 'Downloading video info XML%s' - % (' (skipping adult)' if partial_view else ''), - video_id, headers={ - 'Referer': url, - }, query=query) - - flag = xpath_text(video_xml, './track/flag', 'flag', default=None) - if flag and flag == 'SUCCEED': - break - if flag == 'PARTIAL_ADULT': - self._downloader.report_warning( - 'In accordance with local laws and regulations, underage users are restricted from watching adult content. ' - 'Only content suitable for all ages will be downloaded. ' - 'Provide account credentials if you wish to download restricted content.') - partial_view = True - continue - elif flag == 'ADULT': - error = 'Only users older than 19 are able to watch this video. Provide account credentials to download this content.' - else: - error = flag - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, error), expected=True) - else: - raise ExtractorError('Unable to download video info') - - video_element = video_xml.findall(compat_xpath('./track/video'))[-1] - if video_element is None or video_element.text is None: - raise ExtractorError( - 'Video %s does not exist' % video_id, expected=True) - - video_url = video_element.text.strip() - - title = xpath_text(video_xml, './track/title', 'title', fatal=True) - - uploader = xpath_text(video_xml, './track/nickname', 'uploader') - uploader_id = xpath_text(video_xml, './track/bj_id', 'uploader id') - duration = int_or_none(xpath_text( - video_xml, './track/duration', 'duration')) - thumbnail = xpath_text(video_xml, './track/titleImage', 'thumbnail') - - common_entry = { - 'uploader': uploader, - 'uploader_id': uploader_id, - 'thumbnail': thumbnail, - } - - info = common_entry.copy() - info.update({ - 'id': video_id, - 'title': title, - 'duration': duration, - }) - - if not video_url: - entries = [] - file_elements = video_element.findall(compat_xpath('./file')) - one = len(file_elements) == 1 - for file_num, file_element in enumerate(file_elements, start=1): - file_url = url_or_none(file_element.text) - if not file_url: - continue - key = file_element.get('key', '') - upload_date = self._search_regex( - r'^(\d{8})_', key, 'upload date', default=None) - file_duration = int_or_none(file_element.get('duration')) - format_id = key if key else '%s_%s' % (video_id, file_num) - if determine_ext(file_url) == 'm3u8': - formats = self._extract_m3u8_formats( - file_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', - note='Downloading part %d m3u8 information' % file_num) - else: - formats = [{ - 'url': file_url, - 'format_id': 'http', - }] - if not formats: - continue - self._sort_formats(formats) - file_info = common_entry.copy() - file_info.update({ - 'id': format_id, - 'title': title if one else '%s (part %d)' % (title, file_num), - 'upload_date': upload_date, - 'duration': file_duration, - 'formats': formats, - }) - entries.append(file_info) - entries_info = info.copy() - entries_info.update({ - '_type': 'multi_video', - 'entries': entries, - }) - return entries_info - - info = { - 'id': video_id, - 'title': title, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'duration': duration, - 'thumbnail': thumbnail, - } - - if determine_ext(video_url) == 'm3u8': - info['formats'] = self._extract_m3u8_formats( - video_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls') - else: - app, playpath = video_url.split('mp4:') - info.update({ - 'url': app, - 'ext': 'flv', - 'play_path': 'mp4:' + playpath, - 'rtmp_live': True, # downloading won't end without this - }) - - return info diff --git a/youtube_dl/extractor/aljazeera.py b/youtube_dl/extractor/aljazeera.py deleted file mode 100644 index c4f915a3c..000000000 --- a/youtube_dl/extractor/aljazeera.py +++ /dev/null @@ -1,56 +0,0 @@ -from __future__ import unicode_literals - -import json -import re - -from .common import InfoExtractor - - -class AlJazeeraIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/(?P<type>program/[^/]+|(?:feature|video)s)/\d{4}/\d{1,2}/\d{1,2}/(?P<id>[^/?&#]+)' - - _TESTS = [{ - 'url': 'https://www.aljazeera.com/program/episode/2014/9/19/deliverance', - 'info_dict': { - 'id': '3792260579001', - 'ext': 'mp4', - 'title': 'The Slum - Episode 1: Deliverance', - 'description': 'As a birth attendant advocating for family planning, Remy is on the frontline of Tondo\'s battle with overcrowding.', - 'uploader_id': '665003303001', - 'timestamp': 1411116829, - 'upload_date': '20140919', - }, - 'add_ie': ['BrightcoveNew'], - 'skip': 'Not accessible from Travis CI server', - }, { - 'url': 'https://www.aljazeera.com/videos/2017/5/11/sierra-leone-709-carat-diamond-to-be-auctioned-off', - 'only_matching': True, - }, { - 'url': 'https://www.aljazeera.com/features/2017/8/21/transforming-pakistans-buses-into-art', - 'only_matching': True, - }] - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' - - def _real_extract(self, url): - post_type, name = re.match(self._VALID_URL, url).groups() - post_type = { - 'features': 'post', - 'program': 'episode', - 'videos': 'video', - }[post_type.split('/')[0]] - video = self._download_json( - 'https://www.aljazeera.com/graphql', name, query={ - 'operationName': 'SingleArticleQuery', - 'variables': json.dumps({ - 'name': name, - 'postType': post_type, - }), - }, headers={ - 'wp-site': 'aje', - })['data']['article']['video'] - video_id = video['id'] - account_id = video.get('accountId') or '665003303001' - player_id = video.get('playerId') or 'BkeSH5BDb' - return self.url_result( - self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, video_id), - 'BrightcoveNew', video_id) diff --git a/youtube_dl/extractor/amcnetworks.py b/youtube_dl/extractor/amcnetworks.py deleted file mode 100644 index b8027bbca..000000000 --- a/youtube_dl/extractor/amcnetworks.py +++ /dev/null @@ -1,119 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .theplatform import ThePlatformIE -from ..utils import ( - int_or_none, - parse_age_limit, - try_get, - update_url_query, -) - - -class AMCNetworksIE(ThePlatformIE): - _VALID_URL = r'https?://(?:www\.)?(?P<site>amc|bbcamerica|ifc|(?:we|sundance)tv)\.com/(?P<id>(?:movies|shows(?:/[^/]+)+)/[^/?#&]+)' - _TESTS = [{ - 'url': 'https://www.bbcamerica.com/shows/the-graham-norton-show/videos/tina-feys-adorable-airline-themed-family-dinner--51631', - 'info_dict': { - 'id': '4Lq1dzOnZGt0', - 'ext': 'mp4', - 'title': "The Graham Norton Show - Season 28 - Tina Fey's Adorable Airline-Themed Family Dinner", - 'description': "It turns out child stewardesses are very generous with the wine! All-new episodes of 'The Graham Norton Show' premiere Fridays at 11/10c on BBC America.", - 'upload_date': '20201120', - 'timestamp': 1605904350, - 'uploader': 'AMCN', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'http://www.bbcamerica.com/shows/the-hunt/full-episodes/season-1/episode-01-the-hardest-challenge', - 'only_matching': True, - }, { - 'url': 'http://www.amc.com/shows/preacher/full-episodes/season-01/episode-00/pilot', - 'only_matching': True, - }, { - 'url': 'http://www.wetv.com/shows/million-dollar-matchmaker/season-01/episode-06-the-dumped-dj-and-shallow-hal', - 'only_matching': True, - }, { - 'url': 'http://www.ifc.com/movies/chaos', - 'only_matching': True, - }, { - 'url': 'http://www.bbcamerica.com/shows/doctor-who/full-episodes/the-power-of-the-daleks/episode-01-episode-1-color-version', - 'only_matching': True, - }, { - 'url': 'http://www.wetv.com/shows/mama-june-from-not-to-hot/full-episode/season-01/thin-tervention', - 'only_matching': True, - }, { - 'url': 'http://www.wetv.com/shows/la-hair/videos/season-05/episode-09-episode-9-2/episode-9-sneak-peek-3', - 'only_matching': True, - }, { - 'url': 'https://www.sundancetv.com/shows/riviera/full-episodes/season-1/episode-01-episode-1', - 'only_matching': True, - }] - _REQUESTOR_ID_MAP = { - 'amc': 'AMC', - 'bbcamerica': 'BBCA', - 'ifc': 'IFC', - 'sundancetv': 'SUNDANCE', - 'wetv': 'WETV', - } - - def _real_extract(self, url): - site, display_id = re.match(self._VALID_URL, url).groups() - requestor_id = self._REQUESTOR_ID_MAP[site] - properties = self._download_json( - 'https://content-delivery-gw.svc.ds.amcn.com/api/v2/content/amcn/%s/url/%s' % (requestor_id.lower(), display_id), - display_id)['data']['properties'] - query = { - 'mbr': 'true', - 'manifest': 'm3u', - } - tp_path = 'M_UwQC/media/' + properties['videoPid'] - media_url = 'https://link.theplatform.com/s/' + tp_path - theplatform_metadata = self._download_theplatform_metadata(tp_path, display_id) - info = self._parse_theplatform_metadata(theplatform_metadata) - video_id = theplatform_metadata['pid'] - title = theplatform_metadata['title'] - rating = try_get( - theplatform_metadata, lambda x: x['ratings'][0]['rating']) - video_category = properties.get('videoCategory') - if video_category and video_category.endswith('-Auth'): - resource = self._get_mvpd_resource( - requestor_id, title, video_id, rating) - query['auth'] = self._extract_mvpd_auth( - url, video_id, requestor_id, resource) - media_url = update_url_query(media_url, query) - formats, subtitles = self._extract_theplatform_smil( - media_url, video_id) - self._sort_formats(formats) - info.update({ - 'id': video_id, - 'subtitles': subtitles, - 'formats': formats, - 'age_limit': parse_age_limit(parse_age_limit(rating)), - }) - ns_keys = theplatform_metadata.get('$xmlns', {}).keys() - if ns_keys: - ns = list(ns_keys)[0] - series = theplatform_metadata.get(ns + '$show') - season_number = int_or_none( - theplatform_metadata.get(ns + '$season')) - episode = theplatform_metadata.get(ns + '$episodeTitle') - episode_number = int_or_none( - theplatform_metadata.get(ns + '$episode')) - if season_number: - title = 'Season %d - %s' % (season_number, title) - if series: - title = '%s - %s' % (series, title) - info.update({ - 'title': title, - 'series': series, - 'season_number': season_number, - 'episode': episode, - 'episode_number': episode_number, - }) - return info diff --git a/youtube_dl/extractor/americastestkitchen.py b/youtube_dl/extractor/americastestkitchen.py deleted file mode 100644 index be960c0f9..000000000 --- a/youtube_dl/extractor/americastestkitchen.py +++ /dev/null @@ -1,159 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import json -import re - -from .common import InfoExtractor -from ..utils import ( - clean_html, - int_or_none, - try_get, - unified_strdate, - unified_timestamp, -) - - -class AmericasTestKitchenIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:americastestkitchen|cooks(?:country|illustrated))\.com/(?P<resource_type>episode|videos)/(?P<id>\d+)' - _TESTS = [{ - 'url': 'https://www.americastestkitchen.com/episode/582-weeknight-japanese-suppers', - 'md5': 'b861c3e365ac38ad319cfd509c30577f', - 'info_dict': { - 'id': '5b400b9ee338f922cb06450c', - 'title': 'Japanese Suppers', - 'ext': 'mp4', - 'description': 'md5:64e606bfee910627efc4b5f050de92b3', - 'thumbnail': r're:^https?://', - 'timestamp': 1523318400, - 'upload_date': '20180410', - 'release_date': '20180410', - 'series': "America's Test Kitchen", - 'season_number': 18, - 'episode': 'Japanese Suppers', - 'episode_number': 15, - }, - 'params': { - 'skip_download': True, - }, - }, { - # Metadata parsing behaves differently for newer episodes (705) as opposed to older episodes (582 above) - 'url': 'https://www.americastestkitchen.com/episode/705-simple-chicken-dinner', - 'md5': '06451608c57651e985a498e69cec17e5', - 'info_dict': { - 'id': '5fbe8c61bda2010001c6763b', - 'title': 'Simple Chicken Dinner', - 'ext': 'mp4', - 'description': 'md5:eb68737cc2fd4c26ca7db30139d109e7', - 'thumbnail': r're:^https?://', - 'timestamp': 1610755200, - 'upload_date': '20210116', - 'release_date': '20210116', - 'series': "America's Test Kitchen", - 'season_number': 21, - 'episode': 'Simple Chicken Dinner', - 'episode_number': 3, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://www.americastestkitchen.com/videos/3420-pan-seared-salmon', - 'only_matching': True, - }, { - 'url': 'https://www.cookscountry.com/episode/564-when-only-chocolate-will-do', - 'only_matching': True, - }, { - 'url': 'https://www.cooksillustrated.com/videos/4478-beef-wellington', - 'only_matching': True, - }] - - def _real_extract(self, url): - resource_type, video_id = re.match(self._VALID_URL, url).groups() - is_episode = resource_type == 'episode' - if is_episode: - resource_type = 'episodes' - - resource = self._download_json( - 'https://www.americastestkitchen.com/api/v6/%s/%s' % (resource_type, video_id), video_id) - video = resource['video'] if is_episode else resource - episode = resource if is_episode else resource.get('episode') or {} - - return { - '_type': 'url_transparent', - 'url': 'https://player.zype.com/embed/%s.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ' % video['zypeId'], - 'ie_key': 'Zype', - 'description': clean_html(video.get('description')), - 'timestamp': unified_timestamp(video.get('publishDate')), - 'release_date': unified_strdate(video.get('publishDate')), - 'episode_number': int_or_none(episode.get('number')), - 'season_number': int_or_none(episode.get('season')), - 'series': try_get(episode, lambda x: x['show']['title']), - 'episode': episode.get('title'), - } - - -class AmericasTestKitchenSeasonIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?P<show>americastestkitchen|cookscountry)\.com/episodes/browse/season_(?P<id>\d+)' - _TESTS = [{ - # ATK Season - 'url': 'https://www.americastestkitchen.com/episodes/browse/season_1', - 'info_dict': { - 'id': 'season_1', - 'title': 'Season 1', - }, - 'playlist_count': 13, - }, { - # Cooks Country Season - 'url': 'https://www.cookscountry.com/episodes/browse/season_12', - 'info_dict': { - 'id': 'season_12', - 'title': 'Season 12', - }, - 'playlist_count': 13, - }] - - def _real_extract(self, url): - show_name, season_number = re.match(self._VALID_URL, url).groups() - season_number = int(season_number) - - slug = 'atk' if show_name == 'americastestkitchen' else 'cco' - - season = 'Season %d' % season_number - - season_search = self._download_json( - 'https://y1fnzxui30-dsn.algolia.net/1/indexes/everest_search_%s_season_desc_production' % slug, - season, headers={ - 'Origin': 'https://www.%s.com' % show_name, - 'X-Algolia-API-Key': '8d504d0099ed27c1b73708d22871d805', - 'X-Algolia-Application-Id': 'Y1FNZXUI30', - }, query={ - 'facetFilters': json.dumps([ - 'search_season_list:' + season, - 'search_document_klass:episode', - 'search_show_slug:' + slug, - ]), - 'attributesToRetrieve': 'description,search_%s_episode_number,search_document_date,search_url,title' % slug, - 'attributesToHighlight': '', - 'hitsPerPage': 1000, - }) - - def entries(): - for episode in (season_search.get('hits') or []): - search_url = episode.get('search_url') - if not search_url: - continue - yield { - '_type': 'url', - 'url': 'https://www.%s.com%s' % (show_name, search_url), - 'id': try_get(episode, lambda e: e['objectID'].split('_')[-1]), - 'title': episode.get('title'), - 'description': episode.get('description'), - 'timestamp': unified_timestamp(episode.get('search_document_date')), - 'season_number': season_number, - 'episode_number': int_or_none(episode.get('search_%s_episode_number' % slug)), - 'ie_key': AmericasTestKitchenIE.ie_key(), - } - - return self.playlist_result( - entries(), 'season_%d' % season_number, season) diff --git a/youtube_dl/extractor/anvato.py b/youtube_dl/extractor/anvato.py deleted file mode 100644 index b7398563b..000000000 --- a/youtube_dl/extractor/anvato.py +++ /dev/null @@ -1,381 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import base64 -import hashlib -import json -import random -import re -import time - -from .common import InfoExtractor -from ..aes import aes_encrypt -from ..compat import compat_str -from ..utils import ( - bytes_to_intlist, - determine_ext, - intlist_to_bytes, - int_or_none, - strip_jsonp, - unescapeHTML, - unsmuggle_url, -) - - -def md5_text(s): - if not isinstance(s, compat_str): - s = compat_str(s) - return hashlib.md5(s.encode('utf-8')).hexdigest() - - -class AnvatoIE(InfoExtractor): - _VALID_URL = r'anvato:(?P<access_key_or_mcp>[^:]+):(?P<id>\d+)' - - # Copied from anvplayer.min.js - _ANVACK_TABLE = { - 'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ', - 'nbcu_nbcd_desktop_web_qa_1a6f01bdd0dc45a439043b694c8a031d': 'eSxJUbA2UUKBTXryyQ2d6NuM8oEqaPySvaPzfKNA', - 'nbcu_nbcd_desktop_web_acc_eb2ff240a5d4ae9a63d4c297c32716b6c523a129': '89JR3RtUGbvKuuJIiKOMK0SoarLb5MUx8v89RcbP', - 'nbcu_nbcd_watchvod_web_prod_e61107507180976724ec8e8319fe24ba5b4b60e1': 'Uc7dFt7MJ9GsBWB5T7iPvLaMSOt8BBxv4hAXk5vv', - 'nbcu_nbcd_watchvod_web_qa_42afedba88a36203db5a4c09a5ba29d045302232': 'T12oDYVFP2IaFvxkmYMy5dKxswpLHtGZa4ZAXEi7', - 'nbcu_nbcd_watchvod_web_acc_9193214448e2e636b0ffb78abacfd9c4f937c6ca': 'MmobcxUxMedUpohNWwXaOnMjlbiyTOBLL6d46ZpR', - 'nbcu_local_monitor_web_acc_f998ad54eaf26acd8ee033eb36f39a7b791c6335': 'QvfIoPYrwsjUCcASiw3AIkVtQob2LtJHfidp9iWg', - 'nbcu_cable_monitor_web_acc_a413759603e8bedfcd3c61b14767796e17834077': 'uwVPJLShvJWSs6sWEIuVem7MTF8A4IknMMzIlFto', - 'nbcu_nbcd_mcpstage_web_qa_4c43a8f6e95a88dbb40276c0630ba9f693a63a4e': 'PxVYZVwjhgd5TeoPRxL3whssb5OUPnM3zyAzq8GY', - 'nbcu_comcast_comcast_web_prod_074080762ad4ce956b26b43fb22abf153443a8c4': 'afnaRZfDyg1Z3WZHdupKfy6xrbAG2MHqe3VfuSwh', - 'nbcu_comcast_comcast_web_qa_706103bb93ead3ef70b1de12a0e95e3c4481ade0': 'DcjsVbX9b3uoPlhdriIiovgFQZVxpISZwz0cx1ZK', - 'nbcu_comcast_comcastcable_web_prod_669f04817536743563d7331c9293e59fbdbe3d07': '0RwMN2cWy10qhAhOscq3eK7aEe0wqnKt3vJ0WS4D', - 'nbcu_comcast_comcastcable_web_qa_3d9d2d66219094127f0f6b09cc3c7bb076e3e1ca': '2r8G9DEya7PCqBceKZgrn2XkXgASjwLMuaFE1Aad', - 'hearst_hearst_demo_web_stage_960726dfef3337059a01a78816e43b29ec04dfc7': 'cuZBPXTR6kSdoTCVXwk5KGA8rk3NrgGn4H6e9Dsp', - 'anvato_mcpqa_demo_web_stage_18b55e00db5a13faa8d03ae6e41f6f5bcb15b922': 'IOaaLQ8ymqVyem14QuAvE5SndQynTcH5CrLkU2Ih', - 'anvato_nextmedia_demo_web_stage_9787d56a02ff6b9f43e9a2b0920d8ca88beb5818': 'Pqu9zVzI1ApiIzbVA3VkGBEQHvdKSUuKpD6s2uaR', - 'anvato_scripps_app_web_prod_0837996dbe373629133857ae9eb72e740424d80a': 'du1ccmn7RxzgizwbWU7hyUaGodNlJn7HtXI0WgXW', - 'anvato_scripps_app_web_stage_360797e00fe2826be142155c4618cc52fce6c26c': '2PMrQ0BRoqCWl7nzphj0GouIMEh2mZYivAT0S1Su', - 'fs2go_fs2go_go_all_prod_21934911ccfafc03a075894ead2260d11e2ddd24': 'RcuHlKikW2IJw6HvVoEkqq2UsuEJlbEl11pWXs4Q', - 'fs2go_fs2go_go_web_prod_ead4b0eec7460c1a07783808db21b49cf1f2f9a7': '4K0HTT2u1zkQA2MaGaZmkLa1BthGSBdr7jllrhk5', - 'fs2go_fs2go_go_web_stage_407585454a4400355d4391691c67f361': 'ftnc37VKRJBmHfoGGi3kT05bHyeJzilEzhKJCyl3', - 'fs2go_fs2go_go_android_stage_44b714db6f8477f29afcba15a41e1d30': 'CtxpPvVpo6AbZGomYUhkKs7juHZwNml9b9J0J2gI', - 'anvato_cbslocal_app_web_prod_547f3e49241ef0e5d30c79b2efbca5d92c698f67': 'Pw0XX5KBDsyRnPS0R2JrSrXftsy8Jnz5pAjaYC8s', - 'anvato_cbslocal_app_web_stage_547a5f096594cd3e00620c6f825cad1096d28c80': '37OBUhX2uwNyKhhrNzSSNHSRPZpApC3trdqDBpuz', - 'fs2go_att_att_web_prod_1042dddd089a05438b6a08f972941176f699ffd8': 'JLcF20JwYvpv6uAGcLWIaV12jKwaL1R8us4b6Zkg', - 'fs2go_att_att_web_stage_807c5001955fc114a3331fe027ddc76e': 'gbu1oO1y0JiOFh4SUipt86P288JHpyjSqolrrT1x', - 'fs2go_fs2go_tudor_web_prod_a7dd8e5a7cdc830cae55eae6f3e9fee5ee49eb9b': 'ipcp87VCEZXPPe868j3orLqzc03oTy7DXsGkAXXH', - 'anvato_mhz_app_web_prod_b808218b30de7fdf60340cbd9831512bc1bf6d37': 'Stlm5Gs6BEhJLRTZHcNquyzxGqr23EuFmE5DCgjX', - 'fs2go_charter_charter_web_stage_c2c6e5a68375a1bf00fff213d3ff8f61a835a54c': 'Lz4hbJp1fwL6jlcz4M2PMzghM4jp4aAmybtT5dPc', - 'fs2go_charter_charter_web_prod_ebfe3b10f1af215a7321cd3d629e0b81dfa6fa8c': 'vUJsK345A1bVmyYDRhZX0lqFIgVXuqhmuyp1EtPK', - 'anvato_epfox_app_web_prod_b3373168e12f423f41504f207000188daf88251b': 'GDKq1ixvX3MoBNdU5IOYmYa2DTUXYOozPjrCJnW7', - 'anvato_epfox_app_web_stage_a3c2ce60f8f83ef374a88b68ee73a950f8ab87ce': '2jz2NH4BsXMaDsoJ5qkHMbcczAfIReo2eFYuVC1C', - 'fs2go_verizon_verizon_web_stage_08e6df0354a4803f1b1f2428b5a9a382e8dbcd62': 'rKTVapNaAcmnUbGL4ZcuOoY4SE7VmZSQsblPFr7e', - 'fs2go_verizon_verizon_web_prod_f909564cb606eff1f731b5e22e0928676732c445': 'qLSUuHerM3u9eNPzaHyUK52obai5MvE4XDJfqYe1', - 'fs2go_foxcom_synd_web_stage_f7b9091f00ea25a4fdaaae77fca5b54cdc7e7043': '96VKF2vLd24fFiDfwPFpzM5llFN4TiIGAlodE0Re', - 'fs2go_foxcom_synd_web_prod_0f2cdd64d87e4ab6a1d54aada0ff7a7c8387a064': 'agiPjbXEyEZUkbuhcnmVPhe9NNVbDjCFq2xkcx51', - 'anvato_own_app_web_stage_1214ade5d28422c4dae9d03c1243aba0563c4dba': 'mzhamNac3swG4WsJAiUTacnGIODi6SWeVWk5D7ho', - 'anvato_own_app_web_prod_944e162ed927ec3e9ed13eb68ed2f1008ee7565e': '9TSxh6G2TXOLBoYm9ro3LdNjjvnXpKb8UR8KoIP9', - 'anvato_scripps_app_ftv_prod_a10a10468edd5afb16fb48171c03b956176afad1': 'COJ2i2UIPK7xZqIWswxe7FaVBOVgRkP1F6O6qGoH', - 'anvato_scripps_app_ftv_stage_77d3ad2bdb021ec37ca2e35eb09acd396a974c9a': 'Q7nnopNLe2PPfGLOTYBqxSaRpl209IhqaEuDZi1F', - 'anvato_univision_app_web_stage_551236ef07a0e17718c3995c35586b5ed8cb5031': 'D92PoLS6UitwxDRA191HUGT9OYcOjV6mPMa5wNyo', - 'anvato_univision_app_web_prod_039a5c0a6009e637ae8ac906718a79911e0e65e1': '5mVS5u4SQjtw6NGw2uhMbKEIONIiLqRKck5RwQLR', - 'nbcu_cnbc_springfield_ios_prod_670207fae43d6e9a94c351688851a2ce': 'M7fqCCIP9lW53oJbHs19OlJlpDrVyc2OL8gNeuTa', - 'nbcu_cnbc_springfieldvod_ios_prod_7a5f04b1ceceb0e9c9e2264a44aa236e08e034c2': 'Yia6QbJahW0S7K1I0drksimhZb4UFq92xLBmmMvk', - 'anvato_cox_app_web_prod_ce45cda237969f93e7130f50ee8bb6280c1484ab': 'cc0miZexpFtdoqZGvdhfXsLy7FXjRAOgb9V0f5fZ', - 'anvato_cox_app_web_stage_c23dbe016a8e9d8c7101d10172b92434f6088bf9': 'yivU3MYHd2eDZcOfmLbINVtqxyecKTOp8OjOuoGJ', - 'anvato_chnzero_app_web_stage_b1164d1352b579e792e542fddf13ee34c0eeb46b': 'A76QkXMmVH8lTCfU15xva1mZnSVcqeY4Xb22Kp7m', - 'anvato_chnzero_app_web_prod_253d358928dc08ec161eda2389d53707288a730c': 'OA5QI3ZWZZkdtUEDqh28AH8GedsF6FqzJI32596b', - 'anvato_discovery_vodpoc_web_stage_9fa7077b5e8af1f8355f65d4fb8d2e0e9d54e2b7': 'q3oT191tTQ5g3JCP67PkjLASI9s16DuWZ6fYmry3', - 'anvato_discovery_vodpoc_web_prod_688614983167a1af6cdf6d76343fda10a65223c1': 'qRvRQCTVHd0VVOHsMvvfidyWmlYVrTbjby7WqIuK', - 'nbcu_cnbc_springfieldvod_ftv_stage_826040aad1925a46ac5dfb4b3c5143e648c6a30d': 'JQaSb5a8Tz0PT4ti329DNmzDO30TnngTHmvX8Vua', - 'nbcu_cnbc_springfield_ftv_stage_826040aad1925a46ac5dfb4b3c5143e648c6a30d': 'JQaSb5a8Tz0PT4ti329DNmzDO30TnngTHmvX8Vua', - 'nbcu_nbcd_capture_web_stage_4dd9d585bfb984ebf856dee35db027b2465cc4ae': '0j1Ov4Vopyi2HpBZJYdL2m8ERJVGYh3nNpzPiO8F', - 'nbcu_nbcd_watch3_android_prod_7712ca5fcf1c22f19ec1870a9650f9c37db22dcf': '3LN2UB3rPUAMu7ZriWkHky9vpLMXYha8JbSnxBlx', - 'nbcu_nbcd_watchvod3_android_prod_0910a3a4692d57c0b5ff4316075bc5d096be45b9': 'mJagcQ2II30vUOAauOXne7ERwbf5S9nlB3IP17lQ', - 'anvato_scripps_app_atv_prod_790deda22e16e71e83df58f880cd389908a45d52': 'CB6trI1mpoDIM5o54DNTsji90NDBQPZ4z4RqBNSH', - 'nbcu_nbcd_watchv4_android_prod_ff67cef9cb409158c6f8c3533edddadd0b750507': 'j8CHQCUWjlYERj4NFRmUYOND85QNbHViH09UwuKm', - 'nbcu_nbcd_watchvodv4_android_prod_a814d781609989dea6a629d50ae4c7ad8cc8e907': 'rkVnUXxdA9rawVLUlDQtMue9Y4Q7lFEaIotcUhjt', - 'rvVKpA50qlOPLFxMjrCGf5pdkdQDm7qn': '1J7ZkY5Qz5lMLi93QOH9IveE7EYB3rLl', - 'nbcu_dtv_local_web_prod_b266cf49defe255fd4426a97e27c09e513e9f82f': 'HuLnJDqzLa4saCzYMJ79zDRSQpEduw1TzjMNQu2b', - 'nbcu_att_local_web_prod_4cef038b2d969a6b7d700a56a599040b6a619f67': 'Q0Em5VDc2KpydUrVwzWRXAwoNBulWUxCq2faK0AV', - 'nbcu_dish_local_web_prod_c56dcaf2da2e9157a4266c82a78195f1dd570f6b': 'bC1LWmRz9ayj2AlzizeJ1HuhTfIaJGsDBnZNgoRg', - 'nbcu_verizon_local_web_prod_88bebd2ce006d4ed980de8133496f9a74cb9b3e1': 'wzhDKJZpgvUSS1EQvpCQP8Q59qVzcPixqDGJefSk', - 'nbcu_charter_local_web_prod_9ad90f7fc4023643bb718f0fe0fd5beea2382a50': 'PyNbxNhEWLzy1ZvWEQelRuIQY88Eub7xbSVRMdfT', - 'nbcu_suddenlink_local_web_prod_20fb711725cac224baa1c1cb0b1c324d25e97178': '0Rph41lPXZbb3fqeXtHjjbxfSrNbtZp1Ygq7Jypa', - 'nbcu_wow_local_web_prod_652d9ce4f552d9c2e7b5b1ed37b8cb48155174ad': 'qayIBZ70w1dItm2zS42AptXnxW15mkjRrwnBjMPv', - 'nbcu_centurylink_local_web_prod_2034402b029bf3e837ad46814d9e4b1d1345ccd5': 'StePcPMkjsX51PcizLdLRMzxMEl5k2FlsMLUNV4k', - 'nbcu_atlanticbrd_local_web_prod_8d5f5ecbf7f7b2f5e6d908dd75d90ae3565f682e': 'NtYLb4TFUS0pRs3XTkyO5sbVGYjVf17bVbjaGscI', - 'nbcu_nbcd_watchvod_web_dev_08bc05699be47c4f31d5080263a8cfadc16d0f7c': 'hwxi2dgDoSWgfmVVXOYZm14uuvku4QfopstXckhr', - 'anvato_nextmedia_app_web_prod_a4fa8c7204aa65e71044b57aaf63711980cfe5a0': 'tQN1oGPYY1nM85rJYePWGcIb92TG0gSqoVpQTWOw', - 'anvato_mcp_lin_web_prod_4c36fbfd4d8d8ecae6488656e21ac6d1ac972749': 'GUXNf5ZDX2jFUpu4WT2Go4DJ5nhUCzpnwDRRUx1K', - 'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa': 'bLDYF8JqfG42b7bwKEgQiU9E2LTIAtnKzSgYpFUH', - 'anvato_mcp_fs2go_web_prod_c7b90a93e171469cdca00a931211a2f556370d0a': 'icgGoYGipQMMSEvhplZX1pwbN69srwKYWksz3xWK', - 'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336': 'fA2iQdI7RDpynqzQYIpXALVS83NTPr8LLFK4LFsu', - 'anvato_mcp_anv_web_prod_791407490f4c1ef2a4bcb21103e0cb1bcb3352b3': 'rMOUZqe9lwcGq2mNgG3EDusm6lKgsUnczoOX3mbg', - 'anvato_mcp_gray_web_prod_4c10f067c393ed8fc453d3930f8ab2b159973900': 'rMOUZqe9lwcGq2mNgG3EDusm6lKgsUnczoOX3mbg', - 'anvato_mcp_hearst_web_prod_5356c3de0fc7c90a3727b4863ca7fec3a4524a99': 'P3uXJ0fXXditBPCGkfvlnVScpPEfKmc64Zv7ZgbK', - 'anvato_mcp_cbs_web_prod_02f26581ff80e5bda7aad28226a8d369037f2cbe': 'mGPvo5ZA5SgjOFAPEPXv7AnOpFUICX8hvFQVz69n', - 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582': 'qyT6PXXLjVNCrHaRVj0ugAhalNRS7Ee9BP7LUokD', - 'nbcu_nbcd_watchvodv4_web_stage_4108362fba2d4ede21f262fea3c4162cbafd66c7': 'DhaU5lj0W2gEdcSSsnxURq8t7KIWtJfD966crVDk', - 'anvato_scripps_app_ios_prod_409c41960c60b308db43c3cc1da79cab9f1c3d93': 'WPxj5GraLTkYCyj3M7RozLqIycjrXOEcDGFMIJPn', - 'EZqvRyKBJLrgpClDPDF8I7Xpdp40Vx73': '4OxGd2dEakylntVKjKF0UK9PDPYB6A9W', - 'M2v78QkpleXm9hPp9jUXI63x5vA6BogR': 'ka6K32k7ZALmpINkjJUGUo0OE42Md1BQ', - 'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6_secure': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ', - 'X8POa4zPPaKVZHqmWjuEzfP31b1QM9VN': 'Dn5vOY9ooDw7VSl9qztjZI5o0g08mA0z', - 'M2v78QkBMpNJlSPp9diX5F2PBmBy6Bog': 'ka6K32kyo7nDZfNkjQCGWf1lpApXMd1B', - 'bvJ0dQpav07l0hG5JgfVLF2dv1vARwpP': 'BzoQW24GrJZoJfmNodiJKSPeB9B8NOxj', - 'lxQMLg2XZKuEZaWgsqubBxV9INZ6bryY': 'Vm2Mx6noKds9jB71h6urazwlTG3m9x8l', - '04EnjvXeoSmkbJ9ckPs7oY0mcxv7PlyN': 'aXERQP9LMfQVlEDsgGs6eEA1SWznAQ8P', - 'mQbO2ge6BFRWVPYCYpU06YvNt80XLvAX': 'E2BV1NGmasN5v7eujECVPJgwflnLPm2A', - 'g43oeBzJrCml7o6fa5fRL1ErCdeD8z4K': 'RX34mZ6zVH4Nr6whbxIGLv9WSbxEKo8V', - 'VQrDJoP7mtdBzkxhXbSPwGB1coeElk4x': 'j2VejQx0VFKQepAF7dI0mJLKtOVJE18z', - 'WxA5NzLRjCrmq0NUgaU5pdMDuZO7RJ4w': 'lyY5ADLKaIOLEgAsGQCveEMAcqnx3rY9', - 'M4lpMXB71ie0PjMCjdFzVXq0SeRVqz49': 'n2zVkOqaLIv3GbLfBjcwW51LcveWOZ2e', - 'dyDZGEqN8u8nkJZcJns0oxYmtP7KbGAn': 'VXOEqQW9BtEVLajfZQSLEqxgS5B7qn2D', - 'E7QNjrVY5u5mGvgu67IoDgV1CjEND8QR': 'rz8AaDmdKIkLmPNhB5ILPJnjS5PnlL8d', - 'a4zrqjoKlfzg0dwHEWtP31VqcLBpjm4g': 'LY9J16gwETdGWa3hjBu5o0RzuoQDjqXQ', - 'dQP5BZroMsMVLO1hbmT5r2Enu86GjxA6': '7XR3oOdbPF6x3PRFLDCq9RkgsRjAo48V', - 'M4lKNBO1NFe0PjMCj1tzVXq0SeRVqzA9': 'n2zoRqGLRUv3GbLfBmTwW51LcveWOZYe', - 'nAZ7MZdpGCGg1pqFEbsoJOz2C60mv143': 'dYJgdqA9aT4yojETqGi7yNgoFADxqmXP', - '3y1MERYgOuE9NzbFgwhV6Wv2F0YKvbyz': '081xpZDQgC4VadLTavhWQxrku56DAgXV', - 'bmQvmEXr5HWklBMCZOcpE2Z3HBYwqGyl': 'zxXPbVNyMiMAZldhr9FkOmA0fl4aKr2v', - 'wA7oDNYldfr6050Hwxi52lPZiVlB86Ap': 'ZYK16aA7ni0d3l3c34uwpxD7CbReMm8Q', - 'g43MbKMWmFml7o7sJoSRkXxZiXRvJ3QK': 'RX3oBJonvs4Nr6rUWBCGn3matRGqJPXV', - 'mA9VdlqpLS0raGaSDvtoqNrBTzb8XY4q': '0XN4OjBD3fnW7r7IbmtJB4AyfOmlrE2r', - 'mAajOwgkGt17oGoFmEuklMP9H0GnW54d': 'lXbBLPGyzikNGeGujAuAJGjZiwLRxyXR', - 'vy8vjJ9kbUwrRqRu59Cj5dWZfzYErlAb': 'K8l7gpwaGcBpnAnCLNCmPZRdin3eaQX0', - 'xQMWBpR8oHEZaWaSMGUb0avOHjLVYn4Y': 'm2MrN4vEaf9jB7BFy5Srb40jTrN67AYl', - 'xyKEmVO3miRr6D6UVkt7oB8jtD6aJEAv': 'g2ddDebqDfqdgKgswyUKwGjbTWwzq923', - '7Qk0wa2D9FjKapacoJF27aLvUDKkLGA0': 'b2kgBEkephJaMkMTL7s1PLe4Ua6WyP2P', - '3QLg6nqmNTJ5VvVTo7f508LPidz1xwyY': 'g2L1GgpraipmAOAUqmIbBnPxHOmw4MYa', - '3y1B7zZjXTE9NZNSzZSVNPZaTNLjo6Qz': '081b5G6wzH4VagaURmcWbN5mT4JGEe2V', - 'lAqnwvkw6SG6D8DSqmUg6DRLUp0w3G4x': 'O2pbP0xPDFNJjpjIEvcdryOJtpkVM4X5', - 'awA7xd1N0Hr6050Hw2c52lPZiVlB864p': 'GZYKpn4aoT0d3l3c3PiwpxD7CbReMmXQ', - 'jQVqPLl9YHL1WGWtR1HDgWBGT63qRNyV': '6X03ne6vrU4oWyWUN7tQVoajikxJR3Ye', - 'GQRMR8mL7uZK797t7xH3eNzPIP5dOny1': 'm2vqPWGd4U31zWzSyasDRAoMT1PKRp8o', - 'zydq9RdmRhXLkNkfNoTJlMzaF0lWekQB': '3X7LnvE7vH5nkEkSqLiey793Un7dLB8e', - 'VQrDzwkB2IdBzjzu9MHPbEYkSB50gR4x': 'j2VebLzoKUKQeEesmVh0gM1eIp9jKz8z', - 'mAa2wMamBs17oGoFmktklMP9H0GnW54d': 'lXbgP74xZTkNGeGujVUAJGjZiwLRxy8R', - '7yjB6ZLG6sW8R6RF2xcan1KGfJ5dNoyd': 'wXQkPorvPHZ45N5t4Jf6qwg5Tp4xvw29', - 'a4zPpNeWGuzg0m0iX3tPeanGSkRKWXQg': 'LY9oa3QAyHdGW9Wu3Ri5JGeEik7l1N8Q', - 'k2rneA2M38k25cXDwwSknTJlxPxQLZ6M': '61lyA2aEVDzklfdwmmh31saPxQx2VRjp', - 'bK9Zk4OvPnvxduLgxvi8VUeojnjA02eV': 'o5jANYjbeMb4nfBaQvcLAt1jzLzYx6ze', - '5VD6EydM3R9orHmNMGInGCJwbxbQvGRw': 'w3zjmX7g4vnxzCxElvUEOiewkokXprkZ', - '70X35QbVYVYNPUmP9YfbzI06YqYQk2R1': 'vG4Aj2BMjMjoztB7zeFOnCVPJpJ8lMOa', - '26qYwQVG9p1Bks2GgBckjfDJOXOAMgG1': 'r4ev9X0mv5zqJc0yk5IBDcQOwZw8mnwQ', - 'rvVKpA56MBXWlSxMw3cobT5pdkd4Dm7q': '1J7ZkY53pZ645c93owcLZuveE7E8B3rL', - 'qN1zdy1zlYL23IWZGWtDvfV6WeWQWkJo': 'qN1zdy1zlYL23IWZGWtDvfV6WeWQWkJo', - 'jdKqRGF16dKsBviMDae7IGDl7oTjEbVV': 'Q09l7vhlNxPFErIOK6BVCe7KnwUW5DVV', - '3QLkogW1OUJ5VvPsrDH56DY2u7lgZWyY': 'g2LRE1V9espmAOPhE4ubj4ZdUA57yDXa', - 'wyJvWbXGBSdbkEzhv0CW8meou82aqRy8': 'M2wolPvyBIpQGkbT4juedD4ruzQGdK2y', - '7QkdZrzEkFjKap6IYDU2PB0oCNZORmA0': 'b2kN1l96qhJaMkPs9dt1lpjBfwqZoA8P', - 'pvA05113MHG1w3JTYxc6DVlRCjErVz4O': 'gQXeAbblBUnDJ7vujbHvbRd1cxlz3AXO', - 'mA9blJDZwT0raG1cvkuoeVjLC7ZWd54q': '0XN9jRPwMHnW7rvumgfJZOD9CJgVkWYr', - '5QwRN5qKJTvGKlDTmnf7xwNZcjRmvEy9': 'R2GP6LWBJU1QlnytwGt0B9pytWwAdDYy', - 'eyn5rPPbkfw2KYxH32fG1q58CbLJzM40': 'p2gyqooZnS56JWeiDgfmOy1VugOQEBXn', - '3BABn3b5RfPJGDwilbHe7l82uBoR05Am': '7OYZG7KMVhbPdKJS3xcWEN3AuDlLNmXj', - 'xA5zNGXD3HrmqMlF6OS5pdMDuZO7RJ4w': 'yY5DAm6r1IOLE3BCVMFveEMAcqnx3r29', - 'g43PgW3JZfml7o6fDEURL1ErCdeD8zyK': 'RX3aQn1zrS4Nr6whDgCGLv9WSbxEKo2V', - 'lAqp8WbGgiG6D8LTKJcg3O72CDdre1Qx': 'O2pnm6473HNJjpKuVosd3vVeh975yrX5', - 'wyJbYEDxKSdbkJ6S6RhW8meou82aqRy8': 'M2wPm7EgRSpQGlAh70CedD4ruzQGdKYy', - 'M4lgW28nLCe0PVdtaXszVXq0SeRVqzA9': 'n2zmJvg4jHv3G0ETNgiwW51LcveWOZ8e', - '5Qw3OVvp9FvGKlDTmOC7xwNZcjRmvEQ9': 'R2GzDdml9F1Qlnytw9s0B9pytWwAdD8y', - 'vy8a98X7zCwrRqbHrLUjYzwDiK2b70Qb': 'K8lVwzyjZiBpnAaSGeUmnAgxuGOBxmY0', - 'g4eGjJLLoiqRD3Pf9oT5O03LuNbLRDQp': '6XqD59zzpfN4EwQuaGt67qNpSyRBlnYy', - 'g43OPp9boIml7o6fDOIRL1ErCdeD8z4K': 'RX33alNB4s4Nr6whDPUGLv9WSbxEKoXV', - 'xA2ng9OkBcGKzDbTkKsJlx7dUK8R3dA5': 'z2aPnJvzBfObkwGC3vFaPxeBhxoMqZ8K', - 'xyKEgBajZuRr6DEC0Kt7XpD1cnNW9gAv': 'g2ddlEBvRsqdgKaI4jUK9PrgfMexGZ23', - 'BAogww51jIMa2JnH1BcYpXM5F658RNAL': 'rYWDmm0KptlkGv4FGJFMdZmjs9RDE6XR', - 'BAokpg62VtMa2JnH1mHYpXM5F658RNAL': 'rYWryDnlNslkGv4FG4HMdZmjs9RDE62R', - 'a4z1Px5e2hzg0m0iMMCPeanGSkRKWXAg': 'LY9eorNQGUdGW9WuKKf5JGeEik7l1NYQ', - 'kAx69R58kF9nY5YcdecJdl2pFXP53WyX': 'gXyRxELpbfPvLeLSaRil0mp6UEzbZJ8L', - 'BAoY13nwViMa2J2uo2cY6BlETgmdwryL': 'rYWwKzJmNFlkGvGtNoUM9bzwIJVzB1YR', - } - - _MCP_TO_ACCESS_KEY_TABLE = { - 'qa': 'anvato_mcpqa_demo_web_stage_18b55e00db5a13faa8d03ae6e41f6f5bcb15b922', - 'lin': 'anvato_mcp_lin_web_prod_4c36fbfd4d8d8ecae6488656e21ac6d1ac972749', - 'univison': 'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa', - 'uni': 'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa', - 'dev': 'anvato_mcp_fs2go_web_prod_c7b90a93e171469cdca00a931211a2f556370d0a', - 'sps': 'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336', - 'spsstg': 'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336', - 'anv': 'anvato_mcp_anv_web_prod_791407490f4c1ef2a4bcb21103e0cb1bcb3352b3', - 'gray': 'anvato_mcp_gray_web_prod_4c10f067c393ed8fc453d3930f8ab2b159973900', - 'hearst': 'anvato_mcp_hearst_web_prod_5356c3de0fc7c90a3727b4863ca7fec3a4524a99', - 'cbs': 'anvato_mcp_cbs_web_prod_02f26581ff80e5bda7aad28226a8d369037f2cbe', - 'telemundo': 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582' - } - - _API_KEY = '3hwbSuqqT690uxjNYBktSQpa5ZrpYYR0Iofx7NcJHyA' - - _ANVP_RE = r'<script[^>]+\bdata-anvp\s*=\s*(["\'])(?P<anvp>(?:(?!\1).)+)\1' - _AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce' - - _TESTS = [{ - # from https://www.boston25news.com/news/watch-humpback-whale-breaches-right-next-to-fishing-boat-near-nh/817484874 - 'url': 'anvato:8v9BEynrwx8EFLYpgfOWcG1qJqyXKlRM:4465496', - 'info_dict': { - 'id': '4465496', - 'ext': 'mp4', - 'title': 'VIDEO: Humpback whale breaches right next to NH boat', - 'description': 'VIDEO: Humpback whale breaches right next to NH boat. Footage courtesy: Zach Fahey.', - 'duration': 22, - 'timestamp': 1534855680, - 'upload_date': '20180821', - 'uploader': 'ANV', - }, - 'params': { - 'skip_download': True, - }, - }, { - # from https://sanfrancisco.cbslocal.com/2016/06/17/source-oakland-cop-on-leave-for-having-girlfriend-help-with-police-reports/ - 'url': 'anvato:DVzl9QRzox3ZZsP9bNu5Li3X7obQOnqP:3417601', - 'only_matching': True, - }] - - def __init__(self, *args, **kwargs): - super(AnvatoIE, self).__init__(*args, **kwargs) - self.__server_time = None - - def _server_time(self, access_key, video_id): - if self.__server_time is not None: - return self.__server_time - - self.__server_time = int(self._download_json( - self._api_prefix(access_key) + 'server_time?anvack=' + access_key, video_id, - note='Fetching server time')['server_time']) - - return self.__server_time - - def _api_prefix(self, access_key): - return 'https://tkx2-%s.anvato.net/rest/v2/' % ('prod' if 'prod' in access_key else 'stage') - - def _get_video_json(self, access_key, video_id): - # See et() in anvplayer.min.js, which is an alias of getVideoJSON() - video_data_url = self._api_prefix(access_key) + 'mcp/video/%s?anvack=%s' % (video_id, access_key) - server_time = self._server_time(access_key, video_id) - input_data = '%d~%s~%s' % (server_time, md5_text(video_data_url), md5_text(server_time)) - - auth_secret = intlist_to_bytes(aes_encrypt( - bytes_to_intlist(input_data[:64]), bytes_to_intlist(self._AUTH_KEY))) - - video_data_url += '&X-Anvato-Adst-Auth=' + base64.b64encode(auth_secret).decode('ascii') - anvrid = md5_text(time.time() * 1000 * random.random())[:30] - api = { - 'anvrid': anvrid, - 'anvts': server_time, - } - api['anvstk'] = md5_text('%s|%s|%d|%s' % ( - access_key, anvrid, server_time, - self._ANVACK_TABLE.get(access_key, self._API_KEY))) - - return self._download_json( - video_data_url, video_id, transform_source=strip_jsonp, - data=json.dumps({'api': api}).encode('utf-8')) - - def _get_anvato_videos(self, access_key, video_id): - video_data = self._get_video_json(access_key, video_id) - - formats = [] - for published_url in video_data['published_urls']: - video_url = published_url['embed_url'] - media_format = published_url.get('format') - ext = determine_ext(video_url) - - if ext == 'smil' or media_format == 'smil': - formats.extend(self._extract_smil_formats(video_url, video_id)) - continue - - tbr = int_or_none(published_url.get('kbps')) - a_format = { - 'url': video_url, - 'format_id': ('-'.join(filter(None, ['http', published_url.get('cdn_name')]))).lower(), - 'tbr': tbr if tbr != 0 else None, - } - - if media_format == 'm3u8' and tbr is not None: - a_format.update({ - 'format_id': '-'.join(filter(None, ['hls', compat_str(tbr)])), - 'ext': 'mp4', - }) - elif media_format == 'm3u8-variant' or ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - continue - elif ext == 'mp3' or media_format == 'mp3': - a_format['vcodec'] = 'none' - else: - a_format.update({ - 'width': int_or_none(published_url.get('width')), - 'height': int_or_none(published_url.get('height')), - }) - formats.append(a_format) - - self._sort_formats(formats) - - subtitles = {} - for caption in video_data.get('captions', []): - a_caption = { - 'url': caption['url'], - 'ext': 'tt' if caption.get('format') == 'SMPTE-TT' else None - } - subtitles.setdefault(caption['language'], []).append(a_caption) - - return { - 'id': video_id, - 'formats': formats, - 'title': video_data.get('def_title'), - 'description': video_data.get('def_description'), - 'tags': video_data.get('def_tags', '').split(','), - 'categories': video_data.get('categories'), - 'thumbnail': video_data.get('src_image_url') or video_data.get('thumbnail'), - 'timestamp': int_or_none(video_data.get( - 'ts_published') or video_data.get('ts_added')), - 'uploader': video_data.get('mcp_id'), - 'duration': int_or_none(video_data.get('duration')), - 'subtitles': subtitles, - } - - @staticmethod - def _extract_urls(ie, webpage, video_id): - entries = [] - for mobj in re.finditer(AnvatoIE._ANVP_RE, webpage): - anvplayer_data = ie._parse_json( - mobj.group('anvp'), video_id, transform_source=unescapeHTML, - fatal=False) - if not anvplayer_data: - continue - video = anvplayer_data.get('video') - if not isinstance(video, compat_str) or not video.isdigit(): - continue - access_key = anvplayer_data.get('accessKey') - if not access_key: - mcp = anvplayer_data.get('mcp') - if mcp: - access_key = AnvatoIE._MCP_TO_ACCESS_KEY_TABLE.get( - mcp.lower()) - if not access_key: - continue - entries.append(ie.url_result( - 'anvato:%s:%s' % (access_key, video), ie=AnvatoIE.ie_key(), - video_id=video)) - return entries - - def _extract_anvato_videos(self, webpage, video_id): - anvplayer_data = self._parse_json( - self._html_search_regex( - self._ANVP_RE, webpage, 'Anvato player data', group='anvp'), - video_id) - return self._get_anvato_videos( - anvplayer_data['accessKey'], anvplayer_data['video']) - - def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) - self._initialize_geo_bypass({ - 'countries': smuggled_data.get('geo_countries'), - }) - - mobj = re.match(self._VALID_URL, url) - access_key, video_id = mobj.group('access_key_or_mcp', 'id') - if access_key not in self._ANVACK_TABLE: - access_key = self._MCP_TO_ACCESS_KEY_TABLE.get( - access_key) or access_key - return self._get_anvato_videos(access_key, video_id) diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py deleted file mode 100644 index f6ecb8438..000000000 --- a/youtube_dl/extractor/aol.py +++ /dev/null @@ -1,139 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .yahoo import YahooIE -from ..compat import ( - compat_parse_qs, - compat_urllib_parse_urlparse, -) -from ..utils import ( - ExtractorError, - int_or_none, - url_or_none, -) - - -class AolIE(YahooIE): - IE_NAME = 'aol.com' - _VALID_URL = r'(?:aol-video:|https?://(?:www\.)?aol\.(?:com|ca|co\.uk|de|jp)/video/(?:[^/]+/)*)(?P<id>\d{9}|[0-9a-f]{24}|[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12})' - - _TESTS = [{ - # video with 5min ID - 'url': 'https://www.aol.com/video/view/u-s--official-warns-of-largest-ever-irs-phone-scam/518167793/', - 'md5': '18ef68f48740e86ae94b98da815eec42', - 'info_dict': { - 'id': '518167793', - 'ext': 'mp4', - 'title': 'U.S. Official Warns Of \'Largest Ever\' IRS Phone Scam', - 'description': 'A major phone scam has cost thousands of taxpayers more than $1 million, with less than a month until income tax returns are due to the IRS.', - 'timestamp': 1395405060, - 'upload_date': '20140321', - 'uploader': 'Newsy Studio', - }, - 'params': { - # m3u8 download - 'skip_download': True, - } - }, { - # video with vidible ID - 'url': 'https://www.aol.com/video/view/netflix-is-raising-rates/5707d6b8e4b090497b04f706/', - 'info_dict': { - 'id': '5707d6b8e4b090497b04f706', - 'ext': 'mp4', - 'title': 'Netflix is Raising Rates', - 'description': 'Netflix is rewarding millions of it’s long-standing members with an increase in cost. Veuer’s Carly Figueroa has more.', - 'upload_date': '20160408', - 'timestamp': 1460123280, - 'uploader': 'Veuer', - }, - 'params': { - # m3u8 download - 'skip_download': True, - } - }, { - 'url': 'https://www.aol.com/video/view/park-bench-season-2-trailer/559a1b9be4b0c3bfad3357a7/', - 'only_matching': True, - }, { - 'url': 'https://www.aol.com/video/view/donald-trump-spokeswoman-tones-down-megyn-kelly-attacks/519442220/', - 'only_matching': True, - }, { - 'url': 'aol-video:5707d6b8e4b090497b04f706', - 'only_matching': True, - }, { - 'url': 'https://www.aol.com/video/playlist/PL8245/5ca79d19d21f1a04035db606/', - 'only_matching': True, - }, { - 'url': 'https://www.aol.ca/video/view/u-s-woman-s-family-arrested-for-murder-first-pinned-on-panhandler-police/5c7ccf45bc03931fa04b2fe1/', - 'only_matching': True, - }, { - 'url': 'https://www.aol.co.uk/video/view/-one-dead-and-22-hurt-in-bus-crash-/5cb3a6f3d21f1a072b457347/', - 'only_matching': True, - }, { - 'url': 'https://www.aol.de/video/view/eva-braun-privataufnahmen-von-hitlers-geliebter-werden-digitalisiert/5cb2d49de98ab54c113d3d5d/', - 'only_matching': True, - }, { - 'url': 'https://www.aol.jp/video/playlist/5a28e936a1334d000137da0c/5a28f3151e642219fde19831/', - 'only_matching': True, - }, { - # Yahoo video - 'url': 'https://www.aol.com/video/play/991e6700-ac02-11ea-99ff-357400036f61/24bbc846-3e30-3c46-915e-fe8ccd7fcc46/', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - if '-' in video_id: - return self._extract_yahoo_video(video_id, 'us') - - response = self._download_json( - 'https://feedapi.b2c.on.aol.com/v1.0/app/videos/aolon/%s/details' % video_id, - video_id)['response'] - if response['statusText'] != 'Ok': - raise ExtractorError('%s said: %s' % (self.IE_NAME, response['statusText']), expected=True) - - video_data = response['data'] - formats = [] - m3u8_url = url_or_none(video_data.get('videoMasterPlaylist')) - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) - for rendition in video_data.get('renditions', []): - video_url = url_or_none(rendition.get('url')) - if not video_url: - continue - ext = rendition.get('format') - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) - else: - f = { - 'url': video_url, - 'format_id': rendition.get('quality'), - } - mobj = re.search(r'(\d+)x(\d+)', video_url) - if mobj: - f.update({ - 'width': int(mobj.group(1)), - 'height': int(mobj.group(2)), - }) - else: - qs = compat_parse_qs(compat_urllib_parse_urlparse(video_url).query) - f.update({ - 'width': int_or_none(qs.get('w', [None])[0]), - 'height': int_or_none(qs.get('h', [None])[0]), - }) - formats.append(f) - self._sort_formats(formats, ('width', 'height', 'tbr', 'format_id')) - - return { - 'id': video_id, - 'title': video_data['title'], - 'duration': int_or_none(video_data.get('duration')), - 'timestamp': int_or_none(video_data.get('publishDate')), - 'view_count': int_or_none(video_data.get('views')), - 'description': video_data.get('description'), - 'uploader': video_data.get('videoOwner'), - 'formats': formats, - } diff --git a/youtube_dl/extractor/apa.py b/youtube_dl/extractor/apa.py deleted file mode 100644 index cbc1c0ecb..000000000 --- a/youtube_dl/extractor/apa.py +++ /dev/null @@ -1,95 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - determine_ext, - int_or_none, - url_or_none, -) - - -class APAIE(InfoExtractor): - _VALID_URL = r'(?P<base_url>https?://[^/]+\.apa\.at)/embed/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' - _TESTS = [{ - 'url': 'http://uvp.apa.at/embed/293f6d17-692a-44e3-9fd5-7b178f3a1029', - 'md5': '2b12292faeb0a7d930c778c7a5b4759b', - 'info_dict': { - 'id': '293f6d17-692a-44e3-9fd5-7b178f3a1029', - 'ext': 'mp4', - 'title': '293f6d17-692a-44e3-9fd5-7b178f3a1029', - 'thumbnail': r're:^https?://.*\.jpg$', - }, - }, { - 'url': 'https://uvp-apapublisher.sf.apa.at/embed/2f94e9e6-d945-4db2-9548-f9a41ebf7b78', - 'only_matching': True, - }, { - 'url': 'http://uvp-rma.sf.apa.at/embed/70404cca-2f47-4855-bbb8-20b1fae58f76', - 'only_matching': True, - }, { - 'url': 'http://uvp-kleinezeitung.sf.apa.at/embed/f1c44979-dba2-4ebf-b021-e4cf2cac3c81', - 'only_matching': True, - }] - - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//[^/]+\.apa\.at/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}.*?)\1', - webpage)] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id, base_url = mobj.group('id', 'base_url') - - webpage = self._download_webpage( - '%s/player/%s' % (base_url, video_id), video_id) - - jwplatform_id = self._search_regex( - r'media[iI]d\s*:\s*["\'](?P<id>[a-zA-Z0-9]{8})', webpage, - 'jwplatform id', default=None) - - if jwplatform_id: - return self.url_result( - 'jwplatform:' + jwplatform_id, ie='JWPlatform', - video_id=video_id) - - def extract(field, name=None): - return self._search_regex( - r'\b%s["\']\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1' % field, - webpage, name or field, default=None, group='value') - - title = extract('title') or video_id - description = extract('description') - thumbnail = extract('poster', 'thumbnail') - - formats = [] - for format_id in ('hls', 'progressive'): - source_url = url_or_none(extract(format_id)) - if not source_url: - continue - ext = determine_ext(source_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - source_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - else: - height = int_or_none(self._search_regex( - r'(\d+)\.mp4', source_url, 'height', default=None)) - formats.append({ - 'url': source_url, - 'format_id': format_id, - 'height': height, - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'formats': formats, - } diff --git a/youtube_dl/extractor/aparat.py b/youtube_dl/extractor/aparat.py deleted file mode 100644 index a9527e785..000000000 --- a/youtube_dl/extractor/aparat.py +++ /dev/null @@ -1,89 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - get_element_by_id, - int_or_none, - merge_dicts, - mimetype2ext, - url_or_none, -) - - -class AparatIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P<id>[a-zA-Z0-9]+)' - - _TESTS = [{ - 'url': 'http://www.aparat.com/v/wP8On', - 'md5': '131aca2e14fe7c4dcb3c4877ba300c89', - 'info_dict': { - 'id': 'wP8On', - 'ext': 'mp4', - 'title': 'تیم گلکسی 11 - زومیت', - 'description': 'md5:096bdabcdcc4569f2b8a5e903a3b3028', - 'duration': 231, - 'timestamp': 1387394859, - 'upload_date': '20131218', - 'view_count': int, - }, - }, { - # multiple formats - 'url': 'https://www.aparat.com/v/8dflw/', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - # Provides more metadata - webpage = self._download_webpage(url, video_id, fatal=False) - - if not webpage: - webpage = self._download_webpage( - 'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id, - video_id) - - options = self._parse_json(self._search_regex( - r'options\s*=\s*({.+?})\s*;', webpage, 'options'), video_id) - - formats = [] - for sources in (options.get('multiSRC') or []): - for item in sources: - if not isinstance(item, dict): - continue - file_url = url_or_none(item.get('src')) - if not file_url: - continue - item_type = item.get('type') - if item_type == 'application/vnd.apple.mpegurl': - formats.extend(self._extract_m3u8_formats( - file_url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls', - fatal=False)) - else: - ext = mimetype2ext(item.get('type')) - label = item.get('label') - formats.append({ - 'url': file_url, - 'ext': ext, - 'format_id': 'http-%s' % (label or ext), - 'height': int_or_none(self._search_regex( - r'(\d+)[pP]', label or '', 'height', - default=None)), - }) - self._sort_formats( - formats, field_preference=('height', 'width', 'tbr', 'format_id')) - - info = self._search_json_ld(webpage, video_id, default={}) - - if not info.get('title'): - info['title'] = get_element_by_id('videoTitle', webpage) or \ - self._html_search_meta(['og:title', 'twitter:title', 'DC.Title', 'title'], webpage, fatal=True) - - return merge_dicts(info, { - 'id': video_id, - 'thumbnail': url_or_none(options.get('poster')), - 'duration': int_or_none(options.get('duration')), - 'formats': formats, - }) diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py deleted file mode 100644 index a9ef733e0..000000000 --- a/youtube_dl/extractor/appletrailers.py +++ /dev/null @@ -1,283 +0,0 @@ -from __future__ import unicode_literals - -import re -import json - -from .common import InfoExtractor -from ..compat import compat_urlparse -from ..utils import ( - int_or_none, - parse_duration, - unified_strdate, -) - - -class AppleTrailersIE(InfoExtractor): - IE_NAME = 'appletrailers' - _VALID_URL = r'https?://(?:www\.|movie)?trailers\.apple\.com/(?:trailers|ca)/(?P<company>[^/]+)/(?P<movie>[^/]+)' - _TESTS = [{ - 'url': 'http://trailers.apple.com/trailers/wb/manofsteel/', - 'info_dict': { - 'id': '5111', - 'title': 'Man of Steel', - }, - 'playlist': [ - { - 'md5': 'd97a8e575432dbcb81b7c3acb741f8a8', - 'info_dict': { - 'id': 'manofsteel-trailer4', - 'ext': 'mov', - 'duration': 111, - 'title': 'Trailer 4', - 'upload_date': '20130523', - 'uploader_id': 'wb', - }, - }, - { - 'md5': 'b8017b7131b721fb4e8d6f49e1df908c', - 'info_dict': { - 'id': 'manofsteel-trailer3', - 'ext': 'mov', - 'duration': 182, - 'title': 'Trailer 3', - 'upload_date': '20130417', - 'uploader_id': 'wb', - }, - }, - { - 'md5': 'd0f1e1150989b9924679b441f3404d48', - 'info_dict': { - 'id': 'manofsteel-trailer', - 'ext': 'mov', - 'duration': 148, - 'title': 'Trailer', - 'upload_date': '20121212', - 'uploader_id': 'wb', - }, - }, - { - 'md5': '5fe08795b943eb2e757fa95cb6def1cb', - 'info_dict': { - 'id': 'manofsteel-teaser', - 'ext': 'mov', - 'duration': 93, - 'title': 'Teaser', - 'upload_date': '20120721', - 'uploader_id': 'wb', - }, - }, - ] - }, { - 'url': 'http://trailers.apple.com/trailers/magnolia/blackthorn/', - 'info_dict': { - 'id': '4489', - 'title': 'Blackthorn', - }, - 'playlist_mincount': 2, - 'expected_warnings': ['Unable to download JSON metadata'], - }, { - # json data only available from http://trailers.apple.com/trailers/feeds/data/15881.json - 'url': 'http://trailers.apple.com/trailers/fox/kungfupanda3/', - 'info_dict': { - 'id': '15881', - 'title': 'Kung Fu Panda 3', - }, - 'playlist_mincount': 4, - }, { - 'url': 'http://trailers.apple.com/ca/metropole/autrui/', - 'only_matching': True, - }, { - 'url': 'http://movietrailers.apple.com/trailers/focus_features/kuboandthetwostrings/', - 'only_matching': True, - }] - - _JSON_RE = r'iTunes.playURL\((.*?)\);' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - movie = mobj.group('movie') - uploader_id = mobj.group('company') - - webpage = self._download_webpage(url, movie) - film_id = self._search_regex(r"FilmId\s*=\s*'(\d+)'", webpage, 'film id') - film_data = self._download_json( - 'http://trailers.apple.com/trailers/feeds/data/%s.json' % film_id, - film_id, fatal=False) - - if film_data: - entries = [] - for clip in film_data.get('clips', []): - clip_title = clip['title'] - - formats = [] - for version, version_data in clip.get('versions', {}).items(): - for size, size_data in version_data.get('sizes', {}).items(): - src = size_data.get('src') - if not src: - continue - formats.append({ - 'format_id': '%s-%s' % (version, size), - 'url': re.sub(r'_(\d+p\.mov)', r'_h\1', src), - 'width': int_or_none(size_data.get('width')), - 'height': int_or_none(size_data.get('height')), - 'language': version[:2], - }) - self._sort_formats(formats) - - entries.append({ - 'id': movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', clip_title).lower(), - 'formats': formats, - 'title': clip_title, - 'thumbnail': clip.get('screen') or clip.get('thumb'), - 'duration': parse_duration(clip.get('runtime') or clip.get('faded')), - 'upload_date': unified_strdate(clip.get('posted')), - 'uploader_id': uploader_id, - }) - - page_data = film_data.get('page', {}) - return self.playlist_result(entries, film_id, page_data.get('movie_title')) - - playlist_url = compat_urlparse.urljoin(url, 'includes/playlists/itunes.inc') - - def fix_html(s): - s = re.sub(r'(?s)<script[^<]*?>.*?</script>', '', s) - s = re.sub(r'<img ([^<]*?)/?>', r'<img \1/>', s) - # The ' in the onClick attributes are not escaped, it couldn't be parsed - # like: http://trailers.apple.com/trailers/wb/gravity/ - - def _clean_json(m): - return 'iTunes.playURL(%s);' % m.group(1).replace('\'', ''') - s = re.sub(self._JSON_RE, _clean_json, s) - s = '<html>%s</html>' % s - return s - doc = self._download_xml(playlist_url, movie, transform_source=fix_html) - - playlist = [] - for li in doc.findall('./div/ul/li'): - on_click = li.find('.//a').attrib['onClick'] - trailer_info_json = self._search_regex(self._JSON_RE, - on_click, 'trailer info') - trailer_info = json.loads(trailer_info_json) - first_url = trailer_info.get('url') - if not first_url: - continue - title = trailer_info['title'] - video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower() - thumbnail = li.find('.//img').attrib['src'] - upload_date = trailer_info['posted'].replace('-', '') - - runtime = trailer_info['runtime'] - m = re.search(r'(?P<minutes>[0-9]+):(?P<seconds>[0-9]{1,2})', runtime) - duration = None - if m: - duration = 60 * int(m.group('minutes')) + int(m.group('seconds')) - - trailer_id = first_url.split('/')[-1].rpartition('_')[0].lower() - settings_json_url = compat_urlparse.urljoin(url, 'includes/settings/%s.json' % trailer_id) - settings = self._download_json(settings_json_url, trailer_id, 'Downloading settings json') - - formats = [] - for format in settings['metadata']['sizes']: - # The src is a file pointing to the real video file - format_url = re.sub(r'_(\d*p\.mov)', r'_h\1', format['src']) - formats.append({ - 'url': format_url, - 'format': format['type'], - 'width': int_or_none(format['width']), - 'height': int_or_none(format['height']), - }) - - self._sort_formats(formats) - - playlist.append({ - '_type': 'video', - 'id': video_id, - 'formats': formats, - 'title': title, - 'duration': duration, - 'thumbnail': thumbnail, - 'upload_date': upload_date, - 'uploader_id': uploader_id, - 'http_headers': { - 'User-Agent': 'QuickTime compatible (youtube-dl)', - }, - }) - - return { - '_type': 'playlist', - 'id': movie, - 'entries': playlist, - } - - -class AppleTrailersSectionIE(InfoExtractor): - IE_NAME = 'appletrailers:section' - _SECTIONS = { - 'justadded': { - 'feed_path': 'just_added', - 'title': 'Just Added', - }, - 'exclusive': { - 'feed_path': 'exclusive', - 'title': 'Exclusive', - }, - 'justhd': { - 'feed_path': 'just_hd', - 'title': 'Just HD', - }, - 'mostpopular': { - 'feed_path': 'most_pop', - 'title': 'Most Popular', - }, - 'moviestudios': { - 'feed_path': 'studios', - 'title': 'Movie Studios', - }, - } - _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/#section=(?P<id>%s)' % '|'.join(_SECTIONS) - _TESTS = [{ - 'url': 'http://trailers.apple.com/#section=justadded', - 'info_dict': { - 'title': 'Just Added', - 'id': 'justadded', - }, - 'playlist_mincount': 80, - }, { - 'url': 'http://trailers.apple.com/#section=exclusive', - 'info_dict': { - 'title': 'Exclusive', - 'id': 'exclusive', - }, - 'playlist_mincount': 80, - }, { - 'url': 'http://trailers.apple.com/#section=justhd', - 'info_dict': { - 'title': 'Just HD', - 'id': 'justhd', - }, - 'playlist_mincount': 80, - }, { - 'url': 'http://trailers.apple.com/#section=mostpopular', - 'info_dict': { - 'title': 'Most Popular', - 'id': 'mostpopular', - }, - 'playlist_mincount': 30, - }, { - 'url': 'http://trailers.apple.com/#section=moviestudios', - 'info_dict': { - 'title': 'Movie Studios', - 'id': 'moviestudios', - }, - 'playlist_mincount': 80, - }] - - def _real_extract(self, url): - section = self._match_id(url) - section_data = self._download_json( - 'http://trailers.apple.com/trailers/home/feeds/%s.json' % self._SECTIONS[section]['feed_path'], - section) - entries = [ - self.url_result('http://trailers.apple.com' + e['location']) - for e in section_data] - return self.playlist_result(entries, section, self._SECTIONS[section]['title']) diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py deleted file mode 100644 index e42ed5e79..000000000 --- a/youtube_dl/extractor/archiveorg.py +++ /dev/null @@ -1,95 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - clean_html, - extract_attributes, - unified_strdate, - unified_timestamp, -) - - -class ArchiveOrgIE(InfoExtractor): - IE_NAME = 'archive.org' - IE_DESC = 'archive.org videos' - _VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details|embed)/(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect', - 'md5': '8af1d4cf447933ed3c7f4871162602db', - 'info_dict': { - 'id': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect', - 'ext': 'ogg', - 'title': '1968 Demo - FJCC Conference Presentation Reel #1', - 'description': 'md5:da45c349df039f1cc8075268eb1b5c25', - 'creator': 'SRI International', - 'release_date': '19681210', - 'uploader': 'SRI International', - 'timestamp': 1268695290, - 'upload_date': '20100315', - } - }, { - 'url': 'https://archive.org/details/Cops1922', - 'md5': '0869000b4ce265e8ca62738b336b268a', - 'info_dict': { - 'id': 'Cops1922', - 'ext': 'mp4', - 'title': 'Buster Keaton\'s "Cops" (1922)', - 'description': 'md5:43a603fd6c5b4b90d12a96b921212b9c', - 'timestamp': 1387699629, - 'upload_date': '20131222', - } - }, { - 'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect', - 'only_matching': True, - }, { - 'url': 'https://archive.org/details/MSNBCW_20131125_040000_To_Catch_a_Predator/', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage( - 'http://archive.org/embed/' + video_id, video_id) - - playlist = None - play8 = self._search_regex( - r'(<[^>]+\bclass=["\']js-play8-playlist[^>]+>)', webpage, - 'playlist', default=None) - if play8: - attrs = extract_attributes(play8) - playlist = attrs.get('value') - if not playlist: - # Old jwplayer fallback - playlist = self._search_regex( - r"(?s)Play\('[^']+'\s*,\s*(\[.+\])\s*,\s*{.*?}\)", - webpage, 'jwplayer playlist', default='[]') - jwplayer_playlist = self._parse_json(playlist, video_id, fatal=False) - if jwplayer_playlist: - info = self._parse_jwplayer_data( - {'playlist': jwplayer_playlist}, video_id, base_url=url) - else: - # HTML5 media fallback - info = self._parse_html5_media_entries(url, webpage, video_id)[0] - info['id'] = video_id - - def get_optional(metadata, field): - return metadata.get(field, [None])[0] - - metadata = self._download_json( - 'http://archive.org/details/' + video_id, video_id, query={ - 'output': 'json', - })['metadata'] - info.update({ - 'title': get_optional(metadata, 'title') or info.get('title'), - 'description': clean_html(get_optional(metadata, 'description')), - }) - if info.get('_type') != 'playlist': - creator = get_optional(metadata, 'creator') - info.update({ - 'creator': creator, - 'release_date': unified_strdate(get_optional(metadata, 'date')), - 'uploader': get_optional(metadata, 'publisher') or creator, - 'timestamp': unified_timestamp(get_optional(metadata, 'publicdate')), - 'language': get_optional(metadata, 'language'), - }) - return info diff --git a/youtube_dl/extractor/arcpublishing.py b/youtube_dl/extractor/arcpublishing.py deleted file mode 100644 index ca6a6c4d8..000000000 --- a/youtube_dl/extractor/arcpublishing.py +++ /dev/null @@ -1,174 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - extract_attributes, - int_or_none, - parse_iso8601, - try_get, -) - - -class ArcPublishingIE(InfoExtractor): - _UUID_REGEX = r'[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12}' - _VALID_URL = r'arcpublishing:(?P<org>[a-z]+):(?P<id>%s)' % _UUID_REGEX - _TESTS = [{ - # https://www.adn.com/politics/2020/11/02/video-senate-candidates-campaign-in-anchorage-on-eve-of-election-day/ - 'url': 'arcpublishing:adn:8c99cb6e-b29c-4bc9-9173-7bf9979225ab', - 'only_matching': True, - }, { - # https://www.bostonglobe.com/video/2020/12/30/metro/footage-released-showing-officer-talking-about-striking-protesters-with-car/ - 'url': 'arcpublishing:bostonglobe:232b7ae6-7d73-432d-bc0a-85dbf0119ab1', - 'only_matching': True, - }, { - # https://www.actionnewsjax.com/video/live-stream/ - 'url': 'arcpublishing:cmg:cfb1cf1b-3ab5-4d1b-86c5-a5515d311f2a', - 'only_matching': True, - }, { - # https://elcomercio.pe/videos/deportes/deporte-total-futbol-peruano-seleccion-peruana-la-valorizacion-de-los-peruanos-en-el-exterior-tras-un-2020-atipico-nnav-vr-video-noticia/ - 'url': 'arcpublishing:elcomercio:27a7e1f8-2ec7-4177-874f-a4feed2885b3', - 'only_matching': True, - }, { - # https://www.clickondetroit.com/video/community/2020/05/15/events-surrounding-woodward-dream-cruise-being-canceled/ - 'url': 'arcpublishing:gmg:c8793fb2-8d44-4242-881e-2db31da2d9fe', - 'only_matching': True, - }, { - # https://www.wabi.tv/video/2020/12/30/trenton-company-making-equipment-pfizer-covid-vaccine/ - 'url': 'arcpublishing:gray:0b0ba30e-032a-4598-8810-901d70e6033e', - 'only_matching': True, - }, { - # https://www.lateja.cr/el-mundo/video-china-aprueba-con-condiciones-su-primera/dfcbfa57-527f-45ff-a69b-35fe71054143/video/ - 'url': 'arcpublishing:gruponacion:dfcbfa57-527f-45ff-a69b-35fe71054143', - 'only_matching': True, - }, { - # https://www.fifthdomain.com/video/2018/03/09/is-america-vulnerable-to-a-cyber-attack/ - 'url': 'arcpublishing:mco:aa0ca6fe-1127-46d4-b32c-be0d6fdb8055', - 'only_matching': True, - }, { - # https://www.vl.no/kultur/2020/12/09/en-melding-fra-en-lytter-endret-julelista-til-lewi-bergrud/ - 'url': 'arcpublishing:mentormedier:47a12084-650b-4011-bfd0-3699b6947b2d', - 'only_matching': True, - }, { - # https://www.14news.com/2020/12/30/whiskey-theft-caught-camera-henderson-liquor-store/ - 'url': 'arcpublishing:raycom:b89f61f8-79fa-4c09-8255-e64237119bf7', - 'only_matching': True, - }, { - # https://www.theglobeandmail.com/world/video-ethiopian-woman-who-became-symbol-of-integration-in-italy-killed-on/ - 'url': 'arcpublishing:tgam:411b34c1-8701-4036-9831-26964711664b', - 'only_matching': True, - }, { - # https://www.pilotonline.com/460f2931-8130-4719-8ea1-ffcb2d7cb685-132.html - 'url': 'arcpublishing:tronc:460f2931-8130-4719-8ea1-ffcb2d7cb685', - 'only_matching': True, - }] - _POWA_DEFAULTS = [ - (['cmg', 'prisa'], '%s-config-prod.api.cdn.arcpublishing.com/video'), - ([ - 'adn', 'advancelocal', 'answers', 'bonnier', 'bostonglobe', 'demo', - 'gmg', 'gruponacion', 'infobae', 'mco', 'nzme', 'pmn', 'raycom', - 'spectator', 'tbt', 'tgam', 'tronc', 'wapo', 'wweek', - ], 'video-api-cdn.%s.arcpublishing.com/api'), - ] - - @staticmethod - def _extract_urls(webpage): - entries = [] - # https://arcpublishing.atlassian.net/wiki/spaces/POWA/overview - for powa_el in re.findall(r'(<div[^>]+class="[^"]*\bpowa\b[^"]*"[^>]+data-uuid="%s"[^>]*>)' % ArcPublishingIE._UUID_REGEX, webpage): - powa = extract_attributes(powa_el) or {} - org = powa.get('data-org') - uuid = powa.get('data-uuid') - if org and uuid: - entries.append('arcpublishing:%s:%s' % (org, uuid)) - return entries - - def _real_extract(self, url): - org, uuid = re.match(self._VALID_URL, url).groups() - for orgs, tmpl in self._POWA_DEFAULTS: - if org in orgs: - base_api_tmpl = tmpl - break - else: - base_api_tmpl = '%s-prod-cdn.video-api.arcpublishing.com/api' - if org == 'wapo': - org = 'washpost' - video = self._download_json( - 'https://%s/v1/ansvideos/findByUuid' % (base_api_tmpl % org), - uuid, query={'uuid': uuid})[0] - title = video['headlines']['basic'] - is_live = video.get('status') == 'live' - - urls = [] - formats = [] - for s in video.get('streams', []): - s_url = s.get('url') - if not s_url or s_url in urls: - continue - urls.append(s_url) - stream_type = s.get('stream_type') - if stream_type == 'smil': - smil_formats = self._extract_smil_formats( - s_url, uuid, fatal=False) - for f in smil_formats: - if f['url'].endswith('/cfx/st'): - f['app'] = 'cfx/st' - if not f['play_path'].startswith('mp4:'): - f['play_path'] = 'mp4:' + f['play_path'] - if isinstance(f['tbr'], float): - f['vbr'] = f['tbr'] * 1000 - del f['tbr'] - f['format_id'] = 'rtmp-%d' % f['vbr'] - formats.extend(smil_formats) - elif stream_type in ('ts', 'hls'): - m3u8_formats = self._extract_m3u8_formats( - s_url, uuid, 'mp4', 'm3u8' if is_live else 'm3u8_native', - m3u8_id='hls', fatal=False) - if all([f.get('acodec') == 'none' for f in m3u8_formats]): - continue - for f in m3u8_formats: - if f.get('acodec') == 'none': - f['preference'] = -40 - elif f.get('vcodec') == 'none': - f['preference'] = -50 - height = f.get('height') - if not height: - continue - vbr = self._search_regex( - r'[_x]%d[_-](\d+)' % height, f['url'], 'vbr', default=None) - if vbr: - f['vbr'] = int(vbr) - formats.extend(m3u8_formats) - else: - vbr = int_or_none(s.get('bitrate')) - formats.append({ - 'format_id': '%s-%d' % (stream_type, vbr) if vbr else stream_type, - 'vbr': vbr, - 'width': int_or_none(s.get('width')), - 'height': int_or_none(s.get('height')), - 'filesize': int_or_none(s.get('filesize')), - 'url': s_url, - 'preference': -1, - }) - self._sort_formats( - formats, ('preference', 'width', 'height', 'vbr', 'filesize', 'tbr', 'ext', 'format_id')) - - subtitles = {} - for subtitle in (try_get(video, lambda x: x['subtitles']['urls'], list) or []): - subtitle_url = subtitle.get('url') - if subtitle_url: - subtitles.setdefault('en', []).append({'url': subtitle_url}) - - return { - 'id': uuid, - 'title': self._live_title(title) if is_live else title, - 'thumbnail': try_get(video, lambda x: x['promo_image']['url']), - 'description': try_get(video, lambda x: x['subheadlines']['basic']), - 'formats': formats, - 'duration': int_or_none(video.get('duration'), 100), - 'timestamp': parse_iso8601(video.get('created_date')), - 'subtitles': subtitles, - 'is_live': is_live, - } diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py deleted file mode 100644 index d45a9fe52..000000000 --- a/youtube_dl/extractor/ard.py +++ /dev/null @@ -1,452 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import json -import re - -from .common import InfoExtractor -from .generic import GenericIE -from ..utils import ( - determine_ext, - ExtractorError, - int_or_none, - parse_duration, - qualities, - str_or_none, - try_get, - unified_strdate, - unified_timestamp, - update_url_query, - url_or_none, - xpath_text, -) -from ..compat import compat_etree_fromstring - - -class ARDMediathekBaseIE(InfoExtractor): - _GEO_COUNTRIES = ['DE'] - - def _extract_media_info(self, media_info_url, webpage, video_id): - media_info = self._download_json( - media_info_url, video_id, 'Downloading media JSON') - return self._parse_media_info(media_info, video_id, '"fsk"' in webpage) - - def _parse_media_info(self, media_info, video_id, fsk): - formats = self._extract_formats(media_info, video_id) - - if not formats: - if fsk: - raise ExtractorError( - 'This video is only available after 20:00', expected=True) - elif media_info.get('_geoblocked'): - self.raise_geo_restricted( - 'This video is not available due to geoblocking', - countries=self._GEO_COUNTRIES) - - self._sort_formats(formats) - - subtitles = {} - subtitle_url = media_info.get('_subtitleUrl') - if subtitle_url: - subtitles['de'] = [{ - 'ext': 'ttml', - 'url': subtitle_url, - }] - - return { - 'id': video_id, - 'duration': int_or_none(media_info.get('_duration')), - 'thumbnail': media_info.get('_previewImage'), - 'is_live': media_info.get('_isLive') is True, - 'formats': formats, - 'subtitles': subtitles, - } - - def _extract_formats(self, media_info, video_id): - type_ = media_info.get('_type') - media_array = media_info.get('_mediaArray', []) - formats = [] - for num, media in enumerate(media_array): - for stream in media.get('_mediaStreamArray', []): - stream_urls = stream.get('_stream') - if not stream_urls: - continue - if not isinstance(stream_urls, list): - stream_urls = [stream_urls] - quality = stream.get('_quality') - server = stream.get('_server') - for stream_url in stream_urls: - if not url_or_none(stream_url): - continue - ext = determine_ext(stream_url) - if quality != 'auto' and ext in ('f4m', 'm3u8'): - continue - if ext == 'f4m': - formats.extend(self._extract_f4m_formats( - update_url_query(stream_url, { - 'hdcore': '3.1.1', - 'plugin': 'aasp-3.1.1.69.124' - }), video_id, f4m_id='hds', fatal=False)) - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - stream_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - else: - if server and server.startswith('rtmp'): - f = { - 'url': server, - 'play_path': stream_url, - 'format_id': 'a%s-rtmp-%s' % (num, quality), - } - else: - f = { - 'url': stream_url, - 'format_id': 'a%s-%s-%s' % (num, ext, quality) - } - m = re.search( - r'_(?P<width>\d+)x(?P<height>\d+)\.mp4$', - stream_url) - if m: - f.update({ - 'width': int(m.group('width')), - 'height': int(m.group('height')), - }) - if type_ == 'audio': - f['vcodec'] = 'none' - formats.append(f) - return formats - - -class ARDMediathekIE(ARDMediathekBaseIE): - IE_NAME = 'ARD:mediathek' - _VALID_URL = r'^https?://(?:(?:(?:www|classic)\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de|one\.ard\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' - - _TESTS = [{ - # available till 26.07.2022 - 'url': 'http://www.ardmediathek.de/tv/S%C3%9CDLICHT/Was-ist-die-Kunst-der-Zukunft-liebe-Ann/BR-Fernsehen/Video?bcastId=34633636&documentId=44726822', - 'info_dict': { - 'id': '44726822', - 'ext': 'mp4', - 'title': 'Was ist die Kunst der Zukunft, liebe Anna McCarthy?', - 'description': 'md5:4ada28b3e3b5df01647310e41f3a62f5', - 'duration': 1740, - }, - 'params': { - # m3u8 download - 'skip_download': True, - } - }, { - 'url': 'https://one.ard.de/tv/Mord-mit-Aussicht/Mord-mit-Aussicht-6-39-T%C3%B6dliche-Nach/ONE/Video?bcastId=46384294&documentId=55586872', - 'only_matching': True, - }, { - # audio - 'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086', - 'only_matching': True, - }, { - 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht', - 'only_matching': True, - }, { - # audio - 'url': 'http://mediathek.rbb-online.de/radio/Hörspiel/Vor-dem-Fest/kulturradio/Audio?documentId=30796318&topRessort=radio&bcastId=9839158', - 'only_matching': True, - }, { - 'url': 'https://classic.ardmediathek.de/tv/Panda-Gorilla-Co/Panda-Gorilla-Co-Folge-274/Das-Erste/Video?bcastId=16355486&documentId=58234698', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return False if ARDBetaMediathekIE.suitable(url) else super(ARDMediathekIE, cls).suitable(url) - - def _real_extract(self, url): - # determine video id from url - m = re.match(self._VALID_URL, url) - - document_id = None - - numid = re.search(r'documentId=([0-9]+)', url) - if numid: - document_id = video_id = numid.group(1) - else: - video_id = m.group('video_id') - - webpage = self._download_webpage(url, video_id) - - ERRORS = ( - ('>Leider liegt eine Störung vor.', 'Video %s is unavailable'), - ('>Der gewünschte Beitrag ist nicht mehr verfügbar.<', - 'Video %s is no longer available'), - ) - - for pattern, message in ERRORS: - if pattern in webpage: - raise ExtractorError(message % video_id, expected=True) - - if re.search(r'[\?&]rss($|[=&])', url): - doc = compat_etree_fromstring(webpage.encode('utf-8')) - if doc.tag == 'rss': - return GenericIE()._extract_rss(url, video_id, doc) - - title = self._og_search_title(webpage, default=None) or self._html_search_regex( - [r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>', - r'<meta name="dcterms\.title" content="(.*?)"/>', - r'<h4 class="headline">(.*?)</h4>', - r'<title[^>]*>(.*?)</title>'], - webpage, 'title') - description = self._og_search_description(webpage, default=None) or self._html_search_meta( - 'dcterms.abstract', webpage, 'description', default=None) - if description is None: - description = self._html_search_meta( - 'description', webpage, 'meta description', default=None) - if description is None: - description = self._html_search_regex( - r'<p\s+class="teasertext">(.+?)</p>', - webpage, 'teaser text', default=None) - - # Thumbnail is sometimes not present. - # It is in the mobile version, but that seems to use a different URL - # structure altogether. - thumbnail = self._og_search_thumbnail(webpage, default=None) - - media_streams = re.findall(r'''(?x) - mediaCollection\.addMediaStream\([0-9]+,\s*[0-9]+,\s*"[^"]*",\s* - "([^"]+)"''', webpage) - - if media_streams: - QUALITIES = qualities(['lo', 'hi', 'hq']) - formats = [] - for furl in set(media_streams): - if furl.endswith('.f4m'): - fid = 'f4m' - else: - fid_m = re.match(r'.*\.([^.]+)\.[^.]+$', furl) - fid = fid_m.group(1) if fid_m else None - formats.append({ - 'quality': QUALITIES(fid), - 'format_id': fid, - 'url': furl, - }) - self._sort_formats(formats) - info = { - 'formats': formats, - } - else: # request JSON file - if not document_id: - video_id = self._search_regex( - r'/play/(?:config|media)/(\d+)', webpage, 'media id') - info = self._extract_media_info( - 'http://www.ardmediathek.de/play/media/%s' % video_id, - webpage, video_id) - - info.update({ - 'id': video_id, - 'title': self._live_title(title) if info.get('is_live') else title, - 'description': description, - 'thumbnail': thumbnail, - }) - - return info - - -class ARDIE(InfoExtractor): - _VALID_URL = r'(?P<mainurl>https?://(?:www\.)?daserste\.de/(?:[^/?#&]+/)+(?P<id>[^/?#&]+))\.html' - _TESTS = [{ - # available till 7.01.2022 - 'url': 'https://www.daserste.de/information/talk/maischberger/videos/maischberger-die-woche-video100.html', - 'md5': '867d8aa39eeaf6d76407c5ad1bb0d4c1', - 'info_dict': { - 'id': 'maischberger-die-woche-video100', - 'display_id': 'maischberger-die-woche-video100', - 'ext': 'mp4', - 'duration': 3687.0, - 'title': 'maischberger. die woche vom 7. Januar 2021', - 'upload_date': '20210107', - 'thumbnail': r're:^https?://.*\.jpg$', - }, - }, { - 'url': 'https://www.daserste.de/information/politik-weltgeschehen/morgenmagazin/videosextern/dominik-kahun-aus-der-nhl-direkt-zur-weltmeisterschaft-100.html', - 'only_matching': True, - }, { - 'url': 'https://www.daserste.de/information/nachrichten-wetter/tagesthemen/videosextern/tagesthemen-17736.html', - 'only_matching': True, - }, { - 'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html', - 'only_matching': True, - }, { - 'url': 'https://www.daserste.de/unterhaltung/serie/in-aller-freundschaft-die-jungen-aerzte/Drehpause-100.html', - 'only_matching': True, - }, { - 'url': 'https://www.daserste.de/unterhaltung/film/filmmittwoch-im-ersten/videos/making-ofwendezeit-video-100.html', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('id') - - player_url = mobj.group('mainurl') + '~playerXml.xml' - doc = self._download_xml(player_url, display_id) - video_node = doc.find('./video') - upload_date = unified_strdate(xpath_text( - video_node, './broadcastDate')) - thumbnail = xpath_text(video_node, './/teaserImage//variant/url') - - formats = [] - for a in video_node.findall('.//asset'): - file_name = xpath_text(a, './fileName', default=None) - if not file_name: - continue - format_type = a.attrib.get('type') - format_url = url_or_none(file_name) - if format_url: - ext = determine_ext(file_name) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, display_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id=format_type or 'hls', fatal=False)) - continue - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - update_url_query(format_url, {'hdcore': '3.7.0'}), - display_id, f4m_id=format_type or 'hds', fatal=False)) - continue - f = { - 'format_id': format_type, - 'width': int_or_none(xpath_text(a, './frameWidth')), - 'height': int_or_none(xpath_text(a, './frameHeight')), - 'vbr': int_or_none(xpath_text(a, './bitrateVideo')), - 'abr': int_or_none(xpath_text(a, './bitrateAudio')), - 'vcodec': xpath_text(a, './codecVideo'), - 'tbr': int_or_none(xpath_text(a, './totalBitrate')), - } - server_prefix = xpath_text(a, './serverPrefix', default=None) - if server_prefix: - f.update({ - 'url': server_prefix, - 'playpath': file_name, - }) - else: - if not format_url: - continue - f['url'] = format_url - formats.append(f) - self._sort_formats(formats) - - return { - 'id': xpath_text(video_node, './videoId', default=display_id), - 'formats': formats, - 'display_id': display_id, - 'title': video_node.find('./title').text, - 'duration': parse_duration(video_node.find('./duration').text), - 'upload_date': upload_date, - 'thumbnail': thumbnail, - } - - -class ARDBetaMediathekIE(ARDMediathekBaseIE): - _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?:[^/]+/)?(?:player|live|video)/(?:[^/]+/)*(?P<id>Y3JpZDovL[a-zA-Z0-9]+)' - _TESTS = [{ - 'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/', - 'md5': 'a1dc75a39c61601b980648f7c9f9f71d', - 'info_dict': { - 'display_id': 'die-robuste-roswita', - 'id': '78566716', - 'title': 'Die robuste Roswita', - 'description': r're:^Der Mord.*totgeglaubte Ehefrau Roswita', - 'duration': 5316, - 'thumbnail': 'https://img.ardmediathek.de/standard/00/78/56/67/84/575672121/16x9/960?mandant=ard', - 'timestamp': 1596658200, - 'upload_date': '20200805', - 'ext': 'mp4', - }, - }, { - 'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', - 'only_matching': True, - }, { - 'url': 'https://ardmediathek.de/ard/video/saartalk/saartalk-gesellschaftsgift-haltung-gegen-hass/sr-fernsehen/Y3JpZDovL3NyLW9ubGluZS5kZS9TVF84MTY4MA/', - 'only_matching': True, - }, { - 'url': 'https://www.ardmediathek.de/ard/video/trailer/private-eyes-s01-e01/one/Y3JpZDovL3dkci5kZS9CZWl0cmFnLTE1MTgwYzczLWNiMTEtNGNkMS1iMjUyLTg5MGYzOWQxZmQ1YQ/', - 'only_matching': True, - }, { - 'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3N3ci5kZS9hZXgvbzEwNzE5MTU/', - 'only_matching': True, - }, { - 'url': 'https://www.ardmediathek.de/swr/live/Y3JpZDovL3N3ci5kZS8xMzQ4MTA0Mg', - 'only_matching': True, - }, { - 'url': 'https://www.ardmediathek.de/video/coronavirus-update-ndr-info/astrazeneca-kurz-lockdown-und-pims-syndrom-81/ndr/Y3JpZDovL25kci5kZS84NzE0M2FjNi0wMWEwLTQ5ODEtOTE5NS1mOGZhNzdhOTFmOTI/', - 'only_matching': True, - }, { - 'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3dkci5kZS9CZWl0cmFnLWQ2NDJjYWEzLTMwZWYtNGI4NS1iMTI2LTU1N2UxYTcxOGIzOQ/tatort-duo-koeln-leipzig-ihr-kinderlein-kommet', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - player_page = self._download_json( - 'https://api.ardmediathek.de/public-gateway', - video_id, data=json.dumps({ - 'query': '''{ - playerPage(client: "ard", clipId: "%s") { - blockedByFsk - broadcastedOn - maturityContentRating - mediaCollection { - _duration - _geoblocked - _isLive - _mediaArray { - _mediaStreamArray { - _quality - _server - _stream - } - } - _previewImage - _subtitleUrl - _type - } - show { - title - } - synopsis - title - tracking { - atiCustomVars { - contentId - } - } - } -}''' % video_id, - }).encode(), headers={ - 'Content-Type': 'application/json' - })['data']['playerPage'] - title = player_page['title'] - content_id = str_or_none(try_get( - player_page, lambda x: x['tracking']['atiCustomVars']['contentId'])) - media_collection = player_page.get('mediaCollection') or {} - if not media_collection and content_id: - media_collection = self._download_json( - 'https://www.ardmediathek.de/play/media/' + content_id, - content_id, fatal=False) or {} - info = self._parse_media_info( - media_collection, content_id or video_id, - player_page.get('blockedByFsk')) - age_limit = None - description = player_page.get('synopsis') - maturity_content_rating = player_page.get('maturityContentRating') - if maturity_content_rating: - age_limit = int_or_none(maturity_content_rating.lstrip('FSK')) - if not age_limit and description: - age_limit = int_or_none(self._search_regex( - r'\(FSK\s*(\d+)\)\s*$', description, 'age limit', default=None)) - info.update({ - 'age_limit': age_limit, - 'title': title, - 'description': description, - 'timestamp': unified_timestamp(player_page.get('broadcastedOn')), - 'series': try_get(player_page, lambda x: x['show']['title']), - }) - return info diff --git a/youtube_dl/extractor/arkena.py b/youtube_dl/extractor/arkena.py deleted file mode 100644 index fd46b1c77..000000000 --- a/youtube_dl/extractor/arkena.py +++ /dev/null @@ -1,163 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_urlparse -from ..utils import ( - ExtractorError, - float_or_none, - int_or_none, - parse_iso8601, - try_get, -) - - -class ArkenaIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?:// - (?: - video\.(?:arkena|qbrick)\.com/play2/embed/player\?| - play\.arkena\.com/(?:config|embed)/avp/v\d/player/media/(?P<id>[^/]+)/[^/]+/(?P<account_id>\d+) - ) - ''' - _TESTS = [{ - 'url': 'https://video.qbrick.com/play2/embed/player?accountId=1034090&mediaId=d8ab4607-00090107-aab86310', - 'md5': '97f117754e5f3c020f5f26da4a44ebaf', - 'info_dict': { - 'id': 'd8ab4607-00090107-aab86310', - 'ext': 'mp4', - 'title': 'EM_HT20_117_roslund_v2.mp4', - 'timestamp': 1608285912, - 'upload_date': '20201218', - 'duration': 1429.162667, - 'subtitles': { - 'sv': 'count:3', - }, - }, - }, { - 'url': 'https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411', - 'only_matching': True, - }, { - 'url': 'https://play.arkena.com/config/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411/?callbackMethod=jQuery1111023664739129262213_1469227693893', - 'only_matching': True, - }, { - 'url': 'http://play.arkena.com/config/avp/v1/player/media/327336/darkmatter/131064/?callbackMethod=jQuery1111002221189684892677_1469227595972', - 'only_matching': True, - }, { - 'url': 'http://play.arkena.com/embed/avp/v1/player/media/327336/darkmatter/131064/', - 'only_matching': True, - }, { - 'url': 'http://video.arkena.com/play2/embed/player?accountId=472718&mediaId=35763b3b-00090078-bf604299&pageStyling=styled', - 'only_matching': True, - }] - - @staticmethod - def _extract_url(webpage): - # See https://support.arkena.com/display/PLAY/Ways+to+embed+your+video - mobj = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//play\.arkena\.com/embed/avp/.+?)\1', - webpage) - if mobj: - return mobj.group('url') - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - account_id = mobj.group('account_id') - - # Handle http://video.arkena.com/play2/embed/player URL - if not video_id: - qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - video_id = qs.get('mediaId', [None])[0] - account_id = qs.get('accountId', [None])[0] - if not video_id or not account_id: - raise ExtractorError('Invalid URL', expected=True) - - media = self._download_json( - 'https://video.qbrick.com/api/v1/public/accounts/%s/medias/%s' % (account_id, video_id), - video_id, query={ - # https://video.qbrick.com/docs/api/examples/library-api.html - 'fields': 'asset/resources/*/renditions/*(height,id,language,links/*(href,mimeType),type,size,videos/*(audios/*(codec,sampleRate),bitrate,codec,duration,height,width),width),created,metadata/*(title,description),tags', - }) - metadata = media.get('metadata') or {} - title = metadata['title'] - - duration = None - formats = [] - thumbnails = [] - subtitles = {} - for resource in media['asset']['resources']: - for rendition in (resource.get('renditions') or []): - rendition_type = rendition.get('type') - for i, link in enumerate(rendition.get('links') or []): - href = link.get('href') - if not href: - continue - if rendition_type == 'image': - thumbnails.append({ - 'filesize': int_or_none(rendition.get('size')), - 'height': int_or_none(rendition.get('height')), - 'id': rendition.get('id'), - 'url': href, - 'width': int_or_none(rendition.get('width')), - }) - elif rendition_type == 'subtitle': - subtitles.setdefault(rendition.get('language') or 'en', []).append({ - 'url': href, - }) - elif rendition_type == 'video': - f = { - 'filesize': int_or_none(rendition.get('size')), - 'format_id': rendition.get('id'), - 'url': href, - } - video = try_get(rendition, lambda x: x['videos'][i], dict) - if video: - if not duration: - duration = float_or_none(video.get('duration')) - f.update({ - 'height': int_or_none(video.get('height')), - 'tbr': int_or_none(video.get('bitrate'), 1000), - 'vcodec': video.get('codec'), - 'width': int_or_none(video.get('width')), - }) - audio = try_get(video, lambda x: x['audios'][0], dict) - if audio: - f.update({ - 'acodec': audio.get('codec'), - 'asr': int_or_none(audio.get('sampleRate')), - }) - formats.append(f) - elif rendition_type == 'index': - mime_type = link.get('mimeType') - if mime_type == 'application/smil+xml': - formats.extend(self._extract_smil_formats( - href, video_id, fatal=False)) - elif mime_type == 'application/x-mpegURL': - formats.extend(self._extract_m3u8_formats( - href, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - elif mime_type == 'application/hds+xml': - formats.extend(self._extract_f4m_formats( - href, video_id, f4m_id='hds', fatal=False)) - elif mime_type == 'application/dash+xml': - formats.extend(self._extract_f4m_formats( - href, video_id, f4m_id='hds', fatal=False)) - elif mime_type == 'application/vnd.ms-sstr+xml': - formats.extend(self._extract_ism_formats( - href, video_id, ism_id='mss', fatal=False)) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': metadata.get('description'), - 'timestamp': parse_iso8601(media.get('created')), - 'thumbnails': thumbnails, - 'subtitles': subtitles, - 'duration': duration, - 'tags': media.get('tags'), - 'formats': formats, - } diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py deleted file mode 100644 index 03abdbfaf..000000000 --- a/youtube_dl/extractor/arte.py +++ /dev/null @@ -1,254 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, -) -from ..utils import ( - ExtractorError, - int_or_none, - qualities, - try_get, - unified_strdate, - url_or_none, -) - - -class ArteTVBaseIE(InfoExtractor): - _ARTE_LANGUAGES = 'fr|de|en|es|it|pl' - _API_BASE = 'https://api.arte.tv/api/player/v1' - - -class ArteTVIE(ArteTVBaseIE): - _VALID_URL = r'''(?x) - https?:// - (?: - (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos| - api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s) - ) - /(?P<id>\d{6}-\d{3}-[AF]) - ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES} - _TESTS = [{ - 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/', - 'info_dict': { - 'id': '088501-000-A', - 'ext': 'mp4', - 'title': 'Mexico: Stealing Petrol to Survive', - 'upload_date': '20190628', - }, - }, { - 'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/', - 'only_matching': True, - }, { - 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - lang = mobj.group('lang') or mobj.group('lang_2') - - info = self._download_json( - '%s/config/%s/%s' % (self._API_BASE, lang, video_id), video_id) - player_info = info['videoJsonPlayer'] - - vsr = try_get(player_info, lambda x: x['VSR'], dict) - if not vsr: - error = None - if try_get(player_info, lambda x: x['custom_msg']['type']) == 'error': - error = try_get( - player_info, lambda x: x['custom_msg']['msg'], compat_str) - if not error: - error = 'Video %s is not available' % player_info.get('VID') or video_id - raise ExtractorError(error, expected=True) - - upload_date_str = player_info.get('shootingDate') - if not upload_date_str: - upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0] - - title = (player_info.get('VTI') or player_info['VID']).strip() - subtitle = player_info.get('VSU', '').strip() - if subtitle: - title += ' - %s' % subtitle - - qfunc = qualities(['MQ', 'HQ', 'EQ', 'SQ']) - - LANGS = { - 'fr': 'F', - 'de': 'A', - 'en': 'E[ANG]', - 'es': 'E[ESP]', - 'it': 'E[ITA]', - 'pl': 'E[POL]', - } - - langcode = LANGS.get(lang, lang) - - formats = [] - for format_id, format_dict in vsr.items(): - f = dict(format_dict) - format_url = url_or_none(f.get('url')) - streamer = f.get('streamer') - if not format_url and not streamer: - continue - versionCode = f.get('versionCode') - l = re.escape(langcode) - - # Language preference from most to least priority - # Reference: section 6.8 of - # https://www.arte.tv/sites/en/corporate/files/complete-technical-guidelines-arte-geie-v1-07-1.pdf - PREFERENCES = ( - # original version in requested language, without subtitles - r'VO{0}$'.format(l), - # original version in requested language, with partial subtitles in requested language - r'VO{0}-ST{0}$'.format(l), - # original version in requested language, with subtitles for the deaf and hard-of-hearing in requested language - r'VO{0}-STM{0}$'.format(l), - # non-original (dubbed) version in requested language, without subtitles - r'V{0}$'.format(l), - # non-original (dubbed) version in requested language, with subtitles partial subtitles in requested language - r'V{0}-ST{0}$'.format(l), - # non-original (dubbed) version in requested language, with subtitles for the deaf and hard-of-hearing in requested language - r'V{0}-STM{0}$'.format(l), - # original version in requested language, with partial subtitles in different language - r'VO{0}-ST(?!{0}).+?$'.format(l), - # original version in requested language, with subtitles for the deaf and hard-of-hearing in different language - r'VO{0}-STM(?!{0}).+?$'.format(l), - # original version in different language, with partial subtitles in requested language - r'VO(?:(?!{0}).+?)?-ST{0}$'.format(l), - # original version in different language, with subtitles for the deaf and hard-of-hearing in requested language - r'VO(?:(?!{0}).+?)?-STM{0}$'.format(l), - # original version in different language, without subtitles - r'VO(?:(?!{0}))?$'.format(l), - # original version in different language, with partial subtitles in different language - r'VO(?:(?!{0}).+?)?-ST(?!{0}).+?$'.format(l), - # original version in different language, with subtitles for the deaf and hard-of-hearing in different language - r'VO(?:(?!{0}).+?)?-STM(?!{0}).+?$'.format(l), - ) - - for pref, p in enumerate(PREFERENCES): - if re.match(p, versionCode): - lang_pref = len(PREFERENCES) - pref - break - else: - lang_pref = -1 - - media_type = f.get('mediaType') - if media_type == 'hls': - m3u8_formats = self._extract_m3u8_formats( - format_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id=format_id, fatal=False) - for m3u8_format in m3u8_formats: - m3u8_format['language_preference'] = lang_pref - formats.extend(m3u8_formats) - continue - - format = { - 'format_id': format_id, - 'preference': -10 if f.get('videoFormat') == 'M3U8' else None, - 'language_preference': lang_pref, - 'format_note': '%s, %s' % (f.get('versionCode'), f.get('versionLibelle')), - 'width': int_or_none(f.get('width')), - 'height': int_or_none(f.get('height')), - 'tbr': int_or_none(f.get('bitrate')), - 'quality': qfunc(f.get('quality')), - } - - if media_type == 'rtmp': - format['url'] = f['streamer'] - format['play_path'] = 'mp4:' + f['url'] - format['ext'] = 'flv' - else: - format['url'] = f['url'] - - formats.append(format) - - self._sort_formats(formats) - - return { - 'id': player_info.get('VID') or video_id, - 'title': title, - 'description': player_info.get('VDE'), - 'upload_date': unified_strdate(upload_date_str), - 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'), - 'formats': formats, - } - - -class ArteTVEmbedIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+' - _TESTS = [{ - 'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A', - 'info_dict': { - 'id': '100605-013-A', - 'ext': 'mp4', - 'title': 'United we Stream November Lockdown Edition #13', - 'description': 'md5:be40b667f45189632b78c1425c7c2ce1', - 'upload_date': '20201116', - }, - }, { - 'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A', - 'only_matching': True, - }] - - @staticmethod - def _extract_urls(webpage): - return [url for _, url in re.findall( - r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1', - webpage)] - - def _real_extract(self, url): - qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - json_url = qs['json_url'][0] - video_id = ArteTVIE._match_id(json_url) - return self.url_result( - json_url, ie=ArteTVIE.ie_key(), video_id=video_id) - - -class ArteTVPlaylistIE(ArteTVBaseIE): - _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES - _TESTS = [{ - 'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/', - 'info_dict': { - 'id': 'RC-016954', - 'title': 'Earn a Living', - 'description': 'md5:d322c55011514b3a7241f7fb80d494c2', - }, - 'playlist_mincount': 6, - }, { - 'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/', - 'only_matching': True, - }] - - def _real_extract(self, url): - lang, playlist_id = re.match(self._VALID_URL, url).groups() - collection = self._download_json( - '%s/collectionData/%s/%s?source=videos' - % (self._API_BASE, lang, playlist_id), playlist_id) - entries = [] - for video in collection['videos']: - if not isinstance(video, dict): - continue - video_url = url_or_none(video.get('url')) or url_or_none(video.get('jsonUrl')) - if not video_url: - continue - video_id = video.get('programId') - entries.append({ - '_type': 'url_transparent', - 'url': video_url, - 'id': video_id, - 'title': video.get('title'), - 'alt_title': video.get('subtitle'), - 'thumbnail': url_or_none(try_get(video, lambda x: x['mainImage']['url'], compat_str)), - 'duration': int_or_none(video.get('durationSeconds')), - 'view_count': int_or_none(video.get('views')), - 'ie_key': ArteTVIE.ie_key(), - }) - title = collection.get('title') - description = collection.get('shortDescription') or collection.get('teaserText') - return self.playlist_result(entries, playlist_id, title, description) diff --git a/youtube_dl/extractor/asiancrush.py b/youtube_dl/extractor/asiancrush.py deleted file mode 100644 index 66ce7c686..000000000 --- a/youtube_dl/extractor/asiancrush.py +++ /dev/null @@ -1,200 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import functools -import re - -from .common import InfoExtractor -from .kaltura import KalturaIE -from ..utils import ( - extract_attributes, - int_or_none, - OnDemandPagedList, - parse_age_limit, - strip_or_none, - try_get, -) - - -class AsianCrushBaseIE(InfoExtractor): - _VALID_URL_BASE = r'https?://(?:www\.)?(?P<host>(?:(?:asiancrush|yuyutv|midnightpulp)\.com|(?:cocoro|retrocrush)\.tv))' - _KALTURA_KEYS = [ - 'video_url', 'progressive_url', 'download_url', 'thumbnail_url', - 'widescreen_thumbnail_url', 'screencap_widescreen', - ] - _API_SUFFIX = {'retrocrush.tv': '-ott'} - - def _call_api(self, host, endpoint, video_id, query, resource): - return self._download_json( - 'https://api%s.%s/%s' % (self._API_SUFFIX.get(host, ''), host, endpoint), video_id, - 'Downloading %s JSON metadata' % resource, query=query, - headers=self.geo_verification_headers())['objects'] - - def _download_object_data(self, host, object_id, resource): - return self._call_api( - host, 'search', object_id, {'id': object_id}, resource)[0] - - def _get_object_description(self, obj): - return strip_or_none(obj.get('long_description') or obj.get('short_description')) - - def _parse_video_data(self, video): - title = video['name'] - - entry_id, partner_id = [None] * 2 - for k in self._KALTURA_KEYS: - k_url = video.get(k) - if k_url: - mobj = re.search(r'/p/(\d+)/.+?/entryId/([^/]+)/', k_url) - if mobj: - partner_id, entry_id = mobj.groups() - break - - meta_categories = try_get(video, lambda x: x['meta']['categories'], list) or [] - categories = list(filter(None, [c.get('name') for c in meta_categories])) - - show_info = video.get('show_info') or {} - - return { - '_type': 'url_transparent', - 'url': 'kaltura:%s:%s' % (partner_id, entry_id), - 'ie_key': KalturaIE.ie_key(), - 'id': entry_id, - 'title': title, - 'description': self._get_object_description(video), - 'age_limit': parse_age_limit(video.get('mpaa_rating') or video.get('tv_rating')), - 'categories': categories, - 'series': show_info.get('show_name'), - 'season_number': int_or_none(show_info.get('season_num')), - 'season_id': show_info.get('season_id'), - 'episode_number': int_or_none(show_info.get('episode_num')), - } - - -class AsianCrushIE(AsianCrushBaseIE): - _VALID_URL = r'%s/video/(?:[^/]+/)?0+(?P<id>\d+)v\b' % AsianCrushBaseIE._VALID_URL_BASE - _TESTS = [{ - 'url': 'https://www.asiancrush.com/video/004289v/women-who-flirt', - 'md5': 'c3b740e48d0ba002a42c0b72857beae6', - 'info_dict': { - 'id': '1_y4tmjm5r', - 'ext': 'mp4', - 'title': 'Women Who Flirt', - 'description': 'md5:b65c7e0ae03a85585476a62a186f924c', - 'timestamp': 1496936429, - 'upload_date': '20170608', - 'uploader_id': 'craig@crifkin.com', - 'age_limit': 13, - 'categories': 'count:5', - 'duration': 5812, - }, - }, { - 'url': 'https://www.asiancrush.com/video/she-was-pretty/011886v-pretty-episode-3/', - 'only_matching': True, - }, { - 'url': 'https://www.yuyutv.com/video/013886v/the-act-of-killing/', - 'only_matching': True, - }, { - 'url': 'https://www.yuyutv.com/video/peep-show/013922v-warring-factions/', - 'only_matching': True, - }, { - 'url': 'https://www.midnightpulp.com/video/010400v/drifters/', - 'only_matching': True, - }, { - 'url': 'https://www.midnightpulp.com/video/mononoke/016378v-zashikiwarashi-part-1/', - 'only_matching': True, - }, { - 'url': 'https://www.cocoro.tv/video/the-wonderful-wizard-of-oz/008878v-the-wonderful-wizard-of-oz-ep01/', - 'only_matching': True, - }, { - 'url': 'https://www.retrocrush.tv/video/true-tears/012328v-i...gave-away-my-tears', - 'only_matching': True, - }] - - def _real_extract(self, url): - host, video_id = re.match(self._VALID_URL, url).groups() - - if host == 'cocoro.tv': - webpage = self._download_webpage(url, video_id) - embed_vars = self._parse_json(self._search_regex( - r'iEmbedVars\s*=\s*({.+?})', webpage, 'embed vars', - default='{}'), video_id, fatal=False) or {} - video_id = embed_vars.get('entry_id') or video_id - - video = self._download_object_data(host, video_id, 'video') - return self._parse_video_data(video) - - -class AsianCrushPlaylistIE(AsianCrushBaseIE): - _VALID_URL = r'%s/series/0+(?P<id>\d+)s\b' % AsianCrushBaseIE._VALID_URL_BASE - _TESTS = [{ - 'url': 'https://www.asiancrush.com/series/006447s/fruity-samurai', - 'info_dict': { - 'id': '6447', - 'title': 'Fruity Samurai', - 'description': 'md5:7535174487e4a202d3872a7fc8f2f154', - }, - 'playlist_count': 13, - }, { - 'url': 'https://www.yuyutv.com/series/013920s/peep-show/', - 'only_matching': True, - }, { - 'url': 'https://www.midnightpulp.com/series/016375s/mononoke/', - 'only_matching': True, - }, { - 'url': 'https://www.cocoro.tv/series/008549s/the-wonderful-wizard-of-oz/', - 'only_matching': True, - }, { - 'url': 'https://www.retrocrush.tv/series/012355s/true-tears', - 'only_matching': True, - }] - _PAGE_SIZE = 1000000000 - - def _fetch_page(self, domain, parent_id, page): - videos = self._call_api( - domain, 'getreferencedobjects', parent_id, { - 'max': self._PAGE_SIZE, - 'object_type': 'video', - 'parent_id': parent_id, - 'start': page * self._PAGE_SIZE, - }, 'page %d' % (page + 1)) - for video in videos: - yield self._parse_video_data(video) - - def _real_extract(self, url): - host, playlist_id = re.match(self._VALID_URL, url).groups() - - if host == 'cocoro.tv': - webpage = self._download_webpage(url, playlist_id) - - entries = [] - - for mobj in re.finditer( - r'<a[^>]+href=(["\'])(?P<url>%s.*?)\1[^>]*>' % AsianCrushIE._VALID_URL, - webpage): - attrs = extract_attributes(mobj.group(0)) - if attrs.get('class') == 'clearfix': - entries.append(self.url_result( - mobj.group('url'), ie=AsianCrushIE.ie_key())) - - title = self._html_search_regex( - r'(?s)<h1\b[^>]\bid=["\']movieTitle[^>]+>(.+?)</h1>', webpage, - 'title', default=None) or self._og_search_title( - webpage, default=None) or self._html_search_meta( - 'twitter:title', webpage, 'title', - default=None) or self._search_regex( - r'<title>([^<]+)</title>', webpage, 'title', fatal=False) - if title: - title = re.sub(r'\s*\|\s*.+?$', '', title) - - description = self._og_search_description( - webpage, default=None) or self._html_search_meta( - 'twitter:description', webpage, 'description', fatal=False) - else: - show = self._download_object_data(host, playlist_id, 'show') - title = show.get('name') - description = self._get_object_description(show) - entries = OnDemandPagedList( - functools.partial(self._fetch_page, host, playlist_id), - self._PAGE_SIZE) - - return self.playlist_result(entries, playlist_id, title, description) diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py deleted file mode 100644 index c2cec9845..000000000 --- a/youtube_dl/extractor/atresplayer.py +++ /dev/null @@ -1,118 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_HTTPError -from ..utils import ( - ExtractorError, - int_or_none, - urlencode_postdata, -) - - -class AtresPlayerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?atresplayer\.com/[^/]+/[^/]+/[^/]+/[^/]+/(?P<display_id>.+?)_(?P<id>[0-9a-f]{24})' - _NETRC_MACHINE = 'atresplayer' - _TESTS = [ - { - 'url': 'https://www.atresplayer.com/antena3/series/pequenas-coincidencias/temporada-1/capitulo-7-asuntos-pendientes_5d4aa2c57ed1a88fc715a615/', - 'info_dict': { - 'id': '5d4aa2c57ed1a88fc715a615', - 'ext': 'mp4', - 'title': 'Capítulo 7: Asuntos pendientes', - 'description': 'md5:7634cdcb4d50d5381bedf93efb537fbc', - 'duration': 3413, - }, - 'params': { - 'format': 'bestvideo', - }, - 'skip': 'This video is only available for registered users' - }, - { - 'url': 'https://www.atresplayer.com/lasexta/programas/el-club-de-la-comedia/temporada-4/capitulo-10-especial-solidario-nochebuena_5ad08edf986b2855ed47adc4/', - 'only_matching': True, - }, - { - 'url': 'https://www.atresplayer.com/antena3/series/el-secreto-de-puente-viejo/el-chico-de-los-tres-lunares/capitulo-977-29-12-14_5ad51046986b2886722ccdea/', - 'only_matching': True, - }, - ] - _API_BASE = 'https://api.atresplayer.com/' - - def _real_initialize(self): - self._login() - - def _handle_error(self, e, code): - if isinstance(e.cause, compat_HTTPError) and e.cause.code == code: - error = self._parse_json(e.cause.read(), None) - if error.get('error') == 'required_registered': - self.raise_login_required() - raise ExtractorError(error['error_description'], expected=True) - raise - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - - self._request_webpage( - self._API_BASE + 'login', None, 'Downloading login page') - - try: - target_url = self._download_json( - 'https://account.atresmedia.com/api/login', None, - 'Logging in', headers={ - 'Content-Type': 'application/x-www-form-urlencoded' - }, data=urlencode_postdata({ - 'username': username, - 'password': password, - }))['targetUrl'] - except ExtractorError as e: - self._handle_error(e, 400) - - self._request_webpage(target_url, None, 'Following Target URL') - - def _real_extract(self, url): - display_id, video_id = re.match(self._VALID_URL, url).groups() - - try: - episode = self._download_json( - self._API_BASE + 'client/v1/player/episode/' + video_id, video_id) - except ExtractorError as e: - self._handle_error(e, 403) - - title = episode['titulo'] - - formats = [] - for source in episode.get('sources', []): - src = source.get('src') - if not src: - continue - src_type = source.get('type') - if src_type == 'application/vnd.apple.mpegurl': - formats.extend(self._extract_m3u8_formats( - src, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - elif src_type == 'application/dash+xml': - formats.extend(self._extract_mpd_formats( - src, video_id, mpd_id='dash', fatal=False)) - self._sort_formats(formats) - - heartbeat = episode.get('heartbeat') or {} - omniture = episode.get('omniture') or {} - get_meta = lambda x: heartbeat.get(x) or omniture.get(x) - - return { - 'display_id': display_id, - 'id': video_id, - 'title': title, - 'description': episode.get('descripcion'), - 'thumbnail': episode.get('imgPoster'), - 'duration': int_or_none(episode.get('duration')), - 'formats': formats, - 'channel': get_meta('channel'), - 'season': get_meta('season'), - 'episode_number': int_or_none(get_meta('episodeNumber')), - } diff --git a/youtube_dl/extractor/atvat.py b/youtube_dl/extractor/atvat.py deleted file mode 100644 index 95e572d70..000000000 --- a/youtube_dl/extractor/atvat.py +++ /dev/null @@ -1,75 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - determine_ext, - int_or_none, - unescapeHTML, -) - - -class ATVAtIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?atv\.at/(?:[^/]+/){2}(?P<id>[dv]\d+)' - _TESTS = [{ - 'url': 'http://atv.at/aktuell/di-210317-2005-uhr/v1698449/', - 'md5': 'c3b6b975fb3150fc628572939df205f2', - 'info_dict': { - 'id': '1698447', - 'ext': 'mp4', - 'title': 'DI, 21.03.17 | 20:05 Uhr 1/1', - } - }, { - 'url': 'http://atv.at/aktuell/meinrad-knapp/d8416/', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - video_data = self._parse_json(unescapeHTML(self._search_regex( - [r'flashPlayerOptions\s*=\s*(["\'])(?P<json>(?:(?!\1).)+)\1', - r'class="[^"]*jsb_video/FlashPlayer[^"]*"[^>]+data-jsb="(?P<json>[^"]+)"'], - webpage, 'player data', group='json')), - display_id)['config']['initial_video'] - - video_id = video_data['id'] - video_title = video_data['title'] - - parts = [] - for part in video_data.get('parts', []): - part_id = part['id'] - part_title = part['title'] - - formats = [] - for source in part.get('sources', []): - source_url = source.get('src') - if not source_url: - continue - ext = determine_ext(source_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - source_url, part_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - else: - formats.append({ - 'format_id': source.get('delivery'), - 'url': source_url, - }) - self._sort_formats(formats) - - parts.append({ - 'id': part_id, - 'title': part_title, - 'thumbnail': part.get('preview_image_url'), - 'duration': int_or_none(part.get('duration')), - 'is_live': part.get('is_livestream'), - 'formats': formats, - }) - - return { - '_type': 'multi_video', - 'id': video_id, - 'title': video_title, - 'entries': parts, - } diff --git a/youtube_dl/extractor/awaan.py b/youtube_dl/extractor/awaan.py deleted file mode 100644 index 3a7700cd4..000000000 --- a/youtube_dl/extractor/awaan.py +++ /dev/null @@ -1,187 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re -import base64 - -from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_urlencode, - compat_str, -) -from ..utils import ( - int_or_none, - parse_iso8601, - smuggle_url, - unsmuggle_url, - urlencode_postdata, -) - - -class AWAANIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?show/(?P<show_id>\d+)/[^/]+(?:/(?P<video_id>\d+)/(?P<season_id>\d+))?' - - def _real_extract(self, url): - show_id, video_id, season_id = re.match(self._VALID_URL, url).groups() - if video_id and int(video_id) > 0: - return self.url_result( - 'http://awaan.ae/media/%s' % video_id, 'AWAANVideo') - elif season_id and int(season_id) > 0: - return self.url_result(smuggle_url( - 'http://awaan.ae/program/season/%s' % season_id, - {'show_id': show_id}), 'AWAANSeason') - else: - return self.url_result( - 'http://awaan.ae/program/%s' % show_id, 'AWAANSeason') - - -class AWAANBaseIE(InfoExtractor): - def _parse_video_data(self, video_data, video_id, is_live): - title = video_data.get('title_en') or video_data['title_ar'] - img = video_data.get('img') - - return { - 'id': video_id, - 'title': self._live_title(title) if is_live else title, - 'description': video_data.get('description_en') or video_data.get('description_ar'), - 'thumbnail': 'http://admin.mangomolo.com/analytics/%s' % img if img else None, - 'duration': int_or_none(video_data.get('duration')), - 'timestamp': parse_iso8601(video_data.get('create_time'), ' '), - 'is_live': is_live, - 'uploader_id': video_data.get('user_id'), - } - - -class AWAANVideoIE(AWAANBaseIE): - IE_NAME = 'awaan:video' - _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?(?:video(?:/[^/]+)?|media|catchup/[^/]+/[^/]+)/(?P<id>\d+)' - _TESTS = [{ - 'url': 'http://www.dcndigital.ae/#/video/%D8%B1%D8%AD%D9%84%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D8%B1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/17375', - 'md5': '5f61c33bfc7794315c671a62d43116aa', - 'info_dict': - { - 'id': '17375', - 'ext': 'mp4', - 'title': 'رحلة العمر : الحلقة 1', - 'description': 'md5:0156e935d870acb8ef0a66d24070c6d6', - 'duration': 2041, - 'timestamp': 1227504126, - 'upload_date': '20081124', - 'uploader_id': '71', - }, - }, { - 'url': 'http://awaan.ae/video/26723981/%D8%AF%D8%A7%D8%B1-%D8%A7%D9%84%D8%B3%D9%84%D8%A7%D9%85:-%D8%AE%D9%8A%D8%B1-%D8%AF%D9%88%D8%B1-%D8%A7%D9%84%D8%A3%D9%86%D8%B5%D8%A7%D8%B1', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - video_data = self._download_json( - 'http://admin.mangomolo.com/analytics/index.php/plus/video?id=%s' % video_id, - video_id, headers={'Origin': 'http://awaan.ae'}) - info = self._parse_video_data(video_data, video_id, False) - - embed_url = 'http://admin.mangomolo.com/analytics/index.php/customers/embed/video?' + compat_urllib_parse_urlencode({ - 'id': video_data['id'], - 'user_id': video_data['user_id'], - 'signature': video_data['signature'], - 'countries': 'Q0M=', - 'filter': 'DENY', - }) - info.update({ - '_type': 'url_transparent', - 'url': embed_url, - 'ie_key': 'MangomoloVideo', - }) - return info - - -class AWAANLiveIE(AWAANBaseIE): - IE_NAME = 'awaan:live' - _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?live/(?P<id>\d+)' - _TEST = { - 'url': 'http://awaan.ae/live/6/dubai-tv', - 'info_dict': { - 'id': '6', - 'ext': 'mp4', - 'title': 're:Dubai Al Oula [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'upload_date': '20150107', - 'timestamp': 1420588800, - 'uploader_id': '71', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - } - - def _real_extract(self, url): - channel_id = self._match_id(url) - - channel_data = self._download_json( - 'http://admin.mangomolo.com/analytics/index.php/plus/getchanneldetails?channel_id=%s' % channel_id, - channel_id, headers={'Origin': 'http://awaan.ae'}) - info = self._parse_video_data(channel_data, channel_id, True) - - embed_url = 'http://admin.mangomolo.com/analytics/index.php/customers/embed/index?' + compat_urllib_parse_urlencode({ - 'id': base64.b64encode(channel_data['user_id'].encode()).decode(), - 'channelid': base64.b64encode(channel_data['id'].encode()).decode(), - 'signature': channel_data['signature'], - 'countries': 'Q0M=', - 'filter': 'DENY', - }) - info.update({ - '_type': 'url_transparent', - 'url': embed_url, - 'ie_key': 'MangomoloLive', - }) - return info - - -class AWAANSeasonIE(InfoExtractor): - IE_NAME = 'awaan:season' - _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?program/(?:(?P<show_id>\d+)|season/(?P<season_id>\d+))' - _TEST = { - 'url': 'http://dcndigital.ae/#/program/205024/%D9%85%D8%AD%D8%A7%D8%B6%D8%B1%D8%A7%D8%AA-%D8%A7%D9%84%D8%B4%D9%8A%D8%AE-%D8%A7%D9%84%D8%B4%D8%B9%D8%B1%D8%A7%D9%88%D9%8A', - 'info_dict': - { - 'id': '7910', - 'title': 'محاضرات الشيخ الشعراوي', - }, - 'playlist_mincount': 27, - } - - def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) - show_id, season_id = re.match(self._VALID_URL, url).groups() - - data = {} - if season_id: - data['season'] = season_id - show_id = smuggled_data.get('show_id') - if show_id is None: - season = self._download_json( - 'http://admin.mangomolo.com/analytics/index.php/plus/season_info?id=%s' % season_id, - season_id, headers={'Origin': 'http://awaan.ae'}) - show_id = season['id'] - data['show_id'] = show_id - show = self._download_json( - 'http://admin.mangomolo.com/analytics/index.php/plus/show', - show_id, data=urlencode_postdata(data), headers={ - 'Origin': 'http://awaan.ae', - 'Content-Type': 'application/x-www-form-urlencoded' - }) - if not season_id: - season_id = show['default_season'] - for season in show['seasons']: - if season['id'] == season_id: - title = season.get('title_en') or season['title_ar'] - - entries = [] - for video in show['videos']: - video_id = compat_str(video['id']) - entries.append(self.url_result( - 'http://awaan.ae/media/%s' % video_id, 'AWAANVideo', video_id)) - - return self.playlist_result(entries, season_id, title) diff --git a/youtube_dl/extractor/azmedien.py b/youtube_dl/extractor/azmedien.py deleted file mode 100644 index 930266990..000000000 --- a/youtube_dl/extractor/azmedien.py +++ /dev/null @@ -1,66 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import json -import re - -from .common import InfoExtractor -from .kaltura import KalturaIE - - -class AZMedienIE(InfoExtractor): - IE_DESC = 'AZ Medien videos' - _VALID_URL = r'''(?x) - https?:// - (?:www\.)? - (?P<host> - telezueri\.ch| - telebaern\.tv| - telem1\.ch - )/ - [^/]+/ - (?P<id> - [^/]+-(?P<article_id>\d+) - ) - (?: - \#video= - (?P<kaltura_id> - [_0-9a-z]+ - ) - )? - ''' - - _TESTS = [{ - 'url': 'https://www.telezueri.ch/sonntalk/bundesrats-vakanzen-eu-rahmenabkommen-133214569', - 'info_dict': { - 'id': '1_anruz3wy', - 'ext': 'mp4', - 'title': 'Bundesrats-Vakanzen / EU-Rahmenabkommen', - 'uploader_id': 'TVOnline', - 'upload_date': '20180930', - 'timestamp': 1538328802, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://www.telebaern.tv/telebaern-news/montag-1-oktober-2018-ganze-sendung-133531189#video=0_7xjo9lf1', - 'only_matching': True - }] - _API_TEMPL = 'https://www.%s/api/pub/gql/%s/NewsArticleTeaser/a4016f65fe62b81dc6664dd9f4910e4ab40383be' - _PARTNER_ID = '1719221' - - def _real_extract(self, url): - host, display_id, article_id, entry_id = re.match(self._VALID_URL, url).groups() - - if not entry_id: - entry_id = self._download_json( - self._API_TEMPL % (host, host.split('.')[0]), display_id, query={ - 'variables': json.dumps({ - 'contextId': 'NewsArticle:' + article_id, - }), - })['data']['context']['mainAsset']['video']['kaltura']['kalturaId'] - - return self.url_result( - 'kaltura:%s:%s' % (self._PARTNER_ID, entry_id), - ie=KalturaIE.ie_key(), video_id=entry_id) diff --git a/youtube_dl/extractor/baidu.py b/youtube_dl/extractor/baidu.py deleted file mode 100644 index 234a661d3..000000000 --- a/youtube_dl/extractor/baidu.py +++ /dev/null @@ -1,56 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import unescapeHTML - - -class BaiduVideoIE(InfoExtractor): - IE_DESC = '百度视频' - _VALID_URL = r'https?://v\.baidu\.com/(?P<type>[a-z]+)/(?P<id>\d+)\.htm' - _TESTS = [{ - 'url': 'http://v.baidu.com/comic/1069.htm?frp=bdbrand&q=%E4%B8%AD%E5%8D%8E%E5%B0%8F%E5%BD%93%E5%AE%B6', - 'info_dict': { - 'id': '1069', - 'title': '中华小当家 TV版国语', - 'description': 'md5:51be07afe461cf99fa61231421b5397c', - }, - 'playlist_count': 52, - }, { - 'url': 'http://v.baidu.com/show/11595.htm?frp=bdbrand', - 'info_dict': { - 'id': '11595', - 'title': 're:^奔跑吧兄弟', - 'description': 'md5:1bf88bad6d850930f542d51547c089b8', - }, - 'playlist_mincount': 12, - }] - - def _call_api(self, path, category, playlist_id, note): - return self._download_json('http://app.video.baidu.com/%s/?worktype=adnative%s&id=%s' % ( - path, category, playlist_id), playlist_id, note) - - def _real_extract(self, url): - category, playlist_id = re.match(self._VALID_URL, url).groups() - if category == 'show': - category = 'tvshow' - if category == 'tv': - category = 'tvplay' - - playlist_detail = self._call_api( - 'xqinfo', category, playlist_id, 'Download playlist JSON metadata') - - playlist_title = playlist_detail['title'] - playlist_description = unescapeHTML(playlist_detail.get('intro')) - - episodes_detail = self._call_api( - 'xqsingle', category, playlist_id, 'Download episodes JSON metadata') - - entries = [self.url_result( - episode['url'], video_title=episode['title'] - ) for episode in episodes_detail['videos']] - - return self.playlist_result( - entries, playlist_id, playlist_title, playlist_description) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py deleted file mode 100644 index 006aab3b4..000000000 --- a/youtube_dl/extractor/bandcamp.py +++ /dev/null @@ -1,391 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import random -import re -import time - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - ExtractorError, - float_or_none, - int_or_none, - KNOWN_EXTENSIONS, - parse_filesize, - str_or_none, - try_get, - update_url_query, - unified_strdate, - unified_timestamp, - url_or_none, - urljoin, -) - - -class BandcampIE(InfoExtractor): - _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song', - 'md5': 'c557841d5e50261777a6585648adf439', - 'info_dict': { - 'id': '1812978515', - 'ext': 'mp3', - 'title': "youtube-dl \"'/\\ä↭ - youtube-dl \"'/\\ä↭ - youtube-dl test song \"'/\\ä↭", - 'duration': 9.8485, - 'uploader': 'youtube-dl "\'/\\ä↭', - 'upload_date': '20121129', - 'timestamp': 1354224127, - }, - '_skip': 'There is a limit of 200 free downloads / month for the test song' - }, { - # free download - 'url': 'http://benprunty.bandcamp.com/track/lanius-battle', - 'info_dict': { - 'id': '2650410135', - 'ext': 'aiff', - 'title': 'Ben Prunty - Lanius (Battle)', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Ben Prunty', - 'timestamp': 1396508491, - 'upload_date': '20140403', - 'release_timestamp': 1396483200, - 'release_date': '20140403', - 'duration': 260.877, - 'track': 'Lanius (Battle)', - 'track_number': 1, - 'track_id': '2650410135', - 'artist': 'Ben Prunty', - 'album': 'FTL: Advanced Edition Soundtrack', - }, - }, { - # no free download, mp3 128 - 'url': 'https://relapsealumni.bandcamp.com/track/hail-to-fire', - 'md5': 'fec12ff55e804bb7f7ebeb77a800c8b7', - 'info_dict': { - 'id': '2584466013', - 'ext': 'mp3', - 'title': 'Mastodon - Hail to Fire', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Mastodon', - 'timestamp': 1322005399, - 'upload_date': '20111122', - 'release_timestamp': 1076112000, - 'release_date': '20040207', - 'duration': 120.79, - 'track': 'Hail to Fire', - 'track_number': 5, - 'track_id': '2584466013', - 'artist': 'Mastodon', - 'album': 'Call of the Mastodon', - }, - }] - - def _extract_data_attr(self, webpage, video_id, attr='tralbum', fatal=True): - return self._parse_json(self._html_search_regex( - r'data-%s=(["\'])({.+?})\1' % attr, webpage, - attr + ' data', group=2), video_id, fatal=fatal) - - def _real_extract(self, url): - title = self._match_id(url) - webpage = self._download_webpage(url, title) - tralbum = self._extract_data_attr(webpage, title) - thumbnail = self._og_search_thumbnail(webpage) - - track_id = None - track = None - track_number = None - duration = None - - formats = [] - track_info = try_get(tralbum, lambda x: x['trackinfo'][0], dict) - if track_info: - file_ = track_info.get('file') - if isinstance(file_, dict): - for format_id, format_url in file_.items(): - if not url_or_none(format_url): - continue - ext, abr_str = format_id.split('-', 1) - formats.append({ - 'format_id': format_id, - 'url': self._proto_relative_url(format_url, 'http:'), - 'ext': ext, - 'vcodec': 'none', - 'acodec': ext, - 'abr': int_or_none(abr_str), - }) - track = track_info.get('title') - track_id = str_or_none( - track_info.get('track_id') or track_info.get('id')) - track_number = int_or_none(track_info.get('track_num')) - duration = float_or_none(track_info.get('duration')) - - embed = self._extract_data_attr(webpage, title, 'embed', False) - current = tralbum.get('current') or {} - artist = embed.get('artist') or current.get('artist') or tralbum.get('artist') - timestamp = unified_timestamp( - current.get('publish_date') or tralbum.get('album_publish_date')) - - download_link = tralbum.get('freeDownloadPage') - if download_link: - track_id = compat_str(tralbum['id']) - - download_webpage = self._download_webpage( - download_link, track_id, 'Downloading free downloads page') - - blob = self._extract_data_attr(download_webpage, track_id, 'blob') - - info = try_get( - blob, (lambda x: x['digital_items'][0], - lambda x: x['download_items'][0]), dict) - if info: - downloads = info.get('downloads') - if isinstance(downloads, dict): - if not track: - track = info.get('title') - if not artist: - artist = info.get('artist') - if not thumbnail: - thumbnail = info.get('thumb_url') - - download_formats = {} - download_formats_list = blob.get('download_formats') - if isinstance(download_formats_list, list): - for f in blob['download_formats']: - name, ext = f.get('name'), f.get('file_extension') - if all(isinstance(x, compat_str) for x in (name, ext)): - download_formats[name] = ext.strip('.') - - for format_id, f in downloads.items(): - format_url = f.get('url') - if not format_url: - continue - # Stat URL generation algorithm is reverse engineered from - # download_*_bundle_*.js - stat_url = update_url_query( - format_url.replace('/download/', '/statdownload/'), { - '.rand': int(time.time() * 1000 * random.random()), - }) - format_id = f.get('encoding_name') or format_id - stat = self._download_json( - stat_url, track_id, 'Downloading %s JSON' % format_id, - transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1], - fatal=False) - if not stat: - continue - retry_url = url_or_none(stat.get('retry_url')) - if not retry_url: - continue - formats.append({ - 'url': self._proto_relative_url(retry_url, 'http:'), - 'ext': download_formats.get(format_id), - 'format_id': format_id, - 'format_note': f.get('description'), - 'filesize': parse_filesize(f.get('size_mb')), - 'vcodec': 'none', - }) - - self._sort_formats(formats) - - title = '%s - %s' % (artist, track) if artist else track - - if not duration: - duration = float_or_none(self._html_search_meta( - 'duration', webpage, default=None)) - - return { - 'id': track_id, - 'title': title, - 'thumbnail': thumbnail, - 'uploader': artist, - 'timestamp': timestamp, - 'release_timestamp': unified_timestamp(tralbum.get('album_release_date')), - 'duration': duration, - 'track': track, - 'track_number': track_number, - 'track_id': track_id, - 'artist': artist, - 'album': embed.get('album_title'), - 'formats': formats, - } - - -class BandcampAlbumIE(BandcampIE): - IE_NAME = 'Bandcamp:album' - _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<id>[^/?#&]+))?' - - _TESTS = [{ - 'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1', - 'playlist': [ - { - 'md5': '39bc1eded3476e927c724321ddf116cf', - 'info_dict': { - 'id': '1353101989', - 'ext': 'mp3', - 'title': 'Blazo - Intro', - 'timestamp': 1311756226, - 'upload_date': '20110727', - 'uploader': 'Blazo', - } - }, - { - 'md5': '1a2c32e2691474643e912cc6cd4bffaa', - 'info_dict': { - 'id': '38097443', - 'ext': 'mp3', - 'title': 'Blazo - Kero One - Keep It Alive (Blazo remix)', - 'timestamp': 1311757238, - 'upload_date': '20110727', - 'uploader': 'Blazo', - } - }, - ], - 'info_dict': { - 'title': 'Jazz Format Mixtape vol.1', - 'id': 'jazz-format-mixtape-vol-1', - 'uploader_id': 'blazo', - }, - 'params': { - 'playlistend': 2 - }, - 'skip': 'Bandcamp imposes download limits.' - }, { - 'url': 'http://nightbringer.bandcamp.com/album/hierophany-of-the-open-grave', - 'info_dict': { - 'title': 'Hierophany of the Open Grave', - 'uploader_id': 'nightbringer', - 'id': 'hierophany-of-the-open-grave', - }, - 'playlist_mincount': 9, - }, { - 'url': 'http://dotscale.bandcamp.com', - 'info_dict': { - 'title': 'Loom', - 'id': 'dotscale', - 'uploader_id': 'dotscale', - }, - 'playlist_mincount': 7, - }, { - # with escaped quote in title - 'url': 'https://jstrecords.bandcamp.com/album/entropy-ep', - 'info_dict': { - 'title': '"Entropy" EP', - 'uploader_id': 'jstrecords', - 'id': 'entropy-ep', - 'description': 'md5:0ff22959c943622972596062f2f366a5', - }, - 'playlist_mincount': 3, - }, { - # not all tracks have songs - 'url': 'https://insulters.bandcamp.com/album/we-are-the-plague', - 'info_dict': { - 'id': 'we-are-the-plague', - 'title': 'WE ARE THE PLAGUE', - 'uploader_id': 'insulters', - 'description': 'md5:b3cf845ee41b2b1141dc7bde9237255f', - }, - 'playlist_count': 2, - }] - - @classmethod - def suitable(cls, url): - return (False - if BandcampWeeklyIE.suitable(url) or BandcampIE.suitable(url) - else super(BandcampAlbumIE, cls).suitable(url)) - - def _real_extract(self, url): - uploader_id, album_id = re.match(self._VALID_URL, url).groups() - playlist_id = album_id or uploader_id - webpage = self._download_webpage(url, playlist_id) - tralbum = self._extract_data_attr(webpage, playlist_id) - track_info = tralbum.get('trackinfo') - if not track_info: - raise ExtractorError('The page doesn\'t contain any tracks') - # Only tracks with duration info have songs - entries = [ - self.url_result( - urljoin(url, t['title_link']), BandcampIE.ie_key(), - str_or_none(t.get('track_id') or t.get('id')), t.get('title')) - for t in track_info - if t.get('duration')] - - current = tralbum.get('current') or {} - - return { - '_type': 'playlist', - 'uploader_id': uploader_id, - 'id': playlist_id, - 'title': current.get('title'), - 'description': current.get('about'), - 'entries': entries, - } - - -class BandcampWeeklyIE(BandcampIE): - IE_NAME = 'Bandcamp:weekly' - _VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P<id>\d+)' - _TESTS = [{ - 'url': 'https://bandcamp.com/?show=224', - 'md5': 'b00df799c733cf7e0c567ed187dea0fd', - 'info_dict': { - 'id': '224', - 'ext': 'opus', - 'title': 'BC Weekly April 4th 2017 - Magic Moments', - 'description': 'md5:5d48150916e8e02d030623a48512c874', - 'duration': 5829.77, - 'release_date': '20170404', - 'series': 'Bandcamp Weekly', - 'episode': 'Magic Moments', - 'episode_id': '224', - }, - 'params': { - 'format': 'opus-lo', - }, - }, { - 'url': 'https://bandcamp.com/?blah/blah@&show=228', - 'only_matching': True - }] - - def _real_extract(self, url): - show_id = self._match_id(url) - webpage = self._download_webpage(url, show_id) - - blob = self._extract_data_attr(webpage, show_id, 'blob') - - show = blob['bcw_data'][show_id] - - formats = [] - for format_id, format_url in show['audio_stream'].items(): - if not url_or_none(format_url): - continue - for known_ext in KNOWN_EXTENSIONS: - if known_ext in format_id: - ext = known_ext - break - else: - ext = None - formats.append({ - 'format_id': format_id, - 'url': format_url, - 'ext': ext, - 'vcodec': 'none', - }) - self._sort_formats(formats) - - title = show.get('audio_title') or 'Bandcamp Weekly' - subtitle = show.get('subtitle') - if subtitle: - title += ' - %s' % subtitle - - return { - 'id': show_id, - 'title': title, - 'description': show.get('desc') or show.get('short_desc'), - 'duration': float_or_none(show.get('audio_duration')), - 'is_live': False, - 'release_date': unified_strdate(show.get('published_date')), - 'series': 'Bandcamp Weekly', - 'episode': show.get('subtitle'), - 'episode_id': show_id, - 'formats': formats - } diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py deleted file mode 100644 index 247d982ce..000000000 --- a/youtube_dl/extractor/bbc.py +++ /dev/null @@ -1,1623 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import functools -import itertools -import json -import re - -from .common import InfoExtractor -from ..compat import ( - compat_etree_Element, - compat_HTTPError, - compat_parse_qs, - compat_str, - compat_urllib_parse_urlparse, - compat_urlparse, -) -from ..utils import ( - ExtractorError, - OnDemandPagedList, - clean_html, - dict_get, - float_or_none, - get_element_by_class, - int_or_none, - js_to_json, - parse_duration, - parse_iso8601, - strip_or_none, - try_get, - unescapeHTML, - unified_timestamp, - url_or_none, - urlencode_postdata, - urljoin, -) - - -class BBCCoUkIE(InfoExtractor): - IE_NAME = 'bbc.co.uk' - IE_DESC = 'BBC iPlayer' - _ID_REGEX = r'(?:[pbm][\da-z]{7}|w[\da-z]{7,14})' - _VALID_URL = r'''(?x) - https?:// - (?:www\.)?bbc\.co\.uk/ - (?: - programmes/(?!articles/)| - iplayer(?:/[^/]+)?/(?:episode/|playlist/)| - music/(?:clips|audiovideo/popular)[/#]| - radio/player/| - sounds/play/| - events/[^/]+/play/[^/]+/ - ) - (?P<id>%s)(?!/(?:episodes|broadcasts|clips)) - ''' % _ID_REGEX - - _LOGIN_URL = 'https://account.bbc.com/signin' - _NETRC_MACHINE = 'bbc' - - _MEDIA_SELECTOR_URL_TEMPL = 'https://open.live.bbc.co.uk/mediaselector/6/select/version/2.0/mediaset/%s/vpid/%s' - _MEDIA_SETS = [ - # Provides HQ HLS streams with even better quality that pc mediaset but fails - # with geolocation in some cases when it's even not geo restricted at all (e.g. - # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable. - 'iptv-all', - 'pc', - ] - - _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist' - - _TESTS = [ - { - 'url': 'http://www.bbc.co.uk/programmes/b039g8p7', - 'info_dict': { - 'id': 'b039d07m', - 'ext': 'flv', - 'title': 'Kaleidoscope, Leonard Cohen', - 'description': 'The Canadian poet and songwriter reflects on his musical career.', - }, - 'params': { - # rtmp download - 'skip_download': True, - } - }, - { - 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/', - 'info_dict': { - 'id': 'b00yng1d', - 'ext': 'flv', - 'title': 'The Man in Black: Series 3: The Printed Name', - 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.", - 'duration': 1800, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - 'skip': 'Episode is no longer available on BBC iPlayer Radio', - }, - { - 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/', - 'info_dict': { - 'id': 'b00yng1d', - 'ext': 'flv', - 'title': 'The Voice UK: Series 3: Blind Auditions 5', - 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.', - 'duration': 5100, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only', - }, - { - 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion', - 'info_dict': { - 'id': 'b03k3pb7', - 'ext': 'flv', - 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction", - 'description': '2. Invasion', - 'duration': 3600, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only', - }, { - 'url': 'http://www.bbc.co.uk/programmes/b04v20dw', - 'info_dict': { - 'id': 'b04v209v', - 'ext': 'flv', - 'title': 'Pete Tong, The Essential New Tune Special', - 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!", - 'duration': 10800, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - 'skip': 'Episode is no longer available on BBC iPlayer Radio', - }, { - 'url': 'http://www.bbc.co.uk/music/clips/p022h44b', - 'note': 'Audio', - 'info_dict': { - 'id': 'p022h44j', - 'ext': 'flv', - 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances', - 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.", - 'duration': 227, - }, - 'params': { - # rtmp download - 'skip_download': True, - } - }, { - 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz', - 'note': 'Video', - 'info_dict': { - 'id': 'p025c103', - 'ext': 'flv', - 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)', - 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014', - 'duration': 226, - }, - 'params': { - # rtmp download - 'skip_download': True, - } - }, { - 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls', - 'info_dict': { - 'id': 'p02n76xf', - 'ext': 'flv', - 'title': 'Natural World, 2015-2016: 2. Super Powered Owls', - 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d', - 'duration': 3540, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - 'skip': 'geolocation', - }, { - 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition', - 'info_dict': { - 'id': 'b05zmgw1', - 'ext': 'flv', - 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.', - 'title': 'Royal Academy Summer Exhibition', - 'duration': 3540, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - 'skip': 'geolocation', - }, { - # iptv-all mediaset fails with geolocation however there is no geo restriction - # for this programme at all - 'url': 'http://www.bbc.co.uk/programmes/b06rkn85', - 'info_dict': { - 'id': 'b06rkms3', - 'ext': 'flv', - 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1", - 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!", - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - 'skip': 'Now it\'s really geo-restricted', - }, { - # compact player (https://github.com/ytdl-org/youtube-dl/issues/8147) - 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player', - 'info_dict': { - 'id': 'p028bfkj', - 'ext': 'flv', - 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews', - 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews', - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - }, { - 'url': 'https://www.bbc.co.uk/sounds/play/m0007jzb', - 'note': 'Audio', - 'info_dict': { - 'id': 'm0007jz9', - 'ext': 'mp4', - 'title': 'BBC Proms, 2019, Prom 34: West–Eastern Divan Orchestra', - 'description': "Live BBC Proms. West–Eastern Divan Orchestra with Daniel Barenboim and Martha Argerich.", - 'duration': 9840, - }, - 'params': { - # rtmp download - 'skip_download': True, - } - }, { - 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4', - 'only_matching': True, - }, { - 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3', - 'only_matching': True, - }, { - 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo', - 'only_matching': True, - }, { - 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf', - 'only_matching': True, - }, { - 'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55', - 'only_matching': True, - }, { - 'url': 'http://www.bbc.co.uk/programmes/w3csv1y9', - 'only_matching': True, - }, { - 'url': 'https://www.bbc.co.uk/programmes/m00005xn', - 'only_matching': True, - }, { - 'url': 'https://www.bbc.co.uk/programmes/w172w4dww1jqt5s', - 'only_matching': True, - }] - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - - login_page = self._download_webpage( - self._LOGIN_URL, None, 'Downloading signin page') - - login_form = self._hidden_inputs(login_page) - - login_form.update({ - 'username': username, - 'password': password, - }) - - post_url = urljoin(self._LOGIN_URL, self._search_regex( - r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, - 'post url', default=self._LOGIN_URL, group='url')) - - response, urlh = self._download_webpage_handle( - post_url, None, 'Logging in', data=urlencode_postdata(login_form), - headers={'Referer': self._LOGIN_URL}) - - if self._LOGIN_URL in urlh.geturl(): - error = clean_html(get_element_by_class('form-message', response)) - if error: - raise ExtractorError( - 'Unable to login: %s' % error, expected=True) - raise ExtractorError('Unable to log in') - - def _real_initialize(self): - self._login() - - class MediaSelectionError(Exception): - def __init__(self, id): - self.id = id - - def _extract_asx_playlist(self, connection, programme_id): - asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist') - return [ref.get('href') for ref in asx.findall('./Entry/ref')] - - def _extract_items(self, playlist): - return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS) - - def _extract_medias(self, media_selection): - error = media_selection.get('result') - if error: - raise BBCCoUkIE.MediaSelectionError(error) - return media_selection.get('media') or [] - - def _extract_connections(self, media): - return media.get('connection') or [] - - def _get_subtitles(self, media, programme_id): - subtitles = {} - for connection in self._extract_connections(media): - cc_url = url_or_none(connection.get('href')) - if not cc_url: - continue - captions = self._download_xml( - cc_url, programme_id, 'Downloading captions', fatal=False) - if not isinstance(captions, compat_etree_Element): - continue - subtitles['en'] = [ - { - 'url': connection.get('href'), - 'ext': 'ttml', - }, - ] - break - return subtitles - - def _raise_extractor_error(self, media_selection_error): - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, media_selection_error.id), - expected=True) - - def _download_media_selector(self, programme_id): - last_exception = None - for media_set in self._MEDIA_SETS: - try: - return self._download_media_selector_url( - self._MEDIA_SELECTOR_URL_TEMPL % (media_set, programme_id), programme_id) - except BBCCoUkIE.MediaSelectionError as e: - if e.id in ('notukerror', 'geolocation', 'selectionunavailable'): - last_exception = e - continue - self._raise_extractor_error(e) - self._raise_extractor_error(last_exception) - - def _download_media_selector_url(self, url, programme_id=None): - media_selection = self._download_json( - url, programme_id, 'Downloading media selection JSON', - expected_status=(403, 404)) - return self._process_media_selector(media_selection, programme_id) - - def _process_media_selector(self, media_selection, programme_id): - formats = [] - subtitles = None - urls = [] - - for media in self._extract_medias(media_selection): - kind = media.get('kind') - if kind in ('video', 'audio'): - bitrate = int_or_none(media.get('bitrate')) - encoding = media.get('encoding') - width = int_or_none(media.get('width')) - height = int_or_none(media.get('height')) - file_size = int_or_none(media.get('media_file_size')) - for connection in self._extract_connections(media): - href = connection.get('href') - if href in urls: - continue - if href: - urls.append(href) - conn_kind = connection.get('kind') - protocol = connection.get('protocol') - supplier = connection.get('supplier') - transfer_format = connection.get('transferFormat') - format_id = supplier or conn_kind or protocol - # ASX playlist - if supplier == 'asx': - for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)): - formats.append({ - 'url': ref, - 'format_id': 'ref%s_%s' % (i, format_id), - }) - elif transfer_format == 'dash': - formats.extend(self._extract_mpd_formats( - href, programme_id, mpd_id=format_id, fatal=False)) - elif transfer_format == 'hls': - formats.extend(self._extract_m3u8_formats( - href, programme_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id=format_id, fatal=False)) - elif transfer_format == 'hds': - formats.extend(self._extract_f4m_formats( - href, programme_id, f4m_id=format_id, fatal=False)) - else: - if not supplier and bitrate: - format_id += '-%d' % bitrate - fmt = { - 'format_id': format_id, - 'filesize': file_size, - } - if kind == 'video': - fmt.update({ - 'width': width, - 'height': height, - 'tbr': bitrate, - 'vcodec': encoding, - }) - else: - fmt.update({ - 'abr': bitrate, - 'acodec': encoding, - 'vcodec': 'none', - }) - if protocol in ('http', 'https'): - # Direct link - fmt.update({ - 'url': href, - }) - elif protocol == 'rtmp': - application = connection.get('application', 'ondemand') - auth_string = connection.get('authString') - identifier = connection.get('identifier') - server = connection.get('server') - fmt.update({ - 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string), - 'play_path': identifier, - 'app': '%s?%s' % (application, auth_string), - 'page_url': 'http://www.bbc.co.uk', - 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf', - 'rtmp_live': False, - 'ext': 'flv', - }) - else: - continue - formats.append(fmt) - elif kind == 'captions': - subtitles = self.extract_subtitles(media, programme_id) - return formats, subtitles - - def _download_playlist(self, playlist_id): - try: - playlist = self._download_json( - 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id, - playlist_id, 'Downloading playlist JSON') - - version = playlist.get('defaultAvailableVersion') - if version: - smp_config = version['smpConfig'] - title = smp_config['title'] - description = smp_config['summary'] - for item in smp_config['items']: - kind = item['kind'] - if kind not in ('programme', 'radioProgramme'): - continue - programme_id = item.get('vpid') - duration = int_or_none(item.get('duration')) - formats, subtitles = self._download_media_selector(programme_id) - return programme_id, title, description, duration, formats, subtitles - except ExtractorError as ee: - if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404): - raise - - # fallback to legacy playlist - return self._process_legacy_playlist(playlist_id) - - def _process_legacy_playlist_url(self, url, display_id): - playlist = self._download_legacy_playlist_url(url, display_id) - return self._extract_from_legacy_playlist(playlist, display_id) - - def _process_legacy_playlist(self, playlist_id): - return self._process_legacy_playlist_url( - 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id) - - def _download_legacy_playlist_url(self, url, playlist_id=None): - return self._download_xml( - url, playlist_id, 'Downloading legacy playlist XML') - - def _extract_from_legacy_playlist(self, playlist, playlist_id): - no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS) - if no_items is not None: - reason = no_items.get('reason') - if reason == 'preAvailability': - msg = 'Episode %s is not yet available' % playlist_id - elif reason == 'postAvailability': - msg = 'Episode %s is no longer available' % playlist_id - elif reason == 'noMedia': - msg = 'Episode %s is not currently available' % playlist_id - else: - msg = 'Episode %s is not available: %s' % (playlist_id, reason) - raise ExtractorError(msg, expected=True) - - for item in self._extract_items(playlist): - kind = item.get('kind') - if kind not in ('programme', 'radioProgramme'): - continue - title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text - description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS) - description = description_el.text if description_el is not None else None - - def get_programme_id(item): - def get_from_attributes(item): - for p in ('identifier', 'group'): - value = item.get(p) - if value and re.match(r'^[pb][\da-z]{7}$', value): - return value - get_from_attributes(item) - mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS) - if mediator is not None: - return get_from_attributes(mediator) - - programme_id = get_programme_id(item) - duration = int_or_none(item.get('duration')) - - if programme_id: - formats, subtitles = self._download_media_selector(programme_id) - else: - formats, subtitles = self._process_media_selector(item, playlist_id) - programme_id = playlist_id - - return programme_id, title, description, duration, formats, subtitles - - def _real_extract(self, url): - group_id = self._match_id(url) - - webpage = self._download_webpage(url, group_id, 'Downloading video page') - - error = self._search_regex( - r'<div\b[^>]+\bclass=["\'](?:smp|playout)__message delta["\'][^>]*>\s*([^<]+?)\s*<', - webpage, 'error', default=None) - if error: - raise ExtractorError(error, expected=True) - - programme_id = None - duration = None - - tviplayer = self._search_regex( - r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById', - webpage, 'player', default=None) - - if tviplayer: - player = self._parse_json(tviplayer, group_id).get('player', {}) - duration = int_or_none(player.get('duration')) - programme_id = player.get('vpid') - - if not programme_id: - programme_id = self._search_regex( - r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None) - - if programme_id: - formats, subtitles = self._download_media_selector(programme_id) - title = self._og_search_title(webpage, default=None) or self._html_search_regex( - (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>', - r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title') - description = self._search_regex( - (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>', - r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'), - webpage, 'description', default=None) - if not description: - description = self._html_search_meta('description', webpage) - else: - programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id) - - self._sort_formats(formats) - - return { - 'id': programme_id, - 'title': title, - 'description': description, - 'thumbnail': self._og_search_thumbnail(webpage, default=None), - 'duration': duration, - 'formats': formats, - 'subtitles': subtitles, - } - - -class BBCIE(BBCCoUkIE): - IE_NAME = 'bbc' - IE_DESC = 'BBC' - _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)' - - _MEDIA_SETS = [ - 'mobile-tablet-main', - 'pc', - ] - - _TESTS = [{ - # article with multiple videos embedded with data-playable containing vpids - 'url': 'http://www.bbc.com/news/world-europe-32668511', - 'info_dict': { - 'id': 'world-europe-32668511', - 'title': 'Russia stages massive WW2 parade', - 'description': 'md5:00ff61976f6081841f759a08bf78cc9c', - }, - 'playlist_count': 2, - }, { - # article with multiple videos embedded with data-playable (more videos) - 'url': 'http://www.bbc.com/news/business-28299555', - 'info_dict': { - 'id': 'business-28299555', - 'title': 'Farnborough Airshow: Video highlights', - 'description': 'BBC reports and video highlights at the Farnborough Airshow.', - }, - 'playlist_count': 9, - 'skip': 'Save time', - }, { - # article with multiple videos embedded with `new SMP()` - # broken - 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460', - 'info_dict': { - 'id': '3662a707-0af9-3149-963f-47bea720b460', - 'title': 'BUGGER', - }, - 'playlist_count': 18, - }, { - # single video embedded with data-playable containing vpid - 'url': 'http://www.bbc.com/news/world-europe-32041533', - 'info_dict': { - 'id': 'p02mprgb', - 'ext': 'mp4', - 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', - 'description': 'md5:2868290467291b37feda7863f7a83f54', - 'duration': 47, - 'timestamp': 1427219242, - 'upload_date': '20150324', - }, - 'params': { - # rtmp download - 'skip_download': True, - } - }, { - # article with single video embedded with data-playable containing XML playlist - # with direct video links as progressiveDownloadUrl (for now these are extracted) - # and playlist with f4m and m3u8 as streamingUrl - 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu', - 'info_dict': { - 'id': '150615_telabyad_kentin_cogu', - 'ext': 'mp4', - 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde", - 'description': 'md5:33a4805a855c9baf7115fcbde57e7025', - 'timestamp': 1434397334, - 'upload_date': '20150615', - }, - 'params': { - 'skip_download': True, - } - }, { - # single video embedded with data-playable containing XML playlists (regional section) - 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw', - 'info_dict': { - 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw', - 'ext': 'mp4', - 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción', - 'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8', - 'timestamp': 1434713142, - 'upload_date': '20150619', - }, - 'params': { - 'skip_download': True, - } - }, { - # single video from video playlist embedded with vxp-playlist-data JSON - 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376', - 'info_dict': { - 'id': 'p02w6qjc', - 'ext': 'mp4', - 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''', - 'duration': 56, - 'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''', - }, - 'params': { - 'skip_download': True, - } - }, { - # single video story with digitalData - 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret', - 'info_dict': { - 'id': 'p02q6gc4', - 'ext': 'flv', - 'title': 'Sri Lanka’s spicy secret', - 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.', - 'timestamp': 1437674293, - 'upload_date': '20150723', - }, - 'params': { - # rtmp download - 'skip_download': True, - } - }, { - # single video story without digitalData - 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star', - 'info_dict': { - 'id': 'p018zqqg', - 'ext': 'mp4', - 'title': 'Hyundai Santa Fe Sport: Rock star', - 'description': 'md5:b042a26142c4154a6e472933cf20793d', - 'timestamp': 1415867444, - 'upload_date': '20141113', - }, - 'params': { - # rtmp download - 'skip_download': True, - } - }, { - # single video embedded with Morph - 'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975', - 'info_dict': { - 'id': 'p041vhd0', - 'ext': 'mp4', - 'title': "Nigeria v Japan - Men's First Round", - 'description': 'Live coverage of the first round from Group B at the Amazonia Arena.', - 'duration': 7980, - 'uploader': 'BBC Sport', - 'uploader_id': 'bbc_sport', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'skip': 'Georestricted to UK', - }, { - # single video with playlist.sxml URL in playlist param - 'url': 'http://www.bbc.com/sport/0/football/33653409', - 'info_dict': { - 'id': 'p02xycnp', - 'ext': 'mp4', - 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', - 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.', - 'duration': 140, - }, - 'params': { - # rtmp download - 'skip_download': True, - } - }, { - # article with multiple videos embedded with playlist.sxml in playlist param - 'url': 'http://www.bbc.com/sport/0/football/34475836', - 'info_dict': { - 'id': '34475836', - 'title': 'Jurgen Klopp: Furious football from a witty and winning coach', - 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.', - }, - 'playlist_count': 3, - }, { - # school report article with single video - 'url': 'http://www.bbc.co.uk/schoolreport/35744779', - 'info_dict': { - 'id': '35744779', - 'title': 'School which breaks down barriers in Jerusalem', - }, - 'playlist_count': 1, - }, { - # single video with playlist URL from weather section - 'url': 'http://www.bbc.com/weather/features/33601775', - 'only_matching': True, - }, { - # custom redirection to www.bbc.com - # also, video with window.__INITIAL_DATA__ - 'url': 'http://www.bbc.co.uk/news/science-environment-33661876', - 'info_dict': { - 'id': 'p02xzws1', - 'ext': 'mp4', - 'title': "Pluto may have 'nitrogen glaciers'", - 'description': 'md5:6a95b593f528d7a5f2605221bc56912f', - 'thumbnail': r're:https?://.+/.+\.jpg', - 'timestamp': 1437785037, - 'upload_date': '20150725', - }, - }, { - # single video article embedded with data-media-vpid - 'url': 'http://www.bbc.co.uk/sport/rowing/35908187', - 'only_matching': True, - }, { - 'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1', - 'info_dict': { - 'id': 'p06556y7', - 'ext': 'mp4', - 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', - 'description': 'md5:4b7dfd063d5a789a1512e99662be3ddd', - }, - 'params': { - 'skip_download': True, - } - }, { - # window.__PRELOADED_STATE__ - 'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl', - 'info_dict': { - 'id': 'b0b9z4vz', - 'ext': 'mp4', - 'title': 'Prom 6: An American in Paris and Turangalila', - 'description': 'md5:51cf7d6f5c8553f197e58203bc78dff8', - 'uploader': 'Radio 3', - 'uploader_id': 'bbc_radio_three', - }, - }, { - 'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227', - 'info_dict': { - 'id': 'p06w9tws', - 'ext': 'mp4', - 'title': 'md5:2fabf12a726603193a2879a055f72514', - 'description': 'Learn English words and phrases from this story', - }, - 'add_ie': [BBCCoUkIE.ie_key()], - }, { - # BBC Reel - 'url': 'https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness', - 'info_dict': { - 'id': 'p07c6sb9', - 'ext': 'mp4', - 'title': 'How positive thinking is harming your happiness', - 'alt_title': 'The downsides of positive thinking', - 'description': 'md5:fad74b31da60d83b8265954ee42d85b4', - 'duration': 235, - 'thumbnail': r're:https?://.+/p07c9dsr.jpg', - 'upload_date': '20190604', - 'categories': ['Psychology'], - }, - }] - - @classmethod - def suitable(cls, url): - EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerEpisodesIE, BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE) - return (False if any(ie.suitable(url) for ie in EXCLUDE_IE) - else super(BBCIE, cls).suitable(url)) - - def _extract_from_media_meta(self, media_meta, video_id): - # Direct links to media in media metadata (e.g. - # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu) - # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml - source_files = media_meta.get('sourceFiles') - if source_files: - return [{ - 'url': f['url'], - 'format_id': format_id, - 'ext': f.get('encoding'), - 'tbr': float_or_none(f.get('bitrate'), 1000), - 'filesize': int_or_none(f.get('filesize')), - } for format_id, f in source_files.items() if f.get('url')], [] - - programme_id = media_meta.get('externalId') - if programme_id: - return self._download_media_selector(programme_id) - - # Process playlist.sxml as legacy playlist - href = media_meta.get('href') - if href: - playlist = self._download_legacy_playlist_url(href) - _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id) - return formats, subtitles - - return [], [] - - def _extract_from_playlist_sxml(self, url, playlist_id, timestamp): - programme_id, title, description, duration, formats, subtitles = \ - self._process_legacy_playlist_url(url, playlist_id) - self._sort_formats(formats) - return { - 'id': programme_id, - 'title': title, - 'description': description, - 'duration': duration, - 'timestamp': timestamp, - 'formats': formats, - 'subtitles': subtitles, - } - - def _real_extract(self, url): - playlist_id = self._match_id(url) - - webpage = self._download_webpage(url, playlist_id) - - json_ld_info = self._search_json_ld(webpage, playlist_id, default={}) - timestamp = json_ld_info.get('timestamp') - - playlist_title = json_ld_info.get('title') - if not playlist_title: - playlist_title = self._og_search_title( - webpage, default=None) or self._html_search_regex( - r'<title>(.+?)</title>', webpage, 'playlist title', default=None) - if playlist_title: - playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip() - - playlist_description = json_ld_info.get( - 'description') or self._og_search_description(webpage, default=None) - - if not timestamp: - timestamp = parse_iso8601(self._search_regex( - [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"', - r'itemprop="datePublished"[^>]+datetime="([^"]+)"', - r'"datePublished":\s*"([^"]+)'], - webpage, 'date', default=None)) - - entries = [] - - # article with multiple videos embedded with playlist.sxml (e.g. - # http://www.bbc.com/sport/0/football/34475836) - playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage) - playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage)) - if playlists: - entries = [ - self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp) - for playlist_url in playlists] - - # news article with multiple videos embedded with data-playable - data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage) - if data_playables: - for _, data_playable_json in data_playables: - data_playable = self._parse_json( - unescapeHTML(data_playable_json), playlist_id, fatal=False) - if not data_playable: - continue - settings = data_playable.get('settings', {}) - if settings: - # data-playable with video vpid in settings.playlistObject.items (e.g. - # http://www.bbc.com/news/world-us-canada-34473351) - playlist_object = settings.get('playlistObject', {}) - if playlist_object: - items = playlist_object.get('items') - if items and isinstance(items, list): - title = playlist_object['title'] - description = playlist_object.get('summary') - duration = int_or_none(items[0].get('duration')) - programme_id = items[0].get('vpid') - formats, subtitles = self._download_media_selector(programme_id) - self._sort_formats(formats) - entries.append({ - 'id': programme_id, - 'title': title, - 'description': description, - 'timestamp': timestamp, - 'duration': duration, - 'formats': formats, - 'subtitles': subtitles, - }) - else: - # data-playable without vpid but with a playlist.sxml URLs - # in otherSettings.playlist (e.g. - # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani) - playlist = data_playable.get('otherSettings', {}).get('playlist', {}) - if playlist: - entry = None - for key in ('streaming', 'progressiveDownload'): - playlist_url = playlist.get('%sUrl' % key) - if not playlist_url: - continue - try: - info = self._extract_from_playlist_sxml( - playlist_url, playlist_id, timestamp) - if not entry: - entry = info - else: - entry['title'] = info['title'] - entry['formats'].extend(info['formats']) - except ExtractorError as e: - # Some playlist URL may fail with 500, at the same time - # the other one may work fine (e.g. - # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu) - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500: - continue - raise - if entry: - self._sort_formats(entry['formats']) - entries.append(entry) - - if entries: - return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) - - # http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227 - group_id = self._search_regex( - r'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX, - webpage, 'group id', default=None) - if group_id: - return self.url_result( - 'https://www.bbc.co.uk/programmes/%s' % group_id, - ie=BBCCoUkIE.ie_key()) - - # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) - programme_id = self._search_regex( - [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX, - r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX, - r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX], - webpage, 'vpid', default=None) - - if programme_id: - formats, subtitles = self._download_media_selector(programme_id) - self._sort_formats(formats) - # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star) - digital_data = self._parse_json( - self._search_regex( - r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'), - programme_id, fatal=False) - page_info = digital_data.get('page', {}).get('pageInfo', {}) - title = page_info.get('pageName') or self._og_search_title(webpage) - description = page_info.get('description') or self._og_search_description(webpage) - timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp - return { - 'id': programme_id, - 'title': title, - 'description': description, - 'timestamp': timestamp, - 'formats': formats, - 'subtitles': subtitles, - } - - # bbc reel (e.g. https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness) - initial_data = self._parse_json(self._html_search_regex( - r'<script[^>]+id=(["\'])initial-data\1[^>]+data-json=(["\'])(?P<json>(?:(?!\2).)+)', - webpage, 'initial data', default='{}', group='json'), playlist_id, fatal=False) - if initial_data: - init_data = try_get( - initial_data, lambda x: x['initData']['items'][0], dict) or {} - smp_data = init_data.get('smpData') or {} - clip_data = try_get(smp_data, lambda x: x['items'][0], dict) or {} - version_id = clip_data.get('versionID') - if version_id: - title = smp_data['title'] - formats, subtitles = self._download_media_selector(version_id) - self._sort_formats(formats) - image_url = smp_data.get('holdingImageURL') - display_date = init_data.get('displayDate') - topic_title = init_data.get('topicTitle') - - return { - 'id': version_id, - 'title': title, - 'formats': formats, - 'alt_title': init_data.get('shortTitle'), - 'thumbnail': image_url.replace('$recipe', 'raw') if image_url else None, - 'description': smp_data.get('summary') or init_data.get('shortSummary'), - 'upload_date': display_date.replace('-', '') if display_date else None, - 'subtitles': subtitles, - 'duration': int_or_none(clip_data.get('duration')), - 'categories': [topic_title] if topic_title else None, - } - - # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975) - # There are several setPayload calls may be present but the video - # seems to be always related to the first one - morph_payload = self._parse_json( - self._search_regex( - r'Morph\.setPayload\([^,]+,\s*({.+?})\);', - webpage, 'morph payload', default='{}'), - playlist_id, fatal=False) - if morph_payload: - components = try_get(morph_payload, lambda x: x['body']['components'], list) or [] - for component in components: - if not isinstance(component, dict): - continue - lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict) - if not lead_media: - continue - identifiers = lead_media.get('identifiers') - if not identifiers or not isinstance(identifiers, dict): - continue - programme_id = identifiers.get('vpid') or identifiers.get('playablePid') - if not programme_id: - continue - title = lead_media.get('title') or self._og_search_title(webpage) - formats, subtitles = self._download_media_selector(programme_id) - self._sort_formats(formats) - description = lead_media.get('summary') - uploader = lead_media.get('masterBrand') - uploader_id = lead_media.get('mid') - duration = None - duration_d = lead_media.get('duration') - if isinstance(duration_d, dict): - duration = parse_duration(dict_get( - duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration'))) - return { - 'id': programme_id, - 'title': title, - 'description': description, - 'duration': duration, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'formats': formats, - 'subtitles': subtitles, - } - - preload_state = self._parse_json(self._search_regex( - r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage, - 'preload state', default='{}'), playlist_id, fatal=False) - if preload_state: - current_programme = preload_state.get('programmes', {}).get('current') or {} - programme_id = current_programme.get('id') - if current_programme and programme_id and current_programme.get('type') == 'playable_item': - title = current_programme.get('titles', {}).get('tertiary') or playlist_title - formats, subtitles = self._download_media_selector(programme_id) - self._sort_formats(formats) - synopses = current_programme.get('synopses') or {} - network = current_programme.get('network') or {} - duration = int_or_none( - current_programme.get('duration', {}).get('value')) - thumbnail = None - image_url = current_programme.get('image_url') - if image_url: - thumbnail = image_url.replace('{recipe}', 'raw') - return { - 'id': programme_id, - 'title': title, - 'description': dict_get(synopses, ('long', 'medium', 'short')), - 'thumbnail': thumbnail, - 'duration': duration, - 'uploader': network.get('short_title'), - 'uploader_id': network.get('id'), - 'formats': formats, - 'subtitles': subtitles, - } - - bbc3_config = self._parse_json( - self._search_regex( - r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage, - 'bbcthree config', default='{}'), - playlist_id, transform_source=js_to_json, fatal=False) or {} - payload = bbc3_config.get('payload') or {} - if payload: - clip = payload.get('currentClip') or {} - clip_vpid = clip.get('vpid') - clip_title = clip.get('title') - if clip_vpid and clip_title: - formats, subtitles = self._download_media_selector(clip_vpid) - self._sort_formats(formats) - return { - 'id': clip_vpid, - 'title': clip_title, - 'thumbnail': dict_get(clip, ('poster', 'imageUrl')), - 'description': clip.get('description'), - 'duration': parse_duration(clip.get('duration')), - 'formats': formats, - 'subtitles': subtitles, - } - bbc3_playlist = try_get( - payload, lambda x: x['content']['bbcMedia']['playlist'], - dict) - if bbc3_playlist: - playlist_title = bbc3_playlist.get('title') or playlist_title - thumbnail = bbc3_playlist.get('holdingImageURL') - entries = [] - for bbc3_item in bbc3_playlist['items']: - programme_id = bbc3_item.get('versionID') - if not programme_id: - continue - formats, subtitles = self._download_media_selector(programme_id) - self._sort_formats(formats) - entries.append({ - 'id': programme_id, - 'title': playlist_title, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'formats': formats, - 'subtitles': subtitles, - }) - return self.playlist_result( - entries, playlist_id, playlist_title, playlist_description) - - initial_data = self._parse_json(self._search_regex( - r'window\.__INITIAL_DATA__\s*=\s*({.+?});', webpage, - 'preload state', default='{}'), playlist_id, fatal=False) - if initial_data: - def parse_media(media): - if not media: - return - for item in (try_get(media, lambda x: x['media']['items'], list) or []): - item_id = item.get('id') - item_title = item.get('title') - if not (item_id and item_title): - continue - formats, subtitles = self._download_media_selector(item_id) - self._sort_formats(formats) - item_desc = None - blocks = try_get(media, lambda x: x['summary']['blocks'], list) - if blocks: - summary = [] - for block in blocks: - text = try_get(block, lambda x: x['model']['text'], compat_str) - if text: - summary.append(text) - if summary: - item_desc = '\n\n'.join(summary) - item_time = None - for meta in try_get(media, lambda x: x['metadata']['items'], list) or []: - if try_get(meta, lambda x: x['label']) == 'Published': - item_time = unified_timestamp(meta.get('timestamp')) - break - entries.append({ - 'id': item_id, - 'title': item_title, - 'thumbnail': item.get('holdingImageUrl'), - 'formats': formats, - 'subtitles': subtitles, - 'timestamp': item_time, - 'description': strip_or_none(item_desc), - }) - for resp in (initial_data.get('data') or {}).values(): - name = resp.get('name') - if name == 'media-experience': - parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict)) - elif name == 'article': - for block in (try_get(resp, lambda x: x['data']['blocks'], list) or []): - if block.get('type') != 'media': - continue - parse_media(block.get('model')) - return self.playlist_result( - entries, playlist_id, playlist_title, playlist_description) - - def extract_all(pattern): - return list(filter(None, map( - lambda s: self._parse_json(s, playlist_id, fatal=False), - re.findall(pattern, webpage)))) - - # Multiple video article (e.g. - # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460) - EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX - entries = [] - for match in extract_all(r'new\s+SMP\(({.+?})\)'): - embed_url = match.get('playerSettings', {}).get('externalEmbedUrl') - if embed_url and re.match(EMBED_URL, embed_url): - entries.append(embed_url) - entries.extend(re.findall( - r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage)) - if entries: - return self.playlist_result( - [self.url_result(entry_, 'BBCCoUk') for entry_ in entries], - playlist_id, playlist_title, playlist_description) - - # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511) - medias = extract_all(r"data-media-meta='({[^']+})'") - - if not medias: - # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international) - media_asset = self._search_regex( - r'mediaAssetPage\.init\(\s*({.+?}), "/', - webpage, 'media asset', default=None) - if media_asset: - media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False) - medias = [] - for video in media_asset_page.get('videos', {}).values(): - medias.extend(video.values()) - - if not medias: - # Multiple video playlist with single `now playing` entry (e.g. - # http://www.bbc.com/news/video_and_audio/must_see/33767813) - vxp_playlist = self._parse_json( - self._search_regex( - r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>', - webpage, 'playlist data'), - playlist_id) - playlist_medias = [] - for item in vxp_playlist: - media = item.get('media') - if not media: - continue - playlist_medias.append(media) - # Download single video if found media with asset id matching the video id from URL - if item.get('advert', {}).get('assetId') == playlist_id: - medias = [media] - break - # Fallback to the whole playlist - if not medias: - medias = playlist_medias - - entries = [] - for num, media_meta in enumerate(medias, start=1): - formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id) - if not formats: - continue - self._sort_formats(formats) - - video_id = media_meta.get('externalId') - if not video_id: - video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num) - - title = media_meta.get('caption') - if not title: - title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num) - - duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration')) - - images = [] - for image in media_meta.get('images', {}).values(): - images.extend(image.values()) - if 'image' in media_meta: - images.append(media_meta['image']) - - thumbnails = [{ - 'url': image.get('href'), - 'width': int_or_none(image.get('width')), - 'height': int_or_none(image.get('height')), - } for image in images] - - entries.append({ - 'id': video_id, - 'title': title, - 'thumbnails': thumbnails, - 'duration': duration, - 'timestamp': timestamp, - 'formats': formats, - 'subtitles': subtitles, - }) - - return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) - - -class BBCCoUkArticleIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)' - IE_NAME = 'bbc.co.uk:article' - IE_DESC = 'BBC articles' - - _TEST = { - 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer', - 'info_dict': { - 'id': '3jNQLTMrPlYGTBn0WV6M2MS', - 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four', - 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.', - }, - 'playlist_count': 4, - 'add_ie': ['BBCCoUk'], - } - - def _real_extract(self, url): - playlist_id = self._match_id(url) - - webpage = self._download_webpage(url, playlist_id) - - title = self._og_search_title(webpage) - description = self._og_search_description(webpage).strip() - - entries = [self.url_result(programme_url) for programme_url in re.findall( - r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)] - - return self.playlist_result(entries, playlist_id, title, description) - - -class BBCCoUkPlaylistBaseIE(InfoExtractor): - def _entries(self, webpage, url, playlist_id): - single_page = 'page' in compat_urlparse.parse_qs( - compat_urlparse.urlparse(url).query) - for page_num in itertools.count(2): - for video_id in re.findall( - self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage): - yield self.url_result( - self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key()) - if single_page: - return - next_page = self._search_regex( - r'<li[^>]+class=(["\'])pagination_+next\1[^>]*><a[^>]+href=(["\'])(?P<url>(?:(?!\2).)+)\2', - webpage, 'next page url', default=None, group='url') - if not next_page: - break - webpage = self._download_webpage( - compat_urlparse.urljoin(url, next_page), playlist_id, - 'Downloading page %d' % page_num, page_num) - - def _real_extract(self, url): - playlist_id = self._match_id(url) - - webpage = self._download_webpage(url, playlist_id) - - title, description = self._extract_title_and_description(webpage) - - return self.playlist_result( - self._entries(webpage, url, playlist_id), - playlist_id, title, description) - - -class BBCCoUkIPlayerPlaylistBaseIE(InfoExtractor): - _VALID_URL_TMPL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/%%s/(?P<id>%s)' % BBCCoUkIE._ID_REGEX - - @staticmethod - def _get_default(episode, key, default_key='default'): - return try_get(episode, lambda x: x[key][default_key]) - - def _get_description(self, data): - synopsis = data.get(self._DESCRIPTION_KEY) or {} - return dict_get(synopsis, ('large', 'medium', 'small')) - - def _fetch_page(self, programme_id, per_page, series_id, page): - elements = self._get_elements(self._call_api( - programme_id, per_page, page + 1, series_id)) - for element in elements: - episode = self._get_episode(element) - episode_id = episode.get('id') - if not episode_id: - continue - thumbnail = None - image = self._get_episode_image(episode) - if image: - thumbnail = image.replace('{recipe}', 'raw') - category = self._get_default(episode, 'labels', 'category') - yield { - '_type': 'url', - 'id': episode_id, - 'title': self._get_episode_field(episode, 'subtitle'), - 'url': 'https://www.bbc.co.uk/iplayer/episode/' + episode_id, - 'thumbnail': thumbnail, - 'description': self._get_description(episode), - 'categories': [category] if category else None, - 'series': self._get_episode_field(episode, 'title'), - 'ie_key': BBCCoUkIE.ie_key(), - } - - def _real_extract(self, url): - pid = self._match_id(url) - qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) - series_id = qs.get('seriesId', [None])[0] - page = qs.get('page', [None])[0] - per_page = 36 if page else self._PAGE_SIZE - fetch_page = functools.partial(self._fetch_page, pid, per_page, series_id) - entries = fetch_page(int(page) - 1) if page else OnDemandPagedList(fetch_page, self._PAGE_SIZE) - playlist_data = self._get_playlist_data(self._call_api(pid, 1)) - return self.playlist_result( - entries, pid, self._get_playlist_title(playlist_data), - self._get_description(playlist_data)) - - -class BBCCoUkIPlayerEpisodesIE(BBCCoUkIPlayerPlaylistBaseIE): - IE_NAME = 'bbc.co.uk:iplayer:episodes' - _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'episodes' - _TESTS = [{ - 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v', - 'info_dict': { - 'id': 'b05rcz9v', - 'title': 'The Disappearance', - 'description': 'md5:58eb101aee3116bad4da05f91179c0cb', - }, - 'playlist_mincount': 8, - }, { - # all seasons - 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster', - 'info_dict': { - 'id': 'b094m5t9', - 'title': 'Doctor Foster', - 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6', - }, - 'playlist_mincount': 10, - }, { - # explicit season - 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster?seriesId=b094m6nv', - 'info_dict': { - 'id': 'b094m5t9', - 'title': 'Doctor Foster', - 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6', - }, - 'playlist_mincount': 5, - }, { - # all pages - 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove', - 'info_dict': { - 'id': 'm0004c4v', - 'title': 'Beechgrove', - 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.', - }, - 'playlist_mincount': 37, - }, { - # explicit page - 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove?page=2', - 'info_dict': { - 'id': 'm0004c4v', - 'title': 'Beechgrove', - 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.', - }, - 'playlist_mincount': 1, - }] - _PAGE_SIZE = 100 - _DESCRIPTION_KEY = 'synopsis' - - def _get_episode_image(self, episode): - return self._get_default(episode, 'image') - - def _get_episode_field(self, episode, field): - return self._get_default(episode, field) - - @staticmethod - def _get_elements(data): - return data['entities']['results'] - - @staticmethod - def _get_episode(element): - return element.get('episode') or {} - - def _call_api(self, pid, per_page, page=1, series_id=None): - variables = { - 'id': pid, - 'page': page, - 'perPage': per_page, - } - if series_id: - variables['sliceId'] = series_id - return self._download_json( - 'https://graph.ibl.api.bbc.co.uk/', pid, headers={ - 'Content-Type': 'application/json' - }, data=json.dumps({ - 'id': '5692d93d5aac8d796a0305e895e61551', - 'variables': variables, - }).encode('utf-8'))['data']['programme'] - - @staticmethod - def _get_playlist_data(data): - return data - - def _get_playlist_title(self, data): - return self._get_default(data, 'title') - - -class BBCCoUkIPlayerGroupIE(BBCCoUkIPlayerPlaylistBaseIE): - IE_NAME = 'bbc.co.uk:iplayer:group' - _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'group' - _TESTS = [{ - # Available for over a year unlike 30 days for most other programmes - 'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32', - 'info_dict': { - 'id': 'p02tcc32', - 'title': 'Bohemian Icons', - 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7', - }, - 'playlist_mincount': 10, - }, { - # all pages - 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7', - 'info_dict': { - 'id': 'p081d7j7', - 'title': 'Music in Scotland', - 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.', - }, - 'playlist_mincount': 47, - }, { - # explicit page - 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7?page=2', - 'info_dict': { - 'id': 'p081d7j7', - 'title': 'Music in Scotland', - 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.', - }, - 'playlist_mincount': 11, - }] - _PAGE_SIZE = 200 - _DESCRIPTION_KEY = 'synopses' - - def _get_episode_image(self, episode): - return self._get_default(episode, 'images', 'standard') - - def _get_episode_field(self, episode, field): - return episode.get(field) - - @staticmethod - def _get_elements(data): - return data['elements'] - - @staticmethod - def _get_episode(element): - return element - - def _call_api(self, pid, per_page, page=1, series_id=None): - return self._download_json( - 'http://ibl.api.bbc.co.uk/ibl/v1/groups/%s/episodes' % pid, - pid, query={ - 'page': page, - 'per_page': per_page, - })['group_episodes'] - - @staticmethod - def _get_playlist_data(data): - return data['group'] - - def _get_playlist_title(self, data): - return data.get('title') - - -class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE): - IE_NAME = 'bbc.co.uk:playlist' - _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX - _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s' - _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)' - _TESTS = [{ - 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips', - 'info_dict': { - 'id': 'b05rcz9v', - 'title': 'The Disappearance - Clips - BBC Four', - 'description': 'French thriller serial about a missing teenager.', - }, - 'playlist_mincount': 7, - }, { - # multipage playlist, explicit page - 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1', - 'info_dict': { - 'id': 'b00mfl7n', - 'title': 'Frozen Planet - Clips - BBC One', - 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c', - }, - 'playlist_mincount': 24, - }, { - # multipage playlist, all pages - 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips', - 'info_dict': { - 'id': 'b00mfl7n', - 'title': 'Frozen Planet - Clips - BBC One', - 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c', - }, - 'playlist_mincount': 142, - }, { - 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06', - 'only_matching': True, - }, { - 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips', - 'only_matching': True, - }, { - 'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player', - 'only_matching': True, - }] - - def _extract_title_and_description(self, webpage): - title = self._og_search_title(webpage, fatal=False) - description = self._og_search_description(webpage) - return title, description diff --git a/youtube_dl/extractor/beatport.py b/youtube_dl/extractor/beatport.py deleted file mode 100644 index e60709417..000000000 --- a/youtube_dl/extractor/beatport.py +++ /dev/null @@ -1,103 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import int_or_none - - -class BeatportIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.|pro\.)?beatport\.com/track/(?P<display_id>[^/]+)/(?P<id>[0-9]+)' - _TESTS = [{ - 'url': 'https://beatport.com/track/synesthesia-original-mix/5379371', - 'md5': 'b3c34d8639a2f6a7f734382358478887', - 'info_dict': { - 'id': '5379371', - 'display_id': 'synesthesia-original-mix', - 'ext': 'mp4', - 'title': 'Froxic - Synesthesia (Original Mix)', - }, - }, { - 'url': 'https://beatport.com/track/love-and-war-original-mix/3756896', - 'md5': 'e44c3025dfa38c6577fbaeb43da43514', - 'info_dict': { - 'id': '3756896', - 'display_id': 'love-and-war-original-mix', - 'ext': 'mp3', - 'title': 'Wolfgang Gartner - Love & War (Original Mix)', - }, - }, { - 'url': 'https://beatport.com/track/birds-original-mix/4991738', - 'md5': 'a1fd8e8046de3950fd039304c186c05f', - 'info_dict': { - 'id': '4991738', - 'display_id': 'birds-original-mix', - 'ext': 'mp4', - 'title': "Tos, Middle Milk, Mumblin' Johnsson - Birds (Original Mix)", - } - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - track_id = mobj.group('id') - display_id = mobj.group('display_id') - - webpage = self._download_webpage(url, display_id) - - playables = self._parse_json( - self._search_regex( - r'window\.Playables\s*=\s*({.+?});', webpage, - 'playables info', flags=re.DOTALL), - track_id) - - track = next(t for t in playables['tracks'] if t['id'] == int(track_id)) - - title = ', '.join((a['name'] for a in track['artists'])) + ' - ' + track['name'] - if track['mix']: - title += ' (' + track['mix'] + ')' - - formats = [] - for ext, info in track['preview'].items(): - if not info['url']: - continue - fmt = { - 'url': info['url'], - 'ext': ext, - 'format_id': ext, - 'vcodec': 'none', - } - if ext == 'mp3': - fmt['preference'] = 0 - fmt['acodec'] = 'mp3' - fmt['abr'] = 96 - fmt['asr'] = 44100 - elif ext == 'mp4': - fmt['preference'] = 1 - fmt['acodec'] = 'aac' - fmt['abr'] = 96 - fmt['asr'] = 44100 - formats.append(fmt) - self._sort_formats(formats) - - images = [] - for name, info in track['images'].items(): - image_url = info.get('url') - if name == 'dynamic' or not image_url: - continue - image = { - 'id': name, - 'url': image_url, - 'height': int_or_none(info.get('height')), - 'width': int_or_none(info.get('width')), - } - images.append(image) - - return { - 'id': compat_str(track.get('id')) or track_id, - 'display_id': track.get('slug') or display_id, - 'title': title, - 'formats': formats, - 'thumbnails': images, - } diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py deleted file mode 100644 index 5788d13ba..000000000 --- a/youtube_dl/extractor/beeg.py +++ /dev/null @@ -1,116 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, -) -from ..utils import ( - int_or_none, - unified_timestamp, -) - - -class BeegIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?beeg\.(?:com|porn(?:/video)?)/(?P<id>\d+)' - _TESTS = [{ - # api/v6 v1 - 'url': 'http://beeg.com/5416503', - 'md5': 'a1a1b1a8bc70a89e49ccfd113aed0820', - 'info_dict': { - 'id': '5416503', - 'ext': 'mp4', - 'title': 'Sultry Striptease', - 'description': 'md5:d22219c09da287c14bed3d6c37ce4bc2', - 'timestamp': 1391813355, - 'upload_date': '20140207', - 'duration': 383, - 'tags': list, - 'age_limit': 18, - } - }, { - # api/v6 v2 - 'url': 'https://beeg.com/1941093077?t=911-1391', - 'only_matching': True, - }, { - # api/v6 v2 w/o t - 'url': 'https://beeg.com/1277207756', - 'only_matching': True, - }, { - 'url': 'https://beeg.porn/video/5416503', - 'only_matching': True, - }, { - 'url': 'https://beeg.porn/5416503', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - beeg_version = self._search_regex( - r'beeg_version\s*=\s*([\da-zA-Z_-]+)', webpage, 'beeg version', - default='1546225636701') - - if len(video_id) >= 10: - query = { - 'v': 2, - } - qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - t = qs.get('t', [''])[0].split('-') - if len(t) > 1: - query.update({ - 's': t[0], - 'e': t[1], - }) - else: - query = {'v': 1} - - for api_path in ('', 'api.'): - video = self._download_json( - 'https://%sbeeg.com/api/v6/%s/video/%s' - % (api_path, beeg_version, video_id), video_id, - fatal=api_path == 'api.', query=query) - if video: - break - - formats = [] - for format_id, video_url in video.items(): - if not video_url: - continue - height = self._search_regex( - r'^(\d+)[pP]$', format_id, 'height', default=None) - if not height: - continue - formats.append({ - 'url': self._proto_relative_url( - video_url.replace('{DATA_MARKERS}', 'data=pc_XX__%s_0' % beeg_version), 'https:'), - 'format_id': format_id, - 'height': int(height), - }) - self._sort_formats(formats) - - title = video['title'] - video_id = compat_str(video.get('id') or video_id) - display_id = video.get('code') - description = video.get('desc') - series = video.get('ps_name') - - timestamp = unified_timestamp(video.get('date')) - duration = int_or_none(video.get('duration')) - - tags = [tag.strip() for tag in video['tags'].split(',')] if video.get('tags') else None - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'series': series, - 'timestamp': timestamp, - 'duration': duration, - 'tags': tags, - 'formats': formats, - 'age_limit': self._rta_search(webpage), - } diff --git a/youtube_dl/extractor/behindkink.py b/youtube_dl/extractor/behindkink.py deleted file mode 100644 index 9bca853b3..000000000 --- a/youtube_dl/extractor/behindkink.py +++ /dev/null @@ -1,46 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import url_basename - - -class BehindKinkIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?behindkink\.com/(?P<year>[0-9]{4})/(?P<month>[0-9]{2})/(?P<day>[0-9]{2})/(?P<id>[^/#?_]+)' - _TEST = { - 'url': 'http://www.behindkink.com/2014/12/05/what-are-you-passionate-about-marley-blaze/', - 'md5': '507b57d8fdcd75a41a9a7bdb7989c762', - 'info_dict': { - 'id': '37127', - 'ext': 'mp4', - 'title': 'What are you passionate about – Marley Blaze', - 'description': 'md5:aee8e9611b4ff70186f752975d9b94b4', - 'upload_date': '20141205', - 'thumbnail': 'http://www.behindkink.com/wp-content/uploads/2014/12/blaze-1.jpg', - 'age_limit': 18, - } - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('id') - - webpage = self._download_webpage(url, display_id) - - video_url = self._search_regex( - r'<source src="([^"]+)"', webpage, 'video URL') - video_id = url_basename(video_url).split('_')[0] - upload_date = mobj.group('year') + mobj.group('month') + mobj.group('day') - - return { - 'id': video_id, - 'display_id': display_id, - 'url': video_url, - 'title': self._og_search_title(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), - 'description': self._og_search_description(webpage), - 'upload_date': upload_date, - 'age_limit': 18, - } diff --git a/youtube_dl/extractor/bellmedia.py b/youtube_dl/extractor/bellmedia.py deleted file mode 100644 index 9f9de96c6..000000000 --- a/youtube_dl/extractor/bellmedia.py +++ /dev/null @@ -1,88 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - - -class BellMediaIE(InfoExtractor): - _VALID_URL = r'''(?x)https?://(?:www\.)? - (?P<domain> - (?: - ctv| - tsn| - bnn(?:bloomberg)?| - thecomedynetwork| - discovery| - discoveryvelocity| - sciencechannel| - investigationdiscovery| - animalplanet| - bravo| - mtv| - space| - etalk| - marilyn - )\.ca| - (?:much|cp24)\.com - )/.*?(?:\b(?:vid(?:eoid)?|clipId)=|-vid|~|%7E|/(?:episode)?)(?P<id>[0-9]{6,})''' - _TESTS = [{ - 'url': 'https://www.bnnbloomberg.ca/video/david-cockfield-s-top-picks~1403070', - 'md5': '36d3ef559cfe8af8efe15922cd3ce950', - 'info_dict': { - 'id': '1403070', - 'ext': 'flv', - 'title': 'David Cockfield\'s Top Picks', - 'description': 'md5:810f7f8c6a83ad5b48677c3f8e5bb2c3', - 'upload_date': '20180525', - 'timestamp': 1527288600, - }, - }, { - 'url': 'http://www.thecomedynetwork.ca/video/player?vid=923582', - 'only_matching': True, - }, { - 'url': 'http://www.tsn.ca/video/expectations-high-for-milos-raonic-at-us-open~939549', - 'only_matching': True, - }, { - 'url': 'http://www.bnn.ca/video/berman-s-call-part-two-viewer-questions~939654', - 'only_matching': True, - }, { - 'url': 'http://www.ctv.ca/YourMorning/Video/S1E6-Monday-August-29-2016-vid938009', - 'only_matching': True, - }, { - 'url': 'http://www.much.com/shows/atmidnight/episode948007/tuesday-september-13-2016', - 'only_matching': True, - }, { - 'url': 'http://www.much.com/shows/the-almost-impossible-gameshow/928979/episode-6', - 'only_matching': True, - }, { - 'url': 'http://www.ctv.ca/DCs-Legends-of-Tomorrow/Video/S2E11-Turncoat-vid1051430', - 'only_matching': True, - }, { - 'url': 'http://www.etalk.ca/video?videoid=663455', - 'only_matching': True, - }, { - 'url': 'https://www.cp24.com/video?clipId=1982548', - 'only_matching': True, - }] - _DOMAINS = { - 'thecomedynetwork': 'comedy', - 'discoveryvelocity': 'discvel', - 'sciencechannel': 'discsci', - 'investigationdiscovery': 'invdisc', - 'animalplanet': 'aniplan', - 'etalk': 'ctv', - 'bnnbloomberg': 'bnn', - 'marilyn': 'ctv_marilyn', - } - - def _real_extract(self, url): - domain, video_id = re.match(self._VALID_URL, url).groups() - domain = domain.split('.')[0] - return { - '_type': 'url_transparent', - 'id': video_id, - 'url': '9c9media:%s_web:%s' % (self._DOMAINS.get(domain, domain), video_id), - 'ie_key': 'NineCNineMedia', - } diff --git a/youtube_dl/extractor/bet.py b/youtube_dl/extractor/bet.py deleted file mode 100644 index d7ceaa85e..000000000 --- a/youtube_dl/extractor/bet.py +++ /dev/null @@ -1,80 +0,0 @@ -from __future__ import unicode_literals - -from .mtv import MTVServicesInfoExtractor -from ..utils import unified_strdate - - -class BetIE(MTVServicesInfoExtractor): - _VALID_URL = r'https?://(?:www\.)?bet\.com/(?:[^/]+/)+(?P<id>.+?)\.html' - _TESTS = [ - { - 'url': 'http://www.bet.com/news/politics/2014/12/08/in-bet-exclusive-obama-talks-race-and-racism.html', - 'info_dict': { - 'id': '07e96bd3-8850-3051-b856-271b457f0ab8', - 'display_id': 'in-bet-exclusive-obama-talks-race-and-racism', - 'ext': 'flv', - 'title': 'A Conversation With President Obama', - 'description': 'President Obama urges persistence in confronting racism and bias.', - 'duration': 1534, - 'upload_date': '20141208', - 'thumbnail': r're:(?i)^https?://.*\.jpg$', - 'subtitles': { - 'en': 'mincount:2', - } - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - }, - { - 'url': 'http://www.bet.com/video/news/national/2014/justice-for-ferguson-a-community-reacts.html', - 'info_dict': { - 'id': '9f516bf1-7543-39c4-8076-dd441b459ba9', - 'display_id': 'justice-for-ferguson-a-community-reacts', - 'ext': 'flv', - 'title': 'Justice for Ferguson: A Community Reacts', - 'description': 'A BET News special.', - 'duration': 1696, - 'upload_date': '20141125', - 'thumbnail': r're:(?i)^https?://.*\.jpg$', - 'subtitles': { - 'en': 'mincount:2', - } - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - } - ] - - _FEED_URL = "http://feeds.mtvnservices.com/od/feed/bet-mrss-player" - - def _get_feed_query(self, uri): - return { - 'uuid': uri, - } - - def _extract_mgid(self, webpage): - return self._search_regex(r'data-uri="([^"]+)', webpage, 'mgid') - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - mgid = self._extract_mgid(webpage) - videos_info = self._get_videos_info(mgid) - - info_dict = videos_info['entries'][0] - - upload_date = unified_strdate(self._html_search_meta('date', webpage)) - description = self._html_search_meta('description', webpage) - - info_dict.update({ - 'display_id': display_id, - 'description': description, - 'upload_date': upload_date, - }) - - return info_dict diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py deleted file mode 100644 index bff6ea194..000000000 --- a/youtube_dl/extractor/bilibili.py +++ /dev/null @@ -1,451 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import hashlib -import re - -from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_urlparse, -) -from ..utils import ( - ExtractorError, - int_or_none, - float_or_none, - parse_iso8601, - smuggle_url, - str_or_none, - strip_jsonp, - unified_timestamp, - unsmuggle_url, - urlencode_postdata, -) - - -class BiliBiliIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?:// - (?:(?:www|bangumi)\.)? - bilibili\.(?:tv|com)/ - (?: - (?: - video/[aA][vV]| - anime/(?P<anime_id>\d+)/play\# - )(?P<id_bv>\d+)| - video/[bB][vV](?P<id>[^/?#&]+) - ) - ''' - - _TESTS = [{ - 'url': 'http://www.bilibili.tv/video/av1074402/', - 'md5': '5f7d29e1a2872f3df0cf76b1f87d3788', - 'info_dict': { - 'id': '1074402', - 'ext': 'flv', - 'title': '【金坷垃】金泡沫', - 'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923', - 'duration': 308.067, - 'timestamp': 1398012678, - 'upload_date': '20140420', - 'thumbnail': r're:^https?://.+\.jpg', - 'uploader': '菊子桑', - 'uploader_id': '156160', - }, - }, { - # Tested in BiliBiliBangumiIE - 'url': 'http://bangumi.bilibili.com/anime/1869/play#40062', - 'only_matching': True, - }, { - 'url': 'http://bangumi.bilibili.com/anime/5802/play#100643', - 'md5': '3f721ad1e75030cc06faf73587cfec57', - 'info_dict': { - 'id': '100643', - 'ext': 'mp4', - 'title': 'CHAOS;CHILD', - 'description': '如果你是神明,并且能够让妄想成为现实。那你会进行怎么样的妄想?是淫靡的世界?独裁社会?毁灭性的制裁?还是……2015年,涩谷。从6年前发生的大灾害“涩谷地震”之后复兴了的这个街区里新设立的私立高中...', - }, - 'skip': 'Geo-restricted to China', - }, { - # Title with double quotes - 'url': 'http://www.bilibili.com/video/av8903802/', - 'info_dict': { - 'id': '8903802', - 'title': '阿滴英文|英文歌分享#6 "Closer', - 'description': '滴妹今天唱Closer給你聽! 有史以来,被推最多次也是最久的歌曲,其实歌词跟我原本想像差蛮多的,不过还是好听! 微博@阿滴英文', - }, - 'playlist': [{ - 'info_dict': { - 'id': '8903802_part1', - 'ext': 'flv', - 'title': '阿滴英文|英文歌分享#6 "Closer', - 'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a', - 'uploader': '阿滴英文', - 'uploader_id': '65880958', - 'timestamp': 1488382634, - 'upload_date': '20170301', - }, - 'params': { - 'skip_download': True, # Test metadata only - }, - }, { - 'info_dict': { - 'id': '8903802_part2', - 'ext': 'flv', - 'title': '阿滴英文|英文歌分享#6 "Closer', - 'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a', - 'uploader': '阿滴英文', - 'uploader_id': '65880958', - 'timestamp': 1488382634, - 'upload_date': '20170301', - }, - 'params': { - 'skip_download': True, # Test metadata only - }, - }] - }, { - # new BV video id format - 'url': 'https://www.bilibili.com/video/BV1JE411F741', - 'only_matching': True, - }] - - _APP_KEY = 'iVGUTjsxvpLeuDCf' - _BILIBILI_KEY = 'aHRmhWMLkdeMuILqORnYZocwMBpMEOdt' - - def _report_error(self, result): - if 'message' in result: - raise ExtractorError('%s said: %s' % (self.IE_NAME, result['message']), expected=True) - elif 'code' in result: - raise ExtractorError('%s returns error %d' % (self.IE_NAME, result['code']), expected=True) - else: - raise ExtractorError('Can\'t extract Bangumi episode ID') - - def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) - - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') or mobj.group('id_bv') - anime_id = mobj.group('anime_id') - webpage = self._download_webpage(url, video_id) - - if 'anime/' not in url: - cid = self._search_regex( - r'\bcid(?:["\']:|=)(\d+)', webpage, 'cid', - default=None - ) or compat_parse_qs(self._search_regex( - [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)', - r'EmbedPlayer\([^)]+,\s*\\"([^"]+)\\"\)', - r'<iframe[^>]+src="https://secure\.bilibili\.com/secure,([^"]+)"'], - webpage, 'player parameters'))['cid'][0] - else: - if 'no_bangumi_tip' not in smuggled_data: - self.to_screen('Downloading episode %s. To download all videos in anime %s, re-run youtube-dl with %s' % ( - video_id, anime_id, compat_urlparse.urljoin(url, '//bangumi.bilibili.com/anime/%s' % anime_id))) - headers = { - 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', - 'Referer': url - } - headers.update(self.geo_verification_headers()) - - js = self._download_json( - 'http://bangumi.bilibili.com/web_api/get_source', video_id, - data=urlencode_postdata({'episode_id': video_id}), - headers=headers) - if 'result' not in js: - self._report_error(js) - cid = js['result']['cid'] - - headers = { - 'Accept': 'application/json', - 'Referer': url - } - headers.update(self.geo_verification_headers()) - - entries = [] - - RENDITIONS = ('qn=80&quality=80&type=', 'quality=2&type=mp4') - for num, rendition in enumerate(RENDITIONS, start=1): - payload = 'appkey=%s&cid=%s&otype=json&%s' % (self._APP_KEY, cid, rendition) - sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest() - - video_info = self._download_json( - 'http://interface.bilibili.com/v2/playurl?%s&sign=%s' % (payload, sign), - video_id, note='Downloading video info page', - headers=headers, fatal=num == len(RENDITIONS)) - - if not video_info: - continue - - if 'durl' not in video_info: - if num < len(RENDITIONS): - continue - self._report_error(video_info) - - for idx, durl in enumerate(video_info['durl']): - formats = [{ - 'url': durl['url'], - 'filesize': int_or_none(durl['size']), - }] - for backup_url in durl.get('backup_url', []): - formats.append({ - 'url': backup_url, - # backup URLs have lower priorities - 'preference': -2 if 'hd.mp4' in backup_url else -3, - }) - - for a_format in formats: - a_format.setdefault('http_headers', {}).update({ - 'Referer': url, - }) - - self._sort_formats(formats) - - entries.append({ - 'id': '%s_part%s' % (video_id, idx), - 'duration': float_or_none(durl.get('length'), 1000), - 'formats': formats, - }) - break - - title = self._html_search_regex( - ('<h1[^>]+\btitle=(["\'])(?P<title>(?:(?!\1).)+)\1', - '(?s)<h1[^>]*>(?P<title>.+?)</h1>'), webpage, 'title', - group='title') - description = self._html_search_meta('description', webpage) - timestamp = unified_timestamp(self._html_search_regex( - r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time', - default=None) or self._html_search_meta( - 'uploadDate', webpage, 'timestamp', default=None)) - thumbnail = self._html_search_meta(['og:image', 'thumbnailUrl'], webpage) - - # TODO 'view_count' requires deobfuscating Javascript - info = { - 'id': video_id, - 'title': title, - 'description': description, - 'timestamp': timestamp, - 'thumbnail': thumbnail, - 'duration': float_or_none(video_info.get('timelength'), scale=1000), - } - - uploader_mobj = re.search( - r'<a[^>]+href="(?:https?:)?//space\.bilibili\.com/(?P<id>\d+)"[^>]*>(?P<name>[^<]+)', - webpage) - if uploader_mobj: - info.update({ - 'uploader': uploader_mobj.group('name').strip(), - 'uploader_id': uploader_mobj.group('id'), - }) - if not info.get('uploader'): - info['uploader'] = self._html_search_meta( - 'author', webpage, 'uploader', default=None) - - for entry in entries: - entry.update(info) - - if len(entries) == 1: - return entries[0] - else: - for idx, entry in enumerate(entries): - entry['id'] = '%s_part%d' % (video_id, (idx + 1)) - - return { - '_type': 'multi_video', - 'id': video_id, - 'title': title, - 'description': description, - 'entries': entries, - } - - -class BiliBiliBangumiIE(InfoExtractor): - _VALID_URL = r'https?://bangumi\.bilibili\.com/anime/(?P<id>\d+)' - - IE_NAME = 'bangumi.bilibili.com' - IE_DESC = 'BiliBili番剧' - - _TESTS = [{ - 'url': 'http://bangumi.bilibili.com/anime/1869', - 'info_dict': { - 'id': '1869', - 'title': '混沌武士', - 'description': 'md5:6a9622b911565794c11f25f81d6a97d2', - }, - 'playlist_count': 26, - }, { - 'url': 'http://bangumi.bilibili.com/anime/1869', - 'info_dict': { - 'id': '1869', - 'title': '混沌武士', - 'description': 'md5:6a9622b911565794c11f25f81d6a97d2', - }, - 'playlist': [{ - 'md5': '91da8621454dd58316851c27c68b0c13', - 'info_dict': { - 'id': '40062', - 'ext': 'mp4', - 'title': '混沌武士', - 'description': '故事发生在日本的江户时代。风是一个小酒馆的打工女。一日,酒馆里来了一群恶霸,虽然他们的举动令风十分不满,但是毕竟风只是一届女流,无法对他们采取什么行动,只能在心里嘟哝。这时,酒家里又进来了个“不良份子...', - 'timestamp': 1414538739, - 'upload_date': '20141028', - 'episode': '疾风怒涛 Tempestuous Temperaments', - 'episode_number': 1, - }, - }], - 'params': { - 'playlist_items': '1', - }, - }] - - @classmethod - def suitable(cls, url): - return False if BiliBiliIE.suitable(url) else super(BiliBiliBangumiIE, cls).suitable(url) - - def _real_extract(self, url): - bangumi_id = self._match_id(url) - - # Sometimes this API returns a JSONP response - season_info = self._download_json( - 'http://bangumi.bilibili.com/jsonp/seasoninfo/%s.ver' % bangumi_id, - bangumi_id, transform_source=strip_jsonp)['result'] - - entries = [{ - '_type': 'url_transparent', - 'url': smuggle_url(episode['webplay_url'], {'no_bangumi_tip': 1}), - 'ie_key': BiliBiliIE.ie_key(), - 'timestamp': parse_iso8601(episode.get('update_time'), delimiter=' '), - 'episode': episode.get('index_title'), - 'episode_number': int_or_none(episode.get('index')), - } for episode in season_info['episodes']] - - entries = sorted(entries, key=lambda entry: entry.get('episode_number')) - - return self.playlist_result( - entries, bangumi_id, - season_info.get('bangumi_title'), season_info.get('evaluate')) - - -class BilibiliAudioBaseIE(InfoExtractor): - def _call_api(self, path, sid, query=None): - if not query: - query = {'sid': sid} - return self._download_json( - 'https://www.bilibili.com/audio/music-service-c/web/' + path, - sid, query=query)['data'] - - -class BilibiliAudioIE(BilibiliAudioBaseIE): - _VALID_URL = r'https?://(?:www\.)?bilibili\.com/audio/au(?P<id>\d+)' - _TEST = { - 'url': 'https://www.bilibili.com/audio/au1003142', - 'md5': 'fec4987014ec94ef9e666d4d158ad03b', - 'info_dict': { - 'id': '1003142', - 'ext': 'm4a', - 'title': '【tsukimi】YELLOW / 神山羊', - 'artist': 'tsukimi', - 'comment_count': int, - 'description': 'YELLOW的mp3版!', - 'duration': 183, - 'subtitles': { - 'origin': [{ - 'ext': 'lrc', - }], - }, - 'thumbnail': r're:^https?://.+\.jpg', - 'timestamp': 1564836614, - 'upload_date': '20190803', - 'uploader': 'tsukimi-つきみぐー', - 'view_count': int, - }, - } - - def _real_extract(self, url): - au_id = self._match_id(url) - - play_data = self._call_api('url', au_id) - formats = [{ - 'url': play_data['cdns'][0], - 'filesize': int_or_none(play_data.get('size')), - }] - - song = self._call_api('song/info', au_id) - title = song['title'] - statistic = song.get('statistic') or {} - - subtitles = None - lyric = song.get('lyric') - if lyric: - subtitles = { - 'origin': [{ - 'url': lyric, - }] - } - - return { - 'id': au_id, - 'title': title, - 'formats': formats, - 'artist': song.get('author'), - 'comment_count': int_or_none(statistic.get('comment')), - 'description': song.get('intro'), - 'duration': int_or_none(song.get('duration')), - 'subtitles': subtitles, - 'thumbnail': song.get('cover'), - 'timestamp': int_or_none(song.get('passtime')), - 'uploader': song.get('uname'), - 'view_count': int_or_none(statistic.get('play')), - } - - -class BilibiliAudioAlbumIE(BilibiliAudioBaseIE): - _VALID_URL = r'https?://(?:www\.)?bilibili\.com/audio/am(?P<id>\d+)' - _TEST = { - 'url': 'https://www.bilibili.com/audio/am10624', - 'info_dict': { - 'id': '10624', - 'title': '每日新曲推荐(每日11:00更新)', - 'description': '每天11:00更新,为你推送最新音乐', - }, - 'playlist_count': 19, - } - - def _real_extract(self, url): - am_id = self._match_id(url) - - songs = self._call_api( - 'song/of-menu', am_id, {'sid': am_id, 'pn': 1, 'ps': 100})['data'] - - entries = [] - for song in songs: - sid = str_or_none(song.get('id')) - if not sid: - continue - entries.append(self.url_result( - 'https://www.bilibili.com/audio/au' + sid, - BilibiliAudioIE.ie_key(), sid)) - - if entries: - album_data = self._call_api('menu/info', am_id) or {} - album_title = album_data.get('title') - if album_title: - for entry in entries: - entry['album'] = album_title - return self.playlist_result( - entries, am_id, album_title, album_data.get('intro')) - - return self.playlist_result(entries, am_id) - - -class BiliBiliPlayerIE(InfoExtractor): - _VALID_URL = r'https?://player\.bilibili\.com/player\.html\?.*?\baid=(?P<id>\d+)' - _TEST = { - 'url': 'http://player.bilibili.com/player.html?aid=92494333&cid=157926707&page=1', - 'only_matching': True, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - return self.url_result( - 'http://www.bilibili.tv/video/av%s/' % video_id, - ie=BiliBiliIE.ie_key(), video_id=video_id) diff --git a/youtube_dl/extractor/bitchute.py b/youtube_dl/extractor/bitchute.py deleted file mode 100644 index 0c773e66e..000000000 --- a/youtube_dl/extractor/bitchute.py +++ /dev/null @@ -1,142 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import itertools -import re - -from .common import InfoExtractor -from ..utils import ( - orderedSet, - unified_strdate, - urlencode_postdata, -) - - -class BitChuteIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?bitchute\.com/(?:video|embed|torrent/[^/]+)/(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'https://www.bitchute.com/video/szoMrox2JEI/', - 'md5': '66c4a70e6bfc40dcb6be3eb1d74939eb', - 'info_dict': { - 'id': 'szoMrox2JEI', - 'ext': 'mp4', - 'title': 'Fuck bitches get money', - 'description': 'md5:3f21f6fb5b1d17c3dee9cf6b5fe60b3a', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Victoria X Rave', - 'upload_date': '20170813', - }, - }, { - 'url': 'https://www.bitchute.com/embed/lbb5G1hjPhw/', - 'only_matching': True, - }, { - 'url': 'https://www.bitchute.com/torrent/Zee5BE49045h/szoMrox2JEI.webtorrent', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage( - 'https://www.bitchute.com/video/%s' % video_id, video_id, headers={ - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.57 Safari/537.36', - }) - - title = self._html_search_regex( - (r'<[^>]+\bid=["\']video-title[^>]+>([^<]+)', r'<title>([^<]+)'), - webpage, 'title', default=None) or self._html_search_meta( - 'description', webpage, 'title', - default=None) or self._og_search_description(webpage) - - format_urls = [] - for mobj in re.finditer( - r'addWebSeed\s*\(\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage): - format_urls.append(mobj.group('url')) - format_urls.extend(re.findall(r'as=(https?://[^&"\']+)', webpage)) - - formats = [ - {'url': format_url} - for format_url in orderedSet(format_urls)] - - if not formats: - formats = self._parse_html5_media_entries( - url, webpage, video_id)[0]['formats'] - - self._check_formats(formats, video_id) - self._sort_formats(formats) - - description = self._html_search_regex( - r'(?s)<div\b[^>]+\bclass=["\']full hidden[^>]+>(.+?)</div>', - webpage, 'description', fatal=False) - thumbnail = self._og_search_thumbnail( - webpage, default=None) or self._html_search_meta( - 'twitter:image:src', webpage, 'thumbnail') - uploader = self._html_search_regex( - (r'(?s)<div class=["\']channel-banner.*?<p\b[^>]+\bclass=["\']name[^>]+>(.+?)</p>', - r'(?s)<p\b[^>]+\bclass=["\']video-author[^>]+>(.+?)</p>'), - webpage, 'uploader', fatal=False) - - upload_date = unified_strdate(self._search_regex( - r'class=["\']video-publish-date[^>]+>[^<]+ at \d+:\d+ UTC on (.+?)\.', - webpage, 'upload date', fatal=False)) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'upload_date': upload_date, - 'formats': formats, - } - - -class BitChuteChannelIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?bitchute\.com/channel/(?P<id>[^/?#&]+)' - _TEST = { - 'url': 'https://www.bitchute.com/channel/victoriaxrave/', - 'playlist_mincount': 185, - 'info_dict': { - 'id': 'victoriaxrave', - }, - } - - _TOKEN = 'zyG6tQcGPE5swyAEFLqKUwMuMMuF6IO2DZ6ZDQjGfsL0e4dcTLwqkTTul05Jdve7' - - def _entries(self, channel_id): - channel_url = 'https://www.bitchute.com/channel/%s/' % channel_id - offset = 0 - for page_num in itertools.count(1): - data = self._download_json( - '%sextend/' % channel_url, channel_id, - 'Downloading channel page %d' % page_num, - data=urlencode_postdata({ - 'csrfmiddlewaretoken': self._TOKEN, - 'name': '', - 'offset': offset, - }), headers={ - 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', - 'Referer': channel_url, - 'X-Requested-With': 'XMLHttpRequest', - 'Cookie': 'csrftoken=%s' % self._TOKEN, - }) - if data.get('success') is False: - break - html = data.get('html') - if not html: - break - video_ids = re.findall( - r'class=["\']channel-videos-image-container[^>]+>\s*<a\b[^>]+\bhref=["\']/video/([^"\'/]+)', - html) - if not video_ids: - break - offset += len(video_ids) - for video_id in video_ids: - yield self.url_result( - 'https://www.bitchute.com/video/%s' % video_id, - ie=BitChuteIE.ie_key(), video_id=video_id) - - def _real_extract(self, url): - channel_id = self._match_id(url) - return self.playlist_result( - self._entries(channel_id), playlist_id=channel_id) diff --git a/youtube_dl/extractor/bokecc.py b/youtube_dl/extractor/bokecc.py deleted file mode 100644 index 6017e8344..000000000 --- a/youtube_dl/extractor/bokecc.py +++ /dev/null @@ -1,60 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_parse_qs -from ..utils import ExtractorError - - -class BokeCCBaseIE(InfoExtractor): - def _extract_bokecc_formats(self, webpage, video_id, format_id=None): - player_params_str = self._html_search_regex( - r'<(?:script|embed)[^>]+src=(?P<q>["\'])(?:https?:)?//p\.bokecc\.com/(?:player|flash/player\.swf)\?(?P<query>.+?)(?P=q)', - webpage, 'player params', group='query') - - player_params = compat_parse_qs(player_params_str) - - info_xml = self._download_xml( - 'http://p.bokecc.com/servlet/playinfo?uid=%s&vid=%s&m=1' % ( - player_params['siteid'][0], player_params['vid'][0]), video_id) - - formats = [{ - 'format_id': format_id, - 'url': quality.find('./copy').attrib['playurl'], - 'preference': int(quality.attrib['value']), - } for quality in info_xml.findall('./video/quality')] - - self._sort_formats(formats) - - return formats - - -class BokeCCIE(BokeCCBaseIE): - _IE_DESC = 'CC视频' - _VALID_URL = r'https?://union\.bokecc\.com/playvideo\.bo\?(?P<query>.*)' - - _TESTS = [{ - 'url': 'http://union.bokecc.com/playvideo.bo?vid=E0ABAE9D4F509B189C33DC5901307461&uid=FE644790DE9D154A', - 'info_dict': { - 'id': 'FE644790DE9D154A_E0ABAE9D4F509B189C33DC5901307461', - 'ext': 'flv', - 'title': 'BokeCC Video', - }, - }] - - def _real_extract(self, url): - qs = compat_parse_qs(re.match(self._VALID_URL, url).group('query')) - if not qs.get('vid') or not qs.get('uid'): - raise ExtractorError('Invalid URL', expected=True) - - video_id = '%s_%s' % (qs['uid'][0], qs['vid'][0]) - - webpage = self._download_webpage(url, video_id) - - return { - 'id': video_id, - 'title': 'BokeCC Video', # no title provided in the webpage - 'formats': self._extract_bokecc_formats(webpage, video_id), - } diff --git a/youtube_dl/extractor/bongacams.py b/youtube_dl/extractor/bongacams.py deleted file mode 100644 index 180542fbc..000000000 --- a/youtube_dl/extractor/bongacams.py +++ /dev/null @@ -1,60 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - int_or_none, - try_get, - urlencode_postdata, -) - - -class BongaCamsIE(InfoExtractor): - _VALID_URL = r'https?://(?P<host>(?:[^/]+\.)?bongacams\d*\.com)/(?P<id>[^/?&#]+)' - _TESTS = [{ - 'url': 'https://de.bongacams.com/azumi-8', - 'only_matching': True, - }, { - 'url': 'https://cn.bongacams.com/azumi-8', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - host = mobj.group('host') - channel_id = mobj.group('id') - - amf = self._download_json( - 'https://%s/tools/amf.php' % host, channel_id, - data=urlencode_postdata(( - ('method', 'getRoomData'), - ('args[]', channel_id), - ('args[]', 'false'), - )), headers={'X-Requested-With': 'XMLHttpRequest'}) - - server_url = amf['localData']['videoServerUrl'] - - uploader_id = try_get( - amf, lambda x: x['performerData']['username'], compat_str) or channel_id - uploader = try_get( - amf, lambda x: x['performerData']['displayName'], compat_str) - like_count = int_or_none(try_get( - amf, lambda x: x['performerData']['loversCount'])) - - formats = self._extract_m3u8_formats( - '%s/hls/stream_%s/playlist.m3u8' % (server_url, uploader_id), - channel_id, 'mp4', m3u8_id='hls', live=True) - self._sort_formats(formats) - - return { - 'id': channel_id, - 'title': self._live_title(uploader or uploader_id), - 'uploader': uploader, - 'uploader_id': uploader_id, - 'like_count': like_count, - 'age_limit': 18, - 'is_live': True, - 'formats': formats, - } diff --git a/youtube_dl/extractor/box.py b/youtube_dl/extractor/box.py deleted file mode 100644 index aae82d1af..000000000 --- a/youtube_dl/extractor/box.py +++ /dev/null @@ -1,98 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import json -import re - -from .common import InfoExtractor -from ..utils import ( - determine_ext, - parse_iso8601, - # try_get, - update_url_query, -) - - -class BoxIE(InfoExtractor): - _VALID_URL = r'https?://(?:[^.]+\.)?app\.box\.com/s/(?P<shared_name>[^/]+)/file/(?P<id>\d+)' - _TEST = { - 'url': 'https://mlssoccer.app.box.com/s/0evd2o3e08l60lr4ygukepvnkord1o1x/file/510727257538', - 'md5': '1f81b2fd3960f38a40a3b8823e5fcd43', - 'info_dict': { - 'id': '510727257538', - 'ext': 'mp4', - 'title': 'Garber St. Louis will be 28th MLS team +scarving.mp4', - 'uploader': 'MLS Video', - 'timestamp': 1566320259, - 'upload_date': '20190820', - 'uploader_id': '235196876', - } - } - - def _real_extract(self, url): - shared_name, file_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage(url, file_id) - request_token = self._parse_json(self._search_regex( - r'Box\.config\s*=\s*({.+?});', webpage, - 'Box config'), file_id)['requestToken'] - access_token = self._download_json( - 'https://app.box.com/app-api/enduserapp/elements/tokens', file_id, - 'Downloading token JSON metadata', - data=json.dumps({'fileIDs': [file_id]}).encode(), headers={ - 'Content-Type': 'application/json', - 'X-Request-Token': request_token, - 'X-Box-EndUser-API': 'sharedName=' + shared_name, - })[file_id]['read'] - shared_link = 'https://app.box.com/s/' + shared_name - f = self._download_json( - 'https://api.box.com/2.0/files/' + file_id, file_id, - 'Downloading file JSON metadata', headers={ - 'Authorization': 'Bearer ' + access_token, - 'BoxApi': 'shared_link=' + shared_link, - 'X-Rep-Hints': '[dash]', # TODO: extract `hls` formats - }, query={ - 'fields': 'authenticated_download_url,created_at,created_by,description,extension,is_download_available,name,representations,size' - }) - title = f['name'] - - query = { - 'access_token': access_token, - 'shared_link': shared_link - } - - formats = [] - - # for entry in (try_get(f, lambda x: x['representations']['entries'], list) or []): - # entry_url_template = try_get( - # entry, lambda x: x['content']['url_template']) - # if not entry_url_template: - # continue - # representation = entry.get('representation') - # if representation == 'dash': - # TODO: append query to every fragment URL - # formats.extend(self._extract_mpd_formats( - # entry_url_template.replace('{+asset_path}', 'manifest.mpd'), - # file_id, query=query)) - - authenticated_download_url = f.get('authenticated_download_url') - if authenticated_download_url and f.get('is_download_available'): - formats.append({ - 'ext': f.get('extension') or determine_ext(title), - 'filesize': f.get('size'), - 'format_id': 'download', - 'url': update_url_query(authenticated_download_url, query), - }) - - self._sort_formats(formats) - - creator = f.get('created_by') or {} - - return { - 'id': file_id, - 'title': title, - 'formats': formats, - 'description': f.get('description') or None, - 'uploader': creator.get('name'), - 'timestamp': parse_iso8601(f.get('created_at')), - 'uploader_id': creator.get('id'), - } diff --git a/youtube_dl/extractor/bpb.py b/youtube_dl/extractor/bpb.py deleted file mode 100644 index 07833532e..000000000 --- a/youtube_dl/extractor/bpb.py +++ /dev/null @@ -1,62 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - js_to_json, - determine_ext, -) - - -class BpbIE(InfoExtractor): - IE_DESC = 'Bundeszentrale für politische Bildung' - _VALID_URL = r'https?://(?:www\.)?bpb\.de/mediathek/(?P<id>[0-9]+)/' - - _TEST = { - 'url': 'http://www.bpb.de/mediathek/297/joachim-gauck-zu-1989-und-die-erinnerung-an-die-ddr', - # md5 fails in Python 2.6 due to buggy server response and wrong handling of urllib2 - 'md5': 'c4f84c8a8044ca9ff68bb8441d300b3f', - 'info_dict': { - 'id': '297', - 'ext': 'mp4', - 'title': 'Joachim Gauck zu 1989 und die Erinnerung an die DDR', - 'description': 'Joachim Gauck, erster Beauftragter für die Stasi-Unterlagen, spricht auf dem Geschichtsforum über die friedliche Revolution 1989 und eine "gewisse Traurigkeit" im Umgang mit der DDR-Vergangenheit.' - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - title = self._html_search_regex( - r'<h2 class="white">(.*?)</h2>', webpage, 'title') - video_info_dicts = re.findall( - r"({\s*src\s*:\s*'https?://film\.bpb\.de/[^}]+})", webpage) - - formats = [] - for video_info in video_info_dicts: - video_info = self._parse_json( - video_info, video_id, transform_source=js_to_json, fatal=False) - if not video_info: - continue - video_url = video_info.get('src') - if not video_url: - continue - quality = 'high' if '_high' in video_url else 'low' - formats.append({ - 'url': video_url, - 'preference': 10 if quality == 'high' else 0, - 'format_note': quality, - 'format_id': '%s-%s' % (quality, determine_ext(video_url)), - }) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'formats': formats, - 'title': title, - 'description': self._og_search_description(webpage), - } diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py deleted file mode 100644 index 9bde7f2d8..000000000 --- a/youtube_dl/extractor/br.py +++ /dev/null @@ -1,311 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import json -import re - -from .common import InfoExtractor -from ..utils import ( - determine_ext, - ExtractorError, - int_or_none, - parse_duration, - parse_iso8601, - xpath_element, - xpath_text, -) - - -class BRIE(InfoExtractor): - IE_DESC = 'Bayerischer Rundfunk' - _VALID_URL = r'(?P<base_url>https?://(?:www\.)?br(?:-klassik)?\.de)/(?:[a-z0-9\-_]+/)+(?P<id>[a-z0-9\-_]+)\.html' - - _TESTS = [ - { - 'url': 'http://www.br.de/mediathek/video/sendungen/abendschau/betriebliche-altersvorsorge-104.html', - 'md5': '83a0477cf0b8451027eb566d88b51106', - 'info_dict': { - 'id': '48f656ef-287e-486f-be86-459122db22cc', - 'ext': 'mp4', - 'title': 'Die böse Überraschung', - 'description': 'md5:ce9ac81b466ce775b8018f6801b48ac9', - 'duration': 180, - 'uploader': 'Reinhard Weber', - 'upload_date': '20150422', - }, - 'skip': '404 not found', - }, - { - 'url': 'http://www.br.de/nachrichten/oberbayern/inhalt/muenchner-polizeipraesident-schreiber-gestorben-100.html', - 'md5': 'af3a3a4aa43ff0ce6a89504c67f427ef', - 'info_dict': { - 'id': 'a4b83e34-123d-4b81-9f4e-c0d3121a4e05', - 'ext': 'flv', - 'title': 'Manfred Schreiber ist tot', - 'description': 'md5:b454d867f2a9fc524ebe88c3f5092d97', - 'duration': 26, - }, - 'skip': '404 not found', - }, - { - 'url': 'https://www.br-klassik.de/audio/peeping-tom-premierenkritik-dance-festival-muenchen-100.html', - 'md5': '8b5b27c0b090f3b35eac4ab3f7a73d3d', - 'info_dict': { - 'id': '74c603c9-26d3-48bb-b85b-079aeed66e0b', - 'ext': 'aac', - 'title': 'Kurzweilig und sehr bewegend', - 'description': 'md5:0351996e3283d64adeb38ede91fac54e', - 'duration': 296, - }, - 'skip': '404 not found', - }, - { - 'url': 'http://www.br.de/radio/bayern1/service/team/videos/team-video-erdelt100.html', - 'md5': 'dbab0aef2e047060ea7a21fc1ce1078a', - 'info_dict': { - 'id': '6ba73750-d405-45d3-861d-1ce8c524e059', - 'ext': 'mp4', - 'title': 'Umweltbewusster Häuslebauer', - 'description': 'md5:d52dae9792d00226348c1dbb13c9bae2', - 'duration': 116, - } - }, - { - 'url': 'http://www.br.de/fernsehen/br-alpha/sendungen/kant-fuer-anfaenger/kritik-der-reinen-vernunft/kant-kritik-01-metaphysik100.html', - 'md5': '23bca295f1650d698f94fc570977dae3', - 'info_dict': { - 'id': 'd982c9ce-8648-4753-b358-98abb8aec43d', - 'ext': 'mp4', - 'title': 'Folge 1 - Metaphysik', - 'description': 'md5:bb659990e9e59905c3d41e369db1fbe3', - 'duration': 893, - 'uploader': 'Eva Maria Steimle', - 'upload_date': '20170208', - } - }, - ] - - def _real_extract(self, url): - base_url, display_id = re.search(self._VALID_URL, url).groups() - page = self._download_webpage(url, display_id) - xml_url = self._search_regex( - r"return BRavFramework\.register\(BRavFramework\('avPlayer_(?:[a-f0-9-]{36})'\)\.setup\({dataURL:'(/(?:[a-z0-9\-]+/)+[a-z0-9/~_.-]+)'}\)\);", page, 'XMLURL') - xml = self._download_xml(base_url + xml_url, display_id) - - medias = [] - - for xml_media in xml.findall('video') + xml.findall('audio'): - media_id = xml_media.get('externalId') - media = { - 'id': media_id, - 'title': xpath_text(xml_media, 'title', 'title', True), - 'duration': parse_duration(xpath_text(xml_media, 'duration')), - 'formats': self._extract_formats(xpath_element( - xml_media, 'assets'), media_id), - 'thumbnails': self._extract_thumbnails(xpath_element( - xml_media, 'teaserImage/variants'), base_url), - 'description': xpath_text(xml_media, 'desc'), - 'webpage_url': xpath_text(xml_media, 'permalink'), - 'uploader': xpath_text(xml_media, 'author'), - } - broadcast_date = xpath_text(xml_media, 'broadcastDate') - if broadcast_date: - media['upload_date'] = ''.join(reversed(broadcast_date.split('.'))) - medias.append(media) - - if len(medias) > 1: - self._downloader.report_warning( - 'found multiple medias; please ' - 'report this with the video URL to http://yt-dl.org/bug') - if not medias: - raise ExtractorError('No media entries found') - return medias[0] - - def _extract_formats(self, assets, media_id): - formats = [] - for asset in assets.findall('asset'): - format_url = xpath_text(asset, ['downloadUrl', 'url']) - asset_type = asset.get('type') - if asset_type.startswith('HDS'): - formats.extend(self._extract_f4m_formats( - format_url + '?hdcore=3.2.0', media_id, f4m_id='hds', fatal=False)) - elif asset_type.startswith('HLS'): - formats.extend(self._extract_m3u8_formats( - format_url, media_id, 'mp4', 'm3u8_native', m3u8_id='hds', fatal=False)) - else: - format_info = { - 'ext': xpath_text(asset, 'mediaType'), - 'width': int_or_none(xpath_text(asset, 'frameWidth')), - 'height': int_or_none(xpath_text(asset, 'frameHeight')), - 'tbr': int_or_none(xpath_text(asset, 'bitrateVideo')), - 'abr': int_or_none(xpath_text(asset, 'bitrateAudio')), - 'vcodec': xpath_text(asset, 'codecVideo'), - 'acodec': xpath_text(asset, 'codecAudio'), - 'container': xpath_text(asset, 'mediaType'), - 'filesize': int_or_none(xpath_text(asset, 'size')), - } - format_url = self._proto_relative_url(format_url) - if format_url: - http_format_info = format_info.copy() - http_format_info.update({ - 'url': format_url, - 'format_id': 'http-%s' % asset_type, - }) - formats.append(http_format_info) - server_prefix = xpath_text(asset, 'serverPrefix') - if server_prefix: - rtmp_format_info = format_info.copy() - rtmp_format_info.update({ - 'url': server_prefix, - 'play_path': xpath_text(asset, 'fileName'), - 'format_id': 'rtmp-%s' % asset_type, - }) - formats.append(rtmp_format_info) - self._sort_formats(formats) - return formats - - def _extract_thumbnails(self, variants, base_url): - thumbnails = [{ - 'url': base_url + xpath_text(variant, 'url'), - 'width': int_or_none(xpath_text(variant, 'width')), - 'height': int_or_none(xpath_text(variant, 'height')), - } for variant in variants.findall('variant') if xpath_text(variant, 'url')] - thumbnails.sort(key=lambda x: x['width'] * x['height'], reverse=True) - return thumbnails - - -class BRMediathekIE(InfoExtractor): - IE_DESC = 'Bayerischer Rundfunk Mediathek' - _VALID_URL = r'https?://(?:www\.)?br\.de/mediathek/video/[^/?&#]*?-(?P<id>av:[0-9a-f]{24})' - - _TESTS = [{ - 'url': 'https://www.br.de/mediathek/video/gesundheit-die-sendung-vom-28112017-av:5a1e6a6e8fce6d001871cc8e', - 'md5': 'fdc3d485835966d1622587d08ba632ec', - 'info_dict': { - 'id': 'av:5a1e6a6e8fce6d001871cc8e', - 'ext': 'mp4', - 'title': 'Die Sendung vom 28.11.2017', - 'description': 'md5:6000cdca5912ab2277e5b7339f201ccc', - 'timestamp': 1511942766, - 'upload_date': '20171129', - } - }] - - def _real_extract(self, url): - clip_id = self._match_id(url) - - clip = self._download_json( - 'https://proxy-base.master.mango.express/graphql', - clip_id, data=json.dumps({ - "query": """{ - viewer { - clip(id: "%s") { - title - description - duration - createdAt - ageRestriction - videoFiles { - edges { - node { - publicLocation - fileSize - videoProfile { - width - height - bitrate - encoding - } - } - } - } - captionFiles { - edges { - node { - publicLocation - } - } - } - teaserImages { - edges { - node { - imageFiles { - edges { - node { - publicLocation - width - height - } - } - } - } - } - } - } - } -}""" % clip_id}).encode(), headers={ - 'Content-Type': 'application/json', - })['data']['viewer']['clip'] - title = clip['title'] - - formats = [] - for edge in clip.get('videoFiles', {}).get('edges', []): - node = edge.get('node', {}) - n_url = node.get('publicLocation') - if not n_url: - continue - ext = determine_ext(n_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - n_url, clip_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - else: - video_profile = node.get('videoProfile', {}) - tbr = int_or_none(video_profile.get('bitrate')) - format_id = 'http' - if tbr: - format_id += '-%d' % tbr - formats.append({ - 'format_id': format_id, - 'url': n_url, - 'width': int_or_none(video_profile.get('width')), - 'height': int_or_none(video_profile.get('height')), - 'tbr': tbr, - 'filesize': int_or_none(node.get('fileSize')), - }) - self._sort_formats(formats) - - subtitles = {} - for edge in clip.get('captionFiles', {}).get('edges', []): - node = edge.get('node', {}) - n_url = node.get('publicLocation') - if not n_url: - continue - subtitles.setdefault('de', []).append({ - 'url': n_url, - }) - - thumbnails = [] - for edge in clip.get('teaserImages', {}).get('edges', []): - for image_edge in edge.get('node', {}).get('imageFiles', {}).get('edges', []): - node = image_edge.get('node', {}) - n_url = node.get('publicLocation') - if not n_url: - continue - thumbnails.append({ - 'url': n_url, - 'width': int_or_none(node.get('width')), - 'height': int_or_none(node.get('height')), - }) - - return { - 'id': clip_id, - 'title': title, - 'description': clip.get('description'), - 'duration': int_or_none(clip.get('duration')), - 'timestamp': parse_iso8601(clip.get('createdAt')), - 'age_limit': int_or_none(clip.get('ageRestriction')), - 'formats': formats, - 'subtitles': subtitles, - 'thumbnails': thumbnails, - } diff --git a/youtube_dl/extractor/bravotv.py b/youtube_dl/extractor/bravotv.py deleted file mode 100644 index bae2aedce..000000000 --- a/youtube_dl/extractor/bravotv.py +++ /dev/null @@ -1,90 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .adobepass import AdobePassIE -from ..utils import ( - smuggle_url, - update_url_query, - int_or_none, -) - - -class BravoTVIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?(?P<req_id>bravotv|oxygen)\.com/(?:[^/]+/)+(?P<id>[^/?#]+)' - _TESTS = [{ - 'url': 'https://www.bravotv.com/top-chef/season-16/episode-15/videos/the-top-chef-season-16-winner-is', - 'md5': 'e34684cfea2a96cd2ee1ef3a60909de9', - 'info_dict': { - 'id': 'epL0pmK1kQlT', - 'ext': 'mp4', - 'title': 'The Top Chef Season 16 Winner Is...', - 'description': 'Find out who takes the title of Top Chef!', - 'uploader': 'NBCU-BRAV', - 'upload_date': '20190314', - 'timestamp': 1552591860, - } - }, { - 'url': 'http://www.bravotv.com/below-deck/season-3/ep-14-reunion-part-1', - 'only_matching': True, - }, { - 'url': 'https://www.oxygen.com/in-ice-cold-blood/season-2/episode-16/videos/handling-the-horwitz-house-after-the-murder-season-2', - 'only_matching': True, - }] - - def _real_extract(self, url): - site, display_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage(url, display_id) - settings = self._parse_json(self._search_regex( - r'<script[^>]+data-drupal-selector="drupal-settings-json"[^>]*>({.+?})</script>', webpage, 'drupal settings'), - display_id) - info = {} - query = { - 'mbr': 'true', - } - account_pid, release_pid = [None] * 2 - tve = settings.get('ls_tve') - if tve: - query['manifest'] = 'm3u' - mobj = re.search(r'<[^>]+id="pdk-player"[^>]+data-url=["\']?(?:https?:)?//player\.theplatform\.com/p/([^/]+)/(?:[^/]+/)*select/([^?#&"\']+)', webpage) - if mobj: - account_pid, tp_path = mobj.groups() - release_pid = tp_path.strip('/').split('/')[-1] - else: - account_pid = 'HNK2IC' - tp_path = release_pid = tve['release_pid'] - if tve.get('entitlement') == 'auth': - adobe_pass = settings.get('tve_adobe_auth', {}) - if site == 'bravotv': - site = 'bravo' - resource = self._get_mvpd_resource( - adobe_pass.get('adobePassResourceId') or site, - tve['title'], release_pid, tve.get('rating')) - query['auth'] = self._extract_mvpd_auth( - url, release_pid, - adobe_pass.get('adobePassRequestorId') or site, resource) - else: - shared_playlist = settings['ls_playlist'] - account_pid = shared_playlist['account_pid'] - metadata = shared_playlist['video_metadata'][shared_playlist['default_clip']] - tp_path = release_pid = metadata.get('release_pid') - if not release_pid: - release_pid = metadata['guid'] - tp_path = 'media/guid/2140479951/' + release_pid - info.update({ - 'title': metadata['title'], - 'description': metadata.get('description'), - 'season_number': int_or_none(metadata.get('season_num')), - 'episode_number': int_or_none(metadata.get('episode_num')), - }) - query['switch'] = 'progressive' - info.update({ - '_type': 'url_transparent', - 'id': release_pid, - 'url': smuggle_url(update_url_query( - 'http://link.theplatform.com/s/%s/%s' % (account_pid, tp_path), - query), {'force_smil_url': True}), - 'ie_key': 'ThePlatform', - }) - return info diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py deleted file mode 100644 index 68c7cf2bb..000000000 --- a/youtube_dl/extractor/breakcom.py +++ /dev/null @@ -1,91 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from .youtube import YoutubeIE -from ..utils import ( - int_or_none, - url_or_none, -) - - -class BreakIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?break\.com/video/(?P<display_id>[^/]+?)(?:-(?P<id>\d+))?(?:[/?#&]|$)' - _TESTS = [{ - 'url': 'http://www.break.com/video/when-girls-act-like-guys-2468056', - 'info_dict': { - 'id': '2468056', - 'ext': 'mp4', - 'title': 'When Girls Act Like D-Bags', - 'age_limit': 13, - }, - }, { - # youtube embed - 'url': 'http://www.break.com/video/someone-forgot-boat-brakes-work', - 'info_dict': { - 'id': 'RrrDLdeL2HQ', - 'ext': 'mp4', - 'title': 'Whale Watching Boat Crashing Into San Diego Dock', - 'description': 'md5:afc1b2772f0a8468be51dd80eb021069', - 'upload_date': '20160331', - 'uploader': 'Steve Holden', - 'uploader_id': 'sdholden07', - }, - 'params': { - 'skip_download': True, - } - }, { - 'url': 'http://www.break.com/video/ugc/baby-flex-2773063', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id, video_id = re.match(self._VALID_URL, url).groups() - - webpage = self._download_webpage(url, display_id) - - youtube_url = YoutubeIE._extract_url(webpage) - if youtube_url: - return self.url_result(youtube_url, ie=YoutubeIE.ie_key()) - - content = self._parse_json( - self._search_regex( - r'(?s)content["\']\s*:\s*(\[.+?\])\s*[,\n]', webpage, - 'content'), - display_id) - - formats = [] - for video in content: - video_url = url_or_none(video.get('url')) - if not video_url: - continue - bitrate = int_or_none(self._search_regex( - r'(\d+)_kbps', video_url, 'tbr', default=None)) - formats.append({ - 'url': video_url, - 'format_id': 'http-%d' % bitrate if bitrate else 'http', - 'tbr': bitrate, - }) - self._sort_formats(formats) - - title = self._search_regex( - (r'title["\']\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1', - r'<h1[^>]*>(?P<value>[^<]+)'), webpage, 'title', group='value') - - def get(key, name): - return int_or_none(self._search_regex( - r'%s["\']\s*:\s*["\'](\d+)' % key, webpage, name, - default=None)) - - age_limit = get('ratings', 'age limit') - video_id = video_id or get('pid', 'video id') or display_id - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'thumbnail': self._og_search_thumbnail(webpage), - 'age_limit': age_limit, - 'formats': formats, - } diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py deleted file mode 100644 index 6022076ac..000000000 --- a/youtube_dl/extractor/brightcove.py +++ /dev/null @@ -1,681 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import base64 -import re -import struct - -from .adobepass import AdobePassIE -from .common import InfoExtractor -from ..compat import ( - compat_etree_fromstring, - compat_HTTPError, - compat_parse_qs, - compat_urllib_parse_urlparse, - compat_urlparse, - compat_xml_parse_error, -) -from ..utils import ( - clean_html, - extract_attributes, - ExtractorError, - find_xpath_attr, - fix_xml_ampersands, - float_or_none, - int_or_none, - js_to_json, - mimetype2ext, - parse_iso8601, - smuggle_url, - str_or_none, - try_get, - unescapeHTML, - unsmuggle_url, - UnsupportedError, - update_url_query, - url_or_none, -) - - -class BrightcoveLegacyIE(InfoExtractor): - IE_NAME = 'brightcove:legacy' - _VALID_URL = r'(?:https?://.*brightcove\.com/(services|viewer).*?\?|brightcove:)(?P<query>.*)' - - _TESTS = [ - { - # From http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/ - 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1654948606001&flashID=myExperience&%40videoPlayer=2371591881001', - 'md5': '5423e113865d26e40624dce2e4b45d95', - 'note': 'Test Brightcove downloads and detection in GenericIE', - 'info_dict': { - 'id': '2371591881001', - 'ext': 'mp4', - 'title': 'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”', - 'uploader': '8TV', - 'description': 'md5:a950cc4285c43e44d763d036710cd9cd', - 'timestamp': 1368213670, - 'upload_date': '20130510', - 'uploader_id': '1589608506001', - }, - 'skip': 'The player has been deactivated by the content owner', - }, - { - # From http://medianetwork.oracle.com/video/player/1785452137001 - 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1217746023001&flashID=myPlayer&%40videoPlayer=1785452137001', - 'info_dict': { - 'id': '1785452137001', - 'ext': 'flv', - 'title': 'JVMLS 2012: Arrays 2.0 - Opportunities and Challenges', - 'description': 'John Rose speaks at the JVM Language Summit, August 1, 2012.', - 'uploader': 'Oracle', - 'timestamp': 1344975024, - 'upload_date': '20120814', - 'uploader_id': '1460825906', - }, - 'skip': 'video not playable', - }, - { - # From http://mashable.com/2013/10/26/thermoelectric-bracelet-lets-you-control-your-body-temperature/ - 'url': 'http://c.brightcove.com/services/viewer/federated_f9?&playerID=1265504713001&publisherID=AQ%7E%7E%2CAAABBzUwv1E%7E%2CxP-xFHVUstiMFlNYfvF4G9yFnNaqCw_9&videoID=2750934548001', - 'info_dict': { - 'id': '2750934548001', - 'ext': 'mp4', - 'title': 'This Bracelet Acts as a Personal Thermostat', - 'description': 'md5:547b78c64f4112766ccf4e151c20b6a0', - # 'uploader': 'Mashable', - 'timestamp': 1382041798, - 'upload_date': '20131017', - 'uploader_id': '1130468786001', - }, - }, - { - # test that the default referer works - # from http://national.ballet.ca/interact/video/Lost_in_Motion_II/ - 'url': 'http://link.brightcove.com/services/player/bcpid756015033001?bckey=AQ~~,AAAApYJi_Ck~,GxhXCegT1Dp39ilhXuxMJxasUhVNZiil&bctid=2878862109001', - 'info_dict': { - 'id': '2878862109001', - 'ext': 'mp4', - 'title': 'Lost in Motion II', - 'description': 'md5:363109c02998fee92ec02211bd8000df', - 'uploader': 'National Ballet of Canada', - }, - 'skip': 'Video gone', - }, - { - # test flv videos served by akamaihd.net - # From http://www.redbull.com/en/bike/stories/1331655643987/replay-uci-dh-world-cup-2014-from-fort-william - 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?%40videoPlayer=ref%3Aevent-stream-356&linkBaseURL=http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fvideos%2F1331655630249%2Freplay-uci-fort-william-2014-dh&playerKey=AQ%7E%7E%2CAAAApYJ7UqE%7E%2Cxqr_zXk0I-zzNndy8NlHogrCb5QdyZRf&playerID=1398061561001#__youtubedl_smuggle=%7B%22Referer%22%3A+%22http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fstories%2F1331655643987%2Freplay-uci-dh-world-cup-2014-from-fort-william%22%7D', - # The md5 checksum changes on each download - 'info_dict': { - 'id': '3750436379001', - 'ext': 'flv', - 'title': 'UCI MTB World Cup 2014: Fort William, UK - Downhill Finals', - 'uploader': 'RBTV Old (do not use)', - 'description': 'UCI MTB World Cup 2014: Fort William, UK - Downhill Finals', - 'timestamp': 1409122195, - 'upload_date': '20140827', - 'uploader_id': '710858724001', - }, - 'skip': 'Video gone', - }, - { - # playlist with 'videoList' - # from http://support.brightcove.com/en/video-cloud/docs/playlist-support-single-video-players - 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=3550052898001&playerKey=AQ%7E%7E%2CAAABmA9XpXk%7E%2C-Kp7jNgisre1fG5OdqpAFUTcs0lP_ZoL', - 'info_dict': { - 'title': 'Sealife', - 'id': '3550319591001', - }, - 'playlist_mincount': 7, - 'skip': 'Unsupported URL', - }, - { - # playlist with 'playlistTab' (https://github.com/ytdl-org/youtube-dl/issues/9965) - 'url': 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=AQ%7E%7E,AAABXlLMdok%7E,NJ4EoMlZ4rZdx9eU1rkMVd8EaYPBBUlg', - 'info_dict': { - 'id': '1522758701001', - 'title': 'Lesson 08', - }, - 'playlist_mincount': 10, - 'skip': 'Unsupported URL', - }, - { - # playerID inferred from bcpid - # from http://www.un.org/chinese/News/story.asp?NewsID=27724 - 'url': 'https://link.brightcove.com/services/player/bcpid1722935254001/?bctid=5360463607001&autoStart=false&secureConnections=true&width=650&height=350', - 'only_matching': True, # Tested in GenericIE - } - ] - - @classmethod - def _build_brightcove_url(cls, object_str): - """ - Build a Brightcove url from a xml string containing - <object class="BrightcoveExperience">{params}</object> - """ - - # Fix up some stupid HTML, see https://github.com/ytdl-org/youtube-dl/issues/1553 - object_str = re.sub(r'(<param(?:\s+[a-zA-Z0-9_]+="[^"]*")*)>', - lambda m: m.group(1) + '/>', object_str) - # Fix up some stupid XML, see https://github.com/ytdl-org/youtube-dl/issues/1608 - object_str = object_str.replace('<--', '<!--') - # remove namespace to simplify extraction - object_str = re.sub(r'(<object[^>]*)(xmlns=".*?")', r'\1', object_str) - object_str = fix_xml_ampersands(object_str) - - try: - object_doc = compat_etree_fromstring(object_str.encode('utf-8')) - except compat_xml_parse_error: - return - - fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars') - if fv_el is not None: - flashvars = dict( - (k, v[0]) - for k, v in compat_parse_qs(fv_el.attrib['value']).items()) - else: - flashvars = {} - - data_url = object_doc.attrib.get('data', '') - data_url_params = compat_parse_qs(compat_urllib_parse_urlparse(data_url).query) - - def find_param(name): - if name in flashvars: - return flashvars[name] - node = find_xpath_attr(object_doc, './param', 'name', name) - if node is not None: - return node.attrib['value'] - return data_url_params.get(name) - - params = {} - - playerID = find_param('playerID') or find_param('playerId') - if playerID is None: - raise ExtractorError('Cannot find player ID') - params['playerID'] = playerID - - playerKey = find_param('playerKey') - # Not all pages define this value - if playerKey is not None: - params['playerKey'] = playerKey - # These fields hold the id of the video - videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID') or find_param('@videoList') - if videoPlayer is not None: - if isinstance(videoPlayer, list): - videoPlayer = videoPlayer[0] - videoPlayer = videoPlayer.strip() - # UUID is also possible for videoPlayer (e.g. - # http://www.popcornflix.com/hoodies-vs-hooligans/7f2d2b87-bbf2-4623-acfb-ea942b4f01dd - # or http://www8.hp.com/cn/zh/home.html) - if not (re.match( - r'^(?:\d+|[\da-fA-F]{8}-?[\da-fA-F]{4}-?[\da-fA-F]{4}-?[\da-fA-F]{4}-?[\da-fA-F]{12})$', - videoPlayer) or videoPlayer.startswith('ref:')): - return None - params['@videoPlayer'] = videoPlayer - linkBase = find_param('linkBaseURL') - if linkBase is not None: - params['linkBaseURL'] = linkBase - return cls._make_brightcove_url(params) - - @classmethod - def _build_brightcove_url_from_js(cls, object_js): - # The layout of JS is as follows: - # customBC.createVideo = function (width, height, playerID, playerKey, videoPlayer, VideoRandomID) { - # // build Brightcove <object /> XML - # } - m = re.search( - r'''(?x)customBC\.createVideo\( - .*? # skipping width and height - ["\'](?P<playerID>\d+)["\']\s*,\s* # playerID - ["\'](?P<playerKey>AQ[^"\']{48})[^"\']*["\']\s*,\s* # playerKey begins with AQ and is 50 characters - # in length, however it's appended to itself - # in places, so truncate - ["\'](?P<videoID>\d+)["\'] # @videoPlayer - ''', object_js) - if m: - return cls._make_brightcove_url(m.groupdict()) - - @classmethod - def _make_brightcove_url(cls, params): - return update_url_query( - 'http://c.brightcove.com/services/viewer/htmlFederated', params) - - @classmethod - def _extract_brightcove_url(cls, webpage): - """Try to extract the brightcove url from the webpage, returns None - if it can't be found - """ - urls = cls._extract_brightcove_urls(webpage) - return urls[0] if urls else None - - @classmethod - def _extract_brightcove_urls(cls, webpage): - """Return a list of all Brightcove URLs from the webpage """ - - url_m = re.search( - r'''(?x) - <meta\s+ - (?:property|itemprop)=([\'"])(?:og:video|embedURL)\1[^>]+ - content=([\'"])(?P<url>https?://(?:secure|c)\.brightcove.com/(?:(?!\2).)+)\2 - ''', webpage) - if url_m: - url = unescapeHTML(url_m.group('url')) - # Some sites don't add it, we can't download with this url, for example: - # http://www.ktvu.com/videos/news/raw-video-caltrain-releases-video-of-man-almost/vCTZdY/ - if 'playerKey' in url or 'videoId' in url or 'idVideo' in url: - return [url] - - matches = re.findall( - r'''(?sx)<object - (?: - [^>]+?class=[\'"][^>]*?BrightcoveExperience.*?[\'"] | - [^>]*?>\s*<param\s+name="movie"\s+value="https?://[^/]*brightcove\.com/ - ).+?>\s*</object>''', - webpage) - if matches: - return list(filter(None, [cls._build_brightcove_url(m) for m in matches])) - - matches = re.findall(r'(customBC\.createVideo\(.+?\);)', webpage) - if matches: - return list(filter(None, [ - cls._build_brightcove_url_from_js(custom_bc) - for custom_bc in matches])) - return [src for _, src in re.findall( - r'<iframe[^>]+src=([\'"])((?:https?:)?//link\.brightcove\.com/services/player/(?!\1).+)\1', webpage)] - - def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) - - # Change the 'videoId' and others field to '@videoPlayer' - url = re.sub(r'(?<=[?&])(videoI(d|D)|idVideo|bctid)', '%40videoPlayer', url) - # Change bckey (used by bcove.me urls) to playerKey - url = re.sub(r'(?<=[?&])bckey', 'playerKey', url) - mobj = re.match(self._VALID_URL, url) - query_str = mobj.group('query') - query = compat_urlparse.parse_qs(query_str) - - videoPlayer = query.get('@videoPlayer') - if videoPlayer: - # We set the original url as the default 'Referer' header - referer = query.get('linkBaseURL', [None])[0] or smuggled_data.get('Referer', url) - video_id = videoPlayer[0] - if 'playerID' not in query: - mobj = re.search(r'/bcpid(\d+)', url) - if mobj is not None: - query['playerID'] = [mobj.group(1)] - publisher_id = query.get('publisherId') - if publisher_id and publisher_id[0].isdigit(): - publisher_id = publisher_id[0] - if not publisher_id: - player_key = query.get('playerKey') - if player_key and ',' in player_key[0]: - player_key = player_key[0] - else: - player_id = query.get('playerID') - if player_id and player_id[0].isdigit(): - headers = {} - if referer: - headers['Referer'] = referer - player_page = self._download_webpage( - 'http://link.brightcove.com/services/player/bcpid' + player_id[0], - video_id, headers=headers, fatal=False) - if player_page: - player_key = self._search_regex( - r'<param\s+name="playerKey"\s+value="([\w~,-]+)"', - player_page, 'player key', fatal=False) - if player_key: - enc_pub_id = player_key.split(',')[1].replace('~', '=') - publisher_id = struct.unpack('>Q', base64.urlsafe_b64decode(enc_pub_id))[0] - if publisher_id: - brightcove_new_url = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' % (publisher_id, video_id) - if referer: - brightcove_new_url = smuggle_url(brightcove_new_url, {'referrer': referer}) - return self.url_result(brightcove_new_url, BrightcoveNewIE.ie_key(), video_id) - # TODO: figure out if it's possible to extract playlistId from playerKey - # elif 'playerKey' in query: - # player_key = query['playerKey'] - # return self._get_playlist_info(player_key[0]) - raise UnsupportedError(url) - - -class BrightcoveNewIE(AdobePassIE): - IE_NAME = 'brightcove:new' - _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*(?P<content_type>video|playlist)Id=(?P<video_id>\d+|ref:[^&]+)' - _TESTS = [{ - 'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001', - 'md5': 'c8100925723840d4b0d243f7025703be', - 'info_dict': { - 'id': '4463358922001', - 'ext': 'mp4', - 'title': 'Meet the man behind Popcorn Time', - 'description': 'md5:eac376a4fe366edc70279bfb681aea16', - 'duration': 165.768, - 'timestamp': 1441391203, - 'upload_date': '20150904', - 'uploader_id': '929656772001', - 'formats': 'mincount:20', - }, - }, { - # with rtmp streams - 'url': 'http://players.brightcove.net/4036320279001/5d112ed9-283f-485f-a7f9-33f42e8bc042_default/index.html?videoId=4279049078001', - 'info_dict': { - 'id': '4279049078001', - 'ext': 'mp4', - 'title': 'Titansgrave: Chapter 0', - 'description': 'Titansgrave: Chapter 0', - 'duration': 1242.058, - 'timestamp': 1433556729, - 'upload_date': '20150606', - 'uploader_id': '4036320279001', - 'formats': 'mincount:39', - }, - 'params': { - # m3u8 download - 'skip_download': True, - } - }, { - # playlist stream - 'url': 'https://players.brightcove.net/1752604059001/S13cJdUBz_default/index.html?playlistId=5718313430001', - 'info_dict': { - 'id': '5718313430001', - 'title': 'No Audio Playlist', - }, - 'playlist_count': 7, - 'params': { - # m3u8 download - 'skip_download': True, - } - }, { - 'url': 'http://players.brightcove.net/5690807595001/HyZNerRl7_default/index.html?playlistId=5743160747001', - 'only_matching': True, - }, { - # ref: prefixed video id - 'url': 'http://players.brightcove.net/3910869709001/21519b5c-4b3b-4363-accb-bdc8f358f823_default/index.html?videoId=ref:7069442', - 'only_matching': True, - }, { - # non numeric ref: prefixed video id - 'url': 'http://players.brightcove.net/710858724001/default_default/index.html?videoId=ref:event-stream-356', - 'only_matching': True, - }, { - # unavailable video without message but with error_code - 'url': 'http://players.brightcove.net/1305187701/c832abfb-641b-44eb-9da0-2fe76786505f_default/index.html?videoId=4377407326001', - 'only_matching': True, - }] - - @staticmethod - def _extract_url(ie, webpage): - urls = BrightcoveNewIE._extract_urls(ie, webpage) - return urls[0] if urls else None - - @staticmethod - def _extract_urls(ie, webpage): - # Reference: - # 1. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideoiniframe - # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#tag - # 3. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideousingjavascript - # 4. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/in-page-embed-player-implementation.html - # 5. https://support.brightcove.com/en/video-cloud/docs/dynamically-assigning-videos-player - - entries = [] - - # Look for iframe embeds [1] - for _, url in re.findall( - r'<iframe[^>]+src=(["\'])((?:https?:)?//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage): - entries.append(url if url.startswith('http') else 'http:' + url) - - # Look for <video> tags [2] and embed_in_page embeds [3] - # [2] looks like: - for video, script_tag, account_id, player_id, embed in re.findall( - r'''(?isx) - (<video(?:-js)?\s+[^>]*\bdata-video-id\s*=\s*['"]?[^>]+>) - (?:.*? - (<script[^>]+ - src=["\'](?:https?:)?//players\.brightcove\.net/ - (\d+)/([^/]+)_([^/]+)/index(?:\.min)?\.js - ) - )? - ''', webpage): - attrs = extract_attributes(video) - - # According to examples from [4] it's unclear whether video id - # may be optional and what to do when it is - video_id = attrs.get('data-video-id') - if not video_id: - continue - - account_id = account_id or attrs.get('data-account') - if not account_id: - continue - - player_id = player_id or attrs.get('data-player') or 'default' - embed = embed or attrs.get('data-embed') or 'default' - - bc_url = 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' % ( - account_id, player_id, embed, video_id) - - # Some brightcove videos may be embedded with video tag only and - # without script tag or any mentioning of brightcove at all. Such - # embeds are considered ambiguous since they are matched based only - # on data-video-id and data-account attributes and in the wild may - # not be brightcove embeds at all. Let's check reconstructed - # brightcove URLs in case of such embeds and only process valid - # ones. By this we ensure there is indeed a brightcove embed. - if not script_tag and not ie._is_valid_url( - bc_url, video_id, 'possible brightcove video'): - continue - - entries.append(bc_url) - - return entries - - def _parse_brightcove_metadata(self, json_data, video_id, headers={}): - title = json_data['name'].strip() - - num_drm_sources = 0 - formats = [] - sources = json_data.get('sources') or [] - for source in sources: - container = source.get('container') - ext = mimetype2ext(source.get('type')) - src = source.get('src') - # https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object - if container == 'WVM' or source.get('key_systems'): - num_drm_sources += 1 - continue - elif ext == 'ism': - continue - elif ext == 'm3u8' or container == 'M2TS': - if not src: - continue - formats.extend(self._extract_m3u8_formats( - src, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - elif ext == 'mpd': - if not src: - continue - formats.extend(self._extract_mpd_formats(src, video_id, 'dash', fatal=False)) - else: - streaming_src = source.get('streaming_src') - stream_name, app_name = source.get('stream_name'), source.get('app_name') - if not src and not streaming_src and (not stream_name or not app_name): - continue - tbr = float_or_none(source.get('avg_bitrate'), 1000) - height = int_or_none(source.get('height')) - width = int_or_none(source.get('width')) - f = { - 'tbr': tbr, - 'filesize': int_or_none(source.get('size')), - 'container': container, - 'ext': ext or container.lower(), - } - if width == 0 and height == 0: - f.update({ - 'vcodec': 'none', - }) - else: - f.update({ - 'width': width, - 'height': height, - 'vcodec': source.get('codec'), - }) - - def build_format_id(kind): - format_id = kind - if tbr: - format_id += '-%dk' % int(tbr) - if height: - format_id += '-%dp' % height - return format_id - - if src or streaming_src: - f.update({ - 'url': src or streaming_src, - 'format_id': build_format_id('http' if src else 'http-streaming'), - 'source_preference': 0 if src else -1, - }) - else: - f.update({ - 'url': app_name, - 'play_path': stream_name, - 'format_id': build_format_id('rtmp'), - }) - formats.append(f) - - if not formats: - errors = json_data.get('errors') - if errors: - error = errors[0] - raise ExtractorError( - error.get('message') or error.get('error_subcode') or error['error_code'], expected=True) - if sources and num_drm_sources == len(sources): - raise ExtractorError('This video is DRM protected.', expected=True) - - self._sort_formats(formats) - - for f in formats: - f.setdefault('http_headers', {}).update(headers) - - subtitles = {} - for text_track in json_data.get('text_tracks', []): - if text_track.get('kind') != 'captions': - continue - text_track_url = url_or_none(text_track.get('src')) - if not text_track_url: - continue - lang = (str_or_none(text_track.get('srclang')) - or str_or_none(text_track.get('label')) or 'en').lower() - subtitles.setdefault(lang, []).append({ - 'url': text_track_url, - }) - - is_live = False - duration = float_or_none(json_data.get('duration'), 1000) - if duration is not None and duration <= 0: - is_live = True - - return { - 'id': video_id, - 'title': self._live_title(title) if is_live else title, - 'description': clean_html(json_data.get('description')), - 'thumbnail': json_data.get('thumbnail') or json_data.get('poster'), - 'duration': duration, - 'timestamp': parse_iso8601(json_data.get('published_at')), - 'uploader_id': json_data.get('account_id'), - 'formats': formats, - 'subtitles': subtitles, - 'tags': json_data.get('tags', []), - 'is_live': is_live, - } - - def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) - self._initialize_geo_bypass({ - 'countries': smuggled_data.get('geo_countries'), - 'ip_blocks': smuggled_data.get('geo_ip_blocks'), - }) - - account_id, player_id, embed, content_type, video_id = re.match(self._VALID_URL, url).groups() - - policy_key_id = '%s_%s' % (account_id, player_id) - policy_key = self._downloader.cache.load('brightcove', policy_key_id) - policy_key_extracted = False - store_pk = lambda x: self._downloader.cache.store('brightcove', policy_key_id, x) - - def extract_policy_key(): - base_url = 'http://players.brightcove.net/%s/%s_%s/' % (account_id, player_id, embed) - config = self._download_json( - base_url + 'config.json', video_id, fatal=False) or {} - policy_key = try_get( - config, lambda x: x['video_cloud']['policy_key']) - if not policy_key: - webpage = self._download_webpage( - base_url + 'index.min.js', video_id) - - catalog = self._search_regex( - r'catalog\(({.+?})\);', webpage, 'catalog', default=None) - if catalog: - catalog = self._parse_json( - js_to_json(catalog), video_id, fatal=False) - if catalog: - policy_key = catalog.get('policyKey') - - if not policy_key: - policy_key = self._search_regex( - r'policyKey\s*:\s*(["\'])(?P<pk>.+?)\1', - webpage, 'policy key', group='pk') - - store_pk(policy_key) - return policy_key - - api_url = 'https://edge.api.brightcove.com/playback/v1/accounts/%s/%ss/%s' % (account_id, content_type, video_id) - headers = {} - referrer = smuggled_data.get('referrer') - if referrer: - headers.update({ - 'Referer': referrer, - 'Origin': re.search(r'https?://[^/]+', referrer).group(0), - }) - - for _ in range(2): - if not policy_key: - policy_key = extract_policy_key() - policy_key_extracted = True - headers['Accept'] = 'application/json;pk=%s' % policy_key - try: - json_data = self._download_json(api_url, video_id, headers=headers) - break - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 403): - json_data = self._parse_json(e.cause.read().decode(), video_id)[0] - message = json_data.get('message') or json_data['error_code'] - if json_data.get('error_subcode') == 'CLIENT_GEO': - self.raise_geo_restricted(msg=message) - elif json_data.get('error_code') == 'INVALID_POLICY_KEY' and not policy_key_extracted: - policy_key = None - store_pk(None) - continue - raise ExtractorError(message, expected=True) - raise - - errors = json_data.get('errors') - if errors and errors[0].get('error_subcode') == 'TVE_AUTH': - custom_fields = json_data['custom_fields'] - tve_token = self._extract_mvpd_auth( - smuggled_data['source_url'], video_id, - custom_fields['bcadobepassrequestorid'], - custom_fields['bcadobepassresourceid']) - json_data = self._download_json( - api_url, video_id, headers={ - 'Accept': 'application/json;pk=%s' % policy_key - }, query={ - 'tveToken': tve_token, - }) - - if content_type == 'playlist': - return self.playlist_result( - [self._parse_brightcove_metadata(vid, vid.get('id'), headers) - for vid in json_data.get('videos', []) if vid.get('id')], - json_data.get('id'), json_data.get('name'), - json_data.get('description')) - - return self._parse_brightcove_metadata( - json_data, video_id, headers=headers) diff --git a/youtube_dl/extractor/byutv.py b/youtube_dl/extractor/byutv.py deleted file mode 100644 index 0b11bf11f..000000000 --- a/youtube_dl/extractor/byutv.py +++ /dev/null @@ -1,117 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - determine_ext, - merge_dicts, - parse_duration, - url_or_none, -) - - -class BYUtvIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?byutv\.org/(?:watch|player)/(?!event/)(?P<id>[0-9a-f-]+)(?:/(?P<display_id>[^/?#&]+))?' - _TESTS = [{ - # ooyalaVOD - 'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d/studio-c-season-5-episode-5', - 'info_dict': { - 'id': 'ZvanRocTpW-G5_yZFeltTAMv6jxOU9KH', - 'display_id': 'studio-c-season-5-episode-5', - 'ext': 'mp4', - 'title': 'Season 5 Episode 5', - 'description': 'md5:1d31dc18ef4f075b28f6a65937d22c65', - 'thumbnail': r're:^https?://.*', - 'duration': 1486.486, - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': ['Ooyala'], - }, { - # dvr - 'url': 'https://www.byutv.org/player/8f1dab9b-b243-47c8-b525-3e2d021a3451/byu-softball-pacific-vs-byu-41219---game-2', - 'info_dict': { - 'id': '8f1dab9b-b243-47c8-b525-3e2d021a3451', - 'display_id': 'byu-softball-pacific-vs-byu-41219---game-2', - 'ext': 'mp4', - 'title': 'Pacific vs. BYU (4/12/19)', - 'description': 'md5:1ac7b57cb9a78015910a4834790ce1f3', - 'duration': 11645, - }, - 'params': { - 'skip_download': True - }, - }, { - 'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d', - 'only_matching': True, - }, { - 'url': 'https://www.byutv.org/player/27741493-dc83-40b0-8420-e7ae38a2ae98/byu-football-toledo-vs-byu-93016?listid=4fe0fee5-0d3c-4a29-b725-e4948627f472&listindex=0&q=toledo', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') or video_id - - video = self._download_json( - 'https://api.byutv.org/api3/catalog/getvideosforcontent', - display_id, query={ - 'contentid': video_id, - 'channel': 'byutv', - 'x-byutv-context': 'web$US', - }, headers={ - 'x-byutv-context': 'web$US', - 'x-byutv-platformkey': 'xsaaw9c7y5', - }) - - ep = video.get('ooyalaVOD') - if ep: - return { - '_type': 'url_transparent', - 'ie_key': 'Ooyala', - 'url': 'ooyala:%s' % ep['providerId'], - 'id': video_id, - 'display_id': display_id, - 'title': ep.get('title'), - 'description': ep.get('description'), - 'thumbnail': ep.get('imageThumbnail'), - } - - info = {} - formats = [] - for format_id, ep in video.items(): - if not isinstance(ep, dict): - continue - video_url = url_or_none(ep.get('videoUrl')) - if not video_url: - continue - ext = determine_ext(video_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - elif ext == 'mpd': - formats.extend(self._extract_mpd_formats( - video_url, video_id, mpd_id='dash', fatal=False)) - else: - formats.append({ - 'url': video_url, - 'format_id': format_id, - }) - merge_dicts(info, { - 'title': ep.get('title'), - 'description': ep.get('description'), - 'thumbnail': ep.get('imageThumbnail'), - 'duration': parse_duration(ep.get('length')), - }) - self._sort_formats(formats) - - return merge_dicts(info, { - 'id': video_id, - 'display_id': display_id, - 'title': display_id, - 'formats': formats, - }) diff --git a/youtube_dl/extractor/c56.py b/youtube_dl/extractor/c56.py deleted file mode 100644 index cac8fdcba..000000000 --- a/youtube_dl/extractor/c56.py +++ /dev/null @@ -1,65 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import js_to_json - - -class C56IE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|player)\.)?56\.com/(?:.+?/)?(?:v_|(?:play_album.+-))(?P<textid>.+?)\.(?:html|swf)' - IE_NAME = '56.com' - _TESTS = [{ - 'url': 'http://www.56.com/u39/v_OTM0NDA3MTY.html', - 'md5': 'e59995ac63d0457783ea05f93f12a866', - 'info_dict': { - 'id': '93440716', - 'ext': 'flv', - 'title': '网事知多少 第32期:车怒', - 'duration': 283.813, - }, - }, { - 'url': 'http://www.56.com/u47/v_MTM5NjQ5ODc2.html', - 'md5': '', - 'info_dict': { - 'id': '82247482', - 'title': '爱的诅咒之杜鹃花开', - }, - 'playlist_count': 7, - 'add_ie': ['Sohu'], - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE) - text_id = mobj.group('textid') - - webpage = self._download_webpage(url, text_id) - sohu_video_info_str = self._search_regex( - r'var\s+sohuVideoInfo\s*=\s*({[^}]+});', webpage, 'Sohu video info', default=None) - if sohu_video_info_str: - sohu_video_info = self._parse_json( - sohu_video_info_str, text_id, transform_source=js_to_json) - return self.url_result(sohu_video_info['url'], 'Sohu') - - page = self._download_json( - 'http://vxml.56.com/json/%s/' % text_id, text_id, 'Downloading video info') - - info = page['info'] - - formats = [ - { - 'format_id': f['type'], - 'filesize': int(f['filesize']), - 'url': f['url'] - } for f in info['rfiles'] - ] - self._sort_formats(formats) - - return { - 'id': info['vid'], - 'title': info['Subject'], - 'duration': int(info['duration']) / 1000.0, - 'formats': formats, - 'thumbnail': info.get('bimg') or info.get('img'), - } diff --git a/youtube_dl/extractor/cammodels.py b/youtube_dl/extractor/cammodels.py deleted file mode 100644 index 1eb81b75e..000000000 --- a/youtube_dl/extractor/cammodels.py +++ /dev/null @@ -1,98 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - int_or_none, - url_or_none, -) - - -class CamModelsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?cammodels\.com/cam/(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'https://www.cammodels.com/cam/AutumnKnight/', - 'only_matching': True, - 'age_limit': 18 - }] - - def _real_extract(self, url): - user_id = self._match_id(url) - - webpage = self._download_webpage( - url, user_id, headers=self.geo_verification_headers()) - - manifest_root = self._html_search_regex( - r'manifestUrlRoot=([^&\']+)', webpage, 'manifest', default=None) - - if not manifest_root: - ERRORS = ( - ("I'm offline, but let's stay connected", 'This user is currently offline'), - ('in a private show', 'This user is in a private show'), - ('is currently performing LIVE', 'This model is currently performing live'), - ) - for pattern, message in ERRORS: - if pattern in webpage: - error = message - expected = True - break - else: - error = 'Unable to find manifest URL root' - expected = False - raise ExtractorError(error, expected=expected) - - manifest = self._download_json( - '%s%s.json' % (manifest_root, user_id), user_id) - - formats = [] - for format_id, format_dict in manifest['formats'].items(): - if not isinstance(format_dict, dict): - continue - encodings = format_dict.get('encodings') - if not isinstance(encodings, list): - continue - vcodec = format_dict.get('videoCodec') - acodec = format_dict.get('audioCodec') - for media in encodings: - if not isinstance(media, dict): - continue - media_url = url_or_none(media.get('location')) - if not media_url: - continue - - format_id_list = [format_id] - height = int_or_none(media.get('videoHeight')) - if height is not None: - format_id_list.append('%dp' % height) - f = { - 'url': media_url, - 'format_id': '-'.join(format_id_list), - 'width': int_or_none(media.get('videoWidth')), - 'height': height, - 'vbr': int_or_none(media.get('videoKbps')), - 'abr': int_or_none(media.get('audioKbps')), - 'fps': int_or_none(media.get('fps')), - 'vcodec': vcodec, - 'acodec': acodec, - } - if 'rtmp' in format_id: - f['ext'] = 'flv' - elif 'hls' in format_id: - f.update({ - 'ext': 'mp4', - # hls skips fragments, preferring rtmp - 'preference': -1, - }) - else: - continue - formats.append(f) - self._sort_formats(formats) - - return { - 'id': user_id, - 'title': self._live_title(user_id), - 'is_live': True, - 'formats': formats, - 'age_limit': 18 - } diff --git a/youtube_dl/extractor/camtube.py b/youtube_dl/extractor/camtube.py deleted file mode 100644 index b3be3bdcf..000000000 --- a/youtube_dl/extractor/camtube.py +++ /dev/null @@ -1,71 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - unified_timestamp, -) - - -class CamTubeIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|api)\.)?camtube\.co/recordings?/(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'https://camtube.co/recording/minafay-030618-1136-chaturbate-female', - 'info_dict': { - 'id': '42ad3956-dd5b-445a-8313-803ea6079fac', - 'display_id': 'minafay-030618-1136-chaturbate-female', - 'ext': 'mp4', - 'title': 'minafay-030618-1136-chaturbate-female', - 'duration': 1274, - 'timestamp': 1528018608, - 'upload_date': '20180603', - 'age_limit': 18 - }, - 'params': { - 'skip_download': True, - }, - }] - - _API_BASE = 'https://api.camtube.co' - - def _real_extract(self, url): - display_id = self._match_id(url) - - token = self._download_json( - '%s/rpc/session/new' % self._API_BASE, display_id, - 'Downloading session token')['token'] - - self._set_cookie('api.camtube.co', 'session', token) - - video = self._download_json( - '%s/recordings/%s' % (self._API_BASE, display_id), display_id, - headers={'Referer': url}) - - video_id = video['uuid'] - timestamp = unified_timestamp(video.get('createdAt')) - duration = int_or_none(video.get('duration')) - view_count = int_or_none(video.get('viewCount')) - like_count = int_or_none(video.get('likeCount')) - creator = video.get('stageName') - - formats = [{ - 'url': '%s/recordings/%s/manifest.m3u8' - % (self._API_BASE, video_id), - 'format_id': 'hls', - 'ext': 'mp4', - 'protocol': 'm3u8_native', - }] - - return { - 'id': video_id, - 'display_id': display_id, - 'title': display_id, - 'timestamp': timestamp, - 'duration': duration, - 'view_count': view_count, - 'like_count': like_count, - 'creator': creator, - 'formats': formats, - 'age_limit': 18 - } diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py deleted file mode 100644 index 51c11cb7e..000000000 --- a/youtube_dl/extractor/canalplus.py +++ /dev/null @@ -1,116 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - # ExtractorError, - # HEADRequest, - int_or_none, - qualities, - unified_strdate, -) - - -class CanalplusIE(InfoExtractor): - IE_DESC = 'mycanal.fr and piwiplus.fr' - _VALID_URL = r'https?://(?:www\.)?(?P<site>mycanal|piwiplus)\.fr/(?:[^/]+/)*(?P<display_id>[^?/]+)(?:\.html\?.*\bvid=|/p/)(?P<id>\d+)' - _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/%s/%s?format=json' - _SITE_ID_MAP = { - 'mycanal': 'cplus', - 'piwiplus': 'teletoon', - } - - # Only works for direct mp4 URLs - _GEO_COUNTRIES = ['FR'] - - _TESTS = [{ - 'url': 'https://www.mycanal.fr/d17-emissions/lolywood/p/1397061', - 'info_dict': { - 'id': '1397061', - 'display_id': 'lolywood', - 'ext': 'mp4', - 'title': 'Euro 2016 : Je préfère te prévenir - Lolywood - Episode 34', - 'description': 'md5:7d97039d455cb29cdba0d652a0efaa5e', - 'upload_date': '20160602', - }, - }, { - # geo restricted, bypassed - 'url': 'http://www.piwiplus.fr/videos-piwi/pid1405-le-labyrinthe-boing-super-ranger.html?vid=1108190', - 'info_dict': { - 'id': '1108190', - 'display_id': 'pid1405-le-labyrinthe-boing-super-ranger', - 'ext': 'mp4', - 'title': 'BOING SUPER RANGER - Ep : Le labyrinthe', - 'description': 'md5:4cea7a37153be42c1ba2c1d3064376ff', - 'upload_date': '20140724', - }, - 'expected_warnings': ['HTTP Error 403: Forbidden'], - }] - - def _real_extract(self, url): - site, display_id, video_id = re.match(self._VALID_URL, url).groups() - - site_id = self._SITE_ID_MAP[site] - - info_url = self._VIDEO_INFO_TEMPLATE % (site_id, video_id) - video_data = self._download_json(info_url, video_id, 'Downloading video JSON') - - if isinstance(video_data, list): - video_data = [video for video in video_data if video.get('ID') == video_id][0] - media = video_data['MEDIA'] - infos = video_data['INFOS'] - - preference = qualities(['MOBILE', 'BAS_DEBIT', 'HAUT_DEBIT', 'HD']) - - # _, fmt_url = next(iter(media['VIDEOS'].items())) - # if '/geo' in fmt_url.lower(): - # response = self._request_webpage( - # HEADRequest(fmt_url), video_id, - # 'Checking if the video is georestricted') - # if '/blocage' in response.geturl(): - # raise ExtractorError( - # 'The video is not available in your country', - # expected=True) - - formats = [] - for format_id, format_url in media['VIDEOS'].items(): - if not format_url: - continue - if format_id == 'HLS': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', 'm3u8_native', m3u8_id=format_id, fatal=False)) - elif format_id == 'HDS': - formats.extend(self._extract_f4m_formats( - format_url + '?hdcore=2.11.3', video_id, f4m_id=format_id, fatal=False)) - else: - formats.append({ - # the secret extracted from ya function in http://player.canalplus.fr/common/js/canalPlayer.js - 'url': format_url + '?secret=pqzerjlsmdkjfoiuerhsdlfknaes', - 'format_id': format_id, - 'preference': preference(format_id), - }) - self._sort_formats(formats) - - thumbnails = [{ - 'id': image_id, - 'url': image_url, - } for image_id, image_url in media.get('images', {}).items()] - - titrage = infos['TITRAGE'] - - return { - 'id': video_id, - 'display_id': display_id, - 'title': '%s - %s' % (titrage['TITRE'], - titrage['SOUS_TITRE']), - 'upload_date': unified_strdate(infos.get('PUBLICATION', {}).get('DATE')), - 'thumbnails': thumbnails, - 'description': infos.get('DESCRIPTION'), - 'duration': int_or_none(infos.get('DURATION')), - 'view_count': int_or_none(infos.get('NB_VUES')), - 'like_count': int_or_none(infos.get('NB_LIKES')), - 'comment_count': int_or_none(infos.get('NB_COMMENTS')), - 'formats': formats, - } diff --git a/youtube_dl/extractor/canvas.py b/youtube_dl/extractor/canvas.py deleted file mode 100644 index eefbab241..000000000 --- a/youtube_dl/extractor/canvas.py +++ /dev/null @@ -1,384 +0,0 @@ -from __future__ import unicode_literals - -import re -import json - -from .common import InfoExtractor -from .gigya import GigyaBaseIE -from ..compat import compat_HTTPError -from ..utils import ( - ExtractorError, - clean_html, - extract_attributes, - float_or_none, - get_element_by_class, - int_or_none, - merge_dicts, - str_or_none, - strip_or_none, - url_or_none, -) - - -class CanvasIE(InfoExtractor): - _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrt(?:video|nieuws)|sporza|dako)/assets/(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', - 'md5': '68993eda72ef62386a15ea2cf3c93107', - 'info_dict': { - 'id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', - 'display_id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', - 'ext': 'mp4', - 'title': 'Nachtwacht: De Greystook', - 'description': 'Nachtwacht: De Greystook', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 1468.04, - }, - 'expected_warnings': ['is not a supported codec', 'Unknown MIME type'], - }, { - 'url': 'https://mediazone.vrt.be/api/v1/canvas/assets/mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e', - 'only_matching': True, - }] - _GEO_BYPASS = False - _HLS_ENTRY_PROTOCOLS_MAP = { - 'HLS': 'm3u8_native', - 'HLS_AES': 'm3u8', - } - _REST_API_BASE = 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v1' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - site_id, video_id = mobj.group('site_id'), mobj.group('id') - - data = None - if site_id != 'vrtvideo': - # Old API endpoint, serves more formats but may fail for some videos - data = self._download_json( - 'https://mediazone.vrt.be/api/v1/%s/assets/%s' - % (site_id, video_id), video_id, 'Downloading asset JSON', - 'Unable to download asset JSON', fatal=False) - - # New API endpoint - if not data: - headers = self.geo_verification_headers() - headers.update({'Content-Type': 'application/json'}) - token = self._download_json( - '%s/tokens' % self._REST_API_BASE, video_id, - 'Downloading token', data=b'', headers=headers)['vrtPlayerToken'] - data = self._download_json( - '%s/videos/%s' % (self._REST_API_BASE, video_id), - video_id, 'Downloading video JSON', query={ - 'vrtPlayerToken': token, - 'client': '%s@PROD' % site_id, - }, expected_status=400) - if not data.get('title'): - code = data.get('code') - if code == 'AUTHENTICATION_REQUIRED': - self.raise_login_required() - elif code == 'INVALID_LOCATION': - self.raise_geo_restricted(countries=['BE']) - raise ExtractorError(data.get('message') or code, expected=True) - - title = data['title'] - description = data.get('description') - - formats = [] - for target in data['targetUrls']: - format_url, format_type = url_or_none(target.get('url')), str_or_none(target.get('type')) - if not format_url or not format_type: - continue - format_type = format_type.upper() - if format_type in self._HLS_ENTRY_PROTOCOLS_MAP: - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', self._HLS_ENTRY_PROTOCOLS_MAP[format_type], - m3u8_id=format_type, fatal=False)) - elif format_type == 'HDS': - formats.extend(self._extract_f4m_formats( - format_url, video_id, f4m_id=format_type, fatal=False)) - elif format_type == 'MPEG_DASH': - formats.extend(self._extract_mpd_formats( - format_url, video_id, mpd_id=format_type, fatal=False)) - elif format_type == 'HSS': - formats.extend(self._extract_ism_formats( - format_url, video_id, ism_id='mss', fatal=False)) - else: - formats.append({ - 'format_id': format_type, - 'url': format_url, - }) - self._sort_formats(formats) - - subtitles = {} - subtitle_urls = data.get('subtitleUrls') - if isinstance(subtitle_urls, list): - for subtitle in subtitle_urls: - subtitle_url = subtitle.get('url') - if subtitle_url and subtitle.get('type') == 'CLOSED': - subtitles.setdefault('nl', []).append({'url': subtitle_url}) - - return { - 'id': video_id, - 'display_id': video_id, - 'title': title, - 'description': description, - 'formats': formats, - 'duration': float_or_none(data.get('duration'), 1000), - 'thumbnail': data.get('posterImageUrl'), - 'subtitles': subtitles, - } - - -class CanvasEenIE(InfoExtractor): - IE_DESC = 'canvas.be and een.be' - _VALID_URL = r'https?://(?:www\.)?(?P<site_id>canvas|een)\.be/(?:[^/]+/)*(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'http://www.canvas.be/video/de-afspraak/najaar-2015/de-afspraak-veilt-voor-de-warmste-week', - 'md5': 'ed66976748d12350b118455979cca293', - 'info_dict': { - 'id': 'mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e', - 'display_id': 'de-afspraak-veilt-voor-de-warmste-week', - 'ext': 'flv', - 'title': 'De afspraak veilt voor de Warmste Week', - 'description': 'md5:24cb860c320dc2be7358e0e5aa317ba6', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 49.02, - }, - 'expected_warnings': ['is not a supported codec'], - }, { - # with subtitles - 'url': 'http://www.canvas.be/video/panorama/2016/pieter-0167', - 'info_dict': { - 'id': 'mz-ast-5240ff21-2d30-4101-bba6-92b5ec67c625', - 'display_id': 'pieter-0167', - 'ext': 'mp4', - 'title': 'Pieter 0167', - 'description': 'md5:943cd30f48a5d29ba02c3a104dc4ec4e', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 2553.08, - 'subtitles': { - 'nl': [{ - 'ext': 'vtt', - }], - }, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'Pagina niet gevonden', - }, { - 'url': 'https://www.een.be/thuis/emma-pakt-thilly-aan', - 'info_dict': { - 'id': 'md-ast-3a24ced2-64d7-44fb-b4ed-ed1aafbf90b8', - 'display_id': 'emma-pakt-thilly-aan', - 'ext': 'mp4', - 'title': 'Emma pakt Thilly aan', - 'description': 'md5:c5c9b572388a99b2690030afa3f3bad7', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 118.24, - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': ['is not a supported codec'], - }, { - 'url': 'https://www.canvas.be/check-point/najaar-2016/de-politie-uw-vriend', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - site_id, display_id = mobj.group('site_id'), mobj.group('id') - - webpage = self._download_webpage(url, display_id) - - title = strip_or_none(self._search_regex( - r'<h1[^>]+class="video__body__header__title"[^>]*>(.+?)</h1>', - webpage, 'title', default=None) or self._og_search_title( - webpage, default=None)) - - video_id = self._html_search_regex( - r'data-video=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id', - group='id') - - return { - '_type': 'url_transparent', - 'url': 'https://mediazone.vrt.be/api/v1/%s/assets/%s' % (site_id, video_id), - 'ie_key': CanvasIE.ie_key(), - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': self._og_search_description(webpage), - } - - -class VrtNUIE(GigyaBaseIE): - IE_DESC = 'VrtNU.be' - _VALID_URL = r'https?://(?:www\.)?vrt\.be/vrtnu/a-z/(?:[^/]+/){2}(?P<id>[^/?#&]+)' - _TESTS = [{ - # Available via old API endpoint - 'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1989/postbus-x-s1989a1/', - 'info_dict': { - 'id': 'pbs-pub-e8713dac-899e-41de-9313-81269f4c04ac$vid-90c932b1-e21d-4fb8-99b1-db7b49cf74de', - 'ext': 'mp4', - 'title': 'Postbus X - Aflevering 1 (Seizoen 1989)', - 'description': 'md5:b704f669eb9262da4c55b33d7c6ed4b7', - 'duration': 1457.04, - 'thumbnail': r're:^https?://.*\.jpg$', - 'series': 'Postbus X', - 'season': 'Seizoen 1989', - 'season_number': 1989, - 'episode': 'De zwarte weduwe', - 'episode_number': 1, - 'timestamp': 1595822400, - 'upload_date': '20200727', - }, - 'skip': 'This video is only available for registered users', - 'params': { - 'username': '<snip>', - 'password': '<snip>', - }, - 'expected_warnings': ['is not a supported codec'], - }, { - # Only available via new API endpoint - 'url': 'https://www.vrt.be/vrtnu/a-z/kamp-waes/1/kamp-waes-s1a5/', - 'info_dict': { - 'id': 'pbs-pub-0763b56c-64fb-4d38-b95b-af60bf433c71$vid-ad36a73c-4735-4f1f-b2c0-a38e6e6aa7e1', - 'ext': 'mp4', - 'title': 'Aflevering 5', - 'description': 'Wie valt door de mand tijdens een missie?', - 'duration': 2967.06, - 'season': 'Season 1', - 'season_number': 1, - 'episode_number': 5, - }, - 'skip': 'This video is only available for registered users', - 'params': { - 'username': '<snip>', - 'password': '<snip>', - }, - 'expected_warnings': ['Unable to download asset JSON', 'is not a supported codec', 'Unknown MIME type'], - }] - _NETRC_MACHINE = 'vrtnu' - _APIKEY = '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy' - _CONTEXT_ID = 'R3595707040' - - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - - auth_data = { - 'APIKey': self._APIKEY, - 'targetEnv': 'jssdk', - 'loginID': username, - 'password': password, - 'authMode': 'cookie', - } - - auth_info = self._gigya_login(auth_data) - - # Sometimes authentication fails for no good reason, retry - login_attempt = 1 - while login_attempt <= 3: - try: - # When requesting a token, no actual token is returned, but the - # necessary cookies are set. - self._request_webpage( - 'https://token.vrt.be', - None, note='Requesting a token', errnote='Could not get a token', - headers={ - 'Content-Type': 'application/json', - 'Referer': 'https://www.vrt.be/vrtnu/', - }, - data=json.dumps({ - 'uid': auth_info['UID'], - 'uidsig': auth_info['UIDSignature'], - 'ts': auth_info['signatureTimestamp'], - 'email': auth_info['profile']['email'], - }).encode('utf-8')) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - login_attempt += 1 - self.report_warning('Authentication failed') - self._sleep(1, None, msg_template='Waiting for %(timeout)s seconds before trying again') - else: - raise e - else: - break - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - attrs = extract_attributes(self._search_regex( - r'(<nui-media[^>]+>)', webpage, 'media element')) - video_id = attrs['videoid'] - publication_id = attrs.get('publicationid') - if publication_id: - video_id = publication_id + '$' + video_id - - page = (self._parse_json(self._search_regex( - r'digitalData\s*=\s*({.+?});', webpage, 'digial data', - default='{}'), video_id, fatal=False) or {}).get('page') or {} - - info = self._search_json_ld(webpage, display_id, default={}) - return merge_dicts(info, { - '_type': 'url_transparent', - 'url': 'https://mediazone.vrt.be/api/v1/vrtvideo/assets/%s' % video_id, - 'ie_key': CanvasIE.ie_key(), - 'id': video_id, - 'display_id': display_id, - 'season_number': int_or_none(page.get('episode_season')), - }) - - -class DagelijkseKostIE(InfoExtractor): - IE_DESC = 'dagelijksekost.een.be' - _VALID_URL = r'https?://dagelijksekost\.een\.be/gerechten/(?P<id>[^/?#&]+)' - _TEST = { - 'url': 'https://dagelijksekost.een.be/gerechten/hachis-parmentier-met-witloof', - 'md5': '30bfffc323009a3e5f689bef6efa2365', - 'info_dict': { - 'id': 'md-ast-27a4d1ff-7d7b-425e-b84f-a4d227f592fa', - 'display_id': 'hachis-parmentier-met-witloof', - 'ext': 'mp4', - 'title': 'Hachis parmentier met witloof', - 'description': 'md5:9960478392d87f63567b5b117688cdc5', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 283.02, - }, - 'expected_warnings': ['is not a supported codec'], - } - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - title = strip_or_none(get_element_by_class( - 'dish-metadata__title', webpage - ) or self._html_search_meta( - 'twitter:title', webpage)) - - description = clean_html(get_element_by_class( - 'dish-description', webpage) - ) or self._html_search_meta( - ('description', 'twitter:description', 'og:description'), - webpage) - - video_id = self._html_search_regex( - r'data-url=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id', - group='id') - - return { - '_type': 'url_transparent', - 'url': 'https://mediazone.vrt.be/api/v1/dako/assets/%s' % video_id, - 'ie_key': CanvasIE.ie_key(), - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - } diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py deleted file mode 100644 index fd5ec6033..000000000 --- a/youtube_dl/extractor/cbc.py +++ /dev/null @@ -1,497 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import hashlib -import json -import re -from xml.sax.saxutils import escape - -from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_HTTPError, -) -from ..utils import ( - js_to_json, - smuggle_url, - try_get, - xpath_text, - xpath_element, - xpath_with_ns, - find_xpath_attr, - orderedSet, - parse_duration, - parse_iso8601, - parse_age_limit, - strip_or_none, - int_or_none, - ExtractorError, -) - - -class CBCIE(InfoExtractor): - IE_NAME = 'cbc.ca' - _VALID_URL = r'https?://(?:www\.)?cbc\.ca/(?!player/)(?:[^/]+/)+(?P<id>[^/?#]+)' - _TESTS = [{ - # with mediaId - 'url': 'http://www.cbc.ca/22minutes/videos/clips-season-23/don-cherry-play-offs', - 'md5': '97e24d09672fc4cf56256d6faa6c25bc', - 'info_dict': { - 'id': '2682904050', - 'ext': 'mp4', - 'title': 'Don Cherry – All-Stars', - 'description': 'Don Cherry has a bee in his bonnet about AHL player John Scott because that guy’s got heart.', - 'timestamp': 1454463000, - 'upload_date': '20160203', - 'uploader': 'CBCC-NEW', - }, - 'skip': 'Geo-restricted to Canada', - }, { - # with clipId, feed available via tpfeed.cbc.ca and feed.theplatform.com - 'url': 'http://www.cbc.ca/22minutes/videos/22-minutes-update/22-minutes-update-episode-4', - 'md5': '162adfa070274b144f4fdc3c3b8207db', - 'info_dict': { - 'id': '2414435309', - 'ext': 'mp4', - 'title': '22 Minutes Update: What Not To Wear Quebec', - 'description': "This week's latest Canadian top political story is What Not To Wear Quebec.", - 'upload_date': '20131025', - 'uploader': 'CBCC-NEW', - 'timestamp': 1382717907, - }, - }, { - # with clipId, feed only available via tpfeed.cbc.ca - 'url': 'http://www.cbc.ca/archives/entry/1978-robin-williams-freestyles-on-90-minutes-live', - 'md5': '0274a90b51a9b4971fe005c63f592f12', - 'info_dict': { - 'id': '2487345465', - 'ext': 'mp4', - 'title': 'Robin Williams freestyles on 90 Minutes Live', - 'description': 'Wacky American comedian Robin Williams shows off his infamous "freestyle" comedic talents while being interviewed on CBC\'s 90 Minutes Live.', - 'upload_date': '19780210', - 'uploader': 'CBCC-NEW', - 'timestamp': 255977160, - }, - }, { - # multiple iframes - 'url': 'http://www.cbc.ca/natureofthings/blog/birds-eye-view-from-vancouvers-burrard-street-bridge-how-we-got-the-shot', - 'playlist': [{ - 'md5': '377572d0b49c4ce0c9ad77470e0b96b4', - 'info_dict': { - 'id': '2680832926', - 'ext': 'mp4', - 'title': 'An Eagle\'s-Eye View Off Burrard Bridge', - 'description': 'Hercules the eagle flies from Vancouver\'s Burrard Bridge down to a nearby park with a mini-camera strapped to his back.', - 'upload_date': '20160201', - 'timestamp': 1454342820, - 'uploader': 'CBCC-NEW', - }, - }, { - 'md5': '415a0e3f586113894174dfb31aa5bb1a', - 'info_dict': { - 'id': '2658915080', - 'ext': 'mp4', - 'title': 'Fly like an eagle!', - 'description': 'Eagle equipped with a mini camera flies from the world\'s tallest tower', - 'upload_date': '20150315', - 'timestamp': 1426443984, - 'uploader': 'CBCC-NEW', - }, - }], - 'skip': 'Geo-restricted to Canada', - }, { - # multiple CBC.APP.Caffeine.initInstance(...) - 'url': 'http://www.cbc.ca/news/canada/calgary/dog-indoor-exercise-winter-1.3928238', - 'info_dict': { - 'title': 'Keep Rover active during the deep freeze with doggie pushups and other fun indoor tasks', - 'id': 'dog-indoor-exercise-winter-1.3928238', - 'description': 'md5:c18552e41726ee95bd75210d1ca9194c', - }, - 'playlist_mincount': 6, - }] - - @classmethod - def suitable(cls, url): - return False if CBCPlayerIE.suitable(url) else super(CBCIE, cls).suitable(url) - - def _extract_player_init(self, player_init, display_id): - player_info = self._parse_json(player_init, display_id, js_to_json) - media_id = player_info.get('mediaId') - if not media_id: - clip_id = player_info['clipId'] - feed = self._download_json( - 'http://tpfeed.cbc.ca/f/ExhSPC/vms_5akSXx4Ng_Zn?byCustomValue={:mpsReleases}{%s}' % clip_id, - clip_id, fatal=False) - if feed: - media_id = try_get(feed, lambda x: x['entries'][0]['guid'], compat_str) - if not media_id: - media_id = self._download_json( - 'http://feed.theplatform.com/f/h9dtGB/punlNGjMlc1F?fields=id&byContent=byReleases%3DbyId%253D' + clip_id, - clip_id)['entries'][0]['id'].split('/')[-1] - return self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - title = self._og_search_title(webpage, default=None) or self._html_search_meta( - 'twitter:title', webpage, 'title', default=None) or self._html_search_regex( - r'<title>([^<]+)</title>', webpage, 'title', fatal=False) - entries = [ - self._extract_player_init(player_init, display_id) - for player_init in re.findall(r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage)] - media_ids = [] - for media_id_re in ( - r'<iframe[^>]+src="[^"]+?mediaId=(\d+)"', - r'<div[^>]+\bid=["\']player-(\d+)', - r'guid["\']\s*:\s*["\'](\d+)'): - media_ids.extend(re.findall(media_id_re, webpage)) - entries.extend([ - self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) - for media_id in orderedSet(media_ids)]) - return self.playlist_result( - entries, display_id, strip_or_none(title), - self._og_search_description(webpage)) - - -class CBCPlayerIE(InfoExtractor): - IE_NAME = 'cbc.ca:player' - _VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/|i/caffeine/syndicate/\?mediaId=))(?P<id>\d+)' - _TESTS = [{ - 'url': 'http://www.cbc.ca/player/play/2683190193', - 'md5': '64d25f841ddf4ddb28a235338af32e2c', - 'info_dict': { - 'id': '2683190193', - 'ext': 'mp4', - 'title': 'Gerry Runs a Sweat Shop', - 'description': 'md5:b457e1c01e8ff408d9d801c1c2cd29b0', - 'timestamp': 1455071400, - 'upload_date': '20160210', - 'uploader': 'CBCC-NEW', - }, - 'skip': 'Geo-restricted to Canada', - }, { - # Redirected from http://www.cbc.ca/player/AudioMobile/All%20in%20a%20Weekend%20Montreal/ID/2657632011/ - 'url': 'http://www.cbc.ca/player/play/2657631896', - 'md5': 'e5e708c34ae6fca156aafe17c43e8b75', - 'info_dict': { - 'id': '2657631896', - 'ext': 'mp3', - 'title': 'CBC Montreal is organizing its first ever community hackathon!', - 'description': 'The modern technology we tend to depend on so heavily, is never without it\'s share of hiccups and headaches. Next weekend - CBC Montreal will be getting members of the public for its first Hackathon.', - 'timestamp': 1425704400, - 'upload_date': '20150307', - 'uploader': 'CBCC-NEW', - }, - }, { - 'url': 'http://www.cbc.ca/player/play/2164402062', - 'md5': '33fcd8f6719b9dd60a5e73adcb83b9f6', - 'info_dict': { - 'id': '2164402062', - 'ext': 'mp4', - 'title': 'Cancer survivor four times over', - 'description': 'Tim Mayer has beaten three different forms of cancer four times in five years.', - 'timestamp': 1320410746, - 'upload_date': '20111104', - 'uploader': 'CBCC-NEW', - }, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - return { - '_type': 'url_transparent', - 'ie_key': 'ThePlatform', - 'url': smuggle_url( - 'http://link.theplatform.com/s/ExhSPC/media/guid/2655402169/%s?mbr=true&formats=MPEG4,FLV,MP3' % video_id, { - 'force_smil_url': True - }), - 'id': video_id, - } - - -class CBCWatchBaseIE(InfoExtractor): - _device_id = None - _device_token = None - _API_BASE_URL = 'https://api-cbc.cloud.clearleap.com/cloffice/client/' - _NS_MAP = { - 'media': 'http://search.yahoo.com/mrss/', - 'clearleap': 'http://www.clearleap.com/namespace/clearleap/1.0/', - } - _GEO_COUNTRIES = ['CA'] - _LOGIN_URL = 'https://api.loginradius.com/identity/v2/auth/login' - _TOKEN_URL = 'https://cloud-api.loginradius.com/sso/jwt/api/token' - _API_KEY = '3f4beddd-2061-49b0-ae80-6f1f2ed65b37' - _NETRC_MACHINE = 'cbcwatch' - - def _signature(self, email, password): - data = json.dumps({ - 'email': email, - 'password': password, - }).encode() - headers = {'content-type': 'application/json'} - query = {'apikey': self._API_KEY} - resp = self._download_json(self._LOGIN_URL, None, data=data, headers=headers, query=query) - access_token = resp['access_token'] - - # token - query = { - 'access_token': access_token, - 'apikey': self._API_KEY, - 'jwtapp': 'jwt', - } - resp = self._download_json(self._TOKEN_URL, None, headers=headers, query=query) - return resp['signature'] - - def _call_api(self, path, video_id): - url = path if path.startswith('http') else self._API_BASE_URL + path - for _ in range(2): - try: - result = self._download_xml(url, video_id, headers={ - 'X-Clearleap-DeviceId': self._device_id, - 'X-Clearleap-DeviceToken': self._device_token, - }) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - # Device token has expired, re-acquiring device token - self._register_device() - continue - raise - error_message = xpath_text(result, 'userMessage') or xpath_text(result, 'systemMessage') - if error_message: - raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message)) - return result - - def _real_initialize(self): - if self._valid_device_token(): - return - device = self._downloader.cache.load( - 'cbcwatch', self._cache_device_key()) or {} - self._device_id, self._device_token = device.get('id'), device.get('token') - if self._valid_device_token(): - return - self._register_device() - - def _valid_device_token(self): - return self._device_id and self._device_token - - def _cache_device_key(self): - email, _ = self._get_login_info() - return '%s_device' % hashlib.sha256(email.encode()).hexdigest() if email else 'device' - - def _register_device(self): - result = self._download_xml( - self._API_BASE_URL + 'device/register', - None, 'Acquiring device token', - data=b'<device><type>web</type></device>') - self._device_id = xpath_text(result, 'deviceId', fatal=True) - email, password = self._get_login_info() - if email and password: - signature = self._signature(email, password) - data = '<login><token>{0}</token><device><deviceId>{1}</deviceId><type>web</type></device></login>'.format( - escape(signature), escape(self._device_id)).encode() - url = self._API_BASE_URL + 'device/login' - result = self._download_xml( - url, None, data=data, - headers={'content-type': 'application/xml'}) - self._device_token = xpath_text(result, 'token', fatal=True) - else: - self._device_token = xpath_text(result, 'deviceToken', fatal=True) - self._downloader.cache.store( - 'cbcwatch', self._cache_device_key(), { - 'id': self._device_id, - 'token': self._device_token, - }) - - def _parse_rss_feed(self, rss): - channel = xpath_element(rss, 'channel', fatal=True) - - def _add_ns(path): - return xpath_with_ns(path, self._NS_MAP) - - entries = [] - for item in channel.findall('item'): - guid = xpath_text(item, 'guid', fatal=True) - title = xpath_text(item, 'title', fatal=True) - - media_group = xpath_element(item, _add_ns('media:group'), fatal=True) - content = xpath_element(media_group, _add_ns('media:content'), fatal=True) - content_url = content.attrib['url'] - - thumbnails = [] - for thumbnail in media_group.findall(_add_ns('media:thumbnail')): - thumbnail_url = thumbnail.get('url') - if not thumbnail_url: - continue - thumbnails.append({ - 'id': thumbnail.get('profile'), - 'url': thumbnail_url, - 'width': int_or_none(thumbnail.get('width')), - 'height': int_or_none(thumbnail.get('height')), - }) - - timestamp = None - release_date = find_xpath_attr( - item, _add_ns('media:credit'), 'role', 'releaseDate') - if release_date is not None: - timestamp = parse_iso8601(release_date.text) - - entries.append({ - '_type': 'url_transparent', - 'url': content_url, - 'id': guid, - 'title': title, - 'description': xpath_text(item, 'description'), - 'timestamp': timestamp, - 'duration': int_or_none(content.get('duration')), - 'age_limit': parse_age_limit(xpath_text(item, _add_ns('media:rating'))), - 'episode': xpath_text(item, _add_ns('clearleap:episode')), - 'episode_number': int_or_none(xpath_text(item, _add_ns('clearleap:episodeInSeason'))), - 'series': xpath_text(item, _add_ns('clearleap:series')), - 'season_number': int_or_none(xpath_text(item, _add_ns('clearleap:season'))), - 'thumbnails': thumbnails, - 'ie_key': 'CBCWatchVideo', - }) - - return self.playlist_result( - entries, xpath_text(channel, 'guid'), - xpath_text(channel, 'title'), - xpath_text(channel, 'description')) - - -class CBCWatchVideoIE(CBCWatchBaseIE): - IE_NAME = 'cbc.ca:watch:video' - _VALID_URL = r'https?://api-cbc\.cloud\.clearleap\.com/cloffice/client/web/play/?\?.*?\bcontentId=(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' - _TEST = { - # geo-restricted to Canada, bypassable - 'url': 'https://api-cbc.cloud.clearleap.com/cloffice/client/web/play/?contentId=3c84472a-1eea-4dee-9267-2655d5055dcf&categoryId=ebc258f5-ee40-4cca-b66b-ba6bd55b7235', - 'only_matching': True, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - result = self._call_api(url, video_id) - - m3u8_url = xpath_text(result, 'url', fatal=True) - formats = self._extract_m3u8_formats(re.sub(r'/([^/]+)/[^/?]+\.m3u8', r'/\1/\1.m3u8', m3u8_url), video_id, 'mp4', fatal=False) - if len(formats) < 2: - formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') - for f in formats: - format_id = f.get('format_id') - if format_id.startswith('AAC'): - f['acodec'] = 'aac' - elif format_id.startswith('AC3'): - f['acodec'] = 'ac-3' - self._sort_formats(formats) - - info = { - 'id': video_id, - 'title': video_id, - 'formats': formats, - } - - rss = xpath_element(result, 'rss') - if rss: - info.update(self._parse_rss_feed(rss)['entries'][0]) - del info['url'] - del info['_type'] - del info['ie_key'] - return info - - -class CBCWatchIE(CBCWatchBaseIE): - IE_NAME = 'cbc.ca:watch' - _VALID_URL = r'https?://(?:gem|watch)\.cbc\.ca/(?:[^/]+/)+(?P<id>[0-9a-f-]+)' - _TESTS = [{ - # geo-restricted to Canada, bypassable - 'url': 'http://watch.cbc.ca/doc-zone/season-6/customer-disservice/38e815a-009e3ab12e4', - 'info_dict': { - 'id': '9673749a-5e77-484c-8b62-a1092a6b5168', - 'ext': 'mp4', - 'title': 'Customer (Dis)Service', - 'description': 'md5:8bdd6913a0fe03d4b2a17ebe169c7c87', - 'upload_date': '20160219', - 'timestamp': 1455840000, - }, - 'params': { - # m3u8 download - 'skip_download': True, - 'format': 'bestvideo', - }, - }, { - # geo-restricted to Canada, bypassable - 'url': 'http://watch.cbc.ca/arthur/all/1ed4b385-cd84-49cf-95f0-80f004680057', - 'info_dict': { - 'id': '1ed4b385-cd84-49cf-95f0-80f004680057', - 'title': 'Arthur', - 'description': 'Arthur, the sweetest 8-year-old aardvark, and his pals solve all kinds of problems with humour, kindness and teamwork.', - }, - 'playlist_mincount': 30, - }, { - 'url': 'https://gem.cbc.ca/media/this-hour-has-22-minutes/season-26/episode-20/38e815a-0108c6c6a42', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - rss = self._call_api('web/browse/' + video_id, video_id) - return self._parse_rss_feed(rss) - - -class CBCOlympicsIE(InfoExtractor): - IE_NAME = 'cbc.ca:olympics' - _VALID_URL = r'https?://olympics\.cbc\.ca/video/[^/]+/(?P<id>[^/?#]+)' - _TESTS = [{ - 'url': 'https://olympics.cbc.ca/video/whats-on-tv/olympic-morning-featuring-the-opening-ceremony/', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - video_id = self._hidden_inputs(webpage)['videoId'] - video_doc = self._download_xml( - 'https://olympics.cbc.ca/videodata/%s.xml' % video_id, video_id) - title = xpath_text(video_doc, 'title', fatal=True) - is_live = xpath_text(video_doc, 'kind') == 'Live' - if is_live: - title = self._live_title(title) - - formats = [] - for video_source in video_doc.findall('videoSources/videoSource'): - uri = xpath_text(video_source, 'uri') - if not uri: - continue - tokenize = self._download_json( - 'https://olympics.cbc.ca/api/api-akamai/tokenize', - video_id, data=json.dumps({ - 'VideoSource': uri, - }).encode(), headers={ - 'Content-Type': 'application/json', - 'Referer': url, - # d3.VideoPlayer._init in https://olympics.cbc.ca/components/script/base.js - 'Cookie': '_dvp=TK:C0ObxjerU', # AKAMAI CDN cookie - }, fatal=False) - if not tokenize: - continue - content_url = tokenize['ContentUrl'] - video_source_format = video_source.get('format') - if video_source_format == 'IIS': - formats.extend(self._extract_ism_formats( - content_url, video_id, ism_id=video_source_format, fatal=False)) - else: - formats.extend(self._extract_m3u8_formats( - content_url, video_id, 'mp4', - 'm3u8' if is_live else 'm3u8_native', - m3u8_id=video_source_format, fatal=False)) - self._sort_formats(formats) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': xpath_text(video_doc, 'description'), - 'thumbnail': xpath_text(video_doc, 'thumbnailUrl'), - 'duration': parse_duration(xpath_text(video_doc, 'duration')), - 'formats': formats, - 'is_live': is_live, - } diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py deleted file mode 100644 index c79e55a75..000000000 --- a/youtube_dl/extractor/cbs.py +++ /dev/null @@ -1,115 +0,0 @@ -from __future__ import unicode_literals - -from .theplatform import ThePlatformFeedIE -from ..utils import ( - ExtractorError, - int_or_none, - find_xpath_attr, - xpath_element, - xpath_text, - update_url_query, -) - - -class CBSBaseIE(ThePlatformFeedIE): - def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): - subtitles = {} - for k, ext in [('sMPTE-TTCCURL', 'tt'), ('ClosedCaptionURL', 'ttml'), ('webVTTCaptionURL', 'vtt')]: - cc_e = find_xpath_attr(smil, self._xpath_ns('.//param', namespace), 'name', k) - if cc_e is not None: - cc_url = cc_e.get('value') - if cc_url: - subtitles.setdefault(subtitles_lang, []).append({ - 'ext': ext, - 'url': cc_url, - }) - return subtitles - - -class CBSIE(CBSBaseIE): - _VALID_URL = r'(?:cbs:|https?://(?:www\.)?(?:(?:cbs|paramountplus)\.com/shows/[^/]+/video|colbertlateshow\.com/(?:video|podcasts))/)(?P<id>[\w-]+)' - - _TESTS = [{ - 'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', - 'info_dict': { - 'id': '_u7W953k6la293J7EPTd9oHkSPs6Xn6_', - 'ext': 'mp4', - 'title': 'Connect Chat feat. Garth Brooks', - 'description': 'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!', - 'duration': 1495, - 'timestamp': 1385585425, - 'upload_date': '20131127', - 'uploader': 'CBSI-NEW', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - '_skip': 'Blocked outside the US', - }, { - 'url': 'http://colbertlateshow.com/video/8GmB0oY0McANFvp2aEffk9jZZZ2YyXxy/the-colbeard/', - 'only_matching': True, - }, { - 'url': 'http://www.colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/', - 'only_matching': True, - }, { - 'url': 'https://www.paramountplus.com/shows/all-rise/video/QmR1WhNkh1a_IrdHZrbcRklm176X_rVc/all-rise-space/', - 'only_matching': True, - }] - - def _extract_video_info(self, content_id, site='cbs', mpx_acc=2198311517): - items_data = self._download_xml( - 'http://can.cbs.com/thunder/player/videoPlayerService.php', - content_id, query={'partner': site, 'contentId': content_id}) - video_data = xpath_element(items_data, './/item') - title = xpath_text(video_data, 'videoTitle', 'title', True) - tp_path = 'dJ5BDC/media/guid/%d/%s' % (mpx_acc, content_id) - tp_release_url = 'http://link.theplatform.com/s/' + tp_path - - asset_types = [] - subtitles = {} - formats = [] - last_e = None - for item in items_data.findall('.//item'): - asset_type = xpath_text(item, 'assetType') - if not asset_type or asset_type in asset_types or 'HLS_FPS' in asset_type or 'DASH_CENC' in asset_type: - continue - asset_types.append(asset_type) - query = { - 'mbr': 'true', - 'assetTypes': asset_type, - } - if asset_type.startswith('HLS') or asset_type in ('OnceURL', 'StreamPack'): - query['formats'] = 'MPEG4,M3U' - elif asset_type in ('RTMP', 'WIFI', '3G'): - query['formats'] = 'MPEG4,FLV' - try: - tp_formats, tp_subtitles = self._extract_theplatform_smil( - update_url_query(tp_release_url, query), content_id, - 'Downloading %s SMIL data' % asset_type) - except ExtractorError as e: - last_e = e - continue - formats.extend(tp_formats) - subtitles = self._merge_subtitles(subtitles, tp_subtitles) - if last_e and not formats: - raise last_e - self._sort_formats(formats) - - info = self._extract_theplatform_metadata(tp_path, content_id) - info.update({ - 'id': content_id, - 'title': title, - 'series': xpath_text(video_data, 'seriesTitle'), - 'season_number': int_or_none(xpath_text(video_data, 'seasonNumber')), - 'episode_number': int_or_none(xpath_text(video_data, 'episodeNumber')), - 'duration': int_or_none(xpath_text(video_data, 'videoLength'), 1000), - 'thumbnail': xpath_text(video_data, 'previewImageURL'), - 'formats': formats, - 'subtitles': subtitles, - }) - return info - - def _real_extract(self, url): - content_id = self._match_id(url) - return self._extract_video_info(content_id) diff --git a/youtube_dl/extractor/cbsinteractive.py b/youtube_dl/extractor/cbsinteractive.py deleted file mode 100644 index 6596e98a6..000000000 --- a/youtube_dl/extractor/cbsinteractive.py +++ /dev/null @@ -1,103 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .cbs import CBSIE -from ..utils import int_or_none - - -class CBSInteractiveIE(CBSIE): - _VALID_URL = r'https?://(?:www\.)?(?P<site>cnet|zdnet)\.com/(?:videos|video(?:/share)?)/(?P<id>[^/?]+)' - _TESTS = [{ - 'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/', - 'info_dict': { - 'id': 'R49SYt__yAfmlXR85z4f7gNmCBDcN_00', - 'display_id': 'hands-on-with-microsofts-windows-8-1-update', - 'ext': 'mp4', - 'title': 'Hands-on with Microsoft Windows 8.1 Update', - 'description': 'The new update to the Windows 8 OS brings improved performance for mouse and keyboard users.', - 'uploader_id': '6085384d-619e-11e3-b231-14feb5ca9861', - 'uploader': 'Sarah Mitroff', - 'duration': 70, - 'timestamp': 1396479627, - 'upload_date': '20140402', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'http://www.cnet.com/videos/whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187/', - 'md5': 'f11d27b2fa18597fbf92444d2a9ed386', - 'info_dict': { - 'id': 'kjOJd_OoVJqbg_ZD8MZCOk8Wekb9QccK', - 'display_id': 'whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187', - 'ext': 'mp4', - 'title': 'Whiny potholes tweet at local government when hit by cars (Tomorrow Daily 187)', - 'description': 'md5:d2b9a95a5ffe978ae6fbd4cf944d618f', - 'uploader_id': 'b163284d-6b73-44fc-b3e6-3da66c392d40', - 'uploader': 'Ashley Esqueda', - 'duration': 1482, - 'timestamp': 1433289889, - 'upload_date': '20150603', - }, - }, { - 'url': 'http://www.zdnet.com/video/share/video-keeping-android-smartphones-and-tablets-secure/', - 'info_dict': { - 'id': 'k0r4T_ehht4xW_hAOqiVQPuBDPZ8SRjt', - 'display_id': 'video-keeping-android-smartphones-and-tablets-secure', - 'ext': 'mp4', - 'title': 'Video: Keeping Android smartphones and tablets secure', - 'description': 'Here\'s the best way to keep Android devices secure, and what you do when they\'ve come to the end of their lives.', - 'uploader_id': 'f2d97ea2-8175-11e2-9d12-0018fe8a00b0', - 'uploader': 'Adrian Kingsley-Hughes', - 'duration': 731, - 'timestamp': 1449129925, - 'upload_date': '20151203', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'http://www.zdnet.com/video/huawei-matebook-x-video/', - 'only_matching': True, - }] - - MPX_ACCOUNTS = { - 'cnet': 2198311517, - 'zdnet': 2387448114, - } - - def _real_extract(self, url): - site, display_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage(url, display_id) - - data_json = self._html_search_regex( - r"data(?:-(?:cnet|zdnet))?-video(?:-(?:uvp(?:js)?|player))?-options='([^']+)'", - webpage, 'data json') - data = self._parse_json(data_json, display_id) - vdata = data.get('video') or (data.get('videos') or data.get('playlist'))[0] - - video_id = vdata['mpxRefId'] - - title = vdata['title'] - author = vdata.get('author') - if author: - uploader = '%s %s' % (author['firstName'], author['lastName']) - uploader_id = author.get('id') - else: - uploader = None - uploader_id = None - - info = self._extract_video_info(video_id, site, self.MPX_ACCOUNTS[site]) - info.update({ - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'duration': int_or_none(vdata.get('duration')), - 'uploader': uploader, - 'uploader_id': uploader_id, - }) - return info diff --git a/youtube_dl/extractor/cbssports.py b/youtube_dl/extractor/cbssports.py deleted file mode 100644 index a891c9a55..000000000 --- a/youtube_dl/extractor/cbssports.py +++ /dev/null @@ -1,113 +0,0 @@ -from __future__ import unicode_literals - -import re - -# from .cbs import CBSBaseIE -from .common import InfoExtractor -from ..utils import ( - int_or_none, - try_get, -) - - -# class CBSSportsEmbedIE(CBSBaseIE): -class CBSSportsEmbedIE(InfoExtractor): - IE_NAME = 'cbssports:embed' - _VALID_URL = r'''(?ix)https?://(?:(?:www\.)?cbs|embed\.247)sports\.com/player/embed.+? - (?: - ids%3D(?P<id>[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})| - pcid%3D(?P<pcid>\d+) - )''' - _TESTS = [{ - 'url': 'https://www.cbssports.com/player/embed/?args=player_id%3Db56c03a6-231a-4bbe-9c55-af3c8a8e9636%26ids%3Db56c03a6-231a-4bbe-9c55-af3c8a8e9636%26resizable%3D1%26autoplay%3Dtrue%26domain%3Dcbssports.com%26comp_ads_enabled%3Dfalse%26watchAndRead%3D0%26startTime%3D0%26env%3Dprod', - 'only_matching': True, - }, { - 'url': 'https://embed.247sports.com/player/embed/?args=%3fplayer_id%3d1827823171591%26channel%3dcollege-football-recruiting%26pcid%3d1827823171591%26width%3d640%26height%3d360%26autoplay%3dTrue%26comp_ads_enabled%3dFalse%26uvpc%3dhttps%253a%252f%252fwww.cbssports.com%252fapi%252fcontent%252fvideo%252fconfig%252f%253fcfg%253duvp_247sports_v4%2526partner%253d247%26uvpc_m%3dhttps%253a%252f%252fwww.cbssports.com%252fapi%252fcontent%252fvideo%252fconfig%252f%253fcfg%253duvp_247sports_m_v4%2526partner_m%253d247_mobile%26utag%3d247sportssite%26resizable%3dTrue', - 'only_matching': True, - }] - - # def _extract_video_info(self, filter_query, video_id): - # return self._extract_feed_info('dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id) - - def _real_extract(self, url): - uuid, pcid = re.match(self._VALID_URL, url).groups() - query = {'id': uuid} if uuid else {'pcid': pcid} - video = self._download_json( - 'https://www.cbssports.com/api/content/video/', - uuid or pcid, query=query)[0] - video_id = video['id'] - title = video['title'] - metadata = video.get('metaData') or {} - # return self._extract_video_info('byId=%d' % metadata['mpxOutletId'], video_id) - # return self._extract_video_info('byGuid=' + metadata['mpxRefId'], video_id) - - formats = self._extract_m3u8_formats( - metadata['files'][0]['url'], video_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False) - self._sort_formats(formats) - - image = video.get('image') - thumbnails = None - if image: - image_path = image.get('path') - if image_path: - thumbnails = [{ - 'url': image_path, - 'width': int_or_none(image.get('width')), - 'height': int_or_none(image.get('height')), - 'filesize': int_or_none(image.get('size')), - }] - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'thumbnails': thumbnails, - 'description': video.get('description'), - 'timestamp': int_or_none(try_get(video, lambda x: x['dateCreated']['epoch'])), - 'duration': int_or_none(metadata.get('duration')), - } - - -class CBSSportsBaseIE(InfoExtractor): - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - iframe_url = self._search_regex( - r'<iframe[^>]+(?:data-)?src="(https?://[^/]+/player/embed[^"]+)"', - webpage, 'embed url') - return self.url_result(iframe_url, CBSSportsEmbedIE.ie_key()) - - -class CBSSportsIE(CBSSportsBaseIE): - IE_NAME = 'cbssports' - _VALID_URL = r'https?://(?:www\.)?cbssports\.com/[^/]+/video/(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'https://www.cbssports.com/college-football/video/cover-3-stanford-spring-gleaning/', - 'info_dict': { - 'id': 'b56c03a6-231a-4bbe-9c55-af3c8a8e9636', - 'ext': 'mp4', - 'title': 'Cover 3: Stanford Spring Gleaning', - 'description': 'The Cover 3 crew break down everything you need to know about the Stanford Cardinal this spring.', - 'timestamp': 1617218398, - 'upload_date': '20210331', - 'duration': 502, - }, - }] - - -class TwentyFourSevenSportsIE(CBSSportsBaseIE): - IE_NAME = '247sports' - _VALID_URL = r'https?://(?:www\.)?247sports\.com/Video/(?:[^/?#&]+-)?(?P<id>\d+)' - _TESTS = [{ - 'url': 'https://247sports.com/Video/2021-QB-Jake-Garcia-senior-highlights-through-five-games-10084854/', - 'info_dict': { - 'id': '4f1265cb-c3b5-44a8-bb1d-1914119a0ccc', - 'ext': 'mp4', - 'title': '2021 QB Jake Garcia senior highlights through five games', - 'description': 'md5:8cb67ebed48e2e6adac1701e0ff6e45b', - 'timestamp': 1607114223, - 'upload_date': '20201204', - 'duration': 208, - }, - }] diff --git a/youtube_dl/extractor/ccma.py b/youtube_dl/extractor/ccma.py deleted file mode 100644 index e6ae49352..000000000 --- a/youtube_dl/extractor/ccma.py +++ /dev/null @@ -1,155 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import calendar -import datetime -import re - -from .common import InfoExtractor -from ..utils import ( - clean_html, - extract_timezone, - int_or_none, - parse_duration, - parse_resolution, - try_get, - url_or_none, -) - - -class CCMAIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ccma\.cat/(?:[^/]+/)*?(?P<type>video|audio)/(?P<id>\d+)' - _TESTS = [{ - 'url': 'http://www.ccma.cat/tv3/alacarta/lespot-de-la-marato-de-tv3/lespot-de-la-marato-de-tv3/video/5630208/', - 'md5': '7296ca43977c8ea4469e719c609b0871', - 'info_dict': { - 'id': '5630208', - 'ext': 'mp4', - 'title': 'L\'espot de La Marató de TV3', - 'description': 'md5:f12987f320e2f6e988e9908e4fe97765', - 'timestamp': 1478608140, - 'upload_date': '20161108', - 'age_limit': 0, - } - }, { - 'url': 'http://www.ccma.cat/catradio/alacarta/programa/el-consell-de-savis-analitza-el-derbi/audio/943685/', - 'md5': 'fa3e38f269329a278271276330261425', - 'info_dict': { - 'id': '943685', - 'ext': 'mp3', - 'title': 'El Consell de Savis analitza el derbi', - 'description': 'md5:e2a3648145f3241cb9c6b4b624033e53', - 'upload_date': '20170512', - 'timestamp': 1494622500, - 'vcodec': 'none', - 'categories': ['Esports'], - } - }, { - 'url': 'http://www.ccma.cat/tv3/alacarta/crims/crims-josep-tallada-lespereu-me-capitol-1/video/6031387/', - 'md5': 'b43c3d3486f430f3032b5b160d80cbc3', - 'info_dict': { - 'id': '6031387', - 'ext': 'mp4', - 'title': 'Crims - Josep Talleda, l\'"Espereu-me" (capítol 1)', - 'description': 'md5:7cbdafb640da9d0d2c0f62bad1e74e60', - 'timestamp': 1582577700, - 'upload_date': '20200224', - 'subtitles': 'mincount:4', - 'age_limit': 16, - 'series': 'Crims', - } - }] - - def _real_extract(self, url): - media_type, media_id = re.match(self._VALID_URL, url).groups() - - media = self._download_json( - 'http://dinamics.ccma.cat/pvideo/media.jsp', media_id, query={ - 'media': media_type, - 'idint': media_id, - }) - - formats = [] - media_url = media['media']['url'] - if isinstance(media_url, list): - for format_ in media_url: - format_url = url_or_none(format_.get('file')) - if not format_url: - continue - label = format_.get('label') - f = parse_resolution(label) - f.update({ - 'url': format_url, - 'format_id': label, - }) - formats.append(f) - else: - formats.append({ - 'url': media_url, - 'vcodec': 'none' if media_type == 'audio' else None, - }) - self._sort_formats(formats) - - informacio = media['informacio'] - title = informacio['titol'] - durada = informacio.get('durada') or {} - duration = int_or_none(durada.get('milisegons'), 1000) or parse_duration(durada.get('text')) - tematica = try_get(informacio, lambda x: x['tematica']['text']) - - timestamp = None - data_utc = try_get(informacio, lambda x: x['data_emissio']['utc']) - try: - timezone, data_utc = extract_timezone(data_utc) - timestamp = calendar.timegm((datetime.datetime.strptime( - data_utc, '%Y-%d-%mT%H:%M:%S') - timezone).timetuple()) - except TypeError: - pass - - subtitles = {} - subtitols = media.get('subtitols') or [] - if isinstance(subtitols, dict): - subtitols = [subtitols] - for st in subtitols: - sub_url = st.get('url') - if sub_url: - subtitles.setdefault( - st.get('iso') or st.get('text') or 'ca', []).append({ - 'url': sub_url, - }) - - thumbnails = [] - imatges = media.get('imatges', {}) - if imatges: - thumbnail_url = imatges.get('url') - if thumbnail_url: - thumbnails = [{ - 'url': thumbnail_url, - 'width': int_or_none(imatges.get('amplada')), - 'height': int_or_none(imatges.get('alcada')), - }] - - age_limit = None - codi_etic = try_get(informacio, lambda x: x['codi_etic']['id']) - if codi_etic: - codi_etic_s = codi_etic.split('_') - if len(codi_etic_s) == 2: - if codi_etic_s[1] == 'TP': - age_limit = 0 - else: - age_limit = int_or_none(codi_etic_s[1]) - - return { - 'id': media_id, - 'title': title, - 'description': clean_html(informacio.get('descripcio')), - 'duration': duration, - 'timestamp': timestamp, - 'thumbnails': thumbnails, - 'subtitles': subtitles, - 'formats': formats, - 'age_limit': age_limit, - 'alt_title': informacio.get('titol_complet'), - 'episode_number': int_or_none(informacio.get('capitol')), - 'categories': [tematica] if tematica else None, - 'series': informacio.get('programa'), - } diff --git a/youtube_dl/extractor/cctv.py b/youtube_dl/extractor/cctv.py deleted file mode 100644 index c76f361c6..000000000 --- a/youtube_dl/extractor/cctv.py +++ /dev/null @@ -1,191 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - float_or_none, - try_get, - unified_timestamp, -) - - -class CCTVIE(InfoExtractor): - IE_DESC = '央视网' - _VALID_URL = r'https?://(?:(?:[^/]+)\.(?:cntv|cctv)\.(?:com|cn)|(?:www\.)?ncpa-classic\.com)/(?:[^/]+/)*?(?P<id>[^/?#&]+?)(?:/index)?(?:\.s?html|[?#&]|$)' - _TESTS = [{ - # fo.addVariable("videoCenterId","id") - 'url': 'http://sports.cntv.cn/2016/02/12/ARTIaBRxv4rTT1yWf1frW2wi160212.shtml', - 'md5': 'd61ec00a493e09da810bf406a078f691', - 'info_dict': { - 'id': '5ecdbeab623f4973b40ff25f18b174e8', - 'ext': 'mp4', - 'title': '[NBA]二少联手砍下46分 雷霆主场击败鹈鹕(快讯)', - 'description': 'md5:7e14a5328dc5eb3d1cd6afbbe0574e95', - 'duration': 98, - 'uploader': 'songjunjie', - 'timestamp': 1455279956, - 'upload_date': '20160212', - }, - }, { - # var guid = "id" - 'url': 'http://tv.cctv.com/2016/02/05/VIDEUS7apq3lKrHG9Dncm03B160205.shtml', - 'info_dict': { - 'id': 'efc5d49e5b3b4ab2b34f3a502b73d3ae', - 'ext': 'mp4', - 'title': '[赛车]“车王”舒马赫恢复情况成谜(快讯)', - 'description': '2月4日,蒙特泽莫罗透露了关于“车王”舒马赫恢复情况,但情况是否属实遭到了质疑。', - 'duration': 37, - 'uploader': 'shujun', - 'timestamp': 1454677291, - 'upload_date': '20160205', - }, - 'params': { - 'skip_download': True, - }, - }, { - # changePlayer('id') - 'url': 'http://english.cntv.cn/special/four_comprehensives/index.shtml', - 'info_dict': { - 'id': '4bb9bb4db7a6471ba85fdeda5af0381e', - 'ext': 'mp4', - 'title': 'NHnews008 ANNUAL POLITICAL SEASON', - 'description': 'Four Comprehensives', - 'duration': 60, - 'uploader': 'zhangyunlei', - 'timestamp': 1425385521, - 'upload_date': '20150303', - }, - 'params': { - 'skip_download': True, - }, - }, { - # loadvideo('id') - 'url': 'http://cctv.cntv.cn/lm/tvseries_russian/yilugesanghua/index.shtml', - 'info_dict': { - 'id': 'b15f009ff45c43968b9af583fc2e04b2', - 'ext': 'mp4', - 'title': 'Путь,усыпанный космеями Серия 1', - 'description': 'Путь, усыпанный космеями', - 'duration': 2645, - 'uploader': 'renxue', - 'timestamp': 1477479241, - 'upload_date': '20161026', - }, - 'params': { - 'skip_download': True, - }, - }, { - # var initMyAray = 'id' - 'url': 'http://www.ncpa-classic.com/2013/05/22/VIDE1369219508996867.shtml', - 'info_dict': { - 'id': 'a194cfa7f18c426b823d876668325946', - 'ext': 'mp4', - 'title': '小泽征尔音乐塾 音乐梦想无国界', - 'duration': 2173, - 'timestamp': 1369248264, - 'upload_date': '20130522', - }, - 'params': { - 'skip_download': True, - }, - }, { - # var ids = ["id"] - 'url': 'http://www.ncpa-classic.com/clt/more/416/index.shtml', - 'info_dict': { - 'id': 'a8606119a4884588a79d81c02abecc16', - 'ext': 'mp3', - 'title': '来自维也纳的新年贺礼', - 'description': 'md5:f13764ae8dd484e84dd4b39d5bcba2a7', - 'duration': 1578, - 'uploader': 'djy', - 'timestamp': 1482942419, - 'upload_date': '20161228', - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': ['Failed to download m3u8 information'], - }, { - 'url': 'http://ent.cntv.cn/2016/01/18/ARTIjprSSJH8DryTVr5Bx8Wb160118.shtml', - 'only_matching': True, - }, { - 'url': 'http://tv.cntv.cn/video/C39296/e0210d949f113ddfb38d31f00a4e5c44', - 'only_matching': True, - }, { - 'url': 'http://english.cntv.cn/2016/09/03/VIDEhnkB5y9AgHyIEVphCEz1160903.shtml', - 'only_matching': True, - }, { - 'url': 'http://tv.cctv.com/2016/09/07/VIDE5C1FnlX5bUywlrjhxXOV160907.shtml', - 'only_matching': True, - }, { - 'url': 'http://tv.cntv.cn/video/C39296/95cfac44cabd3ddc4a9438780a4e5c44', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - video_id = self._search_regex( - [r'var\s+guid\s*=\s*["\']([\da-fA-F]+)', - r'videoCenterId["\']\s*,\s*["\']([\da-fA-F]+)', - r'changePlayer\s*\(\s*["\']([\da-fA-F]+)', - r'load[Vv]ideo\s*\(\s*["\']([\da-fA-F]+)', - r'var\s+initMyAray\s*=\s*["\']([\da-fA-F]+)', - r'var\s+ids\s*=\s*\[["\']([\da-fA-F]+)'], - webpage, 'video id') - - data = self._download_json( - 'http://vdn.apps.cntv.cn/api/getHttpVideoInfo.do', video_id, - query={ - 'pid': video_id, - 'url': url, - 'idl': 32, - 'idlr': 32, - 'modifyed': 'false', - }) - - title = data['title'] - - formats = [] - - video = data.get('video') - if isinstance(video, dict): - for quality, chapters_key in enumerate(('lowChapters', 'chapters')): - video_url = try_get( - video, lambda x: x[chapters_key][0]['url'], compat_str) - if video_url: - formats.append({ - 'url': video_url, - 'format_id': 'http', - 'quality': quality, - 'preference': -1, - }) - - hls_url = try_get(data, lambda x: x['hls_url'], compat_str) - if hls_url: - hls_url = re.sub(r'maxbr=\d+&?', '', hls_url) - formats.extend(self._extract_m3u8_formats( - hls_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - - self._sort_formats(formats) - - uploader = data.get('editer_name') - description = self._html_search_meta( - 'description', webpage, default=None) - timestamp = unified_timestamp(data.get('f_pgmtime')) - duration = float_or_none(try_get(video, lambda x: x['totalLength'])) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'uploader': uploader, - 'timestamp': timestamp, - 'duration': duration, - 'formats': formats, - } diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py deleted file mode 100644 index e1b391937..000000000 --- a/youtube_dl/extractor/cda.py +++ /dev/null @@ -1,214 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import codecs -import re - -from .common import InfoExtractor -from ..compat import ( - compat_chr, - compat_ord, - compat_urllib_parse_unquote, -) -from ..utils import ( - ExtractorError, - float_or_none, - int_or_none, - merge_dicts, - multipart_encode, - parse_duration, - random_birthday, - urljoin, -) - - -class CDAIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www\.)?cda\.pl/video|ebd\.cda\.pl/[0-9]+x[0-9]+)/(?P<id>[0-9a-z]+)' - _BASE_URL = 'http://www.cda.pl/' - _TESTS = [{ - 'url': 'http://www.cda.pl/video/5749950c', - 'md5': '6f844bf51b15f31fae165365707ae970', - 'info_dict': { - 'id': '5749950c', - 'ext': 'mp4', - 'height': 720, - 'title': 'Oto dlaczego przed zakrętem należy zwolnić.', - 'description': 'md5:269ccd135d550da90d1662651fcb9772', - 'thumbnail': r're:^https?://.*\.jpg$', - 'average_rating': float, - 'duration': 39, - 'age_limit': 0, - } - }, { - 'url': 'http://www.cda.pl/video/57413289', - 'md5': 'a88828770a8310fc00be6c95faf7f4d5', - 'info_dict': { - 'id': '57413289', - 'ext': 'mp4', - 'title': 'Lądowanie na lotnisku na Maderze', - 'description': 'md5:60d76b71186dcce4e0ba6d4bbdb13e1a', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'crash404', - 'view_count': int, - 'average_rating': float, - 'duration': 137, - 'age_limit': 0, - } - }, { - # Age-restricted - 'url': 'http://www.cda.pl/video/1273454c4', - 'info_dict': { - 'id': '1273454c4', - 'ext': 'mp4', - 'title': 'Bronson (2008) napisy HD 1080p', - 'description': 'md5:1b6cb18508daf2dc4e0fa4db77fec24c', - 'height': 1080, - 'uploader': 'boniek61', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 5554, - 'age_limit': 18, - 'view_count': int, - 'average_rating': float, - }, - }, { - 'url': 'http://ebd.cda.pl/0x0/5749950c', - 'only_matching': True, - }] - - def _download_age_confirm_page(self, url, video_id, *args, **kwargs): - form_data = random_birthday('rok', 'miesiac', 'dzien') - form_data.update({'return': url, 'module': 'video', 'module_id': video_id}) - data, content_type = multipart_encode(form_data) - return self._download_webpage( - urljoin(url, '/a/validatebirth'), video_id, *args, - data=data, headers={ - 'Referer': url, - 'Content-Type': content_type, - }, **kwargs) - - def _real_extract(self, url): - video_id = self._match_id(url) - self._set_cookie('cda.pl', 'cda.player', 'html5') - webpage = self._download_webpage( - self._BASE_URL + '/video/' + video_id, video_id) - - if 'Ten film jest dostępny dla użytkowników premium' in webpage: - raise ExtractorError('This video is only available for premium users.', expected=True) - - if re.search(r'niedostępn[ey] w(?: |\s+)Twoim kraju\s*<', webpage): - self.raise_geo_restricted() - - need_confirm_age = False - if self._html_search_regex(r'(<form[^>]+action="[^"]*/a/validatebirth[^"]*")', - webpage, 'birthday validate form', default=None): - webpage = self._download_age_confirm_page( - url, video_id, note='Confirming age') - need_confirm_age = True - - formats = [] - - uploader = self._search_regex(r'''(?x) - <(span|meta)[^>]+itemprop=(["\'])author\2[^>]*> - (?:<\1[^>]*>[^<]*</\1>|(?!</\1>)(?:.|\n))*? - <(span|meta)[^>]+itemprop=(["\'])name\4[^>]*>(?P<uploader>[^<]+)</\3> - ''', webpage, 'uploader', default=None, group='uploader') - view_count = self._search_regex( - r'Odsłony:(?:\s| )*([0-9]+)', webpage, - 'view_count', default=None) - average_rating = self._search_regex( - (r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P<rating_value>[0-9.]+)', - r'<span[^>]+\bclass=["\']rating["\'][^>]*>(?P<rating_value>[0-9.]+)'), webpage, 'rating', fatal=False, - group='rating_value') - - info_dict = { - 'id': video_id, - 'title': self._og_search_title(webpage), - 'description': self._og_search_description(webpage), - 'uploader': uploader, - 'view_count': int_or_none(view_count), - 'average_rating': float_or_none(average_rating), - 'thumbnail': self._og_search_thumbnail(webpage), - 'formats': formats, - 'duration': None, - 'age_limit': 18 if need_confirm_age else 0, - } - - info = self._search_json_ld(webpage, video_id, default={}) - - # Source: https://www.cda.pl/js/player.js?t=1606154898 - def decrypt_file(a): - for p in ('_XDDD', '_CDA', '_ADC', '_CXD', '_QWE', '_Q5', '_IKSDE'): - a = a.replace(p, '') - a = compat_urllib_parse_unquote(a) - b = [] - for c in a: - f = compat_ord(c) - b.append(compat_chr(33 + (f + 14) % 94) if 33 <= f and 126 >= f else compat_chr(f)) - a = ''.join(b) - a = a.replace('.cda.mp4', '') - for p in ('.2cda.pl', '.3cda.pl'): - a = a.replace(p, '.cda.pl') - if '/upstream' in a: - a = a.replace('/upstream', '.mp4/upstream') - return 'https://' + a - return 'https://' + a + '.mp4' - - def extract_format(page, version): - json_str = self._html_search_regex( - r'player_data=(\\?["\'])(?P<player_data>.+?)\1', page, - '%s player_json' % version, fatal=False, group='player_data') - if not json_str: - return - player_data = self._parse_json( - json_str, '%s player_data' % version, fatal=False) - if not player_data: - return - video = player_data.get('video') - if not video or 'file' not in video: - self.report_warning('Unable to extract %s version information' % version) - return - if video['file'].startswith('uggc'): - video['file'] = codecs.decode(video['file'], 'rot_13') - if video['file'].endswith('adc.mp4'): - video['file'] = video['file'].replace('adc.mp4', '.mp4') - elif not video['file'].startswith('http'): - video['file'] = decrypt_file(video['file']) - f = { - 'url': video['file'], - } - m = re.search( - r'<a[^>]+data-quality="(?P<format_id>[^"]+)"[^>]+href="[^"]+"[^>]+class="[^"]*quality-btn-active[^"]*">(?P<height>[0-9]+)p', - page) - if m: - f.update({ - 'format_id': m.group('format_id'), - 'height': int(m.group('height')), - }) - info_dict['formats'].append(f) - if not info_dict['duration']: - info_dict['duration'] = parse_duration(video.get('duration')) - - extract_format(webpage, 'default') - - for href, resolution in re.findall( - r'<a[^>]+data-quality="[^"]+"[^>]+href="([^"]+)"[^>]+class="quality-btn"[^>]*>([0-9]+p)', - webpage): - if need_confirm_age: - handler = self._download_age_confirm_page - else: - handler = self._download_webpage - - webpage = handler( - urljoin(self._BASE_URL, href), video_id, - 'Downloading %s version information' % resolution, fatal=False) - if not webpage: - # Manually report warning because empty page is returned when - # invalid version is requested. - self.report_warning('Unable to download %s version information' % resolution) - continue - - extract_format(webpage, resolution) - - self._sort_formats(formats) - - return merge_dicts(info_dict, info) diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py deleted file mode 100644 index 7cb4efb74..000000000 --- a/youtube_dl/extractor/ceskatelevize.py +++ /dev/null @@ -1,289 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_unquote, - compat_urllib_parse_urlparse, -) -from ..utils import ( - ExtractorError, - float_or_none, - sanitized_Request, - unescapeHTML, - update_url_query, - urlencode_postdata, - USER_AGENTS, -) - - -class CeskaTelevizeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/ivysilani/(?:[^/?#&]+/)*(?P<id>[^/#?]+)' - _TESTS = [{ - 'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220', - 'info_dict': { - 'id': '61924494877246241', - 'ext': 'mp4', - 'title': 'Hyde Park Civilizace: Život v Grónsku', - 'description': 'md5:3fec8f6bb497be5cdb0c9e8781076626', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 3350, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'http://www.ceskatelevize.cz/ivysilani/10441294653-hyde-park-civilizace/215411058090502/bonus/20641-bonus-01-en', - 'info_dict': { - 'id': '61924494877028507', - 'ext': 'mp4', - 'title': 'Hyde Park Civilizace: Bonus 01 - En', - 'description': 'English Subtittles', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 81.3, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - # live stream - 'url': 'http://www.ceskatelevize.cz/ivysilani/zive/ct4/', - 'info_dict': { - 'id': 402, - 'ext': 'mp4', - 'title': r're:^ČT Sport \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', - 'is_live': True, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'skip': 'Georestricted to Czech Republic', - }, { - 'url': 'http://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php?hash=d6a3e1370d2e4fa76296b90bad4dfc19673b641e&IDEC=217 562 22150/0004&channelID=1&width=100%25', - 'only_matching': True, - }] - - def _real_extract(self, url): - playlist_id = self._match_id(url) - - webpage = self._download_webpage(url, playlist_id) - - NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.' - if '%s</p>' % NOT_AVAILABLE_STRING in webpage: - raise ExtractorError(NOT_AVAILABLE_STRING, expected=True) - - type_ = None - episode_id = None - - playlist = self._parse_json( - self._search_regex( - r'getPlaylistUrl\(\[({.+?})\]', webpage, 'playlist', - default='{}'), playlist_id) - if playlist: - type_ = playlist.get('type') - episode_id = playlist.get('id') - - if not type_: - type_ = self._html_search_regex( - r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],', - webpage, 'type') - if not episode_id: - episode_id = self._html_search_regex( - r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],', - webpage, 'episode_id') - - data = { - 'playlist[0][type]': type_, - 'playlist[0][id]': episode_id, - 'requestUrl': compat_urllib_parse_urlparse(url).path, - 'requestSource': 'iVysilani', - } - - entries = [] - - for user_agent in (None, USER_AGENTS['Safari']): - req = sanitized_Request( - 'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist', - data=urlencode_postdata(data)) - - req.add_header('Content-type', 'application/x-www-form-urlencoded') - req.add_header('x-addr', '127.0.0.1') - req.add_header('X-Requested-With', 'XMLHttpRequest') - if user_agent: - req.add_header('User-Agent', user_agent) - req.add_header('Referer', url) - - playlistpage = self._download_json(req, playlist_id, fatal=False) - - if not playlistpage: - continue - - playlist_url = playlistpage['url'] - if playlist_url == 'error_region': - raise ExtractorError(NOT_AVAILABLE_STRING, expected=True) - - req = sanitized_Request(compat_urllib_parse_unquote(playlist_url)) - req.add_header('Referer', url) - - playlist_title = self._og_search_title(webpage, default=None) - playlist_description = self._og_search_description(webpage, default=None) - - playlist = self._download_json(req, playlist_id, fatal=False) - if not playlist: - continue - - playlist = playlist.get('playlist') - if not isinstance(playlist, list): - continue - - playlist_len = len(playlist) - - for num, item in enumerate(playlist): - is_live = item.get('type') == 'LIVE' - formats = [] - for format_id, stream_url in item.get('streamUrls', {}).items(): - if 'drmOnly=true' in stream_url: - continue - if 'playerType=flash' in stream_url: - stream_formats = self._extract_m3u8_formats( - stream_url, playlist_id, 'mp4', 'm3u8_native', - m3u8_id='hls-%s' % format_id, fatal=False) - else: - stream_formats = self._extract_mpd_formats( - stream_url, playlist_id, - mpd_id='dash-%s' % format_id, fatal=False) - # See https://github.com/ytdl-org/youtube-dl/issues/12119#issuecomment-280037031 - if format_id == 'audioDescription': - for f in stream_formats: - f['source_preference'] = -10 - formats.extend(stream_formats) - - if user_agent and len(entries) == playlist_len: - entries[num]['formats'].extend(formats) - continue - - item_id = item.get('id') or item['assetId'] - title = item['title'] - - duration = float_or_none(item.get('duration')) - thumbnail = item.get('previewImageUrl') - - subtitles = {} - if item.get('type') == 'VOD': - subs = item.get('subtitles') - if subs: - subtitles = self.extract_subtitles(episode_id, subs) - - if playlist_len == 1: - final_title = playlist_title or title - if is_live: - final_title = self._live_title(final_title) - else: - final_title = '%s (%s)' % (playlist_title, title) - - entries.append({ - 'id': item_id, - 'title': final_title, - 'description': playlist_description if playlist_len == 1 else None, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - 'subtitles': subtitles, - 'is_live': is_live, - }) - - for e in entries: - self._sort_formats(e['formats']) - - return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) - - def _get_subtitles(self, episode_id, subs): - original_subtitles = self._download_webpage( - subs[0]['url'], episode_id, 'Downloading subtitles') - srt_subs = self._fix_subtitles(original_subtitles) - return { - 'cs': [{ - 'ext': 'srt', - 'data': srt_subs, - }] - } - - @staticmethod - def _fix_subtitles(subtitles): - """ Convert millisecond-based subtitles to SRT """ - - def _msectotimecode(msec): - """ Helper utility to convert milliseconds to timecode """ - components = [] - for divider in [1000, 60, 60, 100]: - components.append(msec % divider) - msec //= divider - return '{3:02}:{2:02}:{1:02},{0:03}'.format(*components) - - def _fix_subtitle(subtitle): - for line in subtitle.splitlines(): - m = re.match(r'^\s*([0-9]+);\s*([0-9]+)\s+([0-9]+)\s*$', line) - if m: - yield m.group(1) - start, stop = (_msectotimecode(int(t)) for t in m.groups()[1:]) - yield '{0} --> {1}'.format(start, stop) - else: - yield line - - return '\r\n'.join(_fix_subtitle(subtitles)) - - -class CeskaTelevizePoradyIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/porady/(?:[^/?#&]+/)*(?P<id>[^/#?]+)' - _TESTS = [{ - # video with 18+ caution trailer - 'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/', - 'info_dict': { - 'id': '215562210900007-bogotart', - 'title': 'Queer: Bogotart', - 'description': 'Alternativní průvodce současným queer světem', - }, - 'playlist': [{ - 'info_dict': { - 'id': '61924494876844842', - 'ext': 'mp4', - 'title': 'Queer: Bogotart (Varování 18+)', - 'duration': 10.2, - }, - }, { - 'info_dict': { - 'id': '61924494877068022', - 'ext': 'mp4', - 'title': 'Queer: Bogotart (Queer)', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 1558.3, - }, - }], - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - # iframe embed - 'url': 'http://www.ceskatelevize.cz/porady/10614999031-neviditelni/21251212048/', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - data_url = update_url_query(unescapeHTML(self._search_regex( - (r'<span[^>]*\bdata-url=(["\'])(?P<url>(?:(?!\1).)+)\1', - r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?ceskatelevize\.cz/ivysilani/embed/iFramePlayer\.php.*?)\1'), - webpage, 'iframe player url', group='url')), query={ - 'autoStart': 'true', - }) - - return self.url_result(data_url, ie=CeskaTelevizeIE.ie_key()) diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py deleted file mode 100644 index 09cacf6d3..000000000 --- a/youtube_dl/extractor/channel9.py +++ /dev/null @@ -1,262 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - clean_html, - ExtractorError, - int_or_none, - parse_iso8601, - qualities, - unescapeHTML, -) - - -class Channel9IE(InfoExtractor): - IE_DESC = 'Channel 9' - IE_NAME = 'channel9' - _VALID_URL = r'https?://(?:www\.)?(?:channel9\.msdn\.com|s\.ch9\.ms)/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)' - - _TESTS = [{ - 'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002', - 'md5': '32083d4eaf1946db6d454313f44510ca', - 'info_dict': { - 'id': '6c413323-383a-49dc-88f9-a22800cab024', - 'ext': 'wmv', - 'title': 'Developer Kick-Off Session: Stuff We Love', - 'description': 'md5:b80bf9355a503c193aff7ec6cd5a7731', - 'duration': 4576, - 'thumbnail': r're:https?://.*\.jpg', - 'timestamp': 1377717420, - 'upload_date': '20130828', - 'session_code': 'KOS002', - 'session_room': 'Arena 1A', - 'session_speakers': 'count:5', - }, - }, { - 'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing', - 'md5': 'dcf983ee6acd2088e7188c3cf79b46bc', - 'info_dict': { - 'id': 'fe8e435f-bb93-4e01-8e97-a28c01887024', - 'ext': 'wmv', - 'title': 'Self-service BI with Power BI - nuclear testing', - 'description': 'md5:2d17fec927fc91e9e17783b3ecc88f54', - 'duration': 1540, - 'thumbnail': r're:https?://.*\.jpg', - 'timestamp': 1386381991, - 'upload_date': '20131207', - 'authors': ['Mike Wilmot'], - }, - }, { - # low quality mp4 is best - 'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library', - 'info_dict': { - 'id': '33ad69d2-6a4e-4172-83a1-a523013dec76', - 'ext': 'mp4', - 'title': 'Ranges for the Standard Library', - 'description': 'md5:9895e0a9fd80822d2f01c454b8f4a372', - 'duration': 5646, - 'thumbnail': r're:https?://.*\.jpg', - 'upload_date': '20150930', - 'timestamp': 1443640735, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://channel9.msdn.com/Events/DEVintersection/DEVintersection-2016/RSS', - 'info_dict': { - 'id': 'Events/DEVintersection/DEVintersection-2016', - 'title': 'DEVintersection 2016 Orlando Sessions', - }, - 'playlist_mincount': 14, - }, { - 'url': 'https://channel9.msdn.com/Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b/RSS', - 'only_matching': True, - }, { - 'url': 'https://channel9.msdn.com/Events/Speakers/scott-hanselman/RSS?UrlSafeName=scott-hanselman', - 'only_matching': True, - }] - - _RSS_URL = 'http://channel9.msdn.com/%s/RSS' - - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe[^>]+src=["\'](https?://channel9\.msdn\.com/(?:[^/]+/)+)player\b', - webpage) - - def _extract_list(self, video_id, rss_url=None): - if not rss_url: - rss_url = self._RSS_URL % video_id - rss = self._download_xml(rss_url, video_id, 'Downloading RSS') - entries = [self.url_result(session_url.text, 'Channel9') - for session_url in rss.findall('./channel/item/link')] - title_text = rss.find('./channel/title').text - return self.playlist_result(entries, video_id, title_text) - - def _real_extract(self, url): - content_path, rss = re.match(self._VALID_URL, url).groups() - - if rss: - return self._extract_list(content_path, url) - - webpage = self._download_webpage( - url, content_path, 'Downloading web page') - - episode_data = self._search_regex( - r"data-episode='([^']+)'", webpage, 'episode data', default=None) - if episode_data: - episode_data = self._parse_json(unescapeHTML( - episode_data), content_path) - content_id = episode_data['contentId'] - is_session = '/Sessions(' in episode_data['api'] - content_url = 'https://channel9.msdn.com/odata' + episode_data['api'] + '?$select=Captions,CommentCount,MediaLengthInSeconds,PublishedDate,Rating,RatingCount,Title,VideoMP4High,VideoMP4Low,VideoMP4Medium,VideoPlayerPreviewImage,VideoWMV,VideoWMVHQ,Views,' - if is_session: - content_url += 'Code,Description,Room,Slides,Speakers,ZipFile&$expand=Speakers' - else: - content_url += 'Authors,Body&$expand=Authors' - content_data = self._download_json(content_url, content_id) - title = content_data['Title'] - - QUALITIES = ( - 'mp3', - 'wmv', 'mp4', - 'wmv-low', 'mp4-low', - 'wmv-mid', 'mp4-mid', - 'wmv-high', 'mp4-high', - ) - - quality_key = qualities(QUALITIES) - - def quality(quality_id, format_url): - return (len(QUALITIES) if '_Source.' in format_url - else quality_key(quality_id)) - - formats = [] - urls = set() - - SITE_QUALITIES = { - 'MP3': 'mp3', - 'MP4': 'mp4', - 'Low Quality WMV': 'wmv-low', - 'Low Quality MP4': 'mp4-low', - 'Mid Quality WMV': 'wmv-mid', - 'Mid Quality MP4': 'mp4-mid', - 'High Quality WMV': 'wmv-high', - 'High Quality MP4': 'mp4-high', - } - - formats_select = self._search_regex( - r'(?s)<select[^>]+name=["\']format[^>]+>(.+?)</select', webpage, - 'formats select', default=None) - if formats_select: - for mobj in re.finditer( - r'<option\b[^>]+\bvalue=(["\'])(?P<url>(?:(?!\1).)+)\1[^>]*>\s*(?P<format>[^<]+?)\s*<', - formats_select): - format_url = mobj.group('url') - if format_url in urls: - continue - urls.add(format_url) - format_id = mobj.group('format') - quality_id = SITE_QUALITIES.get(format_id, format_id) - formats.append({ - 'url': format_url, - 'format_id': quality_id, - 'quality': quality(quality_id, format_url), - 'vcodec': 'none' if quality_id == 'mp3' else None, - }) - - API_QUALITIES = { - 'VideoMP4Low': 'mp4-low', - 'VideoWMV': 'wmv-mid', - 'VideoMP4Medium': 'mp4-mid', - 'VideoMP4High': 'mp4-high', - 'VideoWMVHQ': 'wmv-hq', - } - - for format_id, q in API_QUALITIES.items(): - q_url = content_data.get(format_id) - if not q_url or q_url in urls: - continue - urls.add(q_url) - formats.append({ - 'url': q_url, - 'format_id': q, - 'quality': quality(q, q_url), - }) - - self._sort_formats(formats) - - slides = content_data.get('Slides') - zip_file = content_data.get('ZipFile') - - if not formats and not slides and not zip_file: - raise ExtractorError( - 'None of recording, slides or zip are available for %s' % content_path) - - subtitles = {} - for caption in content_data.get('Captions', []): - caption_url = caption.get('Url') - if not caption_url: - continue - subtitles.setdefault(caption.get('Language', 'en'), []).append({ - 'url': caption_url, - 'ext': 'vtt', - }) - - common = { - 'id': content_id, - 'title': title, - 'description': clean_html(content_data.get('Description') or content_data.get('Body')), - 'thumbnail': content_data.get('VideoPlayerPreviewImage'), - 'duration': int_or_none(content_data.get('MediaLengthInSeconds')), - 'timestamp': parse_iso8601(content_data.get('PublishedDate')), - 'avg_rating': int_or_none(content_data.get('Rating')), - 'rating_count': int_or_none(content_data.get('RatingCount')), - 'view_count': int_or_none(content_data.get('Views')), - 'comment_count': int_or_none(content_data.get('CommentCount')), - 'subtitles': subtitles, - } - if is_session: - speakers = [] - for s in content_data.get('Speakers', []): - speaker_name = s.get('FullName') - if not speaker_name: - continue - speakers.append(speaker_name) - - common.update({ - 'session_code': content_data.get('Code'), - 'session_room': content_data.get('Room'), - 'session_speakers': speakers, - }) - else: - authors = [] - for a in content_data.get('Authors', []): - author_name = a.get('DisplayName') - if not author_name: - continue - authors.append(author_name) - common['authors'] = authors - - contents = [] - - if slides: - d = common.copy() - d.update({'title': title + '-Slides', 'url': slides}) - contents.append(d) - - if zip_file: - d = common.copy() - d.update({'title': title + '-Zip', 'url': zip_file}) - contents.append(d) - - if formats: - d = common.copy() - d.update({'title': title, 'formats': formats}) - contents.append(d) - return self.playlist_result(contents) - else: - return self._extract_list(content_path) diff --git a/youtube_dl/extractor/chilloutzone.py b/youtube_dl/extractor/chilloutzone.py deleted file mode 100644 index 5aac21299..000000000 --- a/youtube_dl/extractor/chilloutzone.py +++ /dev/null @@ -1,96 +0,0 @@ -from __future__ import unicode_literals - -import re -import json - -from .common import InfoExtractor -from .youtube import YoutubeIE -from ..compat import compat_b64decode -from ..utils import ( - clean_html, - ExtractorError -) - - -class ChilloutzoneIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?chilloutzone\.net/video/(?P<id>[\w|-]+)\.html' - _TESTS = [{ - 'url': 'http://www.chilloutzone.net/video/enemene-meck-alle-katzen-weg.html', - 'md5': 'a76f3457e813ea0037e5244f509e66d1', - 'info_dict': { - 'id': 'enemene-meck-alle-katzen-weg', - 'ext': 'mp4', - 'title': 'Enemene Meck - Alle Katzen weg', - 'description': 'Ist das der Umkehrschluss des Niesenden Panda-Babys?', - }, - }, { - 'note': 'Video hosted at YouTube', - 'url': 'http://www.chilloutzone.net/video/eine-sekunde-bevor.html', - 'info_dict': { - 'id': '1YVQaAgHyRU', - 'ext': 'mp4', - 'title': '16 Photos Taken 1 Second Before Disaster', - 'description': 'md5:58a8fcf6a459fe0a08f54140f0ad1814', - 'uploader': 'BuzzFeedVideo', - 'uploader_id': 'BuzzFeedVideo', - 'upload_date': '20131105', - }, - }, { - 'note': 'Video hosted at Vimeo', - 'url': 'http://www.chilloutzone.net/video/icon-blending.html', - 'md5': '2645c678b8dc4fefcc0e1b60db18dac1', - 'info_dict': { - 'id': '85523671', - 'ext': 'mp4', - 'title': 'The Sunday Times - Icons', - 'description': 're:(?s)^Watch the making of - makingoficons.com.{300,}', - 'uploader': 'Us', - 'uploader_id': 'usfilms', - 'upload_date': '20140131' - }, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - webpage = self._download_webpage(url, video_id) - - base64_video_info = self._html_search_regex( - r'var cozVidData = "(.+?)";', webpage, 'video data') - decoded_video_info = compat_b64decode(base64_video_info).decode('utf-8') - video_info_dict = json.loads(decoded_video_info) - - # get video information from dict - video_url = video_info_dict['mediaUrl'] - description = clean_html(video_info_dict.get('description')) - title = video_info_dict['title'] - native_platform = video_info_dict['nativePlatform'] - native_video_id = video_info_dict['nativeVideoId'] - source_priority = video_info_dict['sourcePriority'] - - # If nativePlatform is None a fallback mechanism is used (i.e. youtube embed) - if native_platform is None: - youtube_url = YoutubeIE._extract_url(webpage) - if youtube_url: - return self.url_result(youtube_url, ie=YoutubeIE.ie_key()) - - # Non Fallback: Decide to use native source (e.g. youtube or vimeo) or - # the own CDN - if source_priority == 'native': - if native_platform == 'youtube': - return self.url_result(native_video_id, ie='Youtube') - if native_platform == 'vimeo': - return self.url_result( - 'http://vimeo.com/' + native_video_id, ie='Vimeo') - - if not video_url: - raise ExtractorError('No video found') - - return { - 'id': video_id, - 'url': video_url, - 'ext': 'mp4', - 'title': title, - 'description': description, - } diff --git a/youtube_dl/extractor/cinemax.py b/youtube_dl/extractor/cinemax.py deleted file mode 100644 index 7f89d33de..000000000 --- a/youtube_dl/extractor/cinemax.py +++ /dev/null @@ -1,29 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .hbo import HBOBaseIE - - -class CinemaxIE(HBOBaseIE): - _VALID_URL = r'https?://(?:www\.)?cinemax\.com/(?P<path>[^/]+/video/[0-9a-z-]+-(?P<id>\d+))' - _TESTS = [{ - 'url': 'https://www.cinemax.com/warrior/video/s1-ep-1-recap-20126903', - 'md5': '82e0734bba8aa7ef526c9dd00cf35a05', - 'info_dict': { - 'id': '20126903', - 'ext': 'mp4', - 'title': 'S1 Ep 1: Recap', - }, - 'expected_warnings': ['Unknown MIME type application/mp4 in DASH manifest'], - }, { - 'url': 'https://www.cinemax.com/warrior/video/s1-ep-1-recap-20126903.embed', - 'only_matching': True, - }] - - def _real_extract(self, url): - path, video_id = re.match(self._VALID_URL, url).groups() - info = self._extract_info('https://www.cinemax.com/%s.xml' % path, video_id) - info['id'] = video_id - return info diff --git a/youtube_dl/extractor/ciscolive.py b/youtube_dl/extractor/ciscolive.py deleted file mode 100644 index da404e4dc..000000000 --- a/youtube_dl/extractor/ciscolive.py +++ /dev/null @@ -1,151 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import itertools - -from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_urllib_parse_urlparse, -) -from ..utils import ( - clean_html, - float_or_none, - int_or_none, - try_get, - urlencode_postdata, -) - - -class CiscoLiveBaseIE(InfoExtractor): - # These appear to be constant across all Cisco Live presentations - # and are not tied to any user session or event - RAINFOCUS_API_URL = 'https://events.rainfocus.com/api/%s' - RAINFOCUS_API_PROFILE_ID = 'Na3vqYdAlJFSxhYTYQGuMbpafMqftalz' - RAINFOCUS_WIDGET_ID = 'n6l4Lo05R8fiy3RpUBm447dZN8uNWoye' - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5647924234001/SyK2FdqjM_default/index.html?videoId=%s' - - HEADERS = { - 'Origin': 'https://ciscolive.cisco.com', - 'rfApiProfileId': RAINFOCUS_API_PROFILE_ID, - 'rfWidgetId': RAINFOCUS_WIDGET_ID, - } - - def _call_api(self, ep, rf_id, query, referrer, note=None): - headers = self.HEADERS.copy() - headers['Referer'] = referrer - return self._download_json( - self.RAINFOCUS_API_URL % ep, rf_id, note=note, - data=urlencode_postdata(query), headers=headers) - - def _parse_rf_item(self, rf_item): - event_name = rf_item.get('eventName') - title = rf_item['title'] - description = clean_html(rf_item.get('abstract')) - presenter_name = try_get(rf_item, lambda x: x['participants'][0]['fullName']) - bc_id = rf_item['videos'][0]['url'] - bc_url = self.BRIGHTCOVE_URL_TEMPLATE % bc_id - duration = float_or_none(try_get(rf_item, lambda x: x['times'][0]['length'])) - location = try_get(rf_item, lambda x: x['times'][0]['room']) - - if duration: - duration = duration * 60 - - return { - '_type': 'url_transparent', - 'url': bc_url, - 'ie_key': 'BrightcoveNew', - 'title': title, - 'description': description, - 'duration': duration, - 'creator': presenter_name, - 'location': location, - 'series': event_name, - } - - -class CiscoLiveSessionIE(CiscoLiveBaseIE): - _VALID_URL = r'https?://(?:www\.)?ciscolive(?:\.cisco)?\.com/[^#]*#/session/(?P<id>[^/?&]+)' - _TESTS = [{ - 'url': 'https://ciscolive.cisco.com/on-demand-library/?#/session/1423353499155001FoSs', - 'md5': 'c98acf395ed9c9f766941c70f5352e22', - 'info_dict': { - 'id': '5803694304001', - 'ext': 'mp4', - 'title': '13 Smart Automations to Monitor Your Cisco IOS Network', - 'description': 'md5:ec4a436019e09a918dec17714803f7cc', - 'timestamp': 1530305395, - 'upload_date': '20180629', - 'uploader_id': '5647924234001', - 'location': '16B Mezz.', - }, - }, { - 'url': 'https://www.ciscolive.com/global/on-demand-library.html?search.event=ciscoliveemea2019#/session/15361595531500013WOU', - 'only_matching': True, - }, { - 'url': 'https://www.ciscolive.com/global/on-demand-library.html?#/session/1490051371645001kNaS', - 'only_matching': True, - }] - - def _real_extract(self, url): - rf_id = self._match_id(url) - rf_result = self._call_api('session', rf_id, {'id': rf_id}, url) - return self._parse_rf_item(rf_result['items'][0]) - - -class CiscoLiveSearchIE(CiscoLiveBaseIE): - _VALID_URL = r'https?://(?:www\.)?ciscolive(?:\.cisco)?\.com/(?:global/)?on-demand-library(?:\.html|/)' - _TESTS = [{ - 'url': 'https://ciscolive.cisco.com/on-demand-library/?search.event=ciscoliveus2018&search.technicallevel=scpsSkillLevel_aintroductory&search.focus=scpsSessionFocus_designAndDeployment#/', - 'info_dict': { - 'title': 'Search query', - }, - 'playlist_count': 5, - }, { - 'url': 'https://ciscolive.cisco.com/on-demand-library/?search.technology=scpsTechnology_applicationDevelopment&search.technology=scpsTechnology_ipv6&search.focus=scpsSessionFocus_troubleshootingTroubleshooting#/', - 'only_matching': True, - }, { - 'url': 'https://www.ciscolive.com/global/on-demand-library.html?search.technicallevel=scpsSkillLevel_aintroductory&search.event=ciscoliveemea2019&search.technology=scpsTechnology_dataCenter&search.focus=scpsSessionFocus_bestPractices#/', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return False if CiscoLiveSessionIE.suitable(url) else super(CiscoLiveSearchIE, cls).suitable(url) - - @staticmethod - def _check_bc_id_exists(rf_item): - return int_or_none(try_get(rf_item, lambda x: x['videos'][0]['url'])) is not None - - def _entries(self, query, url): - query['size'] = 50 - query['from'] = 0 - for page_num in itertools.count(1): - results = self._call_api( - 'search', None, query, url, - 'Downloading search JSON page %d' % page_num) - sl = try_get(results, lambda x: x['sectionList'][0], dict) - if sl: - results = sl - items = results.get('items') - if not items or not isinstance(items, list): - break - for item in items: - if not isinstance(item, dict): - continue - if not self._check_bc_id_exists(item): - continue - yield self._parse_rf_item(item) - size = int_or_none(results.get('size')) - if size is not None: - query['size'] = size - total = int_or_none(results.get('total')) - if total is not None and query['from'] + query['size'] > total: - break - query['from'] += query['size'] - - def _real_extract(self, url): - query = compat_parse_qs(compat_urllib_parse_urlparse(url).query) - query['type'] = 'session' - return self.playlist_result( - self._entries(query, url), playlist_title='Search query') diff --git a/youtube_dl/extractor/cjsw.py b/youtube_dl/extractor/cjsw.py deleted file mode 100644 index 505bdbe16..000000000 --- a/youtube_dl/extractor/cjsw.py +++ /dev/null @@ -1,72 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - determine_ext, - unescapeHTML, -) - - -class CJSWIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?cjsw\.com/program/(?P<program>[^/]+)/episode/(?P<id>\d+)' - _TESTS = [{ - 'url': 'http://cjsw.com/program/freshly-squeezed/episode/20170620', - 'md5': 'cee14d40f1e9433632c56e3d14977120', - 'info_dict': { - 'id': '91d9f016-a2e7-46c5-8dcb-7cbcd7437c41', - 'ext': 'mp3', - 'title': 'Freshly Squeezed – Episode June 20, 2017', - 'description': 'md5:c967d63366c3898a80d0c7b0ff337202', - 'series': 'Freshly Squeezed', - 'episode_id': '20170620', - }, - }, { - # no description - 'url': 'http://cjsw.com/program/road-pops/episode/20170707/', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - program, episode_id = mobj.group('program', 'id') - audio_id = '%s/%s' % (program, episode_id) - - webpage = self._download_webpage(url, episode_id) - - title = unescapeHTML(self._search_regex( - (r'<h1[^>]+class=["\']episode-header__title["\'][^>]*>(?P<title>[^<]+)', - r'data-audio-title=(["\'])(?P<title>(?:(?!\1).)+)\1'), - webpage, 'title', group='title')) - - audio_url = self._search_regex( - r'<button[^>]+data-audio-src=(["\'])(?P<url>(?:(?!\1).)+)\1', - webpage, 'audio url', group='url') - - audio_id = self._search_regex( - r'/([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})\.mp3', - audio_url, 'audio id', default=audio_id) - - formats = [{ - 'url': audio_url, - 'ext': determine_ext(audio_url, 'mp3'), - 'vcodec': 'none', - }] - - description = self._html_search_regex( - r'<p>(?P<description>.+?)</p>', webpage, 'description', - default=None) - series = self._search_regex( - r'data-showname=(["\'])(?P<name>(?:(?!\1).)+)\1', webpage, - 'series', default=program, group='name') - - return { - 'id': audio_id, - 'title': title, - 'description': description, - 'formats': formats, - 'series': series, - 'episode_id': episode_id, - } diff --git a/youtube_dl/extractor/clyp.py b/youtube_dl/extractor/clyp.py deleted file mode 100644 index 06d04de13..000000000 --- a/youtube_dl/extractor/clyp.py +++ /dev/null @@ -1,82 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_urllib_parse_urlparse, -) -from ..utils import ( - float_or_none, - unified_timestamp, -) - - -class ClypIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?clyp\.it/(?P<id>[a-z0-9]+)' - _TESTS = [{ - 'url': 'https://clyp.it/ojz2wfah', - 'md5': '1d4961036c41247ecfdcc439c0cddcbb', - 'info_dict': { - 'id': 'ojz2wfah', - 'ext': 'mp3', - 'title': 'Krisson80 - bits wip wip', - 'description': '#Krisson80BitsWipWip #chiptune\n#wip', - 'duration': 263.21, - 'timestamp': 1443515251, - 'upload_date': '20150929', - }, - }, { - 'url': 'https://clyp.it/b04p1odi?token=b0078e077e15835845c528a44417719d', - 'info_dict': { - 'id': 'b04p1odi', - 'ext': 'mp3', - 'title': 'GJ! (Reward Edit)', - 'description': 'Metal Resistance (THE ONE edition)', - 'duration': 177.789, - 'timestamp': 1528241278, - 'upload_date': '20180605', - }, - 'params': { - 'skip_download': True, - }, - }] - - def _real_extract(self, url): - audio_id = self._match_id(url) - - qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) - token = qs.get('token', [None])[0] - - query = {} - if token: - query['token'] = token - - metadata = self._download_json( - 'https://api.clyp.it/%s' % audio_id, audio_id, query=query) - - formats = [] - for secure in ('', 'Secure'): - for ext in ('Ogg', 'Mp3'): - format_id = '%s%s' % (secure, ext) - format_url = metadata.get('%sUrl' % format_id) - if format_url: - formats.append({ - 'url': format_url, - 'format_id': format_id, - 'vcodec': 'none', - }) - self._sort_formats(formats) - - title = metadata['Title'] - description = metadata.get('Description') - duration = float_or_none(metadata.get('Duration')) - timestamp = unified_timestamp(metadata.get('DateCreated')) - - return { - 'id': audio_id, - 'title': title, - 'description': description, - 'duration': duration, - 'timestamp': timestamp, - 'formats': formats, - } diff --git a/youtube_dl/extractor/cmt.py b/youtube_dl/extractor/cmt.py deleted file mode 100644 index e701fbeab..000000000 --- a/youtube_dl/extractor/cmt.py +++ /dev/null @@ -1,54 +0,0 @@ -from __future__ import unicode_literals - -from .mtv import MTVIE - - -class CMTIE(MTVIE): - IE_NAME = 'cmt.com' - _VALID_URL = r'https?://(?:www\.)?cmt\.com/(?:videos|shows|(?:full-)?episodes|video-clips)/(?P<id>[^/]+)' - - _TESTS = [{ - 'url': 'http://www.cmt.com/videos/garth-brooks/989124/the-call-featuring-trisha-yearwood.jhtml#artist=30061', - 'md5': 'e6b7ef3c4c45bbfae88061799bbba6c2', - 'info_dict': { - 'id': '989124', - 'ext': 'mp4', - 'title': 'Garth Brooks - "The Call (featuring Trisha Yearwood)"', - 'description': 'Blame It All On My Roots', - }, - 'skip': 'Video not available', - }, { - 'url': 'http://www.cmt.com/videos/misc/1504699/still-the-king-ep-109-in-3-minutes.jhtml#id=1739908', - 'md5': 'e61a801ca4a183a466c08bd98dccbb1c', - 'info_dict': { - 'id': '1504699', - 'ext': 'mp4', - 'title': 'Still The King Ep. 109 in 3 Minutes', - 'description': 'Relive or catch up with Still The King by watching this recap of season 1, episode 9.', - 'timestamp': 1469421000.0, - 'upload_date': '20160725', - }, - }, { - 'url': 'http://www.cmt.com/shows/party-down-south/party-down-south-ep-407-gone-girl/1738172/playlist/#id=1738172', - 'only_matching': True, - }, { - 'url': 'http://www.cmt.com/full-episodes/537qb3/nashville-the-wayfaring-stranger-season-5-ep-501', - 'only_matching': True, - }, { - 'url': 'http://www.cmt.com/video-clips/t9e4ci/nashville-juliette-in-2-minutes', - 'only_matching': True, - }] - - def _extract_mgid(self, webpage): - mgid = self._search_regex( - r'MTVN\.VIDEO\.contentUri\s*=\s*([\'"])(?P<mgid>.+?)\1', - webpage, 'mgid', group='mgid', default=None) - if not mgid: - mgid = self._extract_triforce_mgid(webpage) - return mgid - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - mgid = self._extract_mgid(webpage) - return self.url_result('http://media.mtvnservices.com/embed/%s' % mgid) diff --git a/youtube_dl/extractor/cnbc.py b/youtube_dl/extractor/cnbc.py deleted file mode 100644 index 7b9f4536a..000000000 --- a/youtube_dl/extractor/cnbc.py +++ /dev/null @@ -1,71 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import smuggle_url - - -class CNBCIE(InfoExtractor): - _VALID_URL = r'https?://video\.cnbc\.com/gallery/\?video=(?P<id>[0-9]+)' - _TEST = { - 'url': 'http://video.cnbc.com/gallery/?video=3000503714', - 'info_dict': { - 'id': '3000503714', - 'ext': 'mp4', - 'title': 'Fighting zombies is big business', - 'description': 'md5:0c100d8e1a7947bd2feec9a5550e519e', - 'timestamp': 1459332000, - 'upload_date': '20160330', - 'uploader': 'NBCU-CNBC', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - return { - '_type': 'url_transparent', - 'ie_key': 'ThePlatform', - 'url': smuggle_url( - 'http://link.theplatform.com/s/gZWlPC/media/guid/2408950221/%s?mbr=true&manifest=m3u' % video_id, - {'force_smil_url': True}), - 'id': video_id, - } - - -class CNBCVideoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?cnbc\.com(?P<path>/video/(?:[^/]+/)+(?P<id>[^./?#&]+)\.html)' - _TEST = { - 'url': 'https://www.cnbc.com/video/2018/07/19/trump-i-dont-necessarily-agree-with-raising-rates.html', - 'info_dict': { - 'id': '7000031301', - 'ext': 'mp4', - 'title': "Trump: I don't necessarily agree with raising rates", - 'description': 'md5:878d8f0b4ebb5bb1dda3514b91b49de3', - 'timestamp': 1531958400, - 'upload_date': '20180719', - 'uploader': 'NBCU-CNBC', - }, - 'params': { - 'skip_download': True, - }, - } - - def _real_extract(self, url): - path, display_id = re.match(self._VALID_URL, url).groups() - video_id = self._download_json( - 'https://webql-redesign.cnbcfm.com/graphql', display_id, query={ - 'query': '''{ - page(path: "%s") { - vcpsId - } -}''' % path, - })['data']['page']['vcpsId'] - return self.url_result( - 'http://video.cnbc.com/gallery/?video=%d' % video_id, - CNBCIE.ie_key()) diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py deleted file mode 100644 index 2d950fa05..000000000 --- a/youtube_dl/extractor/cnn.py +++ /dev/null @@ -1,147 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from .turner import TurnerBaseIE -from ..utils import url_basename - - -class CNNIE(TurnerBaseIE): - _VALID_URL = r'''(?x)https?://(?:(?P<sub_domain>edition|www|money)\.)?cnn\.com/(?:video/(?:data/.+?|\?)/)?videos?/ - (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:[a-z\-]+)|(?=&)))''' - - _TESTS = [{ - 'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn', - 'md5': '3e6121ea48df7e2259fe73a0628605c4', - 'info_dict': { - 'id': 'sports/2013/06/09/nadal-1-on-1.cnn', - 'ext': 'mp4', - 'title': 'Nadal wins 8th French Open title', - 'description': 'World Sport\'s Amanda Davies chats with 2013 French Open champion Rafael Nadal.', - 'duration': 135, - 'upload_date': '20130609', - }, - 'expected_warnings': ['Failed to download m3u8 information'], - }, { - 'url': 'http://edition.cnn.com/video/?/video/us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fcnn_topstories+%28RSS%3A+Top+Stories%29', - 'md5': 'b5cc60c60a3477d185af8f19a2a26f4e', - 'info_dict': { - 'id': 'us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology', - 'ext': 'mp4', - 'title': "Student's epic speech stuns new freshmen", - 'description': "A Georgia Tech student welcomes the incoming freshmen with an epic speech backed by music from \"2001: A Space Odyssey.\"", - 'upload_date': '20130821', - }, - 'expected_warnings': ['Failed to download m3u8 information'], - }, { - 'url': 'http://www.cnn.com/video/data/2.0/video/living/2014/12/22/growing-america-nashville-salemtown-board-episode-1.hln.html', - 'md5': 'f14d02ebd264df951feb2400e2c25a1b', - 'info_dict': { - 'id': 'living/2014/12/22/growing-america-nashville-salemtown-board-episode-1.hln', - 'ext': 'mp4', - 'title': 'Nashville Ep. 1: Hand crafted skateboards', - 'description': 'md5:e7223a503315c9f150acac52e76de086', - 'upload_date': '20141222', - }, - 'expected_warnings': ['Failed to download m3u8 information'], - }, { - 'url': 'http://money.cnn.com/video/news/2016/08/19/netflix-stunning-stats.cnnmoney/index.html', - 'md5': '52a515dc1b0f001cd82e4ceda32be9d1', - 'info_dict': { - 'id': '/video/news/2016/08/19/netflix-stunning-stats.cnnmoney', - 'ext': 'mp4', - 'title': '5 stunning stats about Netflix', - 'description': 'Did you know that Netflix has more than 80 million members? Here are five facts about the online video distributor that you probably didn\'t know.', - 'upload_date': '20160819', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'http://cnn.com/video/?/video/politics/2015/03/27/pkg-arizona-senator-church-attendance-mandatory.ktvk', - 'only_matching': True, - }, { - 'url': 'http://cnn.com/video/?/video/us/2015/04/06/dnt-baker-refuses-anti-gay-order.wkmg', - 'only_matching': True, - }, { - 'url': 'http://edition.cnn.com/videos/arts/2016/04/21/olympic-games-cultural-a-z-brazil.cnn', - 'only_matching': True, - }] - - _CONFIG = { - # http://edition.cnn.com/.element/apps/cvp/3.0/cfg/spider/cnn/expansion/config.xml - 'edition': { - 'data_src': 'http://edition.cnn.com/video/data/3.0/video/%s/index.xml', - 'media_src': 'http://pmd.cdn.turner.com/cnn/big', - }, - # http://money.cnn.com/.element/apps/cvp2/cfg/config.xml - 'money': { - 'data_src': 'http://money.cnn.com/video/data/4.0/video/%s.xml', - 'media_src': 'http://ht3.cdn.turner.com/money/big', - }, - } - - def _extract_timestamp(self, video_data): - # TODO: fix timestamp extraction - return None - - def _real_extract(self, url): - sub_domain, path, page_title = re.match(self._VALID_URL, url).groups() - if sub_domain not in ('money', 'edition'): - sub_domain = 'edition' - config = self._CONFIG[sub_domain] - return self._extract_cvp_info( - config['data_src'] % path, page_title, { - 'default': { - 'media_src': config['media_src'], - }, - 'f4m': { - 'host': 'cnn-vh.akamaihd.net', - }, - }) - - -class CNNBlogsIE(InfoExtractor): - _VALID_URL = r'https?://[^\.]+\.blogs\.cnn\.com/.+' - _TEST = { - 'url': 'http://reliablesources.blogs.cnn.com/2014/02/09/criminalizing-journalism/', - 'md5': '3e56f97b0b6ffb4b79f4ea0749551084', - 'info_dict': { - 'id': 'bestoftv/2014/02/09/criminalizing-journalism.cnn', - 'ext': 'mp4', - 'title': 'Criminalizing journalism?', - 'description': 'Glenn Greenwald responds to comments made this week on Capitol Hill that journalists could be criminal accessories.', - 'upload_date': '20140209', - }, - 'expected_warnings': ['Failed to download m3u8 information'], - 'add_ie': ['CNN'], - } - - def _real_extract(self, url): - webpage = self._download_webpage(url, url_basename(url)) - cnn_url = self._html_search_regex(r'data-url="(.+?)"', webpage, 'cnn url') - return self.url_result(cnn_url, CNNIE.ie_key()) - - -class CNNArticleIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:edition|www)\.)?cnn\.com/(?!videos?/)' - _TEST = { - 'url': 'http://www.cnn.com/2014/12/21/politics/obama-north-koreas-hack-not-war-but-cyber-vandalism/', - 'md5': '689034c2a3d9c6dc4aa72d65a81efd01', - 'info_dict': { - 'id': 'bestoftv/2014/12/21/ip-north-korea-obama.cnn', - 'ext': 'mp4', - 'title': 'Obama: Cyberattack not an act of war', - 'description': 'md5:0a802a40d2376f60e6b04c8d5bcebc4b', - 'upload_date': '20141221', - }, - 'expected_warnings': ['Failed to download m3u8 information'], - 'add_ie': ['CNN'], - } - - def _real_extract(self, url): - webpage = self._download_webpage(url, url_basename(url)) - cnn_url = self._html_search_regex(r"video:\s*'([^']+)'", webpage, 'cnn url') - return self.url_result('http://cnn.com/video/?/video/' + cnn_url, CNNIE.ie_key()) diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py deleted file mode 100644 index 1bfa912be..000000000 --- a/youtube_dl/extractor/comedycentral.py +++ /dev/null @@ -1,51 +0,0 @@ -from __future__ import unicode_literals - -from .mtv import MTVServicesInfoExtractor - - -class ComedyCentralIE(MTVServicesInfoExtractor): - _VALID_URL = r'https?://(?:www\.)?cc\.com/(?:episodes|video(?:-clips)?)/(?P<id>[0-9a-z]{6})' - _FEED_URL = 'http://comedycentral.com/feeds/mrss/' - - _TESTS = [{ - 'url': 'http://www.cc.com/video-clips/5ke9v2/the-daily-show-with-trevor-noah-doc-rivers-and-steve-ballmer---the-nba-player-strike', - 'md5': 'b8acb347177c680ff18a292aa2166f80', - 'info_dict': { - 'id': '89ccc86e-1b02-4f83-b0c9-1d9592ecd025', - 'ext': 'mp4', - 'title': 'The Daily Show with Trevor Noah|August 28, 2020|25|25149|Doc Rivers and Steve Ballmer - The NBA Player Strike', - 'description': 'md5:5334307c433892b85f4f5e5ac9ef7498', - 'timestamp': 1598670000, - 'upload_date': '20200829', - }, - }, { - 'url': 'http://www.cc.com/episodes/pnzzci/drawn-together--american-idol--parody-clip-show-season-3-ep-314', - 'only_matching': True, - }, { - 'url': 'https://www.cc.com/video/k3sdvm/the-daily-show-with-jon-stewart-exclusive-the-fourth-estate', - 'only_matching': True, - }] - - -class ComedyCentralTVIE(MTVServicesInfoExtractor): - _VALID_URL = r'https?://(?:www\.)?comedycentral\.tv/folgen/(?P<id>[0-9a-z]{6})' - _TESTS = [{ - 'url': 'https://www.comedycentral.tv/folgen/pxdpec/josh-investigates-klimawandel-staffel-1-ep-1', - 'info_dict': { - 'id': '15907dc3-ec3c-11e8-a442-0e40cf2fc285', - 'ext': 'mp4', - 'title': 'Josh Investigates', - 'description': 'Steht uns das Ende der Welt bevor?', - }, - }] - _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed' - _GEO_COUNTRIES = ['DE'] - - def _get_feed_query(self, uri): - return { - 'accountOverride': 'intl.mtvi.com', - 'arcEp': 'web.cc.tv', - 'ep': 'b9032c3a', - 'imageEp': 'web.cc.tv', - 'mgid': uri, - } diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py deleted file mode 100644 index 797c35fd5..000000000 --- a/youtube_dl/extractor/common.py +++ /dev/null @@ -1,3064 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import base64 -import datetime -import hashlib -import json -import netrc -import os -import random -import re -import socket -import ssl -import sys -import time -import math - -from ..compat import ( - compat_cookiejar_Cookie, - compat_cookies_SimpleCookie, - compat_etree_Element, - compat_etree_fromstring, - compat_getpass, - compat_integer_types, - compat_http_client, - compat_os_name, - compat_str, - compat_urllib_error, - compat_urllib_parse_unquote, - compat_urllib_parse_urlencode, - compat_urllib_request, - compat_urlparse, - compat_xml_parse_error, -) -from ..downloader.f4m import ( - get_base_url, - remove_encrypted_media, -) -from ..utils import ( - NO_DEFAULT, - age_restricted, - base_url, - bug_reports_message, - clean_html, - compiled_regex_type, - determine_ext, - determine_protocol, - dict_get, - error_to_compat_str, - ExtractorError, - extract_attributes, - fix_xml_ampersands, - float_or_none, - GeoRestrictedError, - GeoUtils, - int_or_none, - js_to_json, - JSON_LD_RE, - mimetype2ext, - orderedSet, - parse_bitrate, - parse_codecs, - parse_duration, - parse_iso8601, - parse_m3u8_attributes, - parse_resolution, - RegexNotFoundError, - sanitized_Request, - sanitize_filename, - str_or_none, - str_to_int, - strip_or_none, - unescapeHTML, - unified_strdate, - unified_timestamp, - update_Request, - update_url_query, - urljoin, - url_basename, - url_or_none, - xpath_element, - xpath_text, - xpath_with_ns, -) - - -class InfoExtractor(object): - """Information Extractor class. - - Information extractors are the classes that, given a URL, extract - information about the video (or videos) the URL refers to. This - information includes the real video URL, the video title, author and - others. The information is stored in a dictionary which is then - passed to the YoutubeDL. The YoutubeDL processes this - information possibly downloading the video to the file system, among - other possible outcomes. - - The type field determines the type of the result. - By far the most common value (and the default if _type is missing) is - "video", which indicates a single video. - - For a video, the dictionaries must include the following fields: - - id: Video identifier. - title: Video title, unescaped. - - Additionally, it must contain either a formats entry or a url one: - - formats: A list of dictionaries for each format available, ordered - from worst to best quality. - - Potential fields: - * url The mandatory URL representing the media: - for plain file media - HTTP URL of this file, - for RTMP - RTMP URL, - for HLS - URL of the M3U8 media playlist, - for HDS - URL of the F4M manifest, - for DASH - - HTTP URL to plain file media (in case of - unfragmented media) - - URL of the MPD manifest or base URL - representing the media if MPD manifest - is parsed from a string (in case of - fragmented media) - for MSS - URL of the ISM manifest. - * manifest_url - The URL of the manifest file in case of - fragmented media: - for HLS - URL of the M3U8 master playlist, - for HDS - URL of the F4M manifest, - for DASH - URL of the MPD manifest, - for MSS - URL of the ISM manifest. - * ext Will be calculated from URL if missing - * format A human-readable description of the format - ("mp4 container with h264/opus"). - Calculated from the format_id, width, height. - and format_note fields if missing. - * format_id A short description of the format - ("mp4_h264_opus" or "19"). - Technically optional, but strongly recommended. - * format_note Additional info about the format - ("3D" or "DASH video") - * width Width of the video, if known - * height Height of the video, if known - * resolution Textual description of width and height - * tbr Average bitrate of audio and video in KBit/s - * abr Average audio bitrate in KBit/s - * acodec Name of the audio codec in use - * asr Audio sampling rate in Hertz - * vbr Average video bitrate in KBit/s - * fps Frame rate - * vcodec Name of the video codec in use - * container Name of the container format - * filesize The number of bytes, if known in advance - * filesize_approx An estimate for the number of bytes - * player_url SWF Player URL (used for rtmpdump). - * protocol The protocol that will be used for the actual - download, lower-case. - "http", "https", "rtsp", "rtmp", "rtmpe", - "m3u8", "m3u8_native" or "http_dash_segments". - * fragment_base_url - Base URL for fragments. Each fragment's path - value (if present) will be relative to - this URL. - * fragments A list of fragments of a fragmented media. - Each fragment entry must contain either an url - or a path. If an url is present it should be - considered by a client. Otherwise both path and - fragment_base_url must be present. Here is - the list of all potential fields: - * "url" - fragment's URL - * "path" - fragment's path relative to - fragment_base_url - * "duration" (optional, int or float) - * "filesize" (optional, int) - * preference Order number of this format. If this field is - present and not None, the formats get sorted - by this field, regardless of all other values. - -1 for default (order by other properties), - -2 or smaller for less than default. - < -1000 to hide the format (if there is - another one which is strictly better) - * language Language code, e.g. "de" or "en-US". - * language_preference Is this in the language mentioned in - the URL? - 10 if it's what the URL is about, - -1 for default (don't know), - -10 otherwise, other values reserved for now. - * quality Order number of the video quality of this - format, irrespective of the file format. - -1 for default (order by other properties), - -2 or smaller for less than default. - * source_preference Order number for this video source - (quality takes higher priority) - -1 for default (order by other properties), - -2 or smaller for less than default. - * http_headers A dictionary of additional HTTP headers - to add to the request. - * stretched_ratio If given and not 1, indicates that the - video's pixels are not square. - width : height ratio as float. - * no_resume The server does not support resuming the - (HTTP or RTMP) download. Boolean. - * downloader_options A dictionary of downloader options as - described in FileDownloader - - url: Final video URL. - ext: Video filename extension. - format: The video format, defaults to ext (used for --get-format) - player_url: SWF Player URL (used for rtmpdump). - - The following fields are optional: - - alt_title: A secondary title of the video. - display_id An alternative identifier for the video, not necessarily - unique, but available before title. Typically, id is - something like "4234987", title "Dancing naked mole rats", - and display_id "dancing-naked-mole-rats" - thumbnails: A list of dictionaries, with the following entries: - * "id" (optional, string) - Thumbnail format ID - * "url" - * "preference" (optional, int) - quality of the image - * "width" (optional, int) - * "height" (optional, int) - * "resolution" (optional, string "{width}x{height}", - deprecated) - * "filesize" (optional, int) - thumbnail: Full URL to a video thumbnail image. - description: Full video description. - uploader: Full name of the video uploader. - license: License name the video is licensed under. - creator: The creator of the video. - release_timestamp: UNIX timestamp of the moment the video was released. - release_date: The date (YYYYMMDD) when the video was released. - timestamp: UNIX timestamp of the moment the video became available - (uploaded). - upload_date: Video upload date (YYYYMMDD). - If not explicitly set, calculated from timestamp. - uploader_id: Nickname or id of the video uploader. - uploader_url: Full URL to a personal webpage of the video uploader. - channel: Full name of the channel the video is uploaded on. - Note that channel fields may or may not repeat uploader - fields. This depends on a particular extractor. - channel_id: Id of the channel. - channel_url: Full URL to a channel webpage. - location: Physical location where the video was filmed. - subtitles: The available subtitles as a dictionary in the format - {tag: subformats}. "tag" is usually a language code, and - "subformats" is a list sorted from lower to higher - preference, each element is a dictionary with the "ext" - entry and one of: - * "data": The subtitles file contents - * "url": A URL pointing to the subtitles file - "ext" will be calculated from URL if missing - automatic_captions: Like 'subtitles', used by the YoutubeIE for - automatically generated captions - duration: Length of the video in seconds, as an integer or float. - view_count: How many users have watched the video on the platform. - like_count: Number of positive ratings of the video - dislike_count: Number of negative ratings of the video - repost_count: Number of reposts of the video - average_rating: Average rating give by users, the scale used depends on the webpage - comment_count: Number of comments on the video - comments: A list of comments, each with one or more of the following - properties (all but one of text or html optional): - * "author" - human-readable name of the comment author - * "author_id" - user ID of the comment author - * "id" - Comment ID - * "html" - Comment as HTML - * "text" - Plain text of the comment - * "timestamp" - UNIX timestamp of comment - * "parent" - ID of the comment this one is replying to. - Set to "root" to indicate that this is a - comment to the original video. - age_limit: Age restriction for the video, as an integer (years) - webpage_url: The URL to the video webpage, if given to youtube-dl it - should allow to get the same result again. (It will be set - by YoutubeDL if it's missing) - categories: A list of categories that the video falls in, for example - ["Sports", "Berlin"] - tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"] - is_live: True, False, or None (=unknown). Whether this video is a - live stream that goes on instead of a fixed-length video. - start_time: Time in seconds where the reproduction should start, as - specified in the URL. - end_time: Time in seconds where the reproduction should end, as - specified in the URL. - chapters: A list of dictionaries, with the following entries: - * "start_time" - The start time of the chapter in seconds - * "end_time" - The end time of the chapter in seconds - * "title" (optional, string) - - The following fields should only be used when the video belongs to some logical - chapter or section: - - chapter: Name or title of the chapter the video belongs to. - chapter_number: Number of the chapter the video belongs to, as an integer. - chapter_id: Id of the chapter the video belongs to, as a unicode string. - - The following fields should only be used when the video is an episode of some - series, programme or podcast: - - series: Title of the series or programme the video episode belongs to. - season: Title of the season the video episode belongs to. - season_number: Number of the season the video episode belongs to, as an integer. - season_id: Id of the season the video episode belongs to, as a unicode string. - episode: Title of the video episode. Unlike mandatory video title field, - this field should denote the exact title of the video episode - without any kind of decoration. - episode_number: Number of the video episode within a season, as an integer. - episode_id: Id of the video episode, as a unicode string. - - The following fields should only be used when the media is a track or a part of - a music album: - - track: Title of the track. - track_number: Number of the track within an album or a disc, as an integer. - track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii), - as a unicode string. - artist: Artist(s) of the track. - genre: Genre(s) of the track. - album: Title of the album the track belongs to. - album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc). - album_artist: List of all artists appeared on the album (e.g. - "Ash Borer / Fell Voices" or "Various Artists", useful for splits - and compilations). - disc_number: Number of the disc or other physical medium the track belongs to, - as an integer. - release_year: Year (YYYY) when the album was released. - - Unless mentioned otherwise, the fields should be Unicode strings. - - Unless mentioned otherwise, None is equivalent to absence of information. - - - _type "playlist" indicates multiple videos. - There must be a key "entries", which is a list, an iterable, or a PagedList - object, each element of which is a valid dictionary by this specification. - - Additionally, playlists can have "id", "title", "description", "uploader", - "uploader_id", "uploader_url", "duration" attributes with the same semantics - as videos (see above). - - - _type "multi_video" indicates that there are multiple videos that - form a single show, for examples multiple acts of an opera or TV episode. - It must have an entries key like a playlist and contain all the keys - required for a video at the same time. - - - _type "url" indicates that the video must be extracted from another - location, possibly by a different extractor. Its only required key is: - "url" - the next URL to extract. - The key "ie_key" can be set to the class name (minus the trailing "IE", - e.g. "Youtube") if the extractor class is known in advance. - Additionally, the dictionary may have any properties of the resolved entity - known in advance, for example "title" if the title of the referred video is - known ahead of time. - - - _type "url_transparent" entities have the same specification as "url", but - indicate that the given additional information is more precise than the one - associated with the resolved URL. - This is useful when a site employs a video service that hosts the video and - its technical metadata, but that video service does not embed a useful - title, description etc. - - - Subclasses of this one should re-define the _real_initialize() and - _real_extract() methods and define a _VALID_URL regexp. - Probably, they should also be added to the list of extractors. - - _GEO_BYPASS attribute may be set to False in order to disable - geo restriction bypass mechanisms for a particular extractor. - Though it won't disable explicit geo restriction bypass based on - country code provided with geo_bypass_country. - - _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted - countries for this extractor. One of these countries will be used by - geo restriction bypass mechanism right away in order to bypass - geo restriction, of course, if the mechanism is not disabled. - - _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted - IP blocks in CIDR notation for this extractor. One of these IP blocks - will be used by geo restriction bypass mechanism similarly - to _GEO_COUNTRIES. - - Finally, the _WORKING attribute should be set to False for broken IEs - in order to warn the users and skip the tests. - """ - - _ready = False - _downloader = None - _x_forwarded_for_ip = None - _GEO_BYPASS = True - _GEO_COUNTRIES = None - _GEO_IP_BLOCKS = None - _WORKING = True - - def __init__(self, downloader=None): - """Constructor. Receives an optional downloader.""" - self._ready = False - self._x_forwarded_for_ip = None - self.set_downloader(downloader) - - @classmethod - def suitable(cls, url): - """Receives a URL and returns True if suitable for this IE.""" - - # This does not use has/getattr intentionally - we want to know whether - # we have cached the regexp for *this* class, whereas getattr would also - # match the superclass - if '_VALID_URL_RE' not in cls.__dict__: - cls._VALID_URL_RE = re.compile(cls._VALID_URL) - return cls._VALID_URL_RE.match(url) is not None - - @classmethod - def _match_id(cls, url): - if '_VALID_URL_RE' not in cls.__dict__: - cls._VALID_URL_RE = re.compile(cls._VALID_URL) - m = cls._VALID_URL_RE.match(url) - assert m - return compat_str(m.group('id')) - - @classmethod - def working(cls): - """Getter method for _WORKING.""" - return cls._WORKING - - def initialize(self): - """Initializes an instance (authentication, etc).""" - self._initialize_geo_bypass({ - 'countries': self._GEO_COUNTRIES, - 'ip_blocks': self._GEO_IP_BLOCKS, - }) - if not self._ready: - self._real_initialize() - self._ready = True - - def _initialize_geo_bypass(self, geo_bypass_context): - """ - Initialize geo restriction bypass mechanism. - - This method is used to initialize geo bypass mechanism based on faking - X-Forwarded-For HTTP header. A random country from provided country list - is selected and a random IP belonging to this country is generated. This - IP will be passed as X-Forwarded-For HTTP header in all subsequent - HTTP requests. - - This method will be used for initial geo bypass mechanism initialization - during the instance initialization with _GEO_COUNTRIES and - _GEO_IP_BLOCKS. - - You may also manually call it from extractor's code if geo bypass - information is not available beforehand (e.g. obtained during - extraction) or due to some other reason. In this case you should pass - this information in geo bypass context passed as first argument. It may - contain following fields: - - countries: List of geo unrestricted countries (similar - to _GEO_COUNTRIES) - ip_blocks: List of geo unrestricted IP blocks in CIDR notation - (similar to _GEO_IP_BLOCKS) - - """ - if not self._x_forwarded_for_ip: - - # Geo bypass mechanism is explicitly disabled by user - if not self._downloader.params.get('geo_bypass', True): - return - - if not geo_bypass_context: - geo_bypass_context = {} - - # Backward compatibility: previously _initialize_geo_bypass - # expected a list of countries, some 3rd party code may still use - # it this way - if isinstance(geo_bypass_context, (list, tuple)): - geo_bypass_context = { - 'countries': geo_bypass_context, - } - - # The whole point of geo bypass mechanism is to fake IP - # as X-Forwarded-For HTTP header based on some IP block or - # country code. - - # Path 1: bypassing based on IP block in CIDR notation - - # Explicit IP block specified by user, use it right away - # regardless of whether extractor is geo bypassable or not - ip_block = self._downloader.params.get('geo_bypass_ip_block', None) - - # Otherwise use random IP block from geo bypass context but only - # if extractor is known as geo bypassable - if not ip_block: - ip_blocks = geo_bypass_context.get('ip_blocks') - if self._GEO_BYPASS and ip_blocks: - ip_block = random.choice(ip_blocks) - - if ip_block: - self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block) - if self._downloader.params.get('verbose', False): - self._downloader.to_screen( - '[debug] Using fake IP %s as X-Forwarded-For.' - % self._x_forwarded_for_ip) - return - - # Path 2: bypassing based on country code - - # Explicit country code specified by user, use it right away - # regardless of whether extractor is geo bypassable or not - country = self._downloader.params.get('geo_bypass_country', None) - - # Otherwise use random country code from geo bypass context but - # only if extractor is known as geo bypassable - if not country: - countries = geo_bypass_context.get('countries') - if self._GEO_BYPASS and countries: - country = random.choice(countries) - - if country: - self._x_forwarded_for_ip = GeoUtils.random_ipv4(country) - if self._downloader.params.get('verbose', False): - self._downloader.to_screen( - '[debug] Using fake IP %s (%s) as X-Forwarded-For.' - % (self._x_forwarded_for_ip, country.upper())) - - def extract(self, url): - """Extracts URL information and returns it in list of dicts.""" - try: - for _ in range(2): - try: - self.initialize() - ie_result = self._real_extract(url) - if self._x_forwarded_for_ip: - ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip - return ie_result - except GeoRestrictedError as e: - if self.__maybe_fake_ip_and_retry(e.countries): - continue - raise - except ExtractorError: - raise - except compat_http_client.IncompleteRead as e: - raise ExtractorError('A network error has occurred.', cause=e, expected=True) - except (KeyError, StopIteration) as e: - raise ExtractorError('An extractor error has occurred.', cause=e) - - def __maybe_fake_ip_and_retry(self, countries): - if (not self._downloader.params.get('geo_bypass_country', None) - and self._GEO_BYPASS - and self._downloader.params.get('geo_bypass', True) - and not self._x_forwarded_for_ip - and countries): - country_code = random.choice(countries) - self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code) - if self._x_forwarded_for_ip: - self.report_warning( - 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.' - % (self._x_forwarded_for_ip, country_code.upper())) - return True - return False - - def set_downloader(self, downloader): - """Sets the downloader for this IE.""" - self._downloader = downloader - - def _real_initialize(self): - """Real initialization process. Redefine in subclasses.""" - pass - - def _real_extract(self, url): - """Real extraction process. Redefine in subclasses.""" - pass - - @classmethod - def ie_key(cls): - """A string for getting the InfoExtractor with get_info_extractor""" - return compat_str(cls.__name__[:-2]) - - @property - def IE_NAME(self): - return compat_str(type(self).__name__[:-2]) - - @staticmethod - def __can_accept_status_code(err, expected_status): - assert isinstance(err, compat_urllib_error.HTTPError) - if expected_status is None: - return False - if isinstance(expected_status, compat_integer_types): - return err.code == expected_status - elif isinstance(expected_status, (list, tuple)): - return err.code in expected_status - elif callable(expected_status): - return expected_status(err.code) is True - else: - assert False - - def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None): - """ - Return the response handle. - - See _download_webpage docstring for arguments specification. - """ - if note is None: - self.report_download_webpage(video_id) - elif note is not False: - if video_id is None: - self.to_screen('%s' % (note,)) - else: - self.to_screen('%s: %s' % (video_id, note)) - - # Some sites check X-Forwarded-For HTTP header in order to figure out - # the origin of the client behind proxy. This allows bypassing geo - # restriction by faking this header's value to IP that belongs to some - # geo unrestricted country. We will do so once we encounter any - # geo restriction error. - if self._x_forwarded_for_ip: - if 'X-Forwarded-For' not in headers: - headers['X-Forwarded-For'] = self._x_forwarded_for_ip - - if isinstance(url_or_request, compat_urllib_request.Request): - url_or_request = update_Request( - url_or_request, data=data, headers=headers, query=query) - else: - if query: - url_or_request = update_url_query(url_or_request, query) - if data is not None or headers: - url_or_request = sanitized_Request(url_or_request, data, headers) - exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error] - if hasattr(ssl, 'CertificateError'): - exceptions.append(ssl.CertificateError) - try: - return self._downloader.urlopen(url_or_request) - except tuple(exceptions) as err: - if isinstance(err, compat_urllib_error.HTTPError): - if self.__can_accept_status_code(err, expected_status): - # Retain reference to error to prevent file object from - # being closed before it can be read. Works around the - # effects of <https://bugs.python.org/issue15002> - # introduced in Python 3.4.1. - err.fp._error = err - return err.fp - - if errnote is False: - return False - if errnote is None: - errnote = 'Unable to download webpage' - - errmsg = '%s: %s' % (errnote, error_to_compat_str(err)) - if fatal: - raise ExtractorError(errmsg, sys.exc_info()[2], cause=err) - else: - self._downloader.report_warning(errmsg) - return False - - def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None): - """ - Return a tuple (page content as string, URL handle). - - See _download_webpage docstring for arguments specification. - """ - # Strip hashes from the URL (#1038) - if isinstance(url_or_request, (compat_str, str)): - url_or_request = url_or_request.partition('#')[0] - - urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status) - if urlh is False: - assert not fatal - return False - content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding) - return (content, urlh) - - @staticmethod - def _guess_encoding_from_content(content_type, webpage_bytes): - m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) - if m: - encoding = m.group(1) - else: - m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]', - webpage_bytes[:1024]) - if m: - encoding = m.group(1).decode('ascii') - elif webpage_bytes.startswith(b'\xff\xfe'): - encoding = 'utf-16' - else: - encoding = 'utf-8' - - return encoding - - def __check_blocked(self, content): - first_block = content[:512] - if ('<title>Access to this site is blocked</title>' in content - and 'Websense' in first_block): - msg = 'Access to this webpage has been blocked by Websense filtering software in your network.' - blocked_iframe = self._html_search_regex( - r'<iframe src="([^"]+)"', content, - 'Websense information URL', default=None) - if blocked_iframe: - msg += ' Visit %s for more details' % blocked_iframe - raise ExtractorError(msg, expected=True) - if '<title>The URL you requested has been blocked</title>' in first_block: - msg = ( - 'Access to this webpage has been blocked by Indian censorship. ' - 'Use a VPN or proxy server (with --proxy) to route around it.') - block_msg = self._html_search_regex( - r'</h1><p>(.*?)</p>', - content, 'block message', default=None) - if block_msg: - msg += ' (Message: "%s")' % block_msg.replace('\n', ' ') - raise ExtractorError(msg, expected=True) - if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content - and 'blocklist.rkn.gov.ru' in content): - raise ExtractorError( - 'Access to this webpage has been blocked by decision of the Russian government. ' - 'Visit http://blocklist.rkn.gov.ru/ for a block reason.', - expected=True) - - def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None): - content_type = urlh.headers.get('Content-Type', '') - webpage_bytes = urlh.read() - if prefix is not None: - webpage_bytes = prefix + webpage_bytes - if not encoding: - encoding = self._guess_encoding_from_content(content_type, webpage_bytes) - if self._downloader.params.get('dump_intermediate_pages', False): - self.to_screen('Dumping request to ' + urlh.geturl()) - dump = base64.b64encode(webpage_bytes).decode('ascii') - self._downloader.to_screen(dump) - if self._downloader.params.get('write_pages', False): - basen = '%s_%s' % (video_id, urlh.geturl()) - if len(basen) > 240: - h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest() - basen = basen[:240 - len(h)] + h - raw_filename = basen + '.dump' - filename = sanitize_filename(raw_filename, restricted=True) - self.to_screen('Saving request to ' + filename) - # Working around MAX_PATH limitation on Windows (see - # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx) - if compat_os_name == 'nt': - absfilepath = os.path.abspath(filename) - if len(absfilepath) > 259: - filename = '\\\\?\\' + absfilepath - with open(filename, 'wb') as outf: - outf.write(webpage_bytes) - - try: - content = webpage_bytes.decode(encoding, 'replace') - except LookupError: - content = webpage_bytes.decode('utf-8', 'replace') - - self.__check_blocked(content) - - return content - - def _download_webpage( - self, url_or_request, video_id, note=None, errnote=None, - fatal=True, tries=1, timeout=5, encoding=None, data=None, - headers={}, query={}, expected_status=None): - """ - Return the data of the page as a string. - - Arguments: - url_or_request -- plain text URL as a string or - a compat_urllib_request.Requestobject - video_id -- Video/playlist/item identifier (string) - - Keyword arguments: - note -- note printed before downloading (string) - errnote -- note printed in case of an error (string) - fatal -- flag denoting whether error should be considered fatal, - i.e. whether it should cause ExtractionError to be raised, - otherwise a warning will be reported and extraction continued - tries -- number of tries - timeout -- sleep interval between tries - encoding -- encoding for a page content decoding, guessed automatically - when not explicitly specified - data -- POST data (bytes) - headers -- HTTP headers (dict) - query -- URL query (dict) - expected_status -- allows to accept failed HTTP requests (non 2xx - status code) by explicitly specifying a set of accepted status - codes. Can be any of the following entities: - - an integer type specifying an exact failed status code to - accept - - a list or a tuple of integer types specifying a list of - failed status codes to accept - - a callable accepting an actual failed status code and - returning True if it should be accepted - Note that this argument does not affect success status codes (2xx) - which are always accepted. - """ - - success = False - try_count = 0 - while success is False: - try: - res = self._download_webpage_handle( - url_or_request, video_id, note, errnote, fatal, - encoding=encoding, data=data, headers=headers, query=query, - expected_status=expected_status) - success = True - except compat_http_client.IncompleteRead as e: - try_count += 1 - if try_count >= tries: - raise e - self._sleep(timeout, video_id) - if res is False: - return res - else: - content, _ = res - return content - - def _download_xml_handle( - self, url_or_request, video_id, note='Downloading XML', - errnote='Unable to download XML', transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}, - expected_status=None): - """ - Return a tuple (xml as an compat_etree_Element, URL handle). - - See _download_webpage docstring for arguments specification. - """ - res = self._download_webpage_handle( - url_or_request, video_id, note, errnote, fatal=fatal, - encoding=encoding, data=data, headers=headers, query=query, - expected_status=expected_status) - if res is False: - return res - xml_string, urlh = res - return self._parse_xml( - xml_string, video_id, transform_source=transform_source, - fatal=fatal), urlh - - def _download_xml( - self, url_or_request, video_id, - note='Downloading XML', errnote='Unable to download XML', - transform_source=None, fatal=True, encoding=None, - data=None, headers={}, query={}, expected_status=None): - """ - Return the xml as an compat_etree_Element. - - See _download_webpage docstring for arguments specification. - """ - res = self._download_xml_handle( - url_or_request, video_id, note=note, errnote=errnote, - transform_source=transform_source, fatal=fatal, encoding=encoding, - data=data, headers=headers, query=query, - expected_status=expected_status) - return res if res is False else res[0] - - def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True): - if transform_source: - xml_string = transform_source(xml_string) - try: - return compat_etree_fromstring(xml_string.encode('utf-8')) - except compat_xml_parse_error as ve: - errmsg = '%s: Failed to parse XML ' % video_id - if fatal: - raise ExtractorError(errmsg, cause=ve) - else: - self.report_warning(errmsg + str(ve)) - - def _download_json_handle( - self, url_or_request, video_id, note='Downloading JSON metadata', - errnote='Unable to download JSON metadata', transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}, - expected_status=None): - """ - Return a tuple (JSON object, URL handle). - - See _download_webpage docstring for arguments specification. - """ - res = self._download_webpage_handle( - url_or_request, video_id, note, errnote, fatal=fatal, - encoding=encoding, data=data, headers=headers, query=query, - expected_status=expected_status) - if res is False: - return res - json_string, urlh = res - return self._parse_json( - json_string, video_id, transform_source=transform_source, - fatal=fatal), urlh - - def _download_json( - self, url_or_request, video_id, note='Downloading JSON metadata', - errnote='Unable to download JSON metadata', transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}, - expected_status=None): - """ - Return the JSON object as a dict. - - See _download_webpage docstring for arguments specification. - """ - res = self._download_json_handle( - url_or_request, video_id, note=note, errnote=errnote, - transform_source=transform_source, fatal=fatal, encoding=encoding, - data=data, headers=headers, query=query, - expected_status=expected_status) - return res if res is False else res[0] - - def _parse_json(self, json_string, video_id, transform_source=None, fatal=True): - if transform_source: - json_string = transform_source(json_string) - try: - return json.loads(json_string) - except ValueError as ve: - errmsg = '%s: Failed to parse JSON ' % video_id - if fatal: - raise ExtractorError(errmsg, cause=ve) - else: - self.report_warning(errmsg + str(ve)) - - def report_warning(self, msg, video_id=None): - idstr = '' if video_id is None else '%s: ' % video_id - self._downloader.report_warning( - '[%s] %s%s' % (self.IE_NAME, idstr, msg)) - - def to_screen(self, msg): - """Print msg to screen, prefixing it with '[ie_name]'""" - self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg)) - - def report_extraction(self, id_or_name): - """Report information extraction.""" - self.to_screen('%s: Extracting information' % id_or_name) - - def report_download_webpage(self, video_id): - """Report webpage download.""" - self.to_screen('%s: Downloading webpage' % video_id) - - def report_age_confirmation(self): - """Report attempt to confirm age.""" - self.to_screen('Confirming age') - - def report_login(self): - """Report attempt to log in.""" - self.to_screen('Logging in') - - @staticmethod - def raise_login_required(msg='This video is only available for registered users'): - raise ExtractorError( - '%s. Use --username and --password or --netrc to provide account credentials.' % msg, - expected=True) - - @staticmethod - def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None): - raise GeoRestrictedError(msg, countries=countries) - - # Methods for following #608 - @staticmethod - def url_result(url, ie=None, video_id=None, video_title=None): - """Returns a URL that points to a page that should be processed""" - # TODO: ie should be the class used for getting the info - video_info = {'_type': 'url', - 'url': url, - 'ie_key': ie} - if video_id is not None: - video_info['id'] = video_id - if video_title is not None: - video_info['title'] = video_title - return video_info - - def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None): - urls = orderedSet( - self.url_result(self._proto_relative_url(getter(m) if getter else m), ie) - for m in matches) - return self.playlist_result( - urls, playlist_id=playlist_id, playlist_title=playlist_title) - - @staticmethod - def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None): - """Returns a playlist""" - video_info = {'_type': 'playlist', - 'entries': entries} - if playlist_id: - video_info['id'] = playlist_id - if playlist_title: - video_info['title'] = playlist_title - if playlist_description: - video_info['description'] = playlist_description - return video_info - - def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): - """ - Perform a regex search on the given string, using a single or a list of - patterns returning the first matching group. - In case of failure return a default value or raise a WARNING or a - RegexNotFoundError, depending on fatal, specifying the field name. - """ - if isinstance(pattern, (str, compat_str, compiled_regex_type)): - mobj = re.search(pattern, string, flags) - else: - for p in pattern: - mobj = re.search(p, string, flags) - if mobj: - break - - if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty(): - _name = '\033[0;34m%s\033[0m' % name - else: - _name = name - - if mobj: - if group is None: - # return the first matching group - return next(g for g in mobj.groups() if g is not None) - else: - return mobj.group(group) - elif default is not NO_DEFAULT: - return default - elif fatal: - raise RegexNotFoundError('Unable to extract %s' % _name) - else: - self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message()) - return None - - def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): - """ - Like _search_regex, but strips HTML tags and unescapes entities. - """ - res = self._search_regex(pattern, string, name, default, fatal, flags, group) - if res: - return clean_html(res).strip() - else: - return res - - def _get_netrc_login_info(self, netrc_machine=None): - username = None - password = None - netrc_machine = netrc_machine or self._NETRC_MACHINE - - if self._downloader.params.get('usenetrc', False): - try: - info = netrc.netrc().authenticators(netrc_machine) - if info is not None: - username = info[0] - password = info[2] - else: - raise netrc.NetrcParseError( - 'No authenticators for %s' % netrc_machine) - except (IOError, netrc.NetrcParseError) as err: - self._downloader.report_warning( - 'parsing .netrc: %s' % error_to_compat_str(err)) - - return username, password - - def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None): - """ - Get the login info as (username, password) - First look for the manually specified credentials using username_option - and password_option as keys in params dictionary. If no such credentials - available look in the netrc file using the netrc_machine or _NETRC_MACHINE - value. - If there's no info available, return (None, None) - """ - if self._downloader is None: - return (None, None) - - downloader_params = self._downloader.params - - # Attempt to use provided username and password or .netrc data - if downloader_params.get(username_option) is not None: - username = downloader_params[username_option] - password = downloader_params[password_option] - else: - username, password = self._get_netrc_login_info(netrc_machine) - - return username, password - - def _get_tfa_info(self, note='two-factor verification code'): - """ - Get the two-factor authentication info - TODO - asking the user will be required for sms/phone verify - currently just uses the command line option - If there's no info available, return None - """ - if self._downloader is None: - return None - downloader_params = self._downloader.params - - if downloader_params.get('twofactor') is not None: - return downloader_params['twofactor'] - - return compat_getpass('Type %s and press [Return]: ' % note) - - # Helper functions for extracting OpenGraph info - @staticmethod - def _og_regexes(prop): - content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))' - property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)' - % {'prop': re.escape(prop)}) - template = r'<meta[^>]+?%s[^>]+?%s' - return [ - template % (property_re, content_re), - template % (content_re, property_re), - ] - - @staticmethod - def _meta_regex(prop): - return r'''(?isx)<meta - (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1) - [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop) - - def _og_search_property(self, prop, html, name=None, **kargs): - if not isinstance(prop, (list, tuple)): - prop = [prop] - if name is None: - name = 'OpenGraph %s' % prop[0] - og_regexes = [] - for p in prop: - og_regexes.extend(self._og_regexes(p)) - escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs) - if escaped is None: - return None - return unescapeHTML(escaped) - - def _og_search_thumbnail(self, html, **kargs): - return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs) - - def _og_search_description(self, html, **kargs): - return self._og_search_property('description', html, fatal=False, **kargs) - - def _og_search_title(self, html, **kargs): - return self._og_search_property('title', html, **kargs) - - def _og_search_video_url(self, html, name='video url', secure=True, **kargs): - regexes = self._og_regexes('video') + self._og_regexes('video:url') - if secure: - regexes = self._og_regexes('video:secure_url') + regexes - return self._html_search_regex(regexes, html, name, **kargs) - - def _og_search_url(self, html, **kargs): - return self._og_search_property('url', html, **kargs) - - def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs): - if not isinstance(name, (list, tuple)): - name = [name] - if display_name is None: - display_name = name[0] - return self._html_search_regex( - [self._meta_regex(n) for n in name], - html, display_name, fatal=fatal, group='content', **kwargs) - - def _dc_search_uploader(self, html): - return self._html_search_meta('dc.creator', html, 'uploader') - - def _rta_search(self, html): - # See http://www.rtalabel.org/index.php?content=howtofaq#single - if re.search(r'(?ix)<meta\s+name="rating"\s+' - r' content="RTA-5042-1996-1400-1577-RTA"', - html): - return 18 - return 0 - - def _media_rating_search(self, html): - # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/ - rating = self._html_search_meta('rating', html) - - if not rating: - return None - - RATING_TABLE = { - 'safe for kids': 0, - 'general': 8, - '14 years': 14, - 'mature': 17, - 'restricted': 19, - } - return RATING_TABLE.get(rating.lower()) - - def _family_friendly_search(self, html): - # See http://schema.org/VideoObject - family_friendly = self._html_search_meta( - 'isFamilyFriendly', html, default=None) - - if not family_friendly: - return None - - RATING_TABLE = { - '1': 0, - 'true': 0, - '0': 18, - 'false': 18, - } - return RATING_TABLE.get(family_friendly.lower()) - - def _twitter_search_player(self, html): - return self._html_search_meta('twitter:player', html, - 'twitter card player') - - def _search_json_ld(self, html, video_id, expected_type=None, **kwargs): - json_ld_list = list(re.finditer(JSON_LD_RE, html)) - default = kwargs.get('default', NO_DEFAULT) - # JSON-LD may be malformed and thus `fatal` should be respected. - # At the same time `default` may be passed that assumes `fatal=False` - # for _search_regex. Let's simulate the same behavior here as well. - fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False - json_ld = [] - for mobj in json_ld_list: - json_ld_item = self._parse_json( - mobj.group('json_ld'), video_id, fatal=fatal) - if not json_ld_item: - continue - if isinstance(json_ld_item, dict): - json_ld.append(json_ld_item) - elif isinstance(json_ld_item, (list, tuple)): - json_ld.extend(json_ld_item) - if json_ld: - json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type) - if json_ld: - return json_ld - if default is not NO_DEFAULT: - return default - elif fatal: - raise RegexNotFoundError('Unable to extract JSON-LD') - else: - self._downloader.report_warning('unable to extract JSON-LD %s' % bug_reports_message()) - return {} - - def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None): - if isinstance(json_ld, compat_str): - json_ld = self._parse_json(json_ld, video_id, fatal=fatal) - if not json_ld: - return {} - info = {} - if not isinstance(json_ld, (list, tuple, dict)): - return info - if isinstance(json_ld, dict): - json_ld = [json_ld] - - INTERACTION_TYPE_MAP = { - 'CommentAction': 'comment', - 'AgreeAction': 'like', - 'DisagreeAction': 'dislike', - 'LikeAction': 'like', - 'DislikeAction': 'dislike', - 'ListenAction': 'view', - 'WatchAction': 'view', - 'ViewAction': 'view', - } - - def extract_interaction_type(e): - interaction_type = e.get('interactionType') - if isinstance(interaction_type, dict): - interaction_type = interaction_type.get('@type') - return str_or_none(interaction_type) - - def extract_interaction_statistic(e): - interaction_statistic = e.get('interactionStatistic') - if isinstance(interaction_statistic, dict): - interaction_statistic = [interaction_statistic] - if not isinstance(interaction_statistic, list): - return - for is_e in interaction_statistic: - if not isinstance(is_e, dict): - continue - if is_e.get('@type') != 'InteractionCounter': - continue - interaction_type = extract_interaction_type(is_e) - if not interaction_type: - continue - # For interaction count some sites provide string instead of - # an integer (as per spec) with non digit characters (e.g. ",") - # so extracting count with more relaxed str_to_int - interaction_count = str_to_int(is_e.get('userInteractionCount')) - if interaction_count is None: - continue - count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1]) - if not count_kind: - continue - count_key = '%s_count' % count_kind - if info.get(count_key) is not None: - continue - info[count_key] = interaction_count - - def extract_video_object(e): - assert e['@type'] == 'VideoObject' - author = e.get('author') - info.update({ - 'url': url_or_none(e.get('contentUrl')), - 'title': unescapeHTML(e.get('name')), - 'description': unescapeHTML(e.get('description')), - 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')), - 'duration': parse_duration(e.get('duration')), - 'timestamp': unified_timestamp(e.get('uploadDate')), - # author can be an instance of 'Organization' or 'Person' types. - # both types can have 'name' property(inherited from 'Thing' type). [1] - # however some websites are using 'Text' type instead. - # 1. https://schema.org/VideoObject - 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None, - 'filesize': float_or_none(e.get('contentSize')), - 'tbr': int_or_none(e.get('bitrate')), - 'width': int_or_none(e.get('width')), - 'height': int_or_none(e.get('height')), - 'view_count': int_or_none(e.get('interactionCount')), - }) - extract_interaction_statistic(e) - - for e in json_ld: - if '@context' in e: - item_type = e.get('@type') - if expected_type is not None and expected_type != item_type: - continue - if item_type in ('TVEpisode', 'Episode'): - episode_name = unescapeHTML(e.get('name')) - info.update({ - 'episode': episode_name, - 'episode_number': int_or_none(e.get('episodeNumber')), - 'description': unescapeHTML(e.get('description')), - }) - if not info.get('title') and episode_name: - info['title'] = episode_name - part_of_season = e.get('partOfSeason') - if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'): - info.update({ - 'season': unescapeHTML(part_of_season.get('name')), - 'season_number': int_or_none(part_of_season.get('seasonNumber')), - }) - part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries') - if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'): - info['series'] = unescapeHTML(part_of_series.get('name')) - elif item_type == 'Movie': - info.update({ - 'title': unescapeHTML(e.get('name')), - 'description': unescapeHTML(e.get('description')), - 'duration': parse_duration(e.get('duration')), - 'timestamp': unified_timestamp(e.get('dateCreated')), - }) - elif item_type in ('Article', 'NewsArticle'): - info.update({ - 'timestamp': parse_iso8601(e.get('datePublished')), - 'title': unescapeHTML(e.get('headline')), - 'description': unescapeHTML(e.get('articleBody')), - }) - elif item_type == 'VideoObject': - extract_video_object(e) - if expected_type is None: - continue - else: - break - video = e.get('video') - if isinstance(video, dict) and video.get('@type') == 'VideoObject': - extract_video_object(video) - if expected_type is None: - continue - else: - break - return dict((k, v) for k, v in info.items() if v is not None) - - @staticmethod - def _hidden_inputs(html): - html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html) - hidden_inputs = {} - for input in re.findall(r'(?i)(<input[^>]+>)', html): - attrs = extract_attributes(input) - if not input: - continue - if attrs.get('type') not in ('hidden', 'submit'): - continue - name = attrs.get('name') or attrs.get('id') - value = attrs.get('value') - if name and value is not None: - hidden_inputs[name] = value - return hidden_inputs - - def _form_hidden_inputs(self, form_id, html): - form = self._search_regex( - r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id, - html, '%s form' % form_id, group='form') - return self._hidden_inputs(form) - - def _sort_formats(self, formats, field_preference=None): - if not formats: - raise ExtractorError('No video formats found') - - for f in formats: - # Automatically determine tbr when missing based on abr and vbr (improves - # formats sorting in some cases) - if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None: - f['tbr'] = f['abr'] + f['vbr'] - - def _formats_key(f): - # TODO remove the following workaround - from ..utils import determine_ext - if not f.get('ext') and 'url' in f: - f['ext'] = determine_ext(f['url']) - - if isinstance(field_preference, (list, tuple)): - return tuple( - f.get(field) - if f.get(field) is not None - else ('' if field == 'format_id' else -1) - for field in field_preference) - - preference = f.get('preference') - if preference is None: - preference = 0 - if f.get('ext') in ['f4f', 'f4m']: # Not yet supported - preference -= 0.5 - - protocol = f.get('protocol') or determine_protocol(f) - proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1) - - if f.get('vcodec') == 'none': # audio only - preference -= 50 - if self._downloader.params.get('prefer_free_formats'): - ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus'] - else: - ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a'] - ext_preference = 0 - try: - audio_ext_preference = ORDER.index(f['ext']) - except ValueError: - audio_ext_preference = -1 - else: - if f.get('acodec') == 'none': # video only - preference -= 40 - if self._downloader.params.get('prefer_free_formats'): - ORDER = ['flv', 'mp4', 'webm'] - else: - ORDER = ['webm', 'flv', 'mp4'] - try: - ext_preference = ORDER.index(f['ext']) - except ValueError: - ext_preference = -1 - audio_ext_preference = 0 - - return ( - preference, - f.get('language_preference') if f.get('language_preference') is not None else -1, - f.get('quality') if f.get('quality') is not None else -1, - f.get('tbr') if f.get('tbr') is not None else -1, - f.get('filesize') if f.get('filesize') is not None else -1, - f.get('vbr') if f.get('vbr') is not None else -1, - f.get('height') if f.get('height') is not None else -1, - f.get('width') if f.get('width') is not None else -1, - proto_preference, - ext_preference, - f.get('abr') if f.get('abr') is not None else -1, - audio_ext_preference, - f.get('fps') if f.get('fps') is not None else -1, - f.get('filesize_approx') if f.get('filesize_approx') is not None else -1, - f.get('source_preference') if f.get('source_preference') is not None else -1, - f.get('format_id') if f.get('format_id') is not None else '', - ) - formats.sort(key=_formats_key) - - def _check_formats(self, formats, video_id): - if formats: - formats[:] = filter( - lambda f: self._is_valid_url( - f['url'], video_id, - item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'), - formats) - - @staticmethod - def _remove_duplicate_formats(formats): - format_urls = set() - unique_formats = [] - for f in formats: - if f['url'] not in format_urls: - format_urls.add(f['url']) - unique_formats.append(f) - formats[:] = unique_formats - - def _is_valid_url(self, url, video_id, item='video', headers={}): - url = self._proto_relative_url(url, scheme='http:') - # For now assume non HTTP(S) URLs always valid - if not (url.startswith('http://') or url.startswith('https://')): - return True - try: - self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers) - return True - except ExtractorError as e: - self.to_screen( - '%s: %s URL is invalid, skipping: %s' - % (video_id, item, error_to_compat_str(e.cause))) - return False - - def http_scheme(self): - """ Either "http:" or "https:", depending on the user's preferences """ - return ( - 'http:' - if self._downloader.params.get('prefer_insecure', False) - else 'https:') - - def _proto_relative_url(self, url, scheme=None): - if url is None: - return url - if url.startswith('//'): - if scheme is None: - scheme = self.http_scheme() - return scheme + url - else: - return url - - def _sleep(self, timeout, video_id, msg_template=None): - if msg_template is None: - msg_template = '%(video_id)s: Waiting for %(timeout)s seconds' - msg = msg_template % {'video_id': video_id, 'timeout': timeout} - self.to_screen(msg) - time.sleep(timeout) - - def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None, - transform_source=lambda s: fix_xml_ampersands(s).strip(), - fatal=True, m3u8_id=None, data=None, headers={}, query={}): - manifest = self._download_xml( - manifest_url, video_id, 'Downloading f4m manifest', - 'Unable to download f4m manifest', - # Some manifests may be malformed, e.g. prosiebensat1 generated manifests - # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244) - transform_source=transform_source, - fatal=fatal, data=data, headers=headers, query=query) - - if manifest is False: - return [] - - return self._parse_f4m_formats( - manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id, - transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id) - - def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None, - transform_source=lambda s: fix_xml_ampersands(s).strip(), - fatal=True, m3u8_id=None): - if not isinstance(manifest, compat_etree_Element) and not fatal: - return [] - - # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy - akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0') - if akamai_pv is not None and ';' in akamai_pv.text: - playerVerificationChallenge = akamai_pv.text.split(';')[0] - if playerVerificationChallenge.strip() != '': - return [] - - formats = [] - manifest_version = '1.0' - media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media') - if not media_nodes: - manifest_version = '2.0' - media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media') - # Remove unsupported DRM protected media from final formats - # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573). - media_nodes = remove_encrypted_media(media_nodes) - if not media_nodes: - return formats - - manifest_base_url = get_base_url(manifest) - - bootstrap_info = xpath_element( - manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'], - 'bootstrap info', default=None) - - vcodec = None - mime_type = xpath_text( - manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'], - 'base URL', default=None) - if mime_type and mime_type.startswith('audio/'): - vcodec = 'none' - - for i, media_el in enumerate(media_nodes): - tbr = int_or_none(media_el.attrib.get('bitrate')) - width = int_or_none(media_el.attrib.get('width')) - height = int_or_none(media_el.attrib.get('height')) - format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])) - # If <bootstrapInfo> is present, the specified f4m is a - # stream-level manifest, and only set-level manifests may refer to - # external resources. See section 11.4 and section 4 of F4M spec - if bootstrap_info is None: - media_url = None - # @href is introduced in 2.0, see section 11.6 of F4M spec - if manifest_version == '2.0': - media_url = media_el.attrib.get('href') - if media_url is None: - media_url = media_el.attrib.get('url') - if not media_url: - continue - manifest_url = ( - media_url if media_url.startswith('http://') or media_url.startswith('https://') - else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url)) - # If media_url is itself a f4m manifest do the recursive extraction - # since bitrates in parent manifest (this one) and media_url manifest - # may differ leading to inability to resolve the format by requested - # bitrate in f4m downloader - ext = determine_ext(manifest_url) - if ext == 'f4m': - f4m_formats = self._extract_f4m_formats( - manifest_url, video_id, preference=preference, f4m_id=f4m_id, - transform_source=transform_source, fatal=fatal) - # Sometimes stream-level manifest contains single media entry that - # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player). - # At the same time parent's media entry in set-level manifest may - # contain it. We will copy it from parent in such cases. - if len(f4m_formats) == 1: - f = f4m_formats[0] - f.update({ - 'tbr': f.get('tbr') or tbr, - 'width': f.get('width') or width, - 'height': f.get('height') or height, - 'format_id': f.get('format_id') if not tbr else format_id, - 'vcodec': vcodec, - }) - formats.extend(f4m_formats) - continue - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - manifest_url, video_id, 'mp4', preference=preference, - m3u8_id=m3u8_id, fatal=fatal)) - continue - formats.append({ - 'format_id': format_id, - 'url': manifest_url, - 'manifest_url': manifest_url, - 'ext': 'flv' if bootstrap_info is not None else None, - 'protocol': 'f4m', - 'tbr': tbr, - 'width': width, - 'height': height, - 'vcodec': vcodec, - 'preference': preference, - }) - return formats - - def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None): - return { - 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])), - 'url': m3u8_url, - 'ext': ext, - 'protocol': 'm3u8', - 'preference': preference - 100 if preference else -100, - 'resolution': 'multiple', - 'format_note': 'Quality selection URL', - } - - def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, - entry_protocol='m3u8', preference=None, - m3u8_id=None, note=None, errnote=None, - fatal=True, live=False, data=None, headers={}, - query={}): - res = self._download_webpage_handle( - m3u8_url, video_id, - note=note or 'Downloading m3u8 information', - errnote=errnote or 'Failed to download m3u8 information', - fatal=fatal, data=data, headers=headers, query=query) - - if res is False: - return [] - - m3u8_doc, urlh = res - m3u8_url = urlh.geturl() - - return self._parse_m3u8_formats( - m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol, - preference=preference, m3u8_id=m3u8_id, live=live) - - def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None, - entry_protocol='m3u8', preference=None, - m3u8_id=None, live=False): - if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access - return [] - - if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc): # Apple FairPlay - return [] - - formats = [] - - format_url = lambda u: ( - u - if re.match(r'^https?://', u) - else compat_urlparse.urljoin(m3u8_url, u)) - - # References: - # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21 - # 2. https://github.com/ytdl-org/youtube-dl/issues/12211 - # 3. https://github.com/ytdl-org/youtube-dl/issues/18923 - - # We should try extracting formats only from master playlists [1, 4.3.4], - # i.e. playlists that describe available qualities. On the other hand - # media playlists [1, 4.3.3] should be returned as is since they contain - # just the media without qualities renditions. - # Fortunately, master playlist can be easily distinguished from media - # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4] - # master playlist tags MUST NOT appear in a media playlist and vice versa. - # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every - # media playlist and MUST NOT appear in master playlist thus we can - # clearly detect media playlist with this criterion. - - if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is - return [{ - 'url': m3u8_url, - 'format_id': m3u8_id, - 'ext': ext, - 'protocol': entry_protocol, - 'preference': preference, - }] - - groups = {} - last_stream_inf = {} - - def extract_media(x_media_line): - media = parse_m3u8_attributes(x_media_line) - # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED - media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME') - if not (media_type and group_id and name): - return - groups.setdefault(group_id, []).append(media) - if media_type not in ('VIDEO', 'AUDIO'): - return - media_url = media.get('URI') - if media_url: - format_id = [] - for v in (m3u8_id, group_id, name): - if v: - format_id.append(v) - f = { - 'format_id': '-'.join(format_id), - 'url': format_url(media_url), - 'manifest_url': m3u8_url, - 'language': media.get('LANGUAGE'), - 'ext': ext, - 'protocol': entry_protocol, - 'preference': preference, - } - if media_type == 'AUDIO': - f['vcodec'] = 'none' - formats.append(f) - - def build_stream_name(): - # Despite specification does not mention NAME attribute for - # EXT-X-STREAM-INF tag it still sometimes may be present (see [1] - # or vidio test in TestInfoExtractor.test_parse_m3u8_formats) - # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015 - stream_name = last_stream_inf.get('NAME') - if stream_name: - return stream_name - # If there is no NAME in EXT-X-STREAM-INF it will be obtained - # from corresponding rendition group - stream_group_id = last_stream_inf.get('VIDEO') - if not stream_group_id: - return - stream_group = groups.get(stream_group_id) - if not stream_group: - return stream_group_id - rendition = stream_group[0] - return rendition.get('NAME') or stream_group_id - - # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the - # chance to detect video only formats when EXT-X-STREAM-INF tags - # precede EXT-X-MEDIA tags in HLS manifest such as [3]. - for line in m3u8_doc.splitlines(): - if line.startswith('#EXT-X-MEDIA:'): - extract_media(line) - - for line in m3u8_doc.splitlines(): - if line.startswith('#EXT-X-STREAM-INF:'): - last_stream_inf = parse_m3u8_attributes(line) - elif line.startswith('#') or not line.strip(): - continue - else: - tbr = float_or_none( - last_stream_inf.get('AVERAGE-BANDWIDTH') - or last_stream_inf.get('BANDWIDTH'), scale=1000) - format_id = [] - if m3u8_id: - format_id.append(m3u8_id) - stream_name = build_stream_name() - # Bandwidth of live streams may differ over time thus making - # format_id unpredictable. So it's better to keep provided - # format_id intact. - if not live: - format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats))) - manifest_url = format_url(line.strip()) - f = { - 'format_id': '-'.join(format_id), - 'url': manifest_url, - 'manifest_url': m3u8_url, - 'tbr': tbr, - 'ext': ext, - 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')), - 'protocol': entry_protocol, - 'preference': preference, - } - resolution = last_stream_inf.get('RESOLUTION') - if resolution: - mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution) - if mobj: - f['width'] = int(mobj.group('width')) - f['height'] = int(mobj.group('height')) - # Unified Streaming Platform - mobj = re.search( - r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url']) - if mobj: - abr, vbr = mobj.groups() - abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000) - f.update({ - 'vbr': vbr, - 'abr': abr, - }) - codecs = parse_codecs(last_stream_inf.get('CODECS')) - f.update(codecs) - audio_group_id = last_stream_inf.get('AUDIO') - # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which - # references a rendition group MUST have a CODECS attribute. - # However, this is not always respected, for example, [2] - # contains EXT-X-STREAM-INF tag which references AUDIO - # rendition group but does not have CODECS and despite - # referencing an audio group it represents a complete - # (with audio and video) format. So, for such cases we will - # ignore references to rendition groups and treat them - # as complete formats. - if audio_group_id and codecs and f.get('vcodec') != 'none': - audio_group = groups.get(audio_group_id) - if audio_group and audio_group[0].get('URI'): - # TODO: update acodec for audio only formats with - # the same GROUP-ID - f['acodec'] = 'none' - formats.append(f) - - # for DailyMotion - progressive_uri = last_stream_inf.get('PROGRESSIVE-URI') - if progressive_uri: - http_f = f.copy() - del http_f['manifest_url'] - http_f.update({ - 'format_id': f['format_id'].replace('hls-', 'http-'), - 'protocol': 'http', - 'url': progressive_uri, - }) - formats.append(http_f) - - last_stream_inf = {} - return formats - - @staticmethod - def _xpath_ns(path, namespace=None): - if not namespace: - return path - out = [] - for c in path.split('/'): - if not c or c == '.': - out.append(c) - else: - out.append('{%s}%s' % (namespace, c)) - return '/'.join(out) - - def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None): - smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source) - - if smil is False: - assert not fatal - return [] - - namespace = self._parse_smil_namespace(smil) - - return self._parse_smil_formats( - smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) - - def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None): - smil = self._download_smil(smil_url, video_id, fatal=fatal) - if smil is False: - return {} - return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params) - - def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None): - return self._download_xml( - smil_url, video_id, 'Downloading SMIL file', - 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source) - - def _parse_smil(self, smil, smil_url, video_id, f4m_params=None): - namespace = self._parse_smil_namespace(smil) - - formats = self._parse_smil_formats( - smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) - subtitles = self._parse_smil_subtitles(smil, namespace=namespace) - - video_id = os.path.splitext(url_basename(smil_url))[0] - title = None - description = None - upload_date = None - for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): - name = meta.attrib.get('name') - content = meta.attrib.get('content') - if not name or not content: - continue - if not title and name == 'title': - title = content - elif not description and name in ('description', 'abstract'): - description = content - elif not upload_date and name == 'date': - upload_date = unified_strdate(content) - - thumbnails = [{ - 'id': image.get('type'), - 'url': image.get('src'), - 'width': int_or_none(image.get('width')), - 'height': int_or_none(image.get('height')), - } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')] - - return { - 'id': video_id, - 'title': title or video_id, - 'description': description, - 'upload_date': upload_date, - 'thumbnails': thumbnails, - 'formats': formats, - 'subtitles': subtitles, - } - - def _parse_smil_namespace(self, smil): - return self._search_regex( - r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None) - - def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): - base = smil_url - for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): - b = meta.get('base') or meta.get('httpBase') - if b: - base = b - break - - formats = [] - rtmp_count = 0 - http_count = 0 - m3u8_count = 0 - - srcs = [] - media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace)) - for medium in media: - src = medium.get('src') - if not src or src in srcs: - continue - srcs.append(src) - - bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000) - filesize = int_or_none(medium.get('size') or medium.get('fileSize')) - width = int_or_none(medium.get('width')) - height = int_or_none(medium.get('height')) - proto = medium.get('proto') - ext = medium.get('ext') - src_ext = determine_ext(src) - streamer = medium.get('streamer') or base - - if proto == 'rtmp' or streamer.startswith('rtmp'): - rtmp_count += 1 - formats.append({ - 'url': streamer, - 'play_path': src, - 'ext': 'flv', - 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate), - 'tbr': bitrate, - 'filesize': filesize, - 'width': width, - 'height': height, - }) - if transform_rtmp_url: - streamer, src = transform_rtmp_url(streamer, src) - formats[-1].update({ - 'url': streamer, - 'play_path': src, - }) - continue - - src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src) - src_url = src_url.strip() - - if proto == 'm3u8' or src_ext == 'm3u8': - m3u8_formats = self._extract_m3u8_formats( - src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False) - if len(m3u8_formats) == 1: - m3u8_count += 1 - m3u8_formats[0].update({ - 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate), - 'tbr': bitrate, - 'width': width, - 'height': height, - }) - formats.extend(m3u8_formats) - elif src_ext == 'f4m': - f4m_url = src_url - if not f4m_params: - f4m_params = { - 'hdcore': '3.2.0', - 'plugin': 'flowplayer-3.2.0.1', - } - f4m_url += '&' if '?' in f4m_url else '?' - f4m_url += compat_urllib_parse_urlencode(f4m_params) - formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)) - elif src_ext == 'mpd': - formats.extend(self._extract_mpd_formats( - src_url, video_id, mpd_id='dash', fatal=False)) - elif re.search(r'\.ism/[Mm]anifest', src_url): - formats.extend(self._extract_ism_formats( - src_url, video_id, ism_id='mss', fatal=False)) - elif src_url.startswith('http') and self._is_valid_url(src, video_id): - http_count += 1 - formats.append({ - 'url': src_url, - 'ext': ext or src_ext or 'flv', - 'format_id': 'http-%d' % (bitrate or http_count), - 'tbr': bitrate, - 'filesize': filesize, - 'width': width, - 'height': height, - }) - - return formats - - def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): - urls = [] - subtitles = {} - for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))): - src = textstream.get('src') - if not src or src in urls: - continue - urls.append(src) - ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src) - lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang - subtitles.setdefault(lang, []).append({ - 'url': src, - 'ext': ext, - }) - return subtitles - - def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True): - xspf = self._download_xml( - xspf_url, playlist_id, 'Downloading xpsf playlist', - 'Unable to download xspf manifest', fatal=fatal) - if xspf is False: - return [] - return self._parse_xspf( - xspf, playlist_id, xspf_url=xspf_url, - xspf_base_url=base_url(xspf_url)) - - def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None): - NS_MAP = { - 'xspf': 'http://xspf.org/ns/0/', - 's1': 'http://static.streamone.nl/player/ns/0', - } - - entries = [] - for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)): - title = xpath_text( - track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id) - description = xpath_text( - track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description') - thumbnail = xpath_text( - track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail') - duration = float_or_none( - xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000) - - formats = [] - for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)): - format_url = urljoin(xspf_base_url, location.text) - if not format_url: - continue - formats.append({ - 'url': format_url, - 'manifest_url': xspf_url, - 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)), - 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))), - 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))), - }) - self._sort_formats(formats) - - entries.append({ - 'id': playlist_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - }) - return entries - - def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): - res = self._download_xml_handle( - mpd_url, video_id, - note=note or 'Downloading MPD manifest', - errnote=errnote or 'Failed to download MPD manifest', - fatal=fatal, data=data, headers=headers, query=query) - if res is False: - return [] - mpd_doc, urlh = res - if mpd_doc is None: - return [] - mpd_base_url = base_url(urlh.geturl()) - - return self._parse_mpd_formats( - mpd_doc, mpd_id, mpd_base_url, mpd_url) - - def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None): - """ - Parse formats from MPD manifest. - References: - 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E), - http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip - 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP - """ - if mpd_doc.get('type') == 'dynamic': - return [] - - namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None) - - def _add_ns(path): - return self._xpath_ns(path, namespace) - - def is_drm_protected(element): - return element.find(_add_ns('ContentProtection')) is not None - - def extract_multisegment_info(element, ms_parent_info): - ms_info = ms_parent_info.copy() - - # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some - # common attributes and elements. We will only extract relevant - # for us. - def extract_common(source): - segment_timeline = source.find(_add_ns('SegmentTimeline')) - if segment_timeline is not None: - s_e = segment_timeline.findall(_add_ns('S')) - if s_e: - ms_info['total_number'] = 0 - ms_info['s'] = [] - for s in s_e: - r = int(s.get('r', 0)) - ms_info['total_number'] += 1 + r - ms_info['s'].append({ - 't': int(s.get('t', 0)), - # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60]) - 'd': int(s.attrib['d']), - 'r': r, - }) - start_number = source.get('startNumber') - if start_number: - ms_info['start_number'] = int(start_number) - timescale = source.get('timescale') - if timescale: - ms_info['timescale'] = int(timescale) - segment_duration = source.get('duration') - if segment_duration: - ms_info['segment_duration'] = float(segment_duration) - - def extract_Initialization(source): - initialization = source.find(_add_ns('Initialization')) - if initialization is not None: - ms_info['initialization_url'] = initialization.attrib['sourceURL'] - - segment_list = element.find(_add_ns('SegmentList')) - if segment_list is not None: - extract_common(segment_list) - extract_Initialization(segment_list) - segment_urls_e = segment_list.findall(_add_ns('SegmentURL')) - if segment_urls_e: - ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e] - else: - segment_template = element.find(_add_ns('SegmentTemplate')) - if segment_template is not None: - extract_common(segment_template) - media = segment_template.get('media') - if media: - ms_info['media'] = media - initialization = segment_template.get('initialization') - if initialization: - ms_info['initialization'] = initialization - else: - extract_Initialization(segment_template) - return ms_info - - mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration')) - formats = [] - for period in mpd_doc.findall(_add_ns('Period')): - period_duration = parse_duration(period.get('duration')) or mpd_duration - period_ms_info = extract_multisegment_info(period, { - 'start_number': 1, - 'timescale': 1, - }) - for adaptation_set in period.findall(_add_ns('AdaptationSet')): - if is_drm_protected(adaptation_set): - continue - adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info) - for representation in adaptation_set.findall(_add_ns('Representation')): - if is_drm_protected(representation): - continue - representation_attrib = adaptation_set.attrib.copy() - representation_attrib.update(representation.attrib) - # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory - mime_type = representation_attrib['mimeType'] - content_type = mime_type.split('/')[0] - if content_type == 'text': - # TODO implement WebVTT downloading - pass - elif content_type in ('video', 'audio'): - base_url = '' - for element in (representation, adaptation_set, period, mpd_doc): - base_url_e = element.find(_add_ns('BaseURL')) - if base_url_e is not None: - base_url = base_url_e.text + base_url - if re.match(r'^https?://', base_url): - break - if mpd_base_url and not re.match(r'^https?://', base_url): - if not mpd_base_url.endswith('/') and not base_url.startswith('/'): - mpd_base_url += '/' - base_url = mpd_base_url + base_url - representation_id = representation_attrib.get('id') - lang = representation_attrib.get('lang') - url_el = representation.find(_add_ns('BaseURL')) - filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None) - bandwidth = int_or_none(representation_attrib.get('bandwidth')) - f = { - 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id, - 'manifest_url': mpd_url, - 'ext': mimetype2ext(mime_type), - 'width': int_or_none(representation_attrib.get('width')), - 'height': int_or_none(representation_attrib.get('height')), - 'tbr': float_or_none(bandwidth, 1000), - 'asr': int_or_none(representation_attrib.get('audioSamplingRate')), - 'fps': int_or_none(representation_attrib.get('frameRate')), - 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None, - 'format_note': 'DASH %s' % content_type, - 'filesize': filesize, - 'container': mimetype2ext(mime_type) + '_dash', - } - f.update(parse_codecs(representation_attrib.get('codecs'))) - representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) - - def prepare_template(template_name, identifiers): - tmpl = representation_ms_info[template_name] - # First of, % characters outside $...$ templates - # must be escaped by doubling for proper processing - # by % operator string formatting used further (see - # https://github.com/ytdl-org/youtube-dl/issues/16867). - t = '' - in_template = False - for c in tmpl: - t += c - if c == '$': - in_template = not in_template - elif c == '%' and not in_template: - t += c - # Next, $...$ templates are translated to their - # %(...) counterparts to be used with % operator - t = t.replace('$RepresentationID$', representation_id) - t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t) - t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t) - t.replace('$$', '$') - return t - - # @initialization is a regular template like @media one - # so it should be handled just the same way (see - # https://github.com/ytdl-org/youtube-dl/issues/11605) - if 'initialization' in representation_ms_info: - initialization_template = prepare_template( - 'initialization', - # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and - # $Time$ shall not be included for @initialization thus - # only $Bandwidth$ remains - ('Bandwidth', )) - representation_ms_info['initialization_url'] = initialization_template % { - 'Bandwidth': bandwidth, - } - - def location_key(location): - return 'url' if re.match(r'^https?://', location) else 'path' - - if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info: - - media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time')) - media_location_key = location_key(media_template) - - # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$ - # can't be used at the same time - if '%(Number' in media_template and 's' not in representation_ms_info: - segment_duration = None - if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info: - segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale']) - representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) - representation_ms_info['fragments'] = [{ - media_location_key: media_template % { - 'Number': segment_number, - 'Bandwidth': bandwidth, - }, - 'duration': segment_duration, - } for segment_number in range( - representation_ms_info['start_number'], - representation_ms_info['total_number'] + representation_ms_info['start_number'])] - else: - # $Number*$ or $Time$ in media template with S list available - # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg - # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411 - representation_ms_info['fragments'] = [] - segment_time = 0 - segment_d = None - segment_number = representation_ms_info['start_number'] - - def add_segment_url(): - segment_url = media_template % { - 'Time': segment_time, - 'Bandwidth': bandwidth, - 'Number': segment_number, - } - representation_ms_info['fragments'].append({ - media_location_key: segment_url, - 'duration': float_or_none(segment_d, representation_ms_info['timescale']), - }) - - for num, s in enumerate(representation_ms_info['s']): - segment_time = s.get('t') or segment_time - segment_d = s['d'] - add_segment_url() - segment_number += 1 - for r in range(s.get('r', 0)): - segment_time += segment_d - add_segment_url() - segment_number += 1 - segment_time += segment_d - elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info: - # No media template - # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI - # or any YouTube dashsegments video - fragments = [] - segment_index = 0 - timescale = representation_ms_info['timescale'] - for s in representation_ms_info['s']: - duration = float_or_none(s['d'], timescale) - for r in range(s.get('r', 0) + 1): - segment_uri = representation_ms_info['segment_urls'][segment_index] - fragments.append({ - location_key(segment_uri): segment_uri, - 'duration': duration, - }) - segment_index += 1 - representation_ms_info['fragments'] = fragments - elif 'segment_urls' in representation_ms_info: - # Segment URLs with no SegmentTimeline - # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091 - # https://github.com/ytdl-org/youtube-dl/pull/14844 - fragments = [] - segment_duration = float_or_none( - representation_ms_info['segment_duration'], - representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None - for segment_url in representation_ms_info['segment_urls']: - fragment = { - location_key(segment_url): segment_url, - } - if segment_duration: - fragment['duration'] = segment_duration - fragments.append(fragment) - representation_ms_info['fragments'] = fragments - # If there is a fragments key available then we correctly recognized fragmented media. - # Otherwise we will assume unfragmented media with direct access. Technically, such - # assumption is not necessarily correct since we may simply have no support for - # some forms of fragmented media renditions yet, but for now we'll use this fallback. - if 'fragments' in representation_ms_info: - f.update({ - # NB: mpd_url may be empty when MPD manifest is parsed from a string - 'url': mpd_url or base_url, - 'fragment_base_url': base_url, - 'fragments': [], - 'protocol': 'http_dash_segments', - }) - if 'initialization_url' in representation_ms_info: - initialization_url = representation_ms_info['initialization_url'] - if not f.get('url'): - f['url'] = initialization_url - f['fragments'].append({location_key(initialization_url): initialization_url}) - f['fragments'].extend(representation_ms_info['fragments']) - else: - # Assuming direct URL to unfragmented media. - f['url'] = base_url - formats.append(f) - else: - self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) - return formats - - def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): - res = self._download_xml_handle( - ism_url, video_id, - note=note or 'Downloading ISM manifest', - errnote=errnote or 'Failed to download ISM manifest', - fatal=fatal, data=data, headers=headers, query=query) - if res is False: - return [] - ism_doc, urlh = res - if ism_doc is None: - return [] - - return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id) - - def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None): - """ - Parse formats from ISM manifest. - References: - 1. [MS-SSTR]: Smooth Streaming Protocol, - https://msdn.microsoft.com/en-us/library/ff469518.aspx - """ - if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None: - return [] - - duration = int(ism_doc.attrib['Duration']) - timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000 - - formats = [] - for stream in ism_doc.findall('StreamIndex'): - stream_type = stream.get('Type') - if stream_type not in ('video', 'audio'): - continue - url_pattern = stream.attrib['Url'] - stream_timescale = int_or_none(stream.get('TimeScale')) or timescale - stream_name = stream.get('Name') - for track in stream.findall('QualityLevel'): - fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None) - # TODO: add support for WVC1 and WMAP - if fourcc not in ('H264', 'AVC1', 'AACL'): - self.report_warning('%s is not a supported codec' % fourcc) - continue - tbr = int(track.attrib['Bitrate']) // 1000 - # [1] does not mention Width and Height attributes. However, - # they're often present while MaxWidth and MaxHeight are - # missing, so should be used as fallbacks - width = int_or_none(track.get('MaxWidth') or track.get('Width')) - height = int_or_none(track.get('MaxHeight') or track.get('Height')) - sampling_rate = int_or_none(track.get('SamplingRate')) - - track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern) - track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern) - - fragments = [] - fragment_ctx = { - 'time': 0, - } - stream_fragments = stream.findall('c') - for stream_fragment_index, stream_fragment in enumerate(stream_fragments): - fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time'] - fragment_repeat = int_or_none(stream_fragment.get('r')) or 1 - fragment_ctx['duration'] = int_or_none(stream_fragment.get('d')) - if not fragment_ctx['duration']: - try: - next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t']) - except IndexError: - next_fragment_time = duration - fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat - for _ in range(fragment_repeat): - fragments.append({ - 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern), - 'duration': fragment_ctx['duration'] / stream_timescale, - }) - fragment_ctx['time'] += fragment_ctx['duration'] - - format_id = [] - if ism_id: - format_id.append(ism_id) - if stream_name: - format_id.append(stream_name) - format_id.append(compat_str(tbr)) - - formats.append({ - 'format_id': '-'.join(format_id), - 'url': ism_url, - 'manifest_url': ism_url, - 'ext': 'ismv' if stream_type == 'video' else 'isma', - 'width': width, - 'height': height, - 'tbr': tbr, - 'asr': sampling_rate, - 'vcodec': 'none' if stream_type == 'audio' else fourcc, - 'acodec': 'none' if stream_type == 'video' else fourcc, - 'protocol': 'ism', - 'fragments': fragments, - '_download_params': { - 'duration': duration, - 'timescale': stream_timescale, - 'width': width or 0, - 'height': height or 0, - 'fourcc': fourcc, - 'codec_private_data': track.get('CodecPrivateData'), - 'sampling_rate': sampling_rate, - 'channels': int_or_none(track.get('Channels', 2)), - 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)), - 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)), - }, - }) - return formats - - def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None): - def absolute_url(item_url): - return urljoin(base_url, item_url) - - def parse_content_type(content_type): - if not content_type: - return {} - ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type) - if ctr: - mimetype, codecs = ctr.groups() - f = parse_codecs(codecs) - f['ext'] = mimetype2ext(mimetype) - return f - return {} - - def _media_formats(src, cur_media_type, type_info={}): - full_url = absolute_url(src) - ext = type_info.get('ext') or determine_ext(full_url) - if ext == 'm3u8': - is_plain_url = False - formats = self._extract_m3u8_formats( - full_url, video_id, ext='mp4', - entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id, - preference=preference, fatal=False) - elif ext == 'mpd': - is_plain_url = False - formats = self._extract_mpd_formats( - full_url, video_id, mpd_id=mpd_id, fatal=False) - else: - is_plain_url = True - formats = [{ - 'url': full_url, - 'vcodec': 'none' if cur_media_type == 'audio' else None, - }] - return is_plain_url, formats - - entries = [] - # amp-video and amp-audio are very similar to their HTML5 counterparts - # so we wll include them right here (see - # https://www.ampproject.org/docs/reference/components/amp-video) - # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/ - _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)' - media_tags = [(media_tag, media_tag_name, media_type, '') - for media_tag, media_tag_name, media_type - in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)] - media_tags.extend(re.findall( - # We only allow video|audio followed by a whitespace or '>'. - # Allowing more characters may end up in significant slow down (see - # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL: - # http://www.porntrex.com/maps/videositemap.xml). - r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage)) - for media_tag, _, media_type, media_content in media_tags: - media_info = { - 'formats': [], - 'subtitles': {}, - } - media_attributes = extract_attributes(media_tag) - src = strip_or_none(media_attributes.get('src')) - if src: - _, formats = _media_formats(src, media_type) - media_info['formats'].extend(formats) - media_info['thumbnail'] = absolute_url(media_attributes.get('poster')) - if media_content: - for source_tag in re.findall(r'<source[^>]+>', media_content): - s_attr = extract_attributes(source_tag) - # data-video-src and data-src are non standard but seen - # several times in the wild - src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src'))) - if not src: - continue - f = parse_content_type(s_attr.get('type')) - is_plain_url, formats = _media_formats(src, media_type, f) - if is_plain_url: - # width, height, res, label and title attributes are - # all not standard but seen several times in the wild - labels = [ - s_attr.get(lbl) - for lbl in ('label', 'title') - if str_or_none(s_attr.get(lbl)) - ] - width = int_or_none(s_attr.get('width')) - height = (int_or_none(s_attr.get('height')) - or int_or_none(s_attr.get('res'))) - if not width or not height: - for lbl in labels: - resolution = parse_resolution(lbl) - if not resolution: - continue - width = width or resolution.get('width') - height = height or resolution.get('height') - for lbl in labels: - tbr = parse_bitrate(lbl) - if tbr: - break - else: - tbr = None - f.update({ - 'width': width, - 'height': height, - 'tbr': tbr, - 'format_id': s_attr.get('label') or s_attr.get('title'), - }) - f.update(formats[0]) - media_info['formats'].append(f) - else: - media_info['formats'].extend(formats) - for track_tag in re.findall(r'<track[^>]+>', media_content): - track_attributes = extract_attributes(track_tag) - kind = track_attributes.get('kind') - if not kind or kind in ('subtitles', 'captions'): - src = strip_or_none(track_attributes.get('src')) - if not src: - continue - lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label') - media_info['subtitles'].setdefault(lang, []).append({ - 'url': absolute_url(src), - }) - for f in media_info['formats']: - f.setdefault('http_headers', {})['Referer'] = base_url - if media_info['formats'] or media_info['subtitles']: - entries.append(media_info) - return entries - - def _extract_akamai_formats(self, manifest_url, video_id, hosts={}): - signed = 'hdnea=' in manifest_url - if not signed: - # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html - manifest_url = re.sub( - r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?', - '', manifest_url).strip('?') - - formats = [] - - hdcore_sign = 'hdcore=3.7.0' - f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m') - hds_host = hosts.get('hds') - if hds_host: - f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url) - if 'hdcore=' not in f4m_url: - f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign - f4m_formats = self._extract_f4m_formats( - f4m_url, video_id, f4m_id='hds', fatal=False) - for entry in f4m_formats: - entry.update({'extra_param_to_segment_url': hdcore_sign}) - formats.extend(f4m_formats) - - m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8') - hls_host = hosts.get('hls') - if hls_host: - m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url) - m3u8_formats = self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False) - formats.extend(m3u8_formats) - - http_host = hosts.get('http') - if http_host and m3u8_formats and not signed: - REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+' - qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',') - qualities_length = len(qualities) - if len(m3u8_formats) in (qualities_length, qualities_length + 1): - i = 0 - for f in m3u8_formats: - if f['vcodec'] != 'none': - for protocol in ('http', 'https'): - http_f = f.copy() - del http_f['manifest_url'] - http_url = re.sub( - REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url']) - http_f.update({ - 'format_id': http_f['format_id'].replace('hls-', protocol + '-'), - 'url': http_url, - 'protocol': protocol, - }) - formats.append(http_f) - i += 1 - - return formats - - def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]): - query = compat_urlparse.urlparse(url).query - url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url) - mobj = re.search( - r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url) - url_base = mobj.group('url') - http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base) - formats = [] - - def manifest_url(manifest): - m_url = '%s/%s' % (http_base_url, manifest) - if query: - m_url += '?%s' % query - return m_url - - if 'm3u8' not in skip_protocols: - formats.extend(self._extract_m3u8_formats( - manifest_url('playlist.m3u8'), video_id, 'mp4', - m3u8_entry_protocol, m3u8_id='hls', fatal=False)) - if 'f4m' not in skip_protocols: - formats.extend(self._extract_f4m_formats( - manifest_url('manifest.f4m'), - video_id, f4m_id='hds', fatal=False)) - if 'dash' not in skip_protocols: - formats.extend(self._extract_mpd_formats( - manifest_url('manifest.mpd'), - video_id, mpd_id='dash', fatal=False)) - if re.search(r'(?:/smil:|\.smil)', url_base): - if 'smil' not in skip_protocols: - rtmp_formats = self._extract_smil_formats( - manifest_url('jwplayer.smil'), - video_id, fatal=False) - for rtmp_format in rtmp_formats: - rtsp_format = rtmp_format.copy() - rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path']) - del rtsp_format['play_path'] - del rtsp_format['ext'] - rtsp_format.update({ - 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'), - 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'), - 'protocol': 'rtsp', - }) - formats.extend([rtmp_format, rtsp_format]) - else: - for protocol in ('rtmp', 'rtsp'): - if protocol not in skip_protocols: - formats.append({ - 'url': '%s:%s' % (protocol, url_base), - 'format_id': protocol, - 'protocol': protocol, - }) - return formats - - def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json): - mobj = re.search( - r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)', - webpage) - if mobj: - try: - jwplayer_data = self._parse_json(mobj.group('options'), - video_id=video_id, - transform_source=transform_source) - except ExtractorError: - pass - else: - if isinstance(jwplayer_data, dict): - return jwplayer_data - - def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs): - jwplayer_data = self._find_jwplayer_data( - webpage, video_id, transform_source=js_to_json) - return self._parse_jwplayer_data( - jwplayer_data, video_id, *args, **kwargs) - - def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True, - m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): - # JWPlayer backward compatibility: flattened playlists - # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96 - if 'playlist' not in jwplayer_data: - jwplayer_data = {'playlist': [jwplayer_data]} - - entries = [] - - # JWPlayer backward compatibility: single playlist item - # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10 - if not isinstance(jwplayer_data['playlist'], list): - jwplayer_data['playlist'] = [jwplayer_data['playlist']] - - for video_data in jwplayer_data['playlist']: - # JWPlayer backward compatibility: flattened sources - # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35 - if 'sources' not in video_data: - video_data['sources'] = [video_data] - - this_video_id = video_id or video_data['mediaid'] - - formats = self._parse_jwplayer_formats( - video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id, - mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url) - - subtitles = {} - tracks = video_data.get('tracks') - if tracks and isinstance(tracks, list): - for track in tracks: - if not isinstance(track, dict): - continue - track_kind = track.get('kind') - if not track_kind or not isinstance(track_kind, compat_str): - continue - if track_kind.lower() not in ('captions', 'subtitles'): - continue - track_url = urljoin(base_url, track.get('file')) - if not track_url: - continue - subtitles.setdefault(track.get('label') or 'en', []).append({ - 'url': self._proto_relative_url(track_url) - }) - - entry = { - 'id': this_video_id, - 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')), - 'description': clean_html(video_data.get('description')), - 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))), - 'timestamp': int_or_none(video_data.get('pubdate')), - 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')), - 'subtitles': subtitles, - } - # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32 - if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']): - entry.update({ - '_type': 'url_transparent', - 'url': formats[0]['url'], - }) - else: - self._sort_formats(formats) - entry['formats'] = formats - entries.append(entry) - if len(entries) == 1: - return entries[0] - else: - return self.playlist_result(entries) - - def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None, - m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): - urls = [] - formats = [] - for source in jwplayer_sources_data: - if not isinstance(source, dict): - continue - source_url = urljoin( - base_url, self._proto_relative_url(source.get('file'))) - if not source_url or source_url in urls: - continue - urls.append(source_url) - source_type = source.get('type') or '' - ext = mimetype2ext(source_type) or determine_ext(source_url) - if source_type == 'hls' or ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - source_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id=m3u8_id, fatal=False)) - elif source_type == 'dash' or ext == 'mpd': - formats.extend(self._extract_mpd_formats( - source_url, video_id, mpd_id=mpd_id, fatal=False)) - elif ext == 'smil': - formats.extend(self._extract_smil_formats( - source_url, video_id, fatal=False)) - # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67 - elif source_type.startswith('audio') or ext in ( - 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'): - formats.append({ - 'url': source_url, - 'vcodec': 'none', - 'ext': ext, - }) - else: - height = int_or_none(source.get('height')) - if height is None: - # Often no height is provided but there is a label in - # format like "1080p", "720p SD", or 1080. - height = int_or_none(self._search_regex( - r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''), - 'height', default=None)) - a_format = { - 'url': source_url, - 'width': int_or_none(source.get('width')), - 'height': height, - 'tbr': int_or_none(source.get('bitrate')), - 'ext': ext, - } - if source_url.startswith('rtmp'): - a_format['ext'] = 'flv' - # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as - # of jwplayer.flash.swf - rtmp_url_parts = re.split( - r'((?:mp4|mp3|flv):)', source_url, 1) - if len(rtmp_url_parts) == 3: - rtmp_url, prefix, play_path = rtmp_url_parts - a_format.update({ - 'url': rtmp_url, - 'play_path': prefix + play_path, - }) - if rtmp_params: - a_format.update(rtmp_params) - formats.append(a_format) - return formats - - def _live_title(self, name): - """ Generate the title for a live video """ - now = datetime.datetime.now() - now_str = now.strftime('%Y-%m-%d %H:%M') - return name + ' ' + now_str - - def _int(self, v, name, fatal=False, **kwargs): - res = int_or_none(v, **kwargs) - if 'get_attr' in kwargs: - print(getattr(v, kwargs['get_attr'])) - if res is None: - msg = 'Failed to extract %s: Could not parse value %r' % (name, v) - if fatal: - raise ExtractorError(msg) - else: - self._downloader.report_warning(msg) - return res - - def _float(self, v, name, fatal=False, **kwargs): - res = float_or_none(v, **kwargs) - if res is None: - msg = 'Failed to extract %s: Could not parse value %r' % (name, v) - if fatal: - raise ExtractorError(msg) - else: - self._downloader.report_warning(msg) - return res - - def _set_cookie(self, domain, name, value, expire_time=None, port=None, - path='/', secure=False, discard=False, rest={}, **kwargs): - cookie = compat_cookiejar_Cookie( - 0, name, value, port, port is not None, domain, True, - domain.startswith('.'), path, True, secure, expire_time, - discard, None, None, rest) - self._downloader.cookiejar.set_cookie(cookie) - - def _get_cookies(self, url): - """ Return a compat_cookies_SimpleCookie with the cookies for the url """ - req = sanitized_Request(url) - self._downloader.cookiejar.add_cookie_header(req) - return compat_cookies_SimpleCookie(req.get_header('Cookie')) - - def _apply_first_set_cookie_header(self, url_handle, cookie): - """ - Apply first Set-Cookie header instead of the last. Experimental. - - Some sites (e.g. [1-3]) may serve two cookies under the same name - in Set-Cookie header and expect the first (old) one to be set rather - than second (new). However, as of RFC6265 the newer one cookie - should be set into cookie store what actually happens. - We will workaround this issue by resetting the cookie to - the first one manually. - 1. https://new.vk.com/ - 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201 - 3. https://learning.oreilly.com/ - """ - for header, cookies in url_handle.headers.items(): - if header.lower() != 'set-cookie': - continue - if sys.version_info[0] >= 3: - cookies = cookies.encode('iso-8859-1') - cookies = cookies.decode('utf-8') - cookie_value = re.search( - r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies) - if cookie_value: - value, domain = cookie_value.groups() - self._set_cookie(domain, cookie, value) - break - - def get_testcases(self, include_onlymatching=False): - t = getattr(self, '_TEST', None) - if t: - assert not hasattr(self, '_TESTS'), \ - '%s has _TEST and _TESTS' % type(self).__name__ - tests = [t] - else: - tests = getattr(self, '_TESTS', []) - for t in tests: - if not include_onlymatching and t.get('only_matching', False): - continue - t['name'] = type(self).__name__[:-len('IE')] - yield t - - def is_suitable(self, age_limit): - """ Test whether the extractor is generally suitable for the given - age limit (i.e. pornographic sites are not, all others usually are) """ - - any_restricted = False - for tc in self.get_testcases(include_onlymatching=False): - if tc.get('playlist', []): - tc = tc['playlist'][0] - is_restricted = age_restricted( - tc.get('info_dict', {}).get('age_limit'), age_limit) - if not is_restricted: - return True - any_restricted = any_restricted or is_restricted - return not any_restricted - - def extract_subtitles(self, *args, **kwargs): - if (self._downloader.params.get('writesubtitles', False) - or self._downloader.params.get('listsubtitles')): - return self._get_subtitles(*args, **kwargs) - return {} - - def _get_subtitles(self, *args, **kwargs): - raise NotImplementedError('This method must be implemented by subclasses') - - @staticmethod - def _merge_subtitle_items(subtitle_list1, subtitle_list2): - """ Merge subtitle items for one language. Items with duplicated URLs - will be dropped. """ - list1_urls = set([item['url'] for item in subtitle_list1]) - ret = list(subtitle_list1) - ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls]) - return ret - - @classmethod - def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2): - """ Merge two subtitle dictionaries, language by language. """ - ret = dict(subtitle_dict1) - for lang in subtitle_dict2: - ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang]) - return ret - - def extract_automatic_captions(self, *args, **kwargs): - if (self._downloader.params.get('writeautomaticsub', False) - or self._downloader.params.get('listsubtitles')): - return self._get_automatic_captions(*args, **kwargs) - return {} - - def _get_automatic_captions(self, *args, **kwargs): - raise NotImplementedError('This method must be implemented by subclasses') - - def mark_watched(self, *args, **kwargs): - if (self._downloader.params.get('mark_watched', False) - and (self._get_login_info()[0] is not None - or self._downloader.params.get('cookiefile') is not None)): - self._mark_watched(*args, **kwargs) - - def _mark_watched(self, *args, **kwargs): - raise NotImplementedError('This method must be implemented by subclasses') - - def geo_verification_headers(self): - headers = {} - geo_verification_proxy = self._downloader.params.get('geo_verification_proxy') - if geo_verification_proxy: - headers['Ytdl-request-proxy'] = geo_verification_proxy - return headers - - def _generic_id(self, url): - return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0]) - - def _generic_title(self, url): - return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]) - - -class SearchInfoExtractor(InfoExtractor): - """ - Base class for paged search queries extractors. - They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query} - Instances should define _SEARCH_KEY and _MAX_RESULTS. - """ - - @classmethod - def _make_valid_url(cls): - return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY - - @classmethod - def suitable(cls, url): - return re.match(cls._make_valid_url(), url) is not None - - def _real_extract(self, query): - mobj = re.match(self._make_valid_url(), query) - if mobj is None: - raise ExtractorError('Invalid search query "%s"' % query) - - prefix = mobj.group('prefix') - query = mobj.group('query') - if prefix == '': - return self._get_n_results(query, 1) - elif prefix == 'all': - return self._get_n_results(query, self._MAX_RESULTS) - else: - n = int(prefix) - if n <= 0: - raise ExtractorError('invalid download number %s for query "%s"' % (n, query)) - elif n > self._MAX_RESULTS: - self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n)) - n = self._MAX_RESULTS - return self._get_n_results(query, n) - - def _get_n_results(self, query, n): - """Get a specified number of results for a query""" - raise NotImplementedError('This method must be implemented by subclasses') - - @property - def SEARCH_KEY(self): - return self._SEARCH_KEY diff --git a/youtube_dl/extractor/commonmistakes.py b/youtube_dl/extractor/commonmistakes.py deleted file mode 100644 index 7e12499b1..000000000 --- a/youtube_dl/extractor/commonmistakes.py +++ /dev/null @@ -1,50 +0,0 @@ -from __future__ import unicode_literals - -import sys - -from .common import InfoExtractor -from ..utils import ExtractorError - - -class CommonMistakesIE(InfoExtractor): - IE_DESC = False # Do not list - _VALID_URL = r'''(?x) - (?:url|URL)$ - ''' - - _TESTS = [{ - 'url': 'url', - 'only_matching': True, - }, { - 'url': 'URL', - 'only_matching': True, - }] - - def _real_extract(self, url): - msg = ( - 'You\'ve asked youtube-dl to download the URL "%s". ' - 'That doesn\'t make any sense. ' - 'Simply remove the parameter in your command or configuration.' - ) % url - if not self._downloader.params.get('verbose'): - msg += ' Add -v to the command line to see what arguments and configuration youtube-dl got.' - raise ExtractorError(msg, expected=True) - - -class UnicodeBOMIE(InfoExtractor): - IE_DESC = False - _VALID_URL = r'(?P<bom>\ufeff)(?P<id>.*)$' - - # Disable test for python 3.2 since BOM is broken in re in this version - # (see https://github.com/ytdl-org/youtube-dl/issues/9751) - _TESTS = [] if (3, 0) < sys.version_info <= (3, 3) else [{ - 'url': '\ufeffhttp://www.youtube.com/watch?v=BaW_jenozKc', - 'only_matching': True, - }] - - def _real_extract(self, url): - real_url = self._match_id(url) - self.report_warning( - 'Your URL starts with a Byte Order Mark (BOM). ' - 'Removing the BOM and looking for "%s" ...' % real_url) - return self.url_result(real_url) diff --git a/youtube_dl/extractor/commonprotocols.py b/youtube_dl/extractor/commonprotocols.py deleted file mode 100644 index d98331a4e..000000000 --- a/youtube_dl/extractor/commonprotocols.py +++ /dev/null @@ -1,60 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..compat import ( - compat_urlparse, -) - - -class RtmpIE(InfoExtractor): - IE_DESC = False # Do not list - _VALID_URL = r'(?i)rtmp[est]?://.+' - - _TESTS = [{ - 'url': 'rtmp://cp44293.edgefcs.net/ondemand?auth=daEcTdydfdqcsb8cZcDbAaCbhamacbbawaS-bw7dBb-bWG-GqpGFqCpNCnGoyL&aifp=v001&slist=public/unsecure/audio/2c97899446428e4301471a8cb72b4b97--audio--pmg-20110908-0900a_flv_aac_med_int.mp4', - 'only_matching': True, - }, { - 'url': 'rtmp://edge.live.hitbox.tv/live/dimak', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._generic_id(url) - title = self._generic_title(url) - return { - 'id': video_id, - 'title': title, - 'formats': [{ - 'url': url, - 'ext': 'flv', - 'format_id': compat_urlparse.urlparse(url).scheme, - }], - } - - -class MmsIE(InfoExtractor): - IE_DESC = False # Do not list - _VALID_URL = r'(?i)mms://.+' - - _TEST = { - # Direct MMS link - 'url': 'mms://kentro.kaist.ac.kr/200907/MilesReid(0709).wmv', - 'info_dict': { - 'id': 'MilesReid(0709)', - 'ext': 'wmv', - 'title': 'MilesReid(0709)', - }, - 'params': { - 'skip_download': True, # rtsp downloads, requiring mplayer or mpv - }, - } - - def _real_extract(self, url): - video_id = self._generic_id(url) - title = self._generic_title(url) - - return { - 'id': video_id, - 'title': title, - 'url': url, - } diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py deleted file mode 100644 index d5e77af32..000000000 --- a/youtube_dl/extractor/condenast.py +++ /dev/null @@ -1,251 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_urlparse, - compat_urlparse, -) -from ..utils import ( - determine_ext, - extract_attributes, - int_or_none, - js_to_json, - mimetype2ext, - orderedSet, - parse_iso8601, - strip_or_none, - try_get, -) - - -class CondeNastIE(InfoExtractor): - """ - Condé Nast is a media group, some of its sites use a custom HTML5 player - that works the same in all of them. - """ - - # The keys are the supported sites and the values are the name to be shown - # to the user and in the extractor description. - _SITES = { - 'allure': 'Allure', - 'architecturaldigest': 'Architectural Digest', - 'arstechnica': 'Ars Technica', - 'bonappetit': 'Bon Appétit', - 'brides': 'Brides', - 'cnevids': 'Condé Nast', - 'cntraveler': 'Condé Nast Traveler', - 'details': 'Details', - 'epicurious': 'Epicurious', - 'glamour': 'Glamour', - 'golfdigest': 'Golf Digest', - 'gq': 'GQ', - 'newyorker': 'The New Yorker', - 'self': 'SELF', - 'teenvogue': 'Teen Vogue', - 'vanityfair': 'Vanity Fair', - 'vogue': 'Vogue', - 'wired': 'WIRED', - 'wmagazine': 'W Magazine', - } - - _VALID_URL = r'''(?x)https?://(?:video|www|player(?:-backend)?)\.(?:%s)\.com/ - (?: - (?: - embed(?:js)?| - (?:script|inline)/video - )/(?P<id>[0-9a-f]{24})(?:/(?P<player_id>[0-9a-f]{24}))?(?:.+?\btarget=(?P<target>[^&]+))?| - (?P<type>watch|series|video)/(?P<display_id>[^/?#]+) - )''' % '|'.join(_SITES.keys()) - IE_DESC = 'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values())) - - EMBED_URL = r'(?:https?:)?//player(?:-backend)?\.(?:%s)\.com/(?:embed(?:js)?|(?:script|inline)/video)/.+?' % '|'.join(_SITES.keys()) - - _TESTS = [{ - 'url': 'http://video.wired.com/watch/3d-printed-speakers-lit-with-led', - 'md5': '1921f713ed48aabd715691f774c451f7', - 'info_dict': { - 'id': '5171b343c2b4c00dd0c1ccb3', - 'ext': 'mp4', - 'title': '3D Printed Speakers Lit With LED', - 'description': 'Check out these beautiful 3D printed LED speakers. You can\'t actually buy them, but LumiGeek is working on a board that will let you make you\'re own.', - 'uploader': 'wired', - 'upload_date': '20130314', - 'timestamp': 1363219200, - } - }, { - 'url': 'http://video.gq.com/watch/the-closer-with-keith-olbermann-the-only-true-surprise-trump-s-an-idiot?c=series', - 'info_dict': { - 'id': '58d1865bfd2e6126e2000015', - 'ext': 'mp4', - 'title': 'The Only True Surprise? Trump’s an Idiot', - 'uploader': 'gq', - 'upload_date': '20170321', - 'timestamp': 1490126427, - 'description': 'How much grimmer would things be if these people were competent?', - }, - }, { - # JS embed - 'url': 'http://player.cnevids.com/embedjs/55f9cf8b61646d1acf00000c/5511d76261646d5566020000.js', - 'md5': 'f1a6f9cafb7083bab74a710f65d08999', - 'info_dict': { - 'id': '55f9cf8b61646d1acf00000c', - 'ext': 'mp4', - 'title': '3D printed TSA Travel Sentry keys really do open TSA locks', - 'uploader': 'arstechnica', - 'upload_date': '20150916', - 'timestamp': 1442434920, - } - }, { - 'url': 'https://player.cnevids.com/inline/video/59138decb57ac36b83000005.js?target=js-cne-player', - 'only_matching': True, - }, { - 'url': 'http://player-backend.cnevids.com/script/video/59138decb57ac36b83000005.js', - 'only_matching': True, - }] - - def _extract_series(self, url, webpage): - title = self._html_search_regex( - r'(?s)<div class="cne-series-info">.*?<h1>(.+?)</h1>', - webpage, 'series title') - url_object = compat_urllib_parse_urlparse(url) - base_url = '%s://%s' % (url_object.scheme, url_object.netloc) - m_paths = re.finditer( - r'(?s)<p class="cne-thumb-title">.*?<a href="(/watch/.+?)["\?]', webpage) - paths = orderedSet(m.group(1) for m in m_paths) - build_url = lambda path: compat_urlparse.urljoin(base_url, path) - entries = [self.url_result(build_url(path), 'CondeNast') for path in paths] - return self.playlist_result(entries, playlist_title=title) - - def _extract_video_params(self, webpage, display_id): - query = self._parse_json( - self._search_regex( - r'(?s)var\s+params\s*=\s*({.+?})[;,]', webpage, 'player params', - default='{}'), - display_id, transform_source=js_to_json, fatal=False) - if query: - query['videoId'] = self._search_regex( - r'(?:data-video-id=|currentVideoId\s*=\s*)["\']([\da-f]+)', - webpage, 'video id', default=None) - else: - params = extract_attributes(self._search_regex( - r'(<[^>]+data-js="video-player"[^>]+>)', - webpage, 'player params element')) - query.update({ - 'videoId': params['data-video'], - 'playerId': params['data-player'], - 'target': params['id'], - }) - return query - - def _extract_video(self, params): - video_id = params['videoId'] - - video_info = None - - # New API path - query = params.copy() - query['embedType'] = 'inline' - info_page = self._download_json( - 'http://player.cnevids.com/embed-api.json', video_id, - 'Downloading embed info', fatal=False, query=query) - - # Old fallbacks - if not info_page: - if params.get('playerId'): - info_page = self._download_json( - 'http://player.cnevids.com/player/video.js', video_id, - 'Downloading video info', fatal=False, query=params) - if info_page: - video_info = info_page.get('video') - if not video_info: - info_page = self._download_webpage( - 'http://player.cnevids.com/player/loader.js', - video_id, 'Downloading loader info', query=params) - if not video_info: - info_page = self._download_webpage( - 'https://player.cnevids.com/inline/video/%s.js' % video_id, - video_id, 'Downloading inline info', query={ - 'target': params.get('target', 'embedplayer') - }) - - if not video_info: - video_info = self._parse_json( - self._search_regex( - r'(?s)var\s+config\s*=\s*({.+?});', info_page, 'config'), - video_id, transform_source=js_to_json)['video'] - - title = video_info['title'] - - formats = [] - for fdata in video_info['sources']: - src = fdata.get('src') - if not src: - continue - ext = mimetype2ext(fdata.get('type')) or determine_ext(src) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - src, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - continue - quality = fdata.get('quality') - formats.append({ - 'format_id': ext + ('-%s' % quality if quality else ''), - 'url': src, - 'ext': ext, - 'quality': 1 if quality == 'high' else 0, - }) - self._sort_formats(formats) - - subtitles = {} - for t, caption in video_info.get('captions', {}).items(): - caption_url = caption.get('src') - if not (t in ('vtt', 'srt', 'tml') and caption_url): - continue - subtitles.setdefault('en', []).append({'url': caption_url}) - - return { - 'id': video_id, - 'formats': formats, - 'title': title, - 'thumbnail': video_info.get('poster_frame'), - 'uploader': video_info.get('brand'), - 'duration': int_or_none(video_info.get('duration')), - 'tags': video_info.get('tags'), - 'series': video_info.get('series_title'), - 'season': video_info.get('season_title'), - 'timestamp': parse_iso8601(video_info.get('premiere_date')), - 'categories': video_info.get('categories'), - 'subtitles': subtitles, - } - - def _real_extract(self, url): - video_id, player_id, target, url_type, display_id = re.match(self._VALID_URL, url).groups() - - if video_id: - return self._extract_video({ - 'videoId': video_id, - 'playerId': player_id, - 'target': target, - }) - - webpage = self._download_webpage(url, display_id) - - if url_type == 'series': - return self._extract_series(url, webpage) - else: - video = try_get(self._parse_json(self._search_regex( - r'__PRELOADED_STATE__\s*=\s*({.+?});', webpage, - 'preload state', '{}'), display_id), - lambda x: x['transformed']['video']) - if video: - params = {'videoId': video['id']} - info = {'description': strip_or_none(video.get('description'))} - else: - params = self._extract_video_params(webpage, display_id) - info = self._search_json_ld( - webpage, display_id, fatal=False) - info.update(self._extract_video(params)) - return info diff --git a/youtube_dl/extractor/corus.py b/youtube_dl/extractor/corus.py deleted file mode 100644 index e11aadf14..000000000 --- a/youtube_dl/extractor/corus.py +++ /dev/null @@ -1,160 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .theplatform import ThePlatformFeedIE -from ..utils import ( - dict_get, - ExtractorError, - float_or_none, - int_or_none, -) - - -class CorusIE(ThePlatformFeedIE): - _VALID_URL = r'''(?x) - https?:// - (?:www\.)? - (?P<domain> - (?: - globaltv| - etcanada| - seriesplus| - wnetwork| - ytv - )\.com| - (?: - hgtv| - foodnetwork| - slice| - history| - showcase| - bigbrothercanada| - abcspark| - disney(?:channel|lachaine) - )\.ca - ) - /(?:[^/]+/)* - (?: - video\.html\?.*?\bv=| - videos?/(?:[^/]+/)*(?:[a-z0-9-]+-)? - ) - (?P<id> - [\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}| - (?:[A-Z]{4})?\d{12,20} - ) - ''' - _TESTS = [{ - 'url': 'http://www.hgtv.ca/shows/bryan-inc/videos/movie-night-popcorn-with-bryan-870923331648/', - 'info_dict': { - 'id': '870923331648', - 'ext': 'mp4', - 'title': 'Movie Night Popcorn with Bryan', - 'description': 'Bryan whips up homemade popcorn, the old fashion way for Jojo and Lincoln.', - 'upload_date': '20170206', - 'timestamp': 1486392197, - }, - 'params': { - 'format': 'bestvideo', - 'skip_download': True, - }, - 'expected_warnings': ['Failed to parse JSON'], - }, { - 'url': 'http://www.foodnetwork.ca/shows/chopped/video/episode/chocolate-obsession/video.html?v=872683587753', - 'only_matching': True, - }, { - 'url': 'http://etcanada.com/video/873675331955/meet-the-survivor-game-changers-castaways-part-2/', - 'only_matching': True, - }, { - 'url': 'http://www.history.ca/the-world-without-canada/video/full-episodes/natural-resources/video.html?v=955054659646#video', - 'only_matching': True, - }, { - 'url': 'http://www.showcase.ca/eyewitness/video/eyewitness++106/video.html?v=955070531919&p=1&s=da#video', - 'only_matching': True, - }, { - 'url': 'http://www.bigbrothercanada.ca/video/1457812035894/', - 'only_matching': True - }, { - 'url': 'https://www.bigbrothercanada.ca/video/big-brother-canada-704/1457812035894/', - 'only_matching': True - }, { - 'url': 'https://www.seriesplus.com/emissions/dre-mary-mort-sur-ordonnance/videos/deux-coeurs-battant/SERP0055626330000200/', - 'only_matching': True - }, { - 'url': 'https://www.disneychannel.ca/shows/gabby-duran-the-unsittables/video/crybaby-duran-clip/2f557eec-0588-11ea-ae2b-e2c6776b770e/', - 'only_matching': True - }] - _GEO_BYPASS = False - _SITE_MAP = { - 'globaltv': 'series', - 'etcanada': 'series', - 'foodnetwork': 'food', - 'bigbrothercanada': 'series', - 'disneychannel': 'disneyen', - 'disneylachaine': 'disneyfr', - } - - def _real_extract(self, url): - domain, video_id = re.match(self._VALID_URL, url).groups() - site = domain.split('.')[0] - path = self._SITE_MAP.get(site, site) - if path != 'series': - path = 'migration/' + path - video = self._download_json( - 'https://globalcontent.corusappservices.com/templates/%s/playlist/' % path, - video_id, query={'byId': video_id}, - headers={'Accept': 'application/json'})[0] - title = video['title'] - - formats = [] - for source in video.get('sources', []): - smil_url = source.get('file') - if not smil_url: - continue - source_type = source.get('type') - note = 'Downloading%s smil file' % (' ' + source_type if source_type else '') - resp = self._download_webpage( - smil_url, video_id, note, fatal=False, - headers=self.geo_verification_headers()) - if not resp: - continue - error = self._parse_json(resp, video_id, fatal=False) - if error: - if error.get('exception') == 'GeoLocationBlocked': - self.raise_geo_restricted(countries=['CA']) - raise ExtractorError(error['description']) - smil = self._parse_xml(resp, video_id, fatal=False) - if smil is None: - continue - namespace = self._parse_smil_namespace(smil) - formats.extend(self._parse_smil_formats( - smil, smil_url, video_id, namespace)) - if not formats and video.get('drm'): - raise ExtractorError('This video is DRM protected.', expected=True) - self._sort_formats(formats) - - subtitles = {} - for track in video.get('tracks', []): - track_url = track.get('file') - if not track_url: - continue - lang = 'fr' if site in ('disneylachaine', 'seriesplus') else 'en' - subtitles.setdefault(lang, []).append({'url': track_url}) - - metadata = video.get('metadata') or {} - get_number = lambda x: int_or_none(video.get('pl1$' + x) or metadata.get(x + 'Number')) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'thumbnail': dict_get(video, ('defaultThumbnailUrl', 'thumbnail', 'image')), - 'description': video.get('description'), - 'timestamp': int_or_none(video.get('availableDate'), 1000), - 'subtitles': subtitles, - 'duration': float_or_none(metadata.get('duration')), - 'series': dict_get(video, ('show', 'pl1$show')), - 'season_number': get_number('season'), - 'episode_number': get_number('episode'), - } diff --git a/youtube_dl/extractor/coub.py b/youtube_dl/extractor/coub.py deleted file mode 100644 index 6ea03e65c..000000000 --- a/youtube_dl/extractor/coub.py +++ /dev/null @@ -1,140 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - float_or_none, - int_or_none, - parse_iso8601, - qualities, -) - - -class CoubIE(InfoExtractor): - _VALID_URL = r'(?:coub:|https?://(?:coub\.com/(?:view|embed|coubs)/|c-cdn\.coub\.com/fb-player\.swf\?.*\bcoub(?:ID|id)=))(?P<id>[\da-z]+)' - - _TESTS = [{ - 'url': 'http://coub.com/view/5u5n1', - 'info_dict': { - 'id': '5u5n1', - 'ext': 'mp4', - 'title': 'The Matrix Moonwalk', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 4.6, - 'timestamp': 1428527772, - 'upload_date': '20150408', - 'uploader': 'Artyom Loskutnikov', - 'uploader_id': 'artyom.loskutnikov', - 'view_count': int, - 'like_count': int, - 'repost_count': int, - 'age_limit': 0, - }, - }, { - 'url': 'http://c-cdn.coub.com/fb-player.swf?bot_type=vk&coubID=7w5a4', - 'only_matching': True, - }, { - 'url': 'coub:5u5n1', - 'only_matching': True, - }, { - # longer video id - 'url': 'http://coub.com/view/237d5l5h', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - coub = self._download_json( - 'http://coub.com/api/v2/coubs/%s.json' % video_id, video_id) - - if coub.get('error'): - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, coub['error']), expected=True) - - title = coub['title'] - - file_versions = coub['file_versions'] - - QUALITIES = ('low', 'med', 'high') - - MOBILE = 'mobile' - IPHONE = 'iphone' - HTML5 = 'html5' - - SOURCE_PREFERENCE = (MOBILE, IPHONE, HTML5) - - quality_key = qualities(QUALITIES) - preference_key = qualities(SOURCE_PREFERENCE) - - formats = [] - - for kind, items in file_versions.get(HTML5, {}).items(): - if kind not in ('video', 'audio'): - continue - if not isinstance(items, dict): - continue - for quality, item in items.items(): - if not isinstance(item, dict): - continue - item_url = item.get('url') - if not item_url: - continue - formats.append({ - 'url': item_url, - 'format_id': '%s-%s-%s' % (HTML5, kind, quality), - 'filesize': int_or_none(item.get('size')), - 'vcodec': 'none' if kind == 'audio' else None, - 'quality': quality_key(quality), - 'preference': preference_key(HTML5), - }) - - iphone_url = file_versions.get(IPHONE, {}).get('url') - if iphone_url: - formats.append({ - 'url': iphone_url, - 'format_id': IPHONE, - 'preference': preference_key(IPHONE), - }) - - mobile_url = file_versions.get(MOBILE, {}).get('audio_url') - if mobile_url: - formats.append({ - 'url': mobile_url, - 'format_id': '%s-audio' % MOBILE, - 'preference': preference_key(MOBILE), - }) - - self._sort_formats(formats) - - thumbnail = coub.get('picture') - duration = float_or_none(coub.get('duration')) - timestamp = parse_iso8601(coub.get('published_at') or coub.get('created_at')) - uploader = coub.get('channel', {}).get('title') - uploader_id = coub.get('channel', {}).get('permalink') - - view_count = int_or_none(coub.get('views_count') or coub.get('views_increase_count')) - like_count = int_or_none(coub.get('likes_count')) - repost_count = int_or_none(coub.get('recoubs_count')) - - age_restricted = coub.get('age_restricted', coub.get('age_restricted_by_admin')) - if age_restricted is not None: - age_limit = 18 if age_restricted is True else 0 - else: - age_limit = None - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'duration': duration, - 'timestamp': timestamp, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'view_count': view_count, - 'like_count': like_count, - 'repost_count': repost_count, - 'age_limit': age_limit, - 'formats': formats, - } diff --git a/youtube_dl/extractor/crackle.py b/youtube_dl/extractor/crackle.py deleted file mode 100644 index 49bf3a4f9..000000000 --- a/youtube_dl/extractor/crackle.py +++ /dev/null @@ -1,200 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals, division - -import hashlib -import hmac -import re -import time - -from .common import InfoExtractor -from ..compat import compat_HTTPError -from ..utils import ( - determine_ext, - float_or_none, - int_or_none, - parse_age_limit, - parse_duration, - url_or_none, - ExtractorError -) - - -class CrackleIE(InfoExtractor): - _VALID_URL = r'(?:crackle:|https?://(?:(?:www|m)\.)?(?:sony)?crackle\.com/(?:playlist/\d+/|(?:[^/]+/)+))(?P<id>\d+)' - _TESTS = [{ - # geo restricted to CA - 'url': 'https://www.crackle.com/andromeda/2502343', - 'info_dict': { - 'id': '2502343', - 'ext': 'mp4', - 'title': 'Under The Night', - 'description': 'md5:d2b8ca816579ae8a7bf28bfff8cefc8a', - 'duration': 2583, - 'view_count': int, - 'average_rating': 0, - 'age_limit': 14, - 'genre': 'Action, Sci-Fi', - 'creator': 'Allan Kroeker', - 'artist': 'Keith Hamilton Cobb, Kevin Sorbo, Lisa Ryder, Lexa Doig, Robert Hewitt Wolfe', - 'release_year': 2000, - 'series': 'Andromeda', - 'episode': 'Under The Night', - 'season_number': 1, - 'episode_number': 1, - }, - 'params': { - # m3u8 download - 'skip_download': True, - } - }, { - 'url': 'https://www.sonycrackle.com/andromeda/2502343', - 'only_matching': True, - }] - - _MEDIA_FILE_SLOTS = { - '360p.mp4': { - 'width': 640, - 'height': 360, - }, - '480p.mp4': { - 'width': 768, - 'height': 432, - }, - '480p_1mbps.mp4': { - 'width': 852, - 'height': 480, - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - country_code = self._downloader.params.get('geo_bypass_country', None) - countries = [country_code] if country_code else ( - 'US', 'AU', 'CA', 'AS', 'FM', 'GU', 'MP', 'PR', 'PW', 'MH', 'VI') - - last_e = None - - for country in countries: - try: - # Authorization generation algorithm is reverse engineered from: - # https://www.sonycrackle.com/static/js/main.ea93451f.chunk.js - media_detail_url = 'https://web-api-us.crackle.com/Service.svc/details/media/%s/%s?disableProtocols=true' % (video_id, country) - timestamp = time.strftime('%Y%m%d%H%M', time.gmtime()) - h = hmac.new(b'IGSLUQCBDFHEOIFM', '|'.join([media_detail_url, timestamp]).encode(), hashlib.sha1).hexdigest().upper() - media = self._download_json( - media_detail_url, video_id, 'Downloading media JSON as %s' % country, - 'Unable to download media JSON', headers={ - 'Accept': 'application/json', - 'Authorization': '|'.join([h, timestamp, '117', '1']), - }) - except ExtractorError as e: - # 401 means geo restriction, trying next country - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - last_e = e - continue - raise - - media_urls = media.get('MediaURLs') - if not media_urls or not isinstance(media_urls, list): - continue - - title = media['Title'] - - formats = [] - for e in media['MediaURLs']: - if e.get('UseDRM') is True: - continue - format_url = url_or_none(e.get('Path')) - if not format_url: - continue - ext = determine_ext(format_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - elif ext == 'mpd': - formats.extend(self._extract_mpd_formats( - format_url, video_id, mpd_id='dash', fatal=False)) - elif format_url.endswith('.ism/Manifest'): - formats.extend(self._extract_ism_formats( - format_url, video_id, ism_id='mss', fatal=False)) - else: - mfs_path = e.get('Type') - mfs_info = self._MEDIA_FILE_SLOTS.get(mfs_path) - if not mfs_info: - continue - formats.append({ - 'url': format_url, - 'format_id': 'http-' + mfs_path.split('.')[0], - 'width': mfs_info['width'], - 'height': mfs_info['height'], - }) - self._sort_formats(formats) - - description = media.get('Description') - duration = int_or_none(media.get( - 'DurationInSeconds')) or parse_duration(media.get('Duration')) - view_count = int_or_none(media.get('CountViews')) - average_rating = float_or_none(media.get('UserRating')) - age_limit = parse_age_limit(media.get('Rating')) - genre = media.get('Genre') - release_year = int_or_none(media.get('ReleaseYear')) - creator = media.get('Directors') - artist = media.get('Cast') - - if media.get('MediaTypeDisplayValue') == 'Full Episode': - series = media.get('ShowName') - episode = title - season_number = int_or_none(media.get('Season')) - episode_number = int_or_none(media.get('Episode')) - else: - series = episode = season_number = episode_number = None - - subtitles = {} - cc_files = media.get('ClosedCaptionFiles') - if isinstance(cc_files, list): - for cc_file in cc_files: - if not isinstance(cc_file, dict): - continue - cc_url = url_or_none(cc_file.get('Path')) - if not cc_url: - continue - lang = cc_file.get('Locale') or 'en' - subtitles.setdefault(lang, []).append({'url': cc_url}) - - thumbnails = [] - images = media.get('Images') - if isinstance(images, list): - for image_key, image_url in images.items(): - mobj = re.search(r'Img_(\d+)[xX](\d+)', image_key) - if not mobj: - continue - thumbnails.append({ - 'url': image_url, - 'width': int(mobj.group(1)), - 'height': int(mobj.group(2)), - }) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'duration': duration, - 'view_count': view_count, - 'average_rating': average_rating, - 'age_limit': age_limit, - 'genre': genre, - 'creator': creator, - 'artist': artist, - 'release_year': release_year, - 'series': series, - 'episode': episode, - 'season_number': season_number, - 'episode_number': episode_number, - 'thumbnails': thumbnails, - 'subtitles': subtitles, - 'formats': formats, - } - - raise last_e diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py deleted file mode 100644 index bc2d1fa8b..000000000 --- a/youtube_dl/extractor/crunchyroll.py +++ /dev/null @@ -1,686 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re -import json -import zlib - -from hashlib import sha1 -from math import pow, sqrt, floor -from .common import InfoExtractor -from .vrv import VRVIE -from ..compat import ( - compat_b64decode, - compat_etree_Element, - compat_etree_fromstring, - compat_str, - compat_urllib_parse_urlencode, - compat_urllib_request, - compat_urlparse, -) -from ..utils import ( - ExtractorError, - bytes_to_intlist, - extract_attributes, - float_or_none, - intlist_to_bytes, - int_or_none, - lowercase_escape, - merge_dicts, - remove_end, - sanitized_Request, - urlencode_postdata, - xpath_text, -) -from ..aes import ( - aes_cbc_decrypt, -) - - -class CrunchyrollBaseIE(InfoExtractor): - _LOGIN_URL = 'https://www.crunchyroll.com/login' - _LOGIN_FORM = 'login_form' - _NETRC_MACHINE = 'crunchyroll' - - def _call_rpc_api(self, method, video_id, note=None, data=None): - data = data or {} - data['req'] = 'RpcApi' + method - data = compat_urllib_parse_urlencode(data).encode('utf-8') - return self._download_xml( - 'https://www.crunchyroll.com/xml/', - video_id, note, fatal=False, data=data, headers={ - 'Content-Type': 'application/x-www-form-urlencoded', - }) - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - - login_page = self._download_webpage( - self._LOGIN_URL, None, 'Downloading login page') - - def is_logged(webpage): - return 'href="/logout"' in webpage - - # Already logged in - if is_logged(login_page): - return - - login_form_str = self._search_regex( - r'(?P<form><form[^>]+?id=(["\'])%s\2[^>]*>)' % self._LOGIN_FORM, - login_page, 'login form', group='form') - - post_url = extract_attributes(login_form_str).get('action') - if not post_url: - post_url = self._LOGIN_URL - elif not post_url.startswith('http'): - post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) - - login_form = self._form_hidden_inputs(self._LOGIN_FORM, login_page) - - login_form.update({ - 'login_form[name]': username, - 'login_form[password]': password, - }) - - response = self._download_webpage( - post_url, None, 'Logging in', 'Wrong login info', - data=urlencode_postdata(login_form), - headers={'Content-Type': 'application/x-www-form-urlencoded'}) - - # Successful login - if is_logged(response): - return - - error = self._html_search_regex( - '(?s)<ul[^>]+class=["\']messages["\'][^>]*>(.+?)</ul>', - response, 'error message', default=None) - if error: - raise ExtractorError('Unable to login: %s' % error, expected=True) - - raise ExtractorError('Unable to log in') - - def _real_initialize(self): - self._login() - - @staticmethod - def _add_skip_wall(url): - parsed_url = compat_urlparse.urlparse(url) - qs = compat_urlparse.parse_qs(parsed_url.query) - # Always force skip_wall to bypass maturity wall, namely 18+ confirmation message: - # > This content may be inappropriate for some people. - # > Are you sure you want to continue? - # since it's not disabled by default in crunchyroll account's settings. - # See https://github.com/ytdl-org/youtube-dl/issues/7202. - qs['skip_wall'] = ['1'] - return compat_urlparse.urlunparse( - parsed_url._replace(query=compat_urllib_parse_urlencode(qs, True))) - - -class CrunchyrollIE(CrunchyrollBaseIE, VRVIE): - IE_NAME = 'crunchyroll' - _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|(?:[^/]*/){1,2}[^/?&]*?)(?P<video_id>[0-9]+))(?:[/?&]|$)' - _TESTS = [{ - 'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513', - 'info_dict': { - 'id': '645513', - 'ext': 'mp4', - 'title': 'Wanna be the Strongest in the World Episode 1 – An Idol-Wrestler is Born!', - 'description': 'md5:2d17137920c64f2f49981a7797d275ef', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Yomiuri Telecasting Corporation (YTV)', - 'upload_date': '20131013', - 'url': 're:(?!.*&)', - }, - 'params': { - # rtmp - 'skip_download': True, - }, - 'skip': 'Video gone', - }, { - 'url': 'http://www.crunchyroll.com/media-589804/culture-japan-1', - 'info_dict': { - 'id': '589804', - 'ext': 'flv', - 'title': 'Culture Japan Episode 1 – Rebuilding Japan after the 3.11', - 'description': 'md5:2fbc01f90b87e8e9137296f37b461c12', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Danny Choo Network', - 'upload_date': '20120213', - }, - 'params': { - # rtmp - 'skip_download': True, - }, - 'skip': 'Video gone', - }, { - 'url': 'http://www.crunchyroll.com/rezero-starting-life-in-another-world-/episode-5-the-morning-of-our-promise-is-still-distant-702409', - 'info_dict': { - 'id': '702409', - 'ext': 'mp4', - 'title': compat_str, - 'description': compat_str, - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Re:Zero Partners', - 'timestamp': 1462098900, - 'upload_date': '20160501', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'http://www.crunchyroll.com/konosuba-gods-blessing-on-this-wonderful-world/episode-1-give-me-deliverance-from-this-judicial-injustice-727589', - 'info_dict': { - 'id': '727589', - 'ext': 'mp4', - 'title': compat_str, - 'description': compat_str, - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Kadokawa Pictures Inc.', - 'timestamp': 1484130900, - 'upload_date': '20170111', - 'series': compat_str, - 'season': "KONOSUBA -God's blessing on this wonderful world! 2", - 'season_number': 2, - 'episode': 'Give Me Deliverance From This Judicial Injustice!', - 'episode_number': 1, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'http://www.crunchyroll.fr/girl-friend-beta/episode-11-goodbye-la-mode-661697', - 'only_matching': True, - }, { - # geo-restricted (US), 18+ maturity wall, non-premium available - 'url': 'http://www.crunchyroll.com/cosplay-complex-ova/episode-1-the-birth-of-the-cosplay-club-565617', - 'only_matching': True, - }, { - # A description with double quotes - 'url': 'http://www.crunchyroll.com/11eyes/episode-1-piros-jszaka-red-night-535080', - 'info_dict': { - 'id': '535080', - 'ext': 'mp4', - 'title': compat_str, - 'description': compat_str, - 'uploader': 'Marvelous AQL Inc.', - 'timestamp': 1255512600, - 'upload_date': '20091014', - }, - 'params': { - # Just test metadata extraction - 'skip_download': True, - }, - }, { - # make sure we can extract an uploader name that's not a link - 'url': 'http://www.crunchyroll.com/hakuoki-reimeiroku/episode-1-dawn-of-the-divine-warriors-606899', - 'info_dict': { - 'id': '606899', - 'ext': 'mp4', - 'title': 'Hakuoki Reimeiroku Episode 1 – Dawn of the Divine Warriors', - 'description': 'Ryunosuke was left to die, but Serizawa-san asked him a simple question "Do you want to live?"', - 'uploader': 'Geneon Entertainment', - 'upload_date': '20120717', - }, - 'params': { - # just test metadata extraction - 'skip_download': True, - }, - 'skip': 'Video gone', - }, { - # A video with a vastly different season name compared to the series name - 'url': 'http://www.crunchyroll.com/nyarko-san-another-crawling-chaos/episode-1-test-590532', - 'info_dict': { - 'id': '590532', - 'ext': 'mp4', - 'title': compat_str, - 'description': compat_str, - 'uploader': 'TV TOKYO', - 'timestamp': 1330956000, - 'upload_date': '20120305', - 'series': 'Nyarko-san: Another Crawling Chaos', - 'season': 'Haiyoru! Nyaruani (ONA)', - }, - 'params': { - # Just test metadata extraction - 'skip_download': True, - }, - }, { - 'url': 'http://www.crunchyroll.com/media-723735', - 'only_matching': True, - }, { - 'url': 'https://www.crunchyroll.com/en-gb/mob-psycho-100/episode-2-urban-legends-encountering-rumors-780921', - 'only_matching': True, - }] - - _FORMAT_IDS = { - '360': ('60', '106'), - '480': ('61', '106'), - '720': ('62', '106'), - '1080': ('80', '108'), - } - - def _download_webpage(self, url_or_request, *args, **kwargs): - request = (url_or_request if isinstance(url_or_request, compat_urllib_request.Request) - else sanitized_Request(url_or_request)) - # Accept-Language must be set explicitly to accept any language to avoid issues - # similar to https://github.com/ytdl-org/youtube-dl/issues/6797. - # Along with IP address Crunchyroll uses Accept-Language to guess whether georestriction - # should be imposed or not (from what I can see it just takes the first language - # ignoring the priority and requires it to correspond the IP). By the way this causes - # Crunchyroll to not work in georestriction cases in some browsers that don't place - # the locale lang first in header. However allowing any language seems to workaround the issue. - request.add_header('Accept-Language', '*') - return super(CrunchyrollBaseIE, self)._download_webpage(request, *args, **kwargs) - - def _decrypt_subtitles(self, data, iv, id): - data = bytes_to_intlist(compat_b64decode(data)) - iv = bytes_to_intlist(compat_b64decode(iv)) - id = int(id) - - def obfuscate_key_aux(count, modulo, start): - output = list(start) - for _ in range(count): - output.append(output[-1] + output[-2]) - # cut off start values - output = output[2:] - output = list(map(lambda x: x % modulo + 33, output)) - return output - - def obfuscate_key(key): - num1 = int(floor(pow(2, 25) * sqrt(6.9))) - num2 = (num1 ^ key) << 5 - num3 = key ^ num1 - num4 = num3 ^ (num3 >> 3) ^ num2 - prefix = intlist_to_bytes(obfuscate_key_aux(20, 97, (1, 2))) - shaHash = bytes_to_intlist(sha1(prefix + str(num4).encode('ascii')).digest()) - # Extend 160 Bit hash to 256 Bit - return shaHash + [0] * 12 - - key = obfuscate_key(id) - - decrypted_data = intlist_to_bytes(aes_cbc_decrypt(data, key, iv)) - return zlib.decompress(decrypted_data) - - def _convert_subtitles_to_srt(self, sub_root): - output = '' - - for i, event in enumerate(sub_root.findall('./events/event'), 1): - start = event.attrib['start'].replace('.', ',') - end = event.attrib['end'].replace('.', ',') - text = event.attrib['text'].replace('\\N', '\n') - output += '%d\n%s --> %s\n%s\n\n' % (i, start, end, text) - return output - - def _convert_subtitles_to_ass(self, sub_root): - output = '' - - def ass_bool(strvalue): - assvalue = '0' - if strvalue == '1': - assvalue = '-1' - return assvalue - - output = '[Script Info]\n' - output += 'Title: %s\n' % sub_root.attrib['title'] - output += 'ScriptType: v4.00+\n' - output += 'WrapStyle: %s\n' % sub_root.attrib['wrap_style'] - output += 'PlayResX: %s\n' % sub_root.attrib['play_res_x'] - output += 'PlayResY: %s\n' % sub_root.attrib['play_res_y'] - output += """ -[V4+ Styles] -Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding -""" - for style in sub_root.findall('./styles/style'): - output += 'Style: ' + style.attrib['name'] - output += ',' + style.attrib['font_name'] - output += ',' + style.attrib['font_size'] - output += ',' + style.attrib['primary_colour'] - output += ',' + style.attrib['secondary_colour'] - output += ',' + style.attrib['outline_colour'] - output += ',' + style.attrib['back_colour'] - output += ',' + ass_bool(style.attrib['bold']) - output += ',' + ass_bool(style.attrib['italic']) - output += ',' + ass_bool(style.attrib['underline']) - output += ',' + ass_bool(style.attrib['strikeout']) - output += ',' + style.attrib['scale_x'] - output += ',' + style.attrib['scale_y'] - output += ',' + style.attrib['spacing'] - output += ',' + style.attrib['angle'] - output += ',' + style.attrib['border_style'] - output += ',' + style.attrib['outline'] - output += ',' + style.attrib['shadow'] - output += ',' + style.attrib['alignment'] - output += ',' + style.attrib['margin_l'] - output += ',' + style.attrib['margin_r'] - output += ',' + style.attrib['margin_v'] - output += ',' + style.attrib['encoding'] - output += '\n' - - output += """ -[Events] -Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text -""" - for event in sub_root.findall('./events/event'): - output += 'Dialogue: 0' - output += ',' + event.attrib['start'] - output += ',' + event.attrib['end'] - output += ',' + event.attrib['style'] - output += ',' + event.attrib['name'] - output += ',' + event.attrib['margin_l'] - output += ',' + event.attrib['margin_r'] - output += ',' + event.attrib['margin_v'] - output += ',' + event.attrib['effect'] - output += ',' + event.attrib['text'] - output += '\n' - - return output - - def _extract_subtitles(self, subtitle): - sub_root = compat_etree_fromstring(subtitle) - return [{ - 'ext': 'srt', - 'data': self._convert_subtitles_to_srt(sub_root), - }, { - 'ext': 'ass', - 'data': self._convert_subtitles_to_ass(sub_root), - }] - - def _get_subtitles(self, video_id, webpage): - subtitles = {} - for sub_id, sub_name in re.findall(r'\bssid=([0-9]+)"[^>]+?\btitle="([^"]+)', webpage): - sub_doc = self._call_rpc_api( - 'Subtitle_GetXml', video_id, - 'Downloading subtitles for ' + sub_name, data={ - 'subtitle_script_id': sub_id, - }) - if not isinstance(sub_doc, compat_etree_Element): - continue - sid = sub_doc.get('id') - iv = xpath_text(sub_doc, 'iv', 'subtitle iv') - data = xpath_text(sub_doc, 'data', 'subtitle data') - if not sid or not iv or not data: - continue - subtitle = self._decrypt_subtitles(data, iv, sid).decode('utf-8') - lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False) - if not lang_code: - continue - subtitles[lang_code] = self._extract_subtitles(subtitle) - return subtitles - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('video_id') - - if mobj.group('prefix') == 'm': - mobile_webpage = self._download_webpage(url, video_id, 'Downloading mobile webpage') - webpage_url = self._search_regex(r'<link rel="canonical" href="([^"]+)" />', mobile_webpage, 'webpage_url') - else: - webpage_url = 'http://www.' + mobj.group('url') - - webpage = self._download_webpage( - self._add_skip_wall(webpage_url), video_id, - headers=self.geo_verification_headers()) - note_m = self._html_search_regex( - r'<div class="showmedia-trailer-notice">(.+?)</div>', - webpage, 'trailer-notice', default='') - if note_m: - raise ExtractorError(note_m) - - mobj = re.search(r'Page\.messaging_box_controller\.addItems\(\[(?P<msg>{.+?})\]\)', webpage) - if mobj: - msg = json.loads(mobj.group('msg')) - if msg.get('type') == 'error': - raise ExtractorError('crunchyroll returned error: %s' % msg['message_body'], expected=True) - - if 'To view this, please log in to verify you are 18 or older.' in webpage: - self.raise_login_required() - - media = self._parse_json(self._search_regex( - r'vilos\.config\.media\s*=\s*({.+?});', - webpage, 'vilos media', default='{}'), video_id) - media_metadata = media.get('metadata') or {} - - language = self._search_regex( - r'(?:vilos\.config\.player\.language|LOCALE)\s*=\s*(["\'])(?P<lang>(?:(?!\1).)+)\1', - webpage, 'language', default=None, group='lang') - - video_title = self._html_search_regex( - (r'(?s)<h1[^>]*>((?:(?!<h1).)*?<(?:span[^>]+itemprop=["\']title["\']|meta[^>]+itemprop=["\']position["\'])[^>]*>(?:(?!<h1).)+?)</h1>', - r'<title>(.+?),\s+-\s+.+? Crunchyroll'), - webpage, 'video_title', default=None) - if not video_title: - video_title = re.sub(r'^Watch\s+', '', self._og_search_description(webpage)) - video_title = re.sub(r' {2,}', ' ', video_title) - video_description = (self._parse_json(self._html_search_regex( - r'<script[^>]*>\s*.+?\[media_id=%s\].+?({.+?"description"\s*:.+?})\);' % video_id, - webpage, 'description', default='{}'), video_id) or media_metadata).get('description') - if video_description: - video_description = lowercase_escape(video_description.replace(r'\r\n', '\n')) - video_uploader = self._html_search_regex( - # try looking for both an uploader that's a link and one that's not - [r'<a[^>]+href="/publisher/[^"]+"[^>]*>([^<]+)</a>', r'<div>\s*Publisher:\s*<span>\s*(.+?)\s*</span>\s*</div>'], - webpage, 'video_uploader', default=False) - - formats = [] - for stream in media.get('streams', []): - audio_lang = stream.get('audio_lang') - hardsub_lang = stream.get('hardsub_lang') - vrv_formats = self._extract_vrv_formats( - stream.get('url'), video_id, stream.get('format'), - audio_lang, hardsub_lang) - for f in vrv_formats: - if not hardsub_lang: - f['preference'] = 1 - language_preference = 0 - if audio_lang == language: - language_preference += 1 - if hardsub_lang == language: - language_preference += 1 - if language_preference: - f['language_preference'] = language_preference - formats.extend(vrv_formats) - if not formats: - available_fmts = [] - for a, fmt in re.findall(r'(<a[^>]+token=["\']showmedia\.([0-9]{3,4})p["\'][^>]+>)', webpage): - attrs = extract_attributes(a) - href = attrs.get('href') - if href and '/freetrial' in href: - continue - available_fmts.append(fmt) - if not available_fmts: - for p in (r'token=["\']showmedia\.([0-9]{3,4})p"', r'showmedia\.([0-9]{3,4})p'): - available_fmts = re.findall(p, webpage) - if available_fmts: - break - if not available_fmts: - available_fmts = self._FORMAT_IDS.keys() - video_encode_ids = [] - - for fmt in available_fmts: - stream_quality, stream_format = self._FORMAT_IDS[fmt] - video_format = fmt + 'p' - stream_infos = [] - streamdata = self._call_rpc_api( - 'VideoPlayer_GetStandardConfig', video_id, - 'Downloading media info for %s' % video_format, data={ - 'media_id': video_id, - 'video_format': stream_format, - 'video_quality': stream_quality, - 'current_page': url, - }) - if isinstance(streamdata, compat_etree_Element): - stream_info = streamdata.find('./{default}preload/stream_info') - if stream_info is not None: - stream_infos.append(stream_info) - stream_info = self._call_rpc_api( - 'VideoEncode_GetStreamInfo', video_id, - 'Downloading stream info for %s' % video_format, data={ - 'media_id': video_id, - 'video_format': stream_format, - 'video_encode_quality': stream_quality, - }) - if isinstance(stream_info, compat_etree_Element): - stream_infos.append(stream_info) - for stream_info in stream_infos: - video_encode_id = xpath_text(stream_info, './video_encode_id') - if video_encode_id in video_encode_ids: - continue - video_encode_ids.append(video_encode_id) - - video_file = xpath_text(stream_info, './file') - if not video_file: - continue - if video_file.startswith('http'): - formats.extend(self._extract_m3u8_formats( - video_file, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - continue - - video_url = xpath_text(stream_info, './host') - if not video_url: - continue - metadata = stream_info.find('./metadata') - format_info = { - 'format': video_format, - 'height': int_or_none(xpath_text(metadata, './height')), - 'width': int_or_none(xpath_text(metadata, './width')), - } - - if '.fplive.net/' in video_url: - video_url = re.sub(r'^rtmpe?://', 'http://', video_url.strip()) - parsed_video_url = compat_urlparse.urlparse(video_url) - direct_video_url = compat_urlparse.urlunparse(parsed_video_url._replace( - netloc='v.lvlt.crcdn.net', - path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_file.split(':')[-1]))) - if self._is_valid_url(direct_video_url, video_id, video_format): - format_info.update({ - 'format_id': 'http-' + video_format, - 'url': direct_video_url, - }) - formats.append(format_info) - continue - - format_info.update({ - 'format_id': 'rtmp-' + video_format, - 'url': video_url, - 'play_path': video_file, - 'ext': 'flv', - }) - formats.append(format_info) - self._sort_formats(formats, ('preference', 'language_preference', 'height', 'width', 'tbr', 'fps')) - - metadata = self._call_rpc_api( - 'VideoPlayer_GetMediaMetadata', video_id, - note='Downloading media info', data={ - 'media_id': video_id, - }) - - subtitles = {} - for subtitle in media.get('subtitles', []): - subtitle_url = subtitle.get('url') - if not subtitle_url: - continue - subtitles.setdefault(subtitle.get('language', 'enUS'), []).append({ - 'url': subtitle_url, - 'ext': subtitle.get('format', 'ass'), - }) - if not subtitles: - subtitles = self.extract_subtitles(video_id, webpage) - - # webpage provide more accurate data than series_title from XML - series = self._html_search_regex( - r'(?s)<h\d[^>]+\bid=["\']showmedia_about_episode_num[^>]+>(.+?)</h\d', - webpage, 'series', fatal=False) - - season = episode = episode_number = duration = thumbnail = None - - if isinstance(metadata, compat_etree_Element): - season = xpath_text(metadata, 'series_title') - episode = xpath_text(metadata, 'episode_title') - episode_number = int_or_none(xpath_text(metadata, 'episode_number')) - duration = float_or_none(media_metadata.get('duration'), 1000) - thumbnail = xpath_text(metadata, 'episode_image_url') - - if not episode: - episode = media_metadata.get('title') - if not episode_number: - episode_number = int_or_none(media_metadata.get('episode_number')) - if not thumbnail: - thumbnail = media_metadata.get('thumbnail', {}).get('url') - - season_number = int_or_none(self._search_regex( - r'(?s)<h\d[^>]+id=["\']showmedia_about_episode_num[^>]+>.+?</h\d>\s*<h4>\s*Season (\d+)', - webpage, 'season number', default=None)) - - info = self._search_json_ld(webpage, video_id, default={}) - - return merge_dicts({ - 'id': video_id, - 'title': video_title, - 'description': video_description, - 'duration': duration, - 'thumbnail': thumbnail, - 'uploader': video_uploader, - 'series': series, - 'season': season, - 'season_number': season_number, - 'episode': episode, - 'episode_number': episode_number, - 'subtitles': subtitles, - 'formats': formats, - }, info) - - -class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): - IE_NAME = 'crunchyroll:playlist' - _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login|media-\d+))(?P<id>[\w\-]+))/?(?:\?|$)' - - _TESTS = [{ - 'url': 'http://www.crunchyroll.com/a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi', - 'info_dict': { - 'id': 'a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi', - 'title': 'A Bridge to the Starry Skies - Hoshizora e Kakaru Hashi' - }, - 'playlist_count': 13, - }, { - # geo-restricted (US), 18+ maturity wall, non-premium available - 'url': 'http://www.crunchyroll.com/cosplay-complex-ova', - 'info_dict': { - 'id': 'cosplay-complex-ova', - 'title': 'Cosplay Complex OVA' - }, - 'playlist_count': 3, - 'skip': 'Georestricted', - }, { - # geo-restricted (US), 18+ maturity wall, non-premium will be available since 2015.11.14 - 'url': 'http://www.crunchyroll.com/ladies-versus-butlers?skip_wall=1', - 'only_matching': True, - }] - - def _real_extract(self, url): - show_id = self._match_id(url) - - webpage = self._download_webpage( - self._add_skip_wall(url), show_id, - headers=self.geo_verification_headers()) - title = self._html_search_meta('name', webpage, default=None) - - episode_paths = re.findall( - r'(?s)<li id="showview_videos_media_(\d+)"[^>]+>.*?<a href="([^"]+)"', - webpage) - entries = [ - self.url_result('http://www.crunchyroll.com' + ep, 'Crunchyroll', ep_id) - for ep_id, ep in episode_paths - ] - entries.reverse() - - return { - '_type': 'playlist', - 'id': show_id, - 'title': title, - 'entries': entries, - } diff --git a/youtube_dl/extractor/cultureunplugged.py b/youtube_dl/extractor/cultureunplugged.py deleted file mode 100644 index bcdf27323..000000000 --- a/youtube_dl/extractor/cultureunplugged.py +++ /dev/null @@ -1,70 +0,0 @@ -from __future__ import unicode_literals - -import re -import time - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - HEADRequest, -) - - -class CultureUnpluggedIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?cultureunplugged\.com/documentary/watch-online/play/(?P<id>\d+)(?:/(?P<display_id>[^/]+))?' - _TESTS = [{ - 'url': 'http://www.cultureunplugged.com/documentary/watch-online/play/53662/The-Next--Best-West', - 'md5': 'ac6c093b089f7d05e79934dcb3d228fc', - 'info_dict': { - 'id': '53662', - 'display_id': 'The-Next--Best-West', - 'ext': 'mp4', - 'title': 'The Next, Best West', - 'description': 'md5:0423cd00833dea1519cf014e9d0903b1', - 'thumbnail': r're:^https?://.*\.jpg$', - 'creator': 'Coldstream Creative', - 'duration': 2203, - 'view_count': int, - } - }, { - 'url': 'http://www.cultureunplugged.com/documentary/watch-online/play/53662', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') or video_id - - # request setClientTimezone.php to get PHPSESSID cookie which is need to get valid json data in the next request - self._request_webpage(HEADRequest( - 'http://www.cultureunplugged.com/setClientTimezone.php?timeOffset=%d' % -(time.timezone / 3600)), display_id) - movie_data = self._download_json( - 'http://www.cultureunplugged.com/movie-data/cu-%s.json' % video_id, display_id) - - video_url = movie_data['url'] - title = movie_data['title'] - - description = movie_data.get('synopsis') - creator = movie_data.get('producer') - duration = int_or_none(movie_data.get('duration')) - view_count = int_or_none(movie_data.get('views')) - - thumbnails = [{ - 'url': movie_data['%s_thumb' % size], - 'id': size, - 'preference': preference, - } for preference, size in enumerate(( - 'small', 'large')) if movie_data.get('%s_thumb' % size)] - - return { - 'id': video_id, - 'display_id': display_id, - 'url': video_url, - 'title': title, - 'description': description, - 'creator': creator, - 'duration': duration, - 'view_count': view_count, - 'thumbnails': thumbnails, - } diff --git a/youtube_dl/extractor/curiositystream.py b/youtube_dl/extractor/curiositystream.py deleted file mode 100644 index 48ff30432..000000000 --- a/youtube_dl/extractor/curiositystream.py +++ /dev/null @@ -1,177 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - urlencode_postdata, - compat_str, - ExtractorError, -) - - -class CuriosityStreamBaseIE(InfoExtractor): - _NETRC_MACHINE = 'curiositystream' - _auth_token = None - _API_BASE_URL = 'https://api.curiositystream.com/v1/' - - def _handle_errors(self, result): - error = result.get('error', {}).get('message') - if error: - if isinstance(error, dict): - error = ', '.join(error.values()) - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, error), expected=True) - - def _call_api(self, path, video_id, query=None): - headers = {} - if self._auth_token: - headers['X-Auth-Token'] = self._auth_token - result = self._download_json( - self._API_BASE_URL + path, video_id, headers=headers, query=query) - self._handle_errors(result) - return result['data'] - - def _real_initialize(self): - email, password = self._get_login_info() - if email is None: - return - result = self._download_json( - self._API_BASE_URL + 'login', None, data=urlencode_postdata({ - 'email': email, - 'password': password, - })) - self._handle_errors(result) - self._auth_token = result['message']['auth_token'] - - -class CuriosityStreamIE(CuriosityStreamBaseIE): - IE_NAME = 'curiositystream' - _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/video/(?P<id>\d+)' - _TEST = { - 'url': 'https://app.curiositystream.com/video/2', - 'info_dict': { - 'id': '2', - 'ext': 'mp4', - 'title': 'How Did You Develop The Internet?', - 'description': 'Vint Cerf, Google\'s Chief Internet Evangelist, describes how he and Bob Kahn created the internet.', - }, - 'params': { - 'format': 'bestvideo', - # m3u8 download - 'skip_download': True, - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - formats = [] - for encoding_format in ('m3u8', 'mpd'): - media = self._call_api('media/' + video_id, video_id, query={ - 'encodingsNew': 'true', - 'encodingsFormat': encoding_format, - }) - for encoding in media.get('encodings', []): - playlist_url = encoding.get('master_playlist_url') - if encoding_format == 'm3u8': - # use `m3u8` entry_protocol until EXT-X-MAP is properly supported by `m3u8_native` entry_protocol - formats.extend(self._extract_m3u8_formats( - playlist_url, video_id, 'mp4', - m3u8_id='hls', fatal=False)) - elif encoding_format == 'mpd': - formats.extend(self._extract_mpd_formats( - playlist_url, video_id, mpd_id='dash', fatal=False)) - encoding_url = encoding.get('url') - file_url = encoding.get('file_url') - if not encoding_url and not file_url: - continue - f = { - 'width': int_or_none(encoding.get('width')), - 'height': int_or_none(encoding.get('height')), - 'vbr': int_or_none(encoding.get('video_bitrate')), - 'abr': int_or_none(encoding.get('audio_bitrate')), - 'filesize': int_or_none(encoding.get('size_in_bytes')), - 'vcodec': encoding.get('video_codec'), - 'acodec': encoding.get('audio_codec'), - 'container': encoding.get('container_type'), - } - for f_url in (encoding_url, file_url): - if not f_url: - continue - fmt = f.copy() - rtmp = re.search(r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+))/(?P<playpath>mp[34]:.+)$', f_url) - if rtmp: - fmt.update({ - 'url': rtmp.group('url'), - 'play_path': rtmp.group('playpath'), - 'app': rtmp.group('app'), - 'ext': 'flv', - 'format_id': 'rtmp', - }) - else: - fmt.update({ - 'url': f_url, - 'format_id': 'http', - }) - formats.append(fmt) - self._sort_formats(formats) - - title = media['title'] - - subtitles = {} - for closed_caption in media.get('closed_captions', []): - sub_url = closed_caption.get('file') - if not sub_url: - continue - lang = closed_caption.get('code') or closed_caption.get('language') or 'en' - subtitles.setdefault(lang, []).append({ - 'url': sub_url, - }) - - return { - 'id': video_id, - 'formats': formats, - 'title': title, - 'description': media.get('description'), - 'thumbnail': media.get('image_large') or media.get('image_medium') or media.get('image_small'), - 'duration': int_or_none(media.get('duration')), - 'tags': media.get('tags'), - 'subtitles': subtitles, - } - - -class CuriosityStreamCollectionIE(CuriosityStreamBaseIE): - IE_NAME = 'curiositystream:collection' - _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/(?:collections?|series)/(?P<id>\d+)' - _TESTS = [{ - 'url': 'https://app.curiositystream.com/collection/2', - 'info_dict': { - 'id': '2', - 'title': 'Curious Minds: The Internet', - 'description': 'How is the internet shaping our lives in the 21st Century?', - }, - 'playlist_mincount': 16, - }, { - 'url': 'https://curiositystream.com/series/2', - 'only_matching': True, - }, { - 'url': 'https://curiositystream.com/collections/36', - 'only_matching': True, - }] - - def _real_extract(self, url): - collection_id = self._match_id(url) - collection = self._call_api( - 'collections/' + collection_id, collection_id) - entries = [] - for media in collection.get('media', []): - media_id = compat_str(media.get('id')) - entries.append(self.url_result( - 'https://curiositystream.com/video/' + media_id, - CuriosityStreamIE.ie_key(), media_id)) - return self.playlist_result( - entries, collection_id, - collection.get('title'), collection.get('description')) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py deleted file mode 100644 index b8529050c..000000000 --- a/youtube_dl/extractor/dailymotion.py +++ /dev/null @@ -1,393 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import functools -import json -import re - -from .common import InfoExtractor -from ..compat import compat_HTTPError -from ..utils import ( - age_restricted, - clean_html, - ExtractorError, - int_or_none, - OnDemandPagedList, - try_get, - unescapeHTML, - urlencode_postdata, -) - - -class DailymotionBaseInfoExtractor(InfoExtractor): - _FAMILY_FILTER = None - _HEADERS = { - 'Content-Type': 'application/json', - 'Origin': 'https://www.dailymotion.com', - } - _NETRC_MACHINE = 'dailymotion' - - def _get_dailymotion_cookies(self): - return self._get_cookies('https://www.dailymotion.com/') - - @staticmethod - def _get_cookie_value(cookies, name): - cookie = cookies.get(name) - if cookie: - return cookie.value - - def _set_dailymotion_cookie(self, name, value): - self._set_cookie('www.dailymotion.com', name, value) - - def _real_initialize(self): - cookies = self._get_dailymotion_cookies() - ff = self._get_cookie_value(cookies, 'ff') - self._FAMILY_FILTER = ff == 'on' if ff else age_restricted(18, self._downloader.params.get('age_limit')) - self._set_dailymotion_cookie('ff', 'on' if self._FAMILY_FILTER else 'off') - - def _call_api(self, object_type, xid, object_fields, note, filter_extra=None): - if not self._HEADERS.get('Authorization'): - cookies = self._get_dailymotion_cookies() - token = self._get_cookie_value(cookies, 'access_token') or self._get_cookie_value(cookies, 'client_token') - if not token: - data = { - 'client_id': 'f1a362d288c1b98099c7', - 'client_secret': 'eea605b96e01c796ff369935357eca920c5da4c5', - } - username, password = self._get_login_info() - if username: - data.update({ - 'grant_type': 'password', - 'password': password, - 'username': username, - }) - else: - data['grant_type'] = 'client_credentials' - try: - token = self._download_json( - 'https://graphql.api.dailymotion.com/oauth/token', - None, 'Downloading Access Token', - data=urlencode_postdata(data))['access_token'] - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: - raise ExtractorError(self._parse_json( - e.cause.read().decode(), xid)['error_description'], expected=True) - raise - self._set_dailymotion_cookie('access_token' if username else 'client_token', token) - self._HEADERS['Authorization'] = 'Bearer ' + token - - resp = self._download_json( - 'https://graphql.api.dailymotion.com/', xid, note, data=json.dumps({ - 'query': '''{ - %s(xid: "%s"%s) { - %s - } -}''' % (object_type, xid, ', ' + filter_extra if filter_extra else '', object_fields), - }).encode(), headers=self._HEADERS) - obj = resp['data'][object_type] - if not obj: - raise ExtractorError(resp['errors'][0]['message'], expected=True) - return obj - - -class DailymotionIE(DailymotionBaseInfoExtractor): - _VALID_URL = r'''(?ix) - https?:// - (?: - (?:(?:www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:embed|swf|\#)/)?video|swf)| - (?:www\.)?lequipe\.fr/video - ) - /(?P<id>[^/?_]+)(?:.+?\bplaylist=(?P<playlist_id>x[0-9a-z]+))? - ''' - IE_NAME = 'dailymotion' - _TESTS = [{ - 'url': 'http://www.dailymotion.com/video/x5kesuj_office-christmas-party-review-jason-bateman-olivia-munn-t-j-miller_news', - 'md5': '074b95bdee76b9e3654137aee9c79dfe', - 'info_dict': { - 'id': 'x5kesuj', - 'ext': 'mp4', - 'title': 'Office Christmas Party Review – Jason Bateman, Olivia Munn, T.J. Miller', - 'description': 'Office Christmas Party Review - Jason Bateman, Olivia Munn, T.J. Miller', - 'duration': 187, - 'timestamp': 1493651285, - 'upload_date': '20170501', - 'uploader': 'Deadline', - 'uploader_id': 'x1xm8ri', - 'age_limit': 0, - }, - }, { - 'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames', - 'md5': '2137c41a8e78554bb09225b8eb322406', - 'info_dict': { - 'id': 'x2iuewm', - 'ext': 'mp4', - 'title': 'Steam Machine Models, Pricing Listed on Steam Store - IGN News', - 'description': 'Several come bundled with the Steam Controller.', - 'thumbnail': r're:^https?:.*\.(?:jpg|png)$', - 'duration': 74, - 'timestamp': 1425657362, - 'upload_date': '20150306', - 'uploader': 'IGN', - 'uploader_id': 'xijv66', - 'age_limit': 0, - 'view_count': int, - }, - 'skip': 'video gone', - }, { - # Vevo video - 'url': 'http://www.dailymotion.com/video/x149uew_katy-perry-roar-official_musi', - 'info_dict': { - 'title': 'Roar (Official)', - 'id': 'USUV71301934', - 'ext': 'mp4', - 'uploader': 'Katy Perry', - 'upload_date': '20130905', - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'VEVO is only available in some countries', - }, { - # age-restricted video - 'url': 'http://www.dailymotion.com/video/xyh2zz_leanna-decker-cyber-girl-of-the-year-desires-nude-playboy-plus_redband', - 'md5': '0d667a7b9cebecc3c89ee93099c4159d', - 'info_dict': { - 'id': 'xyh2zz', - 'ext': 'mp4', - 'title': 'Leanna Decker - Cyber Girl Of The Year Desires Nude [Playboy Plus]', - 'uploader': 'HotWaves1012', - 'age_limit': 18, - }, - 'skip': 'video gone', - }, { - # geo-restricted, player v5 - 'url': 'http://www.dailymotion.com/video/xhza0o', - 'only_matching': True, - }, { - # with subtitles - 'url': 'http://www.dailymotion.com/video/x20su5f_the-power-of-nightmares-1-the-rise-of-the-politics-of-fear-bbc-2004_news', - 'only_matching': True, - }, { - 'url': 'http://www.dailymotion.com/swf/video/x3n92nf', - 'only_matching': True, - }, { - 'url': 'http://www.dailymotion.com/swf/x3ss1m_funny-magic-trick-barry-and-stuart_fun', - 'only_matching': True, - }, { - 'url': 'https://www.lequipe.fr/video/x791mem', - 'only_matching': True, - }, { - 'url': 'https://www.lequipe.fr/video/k7MtHciueyTcrFtFKA2', - 'only_matching': True, - }, { - 'url': 'https://www.dailymotion.com/video/x3z49k?playlist=xv4bw', - 'only_matching': True, - }] - _GEO_BYPASS = False - _COMMON_MEDIA_FIELDS = '''description - geoblockedCountries { - allowed - } - xid''' - - @staticmethod - def _extract_urls(webpage): - urls = [] - # Look for embedded Dailymotion player - # https://developer.dailymotion.com/player#player-parameters - for mobj in re.finditer( - r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1', webpage): - urls.append(unescapeHTML(mobj.group('url'))) - for mobj in re.finditer( - r'(?s)DM\.player\([^,]+,\s*{.*?video[\'"]?\s*:\s*["\']?(?P<id>[0-9a-zA-Z]+).+?}\s*\);', webpage): - urls.append('https://www.dailymotion.com/embed/video/' + mobj.group('id')) - return urls - - def _real_extract(self, url): - video_id, playlist_id = re.match(self._VALID_URL, url).groups() - - if playlist_id: - if not self._downloader.params.get('noplaylist'): - self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % playlist_id) - return self.url_result( - 'http://www.dailymotion.com/playlist/' + playlist_id, - 'DailymotionPlaylist', playlist_id) - self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - - password = self._downloader.params.get('videopassword') - media = self._call_api( - 'media', video_id, '''... on Video { - %s - stats { - likes { - total - } - views { - total - } - } - } - ... on Live { - %s - audienceCount - isOnAir - }''' % (self._COMMON_MEDIA_FIELDS, self._COMMON_MEDIA_FIELDS), 'Downloading media JSON metadata', - 'password: "%s"' % self._downloader.params.get('videopassword') if password else None) - xid = media['xid'] - - metadata = self._download_json( - 'https://www.dailymotion.com/player/metadata/video/' + xid, - xid, 'Downloading metadata JSON', - query={'app': 'com.dailymotion.neon'}) - - error = metadata.get('error') - if error: - title = error.get('title') or error['raw_message'] - # See https://developer.dailymotion.com/api#access-error - if error.get('code') == 'DM007': - allowed_countries = try_get(media, lambda x: x['geoblockedCountries']['allowed'], list) - self.raise_geo_restricted(msg=title, countries=allowed_countries) - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, title), expected=True) - - title = metadata['title'] - is_live = media.get('isOnAir') - formats = [] - for quality, media_list in metadata['qualities'].items(): - for m in media_list: - media_url = m.get('url') - media_type = m.get('type') - if not media_url or media_type == 'application/vnd.lumberjack.manifest': - continue - if media_type == 'application/x-mpegURL': - formats.extend(self._extract_m3u8_formats( - media_url, video_id, 'mp4', - 'm3u8' if is_live else 'm3u8_native', - m3u8_id='hls', fatal=False)) - else: - f = { - 'url': media_url, - 'format_id': 'http-' + quality, - } - m = re.search(r'/H264-(\d+)x(\d+)(?:-(60)/)?', media_url) - if m: - width, height, fps = map(int_or_none, m.groups()) - f.update({ - 'fps': fps, - 'height': height, - 'width': width, - }) - formats.append(f) - for f in formats: - f['url'] = f['url'].split('#')[0] - if not f.get('fps') and f['format_id'].endswith('@60'): - f['fps'] = 60 - self._sort_formats(formats) - - subtitles = {} - subtitles_data = try_get(metadata, lambda x: x['subtitles']['data'], dict) or {} - for subtitle_lang, subtitle in subtitles_data.items(): - subtitles[subtitle_lang] = [{ - 'url': subtitle_url, - } for subtitle_url in subtitle.get('urls', [])] - - thumbnails = [] - for height, poster_url in metadata.get('posters', {}).items(): - thumbnails.append({ - 'height': int_or_none(height), - 'id': height, - 'url': poster_url, - }) - - owner = metadata.get('owner') or {} - stats = media.get('stats') or {} - get_count = lambda x: int_or_none(try_get(stats, lambda y: y[x + 's']['total'])) - - return { - 'id': video_id, - 'title': self._live_title(title) if is_live else title, - 'description': clean_html(media.get('description')), - 'thumbnails': thumbnails, - 'duration': int_or_none(metadata.get('duration')) or None, - 'timestamp': int_or_none(metadata.get('created_time')), - 'uploader': owner.get('screenname'), - 'uploader_id': owner.get('id') or metadata.get('screenname'), - 'age_limit': 18 if metadata.get('explicit') else 0, - 'tags': metadata.get('tags'), - 'view_count': get_count('view') or int_or_none(media.get('audienceCount')), - 'like_count': get_count('like'), - 'formats': formats, - 'subtitles': subtitles, - 'is_live': is_live, - } - - -class DailymotionPlaylistBaseIE(DailymotionBaseInfoExtractor): - _PAGE_SIZE = 100 - - def _fetch_page(self, playlist_id, page): - page += 1 - videos = self._call_api( - self._OBJECT_TYPE, playlist_id, - '''videos(allowExplicit: %s, first: %d, page: %d) { - edges { - node { - xid - url - } - } - }''' % ('false' if self._FAMILY_FILTER else 'true', self._PAGE_SIZE, page), - 'Downloading page %d' % page)['videos'] - for edge in videos['edges']: - node = edge['node'] - yield self.url_result( - node['url'], DailymotionIE.ie_key(), node['xid']) - - def _real_extract(self, url): - playlist_id = self._match_id(url) - entries = OnDemandPagedList(functools.partial( - self._fetch_page, playlist_id), self._PAGE_SIZE) - return self.playlist_result( - entries, playlist_id) - - -class DailymotionPlaylistIE(DailymotionPlaylistBaseIE): - IE_NAME = 'dailymotion:playlist' - _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>x[0-9a-z]+)' - _TESTS = [{ - 'url': 'http://www.dailymotion.com/playlist/xv4bw_nqtv_sport/1#video=xl8v3q', - 'info_dict': { - 'id': 'xv4bw', - }, - 'playlist_mincount': 20, - }] - _OBJECT_TYPE = 'collection' - - -class DailymotionUserIE(DailymotionPlaylistBaseIE): - IE_NAME = 'dailymotion:user' - _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist)/)(?:(?:old/)?user/)?(?P<id>[^/]+)' - _TESTS = [{ - 'url': 'https://www.dailymotion.com/user/nqtv', - 'info_dict': { - 'id': 'nqtv', - }, - 'playlist_mincount': 152, - }, { - 'url': 'http://www.dailymotion.com/user/UnderProject', - 'info_dict': { - 'id': 'UnderProject', - }, - 'playlist_mincount': 1000, - 'skip': 'Takes too long time', - }, { - 'url': 'https://www.dailymotion.com/user/nqtv', - 'info_dict': { - 'id': 'nqtv', - }, - 'playlist_mincount': 148, - 'params': { - 'age_limit': 0, - }, - }] - _OBJECT_TYPE = 'channel' diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py deleted file mode 100644 index 137095577..000000000 --- a/youtube_dl/extractor/daum.py +++ /dev/null @@ -1,266 +0,0 @@ -# coding: utf-8 - -from __future__ import unicode_literals - -import itertools - -from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_urllib_parse_unquote, - compat_urlparse, -) - - -class DaumBaseIE(InfoExtractor): - _KAKAO_EMBED_BASE = 'http://tv.kakao.com/embed/player/cliplink/' - - -class DaumIE(DaumBaseIE): - _VALID_URL = r'https?://(?:(?:m\.)?tvpot\.daum\.net/v/|videofarm\.daum\.net/controller/player/VodPlayer\.swf\?vid=)(?P<id>[^?#&]+)' - IE_NAME = 'daum.net' - - _TESTS = [{ - 'url': 'http://tvpot.daum.net/v/vab4dyeDBysyBssyukBUjBz', - 'info_dict': { - 'id': 'vab4dyeDBysyBssyukBUjBz', - 'ext': 'mp4', - 'title': '마크 헌트 vs 안토니오 실바', - 'description': 'Mark Hunt vs Antonio Silva', - 'upload_date': '20131217', - 'thumbnail': r're:^https?://.*\.(?:jpg|png)', - 'duration': 2117, - 'view_count': int, - 'comment_count': int, - 'uploader_id': 186139, - 'uploader': '콘간지', - 'timestamp': 1387310323, - }, - }, { - 'url': 'http://m.tvpot.daum.net/v/65139429', - 'info_dict': { - 'id': '65139429', - 'ext': 'mp4', - 'title': '1297회, \'아빠 아들로 태어나길 잘 했어\' 민수, 감동의 눈물[아빠 어디가] 20150118', - 'description': 'md5:79794514261164ff27e36a21ad229fc5', - 'upload_date': '20150118', - 'thumbnail': r're:^https?://.*\.(?:jpg|png)', - 'duration': 154, - 'view_count': int, - 'comment_count': int, - 'uploader': 'MBC 예능', - 'uploader_id': 132251, - 'timestamp': 1421604228, - }, - }, { - 'url': 'http://tvpot.daum.net/v/07dXWRka62Y%24', - 'only_matching': True, - }, { - 'url': 'http://videofarm.daum.net/controller/player/VodPlayer.swf?vid=vwIpVpCQsT8%24&ref=', - 'info_dict': { - 'id': 'vwIpVpCQsT8$', - 'ext': 'flv', - 'title': '01-Korean War ( Trouble on the horizon )', - 'description': 'Korean War 01\r\nTrouble on the horizon\r\n전쟁의 먹구름', - 'upload_date': '20080223', - 'thumbnail': r're:^https?://.*\.(?:jpg|png)', - 'duration': 249, - 'view_count': int, - 'comment_count': int, - 'uploader': '까칠한 墮落始祖 황비홍님의', - 'uploader_id': 560824, - 'timestamp': 1203770745, - }, - }, { - # Requires dte_type=WEB (#9972) - 'url': 'http://tvpot.daum.net/v/s3794Uf1NZeZ1qMpGpeqeRU', - 'md5': 'a8917742069a4dd442516b86e7d66529', - 'info_dict': { - 'id': 's3794Uf1NZeZ1qMpGpeqeRU', - 'ext': 'mp4', - 'title': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)', - 'description': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)\r\n\r\n[쇼! 음악중심] 20160611, 507회', - 'upload_date': '20170129', - 'uploader': '쇼! 음악중심', - 'uploader_id': 2653210, - 'timestamp': 1485684628, - }, - }] - - def _real_extract(self, url): - video_id = compat_urllib_parse_unquote(self._match_id(url)) - if not video_id.isdigit(): - video_id += '@my' - return self.url_result( - self._KAKAO_EMBED_BASE + video_id, 'Kakao', video_id) - - -class DaumClipIE(DaumBaseIE): - _VALID_URL = r'https?://(?:m\.)?tvpot\.daum\.net/(?:clip/ClipView.(?:do|tv)|mypot/View.do)\?.*?clipid=(?P<id>\d+)' - IE_NAME = 'daum.net:clip' - _URL_TEMPLATE = 'http://tvpot.daum.net/clip/ClipView.do?clipid=%s' - - _TESTS = [{ - 'url': 'http://tvpot.daum.net/clip/ClipView.do?clipid=52554690', - 'info_dict': { - 'id': '52554690', - 'ext': 'mp4', - 'title': 'DOTA 2GETHER 시즌2 6회 - 2부', - 'description': 'DOTA 2GETHER 시즌2 6회 - 2부', - 'upload_date': '20130831', - 'thumbnail': r're:^https?://.*\.(?:jpg|png)', - 'duration': 3868, - 'view_count': int, - 'uploader': 'GOMeXP', - 'uploader_id': 6667, - 'timestamp': 1377911092, - }, - }, { - 'url': 'http://m.tvpot.daum.net/clip/ClipView.tv?clipid=54999425', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return False if DaumPlaylistIE.suitable(url) or DaumUserIE.suitable(url) else super(DaumClipIE, cls).suitable(url) - - def _real_extract(self, url): - video_id = self._match_id(url) - return self.url_result( - self._KAKAO_EMBED_BASE + video_id, 'Kakao', video_id) - - -class DaumListIE(InfoExtractor): - def _get_entries(self, list_id, list_id_type): - name = None - entries = [] - for pagenum in itertools.count(1): - list_info = self._download_json( - 'http://tvpot.daum.net/mypot/json/GetClipInfo.do?size=48&init=true&order=date&page=%d&%s=%s' % ( - pagenum, list_id_type, list_id), list_id, 'Downloading list info - %s' % pagenum) - - entries.extend([ - self.url_result( - 'http://tvpot.daum.net/v/%s' % clip['vid']) - for clip in list_info['clip_list'] - ]) - - if not name: - name = list_info.get('playlist_bean', {}).get('name') or \ - list_info.get('potInfo', {}).get('name') - - if not list_info.get('has_more'): - break - - return name, entries - - def _check_clip(self, url, list_id): - query_dict = compat_parse_qs(compat_urlparse.urlparse(url).query) - if 'clipid' in query_dict: - clip_id = query_dict['clipid'][0] - if self._downloader.params.get('noplaylist'): - self.to_screen('Downloading just video %s because of --no-playlist' % clip_id) - return self.url_result(DaumClipIE._URL_TEMPLATE % clip_id, 'DaumClip') - else: - self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % list_id) - - -class DaumPlaylistIE(DaumListIE): - _VALID_URL = r'https?://(?:m\.)?tvpot\.daum\.net/mypot/(?:View\.do|Top\.tv)\?.*?playlistid=(?P<id>[0-9]+)' - IE_NAME = 'daum.net:playlist' - _URL_TEMPLATE = 'http://tvpot.daum.net/mypot/View.do?playlistid=%s' - - _TESTS = [{ - 'note': 'Playlist url with clipid', - 'url': 'http://tvpot.daum.net/mypot/View.do?playlistid=6213966&clipid=73806844', - 'info_dict': { - 'id': '6213966', - 'title': 'Woorissica Official', - }, - 'playlist_mincount': 181 - }, { - 'note': 'Playlist url with clipid - noplaylist', - 'url': 'http://tvpot.daum.net/mypot/View.do?playlistid=6213966&clipid=73806844', - 'info_dict': { - 'id': '73806844', - 'ext': 'mp4', - 'title': '151017 Airport', - 'upload_date': '20160117', - }, - 'params': { - 'noplaylist': True, - 'skip_download': True, - } - }] - - @classmethod - def suitable(cls, url): - return False if DaumUserIE.suitable(url) else super(DaumPlaylistIE, cls).suitable(url) - - def _real_extract(self, url): - list_id = self._match_id(url) - - clip_result = self._check_clip(url, list_id) - if clip_result: - return clip_result - - name, entries = self._get_entries(list_id, 'playlistid') - - return self.playlist_result(entries, list_id, name) - - -class DaumUserIE(DaumListIE): - _VALID_URL = r'https?://(?:m\.)?tvpot\.daum\.net/mypot/(?:View|Top)\.(?:do|tv)\?.*?ownerid=(?P<id>[0-9a-zA-Z]+)' - IE_NAME = 'daum.net:user' - - _TESTS = [{ - 'url': 'http://tvpot.daum.net/mypot/View.do?ownerid=o2scDLIVbHc0', - 'info_dict': { - 'id': 'o2scDLIVbHc0', - 'title': '마이 리틀 텔레비전', - }, - 'playlist_mincount': 213 - }, { - 'url': 'http://tvpot.daum.net/mypot/View.do?ownerid=o2scDLIVbHc0&clipid=73801156', - 'info_dict': { - 'id': '73801156', - 'ext': 'mp4', - 'title': '[미공개] 김구라, 오만석이 부릅니다 \'오케피\' - 마이 리틀 텔레비전 20160116', - 'upload_date': '20160117', - 'description': 'md5:5e91d2d6747f53575badd24bd62b9f36' - }, - 'params': { - 'noplaylist': True, - 'skip_download': True, - } - }, { - 'note': 'Playlist url has ownerid and playlistid, playlistid takes precedence', - 'url': 'http://tvpot.daum.net/mypot/View.do?ownerid=o2scDLIVbHc0&playlistid=6196631', - 'info_dict': { - 'id': '6196631', - 'title': '마이 리틀 텔레비전 - 20160109', - }, - 'playlist_count': 11 - }, { - 'url': 'http://tvpot.daum.net/mypot/Top.do?ownerid=o2scDLIVbHc0', - 'only_matching': True, - }, { - 'url': 'http://m.tvpot.daum.net/mypot/Top.tv?ownerid=45x1okb1If50&playlistid=3569733', - 'only_matching': True, - }] - - def _real_extract(self, url): - list_id = self._match_id(url) - - clip_result = self._check_clip(url, list_id) - if clip_result: - return clip_result - - query_dict = compat_parse_qs(compat_urlparse.urlparse(url).query) - if 'playlistid' in query_dict: - playlist_id = query_dict['playlistid'][0] - return self.url_result(DaumPlaylistIE._URL_TEMPLATE % playlist_id, 'DaumPlaylist') - - name, entries = self._get_entries(list_id, 'ownerid') - - return self.playlist_result(entries, list_id, name) diff --git a/youtube_dl/extractor/dbtv.py b/youtube_dl/extractor/dbtv.py deleted file mode 100644 index aaedf2e3d..000000000 --- a/youtube_dl/extractor/dbtv.py +++ /dev/null @@ -1,57 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - - -class DBTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dagbladet\.no/video/(?:(?:embed|(?P<display_id>[^/]+))/)?(?P<id>[0-9A-Za-z_-]{11}|[a-zA-Z0-9]{8})' - _TESTS = [{ - 'url': 'https://www.dagbladet.no/video/PynxJnNWChE/', - 'md5': 'b8f850ba1860adbda668d367f9b77699', - 'info_dict': { - 'id': 'PynxJnNWChE', - 'ext': 'mp4', - 'title': 'Skulle teste ut fornøyelsespark, men kollegaen var bare opptatt av bikinikroppen', - 'description': 'md5:49cc8370e7d66e8a2ef15c3b4631fd3f', - 'thumbnail': r're:https?://.*\.jpg', - 'upload_date': '20160916', - 'duration': 69, - 'uploader_id': 'UCk5pvsyZJoYJBd7_oFPTlRQ', - 'uploader': 'Dagbladet', - }, - 'add_ie': ['Youtube'] - }, { - 'url': 'https://www.dagbladet.no/video/embed/xlGmyIeN9Jo/?autoplay=false', - 'only_matching': True, - }, { - 'url': 'https://www.dagbladet.no/video/truer-iran-bor-passe-dere/PalfB2Cw', - 'only_matching': True, - }] - - @staticmethod - def _extract_urls(webpage): - return [url for _, url in re.findall( - r'<iframe[^>]+src=(["\'])((?:https?:)?//(?:www\.)?dagbladet\.no/video/embed/(?:[0-9A-Za-z_-]{11}|[a-zA-Z0-9]{8}).*?)\1', - webpage)] - - def _real_extract(self, url): - display_id, video_id = re.match(self._VALID_URL, url).groups() - info = { - '_type': 'url_transparent', - 'id': video_id, - 'display_id': display_id, - } - if len(video_id) == 11: - info.update({ - 'url': video_id, - 'ie_key': 'Youtube', - }) - else: - info.update({ - 'url': 'jwplatform:' + video_id, - 'ie_key': 'JWPlatform', - }) - return info diff --git a/youtube_dl/extractor/deezer.py b/youtube_dl/extractor/deezer.py deleted file mode 100644 index a38b2683d..000000000 --- a/youtube_dl/extractor/deezer.py +++ /dev/null @@ -1,91 +0,0 @@ -from __future__ import unicode_literals - -import json -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - int_or_none, - orderedSet, -) - - -class DeezerPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?deezer\.com/playlist/(?P<id>[0-9]+)' - _TEST = { - 'url': 'http://www.deezer.com/playlist/176747451', - 'info_dict': { - 'id': '176747451', - 'title': 'Best!', - 'uploader': 'Anonymous', - 'thumbnail': r're:^https?://cdn-images\.deezer\.com/images/cover/.*\.jpg$', - }, - 'playlist_count': 30, - 'skip': 'Only available in .de', - } - - def _real_extract(self, url): - if 'test' not in self._downloader.params: - self._downloader.report_warning('For now, this extractor only supports the 30 second previews. Patches welcome!') - - mobj = re.match(self._VALID_URL, url) - playlist_id = mobj.group('id') - - webpage = self._download_webpage(url, playlist_id) - geoblocking_msg = self._html_search_regex( - r'<p class="soon-txt">(.*?)</p>', webpage, 'geoblocking message', - default=None) - if geoblocking_msg is not None: - raise ExtractorError( - 'Deezer said: %s' % geoblocking_msg, expected=True) - - data_json = self._search_regex( - (r'__DZR_APP_STATE__\s*=\s*({.+?})\s*</script>', - r'naboo\.display\(\'[^\']+\',\s*(.*?)\);\n'), - webpage, 'data JSON') - data = json.loads(data_json) - - playlist_title = data.get('DATA', {}).get('TITLE') - playlist_uploader = data.get('DATA', {}).get('PARENT_USERNAME') - playlist_thumbnail = self._search_regex( - r'<img id="naboo_playlist_image".*?src="([^"]+)"', webpage, - 'playlist thumbnail') - - preview_pattern = self._search_regex( - r"var SOUND_PREVIEW_GATEWAY\s*=\s*'([^']+)';", webpage, - 'preview URL pattern', fatal=False) - entries = [] - for s in data['SONGS']['data']: - puid = s['MD5_ORIGIN'] - preview_video_url = preview_pattern.\ - replace('{0}', puid[0]).\ - replace('{1}', puid).\ - replace('{2}', s['MEDIA_VERSION']) - formats = [{ - 'format_id': 'preview', - 'url': preview_video_url, - 'preference': -100, # Only the first 30 seconds - 'ext': 'mp3', - }] - self._sort_formats(formats) - artists = ', '.join( - orderedSet(a['ART_NAME'] for a in s['ARTISTS'])) - entries.append({ - 'id': s['SNG_ID'], - 'duration': int_or_none(s.get('DURATION')), - 'title': '%s - %s' % (artists, s['SNG_TITLE']), - 'uploader': s['ART_NAME'], - 'uploader_id': s['ART_ID'], - 'age_limit': 16 if s.get('EXPLICIT_LYRICS') == '1' else 0, - 'formats': formats, - }) - - return { - '_type': 'playlist', - 'id': playlist_id, - 'title': playlist_title, - 'uploader': playlist_uploader, - 'thumbnail': playlist_thumbnail, - 'entries': entries, - } diff --git a/youtube_dl/extractor/dfb.py b/youtube_dl/extractor/dfb.py deleted file mode 100644 index a4d0448c2..000000000 --- a/youtube_dl/extractor/dfb.py +++ /dev/null @@ -1,57 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import unified_strdate - - -class DFBIE(InfoExtractor): - IE_NAME = 'tv.dfb.de' - _VALID_URL = r'https?://tv\.dfb\.de/video/(?P<display_id>[^/]+)/(?P<id>\d+)' - - _TEST = { - 'url': 'http://tv.dfb.de/video/u-19-em-stimmen-zum-spiel-gegen-russland/11633/', - 'md5': 'ac0f98a52a330f700b4b3034ad240649', - 'info_dict': { - 'id': '11633', - 'display_id': 'u-19-em-stimmen-zum-spiel-gegen-russland', - 'ext': 'mp4', - 'title': 'U 19-EM: Stimmen zum Spiel gegen Russland', - 'upload_date': '20150714', - }, - } - - def _real_extract(self, url): - display_id, video_id = re.match(self._VALID_URL, url).groups() - - player_info = self._download_xml( - 'http://tv.dfb.de/server/hd_video.php?play=%s' % video_id, - display_id) - video_info = player_info.find('video') - stream_access_url = self._proto_relative_url(video_info.find('url').text.strip()) - - formats = [] - # see http://tv.dfb.de/player/js/ajax.js for the method to extract m3u8 formats - for sa_url in (stream_access_url, stream_access_url + '&area=&format=iphone'): - stream_access_info = self._download_xml(sa_url, display_id) - token_el = stream_access_info.find('token') - manifest_url = token_el.attrib['url'] + '?' + 'hdnea=' + token_el.attrib['auth'] - if '.f4m' in manifest_url: - formats.extend(self._extract_f4m_formats( - manifest_url + '&hdcore=3.2.0', - display_id, f4m_id='hds', fatal=False)) - else: - formats.extend(self._extract_m3u8_formats( - manifest_url, display_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False)) - self._sort_formats(formats) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': video_info.find('title').text, - 'thumbnail': 'http://tv.dfb.de/images/%s_640x360.jpg' % video_id, - 'upload_date': unified_strdate(video_info.find('time_date').text), - 'formats': formats, - } diff --git a/youtube_dl/extractor/digiteka.py b/youtube_dl/extractor/digiteka.py deleted file mode 100644 index 3dfde0d8c..000000000 --- a/youtube_dl/extractor/digiteka.py +++ /dev/null @@ -1,112 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import int_or_none - - -class DigitekaIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?://(?:www\.)?(?:digiteka\.net|ultimedia\.com)/ - (?: - deliver/ - (?P<embed_type> - generic| - musique - ) - (?:/[^/]+)*/ - (?: - src| - article - )| - default/index/video - (?P<site_type> - generic| - music - ) - /id - )/(?P<id>[\d+a-z]+)''' - _TESTS = [{ - # news - 'url': 'https://www.ultimedia.com/default/index/videogeneric/id/s8uk0r', - 'md5': '276a0e49de58c7e85d32b057837952a2', - 'info_dict': { - 'id': 's8uk0r', - 'ext': 'mp4', - 'title': 'Loi sur la fin de vie: le texte prévoit un renforcement des directives anticipées', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 74, - 'upload_date': '20150317', - 'timestamp': 1426604939, - 'uploader_id': '3fszv', - }, - }, { - # music - 'url': 'https://www.ultimedia.com/default/index/videomusic/id/xvpfp8', - 'md5': '2ea3513813cf230605c7e2ffe7eca61c', - 'info_dict': { - 'id': 'xvpfp8', - 'ext': 'mp4', - 'title': 'Two - C\'est La Vie (clip)', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 233, - 'upload_date': '20150224', - 'timestamp': 1424760500, - 'uploader_id': '3rfzk', - }, - }, { - 'url': 'https://www.digiteka.net/deliver/generic/iframe/mdtk/01637594/src/lqm3kl/zone/1/showtitle/1/autoplay/yes', - 'only_matching': True, - }] - - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<(?:iframe|script)[^>]+src=["\'](?P<url>(?:https?:)?//(?:www\.)?ultimedia\.com/deliver/(?:generic|musique)(?:/[^/]+)*/(?:src|article)/[\d+a-z]+)', - webpage) - if mobj: - return mobj.group('url') - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - video_type = mobj.group('embed_type') or mobj.group('site_type') - if video_type == 'music': - video_type = 'musique' - - deliver_info = self._download_json( - 'http://www.ultimedia.com/deliver/video?video=%s&topic=%s' % (video_id, video_type), - video_id) - - yt_id = deliver_info.get('yt_id') - if yt_id: - return self.url_result(yt_id, 'Youtube') - - jwconf = deliver_info['jwconf'] - - formats = [] - for source in jwconf['playlist'][0]['sources']: - formats.append({ - 'url': source['file'], - 'format_id': source.get('label'), - }) - - self._sort_formats(formats) - - title = deliver_info['title'] - thumbnail = jwconf.get('image') - duration = int_or_none(deliver_info.get('duration')) - timestamp = int_or_none(deliver_info.get('release_time')) - uploader_id = deliver_info.get('owner_id') - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'duration': duration, - 'timestamp': timestamp, - 'uploader_id': uploader_id, - 'formats': formats, - } diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py deleted file mode 100644 index e0139cc86..000000000 --- a/youtube_dl/extractor/discovery.py +++ /dev/null @@ -1,118 +0,0 @@ -from __future__ import unicode_literals - -import random -import re -import string - -from .discoverygo import DiscoveryGoBaseIE -from ..compat import compat_urllib_parse_unquote -from ..utils import ExtractorError -from ..compat import compat_HTTPError - - -class DiscoveryIE(DiscoveryGoBaseIE): - _VALID_URL = r'''(?x)https?:// - (?P<site> - go\.discovery| - www\. - (?: - investigationdiscovery| - discoverylife| - animalplanet| - ahctv| - destinationamerica| - sciencechannel| - tlc - )| - watch\. - (?: - hgtv| - foodnetwork| - travelchannel| - diynetwork| - cookingchanneltv| - motortrend - ) - )\.com/tv-shows/(?P<show_slug>[^/]+)/(?:video|full-episode)s/(?P<id>[^./?#]+)''' - _TESTS = [{ - 'url': 'https://go.discovery.com/tv-shows/cash-cab/videos/riding-with-matthew-perry', - 'info_dict': { - 'id': '5a2f35ce6b66d17a5026e29e', - 'ext': 'mp4', - 'title': 'Riding with Matthew Perry', - 'description': 'md5:a34333153e79bc4526019a5129e7f878', - 'duration': 84, - }, - 'params': { - 'skip_download': True, # requires ffmpeg - } - }, { - 'url': 'https://www.investigationdiscovery.com/tv-shows/final-vision/full-episodes/final-vision', - 'only_matching': True, - }, { - 'url': 'https://go.discovery.com/tv-shows/alaskan-bush-people/videos/follow-your-own-road', - 'only_matching': True, - }, { - # using `show_slug` is important to get the correct video data - 'url': 'https://www.sciencechannel.com/tv-shows/mythbusters-on-science/full-episodes/christmas-special', - 'only_matching': True, - }] - _GEO_COUNTRIES = ['US'] - _GEO_BYPASS = False - _API_BASE_URL = 'https://api.discovery.com/v1/' - - def _real_extract(self, url): - site, show_slug, display_id = re.match(self._VALID_URL, url).groups() - - access_token = None - cookies = self._get_cookies(url) - - # prefer Affiliate Auth Token over Anonymous Auth Token - auth_storage_cookie = cookies.get('eosAf') or cookies.get('eosAn') - if auth_storage_cookie and auth_storage_cookie.value: - auth_storage = self._parse_json(compat_urllib_parse_unquote( - compat_urllib_parse_unquote(auth_storage_cookie.value)), - display_id, fatal=False) or {} - access_token = auth_storage.get('a') or auth_storage.get('access_token') - - if not access_token: - access_token = self._download_json( - 'https://%s.com/anonymous' % site, display_id, - 'Downloading token JSON metadata', query={ - 'authRel': 'authorization', - 'client_id': '3020a40c2356a645b4b4', - 'nonce': ''.join([random.choice(string.ascii_letters) for _ in range(32)]), - 'redirectUri': 'https://www.discovery.com/', - })['access_token'] - - headers = self.geo_verification_headers() - headers['Authorization'] = 'Bearer ' + access_token - - try: - video = self._download_json( - self._API_BASE_URL + 'content/videos', - display_id, 'Downloading content JSON metadata', - headers=headers, query={ - 'embed': 'show.name', - 'fields': 'authenticated,description.detailed,duration,episodeNumber,id,name,parental.rating,season.number,show,tags', - 'slug': display_id, - 'show_slug': show_slug, - })[0] - video_id = video['id'] - stream = self._download_json( - self._API_BASE_URL + 'streaming/video/' + video_id, - display_id, 'Downloading streaming JSON metadata', headers=headers) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 403): - e_description = self._parse_json( - e.cause.read().decode(), display_id)['description'] - if 'resource not available for country' in e_description: - self.raise_geo_restricted(countries=self._GEO_COUNTRIES) - if 'Authorized Networks' in e_description: - raise ExtractorError( - 'This video is only available via cable service provider subscription that' - ' is not currently supported. You may want to use --cookies.', expected=True) - raise ExtractorError(e_description) - raise - - return self._extract_video_info(video, stream, display_id) diff --git a/youtube_dl/extractor/discoverynetworks.py b/youtube_dl/extractor/discoverynetworks.py deleted file mode 100644 index c512b95d0..000000000 --- a/youtube_dl/extractor/discoverynetworks.py +++ /dev/null @@ -1,43 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .dplay import DPlayIE - - -class DiscoveryNetworksDeIE(DPlayIE): - _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:tlc|dmax)\.de|dplay\.co\.uk)/(?:programme|show|sendungen)/(?P<programme>[^/]+)/(?:video/)?(?P<alternate_id>[^/]+)' - - _TESTS = [{ - 'url': 'https://www.tlc.de/programme/breaking-amish/video/die-welt-da-drauen/DCB331270001100', - 'info_dict': { - 'id': '78867', - 'ext': 'mp4', - 'title': 'Die Welt da draußen', - 'description': 'md5:61033c12b73286e409d99a41742ef608', - 'timestamp': 1554069600, - 'upload_date': '20190331', - }, - 'params': { - 'format': 'bestvideo', - 'skip_download': True, - }, - }, { - 'url': 'https://www.dmax.de/programme/dmax-highlights/video/tuning-star-sidney-hoffmann-exklusiv-bei-dmax/191023082312316', - 'only_matching': True, - }, { - 'url': 'https://www.dplay.co.uk/show/ghost-adventures/video/hotel-leger-103620/EHD_280313B', - 'only_matching': True, - }, { - 'url': 'https://tlc.de/sendungen/breaking-amish/die-welt-da-drauen/', - 'only_matching': True, - }] - - def _real_extract(self, url): - domain, programme, alternate_id = re.match(self._VALID_URL, url).groups() - country = 'GB' if domain == 'dplay.co.uk' else 'DE' - realm = 'questuk' if country == 'GB' else domain.replace('.', '') - return self._get_disco_api_info( - url, '%s/%s' % (programme, alternate_id), - 'sonic-eu1-prod.disco-api.com', realm, country) diff --git a/youtube_dl/extractor/disney.py b/youtube_dl/extractor/disney.py deleted file mode 100644 index 0eee82fd6..000000000 --- a/youtube_dl/extractor/disney.py +++ /dev/null @@ -1,170 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - unified_strdate, - compat_str, - determine_ext, - ExtractorError, - update_url_query, -) - - -class DisneyIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?://(?P<domain>(?:[^/]+\.)?(?:disney\.[a-z]{2,3}(?:\.[a-z]{2})?|disney(?:(?:me|latino)\.com|turkiye\.com\.tr|channel\.de)|(?:starwars|marvelkids)\.com))/(?:(?:embed/|(?:[^/]+/)+[\w-]+-)(?P<id>[a-z0-9]{24})|(?:[^/]+/)?(?P<display_id>[^/?#]+))''' - _TESTS = [{ - # Disney.EmbedVideo - 'url': 'http://video.disney.com/watch/moana-trailer-545ed1857afee5a0ec239977', - 'info_dict': { - 'id': '545ed1857afee5a0ec239977', - 'ext': 'mp4', - 'title': 'Moana - Trailer', - 'description': 'A fun adventure for the entire Family! Bring home Moana on Digital HD Feb 21 & Blu-ray March 7', - 'upload_date': '20170112', - }, - 'params': { - # m3u8 download - 'skip_download': True, - } - }, { - # Grill.burger - 'url': 'http://www.starwars.com/video/rogue-one-a-star-wars-story-intro-featurette', - 'info_dict': { - 'id': '5454e9f4e9804a552e3524c8', - 'ext': 'mp4', - 'title': '"Intro" Featurette: Rogue One: A Star Wars Story', - 'upload_date': '20170104', - 'description': 'Go behind-the-scenes of Rogue One: A Star Wars Story in this featurette with Director Gareth Edwards and the cast of the film.', - }, - 'params': { - # m3u8 download - 'skip_download': True, - } - }, { - 'url': 'http://videos.disneylatino.com/ver/spider-man-de-regreso-a-casa-primer-adelanto-543a33a1850bdcfcca13bae2', - 'only_matching': True, - }, { - 'url': 'http://video.en.disneyme.com/watch/future-worm/robo-carp-2001-544b66002aa7353cdd3f5114', - 'only_matching': True, - }, { - 'url': 'http://video.disneyturkiye.com.tr/izle/7c-7-cuceler/kimin-sesi-zaten-5456f3d015f6b36c8afdd0e2', - 'only_matching': True, - }, { - 'url': 'http://disneyjunior.disney.com/embed/546a4798ddba3d1612e4005d', - 'only_matching': True, - }, { - 'url': 'http://www.starwars.com/embed/54690d1e6c42e5f09a0fb097', - 'only_matching': True, - }, { - 'url': 'http://spiderman.marvelkids.com/embed/522900d2ced3c565e4cc0677', - 'only_matching': True, - }, { - 'url': 'http://spiderman.marvelkids.com/videos/contest-of-champions-part-four-clip-1', - 'only_matching': True, - }, { - 'url': 'http://disneyjunior.en.disneyme.com/dj/watch-my-friends-tigger-and-pooh-promo', - 'only_matching': True, - }, { - 'url': 'http://disneychannel.de/sehen/soy-luna-folge-118-5518518987ba27f3cc729268', - 'only_matching': True, - }, { - 'url': 'http://disneyjunior.disney.com/galactech-the-galactech-grab-galactech-an-admiral-rescue', - 'only_matching': True, - }] - - def _real_extract(self, url): - domain, video_id, display_id = re.match(self._VALID_URL, url).groups() - if not video_id: - webpage = self._download_webpage(url, display_id) - grill = re.sub(r'"\s*\+\s*"', '', self._search_regex( - r'Grill\.burger\s*=\s*({.+})\s*:', - webpage, 'grill data')) - page_data = next(s for s in self._parse_json(grill, display_id)['stack'] if s.get('type') == 'video') - video_data = page_data['data'][0] - else: - webpage = self._download_webpage( - 'http://%s/embed/%s' % (domain, video_id), video_id) - page_data = self._parse_json(self._search_regex( - r'Disney\.EmbedVideo\s*=\s*({.+});', - webpage, 'embed data'), video_id) - video_data = page_data['video'] - - for external in video_data.get('externals', []): - if external.get('source') == 'vevo': - return self.url_result('vevo:' + external['data_id'], 'Vevo') - - video_id = video_data['id'] - title = video_data['title'] - - formats = [] - for flavor in video_data.get('flavors', []): - flavor_format = flavor.get('format') - flavor_url = flavor.get('url') - if not flavor_url or not re.match(r'https?://', flavor_url) or flavor_format == 'mp4_access': - continue - tbr = int_or_none(flavor.get('bitrate')) - if tbr == 99999: - # wrong ks(Kaltura Signature) causes 404 Error - flavor_url = update_url_query(flavor_url, {'ks': ''}) - m3u8_formats = self._extract_m3u8_formats( - flavor_url, video_id, 'mp4', - m3u8_id=flavor_format, fatal=False) - for f in m3u8_formats: - # Apple FairPlay - if '/fpshls/' in f['url']: - continue - formats.append(f) - continue - format_id = [] - if flavor_format: - format_id.append(flavor_format) - if tbr: - format_id.append(compat_str(tbr)) - ext = determine_ext(flavor_url) - if flavor_format == 'applehttp' or ext == 'm3u8': - ext = 'mp4' - width = int_or_none(flavor.get('width')) - height = int_or_none(flavor.get('height')) - formats.append({ - 'format_id': '-'.join(format_id), - 'url': flavor_url, - 'width': width, - 'height': height, - 'tbr': tbr, - 'ext': ext, - 'vcodec': 'none' if (width == 0 and height == 0) else None, - }) - if not formats and video_data.get('expired'): - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, page_data['translations']['video_expired']), - expected=True) - self._sort_formats(formats) - - subtitles = {} - for caption in video_data.get('captions', []): - caption_url = caption.get('url') - caption_format = caption.get('format') - if not caption_url or caption_format.startswith('unknown'): - continue - subtitles.setdefault(caption.get('language', 'en'), []).append({ - 'url': caption_url, - 'ext': { - 'webvtt': 'vtt', - }.get(caption_format, caption_format), - }) - - return { - 'id': video_id, - 'title': title, - 'description': video_data.get('description') or video_data.get('short_desc'), - 'thumbnail': video_data.get('thumb') or video_data.get('thumb_secure'), - 'duration': int_or_none(video_data.get('duration_sec')), - 'upload_date': unified_strdate(video_data.get('publish_date')), - 'formats': formats, - 'subtitles': subtitles, - } diff --git a/youtube_dl/extractor/dispeak.py b/youtube_dl/extractor/dispeak.py deleted file mode 100644 index 276fd4b09..000000000 --- a/youtube_dl/extractor/dispeak.py +++ /dev/null @@ -1,131 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - parse_duration, - remove_end, - xpath_element, - xpath_text, -) - - -class DigitallySpeakingIE(InfoExtractor): - _VALID_URL = r'https?://(?:s?evt\.dispeak|events\.digitallyspeaking)\.com/(?:[^/]+/)+xml/(?P<id>[^.]+)\.xml' - - _TESTS = [{ - # From http://gdcvault.com/play/1023460/Tenacious-Design-and-The-Interface - 'url': 'http://evt.dispeak.com/ubm/gdc/sf16/xml/840376_BQRC.xml', - 'md5': 'a8efb6c31ed06ca8739294960b2dbabd', - 'info_dict': { - 'id': '840376_BQRC', - 'ext': 'mp4', - 'title': 'Tenacious Design and The Interface of \'Destiny\'', - }, - }, { - # From http://www.gdcvault.com/play/1014631/Classic-Game-Postmortem-PAC - 'url': 'http://events.digitallyspeaking.com/gdc/sf11/xml/12396_1299111843500GMPX.xml', - 'only_matching': True, - }, { - # From http://www.gdcvault.com/play/1013700/Advanced-Material - 'url': 'http://sevt.dispeak.com/ubm/gdc/eur10/xml/11256_1282118587281VNIT.xml', - 'only_matching': True, - }, { - # From https://gdcvault.com/play/1016624, empty speakerVideo - 'url': 'https://sevt.dispeak.com/ubm/gdc/online12/xml/201210-822101_1349794556671DDDD.xml', - 'info_dict': { - 'id': '201210-822101_1349794556671DDDD', - 'ext': 'flv', - 'title': 'Pre-launch - Preparing to Take the Plunge', - }, - }, { - # From http://www.gdcvault.com/play/1014846/Conference-Keynote-Shigeru, empty slideVideo - 'url': 'http://events.digitallyspeaking.com/gdc/project25/xml/p25-miyamoto1999_1282467389849HSVB.xml', - 'only_matching': True, - }] - - def _parse_mp4(self, metadata): - video_formats = [] - video_root = None - - mp4_video = xpath_text(metadata, './mp4video', default=None) - if mp4_video is not None: - mobj = re.match(r'(?P<root>https?://.*?/).*', mp4_video) - video_root = mobj.group('root') - if video_root is None: - http_host = xpath_text(metadata, 'httpHost', default=None) - if http_host: - video_root = 'http://%s/' % http_host - if video_root is None: - # Hard-coded in http://evt.dispeak.com/ubm/gdc/sf16/custom/player2.js - # Works for GPUTechConf, too - video_root = 'http://s3-2u.digitallyspeaking.com/' - - formats = metadata.findall('./MBRVideos/MBRVideo') - if not formats: - return None - for a_format in formats: - stream_name = xpath_text(a_format, 'streamName', fatal=True) - video_path = re.match(r'mp4\:(?P<path>.*)', stream_name).group('path') - url = video_root + video_path - bitrate = xpath_text(a_format, 'bitrate') - tbr = int_or_none(bitrate) - vbr = int_or_none(self._search_regex( - r'-(\d+)\.mp4', video_path, 'vbr', default=None)) - abr = tbr - vbr if tbr and vbr else None - video_formats.append({ - 'format_id': bitrate, - 'url': url, - 'tbr': tbr, - 'vbr': vbr, - 'abr': abr, - }) - return video_formats - - def _parse_flv(self, metadata): - formats = [] - akamai_url = xpath_text(metadata, './akamaiHost', fatal=True) - audios = metadata.findall('./audios/audio') - for audio in audios: - formats.append({ - 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, - 'play_path': remove_end(audio.get('url'), '.flv'), - 'ext': 'flv', - 'vcodec': 'none', - 'format_id': audio.get('code'), - }) - for video_key, format_id, preference in ( - ('slide', 'slides', -2), ('speaker', 'speaker', -1)): - video_path = xpath_text(metadata, './%sVideo' % video_key) - if not video_path: - continue - formats.append({ - 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, - 'play_path': remove_end(video_path, '.flv'), - 'ext': 'flv', - 'format_note': '%s video' % video_key, - 'quality': preference, - 'preference': preference, - 'format_id': format_id, - }) - return formats - - def _real_extract(self, url): - video_id = self._match_id(url) - - xml_description = self._download_xml(url, video_id) - metadata = xpath_element(xml_description, 'metadata') - - video_formats = self._parse_mp4(metadata) - if video_formats is None: - video_formats = self._parse_flv(metadata) - - return { - 'id': video_id, - 'formats': video_formats, - 'title': xpath_text(metadata, 'title', fatal=True), - 'duration': parse_duration(xpath_text(metadata, 'endTime')), - 'creator': xpath_text(metadata, 'speaker'), - } diff --git a/youtube_dl/extractor/dlive.py b/youtube_dl/extractor/dlive.py deleted file mode 100644 index d95c67a5b..000000000 --- a/youtube_dl/extractor/dlive.py +++ /dev/null @@ -1,97 +0,0 @@ -from __future__ import unicode_literals - -import json -import re - -from .common import InfoExtractor -from ..utils import int_or_none - - -class DLiveVODIE(InfoExtractor): - IE_NAME = 'dlive:vod' - _VALID_URL = r'https?://(?:www\.)?dlive\.tv/p/(?P<uploader_id>.+?)\+(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'https://dlive.tv/p/pdp+3mTzOl4WR', - 'info_dict': { - 'id': '3mTzOl4WR', - 'ext': 'mp4', - 'title': 'Minecraft with james charles epic', - 'upload_date': '20190701', - 'timestamp': 1562011015, - 'uploader_id': 'pdp', - } - }, { - 'url': 'https://dlive.tv/p/pdpreplay+D-RD-xSZg', - 'only_matching': True, - }] - - def _real_extract(self, url): - uploader_id, vod_id = re.match(self._VALID_URL, url).groups() - broadcast = self._download_json( - 'https://graphigo.prd.dlive.tv/', vod_id, - data=json.dumps({'query': '''query { - pastBroadcast(permlink:"%s+%s") { - content - createdAt - length - playbackUrl - title - thumbnailUrl - viewCount - } -}''' % (uploader_id, vod_id)}).encode())['data']['pastBroadcast'] - title = broadcast['title'] - formats = self._extract_m3u8_formats( - broadcast['playbackUrl'], vod_id, 'mp4', 'm3u8_native') - self._sort_formats(formats) - return { - 'id': vod_id, - 'title': title, - 'uploader_id': uploader_id, - 'formats': formats, - 'description': broadcast.get('content'), - 'thumbnail': broadcast.get('thumbnailUrl'), - 'timestamp': int_or_none(broadcast.get('createdAt'), 1000), - 'view_count': int_or_none(broadcast.get('viewCount')), - } - - -class DLiveStreamIE(InfoExtractor): - IE_NAME = 'dlive:stream' - _VALID_URL = r'https?://(?:www\.)?dlive\.tv/(?!p/)(?P<id>[\w.-]+)' - - def _real_extract(self, url): - display_name = self._match_id(url) - user = self._download_json( - 'https://graphigo.prd.dlive.tv/', display_name, - data=json.dumps({'query': '''query { - userByDisplayName(displayname:"%s") { - livestream { - content - createdAt - title - thumbnailUrl - watchingCount - } - username - } -}''' % display_name}).encode())['data']['userByDisplayName'] - livestream = user['livestream'] - title = livestream['title'] - username = user['username'] - formats = self._extract_m3u8_formats( - 'https://live.prd.dlive.tv/hls/live/%s.m3u8' % username, - display_name, 'mp4') - self._sort_formats(formats) - return { - 'id': display_name, - 'title': self._live_title(title), - 'uploader': display_name, - 'uploader_id': username, - 'formats': formats, - 'description': livestream.get('content'), - 'thumbnail': livestream.get('thumbnailUrl'), - 'is_live': True, - 'timestamp': int_or_none(livestream.get('createdAt'), 1000), - 'view_count': int_or_none(livestream.get('watchingCount')), - } diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py deleted file mode 100644 index bbb199094..000000000 --- a/youtube_dl/extractor/dplay.py +++ /dev/null @@ -1,369 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import json -import re - -from .common import InfoExtractor -from ..compat import compat_HTTPError -from ..utils import ( - determine_ext, - ExtractorError, - float_or_none, - int_or_none, - strip_or_none, - unified_timestamp, -) - - -class DPlayIE(InfoExtractor): - _PATH_REGEX = r'/(?P<id>[^/]+/[^/?#]+)' - _VALID_URL = r'''(?x)https?:// - (?P<domain> - (?:www\.)?(?P<host>d - (?: - play\.(?P<country>dk|fi|jp|se|no)| - iscoveryplus\.(?P<plus_country>dk|es|fi|it|se|no) - ) - )| - (?P<subdomain_country>es|it)\.dplay\.com - )/[^/]+''' + _PATH_REGEX - - _TESTS = [{ - # non geo restricted, via secure api, unsigned download hls URL - 'url': 'https://www.dplay.se/videos/nugammalt-77-handelser-som-format-sverige/nugammalt-77-handelser-som-format-sverige-101', - 'info_dict': { - 'id': '13628', - 'display_id': 'nugammalt-77-handelser-som-format-sverige/nugammalt-77-handelser-som-format-sverige-101', - 'ext': 'mp4', - 'title': 'Svensken lär sig njuta av livet', - 'description': 'md5:d3819c9bccffd0fe458ca42451dd50d8', - 'duration': 2649.856, - 'timestamp': 1365453720, - 'upload_date': '20130408', - 'creator': 'Kanal 5', - 'series': 'Nugammalt - 77 händelser som format Sverige', - 'season_number': 1, - 'episode_number': 1, - }, - 'params': { - 'format': 'bestvideo', - 'skip_download': True, - }, - }, { - # geo restricted, via secure api, unsigned download hls URL - 'url': 'http://www.dplay.dk/videoer/ted-bundy-mind-of-a-monster/ted-bundy-mind-of-a-monster', - 'info_dict': { - 'id': '104465', - 'display_id': 'ted-bundy-mind-of-a-monster/ted-bundy-mind-of-a-monster', - 'ext': 'mp4', - 'title': 'Ted Bundy: Mind Of A Monster', - 'description': 'md5:8b780f6f18de4dae631668b8a9637995', - 'duration': 5290.027, - 'timestamp': 1570694400, - 'upload_date': '20191010', - 'creator': 'ID - Investigation Discovery', - 'series': 'Ted Bundy: Mind Of A Monster', - 'season_number': 1, - 'episode_number': 1, - }, - 'params': { - 'format': 'bestvideo', - 'skip_download': True, - }, - }, { - # disco-api - 'url': 'https://www.dplay.no/videoer/i-kongens-klr/sesong-1-episode-7', - 'info_dict': { - 'id': '40206', - 'display_id': 'i-kongens-klr/sesong-1-episode-7', - 'ext': 'mp4', - 'title': 'Episode 7', - 'description': 'md5:e3e1411b2b9aebeea36a6ec5d50c60cf', - 'duration': 2611.16, - 'timestamp': 1516726800, - 'upload_date': '20180123', - 'series': 'I kongens klær', - 'season_number': 1, - 'episode_number': 7, - }, - 'params': { - 'format': 'bestvideo', - 'skip_download': True, - }, - 'skip': 'Available for Premium users', - }, { - 'url': 'http://it.dplay.com/nove/biografie-imbarazzanti/luigi-di-maio-la-psicosi-di-stanislawskij/', - 'md5': '2b808ffb00fc47b884a172ca5d13053c', - 'info_dict': { - 'id': '6918', - 'display_id': 'biografie-imbarazzanti/luigi-di-maio-la-psicosi-di-stanislawskij', - 'ext': 'mp4', - 'title': 'Luigi Di Maio: la psicosi di Stanislawskij', - 'description': 'md5:3c7a4303aef85868f867a26f5cc14813', - 'thumbnail': r're:^https?://.*\.jpe?g', - 'upload_date': '20160524', - 'timestamp': 1464076800, - 'series': 'Biografie imbarazzanti', - 'season_number': 1, - 'episode': 'Episode 1', - 'episode_number': 1, - }, - }, { - 'url': 'https://es.dplay.com/dmax/la-fiebre-del-oro/temporada-8-episodio-1/', - 'info_dict': { - 'id': '21652', - 'display_id': 'la-fiebre-del-oro/temporada-8-episodio-1', - 'ext': 'mp4', - 'title': 'Episodio 1', - 'description': 'md5:b9dcff2071086e003737485210675f69', - 'thumbnail': r're:^https?://.*\.png', - 'upload_date': '20180709', - 'timestamp': 1531173540, - 'series': 'La fiebre del oro', - 'season_number': 8, - 'episode': 'Episode 1', - 'episode_number': 1, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://www.dplay.fi/videot/shifting-gears-with-aaron-kaufman/episode-16', - 'only_matching': True, - }, { - 'url': 'https://www.dplay.jp/video/gold-rush/24086', - 'only_matching': True, - }, { - 'url': 'https://www.discoveryplus.se/videos/nugammalt-77-handelser-som-format-sverige/nugammalt-77-handelser-som-format-sverige-101', - 'only_matching': True, - }, { - 'url': 'https://www.discoveryplus.dk/videoer/ted-bundy-mind-of-a-monster/ted-bundy-mind-of-a-monster', - 'only_matching': True, - }, { - 'url': 'https://www.discoveryplus.no/videoer/i-kongens-klr/sesong-1-episode-7', - 'only_matching': True, - }, { - 'url': 'https://www.discoveryplus.it/videos/biografie-imbarazzanti/luigi-di-maio-la-psicosi-di-stanislawskij', - 'only_matching': True, - }, { - 'url': 'https://www.discoveryplus.es/videos/la-fiebre-del-oro/temporada-8-episodio-1', - 'only_matching': True, - }, { - 'url': 'https://www.discoveryplus.fi/videot/shifting-gears-with-aaron-kaufman/episode-16', - 'only_matching': True, - }] - - def _process_errors(self, e, geo_countries): - info = self._parse_json(e.cause.read().decode('utf-8'), None) - error = info['errors'][0] - error_code = error.get('code') - if error_code == 'access.denied.geoblocked': - self.raise_geo_restricted(countries=geo_countries) - elif error_code in ('access.denied.missingpackage', 'invalid.token'): - raise ExtractorError( - 'This video is only available for registered users. You may want to use --cookies.', expected=True) - raise ExtractorError(info['errors'][0]['detail'], expected=True) - - def _update_disco_api_headers(self, headers, disco_base, display_id, realm): - headers['Authorization'] = 'Bearer ' + self._download_json( - disco_base + 'token', display_id, 'Downloading token', - query={ - 'realm': realm, - })['data']['attributes']['token'] - - def _download_video_playback_info(self, disco_base, video_id, headers): - streaming = self._download_json( - disco_base + 'playback/videoPlaybackInfo/' + video_id, - video_id, headers=headers)['data']['attributes']['streaming'] - streaming_list = [] - for format_id, format_dict in streaming.items(): - streaming_list.append({ - 'type': format_id, - 'url': format_dict.get('url'), - }) - return streaming_list - - def _get_disco_api_info(self, url, display_id, disco_host, realm, country): - geo_countries = [country.upper()] - self._initialize_geo_bypass({ - 'countries': geo_countries, - }) - disco_base = 'https://%s/' % disco_host - headers = { - 'Referer': url, - } - self._update_disco_api_headers(headers, disco_base, display_id, realm) - try: - video = self._download_json( - disco_base + 'content/videos/' + display_id, display_id, - headers=headers, query={ - 'fields[channel]': 'name', - 'fields[image]': 'height,src,width', - 'fields[show]': 'name', - 'fields[tag]': 'name', - 'fields[video]': 'description,episodeNumber,name,publishStart,seasonNumber,videoDuration', - 'include': 'images,primaryChannel,show,tags' - }) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: - self._process_errors(e, geo_countries) - raise - video_id = video['data']['id'] - info = video['data']['attributes'] - title = info['name'].strip() - formats = [] - try: - streaming = self._download_video_playback_info( - disco_base, video_id, headers) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - self._process_errors(e, geo_countries) - raise - for format_dict in streaming: - if not isinstance(format_dict, dict): - continue - format_url = format_dict.get('url') - if not format_url: - continue - format_id = format_dict.get('type') - ext = determine_ext(format_url) - if format_id == 'dash' or ext == 'mpd': - formats.extend(self._extract_mpd_formats( - format_url, display_id, mpd_id='dash', fatal=False)) - elif format_id == 'hls' or ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, display_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls', - fatal=False)) - else: - formats.append({ - 'url': format_url, - 'format_id': format_id, - }) - self._sort_formats(formats) - - creator = series = None - tags = [] - thumbnails = [] - included = video.get('included') or [] - if isinstance(included, list): - for e in included: - attributes = e.get('attributes') - if not attributes: - continue - e_type = e.get('type') - if e_type == 'channel': - creator = attributes.get('name') - elif e_type == 'image': - src = attributes.get('src') - if src: - thumbnails.append({ - 'url': src, - 'width': int_or_none(attributes.get('width')), - 'height': int_or_none(attributes.get('height')), - }) - if e_type == 'show': - series = attributes.get('name') - elif e_type == 'tag': - name = attributes.get('name') - if name: - tags.append(name) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': strip_or_none(info.get('description')), - 'duration': float_or_none(info.get('videoDuration'), 1000), - 'timestamp': unified_timestamp(info.get('publishStart')), - 'series': series, - 'season_number': int_or_none(info.get('seasonNumber')), - 'episode_number': int_or_none(info.get('episodeNumber')), - 'creator': creator, - 'tags': tags, - 'thumbnails': thumbnails, - 'formats': formats, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('id') - domain = mobj.group('domain').lstrip('www.') - country = mobj.group('country') or mobj.group('subdomain_country') or mobj.group('plus_country') - host = 'disco-api.' + domain if domain[0] == 'd' else 'eu2-prod.disco-api.com' - return self._get_disco_api_info( - url, display_id, host, 'dplay' + country, country) - - -class DiscoveryPlusIE(DPlayIE): - _VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/video' + DPlayIE._PATH_REGEX - _TESTS = [{ - 'url': 'https://www.discoveryplus.com/video/property-brothers-forever-home/food-and-family', - 'info_dict': { - 'id': '1140794', - 'display_id': 'property-brothers-forever-home/food-and-family', - 'ext': 'mp4', - 'title': 'Food and Family', - 'description': 'The brothers help a Richmond family expand their single-level home.', - 'duration': 2583.113, - 'timestamp': 1609304400, - 'upload_date': '20201230', - 'creator': 'HGTV', - 'series': 'Property Brothers: Forever Home', - 'season_number': 1, - 'episode_number': 1, - }, - 'skip': 'Available for Premium users', - }] - - def _update_disco_api_headers(self, headers, disco_base, display_id, realm): - headers['x-disco-client'] = 'WEB:UNKNOWN:dplus_us:15.0.0' - - def _download_video_playback_info(self, disco_base, video_id, headers): - return self._download_json( - disco_base + 'playback/v3/videoPlaybackInfo', - video_id, headers=headers, data=json.dumps({ - 'deviceInfo': { - 'adBlocker': False, - }, - 'videoId': video_id, - 'wisteriaProperties': { - 'platform': 'desktop', - 'product': 'dplus_us', - }, - }).encode('utf-8'))['data']['attributes']['streaming'] - - def _real_extract(self, url): - display_id = self._match_id(url) - return self._get_disco_api_info( - url, display_id, 'us1-prod-direct.discoveryplus.com', 'go', 'us') - - -class HGTVDeIE(DPlayIE): - _VALID_URL = r'https?://de\.hgtv\.com/sendungen' + DPlayIE._PATH_REGEX - _TESTS = [{ - 'url': 'https://de.hgtv.com/sendungen/tiny-house-klein-aber-oho/wer-braucht-schon-eine-toilette/', - 'info_dict': { - 'id': '151205', - 'display_id': 'tiny-house-klein-aber-oho/wer-braucht-schon-eine-toilette', - 'ext': 'mp4', - 'title': 'Wer braucht schon eine Toilette', - 'description': 'md5:05b40a27e7aed2c9172de34d459134e2', - 'duration': 1177.024, - 'timestamp': 1595705400, - 'upload_date': '20200725', - 'creator': 'HGTV', - 'series': 'Tiny House - klein, aber oho', - 'season_number': 3, - 'episode_number': 3, - }, - 'params': { - 'format': 'bestvideo', - }, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - return self._get_disco_api_info( - url, display_id, 'eu1-prod.disco-api.com', 'hgtv', 'de') diff --git a/youtube_dl/extractor/drbonanza.py b/youtube_dl/extractor/drbonanza.py deleted file mode 100644 index 164e97c36..000000000 --- a/youtube_dl/extractor/drbonanza.py +++ /dev/null @@ -1,59 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - js_to_json, - parse_duration, - unescapeHTML, -) - - -class DRBonanzaIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dr\.dk/bonanza/[^/]+/\d+/[^/]+/(?P<id>\d+)/(?P<display_id>[^/?#&]+)' - _TEST = { - 'url': 'http://www.dr.dk/bonanza/serie/154/matador/40312/matador---0824-komme-fremmede-', - 'info_dict': { - 'id': '40312', - 'display_id': 'matador---0824-komme-fremmede-', - 'ext': 'mp4', - 'title': 'MATADOR - 08:24. "Komme fremmede".', - 'description': 'md5:77b4c1ac4d4c1b9d610ab4395212ff84', - 'thumbnail': r're:^https?://.*\.(?:gif|jpg)$', - 'duration': 4613, - }, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id, display_id = mobj.group('id', 'display_id') - - webpage = self._download_webpage(url, display_id) - - info = self._parse_html5_media_entries( - url, webpage, display_id, m3u8_id='hls', - m3u8_entry_protocol='m3u8_native')[0] - self._sort_formats(info['formats']) - - asset = self._parse_json( - self._search_regex( - r'(?s)currentAsset\s*=\s*({.+?})\s*</script', webpage, 'asset'), - display_id, transform_source=js_to_json) - - title = unescapeHTML(asset['AssetTitle']).strip() - - def extract(field): - return self._search_regex( - r'<div[^>]+>\s*<p>%s:<p>\s*</div>\s*<div[^>]+>\s*<p>([^<]+)</p>' % field, - webpage, field, default=None) - - info.update({ - 'id': asset.get('AssetId') or video_id, - 'display_id': display_id, - 'title': title, - 'description': extract('Programinfo'), - 'duration': parse_duration(extract('Tid')), - 'thumbnail': asset.get('AssetImageUrl'), - }) - return info diff --git a/youtube_dl/extractor/dropbox.py b/youtube_dl/extractor/dropbox.py deleted file mode 100644 index 14b6c00b0..000000000 --- a/youtube_dl/extractor/dropbox.py +++ /dev/null @@ -1,40 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import os.path -import re - -from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote -from ..utils import url_basename - - -class DropboxIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dropbox[.]com/sh?/(?P<id>[a-zA-Z0-9]{15})/.*' - _TESTS = [ - { - 'url': 'https://www.dropbox.com/s/nelirfsxnmcfbfh/youtube-dl%20test%20video%20%27%C3%A4%22BaW_jenozKc.mp4?dl=0', - 'info_dict': { - 'id': 'nelirfsxnmcfbfh', - 'ext': 'mp4', - 'title': 'youtube-dl test video \'ä"BaW_jenozKc' - } - }, { - 'url': 'https://www.dropbox.com/sh/662glsejgzoj9sr/AAByil3FGH9KFNZ13e08eSa1a/Pregame%20Ceremony%20Program%20PA%2020140518.m4v', - 'only_matching': True, - }, - ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - fn = compat_urllib_parse_unquote(url_basename(url)) - title = os.path.splitext(fn)[0] - video_url = re.sub(r'[?&]dl=0', '', url) - video_url += ('?' if '?' not in video_url else '&') + 'dl=1' - - return { - 'id': video_id, - 'title': title, - 'url': video_url, - } diff --git a/youtube_dl/extractor/drtuber.py b/youtube_dl/extractor/drtuber.py deleted file mode 100644 index 2baea585b..000000000 --- a/youtube_dl/extractor/drtuber.py +++ /dev/null @@ -1,112 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - NO_DEFAULT, - parse_duration, - str_to_int, -) - - -class DrTuberIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|m)\.)?drtuber\.com/(?:video|embed)/(?P<id>\d+)(?:/(?P<display_id>[\w-]+))?' - _TESTS = [{ - 'url': 'http://www.drtuber.com/video/1740434/hot-perky-blonde-naked-golf', - 'md5': '93e680cf2536ad0dfb7e74d94a89facd', - 'info_dict': { - 'id': '1740434', - 'display_id': 'hot-perky-blonde-naked-golf', - 'ext': 'mp4', - 'title': 'hot perky blonde naked golf', - 'like_count': int, - 'comment_count': int, - 'categories': ['Babe', 'Blonde', 'Erotic', 'Outdoor', 'Softcore', 'Solo'], - 'thumbnail': r're:https?://.*\.jpg$', - 'age_limit': 18, - } - }, { - 'url': 'http://www.drtuber.com/embed/489939', - 'only_matching': True, - }, { - 'url': 'http://m.drtuber.com/video/3893529/lingerie-blowjob-from-beautiful-teen', - 'only_matching': True, - }] - - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?drtuber\.com/embed/\d+)', - webpage) - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') or video_id - - webpage = self._download_webpage( - 'http://www.drtuber.com/video/%s' % video_id, display_id) - - video_data = self._download_json( - 'http://www.drtuber.com/player_config_json/', video_id, query={ - 'vid': video_id, - 'embed': 0, - 'aid': 0, - 'domain_id': 0, - }) - - formats = [] - for format_id, video_url in video_data['files'].items(): - if video_url: - formats.append({ - 'format_id': format_id, - 'quality': 2 if format_id == 'hq' else 1, - 'url': video_url - }) - self._sort_formats(formats) - - duration = int_or_none(video_data.get('duration')) or parse_duration( - video_data.get('duration_format')) - - title = self._html_search_regex( - (r'<h1[^>]+class=["\']title[^>]+>([^<]+)', - r'<title>([^<]+)\s*@\s+DrTuber', - r'class="title_watch"[^>]*><(?:p|h\d+)[^>]*>([^<]+)<', - r'<p[^>]+class="title_substrate">([^<]+)</p>', - r'<title>([^<]+) - \d+'), - webpage, 'title') - - thumbnail = self._html_search_regex( - r'poster="([^"]+)"', - webpage, 'thumbnail', fatal=False) - - def extract_count(id_, name, default=NO_DEFAULT): - return str_to_int(self._html_search_regex( - r'<span[^>]+(?:class|id)="%s"[^>]*>([\d,\.]+)</span>' % id_, - webpage, '%s count' % name, default=default, fatal=False)) - - like_count = extract_count('rate_likes', 'like') - dislike_count = extract_count('rate_dislikes', 'dislike', default=None) - comment_count = extract_count('comments_count', 'comment') - - cats_str = self._search_regex( - r'<div[^>]+class="categories_list">(.+?)</div>', - webpage, 'categories', fatal=False) - categories = [] if not cats_str else re.findall( - r'<a title="([^"]+)"', cats_str) - - return { - 'id': video_id, - 'display_id': display_id, - 'formats': formats, - 'title': title, - 'thumbnail': thumbnail, - 'like_count': like_count, - 'dislike_count': dislike_count, - 'comment_count': comment_count, - 'categories': categories, - 'age_limit': self._rta_search(webpage), - 'duration': duration, - } diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py deleted file mode 100644 index c0036adb6..000000000 --- a/youtube_dl/extractor/drtv.py +++ /dev/null @@ -1,355 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import binascii -import hashlib -import re - - -from .common import InfoExtractor -from ..aes import aes_cbc_decrypt -from ..compat import compat_urllib_parse_unquote -from ..utils import ( - bytes_to_intlist, - ExtractorError, - int_or_none, - intlist_to_bytes, - float_or_none, - mimetype2ext, - str_or_none, - try_get, - unified_timestamp, - update_url_query, - url_or_none, -) - - -class DRTVIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?:// - (?: - (?:www\.)?dr\.dk/(?:tv/se|nyheder|radio(?:/ondemand)?)/(?:[^/]+/)*| - (?:www\.)?(?:dr\.dk|dr-massive\.com)/drtv/(?:se|episode|program)/ - ) - (?P<id>[\da-z_-]+) - ''' - _GEO_BYPASS = False - _GEO_COUNTRIES = ['DK'] - IE_NAME = 'drtv' - _TESTS = [{ - 'url': 'https://www.dr.dk/tv/se/boern/ultra/klassen-ultra/klassen-darlig-taber-10', - 'md5': '25e659cccc9a2ed956110a299fdf5983', - 'info_dict': { - 'id': 'klassen-darlig-taber-10', - 'ext': 'mp4', - 'title': 'Klassen - Dårlig taber (10)', - 'description': 'md5:815fe1b7fa656ed80580f31e8b3c79aa', - 'timestamp': 1539085800, - 'upload_date': '20181009', - 'duration': 606.84, - 'series': 'Klassen', - 'season': 'Klassen I', - 'season_number': 1, - 'season_id': 'urn:dr:mu:bundle:57d7e8216187a4031cfd6f6b', - 'episode': 'Episode 10', - 'episode_number': 10, - 'release_year': 2016, - }, - 'expected_warnings': ['Unable to download f4m manifest'], - }, { - # embed - 'url': 'https://www.dr.dk/nyheder/indland/live-christianias-rydning-af-pusher-street-er-i-gang', - 'info_dict': { - 'id': 'urn:dr:mu:programcard:57c926176187a50a9c6e83c6', - 'ext': 'mp4', - 'title': 'christiania pusher street ryddes drdkrjpo', - 'description': 'md5:2a71898b15057e9b97334f61d04e6eb5', - 'timestamp': 1472800279, - 'upload_date': '20160902', - 'duration': 131.4, - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': ['Unable to download f4m manifest'], - }, { - # with SignLanguage formats - 'url': 'https://www.dr.dk/tv/se/historien-om-danmark/-/historien-om-danmark-stenalder', - 'info_dict': { - 'id': 'historien-om-danmark-stenalder', - 'ext': 'mp4', - 'title': 'Historien om Danmark: Stenalder', - 'description': 'md5:8c66dcbc1669bbc6f873879880f37f2a', - 'timestamp': 1546628400, - 'upload_date': '20190104', - 'duration': 3502.56, - 'formats': 'mincount:20', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://www.dr.dk/radio/p4kbh/regionale-nyheder-kh4/p4-nyheder-2019-06-26-17-30-9', - 'only_matching': True, - }, { - 'url': 'https://www.dr.dk/drtv/se/bonderoeven_71769', - 'info_dict': { - 'id': '00951930010', - 'ext': 'mp4', - 'title': 'Bonderøven (1:8)', - 'description': 'md5:3cf18fc0d3b205745d4505f896af8121', - 'timestamp': 1546542000, - 'upload_date': '20190103', - 'duration': 2576.6, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://www.dr.dk/drtv/episode/bonderoeven_71769', - 'only_matching': True, - }, { - 'url': 'https://dr-massive.com/drtv/se/bonderoeven_71769', - 'only_matching': True, - }, { - 'url': 'https://www.dr.dk/drtv/program/jagten_220924', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - if '>Programmet er ikke længere tilgængeligt' in webpage: - raise ExtractorError( - 'Video %s is not available' % video_id, expected=True) - - video_id = self._search_regex( - (r'data-(?:material-identifier|episode-slug)="([^"]+)"', - r'data-resource="[^>"]+mu/programcard/expanded/([^"]+)"'), - webpage, 'video id', default=None) - - if not video_id: - video_id = self._search_regex( - r'(urn(?:%3A|:)dr(?:%3A|:)mu(?:%3A|:)programcard(?:%3A|:)[\da-f]+)', - webpage, 'urn', default=None) - if video_id: - video_id = compat_urllib_parse_unquote(video_id) - - _PROGRAMCARD_BASE = 'https://www.dr.dk/mu-online/api/1.4/programcard' - query = {'expanded': 'true'} - - if video_id: - programcard_url = '%s/%s' % (_PROGRAMCARD_BASE, video_id) - else: - programcard_url = _PROGRAMCARD_BASE - page = self._parse_json( - self._search_regex( - r'data\s*=\s*({.+?})\s*(?:;|</script)', webpage, - 'data'), '1')['cache']['page'] - page = page[list(page.keys())[0]] - item = try_get( - page, (lambda x: x['item'], lambda x: x['entries'][0]['item']), - dict) - video_id = item['customId'].split(':')[-1] - query['productionnumber'] = video_id - - data = self._download_json( - programcard_url, video_id, 'Downloading video JSON', query=query) - - title = str_or_none(data.get('Title')) or re.sub( - r'\s*\|\s*(?:TV\s*\|\s*DR|DRTV)$', '', - self._og_search_title(webpage)) - description = self._og_search_description( - webpage, default=None) or data.get('Description') - - timestamp = unified_timestamp( - data.get('PrimaryBroadcastStartTime') or data.get('SortDateTime')) - - thumbnail = None - duration = None - - restricted_to_denmark = False - - formats = [] - subtitles = {} - - assets = [] - primary_asset = data.get('PrimaryAsset') - if isinstance(primary_asset, dict): - assets.append(primary_asset) - secondary_assets = data.get('SecondaryAssets') - if isinstance(secondary_assets, list): - for secondary_asset in secondary_assets: - if isinstance(secondary_asset, dict): - assets.append(secondary_asset) - - def hex_to_bytes(hex): - return binascii.a2b_hex(hex.encode('ascii')) - - def decrypt_uri(e): - n = int(e[2:10], 16) - a = e[10 + n:] - data = bytes_to_intlist(hex_to_bytes(e[10:10 + n])) - key = bytes_to_intlist(hashlib.sha256( - ('%s:sRBzYNXBzkKgnjj8pGtkACch' % a).encode('utf-8')).digest()) - iv = bytes_to_intlist(hex_to_bytes(a)) - decrypted = aes_cbc_decrypt(data, key, iv) - return intlist_to_bytes( - decrypted[:-decrypted[-1]]).decode('utf-8').split('?')[0] - - for asset in assets: - kind = asset.get('Kind') - if kind == 'Image': - thumbnail = url_or_none(asset.get('Uri')) - elif kind in ('VideoResource', 'AudioResource'): - duration = float_or_none(asset.get('DurationInMilliseconds'), 1000) - restricted_to_denmark = asset.get('RestrictedToDenmark') - asset_target = asset.get('Target') - for link in asset.get('Links', []): - uri = link.get('Uri') - if not uri: - encrypted_uri = link.get('EncryptedUri') - if not encrypted_uri: - continue - try: - uri = decrypt_uri(encrypted_uri) - except Exception: - self.report_warning( - 'Unable to decrypt EncryptedUri', video_id) - continue - uri = url_or_none(uri) - if not uri: - continue - target = link.get('Target') - format_id = target or '' - if asset_target in ('SpokenSubtitles', 'SignLanguage', 'VisuallyInterpreted'): - preference = -1 - format_id += '-%s' % asset_target - elif asset_target == 'Default': - preference = 1 - else: - preference = None - if target == 'HDS': - f4m_formats = self._extract_f4m_formats( - uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43', - video_id, preference, f4m_id=format_id, fatal=False) - if kind == 'AudioResource': - for f in f4m_formats: - f['vcodec'] = 'none' - formats.extend(f4m_formats) - elif target == 'HLS': - formats.extend(self._extract_m3u8_formats( - uri, video_id, 'mp4', entry_protocol='m3u8_native', - preference=preference, m3u8_id=format_id, - fatal=False)) - else: - bitrate = link.get('Bitrate') - if bitrate: - format_id += '-%s' % bitrate - formats.append({ - 'url': uri, - 'format_id': format_id, - 'tbr': int_or_none(bitrate), - 'ext': link.get('FileFormat'), - 'vcodec': 'none' if kind == 'AudioResource' else None, - 'preference': preference, - }) - subtitles_list = asset.get('SubtitlesList') or asset.get('Subtitleslist') - if isinstance(subtitles_list, list): - LANGS = { - 'Danish': 'da', - } - for subs in subtitles_list: - if not isinstance(subs, dict): - continue - sub_uri = url_or_none(subs.get('Uri')) - if not sub_uri: - continue - lang = subs.get('Language') or 'da' - subtitles.setdefault(LANGS.get(lang, lang), []).append({ - 'url': sub_uri, - 'ext': mimetype2ext(subs.get('MimeType')) or 'vtt' - }) - - if not formats and restricted_to_denmark: - self.raise_geo_restricted( - 'Unfortunately, DR is not allowed to show this program outside Denmark.', - countries=self._GEO_COUNTRIES) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'duration': duration, - 'formats': formats, - 'subtitles': subtitles, - 'series': str_or_none(data.get('SeriesTitle')), - 'season': str_or_none(data.get('SeasonTitle')), - 'season_number': int_or_none(data.get('SeasonNumber')), - 'season_id': str_or_none(data.get('SeasonUrn')), - 'episode': str_or_none(data.get('EpisodeTitle')), - 'episode_number': int_or_none(data.get('EpisodeNumber')), - 'release_year': int_or_none(data.get('ProductionYear')), - } - - -class DRTVLiveIE(InfoExtractor): - IE_NAME = 'drtv:live' - _VALID_URL = r'https?://(?:www\.)?dr\.dk/(?:tv|TV)/live/(?P<id>[\da-z-]+)' - _GEO_COUNTRIES = ['DK'] - _TEST = { - 'url': 'https://www.dr.dk/tv/live/dr1', - 'info_dict': { - 'id': 'dr1', - 'ext': 'mp4', - 'title': 're:^DR1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - } - - def _real_extract(self, url): - channel_id = self._match_id(url) - channel_data = self._download_json( - 'https://www.dr.dk/mu-online/api/1.0/channel/' + channel_id, - channel_id) - title = self._live_title(channel_data['Title']) - - formats = [] - for streaming_server in channel_data.get('StreamingServers', []): - server = streaming_server.get('Server') - if not server: - continue - link_type = streaming_server.get('LinkType') - for quality in streaming_server.get('Qualities', []): - for stream in quality.get('Streams', []): - stream_path = stream.get('Stream') - if not stream_path: - continue - stream_url = update_url_query( - '%s/%s' % (server, stream_path), {'b': ''}) - if link_type == 'HLS': - formats.extend(self._extract_m3u8_formats( - stream_url, channel_id, 'mp4', - m3u8_id=link_type, fatal=False, live=True)) - elif link_type == 'HDS': - formats.extend(self._extract_f4m_formats(update_url_query( - '%s/%s' % (server, stream_path), {'hdcore': '3.7.0'}), - channel_id, f4m_id=link_type, fatal=False)) - self._sort_formats(formats) - - return { - 'id': channel_id, - 'title': title, - 'thumbnail': channel_data.get('PrimaryImageUri'), - 'formats': formats, - 'is_live': True, - } diff --git a/youtube_dl/extractor/dtube.py b/youtube_dl/extractor/dtube.py deleted file mode 100644 index 114d2dbe3..000000000 --- a/youtube_dl/extractor/dtube.py +++ /dev/null @@ -1,83 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import json -import re -from socket import timeout - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - parse_iso8601, -) - - -class DTubeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?d\.tube/(?:#!/)?v/(?P<uploader_id>[0-9a-z.-]+)/(?P<id>[0-9a-z]{8})' - _TEST = { - 'url': 'https://d.tube/#!/v/broncnutz/x380jtr1', - 'md5': '9f29088fa08d699a7565ee983f56a06e', - 'info_dict': { - 'id': 'x380jtr1', - 'ext': 'mp4', - 'title': 'Lefty 3-Rings is Back Baby!! NCAA Picks', - 'description': 'md5:60be222088183be3a42f196f34235776', - 'uploader_id': 'broncnutz', - 'upload_date': '20190107', - 'timestamp': 1546854054, - }, - 'params': { - 'format': '480p', - }, - } - - def _real_extract(self, url): - uploader_id, video_id = re.match(self._VALID_URL, url).groups() - result = self._download_json('https://api.steemit.com/', video_id, data=json.dumps({ - 'jsonrpc': '2.0', - 'method': 'get_content', - 'params': [uploader_id, video_id], - }).encode())['result'] - - metadata = json.loads(result['json_metadata']) - video = metadata['video'] - content = video['content'] - info = video.get('info', {}) - title = info.get('title') or result['title'] - - def canonical_url(h): - if not h: - return None - return 'https://video.dtube.top/ipfs/' + h - - formats = [] - for q in ('240', '480', '720', '1080', ''): - video_url = canonical_url(content.get('video%shash' % q)) - if not video_url: - continue - format_id = (q + 'p') if q else 'Source' - try: - self.to_screen('%s: Checking %s video format URL' % (video_id, format_id)) - self._downloader._opener.open(video_url, timeout=5).close() - except timeout: - self.to_screen( - '%s: %s URL is invalid, skipping' % (video_id, format_id)) - continue - formats.append({ - 'format_id': format_id, - 'url': video_url, - 'height': int_or_none(q), - 'ext': 'mp4', - }) - - return { - 'id': video_id, - 'title': title, - 'description': content.get('description'), - 'thumbnail': canonical_url(info.get('snaphash')), - 'tags': content.get('tags') or metadata.get('tags'), - 'duration': info.get('duration'), - 'formats': formats, - 'timestamp': parse_iso8601(result.get('created')), - 'uploader_id': uploader_id, - } diff --git a/youtube_dl/extractor/dw.py b/youtube_dl/extractor/dw.py deleted file mode 100644 index d740652f1..000000000 --- a/youtube_dl/extractor/dw.py +++ /dev/null @@ -1,108 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - unified_strdate, -) -from ..compat import compat_urlparse - - -class DWIE(InfoExtractor): - IE_NAME = 'dw' - _VALID_URL = r'https?://(?:www\.)?dw\.com/(?:[^/]+/)+(?:av|e)-(?P<id>\d+)' - _TESTS = [{ - # video - 'url': 'http://www.dw.com/en/intelligent-light/av-19112290', - 'md5': '7372046e1815c5a534b43f3c3c36e6e9', - 'info_dict': { - 'id': '19112290', - 'ext': 'mp4', - 'title': 'Intelligent light', - 'description': 'md5:90e00d5881719f2a6a5827cb74985af1', - 'upload_date': '20160311', - } - }, { - # audio - 'url': 'http://www.dw.com/en/worldlink-my-business/av-19111941', - 'md5': '2814c9a1321c3a51f8a7aeb067a360dd', - 'info_dict': { - 'id': '19111941', - 'ext': 'mp3', - 'title': 'WorldLink: My business', - 'description': 'md5:bc9ca6e4e063361e21c920c53af12405', - 'upload_date': '20160311', - } - }, { - # DW documentaries, only last for one or two weeks - 'url': 'http://www.dw.com/en/documentaries-welcome-to-the-90s-2016-05-21/e-19220158-9798', - 'md5': '56b6214ef463bfb9a3b71aeb886f3cf1', - 'info_dict': { - 'id': '19274438', - 'ext': 'mp4', - 'title': 'Welcome to the 90s – Hip Hop', - 'description': 'Welcome to the 90s - The Golden Decade of Hip Hop', - 'upload_date': '20160521', - }, - 'skip': 'Video removed', - }] - - def _real_extract(self, url): - media_id = self._match_id(url) - webpage = self._download_webpage(url, media_id) - hidden_inputs = self._hidden_inputs(webpage) - title = hidden_inputs['media_title'] - media_id = hidden_inputs.get('media_id') or media_id - - if hidden_inputs.get('player_type') == 'video' and hidden_inputs.get('stream_file') == '1': - formats = self._extract_smil_formats( - 'http://www.dw.com/smil/v-%s' % media_id, media_id, - transform_source=lambda s: s.replace( - 'rtmp://tv-od.dw.de/flash/', - 'http://tv-download.dw.de/dwtv_video/flv/')) - self._sort_formats(formats) - else: - formats = [{'url': hidden_inputs['file_name']}] - - upload_date = hidden_inputs.get('display_date') - if not upload_date: - upload_date = self._html_search_regex( - r'<span[^>]+class="date">([0-9.]+)\s*\|', webpage, - 'upload date', default=None) - upload_date = unified_strdate(upload_date) - - return { - 'id': media_id, - 'title': title, - 'description': self._og_search_description(webpage), - 'thumbnail': hidden_inputs.get('preview_image'), - 'duration': int_or_none(hidden_inputs.get('file_duration')), - 'upload_date': upload_date, - 'formats': formats, - } - - -class DWArticleIE(InfoExtractor): - IE_NAME = 'dw:article' - _VALID_URL = r'https?://(?:www\.)?dw\.com/(?:[^/]+/)+a-(?P<id>\d+)' - _TEST = { - 'url': 'http://www.dw.com/en/no-hope-limited-options-for-refugees-in-idomeni/a-19111009', - 'md5': '8ca657f9d068bbef74d6fc38b97fc869', - 'info_dict': { - 'id': '19105868', - 'ext': 'mp4', - 'title': 'The harsh life of refugees in Idomeni', - 'description': 'md5:196015cc7e48ebf474db9399420043c7', - 'upload_date': '20160310', - } - } - - def _real_extract(self, url): - article_id = self._match_id(url) - webpage = self._download_webpage(url, article_id) - hidden_inputs = self._hidden_inputs(webpage) - media_id = hidden_inputs['media_id'] - media_path = self._search_regex(r'href="([^"]+av-%s)"\s+class="overlayLink"' % media_id, webpage, 'media url') - media_url = compat_urlparse.urljoin(url, media_path) - return self.url_result(media_url, 'DW', media_id) diff --git a/youtube_dl/extractor/eagleplatform.py b/youtube_dl/extractor/eagleplatform.py deleted file mode 100644 index 36fef07b7..000000000 --- a/youtube_dl/extractor/eagleplatform.py +++ /dev/null @@ -1,206 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_HTTPError -from ..utils import ( - ExtractorError, - int_or_none, - unsmuggle_url, - url_or_none, -) - - -class EaglePlatformIE(InfoExtractor): - _VALID_URL = r'''(?x) - (?: - eagleplatform:(?P<custom_host>[^/]+):| - https?://(?P<host>.+?\.media\.eagleplatform\.com)/index/player\?.*\brecord_id= - ) - (?P<id>\d+) - ''' - _TESTS = [{ - # http://lenta.ru/news/2015/03/06/navalny/ - 'url': 'http://lentaru.media.eagleplatform.com/index/player?player=new&record_id=227304&player_template_id=5201', - # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used - 'info_dict': { - 'id': '227304', - 'ext': 'mp4', - 'title': 'Навальный вышел на свободу', - 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 87, - 'view_count': int, - 'age_limit': 0, - }, - }, { - # http://muz-tv.ru/play/7129/ - # http://media.clipyou.ru/index/player?record_id=12820&width=730&height=415&autoplay=true - 'url': 'eagleplatform:media.clipyou.ru:12820', - 'md5': '358597369cf8ba56675c1df15e7af624', - 'info_dict': { - 'id': '12820', - 'ext': 'mp4', - 'title': "'O Sole Mio", - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 216, - 'view_count': int, - }, - 'skip': 'Georestricted', - }, { - # referrer protected video (https://tvrain.ru/lite/teleshow/kak_vse_nachinalos/namin-418921/) - 'url': 'eagleplatform:tvrainru.media.eagleplatform.com:582306', - 'only_matching': True, - }] - - @staticmethod - def _extract_url(webpage): - # Regular iframe embedding - mobj = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//.+?\.media\.eagleplatform\.com/index/player\?.+?)\1', - webpage) - if mobj is not None: - return mobj.group('url') - PLAYER_JS_RE = r''' - <script[^>]+ - src=(?P<qjs>["\'])(?:https?:)?//(?P<host>(?:(?!(?P=qjs)).)+\.media\.eagleplatform\.com)/player/player\.js(?P=qjs) - .+? - ''' - # "Basic usage" embedding (see http://dultonmedia.github.io/eplayer/) - mobj = re.search( - r'''(?xs) - %s - <div[^>]+ - class=(?P<qclass>["\'])eagleplayer(?P=qclass)[^>]+ - data-id=["\'](?P<id>\d+) - ''' % PLAYER_JS_RE, webpage) - if mobj is not None: - return 'eagleplatform:%(host)s:%(id)s' % mobj.groupdict() - # Generalization of "Javascript code usage", "Combined usage" and - # "Usage without attaching to DOM" embeddings (see - # http://dultonmedia.github.io/eplayer/) - mobj = re.search( - r'''(?xs) - %s - <script> - .+? - new\s+EaglePlayer\( - (?:[^,]+\s*,\s*)? - { - .+? - \bid\s*:\s*["\']?(?P<id>\d+) - .+? - } - \s*\) - .+? - </script> - ''' % PLAYER_JS_RE, webpage) - if mobj is not None: - return 'eagleplatform:%(host)s:%(id)s' % mobj.groupdict() - - @staticmethod - def _handle_error(response): - status = int_or_none(response.get('status', 200)) - if status != 200: - raise ExtractorError(' '.join(response['errors']), expected=True) - - def _download_json(self, url_or_request, video_id, *args, **kwargs): - try: - response = super(EaglePlatformIE, self)._download_json( - url_or_request, video_id, *args, **kwargs) - except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError): - response = self._parse_json(ee.cause.read().decode('utf-8'), video_id) - self._handle_error(response) - raise - return response - - def _get_video_url(self, url_or_request, video_id, note='Downloading JSON metadata'): - return self._download_json(url_or_request, video_id, note)['data'][0] - - def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) - - mobj = re.match(self._VALID_URL, url) - host, video_id = mobj.group('custom_host') or mobj.group('host'), mobj.group('id') - - headers = {} - query = { - 'id': video_id, - } - - referrer = smuggled_data.get('referrer') - if referrer: - headers['Referer'] = referrer - query['referrer'] = referrer - - player_data = self._download_json( - 'http://%s/api/player_data' % host, video_id, - headers=headers, query=query) - - media = player_data['data']['playlist']['viewports'][0]['medialist'][0] - - title = media['title'] - description = media.get('description') - thumbnail = self._proto_relative_url(media.get('snapshot'), 'http:') - duration = int_or_none(media.get('duration')) - view_count = int_or_none(media.get('views')) - - age_restriction = media.get('age_restriction') - age_limit = None - if age_restriction: - age_limit = 0 if age_restriction == 'allow_all' else 18 - - secure_m3u8 = self._proto_relative_url(media['sources']['secure_m3u8']['auto'], 'http:') - - formats = [] - - m3u8_url = self._get_video_url(secure_m3u8, video_id, 'Downloading m3u8 JSON') - m3u8_formats = self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False) - formats.extend(m3u8_formats) - - m3u8_formats_dict = {} - for f in m3u8_formats: - if f.get('height') is not None: - m3u8_formats_dict[f['height']] = f - - mp4_data = self._download_json( - # Secure mp4 URL is constructed according to Player.prototype.mp4 from - # http://lentaru.media.eagleplatform.com/player/player.js - re.sub(r'm3u8|hlsvod|hls|f4m', 'mp4s', secure_m3u8), - video_id, 'Downloading mp4 JSON', fatal=False) - if mp4_data: - for format_id, format_url in mp4_data.get('data', {}).items(): - if not url_or_none(format_url): - continue - height = int_or_none(format_id) - if height is not None and m3u8_formats_dict.get(height): - f = m3u8_formats_dict[height].copy() - f.update({ - 'format_id': f['format_id'].replace('hls', 'http'), - 'protocol': 'http', - }) - else: - f = { - 'format_id': 'http-%s' % format_id, - 'height': int_or_none(format_id), - } - f['url'] = format_url - formats.append(f) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'view_count': view_count, - 'age_limit': age_limit, - 'formats': formats, - } diff --git a/youtube_dl/extractor/egghead.py b/youtube_dl/extractor/egghead.py deleted file mode 100644 index 9bbd703e0..000000000 --- a/youtube_dl/extractor/egghead.py +++ /dev/null @@ -1,140 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - determine_ext, - int_or_none, - try_get, - unified_timestamp, - url_or_none, -) - - -class EggheadBaseIE(InfoExtractor): - def _call_api(self, path, video_id, resource, fatal=True): - return self._download_json( - 'https://app.egghead.io/api/v1/' + path, - video_id, 'Downloading %s JSON' % resource, fatal=fatal) - - -class EggheadCourseIE(EggheadBaseIE): - IE_DESC = 'egghead.io course' - IE_NAME = 'egghead:course' - _VALID_URL = r'https://(?:app\.)?egghead\.io/(?:course|playlist)s/(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'https://egghead.io/courses/professor-frisby-introduces-composable-functional-javascript', - 'playlist_count': 29, - 'info_dict': { - 'id': '432655', - 'title': 'Professor Frisby Introduces Composable Functional JavaScript', - 'description': 're:(?s)^This course teaches the ubiquitous.*You\'ll start composing functionality before you know it.$', - }, - }, { - 'url': 'https://app.egghead.io/playlists/professor-frisby-introduces-composable-functional-javascript', - 'only_matching': True, - }] - - def _real_extract(self, url): - playlist_id = self._match_id(url) - series_path = 'series/' + playlist_id - lessons = self._call_api( - series_path + '/lessons', playlist_id, 'course lessons') - - entries = [] - for lesson in lessons: - lesson_url = url_or_none(lesson.get('http_url')) - if not lesson_url: - continue - lesson_id = lesson.get('id') - if lesson_id: - lesson_id = compat_str(lesson_id) - entries.append(self.url_result( - lesson_url, ie=EggheadLessonIE.ie_key(), video_id=lesson_id)) - - course = self._call_api( - series_path, playlist_id, 'course', False) or {} - - playlist_id = course.get('id') - if playlist_id: - playlist_id = compat_str(playlist_id) - - return self.playlist_result( - entries, playlist_id, course.get('title'), - course.get('description')) - - -class EggheadLessonIE(EggheadBaseIE): - IE_DESC = 'egghead.io lesson' - IE_NAME = 'egghead:lesson' - _VALID_URL = r'https://(?:app\.)?egghead\.io/(?:api/v1/)?lessons/(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'https://egghead.io/lessons/javascript-linear-data-flow-with-container-style-types-box', - 'info_dict': { - 'id': '1196', - 'display_id': 'javascript-linear-data-flow-with-container-style-types-box', - 'ext': 'mp4', - 'title': 'Create linear data flow with container style types (Box)', - 'description': 'md5:9aa2cdb6f9878ed4c39ec09e85a8150e', - 'thumbnail': r're:^https?:.*\.jpg$', - 'timestamp': 1481296768, - 'upload_date': '20161209', - 'duration': 304, - 'view_count': 0, - 'tags': 'count:2', - }, - 'params': { - 'skip_download': True, - 'format': 'bestvideo', - }, - }, { - 'url': 'https://egghead.io/api/v1/lessons/react-add-redux-to-a-react-application', - 'only_matching': True, - }, { - 'url': 'https://app.egghead.io/lessons/javascript-linear-data-flow-with-container-style-types-box', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - - lesson = self._call_api( - 'lessons/' + display_id, display_id, 'lesson') - - lesson_id = compat_str(lesson['id']) - title = lesson['title'] - - formats = [] - for _, format_url in lesson['media_urls'].items(): - format_url = url_or_none(format_url) - if not format_url: - continue - ext = determine_ext(format_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, lesson_id, 'mp4', entry_protocol='m3u8', - m3u8_id='hls', fatal=False)) - elif ext == 'mpd': - formats.extend(self._extract_mpd_formats( - format_url, lesson_id, mpd_id='dash', fatal=False)) - else: - formats.append({ - 'url': format_url, - }) - self._sort_formats(formats) - - return { - 'id': lesson_id, - 'display_id': display_id, - 'title': title, - 'description': lesson.get('summary'), - 'thumbnail': lesson.get('thumb_nail'), - 'timestamp': unified_timestamp(lesson.get('published_at')), - 'duration': int_or_none(lesson.get('duration')), - 'view_count': int_or_none(lesson.get('plays_count')), - 'tags': try_get(lesson, lambda x: x['tag_list'], list), - 'series': try_get( - lesson, lambda x: x['series']['title'], compat_str), - 'formats': formats, - } diff --git a/youtube_dl/extractor/einthusan.py b/youtube_dl/extractor/einthusan.py deleted file mode 100644 index 4e0f8bc81..000000000 --- a/youtube_dl/extractor/einthusan.py +++ /dev/null @@ -1,111 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import json -import re - -from .common import InfoExtractor -from ..compat import ( - compat_b64decode, - compat_str, - compat_urlparse, -) -from ..utils import ( - extract_attributes, - ExtractorError, - get_elements_by_class, - urlencode_postdata, -) - - -class EinthusanIE(InfoExtractor): - _VALID_URL = r'https?://(?P<host>einthusan\.(?:tv|com|ca))/movie/watch/(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'https://einthusan.tv/movie/watch/9097/', - 'md5': 'ff0f7f2065031b8a2cf13a933731c035', - 'info_dict': { - 'id': '9097', - 'ext': 'mp4', - 'title': 'Ae Dil Hai Mushkil', - 'description': 'md5:33ef934c82a671a94652a9b4e54d931b', - 'thumbnail': r're:^https?://.*\.jpg$', - } - }, { - 'url': 'https://einthusan.tv/movie/watch/51MZ/?lang=hindi', - 'only_matching': True, - }, { - 'url': 'https://einthusan.com/movie/watch/9097/', - 'only_matching': True, - }, { - 'url': 'https://einthusan.ca/movie/watch/4E9n/?lang=hindi', - 'only_matching': True, - }] - - # reversed from jsoncrypto.prototype.decrypt() in einthusan-PGMovieWatcher.js - def _decrypt(self, encrypted_data, video_id): - return self._parse_json(compat_b64decode(( - encrypted_data[:10] + encrypted_data[-1] + encrypted_data[12:-1] - )).decode('utf-8'), video_id) - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - host = mobj.group('host') - video_id = mobj.group('id') - - webpage = self._download_webpage(url, video_id) - - title = self._html_search_regex(r'<h3>([^<]+)</h3>', webpage, 'title') - - player_params = extract_attributes(self._search_regex( - r'(<section[^>]+id="UIVideoPlayer"[^>]+>)', webpage, 'player parameters')) - - page_id = self._html_search_regex( - '<html[^>]+data-pageid="([^"]+)"', webpage, 'page ID') - video_data = self._download_json( - 'https://%s/ajax/movie/watch/%s/' % (host, video_id), video_id, - data=urlencode_postdata({ - 'xEvent': 'UIVideoPlayer.PingOutcome', - 'xJson': json.dumps({ - 'EJOutcomes': player_params['data-ejpingables'], - 'NativeHLS': False - }), - 'arcVersion': 3, - 'appVersion': 59, - 'gorilla.csrf.Token': page_id, - }))['Data'] - - if isinstance(video_data, compat_str) and video_data.startswith('/ratelimited/'): - raise ExtractorError( - 'Download rate reached. Please try again later.', expected=True) - - ej_links = self._decrypt(video_data['EJLinks'], video_id) - - formats = [] - - m3u8_url = ej_links.get('HLSLink') - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native')) - - mp4_url = ej_links.get('MP4Link') - if mp4_url: - formats.append({ - 'url': mp4_url, - }) - - self._sort_formats(formats) - - description = get_elements_by_class('synopsis', webpage)[0] - thumbnail = self._html_search_regex( - r'''<img[^>]+src=(["'])(?P<url>(?!\1).+?/moviecovers/(?!\1).+?)\1''', - webpage, 'thumbnail url', fatal=False, group='url') - if thumbnail is not None: - thumbnail = compat_urlparse.urljoin(url, thumbnail) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'thumbnail': thumbnail, - 'description': description, - } diff --git a/youtube_dl/extractor/eporner.py b/youtube_dl/extractor/eporner.py deleted file mode 100644 index bfecd3a41..000000000 --- a/youtube_dl/extractor/eporner.py +++ /dev/null @@ -1,132 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - encode_base_n, - ExtractorError, - int_or_none, - merge_dicts, - parse_duration, - str_to_int, - url_or_none, -) - - -class EpornerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?eporner\.com/(?:(?:hd-porn|embed)/|video-)(?P<id>\w+)(?:/(?P<display_id>[\w-]+))?' - _TESTS = [{ - 'url': 'http://www.eporner.com/hd-porn/95008/Infamous-Tiffany-Teen-Strip-Tease-Video/', - 'md5': '39d486f046212d8e1b911c52ab4691f8', - 'info_dict': { - 'id': 'qlDUmNsj6VS', - 'display_id': 'Infamous-Tiffany-Teen-Strip-Tease-Video', - 'ext': 'mp4', - 'title': 'Infamous Tiffany Teen Strip Tease Video', - 'description': 'md5:764f39abf932daafa37485eb46efa152', - 'timestamp': 1232520922, - 'upload_date': '20090121', - 'duration': 1838, - 'view_count': int, - 'age_limit': 18, - }, - 'params': { - 'proxy': '127.0.0.1:8118' - } - }, { - # New (May 2016) URL layout - 'url': 'http://www.eporner.com/hd-porn/3YRUtzMcWn0/Star-Wars-XXX-Parody/', - 'only_matching': True, - }, { - 'url': 'http://www.eporner.com/hd-porn/3YRUtzMcWn0', - 'only_matching': True, - }, { - 'url': 'http://www.eporner.com/embed/3YRUtzMcWn0', - 'only_matching': True, - }, { - 'url': 'https://www.eporner.com/video-FJsA19J3Y3H/one-of-the-greats/', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') or video_id - - webpage, urlh = self._download_webpage_handle(url, display_id) - - video_id = self._match_id(urlh.geturl()) - - hash = self._search_regex( - r'hash\s*[:=]\s*["\']([\da-f]{32})', webpage, 'hash') - - title = self._og_search_title(webpage, default=None) or self._html_search_regex( - r'<title>(.+?) - EPORNER', webpage, 'title') - - # Reverse engineered from vjs.js - def calc_hash(s): - return ''.join((encode_base_n(int(s[lb:lb + 8], 16), 36) for lb in range(0, 32, 8))) - - video = self._download_json( - 'http://www.eporner.com/xhr/video/%s' % video_id, - display_id, note='Downloading video JSON', - query={ - 'hash': calc_hash(hash), - 'device': 'generic', - 'domain': 'www.eporner.com', - 'fallback': 'false', - }) - - if video.get('available') is False: - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, video['message']), expected=True) - - sources = video['sources'] - - formats = [] - for kind, formats_dict in sources.items(): - if not isinstance(formats_dict, dict): - continue - for format_id, format_dict in formats_dict.items(): - if not isinstance(format_dict, dict): - continue - src = url_or_none(format_dict.get('src')) - if not src or not src.startswith('http'): - continue - if kind == 'hls': - formats.extend(self._extract_m3u8_formats( - src, display_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id=kind, fatal=False)) - else: - height = int_or_none(self._search_regex( - r'(\d+)[pP]', format_id, 'height', default=None)) - fps = int_or_none(self._search_regex( - r'(\d+)fps', format_id, 'fps', default=None)) - - formats.append({ - 'url': src, - 'format_id': format_id, - 'height': height, - 'fps': fps, - }) - self._sort_formats(formats) - - json_ld = self._search_json_ld(webpage, display_id, default={}) - - duration = parse_duration(self._html_search_meta( - 'duration', webpage, default=None)) - view_count = str_to_int(self._search_regex( - r'id=["\']cinemaviews1["\'][^>]*>\s*([0-9,]+)', - webpage, 'view count', default=None)) - - return merge_dicts(json_ld, { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'duration': duration, - 'view_count': view_count, - 'formats': formats, - 'age_limit': 18, - }) diff --git a/youtube_dl/extractor/eroprofile.py b/youtube_dl/extractor/eroprofile.py deleted file mode 100644 index c460dc7f9..000000000 --- a/youtube_dl/extractor/eroprofile.py +++ /dev/null @@ -1,92 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlencode -from ..utils import ( - ExtractorError, - merge_dicts, -) - - -class EroProfileIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?eroprofile\.com/m/videos/view/(?P<id>[^/]+)' - _LOGIN_URL = 'http://www.eroprofile.com/auth/auth.php?' - _NETRC_MACHINE = 'eroprofile' - _TESTS = [{ - 'url': 'http://www.eroprofile.com/m/videos/view/sexy-babe-softcore', - 'md5': 'c26f351332edf23e1ea28ce9ec9de32f', - 'info_dict': { - 'id': '3733775', - 'display_id': 'sexy-babe-softcore', - 'ext': 'm4v', - 'title': 'sexy babe softcore', - 'thumbnail': r're:https?://.*\.jpg', - 'age_limit': 18, - }, - 'skip': 'Video not found', - }, { - 'url': 'http://www.eroprofile.com/m/videos/view/Try-It-On-Pee_cut_2-wmv-4shared-com-file-sharing-download-movie-file', - 'md5': '1baa9602ede46ce904c431f5418d8916', - 'info_dict': { - 'id': '1133519', - 'ext': 'm4v', - 'title': 'Try It On Pee_cut_2.wmv - 4shared.com - file sharing - download movie file', - 'thumbnail': r're:https?://.*\.jpg', - 'age_limit': 18, - }, - 'skip': 'Requires login', - }] - - def _login(self): - (username, password) = self._get_login_info() - if username is None: - return - - query = compat_urllib_parse_urlencode({ - 'username': username, - 'password': password, - 'url': 'http://www.eroprofile.com/', - }) - login_url = self._LOGIN_URL + query - login_page = self._download_webpage(login_url, None, False) - - m = re.search(r'Your username or password was incorrect\.', login_page) - if m: - raise ExtractorError( - 'Wrong username and/or password.', expected=True) - - self.report_login() - redirect_url = self._search_regex( - r'<script[^>]+?src="([^"]+)"', login_page, 'login redirect url') - self._download_webpage(redirect_url, None, False) - - def _real_initialize(self): - self._login() - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - m = re.search(r'You must be logged in to view this video\.', webpage) - if m: - self.raise_login_required('This video requires login') - - video_id = self._search_regex( - [r"glbUpdViews\s*\('\d*','(\d+)'", r'p/report/video/(\d+)'], - webpage, 'video id', default=None) - - title = self._html_search_regex( - (r'Title:</th><td>([^<]+)</td>', r'<h1[^>]*>(.+?)</h1>'), - webpage, 'title') - - info = self._parse_html5_media_entries(url, webpage, video_id)[0] - - return merge_dicts(info, { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'age_limit': 18, - }) diff --git a/youtube_dl/extractor/espn.py b/youtube_dl/extractor/espn.py deleted file mode 100644 index 6cf05e6da..000000000 --- a/youtube_dl/extractor/espn.py +++ /dev/null @@ -1,238 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from .once import OnceIE -from ..compat import compat_str -from ..utils import ( - determine_ext, - int_or_none, - unified_timestamp, -) - - -class ESPNIE(OnceIE): - _VALID_URL = r'''(?x) - https?:// - (?: - (?: - (?: - (?:(?:\w+\.)+)?espn\.go| - (?:www\.)?espn - )\.com/ - (?: - (?: - video/(?:clip|iframe/twitter)| - watch/player - ) - (?: - .*?\?.*?\bid=| - /_/id/ - )| - [^/]+/video/ - ) - )| - (?:www\.)espnfc\.(?:com|us)/(?:video/)?[^/]+/\d+/video/ - ) - (?P<id>\d+) - ''' - - _TESTS = [{ - 'url': 'http://espn.go.com/video/clip?id=10365079', - 'info_dict': { - 'id': '10365079', - 'ext': 'mp4', - 'title': '30 for 30 Shorts: Judging Jewell', - 'description': 'md5:39370c2e016cb4ecf498ffe75bef7f0f', - 'timestamp': 1390936111, - 'upload_date': '20140128', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://broadband.espn.go.com/video/clip?id=18910086', - 'info_dict': { - 'id': '18910086', - 'ext': 'mp4', - 'title': 'Kyrie spins around defender for two', - 'description': 'md5:2b0f5bae9616d26fba8808350f0d2b9b', - 'timestamp': 1489539155, - 'upload_date': '20170315', - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': ['Unable to download f4m manifest'], - }, { - 'url': 'http://nonredline.sports.espn.go.com/video/clip?id=19744672', - 'only_matching': True, - }, { - 'url': 'https://cdn.espn.go.com/video/clip/_/id/19771774', - 'only_matching': True, - }, { - 'url': 'http://www.espn.com/watch/player?id=19141491', - 'only_matching': True, - }, { - 'url': 'http://www.espn.com/watch/player?bucketId=257&id=19505875', - 'only_matching': True, - }, { - 'url': 'http://www.espn.com/watch/player/_/id/19141491', - 'only_matching': True, - }, { - 'url': 'http://www.espn.com/video/clip?id=10365079', - 'only_matching': True, - }, { - 'url': 'http://www.espn.com/video/clip/_/id/17989860', - 'only_matching': True, - }, { - 'url': 'https://espn.go.com/video/iframe/twitter/?cms=espn&id=10365079', - 'only_matching': True, - }, { - 'url': 'http://www.espnfc.us/video/espn-fc-tv/86/video/3319154/nashville-unveiled-as-the-newest-club-in-mls', - 'only_matching': True, - }, { - 'url': 'http://www.espnfc.com/english-premier-league/23/video/3324163/premier-league-in-90-seconds-golden-tweets', - 'only_matching': True, - }, { - 'url': 'http://www.espn.com/espnw/video/26066627/arkansas-gibson-completes-hr-cycle-four-innings', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - clip = self._download_json( - 'http://api-app.espn.com/v1/video/clips/%s' % video_id, - video_id)['videos'][0] - - title = clip['headline'] - - format_urls = set() - formats = [] - - def traverse_source(source, base_source_id=None): - for source_id, source in source.items(): - if source_id == 'alert': - continue - elif isinstance(source, compat_str): - extract_source(source, base_source_id) - elif isinstance(source, dict): - traverse_source( - source, - '%s-%s' % (base_source_id, source_id) - if base_source_id else source_id) - - def extract_source(source_url, source_id=None): - if source_url in format_urls: - return - format_urls.add(source_url) - ext = determine_ext(source_url) - if OnceIE.suitable(source_url): - formats.extend(self._extract_once_formats(source_url)) - elif ext == 'smil': - formats.extend(self._extract_smil_formats( - source_url, video_id, fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - source_url, video_id, f4m_id=source_id, fatal=False)) - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - source_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id=source_id, fatal=False)) - else: - f = { - 'url': source_url, - 'format_id': source_id, - } - mobj = re.search(r'(\d+)p(\d+)_(\d+)k\.', source_url) - if mobj: - f.update({ - 'height': int(mobj.group(1)), - 'fps': int(mobj.group(2)), - 'tbr': int(mobj.group(3)), - }) - if source_id == 'mezzanine': - f['preference'] = 1 - formats.append(f) - - links = clip.get('links', {}) - traverse_source(links.get('source', {})) - traverse_source(links.get('mobile', {})) - self._sort_formats(formats) - - description = clip.get('caption') or clip.get('description') - thumbnail = clip.get('thumbnail') - duration = int_or_none(clip.get('duration')) - timestamp = unified_timestamp(clip.get('originalPublishDate')) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'duration': duration, - 'formats': formats, - } - - -class ESPNArticleIE(InfoExtractor): - _VALID_URL = r'https?://(?:espn\.go|(?:www\.)?espn)\.com/(?:[^/]+/)*(?P<id>[^/]+)' - _TESTS = [{ - 'url': 'http://espn.go.com/nba/recap?gameId=400793786', - 'only_matching': True, - }, { - 'url': 'http://espn.go.com/blog/golden-state-warriors/post/_/id/593/how-warriors-rapidly-regained-a-winning-edge', - 'only_matching': True, - }, { - 'url': 'http://espn.go.com/sports/endurance/story/_/id/12893522/dzhokhar-tsarnaev-sentenced-role-boston-marathon-bombings', - 'only_matching': True, - }, { - 'url': 'http://espn.go.com/nba/playoffs/2015/story/_/id/12887571/john-wall-washington-wizards-no-swelling-left-hand-wrist-game-5-return', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return False if ESPNIE.suitable(url) else super(ESPNArticleIE, cls).suitable(url) - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - video_id = self._search_regex( - r'class=(["\']).*?video-play-button.*?\1[^>]+data-id=["\'](?P<id>\d+)', - webpage, 'video id', group='id') - - return self.url_result( - 'http://espn.go.com/video/clip?id=%s' % video_id, ESPNIE.ie_key()) - - -class FiveThirtyEightIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?fivethirtyeight\.com/features/(?P<id>[^/?#]+)' - _TEST = { - 'url': 'http://fivethirtyeight.com/features/how-the-6-8-raiders-can-still-make-the-playoffs/', - 'info_dict': { - 'id': '56032156', - 'ext': 'flv', - 'title': 'FiveThirtyEight: The Raiders can still make the playoffs', - 'description': 'Neil Paine breaks down the simplest scenario that will put the Raiders into the playoffs at 8-8.', - }, - 'params': { - 'skip_download': True, - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - embed_url = self._search_regex( - r'<iframe[^>]+src=["\'](https?://fivethirtyeight\.abcnews\.go\.com/video/embed/\d+/\d+)', - webpage, 'embed url') - - return self.url_result(embed_url, 'AbcNewsVideo') diff --git a/youtube_dl/extractor/europa.py b/youtube_dl/extractor/europa.py deleted file mode 100644 index 2c1c747a1..000000000 --- a/youtube_dl/extractor/europa.py +++ /dev/null @@ -1,93 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..compat import compat_urlparse -from ..utils import ( - int_or_none, - orderedSet, - parse_duration, - qualities, - unified_strdate, - xpath_text -) - - -class EuropaIE(InfoExtractor): - _VALID_URL = r'https?://ec\.europa\.eu/avservices/(?:video/player|audio/audioDetails)\.cfm\?.*?\bref=(?P<id>[A-Za-z0-9-]+)' - _TESTS = [{ - 'url': 'http://ec.europa.eu/avservices/video/player.cfm?ref=I107758', - 'md5': '574f080699ddd1e19a675b0ddf010371', - 'info_dict': { - 'id': 'I107758', - 'ext': 'mp4', - 'title': 'TRADE - Wikileaks on TTIP', - 'description': 'NEW LIVE EC Midday press briefing of 11/08/2015', - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20150811', - 'duration': 34, - 'view_count': int, - 'formats': 'mincount:3', - } - }, { - 'url': 'http://ec.europa.eu/avservices/video/player.cfm?sitelang=en&ref=I107786', - 'only_matching': True, - }, { - 'url': 'http://ec.europa.eu/avservices/audio/audioDetails.cfm?ref=I-109295&sitelang=en', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - playlist = self._download_xml( - 'http://ec.europa.eu/avservices/video/player/playlist.cfm?ID=%s' % video_id, video_id) - - def get_item(type_, preference): - items = {} - for item in playlist.findall('./info/%s/item' % type_): - lang, label = xpath_text(item, 'lg', default=None), xpath_text(item, 'label', default=None) - if lang and label: - items[lang] = label.strip() - for p in preference: - if items.get(p): - return items[p] - - query = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - preferred_lang = query.get('sitelang', ('en', ))[0] - - preferred_langs = orderedSet((preferred_lang, 'en', 'int')) - - title = get_item('title', preferred_langs) or video_id - description = get_item('description', preferred_langs) - thumbnail = xpath_text(playlist, './info/thumburl', 'thumbnail') - upload_date = unified_strdate(xpath_text(playlist, './info/date', 'upload date')) - duration = parse_duration(xpath_text(playlist, './info/duration', 'duration')) - view_count = int_or_none(xpath_text(playlist, './info/views', 'views')) - - language_preference = qualities(preferred_langs[::-1]) - - formats = [] - for file_ in playlist.findall('./files/file'): - video_url = xpath_text(file_, './url') - if not video_url: - continue - lang = xpath_text(file_, './lg') - formats.append({ - 'url': video_url, - 'format_id': lang, - 'format_note': xpath_text(file_, './lglabel'), - 'language_preference': language_preference(lang) - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'upload_date': upload_date, - 'duration': duration, - 'view_count': view_count, - 'formats': formats - } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py deleted file mode 100644 index 6e8fc3961..000000000 --- a/youtube_dl/extractor/extractors.py +++ /dev/null @@ -1,1644 +0,0 @@ -# flake8: noqa -from __future__ import unicode_literals - -from .abc import ( - ABCIE, - ABCIViewIE, -) -from .abcnews import ( - AbcNewsIE, - AbcNewsVideoIE, -) -from .abcotvs import ( - ABCOTVSIE, - ABCOTVSClipsIE, -) -from .academicearth import AcademicEarthCourseIE -from .acast import ( - ACastIE, - ACastChannelIE, -) -from .adn import ADNIE -from .adobeconnect import AdobeConnectIE -from .adobetv import ( - AdobeTVEmbedIE, - AdobeTVIE, - AdobeTVShowIE, - AdobeTVChannelIE, - AdobeTVVideoIE, -) -from .adultswim import AdultSwimIE -from .aenetworks import ( - AENetworksIE, - AENetworksCollectionIE, - AENetworksShowIE, - HistoryTopicIE, - HistoryPlayerIE, - BiographyIE, -) -from .afreecatv import AfreecaTVIE -from .airmozilla import AirMozillaIE -from .aljazeera import AlJazeeraIE -from .alphaporno import AlphaPornoIE -from .amara import AmaraIE -from .amcnetworks import AMCNetworksIE -from .americastestkitchen import ( - AmericasTestKitchenIE, - AmericasTestKitchenSeasonIE, -) -from .animeondemand import AnimeOnDemandIE -from .anvato import AnvatoIE -from .aol import AolIE -from .allocine import AllocineIE -from .aliexpress import AliExpressLiveIE -from .apa import APAIE -from .aparat import AparatIE -from .appleconnect import AppleConnectIE -from .appletrailers import ( - AppleTrailersIE, - AppleTrailersSectionIE, -) -from .applepodcasts import ApplePodcastsIE -from .archiveorg import ArchiveOrgIE -from .arcpublishing import ArcPublishingIE -from .arkena import ArkenaIE -from .ard import ( - ARDBetaMediathekIE, - ARDIE, - ARDMediathekIE, -) -from .arte import ( - ArteTVIE, - ArteTVEmbedIE, - ArteTVPlaylistIE, -) -from .arnes import ArnesIE -from .asiancrush import ( - AsianCrushIE, - AsianCrushPlaylistIE, -) -from .atresplayer import AtresPlayerIE -from .atttechchannel import ATTTechChannelIE -from .atvat import ATVAtIE -from .audimedia import AudiMediaIE -from .audioboom import AudioBoomIE -from .audiomack import AudiomackIE, AudiomackAlbumIE -from .awaan import ( - AWAANIE, - AWAANVideoIE, - AWAANLiveIE, - AWAANSeasonIE, -) -from .azmedien import AZMedienIE -from .baidu import BaiduVideoIE -from .bandaichannel import BandaiChannelIE -from .bandcamp import BandcampIE, BandcampAlbumIE, BandcampWeeklyIE -from .bbc import ( - BBCCoUkIE, - BBCCoUkArticleIE, - BBCCoUkIPlayerEpisodesIE, - BBCCoUkIPlayerGroupIE, - BBCCoUkPlaylistIE, - BBCIE, -) -from .beeg import BeegIE -from .behindkink import BehindKinkIE -from .bellmedia import BellMediaIE -from .beatport import BeatportIE -from .bet import BetIE -from .bfi import BFIPlayerIE -from .bfmtv import ( - BFMTVIE, - BFMTVLiveIE, - BFMTVArticleIE, -) -from .bibeltv import BibelTVIE -from .bigflix import BigflixIE -from .bild import BildIE -from .bilibili import ( - BiliBiliIE, - BiliBiliBangumiIE, - BilibiliAudioIE, - BilibiliAudioAlbumIE, - BiliBiliPlayerIE, -) -from .biobiochiletv import BioBioChileTVIE -from .bitchute import ( - BitChuteIE, - BitChuteChannelIE, -) -from .biqle import BIQLEIE -from .bleacherreport import ( - BleacherReportIE, - BleacherReportCMSIE, -) -from .bloomberg import BloombergIE -from .bokecc import BokeCCIE -from .bongacams import BongaCamsIE -from .bostonglobe import BostonGlobeIE -from .box import BoxIE -from .bpb import BpbIE -from .br import ( - BRIE, - BRMediathekIE, -) -from .bravotv import BravoTVIE -from .breakcom import BreakIE -from .brightcove import ( - BrightcoveLegacyIE, - BrightcoveNewIE, -) -from .businessinsider import BusinessInsiderIE -from .buzzfeed import BuzzFeedIE -from .byutv import BYUtvIE -from .c56 import C56IE -from .camdemy import ( - CamdemyIE, - CamdemyFolderIE -) -from .cammodels import CamModelsIE -from .camtube import CamTubeIE -from .camwithher import CamWithHerIE -from .canalplus import CanalplusIE -from .canalc2 import Canalc2IE -from .canvas import ( - CanvasIE, - CanvasEenIE, - VrtNUIE, - DagelijkseKostIE, -) -from .carambatv import ( - CarambaTVIE, - CarambaTVPageIE, -) -from .cartoonnetwork import CartoonNetworkIE -from .cbc import ( - CBCIE, - CBCPlayerIE, - CBCWatchVideoIE, - CBCWatchIE, - CBCOlympicsIE, -) -from .cbs import CBSIE -from .cbslocal import ( - CBSLocalIE, - CBSLocalArticleIE, -) -from .cbsinteractive import CBSInteractiveIE -from .cbsnews import ( - CBSNewsEmbedIE, - CBSNewsIE, - CBSNewsLiveVideoIE, -) -from .cbssports import ( - CBSSportsEmbedIE, - CBSSportsIE, - TwentyFourSevenSportsIE, -) -from .ccc import ( - CCCIE, - CCCPlaylistIE, -) -from .ccma import CCMAIE -from .cctv import CCTVIE -from .cda import CDAIE -from .ceskatelevize import ( - CeskaTelevizeIE, - CeskaTelevizePoradyIE, -) -from .channel9 import Channel9IE -from .charlierose import CharlieRoseIE -from .chaturbate import ChaturbateIE -from .chilloutzone import ChilloutzoneIE -from .chirbit import ( - ChirbitIE, - ChirbitProfileIE, -) -from .cinchcast import CinchcastIE -from .cinemax import CinemaxIE -from .ciscolive import ( - CiscoLiveSessionIE, - CiscoLiveSearchIE, -) -from .cjsw import CJSWIE -from .cliphunter import CliphunterIE -from .clippit import ClippitIE -from .cliprs import ClipRsIE -from .clipsyndicate import ClipsyndicateIE -from .closertotruth import CloserToTruthIE -from .cloudflarestream import CloudflareStreamIE -from .cloudy import CloudyIE -from .clubic import ClubicIE -from .clyp import ClypIE -from .cmt import CMTIE -from .cnbc import ( - CNBCIE, - CNBCVideoIE, -) -from .cnn import ( - CNNIE, - CNNBlogsIE, - CNNArticleIE, -) -from .coub import CoubIE -from .comedycentral import ( - ComedyCentralIE, - ComedyCentralTVIE, -) -from .commonmistakes import CommonMistakesIE, UnicodeBOMIE -from .commonprotocols import ( - MmsIE, - RtmpIE, -) -from .condenast import CondeNastIE -from .contv import CONtvIE -from .corus import CorusIE -from .cracked import CrackedIE -from .crackle import CrackleIE -from .crooksandliars import CrooksAndLiarsIE -from .crunchyroll import ( - CrunchyrollIE, - CrunchyrollShowPlaylistIE -) -from .cspan import CSpanIE -from .ctsnews import CtsNewsIE -from .ctv import CTVIE -from .ctvnews import CTVNewsIE -from .cultureunplugged import CultureUnpluggedIE -from .curiositystream import ( - CuriosityStreamIE, - CuriosityStreamCollectionIE, -) -from .cwtv import CWTVIE -from .dailymail import DailyMailIE -from .dailymotion import ( - DailymotionIE, - DailymotionPlaylistIE, - DailymotionUserIE, -) -from .daum import ( - DaumIE, - DaumClipIE, - DaumPlaylistIE, - DaumUserIE, -) -from .dbtv import DBTVIE -from .dctp import DctpTvIE -from .deezer import DeezerPlaylistIE -from .democracynow import DemocracynowIE -from .dfb import DFBIE -from .dhm import DHMIE -from .digg import DiggIE -from .dotsub import DotsubIE -from .douyutv import ( - DouyuShowIE, - DouyuTVIE, -) -from .dplay import ( - DPlayIE, - DiscoveryPlusIE, - HGTVDeIE, -) -from .dreisat import DreiSatIE -from .drbonanza import DRBonanzaIE -from .drtuber import DrTuberIE -from .drtv import ( - DRTVIE, - DRTVLiveIE, -) -from .dtube import DTubeIE -from .dvtv import DVTVIE -from .dumpert import DumpertIE -from .defense import DefenseGouvFrIE -from .discovery import DiscoveryIE -from .discoverygo import ( - DiscoveryGoIE, - DiscoveryGoPlaylistIE, -) -from .discoverynetworks import DiscoveryNetworksDeIE -from .discoveryvr import DiscoveryVRIE -from .disney import DisneyIE -from .dispeak import DigitallySpeakingIE -from .dropbox import DropboxIE -from .dw import ( - DWIE, - DWArticleIE, -) -from .eagleplatform import EaglePlatformIE -from .ebaumsworld import EbaumsWorldIE -from .echomsk import EchoMskIE -from .egghead import ( - EggheadCourseIE, - EggheadLessonIE, -) -from .ehow import EHowIE -from .eighttracks import EightTracksIE -from .einthusan import EinthusanIE -from .eitb import EitbIE -from .ellentube import ( - EllenTubeIE, - EllenTubeVideoIE, - EllenTubePlaylistIE, -) -from .elpais import ElPaisIE -from .embedly import EmbedlyIE -from .engadget import EngadgetIE -from .eporner import EpornerIE -from .eroprofile import EroProfileIE -from .escapist import EscapistIE -from .espn import ( - ESPNIE, - ESPNArticleIE, - FiveThirtyEightIE, -) -from .esri import EsriVideoIE -from .europa import EuropaIE -from .expotv import ExpoTVIE -from .expressen import ExpressenIE -from .extremetube import ExtremeTubeIE -from .eyedotv import EyedoTVIE -from .facebook import ( - FacebookIE, - FacebookPluginsVideoIE, -) -from .faz import FazIE -from .fc2 import ( - FC2IE, - FC2EmbedIE, -) -from .fczenit import FczenitIE -from .filmon import ( - FilmOnIE, - FilmOnChannelIE, -) -from .filmweb import FilmwebIE -from .firsttv import FirstTVIE -from .fivemin import FiveMinIE -from .fivetv import FiveTVIE -from .flickr import FlickrIE -from .folketinget import FolketingetIE -from .footyroom import FootyRoomIE -from .formula1 import Formula1IE -from .fourtube import ( - FourTubeIE, - PornTubeIE, - PornerBrosIE, - FuxIE, -) -from .fox import FOXIE -from .fox9 import ( - FOX9IE, - FOX9NewsIE, -) -from .foxgay import FoxgayIE -from .foxnews import ( - FoxNewsIE, - FoxNewsArticleIE, -) -from .foxsports import FoxSportsIE -from .franceculture import FranceCultureIE -from .franceinter import FranceInterIE -from .francetv import ( - FranceTVIE, - FranceTVSiteIE, - FranceTVEmbedIE, - FranceTVInfoIE, - FranceTVInfoSportIE, - FranceTVJeunesseIE, - GenerationWhatIE, - CultureboxIE, -) -from .freesound import FreesoundIE -from .freespeech import FreespeechIE -from .freshlive import FreshLiveIE -from .frontendmasters import ( - FrontendMastersIE, - FrontendMastersLessonIE, - FrontendMastersCourseIE -) -from .fujitv import FujiTVFODPlus7IE -from .funimation import FunimationIE -from .funk import FunkIE -from .fusion import FusionIE -from .gaia import GaiaIE -from .gameinformer import GameInformerIE -from .gamespot import GameSpotIE -from .gamestar import GameStarIE -from .gaskrank import GaskrankIE -from .gazeta import GazetaIE -from .gdcvault import GDCVaultIE -from .gedidigital import GediDigitalIE -from .generic import GenericIE -from .gfycat import GfycatIE -from .giantbomb import GiantBombIE -from .giga import GigaIE -from .glide import GlideIE -from .globo import ( - GloboIE, - GloboArticleIE, -) -from .go import GoIE -from .godtube import GodTubeIE -from .golem import GolemIE -from .googledrive import GoogleDriveIE -from .googlepodcasts import ( - GooglePodcastsIE, - GooglePodcastsFeedIE, -) -from .googlesearch import GoogleSearchIE -from .goshgay import GoshgayIE -from .gputechconf import GPUTechConfIE -from .groupon import GrouponIE -from .hbo import HBOIE -from .hearthisat import HearThisAtIE -from .heise import HeiseIE -from .hellporno import HellPornoIE -from .helsinki import HelsinkiIE -from .hentaistigma import HentaiStigmaIE -from .hgtv import HGTVComShowIE -from .hketv import HKETVIE -from .hidive import HiDiveIE -from .historicfilms import HistoricFilmsIE -from .hitbox import HitboxIE, HitboxLiveIE -from .hitrecord import HitRecordIE -from .hornbunny import HornBunnyIE -from .hotnewhiphop import HotNewHipHopIE -from .hotstar import ( - HotStarIE, - HotStarPlaylistIE, -) -from .howcast import HowcastIE -from .howstuffworks import HowStuffWorksIE -from .hrti import ( - HRTiIE, - HRTiPlaylistIE, -) -from .huajiao import HuajiaoIE -from .huffpost import HuffPostIE -from .hungama import ( - HungamaIE, - HungamaSongIE, -) -from .hypem import HypemIE -from .ign import ( - IGNIE, - IGNVideoIE, - IGNArticleIE, -) -from .iheart import ( - IHeartRadioIE, - IHeartRadioPodcastIE, -) -from .imdb import ( - ImdbIE, - ImdbListIE -) -from .imgur import ( - ImgurIE, - ImgurAlbumIE, - ImgurGalleryIE, -) -from .ina import InaIE -from .inc import IncIE -from .indavideo import IndavideoEmbedIE -from .infoq import InfoQIE -from .instagram import ( - InstagramIE, - InstagramUserIE, - InstagramTagIE, -) -from .internazionale import InternazionaleIE -from .internetvideoarchive import InternetVideoArchiveIE -from .iprima import IPrimaIE -from .iqiyi import IqiyiIE -from .ir90tv import Ir90TvIE -from .itv import ( - ITVIE, - ITVBTCCIE, -) -from .ivi import ( - IviIE, - IviCompilationIE -) -from .ivideon import IvideonIE -from .iwara import IwaraIE -from .izlesene import IzleseneIE -from .jamendo import ( - JamendoIE, - JamendoAlbumIE, -) -from .jeuxvideo import JeuxVideoIE -from .jove import JoveIE -from .joj import JojIE -from .jwplatform import JWPlatformIE -from .kakao import KakaoIE -from .kaltura import KalturaIE -from .kankan import KankanIE -from .karaoketv import KaraoketvIE -from .karrierevideos import KarriereVideosIE -from .keezmovies import KeezMoviesIE -from .ketnet import KetnetIE -from .khanacademy import ( - KhanAcademyIE, - KhanAcademyUnitIE, -) -from .kickstarter import KickStarterIE -from .kinja import KinjaEmbedIE -from .kinopoisk import KinoPoiskIE -from .konserthusetplay import KonserthusetPlayIE -from .krasview import KrasViewIE -from .ku6 import Ku6IE -from .kusi import KUSIIE -from .kuwo import ( - KuwoIE, - KuwoAlbumIE, - KuwoChartIE, - KuwoSingerIE, - KuwoCategoryIE, - KuwoMvIE, -) -from .la7 import LA7IE -from .laola1tv import ( - Laola1TvEmbedIE, - Laola1TvIE, - EHFTVIE, - ITTFIE, -) -from .lbry import ( - LBRYIE, - LBRYChannelIE, -) -from .lci import LCIIE -from .lcp import ( - LcpPlayIE, - LcpIE, -) -from .lecture2go import Lecture2GoIE -from .lecturio import ( - LecturioIE, - LecturioCourseIE, - LecturioDeCourseIE, -) -from .leeco import ( - LeIE, - LePlaylistIE, - LetvCloudIE, -) -from .lego import LEGOIE -from .lemonde import LemondeIE -from .lenta import LentaIE -from .libraryofcongress import LibraryOfCongressIE -from .libsyn import LibsynIE -from .lifenews import ( - LifeNewsIE, - LifeEmbedIE, -) -from .limelight import ( - LimelightMediaIE, - LimelightChannelIE, - LimelightChannelListIE, -) -from .line import ( - LineTVIE, - LineLiveIE, - LineLiveChannelIE, -) -from .linkedin import ( - LinkedInLearningIE, - LinkedInLearningCourseIE, -) -from .linuxacademy import LinuxAcademyIE -from .litv import LiTVIE -from .livejournal import LiveJournalIE -from .livestream import ( - LivestreamIE, - LivestreamOriginalIE, - LivestreamShortenerIE, -) -from .lnkgo import LnkGoIE -from .localnews8 import LocalNews8IE -from .lovehomeporn import LoveHomePornIE -from .lrt import LRTIE -from .lynda import ( - LyndaIE, - LyndaCourseIE -) -from .m6 import M6IE -from .mailru import ( - MailRuIE, - MailRuMusicIE, - MailRuMusicSearchIE, -) -from .malltv import MallTVIE -from .mangomolo import ( - MangomoloVideoIE, - MangomoloLiveIE, -) -from .manyvids import ManyVidsIE -from .maoritv import MaoriTVIE -from .markiza import ( - MarkizaIE, - MarkizaPageIE, -) -from .massengeschmacktv import MassengeschmackTVIE -from .matchtv import MatchTVIE -from .mdr import MDRIE -from .medaltv import MedalTVIE -from .mediaset import MediasetIE -from .mediasite import ( - MediasiteIE, - MediasiteCatalogIE, - MediasiteNamedCatalogIE, -) -from .medici import MediciIE -from .megaphone import MegaphoneIE -from .meipai import MeipaiIE -from .melonvod import MelonVODIE -from .meta import METAIE -from .metacafe import MetacafeIE -from .metacritic import MetacriticIE -from .mgoon import MgoonIE -from .mgtv import MGTVIE -from .miaopai import MiaoPaiIE -from .microsoftvirtualacademy import ( - MicrosoftVirtualAcademyIE, - MicrosoftVirtualAcademyCourseIE, -) -from .minds import ( - MindsIE, - MindsChannelIE, - MindsGroupIE, -) -from .ministrygrid import MinistryGridIE -from .minoto import MinotoIE -from .miomio import MioMioIE -from .mit import TechTVMITIE, OCWMITIE -from .mitele import MiTeleIE -from .mixcloud import ( - MixcloudIE, - MixcloudUserIE, - MixcloudPlaylistIE, -) -from .mlb import ( - MLBIE, - MLBVideoIE, -) -from .mnet import MnetIE -from .moevideo import MoeVideoIE -from .mofosex import ( - MofosexIE, - MofosexEmbedIE, -) -from .mojvideo import MojvideoIE -from .morningstar import MorningstarIE -from .motherless import ( - MotherlessIE, - MotherlessGroupIE -) -from .motorsport import MotorsportIE -from .movieclips import MovieClipsIE -from .moviezine import MoviezineIE -from .movingimage import MovingImageIE -from .msn import MSNIE -from .mtv import ( - MTVIE, - MTVVideoIE, - MTVServicesEmbeddedIE, - MTVDEIE, - MTVJapanIE, -) -from .muenchentv import MuenchenTVIE -from .mwave import MwaveIE, MwaveMeetGreetIE -from .mychannels import MyChannelsIE -from .myspace import MySpaceIE, MySpaceAlbumIE -from .myspass import MySpassIE -from .myvi import ( - MyviIE, - MyviEmbedIE, -) -from .myvidster import MyVidsterIE -from .nationalgeographic import ( - NationalGeographicVideoIE, - NationalGeographicTVIE, -) -from .naver import NaverIE -from .nba import ( - NBAWatchEmbedIE, - NBAWatchIE, - NBAWatchCollectionIE, - NBAEmbedIE, - NBAIE, - NBAChannelIE, -) -from .nbc import ( - NBCIE, - NBCNewsIE, - NBCOlympicsIE, - NBCOlympicsStreamIE, - NBCSportsIE, - NBCSportsStreamIE, - NBCSportsVPlayerIE, -) -from .ndr import ( - NDRIE, - NJoyIE, - NDREmbedBaseIE, - NDREmbedIE, - NJoyEmbedIE, -) -from .ndtv import NDTVIE -from .netzkino import NetzkinoIE -from .nerdcubed import NerdCubedFeedIE -from .neteasemusic import ( - NetEaseMusicIE, - NetEaseMusicAlbumIE, - NetEaseMusicSingerIE, - NetEaseMusicListIE, - NetEaseMusicMvIE, - NetEaseMusicProgramIE, - NetEaseMusicDjRadioIE, -) -from .newgrounds import ( - NewgroundsIE, - NewgroundsPlaylistIE, -) -from .newstube import NewstubeIE -from .nextmedia import ( - NextMediaIE, - NextMediaActionNewsIE, - AppleDailyIE, - NextTVIE, -) -from .nexx import ( - NexxIE, - NexxEmbedIE, -) -from .nfl import ( - NFLIE, - NFLArticleIE, -) -from .nhk import ( - NhkVodIE, - NhkVodProgramIE, -) -from .nhl import NHLIE -from .nick import ( - NickIE, - NickBrIE, - NickDeIE, - NickNightIE, - NickRuIE, -) -from .niconico import NiconicoIE, NiconicoPlaylistIE -from .ninecninemedia import NineCNineMediaIE -from .ninegag import NineGagIE -from .ninenow import NineNowIE -from .nintendo import NintendoIE -from .njpwworld import NJPWWorldIE -from .nobelprize import NobelPrizeIE -from .nonktube import NonkTubeIE -from .noovo import NoovoIE -from .normalboots import NormalbootsIE -from .nosvideo import NosVideoIE -from .nova import ( - NovaEmbedIE, - NovaIE, -) -from .nowness import ( - NownessIE, - NownessPlaylistIE, - NownessSeriesIE, -) -from .noz import NozIE -from .npo import ( - AndereTijdenIE, - NPOIE, - NPOLiveIE, - NPORadioIE, - NPORadioFragmentIE, - SchoolTVIE, - HetKlokhuisIE, - VPROIE, - WNLIE, -) -from .npr import NprIE -from .nrk import ( - NRKIE, - NRKPlaylistIE, - NRKSkoleIE, - NRKTVIE, - NRKTVDirekteIE, - NRKRadioPodkastIE, - NRKTVEpisodeIE, - NRKTVEpisodesIE, - NRKTVSeasonIE, - NRKTVSeriesIE, -) -from .nrl import NRLTVIE -from .ntvcojp import NTVCoJpCUIE -from .ntvde import NTVDeIE -from .ntvru import NTVRuIE -from .nytimes import ( - NYTimesIE, - NYTimesArticleIE, - NYTimesCookingIE, -) -from .nuvid import NuvidIE -from .nzz import NZZIE -from .odatv import OdaTVIE -from .odnoklassniki import OdnoklassnikiIE -from .oktoberfesttv import OktoberfestTVIE -from .ondemandkorea import OnDemandKoreaIE -from .onet import ( - OnetIE, - OnetChannelIE, - OnetMVPIE, - OnetPlIE, -) -from .onionstudios import OnionStudiosIE -from .ooyala import ( - OoyalaIE, - OoyalaExternalIE, -) -from .ora import OraTVIE -from .orf import ( - ORFTVthekIE, - ORFFM4IE, - ORFFM4StoryIE, - ORFOE1IE, - ORFOE3IE, - ORFNOEIE, - ORFWIEIE, - ORFBGLIE, - ORFOOEIE, - ORFSTMIE, - ORFKTNIE, - ORFSBGIE, - ORFTIRIE, - ORFVBGIE, - ORFIPTVIE, -) -from .outsidetv import OutsideTVIE -from .packtpub import ( - PacktPubIE, - PacktPubCourseIE, -) -from .palcomp3 import ( - PalcoMP3IE, - PalcoMP3ArtistIE, - PalcoMP3VideoIE, -) -from .pandoratv import PandoraTVIE -from .parliamentliveuk import ParliamentLiveUKIE -from .patreon import PatreonIE -from .pbs import PBSIE -from .pearvideo import PearVideoIE -from .peertube import PeerTubeIE -from .people import PeopleIE -from .performgroup import PerformGroupIE -from .periscope import ( - PeriscopeIE, - PeriscopeUserIE, -) -from .philharmoniedeparis import PhilharmonieDeParisIE -from .phoenix import PhoenixIE -from .photobucket import PhotobucketIE -from .picarto import ( - PicartoIE, - PicartoVodIE, -) -from .piksel import PikselIE -from .pinkbike import PinkbikeIE -from .pinterest import ( - PinterestIE, - PinterestCollectionIE, -) -from .pladform import PladformIE -from .platzi import ( - PlatziIE, - PlatziCourseIE, -) -from .playfm import PlayFMIE -from .playplustv import PlayPlusTVIE -from .plays import PlaysTVIE -from .playstuff import PlayStuffIE -from .playtvak import PlaytvakIE -from .playvid import PlayvidIE -from .playwire import PlaywireIE -from .pluralsight import ( - PluralsightIE, - PluralsightCourseIE, -) -from .podomatic import PodomaticIE -from .pokemon import PokemonIE -from .polskieradio import ( - PolskieRadioIE, - PolskieRadioCategoryIE, -) -from .popcorntimes import PopcorntimesIE -from .popcorntv import PopcornTVIE -from .porn91 import Porn91IE -from .porncom import PornComIE -from .pornhd import PornHdIE -from .pornhub import ( - PornHubIE, - PornHubUserIE, - PornHubPagedVideoListIE, - PornHubUserVideosUploadIE, -) -from .pornotube import PornotubeIE -from .pornovoisines import PornoVoisinesIE -from .pornoxo import PornoXOIE -from .puhutv import ( - PuhuTVIE, - PuhuTVSerieIE, -) -from .presstv import PressTVIE -from .prosiebensat1 import ProSiebenSat1IE -from .puls4 import Puls4IE -from .pyvideo import PyvideoIE -from .qqmusic import ( - QQMusicIE, - QQMusicSingerIE, - QQMusicAlbumIE, - QQMusicToplistIE, - QQMusicPlaylistIE, -) -from .r7 import ( - R7IE, - R7ArticleIE, -) -from .radiocanada import ( - RadioCanadaIE, - RadioCanadaAudioVideoIE, -) -from .radiode import RadioDeIE -from .radiojavan import RadioJavanIE -from .radiobremen import RadioBremenIE -from .radiofrance import RadioFranceIE -from .rai import ( - RaiPlayIE, - RaiPlayLiveIE, - RaiPlayPlaylistIE, - RaiIE, -) -from .raywenderlich import ( - RayWenderlichIE, - RayWenderlichCourseIE, -) -from .rbmaradio import RBMARadioIE -from .rds import RDSIE -from .redbulltv import ( - RedBullTVIE, - RedBullEmbedIE, - RedBullTVRrnContentIE, - RedBullIE, -) -from .reddit import ( - RedditIE, - RedditRIE, -) -from .redtube import RedTubeIE -from .regiotv import RegioTVIE -from .rentv import ( - RENTVIE, - RENTVArticleIE, -) -from .restudy import RestudyIE -from .reuters import ReutersIE -from .reverbnation import ReverbNationIE -from .rice import RICEIE -from .rmcdecouverte import RMCDecouverteIE -from .ro220 import Ro220IE -from .rockstargames import RockstarGamesIE -from .roosterteeth import RoosterTeethIE -from .rottentomatoes import RottenTomatoesIE -from .roxwel import RoxwelIE -from .rozhlas import RozhlasIE -from .rtbf import RTBFIE -from .rte import RteIE, RteRadioIE -from .rtlnl import RtlNlIE -from .rtl2 import ( - RTL2IE, - RTL2YouIE, - RTL2YouSeriesIE, -) -from .rtp import RTPIE -from .rts import RTSIE -from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE, RTVELiveIE, RTVETelevisionIE -from .rtvnh import RTVNHIE -from .rtvs import RTVSIE -from .ruhd import RUHDIE -from .rumble import RumbleEmbedIE -from .rutube import ( - RutubeIE, - RutubeChannelIE, - RutubeEmbedIE, - RutubeMovieIE, - RutubePersonIE, - RutubePlaylistIE, -) -from .rutv import RUTVIE -from .ruutu import RuutuIE -from .ruv import RuvIE -from .safari import ( - SafariIE, - SafariApiIE, - SafariCourseIE, -) -from .samplefocus import SampleFocusIE -from .sapo import SapoIE -from .savefrom import SaveFromIE -from .sbs import SBSIE -from .screencast import ScreencastIE -from .screencastomatic import ScreencastOMaticIE -from .scrippsnetworks import ( - ScrippsNetworksWatchIE, - ScrippsNetworksIE, -) -from .scte import ( - SCTEIE, - SCTECourseIE, -) -from .seeker import SeekerIE -from .senateisvp import SenateISVPIE -from .sendtonews import SendtoNewsIE -from .servus import ServusIE -from .sevenplus import SevenPlusIE -from .sexu import SexuIE -from .seznamzpravy import ( - SeznamZpravyIE, - SeznamZpravyArticleIE, -) -from .shahid import ( - ShahidIE, - ShahidShowIE, -) -from .shared import ( - SharedIE, - VivoIE, -) -from .showroomlive import ShowRoomLiveIE -from .simplecast import ( - SimplecastIE, - SimplecastEpisodeIE, - SimplecastPodcastIE, -) -from .sina import SinaIE -from .sixplay import SixPlayIE -from .skyit import ( - SkyItPlayerIE, - SkyItVideoIE, - SkyItVideoLiveIE, - SkyItIE, - SkyItAcademyIE, - SkyItArteIE, - CieloTVItIE, - TV8ItIE, -) -from .skylinewebcams import SkylineWebcamsIE -from .skynewsarabia import ( - SkyNewsArabiaIE, - SkyNewsArabiaArticleIE, -) -from .sky import ( - SkyNewsIE, - SkySportsIE, - SkySportsNewsIE, -) -from .slideshare import SlideshareIE -from .slideslive import SlidesLiveIE -from .slutload import SlutloadIE -from .snotr import SnotrIE -from .sohu import SohuIE -from .sonyliv import SonyLIVIE -from .soundcloud import ( - SoundcloudEmbedIE, - SoundcloudIE, - SoundcloudSetIE, - SoundcloudUserIE, - SoundcloudTrackStationIE, - SoundcloudPlaylistIE, - SoundcloudSearchIE, -) -from .soundgasm import ( - SoundgasmIE, - SoundgasmProfileIE -) -from .southpark import ( - SouthParkIE, - SouthParkDeIE, - SouthParkDkIE, - SouthParkEsIE, - SouthParkNlIE -) -from .spankbang import ( - SpankBangIE, - SpankBangPlaylistIE, -) -from .spankwire import SpankwireIE -from .spiegel import SpiegelIE -from .spike import ( - BellatorIE, - ParamountNetworkIE, -) -from .stitcher import ( - StitcherIE, - StitcherShowIE, -) -from .sport5 import Sport5IE -from .sportbox import SportBoxIE -from .sportdeutschland import SportDeutschlandIE -from .spotify import ( - SpotifyIE, - SpotifyShowIE, -) -from .spreaker import ( - SpreakerIE, - SpreakerPageIE, - SpreakerShowIE, - SpreakerShowPageIE, -) -from .springboardplatform import SpringboardPlatformIE -from .sprout import SproutIE -from .srgssr import ( - SRGSSRIE, - SRGSSRPlayIE, -) -from .srmediathek import SRMediathekIE -from .stanfordoc import StanfordOpenClassroomIE -from .steam import SteamIE -from .storyfire import ( - StoryFireIE, - StoryFireUserIE, - StoryFireSeriesIE, -) -from .streamable import StreamableIE -from .streamcloud import StreamcloudIE -from .streamcz import StreamCZIE -from .streetvoice import StreetVoiceIE -from .stretchinternet import StretchInternetIE -from .stv import STVPlayerIE -from .sunporno import SunPornoIE -from .sverigesradio import ( - SverigesRadioEpisodeIE, - SverigesRadioPublicationIE, -) -from .svt import ( - SVTIE, - SVTPageIE, - SVTPlayIE, - SVTSeriesIE, -) -from .swrmediathek import SWRMediathekIE -from .syfy import SyfyIE -from .sztvhu import SztvHuIE -from .tagesschau import ( - TagesschauPlayerIE, - TagesschauIE, -) -from .tass import TassIE -from .tbs import TBSIE -from .tdslifeway import TDSLifewayIE -from .teachable import ( - TeachableIE, - TeachableCourseIE, -) -from .teachertube import ( - TeacherTubeIE, - TeacherTubeUserIE, -) -from .teachingchannel import TeachingChannelIE -from .teamcoco import TeamcocoIE -from .teamtreehouse import TeamTreeHouseIE -from .techtalks import TechTalksIE -from .ted import TEDIE -from .tele5 import Tele5IE -from .tele13 import Tele13IE -from .telebruxelles import TeleBruxellesIE -from .telecinco import TelecincoIE -from .telegraaf import TelegraafIE -from .telemb import TeleMBIE -from .telequebec import ( - TeleQuebecIE, - TeleQuebecSquatIE, - TeleQuebecEmissionIE, - TeleQuebecLiveIE, - TeleQuebecVideoIE, -) -from .teletask import TeleTaskIE -from .telewebion import TelewebionIE -from .tennistv import TennisTVIE -from .tenplay import TenPlayIE -from .testurl import TestURLIE -from .tf1 import TF1IE -from .tfo import TFOIE -from .theintercept import TheInterceptIE -from .theplatform import ( - ThePlatformIE, - ThePlatformFeedIE, -) -from .thescene import TheSceneIE -from .thestar import TheStarIE -from .thesun import TheSunIE -from .theweatherchannel import TheWeatherChannelIE -from .thisamericanlife import ThisAmericanLifeIE -from .thisav import ThisAVIE -from .thisoldhouse import ThisOldHouseIE -from .threeqsdn import ThreeQSDNIE -from .tiktok import ( - TikTokIE, - TikTokUserIE, -) -from .tinypic import TinyPicIE -from .tmz import ( - TMZIE, - TMZArticleIE, -) -from .tnaflix import ( - TNAFlixNetworkEmbedIE, - TNAFlixIE, - EMPFlixIE, - MovieFapIE, -) -from .toggle import ( - ToggleIE, - MeWatchIE, -) -from .tonline import TOnlineIE -from .toongoggles import ToonGogglesIE -from .toutv import TouTvIE -from .toypics import ToypicsUserIE, ToypicsIE -from .traileraddict import TrailerAddictIE -from .trilulilu import TriluliluIE -from .trovo import ( - TrovoIE, - TrovoVodIE, -) -from .trunews import TruNewsIE -from .trutv import TruTVIE -from .tube8 import Tube8IE -from .tubitv import TubiTvIE -from .tumblr import TumblrIE -from .tunein import ( - TuneInClipIE, - TuneInStationIE, - TuneInProgramIE, - TuneInTopicIE, - TuneInShortenerIE, -) -from .tunepk import TunePkIE -from .turbo import TurboIE -from .tv2 import ( - TV2IE, - TV2ArticleIE, - KatsomoIE, - MTVUutisetArticleIE, -) -from .tv2dk import ( - TV2DKIE, - TV2DKBornholmPlayIE, -) -from .tv2hu import TV2HuIE -from .tv4 import TV4IE -from .tv5mondeplus import TV5MondePlusIE -from .tv5unis import ( - TV5UnisVideoIE, - TV5UnisIE, -) -from .tva import ( - TVAIE, - QubIE, -) -from .tvanouvelles import ( - TVANouvellesIE, - TVANouvellesArticleIE, -) -from .tvc import ( - TVCIE, - TVCArticleIE, -) -from .tver import TVerIE -from .tvigle import TvigleIE -from .tvland import TVLandIE -from .tvn24 import TVN24IE -from .tvnet import TVNetIE -from .tvnoe import TVNoeIE -from .tvnow import ( - TVNowIE, - TVNowNewIE, - TVNowSeasonIE, - TVNowAnnualIE, - TVNowShowIE, -) -from .tvp import ( - TVPEmbedIE, - TVPIE, - TVPWebsiteIE, -) -from .tvplay import ( - TVPlayIE, - ViafreeIE, - TVPlayHomeIE, -) -from .tvplayer import TVPlayerIE -from .tweakers import TweakersIE -from .twentyfourvideo import TwentyFourVideoIE -from .twentymin import TwentyMinutenIE -from .twentythreevideo import TwentyThreeVideoIE -from .twitcasting import TwitCastingIE -from .twitch import ( - TwitchVodIE, - TwitchCollectionIE, - TwitchVideosIE, - TwitchVideosClipsIE, - TwitchVideosCollectionsIE, - TwitchStreamIE, - TwitchClipsIE, -) -from .twitter import ( - TwitterCardIE, - TwitterIE, - TwitterAmplifyIE, - TwitterBroadcastIE, -) -from .udemy import ( - UdemyIE, - UdemyCourseIE -) -from .udn import UDNEmbedIE -from .ufctv import ( - UFCTVIE, - UFCArabiaIE, -) -from .uktvplay import UKTVPlayIE -from .digiteka import DigitekaIE -from .dlive import ( - DLiveVODIE, - DLiveStreamIE, -) -from .umg import UMGDeIE -from .unistra import UnistraIE -from .unity import UnityIE -from .uol import UOLIE -from .uplynk import ( - UplynkIE, - UplynkPreplayIE, -) -from .urort import UrortIE -from .urplay import URPlayIE -from .usanetwork import USANetworkIE -from .usatoday import USATodayIE -from .ustream import UstreamIE, UstreamChannelIE -from .ustudio import ( - UstudioIE, - UstudioEmbedIE, -) -from .varzesh3 import Varzesh3IE -from .vbox7 import Vbox7IE -from .veehd import VeeHDIE -from .veoh import VeohIE -from .vesti import VestiIE -from .vevo import ( - VevoIE, - VevoPlaylistIE, -) -from .vgtv import ( - BTArticleIE, - BTVestlendingenIE, - VGTVIE, -) -from .vh1 import VH1IE -from .vice import ( - ViceIE, - ViceArticleIE, - ViceShowIE, -) -from .vidbit import VidbitIE -from .viddler import ViddlerIE -from .videa import VideaIE -from .videodetective import VideoDetectiveIE -from .videofyme import VideofyMeIE -from .videomore import ( - VideomoreIE, - VideomoreVideoIE, - VideomoreSeasonIE, -) -from .videopress import VideoPressIE -from .vidio import VidioIE -from .vidlii import VidLiiIE -from .vidme import ( - VidmeIE, - VidmeUserIE, - VidmeUserLikesIE, -) -from .vier import VierIE, VierVideosIE -from .viewlift import ( - ViewLiftIE, - ViewLiftEmbedIE, -) -from .viidea import ViideaIE -from .vimeo import ( - VimeoIE, - VimeoAlbumIE, - VimeoChannelIE, - VimeoGroupsIE, - VimeoLikesIE, - VimeoOndemandIE, - VimeoReviewIE, - VimeoUserIE, - VimeoWatchLaterIE, - VHXEmbedIE, -) -from .vimple import VimpleIE -from .vine import ( - VineIE, - VineUserIE, -) -from .viki import ( - VikiIE, - VikiChannelIE, -) -from .viqeo import ViqeoIE -from .viu import ( - ViuIE, - ViuPlaylistIE, - ViuOTTIE, -) -from .vk import ( - VKIE, - VKUserVideosIE, - VKWallPostIE, -) -from .vlive import ( - VLiveIE, - VLivePostIE, - VLiveChannelIE, -) -from .vodlocker import VodlockerIE -from .vodpl import VODPlIE -from .vodplatform import VODPlatformIE -from .voicerepublic import VoiceRepublicIE -from .voot import VootIE -from .voxmedia import ( - VoxMediaVolumeIE, - VoxMediaIE, -) -from .vrt import VRTIE -from .vrak import VrakIE -from .vrv import ( - VRVIE, - VRVSeriesIE, -) -from .vshare import VShareIE -from .vtm import VTMIE -from .medialaan import MedialaanIE -from .vube import VubeIE -from .vuclip import VuClipIE -from .vvvvid import ( - VVVVIDIE, - VVVVIDShowIE, -) -from .vyborymos import VyboryMosIE -from .vzaar import VzaarIE -from .wakanim import WakanimIE -from .walla import WallaIE -from .washingtonpost import ( - WashingtonPostIE, - WashingtonPostArticleIE, -) -from .wat import WatIE -from .watchbox import WatchBoxIE -from .watchindianporn import WatchIndianPornIE -from .wdr import ( - WDRIE, - WDRPageIE, - WDRElefantIE, - WDRMobileIE, -) -from .webcaster import ( - WebcasterIE, - WebcasterFeedIE, -) -from .webofstories import ( - WebOfStoriesIE, - WebOfStoriesPlaylistIE, -) -from .weibo import ( - WeiboIE, - WeiboMobileIE -) -from .weiqitv import WeiqiTVIE -from .wistia import ( - WistiaIE, - WistiaPlaylistIE, -) -from .worldstarhiphop import WorldStarHipHopIE -from .wsj import ( - WSJIE, - WSJArticleIE, -) -from .wwe import WWEIE -from .xbef import XBefIE -from .xboxclips import XboxClipsIE -from .xfileshare import XFileShareIE -from .xhamster import ( - XHamsterIE, - XHamsterEmbedIE, - XHamsterUserIE, -) -from .xiami import ( - XiamiSongIE, - XiamiAlbumIE, - XiamiArtistIE, - XiamiCollectionIE -) -from .ximalaya import ( - XimalayaIE, - XimalayaAlbumIE -) -from .xminus import XMinusIE -from .xnxx import XNXXIE -from .xstream import XstreamIE -from .xtube import XTubeUserIE, XTubeIE -from .xuite import XuiteIE -from .xvideos import XVideosIE -from .xxxymovies import XXXYMoviesIE -from .yahoo import ( - YahooIE, - YahooSearchIE, - YahooGyaOPlayerIE, - YahooGyaOIE, - YahooJapanNewsIE, -) -from .yandexdisk import YandexDiskIE -from .yandexmusic import ( - YandexMusicTrackIE, - YandexMusicAlbumIE, - YandexMusicPlaylistIE, - YandexMusicArtistTracksIE, - YandexMusicArtistAlbumsIE, -) -from .yandexvideo import YandexVideoIE -from .yapfiles import YapFilesIE -from .yesjapan import YesJapanIE -from .yinyuetai import YinYueTaiIE -from .ynet import YnetIE -from .youjizz import YouJizzIE -from .youku import ( - YoukuIE, - YoukuShowIE, -) -from .younow import ( - YouNowLiveIE, - YouNowChannelIE, - YouNowMomentIE, -) -from .youporn import YouPornIE -from .yourporn import YourPornIE -from .yourupload import YourUploadIE -from .youtube import ( - YoutubeIE, - YoutubeFavouritesIE, - YoutubeHistoryIE, - YoutubeTabIE, - YoutubePlaylistIE, - YoutubeRecommendedIE, - YoutubeSearchDateIE, - YoutubeSearchIE, - #YoutubeSearchURLIE, - YoutubeSubscriptionsIE, - YoutubeTruncatedIDIE, - YoutubeTruncatedURLIE, - YoutubeYtBeIE, - YoutubeYtUserIE, - YoutubeWatchLaterIE, -) -from .zapiks import ZapiksIE -from .zattoo import ( - BBVTVIE, - EinsUndEinsTVIE, - EWETVIE, - GlattvisionTVIE, - MNetTVIE, - MyVisionTVIE, - NetPlusIE, - OsnatelTVIE, - QuantumTVIE, - QuicklineIE, - QuicklineLiveIE, - SaltTVIE, - SAKTVIE, - VTXTVIE, - WalyTVIE, - ZattooIE, - ZattooLiveIE, -) -from .zdf import ZDFIE, ZDFChannelIE -from .zhihu import ZhihuIE -from .zingmp3 import ( - ZingMp3IE, - ZingMp3AlbumIE, -) -from .zoom import ZoomIE -from .zype import ZypeIE diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py deleted file mode 100644 index 04650af39..000000000 --- a/youtube_dl/extractor/facebook.py +++ /dev/null @@ -1,709 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import json -import re -import socket - -from .common import InfoExtractor -from ..compat import ( - compat_etree_fromstring, - compat_http_client, - compat_str, - compat_urllib_error, - compat_urllib_parse_unquote, - compat_urllib_parse_unquote_plus, -) -from ..utils import ( - clean_html, - error_to_compat_str, - ExtractorError, - float_or_none, - get_element_by_id, - int_or_none, - js_to_json, - limit_length, - parse_count, - qualities, - sanitized_Request, - try_get, - urlencode_postdata, - urljoin, -) - - -class FacebookIE(InfoExtractor): - _VALID_URL = r'''(?x) - (?: - https?:// - (?:[\w-]+\.)?(?:facebook\.com|facebookcorewwwi\.onion)/ - (?:[^#]*?\#!/)? - (?: - (?: - video/video\.php| - photo\.php| - video\.php| - video/embed| - story\.php| - watch(?:/live)?/? - )\?(?:.*?)(?:v|video_id|story_fbid)=| - [^/]+/videos/(?:[^/]+/)?| - [^/]+/posts/| - groups/[^/]+/permalink/| - watchparty/ - )| - facebook: - ) - (?P<id>[0-9]+) - ''' - _LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1' - _CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1' - _NETRC_MACHINE = 'facebook' - IE_NAME = 'facebook' - - _VIDEO_PAGE_TEMPLATE = 'https://www.facebook.com/video/video.php?v=%s' - _VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true&payloadtype=primary' - - _TESTS = [{ - 'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf', - 'md5': '6a40d33c0eccbb1af76cf0485a052659', - 'info_dict': { - 'id': '637842556329505', - 'ext': 'mp4', - 'title': 're:Did you know Kei Nishikori is the first Asian man to ever reach a Grand Slam', - 'uploader': 'Tennis on Facebook', - 'upload_date': '20140908', - 'timestamp': 1410199200, - }, - 'skip': 'Requires logging in', - }, { - # data.video - 'url': 'https://www.facebook.com/video.php?v=274175099429670', - 'info_dict': { - 'id': '274175099429670', - 'ext': 'mp4', - 'title': 're:^Asif Nawab Butt posted a video', - 'uploader': 'Asif Nawab Butt', - 'upload_date': '20140506', - 'timestamp': 1399398998, - 'thumbnail': r're:^https?://.*', - }, - 'expected_warnings': [ - 'title' - ] - }, { - 'note': 'Video with DASH manifest', - 'url': 'https://www.facebook.com/video.php?v=957955867617029', - 'md5': 'b2c28d528273b323abe5c6ab59f0f030', - 'info_dict': { - 'id': '957955867617029', - 'ext': 'mp4', - 'title': 'When you post epic content on instagram.com/433 8 million followers, this is ...', - 'uploader': 'Demy de Zeeuw', - 'upload_date': '20160110', - 'timestamp': 1452431627, - }, - 'skip': 'Requires logging in', - }, { - 'url': 'https://www.facebook.com/maxlayn/posts/10153807558977570', - 'md5': '037b1fa7f3c2d02b7a0d7bc16031ecc6', - 'info_dict': { - 'id': '544765982287235', - 'ext': 'mp4', - 'title': '"What are you doing running in the snow?"', - 'uploader': 'FailArmy', - }, - 'skip': 'Video gone', - }, { - 'url': 'https://m.facebook.com/story.php?story_fbid=1035862816472149&id=116132035111903', - 'md5': '1deb90b6ac27f7efcf6d747c8a27f5e3', - 'info_dict': { - 'id': '1035862816472149', - 'ext': 'mp4', - 'title': 'What the Flock Is Going On In New Zealand Credit: ViralHog', - 'uploader': 'S. Saint', - }, - 'skip': 'Video gone', - }, { - 'note': 'swf params escaped', - 'url': 'https://www.facebook.com/barackobama/posts/10153664894881749', - 'md5': '97ba073838964d12c70566e0085c2b91', - 'info_dict': { - 'id': '10153664894881749', - 'ext': 'mp4', - 'title': 'Average time to confirm recent Supreme Court nominees: 67 days Longest it\'s t...', - 'thumbnail': r're:^https?://.*', - 'timestamp': 1456259628, - 'upload_date': '20160223', - 'uploader': 'Barack Obama', - }, - }, { - # have 1080P, but only up to 720p in swf params - # data.video.story.attachments[].media - 'url': 'https://www.facebook.com/cnn/videos/10155529876156509/', - 'md5': '9571fae53d4165bbbadb17a94651dcdc', - 'info_dict': { - 'id': '10155529876156509', - 'ext': 'mp4', - 'title': 'She survived the holocaust — and years later, she’s getting her citizenship s...', - 'timestamp': 1477818095, - 'upload_date': '20161030', - 'uploader': 'CNN', - 'thumbnail': r're:^https?://.*', - 'view_count': int, - }, - }, { - # bigPipe.onPageletArrive ... onPageletArrive pagelet_group_mall - # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media - 'url': 'https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/', - 'info_dict': { - 'id': '1417995061575415', - 'ext': 'mp4', - 'title': 'md5:1db063d6a8c13faa8da727817339c857', - 'timestamp': 1486648217, - 'upload_date': '20170209', - 'uploader': 'Yaroslav Korpan', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://www.facebook.com/LaGuiaDelVaron/posts/1072691702860471', - 'info_dict': { - 'id': '1072691702860471', - 'ext': 'mp4', - 'title': 'md5:ae2d22a93fbb12dad20dc393a869739d', - 'timestamp': 1477305000, - 'upload_date': '20161024', - 'uploader': 'La Guía Del Varón', - 'thumbnail': r're:^https?://.*', - }, - 'params': { - 'skip_download': True, - }, - }, { - # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media - 'url': 'https://www.facebook.com/groups/1024490957622648/permalink/1396382447100162/', - 'info_dict': { - 'id': '1396382447100162', - 'ext': 'mp4', - 'title': 'md5:19a428bbde91364e3de815383b54a235', - 'timestamp': 1486035494, - 'upload_date': '20170202', - 'uploader': 'Elisabeth Ahtn', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://www.facebook.com/video.php?v=10204634152394104', - 'only_matching': True, - }, { - 'url': 'https://www.facebook.com/amogood/videos/1618742068337349/?fref=nf', - 'only_matching': True, - }, { - # data.mediaset.currMedia.edges - 'url': 'https://www.facebook.com/ChristyClarkForBC/videos/vb.22819070941/10153870694020942/?type=2&theater', - 'only_matching': True, - }, { - # data.video.story.attachments[].media - 'url': 'facebook:544765982287235', - 'only_matching': True, - }, { - # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media - 'url': 'https://www.facebook.com/groups/164828000315060/permalink/764967300301124/', - 'only_matching': True, - }, { - # data.video.creation_story.attachments[].media - 'url': 'https://zh-hk.facebook.com/peoplespower/videos/1135894589806027/', - 'only_matching': True, - }, { - # data.video - 'url': 'https://www.facebookcorewwwi.onion/video.php?v=274175099429670', - 'only_matching': True, - }, { - # no title - 'url': 'https://www.facebook.com/onlycleverentertainment/videos/1947995502095005/', - 'only_matching': True, - }, { - # data.video - 'url': 'https://www.facebook.com/WatchESLOne/videos/359649331226507/', - 'info_dict': { - 'id': '359649331226507', - 'ext': 'mp4', - 'title': '#ESLOne VoD - Birmingham Finals Day#1 Fnatic vs. @Evil Geniuses', - 'uploader': 'ESL One Dota 2', - }, - 'params': { - 'skip_download': True, - }, - }, { - # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.all_subattachments.nodes[].media - 'url': 'https://www.facebook.com/100033620354545/videos/106560053808006/', - 'info_dict': { - 'id': '106560053808006', - }, - 'playlist_count': 2, - }, { - # data.video.story.attachments[].media - 'url': 'https://www.facebook.com/watch/?v=647537299265662', - 'only_matching': True, - }, { - # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.all_subattachments.nodes[].media - 'url': 'https://www.facebook.com/PankajShahLondon/posts/10157667649866271', - 'info_dict': { - 'id': '10157667649866271', - }, - 'playlist_count': 3, - }, { - # data.nodes[].comet_sections.content.story.attachments[].style_type_renderer.attachment.media - 'url': 'https://m.facebook.com/Alliance.Police.Department/posts/4048563708499330', - 'info_dict': { - 'id': '117576630041613', - 'ext': 'mp4', - # TODO: title can be extracted from video page - 'title': 'Facebook video #117576630041613', - 'uploader_id': '189393014416438', - 'upload_date': '20201123', - 'timestamp': 1606162592, - }, - 'skip': 'Requires logging in', - }, { - # node.comet_sections.content.story.attached_story.attachments.style_type_renderer.attachment.media - 'url': 'https://www.facebook.com/groups/ateistiskselskab/permalink/10154930137678856/', - 'info_dict': { - 'id': '211567722618337', - 'ext': 'mp4', - 'title': 'Facebook video #211567722618337', - 'uploader_id': '127875227654254', - 'upload_date': '20161122', - 'timestamp': 1479793574, - }, - }, { - # data.video.creation_story.attachments[].media - 'url': 'https://www.facebook.com/watch/live/?v=1823658634322275', - 'only_matching': True, - }, { - 'url': 'https://www.facebook.com/watchparty/211641140192478', - 'info_dict': { - 'id': '211641140192478', - }, - 'playlist_count': 1, - 'skip': 'Requires logging in', - }] - _SUPPORTED_PAGLETS_REGEX = r'(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_[0-9a-f]+)' - _api_config = { - 'graphURI': '/api/graphql/' - } - - @staticmethod - def _extract_urls(webpage): - urls = [] - for mobj in re.finditer( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://www\.facebook\.com/(?:video/embed|plugins/video\.php).+?)\1', - webpage): - urls.append(mobj.group('url')) - # Facebook API embed - # see https://developers.facebook.com/docs/plugins/embedded-video-player - for mobj in re.finditer(r'''(?x)<div[^>]+ - class=(?P<q1>[\'"])[^\'"]*\bfb-(?:video|post)\b[^\'"]*(?P=q1)[^>]+ - data-href=(?P<q2>[\'"])(?P<url>(?:https?:)?//(?:www\.)?facebook.com/.+?)(?P=q2)''', webpage): - urls.append(mobj.group('url')) - return urls - - def _login(self): - useremail, password = self._get_login_info() - if useremail is None: - return - - login_page_req = sanitized_Request(self._LOGIN_URL) - self._set_cookie('facebook.com', 'locale', 'en_US') - login_page = self._download_webpage(login_page_req, None, - note='Downloading login page', - errnote='Unable to download login page') - lsd = self._search_regex( - r'<input type="hidden" name="lsd" value="([^"]*)"', - login_page, 'lsd') - lgnrnd = self._search_regex(r'name="lgnrnd" value="([^"]*?)"', login_page, 'lgnrnd') - - login_form = { - 'email': useremail, - 'pass': password, - 'lsd': lsd, - 'lgnrnd': lgnrnd, - 'next': 'http://facebook.com/home.php', - 'default_persistent': '0', - 'legacy_return': '1', - 'timezone': '-60', - 'trynum': '1', - } - request = sanitized_Request(self._LOGIN_URL, urlencode_postdata(login_form)) - request.add_header('Content-Type', 'application/x-www-form-urlencoded') - try: - login_results = self._download_webpage(request, None, - note='Logging in', errnote='unable to fetch login page') - if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None: - error = self._html_search_regex( - r'(?s)<div[^>]+class=(["\']).*?login_error_box.*?\1[^>]*><div[^>]*>.*?</div><div[^>]*>(?P<error>.+?)</div>', - login_results, 'login error', default=None, group='error') - if error: - raise ExtractorError('Unable to login: %s' % error, expected=True) - self._downloader.report_warning('unable to log in: bad username/password, or exceeded login rate limit (~3/min). Check credentials or wait.') - return - - fb_dtsg = self._search_regex( - r'name="fb_dtsg" value="(.+?)"', login_results, 'fb_dtsg', default=None) - h = self._search_regex( - r'name="h"\s+(?:\w+="[^"]+"\s+)*?value="([^"]+)"', login_results, 'h', default=None) - - if not fb_dtsg or not h: - return - - check_form = { - 'fb_dtsg': fb_dtsg, - 'h': h, - 'name_action_selected': 'dont_save', - } - check_req = sanitized_Request(self._CHECKPOINT_URL, urlencode_postdata(check_form)) - check_req.add_header('Content-Type', 'application/x-www-form-urlencoded') - check_response = self._download_webpage(check_req, None, - note='Confirming login') - if re.search(r'id="checkpointSubmitButton"', check_response) is not None: - self._downloader.report_warning('Unable to confirm login, you have to login in your browser and authorize the login.') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.report_warning('unable to log in: %s' % error_to_compat_str(err)) - return - - def _real_initialize(self): - self._login() - - def _extract_from_url(self, url, video_id): - webpage = self._download_webpage( - url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id) - - video_data = None - - def extract_video_data(instances): - video_data = [] - for item in instances: - if try_get(item, lambda x: x[1][0]) == 'VideoConfig': - video_item = item[2][0] - if video_item.get('video_id'): - video_data.append(video_item['videoData']) - return video_data - - server_js_data = self._parse_json(self._search_regex( - [r'handleServerJS\(({.+})(?:\);|,")', r'\bs\.handle\(({.+?})\);'], - webpage, 'server js data', default='{}'), video_id, fatal=False) - - if server_js_data: - video_data = extract_video_data(server_js_data.get('instances', [])) - - def extract_from_jsmods_instances(js_data): - if js_data: - return extract_video_data(try_get( - js_data, lambda x: x['jsmods']['instances'], list) or []) - - def extract_dash_manifest(video, formats): - dash_manifest = video.get('dash_manifest') - if dash_manifest: - formats.extend(self._parse_mpd_formats( - compat_etree_fromstring(compat_urllib_parse_unquote_plus(dash_manifest)))) - - def process_formats(formats): - # Downloads with browser's User-Agent are rate limited. Working around - # with non-browser User-Agent. - for f in formats: - f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1' - - self._sort_formats(formats) - - def extract_relay_data(_filter): - return self._parse_json(self._search_regex( - r'handleWithCustomApplyEach\([^,]+,\s*({.*?%s.*?})\);' % _filter, - webpage, 'replay data', default='{}'), video_id, fatal=False) or {} - - def extract_relay_prefetched_data(_filter): - replay_data = extract_relay_data(_filter) - for require in (replay_data.get('require') or []): - if require[0] == 'RelayPrefetchedStreamCache': - return try_get(require, lambda x: x[3][1]['__bbox']['result']['data'], dict) or {} - - if not video_data: - server_js_data = self._parse_json(self._search_regex([ - r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+' + self._SUPPORTED_PAGLETS_REGEX, - r'bigPipe\.onPageletArrive\(({.*?id\s*:\s*"%s".*?})\);' % self._SUPPORTED_PAGLETS_REGEX - ], webpage, 'js data', default='{}'), video_id, js_to_json, False) - video_data = extract_from_jsmods_instances(server_js_data) - - if not video_data: - data = extract_relay_prefetched_data( - r'"(?:dash_manifest|playable_url(?:_quality_hd)?)"\s*:\s*"[^"]+"') - if data: - entries = [] - - def parse_graphql_video(video): - formats = [] - q = qualities(['sd', 'hd']) - for (suffix, format_id) in [('', 'sd'), ('_quality_hd', 'hd')]: - playable_url = video.get('playable_url' + suffix) - if not playable_url: - continue - formats.append({ - 'format_id': format_id, - 'quality': q(format_id), - 'url': playable_url, - }) - extract_dash_manifest(video, formats) - process_formats(formats) - v_id = video.get('videoId') or video.get('id') or video_id - info = { - 'id': v_id, - 'formats': formats, - 'thumbnail': try_get(video, lambda x: x['thumbnailImage']['uri']), - 'uploader_id': try_get(video, lambda x: x['owner']['id']), - 'timestamp': int_or_none(video.get('publish_time')), - 'duration': float_or_none(video.get('playable_duration_in_ms'), 1000), - } - description = try_get(video, lambda x: x['savable_description']['text']) - title = video.get('name') - if title: - info.update({ - 'title': title, - 'description': description, - }) - else: - info['title'] = description or 'Facebook video #%s' % v_id - entries.append(info) - - def parse_attachment(attachment, key='media'): - media = attachment.get(key) or {} - if media.get('__typename') == 'Video': - return parse_graphql_video(media) - - nodes = data.get('nodes') or [] - node = data.get('node') or {} - if not nodes and node: - nodes.append(node) - for node in nodes: - story = try_get(node, lambda x: x['comet_sections']['content']['story'], dict) or {} - attachments = try_get(story, [ - lambda x: x['attached_story']['attachments'], - lambda x: x['attachments'] - ], list) or [] - for attachment in attachments: - attachment = try_get(attachment, lambda x: x['style_type_renderer']['attachment'], dict) - ns = try_get(attachment, lambda x: x['all_subattachments']['nodes'], list) or [] - for n in ns: - parse_attachment(n) - parse_attachment(attachment) - - edges = try_get(data, lambda x: x['mediaset']['currMedia']['edges'], list) or [] - for edge in edges: - parse_attachment(edge, key='node') - - video = data.get('video') or {} - if video: - attachments = try_get(video, [ - lambda x: x['story']['attachments'], - lambda x: x['creation_story']['attachments'] - ], list) or [] - for attachment in attachments: - parse_attachment(attachment) - if not entries: - parse_graphql_video(video) - - return self.playlist_result(entries, video_id) - - if not video_data: - m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*"><div>(.*?)</div>', webpage) - if m_msg is not None: - raise ExtractorError( - 'The video is not available, Facebook said: "%s"' % m_msg.group(1), - expected=True) - elif any(p in webpage for p in ( - '>You must log in to continue', - 'id="login_form"', - 'id="loginbutton"')): - self.raise_login_required() - - if not video_data and '/watchparty/' in url: - post_data = { - 'doc_id': 3731964053542869, - 'variables': json.dumps({ - 'livingRoomID': video_id, - }), - } - - prefetched_data = extract_relay_prefetched_data(r'"login_data"\s*:\s*{') - if prefetched_data: - lsd = try_get(prefetched_data, lambda x: x['login_data']['lsd'], dict) - if lsd: - post_data[lsd['name']] = lsd['value'] - - relay_data = extract_relay_data(r'\[\s*"RelayAPIConfigDefaults"\s*,') - for define in (relay_data.get('define') or []): - if define[0] == 'RelayAPIConfigDefaults': - self._api_config = define[2] - - living_room = self._download_json( - urljoin(url, self._api_config['graphURI']), video_id, - data=urlencode_postdata(post_data))['data']['living_room'] - - entries = [] - for edge in (try_get(living_room, lambda x: x['recap']['watched_content']['edges']) or []): - video = try_get(edge, lambda x: x['node']['video']) or {} - v_id = video.get('id') - if not v_id: - continue - v_id = compat_str(v_id) - entries.append(self.url_result( - self._VIDEO_PAGE_TEMPLATE % v_id, - self.ie_key(), v_id, video.get('name'))) - - return self.playlist_result(entries, video_id) - - if not video_data: - # Video info not in first request, do a secondary request using - # tahoe player specific URL - tahoe_data = self._download_webpage( - self._VIDEO_PAGE_TAHOE_TEMPLATE % video_id, video_id, - data=urlencode_postdata({ - '__a': 1, - '__pc': self._search_regex( - r'pkg_cohort["\']\s*:\s*["\'](.+?)["\']', webpage, - 'pkg cohort', default='PHASED:DEFAULT'), - '__rev': self._search_regex( - r'client_revision["\']\s*:\s*(\d+),', webpage, - 'client revision', default='3944515'), - 'fb_dtsg': self._search_regex( - r'"DTSGInitialData"\s*,\s*\[\]\s*,\s*{\s*"token"\s*:\s*"([^"]+)"', - webpage, 'dtsg token', default=''), - }), - headers={ - 'Content-Type': 'application/x-www-form-urlencoded', - }) - tahoe_js_data = self._parse_json( - self._search_regex( - r'for\s+\(\s*;\s*;\s*\)\s*;(.+)', tahoe_data, - 'tahoe js data', default='{}'), - video_id, fatal=False) - video_data = extract_from_jsmods_instances(tahoe_js_data) - - if not video_data: - raise ExtractorError('Cannot parse data') - - if len(video_data) > 1: - entries = [] - for v in video_data: - video_url = v[0].get('video_url') - if not video_url: - continue - entries.append(self.url_result(urljoin( - url, video_url), self.ie_key(), v[0].get('video_id'))) - return self.playlist_result(entries, video_id) - video_data = video_data[0] - - formats = [] - subtitles = {} - for f in video_data: - format_id = f['stream_type'] - if f and isinstance(f, dict): - f = [f] - if not f or not isinstance(f, list): - continue - for quality in ('sd', 'hd'): - for src_type in ('src', 'src_no_ratelimit'): - src = f[0].get('%s_%s' % (quality, src_type)) - if src: - preference = -10 if format_id == 'progressive' else 0 - if quality == 'hd': - preference += 5 - formats.append({ - 'format_id': '%s_%s_%s' % (format_id, quality, src_type), - 'url': src, - 'preference': preference, - }) - extract_dash_manifest(f[0], formats) - subtitles_src = f[0].get('subtitles_src') - if subtitles_src: - subtitles.setdefault('en', []).append({'url': subtitles_src}) - if not formats: - raise ExtractorError('Cannot find video formats') - - process_formats(formats) - - video_title = self._html_search_regex( - r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage, - 'title', default=None) - if not video_title: - video_title = self._html_search_regex( - r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(.*?)</span>', - webpage, 'alternative title', default=None) - if not video_title: - video_title = self._html_search_meta( - 'description', webpage, 'title', default=None) - if video_title: - video_title = limit_length(video_title, 80) - else: - video_title = 'Facebook video #%s' % video_id - uploader = clean_html(get_element_by_id( - 'fbPhotoPageAuthorName', webpage)) or self._search_regex( - r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader', - default=None) or self._og_search_title(webpage, fatal=False) - timestamp = int_or_none(self._search_regex( - r'<abbr[^>]+data-utime=["\'](\d+)', webpage, - 'timestamp', default=None)) - thumbnail = self._html_search_meta(['og:image', 'twitter:image'], webpage) - - view_count = parse_count(self._search_regex( - r'\bviewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count', - default=None)) - - info_dict = { - 'id': video_id, - 'title': video_title, - 'formats': formats, - 'uploader': uploader, - 'timestamp': timestamp, - 'thumbnail': thumbnail, - 'view_count': view_count, - 'subtitles': subtitles, - } - - return info_dict - - def _real_extract(self, url): - video_id = self._match_id(url) - - real_url = self._VIDEO_PAGE_TEMPLATE % video_id if url.startswith('facebook:') else url - return self._extract_from_url(real_url, video_id) - - -class FacebookPluginsVideoIE(InfoExtractor): - _VALID_URL = r'https?://(?:[\w-]+\.)?facebook\.com/plugins/video\.php\?.*?\bhref=(?P<id>https.+)' - - _TESTS = [{ - 'url': 'https://www.facebook.com/plugins/video.php?href=https%3A%2F%2Fwww.facebook.com%2Fgov.sg%2Fvideos%2F10154383743583686%2F&show_text=0&width=560', - 'md5': '5954e92cdfe51fe5782ae9bda7058a07', - 'info_dict': { - 'id': '10154383743583686', - 'ext': 'mp4', - 'title': 'What to do during the haze?', - 'uploader': 'Gov.sg', - 'upload_date': '20160826', - 'timestamp': 1472184808, - }, - 'add_ie': [FacebookIE.ie_key()], - }, { - 'url': 'https://www.facebook.com/plugins/video.php?href=https%3A%2F%2Fwww.facebook.com%2Fvideo.php%3Fv%3D10204634152394104', - 'only_matching': True, - }, { - 'url': 'https://www.facebook.com/plugins/video.php?href=https://www.facebook.com/gov.sg/videos/10154383743583686/&show_text=0&width=560', - 'only_matching': True, - }] - - def _real_extract(self, url): - return self.url_result( - compat_urllib_parse_unquote(self._match_id(url)), - FacebookIE.ie_key()) diff --git a/youtube_dl/extractor/fc2.py b/youtube_dl/extractor/fc2.py deleted file mode 100644 index 435561147..000000000 --- a/youtube_dl/extractor/fc2.py +++ /dev/null @@ -1,160 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import hashlib -import re - -from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_urllib_request, - compat_urlparse, -) -from ..utils import ( - ExtractorError, - sanitized_Request, - urlencode_postdata, -) - - -class FC2IE(InfoExtractor): - _VALID_URL = r'^(?:https?://video\.fc2\.com/(?:[^/]+/)*content/|fc2:)(?P<id>[^/]+)' - IE_NAME = 'fc2' - _NETRC_MACHINE = 'fc2' - _TESTS = [{ - 'url': 'http://video.fc2.com/en/content/20121103kUan1KHs', - 'md5': 'a6ebe8ebe0396518689d963774a54eb7', - 'info_dict': { - 'id': '20121103kUan1KHs', - 'ext': 'flv', - 'title': 'Boxing again with Puff', - }, - }, { - 'url': 'http://video.fc2.com/en/content/20150125cEva0hDn/', - 'info_dict': { - 'id': '20150125cEva0hDn', - 'ext': 'mp4', - }, - 'params': { - 'username': 'ytdl@yt-dl.org', - 'password': '(snip)', - }, - 'skip': 'requires actual password', - }, { - 'url': 'http://video.fc2.com/en/a/content/20130926eZpARwsF', - 'only_matching': True, - }] - - def _login(self): - username, password = self._get_login_info() - if username is None or password is None: - return False - - # Log in - login_form_strs = { - 'email': username, - 'password': password, - 'done': 'video', - 'Submit': ' Login ', - } - - login_data = urlencode_postdata(login_form_strs) - request = sanitized_Request( - 'https://secure.id.fc2.com/index.php?mode=login&switch_language=en', login_data) - - login_results = self._download_webpage(request, None, note='Logging in', errnote='Unable to log in') - if 'mode=redirect&login=done' not in login_results: - self.report_warning('unable to log in: bad username or password') - return False - - # this is also needed - login_redir = sanitized_Request('http://id.fc2.com/?mode=redirect&login=done') - self._download_webpage( - login_redir, None, note='Login redirect', errnote='Login redirect failed') - - return True - - def _real_extract(self, url): - video_id = self._match_id(url) - self._login() - webpage = None - if not url.startswith('fc2:'): - webpage = self._download_webpage(url, video_id) - self._downloader.cookiejar.clear_session_cookies() # must clear - self._login() - - title = 'FC2 video %s' % video_id - thumbnail = None - if webpage is not None: - title = self._og_search_title(webpage) - thumbnail = self._og_search_thumbnail(webpage) - refer = url.replace('/content/', '/a/content/') if '/a/content/' not in url else url - - mimi = hashlib.md5((video_id + '_gGddgPfeaf_gzyr').encode('utf-8')).hexdigest() - - info_url = ( - 'http://video.fc2.com/ginfo.php?mimi={1:s}&href={2:s}&v={0:s}&fversion=WIN%2011%2C6%2C602%2C180&from=2&otag=0&upid={0:s}&tk=null&'. - format(video_id, mimi, compat_urllib_request.quote(refer, safe=b'').replace('.', '%2E'))) - - info_webpage = self._download_webpage( - info_url, video_id, note='Downloading info page') - info = compat_urlparse.parse_qs(info_webpage) - - if 'err_code' in info: - # most of the time we can still download wideo even if err_code is 403 or 602 - self.report_warning( - 'Error code was: %s... but still trying' % info['err_code'][0]) - - if 'filepath' not in info: - raise ExtractorError('Cannot download file. Are you logged in?') - - video_url = info['filepath'][0] + '?mid=' + info['mid'][0] - title_info = info.get('title') - if title_info: - title = title_info[0] - - return { - 'id': video_id, - 'title': title, - 'url': video_url, - 'ext': 'flv', - 'thumbnail': thumbnail, - } - - -class FC2EmbedIE(InfoExtractor): - _VALID_URL = r'https?://video\.fc2\.com/flv2\.swf\?(?P<query>.+)' - IE_NAME = 'fc2:embed' - - _TEST = { - 'url': 'http://video.fc2.com/flv2.swf?t=201404182936758512407645&i=20130316kwishtfitaknmcgd76kjd864hso93htfjcnaogz629mcgfs6rbfk0hsycma7shkf85937cbchfygd74&i=201403223kCqB3Ez&d=2625&sj=11&lang=ja&rel=1&from=11&cmt=1&tk=TlRBM09EQTNNekU9&tl=プリズン・ブレイク%20S1-01%20マイケル%20【吹替】', - 'md5': 'b8aae5334cb691bdb1193a88a6ab5d5a', - 'info_dict': { - 'id': '201403223kCqB3Ez', - 'ext': 'flv', - 'title': 'プリズン・ブレイク S1-01 マイケル 【吹替】', - 'thumbnail': r're:^https?://.*\.jpg$', - }, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - query = compat_parse_qs(mobj.group('query')) - - video_id = query['i'][-1] - title = query.get('tl', ['FC2 video %s' % video_id])[0] - - sj = query.get('sj', [None])[0] - thumbnail = None - if sj: - # See thumbnailImagePath() in ServerConst.as of flv2.swf - thumbnail = 'http://video%s-thumbnail.fc2.com/up/pic/%s.jpg' % ( - sj, '/'.join((video_id[:6], video_id[6:8], video_id[-2], video_id[-1], video_id))) - - return { - '_type': 'url_transparent', - 'ie_key': FC2IE.ie_key(), - 'url': 'fc2:%s' % video_id, - 'title': title, - 'thumbnail': thumbnail, - } diff --git a/youtube_dl/extractor/filmweb.py b/youtube_dl/extractor/filmweb.py deleted file mode 100644 index 56000bc5b..000000000 --- a/youtube_dl/extractor/filmweb.py +++ /dev/null @@ -1,42 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - - -class FilmwebIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?filmweb\.no/(?P<type>trailere|filmnytt)/article(?P<id>\d+)\.ece' - _TEST = { - 'url': 'http://www.filmweb.no/trailere/article1264921.ece', - 'md5': 'e353f47df98e557d67edaceda9dece89', - 'info_dict': { - 'id': '13033574', - 'ext': 'mp4', - 'title': 'Det som en gang var', - 'upload_date': '20160316', - 'timestamp': 1458140101, - 'uploader_id': '12639966', - 'uploader': 'Live Roaldset', - } - } - - def _real_extract(self, url): - article_type, article_id = re.match(self._VALID_URL, url).groups() - if article_type == 'filmnytt': - webpage = self._download_webpage(url, article_id) - article_id = self._search_regex(r'data-videoid="(\d+)"', webpage, 'article id') - embed_code = self._download_json( - 'https://www.filmweb.no/template_v2/ajax/json_trailerEmbed.jsp', - article_id, query={ - 'articleId': article_id, - })['embedCode'] - iframe_url = self._proto_relative_url(self._search_regex( - r'<iframe[^>]+src="([^"]+)', embed_code, 'iframe url')) - - return { - '_type': 'url_transparent', - 'id': article_id, - 'url': iframe_url, - 'ie_key': 'TwentyThreeVideo', - } diff --git a/youtube_dl/extractor/firsttv.py b/youtube_dl/extractor/firsttv.py deleted file mode 100644 index 28617d83c..000000000 --- a/youtube_dl/extractor/firsttv.py +++ /dev/null @@ -1,156 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, -) -from ..utils import ( - int_or_none, - qualities, - unified_strdate, - url_or_none, -) - - -class FirstTVIE(InfoExtractor): - IE_NAME = '1tv' - IE_DESC = 'Первый канал' - _VALID_URL = r'https?://(?:www\.)?1tv\.ru/(?:[^/]+/)+(?P<id>[^/?#]+)' - - _TESTS = [{ - # single format - 'url': 'http://www.1tv.ru/shows/naedine-so-vsemi/vypuski/gost-lyudmila-senchina-naedine-so-vsemi-vypusk-ot-12-02-2015', - 'md5': 'a1b6b60d530ebcf8daacf4565762bbaf', - 'info_dict': { - 'id': '40049', - 'ext': 'mp4', - 'title': 'Гость Людмила Сенчина. Наедине со всеми. Выпуск от 12.02.2015', - 'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$', - 'upload_date': '20150212', - 'duration': 2694, - }, - }, { - # multiple formats - 'url': 'http://www.1tv.ru/shows/dobroe-utro/pro-zdorove/vesennyaya-allergiya-dobroe-utro-fragment-vypuska-ot-07042016', - 'info_dict': { - 'id': '364746', - 'ext': 'mp4', - 'title': 'Весенняя аллергия. Доброе утро. Фрагмент выпуска от 07.04.2016', - 'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$', - 'upload_date': '20160407', - 'duration': 179, - 'formats': 'mincount:3', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://www.1tv.ru/news/issue/2016-12-01/14:00', - 'info_dict': { - 'id': '14:00', - 'title': 'Выпуск новостей в 14:00 1 декабря 2016 года. Новости. Первый канал', - 'description': 'md5:2e921b948f8c1ff93901da78ebdb1dfd', - }, - 'playlist_count': 13, - }, { - 'url': 'http://www.1tv.ru/shows/tochvtoch-supersezon/vystupleniya/evgeniy-dyatlov-vladimir-vysockiy-koni-priveredlivye-toch-v-toch-supersezon-fragment-vypuska-ot-06-11-2016', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - playlist_url = compat_urlparse.urljoin(url, self._search_regex( - r'data-playlist-url=(["\'])(?P<url>(?:(?!\1).)+)\1', - webpage, 'playlist url', group='url')) - - parsed_url = compat_urlparse.urlparse(playlist_url) - qs = compat_urlparse.parse_qs(parsed_url.query) - item_ids = qs.get('videos_ids[]') or qs.get('news_ids[]') - - items = self._download_json(playlist_url, display_id) - - if item_ids: - items = [ - item for item in items - if item.get('uid') and compat_str(item['uid']) in item_ids] - else: - items = [items[0]] - - entries = [] - QUALITIES = ('ld', 'sd', 'hd', ) - - for item in items: - title = item['title'] - quality = qualities(QUALITIES) - formats = [] - path = None - for f in item.get('mbr', []): - src = url_or_none(f.get('src')) - if not src: - continue - tbr = int_or_none(self._search_regex( - r'_(\d{3,})\.mp4', src, 'tbr', default=None)) - if not path: - path = self._search_regex( - r'//[^/]+/(.+?)_\d+\.mp4', src, - 'm3u8 path', default=None) - formats.append({ - 'url': src, - 'format_id': f.get('name'), - 'tbr': tbr, - 'source_preference': quality(f.get('name')), - # quality metadata of http formats may be incorrect - 'preference': -1, - }) - # m3u8 URL format is reverse engineered from [1] (search for - # master.m3u8). dashEdges (that is currently balancer-vod.1tv.ru) - # is taken from [2]. - # 1. http://static.1tv.ru/player/eump1tv-current/eump-1tv.all.min.js?rnd=9097422834:formatted - # 2. http://static.1tv.ru/player/eump1tv-config/config-main.js?rnd=9097422834 - if not path and len(formats) == 1: - path = self._search_regex( - r'//[^/]+/(.+?$)', formats[0]['url'], - 'm3u8 path', default=None) - if path: - if len(formats) == 1: - m3u8_path = ',' - else: - tbrs = [compat_str(t) for t in sorted(f['tbr'] for f in formats)] - m3u8_path = '_,%s,%s' % (','.join(tbrs), '.mp4') - formats.extend(self._extract_m3u8_formats( - 'http://balancer-vod.1tv.ru/%s%s.urlset/master.m3u8' - % (path, m3u8_path), - display_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) - self._sort_formats(formats) - - thumbnail = item.get('poster') or self._og_search_thumbnail(webpage) - duration = int_or_none(item.get('duration') or self._html_search_meta( - 'video:duration', webpage, 'video duration', fatal=False)) - upload_date = unified_strdate(self._html_search_meta( - 'ya:ovs:upload_date', webpage, 'upload date', default=None)) - - entries.append({ - 'id': compat_str(item.get('id') or item['uid']), - 'thumbnail': thumbnail, - 'title': title, - 'upload_date': upload_date, - 'duration': int_or_none(duration), - 'formats': formats - }) - - title = self._html_search_regex( - (r'<div class="tv_translation">\s*<h1><a href="[^"]+">([^<]*)</a>', - r"'title'\s*:\s*'([^']+)'"), - webpage, 'title', default=None) or self._og_search_title( - webpage, default=None) - description = self._html_search_regex( - r'<div class="descr">\s*<div> </div>\s*<p>([^<]*)</p></div>', - webpage, 'description', default=None) or self._html_search_meta( - 'description', webpage, 'description', default=None) - - return self.playlist_result(entries, display_id, title, description) diff --git a/youtube_dl/extractor/fivetv.py b/youtube_dl/extractor/fivetv.py deleted file mode 100644 index c4c0f1b3d..000000000 --- a/youtube_dl/extractor/fivetv.py +++ /dev/null @@ -1,91 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import int_or_none - - -class FiveTVIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?:// - (?:www\.)?5-tv\.ru/ - (?: - (?:[^/]+/)+(?P<id>\d+)| - (?P<path>[^/?#]+)(?:[/?#])? - ) - ''' - - _TESTS = [{ - 'url': 'http://5-tv.ru/news/96814/', - 'md5': 'bbff554ad415ecf5416a2f48c22d9283', - 'info_dict': { - 'id': '96814', - 'ext': 'mp4', - 'title': 'Россияне выбрали имя для общенациональной платежной системы', - 'description': 'md5:a8aa13e2b7ad36789e9f77a74b6de660', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 180, - }, - }, { - 'url': 'http://5-tv.ru/video/1021729/', - 'info_dict': { - 'id': '1021729', - 'ext': 'mp4', - 'title': '3D принтер', - 'description': 'md5:d76c736d29ef7ec5c0cf7d7c65ffcb41', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 180, - }, - }, { - # redirect to https://www.5-tv.ru/projects/1000095/izvestia-glavnoe/ - 'url': 'http://www.5-tv.ru/glavnoe/#itemDetails', - 'info_dict': { - 'id': 'glavnoe', - 'ext': 'mp4', - 'title': r're:^Итоги недели с \d+ по \d+ \w+ \d{4} года$', - 'thumbnail': r're:^https?://.*\.jpg$', - }, - 'skip': 'redirect to «Известия. Главное» project page', - }, { - 'url': 'http://www.5-tv.ru/glavnoe/broadcasts/508645/', - 'only_matching': True, - }, { - 'url': 'http://5-tv.ru/films/1507502/', - 'only_matching': True, - }, { - 'url': 'http://5-tv.ru/programs/broadcast/508713/', - 'only_matching': True, - }, { - 'url': 'http://5-tv.ru/angel/', - 'only_matching': True, - }, { - 'url': 'http://www.5-tv.ru/schedule/?iframe=true&width=900&height=450', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') or mobj.group('path') - - webpage = self._download_webpage(url, video_id) - - video_url = self._search_regex( - [r'<div[^>]+?class="(?:flow)?player[^>]+?data-href="([^"]+)"', - r'<a[^>]+?href="([^"]+)"[^>]+?class="videoplayer"'], - webpage, 'video url') - - title = self._og_search_title(webpage, default=None) or self._search_regex( - r'<title>([^<]+)</title>', webpage, 'title') - duration = int_or_none(self._og_search_property( - 'video:duration', webpage, 'duration', default=None)) - - return { - 'id': video_id, - 'url': video_url, - 'title': title, - 'description': self._og_search_description(webpage, default=None), - 'thumbnail': self._og_search_thumbnail(webpage, default=None), - 'duration': duration, - } diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py deleted file mode 100644 index 9f166efd4..000000000 --- a/youtube_dl/extractor/flickr.py +++ /dev/null @@ -1,116 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urllib_parse_urlencode, -) -from ..utils import ( - ExtractorError, - int_or_none, - qualities, -) - - -class FlickrIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.|secure\.)?flickr\.com/photos/[\w\-_@]+/(?P<id>\d+)' - _TEST = { - 'url': 'http://www.flickr.com/photos/forestwander-nature-pictures/5645318632/in/photostream/', - 'md5': '164fe3fa6c22e18d448d4d5af2330f31', - 'info_dict': { - 'id': '5645318632', - 'ext': 'mpg', - 'description': 'Waterfalls in the Springtime at Dark Hollow Waterfalls. These are located just off of Skyline Drive in Virginia. They are only about 6/10 of a mile hike but it is a pretty steep hill and a good climb back up.', - 'title': 'Dark Hollow Waterfalls', - 'duration': 19, - 'timestamp': 1303528740, - 'upload_date': '20110423', - 'uploader_id': '10922353@N03', - 'uploader': 'Forest Wander', - 'uploader_url': 'https://www.flickr.com/photos/forestwander-nature-pictures/', - 'comment_count': int, - 'view_count': int, - 'tags': list, - 'license': 'Attribution-ShareAlike', - } - } - _API_BASE_URL = 'https://api.flickr.com/services/rest?' - # https://help.yahoo.com/kb/flickr/SLN25525.html - _LICENSES = { - '0': 'All Rights Reserved', - '1': 'Attribution-NonCommercial-ShareAlike', - '2': 'Attribution-NonCommercial', - '3': 'Attribution-NonCommercial-NoDerivs', - '4': 'Attribution', - '5': 'Attribution-ShareAlike', - '6': 'Attribution-NoDerivs', - '7': 'No known copyright restrictions', - '8': 'United States government work', - '9': 'Public Domain Dedication (CC0)', - '10': 'Public Domain Work', - } - - def _call_api(self, method, video_id, api_key, note, secret=None): - query = { - 'photo_id': video_id, - 'method': 'flickr.%s' % method, - 'api_key': api_key, - 'format': 'json', - 'nojsoncallback': 1, - } - if secret: - query['secret'] = secret - data = self._download_json(self._API_BASE_URL + compat_urllib_parse_urlencode(query), video_id, note) - if data['stat'] != 'ok': - raise ExtractorError(data['message']) - return data - - def _real_extract(self, url): - video_id = self._match_id(url) - - api_key = self._download_json( - 'https://www.flickr.com/hermes_error_beacon.gne', video_id, - 'Downloading api key')['site_key'] - - video_info = self._call_api( - 'photos.getInfo', video_id, api_key, 'Downloading video info')['photo'] - if video_info['media'] == 'video': - streams = self._call_api( - 'video.getStreamInfo', video_id, api_key, - 'Downloading streams info', video_info['secret'])['streams'] - - preference = qualities( - ['288p', 'iphone_wifi', '100', '300', '700', '360p', 'appletv', '720p', '1080p', 'orig']) - - formats = [] - for stream in streams['stream']: - stream_type = compat_str(stream.get('type')) - formats.append({ - 'format_id': stream_type, - 'url': stream['_content'], - 'preference': preference(stream_type), - }) - self._sort_formats(formats) - - owner = video_info.get('owner', {}) - uploader_id = owner.get('nsid') - uploader_path = owner.get('path_alias') or uploader_id - uploader_url = 'https://www.flickr.com/photos/%s/' % uploader_path if uploader_path else None - - return { - 'id': video_id, - 'title': video_info['title']['_content'], - 'description': video_info.get('description', {}).get('_content'), - 'formats': formats, - 'timestamp': int_or_none(video_info.get('dateuploaded')), - 'duration': int_or_none(video_info.get('video', {}).get('duration')), - 'uploader_id': uploader_id, - 'uploader': owner.get('realname'), - 'uploader_url': uploader_url, - 'comment_count': int_or_none(video_info.get('comments', {}).get('_content')), - 'view_count': int_or_none(video_info.get('views')), - 'tags': [tag.get('_content') for tag in video_info.get('tags', {}).get('tag', [])], - 'license': self._LICENSES.get(video_info.get('license')), - } - else: - raise ExtractorError('not a video', expected=True) diff --git a/youtube_dl/extractor/fourtube.py b/youtube_dl/extractor/fourtube.py deleted file mode 100644 index be4e81342..000000000 --- a/youtube_dl/extractor/fourtube.py +++ /dev/null @@ -1,309 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_b64decode, - compat_str, - compat_urllib_parse_unquote, - compat_urlparse, -) -from ..utils import ( - int_or_none, - parse_duration, - parse_iso8601, - str_or_none, - str_to_int, - try_get, - unified_timestamp, - url_or_none, -) - - -class FourTubeBaseIE(InfoExtractor): - def _extract_formats(self, url, video_id, media_id, sources): - token_url = 'https://%s/%s/desktop/%s' % ( - self._TKN_HOST, media_id, '+'.join(sources)) - - parsed_url = compat_urlparse.urlparse(url) - tokens = self._download_json(token_url, video_id, data=b'', headers={ - 'Origin': '%s://%s' % (parsed_url.scheme, parsed_url.hostname), - 'Referer': url, - }) - formats = [{ - 'url': tokens[format]['token'], - 'format_id': format + 'p', - 'resolution': format + 'p', - 'quality': int(format), - } for format in sources] - self._sort_formats(formats) - return formats - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - kind, video_id, display_id = mobj.group('kind', 'id', 'display_id') - - if kind == 'm' or not display_id: - url = self._URL_TEMPLATE % video_id - - webpage = self._download_webpage(url, video_id) - - title = self._html_search_meta('name', webpage) - timestamp = parse_iso8601(self._html_search_meta( - 'uploadDate', webpage)) - thumbnail = self._html_search_meta('thumbnailUrl', webpage) - uploader_id = self._html_search_regex( - r'<a class="item-to-subscribe" href="[^"]+/(?:channel|user)s?/([^/"]+)" title="Go to [^"]+ page">', - webpage, 'uploader id', fatal=False) - uploader = self._html_search_regex( - r'<a class="item-to-subscribe" href="[^"]+/(?:channel|user)s?/[^/"]+" title="Go to ([^"]+) page">', - webpage, 'uploader', fatal=False) - - categories_html = self._search_regex( - r'(?s)><i class="icon icon-tag"></i>\s*Categories / Tags\s*.*?<ul class="[^"]*?list[^"]*?">(.*?)</ul>', - webpage, 'categories', fatal=False) - categories = None - if categories_html: - categories = [ - c.strip() for c in re.findall( - r'(?s)<li><a.*?>(.*?)</a>', categories_html)] - - view_count = str_to_int(self._search_regex( - r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserPlays:([0-9,]+)">', - webpage, 'view count', default=None)) - like_count = str_to_int(self._search_regex( - r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserLikes:([0-9,]+)">', - webpage, 'like count', default=None)) - duration = parse_duration(self._html_search_meta('duration', webpage)) - - media_id = self._search_regex( - r'<button[^>]+data-id=(["\'])(?P<id>\d+)\1[^>]+data-quality=', webpage, - 'media id', default=None, group='id') - sources = [ - quality - for _, quality in re.findall(r'<button[^>]+data-quality=(["\'])(.+?)\1', webpage)] - if not (media_id and sources): - player_js = self._download_webpage( - self._search_regex( - r'<script[^>]id=(["\'])playerembed\1[^>]+src=(["\'])(?P<url>.+?)\2', - webpage, 'player JS', group='url'), - video_id, 'Downloading player JS') - params_js = self._search_regex( - r'\$\.ajax\(url,\ opts\);\s*\}\s*\}\)\(([0-9,\[\] ]+)\)', - player_js, 'initialization parameters') - params = self._parse_json('[%s]' % params_js, video_id) - media_id = params[0] - sources = ['%s' % p for p in params[2]] - - formats = self._extract_formats(url, video_id, media_id, sources) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'categories': categories, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'timestamp': timestamp, - 'like_count': like_count, - 'view_count': view_count, - 'duration': duration, - 'age_limit': 18, - } - - -class FourTubeIE(FourTubeBaseIE): - IE_NAME = '4tube' - _VALID_URL = r'https?://(?:(?P<kind>www|m)\.)?4tube\.com/(?:videos|embed)/(?P<id>\d+)(?:/(?P<display_id>[^/?#&]+))?' - _URL_TEMPLATE = 'https://www.4tube.com/videos/%s/video' - _TKN_HOST = 'token.4tube.com' - _TESTS = [{ - 'url': 'http://www.4tube.com/videos/209733/hot-babe-holly-michaels-gets-her-ass-stuffed-by-black', - 'md5': '6516c8ac63b03de06bc8eac14362db4f', - 'info_dict': { - 'id': '209733', - 'ext': 'mp4', - 'title': 'Hot Babe Holly Michaels gets her ass stuffed by black', - 'uploader': 'WCP Club', - 'uploader_id': 'wcp-club', - 'upload_date': '20131031', - 'timestamp': 1383263892, - 'duration': 583, - 'view_count': int, - 'like_count': int, - 'categories': list, - 'age_limit': 18, - }, - }, { - 'url': 'http://www.4tube.com/embed/209733', - 'only_matching': True, - }, { - 'url': 'http://m.4tube.com/videos/209733/hot-babe-holly-michaels-gets-her-ass-stuffed-by-black', - 'only_matching': True, - }] - - -class FuxIE(FourTubeBaseIE): - _VALID_URL = r'https?://(?:(?P<kind>www|m)\.)?fux\.com/(?:video|embed)/(?P<id>\d+)(?:/(?P<display_id>[^/?#&]+))?' - _URL_TEMPLATE = 'https://www.fux.com/video/%s/video' - _TKN_HOST = 'token.fux.com' - _TESTS = [{ - 'url': 'https://www.fux.com/video/195359/awesome-fucking-kitchen-ends-cum-swallow', - 'info_dict': { - 'id': '195359', - 'ext': 'mp4', - 'title': 'Awesome fucking in the kitchen ends with cum swallow', - 'uploader': 'alenci2342', - 'uploader_id': 'alenci2342', - 'upload_date': '20131230', - 'timestamp': 1388361660, - 'duration': 289, - 'view_count': int, - 'like_count': int, - 'categories': list, - 'age_limit': 18, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://www.fux.com/embed/195359', - 'only_matching': True, - }, { - 'url': 'https://www.fux.com/video/195359/awesome-fucking-kitchen-ends-cum-swallow', - 'only_matching': True, - }] - - -class PornTubeIE(FourTubeBaseIE): - _VALID_URL = r'https?://(?:(?P<kind>www|m)\.)?porntube\.com/(?:videos/(?P<display_id>[^/]+)_|embed/)(?P<id>\d+)' - _URL_TEMPLATE = 'https://www.porntube.com/videos/video_%s' - _TKN_HOST = 'tkn.porntube.com' - _TESTS = [{ - 'url': 'https://www.porntube.com/videos/teen-couple-doing-anal_7089759', - 'info_dict': { - 'id': '7089759', - 'ext': 'mp4', - 'title': 'Teen couple doing anal', - 'uploader': 'Alexy', - 'uploader_id': '91488', - 'upload_date': '20150606', - 'timestamp': 1433595647, - 'duration': 5052, - 'view_count': int, - 'like_count': int, - 'age_limit': 18, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://www.porntube.com/videos/squirting-teen-ballerina-ecg_1331406', - 'info_dict': { - 'id': '1331406', - 'ext': 'mp4', - 'title': 'Squirting Teen Ballerina on ECG', - 'uploader': 'Exploited College Girls', - 'uploader_id': '665', - 'channel': 'Exploited College Girls', - 'channel_id': '665', - 'upload_date': '20130920', - 'timestamp': 1379685485, - 'duration': 851, - 'view_count': int, - 'like_count': int, - 'age_limit': 18, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://www.porntube.com/embed/7089759', - 'only_matching': True, - }, { - 'url': 'https://m.porntube.com/videos/teen-couple-doing-anal_7089759', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id, display_id = mobj.group('id', 'display_id') - - webpage = self._download_webpage(url, display_id) - - video = self._parse_json( - self._search_regex( - r'INITIALSTATE\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1', - webpage, 'data', group='value'), video_id, - transform_source=lambda x: compat_urllib_parse_unquote( - compat_b64decode(x).decode('utf-8')))['page']['video'] - - title = video['title'] - media_id = video['mediaId'] - sources = [compat_str(e['height']) - for e in video['encodings'] if e.get('height')] - formats = self._extract_formats(url, video_id, media_id, sources) - - thumbnail = url_or_none(video.get('masterThumb')) - uploader = try_get(video, lambda x: x['user']['username'], compat_str) - uploader_id = str_or_none(try_get( - video, lambda x: x['user']['id'], int)) - channel = try_get(video, lambda x: x['channel']['name'], compat_str) - channel_id = str_or_none(try_get( - video, lambda x: x['channel']['id'], int)) - like_count = int_or_none(video.get('likes')) - dislike_count = int_or_none(video.get('dislikes')) - view_count = int_or_none(video.get('playsQty')) - duration = int_or_none(video.get('durationInSeconds')) - timestamp = unified_timestamp(video.get('publishedAt')) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'thumbnail': thumbnail, - 'uploader': uploader or channel, - 'uploader_id': uploader_id or channel_id, - 'channel': channel, - 'channel_id': channel_id, - 'timestamp': timestamp, - 'like_count': like_count, - 'dislike_count': dislike_count, - 'view_count': view_count, - 'duration': duration, - 'age_limit': 18, - } - - -class PornerBrosIE(FourTubeBaseIE): - _VALID_URL = r'https?://(?:(?P<kind>www|m)\.)?pornerbros\.com/(?:videos/(?P<display_id>[^/]+)_|embed/)(?P<id>\d+)' - _URL_TEMPLATE = 'https://www.pornerbros.com/videos/video_%s' - _TKN_HOST = 'token.pornerbros.com' - _TESTS = [{ - 'url': 'https://www.pornerbros.com/videos/skinny-brunette-takes-big-cock-down-her-anal-hole_181369', - 'md5': '6516c8ac63b03de06bc8eac14362db4f', - 'info_dict': { - 'id': '181369', - 'ext': 'mp4', - 'title': 'Skinny brunette takes big cock down her anal hole', - 'uploader': 'PornerBros HD', - 'uploader_id': 'pornerbros-hd', - 'upload_date': '20130130', - 'timestamp': 1359527401, - 'duration': 1224, - 'view_count': int, - 'categories': list, - 'age_limit': 18, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://www.pornerbros.com/embed/181369', - 'only_matching': True, - }, { - 'url': 'https://m.pornerbros.com/videos/skinny-brunette-takes-big-cock-down-her-anal-hole_181369', - 'only_matching': True, - }] diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py deleted file mode 100644 index 63613cb85..000000000 --- a/youtube_dl/extractor/foxnews.py +++ /dev/null @@ -1,127 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .amp import AMPIE -from .common import InfoExtractor - - -class FoxNewsIE(AMPIE): - IE_NAME = 'foxnews' - IE_DESC = 'Fox News and Fox Business Video' - _VALID_URL = r'https?://(?P<host>video\.(?:insider\.)?fox(?:news|business)\.com)/v/(?:video-embed\.html\?video_id=)?(?P<id>\d+)' - _TESTS = [ - { - 'url': 'http://video.foxnews.com/v/3937480/frozen-in-time/#sp=show-clips', - 'md5': '32aaded6ba3ef0d1c04e238d01031e5e', - 'info_dict': { - 'id': '3937480', - 'ext': 'flv', - 'title': 'Frozen in Time', - 'description': '16-year-old girl is size of toddler', - 'duration': 265, - 'timestamp': 1304411491, - 'upload_date': '20110503', - 'thumbnail': r're:^https?://.*\.jpg$', - }, - }, - { - 'url': 'http://video.foxnews.com/v/3922535568001/rep-luis-gutierrez-on-if-obamas-immigration-plan-is-legal/#sp=show-clips', - 'md5': '5846c64a1ea05ec78175421b8323e2df', - 'info_dict': { - 'id': '3922535568001', - 'ext': 'mp4', - 'title': "Rep. Luis Gutierrez on if Obama's immigration plan is legal", - 'description': "Congressman discusses president's plan", - 'duration': 292, - 'timestamp': 1417662047, - 'upload_date': '20141204', - 'thumbnail': r're:^https?://.*\.jpg$', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, - { - 'url': 'http://video.foxnews.com/v/video-embed.html?video_id=3937480&d=video.foxnews.com', - 'only_matching': True, - }, - { - 'url': 'http://video.foxbusiness.com/v/4442309889001', - 'only_matching': True, - }, - { - # From http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words - 'url': 'http://video.insider.foxnews.com/v/video-embed.html?video_id=5099377331001&autoplay=true&share_url=http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words&share_title=Student%20Group:%20Saying%20%27Politically%20Correct,%27%20%27Trash%27%20and%20%27Lame%27%20Is%20Offensive&share=true', - 'only_matching': True, - }, - ] - - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r'<(?:amp-)?iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//video\.foxnews\.com/v/video-embed\.html?.*?\bvideo_id=\d+.*?)\1', - webpage)] - - def _real_extract(self, url): - host, video_id = re.match(self._VALID_URL, url).groups() - - info = self._extract_feed_info( - 'http://%s/v/feed/video/%s.js?template=fox' % (host, video_id)) - info['id'] = video_id - return info - - -class FoxNewsArticleIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:insider\.)?foxnews\.com/(?!v)([^/]+/)+(?P<id>[a-z-]+)' - IE_NAME = 'foxnews:article' - - _TESTS = [{ - # data-video-id - 'url': 'http://www.foxnews.com/politics/2016/09/08/buzz-about-bud-clinton-camp-denies-claims-wore-earpiece-at-forum.html', - 'md5': '83d44e1aff1433e7a29a7b537d1700b5', - 'info_dict': { - 'id': '5116295019001', - 'ext': 'mp4', - 'title': 'Trump and Clinton asked to defend positions on Iraq War', - 'description': 'Veterans react on \'The Kelly File\'', - 'timestamp': 1473301045, - 'upload_date': '20160908', - }, - }, { - # iframe embed - 'url': 'http://www.foxnews.com/us/2018/03/09/parkland-survivor-kyle-kashuv-on-meeting-trump-his-app-to-prevent-another-school-shooting.amp.html?__twitter_impression=true', - 'info_dict': { - 'id': '5748266721001', - 'ext': 'flv', - 'title': 'Kyle Kashuv has a positive message for the Trump White House', - 'description': 'Marjory Stoneman Douglas student disagrees with classmates.', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 229, - 'timestamp': 1520594670, - 'upload_date': '20180309', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - video_id = self._html_search_regex( - r'data-video-id=([\'"])(?P<id>[^\'"]+)\1', - webpage, 'video ID', group='id', default=None) - if video_id: - return self.url_result( - 'http://video.foxnews.com/v/' + video_id, FoxNewsIE.ie_key()) - - return self.url_result( - FoxNewsIE._extract_urls(webpage)[0], FoxNewsIE.ie_key()) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py deleted file mode 100644 index e4ec2e200..000000000 --- a/youtube_dl/extractor/francetv.py +++ /dev/null @@ -1,546 +0,0 @@ -# coding: utf-8 - -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, -) -from ..utils import ( - clean_html, - determine_ext, - ExtractorError, - int_or_none, - parse_duration, - try_get, - url_or_none, - urljoin, -) -from .dailymotion import DailymotionIE - - -class FranceTVBaseInfoExtractor(InfoExtractor): - def _make_url_result(self, video_or_full_id, catalog=None): - full_id = 'francetv:%s' % video_or_full_id - if '@' not in video_or_full_id and catalog: - full_id += '@%s' % catalog - return self.url_result( - full_id, ie=FranceTVIE.ie_key(), - video_id=video_or_full_id.split('@')[0]) - - -class FranceTVIE(InfoExtractor): - _VALID_URL = r'''(?x) - (?: - https?:// - sivideo\.webservices\.francetelevisions\.fr/tools/getInfosOeuvre/v2/\? - .*?\bidDiffusion=[^&]+| - (?: - https?://videos\.francetv\.fr/video/| - francetv: - ) - (?P<id>[^@]+)(?:@(?P<catalog>.+))? - ) - ''' - - _TESTS = [{ - # without catalog - 'url': 'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/?idDiffusion=162311093&callback=_jsonp_loader_callback_request_0', - 'md5': 'c2248a8de38c4e65ea8fae7b5df2d84f', - 'info_dict': { - 'id': '162311093', - 'ext': 'mp4', - 'title': '13h15, le dimanche... - Les mystères de Jésus', - 'description': 'md5:75efe8d4c0a8205e5904498ffe1e1a42', - 'timestamp': 1502623500, - 'upload_date': '20170813', - }, - }, { - # with catalog - 'url': 'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/?idDiffusion=NI_1004933&catalogue=Zouzous&callback=_jsonp_loader_callback_request_4', - 'only_matching': True, - }, { - 'url': 'http://videos.francetv.fr/video/NI_657393@Regions', - 'only_matching': True, - }, { - 'url': 'francetv:162311093', - 'only_matching': True, - }, { - 'url': 'francetv:NI_1004933@Zouzous', - 'only_matching': True, - }, { - 'url': 'francetv:NI_983319@Info-web', - 'only_matching': True, - }, { - 'url': 'francetv:NI_983319', - 'only_matching': True, - }, { - 'url': 'francetv:NI_657393@Regions', - 'only_matching': True, - }, { - # france-3 live - 'url': 'francetv:SIM_France3', - 'only_matching': True, - }] - - def _extract_video(self, video_id, catalogue=None): - # Videos are identified by idDiffusion so catalogue part is optional. - # However when provided, some extra formats may be returned so we pass - # it if available. - info = self._download_json( - 'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/', - video_id, 'Downloading video JSON', query={ - 'idDiffusion': video_id, - 'catalogue': catalogue or '', - }) - - if info.get('status') == 'NOK': - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, info['message']), - expected=True) - allowed_countries = info['videos'][0].get('geoblocage') - if allowed_countries: - georestricted = True - geo_info = self._download_json( - 'http://geo.francetv.fr/ws/edgescape.json', video_id, - 'Downloading geo restriction info') - country = geo_info['reponse']['geo_info']['country_code'] - if country not in allowed_countries: - raise ExtractorError( - 'The video is not available from your location', - expected=True) - else: - georestricted = False - - def sign(manifest_url, manifest_id): - for host in ('hdfauthftv-a.akamaihd.net', 'hdfauth.francetv.fr'): - signed_url = url_or_none(self._download_webpage( - 'https://%s/esi/TA' % host, video_id, - 'Downloading signed %s manifest URL' % manifest_id, - fatal=False, query={ - 'url': manifest_url, - })) - if signed_url: - return signed_url - return manifest_url - - is_live = None - - videos = [] - - for video in (info.get('videos') or []): - if video.get('statut') != 'ONLINE': - continue - if not video.get('url'): - continue - videos.append(video) - - if not videos: - for device_type in ['desktop', 'mobile']: - fallback_info = self._download_json( - 'https://player.webservices.francetelevisions.fr/v1/videos/%s' % video_id, - video_id, 'Downloading fallback %s video JSON' % device_type, query={ - 'device_type': device_type, - 'browser': 'chrome', - }, fatal=False) - - if fallback_info and fallback_info.get('video'): - videos.append(fallback_info['video']) - - formats = [] - for video in videos: - video_url = video.get('url') - if not video_url: - continue - if is_live is None: - is_live = (try_get( - video, lambda x: x['plages_ouverture'][0]['direct'], bool) is True - or video.get('is_live') is True - or '/live.francetv.fr/' in video_url) - format_id = video.get('format') - ext = determine_ext(video_url) - if ext == 'f4m': - if georestricted: - # See https://github.com/ytdl-org/youtube-dl/issues/3963 - # m3u8 urls work fine - continue - formats.extend(self._extract_f4m_formats( - sign(video_url, format_id) + '&hdcore=3.7.0&plugin=aasp-3.7.0.39.44', - video_id, f4m_id=format_id, fatal=False)) - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - sign(video_url, format_id), video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id=format_id, - fatal=False)) - elif ext == 'mpd': - formats.extend(self._extract_mpd_formats( - sign(video_url, format_id), video_id, mpd_id=format_id, fatal=False)) - elif video_url.startswith('rtmp'): - formats.append({ - 'url': video_url, - 'format_id': 'rtmp-%s' % format_id, - 'ext': 'flv', - }) - else: - if self._is_valid_url(video_url, video_id, format_id): - formats.append({ - 'url': video_url, - 'format_id': format_id, - }) - - self._sort_formats(formats) - - title = info['titre'] - subtitle = info.get('sous_titre') - if subtitle: - title += ' - %s' % subtitle - title = title.strip() - - subtitles = {} - subtitles_list = [{ - 'url': subformat['url'], - 'ext': subformat.get('format'), - } for subformat in info.get('subtitles', []) if subformat.get('url')] - if subtitles_list: - subtitles['fr'] = subtitles_list - - return { - 'id': video_id, - 'title': self._live_title(title) if is_live else title, - 'description': clean_html(info.get('synopsis')), - 'thumbnail': urljoin('https://sivideo.webservices.francetelevisions.fr', info.get('image')), - 'duration': int_or_none(info.get('real_duration')) or parse_duration(info.get('duree')), - 'timestamp': int_or_none(try_get(info, lambda x: x['diffusion']['timestamp'])), - 'is_live': is_live, - 'formats': formats, - 'subtitles': subtitles, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - catalog = mobj.group('catalog') - - if not video_id: - qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - video_id = qs.get('idDiffusion', [None])[0] - catalog = qs.get('catalogue', [None])[0] - if not video_id: - raise ExtractorError('Invalid URL', expected=True) - - return self._extract_video(video_id, catalog) - - -class FranceTVSiteIE(FranceTVBaseInfoExtractor): - _VALID_URL = r'https?://(?:(?:www\.)?france\.tv|mobile\.france\.tv)/(?:[^/]+/)*(?P<id>[^/]+)\.html' - - _TESTS = [{ - 'url': 'https://www.france.tv/france-2/13h15-le-dimanche/140921-les-mysteres-de-jesus.html', - 'info_dict': { - 'id': 'ec217ecc-0733-48cf-ac06-af1347b849d1', - 'ext': 'mp4', - 'title': '13h15, le dimanche... - Les mystères de Jésus', - 'description': 'md5:75efe8d4c0a8205e5904498ffe1e1a42', - 'timestamp': 1502623500, - 'upload_date': '20170813', - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': [FranceTVIE.ie_key()], - }, { - # france3 - 'url': 'https://www.france.tv/france-3/des-chiffres-et-des-lettres/139063-emission-du-mardi-9-mai-2017.html', - 'only_matching': True, - }, { - # france4 - 'url': 'https://www.france.tv/france-4/hero-corp/saison-1/134151-apres-le-calme.html', - 'only_matching': True, - }, { - # france5 - 'url': 'https://www.france.tv/france-5/c-a-dire/saison-10/137013-c-a-dire.html', - 'only_matching': True, - }, { - # franceo - 'url': 'https://www.france.tv/france-o/archipels/132249-mon-ancetre-l-esclave.html', - 'only_matching': True, - }, { - # france2 live - 'url': 'https://www.france.tv/france-2/direct.html', - 'only_matching': True, - }, { - 'url': 'https://www.france.tv/documentaires/histoire/136517-argentine-les-500-bebes-voles-de-la-dictature.html', - 'only_matching': True, - }, { - 'url': 'https://www.france.tv/jeux-et-divertissements/divertissements/133965-le-web-contre-attaque.html', - 'only_matching': True, - }, { - 'url': 'https://mobile.france.tv/france-5/c-dans-l-air/137347-emission-du-vendredi-12-mai-2017.html', - 'only_matching': True, - }, { - 'url': 'https://www.france.tv/142749-rouge-sang.html', - 'only_matching': True, - }, { - # france-3 live - 'url': 'https://www.france.tv/france-3/direct.html', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - catalogue = None - video_id = self._search_regex( - r'(?:data-main-video\s*=|videoId["\']?\s*[:=])\s*(["\'])(?P<id>(?:(?!\1).)+)\1', - webpage, 'video id', default=None, group='id') - - if not video_id: - video_id, catalogue = self._html_search_regex( - r'(?:href=|player\.setVideo\(\s*)"http://videos?\.francetv\.fr/video/([^@]+@[^"]+)"', - webpage, 'video ID').split('@') - - return self._make_url_result(video_id, catalogue) - - -class FranceTVEmbedIE(FranceTVBaseInfoExtractor): - _VALID_URL = r'https?://embed\.francetv\.fr/*\?.*?\bue=(?P<id>[^&]+)' - - _TESTS = [{ - 'url': 'http://embed.francetv.fr/?ue=7fd581a2ccf59d2fc5719c5c13cf6961', - 'info_dict': { - 'id': 'NI_983319', - 'ext': 'mp4', - 'title': 'Le Pen Reims', - 'upload_date': '20170505', - 'timestamp': 1493981780, - 'duration': 16, - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': [FranceTVIE.ie_key()], - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - video = self._download_json( - 'http://api-embed.webservices.francetelevisions.fr/key/%s' % video_id, - video_id) - - return self._make_url_result(video['video_id'], video.get('catalog')) - - -class FranceTVInfoIE(FranceTVBaseInfoExtractor): - IE_NAME = 'francetvinfo.fr' - _VALID_URL = r'https?://(?:www|mobile|france3-regions)\.francetvinfo\.fr/(?:[^/]+/)*(?P<id>[^/?#&.]+)' - - _TESTS = [{ - 'url': 'http://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-lundi-26-aout-2013_393427.html', - 'info_dict': { - 'id': '84981923', - 'ext': 'mp4', - 'title': 'Soir 3', - 'upload_date': '20130826', - 'timestamp': 1377548400, - 'subtitles': { - 'fr': 'mincount:2', - }, - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': [FranceTVIE.ie_key()], - }, { - 'url': 'http://www.francetvinfo.fr/elections/europeennes/direct-europeennes-regardez-le-debat-entre-les-candidats-a-la-presidence-de-la-commission_600639.html', - 'only_matching': True, - }, { - 'url': 'http://www.francetvinfo.fr/economie/entreprises/les-entreprises-familiales-le-secret-de-la-reussite_933271.html', - 'only_matching': True, - }, { - 'url': 'http://france3-regions.francetvinfo.fr/bretagne/cotes-d-armor/thalassa-echappee-breizh-ce-venredi-dans-les-cotes-d-armor-954961.html', - 'only_matching': True, - }, { - # Dailymotion embed - 'url': 'http://www.francetvinfo.fr/politique/notre-dame-des-landes/video-sur-france-inter-cecile-duflot-denonce-le-regard-meprisant-de-patrick-cohen_1520091.html', - 'md5': 'ee7f1828f25a648addc90cb2687b1f12', - 'info_dict': { - 'id': 'x4iiko0', - 'ext': 'mp4', - 'title': 'NDDL, référendum, Brexit : Cécile Duflot répond à Patrick Cohen', - 'description': 'Au lendemain de la victoire du "oui" au référendum sur l\'aéroport de Notre-Dame-des-Landes, l\'ancienne ministre écologiste est l\'invitée de Patrick Cohen. Plus d\'info : https://www.franceinter.fr/emissions/le-7-9/le-7-9-27-juin-2016', - 'timestamp': 1467011958, - 'upload_date': '20160627', - 'uploader': 'France Inter', - 'uploader_id': 'x2q2ez', - }, - 'add_ie': ['Dailymotion'], - }, { - 'url': 'http://france3-regions.francetvinfo.fr/limousin/emissions/jt-1213-limousin', - 'only_matching': True, - }, { - # "<figure id=" pattern (#28792) - 'url': 'https://www.francetvinfo.fr/culture/patrimoine/incendie-de-notre-dame-de-paris/notre-dame-de-paris-de-l-incendie-de-la-cathedrale-a-sa-reconstruction_4372291.html', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - dailymotion_urls = DailymotionIE._extract_urls(webpage) - if dailymotion_urls: - return self.playlist_result([ - self.url_result(dailymotion_url, DailymotionIE.ie_key()) - for dailymotion_url in dailymotion_urls]) - - video_id = self._search_regex( - (r'player\.load[^;]+src:\s*["\']([^"\']+)', - r'id-video=([^@]+@[^"]+)', - r'<a[^>]+href="(?:https?:)?//videos\.francetv\.fr/video/([^@]+@[^"]+)"', - r'(?:data-id|<figure[^<]+\bid)=["\']([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'), - webpage, 'video id') - - return self._make_url_result(video_id) - - -class FranceTVInfoSportIE(FranceTVBaseInfoExtractor): - IE_NAME = 'sport.francetvinfo.fr' - _VALID_URL = r'https?://sport\.francetvinfo\.fr/(?:[^/]+/)*(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'https://sport.francetvinfo.fr/les-jeux-olympiques/retour-sur-les-meilleurs-moments-de-pyeongchang-2018', - 'info_dict': { - 'id': '6e49080e-3f45-11e8-b459-000d3a2439ea', - 'ext': 'mp4', - 'title': 'Retour sur les meilleurs moments de Pyeongchang 2018', - 'timestamp': 1523639962, - 'upload_date': '20180413', - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': [FranceTVIE.ie_key()], - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - video_id = self._search_regex(r'data-video="([^"]+)"', webpage, 'video_id') - return self._make_url_result(video_id, 'Sport-web') - - -class GenerationWhatIE(InfoExtractor): - IE_NAME = 'france2.fr:generation-what' - _VALID_URL = r'https?://generation-what\.francetv\.fr/[^/]+/video/(?P<id>[^/?#&]+)' - - _TESTS = [{ - 'url': 'http://generation-what.francetv.fr/portrait/video/present-arms', - 'info_dict': { - 'id': 'wtvKYUG45iw', - 'ext': 'mp4', - 'title': 'Generation What - Garde à vous - FRA', - 'uploader': 'Generation What', - 'uploader_id': 'UCHH9p1eetWCgt4kXBYCb3_w', - 'upload_date': '20160411', - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': ['Youtube'], - }, { - 'url': 'http://generation-what.francetv.fr/europe/video/present-arms', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - youtube_id = self._search_regex( - r"window\.videoURL\s*=\s*'([0-9A-Za-z_-]{11})';", - webpage, 'youtube id') - - return self.url_result(youtube_id, ie='Youtube', video_id=youtube_id) - - -class CultureboxIE(FranceTVBaseInfoExtractor): - _VALID_URL = r'https?://(?:m\.)?culturebox\.francetvinfo\.fr/(?:[^/]+/)*(?P<id>[^/?#&]+)' - - _TESTS = [{ - 'url': 'https://culturebox.francetvinfo.fr/opera-classique/musique-classique/c-est-baroque/concerts/cantates-bwv-4-106-et-131-de-bach-par-raphael-pichon-57-268689', - 'info_dict': { - 'id': 'EV_134885', - 'ext': 'mp4', - 'title': 'Cantates BWV 4, 106 et 131 de Bach par Raphaël Pichon 5/7', - 'description': 'md5:19c44af004b88219f4daa50fa9a351d4', - 'upload_date': '20180206', - 'timestamp': 1517945220, - 'duration': 5981, - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': [FranceTVIE.ie_key()], - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - if ">Ce live n'est plus disponible en replay<" in webpage: - raise ExtractorError( - 'Video %s is not available' % display_id, expected=True) - - video_id, catalogue = self._search_regex( - r'["\'>]https?://videos\.francetv\.fr/video/([^@]+@.+?)["\'<]', - webpage, 'video id').split('@') - - return self._make_url_result(video_id, catalogue) - - -class FranceTVJeunesseIE(FranceTVBaseInfoExtractor): - _VALID_URL = r'(?P<url>https?://(?:www\.)?(?:zouzous|ludo)\.fr/heros/(?P<id>[^/?#&]+))' - - _TESTS = [{ - 'url': 'https://www.zouzous.fr/heros/simon', - 'info_dict': { - 'id': 'simon', - }, - 'playlist_count': 9, - }, { - 'url': 'https://www.ludo.fr/heros/ninjago', - 'info_dict': { - 'id': 'ninjago', - }, - 'playlist_count': 10, - }, { - 'url': 'https://www.zouzous.fr/heros/simon?abc', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - playlist_id = mobj.group('id') - - playlist = self._download_json( - '%s/%s' % (mobj.group('url'), 'playlist'), playlist_id) - - if not playlist.get('count'): - raise ExtractorError( - '%s is not available' % playlist_id, expected=True) - - entries = [] - for item in playlist['items']: - identity = item.get('identity') - if identity and isinstance(identity, compat_str): - entries.append(self._make_url_result(identity)) - - return self.playlist_result(entries, playlist_id) diff --git a/youtube_dl/extractor/frontendmasters.py b/youtube_dl/extractor/frontendmasters.py deleted file mode 100644 index f1db33fb1..000000000 --- a/youtube_dl/extractor/frontendmasters.py +++ /dev/null @@ -1,263 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, -) -from ..utils import ( - ExtractorError, - parse_duration, - url_or_none, - urlencode_postdata, -) - - -class FrontendMastersBaseIE(InfoExtractor): - _API_BASE = 'https://api.frontendmasters.com/v1/kabuki' - _LOGIN_URL = 'https://frontendmasters.com/login/' - - _NETRC_MACHINE = 'frontendmasters' - - _QUALITIES = { - 'low': {'width': 480, 'height': 360}, - 'mid': {'width': 1280, 'height': 720}, - 'high': {'width': 1920, 'height': 1080} - } - - def _real_initialize(self): - self._login() - - def _login(self): - (username, password) = self._get_login_info() - if username is None: - return - - login_page = self._download_webpage( - self._LOGIN_URL, None, 'Downloading login page') - - login_form = self._hidden_inputs(login_page) - - login_form.update({ - 'username': username, - 'password': password - }) - - post_url = self._search_regex( - r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, - 'post_url', default=self._LOGIN_URL, group='url') - - if not post_url.startswith('http'): - post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) - - response = self._download_webpage( - post_url, None, 'Logging in', data=urlencode_postdata(login_form), - headers={'Content-Type': 'application/x-www-form-urlencoded'}) - - # Successful login - if any(p in response for p in ( - 'wp-login.php?action=logout', '>Logout')): - return - - error = self._html_search_regex( - r'class=(["\'])(?:(?!\1).)*\bMessageAlert\b(?:(?!\1).)*\1[^>]*>(?P<error>[^<]+)<', - response, 'error message', default=None, group='error') - if error: - raise ExtractorError('Unable to login: %s' % error, expected=True) - raise ExtractorError('Unable to log in') - - -class FrontendMastersPageBaseIE(FrontendMastersBaseIE): - def _download_course(self, course_name, url): - return self._download_json( - '%s/courses/%s' % (self._API_BASE, course_name), course_name, - 'Downloading course JSON', headers={'Referer': url}) - - @staticmethod - def _extract_chapters(course): - chapters = [] - lesson_elements = course.get('lessonElements') - if isinstance(lesson_elements, list): - chapters = [url_or_none(e) for e in lesson_elements if url_or_none(e)] - return chapters - - @staticmethod - def _extract_lesson(chapters, lesson_id, lesson): - title = lesson.get('title') or lesson_id - display_id = lesson.get('slug') - description = lesson.get('description') - thumbnail = lesson.get('thumbnail') - - chapter_number = None - index = lesson.get('index') - element_index = lesson.get('elementIndex') - if (isinstance(index, int) and isinstance(element_index, int) - and index < element_index): - chapter_number = element_index - index - chapter = (chapters[chapter_number - 1] - if chapter_number - 1 < len(chapters) else None) - - duration = None - timestamp = lesson.get('timestamp') - if isinstance(timestamp, compat_str): - mobj = re.search( - r'(?P<start>\d{1,2}:\d{1,2}:\d{1,2})\s*-(?P<end>\s*\d{1,2}:\d{1,2}:\d{1,2})', - timestamp) - if mobj: - duration = parse_duration(mobj.group('end')) - parse_duration( - mobj.group('start')) - - return { - '_type': 'url_transparent', - 'url': 'frontendmasters:%s' % lesson_id, - 'ie_key': FrontendMastersIE.ie_key(), - 'id': lesson_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'chapter': chapter, - 'chapter_number': chapter_number, - } - - -class FrontendMastersIE(FrontendMastersBaseIE): - _VALID_URL = r'(?:frontendmasters:|https?://api\.frontendmasters\.com/v\d+/kabuki/video/)(?P<id>[^/]+)' - _TESTS = [{ - 'url': 'https://api.frontendmasters.com/v1/kabuki/video/a2qogef6ba', - 'md5': '7f161159710d6b7016a4f4af6fcb05e2', - 'info_dict': { - 'id': 'a2qogef6ba', - 'ext': 'mp4', - 'title': 'a2qogef6ba', - }, - 'skip': 'Requires FrontendMasters account credentials', - }, { - 'url': 'frontendmasters:a2qogef6ba', - 'only_matching': True, - }] - - def _real_extract(self, url): - lesson_id = self._match_id(url) - - source_url = '%s/video/%s/source' % (self._API_BASE, lesson_id) - - formats = [] - for ext in ('webm', 'mp4'): - for quality in ('low', 'mid', 'high'): - resolution = self._QUALITIES[quality].copy() - format_id = '%s-%s' % (ext, quality) - format_url = self._download_json( - source_url, lesson_id, - 'Downloading %s source JSON' % format_id, query={ - 'f': ext, - 'r': resolution['height'], - }, headers={ - 'Referer': url, - }, fatal=False)['url'] - - if not format_url: - continue - - f = resolution.copy() - f.update({ - 'url': format_url, - 'ext': ext, - 'format_id': format_id, - }) - formats.append(f) - self._sort_formats(formats) - - subtitles = { - 'en': [{ - 'url': '%s/transcripts/%s.vtt' % (self._API_BASE, lesson_id), - }] - } - - return { - 'id': lesson_id, - 'title': lesson_id, - 'formats': formats, - 'subtitles': subtitles - } - - -class FrontendMastersLessonIE(FrontendMastersPageBaseIE): - _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P<course_name>[^/]+)/(?P<lesson_name>[^/]+)' - _TEST = { - 'url': 'https://frontendmasters.com/courses/web-development/tools', - 'info_dict': { - 'id': 'a2qogef6ba', - 'display_id': 'tools', - 'ext': 'mp4', - 'title': 'Tools', - 'description': 'md5:82c1ea6472e88ed5acd1829fe992e4f7', - 'thumbnail': r're:^https?://.*\.jpg$', - 'chapter': 'Introduction', - 'chapter_number': 1, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'Requires FrontendMasters account credentials', - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - course_name, lesson_name = mobj.group('course_name', 'lesson_name') - - course = self._download_course(course_name, url) - - lesson_id, lesson = next( - (video_id, data) - for video_id, data in course['lessonData'].items() - if data.get('slug') == lesson_name) - - chapters = self._extract_chapters(course) - return self._extract_lesson(chapters, lesson_id, lesson) - - -class FrontendMastersCourseIE(FrontendMastersPageBaseIE): - _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P<id>[^/]+)' - _TEST = { - 'url': 'https://frontendmasters.com/courses/web-development/', - 'info_dict': { - 'id': 'web-development', - 'title': 'Introduction to Web Development', - 'description': 'md5:9317e6e842098bf725d62360e52d49a6', - }, - 'playlist_count': 81, - 'skip': 'Requires FrontendMasters account credentials', - } - - @classmethod - def suitable(cls, url): - return False if FrontendMastersLessonIE.suitable(url) else super( - FrontendMastersBaseIE, cls).suitable(url) - - def _real_extract(self, url): - course_name = self._match_id(url) - - course = self._download_course(course_name, url) - - chapters = self._extract_chapters(course) - - lessons = sorted( - course['lessonData'].values(), key=lambda data: data['index']) - - entries = [] - for lesson in lessons: - lesson_name = lesson.get('slug') - if not lesson_name: - continue - lesson_id = lesson.get('hash') or lesson.get('statsId') - entries.append(self._extract_lesson(chapters, lesson_id, lesson)) - - title = course.get('title') - description = course.get('description') - - return self.playlist_result(entries, course_name, title, description) diff --git a/youtube_dl/extractor/funimation.py b/youtube_dl/extractor/funimation.py deleted file mode 100644 index d8f1e169a..000000000 --- a/youtube_dl/extractor/funimation.py +++ /dev/null @@ -1,158 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import random -import string - -from .common import InfoExtractor -from ..compat import compat_HTTPError -from ..utils import ( - determine_ext, - int_or_none, - js_to_json, - ExtractorError, - urlencode_postdata -) - - -class FunimationIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?funimation(?:\.com|now\.uk)/(?:[^/]+/)?shows/[^/]+/(?P<id>[^/?#&]+)' - - _NETRC_MACHINE = 'funimation' - _TOKEN = None - - _TESTS = [{ - 'url': 'https://www.funimation.com/shows/hacksign/role-play/', - 'info_dict': { - 'id': '91144', - 'display_id': 'role-play', - 'ext': 'mp4', - 'title': '.hack//SIGN - Role Play', - 'description': 'md5:b602bdc15eef4c9bbb201bb6e6a4a2dd', - 'thumbnail': r're:https?://.*\.jpg', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'https://www.funimation.com/shows/attack-on-titan-junior-high/broadcast-dub-preview/', - 'info_dict': { - 'id': '210051', - 'display_id': 'broadcast-dub-preview', - 'ext': 'mp4', - 'title': 'Attack on Titan: Junior High - Broadcast Dub Preview', - 'thumbnail': r're:https?://.*\.(?:jpg|png)', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'https://www.funimationnow.uk/shows/puzzle-dragons-x/drop-impact/simulcast/', - 'only_matching': True, - }, { - # with lang code - 'url': 'https://www.funimation.com/en/shows/hacksign/role-play/', - 'only_matching': True, - }] - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - try: - data = self._download_json( - 'https://prod-api-funimationnow.dadcdigital.com/api/auth/login/', - None, 'Logging in', data=urlencode_postdata({ - 'username': username, - 'password': password, - })) - self._TOKEN = data['token'] - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - error = self._parse_json(e.cause.read().decode(), None)['error'] - raise ExtractorError(error, expected=True) - raise - - def _real_initialize(self): - self._login() - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - def _search_kane(name): - return self._search_regex( - r"KANE_customdimensions\.%s\s*=\s*'([^']+)';" % name, - webpage, name, default=None) - - title_data = self._parse_json(self._search_regex( - r'TITLE_DATA\s*=\s*({[^}]+})', - webpage, 'title data', default=''), - display_id, js_to_json, fatal=False) or {} - - video_id = title_data.get('id') or self._search_regex([ - r"KANE_customdimensions.videoID\s*=\s*'(\d+)';", - r'<iframe[^>]+src="/player/(\d+)', - ], webpage, 'video_id', default=None) - if not video_id: - player_url = self._html_search_meta([ - 'al:web:url', - 'og:video:url', - 'og:video:secure_url', - ], webpage, fatal=True) - video_id = self._search_regex(r'/player/(\d+)', player_url, 'video id') - - title = episode = title_data.get('title') or _search_kane('videoTitle') or self._og_search_title(webpage) - series = _search_kane('showName') - if series: - title = '%s - %s' % (series, title) - description = self._html_search_meta(['description', 'og:description'], webpage, fatal=True) - - try: - headers = {} - if self._TOKEN: - headers['Authorization'] = 'Token %s' % self._TOKEN - sources = self._download_json( - 'https://www.funimation.com/api/showexperience/%s/' % video_id, - video_id, headers=headers, query={ - 'pinst_id': ''.join([random.choice(string.digits + string.ascii_letters) for _ in range(8)]), - })['items'] - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - error = self._parse_json(e.cause.read(), video_id)['errors'][0] - raise ExtractorError('%s said: %s' % ( - self.IE_NAME, error.get('detail') or error.get('title')), expected=True) - raise - - formats = [] - for source in sources: - source_url = source.get('src') - if not source_url: - continue - source_type = source.get('videoType') or determine_ext(source_url) - if source_type == 'm3u8': - formats.extend(self._extract_m3u8_formats( - source_url, video_id, 'mp4', - m3u8_id='hls', fatal=False)) - else: - formats.append({ - 'format_id': source_type, - 'url': source_url, - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': self._og_search_thumbnail(webpage), - 'series': series, - 'season_number': int_or_none(title_data.get('seasonNum') or _search_kane('season')), - 'episode_number': int_or_none(title_data.get('episodeNum')), - 'episode': episode, - 'season_id': title_data.get('seriesId'), - 'formats': formats, - } diff --git a/youtube_dl/extractor/funk.py b/youtube_dl/extractor/funk.py deleted file mode 100644 index 81d1949fd..000000000 --- a/youtube_dl/extractor/funk.py +++ /dev/null @@ -1,49 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from .nexx import NexxIE -from ..utils import ( - int_or_none, - str_or_none, -) - - -class FunkIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?funk\.net/(?:channel|playlist)/[^/]+/(?P<display_id>[0-9a-z-]+)-(?P<id>\d+)' - _TESTS = [{ - 'url': 'https://www.funk.net/channel/ba-793/die-lustigsten-instrumente-aus-dem-internet-teil-2-1155821', - 'md5': '8dd9d9ab59b4aa4173b3197f2ea48e81', - 'info_dict': { - 'id': '1155821', - 'ext': 'mp4', - 'title': 'Die LUSTIGSTEN INSTRUMENTE aus dem Internet - Teil 2', - 'description': 'md5:a691d0413ef4835588c5b03ded670c1f', - 'timestamp': 1514507395, - 'upload_date': '20171229', - }, - - }, { - 'url': 'https://www.funk.net/playlist/neuesteVideos/kameras-auf-dem-fusion-festival-1618699', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id, nexx_id = re.match(self._VALID_URL, url).groups() - video = self._download_json( - 'https://www.funk.net/api/v4.0/videos/' + nexx_id, nexx_id) - return { - '_type': 'url_transparent', - 'url': 'nexx:741:' + nexx_id, - 'ie_key': NexxIE.ie_key(), - 'id': nexx_id, - 'title': video.get('title'), - 'description': video.get('description'), - 'duration': int_or_none(video.get('duration')), - 'channel_id': str_or_none(video.get('channelId')), - 'display_id': display_id, - 'tags': video.get('tags'), - 'thumbnail': video.get('imageUrlLandscape'), - } diff --git a/youtube_dl/extractor/gaia.py b/youtube_dl/extractor/gaia.py deleted file mode 100644 index e9527758f..000000000 --- a/youtube_dl/extractor/gaia.py +++ /dev/null @@ -1,130 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urllib_parse_unquote, -) -from ..utils import ( - ExtractorError, - int_or_none, - str_or_none, - strip_or_none, - try_get, - urlencode_postdata, -) - - -class GaiaIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?gaia\.com/video/(?P<id>[^/?]+).*?\bfullplayer=(?P<type>feature|preview)' - _TESTS = [{ - 'url': 'https://www.gaia.com/video/connecting-universal-consciousness?fullplayer=feature', - 'info_dict': { - 'id': '89356', - 'ext': 'mp4', - 'title': 'Connecting with Universal Consciousness', - 'description': 'md5:844e209ad31b7d31345f5ed689e3df6f', - 'upload_date': '20151116', - 'timestamp': 1447707266, - 'duration': 936, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'https://www.gaia.com/video/connecting-universal-consciousness?fullplayer=preview', - 'info_dict': { - 'id': '89351', - 'ext': 'mp4', - 'title': 'Connecting with Universal Consciousness', - 'description': 'md5:844e209ad31b7d31345f5ed689e3df6f', - 'upload_date': '20151116', - 'timestamp': 1447707266, - 'duration': 53, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }] - _NETRC_MACHINE = 'gaia' - _jwt = None - - def _real_initialize(self): - auth = self._get_cookies('https://www.gaia.com/').get('auth') - if auth: - auth = self._parse_json( - compat_urllib_parse_unquote(auth.value), - None, fatal=False) - if not auth: - username, password = self._get_login_info() - if username is None: - return - auth = self._download_json( - 'https://auth.gaia.com/v1/login', - None, data=urlencode_postdata({ - 'username': username, - 'password': password - })) - if auth.get('success') is False: - raise ExtractorError(', '.join(auth['messages']), expected=True) - if auth: - self._jwt = auth.get('jwt') - - def _real_extract(self, url): - display_id, vtype = re.search(self._VALID_URL, url).groups() - node_id = self._download_json( - 'https://brooklyn.gaia.com/pathinfo', display_id, query={ - 'path': 'video/' + display_id, - })['id'] - node = self._download_json( - 'https://brooklyn.gaia.com/node/%d' % node_id, node_id) - vdata = node[vtype] - media_id = compat_str(vdata['nid']) - title = node['title'] - - headers = None - if self._jwt: - headers = {'Authorization': 'Bearer ' + self._jwt} - media = self._download_json( - 'https://brooklyn.gaia.com/media/' + media_id, - media_id, headers=headers) - formats = self._extract_m3u8_formats( - media['mediaUrls']['bcHLS'], media_id, 'mp4') - self._sort_formats(formats) - - subtitles = {} - text_tracks = media.get('textTracks', {}) - for key in ('captions', 'subtitles'): - for lang, sub_url in text_tracks.get(key, {}).items(): - subtitles.setdefault(lang, []).append({ - 'url': sub_url, - }) - - fivestar = node.get('fivestar', {}) - fields = node.get('fields', {}) - - def get_field_value(key, value_key='value'): - return try_get(fields, lambda x: x[key][0][value_key]) - - return { - 'id': media_id, - 'display_id': display_id, - 'title': title, - 'formats': formats, - 'description': strip_or_none(get_field_value('body') or get_field_value('teaser')), - 'timestamp': int_or_none(node.get('created')), - 'subtitles': subtitles, - 'duration': int_or_none(vdata.get('duration')), - 'like_count': int_or_none(try_get(fivestar, lambda x: x['up_count']['value'])), - 'dislike_count': int_or_none(try_get(fivestar, lambda x: x['down_count']['value'])), - 'comment_count': int_or_none(node.get('comment_count')), - 'series': try_get(node, lambda x: x['series']['title'], compat_str), - 'season_number': int_or_none(get_field_value('season')), - 'season_id': str_or_none(get_field_value('series_nid', 'nid')), - 'episode_number': int_or_none(get_field_value('episode')), - } diff --git a/youtube_dl/extractor/gamestar.py b/youtube_dl/extractor/gamestar.py deleted file mode 100644 index f00dab2f3..000000000 --- a/youtube_dl/extractor/gamestar.py +++ /dev/null @@ -1,65 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - remove_end, -) - - -class GameStarIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?game(?P<site>pro|star)\.de/videos/.*,(?P<id>[0-9]+)\.html' - _TESTS = [{ - 'url': 'http://www.gamestar.de/videos/trailer,3/hobbit-3-die-schlacht-der-fuenf-heere,76110.html', - 'md5': 'ee782f1f8050448c95c5cacd63bc851c', - 'info_dict': { - 'id': '76110', - 'ext': 'mp4', - 'title': 'Hobbit 3: Die Schlacht der Fünf Heere - Teaser-Trailer zum dritten Teil', - 'description': 'Der Teaser-Trailer zu Hobbit 3: Die Schlacht der Fünf Heere zeigt einige Szenen aus dem dritten Teil der Saga und kündigt den...', - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1406542380, - 'upload_date': '20140728', - 'duration': 17, - } - }, { - 'url': 'http://www.gamepro.de/videos/top-10-indie-spiele-fuer-nintendo-switch-video-tolle-nindies-games-zum-download,95316.html', - 'only_matching': True, - }, { - 'url': 'http://www.gamestar.de/videos/top-10-indie-spiele-fuer-nintendo-switch-video-tolle-nindies-games-zum-download,95316.html', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - site = mobj.group('site') - video_id = mobj.group('id') - - webpage = self._download_webpage(url, video_id) - - # TODO: there are multiple ld+json objects in the webpage, - # while _search_json_ld finds only the first one - json_ld = self._parse_json(self._search_regex( - r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>[^<]+VideoObject[^<]+)</script>', - webpage, 'JSON-LD', group='json_ld'), video_id) - info_dict = self._json_ld(json_ld, video_id) - info_dict['title'] = remove_end( - info_dict['title'], ' - Game%s' % site.title()) - - view_count = int_or_none(json_ld.get('interactionCount')) - comment_count = int_or_none(self._html_search_regex( - r'<span>Kommentare</span>\s*<span[^>]+class=["\']count[^>]+>\s*\(\s*([0-9]+)', - webpage, 'comment count', fatal=False)) - - info_dict.update({ - 'id': video_id, - 'url': 'http://gamestar.de/_misc/videos/portal/getVideoUrl.cfm?premium=0&videoId=' + video_id, - 'ext': 'mp4', - 'view_count': view_count, - 'comment_count': comment_count - }) - - return info_dict diff --git a/youtube_dl/extractor/gaskrank.py b/youtube_dl/extractor/gaskrank.py deleted file mode 100644 index 1726a6704..000000000 --- a/youtube_dl/extractor/gaskrank.py +++ /dev/null @@ -1,101 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re -from .common import InfoExtractor -from ..utils import ( - float_or_none, - int_or_none, - unified_strdate, -) - - -class GaskrankIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?gaskrank\.tv/tv/(?P<categories>[^/]+)/(?P<id>[^/]+)\.htm' - _TESTS = [{ - 'url': 'http://www.gaskrank.tv/tv/motorrad-fun/strike-einparken-durch-anfaenger-crash-mit-groesserem-flurschaden.htm', - 'md5': '1ae88dbac97887d85ebd1157a95fc4f9', - 'info_dict': { - 'id': '201601/26955', - 'ext': 'mp4', - 'title': 'Strike! Einparken können nur Männer - Flurschaden hält sich in Grenzen *lol*', - 'thumbnail': r're:^https?://.*\.jpg$', - 'categories': ['motorrad-fun'], - 'display_id': 'strike-einparken-durch-anfaenger-crash-mit-groesserem-flurschaden', - 'uploader_id': 'Bikefun', - 'upload_date': '20170110', - 'uploader_url': None, - } - }, { - 'url': 'http://www.gaskrank.tv/tv/racing/isle-of-man-tt-2011-michael-du-15920.htm', - 'md5': 'c33ee32c711bc6c8224bfcbe62b23095', - 'info_dict': { - 'id': '201106/15920', - 'ext': 'mp4', - 'title': 'Isle of Man - Michael Dunlop vs Guy Martin - schwindelig kucken', - 'thumbnail': r're:^https?://.*\.jpg$', - 'categories': ['racing'], - 'display_id': 'isle-of-man-tt-2011-michael-du-15920', - 'uploader_id': 'IOM', - 'upload_date': '20170523', - 'uploader_url': 'www.iomtt.com', - } - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - title = self._og_search_title( - webpage, default=None) or self._html_search_meta( - 'title', webpage, fatal=True) - - categories = [re.match(self._VALID_URL, url).group('categories')] - - mobj = re.search( - r'Video von:\s*(?P<uploader_id>[^|]*?)\s*\|\s*vom:\s*(?P<upload_date>[0-9][0-9]\.[0-9][0-9]\.[0-9][0-9][0-9][0-9])', - webpage) - if mobj is not None: - uploader_id = mobj.groupdict().get('uploader_id') - upload_date = unified_strdate(mobj.groupdict().get('upload_date')) - - uploader_url = self._search_regex( - r'Homepage:\s*<[^>]*>(?P<uploader_url>[^<]*)', - webpage, 'uploader_url', default=None) - tags = re.findall( - r'/tv/tags/[^/]+/"\s*>(?P<tag>[^<]*?)<', - webpage) - - view_count = self._search_regex( - r'class\s*=\s*"gkRight"(?:[^>]*>\s*<[^>]*)*icon-eye-open(?:[^>]*>\s*<[^>]*)*>\s*(?P<view_count>[0-9\.]*)', - webpage, 'view_count', default=None) - if view_count: - view_count = int_or_none(view_count.replace('.', '')) - - average_rating = self._search_regex( - r'itemprop\s*=\s*"ratingValue"[^>]*>\s*(?P<average_rating>[0-9,]+)', - webpage, 'average_rating') - if average_rating: - average_rating = float_or_none(average_rating.replace(',', '.')) - - video_id = self._search_regex( - r'https?://movies\.gaskrank\.tv/([^-]*?)(-[^\.]*)?\.mp4', - webpage, 'video id', default=display_id) - - entry = self._parse_html5_media_entries(url, webpage, video_id)[0] - entry.update({ - 'id': video_id, - 'title': title, - 'categories': categories, - 'display_id': display_id, - 'uploader_id': uploader_id, - 'upload_date': upload_date, - 'uploader_url': uploader_url, - 'tags': tags, - 'view_count': view_count, - 'average_rating': average_rating, - }) - self._sort_formats(entry['formats']) - - return entry diff --git a/youtube_dl/extractor/gazeta.py b/youtube_dl/extractor/gazeta.py deleted file mode 100644 index 57c67a451..000000000 --- a/youtube_dl/extractor/gazeta.py +++ /dev/null @@ -1,48 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - - -class GazetaIE(InfoExtractor): - _VALID_URL = r'(?P<url>https?://(?:www\.)?gazeta\.ru/(?:[^/]+/)?video/(?:main/)*(?:\d{4}/\d{2}/\d{2}/)?(?P<id>[A-Za-z0-9-_.]+)\.s?html)' - _TESTS = [{ - 'url': 'http://www.gazeta.ru/video/main/zadaite_vopros_vladislavu_yurevichu.shtml', - 'md5': 'd49c9bdc6e5a7888f27475dc215ee789', - 'info_dict': { - 'id': '205566', - 'ext': 'mp4', - 'title': '«70–80 процентов гражданских в Донецке на грани голода»', - 'description': 'md5:38617526050bd17b234728e7f9620a71', - 'thumbnail': r're:^https?://.*\.jpg', - }, - 'skip': 'video not found', - }, { - 'url': 'http://www.gazeta.ru/lifestyle/video/2015/03/08/master-klass_krasivoi_byt._delaem_vesennii_makiyazh.shtml', - 'only_matching': True, - }, { - 'url': 'http://www.gazeta.ru/video/main/main/2015/06/22/platit_ili_ne_platit_po_isku_yukosa.shtml', - 'md5': '37f19f78355eb2f4256ee1688359f24c', - 'info_dict': { - 'id': '252048', - 'ext': 'mp4', - 'title': '"Если по иску ЮКОСа придется платить, это будет большой удар по бюджету"', - }, - 'add_ie': ['EaglePlatform'], - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - - display_id = mobj.group('id') - embed_url = '%s?p=embed' % mobj.group('url') - embed_page = self._download_webpage( - embed_url, display_id, 'Downloading embed page') - - video_id = self._search_regex( - r'<div[^>]*?class="eagleplayer"[^>]*?data-id="([^"]+)"', embed_page, 'video id') - - return self.url_result( - 'eagleplatform:gazeta.media.eagleplatform.com:%s' % video_id, 'EaglePlatform') diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py deleted file mode 100644 index acc6478b8..000000000 --- a/youtube_dl/extractor/gdcvault.py +++ /dev/null @@ -1,220 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from .kaltura import KalturaIE -from ..utils import ( - HEADRequest, - remove_start, - sanitized_Request, - smuggle_url, - urlencode_postdata, -) - - -class GDCVaultIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?gdcvault\.com/play/(?P<id>\d+)(?:/(?P<name>[\w-]+))?' - _NETRC_MACHINE = 'gdcvault' - _TESTS = [ - { - 'url': 'http://www.gdcvault.com/play/1019721/Doki-Doki-Universe-Sweet-Simple', - 'md5': '7ce8388f544c88b7ac11c7ab1b593704', - 'info_dict': { - 'id': '201311826596_AWNY', - 'display_id': 'Doki-Doki-Universe-Sweet-Simple', - 'ext': 'mp4', - 'title': 'Doki-Doki Universe: Sweet, Simple and Genuine (GDC Next 10)' - } - }, - { - 'url': 'http://www.gdcvault.com/play/1015683/Embracing-the-Dark-Art-of', - 'info_dict': { - 'id': '201203272_1330951438328RSXR', - 'display_id': 'Embracing-the-Dark-Art-of', - 'ext': 'flv', - 'title': 'Embracing the Dark Art of Mathematical Modeling in AI' - }, - 'params': { - 'skip_download': True, # Requires rtmpdump - } - }, - { - 'url': 'http://www.gdcvault.com/play/1015301/Thexder-Meets-Windows-95-or', - 'md5': 'a5eb77996ef82118afbbe8e48731b98e', - 'info_dict': { - 'id': '1015301', - 'display_id': 'Thexder-Meets-Windows-95-or', - 'ext': 'flv', - 'title': 'Thexder Meets Windows 95, or Writing Great Games in the Windows 95 Environment', - }, - 'skip': 'Requires login', - }, - { - 'url': 'http://gdcvault.com/play/1020791/', - 'only_matching': True, - }, - { - # Hard-coded hostname - 'url': 'http://gdcvault.com/play/1023460/Tenacious-Design-and-The-Interface', - 'md5': 'a8efb6c31ed06ca8739294960b2dbabd', - 'info_dict': { - 'id': '840376_BQRC', - 'ext': 'mp4', - 'display_id': 'Tenacious-Design-and-The-Interface', - 'title': 'Tenacious Design and The Interface of \'Destiny\'', - }, - }, - { - # Multiple audios - 'url': 'http://www.gdcvault.com/play/1014631/Classic-Game-Postmortem-PAC', - 'info_dict': { - 'id': '12396_1299111843500GMPX', - 'ext': 'mp4', - 'title': 'How to Create a Good Game - From My Experience of Designing Pac-Man', - }, - # 'params': { - # 'skip_download': True, # Requires rtmpdump - # 'format': 'jp', # The japanese audio - # } - }, - { - # gdc-player.html - 'url': 'http://www.gdcvault.com/play/1435/An-American-engine-in-Tokyo', - 'info_dict': { - 'id': '9350_1238021887562UHXB', - 'display_id': 'An-American-engine-in-Tokyo', - 'ext': 'mp4', - 'title': 'An American Engine in Tokyo:/nThe collaboration of Epic Games and Square Enix/nFor THE LAST REMINANT', - }, - }, - { - # Kaltura Embed - 'url': 'https://www.gdcvault.com/play/1026180/Mastering-the-Apex-of-Scaling', - 'info_dict': { - 'id': '0_h1fg8j3p', - 'ext': 'mp4', - 'title': 'Mastering the Apex of Scaling Game Servers (Presented by Multiplay)', - 'timestamp': 1554401811, - 'upload_date': '20190404', - 'uploader_id': 'joe@blazestreaming.com', - }, - 'params': { - 'format': 'mp4-408', - }, - }, - { - # Kaltura embed, whitespace between quote and embedded URL in iframe's src - 'url': 'https://www.gdcvault.com/play/1025699', - 'info_dict': { - 'id': '0_zagynv0a', - 'ext': 'mp4', - 'title': 'Tech Toolbox', - 'upload_date': '20190408', - 'uploader_id': 'joe@blazestreaming.com', - 'timestamp': 1554764629, - }, - 'params': { - 'skip_download': True, - }, - }, - { - # HTML5 video - 'url': 'http://www.gdcvault.com/play/1014846/Conference-Keynote-Shigeru', - 'only_matching': True, - }, - ] - - def _login(self, webpage_url, display_id): - username, password = self._get_login_info() - if username is None or password is None: - self.report_warning('It looks like ' + webpage_url + ' requires a login. Try specifying a username and password and try again.') - return None - - mobj = re.match(r'(?P<root_url>https?://.*?/).*', webpage_url) - login_url = mobj.group('root_url') + 'api/login.php' - logout_url = mobj.group('root_url') + 'logout' - - login_form = { - 'email': username, - 'password': password, - } - - request = sanitized_Request(login_url, urlencode_postdata(login_form)) - request.add_header('Content-Type', 'application/x-www-form-urlencoded') - self._download_webpage(request, display_id, 'Logging in') - start_page = self._download_webpage(webpage_url, display_id, 'Getting authenticated video page') - self._download_webpage(logout_url, display_id, 'Logging out') - - return start_page - - def _real_extract(self, url): - video_id, name = re.match(self._VALID_URL, url).groups() - display_id = name or video_id - - webpage_url = 'http://www.gdcvault.com/play/' + video_id - start_page = self._download_webpage(webpage_url, display_id) - - direct_url = self._search_regex( - r's1\.addVariable\("file",\s*encodeURIComponent\("(/[^"]+)"\)\);', - start_page, 'url', default=None) - if direct_url: - title = self._html_search_regex( - r'<td><strong>Session Name:?</strong></td>\s*<td>(.*?)</td>', - start_page, 'title') - video_url = 'http://www.gdcvault.com' + direct_url - # resolve the url so that we can detect the correct extension - video_url = self._request_webpage( - HEADRequest(video_url), video_id).geturl() - - return { - 'id': video_id, - 'display_id': display_id, - 'url': video_url, - 'title': title, - } - - embed_url = KalturaIE._extract_url(start_page) - if embed_url: - embed_url = smuggle_url(embed_url, {'source_url': url}) - ie_key = 'Kaltura' - else: - PLAYER_REGEX = r'<iframe src="(?P<xml_root>.+?)/(?:gdc-)?player.*?\.html.*?".*?</iframe>' - - xml_root = self._html_search_regex( - PLAYER_REGEX, start_page, 'xml root', default=None) - if xml_root is None: - # Probably need to authenticate - login_res = self._login(webpage_url, display_id) - if login_res is None: - self.report_warning('Could not login.') - else: - start_page = login_res - # Grab the url from the authenticated page - xml_root = self._html_search_regex( - PLAYER_REGEX, start_page, 'xml root') - - xml_name = self._html_search_regex( - r'<iframe src=".*?\?xml(?:=|URL=xml/)(.+?\.xml).*?".*?</iframe>', - start_page, 'xml filename', default=None) - if not xml_name: - info = self._parse_html5_media_entries(url, start_page, video_id)[0] - info.update({ - 'title': remove_start(self._search_regex( - r'>Session Name:\s*<.*?>\s*<td>(.+?)</td>', start_page, - 'title', default=None) or self._og_search_title( - start_page, default=None), 'GDC Vault - '), - 'id': video_id, - 'display_id': display_id, - }) - return info - embed_url = '%s/xml/%s' % (xml_root, xml_name) - ie_key = 'DigitallySpeaking' - - return { - '_type': 'url_transparent', - 'id': video_id, - 'display_id': display_id, - 'url': embed_url, - 'ie_key': ie_key, - } diff --git a/youtube_dl/extractor/gedidigital.py b/youtube_dl/extractor/gedidigital.py deleted file mode 100644 index 6c4153b40..000000000 --- a/youtube_dl/extractor/gedidigital.py +++ /dev/null @@ -1,161 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - determine_ext, - int_or_none, -) - - -class GediDigitalIE(InfoExtractor): - _VALID_URL = r'''(?x)https?://video\. - (?: - (?: - (?:espresso\.)?repubblica - |lastampa - |ilsecoloxix - )| - (?: - iltirreno - |messaggeroveneto - |ilpiccolo - |gazzettadimantova - |mattinopadova - |laprovinciapavese - |tribunatreviso - |nuovavenezia - |gazzettadimodena - |lanuovaferrara - |corrierealpi - |lasentinella - )\.gelocal - )\.it(?:/[^/]+){2,3}?/(?P<id>\d+)(?:[/?&#]|$)''' - _TESTS = [{ - 'url': 'https://video.lastampa.it/politica/il-paradosso-delle-regionali-la-lega-vince-ma-sembra-aver-perso/121559/121683', - 'md5': '84658d7fb9e55a6e57ecc77b73137494', - 'info_dict': { - 'id': '121559', - 'ext': 'mp4', - 'title': 'Il paradosso delle Regionali: ecco perché la Lega vince ma sembra aver perso', - 'description': 'md5:de7f4d6eaaaf36c153b599b10f8ce7ca', - 'thumbnail': r're:^https://www\.repstatic\.it/video/photo/.+?-thumb-full-.+?\.jpg$', - 'duration': 125, - }, - }, { - 'url': 'https://video.espresso.repubblica.it/embed/tutti-i-video/01-ted-villa/14772/14870&width=640&height=360', - 'only_matching': True, - }, { - 'url': 'https://video.repubblica.it/motori/record-della-pista-a-spa-francorchamps-la-pagani-huayra-roadster-bc-stupisce/367415/367963', - 'only_matching': True, - }, { - 'url': 'https://video.ilsecoloxix.it/sport/cassani-e-i-brividi-azzurri-ai-mondiali-di-imola-qui-mi-sono-innamorato-del-ciclismo-da-ragazzino-incredibile-tornarci-da-ct/66184/66267', - 'only_matching': True, - }, { - 'url': 'https://video.iltirreno.gelocal.it/sport/dentro-la-notizia-ferrari-cosa-succede-a-maranello/141059/142723', - 'only_matching': True, - }, { - 'url': 'https://video.messaggeroveneto.gelocal.it/locale/maria-giovanna-elmi-covid-vaccino/138155/139268', - 'only_matching': True, - }, { - 'url': 'https://video.ilpiccolo.gelocal.it/dossier/big-john/dinosauro-big-john-al-via-le-visite-guidate-a-trieste/135226/135751', - 'only_matching': True, - }, { - 'url': 'https://video.gazzettadimantova.gelocal.it/locale/dal-ponte-visconteo-di-valeggio-l-and-8217sos-dei-ristoratori-aprire-anche-a-cena/137310/137818', - 'only_matching': True, - }, { - 'url': 'https://video.mattinopadova.gelocal.it/dossier/coronavirus-in-veneto/covid-a-vo-un-anno-dopo-un-cuore-tricolore-per-non-dimenticare/138402/138964', - 'only_matching': True, - }, { - 'url': 'https://video.laprovinciapavese.gelocal.it/locale/mede-zona-rossa-via-alle-vaccinazioni-per-gli-over-80/137545/138120', - 'only_matching': True, - }, { - 'url': 'https://video.tribunatreviso.gelocal.it/dossier/coronavirus-in-veneto/ecco-le-prima-vaccinazioni-di-massa-nella-marca/134485/135024', - 'only_matching': True, - }, { - 'url': 'https://video.nuovavenezia.gelocal.it/locale/camion-troppo-alto-per-il-ponte-ferroviario-perde-il-carico/135734/136266', - 'only_matching': True, - }, { - 'url': 'https://video.gazzettadimodena.gelocal.it/locale/modena-scoperta-la-proteina-che-predice-il-livello-di-gravita-del-covid/139109/139796', - 'only_matching': True, - }, { - 'url': 'https://video.lanuovaferrara.gelocal.it/locale/due-bombole-di-gpl-aperte-e-abbandonate-i-vigili-bruciano-il-gas/134391/134957', - 'only_matching': True, - }, { - 'url': 'https://video.corrierealpi.gelocal.it/dossier/cortina-2021-i-mondiali-di-sci-alpino/mondiali-di-sci-il-timelapse-sulla-splendida-olympia/133760/134331', - 'only_matching': True, - }, { - 'url': 'https://video.lasentinella.gelocal.it/locale/vestigne-centra-un-auto-e-si-ribalta/138931/139466', - 'only_matching': True, - }, { - 'url': 'https://video.espresso.repubblica.it/tutti-i-video/01-ted-villa/14772', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - title = self._html_search_meta( - ['twitter:title', 'og:title'], webpage, fatal=True) - player_data = re.findall( - r"PlayerFactory\.setParam\('(?P<type>format|param)',\s*'(?P<name>[^']+)',\s*'(?P<val>[^']+)'\);", - webpage) - - formats = [] - duration = thumb = None - for t, n, v in player_data: - if t == 'format': - if n in ('video-hds-vod-ec', 'video-hls-vod-ec', 'video-viralize', 'video-youtube-pfp'): - continue - elif n.endswith('-vod-ak'): - formats.extend(self._extract_akamai_formats( - v, video_id, {'http': 'media.gedidigital.it'})) - else: - ext = determine_ext(v) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - v, video_id, 'mp4', 'm3u8_native', m3u8_id=n, fatal=False)) - continue - f = { - 'format_id': n, - 'url': v, - } - if ext == 'mp3': - abr = int_or_none(self._search_regex( - r'-mp3-audio-(\d+)', v, 'abr', default=None)) - f.update({ - 'abr': abr, - 'tbr': abr, - 'vcodec': 'none' - }) - else: - mobj = re.match(r'^video-rrtv-(\d+)(?:-(\d+))?$', n) - if mobj: - f.update({ - 'height': int(mobj.group(1)), - 'vbr': int_or_none(mobj.group(2)), - }) - if not f.get('vbr'): - f['vbr'] = int_or_none(self._search_regex( - r'-video-rrtv-(\d+)', v, 'abr', default=None)) - formats.append(f) - elif t == 'param': - if n in ['image_full', 'image']: - thumb = v - elif n == 'videoDuration': - duration = int_or_none(v) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': self._html_search_meta( - ['twitter:description', 'og:description', 'description'], webpage), - 'thumbnail': thumb or self._og_search_thumbnail(webpage), - 'formats': formats, - 'duration': duration, - } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py deleted file mode 100644 index a9c064105..000000000 --- a/youtube_dl/extractor/generic.py +++ /dev/null @@ -1,3566 +0,0 @@ -# coding: utf-8 - -from __future__ import unicode_literals - -import os -import re -import sys - -from .common import InfoExtractor -from .youtube import YoutubeIE -from ..compat import ( - compat_etree_fromstring, - compat_str, - compat_urllib_parse_unquote, - compat_urlparse, - compat_xml_parse_error, -) -from ..utils import ( - determine_ext, - ExtractorError, - float_or_none, - HEADRequest, - int_or_none, - is_html, - js_to_json, - KNOWN_EXTENSIONS, - merge_dicts, - mimetype2ext, - orderedSet, - parse_duration, - sanitized_Request, - smuggle_url, - unescapeHTML, - unified_timestamp, - unsmuggle_url, - UnsupportedError, - url_or_none, - xpath_attr, - xpath_text, - xpath_with_ns, -) -from .commonprotocols import RtmpIE -from .brightcove import ( - BrightcoveLegacyIE, - BrightcoveNewIE, -) -from .nexx import ( - NexxIE, - NexxEmbedIE, -) -from .nbc import NBCSportsVPlayerIE -from .ooyala import OoyalaIE -from .rutv import RUTVIE -from .tvc import TVCIE -from .sportbox import SportBoxIE -from .myvi import MyviIE -from .condenast import CondeNastIE -from .udn import UDNEmbedIE -from .senateisvp import SenateISVPIE -from .svt import SVTIE -from .pornhub import PornHubIE -from .xhamster import XHamsterEmbedIE -from .tnaflix import TNAFlixNetworkEmbedIE -from .drtuber import DrTuberIE -from .redtube import RedTubeIE -from .tube8 import Tube8IE -from .mofosex import MofosexEmbedIE -from .spankwire import SpankwireIE -from .youporn import YouPornIE -from .vimeo import ( - VimeoIE, - VHXEmbedIE, -) -from .dailymotion import DailymotionIE -from .dailymail import DailyMailIE -from .onionstudios import OnionStudiosIE -from .viewlift import ViewLiftEmbedIE -from .mtv import MTVServicesEmbeddedIE -from .pladform import PladformIE -from .videomore import VideomoreIE -from .webcaster import WebcasterFeedIE -from .googledrive import GoogleDriveIE -from .jwplatform import JWPlatformIE -from .digiteka import DigitekaIE -from .arkena import ArkenaIE -from .instagram import InstagramIE -from .threeqsdn import ThreeQSDNIE -from .theplatform import ThePlatformIE -from .kaltura import KalturaIE -from .eagleplatform import EaglePlatformIE -from .facebook import FacebookIE -from .soundcloud import SoundcloudEmbedIE -from .tunein import TuneInBaseIE -from .vbox7 import Vbox7IE -from .dbtv import DBTVIE -from .piksel import PikselIE -from .videa import VideaIE -from .twentymin import TwentyMinutenIE -from .ustream import UstreamIE -from .arte import ArteTVEmbedIE -from .videopress import VideoPressIE -from .rutube import RutubeIE -from .limelight import LimelightBaseIE -from .anvato import AnvatoIE -from .washingtonpost import WashingtonPostIE -from .wistia import WistiaIE -from .mediaset import MediasetIE -from .joj import JojIE -from .megaphone import MegaphoneIE -from .vzaar import VzaarIE -from .channel9 import Channel9IE -from .vshare import VShareIE -from .mediasite import MediasiteIE -from .springboardplatform import SpringboardPlatformIE -from .yapfiles import YapFilesIE -from .vice import ViceIE -from .xfileshare import XFileShareIE -from .cloudflarestream import CloudflareStreamIE -from .peertube import PeerTubeIE -from .teachable import TeachableIE -from .indavideo import IndavideoEmbedIE -from .apa import APAIE -from .foxnews import FoxNewsIE -from .viqeo import ViqeoIE -from .expressen import ExpressenIE -from .zype import ZypeIE -from .odnoklassniki import OdnoklassnikiIE -from .vk import VKIE -from .kinja import KinjaEmbedIE -from .arcpublishing import ArcPublishingIE -from .medialaan import MedialaanIE -from .simplecast import SimplecastIE - - -class GenericIE(InfoExtractor): - IE_DESC = 'Generic downloader that works on some sites' - _VALID_URL = r'.*' - IE_NAME = 'generic' - _TESTS = [ - # Direct link to a video - { - 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4', - 'md5': '67d406c2bcb6af27fa886f31aa934bbe', - 'info_dict': { - 'id': 'trailer', - 'ext': 'mp4', - 'title': 'trailer', - 'upload_date': '20100513', - } - }, - # Direct link to media delivered compressed (until Accept-Encoding is *) - { - 'url': 'http://calimero.tk/muzik/FictionJunction-Parallel_Hearts.flac', - 'md5': '128c42e68b13950268b648275386fc74', - 'info_dict': { - 'id': 'FictionJunction-Parallel_Hearts', - 'ext': 'flac', - 'title': 'FictionJunction-Parallel_Hearts', - 'upload_date': '20140522', - }, - 'expected_warnings': [ - 'URL could be a direct video link, returning it as such.' - ], - 'skip': 'URL invalid', - }, - # Direct download with broken HEAD - { - 'url': 'http://ai-radio.org:8000/radio.opus', - 'info_dict': { - 'id': 'radio', - 'ext': 'opus', - 'title': 'radio', - }, - 'params': { - 'skip_download': True, # infinite live stream - }, - 'expected_warnings': [ - r'501.*Not Implemented', - r'400.*Bad Request', - ], - }, - # Direct link with incorrect MIME type - { - 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', - 'md5': '4ccbebe5f36706d85221f204d7eb5913', - 'info_dict': { - 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', - 'id': '5_Lennart_Poettering_-_Systemd', - 'ext': 'webm', - 'title': '5_Lennart_Poettering_-_Systemd', - 'upload_date': '20141120', - }, - 'expected_warnings': [ - 'URL could be a direct video link, returning it as such.' - ] - }, - # RSS feed - { - 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml', - 'info_dict': { - 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml', - 'title': 'Zero Punctuation', - 'description': 're:.*groundbreaking video review series.*' - }, - 'playlist_mincount': 11, - }, - # RSS feed with enclosure - { - 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', - 'info_dict': { - 'id': 'http://podcastfeeds.nbcnews.com/nbcnews/video/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', - 'title': 'MSNBC Rachel Maddow (video)', - 'description': 're:.*her unique approach to storytelling.*', - }, - 'playlist': [{ - 'info_dict': { - 'ext': 'mov', - 'id': 'pdv_maddow_netcast_mov-12-04-2020-224335', - 'title': 're:MSNBC Rachel Maddow', - 'description': 're:.*her unique approach to storytelling.*', - 'timestamp': int, - 'upload_date': compat_str, - 'duration': float, - }, - }], - }, - # RSS feed with item with description and thumbnails - { - 'url': 'https://anchor.fm/s/dd00e14/podcast/rss', - 'info_dict': { - 'id': 'https://anchor.fm/s/dd00e14/podcast/rss', - 'title': 're:.*100% Hydrogen.*', - 'description': 're:.*In this episode.*', - }, - 'playlist': [{ - 'info_dict': { - 'ext': 'm4a', - 'id': 'c1c879525ce2cb640b344507e682c36d', - 'title': 're:Hydrogen!', - 'description': 're:.*In this episode we are going.*', - 'timestamp': 1567977776, - 'upload_date': '20190908', - 'duration': 459, - 'thumbnail': r're:^https?://.*\.jpg$', - 'episode_number': 1, - 'season_number': 1, - 'age_limit': 0, - }, - }], - 'params': { - 'skip_download': True, - }, - }, - # RSS feed with enclosures and unsupported link URLs - { - 'url': 'http://www.hellointernet.fm/podcast?format=rss', - 'info_dict': { - 'id': 'http://www.hellointernet.fm/podcast?format=rss', - 'description': 'CGP Grey and Brady Haran talk about YouTube, life, work, whatever.', - 'title': 'Hello Internet', - }, - 'playlist_mincount': 100, - }, - # SMIL from http://videolectures.net/promogram_igor_mekjavic_eng - { - 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/video/1/smil.xml', - 'info_dict': { - 'id': 'smil', - 'ext': 'mp4', - 'title': 'Automatics, robotics and biocybernetics', - 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482', - 'upload_date': '20130627', - 'formats': 'mincount:16', - 'subtitles': 'mincount:1', - }, - 'params': { - 'force_generic_extractor': True, - 'skip_download': True, - }, - }, - # SMIL from http://www1.wdr.de/mediathek/video/livestream/index.html - { - 'url': 'http://metafilegenerator.de/WDR/WDR_FS/hds/hds.smil', - 'info_dict': { - 'id': 'hds', - 'ext': 'flv', - 'title': 'hds', - 'formats': 'mincount:1', - }, - 'params': { - 'skip_download': True, - }, - }, - # SMIL from https://www.restudy.dk/video/play/id/1637 - { - 'url': 'https://www.restudy.dk/awsmedia/SmilDirectory/video_1637.xml', - 'info_dict': { - 'id': 'video_1637', - 'ext': 'flv', - 'title': 'video_1637', - 'formats': 'mincount:3', - }, - 'params': { - 'skip_download': True, - }, - }, - # SMIL from http://adventure.howstuffworks.com/5266-cool-jobs-iditarod-musher-video.htm - { - 'url': 'http://services.media.howstuffworks.com/videos/450221/smil-service.smil', - 'info_dict': { - 'id': 'smil-service', - 'ext': 'flv', - 'title': 'smil-service', - 'formats': 'mincount:1', - }, - 'params': { - 'skip_download': True, - }, - }, - # SMIL from http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370 - { - 'url': 'http://api.new.livestream.com/accounts/1570303/events/1585861/videos/4719370.smil', - 'info_dict': { - 'id': '4719370', - 'ext': 'mp4', - 'title': '571de1fd-47bc-48db-abf9-238872a58d1f', - 'formats': 'mincount:3', - }, - 'params': { - 'skip_download': True, - }, - }, - # XSPF playlist from http://www.telegraaf.nl/tv/nieuws/binnenland/24353229/__Tikibad_ontruimd_wegens_brand__.html - { - 'url': 'http://www.telegraaf.nl/xml/playlist/2015/8/7/mZlp2ctYIUEB.xspf', - 'info_dict': { - 'id': 'mZlp2ctYIUEB', - 'ext': 'mp4', - 'title': 'Tikibad ontruimd wegens brand', - 'description': 'md5:05ca046ff47b931f9b04855015e163a4', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 33, - }, - 'params': { - 'skip_download': True, - }, - }, - # MPD from http://dash-mse-test.appspot.com/media.html - { - 'url': 'http://yt-dash-mse-test.commondatastorage.googleapis.com/media/car-20120827-manifest.mpd', - 'md5': '4b57baab2e30d6eb3a6a09f0ba57ef53', - 'info_dict': { - 'id': 'car-20120827-manifest', - 'ext': 'mp4', - 'title': 'car-20120827-manifest', - 'formats': 'mincount:9', - 'upload_date': '20130904', - }, - 'params': { - 'format': 'bestvideo', - }, - }, - # m3u8 served with Content-Type: audio/x-mpegURL; charset=utf-8 - { - 'url': 'http://once.unicornmedia.com/now/master/playlist/bb0b18ba-64f5-4b1b-a29f-0ac252f06b68/77a785f3-5188-4806-b788-0893a61634ed/93677179-2d99-4ef4-9e17-fe70d49abfbf/content.m3u8', - 'info_dict': { - 'id': 'content', - 'ext': 'mp4', - 'title': 'content', - 'formats': 'mincount:8', - }, - 'params': { - # m3u8 downloads - 'skip_download': True, - }, - 'skip': 'video gone', - }, - # m3u8 served with Content-Type: text/plain - { - 'url': 'http://www.nacentapps.com/m3u8/index.m3u8', - 'info_dict': { - 'id': 'index', - 'ext': 'mp4', - 'title': 'index', - 'upload_date': '20140720', - 'formats': 'mincount:11', - }, - 'params': { - # m3u8 downloads - 'skip_download': True, - }, - 'skip': 'video gone', - }, - # google redirect - { - 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE', - 'info_dict': { - 'id': 'cmQHVoWB5FY', - 'ext': 'mp4', - 'upload_date': '20130224', - 'uploader_id': 'TheVerge', - 'description': r're:^Chris Ziegler takes a look at the\.*', - 'uploader': 'The Verge', - 'title': 'First Firefox OS phones side-by-side', - }, - 'params': { - 'skip_download': False, - } - }, - { - # redirect in Refresh HTTP header - 'url': 'https://www.facebook.com/l.php?u=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DpO8h3EaFRdo&h=TAQHsoToz&enc=AZN16h-b6o4Zq9pZkCCdOLNKMN96BbGMNtcFwHSaazus4JHT_MFYkAA-WARTX2kvsCIdlAIyHZjl6d33ILIJU7Jzwk_K3mcenAXoAzBNoZDI_Q7EXGDJnIhrGkLXo_LJ_pAa2Jzbx17UHMd3jAs--6j2zaeto5w9RTn8T_1kKg3fdC5WPX9Dbb18vzH7YFX0eSJmoa6SP114rvlkw6pkS1-T&s=1', - 'info_dict': { - 'id': 'pO8h3EaFRdo', - 'ext': 'mp4', - 'title': 'Tripeo Boiler Room x Dekmantel Festival DJ Set', - 'description': 'md5:6294cc1af09c4049e0652b51a2df10d5', - 'upload_date': '20150917', - 'uploader_id': 'brtvofficial', - 'uploader': 'Boiler Room', - }, - 'params': { - 'skip_download': False, - }, - }, - { - 'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html', - 'md5': '85b90ccc9d73b4acd9138d3af4c27f89', - 'info_dict': { - 'id': '13601338388002', - 'ext': 'mp4', - 'uploader': 'www.hodiho.fr', - 'title': 'R\u00e9gis plante sa Jeep', - } - }, - # bandcamp page with custom domain - { - 'add_ie': ['Bandcamp'], - 'url': 'http://bronyrock.com/track/the-pony-mash', - 'info_dict': { - 'id': '3235767654', - 'ext': 'mp3', - 'title': 'The Pony Mash', - 'uploader': 'M_Pallante', - }, - 'skip': 'There is a limit of 200 free downloads / month for the test song', - }, - { - # embedded brightcove video - # it also tests brightcove videos that need to set the 'Referer' - # in the http requests - 'add_ie': ['BrightcoveLegacy'], - 'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/', - 'info_dict': { - 'id': '2765128793001', - 'ext': 'mp4', - 'title': 'Le cours de bourse : l’analyse technique', - 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9', - 'uploader': 'BFM BUSINESS', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # embedded with itemprop embedURL and video id spelled as `idVideo` - 'add_id': ['BrightcoveLegacy'], - 'url': 'http://bfmbusiness.bfmtv.com/mediaplayer/chroniques/olivier-delamarche/', - 'info_dict': { - 'id': '5255628253001', - 'ext': 'mp4', - 'title': 'md5:37c519b1128915607601e75a87995fc0', - 'description': 'md5:37f7f888b434bb8f8cc8dbd4f7a4cf26', - 'uploader': 'BFM BUSINESS', - 'uploader_id': '876450612001', - 'timestamp': 1482255315, - 'upload_date': '20161220', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # https://github.com/ytdl-org/youtube-dl/issues/2253 - 'url': 'http://bcove.me/i6nfkrc3', - 'md5': '0ba9446db037002366bab3b3eb30c88c', - 'info_dict': { - 'id': '3101154703001', - 'ext': 'mp4', - 'title': 'Still no power', - 'uploader': 'thestar.com', - 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.', - }, - 'add_ie': ['BrightcoveLegacy'], - 'skip': 'video gone', - }, - { - 'url': 'http://www.championat.com/video/football/v/87/87499.html', - 'md5': 'fb973ecf6e4a78a67453647444222983', - 'info_dict': { - 'id': '3414141473001', - 'ext': 'mp4', - 'title': 'Видео. Удаление Дзагоева (ЦСКА)', - 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"', - 'uploader': 'Championat', - }, - }, - { - # https://github.com/ytdl-org/youtube-dl/issues/3541 - 'add_ie': ['BrightcoveLegacy'], - 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1', - 'info_dict': { - 'id': '3866516442001', - 'ext': 'mp4', - 'title': 'Leer mij vrouwen kennen: Aflevering 1', - 'description': 'Leer mij vrouwen kennen: Aflevering 1', - 'uploader': 'SBS Broadcasting', - }, - 'skip': 'Restricted to Netherlands', - 'params': { - 'skip_download': True, # m3u8 download - }, - }, - { - # Brightcove video in <iframe> - 'url': 'http://www.un.org/chinese/News/story.asp?NewsID=27724', - 'md5': '36d74ef5e37c8b4a2ce92880d208b968', - 'info_dict': { - 'id': '5360463607001', - 'ext': 'mp4', - 'title': '叙利亚失明儿童在废墟上演唱《心跳》 呼吁获得正常童年生活', - 'description': '联合国儿童基金会中东和北非区域大使、作曲家扎德·迪拉尼(Zade Dirani)在3月15日叙利亚冲突爆发7周年纪念日之际发布了为叙利亚谱写的歌曲《心跳》(HEARTBEAT),为受到六年冲突影响的叙利亚儿童发出强烈呐喊,呼吁世界做出共同努力,使叙利亚儿童重新获得享有正常童年生活的权利。', - 'uploader': 'United Nations', - 'uploader_id': '1362235914001', - 'timestamp': 1489593889, - 'upload_date': '20170315', - }, - 'add_ie': ['BrightcoveLegacy'], - }, - { - # Brightcove with alternative playerID key - 'url': 'http://www.nature.com/nmeth/journal/v9/n7/fig_tab/nmeth.2062_SV1.html', - 'info_dict': { - 'id': 'nmeth.2062_SV1', - 'title': 'Simultaneous multiview imaging of the Drosophila syncytial blastoderm : Quantitative high-speed imaging of entire developing embryos with simultaneous multiview light-sheet microscopy : Nature Methods : Nature Research', - }, - 'playlist': [{ - 'info_dict': { - 'id': '2228375078001', - 'ext': 'mp4', - 'title': 'nmeth.2062-sv1', - 'description': 'nmeth.2062-sv1', - 'timestamp': 1363357591, - 'upload_date': '20130315', - 'uploader': 'Nature Publishing Group', - 'uploader_id': '1964492299001', - }, - }], - }, - { - # Brightcove with UUID in videoPlayer - 'url': 'http://www8.hp.com/cn/zh/home.html', - 'info_dict': { - 'id': '5255815316001', - 'ext': 'mp4', - 'title': 'Sprocket Video - China', - 'description': 'Sprocket Video - China', - 'uploader': 'HP-Video Gallery', - 'timestamp': 1482263210, - 'upload_date': '20161220', - 'uploader_id': '1107601872001', - }, - 'params': { - 'skip_download': True, # m3u8 download - }, - 'skip': 'video rotates...weekly?', - }, - { - # Brightcove:new type [2]. - 'url': 'http://www.delawaresportszone.com/video-st-thomas-more-earns-first-trip-to-basketball-semis', - 'md5': '2b35148fcf48da41c9fb4591650784f3', - 'info_dict': { - 'id': '5348741021001', - 'ext': 'mp4', - 'upload_date': '20170306', - 'uploader_id': '4191638492001', - 'timestamp': 1488769918, - 'title': 'VIDEO: St. Thomas More earns first trip to basketball semis', - - }, - }, - { - # Alternative brightcove <video> attributes - 'url': 'http://www.programme-tv.net/videos/extraits/81095-guillaume-canet-evoque-les-rumeurs-d-infidelite-de-marion-cotillard-avec-brad-pitt-dans-vivement-dimanche/', - 'info_dict': { - 'id': '81095-guillaume-canet-evoque-les-rumeurs-d-infidelite-de-marion-cotillard-avec-brad-pitt-dans-vivement-dimanche', - 'title': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche, Extraits : toutes les vidéos avec Télé-Loisirs", - }, - 'playlist': [{ - 'md5': '732d22ba3d33f2f3fc253c39f8f36523', - 'info_dict': { - 'id': '5311302538001', - 'ext': 'mp4', - 'title': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche", - 'description': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche (France 2, 5 février 2017)", - 'timestamp': 1486321708, - 'upload_date': '20170205', - 'uploader_id': '800000640001', - }, - 'only_matching': True, - }], - }, - { - # Brightcove with UUID in videoPlayer - 'url': 'http://www8.hp.com/cn/zh/home.html', - 'info_dict': { - 'id': '5255815316001', - 'ext': 'mp4', - 'title': 'Sprocket Video - China', - 'description': 'Sprocket Video - China', - 'uploader': 'HP-Video Gallery', - 'timestamp': 1482263210, - 'upload_date': '20161220', - 'uploader_id': '1107601872001', - }, - 'params': { - 'skip_download': True, # m3u8 download - }, - }, - # ooyala video - { - 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219', - 'md5': '166dd577b433b4d4ebfee10b0824d8ff', - 'info_dict': { - 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ', - 'ext': 'mp4', - 'title': '2cc213299525360.mov', # that's what we get - 'duration': 238.231, - }, - 'add_ie': ['Ooyala'], - }, - { - # ooyala video embedded with http://player.ooyala.com/iframe.js - 'url': 'http://www.macrumors.com/2015/07/24/steve-jobs-the-man-in-the-machine-first-trailer/', - 'info_dict': { - 'id': 'p0MGJndjoG5SOKqO_hZJuZFPB-Tr5VgB', - 'ext': 'mp4', - 'title': '"Steve Jobs: Man in the Machine" trailer', - 'description': 'The first trailer for the Alex Gibney documentary "Steve Jobs: Man in the Machine."', - 'duration': 135.427, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'movie expired', - }, - # ooyala video embedded with http://player.ooyala.com/static/v4/production/latest/core.min.js - { - 'url': 'http://wnep.com/2017/07/22/steampunk-fest-comes-to-honesdale/', - 'info_dict': { - 'id': 'lwYWYxYzE6V5uJMjNGyKtwwiw9ZJD7t2', - 'ext': 'mp4', - 'title': 'Steampunk Fest Comes to Honesdale', - 'duration': 43.276, - }, - 'params': { - 'skip_download': True, - } - }, - # embed.ly video - { - 'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/', - 'info_dict': { - 'id': '9ODmcdjQcHQ', - 'ext': 'mp4', - 'title': 'Tested: Grinding Coffee at 2000 Frames Per Second', - 'upload_date': '20140225', - 'description': 'md5:06a40fbf30b220468f1e0957c0f558ff', - 'uploader': 'Tested', - 'uploader_id': 'testedcom', - }, - # No need to test YoutubeIE here - 'params': { - 'skip_download': True, - }, - }, - # funnyordie embed - { - 'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns', - 'info_dict': { - 'id': '18e820ec3f', - 'ext': 'mp4', - 'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama', - 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.', - }, - # HEAD requests lead to endless 301, while GET is OK - 'expected_warnings': ['301'], - }, - # RUTV embed - { - 'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html', - 'info_dict': { - 'id': '776940', - 'ext': 'mp4', - 'title': 'Охотское море стало целиком российским', - 'description': 'md5:5ed62483b14663e2a95ebbe115eb8f43', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, - # TVC embed - { - 'url': 'http://sch1298sz.mskobr.ru/dou_edu/karamel_ki/filial_galleries/video/iframe_src_http_tvc_ru_video_iframe_id_55304_isplay_false_acc_video_id_channel_brand_id_11_show_episodes_episode_id_32307_frameb/', - 'info_dict': { - 'id': '55304', - 'ext': 'mp4', - 'title': 'Дошкольное воспитание', - }, - }, - # SportBox embed - { - 'url': 'http://www.vestifinance.ru/articles/25753', - 'info_dict': { - 'id': '25753', - 'title': 'Прямые трансляции с Форума-выставки "Госзаказ-2013"', - }, - 'playlist': [{ - 'info_dict': { - 'id': '370908', - 'title': 'Госзаказ. День 3', - 'ext': 'mp4', - } - }, { - 'info_dict': { - 'id': '370905', - 'title': 'Госзаказ. День 2', - 'ext': 'mp4', - } - }, { - 'info_dict': { - 'id': '370902', - 'title': 'Госзаказ. День 1', - 'ext': 'mp4', - } - }], - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, - # Myvi.ru embed - { - 'url': 'http://www.kinomyvi.tv/news/detail/Pervij-dublirovannij-trejler--Uzhastikov-_nOw1', - 'info_dict': { - 'id': 'f4dafcad-ff21-423d-89b5-146cfd89fa1e', - 'ext': 'mp4', - 'title': 'Ужастики, русский трейлер (2015)', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 153, - } - }, - # XHamster embed - { - 'url': 'http://www.numisc.com/forum/showthread.php?11696-FM15-which-pumiscer-was-this-%28-vid-%29-%28-alfa-as-fuck-srx-%29&s=711f5db534502e22260dec8c5e2d66d8', - 'info_dict': { - 'id': 'showthread', - 'title': '[NSFL] [FM15] which pumiscer was this ( vid ) ( alfa as fuck srx )', - }, - 'playlist_mincount': 7, - # This forum does not allow <iframe> syntaxes anymore - # Now HTML tags are displayed as-is - 'skip': 'No videos on this page', - }, - # Embedded TED video - { - 'url': 'http://en.support.wordpress.com/videos/ted-talks/', - 'md5': '65fdff94098e4a607385a60c5177c638', - 'info_dict': { - 'id': '1969', - 'ext': 'mp4', - 'title': 'Hidden miracles of the natural world', - 'uploader': 'Louie Schwartzberg', - 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9', - } - }, - # nowvideo embed hidden behind percent encoding - { - 'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/', - 'md5': '2baf4ddd70f697d94b1c18cf796d5107', - 'info_dict': { - 'id': '06e53103ca9aa', - 'ext': 'flv', - 'title': 'Macross Episode 001 Watch Macross Episode 001 onl', - 'description': 'No description', - }, - }, - # arte embed - { - 'url': 'http://www.tv-replay.fr/redirection/20-03-14/x-enius-arte-10753389.html', - 'md5': '7653032cbb25bf6c80d80f217055fa43', - 'info_dict': { - 'id': '048195-004_PLUS7-F', - 'ext': 'flv', - 'title': 'X:enius', - 'description': 'md5:d5fdf32ef6613cdbfd516ae658abf168', - 'upload_date': '20140320', - }, - 'params': { - 'skip_download': 'Requires rtmpdump' - }, - 'skip': 'video gone', - }, - # francetv embed - { - 'url': 'http://www.tsprod.com/replay-du-concert-alcaline-de-calogero', - 'info_dict': { - 'id': 'EV_30231', - 'ext': 'mp4', - 'title': 'Alcaline, le concert avec Calogero', - 'description': 'md5:61f08036dcc8f47e9cfc33aed08ffaff', - 'upload_date': '20150226', - 'timestamp': 1424989860, - 'duration': 5400, - }, - 'params': { - # m3u8 downloads - 'skip_download': True, - }, - 'expected_warnings': [ - 'Forbidden' - ] - }, - # Condé Nast embed - { - 'url': 'http://www.wired.com/2014/04/honda-asimo/', - 'md5': 'ba0dfe966fa007657bd1443ee672db0f', - 'info_dict': { - 'id': '53501be369702d3275860000', - 'ext': 'mp4', - 'title': 'Honda’s New Asimo Robot Is More Human Than Ever', - } - }, - # Dailymotion embed - { - 'url': 'http://www.spi0n.com/zap-spi0n-com-n216/', - 'md5': '441aeeb82eb72c422c7f14ec533999cd', - 'info_dict': { - 'id': 'k2mm4bCdJ6CQ2i7c8o2', - 'ext': 'mp4', - 'title': 'Le Zap de Spi0n n°216 - Zapping du Web', - 'description': 'md5:faf028e48a461b8b7fad38f1e104b119', - 'uploader': 'Spi0n', - 'uploader_id': 'xgditw', - 'upload_date': '20140425', - 'timestamp': 1398441542, - }, - 'add_ie': ['Dailymotion'], - }, - # DailyMail embed - { - 'url': 'http://www.bumm.sk/krimi/2017/07/05/biztonsagi-kamera-buktatta-le-az-agg-ferfit-utlegelo-apolot', - 'info_dict': { - 'id': '1495629', - 'ext': 'mp4', - 'title': 'Care worker punches elderly dementia patient in head 11 times', - 'description': 'md5:3a743dee84e57e48ec68bf67113199a5', - }, - 'add_ie': ['DailyMail'], - 'params': { - 'skip_download': True, - }, - }, - # YouTube embed - { - 'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html', - 'info_dict': { - 'id': 'FXRb4ykk4S0', - 'ext': 'mp4', - 'title': 'The NBL Auction 2014', - 'uploader': 'BADMINTON England', - 'uploader_id': 'BADMINTONEvents', - 'upload_date': '20140603', - 'description': 'md5:9ef128a69f1e262a700ed83edb163a73', - }, - 'add_ie': ['Youtube'], - 'params': { - 'skip_download': True, - } - }, - # MTVServices embed - { - 'url': 'http://www.vulture.com/2016/06/new-key-peele-sketches-released.html', - 'md5': 'ca1aef97695ef2c1d6973256a57e5252', - 'info_dict': { - 'id': '769f7ec0-0692-4d62-9b45-0d88074bffc1', - 'ext': 'mp4', - 'title': 'Key and Peele|October 10, 2012|2|203|Liam Neesons - Uncensored', - 'description': 'Two valets share their love for movie star Liam Neesons.', - 'timestamp': 1349922600, - 'upload_date': '20121011', - }, - }, - # YouTube embed via <data-embed-url=""> - { - 'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM', - 'info_dict': { - 'id': '4vAffPZIT44', - 'ext': 'mp4', - 'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!', - 'uploader': 'Gameloft', - 'uploader_id': 'gameloft', - 'upload_date': '20140828', - 'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4', - }, - 'params': { - 'skip_download': True, - } - }, - # YouTube <object> embed - { - 'url': 'http://www.improbable.com/2017/04/03/untrained-modern-youths-and-ancient-masters-in-selfie-portraits/', - 'md5': '516718101ec834f74318df76259fb3cc', - 'info_dict': { - 'id': 'msN87y-iEx0', - 'ext': 'webm', - 'title': 'Feynman: Mirrors FUN TO IMAGINE 6', - 'upload_date': '20080526', - 'description': 'md5:0ffc78ea3f01b2e2c247d5f8d1d3c18d', - 'uploader': 'Christopher Sykes', - 'uploader_id': 'ChristopherJSykes', - }, - 'add_ie': ['Youtube'], - }, - # Camtasia studio - { - 'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/', - 'playlist': [{ - 'md5': '0c5e352edabf715d762b0ad4e6d9ee67', - 'info_dict': { - 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final', - 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1', - 'ext': 'flv', - 'duration': 2235.90, - } - }, { - 'md5': '10e4bb3aaca9fd630e273ff92d9f3c63', - 'info_dict': { - 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP', - 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip', - 'ext': 'flv', - 'duration': 2235.93, - } - }], - 'info_dict': { - 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final', - } - }, - # Flowplayer - { - 'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html', - 'md5': '9d65602bf31c6e20014319c7d07fba27', - 'info_dict': { - 'id': '5123ea6d5e5a7', - 'ext': 'mp4', - 'age_limit': 18, - 'uploader': 'www.handjobhub.com', - 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com', - } - }, - # Multiple brightcove videos - # https://github.com/ytdl-org/youtube-dl/issues/2283 - { - 'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html', - 'info_dict': { - 'id': 'always-never', - 'title': 'Always / Never - The New Yorker', - }, - 'playlist_count': 3, - 'params': { - 'extract_flat': False, - 'skip_download': True, - } - }, - # MLB embed - { - 'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/', - 'md5': '96f09a37e44da40dd083e12d9a683327', - 'info_dict': { - 'id': '33322633', - 'ext': 'mp4', - 'title': 'Ump changes call to ball', - 'description': 'md5:71c11215384298a172a6dcb4c2e20685', - 'duration': 48, - 'timestamp': 1401537900, - 'upload_date': '20140531', - 'thumbnail': r're:^https?://.*\.jpg$', - }, - }, - # Wistia embed - { - 'url': 'http://study.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson', - 'md5': '1953f3a698ab51cfc948ed3992a0b7ff', - 'info_dict': { - 'id': '6e2wtrbdaf', - 'ext': 'mov', - 'title': 'paywall_north-american-exploration-failed-colonies-of-spain-france-england', - 'description': 'a Paywall Videos video from Remilon', - 'duration': 644.072, - 'uploader': 'study.com', - 'timestamp': 1459678540, - 'upload_date': '20160403', - 'filesize': 24687186, - }, - }, - { - 'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz', - 'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4', - 'info_dict': { - 'id': 'uxjb0lwrcz', - 'ext': 'mp4', - 'title': 'Conversation about Hexagonal Rails Part 1', - 'description': 'a Martin Fowler video from ThoughtWorks', - 'duration': 1715.0, - 'uploader': 'thoughtworks.wistia.com', - 'timestamp': 1401832161, - 'upload_date': '20140603', - }, - }, - # Wistia standard embed (async) - { - 'url': 'https://www.getdrip.com/university/brennan-dunn-drip-workshop/', - 'info_dict': { - 'id': '807fafadvk', - 'ext': 'mp4', - 'title': 'Drip Brennan Dunn Workshop', - 'description': 'a JV Webinars video from getdrip-1', - 'duration': 4986.95, - 'timestamp': 1463607249, - 'upload_date': '20160518', - }, - 'params': { - 'skip_download': True, - } - }, - # Soundcloud embed - { - 'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/', - 'info_dict': { - 'id': '174391317', - 'ext': 'mp3', - 'description': 'md5:ff867d6b555488ad3c52572bb33d432c', - 'uploader': 'Sophos Security', - 'title': 'Chet Chat 171 - Oct 29, 2014', - 'upload_date': '20141029', - } - }, - # Soundcloud multiple embeds - { - 'url': 'http://www.guitarplayer.com/lessons/1014/legato-workout-one-hour-to-more-fluid-performance---tab/52809', - 'info_dict': { - 'id': '52809', - 'title': 'Guitar Essentials: Legato Workout—One-Hour to Fluid Performance | TAB + AUDIO', - }, - 'playlist_mincount': 7, - }, - # TuneIn station embed - { - 'url': 'http://radiocnrv.com/promouvoir-radio-cnrv/', - 'info_dict': { - 'id': '204146', - 'ext': 'mp3', - 'title': 'CNRV', - 'location': 'Paris, France', - 'is_live': True, - }, - 'params': { - # Live stream - 'skip_download': True, - }, - }, - # Livestream embed - { - 'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast', - 'info_dict': { - 'id': '67864563', - 'ext': 'flv', - 'upload_date': '20141112', - 'title': 'Rosetta #CometLanding webcast HL 10', - } - }, - # Another Livestream embed, without 'new.' in URL - { - 'url': 'https://www.freespeech.org/', - 'info_dict': { - 'id': '123537347', - 'ext': 'mp4', - 'title': 're:^FSTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - }, - 'params': { - # Live stream - 'skip_download': True, - }, - }, - # LazyYT - { - 'url': 'https://skiplagged.com/', - 'info_dict': { - 'id': 'skiplagged', - 'title': 'Skiplagged: The smart way to find cheap flights', - }, - 'playlist_mincount': 1, - 'add_ie': ['Youtube'], - }, - # Cinchcast embed - { - 'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/', - 'info_dict': { - 'id': '7141703', - 'ext': 'mp3', - 'upload_date': '20141126', - 'title': 'Jack Tips: 5 Steps to Permanent Gut Healing', - } - }, - # Cinerama player - { - 'url': 'http://www.abc.net.au/7.30/content/2015/s4164797.htm', - 'info_dict': { - 'id': '730m_DandD_1901_512k', - 'ext': 'mp4', - 'uploader': 'www.abc.net.au', - 'title': 'Game of Thrones with dice - Dungeons and Dragons fantasy role-playing game gets new life - 19/01/2015', - } - }, - # embedded viddler video - { - 'url': 'http://deadspin.com/i-cant-stop-watching-john-wall-chop-the-nuggets-with-th-1681801597', - 'info_dict': { - 'id': '4d03aad9', - 'ext': 'mp4', - 'uploader': 'deadspin', - 'title': 'WALL-TO-GORTAT', - 'timestamp': 1422285291, - 'upload_date': '20150126', - }, - 'add_ie': ['Viddler'], - }, - # Libsyn embed - { - 'url': 'http://thedailyshow.cc.com/podcast/episodetwelve', - 'info_dict': { - 'id': '3377616', - 'ext': 'mp3', - 'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart", - 'description': 'md5:601cb790edd05908957dae8aaa866465', - 'upload_date': '20150220', - }, - 'skip': 'All The Daily Show URLs now redirect to http://www.cc.com/shows/', - }, - # jwplayer YouTube - { - 'url': 'http://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/', - 'info_dict': { - 'id': 'Mrj4DVp2zeA', - 'ext': 'mp4', - 'upload_date': '20150212', - 'uploader': 'The National Archives UK', - 'description': 'md5:8078af856dca76edc42910b61273dbbf', - 'uploader_id': 'NationalArchives08', - 'title': 'Webinar: Using Discovery, The National Archives’ online catalogue', - }, - }, - # jwplayer rtmp - { - 'url': 'http://www.suffolk.edu/sjc/live.php', - 'info_dict': { - 'id': 'live', - 'ext': 'flv', - 'title': 'Massachusetts Supreme Judicial Court Oral Arguments', - 'uploader': 'www.suffolk.edu', - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'Only has video a few mornings per month, see http://www.suffolk.edu/sjc/', - }, - # Complex jwplayer - { - 'url': 'http://www.indiedb.com/games/king-machine/videos', - 'info_dict': { - 'id': 'videos', - 'ext': 'mp4', - 'title': 'king machine trailer 1', - 'description': 'Browse King Machine videos & audio for sweet media. Your eyes will thank you.', - 'thumbnail': r're:^https?://.*\.jpg$', - }, - }, - { - # JWPlayer config passed as variable - 'url': 'http://www.txxx.com/videos/3326530/ariele/', - 'info_dict': { - 'id': '3326530_hq', - 'ext': 'mp4', - 'title': 'ARIELE | Tube Cup', - 'uploader': 'www.txxx.com', - 'age_limit': 18, - }, - 'params': { - 'skip_download': True, - } - }, - { - # JWPlatform iframe - 'url': 'https://www.mediaite.com/tv/dem-senator-claims-gary-cohn-faked-a-bad-connection-during-trump-call-to-get-him-off-the-phone/', - 'md5': 'ca00a040364b5b439230e7ebfd02c4e9', - 'info_dict': { - 'id': 'O0c5JcKT', - 'ext': 'mp4', - 'upload_date': '20171122', - 'timestamp': 1511366290, - 'title': 'Dem Senator Claims Gary Cohn Faked a Bad Connection During Trump Call to Get Him Off the Phone', - }, - 'add_ie': [JWPlatformIE.ie_key()], - }, - { - # Video.js embed, multiple formats - 'url': 'http://ortcam.com/solidworks-урок-6-настройка-чертежа_33f9b7351.html', - 'info_dict': { - 'id': 'yygqldloqIk', - 'ext': 'mp4', - 'title': 'SolidWorks. Урок 6 Настройка чертежа', - 'description': 'md5:baf95267792646afdbf030e4d06b2ab3', - 'upload_date': '20130314', - 'uploader': 'PROстое3D', - 'uploader_id': 'PROstoe3D', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # Video.js embed, single format - 'url': 'https://www.vooplayer.com/v3/watch/watch.php?v=NzgwNTg=', - 'info_dict': { - 'id': 'watch', - 'ext': 'mp4', - 'title': 'Step 1 - Good Foundation', - 'description': 'md5:d1e7ff33a29fc3eb1673d6c270d344f4', - }, - 'params': { - 'skip_download': True, - }, - }, - # rtl.nl embed - { - 'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen', - 'playlist_mincount': 5, - 'info_dict': { - 'id': 'aanslagen-kopenhagen', - 'title': 'Aanslagen Kopenhagen', - } - }, - # Zapiks embed - { - 'url': 'http://www.skipass.com/news/116090-bon-appetit-s5ep3-baqueira-mi-cor.html', - 'info_dict': { - 'id': '118046', - 'ext': 'mp4', - 'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !', - } - }, - # Kaltura embed (different embed code) - { - 'url': 'http://www.premierchristianradio.com/Shows/Saturday/Unbelievable/Conference-Videos/Os-Guinness-Is-It-Fools-Talk-Unbelievable-Conference-2014', - 'info_dict': { - 'id': '1_a52wc67y', - 'ext': 'flv', - 'upload_date': '20150127', - 'uploader_id': 'PremierMedia', - 'timestamp': int, - 'title': 'Os Guinness // Is It Fools Talk? // Unbelievable? Conference 2014', - }, - }, - # Kaltura embed with single quotes - { - 'url': 'http://fod.infobase.com/p_ViewPlaylist.aspx?AssignmentID=NUN8ZY', - 'info_dict': { - 'id': '0_izeg5utt', - 'ext': 'mp4', - 'title': '35871', - 'timestamp': 1355743100, - 'upload_date': '20121217', - 'uploader_id': 'cplapp@learn360.com', - }, - 'add_ie': ['Kaltura'], - }, - { - # Kaltura embedded via quoted entry_id - 'url': 'https://www.oreilly.com/ideas/my-cloud-makes-pretty-pictures', - 'info_dict': { - 'id': '0_utuok90b', - 'ext': 'mp4', - 'title': '06_matthew_brender_raj_dutt', - 'timestamp': 1466638791, - 'upload_date': '20160622', - }, - 'add_ie': ['Kaltura'], - 'expected_warnings': [ - 'Could not send HEAD request' - ], - 'params': { - 'skip_download': True, - } - }, - { - # Kaltura embedded, some fileExt broken (#11480) - 'url': 'http://www.cornell.edu/video/nima-arkani-hamed-standard-models-of-particle-physics', - 'info_dict': { - 'id': '1_sgtvehim', - 'ext': 'mp4', - 'title': 'Our "Standard Models" of particle physics and cosmology', - 'description': 'md5:67ea74807b8c4fea92a6f38d6d323861', - 'timestamp': 1321158993, - 'upload_date': '20111113', - 'uploader_id': 'kps1', - }, - 'add_ie': ['Kaltura'], - }, - { - # Kaltura iframe embed - 'url': 'http://www.gsd.harvard.edu/event/i-m-pei-a-centennial-celebration/', - 'md5': 'ae5ace8eb09dc1a35d03b579a9c2cc44', - 'info_dict': { - 'id': '0_f2cfbpwy', - 'ext': 'mp4', - 'title': 'I. M. Pei: A Centennial Celebration', - 'description': 'md5:1db8f40c69edc46ca180ba30c567f37c', - 'upload_date': '20170403', - 'uploader_id': 'batchUser', - 'timestamp': 1491232186, - }, - 'add_ie': ['Kaltura'], - }, - { - # Kaltura iframe embed, more sophisticated - 'url': 'http://www.cns.nyu.edu/~eero/math-tools/Videos/lecture-05sep2017.html', - 'info_dict': { - 'id': '1_9gzouybz', - 'ext': 'mp4', - 'title': 'lecture-05sep2017', - 'description': 'md5:40f347d91fd4ba047e511c5321064b49', - 'upload_date': '20170913', - 'uploader_id': 'eps2', - 'timestamp': 1505340777, - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': ['Kaltura'], - }, - { - # meta twitter:player - 'url': 'http://thechive.com/2017/12/08/all-i-want-for-christmas-is-more-twerk/', - 'info_dict': { - 'id': '0_01b42zps', - 'ext': 'mp4', - 'title': 'Main Twerk (Video)', - 'upload_date': '20171208', - 'uploader_id': 'sebastian.salinas@thechive.com', - 'timestamp': 1512713057, - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': ['Kaltura'], - }, - # referrer protected EaglePlatform embed - { - 'url': 'https://tvrain.ru/lite/teleshow/kak_vse_nachinalos/namin-418921/', - 'info_dict': { - 'id': '582306', - 'ext': 'mp4', - 'title': 'Стас Намин: «Мы нарушили девственность Кремля»', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 3382, - 'view_count': int, - }, - 'params': { - 'skip_download': True, - }, - }, - # ClipYou (EaglePlatform) embed (custom URL) - { - 'url': 'http://muz-tv.ru/play/7129/', - # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used - 'info_dict': { - 'id': '12820', - 'ext': 'mp4', - 'title': "'O Sole Mio", - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 216, - 'view_count': int, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'This video is unavailable.', - }, - # Pladform embed - { - 'url': 'http://muz-tv.ru/kinozal/view/7400/', - 'info_dict': { - 'id': '100183293', - 'ext': 'mp4', - 'title': 'Тайны перевала Дятлова • 1 серия 2 часть', - 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 694, - 'age_limit': 0, - }, - 'skip': 'HTTP Error 404: Not Found', - }, - # Playwire embed - { - 'url': 'http://www.cinemablend.com/new/First-Joe-Dirt-2-Trailer-Teaser-Stupid-Greatness-70874.html', - 'info_dict': { - 'id': '3519514', - 'ext': 'mp4', - 'title': 'Joe Dirt 2 Beautiful Loser Teaser Trailer', - 'thumbnail': r're:^https?://.*\.png$', - 'duration': 45.115, - }, - }, - # 5min embed - { - 'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/', - 'md5': '4c6f127a30736b59b3e2c19234ee2bf7', - 'info_dict': { - 'id': '518726732', - 'ext': 'mp4', - 'title': 'Facebook Creates "On This Day" | Crunch Report', - 'description': 'Amazon updates Fire TV line, Tesla\'s Model X spotted in the wild', - 'timestamp': 1427237531, - 'uploader': 'Crunch Report', - 'upload_date': '20150324', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, - # Crooks and Liars embed - { - 'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists', - 'info_dict': { - 'id': '8RUoRhRi', - 'ext': 'mp4', - 'title': "Fox & Friends Says Protecting Atheists From Discrimination Is Anti-Christian!", - 'description': 'md5:e1a46ad1650e3a5ec7196d432799127f', - 'timestamp': 1428207000, - 'upload_date': '20150405', - 'uploader': 'Heather', - }, - }, - # Crooks and Liars external embed - { - 'url': 'http://theothermccain.com/2010/02/02/video-proves-that-bill-kristol-has-been-watching-glenn-beck/comment-page-1/', - 'info_dict': { - 'id': 'MTE3MjUtMzQ2MzA', - 'ext': 'mp4', - 'title': 'md5:5e3662a81a4014d24c250d76d41a08d5', - 'description': 'md5:9b8e9542d6c3c5de42d6451b7d780cec', - 'timestamp': 1265032391, - 'upload_date': '20100201', - 'uploader': 'Heather', - }, - }, - # NBC Sports vplayer embed - { - 'url': 'http://www.riderfans.com/forum/showthread.php?121827-Freeman&s=e98fa1ea6dc08e886b1678d35212494a', - 'info_dict': { - 'id': 'ln7x1qSThw4k', - 'ext': 'flv', - 'title': "PFT Live: New leader in the 'new-look' defense", - 'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e', - 'uploader': 'NBCU-SPORTS', - 'upload_date': '20140107', - 'timestamp': 1389118457, - }, - 'skip': 'Invalid Page URL', - }, - # NBC News embed - { - 'url': 'http://www.vulture.com/2016/06/letterman-couldnt-care-less-about-late-night.html', - 'md5': '1aa589c675898ae6d37a17913cf68d66', - 'info_dict': { - 'id': 'x_dtl_oa_LettermanliftPR_160608', - 'ext': 'mp4', - 'title': 'David Letterman: A Preview', - 'description': 'A preview of Tom Brokaw\'s interview with David Letterman as part of the On Assignment series powered by Dateline. Airs Sunday June 12 at 7/6c.', - 'upload_date': '20160609', - 'timestamp': 1465431544, - 'uploader': 'NBCU-NEWS', - }, - }, - # UDN embed - { - 'url': 'https://video.udn.com/news/300346', - 'md5': 'fd2060e988c326991037b9aff9df21a6', - 'info_dict': { - 'id': '300346', - 'ext': 'mp4', - 'title': '中一中男師變性 全校師生力挺', - 'thumbnail': r're:^https?://.*\.jpg$', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'expected_warnings': ['Failed to parse JSON Expecting value'], - }, - # Brightcove URL in single quotes - { - 'url': 'http://www.sportsnet.ca/baseball/mlb/sn-presents-russell-martin-world-citizen/', - 'md5': '4ae374f1f8b91c889c4b9203c8c752af', - 'info_dict': { - 'id': '4255764656001', - 'ext': 'mp4', - 'title': 'SN Presents: Russell Martin, World Citizen', - 'description': 'To understand why he was the Toronto Blue Jays’ top off-season priority is to appreciate his background and upbringing in Montreal, where he first developed his baseball skills. Written and narrated by Stephen Brunt.', - 'uploader': 'Rogers Sportsnet', - 'uploader_id': '1704050871', - 'upload_date': '20150525', - 'timestamp': 1432570283, - }, - }, - # Kinja embed - { - 'url': 'http://www.clickhole.com/video/dont-understand-bitcoin-man-will-mumble-explanatio-2537', - 'info_dict': { - 'id': '106351', - 'ext': 'mp4', - 'title': 'Don’t Understand Bitcoin? This Man Will Mumble An Explanation At You', - 'description': 'Migrated from OnionStudios', - 'thumbnail': r're:^https?://.*\.jpe?g$', - 'uploader': 'clickhole', - 'upload_date': '20150527', - 'timestamp': 1432744860, - } - }, - # SnagFilms embed - { - 'url': 'http://whilewewatch.blogspot.ru/2012/06/whilewewatch-whilewewatch-gripping.html', - 'info_dict': { - 'id': '74849a00-85a9-11e1-9660-123139220831', - 'ext': 'mp4', - 'title': '#whilewewatch', - } - }, - # AdobeTVVideo embed - { - 'url': 'https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners', - 'md5': '43662b577c018ad707a63766462b1e87', - 'info_dict': { - 'id': '2456', - 'ext': 'mp4', - 'title': 'New experience with Acrobat DC', - 'description': 'New experience with Acrobat DC', - 'duration': 248.667, - }, - }, - # BrightcoveInPageEmbed embed - { - 'url': 'http://www.geekandsundry.com/tabletop-bonus-wils-final-thoughts-on-dread/', - 'info_dict': { - 'id': '4238694884001', - 'ext': 'flv', - 'title': 'Tabletop: Dread, Last Thoughts', - 'description': 'Tabletop: Dread, Last Thoughts', - 'duration': 51690, - }, - }, - # Brightcove embed, with no valid 'renditions' but valid 'IOSRenditions' - # This video can't be played in browsers if Flash disabled and UA set to iPhone, which is actually a false alarm - { - 'url': 'https://dl.dropboxusercontent.com/u/29092637/interview.html', - 'info_dict': { - 'id': '4785848093001', - 'ext': 'mp4', - 'title': 'The Cardinal Pell Interview', - 'description': 'Sky News Contributor Andrew Bolt interviews George Pell in Rome, following the Cardinal\'s evidence before the Royal Commission into Child Abuse. ', - 'uploader': 'GlobeCast Australia - GlobeStream', - 'uploader_id': '2733773828001', - 'upload_date': '20160304', - 'timestamp': 1457083087, - }, - 'params': { - # m3u8 downloads - 'skip_download': True, - }, - }, - { - # Brightcove embed with whitespace around attribute names - 'url': 'http://www.stack.com/video/3167554373001/learn-to-hit-open-three-pointers-with-damian-lillard-s-baseline-drift-drill', - 'info_dict': { - 'id': '3167554373001', - 'ext': 'mp4', - 'title': "Learn to Hit Open Three-Pointers With Damian Lillard's Baseline Drift Drill", - 'description': 'md5:57bacb0e0f29349de4972bfda3191713', - 'uploader_id': '1079349493', - 'upload_date': '20140207', - 'timestamp': 1391810548, - }, - 'params': { - 'skip_download': True, - }, - }, - # Another form of arte.tv embed - { - 'url': 'http://www.tv-replay.fr/redirection/09-04-16/arte-reportage-arte-11508975.html', - 'md5': '850bfe45417ddf221288c88a0cffe2e2', - 'info_dict': { - 'id': '030273-562_PLUS7-F', - 'ext': 'mp4', - 'title': 'ARTE Reportage - Nulle part, en France', - 'description': 'md5:e3a0e8868ed7303ed509b9e3af2b870d', - 'upload_date': '20160409', - }, - }, - # Duplicated embedded video URLs - { - 'url': 'http://www.hudl.com/athlete/2538180/highlights/149298443', - 'info_dict': { - 'id': '149298443_480_16c25b74_2', - 'ext': 'mp4', - 'title': 'vs. Blue Orange Spring Game', - 'uploader': 'www.hudl.com', - }, - }, - # twitter:player:stream embed - { - 'url': 'http://www.rtl.be/info/video/589263.aspx?CategoryID=288', - 'info_dict': { - 'id': 'master', - 'ext': 'mp4', - 'title': 'Une nouvelle espèce de dinosaure découverte en Argentine', - 'uploader': 'www.rtl.be', - }, - 'params': { - # m3u8 downloads - 'skip_download': True, - }, - }, - # twitter:player embed - { - 'url': 'http://www.theatlantic.com/video/index/484130/what-do-black-holes-sound-like/', - 'md5': 'a3e0df96369831de324f0778e126653c', - 'info_dict': { - 'id': '4909620399001', - 'ext': 'mp4', - 'title': 'What Do Black Holes Sound Like?', - 'description': 'what do black holes sound like', - 'upload_date': '20160524', - 'uploader_id': '29913724001', - 'timestamp': 1464107587, - 'uploader': 'TheAtlantic', - }, - 'add_ie': ['BrightcoveLegacy'], - }, - # Facebook <iframe> embed - { - 'url': 'https://www.hostblogger.de/blog/archives/6181-Auto-jagt-Betonmischer.html', - 'md5': 'fbcde74f534176ecb015849146dd3aee', - 'info_dict': { - 'id': '599637780109885', - 'ext': 'mp4', - 'title': 'Facebook video #599637780109885', - }, - }, - # Facebook <iframe> embed, plugin video - { - 'url': 'http://5pillarsuk.com/2017/06/07/tariq-ramadan-disagrees-with-pr-exercise-by-imams-refusing-funeral-prayers-for-london-attackers/', - 'info_dict': { - 'id': '1754168231264132', - 'ext': 'mp4', - 'title': 'About the Imams and Religious leaders refusing to perform funeral prayers for...', - 'uploader': 'Tariq Ramadan (official)', - 'timestamp': 1496758379, - 'upload_date': '20170606', - }, - 'params': { - 'skip_download': True, - }, - }, - # Facebook API embed - { - 'url': 'http://www.lothype.com/blue-stars-2016-preview-standstill-full-show/', - 'md5': 'a47372ee61b39a7b90287094d447d94e', - 'info_dict': { - 'id': '10153467542406923', - 'ext': 'mp4', - 'title': 'Facebook video #10153467542406923', - }, - }, - # Wordpress "YouTube Video Importer" plugin - { - 'url': 'http://www.lothype.com/blue-devils-drumline-stanford-lot-2016/', - 'md5': 'd16797741b560b485194eddda8121b48', - 'info_dict': { - 'id': 'HNTXWDXV9Is', - 'ext': 'mp4', - 'title': 'Blue Devils Drumline Stanford lot 2016', - 'upload_date': '20160627', - 'uploader_id': 'GENOCIDE8GENERAL10', - 'uploader': 'cylus cyrus', - }, - }, - { - # video stored on custom kaltura server - 'url': 'http://www.expansion.com/multimedia/videos.html?media=EQcM30NHIPv', - 'md5': '537617d06e64dfed891fa1593c4b30cc', - 'info_dict': { - 'id': '0_1iotm5bh', - 'ext': 'mp4', - 'title': 'Elecciones británicas: 5 lecciones para Rajoy', - 'description': 'md5:435a89d68b9760b92ce67ed227055f16', - 'uploader_id': 'videos.expansion@el-mundo.net', - 'upload_date': '20150429', - 'timestamp': 1430303472, - }, - 'add_ie': ['Kaltura'], - }, - { - # multiple kaltura embeds, nsfw - 'url': 'https://www.quartier-rouge.be/prive/femmes/kamila-avec-video-jaime-sadomie.html', - 'info_dict': { - 'id': 'kamila-avec-video-jaime-sadomie', - 'title': "Kamila avec vídeo “J'aime sadomie”", - }, - 'playlist_count': 8, - }, - { - # Non-standard Vimeo embed - 'url': 'https://openclassrooms.com/courses/understanding-the-web', - 'md5': '64d86f1c7d369afd9a78b38cbb88d80a', - 'info_dict': { - 'id': '148867247', - 'ext': 'mp4', - 'title': 'Understanding the web - Teaser', - 'description': 'This is "Understanding the web - Teaser" by openclassrooms on Vimeo, the home for high quality videos and the people who love them.', - 'upload_date': '20151214', - 'uploader': 'OpenClassrooms', - 'uploader_id': 'openclassrooms', - }, - 'add_ie': ['Vimeo'], - }, - { - # generic vimeo embed that requires original URL passed as Referer - 'url': 'http://racing4everyone.eu/2016/07/30/formula-1-2016-round12-germany/', - 'only_matching': True, - }, - { - 'url': 'https://support.arkena.com/display/PLAY/Ways+to+embed+your+video', - 'md5': 'b96f2f71b359a8ecd05ce4e1daa72365', - 'info_dict': { - 'id': 'b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe', - 'ext': 'mp4', - 'title': 'Big Buck Bunny', - 'description': 'Royalty free test video', - 'timestamp': 1432816365, - 'upload_date': '20150528', - 'is_live': False, - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': [ArkenaIE.ie_key()], - }, - { - 'url': 'http://nova.bg/news/view/2016/08/16/156543/%D0%BD%D0%B0-%D0%BA%D0%BE%D1%81%D1%8A%D0%BC-%D0%BE%D1%82-%D0%B2%D0%B7%D1%80%D0%B8%D0%B2-%D0%BE%D1%82%D1%86%D0%B5%D0%BF%D0%B8%D1%85%D0%B0-%D1%86%D1%8F%D0%BB-%D0%BA%D0%B2%D0%B0%D1%80%D1%82%D0%B0%D0%BB-%D0%B7%D0%B0%D1%80%D0%B0%D0%B4%D0%B8-%D0%B8%D0%B7%D1%82%D0%B8%D1%87%D0%B0%D0%BD%D0%B5-%D0%BD%D0%B0-%D0%B3%D0%B0%D0%B7-%D0%B2-%D0%BF%D0%BB%D0%BE%D0%B2%D0%B4%D0%B8%D0%B2/', - 'info_dict': { - 'id': '1c7141f46c', - 'ext': 'mp4', - 'title': 'НА КОСЪМ ОТ ВЗРИВ: Изтичане на газ на бензиностанция в Пловдив', - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': [Vbox7IE.ie_key()], - }, - { - # DBTV embeds - 'url': 'http://www.dagbladet.no/2016/02/23/nyheter/nordlys/ski/troms/ver/43254897/', - 'info_dict': { - 'id': '43254897', - 'title': 'Etter ett års planlegging, klaffet endelig alt: - Jeg måtte ta en liten dans', - }, - 'playlist_mincount': 3, - }, - { - # Videa embeds - 'url': 'http://forum.dvdtalk.com/movie-talk/623756-deleted-magic-star-wars-ot-deleted-alt-scenes-docu-style.html', - 'info_dict': { - 'id': '623756-deleted-magic-star-wars-ot-deleted-alt-scenes-docu-style', - 'title': 'Deleted Magic - Star Wars: OT Deleted / Alt. Scenes Docu. Style - DVD Talk Forum', - }, - 'playlist_mincount': 2, - }, - { - # 20 minuten embed - 'url': 'http://www.20min.ch/schweiz/news/story/So-kommen-Sie-bei-Eis-und-Schnee-sicher-an-27032552', - 'info_dict': { - 'id': '523629', - 'ext': 'mp4', - 'title': 'So kommen Sie bei Eis und Schnee sicher an', - 'description': 'md5:117c212f64b25e3d95747e5276863f7d', - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': [TwentyMinutenIE.ie_key()], - }, - { - # VideoPress embed - 'url': 'https://en.support.wordpress.com/videopress/', - 'info_dict': { - 'id': 'OcobLTqC', - 'ext': 'm4v', - 'title': 'IMG_5786', - 'timestamp': 1435711927, - 'upload_date': '20150701', - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': [VideoPressIE.ie_key()], - }, - { - # Rutube embed - 'url': 'http://magazzino.friday.ru/videos/vipuski/kazan-2', - 'info_dict': { - 'id': '9b3d5bee0a8740bf70dfd29d3ea43541', - 'ext': 'flv', - 'title': 'Магаззино: Казань 2', - 'description': 'md5:99bccdfac2269f0e8fdbc4bbc9db184a', - 'uploader': 'Магаззино', - 'upload_date': '20170228', - 'uploader_id': '996642', - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': [RutubeIE.ie_key()], - }, - { - # ThePlatform embedded with whitespaces in URLs - 'url': 'http://www.golfchannel.com/topics/shows/golftalkcentral.htm', - 'only_matching': True, - }, - { - # Senate ISVP iframe https - 'url': 'https://www.hsgac.senate.gov/hearings/canadas-fast-track-refugee-plan-unanswered-questions-and-implications-for-us-national-security', - 'md5': 'fb8c70b0b515e5037981a2492099aab8', - 'info_dict': { - 'id': 'govtaff020316', - 'ext': 'mp4', - 'title': 'Integrated Senate Video Player', - }, - 'add_ie': [SenateISVPIE.ie_key()], - }, - { - # Limelight embeds (1 channel embed + 4 media embeds) - 'url': 'http://www.sedona.com/FacilitatorTraining2017', - 'info_dict': { - 'id': 'FacilitatorTraining2017', - 'title': 'Facilitator Training 2017', - }, - 'playlist_mincount': 5, - }, - { - # Limelight embed (LimelightPlayerUtil.embed) - 'url': 'https://tv5.ca/videos?v=xuu8qowr291ri', - 'info_dict': { - 'id': '95d035dc5c8a401588e9c0e6bd1e9c92', - 'ext': 'mp4', - 'title': '07448641', - 'timestamp': 1499890639, - 'upload_date': '20170712', - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': ['LimelightMedia'], - }, - { - 'url': 'http://kron4.com/2017/04/28/standoff-with-walnut-creek-murder-suspect-ends-with-arrest/', - 'info_dict': { - 'id': 'standoff-with-walnut-creek-murder-suspect-ends-with-arrest', - 'title': 'Standoff with Walnut Creek murder suspect ends', - 'description': 'md5:3ccc48a60fc9441eeccfc9c469ebf788', - }, - 'playlist_mincount': 4, - }, - { - # WashingtonPost embed - 'url': 'http://www.vanityfair.com/hollywood/2017/04/donald-trump-tv-pitches', - 'info_dict': { - 'id': '8caf6e88-d0ec-11e5-90d3-34c2c42653ac', - 'ext': 'mp4', - 'title': "No one has seen the drama series based on Trump's life \u2014 until now", - 'description': 'Donald Trump wanted a weekly TV drama based on his life. It never aired. But The Washington Post recently obtained a scene from the pilot script — and enlisted actors.', - 'timestamp': 1455216756, - 'uploader': 'The Washington Post', - 'upload_date': '20160211', - }, - 'add_ie': [WashingtonPostIE.ie_key()], - }, - { - # Mediaset embed - 'url': 'http://www.tgcom24.mediaset.it/politica/serracchiani-voglio-vivere-in-una-societa-aperta-reazioni-sproporzionate-_3071354-201702a.shtml', - 'info_dict': { - 'id': '720642', - 'ext': 'mp4', - 'title': 'Serracchiani: "Voglio vivere in una società aperta, con tutela del patto di fiducia"', - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': [MediasetIE.ie_key()], - }, - { - # JOJ.sk embeds - 'url': 'https://www.noviny.sk/slovensko/238543-slovenskom-sa-prehnala-vlna-silnych-burok', - 'info_dict': { - 'id': '238543-slovenskom-sa-prehnala-vlna-silnych-burok', - 'title': 'Slovenskom sa prehnala vlna silných búrok', - }, - 'playlist_mincount': 5, - 'add_ie': [JojIE.ie_key()], - }, - { - # AMP embed (see https://www.ampproject.org/docs/reference/components/amp-video) - 'url': 'https://tvrain.ru/amp/418921/', - 'md5': 'cc00413936695987e8de148b67d14f1d', - 'info_dict': { - 'id': '418921', - 'ext': 'mp4', - 'title': 'Стас Намин: «Мы нарушили девственность Кремля»', - }, - }, - { - # vzaar embed - 'url': 'http://help.vzaar.com/article/165-embedding-video', - 'md5': '7e3919d9d2620b89e3e00bec7fe8c9d4', - 'info_dict': { - 'id': '8707641', - 'ext': 'mp4', - 'title': 'Building A Business Online: Principal Chairs Q & A', - }, - }, - { - # multiple HTML5 videos on one page - 'url': 'https://www.paragon-software.com/home/rk-free/keyscenarios.html', - 'info_dict': { - 'id': 'keyscenarios', - 'title': 'Rescue Kit 14 Free Edition - Getting started', - }, - 'playlist_count': 4, - }, - { - # vshare embed - 'url': 'https://youtube-dl-demo.neocities.org/vshare.html', - 'md5': '17b39f55b5497ae8b59f5fbce8e35886', - 'info_dict': { - 'id': '0f64ce6', - 'title': 'vl14062007715967', - 'ext': 'mp4', - } - }, - { - 'url': 'http://www.heidelberg-laureate-forum.org/blog/video/lecture-friday-september-23-2016-sir-c-antony-r-hoare/', - 'md5': 'aecd089f55b1cb5a59032cb049d3a356', - 'info_dict': { - 'id': '90227f51a80c4d8f86c345a7fa62bd9a1d', - 'ext': 'mp4', - 'title': 'Lecture: Friday, September 23, 2016 - Sir Tony Hoare', - 'description': 'md5:5a51db84a62def7b7054df2ade403c6c', - 'timestamp': 1474354800, - 'upload_date': '20160920', - } - }, - { - 'url': 'http://www.kidzworld.com/article/30935-trolls-the-beat-goes-on-interview-skylar-astin-and-amanda-leighton', - 'info_dict': { - 'id': '1731611', - 'ext': 'mp4', - 'title': 'Official Trailer | TROLLS: THE BEAT GOES ON!', - 'description': 'md5:eb5f23826a027ba95277d105f248b825', - 'timestamp': 1516100691, - 'upload_date': '20180116', - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': [SpringboardPlatformIE.ie_key()], - }, - { - 'url': 'https://www.yapfiles.ru/show/1872528/690b05d3054d2dbe1e69523aa21bb3b1.mp4.html', - 'info_dict': { - 'id': 'vMDE4NzI1Mjgt690b', - 'ext': 'mp4', - 'title': 'Котята', - }, - 'add_ie': [YapFilesIE.ie_key()], - 'params': { - 'skip_download': True, - }, - }, - { - # CloudflareStream embed - 'url': 'https://www.cloudflare.com/products/cloudflare-stream/', - 'info_dict': { - 'id': '31c9291ab41fac05471db4e73aa11717', - 'ext': 'mp4', - 'title': '31c9291ab41fac05471db4e73aa11717', - }, - 'add_ie': [CloudflareStreamIE.ie_key()], - 'params': { - 'skip_download': True, - }, - }, - { - # PeerTube embed - 'url': 'https://joinpeertube.org/fr/home/', - 'info_dict': { - 'id': 'home', - 'title': 'Reprenez le contrôle de vos vidéos ! #JoinPeertube', - }, - 'playlist_count': 2, - }, - { - # Indavideo embed - 'url': 'https://streetkitchen.hu/receptek/igy_kell_otthon_hamburgert_sutni/', - 'info_dict': { - 'id': '1693903', - 'ext': 'mp4', - 'title': 'Így kell otthon hamburgert sütni', - 'description': 'md5:f5a730ecf900a5c852e1e00540bbb0f7', - 'timestamp': 1426330212, - 'upload_date': '20150314', - 'uploader': 'StreetKitchen', - 'uploader_id': '546363', - }, - 'add_ie': [IndavideoEmbedIE.ie_key()], - 'params': { - 'skip_download': True, - }, - }, - { - # APA embed via JWPlatform embed - 'url': 'http://www.vol.at/blue-man-group/5593454', - 'info_dict': { - 'id': 'jjv85FdZ', - 'ext': 'mp4', - 'title': '"Blau ist mysteriös": Die Blue Man Group im Interview', - 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 254, - 'timestamp': 1519211149, - 'upload_date': '20180221', - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://share-videos.se/auto/video/83645793?uid=13', - 'md5': 'b68d276de422ab07ee1d49388103f457', - 'info_dict': { - 'id': '83645793', - 'title': 'Lock up and get excited', - 'ext': 'mp4' - }, - 'skip': 'TODO: fix nested playlists processing in tests', - }, - { - # Viqeo embeds - 'url': 'https://viqeo.tv/', - 'info_dict': { - 'id': 'viqeo', - 'title': 'All-new video platform', - }, - 'playlist_count': 6, - }, - { - # Squarespace video embed, 2019-08-28 - 'url': 'http://ootboxford.com', - 'info_dict': { - 'id': 'Tc7b_JGdZfw', - 'title': 'Out of the Blue, at Childish Things 10', - 'ext': 'mp4', - 'description': 'md5:a83d0026666cf5ee970f8bd1cfd69c7f', - 'uploader_id': 'helendouglashouse', - 'uploader': 'Helen & Douglas House', - 'upload_date': '20140328', - }, - 'params': { - 'skip_download': True, - }, - }, - # { - # # Zype embed - # 'url': 'https://www.cookscountry.com/episode/554-smoky-barbecue-favorites', - # 'info_dict': { - # 'id': '5b400b834b32992a310622b9', - # 'ext': 'mp4', - # 'title': 'Smoky Barbecue Favorites', - # 'thumbnail': r're:^https?://.*\.jpe?g', - # 'description': 'md5:5ff01e76316bd8d46508af26dc86023b', - # 'upload_date': '20170909', - # 'timestamp': 1504915200, - # }, - # 'add_ie': [ZypeIE.ie_key()], - # 'params': { - # 'skip_download': True, - # }, - # }, - { - # videojs embed - 'url': 'https://video.sibnet.ru/shell.php?videoid=3422904', - 'info_dict': { - 'id': 'shell', - 'ext': 'mp4', - 'title': 'Доставщик пиццы спросил разрешения сыграть на фортепиано', - 'description': 'md5:89209cdc587dab1e4a090453dbaa2cb1', - 'thumbnail': r're:^https?://.*\.jpg$', - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': ['Failed to download MPD manifest'], - }, - { - # DailyMotion embed with DM.player - 'url': 'https://www.beinsports.com/us/copa-del-rey/video/the-locker-room-valencia-beat-barca-in-copa/1203804', - 'info_dict': { - 'id': 'k6aKkGHd9FJs4mtJN39', - 'ext': 'mp4', - 'title': 'The Locker Room: Valencia Beat Barca In Copa del Rey Final', - 'description': 'This video is private.', - 'uploader_id': 'x1jf30l', - 'uploader': 'beIN SPORTS USA', - 'upload_date': '20190528', - 'timestamp': 1559062971, - }, - 'params': { - 'skip_download': True, - }, - }, - # { - # # TODO: find another test - # # http://schema.org/VideoObject - # 'url': 'https://flipagram.com/f/nyvTSJMKId', - # 'md5': '888dcf08b7ea671381f00fab74692755', - # 'info_dict': { - # 'id': 'nyvTSJMKId', - # 'ext': 'mp4', - # 'title': 'Flipagram by sjuria101 featuring Midnight Memories by One Direction', - # 'description': '#love for cats.', - # 'timestamp': 1461244995, - # 'upload_date': '20160421', - # }, - # 'params': { - # 'force_generic_extractor': True, - # }, - # }, - { - # VHX Embed - 'url': 'https://demo.vhx.tv/category-c/videos/file-example-mp4-480-1-5mg-copy', - 'info_dict': { - 'id': '858208', - 'ext': 'mp4', - 'title': 'Untitled', - 'uploader_id': 'user80538407', - 'uploader': 'OTT Videos', - }, - }, - { - # ArcPublishing PoWa video player - 'url': 'https://www.adn.com/politics/2020/11/02/video-senate-candidates-campaign-in-anchorage-on-eve-of-election-day/', - 'md5': 'b03b2fac8680e1e5a7cc81a5c27e71b3', - 'info_dict': { - 'id': '8c99cb6e-b29c-4bc9-9173-7bf9979225ab', - 'ext': 'mp4', - 'title': 'Senate candidates wave to voters on Anchorage streets', - 'description': 'md5:91f51a6511f090617353dc720318b20e', - 'timestamp': 1604378735, - 'upload_date': '20201103', - 'duration': 1581, - }, - }, - { - # MyChannels SDK embed - # https://www.24kitchen.nl/populair/deskundige-dit-waarom-sommigen-gevoelig-zijn-voor-voedselallergieen - 'url': 'https://www.demorgen.be/nieuws/burgemeester-rotterdam-richt-zich-in-videoboodschap-tot-relschoppers-voelt-het-goed~b0bcfd741/', - 'md5': '90c0699c37006ef18e198c032d81739c', - 'info_dict': { - 'id': '194165', - 'ext': 'mp4', - 'title': 'Burgemeester Aboutaleb spreekt relschoppers toe', - 'timestamp': 1611740340, - 'upload_date': '20210127', - 'duration': 159, - }, - }, - { - # Simplecast player embed - 'url': 'https://www.bio.org/podcast', - 'info_dict': { - 'id': 'podcast', - 'title': 'I AM BIO Podcast | BIO', - }, - 'playlist_mincount': 52, - }, - { - # Sibnet embed (https://help.sibnet.ru/?sibnet_video_embed) - 'url': 'https://phpbb3.x-tk.ru/bbcode-video-sibnet-t24.html', - 'only_matching': True, - }, - ] - - def report_following_redirect(self, new_url): - """Report information extraction.""" - self._downloader.to_screen('[redirect] Following redirect to %s' % new_url) - - def _extract_rss(self, url, video_id, doc): - playlist_title = doc.find('./channel/title').text - playlist_desc_el = doc.find('./channel/description') - playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text - - NS_MAP = { - 'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd', - } - - entries = [] - for it in doc.findall('./channel/item'): - next_url = None - enclosure_nodes = it.findall('./enclosure') - for e in enclosure_nodes: - next_url = e.attrib.get('url') - if next_url: - break - - if not next_url: - next_url = xpath_text(it, 'link', fatal=False) - - if not next_url: - continue - - def itunes(key): - return xpath_text( - it, xpath_with_ns('./itunes:%s' % key, NS_MAP), - default=None) - - duration = itunes('duration') - explicit = (itunes('explicit') or '').lower() - if explicit in ('true', 'yes'): - age_limit = 18 - elif explicit in ('false', 'no'): - age_limit = 0 - else: - age_limit = None - - entries.append({ - '_type': 'url_transparent', - 'url': next_url, - 'title': it.find('title').text, - 'description': xpath_text(it, 'description', default=None), - 'timestamp': unified_timestamp( - xpath_text(it, 'pubDate', default=None)), - 'duration': int_or_none(duration) or parse_duration(duration), - 'thumbnail': url_or_none(xpath_attr(it, xpath_with_ns('./itunes:image', NS_MAP), 'href')), - 'episode': itunes('title'), - 'episode_number': int_or_none(itunes('episode')), - 'season_number': int_or_none(itunes('season')), - 'age_limit': age_limit, - }) - - return { - '_type': 'playlist', - 'id': url, - 'title': playlist_title, - 'description': playlist_desc, - 'entries': entries, - } - - def _extract_camtasia(self, url, video_id, webpage): - """ Returns None if no camtasia video can be found. """ - - camtasia_cfg = self._search_regex( - r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);', - webpage, 'camtasia configuration file', default=None) - if camtasia_cfg is None: - return None - - title = self._html_search_meta('DC.title', webpage, fatal=True) - - camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg) - camtasia_cfg = self._download_xml( - camtasia_url, video_id, - note='Downloading camtasia configuration', - errnote='Failed to download camtasia configuration') - fileset_node = camtasia_cfg.find('./playlist/array/fileset') - - entries = [] - for n in fileset_node.getchildren(): - url_n = n.find('./uri') - if url_n is None: - continue - - entries.append({ - 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0], - 'title': '%s - %s' % (title, n.tag), - 'url': compat_urlparse.urljoin(url, url_n.text), - 'duration': float_or_none(n.find('./duration').text), - }) - - return { - '_type': 'playlist', - 'entries': entries, - 'title': title, - } - - def _real_extract(self, url): - if url.startswith('//'): - return self.url_result(self.http_scheme() + url) - - parsed_url = compat_urlparse.urlparse(url) - if not parsed_url.scheme: - default_search = self._downloader.params.get('default_search') - if default_search is None: - default_search = 'fixup_error' - - if default_search in ('auto', 'auto_warning', 'fixup_error'): - if re.match(r'^[^\s/]+\.[^\s/]+/', url): - self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http') - return self.url_result('http://' + url) - elif default_search != 'fixup_error': - if default_search == 'auto_warning': - if re.match(r'^(?:url|URL)$', url): - raise ExtractorError( - 'Invalid URL: %r . Call youtube-dl like this: youtube-dl -v "https://www.youtube.com/watch?v=BaW_jenozKc" ' % url, - expected=True) - else: - self._downloader.report_warning( - 'Falling back to youtube search for %s . Set --default-search "auto" to suppress this warning.' % url) - return self.url_result('ytsearch:' + url) - - if default_search in ('error', 'fixup_error'): - raise ExtractorError( - '%r is not a valid URL. ' - 'Set --default-search "ytsearch" (or run youtube-dl "ytsearch:%s" ) to search YouTube' - % (url, url), expected=True) - else: - if ':' not in default_search: - default_search += ':' - return self.url_result(default_search + url) - - url, smuggled_data = unsmuggle_url(url) - force_videoid = None - is_intentional = smuggled_data and smuggled_data.get('to_generic') - if smuggled_data and 'force_videoid' in smuggled_data: - force_videoid = smuggled_data['force_videoid'] - video_id = force_videoid - else: - video_id = self._generic_id(url) - - self.to_screen('%s: Requesting header' % video_id) - - head_req = HEADRequest(url) - head_response = self._request_webpage( - head_req, video_id, - note=False, errnote='Could not send HEAD request to %s' % url, - fatal=False) - - if head_response is not False: - # Check for redirect - new_url = head_response.geturl() - if url != new_url: - self.report_following_redirect(new_url) - if force_videoid: - new_url = smuggle_url( - new_url, {'force_videoid': force_videoid}) - return self.url_result(new_url) - - full_response = None - if head_response is False: - request = sanitized_Request(url) - request.add_header('Accept-Encoding', '*') - full_response = self._request_webpage(request, video_id) - head_response = full_response - - info_dict = { - 'id': video_id, - 'title': self._generic_title(url), - 'timestamp': unified_timestamp(head_response.headers.get('Last-Modified')) - } - - # Check for direct link to a video - content_type = head_response.headers.get('Content-Type', '').lower() - m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type) - if m: - format_id = compat_str(m.group('format_id')) - if format_id.endswith('mpegurl'): - formats = self._extract_m3u8_formats(url, video_id, 'mp4') - elif format_id == 'f4m': - formats = self._extract_f4m_formats(url, video_id) - else: - formats = [{ - 'format_id': format_id, - 'url': url, - 'vcodec': 'none' if m.group('type') == 'audio' else None - }] - info_dict['direct'] = True - self._sort_formats(formats) - info_dict['formats'] = formats - return info_dict - - if not self._downloader.params.get('test', False) and not is_intentional: - force = self._downloader.params.get('force_generic_extractor', False) - self._downloader.report_warning( - '%s on generic information extractor.' % ('Forcing' if force else 'Falling back')) - - if not full_response: - request = sanitized_Request(url) - # Some webservers may serve compressed content of rather big size (e.g. gzipped flac) - # making it impossible to download only chunk of the file (yet we need only 512kB to - # test whether it's HTML or not). According to youtube-dl default Accept-Encoding - # that will always result in downloading the whole file that is not desirable. - # Therefore for extraction pass we have to override Accept-Encoding to any in order - # to accept raw bytes and being able to download only a chunk. - # It may probably better to solve this by checking Content-Type for application/octet-stream - # after HEAD request finishes, but not sure if we can rely on this. - request.add_header('Accept-Encoding', '*') - full_response = self._request_webpage(request, video_id) - - first_bytes = full_response.read(512) - - # Is it an M3U playlist? - if first_bytes.startswith(b'#EXTM3U'): - info_dict['formats'] = self._extract_m3u8_formats(url, video_id, 'mp4') - self._sort_formats(info_dict['formats']) - return info_dict - - # Maybe it's a direct link to a video? - # Be careful not to download the whole thing! - if not is_html(first_bytes): - self._downloader.report_warning( - 'URL could be a direct video link, returning it as such.') - info_dict.update({ - 'direct': True, - 'url': url, - }) - return info_dict - - webpage = self._webpage_read_content( - full_response, url, video_id, prefix=first_bytes) - - if '<title>DPG Media Privacy Gate</title>' in webpage: - webpage = self._download_webpage(url, video_id) - - self.report_extraction(video_id) - - # Is it an RSS feed, a SMIL file, an XSPF playlist or a MPD manifest? - try: - doc = compat_etree_fromstring(webpage.encode('utf-8')) - if doc.tag == 'rss': - return self._extract_rss(url, video_id, doc) - elif doc.tag == 'SmoothStreamingMedia': - info_dict['formats'] = self._parse_ism_formats(doc, url) - self._sort_formats(info_dict['formats']) - return info_dict - elif re.match(r'^(?:{[^}]+})?smil$', doc.tag): - smil = self._parse_smil(doc, url, video_id) - self._sort_formats(smil['formats']) - return smil - elif doc.tag == '{http://xspf.org/ns/0/}playlist': - return self.playlist_result( - self._parse_xspf( - doc, video_id, xspf_url=url, - xspf_base_url=full_response.geturl()), - video_id) - elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): - info_dict['formats'] = self._parse_mpd_formats( - doc, - mpd_base_url=full_response.geturl().rpartition('/')[0], - mpd_url=url) - self._sort_formats(info_dict['formats']) - return info_dict - elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag): - info_dict['formats'] = self._parse_f4m_formats(doc, url, video_id) - self._sort_formats(info_dict['formats']) - return info_dict - except compat_xml_parse_error: - pass - - # Is it a Camtasia project? - camtasia_res = self._extract_camtasia(url, video_id, webpage) - if camtasia_res is not None: - return camtasia_res - - # Sometimes embedded video player is hidden behind percent encoding - # (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448) - # Unescaping the whole page allows to handle those cases in a generic way - # FIXME: unescaping the whole page may break URLs, commenting out for now. - # There probably should be a second run of generic extractor on unescaped webpage. - # webpage = compat_urllib_parse_unquote(webpage) - - # Unescape squarespace embeds to be detected by generic extractor, - # see https://github.com/ytdl-org/youtube-dl/issues/21294 - webpage = re.sub( - r'<div[^>]+class=[^>]*?\bsqs-video-wrapper\b[^>]*>', - lambda x: unescapeHTML(x.group(0)), webpage) - - # it's tempting to parse this further, but you would - # have to take into account all the variations like - # Video Title - Site Name - # Site Name | Video Title - # Video Title - Tagline | Site Name - # and so on and so forth; it's just not practical - video_title = self._og_search_title( - webpage, default=None) or self._html_search_regex( - r'(?s)<title>(.*?)</title>', webpage, 'video title', - default='video') - - # Try to detect age limit automatically - age_limit = self._rta_search(webpage) - # And then there are the jokers who advertise that they use RTA, - # but actually don't. - AGE_LIMIT_MARKERS = [ - r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>', - ] - if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS): - age_limit = 18 - - # video uploader is domain name - video_uploader = self._search_regex( - r'^(?:https?://)?([^/]*)/.*', url, 'video uploader') - - video_description = self._og_search_description(webpage, default=None) - video_thumbnail = self._og_search_thumbnail(webpage, default=None) - - info_dict.update({ - 'title': video_title, - 'description': video_description, - 'thumbnail': video_thumbnail, - 'age_limit': age_limit, - }) - - # Look for Brightcove Legacy Studio embeds - bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage) - if bc_urls: - entries = [{ - '_type': 'url', - 'url': smuggle_url(bc_url, {'Referer': url}), - 'ie_key': 'BrightcoveLegacy' - } for bc_url in bc_urls] - - return { - '_type': 'playlist', - 'title': video_title, - 'id': video_id, - 'entries': entries, - } - - # Look for Brightcove New Studio embeds - bc_urls = BrightcoveNewIE._extract_urls(self, webpage) - if bc_urls: - return self.playlist_from_matches( - bc_urls, video_id, video_title, - getter=lambda x: smuggle_url(x, {'referrer': url}), - ie='BrightcoveNew') - - # Look for Nexx embeds - nexx_urls = NexxIE._extract_urls(webpage) - if nexx_urls: - return self.playlist_from_matches(nexx_urls, video_id, video_title, ie=NexxIE.ie_key()) - - # Look for Nexx iFrame embeds - nexx_embed_urls = NexxEmbedIE._extract_urls(webpage) - if nexx_embed_urls: - return self.playlist_from_matches(nexx_embed_urls, video_id, video_title, ie=NexxEmbedIE.ie_key()) - - # Look for ThePlatform embeds - tp_urls = ThePlatformIE._extract_urls(webpage) - if tp_urls: - return self.playlist_from_matches(tp_urls, video_id, video_title, ie='ThePlatform') - - arc_urls = ArcPublishingIE._extract_urls(webpage) - if arc_urls: - return self.playlist_from_matches(arc_urls, video_id, video_title, ie=ArcPublishingIE.ie_key()) - - mychannels_urls = MedialaanIE._extract_urls(webpage) - if mychannels_urls: - return self.playlist_from_matches( - mychannels_urls, video_id, video_title, ie=MedialaanIE.ie_key()) - - # Look for embedded rtl.nl player - matches = re.findall( - r'<iframe[^>]+?src="((?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)"', - webpage) - if matches: - return self.playlist_from_matches(matches, video_id, video_title, ie='RtlNl') - - vimeo_urls = VimeoIE._extract_urls(url, webpage) - if vimeo_urls: - return self.playlist_from_matches(vimeo_urls, video_id, video_title, ie=VimeoIE.ie_key()) - - vhx_url = VHXEmbedIE._extract_url(webpage) - if vhx_url: - return self.url_result(vhx_url, VHXEmbedIE.ie_key()) - - vid_me_embed_url = self._search_regex( - r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]', - webpage, 'vid.me embed', default=None) - if vid_me_embed_url is not None: - return self.url_result(vid_me_embed_url, 'Vidme') - - # Look for YouTube embeds - youtube_urls = YoutubeIE._extract_urls(webpage) - if youtube_urls: - return self.playlist_from_matches( - youtube_urls, video_id, video_title, ie=YoutubeIE.ie_key()) - - matches = DailymotionIE._extract_urls(webpage) - if matches: - return self.playlist_from_matches(matches, video_id, video_title) - - # Look for embedded Dailymotion playlist player (#3822) - m = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage) - if m: - playlists = re.findall( - r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url'))) - if playlists: - return self.playlist_from_matches( - playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p) - - # Look for DailyMail embeds - dailymail_urls = DailyMailIE._extract_urls(webpage) - if dailymail_urls: - return self.playlist_from_matches( - dailymail_urls, video_id, video_title, ie=DailyMailIE.ie_key()) - - # Look for Teachable embeds, must be before Wistia - teachable_url = TeachableIE._extract_url(webpage, url) - if teachable_url: - return self.url_result(teachable_url) - - # Look for embedded Wistia player - wistia_urls = WistiaIE._extract_urls(webpage) - if wistia_urls: - playlist = self.playlist_from_matches(wistia_urls, video_id, video_title, ie=WistiaIE.ie_key()) - for entry in playlist['entries']: - entry.update({ - '_type': 'url_transparent', - 'uploader': video_uploader, - }) - return playlist - - # Look for SVT player - svt_url = SVTIE._extract_url(webpage) - if svt_url: - return self.url_result(svt_url, 'SVT') - - # Look for Bandcamp pages with custom domain - mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage) - if mobj is not None: - burl = unescapeHTML(mobj.group(1)) - # Don't set the extractor because it can be a track url or an album - return self.url_result(burl) - - # Look for embedded Vevo player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for embedded Viddler player - mobj = re.search( - r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P<url>(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for NYTimes player - mobj = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for Libsyn player - mobj = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for Ooyala videos - mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) - or re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) - or re.search(r'OO\.Player\.create\.apply\(\s*OO\.Player\s*,\s*op\(\s*\[\s*[\'"][^\'"]*[\'"]\s*,\s*[\'"](?P<ec>.{32})[\'"]', webpage) - or re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) - or re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage)) - if mobj is not None: - embed_token = self._search_regex( - r'embedToken[\'"]?\s*:\s*[\'"]([^\'"]+)', - webpage, 'ooyala embed token', default=None) - return OoyalaIE._build_url_result(smuggle_url( - mobj.group('ec'), { - 'domain': url, - 'embed_token': embed_token, - })) - - # Look for multiple Ooyala embeds on SBN network websites - mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage) - if mobj is not None: - embeds = self._parse_json(mobj.group(1), video_id, fatal=False) - if embeds: - return self.playlist_from_matches( - embeds, video_id, video_title, - getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala') - - # Look for Aparat videos - mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage) - if mobj is not None: - return self.url_result(mobj.group(1), 'Aparat') - - # Look for MPORA videos - mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage) - if mobj is not None: - return self.url_result(mobj.group(1), 'Mpora') - - # Look for embedded Facebook player - facebook_urls = FacebookIE._extract_urls(webpage) - if facebook_urls: - return self.playlist_from_matches(facebook_urls, video_id, video_title) - - # Look for embedded VK player - mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'VK') - - # Look for embedded Odnoklassniki player - odnoklassniki_url = OdnoklassnikiIE._extract_url(webpage) - if odnoklassniki_url: - return self.url_result(odnoklassniki_url, OdnoklassnikiIE.ie_key()) - - # Look for sibnet embedded player - sibnet_urls = VKIE._extract_sibnet_urls(webpage) - if sibnet_urls: - return self.playlist_from_matches(sibnet_urls, video_id, video_title) - - # Look for embedded ivi player - mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Ivi') - - # Look for embedded Huffington Post player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'HuffPost') - - # Look for embed.ly - mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage) - if mobj is not None: - return self.url_result(compat_urllib_parse_unquote(mobj.group('url'))) - - # Look for funnyordie embed - matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage) - if matches: - return self.playlist_from_matches( - matches, video_id, video_title, getter=unescapeHTML, ie='FunnyOrDie') - - # Look for Simplecast embeds - simplecast_urls = SimplecastIE._extract_urls(webpage) - if simplecast_urls: - return self.playlist_from_matches( - simplecast_urls, video_id, video_title) - - # Look for BBC iPlayer embed - matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage) - if matches: - return self.playlist_from_matches(matches, video_id, video_title, ie='BBCCoUk') - - # Look for embedded RUTV player - rutv_url = RUTVIE._extract_url(webpage) - if rutv_url: - return self.url_result(rutv_url, 'RUTV') - - # Look for embedded TVC player - tvc_url = TVCIE._extract_url(webpage) - if tvc_url: - return self.url_result(tvc_url, 'TVC') - - # Look for embedded SportBox player - sportbox_urls = SportBoxIE._extract_urls(webpage) - if sportbox_urls: - return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie=SportBoxIE.ie_key()) - - # Look for embedded XHamster player - xhamster_urls = XHamsterEmbedIE._extract_urls(webpage) - if xhamster_urls: - return self.playlist_from_matches(xhamster_urls, video_id, video_title, ie='XHamsterEmbed') - - # Look for embedded TNAFlixNetwork player - tnaflix_urls = TNAFlixNetworkEmbedIE._extract_urls(webpage) - if tnaflix_urls: - return self.playlist_from_matches(tnaflix_urls, video_id, video_title, ie=TNAFlixNetworkEmbedIE.ie_key()) - - # Look for embedded PornHub player - pornhub_urls = PornHubIE._extract_urls(webpage) - if pornhub_urls: - return self.playlist_from_matches(pornhub_urls, video_id, video_title, ie=PornHubIE.ie_key()) - - # Look for embedded DrTuber player - drtuber_urls = DrTuberIE._extract_urls(webpage) - if drtuber_urls: - return self.playlist_from_matches(drtuber_urls, video_id, video_title, ie=DrTuberIE.ie_key()) - - # Look for embedded RedTube player - redtube_urls = RedTubeIE._extract_urls(webpage) - if redtube_urls: - return self.playlist_from_matches(redtube_urls, video_id, video_title, ie=RedTubeIE.ie_key()) - - # Look for embedded Tube8 player - tube8_urls = Tube8IE._extract_urls(webpage) - if tube8_urls: - return self.playlist_from_matches(tube8_urls, video_id, video_title, ie=Tube8IE.ie_key()) - - # Look for embedded Mofosex player - mofosex_urls = MofosexEmbedIE._extract_urls(webpage) - if mofosex_urls: - return self.playlist_from_matches(mofosex_urls, video_id, video_title, ie=MofosexEmbedIE.ie_key()) - - # Look for embedded Spankwire player - spankwire_urls = SpankwireIE._extract_urls(webpage) - if spankwire_urls: - return self.playlist_from_matches(spankwire_urls, video_id, video_title, ie=SpankwireIE.ie_key()) - - # Look for embedded YouPorn player - youporn_urls = YouPornIE._extract_urls(webpage) - if youporn_urls: - return self.playlist_from_matches(youporn_urls, video_id, video_title, ie=YouPornIE.ie_key()) - - # Look for embedded Tvigle player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Tvigle') - - # Look for embedded TED player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'TED') - - # Look for embedded Ustream videos - ustream_url = UstreamIE._extract_url(webpage) - if ustream_url: - return self.url_result(ustream_url, UstreamIE.ie_key()) - - # Look for embedded arte.tv player - arte_urls = ArteTVEmbedIE._extract_urls(webpage) - if arte_urls: - return self.playlist_from_matches(arte_urls, video_id, video_title) - - # Look for embedded francetv player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?://)?embed\.francetv\.fr/\?ue=.+?)\1', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for embedded Myvi.ru player - myvi_url = MyviIE._extract_url(webpage) - if myvi_url: - return self.url_result(myvi_url) - - # Look for embedded soundcloud player - soundcloud_urls = SoundcloudEmbedIE._extract_urls(webpage) - if soundcloud_urls: - return self.playlist_from_matches(soundcloud_urls, video_id, video_title, getter=unescapeHTML) - - # Look for tunein player - tunein_urls = TuneInBaseIE._extract_urls(webpage) - if tunein_urls: - return self.playlist_from_matches(tunein_urls, video_id, video_title) - - # Look for embedded mtvservices player - mtvservices_url = MTVServicesEmbeddedIE._extract_url(webpage) - if mtvservices_url: - return self.url_result(mtvservices_url, ie='MTVServicesEmbedded') - - # Look for embedded yahoo player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Yahoo') - - # Look for embedded sbs.com.au player - mobj = re.search( - r'''(?x) - (?: - <meta\s+property="og:video"\s+content=| - <iframe[^>]+?src= - ) - (["\'])(?P<url>https?://(?:www\.)?sbs\.com\.au/ondemand/video/.+?)\1''', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'SBS') - - # Look for embedded Cinchcast player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Cinchcast') - - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1', - webpage) - if not mobj: - mobj = re.search( - r'data-video-link=["\'](?P<url>http://m\.mlb\.com/video/[^"\']+)', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'MLB') - - mobj = re.search( - r'<(?:iframe|script)[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL, - webpage) - if mobj is not None: - return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast') - - mobj = re.search( - r'<iframe[^>]+src="(?P<url>https?://(?:new\.)?livestream\.com/[^"]+/player[^"]+)"', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Livestream') - - # Look for Zapiks embed - mobj = re.search( - r'<iframe[^>]+src="(?P<url>https?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Zapiks') - - # Look for Kaltura embeds - kaltura_urls = KalturaIE._extract_urls(webpage) - if kaltura_urls: - return self.playlist_from_matches( - kaltura_urls, video_id, video_title, - getter=lambda x: smuggle_url(x, {'source_url': url}), - ie=KalturaIE.ie_key()) - - # Look for EaglePlatform embeds - eagleplatform_url = EaglePlatformIE._extract_url(webpage) - if eagleplatform_url: - return self.url_result(smuggle_url(eagleplatform_url, {'referrer': url}), EaglePlatformIE.ie_key()) - - # Look for ClipYou (uses EaglePlatform) embeds - mobj = re.search( - r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage) - if mobj is not None: - return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform') - - # Look for Pladform embeds - pladform_url = PladformIE._extract_url(webpage) - if pladform_url: - return self.url_result(pladform_url) - - # Look for Videomore embeds - videomore_url = VideomoreIE._extract_url(webpage) - if videomore_url: - return self.url_result(videomore_url) - - # Look for Webcaster embeds - webcaster_url = WebcasterFeedIE._extract_url(self, webpage) - if webcaster_url: - return self.url_result(webcaster_url, ie=WebcasterFeedIE.ie_key()) - - # Look for Playwire embeds - mobj = re.search( - r'<script[^>]+data-config=(["\'])(?P<url>(?:https?:)?//config\.playwire\.com/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for 5min embeds - mobj = re.search( - r'<meta[^>]+property="og:video"[^>]+content="https?://embed\.5min\.com/(?P<id>[0-9]+)/?', webpage) - if mobj is not None: - return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin') - - # Look for Crooks and Liars embeds - mobj = re.search( - r'<(?:iframe[^>]+src|param[^>]+value)=(["\'])(?P<url>(?:https?:)?//embed\.crooksandliars\.com/(?:embed|v)/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for NBC Sports VPlayer embeds - nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage) - if nbc_sports_url: - return self.url_result(nbc_sports_url, 'NBCSportsVPlayer') - - # Look for NBC News embeds - nbc_news_embed_url = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//www\.nbcnews\.com/widget/video-embed/[^"\']+)\1', webpage) - if nbc_news_embed_url: - return self.url_result(nbc_news_embed_url.group('url'), 'NBCNews') - - # Look for Google Drive embeds - google_drive_url = GoogleDriveIE._extract_url(webpage) - if google_drive_url: - return self.url_result(google_drive_url, 'GoogleDrive') - - # Look for UDN embeds - mobj = re.search( - r'<iframe[^>]+src="(?:https?:)?(?P<url>%s)"' % UDNEmbedIE._PROTOCOL_RELATIVE_VALID_URL, webpage) - if mobj is not None: - return self.url_result( - compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed') - - # Look for Senate ISVP iframe - senate_isvp_url = SenateISVPIE._search_iframe_url(webpage) - if senate_isvp_url: - return self.url_result(senate_isvp_url, 'SenateISVP') - - # Look for Kinja embeds - kinja_embed_urls = KinjaEmbedIE._extract_urls(webpage, url) - if kinja_embed_urls: - return self.playlist_from_matches( - kinja_embed_urls, video_id, video_title) - - # Look for OnionStudios embeds - onionstudios_url = OnionStudiosIE._extract_url(webpage) - if onionstudios_url: - return self.url_result(onionstudios_url) - - # Look for ViewLift embeds - viewlift_url = ViewLiftEmbedIE._extract_url(webpage) - if viewlift_url: - return self.url_result(viewlift_url) - - # Look for JWPlatform embeds - jwplatform_urls = JWPlatformIE._extract_urls(webpage) - if jwplatform_urls: - return self.playlist_from_matches(jwplatform_urls, video_id, video_title, ie=JWPlatformIE.ie_key()) - - # Look for Digiteka embeds - digiteka_url = DigitekaIE._extract_url(webpage) - if digiteka_url: - return self.url_result(self._proto_relative_url(digiteka_url), DigitekaIE.ie_key()) - - # Look for Arkena embeds - arkena_url = ArkenaIE._extract_url(webpage) - if arkena_url: - return self.url_result(arkena_url, ArkenaIE.ie_key()) - - # Look for Piksel embeds - piksel_url = PikselIE._extract_url(webpage) - if piksel_url: - return self.url_result(piksel_url, PikselIE.ie_key()) - - # Look for Limelight embeds - limelight_urls = LimelightBaseIE._extract_urls(webpage, url) - if limelight_urls: - return self.playlist_result( - limelight_urls, video_id, video_title, video_description) - - # Look for Anvato embeds - anvato_urls = AnvatoIE._extract_urls(self, webpage, video_id) - if anvato_urls: - return self.playlist_result( - anvato_urls, video_id, video_title, video_description) - - # Look for AdobeTVVideo embeds - mobj = re.search( - r'<iframe[^>]+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]', - webpage) - if mobj is not None: - return self.url_result( - self._proto_relative_url(unescapeHTML(mobj.group(1))), - 'AdobeTVVideo') - - # Look for Vine embeds - mobj = re.search( - r'<iframe[^>]+src=[\'"]((?:https?:)?//(?:www\.)?vine\.co/v/[^/]+/embed/(?:simple|postcard))', - webpage) - if mobj is not None: - return self.url_result( - self._proto_relative_url(unescapeHTML(mobj.group(1))), 'Vine') - - # Look for VODPlatform embeds - mobj = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:(?:www\.)?vod-platform\.net|embed\.kwikmotion\.com)/[eE]mbed/.+?)\1', - webpage) - if mobj is not None: - return self.url_result( - self._proto_relative_url(unescapeHTML(mobj.group('url'))), 'VODPlatform') - - # Look for Mangomolo embeds - mobj = re.search( - r'''(?x)<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?// - (?: - admin\.mangomolo\.com/analytics/index\.php/customers/embed| - player\.mangomolo\.com/v1 - )/ - (?: - video\?.*?\bid=(?P<video_id>\d+)| - (?:index|live)\?.*?\bchannelid=(?P<channel_id>(?:[A-Za-z0-9+/=]|%2B|%2F|%3D)+) - ).+?)\1''', webpage) - if mobj is not None: - info = { - '_type': 'url_transparent', - 'url': self._proto_relative_url(unescapeHTML(mobj.group('url'))), - 'title': video_title, - 'description': video_description, - 'thumbnail': video_thumbnail, - 'uploader': video_uploader, - } - video_id = mobj.group('video_id') - if video_id: - info.update({ - 'ie_key': 'MangomoloVideo', - 'id': video_id, - }) - else: - info.update({ - 'ie_key': 'MangomoloLive', - 'id': mobj.group('channel_id'), - }) - return info - - # Look for Instagram embeds - instagram_embed_url = InstagramIE._extract_embed_url(webpage) - if instagram_embed_url is not None: - return self.url_result( - self._proto_relative_url(instagram_embed_url), InstagramIE.ie_key()) - - # Look for 3Q SDN embeds - threeqsdn_url = ThreeQSDNIE._extract_url(webpage) - if threeqsdn_url: - return { - '_type': 'url_transparent', - 'ie_key': ThreeQSDNIE.ie_key(), - 'url': self._proto_relative_url(threeqsdn_url), - 'title': video_title, - 'description': video_description, - 'thumbnail': video_thumbnail, - 'uploader': video_uploader, - } - - # Look for VBOX7 embeds - vbox7_url = Vbox7IE._extract_url(webpage) - if vbox7_url: - return self.url_result(vbox7_url, Vbox7IE.ie_key()) - - # Look for DBTV embeds - dbtv_urls = DBTVIE._extract_urls(webpage) - if dbtv_urls: - return self.playlist_from_matches(dbtv_urls, video_id, video_title, ie=DBTVIE.ie_key()) - - # Look for Videa embeds - videa_urls = VideaIE._extract_urls(webpage) - if videa_urls: - return self.playlist_from_matches(videa_urls, video_id, video_title, ie=VideaIE.ie_key()) - - # Look for 20 minuten embeds - twentymin_urls = TwentyMinutenIE._extract_urls(webpage) - if twentymin_urls: - return self.playlist_from_matches( - twentymin_urls, video_id, video_title, ie=TwentyMinutenIE.ie_key()) - - # Look for VideoPress embeds - videopress_urls = VideoPressIE._extract_urls(webpage) - if videopress_urls: - return self.playlist_from_matches( - videopress_urls, video_id, video_title, ie=VideoPressIE.ie_key()) - - # Look for Rutube embeds - rutube_urls = RutubeIE._extract_urls(webpage) - if rutube_urls: - return self.playlist_from_matches( - rutube_urls, video_id, video_title, ie=RutubeIE.ie_key()) - - # Look for WashingtonPost embeds - wapo_urls = WashingtonPostIE._extract_urls(webpage) - if wapo_urls: - return self.playlist_from_matches( - wapo_urls, video_id, video_title, ie=WashingtonPostIE.ie_key()) - - # Look for Mediaset embeds - mediaset_urls = MediasetIE._extract_urls(self, webpage) - if mediaset_urls: - return self.playlist_from_matches( - mediaset_urls, video_id, video_title, ie=MediasetIE.ie_key()) - - # Look for JOJ.sk embeds - joj_urls = JojIE._extract_urls(webpage) - if joj_urls: - return self.playlist_from_matches( - joj_urls, video_id, video_title, ie=JojIE.ie_key()) - - # Look for megaphone.fm embeds - mpfn_urls = MegaphoneIE._extract_urls(webpage) - if mpfn_urls: - return self.playlist_from_matches( - mpfn_urls, video_id, video_title, ie=MegaphoneIE.ie_key()) - - # Look for vzaar embeds - vzaar_urls = VzaarIE._extract_urls(webpage) - if vzaar_urls: - return self.playlist_from_matches( - vzaar_urls, video_id, video_title, ie=VzaarIE.ie_key()) - - channel9_urls = Channel9IE._extract_urls(webpage) - if channel9_urls: - return self.playlist_from_matches( - channel9_urls, video_id, video_title, ie=Channel9IE.ie_key()) - - vshare_urls = VShareIE._extract_urls(webpage) - if vshare_urls: - return self.playlist_from_matches( - vshare_urls, video_id, video_title, ie=VShareIE.ie_key()) - - # Look for Mediasite embeds - mediasite_urls = MediasiteIE._extract_urls(webpage) - if mediasite_urls: - entries = [ - self.url_result(smuggle_url( - compat_urlparse.urljoin(url, mediasite_url), - {'UrlReferrer': url}), ie=MediasiteIE.ie_key()) - for mediasite_url in mediasite_urls] - return self.playlist_result(entries, video_id, video_title) - - springboardplatform_urls = SpringboardPlatformIE._extract_urls(webpage) - if springboardplatform_urls: - return self.playlist_from_matches( - springboardplatform_urls, video_id, video_title, - ie=SpringboardPlatformIE.ie_key()) - - yapfiles_urls = YapFilesIE._extract_urls(webpage) - if yapfiles_urls: - return self.playlist_from_matches( - yapfiles_urls, video_id, video_title, ie=YapFilesIE.ie_key()) - - vice_urls = ViceIE._extract_urls(webpage) - if vice_urls: - return self.playlist_from_matches( - vice_urls, video_id, video_title, ie=ViceIE.ie_key()) - - xfileshare_urls = XFileShareIE._extract_urls(webpage) - if xfileshare_urls: - return self.playlist_from_matches( - xfileshare_urls, video_id, video_title, ie=XFileShareIE.ie_key()) - - cloudflarestream_urls = CloudflareStreamIE._extract_urls(webpage) - if cloudflarestream_urls: - return self.playlist_from_matches( - cloudflarestream_urls, video_id, video_title, ie=CloudflareStreamIE.ie_key()) - - peertube_urls = PeerTubeIE._extract_urls(webpage, url) - if peertube_urls: - return self.playlist_from_matches( - peertube_urls, video_id, video_title, ie=PeerTubeIE.ie_key()) - - indavideo_urls = IndavideoEmbedIE._extract_urls(webpage) - if indavideo_urls: - return self.playlist_from_matches( - indavideo_urls, video_id, video_title, ie=IndavideoEmbedIE.ie_key()) - - apa_urls = APAIE._extract_urls(webpage) - if apa_urls: - return self.playlist_from_matches( - apa_urls, video_id, video_title, ie=APAIE.ie_key()) - - foxnews_urls = FoxNewsIE._extract_urls(webpage) - if foxnews_urls: - return self.playlist_from_matches( - foxnews_urls, video_id, video_title, ie=FoxNewsIE.ie_key()) - - sharevideos_urls = [sharevideos_mobj.group('url') for sharevideos_mobj in re.finditer( - r'<iframe[^>]+?\bsrc\s*=\s*(["\'])(?P<url>(?:https?:)?//embed\.share-videos\.se/auto/embed/\d+\?.*?\buid=\d+.*?)\1', - webpage)] - if sharevideos_urls: - return self.playlist_from_matches( - sharevideos_urls, video_id, video_title) - - viqeo_urls = ViqeoIE._extract_urls(webpage) - if viqeo_urls: - return self.playlist_from_matches( - viqeo_urls, video_id, video_title, ie=ViqeoIE.ie_key()) - - expressen_urls = ExpressenIE._extract_urls(webpage) - if expressen_urls: - return self.playlist_from_matches( - expressen_urls, video_id, video_title, ie=ExpressenIE.ie_key()) - - zype_urls = ZypeIE._extract_urls(webpage) - if zype_urls: - return self.playlist_from_matches( - zype_urls, video_id, video_title, ie=ZypeIE.ie_key()) - - # Look for HTML5 media - entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') - if entries: - if len(entries) == 1: - entries[0].update({ - 'id': video_id, - 'title': video_title, - }) - else: - for num, entry in enumerate(entries, start=1): - entry.update({ - 'id': '%s-%s' % (video_id, num), - 'title': '%s (%d)' % (video_title, num), - }) - for entry in entries: - self._sort_formats(entry['formats']) - return self.playlist_result(entries, video_id, video_title) - - jwplayer_data = self._find_jwplayer_data( - webpage, video_id, transform_source=js_to_json) - if jwplayer_data: - try: - info = self._parse_jwplayer_data( - jwplayer_data, video_id, require_title=False, base_url=url) - return merge_dicts(info, info_dict) - except ExtractorError: - # See https://github.com/ytdl-org/youtube-dl/pull/16735 - pass - - # Video.js embed - mobj = re.search( - r'(?s)\bvideojs\s*\(.+?\.src\s*\(\s*((?:\[.+?\]|{.+?}))\s*\)\s*;', - webpage) - if mobj is not None: - sources = self._parse_json( - mobj.group(1), video_id, transform_source=js_to_json, - fatal=False) or [] - if not isinstance(sources, list): - sources = [sources] - formats = [] - for source in sources: - src = source.get('src') - if not src or not isinstance(src, compat_str): - continue - src = compat_urlparse.urljoin(url, src) - src_type = source.get('type') - if isinstance(src_type, compat_str): - src_type = src_type.lower() - ext = determine_ext(src).lower() - if src_type == 'video/youtube': - return self.url_result(src, YoutubeIE.ie_key()) - if src_type == 'application/dash+xml' or ext == 'mpd': - formats.extend(self._extract_mpd_formats( - src, video_id, mpd_id='dash', fatal=False)) - elif src_type == 'application/x-mpegurl' or ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - src, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - else: - formats.append({ - 'url': src, - 'ext': (mimetype2ext(src_type) - or ext if ext in KNOWN_EXTENSIONS else 'mp4'), - 'http_headers': { - 'Referer': full_response.geturl(), - }, - }) - if formats: - self._sort_formats(formats) - info_dict['formats'] = formats - return info_dict - - # Looking for http://schema.org/VideoObject - json_ld = self._search_json_ld( - webpage, video_id, default={}, expected_type='VideoObject') - if json_ld.get('url'): - return merge_dicts(json_ld, info_dict) - - def check_video(vurl): - if YoutubeIE.suitable(vurl): - return True - if RtmpIE.suitable(vurl): - return True - vpath = compat_urlparse.urlparse(vurl).path - vext = determine_ext(vpath) - return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js', 'xml') - - def filter_video(urls): - return list(filter(check_video, urls)) - - # Start with something easy: JW Player in SWFObject - found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)) - if not found: - # Look for gorilla-vid style embedding - found = filter_video(re.findall(r'''(?sx) - (?: - jw_plugins| - JWPlayerOptions| - jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup - ) - .*? - ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage)) - if not found: - # Broaden the search a little bit - found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)) - if not found: - # Broaden the findall a little bit: JWPlayer JS loader - found = filter_video(re.findall( - r'[^A-Za-z0-9]?(?:file|video_url)["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage)) - if not found: - # Flow player - found = filter_video(re.findall(r'''(?xs) - flowplayer\("[^"]+",\s* - \{[^}]+?\}\s*, - \s*\{[^}]+? ["']?clip["']?\s*:\s*\{\s* - ["']?url["']?\s*:\s*["']([^"']+)["'] - ''', webpage)) - if not found: - # Cinerama player - found = re.findall( - r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage) - if not found: - # Try to find twitter cards info - # twitter:player:stream should be checked before twitter:player since - # it is expected to contain a raw stream (see - # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser) - found = filter_video(re.findall( - r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)) - if not found: - # We look for Open Graph info: - # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am) - m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage) - # We only look in og:video if the MIME type is a video, don't try if it's a Flash player: - if m_video_type is not None: - found = filter_video(re.findall(r'<meta.*?property="og:(?:video|audio)".*?content="(.*?)"', webpage)) - if not found: - REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)' - found = re.search( - r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")' - r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX, - webpage) - if not found: - # Look also in Refresh HTTP header - refresh_header = head_response.headers.get('Refresh') - if refresh_header: - # In python 2 response HTTP headers are bytestrings - if sys.version_info < (3, 0) and isinstance(refresh_header, str): - refresh_header = refresh_header.decode('iso-8859-1') - found = re.search(REDIRECT_REGEX, refresh_header) - if found: - new_url = compat_urlparse.urljoin(url, unescapeHTML(found.group(1))) - if new_url != url: - self.report_following_redirect(new_url) - return { - '_type': 'url', - 'url': new_url, - } - else: - found = None - - if not found: - # twitter:player is a https URL to iframe player that may or may not - # be supported by youtube-dl thus this is checked the very last (see - # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser) - embed_url = self._html_search_meta('twitter:player', webpage, default=None) - if embed_url and embed_url != url: - return self.url_result(embed_url) - - if not found: - raise UnsupportedError(url) - - entries = [] - for video_url in orderedSet(found): - video_url = unescapeHTML(video_url) - video_url = video_url.replace('\\/', '/') - video_url = compat_urlparse.urljoin(url, video_url) - video_id = compat_urllib_parse_unquote(os.path.basename(video_url)) - - # Sometimes, jwplayer extraction will result in a YouTube URL - if YoutubeIE.suitable(video_url): - entries.append(self.url_result(video_url, 'Youtube')) - continue - - # here's a fun little line of code for you: - video_id = os.path.splitext(video_id)[0] - - entry_info_dict = { - 'id': video_id, - 'uploader': video_uploader, - 'title': video_title, - 'age_limit': age_limit, - } - - if RtmpIE.suitable(video_url): - entry_info_dict.update({ - '_type': 'url_transparent', - 'ie_key': RtmpIE.ie_key(), - 'url': video_url, - }) - entries.append(entry_info_dict) - continue - - ext = determine_ext(video_url) - if ext == 'smil': - entry_info_dict['formats'] = self._extract_smil_formats(video_url, video_id) - elif ext == 'xspf': - return self.playlist_result(self._extract_xspf_playlist(video_url, video_id), video_id) - elif ext == 'm3u8': - entry_info_dict['formats'] = self._extract_m3u8_formats(video_url, video_id, ext='mp4') - elif ext == 'mpd': - entry_info_dict['formats'] = self._extract_mpd_formats(video_url, video_id) - elif ext == 'f4m': - entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id) - elif re.search(r'(?i)\.(?:ism|smil)/manifest', video_url) and video_url != url: - # Just matching .ism/manifest is not enough to be reliably sure - # whether it's actually an ISM manifest or some other streaming - # manifest since there are various streaming URL formats - # possible (see [1]) as well as some other shenanigans like - # .smil/manifest URLs that actually serve an ISM (see [2]) and - # so on. - # Thus the most reasonable way to solve this is to delegate - # to generic extractor in order to look into the contents of - # the manifest itself. - # 1. https://azure.microsoft.com/en-us/documentation/articles/media-services-deliver-content-overview/#streaming-url-formats - # 2. https://svs.itworkscdn.net/lbcivod/smil:itwfcdn/lbci/170976.smil/Manifest - entry_info_dict = self.url_result( - smuggle_url(video_url, {'to_generic': True}), - GenericIE.ie_key()) - else: - entry_info_dict['url'] = video_url - - if entry_info_dict.get('formats'): - self._sort_formats(entry_info_dict['formats']) - - entries.append(entry_info_dict) - - if len(entries) == 1: - return entries[0] - else: - for num, e in enumerate(entries, start=1): - # 'url' results don't have a title - if e.get('title') is not None: - e['title'] = '%s (%d)' % (e['title'], num) - return { - '_type': 'playlist', - 'entries': entries, - } diff --git a/youtube_dl/extractor/giantbomb.py b/youtube_dl/extractor/giantbomb.py deleted file mode 100644 index c6477958d..000000000 --- a/youtube_dl/extractor/giantbomb.py +++ /dev/null @@ -1,90 +0,0 @@ -from __future__ import unicode_literals - -import re -import json - -from .common import InfoExtractor -from ..utils import ( - determine_ext, - int_or_none, - qualities, - unescapeHTML, -) - - -class GiantBombIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?giantbomb\.com/(?:videos|shows)/(?P<display_id>[^/]+)/(?P<id>\d+-\d+)' - _TESTS = [{ - 'url': 'http://www.giantbomb.com/videos/quick-look-destiny-the-dark-below/2300-9782/', - 'md5': '132f5a803e7e0ab0e274d84bda1e77ae', - 'info_dict': { - 'id': '2300-9782', - 'display_id': 'quick-look-destiny-the-dark-below', - 'ext': 'mp4', - 'title': 'Quick Look: Destiny: The Dark Below', - 'description': 'md5:0aa3aaf2772a41b91d44c63f30dfad24', - 'duration': 2399, - 'thumbnail': r're:^https?://.*\.jpg$', - } - }, { - 'url': 'https://www.giantbomb.com/shows/ben-stranding/2970-20212', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') - - webpage = self._download_webpage(url, display_id) - - title = self._og_search_title(webpage) - description = self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) - - video = json.loads(unescapeHTML(self._search_regex( - r'data-video="([^"]+)"', webpage, 'data-video'))) - - duration = int_or_none(video.get('lengthSeconds')) - - quality = qualities([ - 'f4m_low', 'progressive_low', 'f4m_high', - 'progressive_high', 'f4m_hd', 'progressive_hd']) - - formats = [] - for format_id, video_url in video['videoStreams'].items(): - if format_id == 'f4m_stream': - continue - ext = determine_ext(video_url) - if ext == 'f4m': - f4m_formats = self._extract_f4m_formats(video_url + '?hdcore=3.3.1', display_id) - if f4m_formats: - f4m_formats[0]['quality'] = quality(format_id) - formats.extend(f4m_formats) - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - video_url, display_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - else: - formats.append({ - 'url': video_url, - 'format_id': format_id, - 'quality': quality(format_id), - }) - - if not formats: - youtube_id = video.get('youtubeID') - if youtube_id: - return self.url_result(youtube_id, 'Youtube') - - self._sort_formats(formats) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - } diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py deleted file mode 100644 index 60d842d3a..000000000 --- a/youtube_dl/extractor/globo.py +++ /dev/null @@ -1,240 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import base64 -import hashlib -import json -import random -import re - -from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, -) -from ..utils import ( - ExtractorError, - float_or_none, - int_or_none, - orderedSet, - str_or_none, -) - - -class GloboIE(InfoExtractor): - _VALID_URL = r'(?:globo:|https?://.+?\.globo\.com/(?:[^/]+/)*(?:v/(?:[^/]+/)?|videos/))(?P<id>\d{7,})' - _NETRC_MACHINE = 'globo' - _TESTS = [{ - 'url': 'http://g1.globo.com/carros/autoesporte/videos/t/exclusivos-do-g1/v/mercedes-benz-gla-passa-por-teste-de-colisao-na-europa/3607726/', - 'md5': 'b3ccc801f75cd04a914d51dadb83a78d', - 'info_dict': { - 'id': '3607726', - 'ext': 'mp4', - 'title': 'Mercedes-Benz GLA passa por teste de colisão na Europa', - 'duration': 103.204, - 'uploader': 'Globo.com', - 'uploader_id': '265', - }, - }, { - 'url': 'http://globoplay.globo.com/v/4581987/', - 'md5': 'f36a1ecd6a50da1577eee6dd17f67eff', - 'info_dict': { - 'id': '4581987', - 'ext': 'mp4', - 'title': 'Acidentes de trânsito estão entre as maiores causas de queda de energia em SP', - 'duration': 137.973, - 'uploader': 'Rede Globo', - 'uploader_id': '196', - }, - }, { - 'url': 'http://canalbrasil.globo.com/programas/sangue-latino/videos/3928201.html', - 'only_matching': True, - }, { - 'url': 'http://globosatplay.globo.com/globonews/v/4472924/', - 'only_matching': True, - }, { - 'url': 'http://globotv.globo.com/t/programa/v/clipe-sexo-e-as-negas-adeus/3836166/', - 'only_matching': True, - }, { - 'url': 'http://globotv.globo.com/canal-brasil/sangue-latino/t/todos-os-videos/v/ator-e-diretor-argentino-ricado-darin-fala-sobre-utopias-e-suas-perdas/3928201/', - 'only_matching': True, - }, { - 'url': 'http://canaloff.globo.com/programas/desejar-profundo/videos/4518560.html', - 'only_matching': True, - }, { - 'url': 'globo:3607726', - 'only_matching': True, - }] - - def _real_initialize(self): - email, password = self._get_login_info() - if email is None: - return - - try: - glb_id = (self._download_json( - 'https://login.globo.com/api/authentication', None, data=json.dumps({ - 'payload': { - 'email': email, - 'password': password, - 'serviceId': 4654, - }, - }).encode(), headers={ - 'Content-Type': 'application/json; charset=utf-8', - }) or {}).get('glbId') - if glb_id: - self._set_cookie('.globo.com', 'GLBID', glb_id) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - resp = self._parse_json(e.cause.read(), None) - raise ExtractorError(resp.get('userMessage') or resp['id'], expected=True) - raise - - def _real_extract(self, url): - video_id = self._match_id(url) - - video = self._download_json( - 'http://api.globovideos.com/videos/%s/playlist' % video_id, - video_id)['videos'][0] - if video.get('encrypted') is True: - raise ExtractorError('This video is DRM protected.', expected=True) - - title = video['title'] - - formats = [] - subtitles = {} - for resource in video['resources']: - resource_id = resource.get('_id') - resource_url = resource.get('url') - resource_type = resource.get('type') - if not resource_url or (resource_type == 'media' and not resource_id) or resource_type not in ('subtitle', 'media'): - continue - - if resource_type == 'subtitle': - subtitles.setdefault(resource.get('language') or 'por', []).append({ - 'url': resource_url, - }) - continue - - security = self._download_json( - 'http://security.video.globo.com/videos/%s/hash' % video_id, - video_id, 'Downloading security hash for %s' % resource_id, query={ - 'player': 'desktop', - 'version': '5.19.1', - 'resource_id': resource_id, - }) - - security_hash = security.get('hash') - if not security_hash: - message = security.get('message') - if message: - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, message), expected=True) - continue - - hash_code = security_hash[:2] - padding = '%010d' % random.randint(1, 10000000000) - if hash_code in ('04', '14'): - received_time = security_hash[3:13] - received_md5 = security_hash[24:] - hash_prefix = security_hash[:23] - elif hash_code in ('02', '12', '03', '13'): - received_time = security_hash[2:12] - received_md5 = security_hash[22:] - padding += '1' - hash_prefix = '05' + security_hash[:22] - - padded_sign_time = compat_str(int(received_time) + 86400) + padding - md5_data = (received_md5 + padded_sign_time + '0xAC10FD').encode() - signed_md5 = base64.urlsafe_b64encode(hashlib.md5(md5_data).digest()).decode().strip('=') - signed_hash = hash_prefix + padded_sign_time + signed_md5 - signed_url = '%s?h=%s&k=html5&a=%s&u=%s' % (resource_url, signed_hash, 'F' if video.get('subscriber_only') else 'A', security.get('user') or '') - - if resource_id.endswith('m3u8') or resource_url.endswith('.m3u8'): - formats.extend(self._extract_m3u8_formats( - signed_url, resource_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - elif resource_id.endswith('mpd') or resource_url.endswith('.mpd'): - formats.extend(self._extract_mpd_formats( - signed_url, resource_id, mpd_id='dash', fatal=False)) - elif resource_id.endswith('manifest') or resource_url.endswith('/manifest'): - formats.extend(self._extract_ism_formats( - signed_url, resource_id, ism_id='mss', fatal=False)) - else: - formats.append({ - 'url': signed_url, - 'format_id': 'http-%s' % resource_id, - 'height': int_or_none(resource.get('height')), - }) - - self._sort_formats(formats) - - duration = float_or_none(video.get('duration'), 1000) - uploader = video.get('channel') - uploader_id = str_or_none(video.get('channel_id')) - - return { - 'id': video_id, - 'title': title, - 'duration': duration, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'formats': formats, - 'subtitles': subtitles, - } - - -class GloboArticleIE(InfoExtractor): - _VALID_URL = r'https?://.+?\.globo\.com/(?:[^/]+/)*(?P<id>[^/.]+)(?:\.html)?' - - _VIDEOID_REGEXES = [ - r'\bdata-video-id=["\'](\d{7,})', - r'\bdata-player-videosids=["\'](\d{7,})', - r'\bvideosIDs\s*:\s*["\']?(\d{7,})', - r'\bdata-id=["\'](\d{7,})', - r'<div[^>]+\bid=["\'](\d{7,})', - ] - - _TESTS = [{ - 'url': 'http://g1.globo.com/jornal-nacional/noticia/2014/09/novidade-na-fiscalizacao-de-bagagem-pela-receita-provoca-discussoes.html', - 'info_dict': { - 'id': 'novidade-na-fiscalizacao-de-bagagem-pela-receita-provoca-discussoes', - 'title': 'Novidade na fiscalização de bagagem pela Receita provoca discussões', - 'description': 'md5:c3c4b4d4c30c32fce460040b1ac46b12', - }, - 'playlist_count': 1, - }, { - 'url': 'http://g1.globo.com/pr/parana/noticia/2016/09/mpf-denuncia-lula-marisa-e-mais-seis-na-operacao-lava-jato.html', - 'info_dict': { - 'id': 'mpf-denuncia-lula-marisa-e-mais-seis-na-operacao-lava-jato', - 'title': "Lula era o 'comandante máximo' do esquema da Lava Jato, diz MPF", - 'description': 'md5:8aa7cc8beda4dc71cc8553e00b77c54c', - }, - 'playlist_count': 6, - }, { - 'url': 'http://gq.globo.com/Prazeres/Poder/noticia/2015/10/all-o-desafio-assista-ao-segundo-capitulo-da-serie.html', - 'only_matching': True, - }, { - 'url': 'http://gshow.globo.com/programas/tv-xuxa/O-Programa/noticia/2014/01/xuxa-e-junno-namoram-muuuito-em-luau-de-zeze-di-camargo-e-luciano.html', - 'only_matching': True, - }, { - 'url': 'http://oglobo.globo.com/rio/a-amizade-entre-um-entregador-de-farmacia-um-piano-19946271', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return False if GloboIE.suitable(url) else super(GloboArticleIE, cls).suitable(url) - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - video_ids = [] - for video_regex in self._VIDEOID_REGEXES: - video_ids.extend(re.findall(video_regex, webpage)) - entries = [ - self.url_result('globo:%s' % video_id, GloboIE.ie_key()) - for video_id in orderedSet(video_ids)] - title = self._og_search_title(webpage, fatal=False) - description = self._html_search_meta('description', webpage) - return self.playlist_result(entries, display_id, title, description) diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py deleted file mode 100644 index 878ba14e6..000000000 --- a/youtube_dl/extractor/go.py +++ /dev/null @@ -1,315 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .adobepass import AdobePassIE -from ..compat import compat_str -from ..utils import ( - int_or_none, - determine_ext, - parse_age_limit, - try_get, - urlencode_postdata, - ExtractorError, -) - - -class GoIE(AdobePassIE): - _SITE_INFO = { - 'abc': { - 'brand': '001', - 'requestor_id': 'ABC', - }, - 'freeform': { - 'brand': '002', - 'requestor_id': 'ABCFamily', - }, - 'watchdisneychannel': { - 'brand': '004', - 'resource_id': 'Disney', - }, - 'watchdisneyjunior': { - 'brand': '008', - 'resource_id': 'DisneyJunior', - }, - 'watchdisneyxd': { - 'brand': '009', - 'resource_id': 'DisneyXD', - }, - 'disneynow': { - 'brand': '011', - 'resource_id': 'Disney', - }, - 'fxnow.fxnetworks': { - 'brand': '025', - 'requestor_id': 'dtci', - }, - } - _VALID_URL = r'''(?x) - https?:// - (?: - (?:(?P<sub_domain>%s)\.)?go| - (?P<sub_domain_2>abc|freeform|disneynow|fxnow\.fxnetworks) - )\.com/ - (?: - (?:[^/]+/)*(?P<id>[Vv][Dd][Kk][Aa]\w+)| - (?:[^/]+/)*(?P<display_id>[^/?\#]+) - ) - ''' % '|'.join(list(_SITE_INFO.keys())) - _TESTS = [{ - 'url': 'http://abc.go.com/shows/designated-survivor/video/most-recent/VDKA3807643', - 'info_dict': { - 'id': 'VDKA3807643', - 'ext': 'mp4', - 'title': 'The Traitor in the White House', - 'description': 'md5:05b009d2d145a1e85d25111bd37222e8', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'skip': 'This content is no longer available.', - }, { - 'url': 'http://watchdisneyxd.go.com/doraemon', - 'info_dict': { - 'title': 'Doraemon', - 'id': 'SH55574025', - }, - 'playlist_mincount': 51, - }, { - 'url': 'http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood', - 'info_dict': { - 'id': 'VDKA3609139', - 'ext': 'mp4', - 'title': 'This Guilty Blood', - 'description': 'md5:f18e79ad1c613798d95fdabfe96cd292', - 'age_limit': 14, - }, - 'params': { - 'geo_bypass_ip_block': '3.244.239.0/24', - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'https://abc.com/shows/the-rookie/episode-guide/season-02/03-the-bet', - 'info_dict': { - 'id': 'VDKA13435179', - 'ext': 'mp4', - 'title': 'The Bet', - 'description': 'md5:c66de8ba2e92c6c5c113c3ade84ab404', - 'age_limit': 14, - }, - 'params': { - 'geo_bypass_ip_block': '3.244.239.0/24', - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'https://fxnow.fxnetworks.com/shows/better-things/video/vdka12782841', - 'info_dict': { - 'id': 'VDKA12782841', - 'ext': 'mp4', - 'title': 'First Look: Better Things - Season 2', - 'description': 'md5:fa73584a95761c605d9d54904e35b407', - }, - 'params': { - 'geo_bypass_ip_block': '3.244.239.0/24', - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'https://abc.com/shows/modern-family/episode-guide/season-01/101-pilot', - 'info_dict': { - 'id': 'VDKA22600213', - 'ext': 'mp4', - 'title': 'Pilot', - 'description': 'md5:74306df917cfc199d76d061d66bebdb4', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'http://abc.go.com/shows/the-catch/episode-guide/season-01/10-the-wedding', - 'only_matching': True, - }, { - 'url': 'http://abc.go.com/shows/world-news-tonight/episode-guide/2017-02/17-021717-intense-stand-off-between-man-with-rifle-and-police-in-oakland', - 'only_matching': True, - }, { - # brand 004 - 'url': 'http://disneynow.go.com/shows/big-hero-6-the-series/season-01/episode-10-mr-sparkles-loses-his-sparkle/vdka4637915', - 'only_matching': True, - }, { - # brand 008 - 'url': 'http://disneynow.go.com/shows/minnies-bow-toons/video/happy-campers/vdka4872013', - 'only_matching': True, - }, { - 'url': 'https://disneynow.com/shows/minnies-bow-toons/video/happy-campers/vdka4872013', - 'only_matching': True, - }] - - def _extract_videos(self, brand, video_id='-1', show_id='-1'): - display_id = video_id if video_id != '-1' else show_id - return self._download_json( - 'http://api.contents.watchabc.go.com/vp2/ws/contents/3000/videos/%s/001/-1/%s/-1/%s/-1/-1.json' % (brand, show_id, video_id), - display_id)['video'] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - sub_domain = mobj.group('sub_domain') or mobj.group('sub_domain_2') - video_id, display_id = mobj.group('id', 'display_id') - site_info = self._SITE_INFO.get(sub_domain, {}) - brand = site_info.get('brand') - if not video_id or not site_info: - webpage = self._download_webpage(url, display_id or video_id) - data = self._parse_json( - self._search_regex( - r'["\']__abc_com__["\']\s*\]\s*=\s*({.+?})\s*;', webpage, - 'data', default='{}'), - display_id or video_id, fatal=False) - # https://abc.com/shows/modern-family/episode-guide/season-01/101-pilot - layout = try_get(data, lambda x: x['page']['content']['video']['layout'], dict) - video_id = None - if layout: - video_id = try_get( - layout, - (lambda x: x['videoid'], lambda x: x['video']['id']), - compat_str) - if not video_id: - video_id = self._search_regex( - ( - # There may be inner quotes, e.g. data-video-id="'VDKA3609139'" - # from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood - r'data-video-id=["\']*(VDKA\w+)', - # page.analytics.videoIdCode - r'\bvideoIdCode["\']\s*:\s*["\']((?:vdka|VDKA)\w+)', - # https://abc.com/shows/the-rookie/episode-guide/season-02/03-the-bet - r'\b(?:video)?id["\']\s*:\s*["\'](VDKA\w+)' - ), webpage, 'video id', default=video_id) - if not site_info: - brand = self._search_regex( - (r'data-brand=\s*["\']\s*(\d+)', - r'data-page-brand=\s*["\']\s*(\d+)'), webpage, 'brand', - default='004') - site_info = next( - si for _, si in self._SITE_INFO.items() - if si.get('brand') == brand) - if not video_id: - # show extraction works for Disney, DisneyJunior and DisneyXD - # ABC and Freeform has different layout - show_id = self._search_regex(r'data-show-id=["\']*(SH\d+)', webpage, 'show id') - videos = self._extract_videos(brand, show_id=show_id) - show_title = self._search_regex(r'data-show-title="([^"]+)"', webpage, 'show title', fatal=False) - entries = [] - for video in videos: - entries.append(self.url_result( - video['url'], 'Go', video.get('id'), video.get('title'))) - entries.reverse() - return self.playlist_result(entries, show_id, show_title) - video_data = self._extract_videos(brand, video_id)[0] - video_id = video_data['id'] - title = video_data['title'] - - formats = [] - for asset in video_data.get('assets', {}).get('asset', []): - asset_url = asset.get('value') - if not asset_url: - continue - format_id = asset.get('format') - ext = determine_ext(asset_url) - if ext == 'm3u8': - video_type = video_data.get('type') - data = { - 'video_id': video_data['id'], - 'video_type': video_type, - 'brand': brand, - 'device': '001', - } - if video_data.get('accesslevel') == '1': - requestor_id = site_info.get('requestor_id', 'DisneyChannels') - resource = site_info.get('resource_id') or self._get_mvpd_resource( - requestor_id, title, video_id, None) - auth = self._extract_mvpd_auth( - url, video_id, requestor_id, resource) - data.update({ - 'token': auth, - 'token_type': 'ap', - 'adobe_requestor_id': requestor_id, - }) - else: - self._initialize_geo_bypass({'countries': ['US']}) - entitlement = self._download_json( - 'https://api.entitlement.watchabc.go.com/vp2/ws-secure/entitlement/2020/authorize.json', - video_id, data=urlencode_postdata(data)) - errors = entitlement.get('errors', {}).get('errors', []) - if errors: - for error in errors: - if error.get('code') == 1002: - self.raise_geo_restricted( - error['message'], countries=['US']) - error_message = ', '.join([error['message'] for error in errors]) - raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message), expected=True) - asset_url += '?' + entitlement['uplynkData']['sessionKey'] - formats.extend(self._extract_m3u8_formats( - asset_url, video_id, 'mp4', m3u8_id=format_id or 'hls', fatal=False)) - else: - f = { - 'format_id': format_id, - 'url': asset_url, - 'ext': ext, - } - if re.search(r'(?:/mp4/source/|_source\.mp4)', asset_url): - f.update({ - 'format_id': ('%s-' % format_id if format_id else '') + 'SOURCE', - 'preference': 1, - }) - else: - mobj = re.search(r'/(\d+)x(\d+)/', asset_url) - if mobj: - height = int(mobj.group(2)) - f.update({ - 'format_id': ('%s-' % format_id if format_id else '') + '%dP' % height, - 'width': int(mobj.group(1)), - 'height': height, - }) - formats.append(f) - self._sort_formats(formats) - - subtitles = {} - for cc in video_data.get('closedcaption', {}).get('src', []): - cc_url = cc.get('value') - if not cc_url: - continue - ext = determine_ext(cc_url) - if ext == 'xml': - ext = 'ttml' - subtitles.setdefault(cc.get('lang'), []).append({ - 'url': cc_url, - 'ext': ext, - }) - - thumbnails = [] - for thumbnail in video_data.get('thumbnails', {}).get('thumbnail', []): - thumbnail_url = thumbnail.get('value') - if not thumbnail_url: - continue - thumbnails.append({ - 'url': thumbnail_url, - 'width': int_or_none(thumbnail.get('width')), - 'height': int_or_none(thumbnail.get('height')), - }) - - return { - 'id': video_id, - 'title': title, - 'description': video_data.get('longdescription') or video_data.get('description'), - 'duration': int_or_none(video_data.get('duration', {}).get('value'), 1000), - 'age_limit': parse_age_limit(video_data.get('tvrating', {}).get('rating')), - 'episode_number': int_or_none(video_data.get('episodenumber')), - 'series': video_data.get('show', {}).get('title'), - 'season_number': int_or_none(video_data.get('season', {}).get('num')), - 'thumbnails': thumbnails, - 'formats': formats, - 'subtitles': subtitles, - } diff --git a/youtube_dl/extractor/godtube.py b/youtube_dl/extractor/godtube.py deleted file mode 100644 index 92efd16b3..000000000 --- a/youtube_dl/extractor/godtube.py +++ /dev/null @@ -1,58 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - parse_duration, - parse_iso8601, -) - - -class GodTubeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?godtube\.com/watch/\?v=(?P<id>[\da-zA-Z]+)' - _TESTS = [ - { - 'url': 'https://www.godtube.com/watch/?v=0C0CNNNU', - 'md5': '77108c1e4ab58f48031101a1a2119789', - 'info_dict': { - 'id': '0C0CNNNU', - 'ext': 'mp4', - 'title': 'Woman at the well.', - 'duration': 159, - 'timestamp': 1205712000, - 'uploader': 'beverlybmusic', - 'upload_date': '20080317', - 'thumbnail': r're:^https?://.*\.jpg$', - }, - }, - ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - config = self._download_xml( - 'http://www.godtube.com/resource/mediaplayer/%s.xml' % video_id.lower(), - video_id, 'Downloading player config XML') - - video_url = config.find('file').text - uploader = config.find('author').text - timestamp = parse_iso8601(config.find('date').text) - duration = parse_duration(config.find('duration').text) - thumbnail = config.find('image').text - - media = self._download_xml( - 'http://www.godtube.com/media/xml/?v=%s' % video_id, video_id, 'Downloading media XML') - - title = media.find('title').text - - return { - 'id': video_id, - 'url': video_url, - 'title': title, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'uploader': uploader, - 'duration': duration, - } diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py deleted file mode 100644 index 3f2de00f1..000000000 --- a/youtube_dl/extractor/googledrive.py +++ /dev/null @@ -1,278 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_parse_qs -from ..utils import ( - determine_ext, - ExtractorError, - get_element_by_class, - int_or_none, - lowercase_escape, - try_get, - update_url_query, -) - - -class GoogleDriveIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?:// - (?: - (?:docs|drive)\.google\.com/ - (?: - (?:uc|open)\?.*?id=| - file/d/ - )| - video\.google\.com/get_player\?.*?docid= - ) - (?P<id>[a-zA-Z0-9_-]{28,}) - ''' - _TESTS = [{ - 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1', - 'md5': '5c602afbbf2c1db91831f5d82f678554', - 'info_dict': { - 'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ', - 'ext': 'mp4', - 'title': 'Big Buck Bunny.mp4', - 'duration': 45, - } - }, { - # video can't be watched anonymously due to view count limit reached, - # but can be downloaded (see https://github.com/ytdl-org/youtube-dl/issues/14046) - 'url': 'https://drive.google.com/file/d/0B-vUyvmDLdWDcEt4WjBqcmI2XzQ/view', - 'only_matching': True, - }, { - # video id is longer than 28 characters - 'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit', - 'only_matching': True, - }, { - 'url': 'https://drive.google.com/open?id=0B2fjwgkl1A_CX083Tkowdmt6d28', - 'only_matching': True, - }, { - 'url': 'https://drive.google.com/uc?id=0B2fjwgkl1A_CX083Tkowdmt6d28', - 'only_matching': True, - }] - _FORMATS_EXT = { - '5': 'flv', - '6': 'flv', - '13': '3gp', - '17': '3gp', - '18': 'mp4', - '22': 'mp4', - '34': 'flv', - '35': 'flv', - '36': '3gp', - '37': 'mp4', - '38': 'mp4', - '43': 'webm', - '44': 'webm', - '45': 'webm', - '46': 'webm', - '59': 'mp4', - } - _BASE_URL_CAPTIONS = 'https://drive.google.com/timedtext' - _CAPTIONS_ENTRY_TAG = { - 'subtitles': 'track', - 'automatic_captions': 'target', - } - _caption_formats_ext = [] - _captions_xml = None - - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28,})', - webpage) - if mobj: - return 'https://drive.google.com/file/d/%s' % mobj.group('id') - - def _download_subtitles_xml(self, video_id, subtitles_id, hl): - if self._captions_xml: - return - self._captions_xml = self._download_xml( - self._BASE_URL_CAPTIONS, video_id, query={ - 'id': video_id, - 'vid': subtitles_id, - 'hl': hl, - 'v': video_id, - 'type': 'list', - 'tlangs': '1', - 'fmts': '1', - 'vssids': '1', - }, note='Downloading subtitles XML', - errnote='Unable to download subtitles XML', fatal=False) - if self._captions_xml: - for f in self._captions_xml.findall('format'): - if f.attrib.get('fmt_code') and not f.attrib.get('default'): - self._caption_formats_ext.append(f.attrib['fmt_code']) - - def _get_captions_by_type(self, video_id, subtitles_id, caption_type, - origin_lang_code=None): - if not subtitles_id or not caption_type: - return - captions = {} - for caption_entry in self._captions_xml.findall( - self._CAPTIONS_ENTRY_TAG[caption_type]): - caption_lang_code = caption_entry.attrib.get('lang_code') - if not caption_lang_code: - continue - caption_format_data = [] - for caption_format in self._caption_formats_ext: - query = { - 'vid': subtitles_id, - 'v': video_id, - 'fmt': caption_format, - 'lang': (caption_lang_code if origin_lang_code is None - else origin_lang_code), - 'type': 'track', - 'name': '', - 'kind': '', - } - if origin_lang_code is not None: - query.update({'tlang': caption_lang_code}) - caption_format_data.append({ - 'url': update_url_query(self._BASE_URL_CAPTIONS, query), - 'ext': caption_format, - }) - captions[caption_lang_code] = caption_format_data - return captions - - def _get_subtitles(self, video_id, subtitles_id, hl): - if not subtitles_id or not hl: - return - self._download_subtitles_xml(video_id, subtitles_id, hl) - if not self._captions_xml: - return - return self._get_captions_by_type(video_id, subtitles_id, 'subtitles') - - def _get_automatic_captions(self, video_id, subtitles_id, hl): - if not subtitles_id or not hl: - return - self._download_subtitles_xml(video_id, subtitles_id, hl) - if not self._captions_xml: - return - track = self._captions_xml.find('track') - if track is None: - return - origin_lang_code = track.attrib.get('lang_code') - if not origin_lang_code: - return - return self._get_captions_by_type( - video_id, subtitles_id, 'automatic_captions', origin_lang_code) - - def _real_extract(self, url): - video_id = self._match_id(url) - video_info = compat_parse_qs(self._download_webpage( - 'https://drive.google.com/get_video_info', - video_id, query={'docid': video_id})) - - def get_value(key): - return try_get(video_info, lambda x: x[key][0]) - - reason = get_value('reason') - title = get_value('title') - if not title and reason: - raise ExtractorError(reason, expected=True) - - formats = [] - fmt_stream_map = (get_value('fmt_stream_map') or '').split(',') - fmt_list = (get_value('fmt_list') or '').split(',') - if fmt_stream_map and fmt_list: - resolutions = {} - for fmt in fmt_list: - mobj = re.search( - r'^(?P<format_id>\d+)/(?P<width>\d+)[xX](?P<height>\d+)', fmt) - if mobj: - resolutions[mobj.group('format_id')] = ( - int(mobj.group('width')), int(mobj.group('height'))) - - for fmt_stream in fmt_stream_map: - fmt_stream_split = fmt_stream.split('|') - if len(fmt_stream_split) < 2: - continue - format_id, format_url = fmt_stream_split[:2] - f = { - 'url': lowercase_escape(format_url), - 'format_id': format_id, - 'ext': self._FORMATS_EXT[format_id], - } - resolution = resolutions.get(format_id) - if resolution: - f.update({ - 'width': resolution[0], - 'height': resolution[1], - }) - formats.append(f) - - source_url = update_url_query( - 'https://drive.google.com/uc', { - 'id': video_id, - 'export': 'download', - }) - - def request_source_file(source_url, kind): - return self._request_webpage( - source_url, video_id, note='Requesting %s file' % kind, - errnote='Unable to request %s file' % kind, fatal=False) - urlh = request_source_file(source_url, 'source') - if urlh: - def add_source_format(urlh): - formats.append({ - # Use redirect URLs as download URLs in order to calculate - # correct cookies in _calc_cookies. - # Using original URLs may result in redirect loop due to - # google.com's cookies mistakenly used for googleusercontent.com - # redirect URLs (see #23919). - 'url': urlh.geturl(), - 'ext': determine_ext(title, 'mp4').lower(), - 'format_id': 'source', - 'quality': 1, - }) - if urlh.headers.get('Content-Disposition'): - add_source_format(urlh) - else: - confirmation_webpage = self._webpage_read_content( - urlh, url, video_id, note='Downloading confirmation page', - errnote='Unable to confirm download', fatal=False) - if confirmation_webpage: - confirm = self._search_regex( - r'confirm=([^&"\']+)', confirmation_webpage, - 'confirmation code', default=None) - if confirm: - confirmed_source_url = update_url_query(source_url, { - 'confirm': confirm, - }) - urlh = request_source_file(confirmed_source_url, 'confirmed source') - if urlh and urlh.headers.get('Content-Disposition'): - add_source_format(urlh) - else: - self.report_warning( - get_element_by_class('uc-error-subcaption', confirmation_webpage) - or get_element_by_class('uc-error-caption', confirmation_webpage) - or 'unable to extract confirmation code') - - if not formats and reason: - raise ExtractorError(reason, expected=True) - - self._sort_formats(formats) - - hl = get_value('hl') - subtitles_id = None - ttsurl = get_value('ttsurl') - if ttsurl: - # the video Id for subtitles will be the last value in the ttsurl - # query string - subtitles_id = ttsurl.encode('utf-8').decode( - 'unicode_escape').split('=')[-1] - - return { - 'id': video_id, - 'title': title, - 'thumbnail': 'https://drive.google.com/thumbnail?id=' + video_id, - 'duration': int_or_none(get_value('length_seconds')), - 'formats': formats, - 'subtitles': self.extract_subtitles(video_id, subtitles_id, hl), - 'automatic_captions': self.extract_automatic_captions( - video_id, subtitles_id, hl), - } diff --git a/youtube_dl/extractor/googlepodcasts.py b/youtube_dl/extractor/googlepodcasts.py deleted file mode 100644 index 31ad79907..000000000 --- a/youtube_dl/extractor/googlepodcasts.py +++ /dev/null @@ -1,88 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import json -import re - -from .common import InfoExtractor -from ..utils import ( - clean_podcast_url, - int_or_none, - try_get, - urlencode_postdata, -) - - -class GooglePodcastsBaseIE(InfoExtractor): - _VALID_URL_BASE = r'https?://podcasts\.google\.com/feed/' - - def _batch_execute(self, func_id, video_id, params): - return json.loads(self._download_json( - 'https://podcasts.google.com/_/PodcastsUi/data/batchexecute', - video_id, data=urlencode_postdata({ - 'f.req': json.dumps([[[func_id, json.dumps(params), None, '1']]]), - }), transform_source=lambda x: self._search_regex(r'(?s)(\[.+\])', x, 'data'))[0][2]) - - def _extract_episode(self, episode): - return { - 'id': episode[4][3], - 'title': episode[8], - 'url': clean_podcast_url(episode[13]), - 'thumbnail': episode[2], - 'description': episode[9], - 'creator': try_get(episode, lambda x: x[14]), - 'timestamp': int_or_none(episode[11]), - 'duration': int_or_none(episode[12]), - 'series': episode[1], - } - - -class GooglePodcastsIE(GooglePodcastsBaseIE): - IE_NAME = 'google:podcasts' - _VALID_URL = GooglePodcastsBaseIE._VALID_URL_BASE + r'(?P<feed_url>[^/]+)/episode/(?P<id>[^/?&#]+)' - _TEST = { - 'url': 'https://podcasts.google.com/feed/aHR0cHM6Ly9mZWVkcy5ucHIub3JnLzM0NDA5ODUzOS9wb2RjYXN0LnhtbA/episode/MzBlNWRlN2UtOWE4Yy00ODcwLTk2M2MtM2JlMmUyNmViOTRh', - 'md5': 'fa56b2ee8bd0703e27e42d4b104c4766', - 'info_dict': { - 'id': '30e5de7e-9a8c-4870-963c-3be2e26eb94a', - 'ext': 'mp3', - 'title': 'WWDTM New Year 2021', - 'description': 'We say goodbye to 2020 with Christine Baranksi, Doug Jones, Jonna Mendez, and Kellee Edwards.', - 'upload_date': '20210102', - 'timestamp': 1609606800, - 'duration': 2901, - 'series': "Wait Wait... Don't Tell Me!", - } - } - - def _real_extract(self, url): - b64_feed_url, b64_guid = re.match(self._VALID_URL, url).groups() - episode = self._batch_execute( - 'oNjqVe', b64_guid, [b64_feed_url, b64_guid])[1] - return self._extract_episode(episode) - - -class GooglePodcastsFeedIE(GooglePodcastsBaseIE): - IE_NAME = 'google:podcasts:feed' - _VALID_URL = GooglePodcastsBaseIE._VALID_URL_BASE + r'(?P<id>[^/?&#]+)/?(?:[?#&]|$)' - _TEST = { - 'url': 'https://podcasts.google.com/feed/aHR0cHM6Ly9mZWVkcy5ucHIub3JnLzM0NDA5ODUzOS9wb2RjYXN0LnhtbA', - 'info_dict': { - 'title': "Wait Wait... Don't Tell Me!", - 'description': "NPR's weekly current events quiz. Have a laugh and test your news knowledge while figuring out what's real and what we've made up.", - }, - 'playlist_mincount': 20, - } - - def _real_extract(self, url): - b64_feed_url = self._match_id(url) - data = self._batch_execute('ncqJEe', b64_feed_url, [b64_feed_url]) - - entries = [] - for episode in (try_get(data, lambda x: x[1][0]) or []): - entries.append(self._extract_episode(episode)) - - feed = try_get(data, lambda x: x[3]) or [] - return self.playlist_result( - entries, playlist_title=try_get(feed, lambda x: x[0]), - playlist_description=try_get(feed, lambda x: x[2])) diff --git a/youtube_dl/extractor/googlesearch.py b/youtube_dl/extractor/googlesearch.py deleted file mode 100644 index 5279fa807..000000000 --- a/youtube_dl/extractor/googlesearch.py +++ /dev/null @@ -1,59 +0,0 @@ -from __future__ import unicode_literals - -import itertools -import re - -from .common import SearchInfoExtractor - - -class GoogleSearchIE(SearchInfoExtractor): - IE_DESC = 'Google Video search' - _MAX_RESULTS = 1000 - IE_NAME = 'video.google:search' - _SEARCH_KEY = 'gvsearch' - _TEST = { - 'url': 'gvsearch15:python language', - 'info_dict': { - 'id': 'python language', - 'title': 'python language', - }, - 'playlist_count': 15, - } - - def _get_n_results(self, query, n): - """Get a specified number of results for a query""" - - entries = [] - res = { - '_type': 'playlist', - 'id': query, - 'title': query, - } - - for pagenum in itertools.count(): - webpage = self._download_webpage( - 'http://www.google.com/search', - 'gvsearch:' + query, - note='Downloading result page %s' % (pagenum + 1), - query={ - 'tbm': 'vid', - 'q': query, - 'start': pagenum * 10, - 'hl': 'en', - }) - - for hit_idx, mobj in enumerate(re.finditer( - r'<h3 class="r"><a href="([^"]+)"', webpage)): - - # Skip playlists - if not re.search(r'id="vidthumb%d"' % (hit_idx + 1), webpage): - continue - - entries.append({ - '_type': 'url', - 'url': mobj.group(1) - }) - - if (len(entries) >= n) or not re.search(r'id="pnnext"', webpage): - res['entries'] = entries[:n] - return res diff --git a/youtube_dl/extractor/hearthisat.py b/youtube_dl/extractor/hearthisat.py deleted file mode 100644 index 18c252012..000000000 --- a/youtube_dl/extractor/hearthisat.py +++ /dev/null @@ -1,135 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_urlparse -from ..utils import ( - HEADRequest, - KNOWN_EXTENSIONS, - sanitized_Request, - str_to_int, - urlencode_postdata, - urlhandle_detect_ext, -) - - -class HearThisAtIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?hearthis\.at/(?P<artist>[^/]+)/(?P<title>[A-Za-z0-9\-]+)/?$' - _PLAYLIST_URL = 'https://hearthis.at/playlist.php' - _TESTS = [{ - 'url': 'https://hearthis.at/moofi/dr-kreep', - 'md5': 'ab6ec33c8fed6556029337c7885eb4e0', - 'info_dict': { - 'id': '150939', - 'ext': 'wav', - 'title': 'Moofi - Dr. Kreep', - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1421564134, - 'description': 'Listen to Dr. Kreep by Moofi on hearthis.at - Modular, Eurorack, Mutable Intruments Braids, Valhalla-DSP', - 'upload_date': '20150118', - 'comment_count': int, - 'view_count': int, - 'like_count': int, - 'duration': 71, - 'categories': ['Experimental'], - } - }, { - # 'download' link redirects to the original webpage - 'url': 'https://hearthis.at/twitchsf/dj-jim-hopkins-totally-bitchin-80s-dance-mix/', - 'md5': '5980ceb7c461605d30f1f039df160c6e', - 'info_dict': { - 'id': '811296', - 'ext': 'mp3', - 'title': 'TwitchSF - DJ Jim Hopkins - Totally Bitchin\' 80\'s Dance Mix!', - 'description': 'Listen to DJ Jim Hopkins - Totally Bitchin\' 80\'s Dance Mix! by TwitchSF on hearthis.at - Dance', - 'upload_date': '20160328', - 'timestamp': 1459186146, - 'thumbnail': r're:^https?://.*\.jpg$', - 'comment_count': int, - 'view_count': int, - 'like_count': int, - 'duration': 4360, - 'categories': ['Dance'], - }, - }] - - def _real_extract(self, url): - m = re.match(self._VALID_URL, url) - display_id = '{artist:s} - {title:s}'.format(**m.groupdict()) - - webpage = self._download_webpage(url, display_id) - track_id = self._search_regex( - r'intTrackId\s*=\s*(\d+)', webpage, 'track ID') - - payload = urlencode_postdata({'tracks[]': track_id}) - req = sanitized_Request(self._PLAYLIST_URL, payload) - req.add_header('Content-type', 'application/x-www-form-urlencoded') - - track = self._download_json(req, track_id, 'Downloading playlist')[0] - title = '{artist:s} - {title:s}'.format(**track) - - categories = None - if track.get('category'): - categories = [track['category']] - - description = self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) - - meta_span = r'<span[^>]+class="%s".*?</i>([^<]+)</span>' - view_count = str_to_int(self._search_regex( - meta_span % 'plays_count', webpage, 'view count', fatal=False)) - like_count = str_to_int(self._search_regex( - meta_span % 'likes_count', webpage, 'like count', fatal=False)) - comment_count = str_to_int(self._search_regex( - meta_span % 'comment_count', webpage, 'comment count', fatal=False)) - duration = str_to_int(self._search_regex( - r'data-length="(\d+)', webpage, 'duration', fatal=False)) - timestamp = str_to_int(self._search_regex( - r'<span[^>]+class="calctime"[^>]+data-time="(\d+)', webpage, 'timestamp', fatal=False)) - - formats = [] - mp3_url = self._search_regex( - r'(?s)<a class="player-link"\s+(?:[a-zA-Z0-9_:-]+="[^"]+"\s+)*?data-mp3="([^"]+)"', - webpage, 'mp3 URL', fatal=False) - if mp3_url: - formats.append({ - 'format_id': 'mp3', - 'vcodec': 'none', - 'acodec': 'mp3', - 'url': mp3_url, - }) - download_path = self._search_regex( - r'<a class="[^"]*download_fct[^"]*"\s+href="([^"]+)"', - webpage, 'download URL', default=None) - if download_path: - download_url = compat_urlparse.urljoin(url, download_path) - ext_req = HEADRequest(download_url) - ext_handle = self._request_webpage( - ext_req, display_id, note='Determining extension') - ext = urlhandle_detect_ext(ext_handle) - if ext in KNOWN_EXTENSIONS: - formats.append({ - 'format_id': 'download', - 'vcodec': 'none', - 'ext': ext, - 'url': download_url, - 'preference': 2, # Usually better quality - }) - self._sort_formats(formats) - - return { - 'id': track_id, - 'display_id': display_id, - 'title': title, - 'formats': formats, - 'thumbnail': thumbnail, - 'description': description, - 'duration': duration, - 'timestamp': timestamp, - 'view_count': view_count, - 'comment_count': comment_count, - 'like_count': like_count, - 'categories': categories, - } diff --git a/youtube_dl/extractor/hidive.py b/youtube_dl/extractor/hidive.py deleted file mode 100644 index f26f80265..000000000 --- a/youtube_dl/extractor/hidive.py +++ /dev/null @@ -1,118 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - ExtractorError, - int_or_none, - url_or_none, - urlencode_postdata, -) - - -class HiDiveIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?hidive\.com/stream/(?P<title>[^/]+)/(?P<key>[^/?#&]+)' - # Using X-Forwarded-For results in 403 HTTP error for HLS fragments, - # so disabling geo bypass completely - _GEO_BYPASS = False - _NETRC_MACHINE = 'hidive' - _LOGIN_URL = 'https://www.hidive.com/account/login' - - _TESTS = [{ - 'url': 'https://www.hidive.com/stream/the-comic-artist-and-his-assistants/s01e001', - 'info_dict': { - 'id': 'the-comic-artist-and-his-assistants/s01e001', - 'ext': 'mp4', - 'title': 'the-comic-artist-and-his-assistants/s01e001', - 'series': 'the-comic-artist-and-his-assistants', - 'season_number': 1, - 'episode_number': 1, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'Requires Authentication', - }] - - def _real_initialize(self): - email, password = self._get_login_info() - if email is None: - return - - webpage = self._download_webpage(self._LOGIN_URL, None) - form = self._search_regex( - r'(?s)<form[^>]+action="/account/login"[^>]*>(.+?)</form>', - webpage, 'login form') - data = self._hidden_inputs(form) - data.update({ - 'Email': email, - 'Password': password, - }) - self._download_webpage( - self._LOGIN_URL, None, 'Logging in', data=urlencode_postdata(data)) - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - title, key = mobj.group('title', 'key') - video_id = '%s/%s' % (title, key) - - settings = self._download_json( - 'https://www.hidive.com/play/settings', video_id, - data=urlencode_postdata({ - 'Title': title, - 'Key': key, - 'PlayerId': 'f4f895ce1ca713ba263b91caeb1daa2d08904783', - })) - - restriction = settings.get('restrictionReason') - if restriction == 'RegionRestricted': - self.raise_geo_restricted() - - if restriction and restriction != 'None': - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, restriction), expected=True) - - formats = [] - subtitles = {} - for rendition_id, rendition in settings['renditions'].items(): - bitrates = rendition.get('bitrates') - if not isinstance(bitrates, dict): - continue - m3u8_url = url_or_none(bitrates.get('hls')) - if not m3u8_url: - continue - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='%s-hls' % rendition_id, fatal=False)) - cc_files = rendition.get('ccFiles') - if not isinstance(cc_files, list): - continue - for cc_file in cc_files: - if not isinstance(cc_file, list) or len(cc_file) < 3: - continue - cc_lang = cc_file[0] - cc_url = url_or_none(cc_file[2]) - if not isinstance(cc_lang, compat_str) or not cc_url: - continue - subtitles.setdefault(cc_lang, []).append({ - 'url': cc_url, - }) - self._sort_formats(formats) - - season_number = int_or_none(self._search_regex( - r's(\d+)', key, 'season number', default=None)) - episode_number = int_or_none(self._search_regex( - r'e(\d+)', key, 'episode number', default=None)) - - return { - 'id': video_id, - 'title': video_id, - 'subtitles': subtitles, - 'formats': formats, - 'series': title, - 'season_number': season_number, - 'episode_number': episode_number, - } diff --git a/youtube_dl/extractor/hotstar.py b/youtube_dl/extractor/hotstar.py deleted file mode 100644 index 1620822b6..000000000 --- a/youtube_dl/extractor/hotstar.py +++ /dev/null @@ -1,252 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import hashlib -import hmac -import json -import re -import time -import uuid - -from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, -) -from ..utils import ( - determine_ext, - ExtractorError, - int_or_none, - str_or_none, - try_get, - url_or_none, -) - - -class HotStarBaseIE(InfoExtractor): - _AKAMAI_ENCRYPTION_KEY = b'\x05\xfc\x1a\x01\xca\xc9\x4b\xc4\x12\xfc\x53\x12\x07\x75\xf9\xee' - - def _call_api_impl(self, path, video_id, headers, query, data=None): - st = int(time.time()) - exp = st + 6000 - auth = 'st=%d~exp=%d~acl=/*' % (st, exp) - auth += '~hmac=' + hmac.new(self._AKAMAI_ENCRYPTION_KEY, auth.encode(), hashlib.sha256).hexdigest() - h = {'hotstarauth': auth} - h.update(headers) - return self._download_json( - 'https://api.hotstar.com/' + path, - video_id, headers=h, query=query, data=data) - - def _call_api(self, path, video_id, query_name='contentId'): - response = self._call_api_impl(path, video_id, { - 'x-country-code': 'IN', - 'x-platform-code': 'JIO', - }, { - query_name: video_id, - 'tas': 10000, - }) - if response['statusCode'] != 'OK': - raise ExtractorError( - response['body']['message'], expected=True) - return response['body']['results'] - - def _call_api_v2(self, path, video_id, headers, query=None, data=None): - h = {'X-Request-Id': compat_str(uuid.uuid4())} - h.update(headers) - try: - return self._call_api_impl( - path, video_id, h, query, data) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError): - if e.cause.code == 402: - self.raise_login_required() - message = self._parse_json(e.cause.read().decode(), video_id)['message'] - if message in ('Content not available in region', 'Country is not supported'): - raise self.raise_geo_restricted(message) - raise ExtractorError(message) - raise e - - -class HotStarIE(HotStarBaseIE): - IE_NAME = 'hotstar' - _VALID_URL = r'https?://(?:www\.)?hotstar\.com/(?:.+[/-])?(?P<id>\d{10})' - _TESTS = [{ - # contentData - 'url': 'https://www.hotstar.com/can-you-not-spread-rumours/1000076273', - 'info_dict': { - 'id': '1000076273', - 'ext': 'mp4', - 'title': 'Can You Not Spread Rumours?', - 'description': 'md5:c957d8868e9bc793ccb813691cc4c434', - 'timestamp': 1447248600, - 'upload_date': '20151111', - 'duration': 381, - }, - 'params': { - # m3u8 download - 'skip_download': True, - } - }, { - # contentDetail - 'url': 'https://www.hotstar.com/movies/radha-gopalam/1000057157', - 'only_matching': True, - }, { - 'url': 'http://www.hotstar.com/sports/cricket/rajitha-sizzles-on-debut-with-329/2001477583', - 'only_matching': True, - }, { - 'url': 'http://www.hotstar.com/1000000515', - 'only_matching': True, - }, { - # only available via api v2 - 'url': 'https://www.hotstar.com/tv/ek-bhram-sarvagun-sampanna/s-2116/janhvi-targets-suman/1000234847', - 'only_matching': True, - }, { - 'url': 'https://www.hotstar.com/in/tv/start-music/1260005217/cooks-vs-comalis/1100039717', - 'only_matching': True, - }] - _GEO_BYPASS = False - _DEVICE_ID = None - _USER_TOKEN = None - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - app_state = self._parse_json(self._search_regex( - r'<script>window\.APP_STATE\s*=\s*({.+?})</script>', - webpage, 'app state'), video_id) - video_data = {} - getters = list( - lambda x, k=k: x['initialState']['content%s' % k]['content'] - for k in ('Data', 'Detail') - ) - for v in app_state.values(): - content = try_get(v, getters, dict) - if content and content.get('contentId') == video_id: - video_data = content - break - - title = video_data['title'] - - if video_data.get('drmProtected'): - raise ExtractorError('This video is DRM protected.', expected=True) - - headers = {'Referer': url} - formats = [] - geo_restricted = False - - if not self._USER_TOKEN: - self._DEVICE_ID = compat_str(uuid.uuid4()) - self._USER_TOKEN = self._call_api_v2('um/v3/users', video_id, { - 'X-HS-Platform': 'PCTV', - 'Content-Type': 'application/json', - }, data=json.dumps({ - 'device_ids': [{ - 'id': self._DEVICE_ID, - 'type': 'device_id', - }], - }).encode())['user_identity'] - - playback_sets = self._call_api_v2( - 'play/v2/playback/content/' + video_id, video_id, { - 'X-HS-Platform': 'web', - 'X-HS-AppVersion': '6.99.1', - 'X-HS-UserToken': self._USER_TOKEN, - }, query={ - 'device-id': self._DEVICE_ID, - 'desired-config': 'encryption:plain', - 'os-name': 'Windows', - 'os-version': '10', - })['data']['playBackSets'] - for playback_set in playback_sets: - if not isinstance(playback_set, dict): - continue - format_url = url_or_none(playback_set.get('playbackUrl')) - if not format_url: - continue - format_url = re.sub( - r'(?<=//staragvod)(\d)', r'web\1', format_url) - tags = str_or_none(playback_set.get('tagsCombination')) or '' - if tags and 'encryption:plain' not in tags: - continue - ext = determine_ext(format_url) - try: - if 'package:hls' in tags or ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', - entry_protocol='m3u8_native', - m3u8_id='hls', headers=headers)) - elif 'package:dash' in tags or ext == 'mpd': - formats.extend(self._extract_mpd_formats( - format_url, video_id, mpd_id='dash', headers=headers)) - elif ext == 'f4m': - # produce broken files - pass - else: - formats.append({ - 'url': format_url, - 'width': int_or_none(playback_set.get('width')), - 'height': int_or_none(playback_set.get('height')), - }) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - geo_restricted = True - continue - if not formats and geo_restricted: - self.raise_geo_restricted(countries=['IN']) - self._sort_formats(formats) - - for f in formats: - f.setdefault('http_headers', {}).update(headers) - - image = try_get(video_data, lambda x: x['image']['h'], compat_str) - - return { - 'id': video_id, - 'title': title, - 'thumbnail': 'https://img1.hotstarext.com/image/upload/' + image if image else None, - 'description': video_data.get('description'), - 'duration': int_or_none(video_data.get('duration')), - 'timestamp': int_or_none(video_data.get('broadcastDate') or video_data.get('startDate')), - 'formats': formats, - 'channel': video_data.get('channelName'), - 'channel_id': str_or_none(video_data.get('channelId')), - 'series': video_data.get('showName'), - 'season': video_data.get('seasonName'), - 'season_number': int_or_none(video_data.get('seasonNo')), - 'season_id': str_or_none(video_data.get('seasonId')), - 'episode': title, - 'episode_number': int_or_none(video_data.get('episodeNo')), - } - - -class HotStarPlaylistIE(HotStarBaseIE): - IE_NAME = 'hotstar:playlist' - _VALID_URL = r'https?://(?:www\.)?hotstar\.com/(?:[a-z]{2}/)?tv/[^/]+/s-\w+/list/[^/]+/t-(?P<id>\w+)' - _TESTS = [{ - 'url': 'https://www.hotstar.com/tv/savdhaan-india/s-26/list/popular-clips/t-3_2_26', - 'info_dict': { - 'id': '3_2_26', - }, - 'playlist_mincount': 20, - }, { - 'url': 'https://www.hotstar.com/tv/savdhaan-india/s-26/list/extras/t-2480', - 'only_matching': True, - }, { - 'url': 'https://www.hotstar.com/us/tv/masterchef-india/s-830/list/episodes/t-1_2_830', - 'only_matching': True, - }] - - def _real_extract(self, url): - playlist_id = self._match_id(url) - - collection = self._call_api('o/v1/tray/find', playlist_id, 'uqId') - - entries = [ - self.url_result( - 'https://www.hotstar.com/%s' % video['contentId'], - ie=HotStarIE.ie_key(), video_id=video['contentId']) - for video in collection['assets']['items'] - if video.get('contentId')] - - return self.playlist_result(entries, playlist_id) diff --git a/youtube_dl/extractor/hrti.py b/youtube_dl/extractor/hrti.py deleted file mode 100644 index 23f7b1fc9..000000000 --- a/youtube_dl/extractor/hrti.py +++ /dev/null @@ -1,208 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import json -import re - -from .common import InfoExtractor -from ..compat import compat_HTTPError -from ..utils import ( - clean_html, - ExtractorError, - int_or_none, - parse_age_limit, - sanitized_Request, - try_get, -) - - -class HRTiBaseIE(InfoExtractor): - """ - Base Information Extractor for Croatian Radiotelevision - video on demand site https://hrti.hrt.hr - Reverse engineered from the JavaScript app in app.min.js - """ - _NETRC_MACHINE = 'hrti' - - _APP_LANGUAGE = 'hr' - _APP_VERSION = '1.1' - _APP_PUBLICATION_ID = 'all_in_one' - _API_URL = 'http://clientapi.hrt.hr/client_api.php/config/identify/format/json' - - def _initialize_api(self): - init_data = { - 'application_publication_id': self._APP_PUBLICATION_ID - } - - uuid = self._download_json( - self._API_URL, None, note='Downloading uuid', - errnote='Unable to download uuid', - data=json.dumps(init_data).encode('utf-8'))['uuid'] - - app_data = { - 'uuid': uuid, - 'application_publication_id': self._APP_PUBLICATION_ID, - 'application_version': self._APP_VERSION - } - - req = sanitized_Request(self._API_URL, data=json.dumps(app_data).encode('utf-8')) - req.get_method = lambda: 'PUT' - - resources = self._download_json( - req, None, note='Downloading session information', - errnote='Unable to download session information') - - self._session_id = resources['session_id'] - - modules = resources['modules'] - - self._search_url = modules['vod_catalog']['resources']['search']['uri'].format( - language=self._APP_LANGUAGE, - application_id=self._APP_PUBLICATION_ID) - - self._login_url = (modules['user']['resources']['login']['uri'] - + '/format/json').format(session_id=self._session_id) - - self._logout_url = modules['user']['resources']['logout']['uri'] - - def _login(self): - username, password = self._get_login_info() - # TODO: figure out authentication with cookies - if username is None or password is None: - self.raise_login_required() - - auth_data = { - 'username': username, - 'password': password, - } - - try: - auth_info = self._download_json( - self._login_url, None, note='Logging in', errnote='Unable to log in', - data=json.dumps(auth_data).encode('utf-8')) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 406: - auth_info = self._parse_json(e.cause.read().encode('utf-8'), None) - else: - raise - - error_message = auth_info.get('error', {}).get('message') - if error_message: - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, error_message), - expected=True) - - self._token = auth_info['secure_streaming_token'] - - def _real_initialize(self): - self._initialize_api() - self._login() - - -class HRTiIE(HRTiBaseIE): - _VALID_URL = r'''(?x) - (?: - hrti:(?P<short_id>[0-9]+)| - https?:// - hrti\.hrt\.hr/(?:\#/)?video/show/(?P<id>[0-9]+)/(?P<display_id>[^/]+)? - ) - ''' - _TESTS = [{ - 'url': 'https://hrti.hrt.hr/#/video/show/2181385/republika-dokumentarna-serija-16-hd', - 'info_dict': { - 'id': '2181385', - 'display_id': 'republika-dokumentarna-serija-16-hd', - 'ext': 'mp4', - 'title': 'REPUBLIKA, dokumentarna serija (1/6) (HD)', - 'description': 'md5:48af85f620e8e0e1df4096270568544f', - 'duration': 2922, - 'view_count': int, - 'average_rating': int, - 'episode_number': int, - 'season_number': int, - 'age_limit': 12, - }, - 'skip': 'Requires account credentials', - }, { - 'url': 'https://hrti.hrt.hr/#/video/show/2181385/', - 'only_matching': True, - }, { - 'url': 'hrti:2181385', - 'only_matching': True, - }, { - 'url': 'https://hrti.hrt.hr/video/show/3873068/cuvar-dvorca-dramska-serija-14', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('short_id') or mobj.group('id') - display_id = mobj.group('display_id') or video_id - - video = self._download_json( - '%s/video_id/%s/format/json' % (self._search_url, video_id), - display_id, 'Downloading video metadata JSON')['video'][0] - - title_info = video['title'] - title = title_info['title_long'] - - movie = video['video_assets']['movie'][0] - m3u8_url = movie['url'].format(TOKEN=self._token) - formats = self._extract_m3u8_formats( - m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls') - self._sort_formats(formats) - - description = clean_html(title_info.get('summary_long')) - age_limit = parse_age_limit(video.get('parental_control', {}).get('rating')) - view_count = int_or_none(video.get('views')) - average_rating = int_or_none(video.get('user_rating')) - duration = int_or_none(movie.get('duration')) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'duration': duration, - 'view_count': view_count, - 'average_rating': average_rating, - 'age_limit': age_limit, - 'formats': formats, - } - - -class HRTiPlaylistIE(HRTiBaseIE): - _VALID_URL = r'https?://hrti\.hrt\.hr/(?:#/)?video/list/category/(?P<id>[0-9]+)/(?P<display_id>[^/]+)?' - _TESTS = [{ - 'url': 'https://hrti.hrt.hr/#/video/list/category/212/ekumena', - 'info_dict': { - 'id': '212', - 'title': 'ekumena', - }, - 'playlist_mincount': 8, - 'skip': 'Requires account credentials', - }, { - 'url': 'https://hrti.hrt.hr/#/video/list/category/212/', - 'only_matching': True, - }, { - 'url': 'https://hrti.hrt.hr/video/list/category/212/ekumena', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - category_id = mobj.group('id') - display_id = mobj.group('display_id') or category_id - - response = self._download_json( - '%s/category_id/%s/format/json' % (self._search_url, category_id), - display_id, 'Downloading video metadata JSON') - - video_ids = try_get( - response, lambda x: x['video_listings'][0]['alternatives'][0]['list'], - list) or [video['id'] for video in response.get('videos', []) if video.get('id')] - - entries = [self.url_result('hrti:%s' % video_id) for video_id in video_ids] - - return self.playlist_result(entries, category_id, display_id) diff --git a/youtube_dl/extractor/hungama.py b/youtube_dl/extractor/hungama.py deleted file mode 100644 index 3fdaac5b6..000000000 --- a/youtube_dl/extractor/hungama.py +++ /dev/null @@ -1,117 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - urlencode_postdata, -) - - -class HungamaIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?:// - (?:www\.)?hungama\.com/ - (?: - (?:video|movie)/[^/]+/| - tv-show/(?:[^/]+/){2}\d+/episode/[^/]+/ - ) - (?P<id>\d+) - ''' - _TESTS = [{ - 'url': 'http://www.hungama.com/video/krishna-chants/39349649/', - 'md5': 'a845a6d1ebd08d80c1035126d49bd6a0', - 'info_dict': { - 'id': '2931166', - 'ext': 'mp4', - 'title': 'Lucky Ali - Kitni Haseen Zindagi', - 'track': 'Kitni Haseen Zindagi', - 'artist': 'Lucky Ali', - 'album': 'Aks', - 'release_year': 2000, - } - }, { - 'url': 'https://www.hungama.com/movie/kahaani-2/44129919/', - 'only_matching': True, - }, { - 'url': 'https://www.hungama.com/tv-show/padded-ki-pushup/season-1/44139461/episode/ep-02-training-sasu-pathlaag-karing/44139503/', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - info = self._search_json_ld(webpage, video_id) - - m3u8_url = self._download_json( - 'https://www.hungama.com/index.php', video_id, - data=urlencode_postdata({'content_id': video_id}), headers={ - 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', - 'X-Requested-With': 'XMLHttpRequest', - }, query={ - 'c': 'common', - 'm': 'get_video_mdn_url', - })['stream_url'] - - formats = self._extract_m3u8_formats( - m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id='hls') - self._sort_formats(formats) - - info.update({ - 'id': video_id, - 'formats': formats, - }) - return info - - -class HungamaSongIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?hungama\.com/song/[^/]+/(?P<id>\d+)' - _TEST = { - 'url': 'https://www.hungama.com/song/kitni-haseen-zindagi/2931166/', - 'md5': 'a845a6d1ebd08d80c1035126d49bd6a0', - 'info_dict': { - 'id': '2931166', - 'ext': 'mp4', - 'title': 'Lucky Ali - Kitni Haseen Zindagi', - 'track': 'Kitni Haseen Zindagi', - 'artist': 'Lucky Ali', - 'album': 'Aks', - 'release_year': 2000, - } - } - - def _real_extract(self, url): - audio_id = self._match_id(url) - - data = self._download_json( - 'https://www.hungama.com/audio-player-data/track/%s' % audio_id, - audio_id, query={'_country': 'IN'})[0] - - track = data['song_name'] - artist = data.get('singer_name') - - m3u8_url = self._download_json( - data.get('file') or data['preview_link'], - audio_id)['response']['media_url'] - - formats = self._extract_m3u8_formats( - m3u8_url, audio_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id='hls') - self._sort_formats(formats) - - title = '%s - %s' % (artist, track) if artist else track - thumbnail = data.get('img_src') or data.get('album_image') - - return { - 'id': audio_id, - 'title': title, - 'thumbnail': thumbnail, - 'track': track, - 'artist': artist, - 'album': data.get('album_name'), - 'release_year': int_or_none(data.get('date')), - 'formats': formats, - } diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py deleted file mode 100644 index 0d9f50ed2..000000000 --- a/youtube_dl/extractor/ign.py +++ /dev/null @@ -1,257 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_urllib_parse_urlparse, -) -from ..utils import ( - HEADRequest, - determine_ext, - int_or_none, - parse_iso8601, - strip_or_none, - try_get, -) - - -class IGNBaseIE(InfoExtractor): - def _call_api(self, slug): - return self._download_json( - 'http://apis.ign.com/{0}/v3/{0}s/slug/{1}'.format(self._PAGE_TYPE, slug), slug) - - -class IGNIE(IGNBaseIE): - """ - Extractor for some of the IGN sites, like www.ign.com, es.ign.com de.ign.com. - Some videos of it.ign.com are also supported - """ - - _VALID_URL = r'https?://(?:.+?\.ign|www\.pcmag)\.com/videos/(?:\d{4}/\d{2}/\d{2}/)?(?P<id>[^/?&#]+)' - IE_NAME = 'ign.com' - _PAGE_TYPE = 'video' - - _TESTS = [{ - 'url': 'http://www.ign.com/videos/2013/06/05/the-last-of-us-review', - 'md5': 'd2e1586d9987d40fad7867bf96a018ea', - 'info_dict': { - 'id': '8f862beef863986b2785559b9e1aa599', - 'ext': 'mp4', - 'title': 'The Last of Us Review', - 'description': 'md5:c8946d4260a4d43a00d5ae8ed998870c', - 'timestamp': 1370440800, - 'upload_date': '20130605', - 'tags': 'count:9', - } - }, { - 'url': 'http://www.pcmag.com/videos/2015/01/06/010615-whats-new-now-is-gogo-snooping-on-your-data', - 'md5': 'f1581a6fe8c5121be5b807684aeac3f6', - 'info_dict': { - 'id': 'ee10d774b508c9b8ec07e763b9125b91', - 'ext': 'mp4', - 'title': 'What\'s New Now: Is GoGo Snooping on Your Data?', - 'description': 'md5:817a20299de610bd56f13175386da6fa', - 'timestamp': 1420571160, - 'upload_date': '20150106', - 'tags': 'count:4', - } - }, { - 'url': 'https://www.ign.com/videos/is-a-resident-evil-4-remake-on-the-way-ign-daily-fix', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - video = self._call_api(display_id) - video_id = video['videoId'] - metadata = video['metadata'] - title = metadata.get('longTitle') or metadata.get('title') or metadata['name'] - - formats = [] - refs = video.get('refs') or {} - - m3u8_url = refs.get('m3uUrl') - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - - f4m_url = refs.get('f4mUrl') - if f4m_url: - formats.extend(self._extract_f4m_formats( - f4m_url, video_id, f4m_id='hds', fatal=False)) - - for asset in (video.get('assets') or []): - asset_url = asset.get('url') - if not asset_url: - continue - formats.append({ - 'url': asset_url, - 'tbr': int_or_none(asset.get('bitrate'), 1000), - 'fps': int_or_none(asset.get('frame_rate')), - 'height': int_or_none(asset.get('height')), - 'width': int_or_none(asset.get('width')), - }) - - mezzanine_url = try_get(video, lambda x: x['system']['mezzanineUrl']) - if mezzanine_url: - formats.append({ - 'ext': determine_ext(mezzanine_url, 'mp4'), - 'format_id': 'mezzanine', - 'preference': 1, - 'url': mezzanine_url, - }) - - self._sort_formats(formats) - - thumbnails = [] - for thumbnail in (video.get('thumbnails') or []): - thumbnail_url = thumbnail.get('url') - if not thumbnail_url: - continue - thumbnails.append({ - 'url': thumbnail_url, - }) - - tags = [] - for tag in (video.get('tags') or []): - display_name = tag.get('displayName') - if not display_name: - continue - tags.append(display_name) - - return { - 'id': video_id, - 'title': title, - 'description': strip_or_none(metadata.get('description')), - 'timestamp': parse_iso8601(metadata.get('publishDate')), - 'duration': int_or_none(metadata.get('duration')), - 'display_id': display_id, - 'thumbnails': thumbnails, - 'formats': formats, - 'tags': tags, - } - - -class IGNVideoIE(InfoExtractor): - _VALID_URL = r'https?://.+?\.ign\.com/(?:[a-z]{2}/)?[^/]+/(?P<id>\d+)/(?:video|trailer)/' - _TESTS = [{ - 'url': 'http://me.ign.com/en/videos/112203/video/how-hitman-aims-to-be-different-than-every-other-s', - 'md5': 'dd9aca7ed2657c4e118d8b261e5e9de1', - 'info_dict': { - 'id': 'e9be7ea899a9bbfc0674accc22a36cc8', - 'ext': 'mp4', - 'title': 'How Hitman Aims to Be Different Than Every Other Stealth Game - NYCC 2015', - 'description': 'Taking out assassination targets in Hitman has never been more stylish.', - 'timestamp': 1444665600, - 'upload_date': '20151012', - } - }, { - 'url': 'http://me.ign.com/ar/angry-birds-2/106533/video/lrd-ldyy-lwl-lfylm-angry-birds', - 'only_matching': True, - }, { - # Youtube embed - 'url': 'https://me.ign.com/ar/ratchet-clank-rift-apart/144327/trailer/embed', - 'only_matching': True, - }, { - # Twitter embed - 'url': 'http://adria.ign.com/sherlock-season-4/9687/trailer/embed', - 'only_matching': True, - }, { - # Vimeo embed - 'url': 'https://kr.ign.com/bic-2018/3307/trailer/embed', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - req = HEADRequest(url.rsplit('/', 1)[0] + '/embed') - url = self._request_webpage(req, video_id).geturl() - ign_url = compat_parse_qs( - compat_urllib_parse_urlparse(url).query).get('url', [None])[0] - if ign_url: - return self.url_result(ign_url, IGNIE.ie_key()) - return self.url_result(url) - - -class IGNArticleIE(IGNBaseIE): - _VALID_URL = r'https?://.+?\.ign\.com/(?:articles(?:/\d{4}/\d{2}/\d{2})?|(?:[a-z]{2}/)?feature/\d+)/(?P<id>[^/?&#]+)' - _PAGE_TYPE = 'article' - _TESTS = [{ - 'url': 'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind', - 'info_dict': { - 'id': '524497489e4e8ff5848ece34', - 'title': '100 Little Things in GTA 5 That Will Blow Your Mind', - }, - 'playlist': [ - { - 'info_dict': { - 'id': '5ebbd138523268b93c9141af17bec937', - 'ext': 'mp4', - 'title': 'GTA 5 Video Review', - 'description': 'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.', - 'timestamp': 1379339880, - 'upload_date': '20130916', - }, - }, - { - 'info_dict': { - 'id': '638672ee848ae4ff108df2a296418ee2', - 'ext': 'mp4', - 'title': '26 Twisted Moments from GTA 5 in Slow Motion', - 'description': 'The twisted beauty of GTA 5 in stunning slow motion.', - 'timestamp': 1386878820, - 'upload_date': '20131212', - }, - }, - ], - 'params': { - 'playlist_items': '2-3', - 'skip_download': True, - }, - }, { - 'url': 'http://www.ign.com/articles/2014/08/15/rewind-theater-wild-trailer-gamescom-2014?watch', - 'info_dict': { - 'id': '53ee806780a81ec46e0790f8', - 'title': 'Rewind Theater - Wild Trailer Gamescom 2014', - }, - 'playlist_count': 2, - }, { - # videoId pattern - 'url': 'http://www.ign.com/articles/2017/06/08/new-ducktales-short-donalds-birthday-doesnt-go-as-planned', - 'only_matching': True, - }, { - # Youtube embed - 'url': 'https://www.ign.com/articles/2021-mvp-named-in-puppy-bowl-xvii', - 'only_matching': True, - }, { - # IMDB embed - 'url': 'https://www.ign.com/articles/2014/08/07/sons-of-anarchy-final-season-trailer', - 'only_matching': True, - }, { - # Facebook embed - 'url': 'https://www.ign.com/articles/2017/09/20/marvels-the-punisher-watch-the-new-trailer-for-the-netflix-series', - 'only_matching': True, - }, { - # Brightcove embed - 'url': 'https://www.ign.com/articles/2016/01/16/supergirl-goes-flying-with-martian-manhunter-in-new-clip', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - article = self._call_api(display_id) - - def entries(): - media_url = try_get(article, lambda x: x['mediaRelations'][0]['media']['metadata']['url']) - if media_url: - yield self.url_result(media_url, IGNIE.ie_key()) - for content in (article.get('content') or []): - for video_url in re.findall(r'(?:\[(?:ignvideo\s+url|youtube\s+clip_id)|<iframe[^>]+src)="([^"]+)"', content): - yield self.url_result(video_url) - - return self.playlist_result( - entries(), article.get('articleId'), - strip_or_none(try_get(article, lambda x: x['metadata']['headline']))) diff --git a/youtube_dl/extractor/imggaming.py b/youtube_dl/extractor/imggaming.py deleted file mode 100644 index e11f92053..000000000 --- a/youtube_dl/extractor/imggaming.py +++ /dev/null @@ -1,133 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import json -import re - -from .common import InfoExtractor -from ..compat import compat_HTTPError -from ..utils import ( - ExtractorError, - int_or_none, - str_or_none, - try_get, -) - - -class ImgGamingBaseIE(InfoExtractor): - _API_BASE = 'https://dce-frontoffice.imggaming.com/api/v2/' - _API_KEY = '857a1e5d-e35e-4fdf-805b-a87b6f8364bf' - _HEADERS = None - _MANIFEST_HEADERS = {'Accept-Encoding': 'identity'} - _REALM = None - _VALID_URL_TEMPL = r'https?://(?P<domain>%s)/(?P<type>live|playlist|video)/(?P<id>\d+)(?:\?.*?\bplaylistId=(?P<playlist_id>\d+))?' - - def _real_initialize(self): - self._HEADERS = { - 'Realm': 'dce.' + self._REALM, - 'x-api-key': self._API_KEY, - } - - email, password = self._get_login_info() - if email is None: - self.raise_login_required() - - p_headers = self._HEADERS.copy() - p_headers['Content-Type'] = 'application/json' - self._HEADERS['Authorization'] = 'Bearer ' + self._download_json( - self._API_BASE + 'login', - None, 'Logging in', data=json.dumps({ - 'id': email, - 'secret': password, - }).encode(), headers=p_headers)['authorisationToken'] - - def _call_api(self, path, media_id): - return self._download_json( - self._API_BASE + path + media_id, media_id, headers=self._HEADERS) - - def _extract_dve_api_url(self, media_id, media_type): - stream_path = 'stream' - if media_type == 'video': - stream_path += '/vod/' - else: - stream_path += '?eventId=' - try: - return self._call_api( - stream_path, media_id)['playerUrlCallback'] - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - raise ExtractorError( - self._parse_json(e.cause.read().decode(), media_id)['messages'][0], - expected=True) - raise - - def _real_extract(self, url): - domain, media_type, media_id, playlist_id = re.match(self._VALID_URL, url).groups() - - if playlist_id: - if self._downloader.params.get('noplaylist'): - self.to_screen('Downloading just video %s because of --no-playlist' % media_id) - else: - self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % playlist_id) - media_type, media_id = 'playlist', playlist_id - - if media_type == 'playlist': - playlist = self._call_api('vod/playlist/', media_id) - entries = [] - for video in try_get(playlist, lambda x: x['videos']['vods']) or []: - video_id = str_or_none(video.get('id')) - if not video_id: - continue - entries.append(self.url_result( - 'https://%s/video/%s' % (domain, video_id), - self.ie_key(), video_id)) - return self.playlist_result( - entries, media_id, playlist.get('title'), - playlist.get('description')) - - dve_api_url = self._extract_dve_api_url(media_id, media_type) - video_data = self._download_json(dve_api_url, media_id) - is_live = media_type == 'live' - if is_live: - title = self._live_title(self._call_api('event/', media_id)['title']) - else: - title = video_data['name'] - - formats = [] - for proto in ('hls', 'dash'): - media_url = video_data.get(proto + 'Url') or try_get(video_data, lambda x: x[proto]['url']) - if not media_url: - continue - if proto == 'hls': - m3u8_formats = self._extract_m3u8_formats( - media_url, media_id, 'mp4', 'm3u8' if is_live else 'm3u8_native', - m3u8_id='hls', fatal=False, headers=self._MANIFEST_HEADERS) - for f in m3u8_formats: - f.setdefault('http_headers', {}).update(self._MANIFEST_HEADERS) - formats.append(f) - else: - formats.extend(self._extract_mpd_formats( - media_url, media_id, mpd_id='dash', fatal=False, - headers=self._MANIFEST_HEADERS)) - self._sort_formats(formats) - - subtitles = {} - for subtitle in video_data.get('subtitles', []): - subtitle_url = subtitle.get('url') - if not subtitle_url: - continue - subtitles.setdefault(subtitle.get('lang', 'en_US'), []).append({ - 'url': subtitle_url, - }) - - return { - 'id': media_id, - 'title': title, - 'formats': formats, - 'thumbnail': video_data.get('thumbnailUrl'), - 'description': video_data.get('description'), - 'duration': int_or_none(video_data.get('duration')), - 'tags': video_data.get('tags'), - 'is_live': is_live, - 'subtitles': subtitles, - } diff --git a/youtube_dl/extractor/imgur.py b/youtube_dl/extractor/imgur.py deleted file mode 100644 index a5ba03efa..000000000 --- a/youtube_dl/extractor/imgur.py +++ /dev/null @@ -1,154 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - js_to_json, - mimetype2ext, - ExtractorError, -) - - -class ImgurIE(InfoExtractor): - _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?!(?:a|gallery|(?:t(?:opic)?|r)/[^/]+)/)(?P<id>[a-zA-Z0-9]+)' - - _TESTS = [{ - 'url': 'https://i.imgur.com/A61SaA1.gifv', - 'info_dict': { - 'id': 'A61SaA1', - 'ext': 'mp4', - 'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$', - }, - }, { - 'url': 'https://imgur.com/A61SaA1', - 'only_matching': True, - }, { - 'url': 'https://i.imgur.com/crGpqCV.mp4', - 'only_matching': True, - }, { - # no title - 'url': 'https://i.imgur.com/jxBXAMC.gifv', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage( - 'https://i.imgur.com/{id}.gifv'.format(id=video_id), video_id) - - width = int_or_none(self._og_search_property( - 'video:width', webpage, default=None)) - height = int_or_none(self._og_search_property( - 'video:height', webpage, default=None)) - - video_elements = self._search_regex( - r'(?s)<div class="video-elements">(.*?)</div>', - webpage, 'video elements', default=None) - if not video_elements: - raise ExtractorError( - 'No sources found for video %s. Maybe an image?' % video_id, - expected=True) - - formats = [] - for m in re.finditer(r'<source\s+src="(?P<src>[^"]+)"\s+type="(?P<type>[^"]+)"', video_elements): - formats.append({ - 'format_id': m.group('type').partition('/')[2], - 'url': self._proto_relative_url(m.group('src')), - 'ext': mimetype2ext(m.group('type')), - 'width': width, - 'height': height, - 'http_headers': { - 'User-Agent': 'youtube-dl (like wget)', - }, - }) - - gif_json = self._search_regex( - r'(?s)var\s+videoItem\s*=\s*(\{.*?\})', - webpage, 'GIF code', fatal=False) - if gif_json: - gifd = self._parse_json( - gif_json, video_id, transform_source=js_to_json) - formats.append({ - 'format_id': 'gif', - 'preference': -10, - 'width': width, - 'height': height, - 'ext': 'gif', - 'acodec': 'none', - 'vcodec': 'gif', - 'container': 'gif', - 'url': self._proto_relative_url(gifd['gifUrl']), - 'filesize': gifd.get('size'), - 'http_headers': { - 'User-Agent': 'youtube-dl (like wget)', - }, - }) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'formats': formats, - 'title': self._og_search_title(webpage, default=video_id), - } - - -class ImgurGalleryIE(InfoExtractor): - IE_NAME = 'imgur:gallery' - _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:gallery|(?:t(?:opic)?|r)/[^/]+)/(?P<id>[a-zA-Z0-9]+)' - - _TESTS = [{ - 'url': 'http://imgur.com/gallery/Q95ko', - 'info_dict': { - 'id': 'Q95ko', - 'title': 'Adding faces make every GIF better', - }, - 'playlist_count': 25, - }, { - 'url': 'http://imgur.com/topic/Aww/ll5Vk', - 'only_matching': True, - }, { - 'url': 'https://imgur.com/gallery/YcAQlkx', - 'info_dict': { - 'id': 'YcAQlkx', - 'ext': 'mp4', - 'title': 'Classic Steve Carell gif...cracks me up everytime....damn the repost downvotes....', - } - }, { - 'url': 'http://imgur.com/topic/Funny/N8rOudd', - 'only_matching': True, - }, { - 'url': 'http://imgur.com/r/aww/VQcQPhM', - 'only_matching': True, - }] - - def _real_extract(self, url): - gallery_id = self._match_id(url) - - data = self._download_json( - 'https://imgur.com/gallery/%s.json' % gallery_id, - gallery_id)['data']['image'] - - if data.get('is_album'): - entries = [ - self.url_result('http://imgur.com/%s' % image['hash'], ImgurIE.ie_key(), image['hash']) - for image in data['album_images']['images'] if image.get('hash')] - return self.playlist_result(entries, gallery_id, data.get('title'), data.get('description')) - - return self.url_result('http://imgur.com/%s' % gallery_id, ImgurIE.ie_key(), gallery_id) - - -class ImgurAlbumIE(ImgurGalleryIE): - IE_NAME = 'imgur:album' - _VALID_URL = r'https?://(?:i\.)?imgur\.com/a/(?P<id>[a-zA-Z0-9]+)' - - _TESTS = [{ - 'url': 'http://imgur.com/a/j6Orj', - 'info_dict': { - 'id': 'j6Orj', - 'title': 'A Literary Analysis of "Star Wars: The Force Awakens"', - }, - 'playlist_count': 12, - }] diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py deleted file mode 100644 index 12e10143c..000000000 --- a/youtube_dl/extractor/instagram.py +++ /dev/null @@ -1,474 +0,0 @@ -from __future__ import unicode_literals - -import itertools -import hashlib -import json -import re - -from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_HTTPError, -) -from ..utils import ( - ExtractorError, - float_or_none, - get_element_by_attribute, - int_or_none, - lowercase_escape, - std_headers, - try_get, - url_or_none, -) - - -class InstagramIE(InfoExtractor): - _VALID_URL = r'(?P<url>https?://(?:www\.)?instagram\.com/(?:p|tv|reel)/(?P<id>[^/?#&]+))' - _TESTS = [{ - 'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc', - 'md5': '0d2da106a9d2631273e192b372806516', - 'info_dict': { - 'id': 'aye83DjauH', - 'ext': 'mp4', - 'title': 'Video by naomipq', - 'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 0, - 'timestamp': 1371748545, - 'upload_date': '20130620', - 'uploader_id': 'naomipq', - 'uploader': 'B E A U T Y F O R A S H E S', - 'like_count': int, - 'comment_count': int, - 'comments': list, - }, - }, { - # missing description - 'url': 'https://www.instagram.com/p/BA-pQFBG8HZ/?taken-by=britneyspears', - 'info_dict': { - 'id': 'BA-pQFBG8HZ', - 'ext': 'mp4', - 'title': 'Video by britneyspears', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 0, - 'timestamp': 1453760977, - 'upload_date': '20160125', - 'uploader_id': 'britneyspears', - 'uploader': 'Britney Spears', - 'like_count': int, - 'comment_count': int, - 'comments': list, - }, - 'params': { - 'skip_download': True, - }, - }, { - # multi video post - 'url': 'https://www.instagram.com/p/BQ0eAlwhDrw/', - 'playlist': [{ - 'info_dict': { - 'id': 'BQ0dSaohpPW', - 'ext': 'mp4', - 'title': 'Video 1', - }, - }, { - 'info_dict': { - 'id': 'BQ0dTpOhuHT', - 'ext': 'mp4', - 'title': 'Video 2', - }, - }, { - 'info_dict': { - 'id': 'BQ0dT7RBFeF', - 'ext': 'mp4', - 'title': 'Video 3', - }, - }], - 'info_dict': { - 'id': 'BQ0eAlwhDrw', - 'title': 'Post by instagram', - 'description': 'md5:0f9203fc6a2ce4d228da5754bcf54957', - }, - }, { - # IGTV - 'url': 'https://www.instagram.com/tv/BkfuX9UB-eK/', - 'info_dict': { - 'id': 'BkfuX9UB-eK', - 'ext': 'mp4', - 'title': 'Fingerboarding Tricks with @cass.fb', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 53.83, - 'timestamp': 1530032919, - 'upload_date': '20180626', - 'uploader_id': 'instagram', - 'uploader': 'Instagram', - 'like_count': int, - 'comment_count': int, - 'comments': list, - 'description': 'Meet Cass Hirst (@cass.fb), a fingerboarding pro who can perform tiny ollies and kickflips while blindfolded.', - } - }, { - 'url': 'https://instagram.com/p/-Cmh1cukG2/', - 'only_matching': True, - }, { - 'url': 'http://instagram.com/p/9o6LshA7zy/embed/', - 'only_matching': True, - }, { - 'url': 'https://www.instagram.com/tv/aye83DjauH/', - 'only_matching': True, - }, { - 'url': 'https://www.instagram.com/reel/CDUMkliABpa/', - 'only_matching': True, - }] - - @staticmethod - def _extract_embed_url(webpage): - mobj = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?instagram\.com/p/[^/]+/embed.*?)\1', - webpage) - if mobj: - return mobj.group('url') - - blockquote_el = get_element_by_attribute( - 'class', 'instagram-media', webpage) - if blockquote_el is None: - return - - mobj = re.search( - r'<a[^>]+href=([\'"])(?P<link>[^\'"]+)\1', blockquote_el) - if mobj: - return mobj.group('link') - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - url = mobj.group('url') - - webpage = self._download_webpage(url, video_id) - - (media, video_url, description, thumbnail, timestamp, uploader, - uploader_id, like_count, comment_count, comments, height, - width) = [None] * 12 - - shared_data = self._parse_json( - self._search_regex( - r'window\._sharedData\s*=\s*({.+?});', - webpage, 'shared data', default='{}'), - video_id, fatal=False) - if shared_data: - media = try_get( - shared_data, - (lambda x: x['entry_data']['PostPage'][0]['graphql']['shortcode_media'], - lambda x: x['entry_data']['PostPage'][0]['media']), - dict) - # _sharedData.entry_data.PostPage is empty when authenticated (see - # https://github.com/ytdl-org/youtube-dl/pull/22880) - if not media: - additional_data = self._parse_json( - self._search_regex( - r'window\.__additionalDataLoaded\s*\(\s*[^,]+,\s*({.+?})\s*\)\s*;', - webpage, 'additional data', default='{}'), - video_id, fatal=False) - if additional_data: - media = try_get( - additional_data, lambda x: x['graphql']['shortcode_media'], - dict) - if media: - video_url = media.get('video_url') - height = int_or_none(media.get('dimensions', {}).get('height')) - width = int_or_none(media.get('dimensions', {}).get('width')) - description = try_get( - media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], - compat_str) or media.get('caption') - title = media.get('title') - thumbnail = media.get('display_src') or media.get('display_url') - duration = float_or_none(media.get('video_duration')) - timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date')) - uploader = media.get('owner', {}).get('full_name') - uploader_id = media.get('owner', {}).get('username') - - def get_count(keys, kind): - if not isinstance(keys, (list, tuple)): - keys = [keys] - for key in keys: - count = int_or_none(try_get( - media, (lambda x: x['edge_media_%s' % key]['count'], - lambda x: x['%ss' % kind]['count']))) - if count is not None: - return count - like_count = get_count('preview_like', 'like') - comment_count = get_count( - ('preview_comment', 'to_comment', 'to_parent_comment'), 'comment') - - comments = [{ - 'author': comment.get('user', {}).get('username'), - 'author_id': comment.get('user', {}).get('id'), - 'id': comment.get('id'), - 'text': comment.get('text'), - 'timestamp': int_or_none(comment.get('created_at')), - } for comment in media.get( - 'comments', {}).get('nodes', []) if comment.get('text')] - if not video_url: - edges = try_get( - media, lambda x: x['edge_sidecar_to_children']['edges'], - list) or [] - if edges: - entries = [] - for edge_num, edge in enumerate(edges, start=1): - node = try_get(edge, lambda x: x['node'], dict) - if not node: - continue - node_video_url = url_or_none(node.get('video_url')) - if not node_video_url: - continue - entries.append({ - 'id': node.get('shortcode') or node['id'], - 'title': node.get('title') or 'Video %d' % edge_num, - 'url': node_video_url, - 'thumbnail': node.get('display_url'), - 'duration': float_or_none(node.get('video_duration')), - 'width': int_or_none(try_get(node, lambda x: x['dimensions']['width'])), - 'height': int_or_none(try_get(node, lambda x: x['dimensions']['height'])), - 'view_count': int_or_none(node.get('video_view_count')), - }) - return self.playlist_result( - entries, video_id, - 'Post by %s' % uploader_id if uploader_id else None, - description) - - if not video_url: - video_url = self._og_search_video_url(webpage, secure=False) - - formats = [{ - 'url': video_url, - 'width': width, - 'height': height, - }] - - if not uploader_id: - uploader_id = self._search_regex( - r'"owner"\s*:\s*{\s*"username"\s*:\s*"(.+?)"', - webpage, 'uploader id', fatal=False) - - if not description: - description = self._search_regex( - r'"caption"\s*:\s*"(.+?)"', webpage, 'description', default=None) - if description is not None: - description = lowercase_escape(description) - - if not thumbnail: - thumbnail = self._og_search_thumbnail(webpage) - - return { - 'id': video_id, - 'formats': formats, - 'ext': 'mp4', - 'title': title or 'Video by %s' % uploader_id, - 'description': description, - 'duration': duration, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'uploader_id': uploader_id, - 'uploader': uploader, - 'like_count': like_count, - 'comment_count': comment_count, - 'comments': comments, - } - - -class InstagramPlaylistIE(InfoExtractor): - # A superclass for handling any kind of query based on GraphQL which - # results in a playlist. - - _gis_tmpl = None # used to cache GIS request type - - def _parse_graphql(self, webpage, item_id): - # Reads a webpage and returns its GraphQL data. - return self._parse_json( - self._search_regex( - r'sharedData\s*=\s*({.+?})\s*;\s*[<\n]', webpage, 'data'), - item_id) - - def _extract_graphql(self, data, url): - # Parses GraphQL queries containing videos and generates a playlist. - def get_count(suffix): - return int_or_none(try_get( - node, lambda x: x['edge_media_' + suffix]['count'])) - - uploader_id = self._match_id(url) - csrf_token = data['config']['csrf_token'] - rhx_gis = data.get('rhx_gis') or '3c7ca9dcefcf966d11dacf1f151335e8' - - cursor = '' - for page_num in itertools.count(1): - variables = { - 'first': 12, - 'after': cursor, - } - variables.update(self._query_vars_for(data)) - variables = json.dumps(variables) - - if self._gis_tmpl: - gis_tmpls = [self._gis_tmpl] - else: - gis_tmpls = [ - '%s' % rhx_gis, - '', - '%s:%s' % (rhx_gis, csrf_token), - '%s:%s:%s' % (rhx_gis, csrf_token, std_headers['User-Agent']), - ] - - # try all of the ways to generate a GIS query, and not only use the - # first one that works, but cache it for future requests - for gis_tmpl in gis_tmpls: - try: - json_data = self._download_json( - 'https://www.instagram.com/graphql/query/', uploader_id, - 'Downloading JSON page %d' % page_num, headers={ - 'X-Requested-With': 'XMLHttpRequest', - 'X-Instagram-GIS': hashlib.md5( - ('%s:%s' % (gis_tmpl, variables)).encode('utf-8')).hexdigest(), - }, query={ - 'query_hash': self._QUERY_HASH, - 'variables': variables, - }) - media = self._parse_timeline_from(json_data) - self._gis_tmpl = gis_tmpl - break - except ExtractorError as e: - # if it's an error caused by a bad query, and there are - # more GIS templates to try, ignore it and keep trying - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - if gis_tmpl != gis_tmpls[-1]: - continue - raise - - edges = media.get('edges') - if not edges or not isinstance(edges, list): - break - - for edge in edges: - node = edge.get('node') - if not node or not isinstance(node, dict): - continue - if node.get('__typename') != 'GraphVideo' and node.get('is_video') is not True: - continue - video_id = node.get('shortcode') - if not video_id: - continue - - info = self.url_result( - 'https://instagram.com/p/%s/' % video_id, - ie=InstagramIE.ie_key(), video_id=video_id) - - description = try_get( - node, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], - compat_str) - thumbnail = node.get('thumbnail_src') or node.get('display_src') - timestamp = int_or_none(node.get('taken_at_timestamp')) - - comment_count = get_count('to_comment') - like_count = get_count('preview_like') - view_count = int_or_none(node.get('video_view_count')) - - info.update({ - 'description': description, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'comment_count': comment_count, - 'like_count': like_count, - 'view_count': view_count, - }) - - yield info - - page_info = media.get('page_info') - if not page_info or not isinstance(page_info, dict): - break - - has_next_page = page_info.get('has_next_page') - if not has_next_page: - break - - cursor = page_info.get('end_cursor') - if not cursor or not isinstance(cursor, compat_str): - break - - def _real_extract(self, url): - user_or_tag = self._match_id(url) - webpage = self._download_webpage(url, user_or_tag) - data = self._parse_graphql(webpage, user_or_tag) - - self._set_cookie('instagram.com', 'ig_pr', '1') - - return self.playlist_result( - self._extract_graphql(data, url), user_or_tag, user_or_tag) - - -class InstagramUserIE(InstagramPlaylistIE): - _VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P<id>[^/]{2,})/?(?:$|[?#])' - IE_DESC = 'Instagram user profile' - IE_NAME = 'instagram:user' - _TEST = { - 'url': 'https://instagram.com/porsche', - 'info_dict': { - 'id': 'porsche', - 'title': 'porsche', - }, - 'playlist_count': 5, - 'params': { - 'extract_flat': True, - 'skip_download': True, - 'playlistend': 5, - } - } - - _QUERY_HASH = '42323d64886122307be10013ad2dcc44', - - @staticmethod - def _parse_timeline_from(data): - # extracts the media timeline data from a GraphQL result - return data['data']['user']['edge_owner_to_timeline_media'] - - @staticmethod - def _query_vars_for(data): - # returns a dictionary of variables to add to the timeline query based - # on the GraphQL of the original page - return { - 'id': data['entry_data']['ProfilePage'][0]['graphql']['user']['id'] - } - - -class InstagramTagIE(InstagramPlaylistIE): - _VALID_URL = r'https?://(?:www\.)?instagram\.com/explore/tags/(?P<id>[^/]+)' - IE_DESC = 'Instagram hashtag search' - IE_NAME = 'instagram:tag' - _TEST = { - 'url': 'https://instagram.com/explore/tags/lolcats', - 'info_dict': { - 'id': 'lolcats', - 'title': 'lolcats', - }, - 'playlist_count': 50, - 'params': { - 'extract_flat': True, - 'skip_download': True, - 'playlistend': 50, - } - } - - _QUERY_HASH = 'f92f56d47dc7a55b606908374b43a314', - - @staticmethod - def _parse_timeline_from(data): - # extracts the media timeline data from a GraphQL result - return data['data']['hashtag']['edge_hashtag_to_media'] - - @staticmethod - def _query_vars_for(data): - # returns a dictionary of variables to add to the timeline query based - # on the GraphQL of the original page - return { - 'tag_name': - data['entry_data']['TagPage'][0]['graphql']['hashtag']['name'] - } diff --git a/youtube_dl/extractor/internetvideoarchive.py b/youtube_dl/extractor/internetvideoarchive.py deleted file mode 100644 index 59b0a90c3..000000000 --- a/youtube_dl/extractor/internetvideoarchive.py +++ /dev/null @@ -1,64 +0,0 @@ -from __future__ import unicode_literals - -import json -import re - -from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_urlparse, -) - - -class InternetVideoArchiveIE(InfoExtractor): - _VALID_URL = r'https?://video\.internetvideoarchive\.net/(?:player|flash/players)/.*?\?.*?publishedid.*?' - - _TEST = { - 'url': 'http://video.internetvideoarchive.net/player/6/configuration.ashx?customerid=69249&publishedid=194487&reporttag=vdbetatitle&playerid=641&autolist=0&domain=www.videodetective.com&maxrate=high&minrate=low&socialplayer=false', - 'info_dict': { - 'id': '194487', - 'ext': 'mp4', - 'title': 'Kick-Ass 2', - 'description': 'md5:c189d5b7280400630a1d3dd17eaa8d8a', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - } - - @staticmethod - def _build_json_url(query): - return 'http://video.internetvideoarchive.net/player/6/configuration.ashx?' + query - - def _real_extract(self, url): - query = compat_parse_qs(compat_urlparse.urlparse(url).query) - video_id = query['publishedid'][0] - data = self._download_json( - 'https://video.internetvideoarchive.net/videojs7/videojs7.ivasettings.ashx', - video_id, data=json.dumps({ - 'customerid': query['customerid'][0], - 'publishedid': video_id, - }).encode()) - title = data['Title'] - formats = self._extract_m3u8_formats( - data['VideoUrl'], video_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False) - file_url = formats[0]['url'] - if '.ism/' in file_url: - replace_url = lambda x: re.sub(r'\.ism/[^?]+', '.ism/' + x, file_url) - formats.extend(self._extract_f4m_formats( - replace_url('.f4m'), video_id, f4m_id='hds', fatal=False)) - formats.extend(self._extract_mpd_formats( - replace_url('.mpd'), video_id, mpd_id='dash', fatal=False)) - formats.extend(self._extract_ism_formats( - replace_url('Manifest'), video_id, ism_id='mss', fatal=False)) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'thumbnail': data.get('PosterUrl'), - 'description': data.get('Description'), - } diff --git a/youtube_dl/extractor/iprima.py b/youtube_dl/extractor/iprima.py deleted file mode 100644 index 648ae6741..000000000 --- a/youtube_dl/extractor/iprima.py +++ /dev/null @@ -1,149 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re -import time - -from .common import InfoExtractor -from ..utils import ( - determine_ext, - js_to_json, -) - - -class IPrimaIE(InfoExtractor): - _VALID_URL = r'https?://(?:[^/]+)\.iprima\.cz/(?:[^/]+/)*(?P<id>[^/?#&]+)' - _GEO_BYPASS = False - - _TESTS = [{ - 'url': 'https://prima.iprima.cz/particka/92-epizoda', - 'info_dict': { - 'id': 'p51388', - 'ext': 'mp4', - 'title': 'Partička (92)', - 'description': 'md5:859d53beae4609e6dd7796413f1b6cac', - }, - 'params': { - 'skip_download': True, # m3u8 download - }, - }, { - 'url': 'https://cnn.iprima.cz/videa/70-epizoda', - 'info_dict': { - 'id': 'p681554', - 'ext': 'mp4', - 'title': 'HLAVNÍ ZPRÁVY 3.5.2020', - }, - 'params': { - 'skip_download': True, # m3u8 download - }, - }, { - 'url': 'http://play.iprima.cz/particka/particka-92', - 'only_matching': True, - }, { - # geo restricted - 'url': 'http://play.iprima.cz/closer-nove-pripady/closer-nove-pripady-iv-1', - 'only_matching': True, - }, { - # iframe api.play-backend.iprima.cz - 'url': 'https://prima.iprima.cz/my-little-pony/mapa-znameni-2-2', - 'only_matching': True, - }, { - # iframe prima.iprima.cz - 'url': 'https://prima.iprima.cz/porady/jak-se-stavi-sen/rodina-rathousova-praha', - 'only_matching': True, - }, { - 'url': 'http://www.iprima.cz/filmy/desne-rande', - 'only_matching': True, - }, { - 'url': 'https://zoom.iprima.cz/10-nejvetsich-tajemstvi-zahad/posvatna-mista-a-stavby', - 'only_matching': True, - }, { - 'url': 'https://krimi.iprima.cz/mraz-0/sebevrazdy', - 'only_matching': True, - }, { - 'url': 'https://cool.iprima.cz/derava-silnice-nevadi', - 'only_matching': True, - }, { - 'url': 'https://love.iprima.cz/laska-az-za-hrob/slib-dany-bratrovi', - 'only_matching': True, - }, { - 'url': 'https://autosalon.iprima.cz/motorsport/7-epizoda-1', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - self._set_cookie('play.iprima.cz', 'ott_adult_confirmed', '1') - - webpage = self._download_webpage(url, video_id) - - title = self._og_search_title( - webpage, default=None) or self._search_regex( - r'<h1>([^<]+)', webpage, 'title') - - video_id = self._search_regex( - (r'<iframe[^>]+\bsrc=["\'](?:https?:)?//(?:api\.play-backend\.iprima\.cz/prehravac/embedded|prima\.iprima\.cz/[^/]+/[^/]+)\?.*?\bid=(p\d+)', - r'data-product="([^"]+)">', - r'id=["\']player-(p\d+)"', - r'playerId\s*:\s*["\']player-(p\d+)', - r'\bvideos\s*=\s*["\'](p\d+)'), - webpage, 'real id') - - playerpage = self._download_webpage( - 'http://play.iprima.cz/prehravac/init', - video_id, note='Downloading player', query={ - '_infuse': 1, - '_ts': round(time.time()), - 'productId': video_id, - }, headers={'Referer': url}) - - formats = [] - - def extract_formats(format_url, format_key=None, lang=None): - ext = determine_ext(format_url) - new_formats = [] - if format_key == 'hls' or ext == 'm3u8': - new_formats = self._extract_m3u8_formats( - format_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False) - elif format_key == 'dash' or ext == 'mpd': - return - new_formats = self._extract_mpd_formats( - format_url, video_id, mpd_id='dash', fatal=False) - if lang: - for f in new_formats: - if not f.get('language'): - f['language'] = lang - formats.extend(new_formats) - - options = self._parse_json( - self._search_regex( - r'(?s)(?:TDIPlayerOptions|playerOptions)\s*=\s*({.+?});\s*\]\]', - playerpage, 'player options', default='{}'), - video_id, transform_source=js_to_json, fatal=False) - if options: - for key, tracks in options.get('tracks', {}).items(): - if not isinstance(tracks, list): - continue - for track in tracks: - src = track.get('src') - if src: - extract_formats(src, key.lower(), track.get('lang')) - - if not formats: - for _, src in re.findall(r'src["\']\s*:\s*(["\'])(.+?)\1', playerpage): - extract_formats(src) - - if not formats and '>GEO_IP_NOT_ALLOWED<' in playerpage: - self.raise_geo_restricted(countries=['CZ']) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'thumbnail': self._og_search_thumbnail(webpage, default=None), - 'formats': formats, - 'description': self._og_search_description(webpage, default=None), - } diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py deleted file mode 100644 index 6df521426..000000000 --- a/youtube_dl/extractor/iqiyi.py +++ /dev/null @@ -1,219 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import hashlib -import itertools -import re -import time - -from .common import InfoExtractor -from ..compat import ( - compat_str, -) -from ..utils import ( - clean_html, - get_element_by_id, - get_element_by_attribute, - ExtractorError, - ohdave_rsa_encrypt, - remove_start, -) - - -def md5_text(text): - return hashlib.md5(text.encode('utf-8')).hexdigest() - - -class IqiyiIE(InfoExtractor): - IE_NAME = 'iqiyi' - IE_DESC = '爱奇艺' - - _VALID_URL = r'https?://(?:(?:[^.]+\.)?iqiyi\.com|www\.pps\.tv)/.+\.html' - - _NETRC_MACHINE = 'iqiyi' - - _TESTS = [{ - 'url': 'http://www.iqiyi.com/v_19rrojlavg.html', - # MD5 checksum differs on my machine and Travis CI - 'info_dict': { - 'id': '9c1fb1b99d192b21c559e5a1a2cb3c73', - 'ext': 'mp4', - 'title': '美国德州空中惊现奇异云团 酷似UFO', - } - }, { - 'url': 'http://www.iqiyi.com/v_19rrhnnclk.html', - 'md5': 'b7dc800a4004b1b57749d9abae0472da', - 'info_dict': { - 'id': 'e3f585b550a280af23c98b6cb2be19fb', - 'ext': 'mp4', - # This can be either Simplified Chinese or Traditional Chinese - 'title': r're:^(?:名侦探柯南 国语版:第752集 迫近灰原秘密的黑影 下篇|名偵探柯南 國語版:第752集 迫近灰原秘密的黑影 下篇)$', - }, - 'skip': 'Geo-restricted to China', - }, { - 'url': 'http://www.iqiyi.com/w_19rt6o8t9p.html', - 'only_matching': True, - }, { - 'url': 'http://www.iqiyi.com/a_19rrhbc6kt.html', - 'only_matching': True, - }, { - 'url': 'http://yule.iqiyi.com/pcb.html', - 'info_dict': { - 'id': '4a0af228fddb55ec96398a364248ed7f', - 'ext': 'mp4', - 'title': '第2017-04-21期 女艺人频遭极端粉丝骚扰', - }, - }, { - # VIP-only video. The first 2 parts (6 minutes) are available without login - # MD5 sums omitted as values are different on Travis CI and my machine - 'url': 'http://www.iqiyi.com/v_19rrny4w8w.html', - 'info_dict': { - 'id': 'f3cf468b39dddb30d676f89a91200dc1', - 'ext': 'mp4', - 'title': '泰坦尼克号', - }, - 'skip': 'Geo-restricted to China', - }, { - 'url': 'http://www.iqiyi.com/a_19rrhb8ce1.html', - 'info_dict': { - 'id': '202918101', - 'title': '灌篮高手 国语版', - }, - 'playlist_count': 101, - }, { - 'url': 'http://www.pps.tv/w_19rrbav0ph.html', - 'only_matching': True, - }] - - _FORMATS_MAP = { - '96': 1, # 216p, 240p - '1': 2, # 336p, 360p - '2': 3, # 480p, 504p - '21': 4, # 504p - '4': 5, # 720p - '17': 5, # 720p - '5': 6, # 1072p, 1080p - '18': 7, # 1080p - } - - def _real_initialize(self): - self._login() - - @staticmethod - def _rsa_fun(data): - # public key extracted from http://static.iqiyi.com/js/qiyiV2/20160129180840/jobs/i18n/i18nIndex.js - N = 0xab86b6371b5318aaa1d3c9e612a9f1264f372323c8c0f19875b5fc3b3fd3afcc1e5bec527aa94bfa85bffc157e4245aebda05389a5357b75115ac94f074aefcd - e = 65537 - - return ohdave_rsa_encrypt(data, e, N) - - def _login(self): - raise ExtractorError("iQiyi's non-free authentication algorithm has made login impossible", expected=True) - - def get_raw_data(self, tvid, video_id): - tm = int(time.time() * 1000) - - key = 'd5fb4bd9d50c4be6948c97edd7254b0e' - sc = md5_text(compat_str(tm) + key + tvid) - params = { - 'tvid': tvid, - 'vid': video_id, - 'src': '76f90cbd92f94a2e925d83e8ccd22cb7', - 'sc': sc, - 't': tm, - } - - return self._download_json( - 'http://cache.m.iqiyi.com/jp/tmts/%s/%s/' % (tvid, video_id), - video_id, transform_source=lambda s: remove_start(s, 'var tvInfoJs='), - query=params, headers=self.geo_verification_headers()) - - def _extract_playlist(self, webpage): - PAGE_SIZE = 50 - - links = re.findall( - r'<a[^>]+class="site-piclist_pic_link"[^>]+href="(http://www\.iqiyi\.com/.+\.html)"', - webpage) - if not links: - return - - album_id = self._search_regex( - r'albumId\s*:\s*(\d+),', webpage, 'album ID') - album_title = self._search_regex( - r'data-share-title="([^"]+)"', webpage, 'album title', fatal=False) - - entries = list(map(self.url_result, links)) - - # Start from 2 because links in the first page are already on webpage - for page_num in itertools.count(2): - pagelist_page = self._download_webpage( - 'http://cache.video.qiyi.com/jp/avlist/%s/%d/%d/' % (album_id, page_num, PAGE_SIZE), - album_id, - note='Download playlist page %d' % page_num, - errnote='Failed to download playlist page %d' % page_num) - pagelist = self._parse_json( - remove_start(pagelist_page, 'var tvInfoJs='), album_id) - vlist = pagelist['data']['vlist'] - for item in vlist: - entries.append(self.url_result(item['vurl'])) - if len(vlist) < PAGE_SIZE: - break - - return self.playlist_result(entries, album_id, album_title) - - def _real_extract(self, url): - webpage = self._download_webpage( - url, 'temp_id', note='download video page') - - # There's no simple way to determine whether an URL is a playlist or not - # Sometimes there are playlist links in individual videos, so treat it - # as a single video first - tvid = self._search_regex( - r'data-(?:player|shareplattrigger)-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid', default=None) - if tvid is None: - playlist_result = self._extract_playlist(webpage) - if playlist_result: - return playlist_result - raise ExtractorError('Can\'t find any video') - - video_id = self._search_regex( - r'data-(?:player|shareplattrigger)-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id') - - formats = [] - for _ in range(5): - raw_data = self.get_raw_data(tvid, video_id) - - if raw_data['code'] != 'A00000': - if raw_data['code'] == 'A00111': - self.raise_geo_restricted() - raise ExtractorError('Unable to load data. Error code: ' + raw_data['code']) - - data = raw_data['data'] - - for stream in data['vidl']: - if 'm3utx' not in stream: - continue - vd = compat_str(stream['vd']) - formats.append({ - 'url': stream['m3utx'], - 'format_id': vd, - 'ext': 'mp4', - 'preference': self._FORMATS_MAP.get(vd, -1), - 'protocol': 'm3u8_native', - }) - - if formats: - break - - self._sleep(5, video_id) - - self._sort_formats(formats) - title = (get_element_by_id('widget-videotitle', webpage) - or clean_html(get_element_by_attribute('class', 'mod-play-tit', webpage)) - or self._html_search_regex(r'<span[^>]+data-videochanged-title="word"[^>]*>([^<]+)</span>', webpage, 'title')) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - } diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py deleted file mode 100644 index e86c40b42..000000000 --- a/youtube_dl/extractor/itv.py +++ /dev/null @@ -1,185 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import json -import re - -from .common import InfoExtractor -from .brightcove import BrightcoveNewIE -from ..utils import ( - clean_html, - determine_ext, - extract_attributes, - get_element_by_class, - JSON_LD_RE, - merge_dicts, - parse_duration, - smuggle_url, - url_or_none, -) - - -class ITVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?itv\.com/hub/[^/]+/(?P<id>[0-9a-zA-Z]+)' - _GEO_COUNTRIES = ['GB'] - _TESTS = [{ - 'url': 'https://www.itv.com/hub/liar/2a4547a0012', - 'info_dict': { - 'id': '2a4547a0012', - 'ext': 'mp4', - 'title': 'Liar - Series 2 - Episode 6', - 'description': 'md5:d0f91536569dec79ea184f0a44cca089', - 'series': 'Liar', - 'season_number': 2, - 'episode_number': 6, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - # unavailable via data-playlist-url - 'url': 'https://www.itv.com/hub/through-the-keyhole/2a2271a0033', - 'only_matching': True, - }, { - # InvalidVodcrid - 'url': 'https://www.itv.com/hub/james-martins-saturday-morning/2a5159a0034', - 'only_matching': True, - }, { - # ContentUnavailable - 'url': 'https://www.itv.com/hub/whos-doing-the-dishes/2a2898a0024', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - params = extract_attributes(self._search_regex( - r'(?s)(<[^>]+id="video"[^>]*>)', webpage, 'params')) - - ios_playlist_url = params.get('data-video-playlist') or params['data-video-id'] - hmac = params['data-video-hmac'] - headers = self.geo_verification_headers() - headers.update({ - 'Accept': 'application/vnd.itv.vod.playlist.v2+json', - 'Content-Type': 'application/json', - 'hmac': hmac.upper(), - }) - ios_playlist = self._download_json( - ios_playlist_url, video_id, data=json.dumps({ - 'user': { - 'itvUserId': '', - 'entitlements': [], - 'token': '' - }, - 'device': { - 'manufacturer': 'Safari', - 'model': '5', - 'os': { - 'name': 'Windows NT', - 'version': '6.1', - 'type': 'desktop' - } - }, - 'client': { - 'version': '4.1', - 'id': 'browser' - }, - 'variantAvailability': { - 'featureset': { - 'min': ['hls', 'aes', 'outband-webvtt'], - 'max': ['hls', 'aes', 'outband-webvtt'] - }, - 'platformTag': 'dotcom' - } - }).encode(), headers=headers) - video_data = ios_playlist['Playlist']['Video'] - ios_base_url = video_data.get('Base') - - formats = [] - for media_file in (video_data.get('MediaFiles') or []): - href = media_file.get('Href') - if not href: - continue - if ios_base_url: - href = ios_base_url + href - ext = determine_ext(href) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - href, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - else: - formats.append({ - 'url': href, - }) - self._sort_formats(formats) - - subtitles = {} - subs = video_data.get('Subtitles') or [] - for sub in subs: - if not isinstance(sub, dict): - continue - href = url_or_none(sub.get('Href')) - if not href: - continue - subtitles.setdefault('en', []).append({ - 'url': href, - 'ext': determine_ext(href, 'vtt'), - }) - - info = self._search_json_ld(webpage, video_id, default={}) - if not info: - json_ld = self._parse_json(self._search_regex( - JSON_LD_RE, webpage, 'JSON-LD', '{}', - group='json_ld'), video_id, fatal=False) - if json_ld and json_ld.get('@type') == 'BreadcrumbList': - for ile in (json_ld.get('itemListElement:') or []): - item = ile.get('item:') or {} - if item.get('@type') == 'TVEpisode': - item['@context'] = 'http://schema.org' - info = self._json_ld(item, video_id, fatal=False) or {} - break - - return merge_dicts({ - 'id': video_id, - 'title': self._html_search_meta(['og:title', 'twitter:title'], webpage), - 'formats': formats, - 'subtitles': subtitles, - 'duration': parse_duration(video_data.get('Duration')), - 'description': clean_html(get_element_by_class('episode-info__synopsis', webpage)), - }, info) - - -class ITVBTCCIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?itv\.com/btcc/(?:[^/]+/)*(?P<id>[^/?#&]+)' - _TEST = { - 'url': 'http://www.itv.com/btcc/races/btcc-2018-all-the-action-from-brands-hatch', - 'info_dict': { - 'id': 'btcc-2018-all-the-action-from-brands-hatch', - 'title': 'BTCC 2018: All the action from Brands Hatch', - }, - 'playlist_mincount': 9, - } - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1582188683001/HkiHLnNRx_default/index.html?videoId=%s' - - def _real_extract(self, url): - playlist_id = self._match_id(url) - - webpage = self._download_webpage(url, playlist_id) - - entries = [ - self.url_result( - smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, { - # ITV does not like some GB IP ranges, so here are some - # IP blocks it accepts - 'geo_ip_blocks': [ - '193.113.0.0/16', '54.36.162.0/23', '159.65.16.0/21' - ], - 'referrer': url, - }), - ie=BrightcoveNewIE.ie_key(), video_id=video_id) - for video_id in re.findall(r'data-video-id=["\'](\d+)', webpage)] - - title = self._og_search_title(webpage, fatal=False) - - return self.playlist_result(entries, playlist_id, title) diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py deleted file mode 100644 index b5a740a01..000000000 --- a/youtube_dl/extractor/ivi.py +++ /dev/null @@ -1,271 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import json -import re -import sys - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - int_or_none, - qualities, -) - - -class IviIE(InfoExtractor): - IE_DESC = 'ivi.ru' - IE_NAME = 'ivi' - _VALID_URL = r'https?://(?:www\.)?ivi\.(?:ru|tv)/(?:watch/(?:[^/]+/)?|video/player\?.*?videoId=)(?P<id>\d+)' - _GEO_BYPASS = False - _GEO_COUNTRIES = ['RU'] - _LIGHT_KEY = b'\xf1\x02\x32\xb7\xbc\x5c\x7a\xe8\xf7\x96\xc1\x33\x2b\x27\xa1\x8c' - _LIGHT_URL = 'https://api.ivi.ru/light/' - - _TESTS = [ - # Single movie - { - 'url': 'http://www.ivi.ru/watch/53141', - 'md5': '6ff5be2254e796ed346251d117196cf4', - 'info_dict': { - 'id': '53141', - 'ext': 'mp4', - 'title': 'Иван Васильевич меняет профессию', - 'description': 'md5:b924063ea1677c8fe343d8a72ac2195f', - 'duration': 5498, - 'thumbnail': r're:^https?://.*\.jpg$', - }, - 'skip': 'Only works from Russia', - }, - # Serial's series - { - 'url': 'http://www.ivi.ru/watch/dvoe_iz_lartsa/9549', - 'md5': '221f56b35e3ed815fde2df71032f4b3e', - 'info_dict': { - 'id': '9549', - 'ext': 'mp4', - 'title': 'Двое из ларца - Дело Гольдберга (1 часть)', - 'series': 'Двое из ларца', - 'season': 'Сезон 1', - 'season_number': 1, - 'episode': 'Дело Гольдберга (1 часть)', - 'episode_number': 1, - 'duration': 2655, - 'thumbnail': r're:^https?://.*\.jpg$', - }, - 'skip': 'Only works from Russia', - }, - { - # with MP4-HD720 format - 'url': 'http://www.ivi.ru/watch/146500', - 'md5': 'd63d35cdbfa1ea61a5eafec7cc523e1e', - 'info_dict': { - 'id': '146500', - 'ext': 'mp4', - 'title': 'Кукла', - 'description': 'md5:ffca9372399976a2d260a407cc74cce6', - 'duration': 5599, - 'thumbnail': r're:^https?://.*\.jpg$', - }, - 'skip': 'Only works from Russia', - }, - { - 'url': 'https://www.ivi.tv/watch/33560/', - 'only_matching': True, - }, - ] - - # Sorted by quality - _KNOWN_FORMATS = ( - 'MP4-low-mobile', 'MP4-mobile', 'FLV-lo', 'MP4-lo', 'FLV-hi', 'MP4-hi', - 'MP4-SHQ', 'MP4-HD720', 'MP4-HD1080') - - def _real_extract(self, url): - video_id = self._match_id(url) - - data = json.dumps({ - 'method': 'da.content.get', - 'params': [ - video_id, { - 'site': 's%d', - 'referrer': 'http://www.ivi.ru/watch/%s' % video_id, - 'contentid': video_id - } - ] - }) - - bundled = hasattr(sys, 'frozen') - - for site in (353, 183): - content_data = (data % site).encode() - if site == 353: - if bundled: - continue - try: - from Cryptodome.Cipher import Blowfish - from Cryptodome.Hash import CMAC - pycryptodomex_found = True - except ImportError: - pycryptodomex_found = False - continue - - timestamp = (self._download_json( - self._LIGHT_URL, video_id, - 'Downloading timestamp JSON', data=json.dumps({ - 'method': 'da.timestamp.get', - 'params': [] - }).encode(), fatal=False) or {}).get('result') - if not timestamp: - continue - - query = { - 'ts': timestamp, - 'sign': CMAC.new(self._LIGHT_KEY, timestamp.encode() + content_data, Blowfish).hexdigest(), - } - else: - query = {} - - video_json = self._download_json( - self._LIGHT_URL, video_id, - 'Downloading video JSON', data=content_data, query=query) - - error = video_json.get('error') - if error: - origin = error.get('origin') - message = error.get('message') or error.get('user_message') - extractor_msg = 'Unable to download video %s' - if origin == 'NotAllowedForLocation': - self.raise_geo_restricted(message, self._GEO_COUNTRIES) - elif origin == 'NoRedisValidData': - extractor_msg = 'Video %s does not exist' - elif site == 353: - continue - elif bundled: - raise ExtractorError( - 'This feature does not work from bundled exe. Run youtube-dl from sources.', - expected=True) - elif not pycryptodomex_found: - raise ExtractorError( - 'pycryptodomex not found. Please install it.', - expected=True) - elif message: - extractor_msg += ': ' + message - raise ExtractorError(extractor_msg % video_id, expected=True) - else: - break - - result = video_json['result'] - title = result['title'] - - quality = qualities(self._KNOWN_FORMATS) - - formats = [] - for f in result.get('files', []): - f_url = f.get('url') - content_format = f.get('content_format') - if not f_url or '-MDRM-' in content_format or '-FPS-' in content_format: - continue - formats.append({ - 'url': f_url, - 'format_id': content_format, - 'quality': quality(content_format), - 'filesize': int_or_none(f.get('size_in_bytes')), - }) - self._sort_formats(formats) - - compilation = result.get('compilation') - episode = title if compilation else None - - title = '%s - %s' % (compilation, title) if compilation is not None else title - - thumbnails = [{ - 'url': preview['url'], - 'id': preview.get('content_format'), - } for preview in result.get('preview', []) if preview.get('url')] - - webpage = self._download_webpage(url, video_id) - - season = self._search_regex( - r'<li[^>]+class="season active"[^>]*><a[^>]+>([^<]+)', - webpage, 'season', default=None) - season_number = int_or_none(self._search_regex( - r'<li[^>]+class="season active"[^>]*><a[^>]+data-season(?:-index)?="(\d+)"', - webpage, 'season number', default=None)) - - episode_number = int_or_none(self._search_regex( - r'[^>]+itemprop="episode"[^>]*>\s*<meta[^>]+itemprop="episodeNumber"[^>]+content="(\d+)', - webpage, 'episode number', default=None)) - - description = self._og_search_description(webpage, default=None) or self._html_search_meta( - 'description', webpage, 'description', default=None) - - return { - 'id': video_id, - 'title': title, - 'series': compilation, - 'season': season, - 'season_number': season_number, - 'episode': episode, - 'episode_number': episode_number, - 'thumbnails': thumbnails, - 'description': description, - 'duration': int_or_none(result.get('duration')), - 'formats': formats, - } - - -class IviCompilationIE(InfoExtractor): - IE_DESC = 'ivi.ru compilations' - IE_NAME = 'ivi:compilation' - _VALID_URL = r'https?://(?:www\.)?ivi\.ru/watch/(?!\d+)(?P<compilationid>[a-z\d_-]+)(?:/season(?P<seasonid>\d+))?$' - _TESTS = [{ - 'url': 'http://www.ivi.ru/watch/dvoe_iz_lartsa', - 'info_dict': { - 'id': 'dvoe_iz_lartsa', - 'title': 'Двое из ларца (2006 - 2008)', - }, - 'playlist_mincount': 24, - }, { - 'url': 'http://www.ivi.ru/watch/dvoe_iz_lartsa/season1', - 'info_dict': { - 'id': 'dvoe_iz_lartsa/season1', - 'title': 'Двое из ларца (2006 - 2008) 1 сезон', - }, - 'playlist_mincount': 12, - }] - - def _extract_entries(self, html, compilation_id): - return [ - self.url_result( - 'http://www.ivi.ru/watch/%s/%s' % (compilation_id, serie), IviIE.ie_key()) - for serie in re.findall( - r'<a\b[^>]+\bhref=["\']/watch/%s/(\d+)["\']' % compilation_id, html)] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - compilation_id = mobj.group('compilationid') - season_id = mobj.group('seasonid') - - if season_id is not None: # Season link - season_page = self._download_webpage( - url, compilation_id, 'Downloading season %s web page' % season_id) - playlist_id = '%s/season%s' % (compilation_id, season_id) - playlist_title = self._html_search_meta('title', season_page, 'title') - entries = self._extract_entries(season_page, compilation_id) - else: # Compilation link - compilation_page = self._download_webpage(url, compilation_id, 'Downloading compilation web page') - playlist_id = compilation_id - playlist_title = self._html_search_meta('title', compilation_page, 'title') - seasons = re.findall( - r'<a href="/watch/%s/season(\d+)' % compilation_id, compilation_page) - if not seasons: # No seasons in this compilation - entries = self._extract_entries(compilation_page, compilation_id) - else: - entries = [] - for season_id in seasons: - season_page = self._download_webpage( - 'http://www.ivi.ru/watch/%s/season%s' % (compilation_id, season_id), - compilation_id, 'Downloading season %s web page' % season_id) - entries.extend(self._extract_entries(season_page, compilation_id)) - - return self.playlist_result(entries, playlist_id, playlist_title) diff --git a/youtube_dl/extractor/ivideon.py b/youtube_dl/extractor/ivideon.py deleted file mode 100644 index 3ca824f79..000000000 --- a/youtube_dl/extractor/ivideon.py +++ /dev/null @@ -1,83 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_urlencode, - compat_urlparse, -) -from ..utils import qualities - - -class IvideonIE(InfoExtractor): - IE_NAME = 'ivideon' - IE_DESC = 'Ivideon TV' - _VALID_URL = r'https?://(?:www\.)?ivideon\.com/tv/(?:[^/]+/)*camera/(?P<id>\d+-[\da-f]+)/(?P<camera_id>\d+)' - _TESTS = [{ - 'url': 'https://www.ivideon.com/tv/camera/100-916ca13b5c4ad9f564266424a026386d/0/', - 'info_dict': { - 'id': '100-916ca13b5c4ad9f564266424a026386d', - 'ext': 'flv', - 'title': 're:^Касса [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'description': 'Основное предназначение - запись действий кассиров. Плюс общий вид.', - 'is_live': True, - }, - 'params': { - 'skip_download': True, - } - }, { - 'url': 'https://www.ivideon.com/tv/camera/100-c4ee4cb9ede885cf62dfbe93d7b53783/589824/?lang=ru', - 'only_matching': True, - }, { - 'url': 'https://www.ivideon.com/tv/map/22.917923/-31.816406/16/camera/100-e7bc16c7d4b5bbd633fd5350b66dfa9a/0', - 'only_matching': True, - }] - - _QUALITIES = ('low', 'mid', 'hi') - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - server_id, camera_id = mobj.group('id'), mobj.group('camera_id') - camera_name, description = None, None - camera_url = compat_urlparse.urljoin( - url, '/tv/camera/%s/%s/' % (server_id, camera_id)) - - webpage = self._download_webpage(camera_url, server_id, fatal=False) - if webpage: - config_string = self._search_regex( - r'var\s+config\s*=\s*({.+?});', webpage, 'config', default=None) - if config_string: - config = self._parse_json(config_string, server_id, fatal=False) - camera_info = config.get('ivTvAppOptions', {}).get('currentCameraInfo') - if camera_info: - camera_name = camera_info.get('camera_name') - description = camera_info.get('misc', {}).get('description') - if not camera_name: - camera_name = self._html_search_meta( - 'name', webpage, 'camera name', default=None) or self._search_regex( - r'<h1[^>]+class="b-video-title"[^>]*>([^<]+)', webpage, 'camera name', default=None) - - quality = qualities(self._QUALITIES) - - formats = [{ - 'url': 'https://streaming.ivideon.com/flv/live?%s' % compat_urllib_parse_urlencode({ - 'server': server_id, - 'camera': camera_id, - 'sessionId': 'demo', - 'q': quality(format_id), - }), - 'format_id': format_id, - 'ext': 'flv', - 'quality': quality(format_id), - } for format_id in self._QUALITIES] - self._sort_formats(formats) - - return { - 'id': server_id, - 'title': self._live_title(camera_name or server_id), - 'description': description, - 'is_live': True, - 'formats': formats, - } diff --git a/youtube_dl/extractor/iwara.py b/youtube_dl/extractor/iwara.py deleted file mode 100644 index 907d5fc8b..000000000 --- a/youtube_dl/extractor/iwara.py +++ /dev/null @@ -1,99 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlparse -from ..utils import ( - int_or_none, - mimetype2ext, - remove_end, - url_or_none, -) - - -class IwaraIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.|ecchi\.)?iwara\.tv/videos/(?P<id>[a-zA-Z0-9]+)' - _TESTS = [{ - 'url': 'http://iwara.tv/videos/amVwUl1EHpAD9RD', - # md5 is unstable - 'info_dict': { - 'id': 'amVwUl1EHpAD9RD', - 'ext': 'mp4', - 'title': '【MMD R-18】ガールフレンド carry_me_off', - 'age_limit': 18, - }, - }, { - 'url': 'http://ecchi.iwara.tv/videos/Vb4yf2yZspkzkBO', - 'md5': '7e5f1f359cd51a027ba4a7b7710a50f0', - 'info_dict': { - 'id': '0B1LvuHnL-sRFNXB1WHNqbGw4SXc', - 'ext': 'mp4', - 'title': '[3D Hentai] Kyonyu × Genkai × Emaki Shinobi Girls.mp4', - 'age_limit': 18, - }, - 'add_ie': ['GoogleDrive'], - }, { - 'url': 'http://www.iwara.tv/videos/nawkaumd6ilezzgq', - # md5 is unstable - 'info_dict': { - 'id': '6liAP9s2Ojc', - 'ext': 'mp4', - 'age_limit': 18, - 'title': '[MMD] Do It Again Ver.2 [1080p 60FPS] (Motion,Camera,Wav+DL)', - 'description': 'md5:590c12c0df1443d833fbebe05da8c47a', - 'upload_date': '20160910', - 'uploader': 'aMMDsork', - 'uploader_id': 'UCVOFyOSCyFkXTYYHITtqB7A', - }, - 'add_ie': ['Youtube'], - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage, urlh = self._download_webpage_handle(url, video_id) - - hostname = compat_urllib_parse_urlparse(urlh.geturl()).hostname - # ecchi is 'sexy' in Japanese - age_limit = 18 if hostname.split('.')[0] == 'ecchi' else 0 - - video_data = self._download_json('http://www.iwara.tv/api/video/%s' % video_id, video_id) - - if not video_data: - iframe_url = self._html_search_regex( - r'<iframe[^>]+src=([\'"])(?P<url>[^\'"]+)\1', - webpage, 'iframe URL', group='url') - return { - '_type': 'url_transparent', - 'url': iframe_url, - 'age_limit': age_limit, - } - - title = remove_end(self._html_search_regex( - r'<title>([^<]+)</title>', webpage, 'title'), ' | Iwara') - - formats = [] - for a_format in video_data: - format_uri = url_or_none(a_format.get('uri')) - if not format_uri: - continue - format_id = a_format.get('resolution') - height = int_or_none(self._search_regex( - r'(\d+)p', format_id, 'height', default=None)) - formats.append({ - 'url': self._proto_relative_url(format_uri, 'https:'), - 'format_id': format_id, - 'ext': mimetype2ext(a_format.get('mime')) or 'mp4', - 'height': height, - 'width': int_or_none(height / 9.0 * 16.0 if height else None), - 'quality': 1 if format_id == 'Source' else 0, - }) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'age_limit': age_limit, - 'formats': formats, - } diff --git a/youtube_dl/extractor/jeuxvideo.py b/youtube_dl/extractor/jeuxvideo.py deleted file mode 100644 index e9f4ed738..000000000 --- a/youtube_dl/extractor/jeuxvideo.py +++ /dev/null @@ -1,56 +0,0 @@ -# coding: utf-8 - -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - - -class JeuxVideoIE(InfoExtractor): - _VALID_URL = r'https?://.*?\.jeuxvideo\.com/.*/(.*?)\.htm' - - _TESTS = [{ - 'url': 'http://www.jeuxvideo.com/reportages-videos-jeux/0004/00046170/tearaway-playstation-vita-gc-2013-tearaway-nous-presente-ses-papiers-d-identite-00115182.htm', - 'md5': '046e491afb32a8aaac1f44dd4ddd54ee', - 'info_dict': { - 'id': '114765', - 'ext': 'mp4', - 'title': 'Tearaway : GC 2013 : Tearaway nous présente ses papiers d\'identité', - 'description': 'Lorsque les développeurs de LittleBigPlanet proposent un nouveau titre, on ne peut que s\'attendre à un résultat original et fort attrayant.', - }, - }, { - 'url': 'http://www.jeuxvideo.com/videos/chroniques/434220/l-histoire-du-jeu-video-la-saturn.htm', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - title = mobj.group(1) - webpage = self._download_webpage(url, title) - title = self._html_search_meta('name', webpage) or self._og_search_title(webpage) - config_url = self._html_search_regex( - r'data-src(?:set-video)?="(/contenu/medias/video\.php.*?)"', - webpage, 'config URL') - config_url = 'http://www.jeuxvideo.com' + config_url - - video_id = self._search_regex( - r'id=(\d+)', - config_url, 'video ID') - - config = self._download_json( - config_url, title, 'Downloading JSON config') - - formats = [{ - 'url': source['file'], - 'format_id': source['label'], - 'resolution': source['label'], - } for source in reversed(config['sources'])] - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'description': self._og_search_description(webpage), - 'thumbnail': config.get('image'), - } diff --git a/youtube_dl/extractor/joj.py b/youtube_dl/extractor/joj.py deleted file mode 100644 index 62b28e980..000000000 --- a/youtube_dl/extractor/joj.py +++ /dev/null @@ -1,108 +0,0 @@ -# coding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..compat import compat_str
-from ..utils import (
- int_or_none,
- js_to_json,
- try_get,
-)
-
-
-class JojIE(InfoExtractor):
- _VALID_URL = r'''(?x)
- (?:
- joj:|
- https?://media\.joj\.sk/embed/
- )
- (?P<id>[^/?#^]+)
- '''
- _TESTS = [{
- 'url': 'https://media.joj.sk/embed/a388ec4c-6019-4a4a-9312-b1bee194e932',
- 'info_dict': {
- 'id': 'a388ec4c-6019-4a4a-9312-b1bee194e932',
- 'ext': 'mp4',
- 'title': 'NOVÉ BÝVANIE',
- 'thumbnail': r're:^https?://.*\.jpg$',
- 'duration': 3118,
- }
- }, {
- 'url': 'https://media.joj.sk/embed/9i1cxv',
- 'only_matching': True,
- }, {
- 'url': 'joj:a388ec4c-6019-4a4a-9312-b1bee194e932',
- 'only_matching': True,
- }, {
- 'url': 'joj:9i1cxv',
- 'only_matching': True,
- }]
-
- @staticmethod
- def _extract_urls(webpage):
- return [
- mobj.group('url')
- for mobj in re.finditer(
- r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//media\.joj\.sk/embed/(?:(?!\1).)+)\1',
- webpage)]
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
-
- webpage = self._download_webpage(
- 'https://media.joj.sk/embed/%s' % video_id, video_id)
-
- title = self._search_regex(
- (r'videoTitle\s*:\s*(["\'])(?P<title>(?:(?!\1).)+)\1',
- r'<title>(?P<title>[^<]+)'), webpage, 'title',
- default=None, group='title') or self._og_search_title(webpage)
-
- bitrates = self._parse_json(
- self._search_regex(
- r'(?s)(?:src|bitrates)\s*=\s*({.+?});', webpage, 'bitrates',
- default='{}'),
- video_id, transform_source=js_to_json, fatal=False)
-
- formats = []
- for format_url in try_get(bitrates, lambda x: x['mp4'], list) or []:
- if isinstance(format_url, compat_str):
- height = self._search_regex(
- r'(\d+)[pP]\.', format_url, 'height', default=None)
- formats.append({
- 'url': format_url,
- 'format_id': '%sp' % height if height else None,
- 'height': int(height),
- })
- if not formats:
- playlist = self._download_xml(
- 'https://media.joj.sk/services/Video.php?clip=%s' % video_id,
- video_id)
- for file_el in playlist.findall('./files/file'):
- path = file_el.get('path')
- if not path:
- continue
- format_id = file_el.get('id') or file_el.get('label')
- formats.append({
- 'url': 'http://n16.joj.sk/storage/%s' % path.replace(
- 'dat/', '', 1),
- 'format_id': format_id,
- 'height': int_or_none(self._search_regex(
- r'(\d+)[pP]', format_id or path, 'height',
- default=None)),
- })
- self._sort_formats(formats)
-
- thumbnail = self._og_search_thumbnail(webpage)
-
- duration = int_or_none(self._search_regex(
- r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False))
-
- return {
- 'id': video_id,
- 'title': title,
- 'thumbnail': thumbnail,
- 'duration': duration,
- 'formats': formats,
- }
diff --git a/youtube_dl/extractor/jove.py b/youtube_dl/extractor/jove.py deleted file mode 100644 index 27e0e37f6..000000000 --- a/youtube_dl/extractor/jove.py +++ /dev/null @@ -1,80 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - unified_strdate -) - - -class JoveIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?jove\.com/video/(?P<id>[0-9]+)' - _CHAPTERS_URL = 'http://www.jove.com/video-chapters?videoid={video_id:}' - _TESTS = [ - { - 'url': 'http://www.jove.com/video/2744/electrode-positioning-montage-transcranial-direct-current', - 'md5': '93723888d82dbd6ba8b3d7d0cd65dd2b', - 'info_dict': { - 'id': '2744', - 'ext': 'mp4', - 'title': 'Electrode Positioning and Montage in Transcranial Direct Current Stimulation', - 'description': 'md5:015dd4509649c0908bc27f049e0262c6', - 'thumbnail': r're:^https?://.*\.png$', - 'upload_date': '20110523', - } - }, - { - 'url': 'http://www.jove.com/video/51796/culturing-caenorhabditis-elegans-axenic-liquid-media-creation', - 'md5': '914aeb356f416811d911996434811beb', - 'info_dict': { - 'id': '51796', - 'ext': 'mp4', - 'title': 'Culturing Caenorhabditis elegans in Axenic Liquid Media and Creation of Transgenic Worms by Microparticle Bombardment', - 'description': 'md5:35ff029261900583970c4023b70f1dc9', - 'thumbnail': r're:^https?://.*\.png$', - 'upload_date': '20140802', - } - }, - - ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - webpage = self._download_webpage(url, video_id) - - chapters_id = self._html_search_regex( - r'/video-chapters\?videoid=([0-9]+)', webpage, 'chapters id') - - chapters_xml = self._download_xml( - self._CHAPTERS_URL.format(video_id=chapters_id), - video_id, note='Downloading chapters XML', - errnote='Failed to download chapters XML') - - video_url = chapters_xml.attrib.get('video') - if not video_url: - raise ExtractorError('Failed to get the video URL') - - title = self._html_search_meta('citation_title', webpage, 'title') - thumbnail = self._og_search_thumbnail(webpage) - description = self._html_search_regex( - r'<div id="section_body_summary"><p class="jove_content">(.+?)</p>', - webpage, 'description', fatal=False) - publish_date = unified_strdate(self._html_search_meta( - 'citation_publication_date', webpage, 'publish date', fatal=False)) - comment_count = int(self._html_search_regex( - r'<meta name="num_comments" content="(\d+) Comments?"', - webpage, 'comment count', fatal=False)) - - return { - 'id': video_id, - 'title': title, - 'url': video_url, - 'thumbnail': thumbnail, - 'description': description, - 'upload_date': publish_date, - 'comment_count': comment_count, - } diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py deleted file mode 100644 index c34b5f5e6..000000000 --- a/youtube_dl/extractor/jwplatform.py +++ /dev/null @@ -1,46 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import unsmuggle_url - - -class JWPlatformIE(InfoExtractor): - _VALID_URL = r'(?:https?://(?:content\.jwplatform|cdn\.jwplayer)\.com/(?:(?:feed|player|thumb|preview)s|jw6|v2/media)/|jwplatform:)(?P<id>[a-zA-Z0-9]{8})' - _TESTS = [{ - 'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js', - 'md5': 'fa8899fa601eb7c83a64e9d568bdf325', - 'info_dict': { - 'id': 'nPripu9l', - 'ext': 'mov', - 'title': 'Big Buck Bunny Trailer', - 'description': 'Big Buck Bunny is a short animated film by the Blender Institute. It is made using free and open source software.', - 'upload_date': '20081127', - 'timestamp': 1227796140, - } - }, { - 'url': 'https://cdn.jwplayer.com/players/nPripu9l-ALJ3XQCI.js', - 'only_matching': True, - }] - - @staticmethod - def _extract_url(webpage): - urls = JWPlatformIE._extract_urls(webpage) - return urls[0] if urls else None - - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<(?:script|iframe)[^>]+?src=["\']((?:https?:)?//(?:content\.jwplatform|cdn\.jwplayer)\.com/players/[a-zA-Z0-9]{8})', - webpage) - - def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) - self._initialize_geo_bypass({ - 'countries': smuggled_data.get('geo_countries'), - }) - video_id = self._match_id(url) - json_data = self._download_json('https://cdn.jwplayer.com/v2/media/' + video_id, video_id) - return self._parse_jwplayer_data(json_data, video_id) diff --git a/youtube_dl/extractor/kakao.py b/youtube_dl/extractor/kakao.py deleted file mode 100644 index 31ce7a85c..000000000 --- a/youtube_dl/extractor/kakao.py +++ /dev/null @@ -1,143 +0,0 @@ -# coding: utf-8 - -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..compat import compat_HTTPError -from ..utils import ( - ExtractorError, - int_or_none, - str_or_none, - strip_or_none, - try_get, - unified_timestamp, - update_url_query, -) - - -class KakaoIE(InfoExtractor): - _VALID_URL = r'https?://(?:play-)?tv\.kakao\.com/(?:channel/\d+|embed/player)/cliplink/(?P<id>\d+|[^?#&]+@my)' - _API_BASE_TMPL = 'http://tv.kakao.com/api/v1/ft/cliplinks/%s/' - - _TESTS = [{ - 'url': 'http://tv.kakao.com/channel/2671005/cliplink/301965083', - 'md5': '702b2fbdeb51ad82f5c904e8c0766340', - 'info_dict': { - 'id': '301965083', - 'ext': 'mp4', - 'title': '乃木坂46 バナナマン 「3期生紹介コーナーが始動!顔高低差GPも!」 『乃木坂工事中』', - 'uploader_id': '2671005', - 'uploader': '그랑그랑이', - 'timestamp': 1488160199, - 'upload_date': '20170227', - } - }, { - 'url': 'http://tv.kakao.com/channel/2653210/cliplink/300103180', - 'md5': 'a8917742069a4dd442516b86e7d66529', - 'info_dict': { - 'id': '300103180', - 'ext': 'mp4', - 'description': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)\r\n\r\n[쇼! 음악중심] 20160611, 507회', - 'title': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)', - 'uploader_id': '2653210', - 'uploader': '쇼! 음악중심', - 'timestamp': 1485684628, - 'upload_date': '20170129', - } - }, { - # geo restricted - 'url': 'https://tv.kakao.com/channel/3643855/cliplink/412069491', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - display_id = video_id.rstrip('@my') - api_base = self._API_BASE_TMPL % video_id - - player_header = { - 'Referer': update_url_query( - 'http://tv.kakao.com/embed/player/cliplink/%s' % video_id, { - 'service': 'kakao_tv', - 'autoplay': '1', - 'profile': 'HIGH', - 'wmode': 'transparent', - }) - } - - query = { - 'player': 'monet_html5', - 'referer': url, - 'uuid': '', - 'service': 'kakao_tv', - 'section': '', - 'dteType': 'PC', - 'fields': ','.join([ - '-*', 'tid', 'clipLink', 'displayTitle', 'clip', 'title', - 'description', 'channelId', 'createTime', 'duration', 'playCount', - 'likeCount', 'commentCount', 'tagList', 'channel', 'name', 'thumbnailUrl', - 'videoOutputList', 'width', 'height', 'kbps', 'profile', 'label']) - } - - impress = self._download_json( - api_base + 'impress', display_id, 'Downloading video info', - query=query, headers=player_header) - - clip_link = impress['clipLink'] - clip = clip_link['clip'] - - title = clip.get('title') or clip_link.get('displayTitle') - - query.update({ - 'fields': '-*,code,message,url', - 'tid': impress.get('tid') or '', - }) - - formats = [] - for fmt in (clip.get('videoOutputList') or []): - try: - profile_name = fmt['profile'] - if profile_name == 'AUDIO': - continue - query['profile'] = profile_name - try: - fmt_url_json = self._download_json( - api_base + 'raw/videolocation', display_id, - 'Downloading video URL for profile %s' % profile_name, - query=query, headers=player_header) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - resp = self._parse_json(e.cause.read().decode(), video_id) - if resp.get('code') == 'GeoBlocked': - self.raise_geo_restricted() - continue - - fmt_url = fmt_url_json['url'] - formats.append({ - 'url': fmt_url, - 'format_id': profile_name, - 'width': int_or_none(fmt.get('width')), - 'height': int_or_none(fmt.get('height')), - 'format_note': fmt.get('label'), - 'filesize': int_or_none(fmt.get('filesize')), - 'tbr': int_or_none(fmt.get('kbps')), - }) - except KeyError: - pass - self._sort_formats(formats) - - return { - 'id': display_id, - 'title': title, - 'description': strip_or_none(clip.get('description')), - 'uploader': try_get(clip_link, lambda x: x['channel']['name']), - 'uploader_id': str_or_none(clip_link.get('channelId')), - 'thumbnail': clip.get('thumbnailUrl'), - 'timestamp': unified_timestamp(clip_link.get('createTime')), - 'duration': int_or_none(clip.get('duration')), - 'view_count': int_or_none(clip.get('playCount')), - 'like_count': int_or_none(clip.get('likeCount')), - 'comment_count': int_or_none(clip.get('commentCount')), - 'formats': formats, - 'tags': clip.get('tagList'), - } diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py deleted file mode 100644 index c731612c4..000000000 --- a/youtube_dl/extractor/kaltura.py +++ /dev/null @@ -1,377 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re -import base64 - -from .common import InfoExtractor -from ..compat import ( - compat_urlparse, - compat_parse_qs, -) -from ..utils import ( - clean_html, - ExtractorError, - int_or_none, - unsmuggle_url, - smuggle_url, -) - - -class KalturaIE(InfoExtractor): - _VALID_URL = r'''(?x) - (?: - kaltura:(?P<partner_id>\d+):(?P<id>[0-9a-z_]+)| - https?:// - (:?(?:www|cdnapi(?:sec)?)\.)?kaltura\.com(?::\d+)?/ - (?: - (?: - # flash player - index\.php/(?:kwidget|extwidget/preview)| - # html5 player - html5/html5lib/[^/]+/mwEmbedFrame\.php - ) - )(?:/(?P<path>[^?]+))?(?:\?(?P<query>.*))? - ) - ''' - _SERVICE_URL = 'http://cdnapi.kaltura.com' - _SERVICE_BASE = '/api_v3/index.php' - # See https://github.com/kaltura/server/blob/master/plugins/content/caption/base/lib/model/enums/CaptionType.php - _CAPTION_TYPES = { - 1: 'srt', - 2: 'ttml', - 3: 'vtt', - } - _TESTS = [ - { - 'url': 'kaltura:269692:1_1jc2y3e4', - 'md5': '3adcbdb3dcc02d647539e53f284ba171', - 'info_dict': { - 'id': '1_1jc2y3e4', - 'ext': 'mp4', - 'title': 'Straight from the Heart', - 'upload_date': '20131219', - 'uploader_id': 'mlundberg@wolfgangsvault.com', - 'description': 'The Allman Brothers Band, 12/16/1981', - 'thumbnail': 're:^https?://.*/thumbnail/.*', - 'timestamp': int, - }, - }, - { - 'url': 'http://www.kaltura.com/index.php/kwidget/cache_st/1300318621/wid/_269692/uiconf_id/3873291/entry_id/1_1jc2y3e4', - 'only_matching': True, - }, - { - 'url': 'https://cdnapisec.kaltura.com/index.php/kwidget/wid/_557781/uiconf_id/22845202/entry_id/1_plr1syf3', - 'only_matching': True, - }, - { - 'url': 'https://cdnapisec.kaltura.com/html5/html5lib/v2.30.2/mwEmbedFrame.php/p/1337/uiconf_id/20540612/entry_id/1_sf5ovm7u?wid=_243342', - 'only_matching': True, - }, - { - # video with subtitles - 'url': 'kaltura:111032:1_cw786r8q', - 'only_matching': True, - }, - { - # video with ttml subtitles (no fileExt) - 'url': 'kaltura:1926081:0_l5ye1133', - 'info_dict': { - 'id': '0_l5ye1133', - 'ext': 'mp4', - 'title': 'What Can You Do With Python?', - 'upload_date': '20160221', - 'uploader_id': 'stork', - 'thumbnail': 're:^https?://.*/thumbnail/.*', - 'timestamp': int, - 'subtitles': { - 'en': [{ - 'ext': 'ttml', - }], - }, - }, - 'skip': 'Gone. Maybe https://www.safaribooksonline.com/library/tutorials/introduction-to-python-anon/3469/', - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'https://www.kaltura.com/index.php/extwidget/preview/partner_id/1770401/uiconf_id/37307382/entry_id/0_58u8kme7/embed/iframe?&flashvars[streamerType]=auto', - 'only_matching': True, - }, - { - 'url': 'https://www.kaltura.com:443/index.php/extwidget/preview/partner_id/1770401/uiconf_id/37307382/entry_id/0_58u8kme7/embed/iframe?&flashvars[streamerType]=auto', - 'only_matching': True, - }, - { - # unavailable source format - 'url': 'kaltura:513551:1_66x4rg7o', - 'only_matching': True, - } - ] - - @staticmethod - def _extract_url(webpage): - urls = KalturaIE._extract_urls(webpage) - return urls[0] if urls else None - - @staticmethod - def _extract_urls(webpage): - # Embed codes: https://knowledge.kaltura.com/embedding-kaltura-media-players-your-site - finditer = ( - list(re.finditer( - r"""(?xs) - kWidget\.(?:thumb)?[Ee]mbed\( - \{.*? - (?P<q1>['"])wid(?P=q1)\s*:\s* - (?P<q2>['"])_?(?P<partner_id>(?:(?!(?P=q2)).)+)(?P=q2),.*? - (?P<q3>['"])entry_?[Ii]d(?P=q3)\s*:\s* - (?P<q4>['"])(?P<id>(?:(?!(?P=q4)).)+)(?P=q4)(?:,|\s*\}) - """, webpage)) - or list(re.finditer( - r'''(?xs) - (?P<q1>["']) - (?:https?:)?//cdnapi(?:sec)?\.kaltura\.com(?::\d+)?/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P<partner_id>\d+)(?:(?!(?P=q1)).)* - (?P=q1).*? - (?: - (?: - entry_?[Ii]d| - (?P<q2>["'])entry_?[Ii]d(?P=q2) - )\s*:\s*| - \[\s*(?P<q2_1>["'])entry_?[Ii]d(?P=q2_1)\s*\]\s*=\s* - ) - (?P<q3>["'])(?P<id>(?:(?!(?P=q3)).)+)(?P=q3) - ''', webpage)) - or list(re.finditer( - r'''(?xs) - <(?:iframe[^>]+src|meta[^>]+\bcontent)=(?P<q1>["'])\s* - (?:https?:)?//(?:(?:www|cdnapi(?:sec)?)\.)?kaltura\.com/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P<partner_id>\d+) - (?:(?!(?P=q1)).)* - [?&;]entry_id=(?P<id>(?:(?!(?P=q1))[^&])+) - (?:(?!(?P=q1)).)* - (?P=q1) - ''', webpage)) - ) - urls = [] - for mobj in finditer: - embed_info = mobj.groupdict() - for k, v in embed_info.items(): - if v: - embed_info[k] = v.strip() - url = 'kaltura:%(partner_id)s:%(id)s' % embed_info - escaped_pid = re.escape(embed_info['partner_id']) - service_mobj = re.search( - r'<script[^>]+src=(["\'])(?P<id>(?:https?:)?//(?:(?!\1).)+)/p/%s/sp/%s00/embedIframeJs' % (escaped_pid, escaped_pid), - webpage) - if service_mobj: - url = smuggle_url(url, {'service_url': service_mobj.group('id')}) - urls.append(url) - return urls - - def _kaltura_api_call(self, video_id, actions, service_url=None, *args, **kwargs): - params = actions[0] - if len(actions) > 1: - for i, a in enumerate(actions[1:], start=1): - for k, v in a.items(): - params['%d:%s' % (i, k)] = v - - data = self._download_json( - (service_url or self._SERVICE_URL) + self._SERVICE_BASE, - video_id, query=params, *args, **kwargs) - - status = data if len(actions) == 1 else data[0] - if status.get('objectType') == 'KalturaAPIException': - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, status['message'])) - - return data - - def _get_video_info(self, video_id, partner_id, service_url=None): - actions = [ - { - 'action': 'null', - 'apiVersion': '3.1.5', - 'clientTag': 'kdp:v3.8.5', - 'format': 1, # JSON, 2 = XML, 3 = PHP - 'service': 'multirequest', - }, - { - 'expiry': 86400, - 'service': 'session', - 'action': 'startWidgetSession', - 'widgetId': '_%s' % partner_id, - }, - { - 'action': 'get', - 'entryId': video_id, - 'service': 'baseentry', - 'ks': '{1:result:ks}', - 'responseProfile:fields': 'createdAt,dataUrl,duration,name,plays,thumbnailUrl,userId', - 'responseProfile:type': 1, - }, - { - 'action': 'getbyentryid', - 'entryId': video_id, - 'service': 'flavorAsset', - 'ks': '{1:result:ks}', - }, - { - 'action': 'list', - 'filter:entryIdEqual': video_id, - 'service': 'caption_captionasset', - 'ks': '{1:result:ks}', - }, - ] - return self._kaltura_api_call( - video_id, actions, service_url, note='Downloading video info JSON') - - def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) - - mobj = re.match(self._VALID_URL, url) - partner_id, entry_id = mobj.group('partner_id', 'id') - ks = None - captions = None - if partner_id and entry_id: - _, info, flavor_assets, captions = self._get_video_info(entry_id, partner_id, smuggled_data.get('service_url')) - else: - path, query = mobj.group('path', 'query') - if not path and not query: - raise ExtractorError('Invalid URL', expected=True) - params = {} - if query: - params = compat_parse_qs(query) - if path: - splitted_path = path.split('/') - params.update(dict((zip(splitted_path[::2], [[v] for v in splitted_path[1::2]])))) - if 'wid' in params: - partner_id = params['wid'][0][1:] - elif 'p' in params: - partner_id = params['p'][0] - elif 'partner_id' in params: - partner_id = params['partner_id'][0] - else: - raise ExtractorError('Invalid URL', expected=True) - if 'entry_id' in params: - entry_id = params['entry_id'][0] - _, info, flavor_assets, captions = self._get_video_info(entry_id, partner_id) - elif 'uiconf_id' in params and 'flashvars[referenceId]' in params: - reference_id = params['flashvars[referenceId]'][0] - webpage = self._download_webpage(url, reference_id) - entry_data = self._parse_json(self._search_regex( - r'window\.kalturaIframePackageData\s*=\s*({.*});', - webpage, 'kalturaIframePackageData'), - reference_id)['entryResult'] - info, flavor_assets = entry_data['meta'], entry_data['contextData']['flavorAssets'] - entry_id = info['id'] - # Unfortunately, data returned in kalturaIframePackageData lacks - # captions so we will try requesting the complete data using - # regular approach since we now know the entry_id - try: - _, info, flavor_assets, captions = self._get_video_info( - entry_id, partner_id) - except ExtractorError: - # Regular scenario failed but we already have everything - # extracted apart from captions and can process at least - # with this - pass - else: - raise ExtractorError('Invalid URL', expected=True) - ks = params.get('flashvars[ks]', [None])[0] - - source_url = smuggled_data.get('source_url') - if source_url: - referrer = base64.b64encode( - '://'.join(compat_urlparse.urlparse(source_url)[:2]) - .encode('utf-8')).decode('utf-8') - else: - referrer = None - - def sign_url(unsigned_url): - if ks: - unsigned_url += '/ks/%s' % ks - if referrer: - unsigned_url += '?referrer=%s' % referrer - return unsigned_url - - data_url = info['dataUrl'] - if '/flvclipper/' in data_url: - data_url = re.sub(r'/flvclipper/.*', '/serveFlavor', data_url) - - formats = [] - for f in flavor_assets: - # Continue if asset is not ready - if f.get('status') != 2: - continue - # Original format that's not available (e.g. kaltura:1926081:0_c03e1b5g) - # skip for now. - if f.get('fileExt') == 'chun': - continue - # DRM-protected video, cannot be decrypted - if f.get('fileExt') == 'wvm': - continue - if not f.get('fileExt'): - # QT indicates QuickTime; some videos have broken fileExt - if f.get('containerFormat') == 'qt': - f['fileExt'] = 'mov' - else: - f['fileExt'] = 'mp4' - video_url = sign_url( - '%s/flavorId/%s' % (data_url, f['id'])) - format_id = '%(fileExt)s-%(bitrate)s' % f - # Source format may not be available (e.g. kaltura:513551:1_66x4rg7o) - if f.get('isOriginal') is True and not self._is_valid_url( - video_url, entry_id, format_id): - continue - # audio-only has no videoCodecId (e.g. kaltura:1926081:0_c03e1b5g - # -f mp4-56) - vcodec = 'none' if 'videoCodecId' not in f and f.get( - 'frameRate') == 0 else f.get('videoCodecId') - formats.append({ - 'format_id': format_id, - 'ext': f.get('fileExt'), - 'tbr': int_or_none(f['bitrate']), - 'fps': int_or_none(f.get('frameRate')), - 'filesize_approx': int_or_none(f.get('size'), invscale=1024), - 'container': f.get('containerFormat'), - 'vcodec': vcodec, - 'height': int_or_none(f.get('height')), - 'width': int_or_none(f.get('width')), - 'url': video_url, - }) - if '/playManifest/' in data_url: - m3u8_url = sign_url(data_url.replace( - 'format/url', 'format/applehttp')) - formats.extend(self._extract_m3u8_formats( - m3u8_url, entry_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - - self._sort_formats(formats) - - subtitles = {} - if captions: - for caption in captions.get('objects', []): - # Continue if caption is not ready - if caption.get('status') != 2: - continue - if not caption.get('id'): - continue - caption_format = int_or_none(caption.get('format')) - subtitles.setdefault(caption.get('languageCode') or caption.get('language'), []).append({ - 'url': '%s/api_v3/service/caption_captionasset/action/serve/captionAssetId/%s' % (self._SERVICE_URL, caption['id']), - 'ext': caption.get('fileExt') or self._CAPTION_TYPES.get(caption_format) or 'ttml', - }) - - return { - 'id': entry_id, - 'title': info['name'], - 'formats': formats, - 'subtitles': subtitles, - 'description': clean_html(info.get('description')), - 'thumbnail': info.get('thumbnailUrl'), - 'duration': info.get('duration'), - 'timestamp': info.get('createdAt'), - 'uploader_id': info.get('userId') if info.get('userId') != 'None' else None, - 'view_count': info.get('plays'), - } diff --git a/youtube_dl/extractor/keezmovies.py b/youtube_dl/extractor/keezmovies.py deleted file mode 100644 index c3eb74c17..000000000 --- a/youtube_dl/extractor/keezmovies.py +++ /dev/null @@ -1,133 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..aes import aes_decrypt_text -from ..compat import compat_urllib_parse_unquote -from ..utils import ( - determine_ext, - ExtractorError, - int_or_none, - str_to_int, - strip_or_none, - url_or_none, -) - - -class KeezMoviesIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?keezmovies\.com/video/(?:(?P<display_id>[^/]+)-)?(?P<id>\d+)' - _TESTS = [{ - 'url': 'https://www.keezmovies.com/video/arab-wife-want-it-so-bad-i-see-she-thirsty-and-has-tiny-money-18070681', - 'md5': '2ac69cdb882055f71d82db4311732a1a', - 'info_dict': { - 'id': '18070681', - 'display_id': 'arab-wife-want-it-so-bad-i-see-she-thirsty-and-has-tiny-money', - 'ext': 'mp4', - 'title': 'Arab wife want it so bad I see she thirsty and has tiny money.', - 'thumbnail': None, - 'view_count': int, - 'age_limit': 18, - } - }, { - 'url': 'http://www.keezmovies.com/video/18070681', - 'only_matching': True, - }] - - def _extract_info(self, url, fatal=True): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = (mobj.group('display_id') - if 'display_id' in mobj.groupdict() - else None) or mobj.group('id') - - webpage = self._download_webpage( - url, display_id, headers={'Cookie': 'age_verified=1'}) - - formats = [] - format_urls = set() - - title = None - thumbnail = None - duration = None - encrypted = False - - def extract_format(format_url, height=None): - format_url = url_or_none(format_url) - if not format_url or not format_url.startswith(('http', '//')): - return - if format_url in format_urls: - return - format_urls.add(format_url) - tbr = int_or_none(self._search_regex( - r'[/_](\d+)[kK][/_]', format_url, 'tbr', default=None)) - if not height: - height = int_or_none(self._search_regex( - r'[/_](\d+)[pP][/_]', format_url, 'height', default=None)) - if encrypted: - format_url = aes_decrypt_text( - video_url, title, 32).decode('utf-8') - formats.append({ - 'url': format_url, - 'format_id': '%dp' % height if height else None, - 'height': height, - 'tbr': tbr, - }) - - flashvars = self._parse_json( - self._search_regex( - r'flashvars\s*=\s*({.+?});', webpage, - 'flashvars', default='{}'), - display_id, fatal=False) - - if flashvars: - title = flashvars.get('video_title') - thumbnail = flashvars.get('image_url') - duration = int_or_none(flashvars.get('video_duration')) - encrypted = flashvars.get('encrypted') is True - for key, value in flashvars.items(): - mobj = re.search(r'quality_(\d+)[pP]', key) - if mobj: - extract_format(value, int(mobj.group(1))) - video_url = flashvars.get('video_url') - if video_url and determine_ext(video_url, None): - extract_format(video_url) - - video_url = self._html_search_regex( - r'flashvars\.video_url\s*=\s*(["\'])(?P<url>http.+?)\1', - webpage, 'video url', default=None, group='url') - if video_url: - extract_format(compat_urllib_parse_unquote(video_url)) - - if not formats: - if 'title="This video is no longer available"' in webpage: - raise ExtractorError( - 'Video %s is no longer available' % video_id, expected=True) - - try: - self._sort_formats(formats) - except ExtractorError: - if fatal: - raise - - if not title: - title = self._html_search_regex( - r'<h1[^>]*>([^<]+)', webpage, 'title') - - return webpage, { - 'id': video_id, - 'display_id': display_id, - 'title': strip_or_none(title), - 'thumbnail': thumbnail, - 'duration': duration, - 'age_limit': 18, - 'formats': formats, - } - - def _real_extract(self, url): - webpage, info = self._extract_info(url, fatal=False) - if not info['formats']: - return self.url_result(url, 'Generic') - info['view_count'] = str_to_int(self._search_regex( - r'<b>([\d,.]+)</b> Views?', webpage, 'view count', fatal=False)) - return info diff --git a/youtube_dl/extractor/kinja.py b/youtube_dl/extractor/kinja.py deleted file mode 100644 index 79e3026d2..000000000 --- a/youtube_dl/extractor/kinja.py +++ /dev/null @@ -1,221 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urllib_parse_unquote, -) -from ..utils import ( - int_or_none, - parse_iso8601, - strip_or_none, - try_get, - unescapeHTML, - urljoin, -) - - -class KinjaEmbedIE(InfoExtractor): - IENAME = 'kinja:embed' - _DOMAIN_REGEX = r'''(?:[^.]+\.)? - (?: - avclub| - clickhole| - deadspin| - gizmodo| - jalopnik| - jezebel| - kinja| - kotaku| - lifehacker| - splinternews| - the(?:inventory|onion|root|takeout) - )\.com''' - _COMMON_REGEX = r'''/ - (?: - ajax/inset| - embed/video - )/iframe\?.*?\bid=''' - _VALID_URL = r'''(?x)https?://%s%s - (?P<type> - fb| - imgur| - instagram| - jwp(?:layer)?-video| - kinjavideo| - mcp| - megaphone| - ooyala| - soundcloud(?:-playlist)?| - tumblr-post| - twitch-stream| - twitter| - ustream-channel| - vimeo| - vine| - youtube-(?:list|video) - )-(?P<id>[^&]+)''' % (_DOMAIN_REGEX, _COMMON_REGEX) - _TESTS = [{ - 'url': 'https://kinja.com/ajax/inset/iframe?id=fb-10103303356633621', - 'only_matching': True, - }, { - 'url': 'https://kinja.com/ajax/inset/iframe?id=kinjavideo-100313', - 'only_matching': True, - }, { - 'url': 'https://kinja.com/ajax/inset/iframe?id=megaphone-PPY1300931075', - 'only_matching': True, - }, { - 'url': 'https://kinja.com/ajax/inset/iframe?id=ooyala-xzMXhleDpopuT0u1ijt_qZj3Va-34pEX%2FZTIxYmJjZDM2NWYzZDViZGRiOWJjYzc5', - 'only_matching': True, - }, { - 'url': 'https://kinja.com/ajax/inset/iframe?id=soundcloud-128574047', - 'only_matching': True, - }, { - 'url': 'https://kinja.com/ajax/inset/iframe?id=soundcloud-playlist-317413750', - 'only_matching': True, - }, { - 'url': 'https://kinja.com/ajax/inset/iframe?id=tumblr-post-160130699814-daydreams-at-midnight', - 'only_matching': True, - }, { - 'url': 'https://kinja.com/ajax/inset/iframe?id=twitch-stream-libratus_extra', - 'only_matching': True, - }, { - 'url': 'https://kinja.com/ajax/inset/iframe?id=twitter-1068875942473404422', - 'only_matching': True, - }, { - 'url': 'https://kinja.com/ajax/inset/iframe?id=ustream-channel-10414700', - 'only_matching': True, - }, { - 'url': 'https://kinja.com/ajax/inset/iframe?id=vimeo-120153502', - 'only_matching': True, - }, { - 'url': 'https://kinja.com/ajax/inset/iframe?id=vine-5BlvV5qqPrD', - 'only_matching': True, - }, { - 'url': 'https://kinja.com/ajax/inset/iframe?id=youtube-list-BCQ3KyrPjgA/PLE6509247C270A72E', - 'only_matching': True, - }, { - 'url': 'https://kinja.com/ajax/inset/iframe?id=youtube-video-00QyL0AgPAE', - 'only_matching': True, - }] - _JWPLATFORM_PROVIDER = ('cdn.jwplayer.com/v2/media/', 'JWPlatform') - _PROVIDER_MAP = { - 'fb': ('facebook.com/video.php?v=', 'Facebook'), - 'imgur': ('imgur.com/', 'Imgur'), - 'instagram': ('instagram.com/p/', 'Instagram'), - 'jwplayer-video': _JWPLATFORM_PROVIDER, - 'jwp-video': _JWPLATFORM_PROVIDER, - 'megaphone': ('player.megaphone.fm/', 'Generic'), - 'ooyala': ('player.ooyala.com/player.js?embedCode=', 'Ooyala'), - 'soundcloud': ('api.soundcloud.com/tracks/', 'Soundcloud'), - 'soundcloud-playlist': ('api.soundcloud.com/playlists/', 'SoundcloudPlaylist'), - 'tumblr-post': ('%s.tumblr.com/post/%s', 'Tumblr'), - 'twitch-stream': ('twitch.tv/', 'TwitchStream'), - 'twitter': ('twitter.com/i/cards/tfw/v1/', 'TwitterCard'), - 'ustream-channel': ('ustream.tv/embed/', 'Ustream'), - 'vimeo': ('vimeo.com/', 'Vimeo'), - 'vine': ('vine.co/v/', 'Vine'), - 'youtube-list': ('youtube.com/embed/%s?list=%s', 'YoutubePlaylist'), - 'youtube-video': ('youtube.com/embed/', 'Youtube'), - } - - @staticmethod - def _extract_urls(webpage, url): - return [urljoin(url, unescapeHTML(mobj.group('url'))) for mobj in re.finditer( - r'(?x)<iframe[^>]+?src=(?P<q>["\'])(?P<url>(?:(?:https?:)?//%s)?%s(?:(?!\1).)+)\1' % (KinjaEmbedIE._DOMAIN_REGEX, KinjaEmbedIE._COMMON_REGEX), - webpage)] - - def _real_extract(self, url): - video_type, video_id = re.match(self._VALID_URL, url).groups() - - provider = self._PROVIDER_MAP.get(video_type) - if provider: - video_id = compat_urllib_parse_unquote(video_id) - if video_type == 'tumblr-post': - video_id, blog = video_id.split('-', 1) - result_url = provider[0] % (blog, video_id) - elif video_type == 'youtube-list': - video_id, playlist_id = video_id.split('/') - result_url = provider[0] % (video_id, playlist_id) - else: - if video_type == 'ooyala': - video_id = video_id.split('/')[0] - result_url = provider[0] + video_id - return self.url_result('http://' + result_url, provider[1]) - - if video_type == 'kinjavideo': - data = self._download_json( - 'https://kinja.com/api/core/video/views/videoById', - video_id, query={'videoId': video_id})['data'] - title = data['title'] - - formats = [] - for k in ('signedPlaylist', 'streaming'): - m3u8_url = data.get(k + 'Url') - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - self._sort_formats(formats) - - thumbnail = None - poster = data.get('poster') or {} - poster_id = poster.get('id') - if poster_id: - thumbnail = 'https://i.kinja-img.com/gawker-media/image/upload/%s.%s' % (poster_id, poster.get('format') or 'jpg') - - return { - 'id': video_id, - 'title': title, - 'description': strip_or_none(data.get('description')), - 'formats': formats, - 'tags': data.get('tags'), - 'timestamp': int_or_none(try_get( - data, lambda x: x['postInfo']['publishTimeMillis']), 1000), - 'thumbnail': thumbnail, - 'uploader': data.get('network'), - } - else: - video_data = self._download_json( - 'https://api.vmh.univision.com/metadata/v1/content/' + video_id, - video_id)['videoMetadata'] - iptc = video_data['photoVideoMetadataIPTC'] - title = iptc['title']['en'] - fmg = video_data.get('photoVideoMetadata_fmg') or {} - tvss_domain = fmg.get('tvssDomain') or 'https://auth.univision.com' - data = self._download_json( - tvss_domain + '/api/v3/video-auth/url-signature-tokens', - video_id, query={'mcpids': video_id})['data'][0] - formats = [] - - rendition_url = data.get('renditionUrl') - if rendition_url: - formats = self._extract_m3u8_formats( - rendition_url, video_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False) - - fallback_rendition_url = data.get('fallbackRenditionUrl') - if fallback_rendition_url: - formats.append({ - 'format_id': 'fallback', - 'tbr': int_or_none(self._search_regex( - r'_(\d+)\.mp4', fallback_rendition_url, - 'bitrate', default=None)), - 'url': fallback_rendition_url, - }) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'thumbnail': try_get(iptc, lambda x: x['cloudinaryLink']['link'], compat_str), - 'uploader': fmg.get('network'), - 'duration': int_or_none(iptc.get('fileDuration')), - 'formats': formats, - 'description': try_get(iptc, lambda x: x['description']['en'], compat_str), - 'timestamp': parse_iso8601(iptc.get('dateReleased')), - } diff --git a/youtube_dl/extractor/kusi.py b/youtube_dl/extractor/kusi.py deleted file mode 100644 index 9833d35eb..000000000 --- a/youtube_dl/extractor/kusi.py +++ /dev/null @@ -1,88 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import random -import re - -from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote_plus -from ..utils import ( - int_or_none, - float_or_none, - timeconvert, - update_url_query, - xpath_text, -) - - -class KUSIIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?kusi\.com/(?P<path>story/.+|video\?clipId=(?P<clipId>\d+))' - _TESTS = [{ - 'url': 'http://www.kusi.com/story/32849881/turko-files-refused-to-help-it-aint-right', - 'md5': '4e76ce8e53660ce9697d06c0ba6fc47d', - 'info_dict': { - 'id': '12689020', - 'ext': 'mp4', - 'title': "Turko Files: Refused to Help, It Ain't Right!", - 'duration': 223.586, - 'upload_date': '20160826', - 'timestamp': 1472233118, - 'thumbnail': r're:^https?://.*\.jpg$' - }, - }, { - 'url': 'http://kusi.com/video?clipId=12203019', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - clip_id = mobj.group('clipId') - video_id = clip_id or mobj.group('path') - - webpage = self._download_webpage(url, video_id) - - if clip_id is None: - video_id = clip_id = self._html_search_regex( - r'"clipId"\s*,\s*"(\d+)"', webpage, 'clip id') - - affiliate_id = self._search_regex( - r'affiliateId\s*:\s*\'([^\']+)\'', webpage, 'affiliate id') - - # See __Packages/worldnow/model/GalleryModel.as of WNGallery.swf - xml_url = update_url_query('http://www.kusi.com/build.asp', { - 'buildtype': 'buildfeaturexmlrequest', - 'featureType': 'Clip', - 'featureid': clip_id, - 'affiliateno': affiliate_id, - 'clientgroupid': '1', - 'rnd': int(round(random.random() * 1000000)), - }) - - doc = self._download_xml(xml_url, video_id) - - video_title = xpath_text(doc, 'HEADLINE', fatal=True) - duration = float_or_none(xpath_text(doc, 'DURATION'), scale=1000) - description = xpath_text(doc, 'ABSTRACT') - thumbnail = xpath_text(doc, './THUMBNAILIMAGE/FILENAME') - creation_time = timeconvert(xpath_text(doc, 'rfc822creationdate')) - - quality_options = doc.find('{http://search.yahoo.com/mrss/}group').findall('{http://search.yahoo.com/mrss/}content') - formats = [] - for quality in quality_options: - formats.append({ - 'url': compat_urllib_parse_unquote_plus(quality.attrib['url']), - 'height': int_or_none(quality.attrib.get('height')), - 'width': int_or_none(quality.attrib.get('width')), - 'vbr': float_or_none(quality.attrib.get('bitratebits'), scale=1000), - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': video_title, - 'description': description, - 'duration': duration, - 'formats': formats, - 'thumbnail': thumbnail, - 'timestamp': creation_time, - } diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py deleted file mode 100644 index cc5b2a1c1..000000000 --- a/youtube_dl/extractor/kuwo.py +++ /dev/null @@ -1,352 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_urlparse -from ..utils import ( - get_element_by_id, - clean_html, - ExtractorError, - InAdvancePagedList, - remove_start, -) - - -class KuwoBaseIE(InfoExtractor): - _FORMATS = [ - {'format': 'ape', 'ext': 'ape', 'preference': 100}, - {'format': 'mp3-320', 'ext': 'mp3', 'br': '320kmp3', 'abr': 320, 'preference': 80}, - {'format': 'mp3-192', 'ext': 'mp3', 'br': '192kmp3', 'abr': 192, 'preference': 70}, - {'format': 'mp3-128', 'ext': 'mp3', 'br': '128kmp3', 'abr': 128, 'preference': 60}, - {'format': 'wma', 'ext': 'wma', 'preference': 20}, - {'format': 'aac', 'ext': 'aac', 'abr': 48, 'preference': 10} - ] - - def _get_formats(self, song_id, tolerate_ip_deny=False): - formats = [] - for file_format in self._FORMATS: - query = { - 'format': file_format['ext'], - 'br': file_format.get('br', ''), - 'rid': 'MUSIC_%s' % song_id, - 'type': 'convert_url', - 'response': 'url' - } - - song_url = self._download_webpage( - 'http://antiserver.kuwo.cn/anti.s', - song_id, note='Download %s url info' % file_format['format'], - query=query, headers=self.geo_verification_headers(), - ) - - if song_url == 'IPDeny' and not tolerate_ip_deny: - raise ExtractorError('This song is blocked in this region', expected=True) - - if song_url.startswith('http://') or song_url.startswith('https://'): - formats.append({ - 'url': song_url, - 'format_id': file_format['format'], - 'format': file_format['format'], - 'preference': file_format['preference'], - 'abr': file_format.get('abr'), - }) - - return formats - - -class KuwoIE(KuwoBaseIE): - IE_NAME = 'kuwo:song' - IE_DESC = '酷我音乐' - _VALID_URL = r'https?://(?:www\.)?kuwo\.cn/yinyue/(?P<id>\d+)' - _TESTS = [{ - 'url': 'http://www.kuwo.cn/yinyue/635632/', - 'info_dict': { - 'id': '635632', - 'ext': 'ape', - 'title': '爱我别走', - 'creator': '张震岳', - 'upload_date': '20080122', - 'description': 'md5:ed13f58e3c3bf3f7fd9fbc4e5a7aa75c' - }, - 'skip': 'this song has been offline because of copyright issues', - }, { - 'url': 'http://www.kuwo.cn/yinyue/6446136/', - 'info_dict': { - 'id': '6446136', - 'ext': 'mp3', - 'title': '心', - 'description': 'md5:5d0e947b242c35dc0eb1d2fce9fbf02c', - 'creator': 'IU', - 'upload_date': '20150518', - }, - 'params': { - 'format': 'mp3-320', - }, - }, { - 'url': 'http://www.kuwo.cn/yinyue/3197154?catalog=yueku2016', - 'only_matching': True, - }] - - def _real_extract(self, url): - song_id = self._match_id(url) - webpage, urlh = self._download_webpage_handle( - url, song_id, note='Download song detail info', - errnote='Unable to get song detail info') - if song_id not in urlh.geturl() or '对不起,该歌曲由于版权问题已被下线,将返回网站首页' in webpage: - raise ExtractorError('this song has been offline because of copyright issues', expected=True) - - song_name = self._html_search_regex( - r'<p[^>]+id="lrcName">([^<]+)</p>', webpage, 'song name') - singer_name = remove_start(self._html_search_regex( - r'<a[^>]+href="http://www\.kuwo\.cn/artist/content\?name=([^"]+)">', - webpage, 'singer name', fatal=False), '歌手') - lrc_content = clean_html(get_element_by_id('lrcContent', webpage)) - if lrc_content == '暂无': # indicates no lyrics - lrc_content = None - - formats = self._get_formats(song_id) - self._sort_formats(formats) - - album_id = self._html_search_regex( - r'<a[^>]+href="http://www\.kuwo\.cn/album/(\d+)/"', - webpage, 'album id', fatal=False) - - publish_time = None - if album_id is not None: - album_info_page = self._download_webpage( - 'http://www.kuwo.cn/album/%s/' % album_id, song_id, - note='Download album detail info', - errnote='Unable to get album detail info') - - publish_time = self._html_search_regex( - r'发行时间:(\d{4}-\d{2}-\d{2})', album_info_page, - 'publish time', fatal=False) - if publish_time: - publish_time = publish_time.replace('-', '') - - return { - 'id': song_id, - 'title': song_name, - 'creator': singer_name, - 'upload_date': publish_time, - 'description': lrc_content, - 'formats': formats, - } - - -class KuwoAlbumIE(InfoExtractor): - IE_NAME = 'kuwo:album' - IE_DESC = '酷我音乐 - 专辑' - _VALID_URL = r'https?://(?:www\.)?kuwo\.cn/album/(?P<id>\d+?)/' - _TEST = { - 'url': 'http://www.kuwo.cn/album/502294/', - 'info_dict': { - 'id': '502294', - 'title': 'Made\xa0Series\xa0《M》', - 'description': 'md5:d463f0d8a0ff3c3ea3d6ed7452a9483f', - }, - 'playlist_count': 2, - } - - def _real_extract(self, url): - album_id = self._match_id(url) - - webpage = self._download_webpage( - url, album_id, note='Download album info', - errnote='Unable to get album info') - - album_name = self._html_search_regex( - r'<div[^>]+class="comm"[^<]+<h1[^>]+title="([^"]+)"', webpage, - 'album name') - album_intro = remove_start( - clean_html(get_element_by_id('intro', webpage)), - '%s简介:' % album_name) - - entries = [ - self.url_result(song_url, 'Kuwo') for song_url in re.findall( - r'<p[^>]+class="listen"><a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+/)"', - webpage) - ] - return self.playlist_result(entries, album_id, album_name, album_intro) - - -class KuwoChartIE(InfoExtractor): - IE_NAME = 'kuwo:chart' - IE_DESC = '酷我音乐 - 排行榜' - _VALID_URL = r'https?://yinyue\.kuwo\.cn/billboard_(?P<id>[^.]+).htm' - _TEST = { - 'url': 'http://yinyue.kuwo.cn/billboard_香港中文龙虎榜.htm', - 'info_dict': { - 'id': '香港中文龙虎榜', - }, - 'playlist_mincount': 7, - } - - def _real_extract(self, url): - chart_id = self._match_id(url) - webpage = self._download_webpage( - url, chart_id, note='Download chart info', - errnote='Unable to get chart info') - - entries = [ - self.url_result(song_url, 'Kuwo') for song_url in re.findall( - r'<a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+)', webpage) - ] - return self.playlist_result(entries, chart_id) - - -class KuwoSingerIE(InfoExtractor): - IE_NAME = 'kuwo:singer' - IE_DESC = '酷我音乐 - 歌手' - _VALID_URL = r'https?://(?:www\.)?kuwo\.cn/mingxing/(?P<id>[^/]+)' - _TESTS = [{ - 'url': 'http://www.kuwo.cn/mingxing/bruno+mars/', - 'info_dict': { - 'id': 'bruno+mars', - 'title': 'Bruno\xa0Mars', - }, - 'playlist_mincount': 329, - }, { - 'url': 'http://www.kuwo.cn/mingxing/Ali/music.htm', - 'info_dict': { - 'id': 'Ali', - 'title': 'Ali', - }, - 'playlist_mincount': 95, - 'skip': 'Regularly stalls travis build', # See https://travis-ci.org/ytdl-org/youtube-dl/jobs/78878540 - }] - - PAGE_SIZE = 15 - - def _real_extract(self, url): - singer_id = self._match_id(url) - webpage = self._download_webpage( - url, singer_id, note='Download singer info', - errnote='Unable to get singer info') - - singer_name = self._html_search_regex( - r'<h1>([^<]+)</h1>', webpage, 'singer name') - - artist_id = self._html_search_regex( - r'data-artistid="(\d+)"', webpage, 'artist id') - - page_count = int(self._html_search_regex( - r'data-page="(\d+)"', webpage, 'page count')) - - def page_func(page_num): - webpage = self._download_webpage( - 'http://www.kuwo.cn/artist/contentMusicsAjax', - singer_id, note='Download song list page #%d' % (page_num + 1), - errnote='Unable to get song list page #%d' % (page_num + 1), - query={'artistId': artist_id, 'pn': page_num, 'rn': self.PAGE_SIZE}) - - return [ - self.url_result(compat_urlparse.urljoin(url, song_url), 'Kuwo') - for song_url in re.findall( - r'<div[^>]+class="name"><a[^>]+href="(/yinyue/\d+)', - webpage) - ] - - entries = InAdvancePagedList(page_func, page_count, self.PAGE_SIZE) - - return self.playlist_result(entries, singer_id, singer_name) - - -class KuwoCategoryIE(InfoExtractor): - IE_NAME = 'kuwo:category' - IE_DESC = '酷我音乐 - 分类' - _VALID_URL = r'https?://yinyue\.kuwo\.cn/yy/cinfo_(?P<id>\d+?).htm' - _TEST = { - 'url': 'http://yinyue.kuwo.cn/yy/cinfo_86375.htm', - 'info_dict': { - 'id': '86375', - 'title': '八十年代精选', - 'description': '这些都是属于八十年代的回忆!', - }, - 'playlist_mincount': 24, - } - - def _real_extract(self, url): - category_id = self._match_id(url) - webpage = self._download_webpage( - url, category_id, note='Download category info', - errnote='Unable to get category info') - - category_name = self._html_search_regex( - r'<h1[^>]+title="([^<>]+?)">[^<>]+?</h1>', webpage, 'category name') - - category_desc = remove_start( - get_element_by_id('intro', webpage).strip(), - '%s简介:' % category_name) - if category_desc == '暂无': - category_desc = None - - jsonm = self._parse_json(self._html_search_regex( - r'var\s+jsonm\s*=\s*([^;]+);', webpage, 'category songs'), category_id) - - entries = [ - self.url_result('http://www.kuwo.cn/yinyue/%s/' % song['musicrid'], 'Kuwo') - for song in jsonm['musiclist'] - ] - return self.playlist_result(entries, category_id, category_name, category_desc) - - -class KuwoMvIE(KuwoBaseIE): - IE_NAME = 'kuwo:mv' - IE_DESC = '酷我音乐 - MV' - _VALID_URL = r'https?://(?:www\.)?kuwo\.cn/mv/(?P<id>\d+?)/' - _TEST = { - 'url': 'http://www.kuwo.cn/mv/6480076/', - 'info_dict': { - 'id': '6480076', - 'ext': 'mp4', - 'title': 'My HouseMV', - 'creator': '2PM', - }, - # In this video, music URLs (anti.s) are blocked outside China and - # USA, while the MV URL (mvurl) is available globally, so force the MV - # URL for consistent results in different countries - 'params': { - 'format': 'mv', - }, - } - _FORMATS = KuwoBaseIE._FORMATS + [ - {'format': 'mkv', 'ext': 'mkv', 'preference': 250}, - {'format': 'mp4', 'ext': 'mp4', 'preference': 200}, - ] - - def _real_extract(self, url): - song_id = self._match_id(url) - webpage = self._download_webpage( - url, song_id, note='Download mv detail info: %s' % song_id, - errnote='Unable to get mv detail info: %s' % song_id) - - mobj = re.search( - r'<h1[^>]+title="(?P<song>[^"]+)">[^<]+<span[^>]+title="(?P<singer>[^"]+)"', - webpage) - if mobj: - song_name = mobj.group('song') - singer_name = mobj.group('singer') - else: - raise ExtractorError('Unable to find song or singer names') - - formats = self._get_formats(song_id, tolerate_ip_deny=True) - - mv_url = self._download_webpage( - 'http://www.kuwo.cn/yy/st/mvurl?rid=MUSIC_%s' % song_id, - song_id, note='Download %s MV URL' % song_id) - formats.append({ - 'url': mv_url, - 'format_id': 'mv', - }) - - self._sort_formats(formats) - - return { - 'id': song_id, - 'title': song_name, - 'creator': singer_name, - 'formats': formats, - } diff --git a/youtube_dl/extractor/la7.py b/youtube_dl/extractor/la7.py deleted file mode 100644 index c3b4ffa7e..000000000 --- a/youtube_dl/extractor/la7.py +++ /dev/null @@ -1,67 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - js_to_json, - smuggle_url, -) - - -class LA7IE(InfoExtractor): - IE_NAME = 'la7.it' - _VALID_URL = r'''(?x)(https?://)?(?: - (?:www\.)?la7\.it/([^/]+)/(?:rivedila7|video)/| - tg\.la7\.it/repliche-tgla7\?id= - )(?P<id>.+)''' - - _TESTS = [{ - # 'src' is a plain URL - 'url': 'http://www.la7.it/crozza/video/inccool8-02-10-2015-163722', - 'md5': '8b613ffc0c4bf9b9e377169fc19c214c', - 'info_dict': { - 'id': '0_42j6wd36', - 'ext': 'mp4', - 'title': 'Inc.Cool8', - 'description': 'Benvenuti nell\'incredibile mondo della INC. COOL. 8. dove “INC.” sta per “Incorporated” “COOL” sta per “fashion” ed Eight sta per il gesto atletico', - 'thumbnail': 're:^https?://.*', - 'uploader_id': 'kdla7pillole@iltrovatore.it', - 'timestamp': 1443814869, - 'upload_date': '20151002', - }, - }, { - # 'src' is a dictionary - 'url': 'http://tg.la7.it/repliche-tgla7?id=189080', - 'md5': '6b0d8888d286e39870208dfeceaf456b', - 'info_dict': { - 'id': '189080', - 'ext': 'mp4', - 'title': 'TG LA7', - }, - }, { - 'url': 'http://www.la7.it/omnibus/rivedila7/omnibus-news-02-07-2016-189077', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - player_data = self._parse_json( - self._search_regex( - [r'(?s)videoParams\s*=\s*({.+?});', r'videoLa7\(({[^;]+})\);'], - webpage, 'player data'), - video_id, transform_source=js_to_json) - - return { - '_type': 'url_transparent', - 'url': smuggle_url('kaltura:103:%s' % player_data['vid'], { - 'service_url': 'http://nkdam.iltrovatore.it', - }), - 'id': video_id, - 'title': player_data['title'], - 'description': self._og_search_description(webpage, default=None), - 'thumbnail': player_data.get('poster'), - 'ie_key': 'Kaltura', - } diff --git a/youtube_dl/extractor/lbry.py b/youtube_dl/extractor/lbry.py deleted file mode 100644 index cfd6b8393..000000000 --- a/youtube_dl/extractor/lbry.py +++ /dev/null @@ -1,280 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import functools -import json - -from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_str, - compat_urllib_parse_unquote, - compat_urllib_parse_urlparse, -) -from ..utils import ( - determine_ext, - ExtractorError, - int_or_none, - mimetype2ext, - OnDemandPagedList, - try_get, - urljoin, -) - - -class LBRYBaseIE(InfoExtractor): - _BASE_URL_REGEX = r'https?://(?:www\.)?(?:lbry\.tv|odysee\.com)/' - _CLAIM_ID_REGEX = r'[0-9a-f]{1,40}' - _OPT_CLAIM_ID = '[^:/?#&]+(?::%s)?' % _CLAIM_ID_REGEX - _SUPPORTED_STREAM_TYPES = ['video', 'audio'] - - def _call_api_proxy(self, method, display_id, params, resource): - return self._download_json( - 'https://api.lbry.tv/api/v1/proxy', - display_id, 'Downloading %s JSON metadata' % resource, - headers={'Content-Type': 'application/json-rpc'}, - data=json.dumps({ - 'method': method, - 'params': params, - }).encode())['result'] - - def _resolve_url(self, url, display_id, resource): - return self._call_api_proxy( - 'resolve', display_id, {'urls': url}, resource)[url] - - def _permanent_url(self, url, claim_name, claim_id): - return urljoin(url, '/%s:%s' % (claim_name, claim_id)) - - def _parse_stream(self, stream, url): - stream_value = stream.get('value') or {} - stream_type = stream_value.get('stream_type') - source = stream_value.get('source') or {} - media = stream_value.get(stream_type) or {} - signing_channel = stream.get('signing_channel') or {} - channel_name = signing_channel.get('name') - channel_claim_id = signing_channel.get('claim_id') - channel_url = None - if channel_name and channel_claim_id: - channel_url = self._permanent_url(url, channel_name, channel_claim_id) - - info = { - 'thumbnail': try_get(stream_value, lambda x: x['thumbnail']['url'], compat_str), - 'description': stream_value.get('description'), - 'license': stream_value.get('license'), - 'timestamp': int_or_none(stream.get('timestamp')), - 'release_timestamp': int_or_none(stream_value.get('release_time')), - 'tags': stream_value.get('tags'), - 'duration': int_or_none(media.get('duration')), - 'channel': try_get(signing_channel, lambda x: x['value']['title']), - 'channel_id': channel_claim_id, - 'channel_url': channel_url, - 'ext': determine_ext(source.get('name')) or mimetype2ext(source.get('media_type')), - 'filesize': int_or_none(source.get('size')), - } - if stream_type == 'audio': - info['vcodec'] = 'none' - else: - info.update({ - 'width': int_or_none(media.get('width')), - 'height': int_or_none(media.get('height')), - }) - return info - - -class LBRYIE(LBRYBaseIE): - IE_NAME = 'lbry' - _VALID_URL = LBRYBaseIE._BASE_URL_REGEX + r'(?P<id>\$/[^/]+/[^/]+/{1}|@{0}/{0}|(?!@){0})'.format(LBRYBaseIE._OPT_CLAIM_ID, LBRYBaseIE._CLAIM_ID_REGEX) - _TESTS = [{ - # Video - 'url': 'https://lbry.tv/@Mantega:1/First-day-LBRY:1', - 'md5': '65bd7ec1f6744ada55da8e4c48a2edf9', - 'info_dict': { - 'id': '17f983b61f53091fb8ea58a9c56804e4ff8cff4d', - 'ext': 'mp4', - 'title': 'First day in LBRY? Start HERE!', - 'description': 'md5:f6cb5c704b332d37f5119313c2c98f51', - 'timestamp': 1595694354, - 'upload_date': '20200725', - 'release_timestamp': 1595340697, - 'release_date': '20200721', - 'width': 1280, - 'height': 720, - } - }, { - # Audio - 'url': 'https://lbry.tv/@LBRYFoundation:0/Episode-1:e', - 'md5': 'c94017d3eba9b49ce085a8fad6b98d00', - 'info_dict': { - 'id': 'e7d93d772bd87e2b62d5ab993c1c3ced86ebb396', - 'ext': 'mp3', - 'title': 'The LBRY Foundation Community Podcast Episode 1 - Introduction, Streaming on LBRY, Transcoding', - 'description': 'md5:661ac4f1db09f31728931d7b88807a61', - 'timestamp': 1591312601, - 'upload_date': '20200604', - 'release_timestamp': 1591312421, - 'release_date': '20200604', - 'tags': list, - 'duration': 2570, - 'channel': 'The LBRY Foundation', - 'channel_id': '0ed629d2b9c601300cacf7eabe9da0be79010212', - 'channel_url': 'https://lbry.tv/@LBRYFoundation:0ed629d2b9c601300cacf7eabe9da0be79010212', - 'vcodec': 'none', - } - }, { - # HLS - 'url': 'https://odysee.com/@gardeningincanada:b/plants-i-will-never-grow-again.-the:e', - 'md5': 'fc82f45ea54915b1495dd7cb5cc1289f', - 'info_dict': { - 'id': 'e51671357333fe22ae88aad320bde2f6f96b1410', - 'ext': 'mp4', - 'title': 'PLANTS I WILL NEVER GROW AGAIN. THE BLACK LIST PLANTS FOR A CANADIAN GARDEN | Gardening in Canada 🍁', - 'description': 'md5:9c539c6a03fb843956de61a4d5288d5e', - 'timestamp': 1618254123, - 'upload_date': '20210412', - 'release_timestamp': 1618254002, - 'release_date': '20210412', - 'tags': list, - 'duration': 554, - 'channel': 'Gardening In Canada', - 'channel_id': 'b8be0e93b423dad221abe29545fbe8ec36e806bc', - 'channel_url': 'https://odysee.com/@gardeningincanada:b8be0e93b423dad221abe29545fbe8ec36e806bc', - 'formats': 'mincount:3', - } - }, { - 'url': 'https://odysee.com/@BrodieRobertson:5/apple-is-tracking-everything-you-do-on:e', - 'only_matching': True, - }, { - 'url': "https://odysee.com/@ScammerRevolts:b0/I-SYSKEY'D-THE-SAME-SCAMMERS-3-TIMES!:b", - 'only_matching': True, - }, { - 'url': 'https://lbry.tv/Episode-1:e7d93d772bd87e2b62d5ab993c1c3ced86ebb396', - 'only_matching': True, - }, { - 'url': 'https://lbry.tv/$/embed/Episode-1/e7d93d772bd87e2b62d5ab993c1c3ced86ebb396', - 'only_matching': True, - }, { - 'url': 'https://lbry.tv/Episode-1:e7', - 'only_matching': True, - }, { - 'url': 'https://lbry.tv/@LBRYFoundation/Episode-1', - 'only_matching': True, - }, { - 'url': 'https://lbry.tv/$/download/Episode-1/e7d93d772bd87e2b62d5ab993c1c3ced86ebb396', - 'only_matching': True, - }, { - 'url': 'https://lbry.tv/@lacajadepandora:a/TRUMP-EST%C3%81-BIEN-PUESTO-con-Pilar-Baselga,-Carlos-Senra,-Luis-Palacios-(720p_30fps_H264-192kbit_AAC):1', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - if display_id.startswith('$/'): - display_id = display_id.split('/', 2)[-1].replace('/', ':') - else: - display_id = display_id.replace(':', '#') - display_id = compat_urllib_parse_unquote(display_id) - uri = 'lbry://' + display_id - result = self._resolve_url(uri, display_id, 'stream') - result_value = result['value'] - if result_value.get('stream_type') not in self._SUPPORTED_STREAM_TYPES: - raise ExtractorError('Unsupported URL', expected=True) - claim_id = result['claim_id'] - title = result_value['title'] - streaming_url = self._call_api_proxy( - 'get', claim_id, {'uri': uri}, 'streaming url')['streaming_url'] - info = self._parse_stream(result, url) - urlh = self._request_webpage( - streaming_url, display_id, note='Downloading streaming redirect url info') - if determine_ext(urlh.geturl()) == 'm3u8': - info['formats'] = self._extract_m3u8_formats( - urlh.geturl(), display_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls') - self._sort_formats(info['formats']) - else: - info['url'] = streaming_url - info.update({ - 'id': claim_id, - 'title': title, - }) - return info - - -class LBRYChannelIE(LBRYBaseIE): - IE_NAME = 'lbry:channel' - _VALID_URL = LBRYBaseIE._BASE_URL_REGEX + r'(?P<id>@%s)/?(?:[?#&]|$)' % LBRYBaseIE._OPT_CLAIM_ID - _TESTS = [{ - 'url': 'https://lbry.tv/@LBRYFoundation:0', - 'info_dict': { - 'id': '0ed629d2b9c601300cacf7eabe9da0be79010212', - 'title': 'The LBRY Foundation', - 'description': 'Channel for the LBRY Foundation. Follow for updates and news.', - }, - 'playlist_count': 29, - }, { - 'url': 'https://lbry.tv/@LBRYFoundation', - 'only_matching': True, - }] - _PAGE_SIZE = 50 - - def _fetch_page(self, claim_id, url, params, page): - page += 1 - page_params = { - 'channel_ids': [claim_id], - 'claim_type': 'stream', - 'no_totals': True, - 'page': page, - 'page_size': self._PAGE_SIZE, - } - page_params.update(params) - result = self._call_api_proxy( - 'claim_search', claim_id, page_params, 'page %d' % page) - for item in (result.get('items') or []): - stream_claim_name = item.get('name') - stream_claim_id = item.get('claim_id') - if not (stream_claim_name and stream_claim_id): - continue - - info = self._parse_stream(item, url) - info.update({ - '_type': 'url', - 'id': stream_claim_id, - 'title': try_get(item, lambda x: x['value']['title']), - 'url': self._permanent_url(url, stream_claim_name, stream_claim_id), - }) - yield info - - def _real_extract(self, url): - display_id = self._match_id(url).replace(':', '#') - result = self._resolve_url( - 'lbry://' + display_id, display_id, 'channel') - claim_id = result['claim_id'] - qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) - content = qs.get('content', [None])[0] - params = { - 'fee_amount': qs.get('fee_amount', ['>=0'])[0], - 'order_by': { - 'new': ['release_time'], - 'top': ['effective_amount'], - 'trending': ['trending_group', 'trending_mixed'], - }[qs.get('order', ['new'])[0]], - 'stream_types': [content] if content in ['audio', 'video'] else self._SUPPORTED_STREAM_TYPES, - } - duration = qs.get('duration', [None])[0] - if duration: - params['duration'] = { - 'long': '>=1200', - 'short': '<=240', - }[duration] - language = qs.get('language', ['all'])[0] - if language != 'all': - languages = [language] - if language == 'en': - languages.append('none') - params['any_languages'] = languages - entries = OnDemandPagedList( - functools.partial(self._fetch_page, claim_id, url, params), - self._PAGE_SIZE) - result_value = result.get('value') or {} - return self.playlist_result( - entries, claim_id, result_value.get('title'), - result_value.get('description')) diff --git a/youtube_dl/extractor/lecturio.py b/youtube_dl/extractor/lecturio.py deleted file mode 100644 index 1b2dcef46..000000000 --- a/youtube_dl/extractor/lecturio.py +++ /dev/null @@ -1,243 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - clean_html, - determine_ext, - ExtractorError, - float_or_none, - int_or_none, - str_or_none, - url_or_none, - urlencode_postdata, - urljoin, -) - - -class LecturioBaseIE(InfoExtractor): - _API_BASE_URL = 'https://app.lecturio.com/api/en/latest/html5/' - _LOGIN_URL = 'https://app.lecturio.com/en/login' - _NETRC_MACHINE = 'lecturio' - - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - - # Sets some cookies - _, urlh = self._download_webpage_handle( - self._LOGIN_URL, None, 'Downloading login popup') - - def is_logged(url_handle): - return self._LOGIN_URL not in url_handle.geturl() - - # Already logged in - if is_logged(urlh): - return - - login_form = { - 'signin[email]': username, - 'signin[password]': password, - 'signin[remember]': 'on', - } - - response, urlh = self._download_webpage_handle( - self._LOGIN_URL, None, 'Logging in', - data=urlencode_postdata(login_form)) - - # Logged in successfully - if is_logged(urlh): - return - - errors = self._html_search_regex( - r'(?s)<ul[^>]+class=["\']error_list[^>]+>(.+?)</ul>', response, - 'errors', default=None) - if errors: - raise ExtractorError('Unable to login: %s' % errors, expected=True) - raise ExtractorError('Unable to log in') - - -class LecturioIE(LecturioBaseIE): - _VALID_URL = r'''(?x) - https:// - (?: - app\.lecturio\.com/([^/]+/(?P<nt>[^/?#&]+)\.lecture|(?:\#/)?lecture/c/\d+/(?P<id>\d+))| - (?:www\.)?lecturio\.de/[^/]+/(?P<nt_de>[^/?#&]+)\.vortrag - ) - ''' - _TESTS = [{ - 'url': 'https://app.lecturio.com/medical-courses/important-concepts-and-terms-introduction-to-microbiology.lecture#tab/videos', - 'md5': '9a42cf1d8282a6311bf7211bbde26fde', - 'info_dict': { - 'id': '39634', - 'ext': 'mp4', - 'title': 'Important Concepts and Terms — Introduction to Microbiology', - }, - 'skip': 'Requires lecturio account credentials', - }, { - 'url': 'https://www.lecturio.de/jura/oeffentliches-recht-staatsexamen.vortrag', - 'only_matching': True, - }, { - 'url': 'https://app.lecturio.com/#/lecture/c/6434/39634', - 'only_matching': True, - }] - - _CC_LANGS = { - 'Arabic': 'ar', - 'Bulgarian': 'bg', - 'German': 'de', - 'English': 'en', - 'Spanish': 'es', - 'Persian': 'fa', - 'French': 'fr', - 'Japanese': 'ja', - 'Polish': 'pl', - 'Pashto': 'ps', - 'Russian': 'ru', - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - nt = mobj.group('nt') or mobj.group('nt_de') - lecture_id = mobj.group('id') - display_id = nt or lecture_id - api_path = 'lectures/' + lecture_id if lecture_id else 'lecture/' + nt + '.json' - video = self._download_json( - self._API_BASE_URL + api_path, display_id) - title = video['title'].strip() - if not lecture_id: - pid = video.get('productId') or video.get('uid') - if pid: - spid = pid.split('_') - if spid and len(spid) == 2: - lecture_id = spid[1] - - formats = [] - for format_ in video['content']['media']: - if not isinstance(format_, dict): - continue - file_ = format_.get('file') - if not file_: - continue - ext = determine_ext(file_) - if ext == 'smil': - # smil contains only broken RTMP formats anyway - continue - file_url = url_or_none(file_) - if not file_url: - continue - label = str_or_none(format_.get('label')) - filesize = int_or_none(format_.get('fileSize')) - f = { - 'url': file_url, - 'format_id': label, - 'filesize': float_or_none(filesize, invscale=1000) - } - if label: - mobj = re.match(r'(\d+)p\s*\(([^)]+)\)', label) - if mobj: - f.update({ - 'format_id': mobj.group(2), - 'height': int(mobj.group(1)), - }) - formats.append(f) - self._sort_formats(formats) - - subtitles = {} - automatic_captions = {} - captions = video.get('captions') or [] - for cc in captions: - cc_url = cc.get('url') - if not cc_url: - continue - cc_label = cc.get('translatedCode') - lang = cc.get('languageCode') or self._search_regex( - r'/([a-z]{2})_', cc_url, 'lang', - default=cc_label.split()[0] if cc_label else 'en') - original_lang = self._search_regex( - r'/[a-z]{2}_([a-z]{2})_', cc_url, 'original lang', - default=None) - sub_dict = (automatic_captions - if 'auto-translated' in cc_label or original_lang - else subtitles) - sub_dict.setdefault(self._CC_LANGS.get(lang, lang), []).append({ - 'url': cc_url, - }) - - return { - 'id': lecture_id or nt, - 'title': title, - 'formats': formats, - 'subtitles': subtitles, - 'automatic_captions': automatic_captions, - } - - -class LecturioCourseIE(LecturioBaseIE): - _VALID_URL = r'https://app\.lecturio\.com/(?:[^/]+/(?P<nt>[^/?#&]+)\.course|(?:#/)?course/c/(?P<id>\d+))' - _TESTS = [{ - 'url': 'https://app.lecturio.com/medical-courses/microbiology-introduction.course#/', - 'info_dict': { - 'id': 'microbiology-introduction', - 'title': 'Microbiology: Introduction', - 'description': 'md5:13da8500c25880c6016ae1e6d78c386a', - }, - 'playlist_count': 45, - 'skip': 'Requires lecturio account credentials', - }, { - 'url': 'https://app.lecturio.com/#/course/c/6434', - 'only_matching': True, - }] - - def _real_extract(self, url): - nt, course_id = re.match(self._VALID_URL, url).groups() - display_id = nt or course_id - api_path = 'courses/' + course_id if course_id else 'course/content/' + nt + '.json' - course = self._download_json( - self._API_BASE_URL + api_path, display_id) - entries = [] - for lecture in course.get('lectures', []): - lecture_id = str_or_none(lecture.get('id')) - lecture_url = lecture.get('url') - if lecture_url: - lecture_url = urljoin(url, lecture_url) - else: - lecture_url = 'https://app.lecturio.com/#/lecture/c/%s/%s' % (course_id, lecture_id) - entries.append(self.url_result( - lecture_url, ie=LecturioIE.ie_key(), video_id=lecture_id)) - return self.playlist_result( - entries, display_id, course.get('title'), - clean_html(course.get('description'))) - - -class LecturioDeCourseIE(LecturioBaseIE): - _VALID_URL = r'https://(?:www\.)?lecturio\.de/[^/]+/(?P<id>[^/?#&]+)\.kurs' - _TEST = { - 'url': 'https://www.lecturio.de/jura/grundrechte.kurs', - 'only_matching': True, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - entries = [] - for mobj in re.finditer( - r'(?s)<td[^>]+\bdata-lecture-id=["\'](?P<id>\d+).+?\bhref=(["\'])(?P<url>(?:(?!\2).)+\.vortrag)\b[^>]+>', - webpage): - lecture_url = urljoin(url, mobj.group('url')) - lecture_id = mobj.group('id') - entries.append(self.url_result( - lecture_url, ie=LecturioIE.ie_key(), video_id=lecture_id)) - - title = self._search_regex( - r'<h1[^>]*>([^<]+)', webpage, 'title', default=None) - - return self.playlist_result(entries, display_id, title) diff --git a/youtube_dl/extractor/leeco.py b/youtube_dl/extractor/leeco.py deleted file mode 100644 index 7dc0ad794..000000000 --- a/youtube_dl/extractor/leeco.py +++ /dev/null @@ -1,368 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import datetime -import hashlib -import re -import time - -from .common import InfoExtractor -from ..compat import ( - compat_b64decode, - compat_ord, - compat_str, - compat_urllib_parse_urlencode, -) -from ..utils import ( - determine_ext, - encode_data_uri, - ExtractorError, - int_or_none, - orderedSet, - parse_iso8601, - str_or_none, - url_basename, - urshift, -) - - -class LeIE(InfoExtractor): - IE_DESC = '乐视网' - _VALID_URL = r'https?://(?:www\.le\.com/ptv/vplay|(?:sports\.le|(?:www\.)?lesports)\.com/(?:match|video))/(?P<id>\d+)\.html' - _GEO_COUNTRIES = ['CN'] - _URL_TEMPLATE = 'http://www.le.com/ptv/vplay/%s.html' - - _TESTS = [{ - 'url': 'http://www.le.com/ptv/vplay/22005890.html', - 'md5': 'edadcfe5406976f42f9f266057ee5e40', - 'info_dict': { - 'id': '22005890', - 'ext': 'mp4', - 'title': '第87届奥斯卡颁奖礼完美落幕 《鸟人》成最大赢家', - 'description': 'md5:a9cb175fd753e2962176b7beca21a47c', - }, - 'params': { - 'hls_prefer_native': True, - }, - }, { - 'url': 'http://www.le.com/ptv/vplay/1415246.html', - 'info_dict': { - 'id': '1415246', - 'ext': 'mp4', - 'title': '美人天下01', - 'description': 'md5:28942e650e82ed4fcc8e4de919ee854d', - }, - 'params': { - 'hls_prefer_native': True, - }, - }, { - 'note': 'This video is available only in Mainland China, thus a proxy is needed', - 'url': 'http://www.le.com/ptv/vplay/1118082.html', - 'md5': '2424c74948a62e5f31988438979c5ad1', - 'info_dict': { - 'id': '1118082', - 'ext': 'mp4', - 'title': '与龙共舞 完整版', - 'description': 'md5:7506a5eeb1722bb9d4068f85024e3986', - }, - 'params': { - 'hls_prefer_native': True, - }, - }, { - 'url': 'http://sports.le.com/video/25737697.html', - 'only_matching': True, - }, { - 'url': 'http://www.lesports.com/match/1023203003.html', - 'only_matching': True, - }, { - 'url': 'http://sports.le.com/match/1023203003.html', - 'only_matching': True, - }] - - # ror() and calc_time_key() are reversed from a embedded swf file in LetvPlayer.swf - def ror(self, param1, param2): - _loc3_ = 0 - while _loc3_ < param2: - param1 = urshift(param1, 1) + ((param1 & 1) << 31) - _loc3_ += 1 - return param1 - - def calc_time_key(self, param1): - _loc2_ = 185025305 - return self.ror(param1, _loc2_ % 17) ^ _loc2_ - - # see M3U8Encryption class in KLetvPlayer.swf - @staticmethod - def decrypt_m3u8(encrypted_data): - if encrypted_data[:5].decode('utf-8').lower() != 'vc_01': - return encrypted_data - encrypted_data = encrypted_data[5:] - - _loc4_ = bytearray(2 * len(encrypted_data)) - for idx, val in enumerate(encrypted_data): - b = compat_ord(val) - _loc4_[2 * idx] = b // 16 - _loc4_[2 * idx + 1] = b % 16 - idx = len(_loc4_) - 11 - _loc4_ = _loc4_[idx:] + _loc4_[:idx] - _loc7_ = bytearray(len(encrypted_data)) - for i in range(len(encrypted_data)): - _loc7_[i] = _loc4_[2 * i] * 16 + _loc4_[2 * i + 1] - - return bytes(_loc7_) - - def _check_errors(self, play_json): - # Check for errors - playstatus = play_json['msgs']['playstatus'] - if playstatus['status'] == 0: - flag = playstatus['flag'] - if flag == 1: - self.raise_geo_restricted() - else: - raise ExtractorError('Generic error. flag = %d' % flag, expected=True) - - def _real_extract(self, url): - media_id = self._match_id(url) - page = self._download_webpage(url, media_id) - - play_json_flash = self._download_json( - 'http://player-pc.le.com/mms/out/video/playJson', - media_id, 'Downloading flash playJson data', query={ - 'id': media_id, - 'platid': 1, - 'splatid': 105, - 'format': 1, - 'source': 1000, - 'tkey': self.calc_time_key(int(time.time())), - 'domain': 'www.le.com', - 'region': 'cn', - }, - headers=self.geo_verification_headers()) - self._check_errors(play_json_flash) - - def get_flash_urls(media_url, format_id): - nodes_data = self._download_json( - media_url, media_id, - 'Download JSON metadata for format %s' % format_id, - query={ - 'm3v': 1, - 'format': 1, - 'expect': 3, - 'tss': 'ios', - }) - - req = self._request_webpage( - nodes_data['nodelist'][0]['location'], media_id, - note='Downloading m3u8 information for format %s' % format_id) - - m3u8_data = self.decrypt_m3u8(req.read()) - - return { - 'hls': encode_data_uri(m3u8_data, 'application/vnd.apple.mpegurl'), - } - - extracted_formats = [] - formats = [] - playurl = play_json_flash['msgs']['playurl'] - play_domain = playurl['domain'][0] - - for format_id, format_data in playurl.get('dispatch', []).items(): - if format_id in extracted_formats: - continue - extracted_formats.append(format_id) - - media_url = play_domain + format_data[0] - for protocol, format_url in get_flash_urls(media_url, format_id).items(): - f = { - 'url': format_url, - 'ext': determine_ext(format_data[1]), - 'format_id': '%s-%s' % (protocol, format_id), - 'protocol': 'm3u8_native' if protocol == 'hls' else 'http', - 'quality': int_or_none(format_id), - } - - if format_id[-1:] == 'p': - f['height'] = int_or_none(format_id[:-1]) - - formats.append(f) - self._sort_formats(formats, ('height', 'quality', 'format_id')) - - publish_time = parse_iso8601(self._html_search_regex( - r'发布时间 ([^<>]+) ', page, 'publish time', default=None), - delimiter=' ', timezone=datetime.timedelta(hours=8)) - description = self._html_search_meta('description', page, fatal=False) - - return { - 'id': media_id, - 'formats': formats, - 'title': playurl['title'], - 'thumbnail': playurl['pic'], - 'description': description, - 'timestamp': publish_time, - } - - -class LePlaylistIE(InfoExtractor): - _VALID_URL = r'https?://[a-z]+\.le\.com/(?!video)[a-z]+/(?P<id>[a-z0-9_]+)' - - _TESTS = [{ - 'url': 'http://www.le.com/tv/46177.html', - 'info_dict': { - 'id': '46177', - 'title': '美人天下', - 'description': 'md5:395666ff41b44080396e59570dbac01c' - }, - 'playlist_count': 35 - }, { - 'url': 'http://tv.le.com/izt/wuzetian/index.html', - 'info_dict': { - 'id': 'wuzetian', - 'title': '武媚娘传奇', - 'description': 'md5:e12499475ab3d50219e5bba00b3cb248' - }, - # This playlist contains some extra videos other than the drama itself - 'playlist_mincount': 96 - }, { - 'url': 'http://tv.le.com/pzt/lswjzzjc/index.shtml', - # This series is moved to http://www.le.com/tv/10005297.html - 'only_matching': True, - }, { - 'url': 'http://www.le.com/comic/92063.html', - 'only_matching': True, - }, { - 'url': 'http://list.le.com/listn/c1009_sc532002_d2_p1_o1.html', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return False if LeIE.suitable(url) else super(LePlaylistIE, cls).suitable(url) - - def _real_extract(self, url): - playlist_id = self._match_id(url) - page = self._download_webpage(url, playlist_id) - - # Currently old domain names are still used in playlists - media_ids = orderedSet(re.findall( - r'<a[^>]+href="http://www\.letv\.com/ptv/vplay/(\d+)\.html', page)) - entries = [self.url_result(LeIE._URL_TEMPLATE % media_id, ie='Le') - for media_id in media_ids] - - title = self._html_search_meta('keywords', page, - fatal=False).split(',')[0] - description = self._html_search_meta('description', page, fatal=False) - - return self.playlist_result(entries, playlist_id, playlist_title=title, - playlist_description=description) - - -class LetvCloudIE(InfoExtractor): - # Most of *.letv.com is changed to *.le.com on 2016/01/02 - # but yuntv.letv.com is kept, so also keep the extractor name - IE_DESC = '乐视云' - _VALID_URL = r'https?://yuntv\.letv\.com/bcloud.html\?.+' - - _TESTS = [{ - 'url': 'http://yuntv.letv.com/bcloud.html?uu=p7jnfw5hw9&vu=467623dedf', - 'md5': '26450599afd64c513bc77030ad15db44', - 'info_dict': { - 'id': 'p7jnfw5hw9_467623dedf', - 'ext': 'mp4', - 'title': 'Video p7jnfw5hw9_467623dedf', - }, - }, { - 'url': 'http://yuntv.letv.com/bcloud.html?uu=p7jnfw5hw9&vu=ec93197892&pu=2c7cd40209&auto_play=1&gpcflag=1&width=640&height=360', - 'md5': 'e03d9cc8d9c13191e1caf277e42dbd31', - 'info_dict': { - 'id': 'p7jnfw5hw9_ec93197892', - 'ext': 'mp4', - 'title': 'Video p7jnfw5hw9_ec93197892', - }, - }, { - 'url': 'http://yuntv.letv.com/bcloud.html?uu=p7jnfw5hw9&vu=187060b6fd', - 'md5': 'cb988699a776b22d4a41b9d43acfb3ac', - 'info_dict': { - 'id': 'p7jnfw5hw9_187060b6fd', - 'ext': 'mp4', - 'title': 'Video p7jnfw5hw9_187060b6fd', - }, - }] - - @staticmethod - def sign_data(obj): - if obj['cf'] == 'flash': - salt = '2f9d6924b33a165a6d8b5d3d42f4f987' - items = ['cf', 'format', 'ran', 'uu', 'ver', 'vu'] - elif obj['cf'] == 'html5': - salt = 'fbeh5player12c43eccf2bec3300344' - items = ['cf', 'ran', 'uu', 'bver', 'vu'] - input_data = ''.join([item + obj[item] for item in items]) + salt - obj['sign'] = hashlib.md5(input_data.encode('utf-8')).hexdigest() - - def _get_formats(self, cf, uu, vu, media_id): - def get_play_json(cf, timestamp): - data = { - 'cf': cf, - 'ver': '2.2', - 'bver': 'firefox44.0', - 'format': 'json', - 'uu': uu, - 'vu': vu, - 'ran': compat_str(timestamp), - } - self.sign_data(data) - return self._download_json( - 'http://api.letvcloud.com/gpc.php?' + compat_urllib_parse_urlencode(data), - media_id, 'Downloading playJson data for type %s' % cf) - - play_json = get_play_json(cf, time.time()) - # The server time may be different from local time - if play_json.get('code') == 10071: - play_json = get_play_json(cf, play_json['timestamp']) - - if not play_json.get('data'): - if play_json.get('message'): - raise ExtractorError('Letv cloud said: %s' % play_json['message'], expected=True) - elif play_json.get('code'): - raise ExtractorError('Letv cloud returned error %d' % play_json['code'], expected=True) - else: - raise ExtractorError('Letv cloud returned an unknown error') - - def b64decode(s): - return compat_b64decode(s).decode('utf-8') - - formats = [] - for media in play_json['data']['video_info']['media'].values(): - play_url = media['play_url'] - url = b64decode(play_url['main_url']) - decoded_url = b64decode(url_basename(url)) - formats.append({ - 'url': url, - 'ext': determine_ext(decoded_url), - 'format_id': str_or_none(play_url.get('vtype')), - 'format_note': str_or_none(play_url.get('definition')), - 'width': int_or_none(play_url.get('vwidth')), - 'height': int_or_none(play_url.get('vheight')), - }) - - return formats - - def _real_extract(self, url): - uu_mobj = re.search(r'uu=([\w]+)', url) - vu_mobj = re.search(r'vu=([\w]+)', url) - - if not uu_mobj or not vu_mobj: - raise ExtractorError('Invalid URL: %s' % url, expected=True) - - uu = uu_mobj.group(1) - vu = vu_mobj.group(1) - media_id = uu + '_' + vu - - formats = self._get_formats('flash', uu, vu, media_id) + self._get_formats('html5', uu, vu, media_id) - self._sort_formats(formats) - - return { - 'id': media_id, - 'title': 'Video %s' % media_id, - 'formats': formats, - } diff --git a/youtube_dl/extractor/lego.py b/youtube_dl/extractor/lego.py deleted file mode 100644 index 1e3c19dfd..000000000 --- a/youtube_dl/extractor/lego.py +++ /dev/null @@ -1,149 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re -import uuid - -from .common import InfoExtractor -from ..compat import compat_HTTPError -from ..utils import ( - ExtractorError, - int_or_none, - qualities, -) - - -class LEGOIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?lego\.com/(?P<locale>[a-z]{2}-[a-z]{2})/(?:[^/]+/)*videos/(?:[^/]+/)*[^/?#]+-(?P<id>[0-9a-f]{32})' - _TESTS = [{ - 'url': 'http://www.lego.com/en-us/videos/themes/club/blocumentary-kawaguchi-55492d823b1b4d5e985787fa8c2973b1', - 'md5': 'f34468f176cfd76488767fc162c405fa', - 'info_dict': { - 'id': '55492d82-3b1b-4d5e-9857-87fa8c2973b1_en-US', - 'ext': 'mp4', - 'title': 'Blocumentary Great Creations: Akiyuki Kawaguchi', - 'description': 'Blocumentary Great Creations: Akiyuki Kawaguchi', - }, - }, { - # geo-restricted but the contentUrl contain a valid url - 'url': 'http://www.lego.com/nl-nl/videos/themes/nexoknights/episode-20-kingdom-of-heroes-13bdc2299ab24d9685701a915b3d71e7##sp=399', - 'md5': 'c7420221f7ffd03ff056f9db7f8d807c', - 'info_dict': { - 'id': '13bdc229-9ab2-4d96-8570-1a915b3d71e7_nl-NL', - 'ext': 'mp4', - 'title': 'Aflevering 20: Helden van het koninkrijk', - 'description': 'md5:8ee499aac26d7fa8bcb0cedb7f9c3941', - 'age_limit': 5, - }, - }, { - # with subtitle - 'url': 'https://www.lego.com/nl-nl/kids/videos/classic/creative-storytelling-the-little-puppy-aa24f27c7d5242bc86102ebdc0f24cba', - 'info_dict': { - 'id': 'aa24f27c-7d52-42bc-8610-2ebdc0f24cba_nl-NL', - 'ext': 'mp4', - 'title': 'De kleine puppy', - 'description': 'md5:5b725471f849348ac73f2e12cfb4be06', - 'age_limit': 1, - 'subtitles': { - 'nl': [{ - 'ext': 'srt', - 'url': r're:^https://.+\.srt$', - }], - }, - }, - 'params': { - 'skip_download': True, - }, - }] - _QUALITIES = { - 'Lowest': (64, 180, 320), - 'Low': (64, 270, 480), - 'Medium': (96, 360, 640), - 'High': (128, 540, 960), - 'Highest': (128, 720, 1280), - } - - def _real_extract(self, url): - locale, video_id = re.match(self._VALID_URL, url).groups() - countries = [locale.split('-')[1].upper()] - self._initialize_geo_bypass({ - 'countries': countries, - }) - - try: - item = self._download_json( - # https://contentfeed.services.lego.com/api/v2/item/[VIDEO_ID]?culture=[LOCALE]&contentType=Video - 'https://services.slingshot.lego.com/mediaplayer/v2', - video_id, query={ - 'videoId': '%s_%s' % (uuid.UUID(video_id), locale), - }, headers=self.geo_verification_headers()) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 451: - self.raise_geo_restricted(countries=countries) - raise - - video = item['Video'] - video_id = video['Id'] - title = video['Title'] - - q = qualities(['Lowest', 'Low', 'Medium', 'High', 'Highest']) - formats = [] - for video_source in item.get('VideoFormats', []): - video_source_url = video_source.get('Url') - if not video_source_url: - continue - video_source_format = video_source.get('Format') - if video_source_format == 'F4M': - formats.extend(self._extract_f4m_formats( - video_source_url, video_id, - f4m_id=video_source_format, fatal=False)) - elif video_source_format == 'M3U8': - formats.extend(self._extract_m3u8_formats( - video_source_url, video_id, 'mp4', 'm3u8_native', - m3u8_id=video_source_format, fatal=False)) - else: - video_source_quality = video_source.get('Quality') - format_id = [] - for v in (video_source_format, video_source_quality): - if v: - format_id.append(v) - f = { - 'format_id': '-'.join(format_id), - 'quality': q(video_source_quality), - 'url': video_source_url, - } - quality = self._QUALITIES.get(video_source_quality) - if quality: - f.update({ - 'abr': quality[0], - 'height': quality[1], - 'width': quality[2], - }), - formats.append(f) - self._sort_formats(formats) - - subtitles = {} - sub_file_id = video.get('SubFileId') - if sub_file_id and sub_file_id != '00000000-0000-0000-0000-000000000000': - net_storage_path = video.get('NetstoragePath') - invariant_id = video.get('InvariantId') - video_file_id = video.get('VideoFileId') - video_version = video.get('VideoVersion') - if net_storage_path and invariant_id and video_file_id and video_version: - subtitles.setdefault(locale[:2], []).append({ - 'url': 'https://lc-mediaplayerns-live-s.legocdn.com/public/%s/%s_%s_%s_%s_sub.srt' % (net_storage_path, invariant_id, video_file_id, locale, video_version), - }) - - return { - 'id': video_id, - 'title': title, - 'description': video.get('Description'), - 'thumbnail': video.get('GeneratedCoverImage') or video.get('GeneratedThumbnail'), - 'duration': int_or_none(video.get('Length')), - 'formats': formats, - 'subtitles': subtitles, - 'age_limit': int_or_none(video.get('AgeFrom')), - 'season': video.get('SeasonTitle'), - 'season_number': int_or_none(video.get('Season')) or None, - 'episode_number': int_or_none(video.get('Episode')) or None, - } diff --git a/youtube_dl/extractor/libsyn.py b/youtube_dl/extractor/libsyn.py deleted file mode 100644 index 2cf444258..000000000 --- a/youtube_dl/extractor/libsyn.py +++ /dev/null @@ -1,93 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - clean_html, - get_element_by_class, - parse_duration, - strip_or_none, - unified_strdate, -) - - -class LibsynIE(InfoExtractor): - _VALID_URL = r'(?P<mainurl>https?://html5-player\.libsyn\.com/embed/episode/id/(?P<id>[0-9]+))' - - _TESTS = [{ - 'url': 'http://html5-player.libsyn.com/embed/episode/id/6385796/', - 'md5': '2a55e75496c790cdeb058e7e6c087746', - 'info_dict': { - 'id': '6385796', - 'ext': 'mp3', - 'title': "Champion Minded - Developing a Growth Mindset", - # description fetched using another request: - # http://html5-player.libsyn.com/embed/getitemdetails?item_id=6385796 - # 'description': 'In this episode, Allistair talks about the importance of developing a growth mindset, not only in sports, but in life too.', - 'upload_date': '20180320', - 'thumbnail': 're:^https?://.*', - }, - }, { - 'url': 'https://html5-player.libsyn.com/embed/episode/id/3727166/height/75/width/200/theme/standard/direction/no/autoplay/no/autonext/no/thumbnail/no/preload/no/no_addthis/no/', - 'md5': '6c5cb21acd622d754d3b1a92b582ce42', - 'info_dict': { - 'id': '3727166', - 'ext': 'mp3', - 'title': 'Clients From Hell Podcast - How a Sex Toy Company Kickstarted my Freelance Career', - 'upload_date': '20150818', - 'thumbnail': 're:^https?://.*', - } - }] - - def _real_extract(self, url): - url, video_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage(url, video_id) - - data = self._parse_json(self._search_regex( - r'var\s+playlistItem\s*=\s*({.+?});', - webpage, 'JSON data block'), video_id) - - episode_title = data.get('item_title') or get_element_by_class('episode-title', webpage) - if not episode_title: - self._search_regex( - [r'data-title="([^"]+)"', r'<title>(.+?)</title>'], - webpage, 'episode title') - episode_title = episode_title.strip() - - podcast_title = strip_or_none(clean_html(self._search_regex( - r'<h3>([^<]+)</h3>', webpage, 'podcast title', - default=None) or get_element_by_class('podcast-title', webpage))) - - title = '%s - %s' % (podcast_title, episode_title) if podcast_title else episode_title - - formats = [] - for k, format_id in (('media_url_libsyn', 'libsyn'), ('media_url', 'main'), ('download_link', 'download')): - f_url = data.get(k) - if not f_url: - continue - formats.append({ - 'url': f_url, - 'format_id': format_id, - }) - - description = self._html_search_regex( - r'<p\s+id="info_text_body">(.+?)</p>', webpage, - 'description', default=None) - if description: - # Strip non-breaking and normal spaces - description = description.replace('\u00A0', ' ').strip() - release_date = unified_strdate(self._search_regex( - r'<div class="release_date">Released: ([^<]+)<', - webpage, 'release date', default=None) or data.get('release_date')) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': data.get('thumbnail_url'), - 'upload_date': release_date, - 'duration': parse_duration(data.get('duration')), - 'formats': formats, - } diff --git a/youtube_dl/extractor/lifenews.py b/youtube_dl/extractor/lifenews.py deleted file mode 100644 index 42e263bfa..000000000 --- a/youtube_dl/extractor/lifenews.py +++ /dev/null @@ -1,239 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, -) -from ..utils import ( - determine_ext, - ExtractorError, - int_or_none, - parse_iso8601, - remove_end, -) - - -class LifeNewsIE(InfoExtractor): - IE_NAME = 'life' - IE_DESC = 'Life.ru' - _VALID_URL = r'https?://life\.ru/t/[^/]+/(?P<id>\d+)' - - _TESTS = [{ - # single video embedded via video/source - 'url': 'https://life.ru/t/новости/98736', - 'md5': '77c95eaefaca216e32a76a343ad89d23', - 'info_dict': { - 'id': '98736', - 'ext': 'mp4', - 'title': 'Мужчина нашел дома архив оборонного завода', - 'description': 'md5:3b06b1b39b5e2bea548e403d99b8bf26', - 'timestamp': 1344154740, - 'upload_date': '20120805', - 'view_count': int, - } - }, { - # single video embedded via iframe - 'url': 'https://life.ru/t/новости/152125', - 'md5': '77d19a6f0886cd76bdbf44b4d971a273', - 'info_dict': { - 'id': '152125', - 'ext': 'mp4', - 'title': 'В Сети появилось видео захвата «Правым сектором» колхозных полей ', - 'description': 'Жители двух поселков Днепропетровской области не простили радикалам угрозу лишения плодородных земель и пошли в лобовую. ', - 'timestamp': 1427961840, - 'upload_date': '20150402', - 'view_count': int, - } - }, { - # two videos embedded via iframe - 'url': 'https://life.ru/t/новости/153461', - 'info_dict': { - 'id': '153461', - 'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве', - 'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.', - 'timestamp': 1430825520, - 'view_count': int, - }, - 'playlist': [{ - 'md5': '9b6ef8bc0ffa25aebc8bdb40d89ab795', - 'info_dict': { - 'id': '153461-video1', - 'ext': 'mp4', - 'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве (Видео 1)', - 'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.', - 'timestamp': 1430825520, - 'upload_date': '20150505', - }, - }, { - 'md5': 'ebb3bf3b1ce40e878d0d628e93eb0322', - 'info_dict': { - 'id': '153461-video2', - 'ext': 'mp4', - 'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве (Видео 2)', - 'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.', - 'timestamp': 1430825520, - 'upload_date': '20150505', - }, - }], - }, { - 'url': 'https://life.ru/t/новости/213035', - 'only_matching': True, - }, { - 'url': 'https://life.ru/t/%D0%BD%D0%BE%D0%B2%D0%BE%D1%81%D1%82%D0%B8/153461', - 'only_matching': True, - }, { - 'url': 'https://life.ru/t/новости/411489/manuel_vals_nazval_frantsiiu_tsieliu_nomier_odin_dlia_ighil', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - video_urls = re.findall( - r'<video[^>]+><source[^>]+src=["\'](.+?)["\']', webpage) - - iframe_links = re.findall( - r'<iframe[^>]+src=["\']((?:https?:)?//embed\.life\.ru/(?:embed|video)/.+?)["\']', - webpage) - - if not video_urls and not iframe_links: - raise ExtractorError('No media links available for %s' % video_id) - - title = remove_end( - self._og_search_title(webpage), - ' - Life.ru') - - description = self._og_search_description(webpage) - - view_count = self._html_search_regex( - r'<div[^>]+class=(["\']).*?\bhits-count\b.*?\1[^>]*>\s*(?P<value>\d+)\s*</div>', - webpage, 'view count', fatal=False, group='value') - - timestamp = parse_iso8601(self._search_regex( - r'<time[^>]+datetime=(["\'])(?P<value>.+?)\1', - webpage, 'upload date', fatal=False, group='value')) - - common_info = { - 'description': description, - 'view_count': int_or_none(view_count), - 'timestamp': timestamp, - } - - def make_entry(video_id, video_url, index=None): - cur_info = dict(common_info) - cur_info.update({ - 'id': video_id if not index else '%s-video%s' % (video_id, index), - 'url': video_url, - 'title': title if not index else '%s (Видео %s)' % (title, index), - }) - return cur_info - - def make_video_entry(video_id, video_url, index=None): - video_url = compat_urlparse.urljoin(url, video_url) - return make_entry(video_id, video_url, index) - - def make_iframe_entry(video_id, video_url, index=None): - video_url = self._proto_relative_url(video_url, 'http:') - cur_info = make_entry(video_id, video_url, index) - cur_info['_type'] = 'url_transparent' - return cur_info - - if len(video_urls) == 1 and not iframe_links: - return make_video_entry(video_id, video_urls[0]) - - if len(iframe_links) == 1 and not video_urls: - return make_iframe_entry(video_id, iframe_links[0]) - - entries = [] - - if video_urls: - for num, video_url in enumerate(video_urls, 1): - entries.append(make_video_entry(video_id, video_url, num)) - - if iframe_links: - for num, iframe_link in enumerate(iframe_links, len(video_urls) + 1): - entries.append(make_iframe_entry(video_id, iframe_link, num)) - - playlist = common_info.copy() - playlist.update(self.playlist_result(entries, video_id, title, description)) - return playlist - - -class LifeEmbedIE(InfoExtractor): - IE_NAME = 'life:embed' - _VALID_URL = r'https?://embed\.life\.ru/(?:embed|video)/(?P<id>[\da-f]{32})' - - _TESTS = [{ - 'url': 'http://embed.life.ru/embed/e50c2dec2867350528e2574c899b8291', - 'md5': 'b889715c9e49cb1981281d0e5458fbbe', - 'info_dict': { - 'id': 'e50c2dec2867350528e2574c899b8291', - 'ext': 'mp4', - 'title': 'e50c2dec2867350528e2574c899b8291', - 'thumbnail': r're:http://.*\.jpg', - } - }, { - # with 1080p - 'url': 'https://embed.life.ru/video/e50c2dec2867350528e2574c899b8291', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - thumbnail = None - formats = [] - - def extract_m3u8(manifest_url): - formats.extend(self._extract_m3u8_formats( - manifest_url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='m3u8')) - - def extract_original(original_url): - formats.append({ - 'url': original_url, - 'format_id': determine_ext(original_url, None), - 'preference': 1, - }) - - playlist = self._parse_json( - self._search_regex( - r'options\s*=\s*({.+?});', webpage, 'options', default='{}'), - video_id).get('playlist', {}) - if playlist: - master = playlist.get('master') - if isinstance(master, compat_str) and determine_ext(master) == 'm3u8': - extract_m3u8(compat_urlparse.urljoin(url, master)) - original = playlist.get('original') - if isinstance(original, compat_str): - extract_original(original) - thumbnail = playlist.get('image') - - # Old rendition fallback - if not formats: - for video_url in re.findall(r'"file"\s*:\s*"([^"]+)', webpage): - video_url = compat_urlparse.urljoin(url, video_url) - if determine_ext(video_url) == 'm3u8': - extract_m3u8(video_url) - else: - extract_original(video_url) - - self._sort_formats(formats) - - thumbnail = thumbnail or self._search_regex( - r'"image"\s*:\s*"([^"]+)', webpage, 'thumbnail', default=None) - - return { - 'id': video_id, - 'title': video_id, - 'thumbnail': thumbnail, - 'formats': formats, - } diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py deleted file mode 100644 index 39f74d282..000000000 --- a/youtube_dl/extractor/limelight.py +++ /dev/null @@ -1,358 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_HTTPError -from ..utils import ( - determine_ext, - float_or_none, - int_or_none, - smuggle_url, - try_get, - unsmuggle_url, - ExtractorError, -) - - -class LimelightBaseIE(InfoExtractor): - _PLAYLIST_SERVICE_URL = 'http://production-ps.lvp.llnw.net/r/PlaylistService/%s/%s/%s' - - @classmethod - def _extract_urls(cls, webpage, source_url): - lm = { - 'Media': 'media', - 'Channel': 'channel', - 'ChannelList': 'channel_list', - } - - def smuggle(url): - return smuggle_url(url, {'source_url': source_url}) - - entries = [] - for kind, video_id in re.findall( - r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})', - webpage): - entries.append(cls.url_result( - smuggle('limelight:%s:%s' % (lm[kind], video_id)), - 'Limelight%s' % kind, video_id)) - for mobj in re.finditer( - # As per [1] class attribute should be exactly equal to - # LimelightEmbeddedPlayerFlash but numerous examples seen - # that don't exactly match it (e.g. [2]). - # 1. http://support.3playmedia.com/hc/en-us/articles/227732408-Limelight-Embedding-the-Captions-Plugin-with-the-Limelight-Player-on-Your-Webpage - # 2. http://www.sedona.com/FacilitatorTraining2017 - r'''(?sx) - <object[^>]+class=(["\'])(?:(?!\1).)*\bLimelightEmbeddedPlayerFlash\b(?:(?!\1).)*\1[^>]*>.*? - <param[^>]+ - name=(["\'])flashVars\2[^>]+ - value=(["\'])(?:(?!\3).)*(?P<kind>media|channel(?:List)?)Id=(?P<id>[a-z0-9]{32}) - ''', webpage): - kind, video_id = mobj.group('kind'), mobj.group('id') - entries.append(cls.url_result( - smuggle('limelight:%s:%s' % (kind, video_id)), - 'Limelight%s' % kind.capitalize(), video_id)) - # http://support.3playmedia.com/hc/en-us/articles/115009517327-Limelight-Embedding-the-Audio-Description-Plugin-with-the-Limelight-Player-on-Your-Web-Page) - for video_id in re.findall( - r'(?s)LimelightPlayerUtil\.embed\s*\(\s*{.*?\bmediaId["\']\s*:\s*["\'](?P<id>[a-z0-9]{32})', - webpage): - entries.append(cls.url_result( - smuggle('limelight:media:%s' % video_id), - LimelightMediaIE.ie_key(), video_id)) - return entries - - def _call_playlist_service(self, item_id, method, fatal=True, referer=None): - headers = {} - if referer: - headers['Referer'] = referer - try: - return self._download_json( - self._PLAYLIST_SERVICE_URL % (self._PLAYLIST_SERVICE_PATH, item_id, method), - item_id, 'Downloading PlaylistService %s JSON' % method, - fatal=fatal, headers=headers) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - error = self._parse_json(e.cause.read().decode(), item_id)['detail']['contentAccessPermission'] - if error == 'CountryDisabled': - self.raise_geo_restricted() - raise ExtractorError(error, expected=True) - raise - - def _extract(self, item_id, pc_method, mobile_method, referer=None): - pc = self._call_playlist_service(item_id, pc_method, referer=referer) - mobile = self._call_playlist_service( - item_id, mobile_method, fatal=False, referer=referer) - return pc, mobile - - def _extract_info(self, pc, mobile, i, referer): - get_item = lambda x, y: try_get(x, lambda x: x[y][i], dict) or {} - pc_item = get_item(pc, 'playlistItems') - mobile_item = get_item(mobile, 'mediaList') - video_id = pc_item.get('mediaId') or mobile_item['mediaId'] - title = pc_item.get('title') or mobile_item['title'] - - formats = [] - urls = [] - for stream in pc_item.get('streams', []): - stream_url = stream.get('url') - if not stream_url or stream.get('drmProtected') or stream_url in urls: - continue - urls.append(stream_url) - ext = determine_ext(stream_url) - if ext == 'f4m': - formats.extend(self._extract_f4m_formats( - stream_url, video_id, f4m_id='hds', fatal=False)) - else: - fmt = { - 'url': stream_url, - 'abr': float_or_none(stream.get('audioBitRate')), - 'fps': float_or_none(stream.get('videoFrameRate')), - 'ext': ext, - } - width = int_or_none(stream.get('videoWidthInPixels')) - height = int_or_none(stream.get('videoHeightInPixels')) - vbr = float_or_none(stream.get('videoBitRate')) - if width or height or vbr: - fmt.update({ - 'width': width, - 'height': height, - 'vbr': vbr, - }) - else: - fmt['vcodec'] = 'none' - rtmp = re.search(r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+))/(?P<playpath>mp[34]:.+)$', stream_url) - if rtmp: - format_id = 'rtmp' - if stream.get('videoBitRate'): - format_id += '-%d' % int_or_none(stream['videoBitRate']) - http_format_id = format_id.replace('rtmp', 'http') - - CDN_HOSTS = ( - ('delvenetworks.com', 'cpl.delvenetworks.com'), - ('video.llnw.net', 's2.content.video.llnw.net'), - ) - for cdn_host, http_host in CDN_HOSTS: - if cdn_host not in rtmp.group('host').lower(): - continue - http_url = 'http://%s/%s' % (http_host, rtmp.group('playpath')[4:]) - urls.append(http_url) - if self._is_valid_url(http_url, video_id, http_format_id): - http_fmt = fmt.copy() - http_fmt.update({ - 'url': http_url, - 'format_id': http_format_id, - }) - formats.append(http_fmt) - break - - fmt.update({ - 'url': rtmp.group('url'), - 'play_path': rtmp.group('playpath'), - 'app': rtmp.group('app'), - 'ext': 'flv', - 'format_id': format_id, - }) - formats.append(fmt) - - for mobile_url in mobile_item.get('mobileUrls', []): - media_url = mobile_url.get('mobileUrl') - format_id = mobile_url.get('targetMediaPlatform') - if not media_url or format_id in ('Widevine', 'SmoothStreaming') or media_url in urls: - continue - urls.append(media_url) - ext = determine_ext(media_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - media_url, video_id, 'mp4', 'm3u8_native', - m3u8_id=format_id, fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - stream_url, video_id, f4m_id=format_id, fatal=False)) - else: - formats.append({ - 'url': media_url, - 'format_id': format_id, - 'preference': -1, - 'ext': ext, - }) - - self._sort_formats(formats) - - subtitles = {} - for flag in mobile_item.get('flags'): - if flag == 'ClosedCaptions': - closed_captions = self._call_playlist_service( - video_id, 'getClosedCaptionsDetailsByMediaId', - False, referer) or [] - for cc in closed_captions: - cc_url = cc.get('webvttFileUrl') - if not cc_url: - continue - lang = cc.get('languageCode') or self._search_regex(r'/[a-z]{2}\.vtt', cc_url, 'lang', default='en') - subtitles.setdefault(lang, []).append({ - 'url': cc_url, - }) - break - - get_meta = lambda x: pc_item.get(x) or mobile_item.get(x) - - return { - 'id': video_id, - 'title': title, - 'description': get_meta('description'), - 'formats': formats, - 'duration': float_or_none(get_meta('durationInMilliseconds'), 1000), - 'thumbnail': get_meta('previewImageUrl') or get_meta('thumbnailImageUrl'), - 'subtitles': subtitles, - } - - -class LimelightMediaIE(LimelightBaseIE): - IE_NAME = 'limelight' - _VALID_URL = r'''(?x) - (?: - limelight:media:| - https?:// - (?: - link\.videoplatform\.limelight\.com/media/| - assets\.delvenetworks\.com/player/loader\.swf - ) - \?.*?\bmediaId= - ) - (?P<id>[a-z0-9]{32}) - ''' - _TESTS = [{ - 'url': 'http://link.videoplatform.limelight.com/media/?mediaId=3ffd040b522b4485b6d84effc750cd86', - 'info_dict': { - 'id': '3ffd040b522b4485b6d84effc750cd86', - 'ext': 'mp4', - 'title': 'HaP and the HB Prince Trailer', - 'description': 'md5:8005b944181778e313d95c1237ddb640', - 'thumbnail': r're:^https?://.*\.jpeg$', - 'duration': 144.23, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - # video with subtitles - 'url': 'limelight:media:a3e00274d4564ec4a9b29b9466432335', - 'md5': '2fa3bad9ac321e23860ca23bc2c69e3d', - 'info_dict': { - 'id': 'a3e00274d4564ec4a9b29b9466432335', - 'ext': 'mp4', - 'title': '3Play Media Overview Video', - 'thumbnail': r're:^https?://.*\.jpeg$', - 'duration': 78.101, - # TODO: extract all languages that were accessible via API - # 'subtitles': 'mincount:9', - 'subtitles': 'mincount:1', - }, - }, { - 'url': 'https://assets.delvenetworks.com/player/loader.swf?mediaId=8018a574f08d416e95ceaccae4ba0452', - 'only_matching': True, - }] - _PLAYLIST_SERVICE_PATH = 'media' - - def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) - video_id = self._match_id(url) - source_url = smuggled_data.get('source_url') - self._initialize_geo_bypass({ - 'countries': smuggled_data.get('geo_countries'), - }) - - pc, mobile = self._extract( - video_id, 'getPlaylistByMediaId', - 'getMobilePlaylistByMediaId', source_url) - - return self._extract_info(pc, mobile, 0, source_url) - - -class LimelightChannelIE(LimelightBaseIE): - IE_NAME = 'limelight:channel' - _VALID_URL = r'''(?x) - (?: - limelight:channel:| - https?:// - (?: - link\.videoplatform\.limelight\.com/media/| - assets\.delvenetworks\.com/player/loader\.swf - ) - \?.*?\bchannelId= - ) - (?P<id>[a-z0-9]{32}) - ''' - _TESTS = [{ - 'url': 'http://link.videoplatform.limelight.com/media/?channelId=ab6a524c379342f9b23642917020c082', - 'info_dict': { - 'id': 'ab6a524c379342f9b23642917020c082', - 'title': 'Javascript Sample Code', - 'description': 'Javascript Sample Code - http://www.delvenetworks.com/sample-code/playerCode-demo.html', - }, - 'playlist_mincount': 3, - }, { - 'url': 'http://assets.delvenetworks.com/player/loader.swf?channelId=ab6a524c379342f9b23642917020c082', - 'only_matching': True, - }] - _PLAYLIST_SERVICE_PATH = 'channel' - - def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) - channel_id = self._match_id(url) - source_url = smuggled_data.get('source_url') - - pc, mobile = self._extract( - channel_id, 'getPlaylistByChannelId', - 'getMobilePlaylistWithNItemsByChannelId?begin=0&count=-1', - source_url) - - entries = [ - self._extract_info(pc, mobile, i, source_url) - for i in range(len(pc['playlistItems']))] - - return self.playlist_result( - entries, channel_id, pc.get('title'), mobile.get('description')) - - -class LimelightChannelListIE(LimelightBaseIE): - IE_NAME = 'limelight:channel_list' - _VALID_URL = r'''(?x) - (?: - limelight:channel_list:| - https?:// - (?: - link\.videoplatform\.limelight\.com/media/| - assets\.delvenetworks\.com/player/loader\.swf - ) - \?.*?\bchannelListId= - ) - (?P<id>[a-z0-9]{32}) - ''' - _TESTS = [{ - 'url': 'http://link.videoplatform.limelight.com/media/?channelListId=301b117890c4465c8179ede21fd92e2b', - 'info_dict': { - 'id': '301b117890c4465c8179ede21fd92e2b', - 'title': 'Website - Hero Player', - }, - 'playlist_mincount': 2, - }, { - 'url': 'https://assets.delvenetworks.com/player/loader.swf?channelListId=301b117890c4465c8179ede21fd92e2b', - 'only_matching': True, - }] - _PLAYLIST_SERVICE_PATH = 'channel_list' - - def _real_extract(self, url): - channel_list_id = self._match_id(url) - - channel_list = self._call_playlist_service( - channel_list_id, 'getMobileChannelListById') - - entries = [ - self.url_result('limelight:channel:%s' % channel['id'], 'LimelightChannel') - for channel in channel_list['channelList']] - - return self.playlist_result( - entries, channel_list_id, channel_list['title']) diff --git a/youtube_dl/extractor/line.py b/youtube_dl/extractor/line.py deleted file mode 100644 index 2526daa77..000000000 --- a/youtube_dl/extractor/line.py +++ /dev/null @@ -1,230 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - ExtractorError, - int_or_none, - js_to_json, - str_or_none, -) - - -class LineTVIE(InfoExtractor): - _VALID_URL = r'https?://tv\.line\.me/v/(?P<id>\d+)_[^/]+-(?P<segment>ep\d+-\d+)' - - _TESTS = [{ - 'url': 'https://tv.line.me/v/793123_goodbye-mrblack-ep1-1/list/69246', - 'info_dict': { - 'id': '793123_ep1-1', - 'ext': 'mp4', - 'title': 'Goodbye Mr.Black | EP.1-1', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 998.509, - 'view_count': int, - }, - }, { - 'url': 'https://tv.line.me/v/2587507_%E6%B4%BE%E9%81%A3%E5%A5%B3%E9%86%ABx-ep1-02/list/185245', - 'only_matching': True, - }] - - def _real_extract(self, url): - series_id, segment = re.match(self._VALID_URL, url).groups() - video_id = '%s_%s' % (series_id, segment) - - webpage = self._download_webpage(url, video_id) - - player_params = self._parse_json(self._search_regex( - r'naver\.WebPlayer\(({[^}]+})\)', webpage, 'player parameters'), - video_id, transform_source=js_to_json) - - video_info = self._download_json( - 'https://global-nvapis.line.me/linetv/rmcnmv/vod_play_videoInfo.json', - video_id, query={ - 'videoId': player_params['videoId'], - 'key': player_params['key'], - }) - - stream = video_info['streams'][0] - extra_query = '?__gda__=' + stream['key']['value'] - formats = self._extract_m3u8_formats( - stream['source'] + extra_query, video_id, ext='mp4', - entry_protocol='m3u8_native', m3u8_id='hls') - - for a_format in formats: - a_format['url'] += extra_query - - duration = None - for video in video_info.get('videos', {}).get('list', []): - encoding_option = video.get('encodingOption', {}) - abr = video['bitrate']['audio'] - vbr = video['bitrate']['video'] - tbr = abr + vbr - formats.append({ - 'url': video['source'], - 'format_id': 'http-%d' % int(tbr), - 'height': encoding_option.get('height'), - 'width': encoding_option.get('width'), - 'abr': abr, - 'vbr': vbr, - 'filesize': video.get('size'), - }) - if video.get('duration') and duration is None: - duration = video['duration'] - - self._sort_formats(formats) - - if not formats[0].get('width'): - formats[0]['vcodec'] = 'none' - - title = self._og_search_title(webpage) - - # like_count requires an additional API request https://tv.line.me/api/likeit/getCount - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'extra_param_to_segment_url': extra_query[1:], - 'duration': duration, - 'thumbnails': [{'url': thumbnail['source']} - for thumbnail in video_info.get('thumbnails', {}).get('list', [])], - 'view_count': video_info.get('meta', {}).get('count'), - } - - -class LineLiveBaseIE(InfoExtractor): - _API_BASE_URL = 'https://live-api.line-apps.com/web/v4.0/channel/' - - def _parse_broadcast_item(self, item): - broadcast_id = compat_str(item['id']) - title = item['title'] - is_live = item.get('isBroadcastingNow') - - thumbnails = [] - for thumbnail_id, thumbnail_url in (item.get('thumbnailURLs') or {}).items(): - if not thumbnail_url: - continue - thumbnails.append({ - 'id': thumbnail_id, - 'url': thumbnail_url, - }) - - channel = item.get('channel') or {} - channel_id = str_or_none(channel.get('id')) - - return { - 'id': broadcast_id, - 'title': self._live_title(title) if is_live else title, - 'thumbnails': thumbnails, - 'timestamp': int_or_none(item.get('createdAt')), - 'channel': channel.get('name'), - 'channel_id': channel_id, - 'channel_url': 'https://live.line.me/channels/' + channel_id if channel_id else None, - 'duration': int_or_none(item.get('archiveDuration')), - 'view_count': int_or_none(item.get('viewerCount')), - 'comment_count': int_or_none(item.get('chatCount')), - 'is_live': is_live, - } - - -class LineLiveIE(LineLiveBaseIE): - _VALID_URL = r'https?://live\.line\.me/channels/(?P<channel_id>\d+)/broadcast/(?P<id>\d+)' - _TESTS = [{ - 'url': 'https://live.line.me/channels/4867368/broadcast/16331360', - 'md5': 'bc931f26bf1d4f971e3b0982b3fab4a3', - 'info_dict': { - 'id': '16331360', - 'title': '振りコピ講座😙😙😙', - 'ext': 'mp4', - 'timestamp': 1617095132, - 'upload_date': '20210330', - 'channel': '白川ゆめか', - 'channel_id': '4867368', - 'view_count': int, - 'comment_count': int, - 'is_live': False, - } - }, { - # archiveStatus == 'DELETED' - 'url': 'https://live.line.me/channels/4778159/broadcast/16378488', - 'only_matching': True, - }] - - def _real_extract(self, url): - channel_id, broadcast_id = re.match(self._VALID_URL, url).groups() - broadcast = self._download_json( - self._API_BASE_URL + '%s/broadcast/%s' % (channel_id, broadcast_id), - broadcast_id) - item = broadcast['item'] - info = self._parse_broadcast_item(item) - protocol = 'm3u8' if info['is_live'] else 'm3u8_native' - formats = [] - for k, v in (broadcast.get(('live' if info['is_live'] else 'archived') + 'HLSURLs') or {}).items(): - if not v: - continue - if k == 'abr': - formats.extend(self._extract_m3u8_formats( - v, broadcast_id, 'mp4', protocol, - m3u8_id='hls', fatal=False)) - continue - f = { - 'ext': 'mp4', - 'format_id': 'hls-' + k, - 'protocol': protocol, - 'url': v, - } - if not k.isdigit(): - f['vcodec'] = 'none' - formats.append(f) - if not formats: - archive_status = item.get('archiveStatus') - if archive_status != 'ARCHIVED': - raise ExtractorError('this video has been ' + archive_status.lower(), expected=True) - self._sort_formats(formats) - info['formats'] = formats - return info - - -class LineLiveChannelIE(LineLiveBaseIE): - _VALID_URL = r'https?://live\.line\.me/channels/(?P<id>\d+)(?!/broadcast/\d+)(?:[/?&#]|$)' - _TEST = { - 'url': 'https://live.line.me/channels/5893542', - 'info_dict': { - 'id': '5893542', - 'title': 'いくらちゃん', - 'description': 'md5:c3a4af801f43b2fac0b02294976580be', - }, - 'playlist_mincount': 29 - } - - def _archived_broadcasts_entries(self, archived_broadcasts, channel_id): - while True: - for row in (archived_broadcasts.get('rows') or []): - share_url = str_or_none(row.get('shareURL')) - if not share_url: - continue - info = self._parse_broadcast_item(row) - info.update({ - '_type': 'url', - 'url': share_url, - 'ie_key': LineLiveIE.ie_key(), - }) - yield info - if not archived_broadcasts.get('hasNextPage'): - return - archived_broadcasts = self._download_json( - self._API_BASE_URL + channel_id + '/archived_broadcasts', - channel_id, query={ - 'lastId': info['id'], - }) - - def _real_extract(self, url): - channel_id = self._match_id(url) - channel = self._download_json(self._API_BASE_URL + channel_id, channel_id) - return self.playlist_result( - self._archived_broadcasts_entries(channel.get('archivedBroadcasts') or {}, channel_id), - channel_id, channel.get('title'), channel.get('information')) diff --git a/youtube_dl/extractor/linkedin.py b/youtube_dl/extractor/linkedin.py deleted file mode 100644 index 26fc703d1..000000000 --- a/youtube_dl/extractor/linkedin.py +++ /dev/null @@ -1,182 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - float_or_none, - int_or_none, - urlencode_postdata, - urljoin, -) - - -class LinkedInLearningBaseIE(InfoExtractor): - _NETRC_MACHINE = 'linkedin' - _LOGIN_URL = 'https://www.linkedin.com/uas/login?trk=learning' - - def _call_api(self, course_slug, fields, video_slug=None, resolution=None): - query = { - 'courseSlug': course_slug, - 'fields': fields, - 'q': 'slugs', - } - sub = '' - if video_slug: - query.update({ - 'videoSlug': video_slug, - 'resolution': '_%s' % resolution, - }) - sub = ' %dp' % resolution - api_url = 'https://www.linkedin.com/learning-api/detailedCourses' - return self._download_json( - api_url, video_slug, 'Downloading%s JSON metadata' % sub, headers={ - 'Csrf-Token': self._get_cookies(api_url)['JSESSIONID'].value, - }, query=query)['elements'][0] - - def _get_urn_id(self, video_data): - urn = video_data.get('urn') - if urn: - mobj = re.search(r'urn:li:lyndaCourse:\d+,(\d+)', urn) - if mobj: - return mobj.group(1) - - def _get_video_id(self, video_data, course_slug, video_slug): - return self._get_urn_id(video_data) or '%s/%s' % (course_slug, video_slug) - - def _real_initialize(self): - email, password = self._get_login_info() - if email is None: - return - - login_page = self._download_webpage( - self._LOGIN_URL, None, 'Downloading login page') - action_url = urljoin(self._LOGIN_URL, self._search_regex( - r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, 'post url', - default='https://www.linkedin.com/uas/login-submit', group='url')) - data = self._hidden_inputs(login_page) - data.update({ - 'session_key': email, - 'session_password': password, - }) - login_submit_page = self._download_webpage( - action_url, None, 'Logging in', - data=urlencode_postdata(data)) - error = self._search_regex( - r'<span[^>]+class="error"[^>]*>\s*(.+?)\s*</span>', - login_submit_page, 'error', default=None) - if error: - raise ExtractorError(error, expected=True) - - -class LinkedInLearningIE(LinkedInLearningBaseIE): - IE_NAME = 'linkedin:learning' - _VALID_URL = r'https?://(?:www\.)?linkedin\.com/learning/(?P<course_slug>[^/]+)/(?P<id>[^/?#]+)' - _TEST = { - 'url': 'https://www.linkedin.com/learning/programming-foundations-fundamentals/welcome?autoplay=true', - 'md5': 'a1d74422ff0d5e66a792deb996693167', - 'info_dict': { - 'id': '90426', - 'ext': 'mp4', - 'title': 'Welcome', - 'timestamp': 1430396150.82, - 'upload_date': '20150430', - }, - } - - def _real_extract(self, url): - course_slug, video_slug = re.match(self._VALID_URL, url).groups() - - video_data = None - formats = [] - for width, height in ((640, 360), (960, 540), (1280, 720)): - video_data = self._call_api( - course_slug, 'selectedVideo', video_slug, height)['selectedVideo'] - - video_url_data = video_data.get('url') or {} - progressive_url = video_url_data.get('progressiveUrl') - if progressive_url: - formats.append({ - 'format_id': 'progressive-%dp' % height, - 'url': progressive_url, - 'height': height, - 'width': width, - 'source_preference': 1, - }) - - title = video_data['title'] - - audio_url = video_data.get('audio', {}).get('progressiveUrl') - if audio_url: - formats.append({ - 'abr': 64, - 'ext': 'm4a', - 'format_id': 'audio', - 'url': audio_url, - 'vcodec': 'none', - }) - - streaming_url = video_url_data.get('streamingUrl') - if streaming_url: - formats.extend(self._extract_m3u8_formats( - streaming_url, video_slug, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False)) - - self._sort_formats(formats, ('width', 'height', 'source_preference', 'tbr', 'abr')) - - return { - 'id': self._get_video_id(video_data, course_slug, video_slug), - 'title': title, - 'formats': formats, - 'thumbnail': video_data.get('defaultThumbnail'), - 'timestamp': float_or_none(video_data.get('publishedOn'), 1000), - 'duration': int_or_none(video_data.get('durationInSeconds')), - } - - -class LinkedInLearningCourseIE(LinkedInLearningBaseIE): - IE_NAME = 'linkedin:learning:course' - _VALID_URL = r'https?://(?:www\.)?linkedin\.com/learning/(?P<id>[^/?#]+)' - _TEST = { - 'url': 'https://www.linkedin.com/learning/programming-foundations-fundamentals', - 'info_dict': { - 'id': 'programming-foundations-fundamentals', - 'title': 'Programming Foundations: Fundamentals', - 'description': 'md5:76e580b017694eb89dc8e8923fff5c86', - }, - 'playlist_mincount': 61, - } - - @classmethod - def suitable(cls, url): - return False if LinkedInLearningIE.suitable(url) else super(LinkedInLearningCourseIE, cls).suitable(url) - - def _real_extract(self, url): - course_slug = self._match_id(url) - course_data = self._call_api(course_slug, 'chapters,description,title') - - entries = [] - for chapter_number, chapter in enumerate(course_data.get('chapters', []), 1): - chapter_title = chapter.get('title') - chapter_id = self._get_urn_id(chapter) - for video in chapter.get('videos', []): - video_slug = video.get('slug') - if not video_slug: - continue - entries.append({ - '_type': 'url_transparent', - 'id': self._get_video_id(video, course_slug, video_slug), - 'title': video.get('title'), - 'url': 'https://www.linkedin.com/learning/%s/%s' % (course_slug, video_slug), - 'chapter': chapter_title, - 'chapter_number': chapter_number, - 'chapter_id': chapter_id, - 'ie_key': LinkedInLearningIE.ie_key(), - }) - - return self.playlist_result( - entries, course_slug, - course_data.get('title'), - course_data.get('description')) diff --git a/youtube_dl/extractor/linuxacademy.py b/youtube_dl/extractor/linuxacademy.py deleted file mode 100644 index 7ec4a6557..000000000 --- a/youtube_dl/extractor/linuxacademy.py +++ /dev/null @@ -1,243 +0,0 @@ -from __future__ import unicode_literals - -import json -import random -import re - -from .common import InfoExtractor -from ..compat import ( - compat_b64decode, - compat_HTTPError, - compat_str, -) -from ..utils import ( - clean_html, - ExtractorError, - js_to_json, - parse_duration, - try_get, - unified_timestamp, - urlencode_postdata, - urljoin, -) - - -class LinuxAcademyIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?:// - (?:www\.)?linuxacademy\.com/cp/ - (?: - courses/lesson/course/(?P<chapter_id>\d+)/lesson/(?P<lesson_id>\d+)| - modules/view/id/(?P<course_id>\d+) - ) - ''' - _TESTS = [{ - 'url': 'https://linuxacademy.com/cp/courses/lesson/course/7971/lesson/2/module/675', - 'info_dict': { - 'id': '7971-2', - 'ext': 'mp4', - 'title': 'What Is Data Science', - 'description': 'md5:c574a3c20607144fb36cb65bdde76c99', - 'timestamp': 1607387907, - 'upload_date': '20201208', - 'duration': 304, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'Requires Linux Academy account credentials', - }, { - 'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2', - 'only_matching': True, - }, { - 'url': 'https://linuxacademy.com/cp/modules/view/id/154', - 'info_dict': { - 'id': '154', - 'title': 'AWS Certified Cloud Practitioner', - 'description': 'md5:a68a299ca9bb98d41cca5abc4d4ce22c', - 'duration': 28835, - }, - 'playlist_count': 41, - 'skip': 'Requires Linux Academy account credentials', - }] - - _AUTHORIZE_URL = 'https://login.linuxacademy.com/authorize' - _ORIGIN_URL = 'https://linuxacademy.com' - _CLIENT_ID = 'KaWxNn1C2Gc7n83W9OFeXltd8Utb5vvx' - _NETRC_MACHINE = 'linuxacademy' - - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - - def random_string(): - return ''.join([ - random.choice('0123456789ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz-._~') - for _ in range(32)]) - - webpage, urlh = self._download_webpage_handle( - self._AUTHORIZE_URL, None, 'Downloading authorize page', query={ - 'client_id': self._CLIENT_ID, - 'response_type': 'token id_token', - 'response_mode': 'web_message', - 'redirect_uri': self._ORIGIN_URL, - 'scope': 'openid email user_impersonation profile', - 'audience': self._ORIGIN_URL, - 'state': random_string(), - 'nonce': random_string(), - }) - - login_data = self._parse_json( - self._search_regex( - r'atob\(\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, - 'login info', group='value'), None, - transform_source=lambda x: compat_b64decode(x).decode('utf-8') - )['extraParams'] - - login_data.update({ - 'client_id': self._CLIENT_ID, - 'redirect_uri': self._ORIGIN_URL, - 'tenant': 'lacausers', - 'connection': 'Username-Password-Authentication', - 'username': username, - 'password': password, - 'sso': 'true', - }) - - login_state_url = urlh.geturl() - - try: - login_page = self._download_webpage( - 'https://login.linuxacademy.com/usernamepassword/login', None, - 'Downloading login page', data=json.dumps(login_data).encode(), - headers={ - 'Content-Type': 'application/json', - 'Origin': 'https://login.linuxacademy.com', - 'Referer': login_state_url, - }) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - error = self._parse_json(e.cause.read(), None) - message = error.get('description') or error['code'] - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, message), expected=True) - raise - - callback_page, urlh = self._download_webpage_handle( - 'https://login.linuxacademy.com/login/callback', None, - 'Downloading callback page', - data=urlencode_postdata(self._hidden_inputs(login_page)), - headers={ - 'Content-Type': 'application/x-www-form-urlencoded', - 'Origin': 'https://login.linuxacademy.com', - 'Referer': login_state_url, - }) - - access_token = self._search_regex( - r'access_token=([^=&]+)', urlh.geturl(), - 'access token', default=None) - if not access_token: - access_token = self._parse_json( - self._search_regex( - r'authorizationResponse\s*=\s*({.+?})\s*;', callback_page, - 'authorization response'), None, - transform_source=js_to_json)['response']['access_token'] - - self._download_webpage( - 'https://linuxacademy.com/cp/login/tokenValidateLogin/token/%s' - % access_token, None, 'Downloading token validation page') - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - chapter_id, lecture_id, course_id = mobj.group('chapter_id', 'lesson_id', 'course_id') - item_id = course_id if course_id else '%s-%s' % (chapter_id, lecture_id) - - webpage = self._download_webpage(url, item_id) - - # course path - if course_id: - module = self._parse_json( - self._search_regex( - r'window\.module\s*=\s*({.+?})\s*;', webpage, 'module'), - item_id) - entries = [] - chapter_number = None - chapter = None - chapter_id = None - for item in module['items']: - if not isinstance(item, dict): - continue - - def type_field(key): - return (try_get(item, lambda x: x['type'][key], compat_str) or '').lower() - type_fields = (type_field('name'), type_field('slug')) - # Move to next module section - if 'section' in type_fields: - chapter = item.get('course_name') - chapter_id = item.get('course_module') - chapter_number = 1 if not chapter_number else chapter_number + 1 - continue - # Skip non-lessons - if 'lesson' not in type_fields: - continue - lesson_url = urljoin(url, item.get('url')) - if not lesson_url: - continue - title = item.get('title') or item.get('lesson_name') - description = item.get('md_desc') or clean_html(item.get('description')) or clean_html(item.get('text')) - entries.append({ - '_type': 'url_transparent', - 'url': lesson_url, - 'ie_key': LinuxAcademyIE.ie_key(), - 'title': title, - 'description': description, - 'timestamp': unified_timestamp(item.get('date')) or unified_timestamp(item.get('created_on')), - 'duration': parse_duration(item.get('duration')), - 'chapter': chapter, - 'chapter_id': chapter_id, - 'chapter_number': chapter_number, - }) - return { - '_type': 'playlist', - 'entries': entries, - 'id': course_id, - 'title': module.get('title'), - 'description': module.get('md_desc') or clean_html(module.get('desc')), - 'duration': parse_duration(module.get('duration')), - } - - # single video path - m3u8_url = self._parse_json( - self._search_regex( - r'player\.playlist\s*=\s*(\[.+?\])\s*;', webpage, 'playlist'), - item_id)[0]['file'] - formats = self._extract_m3u8_formats( - m3u8_url, item_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls') - self._sort_formats(formats) - info = { - 'id': item_id, - 'formats': formats, - } - lesson = self._parse_json( - self._search_regex( - (r'window\.lesson\s*=\s*({.+?})\s*;', - r'player\.lesson\s*=\s*({.+?})\s*;'), - webpage, 'lesson', default='{}'), item_id, fatal=False) - if lesson: - info.update({ - 'title': lesson.get('lesson_name'), - 'description': lesson.get('md_desc') or clean_html(lesson.get('desc')), - 'timestamp': unified_timestamp(lesson.get('date')) or unified_timestamp(lesson.get('created_on')), - 'duration': parse_duration(lesson.get('duration')), - }) - if not info.get('title'): - info['title'] = self._search_regex( - (r'>Lecture\s*:\s*(?P<value>[^<]+)', - r'lessonName\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage, - 'title', group='value') - return info diff --git a/youtube_dl/extractor/litv.py b/youtube_dl/extractor/litv.py deleted file mode 100644 index 337b1b15c..000000000 --- a/youtube_dl/extractor/litv.py +++ /dev/null @@ -1,148 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import json - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - int_or_none, - smuggle_url, - unsmuggle_url, -) - - -class LiTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?litv\.tv/(?:vod|promo)/[^/]+/(?:content\.do)?\?.*?\b(?:content_)?id=(?P<id>[^&]+)' - - _URL_TEMPLATE = 'https://www.litv.tv/vod/%s/content.do?id=%s' - - _TESTS = [{ - 'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1', - 'info_dict': { - 'id': 'VOD00041606', - 'title': '花千骨', - }, - 'playlist_count': 50, - }, { - 'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1', - 'md5': '969e343d9244778cb29acec608e53640', - 'info_dict': { - 'id': 'VOD00041610', - 'ext': 'mp4', - 'title': '花千骨第1集', - 'thumbnail': r're:https?://.*\.jpg$', - 'description': 'md5:c7017aa144c87467c4fb2909c4b05d6f', - 'episode_number': 1, - }, - 'params': { - 'noplaylist': True, - }, - 'skip': 'Georestricted to Taiwan', - }, { - 'url': 'https://www.litv.tv/promo/miyuezhuan/?content_id=VOD00044841&', - 'md5': '88322ea132f848d6e3e18b32a832b918', - 'info_dict': { - 'id': 'VOD00044841', - 'ext': 'mp4', - 'title': '芈月傳第1集 霸星芈月降世楚國', - 'description': '楚威王二年,太史令唐昧夜觀星象,發現霸星即將現世。王后得知霸星的預言後,想盡辦法不讓孩子順利出生,幸得莒姬相護化解危機。沒想到眾人期待下出生的霸星卻是位公主,楚威王對此失望至極。楚王后命人將女嬰丟棄河中,居然奇蹟似的被少司命像攔下,楚威王認為此女非同凡響,為她取名芈月。', - }, - 'skip': 'Georestricted to Taiwan', - }] - - def _extract_playlist(self, season_list, video_id, program_info, prompt=True): - episode_title = program_info['title'] - content_id = season_list['contentId'] - - if prompt: - self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (content_id, video_id)) - - all_episodes = [ - self.url_result(smuggle_url( - self._URL_TEMPLATE % (program_info['contentType'], episode['contentId']), - {'force_noplaylist': True})) # To prevent infinite recursion - for episode in season_list['episode']] - - return self.playlist_result(all_episodes, content_id, episode_title) - - def _real_extract(self, url): - url, data = unsmuggle_url(url, {}) - - video_id = self._match_id(url) - - noplaylist = self._downloader.params.get('noplaylist') - noplaylist_prompt = True - if 'force_noplaylist' in data: - noplaylist = data['force_noplaylist'] - noplaylist_prompt = False - - webpage = self._download_webpage(url, video_id) - - program_info = self._parse_json(self._search_regex( - r'var\s+programInfo\s*=\s*([^;]+)', webpage, 'VOD data', default='{}'), - video_id) - - season_list = list(program_info.get('seasonList', {}).values()) - if season_list: - if not noplaylist: - return self._extract_playlist( - season_list[0], video_id, program_info, - prompt=noplaylist_prompt) - - if noplaylist_prompt: - self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - - # In browsers `getMainUrl` request is always issued. Usually this - # endpoint gives the same result as the data embedded in the webpage. - # If georestricted, there are no embedded data, so an extra request is - # necessary to get the error code - if 'assetId' not in program_info: - program_info = self._download_json( - 'https://www.litv.tv/vod/ajax/getProgramInfo', video_id, - query={'contentId': video_id}, - headers={'Accept': 'application/json'}) - video_data = self._parse_json(self._search_regex( - r'uiHlsUrl\s*=\s*testBackendData\(([^;]+)\);', - webpage, 'video data', default='{}'), video_id) - if not video_data: - payload = { - 'assetId': program_info['assetId'], - 'watchDevices': program_info['watchDevices'], - 'contentType': program_info['contentType'], - } - video_data = self._download_json( - 'https://www.litv.tv/vod/getMainUrl', video_id, - data=json.dumps(payload).encode('utf-8'), - headers={'Content-Type': 'application/json'}) - - if not video_data.get('fullpath'): - error_msg = video_data.get('errorMessage') - if error_msg == 'vod.error.outsideregionerror': - self.raise_geo_restricted('This video is available in Taiwan only') - if error_msg: - raise ExtractorError('%s said: %s' % (self.IE_NAME, error_msg), expected=True) - raise ExtractorError('Unexpected result from %s' % self.IE_NAME) - - formats = self._extract_m3u8_formats( - video_data['fullpath'], video_id, ext='mp4', - entry_protocol='m3u8_native', m3u8_id='hls') - for a_format in formats: - # LiTV HLS segments doesn't like compressions - a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = True - - title = program_info['title'] + program_info.get('secondaryMark', '') - description = program_info.get('description') - thumbnail = program_info.get('imageFile') - categories = [item['name'] for item in program_info.get('category', [])] - episode = int_or_none(program_info.get('episode')) - - return { - 'id': video_id, - 'formats': formats, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'categories': categories, - 'episode_number': episode, - } diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py deleted file mode 100644 index e55b1a202..000000000 --- a/youtube_dl/extractor/livestream.py +++ /dev/null @@ -1,366 +0,0 @@ -from __future__ import unicode_literals - -import re -import itertools - -from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, -) -from ..utils import ( - find_xpath_attr, - xpath_attr, - xpath_with_ns, - xpath_text, - orderedSet, - update_url_query, - int_or_none, - float_or_none, - parse_iso8601, - determine_ext, -) - - -class LivestreamIE(InfoExtractor): - IE_NAME = 'livestream' - _VALID_URL = r'https?://(?:new\.)?livestream\.com/(?:accounts/(?P<account_id>\d+)|(?P<account_name>[^/]+))/(?:events/(?P<event_id>\d+)|(?P<event_name>[^/]+))(?:/videos/(?P<id>\d+))?' - _TESTS = [{ - 'url': 'http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370', - 'md5': '53274c76ba7754fb0e8d072716f2292b', - 'info_dict': { - 'id': '4719370', - 'ext': 'mp4', - 'title': 'Live from Webster Hall NYC', - 'timestamp': 1350008072, - 'upload_date': '20121012', - 'duration': 5968.0, - 'like_count': int, - 'view_count': int, - 'thumbnail': r're:^http://.*\.jpg$' - } - }, { - 'url': 'http://new.livestream.com/tedx/cityenglish', - 'info_dict': { - 'title': 'TEDCity2.0 (English)', - 'id': '2245590', - }, - 'playlist_mincount': 4, - }, { - 'url': 'http://new.livestream.com/chess24/tatasteelchess', - 'info_dict': { - 'title': 'Tata Steel Chess', - 'id': '3705884', - }, - 'playlist_mincount': 60, - }, { - 'url': 'https://new.livestream.com/accounts/362/events/3557232/videos/67864563/player?autoPlay=false&height=360&mute=false&width=640', - 'only_matching': True, - }, { - 'url': 'http://livestream.com/bsww/concacafbeachsoccercampeonato2015', - 'only_matching': True, - }] - _API_URL_TEMPLATE = 'http://livestream.com/api/accounts/%s/events/%s' - - def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): - base_ele = find_xpath_attr( - smil, self._xpath_ns('.//meta', namespace), 'name', 'httpBase') - base = base_ele.get('content') if base_ele is not None else 'http://livestreamvod-f.akamaihd.net/' - - formats = [] - video_nodes = smil.findall(self._xpath_ns('.//video', namespace)) - - for vn in video_nodes: - tbr = int_or_none(vn.attrib.get('system-bitrate'), 1000) - furl = ( - update_url_query(compat_urlparse.urljoin(base, vn.attrib['src']), { - 'v': '3.0.3', - 'fp': 'WIN% 14,0,0,145', - })) - if 'clipBegin' in vn.attrib: - furl += '&ssek=' + vn.attrib['clipBegin'] - formats.append({ - 'url': furl, - 'format_id': 'smil_%d' % tbr, - 'ext': 'flv', - 'tbr': tbr, - 'preference': -1000, - }) - return formats - - def _extract_video_info(self, video_data): - video_id = compat_str(video_data['id']) - - FORMAT_KEYS = ( - ('sd', 'progressive_url'), - ('hd', 'progressive_url_hd'), - ) - - formats = [] - for format_id, key in FORMAT_KEYS: - video_url = video_data.get(key) - if video_url: - ext = determine_ext(video_url) - if ext == 'm3u8': - continue - bitrate = int_or_none(self._search_regex( - r'(\d+)\.%s' % ext, video_url, 'bitrate', default=None)) - formats.append({ - 'url': video_url, - 'format_id': format_id, - 'tbr': bitrate, - 'ext': ext, - }) - - smil_url = video_data.get('smil_url') - if smil_url: - formats.extend(self._extract_smil_formats(smil_url, video_id, fatal=False)) - - m3u8_url = video_data.get('m3u8_url') - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - - f4m_url = video_data.get('f4m_url') - if f4m_url: - formats.extend(self._extract_f4m_formats( - f4m_url, video_id, f4m_id='hds', fatal=False)) - self._sort_formats(formats) - - comments = [{ - 'author_id': comment.get('author_id'), - 'author': comment.get('author', {}).get('full_name'), - 'id': comment.get('id'), - 'text': comment['text'], - 'timestamp': parse_iso8601(comment.get('created_at')), - } for comment in video_data.get('comments', {}).get('data', [])] - - return { - 'id': video_id, - 'formats': formats, - 'title': video_data['caption'], - 'description': video_data.get('description'), - 'thumbnail': video_data.get('thumbnail_url'), - 'duration': float_or_none(video_data.get('duration'), 1000), - 'timestamp': parse_iso8601(video_data.get('publish_at')), - 'like_count': video_data.get('likes', {}).get('total'), - 'comment_count': video_data.get('comments', {}).get('total'), - 'view_count': video_data.get('views'), - 'comments': comments, - } - - def _extract_stream_info(self, stream_info): - broadcast_id = compat_str(stream_info['broadcast_id']) - is_live = stream_info.get('is_live') - - formats = [] - smil_url = stream_info.get('play_url') - if smil_url: - formats.extend(self._extract_smil_formats(smil_url, broadcast_id)) - - m3u8_url = stream_info.get('m3u8_url') - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, broadcast_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - - rtsp_url = stream_info.get('rtsp_url') - if rtsp_url: - formats.append({ - 'url': rtsp_url, - 'format_id': 'rtsp', - }) - self._sort_formats(formats) - - return { - 'id': broadcast_id, - 'formats': formats, - 'title': self._live_title(stream_info['stream_title']) if is_live else stream_info['stream_title'], - 'thumbnail': stream_info.get('thumbnail_url'), - 'is_live': is_live, - } - - def _extract_event(self, event_data): - event_id = compat_str(event_data['id']) - account_id = compat_str(event_data['owner_account_id']) - feed_root_url = self._API_URL_TEMPLATE % (account_id, event_id) + '/feed.json' - - stream_info = event_data.get('stream_info') - if stream_info: - return self._extract_stream_info(stream_info) - - last_video = None - entries = [] - for i in itertools.count(1): - if last_video is None: - info_url = feed_root_url - else: - info_url = '{root}?&id={id}&newer=-1&type=video'.format( - root=feed_root_url, id=last_video) - videos_info = self._download_json( - info_url, event_id, 'Downloading page {0}'.format(i))['data'] - videos_info = [v['data'] for v in videos_info if v['type'] == 'video'] - if not videos_info: - break - for v in videos_info: - v_id = compat_str(v['id']) - entries.append(self.url_result( - 'http://livestream.com/accounts/%s/events/%s/videos/%s' % (account_id, event_id, v_id), - 'Livestream', v_id, v.get('caption'))) - last_video = videos_info[-1]['id'] - return self.playlist_result(entries, event_id, event_data['full_name']) - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - event = mobj.group('event_id') or mobj.group('event_name') - account = mobj.group('account_id') or mobj.group('account_name') - api_url = self._API_URL_TEMPLATE % (account, event) - if video_id: - video_data = self._download_json( - api_url + '/videos/%s' % video_id, video_id) - return self._extract_video_info(video_data) - else: - event_data = self._download_json(api_url, video_id) - return self._extract_event(event_data) - - -# The original version of Livestream uses a different system -class LivestreamOriginalIE(InfoExtractor): - IE_NAME = 'livestream:original' - _VALID_URL = r'''(?x)https?://original\.livestream\.com/ - (?P<user>[^/\?#]+)(?:/(?P<type>video|folder) - (?:(?:\?.*?Id=|/)(?P<id>.*?)(&|$))?)? - ''' - _TESTS = [{ - 'url': 'http://original.livestream.com/dealbook/video?clipId=pla_8aa4a3f1-ba15-46a4-893b-902210e138fb', - 'info_dict': { - 'id': 'pla_8aa4a3f1-ba15-46a4-893b-902210e138fb', - 'ext': 'mp4', - 'title': 'Spark 1 (BitCoin) with Cameron Winklevoss & Tyler Winklevoss of Winklevoss Capital', - 'duration': 771.301, - 'view_count': int, - }, - }, { - 'url': 'https://original.livestream.com/newplay/folder?dirId=a07bf706-d0e4-4e75-a747-b021d84f2fd3', - 'info_dict': { - 'id': 'a07bf706-d0e4-4e75-a747-b021d84f2fd3', - }, - 'playlist_mincount': 4, - }, { - # live stream - 'url': 'http://original.livestream.com/znsbahamas', - 'only_matching': True, - }] - - def _extract_video_info(self, user, video_id): - api_url = 'http://x%sx.api.channel.livestream.com/2.0/clipdetails?extendedInfo=true&id=%s' % (user, video_id) - info = self._download_xml(api_url, video_id) - - item = info.find('channel').find('item') - title = xpath_text(item, 'title') - media_ns = {'media': 'http://search.yahoo.com/mrss'} - thumbnail_url = xpath_attr( - item, xpath_with_ns('media:thumbnail', media_ns), 'url') - duration = float_or_none(xpath_attr( - item, xpath_with_ns('media:content', media_ns), 'duration')) - ls_ns = {'ls': 'http://api.channel.livestream.com/2.0'} - view_count = int_or_none(xpath_text( - item, xpath_with_ns('ls:viewsCount', ls_ns))) - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail_url, - 'duration': duration, - 'view_count': view_count, - } - - def _extract_video_formats(self, video_data, video_id): - formats = [] - - progressive_url = video_data.get('progressiveUrl') - if progressive_url: - formats.append({ - 'url': progressive_url, - 'format_id': 'http', - }) - - m3u8_url = video_data.get('httpUrl') - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - - rtsp_url = video_data.get('rtspUrl') - if rtsp_url: - formats.append({ - 'url': rtsp_url, - 'format_id': 'rtsp', - }) - - self._sort_formats(formats) - return formats - - def _extract_folder(self, url, folder_id): - webpage = self._download_webpage(url, folder_id) - paths = orderedSet(re.findall( - r'''(?x)(?: - <li\s+class="folder">\s*<a\s+href="| - <a\s+href="(?=https?://livestre\.am/) - )([^"]+)"''', webpage)) - - entries = [{ - '_type': 'url', - 'url': compat_urlparse.urljoin(url, p), - } for p in paths] - - return self.playlist_result(entries, folder_id) - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - user = mobj.group('user') - url_type = mobj.group('type') - content_id = mobj.group('id') - if url_type == 'folder': - return self._extract_folder(url, content_id) - else: - # this url is used on mobile devices - stream_url = 'http://x%sx.api.channel.livestream.com/3.0/getstream.json' % user - info = {} - if content_id: - stream_url += '?id=%s' % content_id - info = self._extract_video_info(user, content_id) - else: - content_id = user - webpage = self._download_webpage(url, content_id) - info = { - 'title': self._og_search_title(webpage), - 'description': self._og_search_description(webpage), - 'thumbnail': self._search_regex(r'channelLogo\.src\s*=\s*"([^"]+)"', webpage, 'thumbnail', None), - } - video_data = self._download_json(stream_url, content_id) - is_live = video_data.get('isLive') - info.update({ - 'id': content_id, - 'title': self._live_title(info['title']) if is_live else info['title'], - 'formats': self._extract_video_formats(video_data, content_id), - 'is_live': is_live, - }) - return info - - -# The server doesn't support HEAD request, the generic extractor can't detect -# the redirection -class LivestreamShortenerIE(InfoExtractor): - IE_NAME = 'livestream:shortener' - IE_DESC = False # Do not list - _VALID_URL = r'https?://livestre\.am/(?P<id>.+)' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - id = mobj.group('id') - webpage = self._download_webpage(url, id) - - return self.url_result(self._og_search_url(webpage)) diff --git a/youtube_dl/extractor/lnkgo.py b/youtube_dl/extractor/lnkgo.py deleted file mode 100644 index 3e71852aa..000000000 --- a/youtube_dl/extractor/lnkgo.py +++ /dev/null @@ -1,88 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - clean_html, - compat_str, - int_or_none, - parse_iso8601, -) - - -class LnkGoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?lnk(?:go)?\.(?:alfa\.)?lt/(?:visi-video/[^/]+|video)/(?P<id>[A-Za-z0-9-]+)(?:/(?P<episode_id>\d+))?' - _TESTS = [{ - 'url': 'http://www.lnkgo.lt/visi-video/aktualai-pratesimas/ziurek-putka-trys-klausimai', - 'info_dict': { - 'id': '10809', - 'ext': 'mp4', - 'title': "Put'ka: Trys Klausimai", - 'upload_date': '20161216', - 'description': 'Seniai matytas Put’ka užduoda tris klausimėlius. Pabandykime surasti atsakymus.', - 'age_limit': 18, - 'duration': 117, - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1481904000, - }, - 'params': { - 'skip_download': True, # HLS download - }, - }, { - 'url': 'http://lnkgo.alfa.lt/visi-video/aktualai-pratesimas/ziurek-nerdas-taiso-kompiuteri-2', - 'info_dict': { - 'id': '10467', - 'ext': 'mp4', - 'title': 'Nėrdas: Kompiuterio Valymas', - 'upload_date': '20150113', - 'description': 'md5:7352d113a242a808676ff17e69db6a69', - 'age_limit': 18, - 'duration': 346, - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1421164800, - }, - 'params': { - 'skip_download': True, # HLS download - }, - }, { - 'url': 'https://lnk.lt/video/neigalieji-tv-bokste/37413', - 'only_matching': True, - }] - _AGE_LIMITS = { - 'N-7': 7, - 'N-14': 14, - 'S': 18, - } - _M3U8_TEMPL = 'https://vod.lnk.lt/lnk_vod/lnk/lnk/%s:%s/playlist.m3u8%s' - - def _real_extract(self, url): - display_id, video_id = re.match(self._VALID_URL, url).groups() - - video_info = self._download_json( - 'https://lnk.lt/api/main/video-page/%s/%s/false' % (display_id, video_id or '0'), - display_id)['videoConfig']['videoInfo'] - - video_id = compat_str(video_info['id']) - title = video_info['title'] - prefix = 'smil' if video_info.get('isQualityChangeAvailable') else 'mp4' - formats = self._extract_m3u8_formats( - self._M3U8_TEMPL % (prefix, video_info['videoUrl'], video_info.get('secureTokenParams') or ''), - video_id, 'mp4', 'm3u8_native') - self._sort_formats(formats) - - poster_image = video_info.get('posterImage') - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'formats': formats, - 'thumbnail': 'https://lnk.lt/all-images/' + poster_image if poster_image else None, - 'duration': int_or_none(video_info.get('duration')), - 'description': clean_html(video_info.get('htmlDescription')), - 'age_limit': self._AGE_LIMITS.get(video_info.get('pgRating'), 0), - 'timestamp': parse_iso8601(video_info.get('airDate')), - 'view_count': int_or_none(video_info.get('viewsCount')), - } diff --git a/youtube_dl/extractor/localnews8.py b/youtube_dl/extractor/localnews8.py deleted file mode 100644 index aad396135..000000000 --- a/youtube_dl/extractor/localnews8.py +++ /dev/null @@ -1,47 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - - -class LocalNews8IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?localnews8\.com/(?:[^/]+/)*(?P<display_id>[^/]+)/(?P<id>[0-9]+)' - _TEST = { - 'url': 'http://www.localnews8.com/news/rexburg-business-turns-carbon-fiber-scraps-into-wedding-rings/35183304', - 'md5': 'be4d48aea61aa2bde7be2ee47691ad20', - 'info_dict': { - 'id': '35183304', - 'display_id': 'rexburg-business-turns-carbon-fiber-scraps-into-wedding-rings', - 'ext': 'mp4', - 'title': 'Rexburg business turns carbon fiber scraps into wedding ring', - 'description': 'The process was first invented by Lamborghini and less than a dozen companies around the world use it.', - 'duration': 153, - 'timestamp': 1441844822, - 'upload_date': '20150910', - 'uploader_id': 'api', - } - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') - - webpage = self._download_webpage(url, display_id) - - partner_id = self._search_regex( - r'partnerId\s*[:=]\s*(["\'])(?P<id>\d+)\1', - webpage, 'partner id', group='id') - kaltura_id = self._search_regex( - r'videoIdString\s*[:=]\s*(["\'])kaltura:(?P<id>[0-9a-z_]+)\1', - webpage, 'videl id', group='id') - - return { - '_type': 'url_transparent', - 'url': 'kaltura:%s:%s' % (partner_id, kaltura_id), - 'ie_key': 'Kaltura', - 'id': video_id, - 'display_id': display_id, - } diff --git a/youtube_dl/extractor/lovehomeporn.py b/youtube_dl/extractor/lovehomeporn.py deleted file mode 100644 index 8f65a3c03..000000000 --- a/youtube_dl/extractor/lovehomeporn.py +++ /dev/null @@ -1,37 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .nuevo import NuevoBaseIE - - -class LoveHomePornIE(NuevoBaseIE): - _VALID_URL = r'https?://(?:www\.)?lovehomeporn\.com/video/(?P<id>\d+)(?:/(?P<display_id>[^/?#&]+))?' - _TEST = { - 'url': 'http://lovehomeporn.com/video/48483/stunning-busty-brunette-girlfriend-sucking-and-riding-a-big-dick#menu', - 'info_dict': { - 'id': '48483', - 'display_id': 'stunning-busty-brunette-girlfriend-sucking-and-riding-a-big-dick', - 'ext': 'mp4', - 'title': 'Stunning busty brunette girlfriend sucking and riding a big dick', - 'age_limit': 18, - 'duration': 238.47, - }, - 'params': { - 'skip_download': True, - } - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') - - info = self._extract_nuevo( - 'http://lovehomeporn.com/media/nuevo/config.php?key=%s' % video_id, - video_id) - info.update({ - 'display_id': display_id, - 'age_limit': 18 - }) - return info diff --git a/youtube_dl/extractor/lrt.py b/youtube_dl/extractor/lrt.py deleted file mode 100644 index 89d549858..000000000 --- a/youtube_dl/extractor/lrt.py +++ /dev/null @@ -1,75 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - clean_html, - merge_dicts, -) - - -class LRTIE(InfoExtractor): - IE_NAME = 'lrt.lt' - _VALID_URL = r'https?://(?:www\.)?lrt\.lt(?P<path>/mediateka/irasas/(?P<id>[0-9]+))' - _TESTS = [{ - # m3u8 download - 'url': 'https://www.lrt.lt/mediateka/irasas/2000127261/greita-ir-gardu-sicilijos-ikvepta-klasikiniu-makaronu-su-baklazanais-vakariene', - 'md5': '85cb2bb530f31d91a9c65b479516ade4', - 'info_dict': { - 'id': '2000127261', - 'ext': 'mp4', - 'title': 'Greita ir gardu: Sicilijos įkvėpta klasikinių makaronų su baklažanais vakarienė', - 'description': 'md5:ad7d985f51b0dc1489ba2d76d7ed47fa', - 'duration': 3035, - 'timestamp': 1604079000, - 'upload_date': '20201030', - }, - }, { - # direct mp3 download - 'url': 'http://www.lrt.lt/mediateka/irasas/1013074524/', - 'md5': '389da8ca3cad0f51d12bed0c844f6a0a', - 'info_dict': { - 'id': '1013074524', - 'ext': 'mp3', - 'title': 'Kita tema 2016-09-05 15:05', - 'description': 'md5:1b295a8fc7219ed0d543fc228c931fb5', - 'duration': 3008, - 'view_count': int, - 'like_count': int, - }, - }] - - def _extract_js_var(self, webpage, var_name, default): - return self._search_regex( - r'%s\s*=\s*(["\'])((?:(?!\1).)+)\1' % var_name, - webpage, var_name.replace('_', ' '), default, group=2) - - def _real_extract(self, url): - path, video_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage(url, video_id) - - media_url = self._extract_js_var(webpage, 'main_url', path) - media = self._download_json(self._extract_js_var( - webpage, 'media_info_url', - 'https://www.lrt.lt/servisai/stream_url/vod/media_info/'), - video_id, query={'url': media_url}) - jw_data = self._parse_jwplayer_data( - media['playlist_item'], video_id, base_url=url) - - json_ld_data = self._search_json_ld(webpage, video_id) - - tags = [] - for tag in (media.get('tags') or []): - tag_name = tag.get('name') - if not tag_name: - continue - tags.append(tag_name) - - clean_info = { - 'description': clean_html(media.get('content')), - 'tags': tags, - } - - return merge_dicts(clean_info, jw_data, json_ld_data) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py deleted file mode 100644 index b3d8653d0..000000000 --- a/youtube_dl/extractor/lynda.py +++ /dev/null @@ -1,341 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, -) -from ..utils import ( - ExtractorError, - int_or_none, - urlencode_postdata, -) - - -class LyndaBaseIE(InfoExtractor): - _SIGNIN_URL = 'https://www.lynda.com/signin/lynda' - _PASSWORD_URL = 'https://www.lynda.com/signin/password' - _USER_URL = 'https://www.lynda.com/signin/user' - _ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to provide lynda.com account credentials.' - _NETRC_MACHINE = 'lynda' - - def _real_initialize(self): - self._login() - - @staticmethod - def _check_error(json_string, key_or_keys): - keys = [key_or_keys] if isinstance(key_or_keys, compat_str) else key_or_keys - for key in keys: - error = json_string.get(key) - if error: - raise ExtractorError('Unable to login: %s' % error, expected=True) - - def _login_step(self, form_html, fallback_action_url, extra_form_data, note, referrer_url): - action_url = self._search_regex( - r'<form[^>]+action=(["\'])(?P<url>.+?)\1', form_html, - 'post url', default=fallback_action_url, group='url') - - if not action_url.startswith('http'): - action_url = compat_urlparse.urljoin(self._SIGNIN_URL, action_url) - - form_data = self._hidden_inputs(form_html) - form_data.update(extra_form_data) - - response = self._download_json( - action_url, None, note, - data=urlencode_postdata(form_data), - headers={ - 'Referer': referrer_url, - 'X-Requested-With': 'XMLHttpRequest', - }, expected_status=(418, 500, )) - - self._check_error(response, ('email', 'password', 'ErrorMessage')) - - return response, action_url - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - - # Step 1: download signin page - signin_page = self._download_webpage( - self._SIGNIN_URL, None, 'Downloading signin page') - - # Already logged in - if any(re.search(p, signin_page) for p in ( - r'isLoggedIn\s*:\s*true', r'logout\.aspx', r'>Log out<')): - return - - # Step 2: submit email - signin_form = self._search_regex( - r'(?s)(<form[^>]+data-form-name=["\']signin["\'][^>]*>.+?</form>)', - signin_page, 'signin form') - signin_page, signin_url = self._login_step( - signin_form, self._PASSWORD_URL, {'email': username}, - 'Submitting email', self._SIGNIN_URL) - - # Step 3: submit password - password_form = signin_page['body'] - self._login_step( - password_form, self._USER_URL, {'email': username, 'password': password}, - 'Submitting password', signin_url) - - -class LyndaIE(LyndaBaseIE): - IE_NAME = 'lynda' - IE_DESC = 'lynda.com videos' - _VALID_URL = r'''(?x) - https?:// - (?:www\.)?(?:lynda\.com|educourse\.ga)/ - (?: - (?:[^/]+/){2,3}(?P<course_id>\d+)| - player/embed - )/ - (?P<id>\d+) - ''' - - _TIMECODE_REGEX = r'\[(?P<timecode>\d+:\d+:\d+[\.,]\d+)\]' - - _TESTS = [{ - 'url': 'https://www.lynda.com/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html', - # md5 is unstable - 'info_dict': { - 'id': '114408', - 'ext': 'mp4', - 'title': 'Using the exercise files', - 'duration': 68 - } - }, { - 'url': 'https://www.lynda.com/player/embed/133770?tr=foo=1;bar=g;fizz=rt&fs=0', - 'only_matching': True, - }, { - 'url': 'https://educourse.ga/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html', - 'only_matching': True, - }, { - 'url': 'https://www.lynda.com/de/Graphic-Design-tutorials/Willkommen-Grundlagen-guten-Gestaltung/393570/393572-4.html', - 'only_matching': True, - }, { - # Status="NotFound", Message="Transcript not found" - 'url': 'https://www.lynda.com/ASP-NET-tutorials/What-you-should-know/5034180/2811512-4.html', - 'only_matching': True, - }] - - def _raise_unavailable(self, video_id): - self.raise_login_required( - 'Video %s is only available for members' % video_id) - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - course_id = mobj.group('course_id') - - query = { - 'videoId': video_id, - 'type': 'video', - } - - video = self._download_json( - 'https://www.lynda.com/ajax/player', video_id, - 'Downloading video JSON', fatal=False, query=query) - - # Fallback scenario - if not video: - query['courseId'] = course_id - - play = self._download_json( - 'https://www.lynda.com/ajax/course/%s/%s/play' - % (course_id, video_id), video_id, 'Downloading play JSON') - - if not play: - self._raise_unavailable(video_id) - - formats = [] - for formats_dict in play: - urls = formats_dict.get('urls') - if not isinstance(urls, dict): - continue - cdn = formats_dict.get('name') - for format_id, format_url in urls.items(): - if not format_url: - continue - formats.append({ - 'url': format_url, - 'format_id': '%s-%s' % (cdn, format_id) if cdn else format_id, - 'height': int_or_none(format_id), - }) - self._sort_formats(formats) - - conviva = self._download_json( - 'https://www.lynda.com/ajax/player/conviva', video_id, - 'Downloading conviva JSON', query=query) - - return { - 'id': video_id, - 'title': conviva['VideoTitle'], - 'description': conviva.get('VideoDescription'), - 'release_year': int_or_none(conviva.get('ReleaseYear')), - 'duration': int_or_none(conviva.get('Duration')), - 'creator': conviva.get('Author'), - 'formats': formats, - } - - if 'Status' in video: - raise ExtractorError( - 'lynda returned error: %s' % video['Message'], expected=True) - - if video.get('HasAccess') is False: - self._raise_unavailable(video_id) - - video_id = compat_str(video.get('ID') or video_id) - duration = int_or_none(video.get('DurationInSeconds')) - title = video['Title'] - - formats = [] - - fmts = video.get('Formats') - if fmts: - formats.extend([{ - 'url': f['Url'], - 'ext': f.get('Extension'), - 'width': int_or_none(f.get('Width')), - 'height': int_or_none(f.get('Height')), - 'filesize': int_or_none(f.get('FileSize')), - 'format_id': compat_str(f.get('Resolution')) if f.get('Resolution') else None, - } for f in fmts if f.get('Url')]) - - prioritized_streams = video.get('PrioritizedStreams') - if prioritized_streams: - for prioritized_stream_id, prioritized_stream in prioritized_streams.items(): - formats.extend([{ - 'url': video_url, - 'height': int_or_none(format_id), - 'format_id': '%s-%s' % (prioritized_stream_id, format_id), - } for format_id, video_url in prioritized_stream.items()]) - - self._check_formats(formats, video_id) - self._sort_formats(formats) - - subtitles = self.extract_subtitles(video_id) - - return { - 'id': video_id, - 'title': title, - 'duration': duration, - 'subtitles': subtitles, - 'formats': formats - } - - def _fix_subtitles(self, subs): - srt = '' - seq_counter = 0 - for pos in range(0, len(subs) - 1): - seq_current = subs[pos] - m_current = re.match(self._TIMECODE_REGEX, seq_current['Timecode']) - if m_current is None: - continue - seq_next = subs[pos + 1] - m_next = re.match(self._TIMECODE_REGEX, seq_next['Timecode']) - if m_next is None: - continue - appear_time = m_current.group('timecode') - disappear_time = m_next.group('timecode') - text = seq_current['Caption'].strip() - if text: - seq_counter += 1 - srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (seq_counter, appear_time, disappear_time, text) - if srt: - return srt - - def _get_subtitles(self, video_id): - url = 'https://www.lynda.com/ajax/player?videoId=%s&type=transcript' % video_id - subs = self._download_webpage( - url, video_id, 'Downloading subtitles JSON', fatal=False) - if not subs or 'Status="NotFound"' in subs: - return {} - subs = self._parse_json(subs, video_id, fatal=False) - if not subs: - return {} - fixed_subs = self._fix_subtitles(subs) - if fixed_subs: - return {'en': [{'ext': 'srt', 'data': fixed_subs}]} - return {} - - -class LyndaCourseIE(LyndaBaseIE): - IE_NAME = 'lynda:course' - IE_DESC = 'lynda.com online courses' - - # Course link equals to welcome/introduction video link of same course - # We will recognize it as course link - _VALID_URL = r'https?://(?:www|m)\.(?:lynda\.com|educourse\.ga)/(?P<coursepath>(?:[^/]+/){2,3}(?P<courseid>\d+))-2\.html' - - _TESTS = [{ - 'url': 'https://www.lynda.com/Graphic-Design-tutorials/Grundlagen-guten-Gestaltung/393570-2.html', - 'only_matching': True, - }, { - 'url': 'https://www.lynda.com/de/Graphic-Design-tutorials/Grundlagen-guten-Gestaltung/393570-2.html', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - course_path = mobj.group('coursepath') - course_id = mobj.group('courseid') - - item_template = 'https://www.lynda.com/%s/%%s-4.html' % course_path - - course = self._download_json( - 'https://www.lynda.com/ajax/player?courseId=%s&type=course' % course_id, - course_id, 'Downloading course JSON', fatal=False) - - if not course: - webpage = self._download_webpage(url, course_id) - entries = [ - self.url_result( - item_template % video_id, ie=LyndaIE.ie_key(), - video_id=video_id) - for video_id in re.findall( - r'data-video-id=["\'](\d+)', webpage)] - return self.playlist_result( - entries, course_id, - self._og_search_title(webpage, fatal=False), - self._og_search_description(webpage)) - - if course.get('Status') == 'NotFound': - raise ExtractorError( - 'Course %s does not exist' % course_id, expected=True) - - unaccessible_videos = 0 - entries = [] - - # Might want to extract videos right here from video['Formats'] as it seems 'Formats' is not provided - # by single video API anymore - - for chapter in course['Chapters']: - for video in chapter.get('Videos', []): - if video.get('HasAccess') is False: - unaccessible_videos += 1 - continue - video_id = video.get('ID') - if video_id: - entries.append({ - '_type': 'url_transparent', - 'url': item_template % video_id, - 'ie_key': LyndaIE.ie_key(), - 'chapter': chapter.get('Title'), - 'chapter_number': int_or_none(chapter.get('ChapterIndex')), - 'chapter_id': compat_str(chapter.get('ID')), - }) - - if unaccessible_videos > 0: - self._downloader.report_warning( - '%s videos are only available for members (or paid members) and will not be downloaded. ' - % unaccessible_videos + self._ACCOUNT_CREDENTIALS_HINT) - - course_title = course.get('Title') - course_description = course.get('Description') - - return self.playlist_result(entries, course_id, course_title, course_description) diff --git a/youtube_dl/extractor/mailru.py b/youtube_dl/extractor/mailru.py deleted file mode 100644 index 65cc474db..000000000 --- a/youtube_dl/extractor/mailru.py +++ /dev/null @@ -1,329 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import itertools -import json -import re - -from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote -from ..utils import ( - int_or_none, - parse_duration, - remove_end, - try_get, -) - - -class MailRuIE(InfoExtractor): - IE_NAME = 'mailru' - IE_DESC = 'Видео@Mail.Ru' - _VALID_URL = r'''(?x) - https?:// - (?:(?:www|m)\.)?my\.mail\.ru/+ - (?: - video/.*\#video=/?(?P<idv1>(?:[^/]+/){3}\d+)| - (?:(?P<idv2prefix>(?:[^/]+/+){2})video/(?P<idv2suffix>[^/]+/\d+))\.html| - (?:video/embed|\+/video/meta)/(?P<metaid>\d+) - ) - ''' - _TESTS = [ - { - 'url': 'http://my.mail.ru/video/top#video=/mail/sonypicturesrus/75/76', - 'md5': 'dea205f03120046894db4ebb6159879a', - 'info_dict': { - 'id': '46301138_76', - 'ext': 'mp4', - 'title': 'Новый Человек-Паук. Высокое напряжение. Восстание Электро', - 'timestamp': 1393235077, - 'upload_date': '20140224', - 'uploader': 'sonypicturesrus', - 'uploader_id': 'sonypicturesrus@mail.ru', - 'duration': 184, - }, - 'skip': 'Not accessible from Travis CI server', - }, - { - 'url': 'http://my.mail.ru/corp/hitech/video/news_hi-tech_mail_ru/1263.html', - 'md5': '00a91a58c3402204dcced523777b475f', - 'info_dict': { - 'id': '46843144_1263', - 'ext': 'mp4', - 'title': 'Samsung Galaxy S5 Hammer Smash Fail Battery Explosion', - 'timestamp': 1397039888, - 'upload_date': '20140409', - 'uploader': 'hitech', - 'uploader_id': 'hitech@corp.mail.ru', - 'duration': 245, - }, - 'skip': 'Not accessible from Travis CI server', - }, - { - # only available via metaUrl API - 'url': 'http://my.mail.ru/mail/720pizle/video/_myvideo/502.html', - 'md5': '3b26d2491c6949d031a32b96bd97c096', - 'info_dict': { - 'id': '56664382_502', - 'ext': 'mp4', - 'title': ':8336', - 'timestamp': 1449094163, - 'upload_date': '20151202', - 'uploader': '720pizle@mail.ru', - 'uploader_id': '720pizle@mail.ru', - 'duration': 6001, - }, - 'skip': 'Not accessible from Travis CI server', - }, - { - 'url': 'http://m.my.mail.ru/mail/3sktvtr/video/_myvideo/138.html', - 'only_matching': True, - }, - { - 'url': 'https://my.mail.ru/video/embed/7949340477499637815', - 'only_matching': True, - }, - { - 'url': 'http://my.mail.ru/+/video/meta/7949340477499637815', - 'only_matching': True, - }, - { - 'url': 'https://my.mail.ru//list/sinyutin10/video/_myvideo/4.html', - 'only_matching': True, - }, - { - 'url': 'https://my.mail.ru//list//sinyutin10/video/_myvideo/4.html', - 'only_matching': True, - } - ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - meta_id = mobj.group('metaid') - - video_id = None - if meta_id: - meta_url = 'https://my.mail.ru/+/video/meta/%s' % meta_id - else: - video_id = mobj.group('idv1') - if not video_id: - video_id = mobj.group('idv2prefix') + mobj.group('idv2suffix') - webpage = self._download_webpage(url, video_id) - page_config = self._parse_json(self._search_regex( - r'(?s)<script[^>]+class="sp-video__page-config"[^>]*>(.+?)</script>', - webpage, 'page config', default='{}'), video_id, fatal=False) - if page_config: - meta_url = page_config.get('metaUrl') or page_config.get('video', {}).get('metaUrl') - else: - meta_url = None - - video_data = None - if meta_url: - video_data = self._download_json( - meta_url, video_id or meta_id, 'Downloading video meta JSON', - fatal=not video_id) - - # Fallback old approach - if not video_data: - video_data = self._download_json( - 'http://api.video.mail.ru/videos/%s.json?new=1' % video_id, - video_id, 'Downloading video JSON') - - headers = {} - - video_key = self._get_cookies('https://my.mail.ru').get('video_key') - if video_key: - headers['Cookie'] = 'video_key=%s' % video_key.value - - formats = [] - for f in video_data['videos']: - video_url = f.get('url') - if not video_url: - continue - format_id = f.get('key') - height = int_or_none(self._search_regex( - r'^(\d+)[pP]$', format_id, 'height', default=None)) if format_id else None - formats.append({ - 'url': video_url, - 'format_id': format_id, - 'height': height, - 'http_headers': headers, - }) - self._sort_formats(formats) - - meta_data = video_data['meta'] - title = remove_end(meta_data['title'], '.mp4') - - author = video_data.get('author') - uploader = author.get('name') - uploader_id = author.get('id') or author.get('email') - view_count = int_or_none(video_data.get('viewsCount') or video_data.get('views_count')) - - acc_id = meta_data.get('accId') - item_id = meta_data.get('itemId') - content_id = '%s_%s' % (acc_id, item_id) if acc_id and item_id else video_id - - thumbnail = meta_data.get('poster') - duration = int_or_none(meta_data.get('duration')) - timestamp = int_or_none(meta_data.get('timestamp')) - - return { - 'id': content_id, - 'title': title, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'duration': duration, - 'view_count': view_count, - 'formats': formats, - } - - -class MailRuMusicSearchBaseIE(InfoExtractor): - def _search(self, query, url, audio_id, limit=100, offset=0): - search = self._download_json( - 'https://my.mail.ru/cgi-bin/my/ajax', audio_id, - 'Downloading songs JSON page %d' % (offset // limit + 1), - headers={ - 'Referer': url, - 'X-Requested-With': 'XMLHttpRequest', - }, query={ - 'xemail': '', - 'ajax_call': '1', - 'func_name': 'music.search', - 'mna': '', - 'mnb': '', - 'arg_query': query, - 'arg_extended': '1', - 'arg_search_params': json.dumps({ - 'music': { - 'limit': limit, - 'offset': offset, - }, - }), - 'arg_limit': limit, - 'arg_offset': offset, - }) - return next(e for e in search if isinstance(e, dict)) - - @staticmethod - def _extract_track(t, fatal=True): - audio_url = t['URL'] if fatal else t.get('URL') - if not audio_url: - return - - audio_id = t['File'] if fatal else t.get('File') - if not audio_id: - return - - thumbnail = t.get('AlbumCoverURL') or t.get('FiledAlbumCover') - uploader = t.get('OwnerName') or t.get('OwnerName_Text_HTML') - uploader_id = t.get('UploaderID') - duration = int_or_none(t.get('DurationInSeconds')) or parse_duration( - t.get('Duration') or t.get('DurationStr')) - view_count = int_or_none(t.get('PlayCount') or t.get('PlayCount_hr')) - - track = t.get('Name') or t.get('Name_Text_HTML') - artist = t.get('Author') or t.get('Author_Text_HTML') - - if track: - title = '%s - %s' % (artist, track) if artist else track - else: - title = audio_id - - return { - 'extractor_key': MailRuMusicIE.ie_key(), - 'id': audio_id, - 'title': title, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'duration': duration, - 'view_count': view_count, - 'vcodec': 'none', - 'abr': int_or_none(t.get('BitRate')), - 'track': track, - 'artist': artist, - 'album': t.get('Album'), - 'url': audio_url, - } - - -class MailRuMusicIE(MailRuMusicSearchBaseIE): - IE_NAME = 'mailru:music' - IE_DESC = 'Музыка@Mail.Ru' - _VALID_URL = r'https?://my\.mail\.ru/+music/+songs/+[^/?#&]+-(?P<id>[\da-f]+)' - _TESTS = [{ - 'url': 'https://my.mail.ru/music/songs/%D0%BC8%D0%BB8%D1%82%D1%85-l-a-h-luciferian-aesthetics-of-herrschaft-single-2017-4e31f7125d0dfaef505d947642366893', - 'md5': '0f8c22ef8c5d665b13ac709e63025610', - 'info_dict': { - 'id': '4e31f7125d0dfaef505d947642366893', - 'ext': 'mp3', - 'title': 'L.A.H. (Luciferian Aesthetics of Herrschaft) single, 2017 - М8Л8ТХ', - 'uploader': 'Игорь Мудрый', - 'uploader_id': '1459196328', - 'duration': 280, - 'view_count': int, - 'vcodec': 'none', - 'abr': 320, - 'track': 'L.A.H. (Luciferian Aesthetics of Herrschaft) single, 2017', - 'artist': 'М8Л8ТХ', - }, - }] - - def _real_extract(self, url): - audio_id = self._match_id(url) - - webpage = self._download_webpage(url, audio_id) - - title = self._og_search_title(webpage) - music_data = self._search(title, url, audio_id)['MusicData'] - t = next(t for t in music_data if t.get('File') == audio_id) - - info = self._extract_track(t) - info['title'] = title - return info - - -class MailRuMusicSearchIE(MailRuMusicSearchBaseIE): - IE_NAME = 'mailru:music:search' - IE_DESC = 'Музыка@Mail.Ru' - _VALID_URL = r'https?://my\.mail\.ru/+music/+search/+(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'https://my.mail.ru/music/search/black%20shadow', - 'info_dict': { - 'id': 'black shadow', - }, - 'playlist_mincount': 532, - }] - - def _real_extract(self, url): - query = compat_urllib_parse_unquote(self._match_id(url)) - - entries = [] - - LIMIT = 100 - offset = 0 - - for _ in itertools.count(1): - search = self._search(query, url, query, LIMIT, offset) - - music_data = search.get('MusicData') - if not music_data or not isinstance(music_data, list): - break - - for t in music_data: - track = self._extract_track(t, fatal=False) - if track: - entries.append(track) - - total = try_get( - search, lambda x: x['Results']['music']['Total'], int) - - if total is not None: - if offset > total: - break - - offset += LIMIT - - return self.playlist_result(entries, query) diff --git a/youtube_dl/extractor/massengeschmacktv.py b/youtube_dl/extractor/massengeschmacktv.py deleted file mode 100644 index cfcc6b224..000000000 --- a/youtube_dl/extractor/massengeschmacktv.py +++ /dev/null @@ -1,77 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - clean_html, - determine_ext, - int_or_none, - js_to_json, - mimetype2ext, - parse_filesize, -) - - -class MassengeschmackTVIE(InfoExtractor): - IE_NAME = 'massengeschmack.tv' - _VALID_URL = r'https?://(?:www\.)?massengeschmack\.tv/play/(?P<id>[^?&#]+)' - - _TEST = { - 'url': 'https://massengeschmack.tv/play/fktv202', - 'md5': 'a9e054db9c2b5a08f0a0527cc201e8d3', - 'info_dict': { - 'id': 'fktv202', - 'ext': 'mp4', - 'title': 'Fernsehkritik-TV - Folge 202', - }, - } - - def _real_extract(self, url): - episode = self._match_id(url) - - webpage = self._download_webpage(url, episode) - title = clean_html(self._html_search_regex( - '<h3>([^<]+)</h3>', webpage, 'title')) - thumbnail = self._search_regex(r'POSTER\s*=\s*"([^"]+)', webpage, 'thumbnail', fatal=False) - sources = self._parse_json(self._search_regex(r'(?s)MEDIA\s*=\s*(\[.+?\]);', webpage, 'media'), episode, js_to_json) - - formats = [] - for source in sources: - furl = source.get('src') - if not furl: - continue - furl = self._proto_relative_url(furl) - ext = determine_ext(furl) or mimetype2ext(source.get('type')) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - furl, episode, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - else: - formats.append({ - 'url': furl, - 'format_id': determine_ext(furl), - }) - - for (durl, format_id, width, height, filesize) in re.findall(r'''(?x) - <a[^>]+?href="(?P<url>(?:https:)?//[^"]+)".*? - <strong>(?P<format_id>.+?)</strong>.*? - <small>(?:(?P<width>\d+)x(?P<height>\d+))?\s+?\((?P<filesize>[\d,]+\s*[GM]iB)\)</small> - ''', webpage): - formats.append({ - 'url': durl, - 'format_id': format_id, - 'width': int_or_none(width), - 'height': int_or_none(height), - 'filesize': parse_filesize(filesize), - 'vcodec': 'none' if format_id.startswith('Audio') else None, - }) - - self._sort_formats(formats, ('width', 'height', 'filesize', 'tbr')) - - return { - 'id': episode, - 'title': title, - 'formats': formats, - 'thumbnail': thumbnail, - } diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py deleted file mode 100644 index dc6aa9819..000000000 --- a/youtube_dl/extractor/mdr.py +++ /dev/null @@ -1,195 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, -) -from ..utils import ( - determine_ext, - int_or_none, - parse_duration, - parse_iso8601, - url_or_none, - xpath_text, -) - - -class MDRIE(InfoExtractor): - IE_DESC = 'MDR.DE and KiKA' - _VALID_URL = r'https?://(?:www\.)?(?:mdr|kika)\.de/(?:.*)/[a-z-]+-?(?P<id>\d+)(?:_.+?)?\.html' - - _GEO_COUNTRIES = ['DE'] - - _TESTS = [{ - # MDR regularly deletes its videos - 'url': 'http://www.mdr.de/fakt/video189002.html', - 'only_matching': True, - }, { - # audio - 'url': 'http://www.mdr.de/kultur/audio1312272_zc-15948bad_zs-86171fdd.html', - 'md5': '64c4ee50f0a791deb9479cd7bbe9d2fa', - 'info_dict': { - 'id': '1312272', - 'ext': 'mp3', - 'title': 'Feuilleton vom 30. Oktober 2015', - 'duration': 250, - 'uploader': 'MITTELDEUTSCHER RUNDFUNK', - }, - 'skip': '404 not found', - }, { - 'url': 'http://www.kika.de/baumhaus/videos/video19636.html', - 'md5': '4930515e36b06c111213e80d1e4aad0e', - 'info_dict': { - 'id': '19636', - 'ext': 'mp4', - 'title': 'Baumhaus vom 30. Oktober 2015', - 'duration': 134, - 'uploader': 'KIKA', - }, - 'skip': '404 not found', - }, { - 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/videos/video8182.html', - 'md5': '5fe9c4dd7d71e3b238f04b8fdd588357', - 'info_dict': { - 'id': '8182', - 'ext': 'mp4', - 'title': 'Beutolomäus und der geheime Weihnachtswunsch', - 'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd', - 'timestamp': 1482541200, - 'upload_date': '20161224', - 'duration': 4628, - 'uploader': 'KIKA', - }, - }, { - # audio with alternative playerURL pattern - 'url': 'http://www.mdr.de/kultur/videos-und-audios/audio-radio/operation-mindfuck-robert-wilson100.html', - 'info_dict': { - 'id': '100', - 'ext': 'mp4', - 'title': 'Feature: Operation Mindfuck - Robert Anton Wilson', - 'duration': 3239, - 'uploader': 'MITTELDEUTSCHER RUNDFUNK', - }, - }, { - # empty bitrateVideo and bitrateAudio - 'url': 'https://www.kika.de/filme/sendung128372_zc-572e3f45_zs-1d9fb70e.html', - 'info_dict': { - 'id': '128372', - 'ext': 'mp4', - 'title': 'Der kleine Wichtel kehrt zurück', - 'description': 'md5:f77fafdff90f7aa1e9dca14f662c052a', - 'duration': 4876, - 'timestamp': 1607823300, - 'upload_date': '20201213', - 'uploader': 'ZDF', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://www.kika.de/baumhaus/sendungen/video19636_zc-fea7f8a0_zs-4bf89c60.html', - 'only_matching': True, - }, { - 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/einzelsendung2534.html', - 'only_matching': True, - }, { - 'url': 'http://www.mdr.de/mediathek/mdr-videos/a/video-1334.html', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - data_url = self._search_regex( - r'(?:dataURL|playerXml(?:["\'])?)\s*:\s*(["\'])(?P<url>.+?-avCustom\.xml)\1', - webpage, 'data url', group='url').replace(r'\/', '/') - - doc = self._download_xml( - compat_urlparse.urljoin(url, data_url), video_id) - - title = xpath_text(doc, ['./title', './broadcast/broadcastName'], 'title', fatal=True) - - type_ = xpath_text(doc, './type', default=None) - - formats = [] - processed_urls = [] - for asset in doc.findall('./assets/asset'): - for source in ( - 'download', - 'progressiveDownload', - 'dynamicHttpStreamingRedirector', - 'adaptiveHttpStreamingRedirector'): - url_el = asset.find('./%sUrl' % source) - if url_el is None: - continue - - video_url = url_or_none(url_el.text) - if not video_url or video_url in processed_urls: - continue - - processed_urls.append(video_url) - - ext = determine_ext(video_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', entry_protocol='m3u8_native', - preference=0, m3u8_id='HLS', fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - video_url + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id, - preference=0, f4m_id='HDS', fatal=False)) - else: - media_type = xpath_text(asset, './mediaType', 'media type', default='MP4') - vbr = int_or_none(xpath_text(asset, './bitrateVideo', 'vbr'), 1000) - abr = int_or_none(xpath_text(asset, './bitrateAudio', 'abr'), 1000) - filesize = int_or_none(xpath_text(asset, './fileSize', 'file size')) - - format_id = [media_type] - if vbr or abr: - format_id.append(compat_str(vbr or abr)) - - f = { - 'url': video_url, - 'format_id': '-'.join(format_id), - 'filesize': filesize, - 'abr': abr, - 'vbr': vbr, - } - - if vbr: - f.update({ - 'width': int_or_none(xpath_text(asset, './frameWidth', 'width')), - 'height': int_or_none(xpath_text(asset, './frameHeight', 'height')), - }) - - if type_ == 'audio': - f['vcodec'] = 'none' - - formats.append(f) - - self._sort_formats(formats) - - description = xpath_text(doc, './broadcast/broadcastDescription', 'description') - timestamp = parse_iso8601( - xpath_text( - doc, [ - './broadcast/broadcastDate', - './broadcast/broadcastStartDate', - './broadcast/broadcastEndDate'], - 'timestamp', default=None)) - duration = parse_duration(xpath_text(doc, './duration', 'duration')) - uploader = xpath_text(doc, './rights', 'uploader') - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'timestamp': timestamp, - 'duration': duration, - 'uploader': uploader, - 'formats': formats, - } diff --git a/youtube_dl/extractor/medaltv.py b/youtube_dl/extractor/medaltv.py deleted file mode 100644 index 67bb4debb..000000000 --- a/youtube_dl/extractor/medaltv.py +++ /dev/null @@ -1,137 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - ExtractorError, - float_or_none, - int_or_none, - str_or_none, - try_get, -) - - -class MedalTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?medal\.tv/clips/(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'https://medal.tv/clips/2mA60jWAGQCBH', - 'md5': '7b07b064331b1cf9e8e5c52a06ae68fa', - 'info_dict': { - 'id': '2mA60jWAGQCBH', - 'ext': 'mp4', - 'title': 'Quad Cold', - 'description': 'Medal,https://medal.tv/desktop/', - 'uploader': 'MowgliSB', - 'timestamp': 1603165266, - 'upload_date': '20201020', - 'uploader_id': '10619174', - } - }, { - 'url': 'https://medal.tv/clips/2um24TWdty0NA', - 'md5': 'b6dc76b78195fff0b4f8bf4a33ec2148', - 'info_dict': { - 'id': '2um24TWdty0NA', - 'ext': 'mp4', - 'title': 'u tk me i tk u bigger', - 'description': 'Medal,https://medal.tv/desktop/', - 'uploader': 'Mimicc', - 'timestamp': 1605580939, - 'upload_date': '20201117', - 'uploader_id': '5156321', - } - }, { - 'url': 'https://medal.tv/clips/37rMeFpryCC-9', - 'only_matching': True, - }, { - 'url': 'https://medal.tv/clips/2WRj40tpY_EU9', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - hydration_data = self._parse_json(self._search_regex( - r'<script[^>]*>\s*(?:var\s*)?hydrationData\s*=\s*({.+?})\s*</script>', - webpage, 'hydration data', default='{}'), video_id) - - clip = try_get( - hydration_data, lambda x: x['clips'][video_id], dict) or {} - if not clip: - raise ExtractorError( - 'Could not find video information.', video_id=video_id) - - title = clip['contentTitle'] - - source_width = int_or_none(clip.get('sourceWidth')) - source_height = int_or_none(clip.get('sourceHeight')) - - aspect_ratio = source_width / source_height if source_width and source_height else 16 / 9 - - def add_item(container, item_url, height, id_key='format_id', item_id=None): - item_id = item_id or '%dp' % height - if item_id not in item_url: - return - width = int(round(aspect_ratio * height)) - container.append({ - 'url': item_url, - id_key: item_id, - 'width': width, - 'height': height - }) - - formats = [] - thumbnails = [] - for k, v in clip.items(): - if not (v and isinstance(v, compat_str)): - continue - mobj = re.match(r'(contentUrl|thumbnail)(?:(\d+)p)?$', k) - if not mobj: - continue - prefix = mobj.group(1) - height = int_or_none(mobj.group(2)) - if prefix == 'contentUrl': - add_item( - formats, v, height or source_height, - item_id=None if height else 'source') - elif prefix == 'thumbnail': - add_item(thumbnails, v, height, 'id') - - error = clip.get('error') - if not formats and error: - if error == 404: - raise ExtractorError( - 'That clip does not exist.', - expected=True, video_id=video_id) - else: - raise ExtractorError( - 'An unknown error occurred ({0}).'.format(error), - video_id=video_id) - - self._sort_formats(formats) - - # Necessary because the id of the author is not known in advance. - # Won't raise an issue if no profile can be found as this is optional. - author = try_get( - hydration_data, lambda x: list(x['profiles'].values())[0], dict) or {} - author_id = str_or_none(author.get('id')) - author_url = 'https://medal.tv/users/{0}'.format(author_id) if author_id else None - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'thumbnails': thumbnails, - 'description': clip.get('contentDescription'), - 'uploader': author.get('displayName'), - 'timestamp': float_or_none(clip.get('created'), 1000), - 'uploader_id': author_id, - 'uploader_url': author_url, - 'duration': int_or_none(clip.get('videoLengthSeconds')), - 'view_count': int_or_none(clip.get('views')), - 'like_count': int_or_none(clip.get('likes')), - 'comment_count': int_or_none(clip.get('comments')), - } diff --git a/youtube_dl/extractor/mediaset.py b/youtube_dl/extractor/mediaset.py deleted file mode 100644 index 2c16fc9e2..000000000 --- a/youtube_dl/extractor/mediaset.py +++ /dev/null @@ -1,182 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .theplatform import ThePlatformBaseIE -from ..compat import ( - compat_parse_qs, - compat_urllib_parse_urlparse, -) -from ..utils import ( - ExtractorError, - int_or_none, - update_url_query, -) - - -class MediasetIE(ThePlatformBaseIE): - _TP_TLD = 'eu' - _VALID_URL = r'''(?x) - (?: - mediaset:| - https?:// - (?:(?:www|static3)\.)?mediasetplay\.mediaset\.it/ - (?: - (?:video|on-demand|movie)/(?:[^/]+/)+[^/]+_| - player/index\.html\?.*?\bprogramGuid= - ) - )(?P<id>[0-9A-Z]{16,}) - ''' - _TESTS = [{ - # full episode - 'url': 'https://www.mediasetplay.mediaset.it/video/hellogoodbye/quarta-puntata_FAFU000000661824', - 'md5': '9b75534d42c44ecef7bf1ffeacb7f85d', - 'info_dict': { - 'id': 'FAFU000000661824', - 'ext': 'mp4', - 'title': 'Quarta puntata', - 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 1414.26, - 'upload_date': '20161107', - 'series': 'Hello Goodbye', - 'timestamp': 1478532900, - 'uploader': 'Rete 4', - 'uploader_id': 'R4', - }, - }, { - 'url': 'https://www.mediasetplay.mediaset.it/video/matrix/puntata-del-25-maggio_F309013801000501', - 'md5': '288532f0ad18307705b01e581304cd7b', - 'info_dict': { - 'id': 'F309013801000501', - 'ext': 'mp4', - 'title': 'Puntata del 25 maggio', - 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 6565.007, - 'upload_date': '20180526', - 'series': 'Matrix', - 'timestamp': 1527326245, - 'uploader': 'Canale 5', - 'uploader_id': 'C5', - }, - }, { - # clip - 'url': 'https://www.mediasetplay.mediaset.it/video/gogglebox/un-grande-classico-della-commedia-sexy_FAFU000000661680', - 'only_matching': True, - }, { - # iframe simple - 'url': 'https://static3.mediasetplay.mediaset.it/player/index.html?appKey=5ad3966b1de1c4000d5cec48&programGuid=FAFU000000665924&id=665924', - 'only_matching': True, - }, { - # iframe twitter (from http://www.wittytv.it/se-prima-mi-fidavo-zero/) - 'url': 'https://static3.mediasetplay.mediaset.it/player/index.html?appKey=5ad3966b1de1c4000d5cec48&programGuid=FAFU000000665104&id=665104', - 'only_matching': True, - }, { - 'url': 'mediaset:FAFU000000665924', - 'only_matching': True, - }, { - 'url': 'https://www.mediasetplay.mediaset.it/video/mediasethaacuoreilfuturo/palmieri-alicudi-lisola-dei-tre-bambini-felici--un-decreto-per-alicudi-e-tutte-le-microscuole_FD00000000102295', - 'only_matching': True, - }, { - 'url': 'https://www.mediasetplay.mediaset.it/video/cherryseason/anticipazioni-degli-episodi-del-23-ottobre_F306837101005C02', - 'only_matching': True, - }, { - 'url': 'https://www.mediasetplay.mediaset.it/video/tg5/ambiente-onda-umana-per-salvare-il-pianeta_F309453601079D01', - 'only_matching': True, - }, { - 'url': 'https://www.mediasetplay.mediaset.it/video/grandefratellovip/benedetta-una-doccia-gelata_F309344401044C135', - 'only_matching': True, - }, { - 'url': 'https://www.mediasetplay.mediaset.it/movie/herculeslaleggendahainizio/hercules-la-leggenda-ha-inizio_F305927501000102', - 'only_matching': True, - }] - - @staticmethod - def _extract_urls(ie, webpage): - def _qs(url): - return compat_parse_qs(compat_urllib_parse_urlparse(url).query) - - def _program_guid(qs): - return qs.get('programGuid', [None])[0] - - entries = [] - for mobj in re.finditer( - r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?video\.mediaset\.it/player/playerIFrame(?:Twitter)?\.shtml.*?)\1', - webpage): - embed_url = mobj.group('url') - embed_qs = _qs(embed_url) - program_guid = _program_guid(embed_qs) - if program_guid: - entries.append(embed_url) - continue - video_id = embed_qs.get('id', [None])[0] - if not video_id: - continue - urlh = ie._request_webpage( - embed_url, video_id, note='Following embed URL redirect') - embed_url = urlh.geturl() - program_guid = _program_guid(_qs(embed_url)) - if program_guid: - entries.append(embed_url) - return entries - - def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): - for video in smil.findall(self._xpath_ns('.//video', namespace)): - video.attrib['src'] = re.sub(r'(https?://vod05)t(-mediaset-it\.akamaized\.net/.+?.mpd)\?.+', r'\1\2', video.attrib['src']) - return super(MediasetIE, self)._parse_smil_formats(smil, smil_url, video_id, namespace, f4m_params, transform_rtmp_url) - - def _real_extract(self, url): - guid = self._match_id(url) - tp_path = 'PR1GhC/media/guid/2702976343/' + guid - info = self._extract_theplatform_metadata(tp_path, guid) - - formats = [] - subtitles = {} - first_e = None - for asset_type in ('SD', 'HD'): - # TODO: fixup ISM+none manifest URLs - for f in ('MPEG4', 'MPEG-DASH+none', 'M3U+none'): - try: - tp_formats, tp_subtitles = self._extract_theplatform_smil( - update_url_query('http://link.theplatform.%s/s/%s' % (self._TP_TLD, tp_path), { - 'mbr': 'true', - 'formats': f, - 'assetTypes': asset_type, - }), guid, 'Downloading %s %s SMIL data' % (f.split('+')[0], asset_type)) - except ExtractorError as e: - if not first_e: - first_e = e - break - for tp_f in tp_formats: - tp_f['quality'] = 1 if asset_type == 'HD' else 0 - formats.extend(tp_formats) - subtitles = self._merge_subtitles(subtitles, tp_subtitles) - if first_e and not formats: - raise first_e - self._sort_formats(formats) - - fields = [] - for templ, repls in (('tvSeason%sNumber', ('', 'Episode')), ('mediasetprogram$%s', ('brandTitle', 'numberOfViews', 'publishInfo'))): - fields.extend(templ % repl for repl in repls) - feed_data = self._download_json( - 'https://feed.entertainment.tv.theplatform.eu/f/PR1GhC/mediaset-prod-all-programs/guid/-/' + guid, - guid, fatal=False, query={'fields': ','.join(fields)}) - if feed_data: - publish_info = feed_data.get('mediasetprogram$publishInfo') or {} - info.update({ - 'episode_number': int_or_none(feed_data.get('tvSeasonEpisodeNumber')), - 'season_number': int_or_none(feed_data.get('tvSeasonNumber')), - 'series': feed_data.get('mediasetprogram$brandTitle'), - 'uploader': publish_info.get('description'), - 'uploader_id': publish_info.get('channel'), - 'view_count': int_or_none(feed_data.get('mediasetprogram$numberOfViews')), - }) - - info.update({ - 'id': guid, - 'formats': formats, - 'subtitles': subtitles, - }) - return info diff --git a/youtube_dl/extractor/mediasite.py b/youtube_dl/extractor/mediasite.py deleted file mode 100644 index d6eb15740..000000000 --- a/youtube_dl/extractor/mediasite.py +++ /dev/null @@ -1,366 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re -import json - -from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, -) -from ..utils import ( - ExtractorError, - float_or_none, - mimetype2ext, - str_or_none, - try_get, - unescapeHTML, - unsmuggle_url, - url_or_none, - urljoin, -) - - -_ID_RE = r'(?:[0-9a-f]{32,34}|[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12,14})' - - -class MediasiteIE(InfoExtractor): - _VALID_URL = r'(?xi)https?://[^/]+/Mediasite/(?:Play|Showcase/(?:default|livebroadcast)/Presentation)/(?P<id>%s)(?P<query>\?[^#]+|)' % _ID_RE - _TESTS = [ - { - 'url': 'https://hitsmediaweb.h-its.org/mediasite/Play/2db6c271681e4f199af3c60d1f82869b1d', - 'info_dict': { - 'id': '2db6c271681e4f199af3c60d1f82869b1d', - 'ext': 'mp4', - 'title': 'Lecture: Tuesday, September 20, 2016 - Sir Andrew Wiles', - 'description': 'Sir Andrew Wiles: “Equations in arithmetic”\\n\\nI will describe some of the interactions between modern number theory and the problem of solving equations in rational numbers or integers\\u0027.', - 'timestamp': 1474268400.0, - 'upload_date': '20160919', - }, - }, - { - 'url': 'http://mediasite.uib.no/Mediasite/Play/90bb363295d945d6b548c867d01181361d?catalog=a452b7df-9ae1-46b7-a3ba-aceeb285f3eb', - 'info_dict': { - 'id': '90bb363295d945d6b548c867d01181361d', - 'ext': 'mp4', - 'upload_date': '20150429', - 'title': '5) IT-forum 2015-Dag 1 - Dungbeetle - How and why Rain created a tiny bug tracker for Unity', - 'timestamp': 1430311380.0, - }, - }, - { - 'url': 'https://collegerama.tudelft.nl/Mediasite/Play/585a43626e544bdd97aeb71a0ec907a01d', - 'md5': '481fda1c11f67588c0d9d8fbdced4e39', - 'info_dict': { - 'id': '585a43626e544bdd97aeb71a0ec907a01d', - 'ext': 'mp4', - 'title': 'Een nieuwe wereld: waarden, bewustzijn en techniek van de mensheid 2.0.', - 'description': '', - 'thumbnail': r're:^https?://.*\.jpg(?:\?.*)?$', - 'duration': 7713.088, - 'timestamp': 1413309600, - 'upload_date': '20141014', - }, - }, - { - 'url': 'https://collegerama.tudelft.nl/Mediasite/Play/86a9ea9f53e149079fbdb4202b521ed21d?catalog=fd32fd35-6c99-466c-89d4-cd3c431bc8a4', - 'md5': 'ef1fdded95bdf19b12c5999949419c92', - 'info_dict': { - 'id': '86a9ea9f53e149079fbdb4202b521ed21d', - 'ext': 'wmv', - 'title': '64ste Vakantiecursus: Afvalwater', - 'description': 'md5:7fd774865cc69d972f542b157c328305', - 'thumbnail': r're:^https?://.*\.jpg(?:\?.*?)?$', - 'duration': 10853, - 'timestamp': 1326446400, - 'upload_date': '20120113', - }, - }, - { - 'url': 'http://digitalops.sandia.gov/Mediasite/Play/24aace4429fc450fb5b38cdbf424a66e1d', - 'md5': '9422edc9b9a60151727e4b6d8bef393d', - 'info_dict': { - 'id': '24aace4429fc450fb5b38cdbf424a66e1d', - 'ext': 'mp4', - 'title': 'Xyce Software Training - Section 1', - 'description': r're:(?s)SAND Number: SAND 2013-7800.{200,}', - 'upload_date': '20120409', - 'timestamp': 1333983600, - 'duration': 7794, - } - }, - { - 'url': 'https://collegerama.tudelft.nl/Mediasite/Showcase/livebroadcast/Presentation/ada7020854f743c49fbb45c9ec7dbb351d', - 'only_matching': True, - }, - { - 'url': 'https://mediasite.ntnu.no/Mediasite/Showcase/default/Presentation/7d8b913259334b688986e970fae6fcb31d', - 'only_matching': True, - }, - { - # dashed id - 'url': 'https://hitsmediaweb.h-its.org/mediasite/Play/2db6c271-681e-4f19-9af3-c60d1f82869b1d', - 'only_matching': True, - } - ] - - # look in Mediasite.Core.js (Mediasite.ContentStreamType[*]) - _STREAM_TYPES = { - 0: 'video1', # the main video - 2: 'slide', - 3: 'presentation', - 4: 'video2', # screencast? - 5: 'video3', - } - - @staticmethod - def _extract_urls(webpage): - return [ - unescapeHTML(mobj.group('url')) - for mobj in re.finditer( - r'(?xi)<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:(?:https?:)?//[^/]+)?/Mediasite/Play/%s(?:\?.*?)?)\1' % _ID_RE, - webpage)] - - def _real_extract(self, url): - url, data = unsmuggle_url(url, {}) - mobj = re.match(self._VALID_URL, url) - resource_id = mobj.group('id') - query = mobj.group('query') - - webpage, urlh = self._download_webpage_handle(url, resource_id) # XXX: add UrlReferrer? - redirect_url = urlh.geturl() - - # XXX: might have also extracted UrlReferrer and QueryString from the html - service_path = compat_urlparse.urljoin(redirect_url, self._html_search_regex( - r'<div[^>]+\bid=["\']ServicePath[^>]+>(.+?)</div>', webpage, resource_id, - default='/Mediasite/PlayerService/PlayerService.svc/json')) - - player_options = self._download_json( - '%s/GetPlayerOptions' % service_path, resource_id, - headers={ - 'Content-type': 'application/json; charset=utf-8', - 'X-Requested-With': 'XMLHttpRequest', - }, - data=json.dumps({ - 'getPlayerOptionsRequest': { - 'ResourceId': resource_id, - 'QueryString': query, - 'UrlReferrer': data.get('UrlReferrer', ''), - 'UseScreenReader': False, - } - }).encode('utf-8'))['d'] - - presentation = player_options['Presentation'] - title = presentation['Title'] - - if presentation is None: - raise ExtractorError( - 'Mediasite says: %s' % player_options['PlayerPresentationStatusMessage'], - expected=True) - - thumbnails = [] - formats = [] - for snum, Stream in enumerate(presentation['Streams']): - stream_type = Stream.get('StreamType') - if stream_type is None: - continue - - video_urls = Stream.get('VideoUrls') - if not isinstance(video_urls, list): - video_urls = [] - - stream_id = self._STREAM_TYPES.get( - stream_type, 'type%u' % stream_type) - - stream_formats = [] - for unum, VideoUrl in enumerate(video_urls): - video_url = url_or_none(VideoUrl.get('Location')) - if not video_url: - continue - # XXX: if Stream.get('CanChangeScheme', False), switch scheme to HTTP/HTTPS - - media_type = VideoUrl.get('MediaType') - if media_type == 'SS': - stream_formats.extend(self._extract_ism_formats( - video_url, resource_id, - ism_id='%s-%u.%u' % (stream_id, snum, unum), - fatal=False)) - elif media_type == 'Dash': - stream_formats.extend(self._extract_mpd_formats( - video_url, resource_id, - mpd_id='%s-%u.%u' % (stream_id, snum, unum), - fatal=False)) - else: - stream_formats.append({ - 'format_id': '%s-%u.%u' % (stream_id, snum, unum), - 'url': video_url, - 'ext': mimetype2ext(VideoUrl.get('MimeType')), - }) - - # TODO: if Stream['HasSlideContent']: - # synthesise an MJPEG video stream '%s-%u.slides' % (stream_type, snum) - # from Stream['Slides'] - # this will require writing a custom downloader... - - # disprefer 'secondary' streams - if stream_type != 0: - for fmt in stream_formats: - fmt['preference'] = -1 - - thumbnail_url = Stream.get('ThumbnailUrl') - if thumbnail_url: - thumbnails.append({ - 'id': '%s-%u' % (stream_id, snum), - 'url': urljoin(redirect_url, thumbnail_url), - 'preference': -1 if stream_type != 0 else 0, - }) - formats.extend(stream_formats) - - self._sort_formats(formats) - - # XXX: Presentation['Presenters'] - # XXX: Presentation['Transcript'] - - return { - 'id': resource_id, - 'title': title, - 'description': presentation.get('Description'), - 'duration': float_or_none(presentation.get('Duration'), 1000), - 'timestamp': float_or_none(presentation.get('UnixTime'), 1000), - 'formats': formats, - 'thumbnails': thumbnails, - } - - -class MediasiteCatalogIE(InfoExtractor): - _VALID_URL = r'''(?xi) - (?P<url>https?://[^/]+/Mediasite) - /Catalog/Full/ - (?P<catalog_id>{0}) - (?: - /(?P<current_folder_id>{0}) - /(?P<root_dynamic_folder_id>{0}) - )? - '''.format(_ID_RE) - _TESTS = [{ - 'url': 'http://events7.mediasite.com/Mediasite/Catalog/Full/631f9e48530d454381549f955d08c75e21', - 'info_dict': { - 'id': '631f9e48530d454381549f955d08c75e21', - 'title': 'WCET Summit: Adaptive Learning in Higher Ed: Improving Outcomes Dynamically', - }, - 'playlist_count': 6, - 'expected_warnings': ['is not a supported codec'], - }, { - # with CurrentFolderId and RootDynamicFolderId - 'url': 'https://medaudio.medicine.iu.edu/Mediasite/Catalog/Full/9518c4a6c5cf4993b21cbd53e828a92521/97a9db45f7ab47428c77cd2ed74bb98f14/9518c4a6c5cf4993b21cbd53e828a92521', - 'info_dict': { - 'id': '9518c4a6c5cf4993b21cbd53e828a92521', - 'title': 'IUSM Family and Friends Sessions', - }, - 'playlist_count': 2, - }, { - 'url': 'http://uipsyc.mediasite.com/mediasite/Catalog/Full/d5d79287c75243c58c50fef50174ec1b21', - 'only_matching': True, - }, { - # no AntiForgeryToken - 'url': 'https://live.libraries.psu.edu/Mediasite/Catalog/Full/8376d4b24dd1457ea3bfe4cf9163feda21', - 'only_matching': True, - }, { - 'url': 'https://medaudio.medicine.iu.edu/Mediasite/Catalog/Full/9518c4a6c5cf4993b21cbd53e828a92521/97a9db45f7ab47428c77cd2ed74bb98f14/9518c4a6c5cf4993b21cbd53e828a92521', - 'only_matching': True, - }, { - # dashed id - 'url': 'http://events7.mediasite.com/Mediasite/Catalog/Full/631f9e48-530d-4543-8154-9f955d08c75e', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - mediasite_url = mobj.group('url') - catalog_id = mobj.group('catalog_id') - current_folder_id = mobj.group('current_folder_id') or catalog_id - root_dynamic_folder_id = mobj.group('root_dynamic_folder_id') - - webpage = self._download_webpage(url, catalog_id) - - # AntiForgeryToken is optional (e.g. [1]) - # 1. https://live.libraries.psu.edu/Mediasite/Catalog/Full/8376d4b24dd1457ea3bfe4cf9163feda21 - anti_forgery_token = self._search_regex( - r'AntiForgeryToken\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1', - webpage, 'anti forgery token', default=None, group='value') - if anti_forgery_token: - anti_forgery_header = self._search_regex( - r'AntiForgeryHeaderName\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1', - webpage, 'anti forgery header name', - default='X-SOFO-AntiForgeryHeader', group='value') - - data = { - 'IsViewPage': True, - 'IsNewFolder': True, - 'AuthTicket': None, - 'CatalogId': catalog_id, - 'CurrentFolderId': current_folder_id, - 'RootDynamicFolderId': root_dynamic_folder_id, - 'ItemsPerPage': 1000, - 'PageIndex': 0, - 'PermissionMask': 'Execute', - 'CatalogSearchType': 'SearchInFolder', - 'SortBy': 'Date', - 'SortDirection': 'Descending', - 'StartDate': None, - 'EndDate': None, - 'StatusFilterList': None, - 'PreviewKey': None, - 'Tags': [], - } - - headers = { - 'Content-Type': 'application/json; charset=UTF-8', - 'Referer': url, - 'X-Requested-With': 'XMLHttpRequest', - } - if anti_forgery_token: - headers[anti_forgery_header] = anti_forgery_token - - catalog = self._download_json( - '%s/Catalog/Data/GetPresentationsForFolder' % mediasite_url, - catalog_id, data=json.dumps(data).encode(), headers=headers) - - entries = [] - for video in catalog['PresentationDetailsList']: - if not isinstance(video, dict): - continue - video_id = str_or_none(video.get('Id')) - if not video_id: - continue - entries.append(self.url_result( - '%s/Play/%s' % (mediasite_url, video_id), - ie=MediasiteIE.ie_key(), video_id=video_id)) - - title = try_get( - catalog, lambda x: x['CurrentFolder']['Name'], compat_str) - - return self.playlist_result(entries, catalog_id, title,) - - -class MediasiteNamedCatalogIE(InfoExtractor): - _VALID_URL = r'(?xi)(?P<url>https?://[^/]+/Mediasite)/Catalog/catalogs/(?P<catalog_name>[^/?#&]+)' - _TESTS = [{ - 'url': 'https://msite.misis.ru/Mediasite/Catalog/catalogs/2016-industrial-management-skriabin-o-o', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - mediasite_url = mobj.group('url') - catalog_name = mobj.group('catalog_name') - - webpage = self._download_webpage(url, catalog_name) - - catalog_id = self._search_regex( - r'CatalogId\s*:\s*["\'](%s)' % _ID_RE, webpage, 'catalog id') - - return self.url_result( - '%s/Catalog/Full/%s' % (mediasite_url, catalog_id), - ie=MediasiteCatalogIE.ie_key(), video_id=catalog_id) diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py deleted file mode 100644 index 9e92416d1..000000000 --- a/youtube_dl/extractor/metacafe.py +++ /dev/null @@ -1,287 +0,0 @@ -from __future__ import unicode_literals - -import json -import re - -from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_urllib_parse, - compat_urllib_parse_unquote, -) -from ..utils import ( - determine_ext, - ExtractorError, - int_or_none, - get_element_by_attribute, - mimetype2ext, -) - - -class MetacafeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?metacafe\.com/watch/(?P<video_id>[^/]+)/(?P<display_id>[^/?#]+)' - _DISCLAIMER = 'http://www.metacafe.com/family_filter/' - _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user' - IE_NAME = 'metacafe' - _TESTS = [ - # Youtube video - { - 'add_ie': ['Youtube'], - 'url': 'http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/', - 'info_dict': { - 'id': '_aUehQsCQtM', - 'ext': 'mp4', - 'upload_date': '20090102', - 'title': 'The Electric Company | "Short I" | PBS KIDS GO!', - 'description': 'md5:2439a8ef6d5a70e380c22f5ad323e5a8', - 'uploader': 'PBS', - 'uploader_id': 'PBS' - } - }, - # Normal metacafe video - { - 'url': 'http://www.metacafe.com/watch/11121940/news_stuff_you_wont_do_with_your_playstation_4/', - 'md5': '6e0bca200eaad2552e6915ed6fd4d9ad', - 'info_dict': { - 'id': '11121940', - 'ext': 'mp4', - 'title': 'News: Stuff You Won\'t Do with Your PlayStation 4', - 'uploader': 'ign', - 'description': 'Sony released a massive FAQ on the PlayStation Blog detailing the PS4\'s capabilities and limitations.', - }, - 'skip': 'Page is temporarily unavailable.', - }, - # metacafe video with family filter - { - 'url': 'http://www.metacafe.com/watch/2155630/adult_art_by_david_hart_156/', - 'md5': 'b06082c5079bbdcde677a6291fbdf376', - 'info_dict': { - 'id': '2155630', - 'ext': 'mp4', - 'title': 'Adult Art By David Hart 156', - 'uploader': '63346', - 'description': 'md5:9afac8fc885252201ad14563694040fc', - }, - 'params': { - 'skip_download': True, - }, - }, - # AnyClip video - { - 'url': 'http://www.metacafe.com/watch/an-dVVXnuY7Jh77J/the_andromeda_strain_1971_stop_the_bomb_part_3/', - 'info_dict': { - 'id': 'an-dVVXnuY7Jh77J', - 'ext': 'mp4', - 'title': 'The Andromeda Strain (1971): Stop the Bomb Part 3', - 'uploader': 'AnyClip', - 'description': 'md5:cbef0460d31e3807f6feb4e7a5952e5b', - }, - }, - # age-restricted video - { - 'url': 'http://www.metacafe.com/watch/5186653/bbc_internal_christmas_tape_79_uncensored_outtakes_etc/', - 'md5': '98dde7c1a35d02178e8ab7560fe8bd09', - 'info_dict': { - 'id': '5186653', - 'ext': 'mp4', - 'title': 'BBC INTERNAL Christmas Tape \'79 - UNCENSORED Outtakes, Etc.', - 'uploader': 'Dwayne Pipe', - 'description': 'md5:950bf4c581e2c059911fa3ffbe377e4b', - 'age_limit': 18, - }, - }, - # cbs video - { - 'url': 'http://www.metacafe.com/watch/cb-8VD4r_Zws8VP/open_this_is_face_the_nation_february_9/', - 'info_dict': { - 'id': '8VD4r_Zws8VP', - 'ext': 'flv', - 'title': 'Open: This is Face the Nation, February 9', - 'description': 'md5:8a9ceec26d1f7ed6eab610834cc1a476', - 'duration': 96, - 'uploader': 'CBSI-NEW', - 'upload_date': '20140209', - 'timestamp': 1391959800, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - }, - # Movieclips.com video - { - 'url': 'http://www.metacafe.com/watch/mv-Wy7ZU/my_week_with_marilyn_do_you_love_me/', - 'info_dict': { - 'id': 'mv-Wy7ZU', - 'ext': 'mp4', - 'title': 'My Week with Marilyn - Do You Love Me?', - 'description': 'From the movie My Week with Marilyn - Colin (Eddie Redmayne) professes his love to Marilyn (Michelle Williams) and gets her to promise to return to set and finish the movie.', - 'uploader': 'movie_trailers', - 'duration': 176, - }, - 'params': { - 'skip_download': 'requires rtmpdump', - } - } - ] - - def report_disclaimer(self): - self.to_screen('Retrieving disclaimer') - - def _real_extract(self, url): - # Extract id and simplified title from URL - video_id, display_id = re.match(self._VALID_URL, url).groups() - - # the video may come from an external site - m_external = re.match(r'^(\w{2})-(.*)$', video_id) - if m_external is not None: - prefix, ext_id = m_external.groups() - # Check if video comes from YouTube - if prefix == 'yt': - return self.url_result('http://www.youtube.com/watch?v=%s' % ext_id, 'Youtube') - # CBS videos use theplatform.com - if prefix == 'cb': - return self.url_result('theplatform:%s' % ext_id, 'ThePlatform') - - headers = { - # Disable family filter - 'Cookie': 'user=%s; ' % compat_urllib_parse.quote(json.dumps({'ffilter': False})) - } - - # AnyClip videos require the flashversion cookie so that we get the link - # to the mp4 file - if video_id.startswith('an-'): - headers['Cookie'] += 'flashVersion=0; ' - - # Retrieve video webpage to extract further information - webpage = self._download_webpage(url, video_id, headers=headers) - - error = get_element_by_attribute( - 'class', 'notfound-page-title', webpage) - if error: - raise ExtractorError(error, expected=True) - - video_title = self._html_search_meta( - ['og:title', 'twitter:title'], webpage, 'title', default=None) or self._search_regex(r'<h1>(.*?)</h1>', webpage, 'title') - - # Extract URL, uploader and title from webpage - self.report_extraction(video_id) - video_url = None - mobj = re.search(r'(?m)&(?:media|video)URL=([^&]+)', webpage) - if mobj is not None: - mediaURL = compat_urllib_parse_unquote(mobj.group(1)) - video_ext = determine_ext(mediaURL) - - # Extract gdaKey if available - mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage) - if mobj is None: - video_url = mediaURL - else: - gdaKey = mobj.group(1) - video_url = '%s?__gda__=%s' % (mediaURL, gdaKey) - if video_url is None: - mobj = re.search(r'<video src="([^"]+)"', webpage) - if mobj: - video_url = mobj.group(1) - video_ext = 'mp4' - if video_url is None: - flashvars = self._search_regex( - r' name="flashvars" value="(.*?)"', webpage, 'flashvars', - default=None) - if flashvars: - vardict = compat_parse_qs(flashvars) - if 'mediaData' not in vardict: - raise ExtractorError('Unable to extract media URL') - mobj = re.search( - r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0]) - if mobj is None: - raise ExtractorError('Unable to extract media URL') - mediaURL = mobj.group('mediaURL').replace('\\/', '/') - video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key')) - video_ext = determine_ext(video_url) - if video_url is None: - player_url = self._search_regex( - r"swfobject\.embedSWF\('([^']+)'", - webpage, 'config URL', default=None) - if player_url: - config_url = self._search_regex( - r'config=(.+)$', player_url, 'config URL') - config_doc = self._download_xml( - config_url, video_id, - note='Downloading video config') - smil_url = config_doc.find('.//properties').attrib['smil_file'] - smil_doc = self._download_xml( - smil_url, video_id, - note='Downloading SMIL document') - base_url = smil_doc.find('./head/meta').attrib['base'] - video_url = [] - for vn in smil_doc.findall('.//video'): - br = int(vn.attrib['system-bitrate']) - play_path = vn.attrib['src'] - video_url.append({ - 'format_id': 'smil-%d' % br, - 'url': base_url, - 'play_path': play_path, - 'page_url': url, - 'player_url': player_url, - 'ext': play_path.partition(':')[0], - }) - if video_url is None: - flashvars = self._parse_json(self._search_regex( - r'flashvars\s*=\s*({.*});', webpage, 'flashvars', - default=None), video_id, fatal=False) - if flashvars: - video_url = [] - for source in flashvars.get('sources'): - source_url = source.get('src') - if not source_url: - continue - ext = mimetype2ext(source.get('type')) or determine_ext(source_url) - if ext == 'm3u8': - video_url.extend(self._extract_m3u8_formats( - source_url, video_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False)) - else: - video_url.append({ - 'url': source_url, - 'ext': ext, - }) - - if video_url is None: - raise ExtractorError('Unsupported video type') - - description = self._html_search_meta( - ['og:description', 'twitter:description', 'description'], - webpage, 'title', fatal=False) - thumbnail = self._html_search_meta( - ['og:image', 'twitter:image'], webpage, 'title', fatal=False) - video_uploader = self._html_search_regex( - r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);', - webpage, 'uploader nickname', fatal=False) - duration = int_or_none( - self._html_search_meta('video:duration', webpage, default=None)) - age_limit = ( - 18 - if re.search(r'(?:"contentRating":|"rating",)"restricted"', webpage) - else 0) - - if isinstance(video_url, list): - formats = video_url - else: - formats = [{ - 'url': video_url, - 'ext': video_ext, - }] - self._sort_formats(formats) - - return { - 'id': video_id, - 'display_id': display_id, - 'description': description, - 'uploader': video_uploader, - 'title': video_title, - 'thumbnail': thumbnail, - 'age_limit': age_limit, - 'formats': formats, - 'duration': duration, - } diff --git a/youtube_dl/extractor/metacritic.py b/youtube_dl/extractor/metacritic.py deleted file mode 100644 index 7d468d78b..000000000 --- a/youtube_dl/extractor/metacritic.py +++ /dev/null @@ -1,65 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - fix_xml_ampersands, -) - - -class MetacriticIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?metacritic\.com/.+?/trailers/(?P<id>\d+)' - - _TESTS = [{ - 'url': 'http://www.metacritic.com/game/playstation-4/infamous-second-son/trailers/3698222', - 'info_dict': { - 'id': '3698222', - 'ext': 'mp4', - 'title': 'inFamous: Second Son - inSide Sucker Punch: Smoke & Mirrors', - 'description': 'Take a peak behind-the-scenes to see how Sucker Punch brings smoke into the universe of inFAMOUS Second Son on the PS4.', - 'duration': 221, - }, - 'skip': 'Not providing trailers anymore', - }, { - 'url': 'http://www.metacritic.com/game/playstation-4/tales-from-the-borderlands-a-telltale-game-series/trailers/5740315', - 'info_dict': { - 'id': '5740315', - 'ext': 'mp4', - 'title': 'Tales from the Borderlands - Finale: The Vault of the Traveler', - 'description': 'In the final episode of the season, all hell breaks loose. Jack is now in control of Helios\' systems, and he\'s ready to reclaim his rightful place as king of Hyperion (with or without you).', - 'duration': 114, - }, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - webpage = self._download_webpage(url, video_id) - # The xml is not well formatted, there are raw '&' - info = self._download_xml('http://www.metacritic.com/video_data?video=' + video_id, - video_id, 'Downloading info xml', transform_source=fix_xml_ampersands) - - clip = next(c for c in info.findall('playList/clip') if c.find('id').text == video_id) - formats = [] - for videoFile in clip.findall('httpURI/videoFile'): - rate_str = videoFile.find('rate').text - video_url = videoFile.find('filePath').text - formats.append({ - 'url': video_url, - 'ext': 'mp4', - 'format_id': rate_str, - 'tbr': int(rate_str), - }) - self._sort_formats(formats) - - description = self._html_search_regex(r'<b>Description:</b>(.*?)</p>', - webpage, 'description', flags=re.DOTALL) - - return { - 'id': video_id, - 'title': clip.find('title').text, - 'formats': formats, - 'description': description, - 'duration': int(clip.find('duration').text), - } diff --git a/youtube_dl/extractor/mgoon.py b/youtube_dl/extractor/mgoon.py deleted file mode 100644 index 7bb473900..000000000 --- a/youtube_dl/extractor/mgoon.py +++ /dev/null @@ -1,87 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - qualities, - unified_strdate, -) - - -class MgoonIE(InfoExtractor): - _VALID_URL = r'''(?x)https?://(?:www\.)? - (?:(:?m\.)?mgoon\.com/(?:ch/(?:.+)/v|play/view)| - video\.mgoon\.com)/(?P<id>[0-9]+)''' - _API_URL = 'http://mpos.mgoon.com/player/video?id={0:}' - _TESTS = [ - { - 'url': 'http://m.mgoon.com/ch/hi6618/v/5582148', - 'md5': 'dd46bb66ab35cf6d51cc812fd82da79d', - 'info_dict': { - 'id': '5582148', - 'uploader_id': 'hi6618', - 'duration': 240.419, - 'upload_date': '20131220', - 'ext': 'mp4', - 'title': 'md5:543aa4c27a4931d371c3f433e8cebebc', - 'thumbnail': r're:^https?://.*\.jpg$', - } - }, - { - 'url': 'http://www.mgoon.com/play/view/5582148', - 'only_matching': True, - }, - { - 'url': 'http://video.mgoon.com/5582148', - 'only_matching': True, - }, - ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - data = self._download_json(self._API_URL.format(video_id), video_id) - - if data.get('errorInfo', {}).get('code') != 'NONE': - raise ExtractorError('%s encountered an error: %s' % ( - self.IE_NAME, data['errorInfo']['message']), expected=True) - - v_info = data['videoInfo'] - title = v_info.get('v_title') - thumbnail = v_info.get('v_thumbnail') - duration = v_info.get('v_duration') - upload_date = unified_strdate(v_info.get('v_reg_date')) - uploader_id = data.get('userInfo', {}).get('u_alias') - if duration: - duration /= 1000.0 - - age_limit = None - if data.get('accessInfo', {}).get('code') == 'VIDEO_STATUS_ADULT': - age_limit = 18 - - formats = [] - get_quality = qualities(['360p', '480p', '720p', '1080p']) - for fmt in data['videoFiles']: - formats.append({ - 'format_id': fmt['label'], - 'quality': get_quality(fmt['label']), - 'url': fmt['url'], - 'ext': fmt['format'], - - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'thumbnail': thumbnail, - 'duration': duration, - 'upload_date': upload_date, - 'uploader_id': uploader_id, - 'age_limit': age_limit, - } diff --git a/youtube_dl/extractor/microsoftvirtualacademy.py b/youtube_dl/extractor/microsoftvirtualacademy.py deleted file mode 100644 index 8e0aee0e6..000000000 --- a/youtube_dl/extractor/microsoftvirtualacademy.py +++ /dev/null @@ -1,195 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_xpath, -) -from ..utils import ( - int_or_none, - parse_duration, - smuggle_url, - unsmuggle_url, - xpath_text, -) - - -class MicrosoftVirtualAcademyBaseIE(InfoExtractor): - def _extract_base_url(self, course_id, display_id): - return self._download_json( - 'https://api-mlxprod.microsoft.com/services/products/anonymous/%s' % course_id, - display_id, 'Downloading course base URL') - - def _extract_chapter_and_title(self, title): - if not title: - return None, None - m = re.search(r'(?P<chapter>\d+)\s*\|\s*(?P<title>.+)', title) - return (int(m.group('chapter')), m.group('title')) if m else (None, title) - - -class MicrosoftVirtualAcademyIE(MicrosoftVirtualAcademyBaseIE): - IE_NAME = 'mva' - IE_DESC = 'Microsoft Virtual Academy videos' - _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|(?:www\.)?microsoftvirtualacademy)\.com/[^/]+/training-courses/[^/?#&]+-)(?P<course_id>\d+)(?::|\?l=)(?P<id>[\da-zA-Z]+_\d+)' % IE_NAME - - _TESTS = [{ - 'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788?l=gfVXISmEB_6804984382', - 'md5': '7826c44fc31678b12ad8db11f6b5abb9', - 'info_dict': { - 'id': 'gfVXISmEB_6804984382', - 'ext': 'mp4', - 'title': 'Course Introduction', - 'formats': 'mincount:3', - 'subtitles': { - 'en': [{ - 'ext': 'ttml', - }], - }, - } - }, { - 'url': 'mva:11788:gfVXISmEB_6804984382', - 'only_matching': True, - }] - - def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) - - mobj = re.match(self._VALID_URL, url) - course_id = mobj.group('course_id') - video_id = mobj.group('id') - - base_url = smuggled_data.get('base_url') or self._extract_base_url(course_id, video_id) - - settings = self._download_xml( - '%s/content/content_%s/videosettings.xml?v=1' % (base_url, video_id), - video_id, 'Downloading video settings XML') - - _, title = self._extract_chapter_and_title(xpath_text( - settings, './/Title', 'title', fatal=True)) - - formats = [] - - for sources in settings.findall(compat_xpath('.//MediaSources')): - sources_type = sources.get('videoType') - for source in sources.findall(compat_xpath('./MediaSource')): - video_url = source.text - if not video_url or not video_url.startswith('http'): - continue - if sources_type == 'smoothstreaming': - formats.extend(self._extract_ism_formats( - video_url, video_id, 'mss', fatal=False)) - continue - video_mode = source.get('videoMode') - height = int_or_none(self._search_regex( - r'^(\d+)[pP]$', video_mode or '', 'height', default=None)) - codec = source.get('codec') - acodec, vcodec = [None] * 2 - if codec: - codecs = codec.split(',') - if len(codecs) == 2: - acodec, vcodec = codecs - elif len(codecs) == 1: - vcodec = codecs[0] - formats.append({ - 'url': video_url, - 'format_id': video_mode, - 'height': height, - 'acodec': acodec, - 'vcodec': vcodec, - }) - self._sort_formats(formats) - - subtitles = {} - for source in settings.findall(compat_xpath('.//MarkerResourceSource')): - subtitle_url = source.text - if not subtitle_url: - continue - subtitles.setdefault('en', []).append({ - 'url': '%s/%s' % (base_url, subtitle_url), - 'ext': source.get('type'), - }) - - return { - 'id': video_id, - 'title': title, - 'subtitles': subtitles, - 'formats': formats - } - - -class MicrosoftVirtualAcademyCourseIE(MicrosoftVirtualAcademyBaseIE): - IE_NAME = 'mva:course' - IE_DESC = 'Microsoft Virtual Academy courses' - _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|(?:www\.)?microsoftvirtualacademy)\.com/[^/]+/training-courses/(?P<display_id>[^/?#&]+)-)(?P<id>\d+)' % IE_NAME - - _TESTS = [{ - 'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788', - 'info_dict': { - 'id': '11788', - 'title': 'Microsoft Azure Fundamentals: Virtual Machines', - }, - 'playlist_count': 36, - }, { - # with emphasized chapters - 'url': 'https://mva.microsoft.com/en-US/training-courses/developing-windows-10-games-with-construct-2-16335', - 'info_dict': { - 'id': '16335', - 'title': 'Developing Windows 10 Games with Construct 2', - }, - 'playlist_count': 10, - }, { - 'url': 'https://www.microsoftvirtualacademy.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788', - 'only_matching': True, - }, { - 'url': 'mva:course:11788', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return False if MicrosoftVirtualAcademyIE.suitable(url) else super( - MicrosoftVirtualAcademyCourseIE, cls).suitable(url) - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - course_id = mobj.group('id') - display_id = mobj.group('display_id') - - base_url = self._extract_base_url(course_id, display_id) - - manifest = self._download_json( - '%s/imsmanifestlite.json' % base_url, - display_id, 'Downloading course manifest JSON')['manifest'] - - organization = manifest['organizations']['organization'][0] - - entries = [] - for chapter in organization['item']: - chapter_number, chapter_title = self._extract_chapter_and_title(chapter.get('title')) - chapter_id = chapter.get('@identifier') - for item in chapter.get('item', []): - item_id = item.get('@identifier') - if not item_id: - continue - metadata = item.get('resource', {}).get('metadata') or {} - if metadata.get('learningresourcetype') != 'Video': - continue - _, title = self._extract_chapter_and_title(item.get('title')) - duration = parse_duration(metadata.get('duration')) - description = metadata.get('description') - entries.append({ - '_type': 'url_transparent', - 'url': smuggle_url( - 'mva:%s:%s' % (course_id, item_id), {'base_url': base_url}), - 'title': title, - 'description': description, - 'duration': duration, - 'chapter': chapter_title, - 'chapter_number': chapter_number, - 'chapter_id': chapter_id, - }) - - title = organization.get('title') or manifest.get('metadata', {}).get('title') - - return self.playlist_result(entries, course_id, title) diff --git a/youtube_dl/extractor/minoto.py b/youtube_dl/extractor/minoto.py deleted file mode 100644 index 636731195..000000000 --- a/youtube_dl/extractor/minoto.py +++ /dev/null @@ -1,51 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - parse_codecs, -) - - -class MinotoIE(InfoExtractor): - _VALID_URL = r'(?:minoto:|https?://(?:play|iframe|embed)\.minoto-video\.com/(?P<player_id>[0-9]+)/)(?P<id>[a-zA-Z0-9]+)' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - player_id = mobj.group('player_id') or '1' - video_id = mobj.group('id') - video_data = self._download_json('http://play.minoto-video.com/%s/%s.js' % (player_id, video_id), video_id) - video_metadata = video_data['video-metadata'] - formats = [] - for fmt in video_data['video-files']: - fmt_url = fmt.get('url') - if not fmt_url: - continue - container = fmt.get('container') - if container == 'hls': - formats.extend(fmt_url, video_id, 'mp4', m3u8_id='hls', fatal=False) - else: - fmt_profile = fmt.get('profile') or {} - formats.append({ - 'format_id': fmt_profile.get('name-short'), - 'format_note': fmt_profile.get('name'), - 'url': fmt_url, - 'container': container, - 'tbr': int_or_none(fmt.get('bitrate')), - 'filesize': int_or_none(fmt.get('filesize')), - 'width': int_or_none(fmt.get('width')), - 'height': int_or_none(fmt.get('height')), - 'codecs': parse_codecs(fmt.get('codecs')), - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': video_metadata['title'], - 'description': video_metadata.get('description'), - 'thumbnail': video_metadata.get('video-poster', {}).get('url'), - 'formats': formats, - } diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py deleted file mode 100644 index e1506a745..000000000 --- a/youtube_dl/extractor/mit.py +++ /dev/null @@ -1,132 +0,0 @@ -from __future__ import unicode_literals - -import re -import json - -from .common import InfoExtractor -from .youtube import YoutubeIE -from ..utils import ( - clean_html, - ExtractorError, - get_element_by_id, -) - - -class TechTVMITIE(InfoExtractor): - IE_NAME = 'techtv.mit.edu' - _VALID_URL = r'https?://techtv\.mit\.edu/(?:videos|embeds)/(?P<id>\d+)' - - _TEST = { - 'url': 'http://techtv.mit.edu/videos/25418-mit-dna-learning-center-set', - 'md5': '00a3a27ee20d44bcaa0933ccec4a2cf7', - 'info_dict': { - 'id': '25418', - 'ext': 'mp4', - 'title': 'MIT DNA and Protein Sets', - 'description': 'md5:46f5c69ce434f0a97e7c628cc142802d', - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - raw_page = self._download_webpage( - 'http://techtv.mit.edu/videos/%s' % video_id, video_id) - clean_page = re.compile(r'<!--.*?-->', re.S).sub('', raw_page) - - base_url = self._proto_relative_url(self._search_regex( - r'ipadUrl: \'(.+?cloudfront.net/)', raw_page, 'base url'), 'http:') - formats_json = self._search_regex( - r'bitrates: (\[.+?\])', raw_page, 'video formats') - formats_mit = json.loads(formats_json) - formats = [ - { - 'format_id': f['label'], - 'url': base_url + f['url'].partition(':')[2], - 'ext': f['url'].partition(':')[0], - 'format': f['label'], - 'width': f['width'], - 'vbr': f['bitrate'], - } - for f in formats_mit - ] - - title = get_element_by_id('edit-title', clean_page) - description = clean_html(get_element_by_id('edit-description', clean_page)) - thumbnail = self._search_regex( - r'playlist:.*?url: \'(.+?)\'', - raw_page, 'thumbnail', flags=re.DOTALL) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'description': description, - 'thumbnail': thumbnail, - } - - -class OCWMITIE(InfoExtractor): - IE_NAME = 'ocw.mit.edu' - _VALID_URL = r'^https?://ocw\.mit\.edu/courses/(?P<topic>[a-z0-9\-]+)' - _BASE_URL = 'http://ocw.mit.edu/' - - _TESTS = [ - { - 'url': 'http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/', - 'info_dict': { - 'id': 'EObHWIEKGjA', - 'ext': 'webm', - 'title': 'Lecture 7: Multiple Discrete Random Variables: Expectations, Conditioning, Independence', - 'description': 'In this lecture, the professor discussed multiple random variables, expectations, and binomial distribution.', - 'upload_date': '20121109', - 'uploader_id': 'MIT', - 'uploader': 'MIT OpenCourseWare', - } - }, - { - 'url': 'http://ocw.mit.edu/courses/mathematics/18-01sc-single-variable-calculus-fall-2010/1.-differentiation/part-a-definition-and-basic-rules/session-1-introduction-to-derivatives/', - 'info_dict': { - 'id': '7K1sB05pE0A', - 'ext': 'mp4', - 'title': 'Session 1: Introduction to Derivatives', - 'upload_date': '20090818', - 'uploader_id': 'MIT', - 'uploader': 'MIT OpenCourseWare', - 'description': 'This section contains lecture video excerpts, lecture notes, an interactive mathlet with supporting documents, and problem solving videos.', - } - } - ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - topic = mobj.group('topic') - - webpage = self._download_webpage(url, topic) - title = self._html_search_meta('WT.cg_s', webpage) - description = self._html_search_meta('Description', webpage) - - # search for call to ocw_embed_chapter_media(container_id, media_url, provider, page_url, image_url, start, stop, captions_file) - embed_chapter_media = re.search(r'ocw_embed_chapter_media\((.+?)\)', webpage) - if embed_chapter_media: - metadata = re.sub(r'[\'"]', '', embed_chapter_media.group(1)) - metadata = re.split(r', ?', metadata) - yt = metadata[1] - else: - # search for call to ocw_embed_chapter_media(container_id, media_url, provider, page_url, image_url, captions_file) - embed_media = re.search(r'ocw_embed_media\((.+?)\)', webpage) - if embed_media: - metadata = re.sub(r'[\'"]', '', embed_media.group(1)) - metadata = re.split(r', ?', metadata) - yt = metadata[1] - else: - raise ExtractorError('Unable to find embedded YouTube video.') - video_id = YoutubeIE.extract_id(yt) - - return { - '_type': 'url_transparent', - 'id': video_id, - 'title': title, - 'description': description, - 'url': yt, - 'ie_key': 'Youtube', - } diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py deleted file mode 100644 index 69319857d..000000000 --- a/youtube_dl/extractor/mixcloud.py +++ /dev/null @@ -1,356 +0,0 @@ -from __future__ import unicode_literals - -import itertools -import re - -from .common import InfoExtractor -from ..compat import ( - compat_b64decode, - compat_chr, - compat_ord, - compat_str, - compat_urllib_parse_unquote, - compat_zip -) -from ..utils import ( - int_or_none, - parse_iso8601, - strip_or_none, - try_get, -) - - -class MixcloudBaseIE(InfoExtractor): - def _call_api(self, object_type, object_fields, display_id, username, slug=None): - lookup_key = object_type + 'Lookup' - return self._download_json( - 'https://www.mixcloud.com/graphql', display_id, query={ - 'query': '''{ - %s(lookup: {username: "%s"%s}) { - %s - } -}''' % (lookup_key, username, ', slug: "%s"' % slug if slug else '', object_fields) - })['data'][lookup_key] - - -class MixcloudIE(MixcloudBaseIE): - _VALID_URL = r'https?://(?:(?:www|beta|m)\.)?mixcloud\.com/([^/]+)/(?!stream|uploads|favorites|listens|playlists)([^/]+)' - IE_NAME = 'mixcloud' - - _TESTS = [{ - 'url': 'http://www.mixcloud.com/dholbach/cryptkeeper/', - 'info_dict': { - 'id': 'dholbach_cryptkeeper', - 'ext': 'm4a', - 'title': 'Cryptkeeper', - 'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.', - 'uploader': 'Daniel Holbach', - 'uploader_id': 'dholbach', - 'thumbnail': r're:https?://.*\.jpg', - 'view_count': int, - 'timestamp': 1321359578, - 'upload_date': '20111115', - }, - }, { - 'url': 'http://www.mixcloud.com/gillespeterson/caribou-7-inch-vinyl-mix-chat/', - 'info_dict': { - 'id': 'gillespeterson_caribou-7-inch-vinyl-mix-chat', - 'ext': 'mp3', - 'title': 'Caribou 7 inch Vinyl Mix & Chat', - 'description': 'md5:2b8aec6adce69f9d41724647c65875e8', - 'uploader': 'Gilles Peterson Worldwide', - 'uploader_id': 'gillespeterson', - 'thumbnail': 're:https?://.*', - 'view_count': int, - 'timestamp': 1422987057, - 'upload_date': '20150203', - }, - }, { - 'url': 'https://beta.mixcloud.com/RedLightRadio/nosedrip-15-red-light-radio-01-18-2016/', - 'only_matching': True, - }] - _DECRYPTION_KEY = 'IFYOUWANTTHEARTISTSTOGETPAIDDONOTDOWNLOADFROMMIXCLOUD' - - @staticmethod - def _decrypt_xor_cipher(key, ciphertext): - """Encrypt/Decrypt XOR cipher. Both ways are possible because it's XOR.""" - return ''.join([ - compat_chr(compat_ord(ch) ^ compat_ord(k)) - for ch, k in compat_zip(ciphertext, itertools.cycle(key))]) - - def _real_extract(self, url): - username, slug = re.match(self._VALID_URL, url).groups() - username, slug = compat_urllib_parse_unquote(username), compat_urllib_parse_unquote(slug) - track_id = '%s_%s' % (username, slug) - - cloudcast = self._call_api('cloudcast', '''audioLength - comments(first: 100) { - edges { - node { - comment - created - user { - displayName - username - } - } - } - totalCount - } - description - favorites { - totalCount - } - featuringArtistList - isExclusive - name - owner { - displayName - url - username - } - picture(width: 1024, height: 1024) { - url - } - plays - publishDate - reposts { - totalCount - } - streamInfo { - dashUrl - hlsUrl - url - } - tags { - tag { - name - } - }''', track_id, username, slug) - - title = cloudcast['name'] - - stream_info = cloudcast['streamInfo'] - formats = [] - - for url_key in ('url', 'hlsUrl', 'dashUrl'): - format_url = stream_info.get(url_key) - if not format_url: - continue - decrypted = self._decrypt_xor_cipher( - self._DECRYPTION_KEY, compat_b64decode(format_url)) - if url_key == 'hlsUrl': - formats.extend(self._extract_m3u8_formats( - decrypted, track_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - elif url_key == 'dashUrl': - formats.extend(self._extract_mpd_formats( - decrypted, track_id, mpd_id='dash', fatal=False)) - else: - formats.append({ - 'format_id': 'http', - 'url': decrypted, - 'downloader_options': { - # Mixcloud starts throttling at >~5M - 'http_chunk_size': 5242880, - }, - }) - - if not formats and cloudcast.get('isExclusive'): - self.raise_login_required() - - self._sort_formats(formats) - - comments = [] - for edge in (try_get(cloudcast, lambda x: x['comments']['edges']) or []): - node = edge.get('node') or {} - text = strip_or_none(node.get('comment')) - if not text: - continue - user = node.get('user') or {} - comments.append({ - 'author': user.get('displayName'), - 'author_id': user.get('username'), - 'text': text, - 'timestamp': parse_iso8601(node.get('created')), - }) - - tags = [] - for t in cloudcast.get('tags'): - tag = try_get(t, lambda x: x['tag']['name'], compat_str) - if not tag: - tags.append(tag) - - get_count = lambda x: int_or_none(try_get(cloudcast, lambda y: y[x]['totalCount'])) - - owner = cloudcast.get('owner') or {} - - return { - 'id': track_id, - 'title': title, - 'formats': formats, - 'description': cloudcast.get('description'), - 'thumbnail': try_get(cloudcast, lambda x: x['picture']['url'], compat_str), - 'uploader': owner.get('displayName'), - 'timestamp': parse_iso8601(cloudcast.get('publishDate')), - 'uploader_id': owner.get('username'), - 'uploader_url': owner.get('url'), - 'duration': int_or_none(cloudcast.get('audioLength')), - 'view_count': int_or_none(cloudcast.get('plays')), - 'like_count': get_count('favorites'), - 'repost_count': get_count('reposts'), - 'comment_count': get_count('comments'), - 'comments': comments, - 'tags': tags, - 'artist': ', '.join(cloudcast.get('featuringArtistList') or []) or None, - } - - -class MixcloudPlaylistBaseIE(MixcloudBaseIE): - def _get_cloudcast(self, node): - return node - - def _get_playlist_title(self, title, slug): - return title - - def _real_extract(self, url): - username, slug = re.match(self._VALID_URL, url).groups() - username = compat_urllib_parse_unquote(username) - if not slug: - slug = 'uploads' - else: - slug = compat_urllib_parse_unquote(slug) - playlist_id = '%s_%s' % (username, slug) - - is_playlist_type = self._ROOT_TYPE == 'playlist' - playlist_type = 'items' if is_playlist_type else slug - list_filter = '' - - has_next_page = True - entries = [] - while has_next_page: - playlist = self._call_api( - self._ROOT_TYPE, '''%s - %s - %s(first: 100%s) { - edges { - node { - %s - } - } - pageInfo { - endCursor - hasNextPage - } - }''' % (self._TITLE_KEY, self._DESCRIPTION_KEY, playlist_type, list_filter, self._NODE_TEMPLATE), - playlist_id, username, slug if is_playlist_type else None) - - items = playlist.get(playlist_type) or {} - for edge in items.get('edges', []): - cloudcast = self._get_cloudcast(edge.get('node') or {}) - cloudcast_url = cloudcast.get('url') - if not cloudcast_url: - continue - slug = try_get(cloudcast, lambda x: x['slug'], compat_str) - owner_username = try_get(cloudcast, lambda x: x['owner']['username'], compat_str) - video_id = '%s_%s' % (owner_username, slug) if slug and owner_username else None - entries.append(self.url_result( - cloudcast_url, MixcloudIE.ie_key(), video_id)) - - page_info = items['pageInfo'] - has_next_page = page_info['hasNextPage'] - list_filter = ', after: "%s"' % page_info['endCursor'] - - return self.playlist_result( - entries, playlist_id, - self._get_playlist_title(playlist[self._TITLE_KEY], slug), - playlist.get(self._DESCRIPTION_KEY)) - - -class MixcloudUserIE(MixcloudPlaylistBaseIE): - _VALID_URL = r'https?://(?:www\.)?mixcloud\.com/(?P<id>[^/]+)/(?P<type>uploads|favorites|listens|stream)?/?$' - IE_NAME = 'mixcloud:user' - - _TESTS = [{ - 'url': 'http://www.mixcloud.com/dholbach/', - 'info_dict': { - 'id': 'dholbach_uploads', - 'title': 'Daniel Holbach (uploads)', - 'description': 'md5:b60d776f0bab534c5dabe0a34e47a789', - }, - 'playlist_mincount': 36, - }, { - 'url': 'http://www.mixcloud.com/dholbach/uploads/', - 'info_dict': { - 'id': 'dholbach_uploads', - 'title': 'Daniel Holbach (uploads)', - 'description': 'md5:b60d776f0bab534c5dabe0a34e47a789', - }, - 'playlist_mincount': 36, - }, { - 'url': 'http://www.mixcloud.com/dholbach/favorites/', - 'info_dict': { - 'id': 'dholbach_favorites', - 'title': 'Daniel Holbach (favorites)', - 'description': 'md5:b60d776f0bab534c5dabe0a34e47a789', - }, - # 'params': { - # 'playlist_items': '1-100', - # }, - 'playlist_mincount': 396, - }, { - 'url': 'http://www.mixcloud.com/dholbach/listens/', - 'info_dict': { - 'id': 'dholbach_listens', - 'title': 'Daniel Holbach (listens)', - 'description': 'md5:b60d776f0bab534c5dabe0a34e47a789', - }, - # 'params': { - # 'playlist_items': '1-100', - # }, - 'playlist_mincount': 1623, - 'skip': 'Large list', - }, { - 'url': 'https://www.mixcloud.com/FirstEar/stream/', - 'info_dict': { - 'id': 'FirstEar_stream', - 'title': 'First Ear (stream)', - 'description': 'Curators of good music\r\n\r\nfirstearmusic.com', - }, - 'playlist_mincount': 271, - }] - - _TITLE_KEY = 'displayName' - _DESCRIPTION_KEY = 'biog' - _ROOT_TYPE = 'user' - _NODE_TEMPLATE = '''slug - url - owner { username }''' - - def _get_playlist_title(self, title, slug): - return '%s (%s)' % (title, slug) - - -class MixcloudPlaylistIE(MixcloudPlaylistBaseIE): - _VALID_URL = r'https?://(?:www\.)?mixcloud\.com/(?P<user>[^/]+)/playlists/(?P<playlist>[^/]+)/?$' - IE_NAME = 'mixcloud:playlist' - - _TESTS = [{ - 'url': 'https://www.mixcloud.com/maxvibes/playlists/jazzcat-on-ness-radio/', - 'info_dict': { - 'id': 'maxvibes_jazzcat-on-ness-radio', - 'title': 'Ness Radio sessions', - }, - 'playlist_mincount': 59, - }] - _TITLE_KEY = 'name' - _DESCRIPTION_KEY = 'description' - _ROOT_TYPE = 'playlist' - _NODE_TEMPLATE = '''cloudcast { - slug - url - owner { username } - }''' - - def _get_cloudcast(self, node): - return node.get('cloudcast') or {} diff --git a/youtube_dl/extractor/moevideo.py b/youtube_dl/extractor/moevideo.py deleted file mode 100644 index eb9b4ce7c..000000000 --- a/youtube_dl/extractor/moevideo.py +++ /dev/null @@ -1,79 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - clean_html, - int_or_none, -) - - -class MoeVideoIE(InfoExtractor): - IE_DESC = 'LetitBit video services: moevideo.net, playreplay.net and videochart.net' - _VALID_URL = r'''(?x) - https?://(?P<host>(?:www\.)? - (?:(?:moevideo|playreplay|videochart)\.net|thesame\.tv))/ - (?:video|framevideo|embed)/(?P<id>[0-9a-z]+\.[0-9A-Za-z]+)''' - _API_URL = 'http://api.letitbit.net/' - _API_KEY = 'tVL0gjqo5' - _TESTS = [ - { - 'url': 'http://moevideo.net/video/00297.0036103fe3d513ef27915216fd29', - 'md5': '129f5ae1f6585d0e9bb4f38e774ffb3a', - 'info_dict': { - 'id': '00297.0036103fe3d513ef27915216fd29', - 'ext': 'flv', - 'title': 'Sink cut out machine', - 'description': 'md5:f29ff97b663aefa760bf7ca63c8ca8a8', - 'thumbnail': r're:^https?://.*\.jpg$', - 'width': 540, - 'height': 360, - 'duration': 179, - 'filesize': 17822500, - }, - 'skip': 'Video has been removed', - }, - { - 'url': 'http://playreplay.net/video/77107.7f325710a627383d40540d8e991a', - 'md5': '74f0a014d5b661f0f0e2361300d1620e', - 'info_dict': { - 'id': '77107.7f325710a627383d40540d8e991a', - 'ext': 'flv', - 'title': 'Operacion Condor.', - 'description': 'md5:7e68cb2fcda66833d5081c542491a9a3', - 'thumbnail': r're:^https?://.*\.jpg$', - 'width': 480, - 'height': 296, - 'duration': 6027, - 'filesize': 588257923, - }, - 'skip': 'Video has been removed', - }, - ] - - def _real_extract(self, url): - host, video_id = re.match(self._VALID_URL, url).groups() - - webpage = self._download_webpage( - 'http://%s/video/%s' % (host, video_id), - video_id, 'Downloading webpage') - - title = self._og_search_title(webpage) - - embed_webpage = self._download_webpage( - 'http://%s/embed/%s' % (host, video_id), - video_id, 'Downloading embed webpage') - video = self._parse_json(self._search_regex( - r'mvplayer\("#player"\s*,\s*({.+})', - embed_webpage, 'mvplayer'), video_id)['video'] - - return { - 'id': video_id, - 'title': title, - 'thumbnail': video.get('poster') or self._og_search_thumbnail(webpage), - 'description': clean_html(self._og_search_description(webpage)), - 'duration': int_or_none(self._og_search_property('video:duration', webpage)), - 'url': video['ourUrl'], - } diff --git a/youtube_dl/extractor/mojvideo.py b/youtube_dl/extractor/mojvideo.py deleted file mode 100644 index 165e658c9..000000000 --- a/youtube_dl/extractor/mojvideo.py +++ /dev/null @@ -1,58 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - parse_duration, -) - - -class MojvideoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?mojvideo\.com/video-(?P<display_id>[^/]+)/(?P<id>[a-f0-9]+)' - _TEST = { - 'url': 'http://www.mojvideo.com/video-v-avtu-pred-mano-rdecelaska-alfi-nipic/3d1ed4497707730b2906', - 'md5': 'f7fd662cc8ce2be107b0d4f2c0483ae7', - 'info_dict': { - 'id': '3d1ed4497707730b2906', - 'display_id': 'v-avtu-pred-mano-rdecelaska-alfi-nipic', - 'ext': 'mp4', - 'title': 'V avtu pred mano rdečelaska - Alfi Nipič', - 'thumbnail': r're:^http://.*\.jpg$', - 'duration': 242, - } - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') - - # XML is malformed - playerapi = self._download_webpage( - 'http://www.mojvideo.com/playerapi.php?v=%s&t=1' % video_id, display_id) - - if '<error>true</error>' in playerapi: - error_desc = self._html_search_regex( - r'<errordesc>([^<]*)</errordesc>', playerapi, 'error description', fatal=False) - raise ExtractorError('%s said: %s' % (self.IE_NAME, error_desc), expected=True) - - title = self._html_search_regex( - r'<title>([^<]+)</title>', playerapi, 'title') - video_url = self._html_search_regex( - r'<file>([^<]+)</file>', playerapi, 'video URL') - thumbnail = self._html_search_regex( - r'<preview>([^<]+)</preview>', playerapi, 'thumbnail', fatal=False) - duration = parse_duration(self._html_search_regex( - r'<duration>([^<]+)</duration>', playerapi, 'duration', fatal=False)) - - return { - 'id': video_id, - 'display_id': display_id, - 'url': video_url, - 'title': title, - 'thumbnail': thumbnail, - 'duration': duration, - } diff --git a/youtube_dl/extractor/morningstar.py b/youtube_dl/extractor/morningstar.py deleted file mode 100644 index 0093bcd6c..000000000 --- a/youtube_dl/extractor/morningstar.py +++ /dev/null @@ -1,50 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - - -class MorningstarIE(InfoExtractor): - IE_DESC = 'morningstar.com' - _VALID_URL = r'https?://(?:(?:www|news)\.)morningstar\.com/[cC]over/video[cC]enter\.aspx\?id=(?P<id>[0-9]+)' - _TESTS = [{ - 'url': 'http://www.morningstar.com/cover/videocenter.aspx?id=615869', - 'md5': '6c0acface7a787aadc8391e4bbf7b0f5', - 'info_dict': { - 'id': '615869', - 'ext': 'mp4', - 'title': 'Get Ahead of the Curve on 2013 Taxes', - 'description': "Vanguard's Joel Dickson on managing higher tax rates for high-income earners and fund capital-gain distributions in 2013.", - 'thumbnail': r're:^https?://.*m(?:orning)?star\.com/.+thumb\.jpg$' - } - }, { - 'url': 'http://news.morningstar.com/cover/videocenter.aspx?id=825556', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - webpage = self._download_webpage(url, video_id) - title = self._html_search_regex( - r'<h1 id="titleLink">(.*?)</h1>', webpage, 'title') - video_url = self._html_search_regex( - r'<input type="hidden" id="hidVideoUrl" value="([^"]+)"', - webpage, 'video URL') - thumbnail = self._html_search_regex( - r'<input type="hidden" id="hidSnapshot" value="([^"]+)"', - webpage, 'thumbnail', fatal=False) - description = self._html_search_regex( - r'<div id="mstarDeck".*?>(.*?)</div>', - webpage, 'description', fatal=False) - - return { - 'id': video_id, - 'title': title, - 'url': video_url, - 'thumbnail': thumbnail, - 'description': description, - } diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py deleted file mode 100644 index ef1e081f2..000000000 --- a/youtube_dl/extractor/motherless.py +++ /dev/null @@ -1,232 +0,0 @@ -from __future__ import unicode_literals - -import datetime -import re - -from .common import InfoExtractor -from ..compat import compat_urlparse -from ..utils import ( - ExtractorError, - InAdvancePagedList, - orderedSet, - str_to_int, - unified_strdate, -) - - -class MotherlessIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?motherless\.com/(?:g/[a-z0-9_]+/)?(?P<id>[A-Z0-9]+)' - _TESTS = [{ - 'url': 'http://motherless.com/AC3FFE1', - 'md5': '310f62e325a9fafe64f68c0bccb6e75f', - 'info_dict': { - 'id': 'AC3FFE1', - 'ext': 'mp4', - 'title': 'Fucked in the ass while playing PS3', - 'categories': ['Gaming', 'anal', 'reluctant', 'rough', 'Wife'], - 'upload_date': '20100913', - 'uploader_id': 'famouslyfuckedup', - 'thumbnail': r're:https?://.*\.jpg', - 'age_limit': 18, - } - }, { - 'url': 'http://motherless.com/532291B', - 'md5': 'bc59a6b47d1f958e61fbd38a4d31b131', - 'info_dict': { - 'id': '532291B', - 'ext': 'mp4', - 'title': 'Amazing girl playing the omegle game, PERFECT!', - 'categories': ['Amateur', 'webcam', 'omegle', 'pink', 'young', 'masturbate', 'teen', - 'game', 'hairy'], - 'upload_date': '20140622', - 'uploader_id': 'Sulivana7x', - 'thumbnail': r're:https?://.*\.jpg', - 'age_limit': 18, - }, - 'skip': '404', - }, { - 'url': 'http://motherless.com/g/cosplay/633979F', - 'md5': '0b2a43f447a49c3e649c93ad1fafa4a0', - 'info_dict': { - 'id': '633979F', - 'ext': 'mp4', - 'title': 'Turtlette', - 'categories': ['superheroine heroine superher'], - 'upload_date': '20140827', - 'uploader_id': 'shade0230', - 'thumbnail': r're:https?://.*\.jpg', - 'age_limit': 18, - } - }, { - # no keywords - 'url': 'http://motherless.com/8B4BBC1', - 'only_matching': True, - }, { - # see https://motherless.com/videos/recent for recent videos with - # uploaded date in "ago" format - 'url': 'https://motherless.com/3C3E2CF', - 'info_dict': { - 'id': '3C3E2CF', - 'ext': 'mp4', - 'title': 'a/ Hot Teens', - 'categories': list, - 'upload_date': '20210104', - 'uploader_id': 'yonbiw', - 'thumbnail': r're:https?://.*\.jpg', - 'age_limit': 18, - }, - 'params': { - 'skip_download': True, - }, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - if any(p in webpage for p in ( - '<title>404 - MOTHERLESS.COM<', - ">The page you're looking for cannot be found.<")): - raise ExtractorError('Video %s does not exist' % video_id, expected=True) - - if '>The content you are trying to view is for friends only.' in webpage: - raise ExtractorError('Video %s is for friends only' % video_id, expected=True) - - title = self._html_search_regex( - (r'(?s)<div[^>]+\bclass=["\']media-meta-title[^>]+>(.+?)</div>', - r'id="view-upload-title">\s+([^<]+)<'), webpage, 'title') - video_url = (self._html_search_regex( - (r'setup\(\{\s*["\']file["\']\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', - r'fileurl\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1'), - webpage, 'video URL', default=None, group='url') - or 'http://cdn4.videos.motherlessmedia.com/videos/%s.mp4?fs=opencloud' % video_id) - age_limit = self._rta_search(webpage) - view_count = str_to_int(self._html_search_regex( - (r'>([\d,.]+)\s+Views<', r'<strong>Views</strong>\s+([^<]+)<'), - webpage, 'view count', fatal=False)) - like_count = str_to_int(self._html_search_regex( - (r'>([\d,.]+)\s+Favorites<', - r'<strong>Favorited</strong>\s+([^<]+)<'), - webpage, 'like count', fatal=False)) - - upload_date = unified_strdate(self._search_regex( - r'class=["\']count[^>]+>(\d+\s+[a-zA-Z]{3}\s+\d{4})<', webpage, - 'upload date', default=None)) - if not upload_date: - uploaded_ago = self._search_regex( - r'>\s*(\d+[hd])\s+[aA]go\b', webpage, 'uploaded ago', - default=None) - if uploaded_ago: - delta = int(uploaded_ago[:-1]) - _AGO_UNITS = { - 'h': 'hours', - 'd': 'days', - } - kwargs = {_AGO_UNITS.get(uploaded_ago[-1]): delta} - upload_date = (datetime.datetime.utcnow() - datetime.timedelta(**kwargs)).strftime('%Y%m%d') - - comment_count = webpage.count('class="media-comment-contents"') - uploader_id = self._html_search_regex( - r'"thumb-member-username">\s+<a href="/m/([^"]+)"', - webpage, 'uploader_id') - - categories = self._html_search_meta('keywords', webpage, default=None) - if categories: - categories = [cat.strip() for cat in categories.split(',')] - - return { - 'id': video_id, - 'title': title, - 'upload_date': upload_date, - 'uploader_id': uploader_id, - 'thumbnail': self._og_search_thumbnail(webpage), - 'categories': categories, - 'view_count': view_count, - 'like_count': like_count, - 'comment_count': comment_count, - 'age_limit': age_limit, - 'url': video_url, - } - - -class MotherlessGroupIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?motherless\.com/gv?/(?P<id>[a-z0-9_]+)' - _TESTS = [{ - 'url': 'http://motherless.com/g/movie_scenes', - 'info_dict': { - 'id': 'movie_scenes', - 'title': 'Movie Scenes', - 'description': 'Hot and sexy scenes from "regular" movies... ' - 'Beautiful actresses fully nude... A looot of ' - 'skin! :)Enjoy!', - }, - 'playlist_mincount': 662, - }, { - 'url': 'http://motherless.com/gv/sex_must_be_funny', - 'info_dict': { - 'id': 'sex_must_be_funny', - 'title': 'Sex must be funny', - 'description': 'Sex can be funny. Wide smiles,laugh, games, fun of ' - 'any kind!' - }, - 'playlist_mincount': 9, - }] - - @classmethod - def suitable(cls, url): - return (False if MotherlessIE.suitable(url) - else super(MotherlessGroupIE, cls).suitable(url)) - - def _extract_entries(self, webpage, base): - entries = [] - for mobj in re.finditer( - r'href="(?P<href>/[^"]+)"[^>]*>(?:\s*<img[^>]+alt="[^-]+-\s(?P<title>[^"]+)")?', - webpage): - video_url = compat_urlparse.urljoin(base, mobj.group('href')) - if not MotherlessIE.suitable(video_url): - continue - video_id = MotherlessIE._match_id(video_url) - title = mobj.group('title') - entries.append(self.url_result( - video_url, ie=MotherlessIE.ie_key(), video_id=video_id, - video_title=title)) - # Alternative fallback - if not entries: - entries = [ - self.url_result( - compat_urlparse.urljoin(base, '/' + entry_id), - ie=MotherlessIE.ie_key(), video_id=entry_id) - for entry_id in orderedSet(re.findall( - r'data-codename=["\']([A-Z0-9]+)', webpage))] - return entries - - def _real_extract(self, url): - group_id = self._match_id(url) - page_url = compat_urlparse.urljoin(url, '/gv/%s' % group_id) - webpage = self._download_webpage(page_url, group_id) - title = self._search_regex( - r'<title>([\w\s]+\w)\s+-', webpage, 'title', fatal=False) - description = self._html_search_meta( - 'description', webpage, fatal=False) - page_count = self._int(self._search_regex( - r'(\d+)</(?:a|span)><(?:a|span)[^>]+>\s*NEXT', - webpage, 'page_count'), 'page_count') - PAGE_SIZE = 80 - - def _get_page(idx): - webpage = self._download_webpage( - page_url, group_id, query={'page': idx + 1}, - note='Downloading page %d/%d' % (idx + 1, page_count) - ) - for entry in self._extract_entries(webpage, url): - yield entry - - playlist = InAdvancePagedList(_get_page, page_count, PAGE_SIZE) - - return { - '_type': 'playlist', - 'id': group_id, - 'title': title, - 'description': description, - 'entries': playlist - } diff --git a/youtube_dl/extractor/moviezine.py b/youtube_dl/extractor/moviezine.py deleted file mode 100644 index 85cc6e22f..000000000 --- a/youtube_dl/extractor/moviezine.py +++ /dev/null @@ -1,45 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - - -class MoviezineIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?moviezine\.se/video/(?P<id>[^?#]+)' - - _TEST = { - 'url': 'http://www.moviezine.se/video/205866', - 'info_dict': { - 'id': '205866', - 'ext': 'mp4', - 'title': 'Oculus - Trailer 1', - 'description': 'md5:40cc6790fc81d931850ca9249b40e8a4', - 'thumbnail': r're:http://.*\.jpg', - }, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - webpage = self._download_webpage(url, video_id) - jsplayer = self._download_webpage('http://www.moviezine.se/api/player.js?video=%s' % video_id, video_id, 'Downloading js api player') - - formats = [{ - 'format_id': 'sd', - 'url': self._html_search_regex(r'file: "(.+?)",', jsplayer, 'file'), - 'quality': 0, - 'ext': 'mp4', - }] - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': self._search_regex(r'title: "(.+?)",', jsplayer, 'title'), - 'thumbnail': self._search_regex(r'image: "(.+?)",', jsplayer, 'image'), - 'formats': formats, - 'description': self._og_search_description(webpage), - } diff --git a/youtube_dl/extractor/msn.py b/youtube_dl/extractor/msn.py deleted file mode 100644 index e59b0b7b0..000000000 --- a/youtube_dl/extractor/msn.py +++ /dev/null @@ -1,171 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - determine_ext, - ExtractorError, - int_or_none, - unescapeHTML, -) - - -class MSNIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|preview)\.)?msn\.com/(?:[^/]+/)+(?P<display_id>[^/]+)/[a-z]{2}-(?P<id>[\da-zA-Z]+)' - _TESTS = [{ - 'url': 'https://www.msn.com/en-in/money/video/7-ways-to-get-rid-of-chest-congestion/vi-BBPxU6d', - 'md5': '087548191d273c5c55d05028f8d2cbcd', - 'info_dict': { - 'id': 'BBPxU6d', - 'display_id': '7-ways-to-get-rid-of-chest-congestion', - 'ext': 'mp4', - 'title': 'Seven ways to get rid of chest congestion', - 'description': '7 Ways to Get Rid of Chest Congestion', - 'duration': 88, - 'uploader': 'Health', - 'uploader_id': 'BBPrMqa', - }, - }, { - # Article, multiple Dailymotion Embeds - 'url': 'https://www.msn.com/en-in/money/sports/hottest-football-wags-greatest-footballers-turned-managers-and-more/ar-BBpc7Nl', - 'info_dict': { - 'id': 'BBpc7Nl', - }, - 'playlist_mincount': 4, - }, { - 'url': 'http://www.msn.com/en-ae/news/offbeat/meet-the-nine-year-old-self-made-millionaire/ar-BBt6ZKf', - 'only_matching': True, - }, { - 'url': 'http://www.msn.com/en-ae/video/watch/obama-a-lot-of-people-will-be-disappointed/vi-AAhxUMH', - 'only_matching': True, - }, { - # geo restricted - 'url': 'http://www.msn.com/en-ae/foodanddrink/joinourtable/the-first-fart-makes-you-laugh-the-last-fart-makes-you-cry/vp-AAhzIBU', - 'only_matching': True, - }, { - 'url': 'http://www.msn.com/en-ae/entertainment/bollywood/watch-how-salman-khan-reacted-when-asked-if-he-would-apologize-for-his-‘raped-woman’-comment/vi-AAhvzW6', - 'only_matching': True, - }, { - # Vidible(AOL) Embed - 'url': 'https://www.msn.com/en-us/money/other/jupiter-is-about-to-come-so-close-you-can-see-its-moons-with-binoculars/vi-AACqsHR', - 'only_matching': True, - }, { - # Dailymotion Embed - 'url': 'https://www.msn.com/es-ve/entretenimiento/watch/winston-salem-paire-refait-des-siennes-en-perdant-sa-raquette-au-service/vp-AAG704L', - 'only_matching': True, - }, { - # YouTube Embed - 'url': 'https://www.msn.com/en-in/money/news/meet-vikram-%E2%80%94-chandrayaan-2s-lander/vi-AAGUr0v', - 'only_matching': True, - }, { - # NBCSports Embed - 'url': 'https://www.msn.com/en-us/money/football_nfl/week-13-preview-redskins-vs-panthers/vi-BBXsCDb', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id, page_id = re.match(self._VALID_URL, url).groups() - - webpage = self._download_webpage(url, display_id) - - entries = [] - for _, metadata in re.findall(r'data-metadata\s*=\s*(["\'])(?P<data>.+?)\1', webpage): - video = self._parse_json(unescapeHTML(metadata), display_id) - - provider_id = video.get('providerId') - player_name = video.get('playerName') - if player_name and provider_id: - entry = None - if player_name == 'AOL': - if provider_id.startswith('http'): - provider_id = self._search_regex( - r'https?://delivery\.vidible\.tv/video/redirect/([0-9a-f]{24})', - provider_id, 'vidible id') - entry = self.url_result( - 'aol-video:' + provider_id, 'Aol', provider_id) - elif player_name == 'Dailymotion': - entry = self.url_result( - 'https://www.dailymotion.com/video/' + provider_id, - 'Dailymotion', provider_id) - elif player_name == 'YouTube': - entry = self.url_result( - provider_id, 'Youtube', provider_id) - elif player_name == 'NBCSports': - entry = self.url_result( - 'http://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/media/' + provider_id, - 'NBCSportsVPlayer', provider_id) - if entry: - entries.append(entry) - continue - - video_id = video['uuid'] - title = video['title'] - - formats = [] - for file_ in video.get('videoFiles', []): - format_url = file_.get('url') - if not format_url: - continue - if 'format=m3u8-aapl' in format_url: - # m3u8_native should not be used here until - # https://github.com/ytdl-org/youtube-dl/issues/9913 is fixed - formats.extend(self._extract_m3u8_formats( - format_url, display_id, 'mp4', - m3u8_id='hls', fatal=False)) - elif 'format=mpd-time-csf' in format_url: - formats.extend(self._extract_mpd_formats( - format_url, display_id, 'dash', fatal=False)) - elif '.ism' in format_url: - if format_url.endswith('.ism'): - format_url += '/manifest' - formats.extend(self._extract_ism_formats( - format_url, display_id, 'mss', fatal=False)) - else: - format_id = file_.get('formatCode') - formats.append({ - 'url': format_url, - 'ext': 'mp4', - 'format_id': format_id, - 'width': int_or_none(file_.get('width')), - 'height': int_or_none(file_.get('height')), - 'vbr': int_or_none(self._search_regex(r'_(\d+)\.mp4', format_url, 'vbr', default=None)), - 'preference': 1 if format_id == '1001' else None, - }) - self._sort_formats(formats) - - subtitles = {} - for file_ in video.get('files', []): - format_url = file_.get('url') - format_code = file_.get('formatCode') - if not format_url or not format_code: - continue - if compat_str(format_code) == '3100': - subtitles.setdefault(file_.get('culture', 'en'), []).append({ - 'ext': determine_ext(format_url, 'ttml'), - 'url': format_url, - }) - - entries.append({ - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': video.get('description'), - 'thumbnail': video.get('headlineImage', {}).get('url'), - 'duration': int_or_none(video.get('durationSecs')), - 'uploader': video.get('sourceFriendly'), - 'uploader_id': video.get('providerId'), - 'creator': video.get('creator'), - 'subtitles': subtitles, - 'formats': formats, - }) - - if not entries: - error = unescapeHTML(self._search_regex( - r'data-error=(["\'])(?P<error>.+?)\1', - webpage, 'error', group='error')) - raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) - - return self.playlist_result(entries, page_id) diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py deleted file mode 100644 index 5a5205c0e..000000000 --- a/youtube_dl/extractor/mtv.py +++ /dev/null @@ -1,488 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_xpath, -) -from ..utils import ( - ExtractorError, - find_xpath_attr, - fix_xml_ampersands, - float_or_none, - HEADRequest, - RegexNotFoundError, - sanitized_Request, - strip_or_none, - timeconvert, - try_get, - unescapeHTML, - update_url_query, - url_basename, - xpath_text, -) - - -def _media_xml_tag(tag): - return '{http://search.yahoo.com/mrss/}%s' % tag - - -class MTVServicesInfoExtractor(InfoExtractor): - _MOBILE_TEMPLATE = None - _LANG = None - - @staticmethod - def _id_from_uri(uri): - return uri.split(':')[-1] - - @staticmethod - def _remove_template_parameter(url): - # Remove the templates, like &device={device} - return re.sub(r'&[^=]*?={.*?}(?=(&|$))', '', url) - - def _get_feed_url(self, uri): - return self._FEED_URL - - def _get_thumbnail_url(self, uri, itemdoc): - search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail')) - thumb_node = itemdoc.find(search_path) - if thumb_node is None: - return None - return thumb_node.get('url') or thumb_node.text or None - - def _extract_mobile_video_formats(self, mtvn_id): - webpage_url = self._MOBILE_TEMPLATE % mtvn_id - req = sanitized_Request(webpage_url) - # Otherwise we get a webpage that would execute some javascript - req.add_header('User-Agent', 'curl/7') - webpage = self._download_webpage(req, mtvn_id, - 'Downloading mobile page') - metrics_url = unescapeHTML(self._search_regex(r'<a href="(http://metrics.+?)"', webpage, 'url')) - req = HEADRequest(metrics_url) - response = self._request_webpage(req, mtvn_id, 'Resolving url') - url = response.geturl() - # Transform the url to get the best quality: - url = re.sub(r'.+pxE=mp4', 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=0+_pxK=18639+_pxE=mp4', url, 1) - return [{'url': url, 'ext': 'mp4'}] - - def _extract_video_formats(self, mdoc, mtvn_id, video_id): - if re.match(r'.*/(error_country_block\.swf|geoblock\.mp4|copyright_error\.flv(?:\?geo\b.+?)?)$', mdoc.find('.//src').text) is not None: - if mtvn_id is not None and self._MOBILE_TEMPLATE is not None: - self.to_screen('The normal version is not available from your ' - 'country, trying with the mobile version') - return self._extract_mobile_video_formats(mtvn_id) - raise ExtractorError('This video is not available from your country.', - expected=True) - - formats = [] - for rendition in mdoc.findall('.//rendition'): - if rendition.get('method') == 'hls': - hls_url = rendition.find('./src').text - formats.extend(self._extract_m3u8_formats( - hls_url, video_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - else: - # fms - try: - _, _, ext = rendition.attrib['type'].partition('/') - rtmp_video_url = rendition.find('./src').text - if 'error_not_available.swf' in rtmp_video_url: - raise ExtractorError( - '%s said: video is not available' % self.IE_NAME, - expected=True) - if rtmp_video_url.endswith('siteunavail.png'): - continue - formats.extend([{ - 'ext': 'flv' if rtmp_video_url.startswith('rtmp') else ext, - 'url': rtmp_video_url, - 'format_id': '-'.join(filter(None, [ - 'rtmp' if rtmp_video_url.startswith('rtmp') else None, - rendition.get('bitrate')])), - 'width': int(rendition.get('width')), - 'height': int(rendition.get('height')), - }]) - except (KeyError, TypeError): - raise ExtractorError('Invalid rendition field.') - if formats: - self._sort_formats(formats) - return formats - - def _extract_subtitles(self, mdoc, mtvn_id): - subtitles = {} - for transcript in mdoc.findall('.//transcript'): - if transcript.get('kind') != 'captions': - continue - lang = transcript.get('srclang') - for typographic in transcript.findall('./typographic'): - sub_src = typographic.get('src') - if not sub_src: - continue - ext = typographic.get('format') - if ext == 'cea-608': - ext = 'scc' - subtitles.setdefault(lang, []).append({ - 'url': compat_str(sub_src), - 'ext': ext - }) - return subtitles - - def _get_video_info(self, itemdoc, use_hls=True): - uri = itemdoc.find('guid').text - video_id = self._id_from_uri(uri) - self.report_extraction(video_id) - content_el = itemdoc.find('%s/%s' % (_media_xml_tag('group'), _media_xml_tag('content'))) - mediagen_url = self._remove_template_parameter(content_el.attrib['url']) - mediagen_url = mediagen_url.replace('device={device}', '') - if 'acceptMethods' not in mediagen_url: - mediagen_url += '&' if '?' in mediagen_url else '?' - mediagen_url += 'acceptMethods=' - mediagen_url += 'hls' if use_hls else 'fms' - - mediagen_doc = self._download_xml( - mediagen_url, video_id, 'Downloading video urls', fatal=False) - - if mediagen_doc is False: - return None - - item = mediagen_doc.find('./video/item') - if item is not None and item.get('type') == 'text': - message = '%s returned error: ' % self.IE_NAME - if item.get('code') is not None: - message += '%s - ' % item.get('code') - message += item.text - raise ExtractorError(message, expected=True) - - description = strip_or_none(xpath_text(itemdoc, 'description')) - - timestamp = timeconvert(xpath_text(itemdoc, 'pubDate')) - - title_el = None - if title_el is None: - title_el = find_xpath_attr( - itemdoc, './/{http://search.yahoo.com/mrss/}category', - 'scheme', 'urn:mtvn:video_title') - if title_el is None: - title_el = itemdoc.find(compat_xpath('.//{http://search.yahoo.com/mrss/}title')) - if title_el is None: - title_el = itemdoc.find(compat_xpath('.//title')) - if title_el.text is None: - title_el = None - - title = title_el.text - if title is None: - raise ExtractorError('Could not find video title') - title = title.strip() - - # This a short id that's used in the webpage urls - mtvn_id = None - mtvn_id_node = find_xpath_attr(itemdoc, './/{http://search.yahoo.com/mrss/}category', - 'scheme', 'urn:mtvn:id') - if mtvn_id_node is not None: - mtvn_id = mtvn_id_node.text - - formats = self._extract_video_formats(mediagen_doc, mtvn_id, video_id) - - # Some parts of complete video may be missing (e.g. missing Act 3 in - # http://www.southpark.de/alle-episoden/s14e01-sexual-healing) - if not formats: - return None - - self._sort_formats(formats) - - return { - 'title': title, - 'formats': formats, - 'subtitles': self._extract_subtitles(mediagen_doc, mtvn_id), - 'id': video_id, - 'thumbnail': self._get_thumbnail_url(uri, itemdoc), - 'description': description, - 'duration': float_or_none(content_el.attrib.get('duration')), - 'timestamp': timestamp, - } - - def _get_feed_query(self, uri): - data = {'uri': uri} - if self._LANG: - data['lang'] = self._LANG - return data - - def _get_videos_info(self, uri, use_hls=True): - video_id = self._id_from_uri(uri) - feed_url = self._get_feed_url(uri) - info_url = update_url_query(feed_url, self._get_feed_query(uri)) - return self._get_videos_info_from_url(info_url, video_id, use_hls) - - def _get_videos_info_from_url(self, url, video_id, use_hls=True): - idoc = self._download_xml( - url, video_id, - 'Downloading info', transform_source=fix_xml_ampersands) - - title = xpath_text(idoc, './channel/title') - description = xpath_text(idoc, './channel/description') - - entries = [] - for item in idoc.findall('.//item'): - info = self._get_video_info(item, use_hls) - if info: - entries.append(info) - - return self.playlist_result( - entries, playlist_title=title, playlist_description=description) - - def _extract_triforce_mgid(self, webpage, data_zone=None, video_id=None): - triforce_feed = self._parse_json(self._search_regex( - r'triforceManifestFeed\s*=\s*({.+?})\s*;\s*\n', webpage, - 'triforce feed', default='{}'), video_id, fatal=False) - - data_zone = self._search_regex( - r'data-zone=(["\'])(?P<zone>.+?_lc_promo.*?)\1', webpage, - 'data zone', default=data_zone, group='zone') - - feed_url = try_get( - triforce_feed, lambda x: x['manifest']['zones'][data_zone]['feed'], - compat_str) - if not feed_url: - return - - feed = self._download_json(feed_url, video_id, fatal=False) - if not feed: - return - - return try_get(feed, lambda x: x['result']['data']['id'], compat_str) - - @staticmethod - def _extract_child_with_type(parent, t): - for c in parent['children']: - if c.get('type') == t: - return c - - def _extract_mgid(self, webpage): - try: - # the url can be http://media.mtvnservices.com/fb/{mgid}.swf - # or http://media.mtvnservices.com/{mgid} - og_url = self._og_search_video_url(webpage) - mgid = url_basename(og_url) - if mgid.endswith('.swf'): - mgid = mgid[:-4] - except RegexNotFoundError: - mgid = None - - if mgid is None or ':' not in mgid: - mgid = self._search_regex( - [r'data-mgid="(.*?)"', r'swfobject\.embedSWF\(".*?(mgid:.*?)"'], - webpage, 'mgid', default=None) - - if not mgid: - sm4_embed = self._html_search_meta( - 'sm4:video:embed', webpage, 'sm4 embed', default='') - mgid = self._search_regex( - r'embed/(mgid:.+?)["\'&?/]', sm4_embed, 'mgid', default=None) - - if not mgid: - mgid = self._extract_triforce_mgid(webpage) - - if not mgid: - data = self._parse_json(self._search_regex( - r'__DATA__\s*=\s*({.+?});', webpage, 'data'), None) - main_container = self._extract_child_with_type(data, 'MainContainer') - ab_testing = self._extract_child_with_type(main_container, 'ABTesting') - video_player = self._extract_child_with_type(ab_testing or main_container, 'VideoPlayer') - mgid = video_player['props']['media']['video']['config']['uri'] - - return mgid - - def _real_extract(self, url): - title = url_basename(url) - webpage = self._download_webpage(url, title) - mgid = self._extract_mgid(webpage) - videos_info = self._get_videos_info(mgid) - return videos_info - - -class MTVServicesEmbeddedIE(MTVServicesInfoExtractor): - IE_NAME = 'mtvservices:embedded' - _VALID_URL = r'https?://media\.mtvnservices\.com/embed/(?P<mgid>.+?)(\?|/|$)' - - _TEST = { - # From http://www.thewrap.com/peter-dinklage-sums-up-game-of-thrones-in-45-seconds-video/ - 'url': 'http://media.mtvnservices.com/embed/mgid:uma:video:mtv.com:1043906/cp~vid%3D1043906%26uri%3Dmgid%3Auma%3Avideo%3Amtv.com%3A1043906', - 'md5': 'cb349b21a7897164cede95bd7bf3fbb9', - 'info_dict': { - 'id': '1043906', - 'ext': 'mp4', - 'title': 'Peter Dinklage Sums Up \'Game Of Thrones\' In 45 Seconds', - 'description': '"Sexy sexy sexy, stabby stabby stabby, beautiful language," says Peter Dinklage as he tries summarizing "Game of Thrones" in under a minute.', - 'timestamp': 1400126400, - 'upload_date': '20140515', - }, - } - - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//media\.mtvnservices\.com/embed/.+?)\1', webpage) - if mobj: - return mobj.group('url') - - def _get_feed_url(self, uri): - video_id = self._id_from_uri(uri) - config = self._download_json( - 'http://media.mtvnservices.com/pmt/e1/access/index.html?uri=%s&configtype=edge' % uri, video_id) - return self._remove_template_parameter(config['feedWithQueryParams']) - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - mgid = mobj.group('mgid') - return self._get_videos_info(mgid) - - -class MTVIE(MTVServicesInfoExtractor): - IE_NAME = 'mtv' - _VALID_URL = r'https?://(?:www\.)?mtv\.com/(?:video-clips|(?:full-)?episodes)/(?P<id>[^/?#.]+)' - _FEED_URL = 'http://www.mtv.com/feeds/mrss/' - - _TESTS = [{ - 'url': 'http://www.mtv.com/video-clips/vl8qof/unlocking-the-truth-trailer', - 'md5': '1edbcdf1e7628e414a8c5dcebca3d32b', - 'info_dict': { - 'id': '5e14040d-18a4-47c4-a582-43ff602de88e', - 'ext': 'mp4', - 'title': 'Unlocking The Truth|July 18, 2016|1|101|Trailer', - 'description': '"Unlocking the Truth" premieres August 17th at 11/10c.', - 'timestamp': 1468846800, - 'upload_date': '20160718', - }, - }, { - 'url': 'http://www.mtv.com/full-episodes/94tujl/unlocking-the-truth-gates-of-hell-season-1-ep-101', - 'only_matching': True, - }, { - 'url': 'http://www.mtv.com/episodes/g8xu7q/teen-mom-2-breaking-the-wall-season-7-ep-713', - 'only_matching': True, - }] - - -class MTVJapanIE(MTVServicesInfoExtractor): - IE_NAME = 'mtvjapan' - _VALID_URL = r'https?://(?:www\.)?mtvjapan\.com/videos/(?P<id>[0-9a-z]+)' - - _TEST = { - 'url': 'http://www.mtvjapan.com/videos/prayht/fresh-info-cadillac-escalade', - 'info_dict': { - 'id': 'bc01da03-6fe5-4284-8880-f291f4e368f5', - 'ext': 'mp4', - 'title': '【Fresh Info】Cadillac ESCALADE Sport Edition', - }, - 'params': { - 'skip_download': True, - }, - } - _GEO_COUNTRIES = ['JP'] - _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed' - - def _get_feed_query(self, uri): - return { - 'arcEp': 'mtvjapan.com', - 'mgid': uri, - } - - -class MTVVideoIE(MTVServicesInfoExtractor): - IE_NAME = 'mtv:video' - _VALID_URL = r'''(?x)^https?:// - (?:(?:www\.)?mtv\.com/videos/.+?/(?P<videoid>[0-9]+)/[^/]+$| - m\.mtv\.com/videos/video\.rbml\?.*?id=(?P<mgid>[^&]+))''' - - _FEED_URL = 'http://www.mtv.com/player/embed/AS3/rss/' - - _TESTS = [ - { - 'url': 'http://www.mtv.com/videos/misc/853555/ours-vh1-storytellers.jhtml', - 'md5': '850f3f143316b1e71fa56a4edfd6e0f8', - 'info_dict': { - 'id': '853555', - 'ext': 'mp4', - 'title': 'Taylor Swift - "Ours (VH1 Storytellers)"', - 'description': 'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.', - 'timestamp': 1352610000, - 'upload_date': '20121111', - }, - }, - ] - - def _get_thumbnail_url(self, uri, itemdoc): - return 'http://mtv.mtvnimages.com/uri/' + uri - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('videoid') - uri = mobj.groupdict().get('mgid') - if uri is None: - webpage = self._download_webpage(url, video_id) - - # Some videos come from Vevo.com - m_vevo = re.search( - r'(?s)isVevoVideo = true;.*?vevoVideoId = "(.*?)";', webpage) - if m_vevo: - vevo_id = m_vevo.group(1) - self.to_screen('Vevo video detected: %s' % vevo_id) - return self.url_result('vevo:%s' % vevo_id, ie='Vevo') - - uri = self._html_search_regex(r'/uri/(.*?)\?', webpage, 'uri') - return self._get_videos_info(uri) - - -class MTVDEIE(MTVServicesInfoExtractor): - IE_NAME = 'mtv.de' - _VALID_URL = r'https?://(?:www\.)?mtv\.de/(?:musik/videoclips|folgen|news)/(?P<id>[0-9a-z]+)' - _TESTS = [{ - 'url': 'http://www.mtv.de/musik/videoclips/2gpnv7/Traum', - 'info_dict': { - 'id': 'd5d472bc-f5b7-11e5-bffd-a4badb20dab5', - 'ext': 'mp4', - 'title': 'Traum', - 'description': 'Traum', - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - 'skip': 'Blocked at Travis CI', - }, { - # mediagen URL without query (e.g. http://videos.mtvnn.com/mediagen/e865da714c166d18d6f80893195fcb97) - 'url': 'http://www.mtv.de/folgen/6b1ylu/teen-mom-2-enthuellungen-S5-F1', - 'info_dict': { - 'id': '1e5a878b-31c5-11e7-a442-0e40cf2fc285', - 'ext': 'mp4', - 'title': 'Teen Mom 2', - 'description': 'md5:dc65e357ef7e1085ed53e9e9d83146a7', - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - 'skip': 'Blocked at Travis CI', - }, { - 'url': 'http://www.mtv.de/news/glolix/77491-mtv-movies-spotlight--pixels--teil-3', - 'info_dict': { - 'id': 'local_playlist-4e760566473c4c8c5344', - 'ext': 'mp4', - 'title': 'Article_mtv-movies-spotlight-pixels-teil-3_short-clips_part1', - 'description': 'MTV Movies Supercut', - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - 'skip': 'Das Video kann zur Zeit nicht abgespielt werden.', - }] - _GEO_COUNTRIES = ['DE'] - _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed' - - def _get_feed_query(self, uri): - return { - 'arcEp': 'mtv.de', - 'mgid': uri, - } diff --git a/youtube_dl/extractor/muenchentv.py b/youtube_dl/extractor/muenchentv.py deleted file mode 100644 index 2cc2bf229..000000000 --- a/youtube_dl/extractor/muenchentv.py +++ /dev/null @@ -1,75 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import json - -from .common import InfoExtractor -from ..utils import ( - determine_ext, - int_or_none, - js_to_json, -) - - -class MuenchenTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?muenchen\.tv/livestream' - IE_DESC = 'münchen.tv' - _TEST = { - 'url': 'http://www.muenchen.tv/livestream/', - 'info_dict': { - 'id': '5334', - 'display_id': 'live', - 'ext': 'mp4', - 'title': 're:^münchen.tv-Livestream [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'is_live': True, - 'thumbnail': r're:^https?://.*\.jpg$' - }, - 'params': { - 'skip_download': True, - } - } - - def _real_extract(self, url): - display_id = 'live' - webpage = self._download_webpage(url, display_id) - - title = self._live_title(self._og_search_title(webpage)) - - data_js = self._search_regex( - r'(?s)\nplaylist:\s*(\[.*?}\]),', - webpage, 'playlist configuration') - data_json = js_to_json(data_js) - data = json.loads(data_json)[0] - - video_id = data['mediaid'] - thumbnail = data.get('image') - - formats = [] - for format_num, s in enumerate(data['sources']): - ext = determine_ext(s['file'], None) - label_str = s.get('label') - if label_str is None: - label_str = '_%d' % format_num - - if ext is None: - format_id = label_str - else: - format_id = '%s-%s' % (ext, label_str) - - formats.append({ - 'url': s['file'], - 'tbr': int_or_none(s.get('label')), - 'ext': 'mp4', - 'format_id': format_id, - 'preference': -100 if '.smil' in s['file'] else 0, - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'formats': formats, - 'is_live': True, - 'thumbnail': thumbnail, - } diff --git a/youtube_dl/extractor/mychannels.py b/youtube_dl/extractor/mychannels.py deleted file mode 100644 index b1ffe7848..000000000 --- a/youtube_dl/extractor/mychannels.py +++ /dev/null @@ -1,40 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - - -class MyChannelsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?mychannels\.com/.*(?P<id_type>video|production)_id=(?P<id>[0-9]+)' - _TEST = { - 'url': 'https://mychannels.com/missholland/miss-holland?production_id=3416', - 'md5': 'b8993daad4262dd68d89d651c0c52c45', - 'info_dict': { - 'id': 'wUUDZZep6vQD', - 'ext': 'mp4', - 'title': 'Miss Holland joins VOTE LEAVE', - 'description': 'Miss Holland | #13 Not a potato', - 'uploader': 'Miss Holland', - } - } - - def _real_extract(self, url): - id_type, url_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage(url, url_id) - video_data = self._html_search_regex(r'<div([^>]+data-%s-id="%s"[^>]+)>' % (id_type, url_id), webpage, 'video data') - - def extract_data_val(attr, fatal=False): - return self._html_search_regex(r'data-%s\s*=\s*"([^"]+)"' % attr, video_data, attr, fatal=fatal) - minoto_id = extract_data_val('minoto-id') or self._search_regex(r'/id/([a-zA-Z0-9]+)', extract_data_val('video-src', True), 'minoto id') - - return { - '_type': 'url_transparent', - 'url': 'minoto:%s' % minoto_id, - 'id': url_id, - 'title': extract_data_val('title', True), - 'description': extract_data_val('description'), - 'thumbnail': extract_data_val('image'), - 'uploader': extract_data_val('channel'), - } diff --git a/youtube_dl/extractor/myspace.py b/youtube_dl/extractor/myspace.py deleted file mode 100644 index e164d5940..000000000 --- a/youtube_dl/extractor/myspace.py +++ /dev/null @@ -1,212 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - int_or_none, - parse_iso8601, -) - - -class MySpaceIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?:// - myspace\.com/[^/]+/ - (?P<mediatype> - video/[^/]+/(?P<video_id>\d+)| - music/song/[^/?#&]+-(?P<song_id>\d+)-\d+(?:[/?#&]|$) - ) - ''' - - _TESTS = [{ - 'url': 'https://myspace.com/fiveminutestothestage/video/little-big-town/109594919', - 'md5': '9c1483c106f4a695c47d2911feed50a7', - 'info_dict': { - 'id': '109594919', - 'ext': 'mp4', - 'title': 'Little Big Town', - 'description': 'This country quartet was all smiles while playing a sold out show at the Pacific Amphitheatre in Orange County, California.', - 'uploader': 'Five Minutes to the Stage', - 'uploader_id': 'fiveminutestothestage', - 'timestamp': 1414108751, - 'upload_date': '20141023', - }, - }, { - # songs - 'url': 'https://myspace.com/killsorrow/music/song/of-weakened-soul...-93388656-103880681', - 'md5': '1d7ee4604a3da226dd69a123f748b262', - 'info_dict': { - 'id': '93388656', - 'ext': 'm4a', - 'title': 'Of weakened soul...', - 'uploader': 'Killsorrow', - 'uploader_id': 'killsorrow', - }, - }, { - 'add_ie': ['Youtube'], - 'url': 'https://myspace.com/threedaysgrace/music/song/animal-i-have-become-28400208-28218041', - 'info_dict': { - 'id': 'xqds0B_meys', - 'ext': 'webm', - 'title': 'Three Days Grace - Animal I Have Become', - 'description': 'md5:8bd86b3693e72a077cf863a8530c54bb', - 'uploader': 'ThreeDaysGraceVEVO', - 'uploader_id': 'ThreeDaysGraceVEVO', - 'upload_date': '20091002', - }, - }, { - 'url': 'https://myspace.com/starset2/music/song/first-light-95799905-106964426', - 'only_matching': True, - }, { - 'url': 'https://myspace.com/thelargemouthbassband/music/song/02-pure-eyes.mp3-94422330-105113388', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('video_id') or mobj.group('song_id') - is_song = mobj.group('mediatype').startswith('music/song') - webpage = self._download_webpage(url, video_id) - player_url = self._search_regex( - r'videoSwf":"([^"?]*)', webpage, 'player URL', fatal=False) - - def formats_from_stream_urls(stream_url, hls_stream_url, http_stream_url, width=None, height=None): - formats = [] - vcodec = 'none' if is_song else None - if hls_stream_url: - formats.append({ - 'format_id': 'hls', - 'url': hls_stream_url, - 'protocol': 'm3u8_native', - 'ext': 'm4a' if is_song else 'mp4', - 'vcodec': vcodec, - }) - if stream_url and player_url: - rtmp_url, play_path = stream_url.split(';', 1) - formats.append({ - 'format_id': 'rtmp', - 'url': rtmp_url, - 'play_path': play_path, - 'player_url': player_url, - 'protocol': 'rtmp', - 'ext': 'flv', - 'width': width, - 'height': height, - 'vcodec': vcodec, - }) - if http_stream_url: - formats.append({ - 'format_id': 'http', - 'url': http_stream_url, - 'width': width, - 'height': height, - 'vcodec': vcodec, - }) - return formats - - if is_song: - # songs don't store any useful info in the 'context' variable - song_data = self._search_regex( - r'''<button.*data-song-id=(["\'])%s\1.*''' % video_id, - webpage, 'song_data', default=None, group=0) - if song_data is None: - # some songs in an album are not playable - self.report_warning( - '%s: No downloadable song on this page' % video_id) - return - - def search_data(name): - return self._search_regex( - r'''data-%s=([\'"])(?P<data>.*?)\1''' % name, - song_data, name, default='', group='data') - formats = formats_from_stream_urls( - search_data('stream-url'), search_data('hls-stream-url'), - search_data('http-stream-url')) - if not formats: - vevo_id = search_data('vevo-id') - youtube_id = search_data('youtube-id') - if vevo_id: - self.to_screen('Vevo video detected: %s' % vevo_id) - return self.url_result('vevo:%s' % vevo_id, ie='Vevo') - elif youtube_id: - self.to_screen('Youtube video detected: %s' % youtube_id) - return self.url_result(youtube_id, ie='Youtube') - else: - raise ExtractorError( - 'Found song but don\'t know how to download it') - self._sort_formats(formats) - return { - 'id': video_id, - 'title': self._og_search_title(webpage), - 'uploader': search_data('artist-name'), - 'uploader_id': search_data('artist-username'), - 'thumbnail': self._og_search_thumbnail(webpage), - 'duration': int_or_none(search_data('duration')), - 'formats': formats, - } - else: - video = self._parse_json(self._search_regex( - r'context = ({.*?});', webpage, 'context'), - video_id)['video'] - formats = formats_from_stream_urls( - video.get('streamUrl'), video.get('hlsStreamUrl'), - video.get('mp4StreamUrl'), int_or_none(video.get('width')), - int_or_none(video.get('height'))) - self._sort_formats(formats) - return { - 'id': video_id, - 'title': video['title'], - 'description': video.get('description'), - 'thumbnail': video.get('imageUrl'), - 'uploader': video.get('artistName'), - 'uploader_id': video.get('artistUsername'), - 'duration': int_or_none(video.get('duration')), - 'timestamp': parse_iso8601(video.get('dateAdded')), - 'formats': formats, - } - - -class MySpaceAlbumIE(InfoExtractor): - IE_NAME = 'MySpace:album' - _VALID_URL = r'https?://myspace\.com/([^/]+)/music/album/(?P<title>.*-)(?P<id>\d+)' - - _TESTS = [{ - 'url': 'https://myspace.com/starset2/music/album/transmissions-19455773', - 'info_dict': { - 'title': 'Transmissions', - 'id': '19455773', - }, - 'playlist_count': 14, - 'skip': 'this album is only available in some countries', - }, { - 'url': 'https://myspace.com/killsorrow/music/album/the-demo-18596029', - 'info_dict': { - 'title': 'The Demo', - 'id': '18596029', - }, - 'playlist_count': 5, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - playlist_id = mobj.group('id') - display_id = mobj.group('title') + playlist_id - webpage = self._download_webpage(url, display_id) - tracks_paths = re.findall(r'"music:song" content="(.*?)"', webpage) - if not tracks_paths: - raise ExtractorError( - '%s: No songs found, try using proxy' % display_id, - expected=True) - entries = [ - self.url_result(t_path, ie=MySpaceIE.ie_key()) - for t_path in tracks_paths] - return { - '_type': 'playlist', - 'id': playlist_id, - 'display_id': display_id, - 'title': self._og_search_title(webpage), - 'entries': entries, - } diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py deleted file mode 100644 index 61fc59126..000000000 --- a/youtube_dl/extractor/naver.py +++ /dev/null @@ -1,166 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - clean_html, - dict_get, - ExtractorError, - int_or_none, - parse_duration, - try_get, - update_url_query, -) - - -class NaverBaseIE(InfoExtractor): - _CAPTION_EXT_RE = r'\.(?:ttml|vtt)' - - def _extract_video_info(self, video_id, vid, key): - video_data = self._download_json( - 'http://play.rmcnmv.naver.com/vod/play/v2.0/' + vid, - video_id, query={ - 'key': key, - }) - meta = video_data['meta'] - title = meta['subject'] - formats = [] - get_list = lambda x: try_get(video_data, lambda y: y[x + 's']['list'], list) or [] - - def extract_formats(streams, stream_type, query={}): - for stream in streams: - stream_url = stream.get('source') - if not stream_url: - continue - stream_url = update_url_query(stream_url, query) - encoding_option = stream.get('encodingOption', {}) - bitrate = stream.get('bitrate', {}) - formats.append({ - 'format_id': '%s_%s' % (stream.get('type') or stream_type, dict_get(encoding_option, ('name', 'id'))), - 'url': stream_url, - 'width': int_or_none(encoding_option.get('width')), - 'height': int_or_none(encoding_option.get('height')), - 'vbr': int_or_none(bitrate.get('video')), - 'abr': int_or_none(bitrate.get('audio')), - 'filesize': int_or_none(stream.get('size')), - 'protocol': 'm3u8_native' if stream_type == 'HLS' else None, - }) - - extract_formats(get_list('video'), 'H264') - for stream_set in video_data.get('streams', []): - query = {} - for param in stream_set.get('keys', []): - query[param['name']] = param['value'] - stream_type = stream_set.get('type') - videos = stream_set.get('videos') - if videos: - extract_formats(videos, stream_type, query) - elif stream_type == 'HLS': - stream_url = stream_set.get('source') - if not stream_url: - continue - formats.extend(self._extract_m3u8_formats( - update_url_query(stream_url, query), video_id, - 'mp4', 'm3u8_native', m3u8_id=stream_type, fatal=False)) - self._sort_formats(formats) - - replace_ext = lambda x, y: re.sub(self._CAPTION_EXT_RE, '.' + y, x) - - def get_subs(caption_url): - if re.search(self._CAPTION_EXT_RE, caption_url): - return [{ - 'url': replace_ext(caption_url, 'ttml'), - }, { - 'url': replace_ext(caption_url, 'vtt'), - }] - else: - return [{'url': caption_url}] - - automatic_captions = {} - subtitles = {} - for caption in get_list('caption'): - caption_url = caption.get('source') - if not caption_url: - continue - sub_dict = automatic_captions if caption.get('type') == 'auto' else subtitles - sub_dict.setdefault(dict_get(caption, ('locale', 'language')), []).extend(get_subs(caption_url)) - - user = meta.get('user', {}) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'subtitles': subtitles, - 'automatic_captions': automatic_captions, - 'thumbnail': try_get(meta, lambda x: x['cover']['source']), - 'view_count': int_or_none(meta.get('count')), - 'uploader_id': user.get('id'), - 'uploader': user.get('name'), - 'uploader_url': user.get('url'), - } - - -class NaverIE(NaverBaseIE): - _VALID_URL = r'https?://(?:m\.)?tv(?:cast)?\.naver\.com/(?:v|embed)/(?P<id>\d+)' - _GEO_BYPASS = False - _TESTS = [{ - 'url': 'http://tv.naver.com/v/81652', - 'info_dict': { - 'id': '81652', - 'ext': 'mp4', - 'title': '[9월 모의고사 해설강의][수학_김상희] 수학 A형 16~20번', - 'description': '메가스터디 수학 김상희 선생님이 9월 모의고사 수학A형 16번에서 20번까지 해설강의를 공개합니다.', - 'timestamp': 1378200754, - 'upload_date': '20130903', - 'uploader': '메가스터디, 합격불변의 법칙', - 'uploader_id': 'megastudy', - }, - }, { - 'url': 'http://tv.naver.com/v/395837', - 'md5': '8a38e35354d26a17f73f4e90094febd3', - 'info_dict': { - 'id': '395837', - 'ext': 'mp4', - 'title': '9년이 지나도 아픈 기억, 전효성의 아버지', - 'description': 'md5:eb6aca9d457b922e43860a2a2b1984d3', - 'timestamp': 1432030253, - 'upload_date': '20150519', - 'uploader': '4가지쇼 시즌2', - 'uploader_id': 'wrappinguser29', - }, - 'skip': 'Georestricted', - }, { - 'url': 'http://tvcast.naver.com/v/81652', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - content = self._download_json( - 'https://tv.naver.com/api/json/v/' + video_id, - video_id, headers=self.geo_verification_headers()) - player_info_json = content.get('playerInfoJson') or {} - current_clip = player_info_json.get('currentClip') or {} - - vid = current_clip.get('videoId') - in_key = current_clip.get('inKey') - - if not vid or not in_key: - player_auth = try_get(player_info_json, lambda x: x['playerOption']['auth']) - if player_auth == 'notCountry': - self.raise_geo_restricted(countries=['KR']) - elif player_auth == 'notLogin': - self.raise_login_required() - raise ExtractorError('couldn\'t extract vid and key') - info = self._extract_video_info(video_id, vid, in_key) - info.update({ - 'description': clean_html(current_clip.get('description')), - 'timestamp': int_or_none(current_clip.get('firstExposureTime'), 1000), - 'duration': parse_duration(current_clip.get('displayPlayTime')), - 'like_count': int_or_none(current_clip.get('recommendPoint')), - 'age_limit': 19 if current_clip.get('adult') else None, - }) - return info diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py deleted file mode 100644 index fbc7adaf4..000000000 --- a/youtube_dl/extractor/nba.py +++ /dev/null @@ -1,428 +0,0 @@ -from __future__ import unicode_literals - -import functools -import re - -from .turner import TurnerBaseIE -from ..compat import ( - compat_parse_qs, - compat_str, - compat_urllib_parse_unquote, - compat_urllib_parse_urlparse, -) -from ..utils import ( - int_or_none, - merge_dicts, - OnDemandPagedList, - parse_duration, - parse_iso8601, - try_get, - update_url_query, - urljoin, -) - - -class NBACVPBaseIE(TurnerBaseIE): - def _extract_nba_cvp_info(self, path, video_id, fatal=False): - return self._extract_cvp_info( - 'http://secure.nba.com/%s' % path, video_id, { - 'default': { - 'media_src': 'http://nba.cdn.turner.com/nba/big', - }, - 'm3u8': { - 'media_src': 'http://nbavod-f.akamaihd.net', - }, - }, fatal=fatal) - - -class NBAWatchBaseIE(NBACVPBaseIE): - _VALID_URL_BASE = r'https?://(?:(?:www\.)?nba\.com(?:/watch)?|watch\.nba\.com)/' - - def _extract_video(self, filter_key, filter_value): - video = self._download_json( - 'https://neulionscnbav2-a.akamaihd.net/solr/nbad_program/usersearch', - filter_value, query={ - 'fl': 'description,image,name,pid,releaseDate,runtime,tags,seoName', - 'q': filter_key + ':' + filter_value, - 'wt': 'json', - })['response']['docs'][0] - - video_id = str(video['pid']) - title = video['name'] - - formats = [] - m3u8_url = (self._download_json( - 'https://watch.nba.com/service/publishpoint', video_id, query={ - 'type': 'video', - 'format': 'json', - 'id': video_id, - }, headers={ - 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0_1 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A402 Safari/604.1', - }, fatal=False) or {}).get('path') - if m3u8_url: - m3u8_formats = self._extract_m3u8_formats( - re.sub(r'_(?:pc|iphone)\.', '.', m3u8_url), video_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False) - formats.extend(m3u8_formats) - for f in m3u8_formats: - http_f = f.copy() - http_f.update({ - 'format_id': http_f['format_id'].replace('hls-', 'http-'), - 'protocol': 'http', - 'url': http_f['url'].replace('.m3u8', ''), - }) - formats.append(http_f) - - info = { - 'id': video_id, - 'title': title, - 'thumbnail': urljoin('https://nbadsdmt.akamaized.net/media/nba/nba/thumbs/', video.get('image')), - 'description': video.get('description'), - 'duration': int_or_none(video.get('runtime')), - 'timestamp': parse_iso8601(video.get('releaseDate')), - 'tags': video.get('tags'), - } - - seo_name = video.get('seoName') - if seo_name and re.search(r'\d{4}/\d{2}/\d{2}/', seo_name): - base_path = '' - if seo_name.startswith('teams/'): - base_path += seo_name.split('/')[1] + '/' - base_path += 'video/' - cvp_info = self._extract_nba_cvp_info( - base_path + seo_name + '.xml', video_id, False) - if cvp_info: - formats.extend(cvp_info['formats']) - info = merge_dicts(info, cvp_info) - - self._sort_formats(formats) - info['formats'] = formats - return info - - -class NBAWatchEmbedIE(NBAWatchBaseIE): - IENAME = 'nba:watch:embed' - _VALID_URL = NBAWatchBaseIE._VALID_URL_BASE + r'embed\?.*?\bid=(?P<id>\d+)' - _TESTS = [{ - 'url': 'http://watch.nba.com/embed?id=659395', - 'md5': 'b7e3f9946595f4ca0a13903ce5edd120', - 'info_dict': { - 'id': '659395', - 'ext': 'mp4', - 'title': 'Mix clip: More than 7 points of Joe Ingles, Luc Mbah a Moute, Blake Griffin and 6 more in Utah Jazz vs. the Clippers, 4/15/2017', - 'description': 'Mix clip: More than 7 points of Joe Ingles, Luc Mbah a Moute, Blake Griffin and 6 more in Utah Jazz vs. the Clippers, 4/15/2017', - 'timestamp': 1492228800, - 'upload_date': '20170415', - }, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - return self._extract_video('pid', video_id) - - -class NBAWatchIE(NBAWatchBaseIE): - IE_NAME = 'nba:watch' - _VALID_URL = NBAWatchBaseIE._VALID_URL_BASE + r'(?:nba/)?video/(?P<id>.+?(?=/index\.html)|(?:[^/]+/)*[^/?#&]+)' - _TESTS = [{ - 'url': 'http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html', - 'md5': '9d902940d2a127af3f7f9d2f3dc79c96', - 'info_dict': { - 'id': '70946', - 'ext': 'mp4', - 'title': 'Thunder vs. Nets', - 'description': 'Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.', - 'duration': 181, - 'timestamp': 1354597200, - 'upload_date': '20121204', - }, - }, { - 'url': 'http://www.nba.com/video/games/hornets/2014/12/05/0021400276-nyk-cha-play5.nba/', - 'only_matching': True, - }, { - 'url': 'http://watch.nba.com/video/channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba', - 'md5': 'b2b39b81cf28615ae0c3360a3f9668c4', - 'info_dict': { - 'id': '330865', - 'ext': 'mp4', - 'title': 'Hawks vs. Cavaliers Game 1', - 'description': 'md5:8094c3498d35a9bd6b1a8c396a071b4d', - 'duration': 228, - 'timestamp': 1432094400, - 'upload_date': '20150521', - }, - }, { - 'url': 'http://watch.nba.com/nba/video/channels/nba_tv/2015/06/11/YT_go_big_go_home_Game4_061115', - 'only_matching': True, - }, { - # only CVP mp4 format available - 'url': 'https://watch.nba.com/video/teams/cavaliers/2012/10/15/sloan121015mov-2249106', - 'only_matching': True, - }, { - 'url': 'https://watch.nba.com/video/top-100-dunks-from-the-2019-20-season?plsrc=nba&collection=2019-20-season-highlights', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - collection_id = compat_parse_qs(compat_urllib_parse_urlparse(url).query).get('collection', [None])[0] - if collection_id: - if self._downloader.params.get('noplaylist'): - self.to_screen('Downloading just video %s because of --no-playlist' % display_id) - else: - self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % collection_id) - return self.url_result( - 'https://www.nba.com/watch/list/collection/' + collection_id, - NBAWatchCollectionIE.ie_key(), collection_id) - return self._extract_video('seoName', display_id) - - -class NBAWatchCollectionIE(NBAWatchBaseIE): - IE_NAME = 'nba:watch:collection' - _VALID_URL = NBAWatchBaseIE._VALID_URL_BASE + r'list/collection/(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'https://watch.nba.com/list/collection/season-preview-2020', - 'info_dict': { - 'id': 'season-preview-2020', - }, - 'playlist_mincount': 43, - }] - _PAGE_SIZE = 100 - - def _fetch_page(self, collection_id, page): - page += 1 - videos = self._download_json( - 'https://content-api-prod.nba.com/public/1/endeavor/video-list/collection/' + collection_id, - collection_id, 'Downloading page %d JSON metadata' % page, query={ - 'count': self._PAGE_SIZE, - 'page': page, - })['results']['videos'] - for video in videos: - program = video.get('program') or {} - seo_name = program.get('seoName') or program.get('slug') - if not seo_name: - continue - yield { - '_type': 'url', - 'id': program.get('id'), - 'title': program.get('title') or video.get('title'), - 'url': 'https://www.nba.com/watch/video/' + seo_name, - 'thumbnail': video.get('image'), - 'description': program.get('description') or video.get('description'), - 'duration': parse_duration(program.get('runtimeHours')), - 'timestamp': parse_iso8601(video.get('releaseDate')), - } - - def _real_extract(self, url): - collection_id = self._match_id(url) - entries = OnDemandPagedList( - functools.partial(self._fetch_page, collection_id), - self._PAGE_SIZE) - return self.playlist_result(entries, collection_id) - - -class NBABaseIE(NBACVPBaseIE): - _VALID_URL_BASE = r'''(?x) - https?://(?:www\.)?nba\.com/ - (?P<team> - blazers| - bucks| - bulls| - cavaliers| - celtics| - clippers| - grizzlies| - hawks| - heat| - hornets| - jazz| - kings| - knicks| - lakers| - magic| - mavericks| - nets| - nuggets| - pacers| - pelicans| - pistons| - raptors| - rockets| - sixers| - spurs| - suns| - thunder| - timberwolves| - warriors| - wizards - ) - (?:/play\#)?/''' - _CHANNEL_PATH_REGEX = r'video/channel|series' - - def _embed_url_result(self, team, content_id): - return self.url_result(update_url_query( - 'https://secure.nba.com/assets/amp/include/video/iframe.html', { - 'contentId': content_id, - 'team': team, - }), NBAEmbedIE.ie_key()) - - def _call_api(self, team, content_id, query, resource): - return self._download_json( - 'https://api.nba.net/2/%s/video,imported_video,wsc/' % team, - content_id, 'Download %s JSON metadata' % resource, - query=query, headers={ - 'accessToken': 'internal|bb88df6b4c2244e78822812cecf1ee1b', - })['response']['result'] - - def _extract_video(self, video, team, extract_all=True): - video_id = compat_str(video['nid']) - team = video['brand'] - - info = { - 'id': video_id, - 'title': video.get('title') or video.get('headline') or video['shortHeadline'], - 'description': video.get('description'), - 'timestamp': parse_iso8601(video.get('published')), - } - - subtitles = {} - captions = try_get(video, lambda x: x['videoCaptions']['sidecars'], dict) or {} - for caption_url in captions.values(): - subtitles.setdefault('en', []).append({'url': caption_url}) - - formats = [] - mp4_url = video.get('mp4') - if mp4_url: - formats.append({ - 'url': mp4_url, - }) - - if extract_all: - source_url = video.get('videoSource') - if source_url and not source_url.startswith('s3://') and self._is_valid_url(source_url, video_id, 'source'): - formats.append({ - 'format_id': 'source', - 'url': source_url, - 'preference': 1, - }) - - m3u8_url = video.get('m3u8') - if m3u8_url: - if '.akamaihd.net/i/' in m3u8_url: - formats.extend(self._extract_akamai_formats( - m3u8_url, video_id, {'http': 'pmd.cdn.turner.com'})) - else: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False)) - - content_xml = video.get('contentXml') - if team and content_xml: - cvp_info = self._extract_nba_cvp_info( - team + content_xml, video_id, fatal=False) - if cvp_info: - formats.extend(cvp_info['formats']) - subtitles = self._merge_subtitles(subtitles, cvp_info['subtitles']) - info = merge_dicts(info, cvp_info) - - self._sort_formats(formats) - else: - info.update(self._embed_url_result(team, video['videoId'])) - - info.update({ - 'formats': formats, - 'subtitles': subtitles, - }) - - return info - - def _real_extract(self, url): - team, display_id = re.match(self._VALID_URL, url).groups() - if '/play#/' in url: - display_id = compat_urllib_parse_unquote(display_id) - else: - webpage = self._download_webpage(url, display_id) - display_id = self._search_regex( - self._CONTENT_ID_REGEX + r'\s*:\s*"([^"]+)"', webpage, 'video id') - return self._extract_url_results(team, display_id) - - -class NBAEmbedIE(NBABaseIE): - IENAME = 'nba:embed' - _VALID_URL = r'https?://secure\.nba\.com/assets/amp/include/video/(?:topI|i)frame\.html\?.*?\bcontentId=(?P<id>[^?#&]+)' - _TESTS = [{ - 'url': 'https://secure.nba.com/assets/amp/include/video/topIframe.html?contentId=teams/bulls/2020/12/04/3478774/1607105587854-20201204_SCHEDULE_RELEASE_FINAL_DRUPAL-3478774&team=bulls&adFree=false&profile=71&videoPlayerName=TAMPCVP&baseUrl=&videoAdsection=nba.com_mobile_web_teamsites_chicagobulls&Env=', - 'only_matching': True, - }, { - 'url': 'https://secure.nba.com/assets/amp/include/video/iframe.html?contentId=2016/10/29/0021600027boschaplay7&adFree=false&profile=71&team=&videoPlayerName=LAMPCVP', - 'only_matching': True, - }] - - def _real_extract(self, url): - qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) - content_id = qs['contentId'][0] - team = qs.get('team', [None])[0] - if not team: - return self.url_result( - 'https://watch.nba.com/video/' + content_id, NBAWatchIE.ie_key()) - video = self._call_api(team, content_id, {'videoid': content_id}, 'video')[0] - return self._extract_video(video, team) - - -class NBAIE(NBABaseIE): - IENAME = 'nba' - _VALID_URL = NBABaseIE._VALID_URL_BASE + '(?!%s)video/(?P<id>(?:[^/]+/)*[^/?#&]+)' % NBABaseIE._CHANNEL_PATH_REGEX - _TESTS = [{ - 'url': 'https://www.nba.com/bulls/video/teams/bulls/2020/12/04/3478774/1607105587854-20201204schedulereleasefinaldrupal-3478774', - 'info_dict': { - 'id': '45039', - 'ext': 'mp4', - 'title': 'AND WE BACK.', - 'description': 'Part 1 of our 2020-21 schedule is here! Watch our games on NBC Sports Chicago.', - 'duration': 94, - 'timestamp': 1607112000, - 'upload_date': '20201218', - }, - }, { - 'url': 'https://www.nba.com/bucks/play#/video/teams%2Fbucks%2F2020%2F12%2F17%2F64860%2F1608252863446-Op_Dream_16x9-64860', - 'only_matching': True, - }, { - 'url': 'https://www.nba.com/bucks/play#/video/wsc%2Fteams%2F2787C911AA1ACD154B5377F7577CCC7134B2A4B0', - 'only_matching': True, - }] - _CONTENT_ID_REGEX = r'videoID' - - def _extract_url_results(self, team, content_id): - return self._embed_url_result(team, content_id) - - -class NBAChannelIE(NBABaseIE): - IENAME = 'nba:channel' - _VALID_URL = NBABaseIE._VALID_URL_BASE + '(?:%s)/(?P<id>[^/?#&]+)' % NBABaseIE._CHANNEL_PATH_REGEX - _TESTS = [{ - 'url': 'https://www.nba.com/blazers/video/channel/summer_league', - 'info_dict': { - 'title': 'Summer League', - }, - 'playlist_mincount': 138, - }, { - 'url': 'https://www.nba.com/bucks/play#/series/On%20This%20Date', - 'only_matching': True, - }] - _CONTENT_ID_REGEX = r'videoSubCategory' - _PAGE_SIZE = 100 - - def _fetch_page(self, team, channel, page): - results = self._call_api(team, channel, { - 'channels': channel, - 'count': self._PAGE_SIZE, - 'offset': page * self._PAGE_SIZE, - }, 'page %d' % (page + 1)) - for video in results: - yield self._extract_video(video, team, False) - - def _extract_url_results(self, team, content_id): - entries = OnDemandPagedList( - functools.partial(self._fetch_page, team, content_id), - self._PAGE_SIZE) - return self.playlist_result(entries, playlist_title=content_id) diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py deleted file mode 100644 index 0d77648c2..000000000 --- a/youtube_dl/extractor/nbc.py +++ /dev/null @@ -1,525 +0,0 @@ -from __future__ import unicode_literals - -import base64 -import json -import re - -from .common import InfoExtractor -from .theplatform import ThePlatformIE -from .adobepass import AdobePassIE -from ..compat import compat_urllib_parse_unquote -from ..utils import ( - int_or_none, - parse_duration, - smuggle_url, - try_get, - unified_timestamp, - update_url_query, -) - - -class NBCIE(AdobePassIE): - _VALID_URL = r'https?(?P<permalink>://(?:www\.)?nbc\.com/(?:classic-tv/)?[^/]+/video/[^/]+/(?P<id>n?\d+))' - - _TESTS = [ - { - 'url': 'http://www.nbc.com/the-tonight-show/video/jimmy-fallon-surprises-fans-at-ben-jerrys/2848237', - 'info_dict': { - 'id': '2848237', - 'ext': 'mp4', - 'title': 'Jimmy Fallon Surprises Fans at Ben & Jerry\'s', - 'description': 'Jimmy gives out free scoops of his new "Tonight Dough" ice cream flavor by surprising customers at the Ben & Jerry\'s scoop shop.', - 'timestamp': 1424246400, - 'upload_date': '20150218', - 'uploader': 'NBCU-COM', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, - { - 'url': 'http://www.nbc.com/saturday-night-live/video/star-wars-teaser/2832821', - 'info_dict': { - 'id': '2832821', - 'ext': 'mp4', - 'title': 'Star Wars Teaser', - 'description': 'md5:0b40f9cbde5b671a7ff62fceccc4f442', - 'timestamp': 1417852800, - 'upload_date': '20141206', - 'uploader': 'NBCU-COM', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'skip': 'Only works from US', - }, - { - # HLS streams requires the 'hdnea3' cookie - 'url': 'http://www.nbc.com/Kings/video/goliath/n1806', - 'info_dict': { - 'id': '101528f5a9e8127b107e98c5e6ce4638', - 'ext': 'mp4', - 'title': 'Goliath', - 'description': 'When an unknown soldier saves the life of the King\'s son in battle, he\'s thrust into the limelight and politics of the kingdom.', - 'timestamp': 1237100400, - 'upload_date': '20090315', - 'uploader': 'NBCU-COM', - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'Only works from US', - }, - { - 'url': 'https://www.nbc.com/classic-tv/charles-in-charge/video/charles-in-charge-pilot/n3310', - 'only_matching': True, - }, - { - # Percent escaped url - 'url': 'https://www.nbc.com/up-all-night/video/day-after-valentine%27s-day/n2189', - 'only_matching': True, - } - ] - - def _real_extract(self, url): - permalink, video_id = re.match(self._VALID_URL, url).groups() - permalink = 'http' + compat_urllib_parse_unquote(permalink) - video_data = self._download_json( - 'https://friendship.nbc.co/v2/graphql', video_id, query={ - 'query': '''query bonanzaPage( - $app: NBCUBrands! = nbc - $name: String! - $oneApp: Boolean - $platform: SupportedPlatforms! = web - $type: EntityPageType! = VIDEO - $userId: String! -) { - bonanzaPage( - app: $app - name: $name - oneApp: $oneApp - platform: $platform - type: $type - userId: $userId - ) { - metadata { - ... on VideoPageData { - description - episodeNumber - keywords - locked - mpxAccountId - mpxGuid - rating - resourceId - seasonNumber - secondaryTitle - seriesShortTitle - } - } - } -}''', - 'variables': json.dumps({ - 'name': permalink, - 'oneApp': True, - 'userId': '0', - }), - })['data']['bonanzaPage']['metadata'] - query = { - 'mbr': 'true', - 'manifest': 'm3u', - } - video_id = video_data['mpxGuid'] - title = video_data['secondaryTitle'] - if video_data.get('locked'): - resource = self._get_mvpd_resource( - video_data.get('resourceId') or 'nbcentertainment', - title, video_id, video_data.get('rating')) - query['auth'] = self._extract_mvpd_auth( - url, video_id, 'nbcentertainment', resource) - theplatform_url = smuggle_url(update_url_query( - 'http://link.theplatform.com/s/NnzsPC/media/guid/%s/%s' % (video_data.get('mpxAccountId') or '2410887629', video_id), - query), {'force_smil_url': True}) - return { - '_type': 'url_transparent', - 'id': video_id, - 'title': title, - 'url': theplatform_url, - 'description': video_data.get('description'), - 'tags': video_data.get('keywords'), - 'season_number': int_or_none(video_data.get('seasonNumber')), - 'episode_number': int_or_none(video_data.get('episodeNumber')), - 'episode': title, - 'series': video_data.get('seriesShortTitle'), - 'ie_key': 'ThePlatform', - } - - -class NBCSportsVPlayerIE(InfoExtractor): - _VALID_URL_BASE = r'https?://(?:vplayer\.nbcsports\.com|(?:www\.)?nbcsports\.com/vplayer)/' - _VALID_URL = _VALID_URL_BASE + r'(?:[^/]+/)+(?P<id>[0-9a-zA-Z_]+)' - - _TESTS = [{ - 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/9CsDKds0kvHI', - 'info_dict': { - 'id': '9CsDKds0kvHI', - 'ext': 'mp4', - 'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d', - 'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson', - 'timestamp': 1426270238, - 'upload_date': '20150313', - 'uploader': 'NBCU-SPORTS', - } - }, { - 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/media/_hqLjQ95yx8Z', - 'only_matching': True, - }, { - 'url': 'https://www.nbcsports.com/vplayer/p/BxmELC/nbcsports/select/PHJSaFWbrTY9?form=html&autoPlay=true', - 'only_matching': True, - }] - - @staticmethod - def _extract_url(webpage): - iframe_m = re.search( - r'<(?:iframe[^>]+|div[^>]+data-(?:mpx-)?)src="(?P<url>%s[^"]+)"' % NBCSportsVPlayerIE._VALID_URL_BASE, webpage) - if iframe_m: - return iframe_m.group('url') - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - theplatform_url = self._og_search_video_url(webpage).replace( - 'vplayer.nbcsports.com', 'player.theplatform.com') - return self.url_result(theplatform_url, 'ThePlatform') - - -class NBCSportsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?nbcsports\.com//?(?!vplayer/)(?:[^/]+/)+(?P<id>[0-9a-z-]+)' - - _TESTS = [{ - # iframe src - 'url': 'http://www.nbcsports.com//college-basketball/ncaab/tom-izzo-michigan-st-has-so-much-respect-duke', - 'info_dict': { - 'id': 'PHJSaFWbrTY9', - 'ext': 'mp4', - 'title': 'Tom Izzo, Michigan St. has \'so much respect\' for Duke', - 'description': 'md5:ecb459c9d59e0766ac9c7d5d0eda8113', - 'uploader': 'NBCU-SPORTS', - 'upload_date': '20150330', - 'timestamp': 1427726529, - } - }, { - # data-mpx-src - 'url': 'https://www.nbcsports.com/philadelphia/philadelphia-phillies/bruce-bochy-hector-neris-hes-idiot', - 'only_matching': True, - }, { - # data-src - 'url': 'https://www.nbcsports.com/boston/video/report-card-pats-secondary-no-match-josh-allen', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - return self.url_result( - NBCSportsVPlayerIE._extract_url(webpage), 'NBCSportsVPlayer') - - -class NBCSportsStreamIE(AdobePassIE): - _VALID_URL = r'https?://stream\.nbcsports\.com/.+?\bpid=(?P<id>\d+)' - _TEST = { - 'url': 'http://stream.nbcsports.com/nbcsn/generic?pid=206559', - 'info_dict': { - 'id': '206559', - 'ext': 'mp4', - 'title': 'Amgen Tour of California Women\'s Recap', - 'description': 'md5:66520066b3b5281ada7698d0ea2aa894', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'skip': 'Requires Adobe Pass Authentication', - } - - def _real_extract(self, url): - video_id = self._match_id(url) - live_source = self._download_json( - 'http://stream.nbcsports.com/data/live_sources_%s.json' % video_id, - video_id) - video_source = live_source['videoSources'][0] - title = video_source['title'] - source_url = None - for k in ('source', 'msl4source', 'iossource', 'hlsv4'): - sk = k + 'Url' - source_url = video_source.get(sk) or video_source.get(sk + 'Alt') - if source_url: - break - else: - source_url = video_source['ottStreamUrl'] - is_live = video_source.get('type') == 'live' or video_source.get('status') == 'Live' - resource = self._get_mvpd_resource('nbcsports', title, video_id, '') - token = self._extract_mvpd_auth(url, video_id, 'nbcsports', resource) - tokenized_url = self._download_json( - 'https://token.playmakerservices.com/cdn', - video_id, data=json.dumps({ - 'requestorId': 'nbcsports', - 'pid': video_id, - 'application': 'NBCSports', - 'version': 'v1', - 'platform': 'desktop', - 'cdn': 'akamai', - 'url': video_source['sourceUrl'], - 'token': base64.b64encode(token.encode()).decode(), - 'resourceId': base64.b64encode(resource.encode()).decode(), - }).encode())['tokenizedUrl'] - formats = self._extract_m3u8_formats(tokenized_url, video_id, 'mp4') - self._sort_formats(formats) - return { - 'id': video_id, - 'title': self._live_title(title) if is_live else title, - 'description': live_source.get('description'), - 'formats': formats, - 'is_live': is_live, - } - - -class NBCNewsIE(ThePlatformIE): - _VALID_URL = r'(?x)https?://(?:www\.)?(?:nbcnews|today|msnbc)\.com/([^/]+/)*(?:.*-)?(?P<id>[^/?]+)' - - _TESTS = [ - { - 'url': 'http://www.nbcnews.com/watch/nbcnews-com/how-twitter-reacted-to-the-snowden-interview-269389891880', - 'md5': 'cf4bc9e6ce0130f00f545d80ecedd4bf', - 'info_dict': { - 'id': '269389891880', - 'ext': 'mp4', - 'title': 'How Twitter Reacted To The Snowden Interview', - 'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64', - 'timestamp': 1401363060, - 'upload_date': '20140529', - }, - }, - { - 'url': 'http://www.nbcnews.com/feature/dateline-full-episodes/full-episode-family-business-n285156', - 'md5': 'fdbf39ab73a72df5896b6234ff98518a', - 'info_dict': { - 'id': '529953347624', - 'ext': 'mp4', - 'title': 'FULL EPISODE: Family Business', - 'description': 'md5:757988edbaae9d7be1d585eb5d55cc04', - }, - 'skip': 'This page is unavailable.', - }, - { - 'url': 'http://www.nbcnews.com/nightly-news/video/nightly-news-with-brian-williams-full-broadcast-february-4-394064451844', - 'md5': '8eb831eca25bfa7d25ddd83e85946548', - 'info_dict': { - 'id': '394064451844', - 'ext': 'mp4', - 'title': 'Nightly News with Brian Williams Full Broadcast (February 4)', - 'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5', - 'timestamp': 1423104900, - 'upload_date': '20150205', - }, - }, - { - 'url': 'http://www.nbcnews.com/business/autos/volkswagen-11-million-vehicles-could-have-suspect-software-emissions-scandal-n431456', - 'md5': '4a8c4cec9e1ded51060bdda36ff0a5c0', - 'info_dict': { - 'id': 'n431456', - 'ext': 'mp4', - 'title': "Volkswagen U.S. Chief: We 'Totally Screwed Up'", - 'description': 'md5:d22d1281a24f22ea0880741bb4dd6301', - 'upload_date': '20150922', - 'timestamp': 1442917800, - }, - }, - { - 'url': 'http://www.today.com/video/see-the-aurora-borealis-from-space-in-stunning-new-nasa-video-669831235788', - 'md5': '118d7ca3f0bea6534f119c68ef539f71', - 'info_dict': { - 'id': '669831235788', - 'ext': 'mp4', - 'title': 'See the aurora borealis from space in stunning new NASA video', - 'description': 'md5:74752b7358afb99939c5f8bb2d1d04b1', - 'upload_date': '20160420', - 'timestamp': 1461152093, - }, - }, - { - 'url': 'http://www.msnbc.com/all-in-with-chris-hayes/watch/the-chaotic-gop-immigration-vote-314487875924', - 'md5': '6d236bf4f3dddc226633ce6e2c3f814d', - 'info_dict': { - 'id': '314487875924', - 'ext': 'mp4', - 'title': 'The chaotic GOP immigration vote', - 'description': 'The Republican House votes on a border bill that has no chance of getting through the Senate or signed by the President and is drawing criticism from all sides.', - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1406937606, - 'upload_date': '20140802', - }, - }, - { - 'url': 'http://www.nbcnews.com/watch/dateline/full-episode--deadly-betrayal-386250819952', - 'only_matching': True, - }, - { - # From http://www.vulture.com/2016/06/letterman-couldnt-care-less-about-late-night.html - 'url': 'http://www.nbcnews.com/widget/video-embed/701714499682', - 'only_matching': True, - }, - ] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - data = self._parse_json(self._search_regex( - r'<script[^>]+id="__NEXT_DATA__"[^>]*>({.+?})</script>', - webpage, 'bootstrap json'), video_id)['props']['initialState'] - video_data = try_get(data, lambda x: x['video']['current'], dict) - if not video_data: - video_data = data['article']['content'][0]['primaryMedia']['video'] - title = video_data['headline']['primary'] - - formats = [] - for va in video_data.get('videoAssets', []): - public_url = va.get('publicUrl') - if not public_url: - continue - if '://link.theplatform.com/' in public_url: - public_url = update_url_query(public_url, {'format': 'redirect'}) - format_id = va.get('format') - if format_id == 'M3U': - formats.extend(self._extract_m3u8_formats( - public_url, video_id, 'mp4', 'm3u8_native', - m3u8_id=format_id, fatal=False)) - continue - tbr = int_or_none(va.get('bitrate'), 1000) - if tbr: - format_id += '-%d' % tbr - formats.append({ - 'format_id': format_id, - 'url': public_url, - 'width': int_or_none(va.get('width')), - 'height': int_or_none(va.get('height')), - 'tbr': tbr, - 'ext': 'mp4', - }) - self._sort_formats(formats) - - subtitles = {} - closed_captioning = video_data.get('closedCaptioning') - if closed_captioning: - for cc_url in closed_captioning.values(): - if not cc_url: - continue - subtitles.setdefault('en', []).append({ - 'url': cc_url, - }) - - return { - 'id': video_id, - 'title': title, - 'description': try_get(video_data, lambda x: x['description']['primary']), - 'thumbnail': try_get(video_data, lambda x: x['primaryImage']['url']['primary']), - 'duration': parse_duration(video_data.get('duration')), - 'timestamp': unified_timestamp(video_data.get('datePublished')), - 'formats': formats, - 'subtitles': subtitles, - } - - -class NBCOlympicsIE(InfoExtractor): - IE_NAME = 'nbcolympics' - _VALID_URL = r'https?://www\.nbcolympics\.com/video/(?P<id>[a-z-]+)' - - _TEST = { - # Geo-restricted to US - 'url': 'http://www.nbcolympics.com/video/justin-roses-son-leo-was-tears-after-his-dad-won-gold', - 'md5': '54fecf846d05429fbaa18af557ee523a', - 'info_dict': { - 'id': 'WjTBzDXx5AUq', - 'display_id': 'justin-roses-son-leo-was-tears-after-his-dad-won-gold', - 'ext': 'mp4', - 'title': 'Rose\'s son Leo was in tears after his dad won gold', - 'description': 'Olympic gold medalist Justin Rose gets emotional talking to the impact his win in men\'s golf has already had on his children.', - 'timestamp': 1471274964, - 'upload_date': '20160815', - 'uploader': 'NBCU-SPORTS', - }, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - drupal_settings = self._parse_json(self._search_regex( - r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', - webpage, 'drupal settings'), display_id) - - iframe_url = drupal_settings['vod']['iframe_url'] - theplatform_url = iframe_url.replace( - 'vplayer.nbcolympics.com', 'player.theplatform.com') - - return { - '_type': 'url_transparent', - 'url': theplatform_url, - 'ie_key': ThePlatformIE.ie_key(), - 'display_id': display_id, - } - - -class NBCOlympicsStreamIE(AdobePassIE): - IE_NAME = 'nbcolympics:stream' - _VALID_URL = r'https?://stream\.nbcolympics\.com/(?P<id>[0-9a-z-]+)' - _TEST = { - 'url': 'http://stream.nbcolympics.com/2018-winter-olympics-nbcsn-evening-feb-8', - 'info_dict': { - 'id': '203493', - 'ext': 'mp4', - 'title': 're:Curling, Alpine, Luge [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - } - _DATA_URL_TEMPLATE = 'http://stream.nbcolympics.com/data/%s_%s.json' - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - pid = self._search_regex(r'pid\s*=\s*(\d+);', webpage, 'pid') - resource = self._search_regex( - r"resource\s*=\s*'(.+)';", webpage, - 'resource').replace("' + pid + '", pid) - event_config = self._download_json( - self._DATA_URL_TEMPLATE % ('event_config', pid), - pid)['eventConfig'] - title = self._live_title(event_config['eventTitle']) - source_url = self._download_json( - self._DATA_URL_TEMPLATE % ('live_sources', pid), - pid)['videoSources'][0]['sourceUrl'] - media_token = self._extract_mvpd_auth( - url, pid, event_config.get('requestorId', 'NBCOlympics'), resource) - formats = self._extract_m3u8_formats(self._download_webpage( - 'http://sp.auth.adobe.com/tvs/v1/sign', pid, query={ - 'cdn': 'akamai', - 'mediaToken': base64.b64encode(media_token.encode()), - 'resource': base64.b64encode(resource.encode()), - 'url': source_url, - }), pid, 'mp4') - self._sort_formats(formats) - - return { - 'id': pid, - 'display_id': display_id, - 'title': title, - 'formats': formats, - 'is_live': True, - } diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py deleted file mode 100644 index ddd828d92..000000000 --- a/youtube_dl/extractor/ndr.py +++ /dev/null @@ -1,440 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - determine_ext, - int_or_none, - merge_dicts, - parse_iso8601, - qualities, - try_get, - urljoin, -) - - -class NDRBaseIE(InfoExtractor): - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = next(group for group in mobj.groups() if group) - webpage = self._download_webpage(url, display_id) - return self._extract_embed(webpage, display_id) - - -class NDRIE(NDRBaseIE): - IE_NAME = 'ndr' - IE_DESC = 'NDR.de - Norddeutscher Rundfunk' - _VALID_URL = r'https?://(?:www\.)?ndr\.de/(?:[^/]+/)*(?P<id>[^/?#]+),[\da-z]+\.html' - _TESTS = [{ - # httpVideo, same content id - 'url': 'http://www.ndr.de/fernsehen/Party-Poette-und-Parade,hafengeburtstag988.html', - 'md5': '6515bc255dc5c5f8c85bbc38e035a659', - 'info_dict': { - 'id': 'hafengeburtstag988', - 'display_id': 'Party-Poette-und-Parade', - 'ext': 'mp4', - 'title': 'Party, Pötte und Parade', - 'description': 'md5:ad14f9d2f91d3040b6930c697e5f6b4c', - 'uploader': 'ndrtv', - 'timestamp': 1431108900, - 'upload_date': '20150510', - 'duration': 3498, - }, - 'params': { - 'skip_download': True, - }, - }, { - # httpVideo, different content id - 'url': 'http://www.ndr.de/sport/fussball/40-Osnabrueck-spielt-sich-in-einen-Rausch,osna270.html', - 'md5': '1043ff203eab307f0c51702ec49e9a71', - 'info_dict': { - 'id': 'osna272', - 'display_id': '40-Osnabrueck-spielt-sich-in-einen-Rausch', - 'ext': 'mp4', - 'title': 'Osnabrück - Wehen Wiesbaden: Die Highlights', - 'description': 'md5:32e9b800b3d2d4008103752682d5dc01', - 'uploader': 'ndrtv', - 'timestamp': 1442059200, - 'upload_date': '20150912', - 'duration': 510, - }, - 'params': { - 'skip_download': True, - }, - }, { - # httpAudio, same content id - 'url': 'http://www.ndr.de/info/La-Valette-entgeht-der-Hinrichtung,audio51535.html', - 'md5': 'bb3cd38e24fbcc866d13b50ca59307b8', - 'info_dict': { - 'id': 'audio51535', - 'display_id': 'La-Valette-entgeht-der-Hinrichtung', - 'ext': 'mp3', - 'title': 'La Valette entgeht der Hinrichtung', - 'description': 'md5:22f9541913a40fe50091d5cdd7c9f536', - 'uploader': 'ndrinfo', - 'timestamp': 1290626100, - 'upload_date': '20140729', - 'duration': 884, - }, - 'params': { - 'skip_download': True, - }, - }, { - # with subtitles - 'url': 'https://www.ndr.de/fernsehen/sendungen/extra_3/extra-3-Satiremagazin-mit-Christian-Ehring,sendung1091858.html', - 'info_dict': { - 'id': 'extra18674', - 'display_id': 'extra-3-Satiremagazin-mit-Christian-Ehring', - 'ext': 'mp4', - 'title': 'Extra 3 vom 11.11.2020 mit Christian Ehring', - 'description': 'md5:42ee53990a715eaaf4dc7f13a3bd56c6', - 'uploader': 'ndrtv', - 'upload_date': '20201113', - 'duration': 1749, - 'subtitles': { - 'de': [{ - 'ext': 'ttml', - 'url': r're:^https://www\.ndr\.de.+', - }], - }, - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': ['Unable to download f4m manifest'], - }, { - 'url': 'https://www.ndr.de/Fettes-Brot-Ferris-MC-und-Thees-Uhlmann-live-on-stage,festivalsommer116.html', - 'only_matching': True, - }] - - def _extract_embed(self, webpage, display_id): - embed_url = self._html_search_meta( - 'embedURL', webpage, 'embed URL', - default=None) or self._search_regex( - r'\bembedUrl["\']\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, - 'embed URL', group='url') - description = self._search_regex( - r'<p[^>]+itemprop="description">([^<]+)</p>', - webpage, 'description', default=None) or self._og_search_description(webpage) - timestamp = parse_iso8601( - self._search_regex( - r'<span[^>]+itemprop="(?:datePublished|uploadDate)"[^>]+content="([^"]+)"', - webpage, 'upload date', default=None)) - info = self._search_json_ld(webpage, display_id, default={}) - return merge_dicts({ - '_type': 'url_transparent', - 'url': embed_url, - 'display_id': display_id, - 'description': description, - 'timestamp': timestamp, - }, info) - - -class NJoyIE(NDRBaseIE): - IE_NAME = 'njoy' - IE_DESC = 'N-JOY' - _VALID_URL = r'https?://(?:www\.)?n-joy\.de/(?:[^/]+/)*(?:(?P<display_id>[^/?#]+),)?(?P<id>[\da-z]+)\.html' - _TESTS = [{ - # httpVideo, same content id - 'url': 'http://www.n-joy.de/entertainment/comedy/comedy_contest/Benaissa-beim-NDR-Comedy-Contest,comedycontest2480.html', - 'md5': 'cb63be60cd6f9dd75218803146d8dc67', - 'info_dict': { - 'id': 'comedycontest2480', - 'display_id': 'Benaissa-beim-NDR-Comedy-Contest', - 'ext': 'mp4', - 'title': 'Benaissa beim NDR Comedy Contest', - 'description': 'md5:f057a6c4e1c728b10d33b5ffd36ddc39', - 'uploader': 'ndrtv', - 'upload_date': '20141129', - 'duration': 654, - }, - 'params': { - 'skip_download': True, - }, - }, { - # httpVideo, different content id - 'url': 'http://www.n-joy.de/musik/Das-frueheste-DJ-Set-des-Nordens-live-mit-Felix-Jaehn-,felixjaehn168.html', - 'md5': '417660fffa90e6df2fda19f1b40a64d8', - 'info_dict': { - 'id': 'dockville882', - 'display_id': 'Das-frueheste-DJ-Set-des-Nordens-live-mit-Felix-Jaehn-', - 'ext': 'mp4', - 'title': '"Ich hab noch nie" mit Felix Jaehn', - 'description': 'md5:85dd312d53be1b99e1f998a16452a2f3', - 'uploader': 'njoy', - 'upload_date': '20150822', - 'duration': 211, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://www.n-joy.de/radio/webradio/morningshow209.html', - 'only_matching': True, - }] - - def _extract_embed(self, webpage, display_id): - video_id = self._search_regex( - r'<iframe[^>]+id="pp_([\da-z]+)"', webpage, 'embed id') - description = self._search_regex( - r'<div[^>]+class="subline"[^>]*>[^<]+</div>\s*<p>([^<]+)</p>', - webpage, 'description', fatal=False) - return { - '_type': 'url_transparent', - 'ie_key': 'NDREmbedBase', - 'url': 'ndr:%s' % video_id, - 'display_id': display_id, - 'description': description, - } - - -class NDREmbedBaseIE(InfoExtractor): - IE_NAME = 'ndr:embed:base' - _VALID_URL = r'(?:ndr:(?P<id_s>[\da-z]+)|https?://www\.ndr\.de/(?P<id>[\da-z]+)-ppjson\.json)' - _TESTS = [{ - 'url': 'ndr:soundcheck3366', - 'only_matching': True, - }, { - 'url': 'http://www.ndr.de/soundcheck3366-ppjson.json', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') or mobj.group('id_s') - - ppjson = self._download_json( - 'http://www.ndr.de/%s-ppjson.json' % video_id, video_id) - - playlist = ppjson['playlist'] - - formats = [] - quality_key = qualities(('xs', 's', 'm', 'l', 'xl')) - - for format_id, f in playlist.items(): - src = f.get('src') - if not src: - continue - ext = determine_ext(src, None) - if ext == 'f4m': - formats.extend(self._extract_f4m_formats( - src + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id, - f4m_id='hds', fatal=False)) - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - src, video_id, 'mp4', m3u8_id='hls', - entry_protocol='m3u8_native', fatal=False)) - else: - quality = f.get('quality') - ff = { - 'url': src, - 'format_id': quality or format_id, - 'quality': quality_key(quality), - } - type_ = f.get('type') - if type_ and type_.split('/')[0] == 'audio': - ff['vcodec'] = 'none' - ff['ext'] = ext or 'mp3' - formats.append(ff) - self._sort_formats(formats) - - config = playlist['config'] - - live = playlist.get('config', {}).get('streamType') in ['httpVideoLive', 'httpAudioLive'] - title = config['title'] - if live: - title = self._live_title(title) - uploader = ppjson.get('config', {}).get('branding') - upload_date = ppjson.get('config', {}).get('publicationDate') - duration = int_or_none(config.get('duration')) - - thumbnails = [] - poster = try_get(config, lambda x: x['poster'], dict) or {} - for thumbnail_id, thumbnail in poster.items(): - thumbnail_url = urljoin(url, thumbnail.get('src')) - if not thumbnail_url: - continue - thumbnails.append({ - 'id': thumbnail.get('quality') or thumbnail_id, - 'url': thumbnail_url, - 'preference': quality_key(thumbnail.get('quality')), - }) - - subtitles = {} - tracks = config.get('tracks') - if tracks and isinstance(tracks, list): - for track in tracks: - if not isinstance(track, dict): - continue - track_url = urljoin(url, track.get('src')) - if not track_url: - continue - subtitles.setdefault(track.get('srclang') or 'de', []).append({ - 'url': track_url, - 'ext': 'ttml', - }) - - return { - 'id': video_id, - 'title': title, - 'is_live': live, - 'uploader': uploader if uploader != '-' else None, - 'upload_date': upload_date[0:8] if upload_date else None, - 'duration': duration, - 'thumbnails': thumbnails, - 'formats': formats, - 'subtitles': subtitles, - } - - -class NDREmbedIE(NDREmbedBaseIE): - IE_NAME = 'ndr:embed' - _VALID_URL = r'https?://(?:www\.)?ndr\.de/(?:[^/]+/)*(?P<id>[\da-z]+)-(?:player|externalPlayer)\.html' - _TESTS = [{ - 'url': 'http://www.ndr.de/fernsehen/sendungen/ndr_aktuell/ndraktuell28488-player.html', - 'md5': '8b9306142fe65bbdefb5ce24edb6b0a9', - 'info_dict': { - 'id': 'ndraktuell28488', - 'ext': 'mp4', - 'title': 'Norddeutschland begrüßt Flüchtlinge', - 'is_live': False, - 'uploader': 'ndrtv', - 'upload_date': '20150907', - 'duration': 132, - }, - }, { - 'url': 'http://www.ndr.de/ndr2/events/soundcheck/soundcheck3366-player.html', - 'md5': '002085c44bae38802d94ae5802a36e78', - 'info_dict': { - 'id': 'soundcheck3366', - 'ext': 'mp4', - 'title': 'Ella Henderson braucht Vergleiche nicht zu scheuen', - 'is_live': False, - 'uploader': 'ndr2', - 'upload_date': '20150912', - 'duration': 3554, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://www.ndr.de/info/audio51535-player.html', - 'md5': 'bb3cd38e24fbcc866d13b50ca59307b8', - 'info_dict': { - 'id': 'audio51535', - 'ext': 'mp3', - 'title': 'La Valette entgeht der Hinrichtung', - 'is_live': False, - 'uploader': 'ndrinfo', - 'upload_date': '20140729', - 'duration': 884, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://www.ndr.de/fernsehen/sendungen/visite/visite11010-externalPlayer.html', - 'md5': 'ae57f80511c1e1f2fd0d0d3d31aeae7c', - 'info_dict': { - 'id': 'visite11010', - 'ext': 'mp4', - 'title': 'Visite - die ganze Sendung', - 'is_live': False, - 'uploader': 'ndrtv', - 'upload_date': '20150902', - 'duration': 3525, - }, - 'params': { - 'skip_download': True, - }, - }, { - # httpVideoLive - 'url': 'http://www.ndr.de/fernsehen/livestream/livestream217-externalPlayer.html', - 'info_dict': { - 'id': 'livestream217', - 'ext': 'flv', - 'title': r're:^NDR Fernsehen Niedersachsen \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', - 'is_live': True, - 'upload_date': '20150910', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://www.ndr.de/ndrkultur/audio255020-player.html', - 'only_matching': True, - }, { - 'url': 'http://www.ndr.de/fernsehen/sendungen/nordtour/nordtour7124-player.html', - 'only_matching': True, - }, { - 'url': 'http://www.ndr.de/kultur/film/videos/videoimport10424-player.html', - 'only_matching': True, - }, { - 'url': 'http://www.ndr.de/fernsehen/sendungen/hamburg_journal/hamj43006-player.html', - 'only_matching': True, - }, { - 'url': 'http://www.ndr.de/fernsehen/sendungen/weltbilder/weltbilder4518-player.html', - 'only_matching': True, - }, { - 'url': 'http://www.ndr.de/fernsehen/doku952-player.html', - 'only_matching': True, - }] - - -class NJoyEmbedIE(NDREmbedBaseIE): - IE_NAME = 'njoy:embed' - _VALID_URL = r'https?://(?:www\.)?n-joy\.de/(?:[^/]+/)*(?P<id>[\da-z]+)-(?:player|externalPlayer)_[^/]+\.html' - _TESTS = [{ - # httpVideo - 'url': 'http://www.n-joy.de/events/reeperbahnfestival/doku948-player_image-bc168e87-5263-4d6d-bd27-bb643005a6de_theme-n-joy.html', - 'md5': '8483cbfe2320bd4d28a349d62d88bd74', - 'info_dict': { - 'id': 'doku948', - 'ext': 'mp4', - 'title': 'Zehn Jahre Reeperbahn Festival - die Doku', - 'is_live': False, - 'upload_date': '20150807', - 'duration': 1011, - }, - }, { - # httpAudio - 'url': 'http://www.n-joy.de/news_wissen/stefanrichter100-player_image-d5e938b1-f21a-4b9a-86b8-aaba8bca3a13_theme-n-joy.html', - 'md5': 'd989f80f28ac954430f7b8a48197188a', - 'info_dict': { - 'id': 'stefanrichter100', - 'ext': 'mp3', - 'title': 'Interview mit einem Augenzeugen', - 'is_live': False, - 'uploader': 'njoy', - 'upload_date': '20150909', - 'duration': 140, - }, - 'params': { - 'skip_download': True, - }, - }, { - # httpAudioLive, no explicit ext - 'url': 'http://www.n-joy.de/news_wissen/webradioweltweit100-player_image-3fec0484-2244-4565-8fb8-ed25fd28b173_theme-n-joy.html', - 'info_dict': { - 'id': 'webradioweltweit100', - 'ext': 'mp3', - 'title': r're:^N-JOY Weltweit \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', - 'is_live': True, - 'uploader': 'njoy', - 'upload_date': '20150810', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://www.n-joy.de/musik/dockville882-player_image-3905259e-0803-4764-ac72-8b7de077d80a_theme-n-joy.html', - 'only_matching': True, - }, { - 'url': 'http://www.n-joy.de/radio/sendungen/morningshow/urlaubsfotos190-player_image-066a5df1-5c95-49ec-a323-941d848718db_theme-n-joy.html', - 'only_matching': True, - }, { - 'url': 'http://www.n-joy.de/entertainment/comedy/krudetv290-player_image-ab261bfe-51bf-4bf3-87ba-c5122ee35b3d_theme-n-joy.html', - 'only_matching': True, - }] diff --git a/youtube_dl/extractor/neteasemusic.py b/youtube_dl/extractor/neteasemusic.py deleted file mode 100644 index 978a05841..000000000 --- a/youtube_dl/extractor/neteasemusic.py +++ /dev/null @@ -1,485 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from hashlib import md5 -from base64 import b64encode -from datetime import datetime -import re - -from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_urlencode, - compat_str, - compat_itertools_count, -) -from ..utils import ( - sanitized_Request, - float_or_none, -) - - -class NetEaseMusicBaseIE(InfoExtractor): - _FORMATS = ['bMusic', 'mMusic', 'hMusic'] - _NETEASE_SALT = '3go8&$8*3*3h0k(2)2' - _API_BASE = 'http://music.163.com/api/' - - @classmethod - def _encrypt(cls, dfsid): - salt_bytes = bytearray(cls._NETEASE_SALT.encode('utf-8')) - string_bytes = bytearray(compat_str(dfsid).encode('ascii')) - salt_len = len(salt_bytes) - for i in range(len(string_bytes)): - string_bytes[i] = string_bytes[i] ^ salt_bytes[i % salt_len] - m = md5() - m.update(bytes(string_bytes)) - result = b64encode(m.digest()).decode('ascii') - return result.replace('/', '_').replace('+', '-') - - def extract_formats(self, info): - formats = [] - for song_format in self._FORMATS: - details = info.get(song_format) - if not details: - continue - song_file_path = '/%s/%s.%s' % ( - self._encrypt(details['dfsId']), details['dfsId'], details['extension']) - - # 203.130.59.9, 124.40.233.182, 115.231.74.139, etc is a reverse proxy-like feature - # from NetEase's CDN provider that can be used if m5.music.126.net does not - # work, especially for users outside of Mainland China - # via: https://github.com/JixunMoe/unblock-163/issues/3#issuecomment-163115880 - for host in ('http://m5.music.126.net', 'http://115.231.74.139/m1.music.126.net', - 'http://124.40.233.182/m1.music.126.net', 'http://203.130.59.9/m1.music.126.net'): - song_url = host + song_file_path - if self._is_valid_url(song_url, info['id'], 'song'): - formats.append({ - 'url': song_url, - 'ext': details.get('extension'), - 'abr': float_or_none(details.get('bitrate'), scale=1000), - 'format_id': song_format, - 'filesize': details.get('size'), - 'asr': details.get('sr') - }) - break - return formats - - @classmethod - def convert_milliseconds(cls, ms): - return int(round(ms / 1000.0)) - - def query_api(self, endpoint, video_id, note): - req = sanitized_Request('%s%s' % (self._API_BASE, endpoint)) - req.add_header('Referer', self._API_BASE) - return self._download_json(req, video_id, note) - - -class NetEaseMusicIE(NetEaseMusicBaseIE): - IE_NAME = 'netease:song' - IE_DESC = '网易云音乐' - _VALID_URL = r'https?://music\.163\.com/(#/)?song\?id=(?P<id>[0-9]+)' - _TESTS = [{ - 'url': 'http://music.163.com/#/song?id=32102397', - 'md5': 'f2e97280e6345c74ba9d5677dd5dcb45', - 'info_dict': { - 'id': '32102397', - 'ext': 'mp3', - 'title': 'Bad Blood (feat. Kendrick Lamar)', - 'creator': 'Taylor Swift / Kendrick Lamar', - 'upload_date': '20150517', - 'timestamp': 1431878400, - 'description': 'md5:a10a54589c2860300d02e1de821eb2ef', - }, - 'skip': 'Blocked outside Mainland China', - }, { - 'note': 'No lyrics translation.', - 'url': 'http://music.163.com/#/song?id=29822014', - 'info_dict': { - 'id': '29822014', - 'ext': 'mp3', - 'title': '听见下雨的声音', - 'creator': '周杰伦', - 'upload_date': '20141225', - 'timestamp': 1419523200, - 'description': 'md5:a4d8d89f44656af206b7b2555c0bce6c', - }, - 'skip': 'Blocked outside Mainland China', - }, { - 'note': 'No lyrics.', - 'url': 'http://music.163.com/song?id=17241424', - 'info_dict': { - 'id': '17241424', - 'ext': 'mp3', - 'title': 'Opus 28', - 'creator': 'Dustin O\'Halloran', - 'upload_date': '20080211', - 'timestamp': 1202745600, - }, - 'skip': 'Blocked outside Mainland China', - }, { - 'note': 'Has translated name.', - 'url': 'http://music.163.com/#/song?id=22735043', - 'info_dict': { - 'id': '22735043', - 'ext': 'mp3', - 'title': '소원을 말해봐 (Genie)', - 'creator': '少女时代', - 'description': 'md5:79d99cc560e4ca97e0c4d86800ee4184', - 'upload_date': '20100127', - 'timestamp': 1264608000, - 'alt_title': '说出愿望吧(Genie)', - }, - 'skip': 'Blocked outside Mainland China', - }] - - def _process_lyrics(self, lyrics_info): - original = lyrics_info.get('lrc', {}).get('lyric') - translated = lyrics_info.get('tlyric', {}).get('lyric') - - if not translated: - return original - - lyrics_expr = r'(\[[0-9]{2}:[0-9]{2}\.[0-9]{2,}\])([^\n]+)' - original_ts_texts = re.findall(lyrics_expr, original) - translation_ts_dict = dict( - (time_stamp, text) for time_stamp, text in re.findall(lyrics_expr, translated) - ) - lyrics = '\n'.join([ - '%s%s / %s' % (time_stamp, text, translation_ts_dict.get(time_stamp, '')) - for time_stamp, text in original_ts_texts - ]) - return lyrics - - def _real_extract(self, url): - song_id = self._match_id(url) - - params = { - 'id': song_id, - 'ids': '[%s]' % song_id - } - info = self.query_api( - 'song/detail?' + compat_urllib_parse_urlencode(params), - song_id, 'Downloading song info')['songs'][0] - - formats = self.extract_formats(info) - self._sort_formats(formats) - - lyrics_info = self.query_api( - 'song/lyric?id=%s&lv=-1&tv=-1' % song_id, - song_id, 'Downloading lyrics data') - lyrics = self._process_lyrics(lyrics_info) - - alt_title = None - if info.get('transNames'): - alt_title = '/'.join(info.get('transNames')) - - return { - 'id': song_id, - 'title': info['name'], - 'alt_title': alt_title, - 'creator': ' / '.join([artist['name'] for artist in info.get('artists', [])]), - 'timestamp': self.convert_milliseconds(info.get('album', {}).get('publishTime')), - 'thumbnail': info.get('album', {}).get('picUrl'), - 'duration': self.convert_milliseconds(info.get('duration', 0)), - 'description': lyrics, - 'formats': formats, - } - - -class NetEaseMusicAlbumIE(NetEaseMusicBaseIE): - IE_NAME = 'netease:album' - IE_DESC = '网易云音乐 - 专辑' - _VALID_URL = r'https?://music\.163\.com/(#/)?album\?id=(?P<id>[0-9]+)' - _TEST = { - 'url': 'http://music.163.com/#/album?id=220780', - 'info_dict': { - 'id': '220780', - 'title': 'B\'day', - }, - 'playlist_count': 23, - 'skip': 'Blocked outside Mainland China', - } - - def _real_extract(self, url): - album_id = self._match_id(url) - - info = self.query_api( - 'album/%s?id=%s' % (album_id, album_id), - album_id, 'Downloading album data')['album'] - - name = info['name'] - desc = info.get('description') - entries = [ - self.url_result('http://music.163.com/#/song?id=%s' % song['id'], - 'NetEaseMusic', song['id']) - for song in info['songs'] - ] - return self.playlist_result(entries, album_id, name, desc) - - -class NetEaseMusicSingerIE(NetEaseMusicBaseIE): - IE_NAME = 'netease:singer' - IE_DESC = '网易云音乐 - 歌手' - _VALID_URL = r'https?://music\.163\.com/(#/)?artist\?id=(?P<id>[0-9]+)' - _TESTS = [{ - 'note': 'Singer has aliases.', - 'url': 'http://music.163.com/#/artist?id=10559', - 'info_dict': { - 'id': '10559', - 'title': '张惠妹 - aMEI;阿密特', - }, - 'playlist_count': 50, - 'skip': 'Blocked outside Mainland China', - }, { - 'note': 'Singer has translated name.', - 'url': 'http://music.163.com/#/artist?id=124098', - 'info_dict': { - 'id': '124098', - 'title': '李昇基 - 이승기', - }, - 'playlist_count': 50, - 'skip': 'Blocked outside Mainland China', - }] - - def _real_extract(self, url): - singer_id = self._match_id(url) - - info = self.query_api( - 'artist/%s?id=%s' % (singer_id, singer_id), - singer_id, 'Downloading singer data') - - name = info['artist']['name'] - if info['artist']['trans']: - name = '%s - %s' % (name, info['artist']['trans']) - if info['artist']['alias']: - name = '%s - %s' % (name, ';'.join(info['artist']['alias'])) - - entries = [ - self.url_result('http://music.163.com/#/song?id=%s' % song['id'], - 'NetEaseMusic', song['id']) - for song in info['hotSongs'] - ] - return self.playlist_result(entries, singer_id, name) - - -class NetEaseMusicListIE(NetEaseMusicBaseIE): - IE_NAME = 'netease:playlist' - IE_DESC = '网易云音乐 - 歌单' - _VALID_URL = r'https?://music\.163\.com/(#/)?(playlist|discover/toplist)\?id=(?P<id>[0-9]+)' - _TESTS = [{ - 'url': 'http://music.163.com/#/playlist?id=79177352', - 'info_dict': { - 'id': '79177352', - 'title': 'Billboard 2007 Top 100', - 'description': 'md5:12fd0819cab2965b9583ace0f8b7b022' - }, - 'playlist_count': 99, - 'skip': 'Blocked outside Mainland China', - }, { - 'note': 'Toplist/Charts sample', - 'url': 'http://music.163.com/#/discover/toplist?id=3733003', - 'info_dict': { - 'id': '3733003', - 'title': 're:韩国Melon排行榜周榜 [0-9]{4}-[0-9]{2}-[0-9]{2}', - 'description': 'md5:73ec782a612711cadc7872d9c1e134fc', - }, - 'playlist_count': 50, - 'skip': 'Blocked outside Mainland China', - }] - - def _real_extract(self, url): - list_id = self._match_id(url) - - info = self.query_api( - 'playlist/detail?id=%s&lv=-1&tv=-1' % list_id, - list_id, 'Downloading playlist data')['result'] - - name = info['name'] - desc = info.get('description') - - if info.get('specialType') == 10: # is a chart/toplist - datestamp = datetime.fromtimestamp( - self.convert_milliseconds(info['updateTime'])).strftime('%Y-%m-%d') - name = '%s %s' % (name, datestamp) - - entries = [ - self.url_result('http://music.163.com/#/song?id=%s' % song['id'], - 'NetEaseMusic', song['id']) - for song in info['tracks'] - ] - return self.playlist_result(entries, list_id, name, desc) - - -class NetEaseMusicMvIE(NetEaseMusicBaseIE): - IE_NAME = 'netease:mv' - IE_DESC = '网易云音乐 - MV' - _VALID_URL = r'https?://music\.163\.com/(#/)?mv\?id=(?P<id>[0-9]+)' - _TEST = { - 'url': 'http://music.163.com/#/mv?id=415350', - 'info_dict': { - 'id': '415350', - 'ext': 'mp4', - 'title': '이럴거면 그러지말지', - 'description': '白雅言自作曲唱甜蜜爱情', - 'creator': '白雅言', - 'upload_date': '20150520', - }, - 'skip': 'Blocked outside Mainland China', - } - - def _real_extract(self, url): - mv_id = self._match_id(url) - - info = self.query_api( - 'mv/detail?id=%s&type=mp4' % mv_id, - mv_id, 'Downloading mv info')['data'] - - formats = [ - {'url': mv_url, 'ext': 'mp4', 'format_id': '%sp' % brs, 'height': int(brs)} - for brs, mv_url in info['brs'].items() - ] - self._sort_formats(formats) - - return { - 'id': mv_id, - 'title': info['name'], - 'description': info.get('desc') or info.get('briefDesc'), - 'creator': info['artistName'], - 'upload_date': info['publishTime'].replace('-', ''), - 'formats': formats, - 'thumbnail': info.get('cover'), - 'duration': self.convert_milliseconds(info.get('duration', 0)), - } - - -class NetEaseMusicProgramIE(NetEaseMusicBaseIE): - IE_NAME = 'netease:program' - IE_DESC = '网易云音乐 - 电台节目' - _VALID_URL = r'https?://music\.163\.com/(#/?)program\?id=(?P<id>[0-9]+)' - _TESTS = [{ - 'url': 'http://music.163.com/#/program?id=10109055', - 'info_dict': { - 'id': '10109055', - 'ext': 'mp3', - 'title': '不丹足球背后的故事', - 'description': '喜马拉雅人的足球梦 ...', - 'creator': '大话西藏', - 'timestamp': 1434179342, - 'upload_date': '20150613', - 'duration': 900, - }, - 'skip': 'Blocked outside Mainland China', - }, { - 'note': 'This program has accompanying songs.', - 'url': 'http://music.163.com/#/program?id=10141022', - 'info_dict': { - 'id': '10141022', - 'title': '25岁,你是自在如风的少年<27°C>', - 'description': 'md5:8d594db46cc3e6509107ede70a4aaa3b', - }, - 'playlist_count': 4, - 'skip': 'Blocked outside Mainland China', - }, { - 'note': 'This program has accompanying songs.', - 'url': 'http://music.163.com/#/program?id=10141022', - 'info_dict': { - 'id': '10141022', - 'ext': 'mp3', - 'title': '25岁,你是自在如风的少年<27°C>', - 'description': 'md5:8d594db46cc3e6509107ede70a4aaa3b', - 'timestamp': 1434450841, - 'upload_date': '20150616', - }, - 'params': { - 'noplaylist': True - }, - 'skip': 'Blocked outside Mainland China', - }] - - def _real_extract(self, url): - program_id = self._match_id(url) - - info = self.query_api( - 'dj/program/detail?id=%s' % program_id, - program_id, 'Downloading program info')['program'] - - name = info['name'] - description = info['description'] - - if not info['songs'] or self._downloader.params.get('noplaylist'): - if info['songs']: - self.to_screen( - 'Downloading just the main audio %s because of --no-playlist' - % info['mainSong']['id']) - - formats = self.extract_formats(info['mainSong']) - self._sort_formats(formats) - - return { - 'id': program_id, - 'title': name, - 'description': description, - 'creator': info['dj']['brand'], - 'timestamp': self.convert_milliseconds(info['createTime']), - 'thumbnail': info['coverUrl'], - 'duration': self.convert_milliseconds(info.get('duration', 0)), - 'formats': formats, - } - - self.to_screen( - 'Downloading playlist %s - add --no-playlist to just download the main audio %s' - % (program_id, info['mainSong']['id'])) - - song_ids = [info['mainSong']['id']] - song_ids.extend([song['id'] for song in info['songs']]) - entries = [ - self.url_result('http://music.163.com/#/song?id=%s' % song_id, - 'NetEaseMusic', song_id) - for song_id in song_ids - ] - return self.playlist_result(entries, program_id, name, description) - - -class NetEaseMusicDjRadioIE(NetEaseMusicBaseIE): - IE_NAME = 'netease:djradio' - IE_DESC = '网易云音乐 - 电台' - _VALID_URL = r'https?://music\.163\.com/(#/)?djradio\?id=(?P<id>[0-9]+)' - _TEST = { - 'url': 'http://music.163.com/#/djradio?id=42', - 'info_dict': { - 'id': '42', - 'title': '声音蔓延', - 'description': 'md5:766220985cbd16fdd552f64c578a6b15' - }, - 'playlist_mincount': 40, - 'skip': 'Blocked outside Mainland China', - } - _PAGE_SIZE = 1000 - - def _real_extract(self, url): - dj_id = self._match_id(url) - - name = None - desc = None - entries = [] - for offset in compat_itertools_count(start=0, step=self._PAGE_SIZE): - info = self.query_api( - 'dj/program/byradio?asc=false&limit=%d&radioId=%s&offset=%d' - % (self._PAGE_SIZE, dj_id, offset), - dj_id, 'Downloading dj programs - %d' % offset) - - entries.extend([ - self.url_result( - 'http://music.163.com/#/program?id=%s' % program['id'], - 'NetEaseMusicProgram', program['id']) - for program in info['programs'] - ]) - - if name is None: - radio = info['programs'][0]['radio'] - name = radio['name'] - desc = radio['desc'] - - if not info['more']: - break - - return self.playlist_result(entries, dj_id, name, desc) diff --git a/youtube_dl/extractor/netzkino.py b/youtube_dl/extractor/netzkino.py deleted file mode 100644 index aec3026b1..000000000 --- a/youtube_dl/extractor/netzkino.py +++ /dev/null @@ -1,89 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - clean_html, - int_or_none, - js_to_json, - parse_iso8601, -) - - -class NetzkinoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?netzkino\.de/\#!/(?P<category>[^/]+)/(?P<id>[^/]+)' - - _TEST = { - 'url': 'http://www.netzkino.de/#!/scifikino/rakete-zum-mond', - 'md5': '92a3f8b76f8d7220acce5377ea5d4873', - 'info_dict': { - 'id': 'rakete-zum-mond', - 'ext': 'mp4', - 'title': 'Rakete zum Mond (Endstation Mond, Destination Moon)', - 'comments': 'mincount:3', - 'description': 'md5:1eddeacc7e62d5a25a2d1a7290c64a28', - 'upload_date': '20120813', - 'thumbnail': r're:https?://.*\.jpg$', - 'timestamp': 1344858571, - 'age_limit': 12, - }, - 'params': { - 'skip_download': 'Download only works from Germany', - } - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - category_id = mobj.group('category') - video_id = mobj.group('id') - - api_url = 'http://api.netzkino.de.simplecache.net/capi-2.0a/categories/%s.json?d=www' % category_id - api_info = self._download_json(api_url, video_id) - info = next( - p for p in api_info['posts'] if p['slug'] == video_id) - custom_fields = info['custom_fields'] - - production_js = self._download_webpage( - 'http://www.netzkino.de/beta/dist/production.min.js', video_id, - note='Downloading player code') - avo_js = self._search_regex( - r'var urlTemplate=(\{.*?"\})', - production_js, 'URL templates') - templates = self._parse_json( - avo_js, video_id, transform_source=js_to_json) - - suffix = { - 'hds': '.mp4/manifest.f4m', - 'hls': '.mp4/master.m3u8', - 'pmd': '.mp4', - } - film_fn = custom_fields['Streaming'][0] - formats = [{ - 'format_id': key, - 'ext': 'mp4', - 'url': tpl.replace('{}', film_fn) + suffix[key], - } for key, tpl in templates.items()] - self._sort_formats(formats) - - comments = [{ - 'timestamp': parse_iso8601(c.get('date'), delimiter=' '), - 'id': c['id'], - 'author': c['name'], - 'html': c['content'], - 'parent': 'root' if c.get('parent', 0) == 0 else c['parent'], - } for c in info.get('comments', [])] - - return { - 'id': video_id, - 'formats': formats, - 'comments': comments, - 'title': info['title'], - 'age_limit': int_or_none(custom_fields.get('FSK')[0]), - 'timestamp': parse_iso8601(info.get('date'), delimiter=' '), - 'description': clean_html(info.get('content')), - 'thumbnail': info.get('thumbnail'), - 'playlist_title': api_info.get('title'), - 'playlist_id': category_id, - } diff --git a/youtube_dl/extractor/newgrounds.py b/youtube_dl/extractor/newgrounds.py deleted file mode 100644 index 82e7cf522..000000000 --- a/youtube_dl/extractor/newgrounds.py +++ /dev/null @@ -1,168 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - extract_attributes, - int_or_none, - parse_duration, - parse_filesize, - unified_timestamp, -) - - -class NewgroundsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?newgrounds\.com/(?:audio/listen|portal/view)/(?P<id>[0-9]+)' - _TESTS = [{ - 'url': 'https://www.newgrounds.com/audio/listen/549479', - 'md5': 'fe6033d297591288fa1c1f780386f07a', - 'info_dict': { - 'id': '549479', - 'ext': 'mp3', - 'title': 'B7 - BusMode', - 'uploader': 'Burn7', - 'timestamp': 1378878540, - 'upload_date': '20130911', - 'duration': 143, - }, - }, { - 'url': 'https://www.newgrounds.com/portal/view/673111', - 'md5': '3394735822aab2478c31b1004fe5e5bc', - 'info_dict': { - 'id': '673111', - 'ext': 'mp4', - 'title': 'Dancin', - 'uploader': 'Squirrelman82', - 'timestamp': 1460256780, - 'upload_date': '20160410', - }, - }, { - # source format unavailable, additional mp4 formats - 'url': 'http://www.newgrounds.com/portal/view/689400', - 'info_dict': { - 'id': '689400', - 'ext': 'mp4', - 'title': 'ZTV News Episode 8', - 'uploader': 'BennettTheSage', - 'timestamp': 1487965140, - 'upload_date': '20170224', - }, - 'params': { - 'skip_download': True, - }, - }] - - def _real_extract(self, url): - media_id = self._match_id(url) - - webpage = self._download_webpage(url, media_id) - - title = self._html_search_regex( - r'<title>([^>]+)</title>', webpage, 'title') - - media_url = self._parse_json(self._search_regex( - r'"url"\s*:\s*("[^"]+"),', webpage, ''), media_id) - - formats = [{ - 'url': media_url, - 'format_id': 'source', - 'quality': 1, - }] - - max_resolution = int_or_none(self._search_regex( - r'max_resolution["\']\s*:\s*(\d+)', webpage, 'max resolution', - default=None)) - if max_resolution: - url_base = media_url.rpartition('.')[0] - for resolution in (360, 720, 1080): - if resolution > max_resolution: - break - formats.append({ - 'url': '%s.%dp.mp4' % (url_base, resolution), - 'format_id': '%dp' % resolution, - 'height': resolution, - }) - - self._check_formats(formats, media_id) - self._sort_formats(formats) - - uploader = self._html_search_regex( - (r'(?s)<h4[^>]*>(.+?)</h4>.*?<em>\s*Author\s*</em>', - r'(?:Author|Writer)\s*<a[^>]+>([^<]+)'), webpage, 'uploader', - fatal=False) - - timestamp = unified_timestamp(self._html_search_regex( - (r'<dt>\s*Uploaded\s*</dt>\s*<dd>([^<]+</dd>\s*<dd>[^<]+)', - r'<dt>\s*Uploaded\s*</dt>\s*<dd>([^<]+)'), webpage, 'timestamp', - default=None)) - duration = parse_duration(self._search_regex( - r'(?s)<dd>\s*Song\s*</dd>\s*<dd>.+?</dd>\s*<dd>([^<]+)', webpage, - 'duration', default=None)) - - filesize_approx = parse_filesize(self._html_search_regex( - r'(?s)<dd>\s*Song\s*</dd>\s*<dd>(.+?)</dd>', webpage, 'filesize', - default=None)) - if len(formats) == 1: - formats[0]['filesize_approx'] = filesize_approx - - if '<dd>Song' in webpage: - formats[0]['vcodec'] = 'none' - - return { - 'id': media_id, - 'title': title, - 'uploader': uploader, - 'timestamp': timestamp, - 'duration': duration, - 'formats': formats, - } - - -class NewgroundsPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?newgrounds\.com/(?:collection|[^/]+/search/[^/]+)/(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'https://www.newgrounds.com/collection/cats', - 'info_dict': { - 'id': 'cats', - 'title': 'Cats', - }, - 'playlist_mincount': 46, - }, { - 'url': 'http://www.newgrounds.com/portal/search/author/ZONE-SAMA', - 'info_dict': { - 'id': 'ZONE-SAMA', - 'title': 'Portal Search: ZONE-SAMA', - }, - 'playlist_mincount': 47, - }, { - 'url': 'http://www.newgrounds.com/audio/search/title/cats', - 'only_matching': True, - }] - - def _real_extract(self, url): - playlist_id = self._match_id(url) - - webpage = self._download_webpage(url, playlist_id) - - title = self._search_regex( - r'<title>([^>]+)</title>', webpage, 'title', default=None) - - # cut left menu - webpage = self._search_regex( - r'(?s)<div[^>]+\bclass=["\']column wide(.+)', - webpage, 'wide column', default=webpage) - - entries = [] - for a, path, media_id in re.findall( - r'(<a[^>]+\bhref=["\']/?((?:portal/view|audio/listen)/(\d+))[^>]+>)', - webpage): - a_class = extract_attributes(a).get('class') - if a_class not in ('item-portalsubmission', 'item-audiosubmission'): - continue - entries.append( - self.url_result( - 'https://www.newgrounds.com/%s' % path, - ie=NewgroundsIE.ie_key(), video_id=media_id)) - - return self.playlist_result(entries, playlist_id, title) diff --git a/youtube_dl/extractor/nexx.py b/youtube_dl/extractor/nexx.py deleted file mode 100644 index 586c1b7eb..000000000 --- a/youtube_dl/extractor/nexx.py +++ /dev/null @@ -1,453 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import hashlib -import random -import re -import time - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - ExtractorError, - int_or_none, - parse_duration, - try_get, - urlencode_postdata, -) - - -class NexxIE(InfoExtractor): - _VALID_URL = r'''(?x) - (?: - https?://api\.nexx(?:\.cloud|cdn\.com)/v3/(?P<domain_id>\d+)/videos/byid/| - nexx:(?:(?P<domain_id_s>\d+):)?| - https?://arc\.nexx\.cloud/api/video/ - ) - (?P<id>\d+) - ''' - _TESTS = [{ - # movie - 'url': 'https://api.nexx.cloud/v3/748/videos/byid/128907', - 'md5': '31899fd683de49ad46f4ee67e53e83fe', - 'info_dict': { - 'id': '128907', - 'ext': 'mp4', - 'title': 'Stiftung Warentest', - 'alt_title': 'Wie ein Test abläuft', - 'description': 'md5:d1ddb1ef63de721132abd38639cc2fd2', - 'creator': 'SPIEGEL TV', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 2509, - 'timestamp': 1384264416, - 'upload_date': '20131112', - }, - }, { - # episode - 'url': 'https://api.nexx.cloud/v3/741/videos/byid/247858', - 'info_dict': { - 'id': '247858', - 'ext': 'mp4', - 'title': 'Return of the Golden Child (OV)', - 'description': 'md5:5d969537509a92b733de21bae249dc63', - 'release_year': 2017, - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 1397, - 'timestamp': 1495033267, - 'upload_date': '20170517', - 'episode_number': 2, - 'season_number': 2, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'HTTP Error 404: Not Found', - }, { - # does not work via arc - 'url': 'nexx:741:1269984', - 'md5': 'c714b5b238b2958dc8d5642addba6886', - 'info_dict': { - 'id': '1269984', - 'ext': 'mp4', - 'title': '1 TAG ohne KLO... wortwörtlich! 😑', - 'alt_title': '1 TAG ohne KLO... wortwörtlich! 😑', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 607, - 'timestamp': 1518614955, - 'upload_date': '20180214', - }, - }, { - # free cdn from http://www.spiegel.de/video/eifel-zoo-aufregung-um-ausgebrochene-raubtiere-video-99018031.html - 'url': 'nexx:747:1533779', - 'md5': '6bf6883912b82b7069fb86c2297e9893', - 'info_dict': { - 'id': '1533779', - 'ext': 'mp4', - 'title': 'Aufregung um ausgebrochene Raubtiere', - 'alt_title': 'Eifel-Zoo', - 'description': 'md5:f21375c91c74ad741dcb164c427999d2', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 111, - 'timestamp': 1527874460, - 'upload_date': '20180601', - }, - }, { - 'url': 'https://api.nexxcdn.com/v3/748/videos/byid/128907', - 'only_matching': True, - }, { - 'url': 'nexx:748:128907', - 'only_matching': True, - }, { - 'url': 'nexx:128907', - 'only_matching': True, - }, { - 'url': 'https://arc.nexx.cloud/api/video/128907.json', - 'only_matching': True, - }] - - @staticmethod - def _extract_domain_id(webpage): - mobj = re.search( - r'<script\b[^>]+\bsrc=["\'](?:https?:)?//(?:require|arc)\.nexx(?:\.cloud|cdn\.com)/(?:sdk/)?(?P<id>\d+)', - webpage) - return mobj.group('id') if mobj else None - - @staticmethod - def _extract_urls(webpage): - # Reference: - # 1. https://nx-s.akamaized.net/files/201510/44.pdf - - entries = [] - - # JavaScript Integration - domain_id = NexxIE._extract_domain_id(webpage) - if domain_id: - for video_id in re.findall( - r'(?is)onPLAYReady.+?_play\.(?:init|(?:control\.)?addPlayer)\s*\(.+?\s*,\s*["\']?(\d+)', - webpage): - entries.append( - 'https://api.nexx.cloud/v3/%s/videos/byid/%s' - % (domain_id, video_id)) - - # TODO: support more embed formats - - return entries - - @staticmethod - def _extract_url(webpage): - return NexxIE._extract_urls(webpage)[0] - - def _handle_error(self, response): - status = int_or_none(try_get( - response, lambda x: x['metadata']['status']) or 200) - if 200 <= status < 300: - return - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, response['metadata']['errorhint']), - expected=True) - - def _call_api(self, domain_id, path, video_id, data=None, headers={}): - headers['Content-Type'] = 'application/x-www-form-urlencoded; charset=UTF-8' - result = self._download_json( - 'https://api.nexx.cloud/v3/%s/%s' % (domain_id, path), video_id, - 'Downloading %s JSON' % path, data=urlencode_postdata(data), - headers=headers) - self._handle_error(result) - return result['result'] - - def _extract_free_formats(self, video, video_id): - stream_data = video['streamdata'] - cdn = stream_data['cdnType'] - assert cdn == 'free' - - hash = video['general']['hash'] - - ps = compat_str(stream_data['originalDomain']) - if stream_data['applyFolderHierarchy'] == 1: - s = ('%04d' % int(video_id))[::-1] - ps += '/%s/%s' % (s[0:2], s[2:4]) - ps += '/%s/%s_' % (video_id, hash) - - t = 'http://%s' + ps - fd = stream_data['azureFileDistribution'].split(',') - cdn_provider = stream_data['cdnProvider'] - - def p0(p): - return '_%s' % p if stream_data['applyAzureStructure'] == 1 else '' - - formats = [] - if cdn_provider == 'ak': - t += ',' - for i in fd: - p = i.split(':') - t += p[1] + p0(int(p[0])) + ',' - t += '.mp4.csmil/master.%s' - elif cdn_provider == 'ce': - k = t.split('/') - h = k.pop() - http_base = t = '/'.join(k) - http_base = http_base % stream_data['cdnPathHTTP'] - t += '/asset.ism/manifest.%s?dcp_ver=aos4&videostream=' - for i in fd: - p = i.split(':') - tbr = int(p[0]) - filename = '%s%s%s.mp4' % (h, p[1], p0(tbr)) - f = { - 'url': http_base + '/' + filename, - 'format_id': '%s-http-%d' % (cdn, tbr), - 'tbr': tbr, - } - width_height = p[1].split('x') - if len(width_height) == 2: - f.update({ - 'width': int_or_none(width_height[0]), - 'height': int_or_none(width_height[1]), - }) - formats.append(f) - a = filename + ':%s' % (tbr * 1000) - t += a + ',' - t = t[:-1] + '&audiostream=' + a.split(':')[0] - else: - assert False - - if cdn_provider == 'ce': - formats.extend(self._extract_mpd_formats( - t % (stream_data['cdnPathDASH'], 'mpd'), video_id, - mpd_id='%s-dash' % cdn, fatal=False)) - formats.extend(self._extract_m3u8_formats( - t % (stream_data['cdnPathHLS'], 'm3u8'), video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='%s-hls' % cdn, fatal=False)) - - return formats - - def _extract_azure_formats(self, video, video_id): - stream_data = video['streamdata'] - cdn = stream_data['cdnType'] - assert cdn == 'azure' - - azure_locator = stream_data['azureLocator'] - - def get_cdn_shield_base(shield_type='', static=False): - for secure in ('', 's'): - cdn_shield = stream_data.get('cdnShield%sHTTP%s' % (shield_type, secure.upper())) - if cdn_shield: - return 'http%s://%s' % (secure, cdn_shield) - else: - if 'fb' in stream_data['azureAccount']: - prefix = 'df' if static else 'f' - else: - prefix = 'd' if static else 'p' - account = int(stream_data['azureAccount'].replace('nexxplayplus', '').replace('nexxplayfb', '')) - return 'http://nx-%s%02d.akamaized.net/' % (prefix, account) - - language = video['general'].get('language_raw') or '' - - azure_stream_base = get_cdn_shield_base() - is_ml = ',' in language - azure_manifest_url = '%s%s/%s_src%s.ism/Manifest' % ( - azure_stream_base, azure_locator, video_id, ('_manifest' if is_ml else '')) + '%s' - - protection_token = try_get( - video, lambda x: x['protectiondata']['token'], compat_str) - if protection_token: - azure_manifest_url += '?hdnts=%s' % protection_token - - formats = self._extract_m3u8_formats( - azure_manifest_url % '(format=m3u8-aapl)', - video_id, 'mp4', 'm3u8_native', - m3u8_id='%s-hls' % cdn, fatal=False) - formats.extend(self._extract_mpd_formats( - azure_manifest_url % '(format=mpd-time-csf)', - video_id, mpd_id='%s-dash' % cdn, fatal=False)) - formats.extend(self._extract_ism_formats( - azure_manifest_url % '', video_id, ism_id='%s-mss' % cdn, fatal=False)) - - azure_progressive_base = get_cdn_shield_base('Prog', True) - azure_file_distribution = stream_data.get('azureFileDistribution') - if azure_file_distribution: - fds = azure_file_distribution.split(',') - if fds: - for fd in fds: - ss = fd.split(':') - if len(ss) == 2: - tbr = int_or_none(ss[0]) - if tbr: - f = { - 'url': '%s%s/%s_src_%s_%d.mp4' % ( - azure_progressive_base, azure_locator, video_id, ss[1], tbr), - 'format_id': '%s-http-%d' % (cdn, tbr), - 'tbr': tbr, - } - width_height = ss[1].split('x') - if len(width_height) == 2: - f.update({ - 'width': int_or_none(width_height[0]), - 'height': int_or_none(width_height[1]), - }) - formats.append(f) - - return formats - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - domain_id = mobj.group('domain_id') or mobj.group('domain_id_s') - video_id = mobj.group('id') - - video = None - - def find_video(result): - if isinstance(result, dict): - return result - elif isinstance(result, list): - vid = int(video_id) - for v in result: - if try_get(v, lambda x: x['general']['ID'], int) == vid: - return v - return None - - response = self._download_json( - 'https://arc.nexx.cloud/api/video/%s.json' % video_id, - video_id, fatal=False) - if response and isinstance(response, dict): - result = response.get('result') - if result: - video = find_video(result) - - # not all videos work via arc, e.g. nexx:741:1269984 - if not video: - # Reverse engineered from JS code (see getDeviceID function) - device_id = '%d:%d:%d%d' % ( - random.randint(1, 4), int(time.time()), - random.randint(1e4, 99999), random.randint(1, 9)) - - result = self._call_api(domain_id, 'session/init', video_id, data={ - 'nxp_devh': device_id, - 'nxp_userh': '', - 'precid': '0', - 'playlicense': '0', - 'screenx': '1920', - 'screeny': '1080', - 'playerversion': '6.0.00', - 'gateway': 'html5', - 'adGateway': '', - 'explicitlanguage': 'en-US', - 'addTextTemplates': '1', - 'addDomainData': '1', - 'addAdModel': '1', - }, headers={ - 'X-Request-Enable-Auth-Fallback': '1', - }) - - cid = result['general']['cid'] - - # As described in [1] X-Request-Token generation algorithm is - # as follows: - # md5( operation + domain_id + domain_secret ) - # where domain_secret is a static value that will be given by nexx.tv - # as per [1]. Here is how this "secret" is generated (reversed - # from _play.api.init function, search for clienttoken). So it's - # actually not static and not that much of a secret. - # 1. https://nexxtvstorage.blob.core.windows.net/files/201610/27.pdf - secret = result['device']['clienttoken'][int(device_id[0]):] - secret = secret[0:len(secret) - int(device_id[-1])] - - op = 'byid' - - # Reversed from JS code for _play.api.call function (search for - # X-Request-Token) - request_token = hashlib.md5( - ''.join((op, domain_id, secret)).encode('utf-8')).hexdigest() - - result = self._call_api( - domain_id, 'videos/%s/%s' % (op, video_id), video_id, data={ - 'additionalfields': 'language,channel,actors,studio,licenseby,slug,subtitle,teaser,description', - 'addInteractionOptions': '1', - 'addStatusDetails': '1', - 'addStreamDetails': '1', - 'addCaptions': '1', - 'addScenes': '1', - 'addHotSpots': '1', - 'addBumpers': '1', - 'captionFormat': 'data', - }, headers={ - 'X-Request-CID': cid, - 'X-Request-Token': request_token, - }) - video = find_video(result) - - general = video['general'] - title = general['title'] - - cdn = video['streamdata']['cdnType'] - - if cdn == 'azure': - formats = self._extract_azure_formats(video, video_id) - elif cdn == 'free': - formats = self._extract_free_formats(video, video_id) - else: - # TODO: reverse more cdns - assert False - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'alt_title': general.get('subtitle'), - 'description': general.get('description'), - 'release_year': int_or_none(general.get('year')), - 'creator': general.get('studio') or general.get('studio_adref'), - 'thumbnail': try_get( - video, lambda x: x['imagedata']['thumb'], compat_str), - 'duration': parse_duration(general.get('runtime')), - 'timestamp': int_or_none(general.get('uploaded')), - 'episode_number': int_or_none(try_get( - video, lambda x: x['episodedata']['episode'])), - 'season_number': int_or_none(try_get( - video, lambda x: x['episodedata']['season'])), - 'formats': formats, - } - - -class NexxEmbedIE(InfoExtractor): - _VALID_URL = r'https?://embed\.nexx(?:\.cloud|cdn\.com)/\d+/(?:video/)?(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'http://embed.nexx.cloud/748/KC1614647Z27Y7T?autoplay=1', - 'md5': '16746bfc28c42049492385c989b26c4a', - 'info_dict': { - 'id': '161464', - 'ext': 'mp4', - 'title': 'Nervenkitzel Achterbahn', - 'alt_title': 'Karussellbauer in Deutschland', - 'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc', - 'creator': 'SPIEGEL TV', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 2761, - 'timestamp': 1394021479, - 'upload_date': '20140305', - }, - 'params': { - 'format': 'bestvideo', - 'skip_download': True, - }, - }, { - 'url': 'https://embed.nexx.cloud/11888/video/DSRTO7UVOX06S7', - 'only_matching': True, - }] - - @staticmethod - def _extract_urls(webpage): - # Reference: - # 1. https://nx-s.akamaized.net/files/201510/44.pdf - - # iFrame Embed Integration - return [mobj.group('url') for mobj in re.finditer( - r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//embed\.nexx(?:\.cloud|cdn\.com)/\d+/(?:(?!\1).)+)\1', - webpage)] - - def _real_extract(self, url): - embed_id = self._match_id(url) - - webpage = self._download_webpage(url, embed_id) - - return self.url_result(NexxIE._extract_url(webpage), ie=NexxIE.ie_key()) diff --git a/youtube_dl/extractor/nhk.py b/youtube_dl/extractor/nhk.py deleted file mode 100644 index 8a9331a79..000000000 --- a/youtube_dl/extractor/nhk.py +++ /dev/null @@ -1,178 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import urljoin - - -class NhkBaseIE(InfoExtractor): - _API_URL_TEMPLATE = 'https://api.nhk.or.jp/nhkworld/%sod%slist/v7a/%s/%s/%s/all%s.json' - _BASE_URL_REGEX = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/ondemand' - _TYPE_REGEX = r'/(?P<type>video|audio)/' - - def _call_api(self, m_id, lang, is_video, is_episode, is_clip): - return self._download_json( - self._API_URL_TEMPLATE % ( - 'v' if is_video else 'r', - 'clip' if is_clip else 'esd', - 'episode' if is_episode else 'program', - m_id, lang, '/all' if is_video else ''), - m_id, query={'apikey': 'EJfK8jdS57GqlupFgAfAAwr573q01y6k'})['data']['episodes'] or [] - - def _extract_episode_info(self, url, episode=None): - fetch_episode = episode is None - lang, m_type, episode_id = re.match(NhkVodIE._VALID_URL, url).groups() - if episode_id.isdigit(): - episode_id = episode_id[:4] + '-' + episode_id[4:] - - is_video = m_type == 'video' - if fetch_episode: - episode = self._call_api( - episode_id, lang, is_video, True, episode_id[:4] == '9999')[0] - title = episode.get('sub_title_clean') or episode['sub_title'] - - def get_clean_field(key): - return episode.get(key + '_clean') or episode.get(key) - - series = get_clean_field('title') - - thumbnails = [] - for s, w, h in [('', 640, 360), ('_l', 1280, 720)]: - img_path = episode.get('image' + s) - if not img_path: - continue - thumbnails.append({ - 'id': '%dp' % h, - 'height': h, - 'width': w, - 'url': 'https://www3.nhk.or.jp' + img_path, - }) - - info = { - 'id': episode_id + '-' + lang, - 'title': '%s - %s' % (series, title) if series and title else title, - 'description': get_clean_field('description'), - 'thumbnails': thumbnails, - 'series': series, - 'episode': title, - } - if is_video: - vod_id = episode['vod_id'] - info.update({ - '_type': 'url_transparent', - 'ie_key': 'Piksel', - 'url': 'https://player.piksel.com/v/refid/nhkworld/prefid/' + vod_id, - 'id': vod_id, - }) - else: - if fetch_episode: - audio_path = episode['audio']['audio'] - info['formats'] = self._extract_m3u8_formats( - 'https://nhkworld-vh.akamaihd.net/i%s/master.m3u8' % audio_path, - episode_id, 'm4a', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False) - for f in info['formats']: - f['language'] = lang - else: - info.update({ - '_type': 'url_transparent', - 'ie_key': NhkVodIE.ie_key(), - 'url': url, - }) - return info - - -class NhkVodIE(NhkBaseIE): - _VALID_URL = r'%s%s(?P<id>\d{7}|[^/]+?-\d{8}-[0-9a-z]+)' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX) - # Content available only for a limited period of time. Visit - # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples. - _TESTS = [{ - # video clip - 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999011/', - 'md5': '7a90abcfe610ec22a6bfe15bd46b30ca', - 'info_dict': { - 'id': 'a95j5iza', - 'ext': 'mp4', - 'title': "Dining with the Chef - Chef Saito's Family recipe: MENCHI-KATSU", - 'description': 'md5:5aee4a9f9d81c26281862382103b0ea5', - 'timestamp': 1565965194, - 'upload_date': '20190816', - }, - }, { - # audio clip - 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/r_inventions-20201104-1/', - 'info_dict': { - 'id': 'r_inventions-20201104-1-en', - 'ext': 'm4a', - 'title': "Japan's Top Inventions - Miniature Video Cameras", - 'description': 'md5:07ea722bdbbb4936fdd360b6a480c25b', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2015173/', - 'only_matching': True, - }, { - 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/plugin-20190404-1/', - 'only_matching': True, - }, { - 'url': 'https://www3.nhk.or.jp/nhkworld/fr/ondemand/audio/plugin-20190404-1/', - 'only_matching': True, - }, { - 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/j_art-20150903-1/', - 'only_matching': True, - }] - - def _real_extract(self, url): - return self._extract_episode_info(url) - - -class NhkVodProgramIE(NhkBaseIE): - _VALID_URL = r'%s/program%s(?P<id>[0-9a-z]+)(?:.+?\btype=(?P<episode_type>clip|(?:radio|tv)Episode))?' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX) - _TESTS = [{ - # video program episodes - 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway', - 'info_dict': { - 'id': 'japanrailway', - 'title': 'Japan Railway Journal', - }, - 'playlist_mincount': 1, - }, { - # video program clips - 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway/?type=clip', - 'info_dict': { - 'id': 'japanrailway', - 'title': 'Japan Railway Journal', - }, - 'playlist_mincount': 5, - }, { - 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/10yearshayaomiyazaki/', - 'only_matching': True, - }, { - # audio program - 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/audio/listener/', - 'only_matching': True, - }] - - def _real_extract(self, url): - lang, m_type, program_id, episode_type = re.match(self._VALID_URL, url).groups() - - episodes = self._call_api( - program_id, lang, m_type == 'video', False, episode_type == 'clip') - - entries = [] - for episode in episodes: - episode_path = episode.get('url') - if not episode_path: - continue - entries.append(self._extract_episode_info( - urljoin(url, episode_path), episode)) - - program_title = None - if entries: - program_title = entries[0].get('series') - - return self.playlist_result(entries, program_id, program_title) diff --git a/youtube_dl/extractor/nhl.py b/youtube_dl/extractor/nhl.py deleted file mode 100644 index eddfe1f37..000000000 --- a/youtube_dl/extractor/nhl.py +++ /dev/null @@ -1,128 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - determine_ext, - int_or_none, - parse_iso8601, - parse_duration, -) - - -class NHLBaseIE(InfoExtractor): - def _real_extract(self, url): - site, tmp_id = re.match(self._VALID_URL, url).groups() - video_data = self._download_json( - 'https://%s/%s/%sid/v1/%s/details/web-v1.json' - % (self._CONTENT_DOMAIN, site[:3], 'item/' if site == 'mlb' else '', tmp_id), tmp_id) - if video_data.get('type') != 'video': - video_data = video_data['media'] - video = video_data.get('video') - if video: - video_data = video - else: - videos = video_data.get('videos') - if videos: - video_data = videos[0] - - video_id = compat_str(video_data['id']) - title = video_data['title'] - - formats = [] - for playback in video_data.get('playbacks', []): - playback_url = playback.get('url') - if not playback_url: - continue - ext = determine_ext(playback_url) - if ext == 'm3u8': - m3u8_formats = self._extract_m3u8_formats( - playback_url, video_id, 'mp4', 'm3u8_native', - m3u8_id=playback.get('name', 'hls'), fatal=False) - self._check_formats(m3u8_formats, video_id) - formats.extend(m3u8_formats) - else: - height = int_or_none(playback.get('height')) - formats.append({ - 'format_id': playback.get('name', 'http' + ('-%dp' % height if height else '')), - 'url': playback_url, - 'width': int_or_none(playback.get('width')), - 'height': height, - 'tbr': int_or_none(self._search_regex(r'_(\d+)[kK]', playback_url, 'bitrate', default=None)), - }) - self._sort_formats(formats) - - thumbnails = [] - cuts = video_data.get('image', {}).get('cuts') or [] - if isinstance(cuts, dict): - cuts = cuts.values() - for thumbnail_data in cuts: - thumbnail_url = thumbnail_data.get('src') - if not thumbnail_url: - continue - thumbnails.append({ - 'url': thumbnail_url, - 'width': int_or_none(thumbnail_data.get('width')), - 'height': int_or_none(thumbnail_data.get('height')), - }) - - return { - 'id': video_id, - 'title': title, - 'description': video_data.get('description'), - 'timestamp': parse_iso8601(video_data.get('date')), - 'duration': parse_duration(video_data.get('duration')), - 'thumbnails': thumbnails, - 'formats': formats, - } - - -class NHLIE(NHLBaseIE): - IE_NAME = 'nhl.com' - _VALID_URL = r'https?://(?:www\.)?(?P<site>nhl|wch2016)\.com/(?:[^/]+/)*c-(?P<id>\d+)' - _CONTENT_DOMAIN = 'nhl.bamcontent.com' - _TESTS = [{ - # type=video - 'url': 'https://www.nhl.com/video/anisimov-cleans-up-mess/t-277752844/c-43663503', - 'md5': '0f7b9a8f986fb4b4eeeece9a56416eaf', - 'info_dict': { - 'id': '43663503', - 'ext': 'mp4', - 'title': 'Anisimov cleans up mess', - 'description': 'md5:a02354acdfe900e940ce40706939ca63', - 'timestamp': 1461288600, - 'upload_date': '20160422', - }, - }, { - # type=article - 'url': 'https://www.nhl.com/news/dennis-wideman-suspended/c-278258934', - 'md5': '1f39f4ea74c1394dea110699a25b366c', - 'info_dict': { - 'id': '40784403', - 'ext': 'mp4', - 'title': 'Wideman suspended by NHL', - 'description': 'Flames defenseman Dennis Wideman was banned 20 games for violation of Rule 40 (Physical Abuse of Officials)', - 'upload_date': '20160204', - 'timestamp': 1454544904, - }, - }, { - # Some m3u8 URLs are invalid (https://github.com/ytdl-org/youtube-dl/issues/10713) - 'url': 'https://www.nhl.com/predators/video/poile-laviolette-on-subban-trade/t-277437416/c-44315003', - 'md5': '50b2bb47f405121484dda3ccbea25459', - 'info_dict': { - 'id': '44315003', - 'ext': 'mp4', - 'title': 'Poile, Laviolette on Subban trade', - 'description': 'General manager David Poile and head coach Peter Laviolette share their thoughts on acquiring P.K. Subban from Montreal (06/29/16)', - 'timestamp': 1467242866, - 'upload_date': '20160629', - }, - }, { - 'url': 'https://www.wch2016.com/video/caneur-best-of-game-2-micd-up/t-281230378/c-44983703', - 'only_matching': True, - }, { - 'url': 'https://www.wch2016.com/news/3-stars-team-europe-vs-team-canada/c-282195068', - 'only_matching': True, - }] diff --git a/youtube_dl/extractor/nick.py b/youtube_dl/extractor/nick.py deleted file mode 100644 index 2e8b302ac..000000000 --- a/youtube_dl/extractor/nick.py +++ /dev/null @@ -1,249 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .mtv import MTVServicesInfoExtractor -from ..utils import update_url_query - - -class NickIE(MTVServicesInfoExtractor): - # None of videos on the website are still alive? - IE_NAME = 'nick.com' - _VALID_URL = r'https?://(?P<domain>(?:(?:www|beta)\.)?nick(?:jr)?\.com)/(?:[^/]+/)?(?:videos/clip|[^/]+/videos)/(?P<id>[^/?#.]+)' - _FEED_URL = 'http://udat.mtvnservices.com/service1/dispatch.htm' - _GEO_COUNTRIES = ['US'] - _TESTS = [{ - 'url': 'http://www.nick.com/videos/clip/alvinnn-and-the-chipmunks-112-full-episode.html', - 'playlist': [ - { - 'md5': '6e5adc1e28253bbb1b28ab05403dd4d4', - 'info_dict': { - 'id': 'be6a17b0-412d-11e5-8ff7-0026b9414f30', - 'ext': 'mp4', - 'title': 'ALVINNN!!! and The Chipmunks: "Mojo Missing/Who\'s The Animal" S1', - 'description': 'Alvin is convinced his mojo was in a cap he gave to a fan, and must find a way to get his hat back before the Chipmunks’ big concert.\nDuring a costume visit to the zoo, Alvin finds himself mistaken for the real Tasmanian devil.', - - } - }, - { - 'md5': 'd7be441fc53a1d4882fa9508a1e5b3ce', - 'info_dict': { - 'id': 'be6b8f96-412d-11e5-8ff7-0026b9414f30', - 'ext': 'mp4', - 'title': 'ALVINNN!!! and The Chipmunks: "Mojo Missing/Who\'s The Animal" S2', - 'description': 'Alvin is convinced his mojo was in a cap he gave to a fan, and must find a way to get his hat back before the Chipmunks’ big concert.\nDuring a costume visit to the zoo, Alvin finds himself mistaken for the real Tasmanian devil.', - - } - }, - { - 'md5': 'efffe1728a234b2b0d2f2b343dd1946f', - 'info_dict': { - 'id': 'be6cf7e6-412d-11e5-8ff7-0026b9414f30', - 'ext': 'mp4', - 'title': 'ALVINNN!!! and The Chipmunks: "Mojo Missing/Who\'s The Animal" S3', - 'description': 'Alvin is convinced his mojo was in a cap he gave to a fan, and must find a way to get his hat back before the Chipmunks’ big concert.\nDuring a costume visit to the zoo, Alvin finds himself mistaken for the real Tasmanian devil.', - } - }, - { - 'md5': '1ec6690733ab9f41709e274a1d5c7556', - 'info_dict': { - 'id': 'be6e3354-412d-11e5-8ff7-0026b9414f30', - 'ext': 'mp4', - 'title': 'ALVINNN!!! and The Chipmunks: "Mojo Missing/Who\'s The Animal" S4', - 'description': 'Alvin is convinced his mojo was in a cap he gave to a fan, and must find a way to get his hat back before the Chipmunks’ big concert.\nDuring a costume visit to the zoo, Alvin finds himself mistaken for the real Tasmanian devil.', - } - }, - ], - }, { - 'url': 'http://www.nickjr.com/paw-patrol/videos/pups-save-a-goldrush-s3-ep302-full-episode/', - 'only_matching': True, - }, { - 'url': 'http://beta.nick.com/nicky-ricky-dicky-and-dawn/videos/nicky-ricky-dicky-dawn-301-full-episode/', - 'only_matching': True, - }] - - def _get_feed_query(self, uri): - return { - 'feed': 'nick_arc_player_prime', - 'mgid': uri, - } - - def _real_extract(self, url): - domain, display_id = re.match(self._VALID_URL, url).groups() - video_data = self._download_json( - 'http://%s/data/video.endLevel.json' % domain, - display_id, query={ - 'urlKey': display_id, - }) - return self._get_videos_info(video_data['player'] + video_data['id']) - - -class NickBrIE(MTVServicesInfoExtractor): - IE_NAME = 'nickelodeon:br' - _VALID_URL = r'''(?x) - https?:// - (?: - (?P<domain>(?:www\.)?nickjr|mundonick\.uol)\.com\.br| - (?:www\.)?nickjr\.[a-z]{2}| - (?:www\.)?nickelodeonjunior\.fr - ) - /(?:programas/)?[^/]+/videos/(?:episodios/)?(?P<id>[^/?\#.]+) - ''' - _TESTS = [{ - 'url': 'http://www.nickjr.com.br/patrulha-canina/videos/210-labirinto-de-pipoca/', - 'only_matching': True, - }, { - 'url': 'http://mundonick.uol.com.br/programas/the-loud-house/videos/muitas-irmas/7ljo9j', - 'only_matching': True, - }, { - 'url': 'http://www.nickjr.nl/paw-patrol/videos/311-ge-wol-dig-om-terug-te-zijn/', - 'only_matching': True, - }, { - 'url': 'http://www.nickjr.de/blaze-und-die-monster-maschinen/videos/f6caaf8f-e4e8-4cc1-b489-9380d6dcd059/', - 'only_matching': True, - }, { - 'url': 'http://www.nickelodeonjunior.fr/paw-patrol-la-pat-patrouille/videos/episode-401-entier-paw-patrol/', - 'only_matching': True, - }] - - def _real_extract(self, url): - domain, display_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage(url, display_id) - uri = self._search_regex( - r'data-(?:contenturi|mgid)="([^"]+)', webpage, 'mgid') - video_id = self._id_from_uri(uri) - config = self._download_json( - 'http://media.mtvnservices.com/pmt/e1/access/index.html', - video_id, query={ - 'uri': uri, - 'configtype': 'edge', - }, headers={ - 'Referer': url, - }) - info_url = self._remove_template_parameter(config['feedWithQueryParams']) - if info_url == 'None': - if domain.startswith('www.'): - domain = domain[4:] - content_domain = { - 'mundonick.uol': 'mundonick.com.br', - 'nickjr': 'br.nickelodeonjunior.tv', - }[domain] - query = { - 'mgid': uri, - 'imageEp': content_domain, - 'arcEp': content_domain, - } - if domain == 'nickjr.com.br': - query['ep'] = 'c4b16088' - info_url = update_url_query( - 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed', query) - return self._get_videos_info_from_url(info_url, video_id) - - -class NickDeIE(MTVServicesInfoExtractor): - IE_NAME = 'nick.de' - _VALID_URL = r'https?://(?:www\.)?(?P<host>nick\.(?:de|com\.pl|ch)|nickelodeon\.(?:nl|be|at|dk|no|se))/[^/]+/(?:[^/]+/)*(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'http://www.nick.de/playlist/3773-top-videos/videos/episode/17306-zu-wasser-und-zu-land-rauchende-erdnusse', - 'only_matching': True, - }, { - 'url': 'http://www.nick.de/shows/342-icarly', - 'only_matching': True, - }, { - 'url': 'http://www.nickelodeon.nl/shows/474-spongebob/videos/17403-een-kijkje-in-de-keuken-met-sandy-van-binnenuit', - 'only_matching': True, - }, { - 'url': 'http://www.nickelodeon.at/playlist/3773-top-videos/videos/episode/77993-das-letzte-gefecht', - 'only_matching': True, - }, { - 'url': 'http://www.nick.com.pl/seriale/474-spongebob-kanciastoporty/wideo/17412-teatr-to-jest-to-rodeo-oszolom', - 'only_matching': True, - }, { - 'url': 'http://www.nickelodeon.no/program/2626-bulderhuset/videoer/90947-femteklasse-veronica-vs-vanzilla', - 'only_matching': True, - }, { - 'url': 'http://www.nickelodeon.dk/serier/2626-hojs-hus/videoer/761-tissepause', - 'only_matching': True, - }, { - 'url': 'http://www.nickelodeon.se/serier/2626-lugn-i-stormen/videos/998-', - 'only_matching': True, - }, { - 'url': 'http://www.nick.ch/shows/2304-adventure-time-abenteuerzeit-mit-finn-und-jake', - 'only_matching': True, - }, { - 'url': 'http://www.nickelodeon.be/afspeellijst/4530-top-videos/videos/episode/73917-inval-broodschapper-lariekoek-arie', - 'only_matching': True, - }] - - def _extract_mrss_url(self, webpage, host): - return update_url_query(self._search_regex( - r'data-mrss=(["\'])(?P<url>http.+?)\1', webpage, 'mrss url', group='url'), - {'siteKey': host}) - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - host = mobj.group('host') - - webpage = self._download_webpage(url, video_id) - - mrss_url = self._extract_mrss_url(webpage, host) - - return self._get_videos_info_from_url(mrss_url, video_id) - - -class NickNightIE(NickDeIE): - IE_NAME = 'nicknight' - _VALID_URL = r'https?://(?:www\.)(?P<host>nicknight\.(?:de|at|tv))/(?:playlist|shows)/(?:[^/]+/)*(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'http://www.nicknight.at/shows/977-awkward/videos/85987-nimmer-beste-freunde', - 'only_matching': True, - }, { - 'url': 'http://www.nicknight.at/shows/977-awkward', - 'only_matching': True, - }, { - 'url': 'http://www.nicknight.at/shows/1900-faking-it', - 'only_matching': True, - }] - - def _extract_mrss_url(self, webpage, *args): - return self._search_regex( - r'mrss\s*:\s*(["\'])(?P<url>http.+?)\1', webpage, - 'mrss url', group='url') - - -class NickRuIE(MTVServicesInfoExtractor): - IE_NAME = 'nickelodeonru' - _VALID_URL = r'https?://(?:www\.)nickelodeon\.(?:ru|fr|es|pt|ro|hu|com\.tr)/[^/]+/(?:[^/]+/)*(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'http://www.nickelodeon.ru/shows/henrydanger/videos/episodes/3-sezon-15-seriya-licenziya-na-polyot/pmomfb#playlist/7airc6', - 'only_matching': True, - }, { - 'url': 'http://www.nickelodeon.ru/videos/smotri-na-nickelodeon-v-iyule/g9hvh7', - 'only_matching': True, - }, { - 'url': 'http://www.nickelodeon.fr/programmes/bob-l-eponge/videos/le-marathon-de-booh-kini-bottom-mardi-31-octobre/nfn7z0', - 'only_matching': True, - }, { - 'url': 'http://www.nickelodeon.es/videos/nickelodeon-consejos-tortitas/f7w7xy', - 'only_matching': True, - }, { - 'url': 'http://www.nickelodeon.pt/series/spongebob-squarepants/videos/a-bolha-de-tinta-gigante/xutq1b', - 'only_matching': True, - }, { - 'url': 'http://www.nickelodeon.ro/emisiuni/shimmer-si-shine/video/nahal-din-bomboane/uw5u2k', - 'only_matching': True, - }, { - 'url': 'http://www.nickelodeon.hu/musorok/spongyabob-kockanadrag/videok/episodes/buborekfujas-az-elszakadt-nadrag/q57iob#playlist/k6te4y', - 'only_matching': True, - }, { - 'url': 'http://www.nickelodeon.com.tr/programlar/sunger-bob/videolar/kayip-yatak/mgqbjy', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - mgid = self._extract_mgid(webpage) - return self.url_result('http://media.mtvnservices.com/embed/%s' % mgid) diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py deleted file mode 100644 index a85fc3d5c..000000000 --- a/youtube_dl/extractor/niconico.py +++ /dev/null @@ -1,515 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import datetime -import functools -import json -import math - -from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_urllib_parse_urlparse, -) -from ..utils import ( - determine_ext, - dict_get, - ExtractorError, - float_or_none, - InAdvancePagedList, - int_or_none, - parse_duration, - parse_iso8601, - remove_start, - try_get, - unified_timestamp, - urlencode_postdata, - xpath_text, -) - - -class NiconicoIE(InfoExtractor): - IE_NAME = 'niconico' - IE_DESC = 'ニコニコ動画' - - _TESTS = [{ - 'url': 'http://www.nicovideo.jp/watch/sm22312215', - 'md5': 'd1a75c0823e2f629128c43e1212760f9', - 'info_dict': { - 'id': 'sm22312215', - 'ext': 'mp4', - 'title': 'Big Buck Bunny', - 'thumbnail': r're:https?://.*', - 'uploader': 'takuya0301', - 'uploader_id': '2698420', - 'upload_date': '20131123', - 'timestamp': int, # timestamp is unstable - 'description': '(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org', - 'duration': 33, - 'view_count': int, - 'comment_count': int, - }, - 'skip': 'Requires an account', - }, { - # File downloaded with and without credentials are different, so omit - # the md5 field - 'url': 'http://www.nicovideo.jp/watch/nm14296458', - 'info_dict': { - 'id': 'nm14296458', - 'ext': 'swf', - 'title': '【鏡音リン】Dance on media【オリジナル】take2!', - 'description': 'md5:689f066d74610b3b22e0f1739add0f58', - 'thumbnail': r're:https?://.*', - 'uploader': 'りょうた', - 'uploader_id': '18822557', - 'upload_date': '20110429', - 'timestamp': 1304065916, - 'duration': 209, - }, - 'skip': 'Requires an account', - }, { - # 'video exists but is marked as "deleted" - # md5 is unstable - 'url': 'http://www.nicovideo.jp/watch/sm10000', - 'info_dict': { - 'id': 'sm10000', - 'ext': 'unknown_video', - 'description': 'deleted', - 'title': 'ドラえもんエターナル第3話「決戦第3新東京市」<前編>', - 'thumbnail': r're:https?://.*', - 'upload_date': '20071224', - 'timestamp': int, # timestamp field has different value if logged in - 'duration': 304, - 'view_count': int, - }, - 'skip': 'Requires an account', - }, { - 'url': 'http://www.nicovideo.jp/watch/so22543406', - 'info_dict': { - 'id': '1388129933', - 'ext': 'mp4', - 'title': '【第1回】RADIOアニメロミックス ラブライブ!~のぞえりRadio Garden~', - 'description': 'md5:b27d224bb0ff53d3c8269e9f8b561cf1', - 'thumbnail': r're:https?://.*', - 'timestamp': 1388851200, - 'upload_date': '20140104', - 'uploader': 'アニメロチャンネル', - 'uploader_id': '312', - }, - 'skip': 'The viewing period of the video you were searching for has expired.', - }, { - # video not available via `getflv`; "old" HTML5 video - 'url': 'http://www.nicovideo.jp/watch/sm1151009', - 'md5': '8fa81c364eb619d4085354eab075598a', - 'info_dict': { - 'id': 'sm1151009', - 'ext': 'mp4', - 'title': 'マスターシステム本体内蔵のスペハリのメインテーマ(PSG版)', - 'description': 'md5:6ee077e0581ff5019773e2e714cdd0b7', - 'thumbnail': r're:https?://.*', - 'duration': 184, - 'timestamp': 1190868283, - 'upload_date': '20070927', - 'uploader': 'denden2', - 'uploader_id': '1392194', - 'view_count': int, - 'comment_count': int, - }, - 'skip': 'Requires an account', - }, { - # "New" HTML5 video - # md5 is unstable - 'url': 'http://www.nicovideo.jp/watch/sm31464864', - 'info_dict': { - 'id': 'sm31464864', - 'ext': 'mp4', - 'title': '新作TVアニメ「戦姫絶唱シンフォギアAXZ」PV 最高画質', - 'description': 'md5:e52974af9a96e739196b2c1ca72b5feb', - 'timestamp': 1498514060, - 'upload_date': '20170626', - 'uploader': 'ゲスト', - 'uploader_id': '40826363', - 'thumbnail': r're:https?://.*', - 'duration': 198, - 'view_count': int, - 'comment_count': int, - }, - 'skip': 'Requires an account', - }, { - # Video without owner - 'url': 'http://www.nicovideo.jp/watch/sm18238488', - 'md5': 'd265680a1f92bdcbbd2a507fc9e78a9e', - 'info_dict': { - 'id': 'sm18238488', - 'ext': 'mp4', - 'title': '【実写版】ミュータントタートルズ', - 'description': 'md5:15df8988e47a86f9e978af2064bf6d8e', - 'timestamp': 1341160408, - 'upload_date': '20120701', - 'uploader': None, - 'uploader_id': None, - 'thumbnail': r're:https?://.*', - 'duration': 5271, - 'view_count': int, - 'comment_count': int, - }, - 'skip': 'Requires an account', - }, { - 'url': 'http://sp.nicovideo.jp/watch/sm28964488?ss_pos=1&cp_in=wt_tg', - 'only_matching': True, - }] - - _VALID_URL = r'https?://(?:www\.|secure\.|sp\.)?nicovideo\.jp/watch/(?P<id>(?:[a-z]{2})?[0-9]+)' - _NETRC_MACHINE = 'niconico' - - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - # No authentication to be performed - if not username: - return True - - # Log in - login_ok = True - login_form_strs = { - 'mail_tel': username, - 'password': password, - } - urlh = self._request_webpage( - 'https://account.nicovideo.jp/api/v1/login', None, - note='Logging in', errnote='Unable to log in', - data=urlencode_postdata(login_form_strs)) - if urlh is False: - login_ok = False - else: - parts = compat_urllib_parse_urlparse(urlh.geturl()) - if compat_parse_qs(parts.query).get('message', [None])[0] == 'cant_login': - login_ok = False - if not login_ok: - self._downloader.report_warning('unable to log in: bad username or password') - return login_ok - - def _extract_format_for_quality(self, api_data, video_id, audio_quality, video_quality): - def yesno(boolean): - return 'yes' if boolean else 'no' - - session_api_data = api_data['video']['dmcInfo']['session_api'] - session_api_endpoint = session_api_data['urls'][0] - - format_id = '-'.join(map(lambda s: remove_start(s['id'], 'archive_'), [video_quality, audio_quality])) - - session_response = self._download_json( - session_api_endpoint['url'], video_id, - query={'_format': 'json'}, - headers={'Content-Type': 'application/json'}, - note='Downloading JSON metadata for %s' % format_id, - data=json.dumps({ - 'session': { - 'client_info': { - 'player_id': session_api_data['player_id'], - }, - 'content_auth': { - 'auth_type': session_api_data['auth_types'][session_api_data['protocols'][0]], - 'content_key_timeout': session_api_data['content_key_timeout'], - 'service_id': 'nicovideo', - 'service_user_id': session_api_data['service_user_id'] - }, - 'content_id': session_api_data['content_id'], - 'content_src_id_sets': [{ - 'content_src_ids': [{ - 'src_id_to_mux': { - 'audio_src_ids': [audio_quality['id']], - 'video_src_ids': [video_quality['id']], - } - }] - }], - 'content_type': 'movie', - 'content_uri': '', - 'keep_method': { - 'heartbeat': { - 'lifetime': session_api_data['heartbeat_lifetime'] - } - }, - 'priority': session_api_data['priority'], - 'protocol': { - 'name': 'http', - 'parameters': { - 'http_parameters': { - 'parameters': { - 'http_output_download_parameters': { - 'use_ssl': yesno(session_api_endpoint['is_ssl']), - 'use_well_known_port': yesno(session_api_endpoint['is_well_known_port']), - } - } - } - } - }, - 'recipe_id': session_api_data['recipe_id'], - 'session_operation_auth': { - 'session_operation_auth_by_signature': { - 'signature': session_api_data['signature'], - 'token': session_api_data['token'], - } - }, - 'timing_constraint': 'unlimited' - } - }).encode()) - - resolution = video_quality.get('resolution', {}) - - return { - 'url': session_response['data']['session']['content_uri'], - 'format_id': format_id, - 'ext': 'mp4', # Session API are used in HTML5, which always serves mp4 - 'abr': float_or_none(audio_quality.get('bitrate'), 1000), - 'vbr': float_or_none(video_quality.get('bitrate'), 1000), - 'height': resolution.get('height'), - 'width': resolution.get('width'), - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - # Get video webpage. We are not actually interested in it for normal - # cases, but need the cookies in order to be able to download the - # info webpage - webpage, handle = self._download_webpage_handle( - 'http://www.nicovideo.jp/watch/' + video_id, video_id) - if video_id.startswith('so'): - video_id = self._match_id(handle.geturl()) - - api_data = self._parse_json(self._html_search_regex( - 'data-api-data="([^"]+)"', webpage, - 'API data', default='{}'), video_id) - - def _format_id_from_url(video_url): - return 'economy' if video_real_url.endswith('low') else 'normal' - - try: - video_real_url = api_data['video']['smileInfo']['url'] - except KeyError: # Flash videos - # Get flv info - flv_info_webpage = self._download_webpage( - 'http://flapi.nicovideo.jp/api/getflv/' + video_id + '?as3=1', - video_id, 'Downloading flv info') - - flv_info = compat_parse_qs(flv_info_webpage) - if 'url' not in flv_info: - if 'deleted' in flv_info: - raise ExtractorError('The video has been deleted.', - expected=True) - elif 'closed' in flv_info: - raise ExtractorError('Niconico videos now require logging in', - expected=True) - elif 'error' in flv_info: - raise ExtractorError('%s reports error: %s' % ( - self.IE_NAME, flv_info['error'][0]), expected=True) - else: - raise ExtractorError('Unable to find video URL') - - video_info_xml = self._download_xml( - 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, - video_id, note='Downloading video info page') - - def get_video_info(items): - if not isinstance(items, list): - items = [items] - for item in items: - ret = xpath_text(video_info_xml, './/' + item) - if ret: - return ret - - video_real_url = flv_info['url'][0] - - extension = get_video_info('movie_type') - if not extension: - extension = determine_ext(video_real_url) - - formats = [{ - 'url': video_real_url, - 'ext': extension, - 'format_id': _format_id_from_url(video_real_url), - }] - else: - formats = [] - - dmc_info = api_data['video'].get('dmcInfo') - if dmc_info: # "New" HTML5 videos - quality_info = dmc_info['quality'] - for audio_quality in quality_info['audios']: - for video_quality in quality_info['videos']: - if not audio_quality['available'] or not video_quality['available']: - continue - formats.append(self._extract_format_for_quality( - api_data, video_id, audio_quality, video_quality)) - - self._sort_formats(formats) - else: # "Old" HTML5 videos - formats = [{ - 'url': video_real_url, - 'ext': 'mp4', - 'format_id': _format_id_from_url(video_real_url), - }] - - def get_video_info(items): - return dict_get(api_data['video'], items) - - # Start extracting information - title = get_video_info('title') - if not title: - title = self._og_search_title(webpage, default=None) - if not title: - title = self._html_search_regex( - r'<span[^>]+class="videoHeaderTitle"[^>]*>([^<]+)</span>', - webpage, 'video title') - - watch_api_data_string = self._html_search_regex( - r'<div[^>]+id="watchAPIDataContainer"[^>]+>([^<]+)</div>', - webpage, 'watch api data', default=None) - watch_api_data = self._parse_json(watch_api_data_string, video_id) if watch_api_data_string else {} - video_detail = watch_api_data.get('videoDetail', {}) - - thumbnail = ( - get_video_info(['thumbnail_url', 'thumbnailURL']) - or self._html_search_meta('image', webpage, 'thumbnail', default=None) - or video_detail.get('thumbnail')) - - description = get_video_info('description') - - timestamp = (parse_iso8601(get_video_info('first_retrieve')) - or unified_timestamp(get_video_info('postedDateTime'))) - if not timestamp: - match = self._html_search_meta('datePublished', webpage, 'date published', default=None) - if match: - timestamp = parse_iso8601(match.replace('+', ':00+')) - if not timestamp and video_detail.get('postedAt'): - timestamp = parse_iso8601( - video_detail['postedAt'].replace('/', '-'), - delimiter=' ', timezone=datetime.timedelta(hours=9)) - - view_count = int_or_none(get_video_info(['view_counter', 'viewCount'])) - if not view_count: - match = self._html_search_regex( - r'>Views: <strong[^>]*>([^<]+)</strong>', - webpage, 'view count', default=None) - if match: - view_count = int_or_none(match.replace(',', '')) - view_count = view_count or video_detail.get('viewCount') - - comment_count = (int_or_none(get_video_info('comment_num')) - or video_detail.get('commentCount') - or try_get(api_data, lambda x: x['thread']['commentCount'])) - if not comment_count: - match = self._html_search_regex( - r'>Comments: <strong[^>]*>([^<]+)</strong>', - webpage, 'comment count', default=None) - if match: - comment_count = int_or_none(match.replace(',', '')) - - duration = (parse_duration( - get_video_info('length') - or self._html_search_meta( - 'video:duration', webpage, 'video duration', default=None)) - or video_detail.get('length') - or get_video_info('duration')) - - webpage_url = get_video_info('watch_url') or url - - # Note: cannot use api_data.get('owner', {}) because owner may be set to "null" - # in the JSON, which will cause None to be returned instead of {}. - owner = try_get(api_data, lambda x: x.get('owner'), dict) or {} - uploader_id = get_video_info(['ch_id', 'user_id']) or owner.get('id') - uploader = get_video_info(['ch_name', 'user_nickname']) or owner.get('nickname') - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'thumbnail': thumbnail, - 'description': description, - 'uploader': uploader, - 'timestamp': timestamp, - 'uploader_id': uploader_id, - 'view_count': view_count, - 'comment_count': comment_count, - 'duration': duration, - 'webpage_url': webpage_url, - } - - -class NiconicoPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/(?:user/\d+/)?mylist/(?P<id>\d+)' - - _TESTS = [{ - 'url': 'http://www.nicovideo.jp/mylist/27411728', - 'info_dict': { - 'id': '27411728', - 'title': 'AKB48のオールナイトニッポン', - 'description': 'md5:d89694c5ded4b6c693dea2db6e41aa08', - 'uploader': 'のっく', - 'uploader_id': '805442', - }, - 'playlist_mincount': 225, - }, { - 'url': 'https://www.nicovideo.jp/user/805442/mylist/27411728', - 'only_matching': True, - }] - _PAGE_SIZE = 100 - - def _call_api(self, list_id, resource, query): - return self._download_json( - 'https://nvapi.nicovideo.jp/v2/mylists/' + list_id, list_id, - 'Downloading %s JSON metatdata' % resource, query=query, - headers={'X-Frontend-Id': 6})['data']['mylist'] - - def _parse_owner(self, item): - owner = item.get('owner') or {} - if owner: - return { - 'uploader': owner.get('name'), - 'uploader_id': owner.get('id'), - } - return {} - - def _fetch_page(self, list_id, page): - page += 1 - items = self._call_api(list_id, 'page %d' % page, { - 'page': page, - 'pageSize': self._PAGE_SIZE, - })['items'] - for item in items: - video = item.get('video') or {} - video_id = video.get('id') - if not video_id: - continue - count = video.get('count') or {} - get_count = lambda x: int_or_none(count.get(x)) - info = { - '_type': 'url', - 'id': video_id, - 'title': video.get('title'), - 'url': 'https://www.nicovideo.jp/watch/' + video_id, - 'description': video.get('shortDescription'), - 'duration': int_or_none(video.get('duration')), - 'view_count': get_count('view'), - 'comment_count': get_count('comment'), - 'ie_key': NiconicoIE.ie_key(), - } - info.update(self._parse_owner(video)) - yield info - - def _real_extract(self, url): - list_id = self._match_id(url) - mylist = self._call_api(list_id, 'list', { - 'pageSize': 1, - }) - entries = InAdvancePagedList( - functools.partial(self._fetch_page, list_id), - math.ceil(mylist['totalItemCount'] / self._PAGE_SIZE), - self._PAGE_SIZE) - result = self.playlist_result( - entries, list_id, mylist.get('name'), mylist.get('description')) - result.update(self._parse_owner(mylist)) - return result diff --git a/youtube_dl/extractor/ninecninemedia.py b/youtube_dl/extractor/ninecninemedia.py deleted file mode 100644 index cfc220314..000000000 --- a/youtube_dl/extractor/ninecninemedia.py +++ /dev/null @@ -1,102 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - float_or_none, - int_or_none, - parse_iso8601, - try_get, -) - - -class NineCNineMediaIE(InfoExtractor): - IE_NAME = '9c9media' - _GEO_COUNTRIES = ['CA'] - _VALID_URL = r'9c9media:(?P<destination_code>[^:]+):(?P<id>\d+)' - _API_BASE_TEMPLATE = 'http://capi.9c9media.com/destinations/%s/platforms/desktop/contents/%s/' - - def _real_extract(self, url): - destination_code, content_id = re.match(self._VALID_URL, url).groups() - api_base_url = self._API_BASE_TEMPLATE % (destination_code, content_id) - content = self._download_json(api_base_url, content_id, query={ - '$include': '[Media.Name,Season,ContentPackages.Duration,ContentPackages.Id]', - }) - title = content['Name'] - content_package = content['ContentPackages'][0] - package_id = content_package['Id'] - content_package_url = api_base_url + 'contentpackages/%s/' % package_id - content_package = self._download_json( - content_package_url, content_id, query={ - '$include': '[HasClosedCaptions]', - }) - - if try_get(content_package, lambda x: x['Constraints']['Security']['Type']): - raise ExtractorError('This video is DRM protected.', expected=True) - - manifest_base_url = content_package_url + 'manifest.' - formats = [] - formats.extend(self._extract_m3u8_formats( - manifest_base_url + 'm3u8', content_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False)) - formats.extend(self._extract_f4m_formats( - manifest_base_url + 'f4m', content_id, - f4m_id='hds', fatal=False)) - formats.extend(self._extract_mpd_formats( - manifest_base_url + 'mpd', content_id, - mpd_id='dash', fatal=False)) - self._sort_formats(formats) - - thumbnails = [] - for image in (content.get('Images') or []): - image_url = image.get('Url') - if not image_url: - continue - thumbnails.append({ - 'url': image_url, - 'width': int_or_none(image.get('Width')), - 'height': int_or_none(image.get('Height')), - }) - - tags, categories = [], [] - for source_name, container in (('Tags', tags), ('Genres', categories)): - for e in content.get(source_name, []): - e_name = e.get('Name') - if not e_name: - continue - container.append(e_name) - - season = content.get('Season') or {} - - info = { - 'id': content_id, - 'title': title, - 'description': content.get('Desc') or content.get('ShortDesc'), - 'timestamp': parse_iso8601(content.get('BroadcastDateTime')), - 'episode_number': int_or_none(content.get('Episode')), - 'season': season.get('Name'), - 'season_number': int_or_none(season.get('Number')), - 'season_id': season.get('Id'), - 'series': try_get(content, lambda x: x['Media']['Name']), - 'tags': tags, - 'categories': categories, - 'duration': float_or_none(content_package.get('Duration')), - 'formats': formats, - 'thumbnails': thumbnails, - } - - if content_package.get('HasClosedCaptions'): - info['subtitles'] = { - 'en': [{ - 'url': manifest_base_url + 'vtt', - 'ext': 'vtt', - }, { - 'url': manifest_base_url + 'srt', - 'ext': 'srt', - }] - } - - return info diff --git a/youtube_dl/extractor/ninenow.py b/youtube_dl/extractor/ninenow.py deleted file mode 100644 index 6157dc7c1..000000000 --- a/youtube_dl/extractor/ninenow.py +++ /dev/null @@ -1,93 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - ExtractorError, - int_or_none, - float_or_none, - smuggle_url, -) - - -class NineNowIE(InfoExtractor): - IE_NAME = '9now.com.au' - _VALID_URL = r'https?://(?:www\.)?9now\.com\.au/(?:[^/]+/){2}(?P<id>[^/?#]+)' - _GEO_COUNTRIES = ['AU'] - _TESTS = [{ - # clip - 'url': 'https://www.9now.com.au/afl-footy-show/2016/clip-ciql02091000g0hp5oktrnytc', - 'md5': '17cf47d63ec9323e562c9957a968b565', - 'info_dict': { - 'id': '16801', - 'ext': 'mp4', - 'title': 'St. Kilda\'s Joey Montagna on the potential for a player\'s strike', - 'description': 'Is a boycott of the NAB Cup "on the table"?', - 'uploader_id': '4460760524001', - 'upload_date': '20160713', - 'timestamp': 1468421266, - }, - 'skip': 'Only available in Australia', - }, { - # episode - 'url': 'https://www.9now.com.au/afl-footy-show/2016/episode-19', - 'only_matching': True, - }, { - # DRM protected - 'url': 'https://www.9now.com.au/andrew-marrs-history-of-the-world/season-1/episode-1', - 'only_matching': True, - }] - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/4460760524001/default_default/index.html?videoId=%s' - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - page_data = self._parse_json(self._search_regex( - r'window\.__data\s*=\s*({.*?});', webpage, - 'page data', default='{}'), display_id, fatal=False) - if not page_data: - page_data = self._parse_json(self._parse_json(self._search_regex( - r'window\.__data\s*=\s*JSON\.parse\s*\(\s*(".+?")\s*\)\s*;', - webpage, 'page data'), display_id), display_id) - - for kind in ('episode', 'clip'): - current_key = page_data.get(kind, {}).get( - 'current%sKey' % kind.capitalize()) - if not current_key: - continue - cache = page_data.get(kind, {}).get('%sCache' % kind, {}) - if not cache: - continue - common_data = (cache.get(current_key) or list(cache.values())[0])[kind] - break - else: - raise ExtractorError('Unable to find video data') - - video_data = common_data['video'] - - if video_data.get('drm'): - raise ExtractorError('This video is DRM protected.', expected=True) - - brightcove_id = video_data.get('brightcoveId') or 'ref:' + video_data['referenceId'] - video_id = compat_str(video_data.get('id') or brightcove_id) - title = common_data['name'] - - thumbnails = [{ - 'id': thumbnail_id, - 'url': thumbnail_url, - 'width': int_or_none(thumbnail_id[1:]) - } for thumbnail_id, thumbnail_url in common_data.get('image', {}).get('sizes', {}).items()] - - return { - '_type': 'url_transparent', - 'url': smuggle_url( - self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, - {'geo_countries': self._GEO_COUNTRIES}), - 'id': video_id, - 'title': title, - 'description': common_data.get('description'), - 'duration': float_or_none(video_data.get('duration'), 1000), - 'thumbnails': thumbnails, - 'ie_key': 'BrightcoveNew', - } diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py deleted file mode 100644 index 47b9748f0..000000000 --- a/youtube_dl/extractor/nova.py +++ /dev/null @@ -1,305 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - clean_html, - determine_ext, - int_or_none, - js_to_json, - qualities, - unified_strdate, - url_or_none, -) - - -class NovaEmbedIE(InfoExtractor): - _VALID_URL = r'https?://media\.cms\.nova\.cz/embed/(?P<id>[^/?#&]+)' - _TEST = { - 'url': 'https://media.cms.nova.cz/embed/8o0n0r?autoplay=1', - 'md5': 'ee009bafcc794541570edd44b71cbea3', - 'info_dict': { - 'id': '8o0n0r', - 'ext': 'mp4', - 'title': '2180. díl', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 2578, - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - duration = None - formats = [] - - player = self._parse_json( - self._search_regex( - r'Player\.init\s*\([^,]+,\s*({.+?})\s*,\s*{.+?}\s*\)\s*;', - webpage, 'player', default='{}'), video_id, fatal=False) - if player: - for format_id, format_list in player['tracks'].items(): - if not isinstance(format_list, list): - format_list = [format_list] - for format_dict in format_list: - if not isinstance(format_dict, dict): - continue - format_url = url_or_none(format_dict.get('src')) - format_type = format_dict.get('type') - ext = determine_ext(format_url) - if (format_type == 'application/x-mpegURL' - or format_id == 'HLS' or ext == 'm3u8'): - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls', - fatal=False)) - elif (format_type == 'application/dash+xml' - or format_id == 'DASH' or ext == 'mpd'): - formats.extend(self._extract_mpd_formats( - format_url, video_id, mpd_id='dash', fatal=False)) - else: - formats.append({ - 'url': format_url, - }) - duration = int_or_none(player.get('duration')) - else: - # Old path, not actual as of 08.04.2020 - bitrates = self._parse_json( - self._search_regex( - r'(?s)(?:src|bitrates)\s*=\s*({.+?})\s*;', webpage, 'formats'), - video_id, transform_source=js_to_json) - - QUALITIES = ('lq', 'mq', 'hq', 'hd') - quality_key = qualities(QUALITIES) - - for format_id, format_list in bitrates.items(): - if not isinstance(format_list, list): - format_list = [format_list] - for format_url in format_list: - format_url = url_or_none(format_url) - if not format_url: - continue - if format_id == 'hls': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, ext='mp4', - entry_protocol='m3u8_native', m3u8_id='hls', - fatal=False)) - continue - f = { - 'url': format_url, - } - f_id = format_id - for quality in QUALITIES: - if '%s.mp4' % quality in format_url: - f_id += '-%s' % quality - f.update({ - 'quality': quality_key(quality), - 'format_note': quality.upper(), - }) - break - f['format_id'] = f_id - formats.append(f) - - self._sort_formats(formats) - - title = self._og_search_title( - webpage, default=None) or self._search_regex( - (r'<value>(?P<title>[^<]+)', - r'videoTitle\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage, - 'title', group='value') - thumbnail = self._og_search_thumbnail( - webpage, default=None) or self._search_regex( - r'poster\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, - 'thumbnail', fatal=False, group='value') - duration = int_or_none(self._search_regex( - r'videoDuration\s*:\s*(\d+)', webpage, 'duration', - default=duration)) - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - } - - -class NovaIE(InfoExtractor): - IE_DESC = 'TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz' - _VALID_URL = r'https?://(?:[^.]+\.)?(?P<site>tv(?:noviny)?|tn|novaplus|vymena|fanda|krasna|doma|prask)\.nova\.cz/(?:[^/]+/)+(?P<id>[^/]+?)(?:\.html|/|$)' - _TESTS = [{ - 'url': 'http://tn.nova.cz/clanek/tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci.html#player_13260', - 'md5': '249baab7d0104e186e78b0899c7d5f28', - 'info_dict': { - 'id': '1757139', - 'display_id': 'tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci', - 'ext': 'mp4', - 'title': 'Podzemní nemocnice v pražské Krči', - 'description': 'md5:f0a42dd239c26f61c28f19e62d20ef53', - 'thumbnail': r're:^https?://.*\.(?:jpg)', - } - }, { - 'url': 'http://fanda.nova.cz/clanek/fun-and-games/krvavy-epos-zaklinac-3-divoky-hon-vychazi-vyhrajte-ho-pro-sebe.html', - 'info_dict': { - 'id': '1753621', - 'ext': 'mp4', - 'title': 'Zaklínač 3: Divoký hon', - 'description': 're:.*Pokud se stejně jako my nemůžete.*', - 'thumbnail': r're:https?://.*\.jpg(\?.*)?', - 'upload_date': '20150521', - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - 'skip': 'gone', - }, { - # media.cms.nova.cz embed - 'url': 'https://novaplus.nova.cz/porad/ulice/epizoda/18760-2180-dil', - 'info_dict': { - 'id': '8o0n0r', - 'ext': 'mp4', - 'title': '2180. díl', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 2578, - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': [NovaEmbedIE.ie_key()], - 'skip': 'CHYBA 404: STRÁNKA NENALEZENA', - }, { - 'url': 'http://sport.tn.nova.cz/clanek/sport/hokej/nhl/zivot-jde-dal-hodnotil-po-vyrazeni-z-playoff-jiri-sekac.html', - 'only_matching': True, - }, { - 'url': 'http://fanda.nova.cz/clanek/fun-and-games/krvavy-epos-zaklinac-3-divoky-hon-vychazi-vyhrajte-ho-pro-sebe.html', - 'only_matching': True, - }, { - 'url': 'http://doma.nova.cz/clanek/zdravi/prijdte-se-zapsat-do-registru-kostni-drene-jiz-ve-stredu-3-cervna.html', - 'only_matching': True, - }, { - 'url': 'http://prask.nova.cz/clanek/novinky/co-si-na-sobe-nase-hvezdy-nechaly-pojistit.html', - 'only_matching': True, - }, { - 'url': 'http://tv.nova.cz/clanek/novinky/zivot-je-zivot-bondovsky-trailer.html', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('id') - site = mobj.group('site') - - webpage = self._download_webpage(url, display_id) - - description = clean_html(self._og_search_description(webpage, default=None)) - if site == 'novaplus': - upload_date = unified_strdate(self._search_regex( - r'(\d{1,2}-\d{1,2}-\d{4})$', display_id, 'upload date', default=None)) - elif site == 'fanda': - upload_date = unified_strdate(self._search_regex( - r'<span class="date_time">(\d{1,2}\.\d{1,2}\.\d{4})', webpage, 'upload date', default=None)) - else: - upload_date = None - - # novaplus - embed_id = self._search_regex( - r'<iframe[^>]+\bsrc=["\'](?:https?:)?//media\.cms\.nova\.cz/embed/([^/?#&]+)', - webpage, 'embed url', default=None) - if embed_id: - return { - '_type': 'url_transparent', - 'url': 'https://media.cms.nova.cz/embed/%s' % embed_id, - 'ie_key': NovaEmbedIE.ie_key(), - 'id': embed_id, - 'description': description, - 'upload_date': upload_date - } - - video_id = self._search_regex( - [r"(?:media|video_id)\s*:\s*'(\d+)'", - r'media=(\d+)', - r'id="article_video_(\d+)"', - r'id="player_(\d+)"'], - webpage, 'video id') - - config_url = self._search_regex( - r'src="(https?://(?:tn|api)\.nova\.cz/bin/player/videojs/config\.php\?[^"]+)"', - webpage, 'config url', default=None) - config_params = {} - - if not config_url: - player = self._parse_json( - self._search_regex( - r'(?s)Player\s*\(.+?\s*,\s*({.+?\bmedia\b["\']?\s*:\s*["\']?\d+.+?})\s*\)', webpage, - 'player', default='{}'), - video_id, transform_source=js_to_json, fatal=False) - if player: - config_url = url_or_none(player.get('configUrl')) - params = player.get('configParams') - if isinstance(params, dict): - config_params = params - - if not config_url: - DEFAULT_SITE_ID = '23000' - SITES = { - 'tvnoviny': DEFAULT_SITE_ID, - 'novaplus': DEFAULT_SITE_ID, - 'vymena': DEFAULT_SITE_ID, - 'krasna': DEFAULT_SITE_ID, - 'fanda': '30', - 'tn': '30', - 'doma': '30', - } - - site_id = self._search_regex( - r'site=(\d+)', webpage, 'site id', default=None) or SITES.get( - site, DEFAULT_SITE_ID) - - config_url = 'https://api.nova.cz/bin/player/videojs/config.php' - config_params = { - 'site': site_id, - 'media': video_id, - 'quality': 3, - 'version': 1, - } - - config = self._download_json( - config_url, display_id, - 'Downloading config JSON', query=config_params, - transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1]) - - mediafile = config['mediafile'] - video_url = mediafile['src'] - - m = re.search(r'^(?P<url>rtmpe?://[^/]+/(?P<app>[^/]+?))/&*(?P<playpath>.+)$', video_url) - if m: - formats = [{ - 'url': m.group('url'), - 'app': m.group('app'), - 'play_path': m.group('playpath'), - 'player_path': 'http://tvnoviny.nova.cz/static/shared/app/videojs/video-js.swf', - 'ext': 'flv', - }] - else: - formats = [{ - 'url': video_url, - }] - self._sort_formats(formats) - - title = mediafile.get('meta', {}).get('title') or self._og_search_title(webpage) - thumbnail = config.get('poster') - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'upload_date': upload_date, - 'thumbnail': thumbnail, - 'formats': formats, - } diff --git a/youtube_dl/extractor/nowness.py b/youtube_dl/extractor/nowness.py deleted file mode 100644 index f26dafb8f..000000000 --- a/youtube_dl/extractor/nowness.py +++ /dev/null @@ -1,147 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .brightcove import ( - BrightcoveLegacyIE, - BrightcoveNewIE, -) -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - ExtractorError, - sanitized_Request, -) - - -class NownessBaseIE(InfoExtractor): - def _extract_url_result(self, post): - if post['type'] == 'video': - for media in post['media']: - if media['type'] == 'video': - video_id = media['content'] - source = media['source'] - if source == 'brightcove': - player_code = self._download_webpage( - 'http://www.nowness.com/iframe?id=%s' % video_id, video_id, - note='Downloading player JavaScript', - errnote='Unable to download player JavaScript') - bc_url = BrightcoveLegacyIE._extract_brightcove_url(player_code) - if bc_url: - return self.url_result(bc_url, BrightcoveLegacyIE.ie_key()) - bc_url = BrightcoveNewIE._extract_url(self, player_code) - if bc_url: - return self.url_result(bc_url, BrightcoveNewIE.ie_key()) - raise ExtractorError('Could not find player definition') - elif source == 'vimeo': - return self.url_result('http://vimeo.com/%s' % video_id, 'Vimeo') - elif source == 'youtube': - return self.url_result(video_id, 'Youtube') - elif source == 'cinematique': - # youtube-dl currently doesn't support cinematique - # return self.url_result('http://cinematique.com/embed/%s' % video_id, 'Cinematique') - pass - - def _api_request(self, url, request_path): - display_id = self._match_id(url) - request = sanitized_Request( - 'http://api.nowness.com/api/' + request_path % display_id, - headers={ - 'X-Nowness-Language': 'zh-cn' if 'cn.nowness.com' in url else 'en-us', - }) - return display_id, self._download_json(request, display_id) - - -class NownessIE(NownessBaseIE): - IE_NAME = 'nowness' - _VALID_URL = r'https?://(?:(?:www|cn)\.)?nowness\.com/(?:story|(?:series|category)/[^/]+)/(?P<id>[^/]+?)(?:$|[?#])' - _TESTS = [{ - 'url': 'https://www.nowness.com/story/candor-the-art-of-gesticulation', - 'md5': '068bc0202558c2e391924cb8cc470676', - 'info_dict': { - 'id': '2520295746001', - 'ext': 'mp4', - 'title': 'Candor: The Art of Gesticulation', - 'description': 'Candor: The Art of Gesticulation', - 'thumbnail': r're:^https?://.*\.jpg', - 'timestamp': 1446745676, - 'upload_date': '20151105', - 'uploader_id': '2385340575001', - }, - 'add_ie': ['BrightcoveNew'], - }, { - 'url': 'https://cn.nowness.com/story/kasper-bjorke-ft-jaakko-eino-kalevi-tnr', - 'md5': 'e79cf125e387216f86b2e0a5b5c63aa3', - 'info_dict': { - 'id': '3716354522001', - 'ext': 'mp4', - 'title': 'Kasper Bjørke ft. Jaakko Eino Kalevi: TNR', - 'description': 'Kasper Bjørke ft. Jaakko Eino Kalevi: TNR', - 'thumbnail': r're:^https?://.*\.jpg', - 'timestamp': 1407315371, - 'upload_date': '20140806', - 'uploader_id': '2385340575001', - }, - 'add_ie': ['BrightcoveNew'], - }, { - # vimeo - 'url': 'https://www.nowness.com/series/nowness-picks/jean-luc-godard-supercut', - 'md5': '9a5a6a8edf806407e411296ab6bc2a49', - 'info_dict': { - 'id': '130020913', - 'ext': 'mp4', - 'title': 'Bleu, Blanc, Rouge - A Godard Supercut', - 'description': 'md5:f0ea5f1857dffca02dbd37875d742cec', - 'thumbnail': r're:^https?://.*\.jpg', - 'upload_date': '20150607', - 'uploader': 'Cinema Sem Lei', - 'uploader_id': 'cinemasemlei', - }, - 'add_ie': ['Vimeo'], - }] - - def _real_extract(self, url): - _, post = self._api_request(url, 'post/getBySlug/%s') - return self._extract_url_result(post) - - -class NownessPlaylistIE(NownessBaseIE): - IE_NAME = 'nowness:playlist' - _VALID_URL = r'https?://(?:(?:www|cn)\.)?nowness\.com/playlist/(?P<id>\d+)' - _TEST = { - 'url': 'https://www.nowness.com/playlist/3286/i-guess-thats-why-they-call-it-the-blues', - 'info_dict': { - 'id': '3286', - }, - 'playlist_mincount': 8, - } - - def _real_extract(self, url): - playlist_id, playlist = self._api_request(url, 'post?PlaylistId=%s') - entries = [self._extract_url_result(item) for item in playlist['items']] - return self.playlist_result(entries, playlist_id) - - -class NownessSeriesIE(NownessBaseIE): - IE_NAME = 'nowness:series' - _VALID_URL = r'https?://(?:(?:www|cn)\.)?nowness\.com/series/(?P<id>[^/]+?)(?:$|[?#])' - _TEST = { - 'url': 'https://www.nowness.com/series/60-seconds', - 'info_dict': { - 'id': '60', - 'title': '60 Seconds', - 'description': 'One-minute wisdom in a new NOWNESS series', - }, - 'playlist_mincount': 4, - } - - def _real_extract(self, url): - display_id, series = self._api_request(url, 'series/getBySlug/%s') - entries = [self._extract_url_result(post) for post in series['posts']] - series_title = None - series_description = None - translations = series.get('translations', []) - if translations: - series_title = translations[0].get('title') or translations[0]['seoTitle'] - series_description = translations[0].get('seoDescription') - return self.playlist_result( - entries, compat_str(series['id']), series_title, series_description) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py deleted file mode 100644 index e525ad928..000000000 --- a/youtube_dl/extractor/npo.py +++ /dev/null @@ -1,767 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, -) -from ..utils import ( - determine_ext, - ExtractorError, - fix_xml_ampersands, - int_or_none, - merge_dicts, - orderedSet, - parse_duration, - qualities, - str_or_none, - strip_jsonp, - unified_strdate, - unified_timestamp, - url_or_none, - urlencode_postdata, -) - - -class NPOBaseIE(InfoExtractor): - def _get_token(self, video_id): - return self._download_json( - 'http://ida.omroep.nl/app.php/auth', video_id, - note='Downloading token')['token'] - - -class NPOIE(NPOBaseIE): - IE_NAME = 'npo' - IE_DESC = 'npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl' - _VALID_URL = r'''(?x) - (?: - npo:| - https?:// - (?:www\.)? - (?: - npo\.nl/(?:[^/]+/)*| - (?:ntr|npostart)\.nl/(?:[^/]+/){2,}| - omroepwnl\.nl/video/fragment/[^/]+__| - (?:zapp|npo3)\.nl/(?:[^/]+/){2,} - ) - ) - (?P<id>[^/?#]+) - ''' - - _TESTS = [{ - 'url': 'http://www.npo.nl/nieuwsuur/22-06-2014/VPWON_1220719', - 'md5': '4b3f9c429157ec4775f2c9cb7b911016', - 'info_dict': { - 'id': 'VPWON_1220719', - 'ext': 'm4v', - 'title': 'Nieuwsuur', - 'description': 'Dagelijks tussen tien en elf: nieuws, sport en achtergronden.', - 'upload_date': '20140622', - }, - }, { - 'url': 'http://www.npo.nl/de-mega-mike-mega-thomas-show/27-02-2009/VARA_101191800', - 'md5': 'da50a5787dbfc1603c4ad80f31c5120b', - 'info_dict': { - 'id': 'VARA_101191800', - 'ext': 'm4v', - 'title': 'De Mega Mike & Mega Thomas show: The best of.', - 'description': 'md5:3b74c97fc9d6901d5a665aac0e5400f4', - 'upload_date': '20090227', - 'duration': 2400, - }, - }, { - 'url': 'http://www.npo.nl/tegenlicht/25-02-2013/VPWON_1169289', - 'md5': 'f8065e4e5a7824068ed3c7e783178f2c', - 'info_dict': { - 'id': 'VPWON_1169289', - 'ext': 'm4v', - 'title': 'Tegenlicht: Zwart geld. De toekomst komt uit Afrika', - 'description': 'md5:52cf4eefbc96fffcbdc06d024147abea', - 'upload_date': '20130225', - 'duration': 3000, - }, - }, { - 'url': 'http://www.npo.nl/de-nieuwe-mens-deel-1/21-07-2010/WO_VPRO_043706', - 'info_dict': { - 'id': 'WO_VPRO_043706', - 'ext': 'm4v', - 'title': 'De nieuwe mens - Deel 1', - 'description': 'md5:518ae51ba1293ffb80d8d8ce90b74e4b', - 'duration': 4680, - }, - 'params': { - 'skip_download': True, - } - }, { - # non asf in streams - 'url': 'http://www.npo.nl/hoe-gaat-europa-verder-na-parijs/10-01-2015/WO_NOS_762771', - 'info_dict': { - 'id': 'WO_NOS_762771', - 'ext': 'mp4', - 'title': 'Hoe gaat Europa verder na Parijs?', - }, - 'params': { - 'skip_download': True, - } - }, { - 'url': 'http://www.ntr.nl/Aap-Poot-Pies/27/detail/Aap-poot-pies/VPWON_1233944#content', - 'info_dict': { - 'id': 'VPWON_1233944', - 'ext': 'm4v', - 'title': 'Aap, poot, pies', - 'description': 'md5:c9c8005d1869ae65b858e82c01a91fde', - 'upload_date': '20150508', - 'duration': 599, - }, - 'params': { - 'skip_download': True, - } - }, { - 'url': 'http://www.omroepwnl.nl/video/fragment/vandaag-de-dag-verkiezingen__POMS_WNL_853698', - 'info_dict': { - 'id': 'POW_00996502', - 'ext': 'm4v', - 'title': '''"Dit is wel een 'landslide'..."''', - 'description': 'md5:f8d66d537dfb641380226e31ca57b8e8', - 'upload_date': '20150508', - 'duration': 462, - }, - 'params': { - 'skip_download': True, - } - }, { - # audio - 'url': 'http://www.npo.nl/jouw-stad-rotterdam/29-01-2017/RBX_FUNX_6683215/RBX_FUNX_7601437', - 'info_dict': { - 'id': 'RBX_FUNX_6683215', - 'ext': 'mp3', - 'title': 'Jouw Stad Rotterdam', - 'description': 'md5:db251505244f097717ec59fabc372d9f', - }, - 'params': { - 'skip_download': True, - } - }, { - 'url': 'http://www.zapp.nl/de-bzt-show/gemist/KN_1687547', - 'only_matching': True, - }, { - 'url': 'http://www.zapp.nl/de-bzt-show/filmpjes/POMS_KN_7315118', - 'only_matching': True, - }, { - 'url': 'http://www.zapp.nl/beste-vrienden-quiz/extra-video-s/WO_NTR_1067990', - 'only_matching': True, - }, { - 'url': 'https://www.npo3.nl/3onderzoekt/16-09-2015/VPWON_1239870', - 'only_matching': True, - }, { - # live stream - 'url': 'npo:LI_NL1_4188102', - 'only_matching': True, - }, { - 'url': 'http://www.npo.nl/radio-gaga/13-06-2017/BNN_101383373', - 'only_matching': True, - }, { - 'url': 'https://www.zapp.nl/1803-skelterlab/instructie-video-s/740-instructievideo-s/POMS_AT_11736927', - 'only_matching': True, - }, { - 'url': 'https://www.npostart.nl/broodje-gezond-ei/28-05-2018/KN_1698996', - 'only_matching': True, - }, { - 'url': 'https://npo.nl/KN_1698996', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return (False if any(ie.suitable(url) - for ie in (NPOLiveIE, NPORadioIE, NPORadioFragmentIE)) - else super(NPOIE, cls).suitable(url)) - - def _real_extract(self, url): - video_id = self._match_id(url) - return self._get_info(url, video_id) or self._get_old_info(video_id) - - def _get_info(self, url, video_id): - token = self._download_json( - 'https://www.npostart.nl/api/token', video_id, - 'Downloading token', headers={ - 'Referer': url, - 'X-Requested-With': 'XMLHttpRequest', - })['token'] - - player = self._download_json( - 'https://www.npostart.nl/player/%s' % video_id, video_id, - 'Downloading player JSON', data=urlencode_postdata({ - 'autoplay': 0, - 'share': 1, - 'pageUrl': url, - 'hasAdConsent': 0, - '_token': token, - })) - - player_token = player['token'] - - drm = False - format_urls = set() - formats = [] - for profile in ('hls', 'dash-widevine', 'dash-playready', 'smooth'): - streams = self._download_json( - 'https://start-player.npo.nl/video/%s/streams' % video_id, - video_id, 'Downloading %s profile JSON' % profile, fatal=False, - query={ - 'profile': profile, - 'quality': 'npo', - 'tokenId': player_token, - 'streamType': 'broadcast', - }) - if not streams: - continue - stream = streams.get('stream') - if not isinstance(stream, dict): - continue - stream_url = url_or_none(stream.get('src')) - if not stream_url or stream_url in format_urls: - continue - format_urls.add(stream_url) - if stream.get('protection') is not None or stream.get('keySystemOptions') is not None: - drm = True - continue - stream_type = stream.get('type') - stream_ext = determine_ext(stream_url) - if stream_type == 'application/dash+xml' or stream_ext == 'mpd': - formats.extend(self._extract_mpd_formats( - stream_url, video_id, mpd_id='dash', fatal=False)) - elif stream_type == 'application/vnd.apple.mpegurl' or stream_ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - stream_url, video_id, ext='mp4', - entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) - elif re.search(r'\.isml?/Manifest', stream_url): - formats.extend(self._extract_ism_formats( - stream_url, video_id, ism_id='mss', fatal=False)) - else: - formats.append({ - 'url': stream_url, - }) - - if not formats: - if drm: - raise ExtractorError('This video is DRM protected.', expected=True) - return - - self._sort_formats(formats) - - info = { - 'id': video_id, - 'title': video_id, - 'formats': formats, - } - - embed_url = url_or_none(player.get('embedUrl')) - if embed_url: - webpage = self._download_webpage( - embed_url, video_id, 'Downloading embed page', fatal=False) - if webpage: - video = self._parse_json( - self._search_regex( - r'\bvideo\s*=\s*({.+?})\s*;', webpage, 'video', - default='{}'), video_id) - if video: - title = video.get('episodeTitle') - subtitles = {} - subtitles_list = video.get('subtitles') - if isinstance(subtitles_list, list): - for cc in subtitles_list: - cc_url = url_or_none(cc.get('src')) - if not cc_url: - continue - lang = str_or_none(cc.get('language')) or 'nl' - subtitles.setdefault(lang, []).append({ - 'url': cc_url, - }) - return merge_dicts({ - 'title': title, - 'description': video.get('description'), - 'thumbnail': url_or_none( - video.get('still_image_url') or video.get('orig_image_url')), - 'duration': int_or_none(video.get('duration')), - 'timestamp': unified_timestamp(video.get('broadcastDate')), - 'creator': video.get('channel'), - 'series': video.get('title'), - 'episode': title, - 'episode_number': int_or_none(video.get('episodeNumber')), - 'subtitles': subtitles, - }, info) - - return info - - def _get_old_info(self, video_id): - metadata = self._download_json( - 'http://e.omroep.nl/metadata/%s' % video_id, - video_id, - # We have to remove the javascript callback - transform_source=strip_jsonp, - ) - - error = metadata.get('error') - if error: - raise ExtractorError(error, expected=True) - - # For some videos actual video id (prid) is different (e.g. for - # http://www.omroepwnl.nl/video/fragment/vandaag-de-dag-verkiezingen__POMS_WNL_853698 - # video id is POMS_WNL_853698 but prid is POW_00996502) - video_id = metadata.get('prid') or video_id - - # titel is too generic in some cases so utilize aflevering_titel as well - # when available (e.g. http://tegenlicht.vpro.nl/afleveringen/2014-2015/access-to-africa.html) - title = metadata['titel'] - sub_title = metadata.get('aflevering_titel') - if sub_title and sub_title != title: - title += ': %s' % sub_title - - token = self._get_token(video_id) - - formats = [] - urls = set() - - def is_legal_url(format_url): - return format_url and format_url not in urls and re.match( - r'^(?:https?:)?//', format_url) - - QUALITY_LABELS = ('Laag', 'Normaal', 'Hoog') - QUALITY_FORMATS = ('adaptive', 'wmv_sb', 'h264_sb', 'wmv_bb', 'h264_bb', 'wvc1_std', 'h264_std') - - quality_from_label = qualities(QUALITY_LABELS) - quality_from_format_id = qualities(QUALITY_FORMATS) - items = self._download_json( - 'http://ida.omroep.nl/app.php/%s' % video_id, video_id, - 'Downloading formats JSON', query={ - 'adaptive': 'yes', - 'token': token, - })['items'][0] - for num, item in enumerate(items): - item_url = item.get('url') - if not is_legal_url(item_url): - continue - urls.add(item_url) - format_id = self._search_regex( - r'video/ida/([^/]+)', item_url, 'format id', - default=None) - - item_label = item.get('label') - - def add_format_url(format_url): - width = int_or_none(self._search_regex( - r'(\d+)[xX]\d+', format_url, 'width', default=None)) - height = int_or_none(self._search_regex( - r'\d+[xX](\d+)', format_url, 'height', default=None)) - if item_label in QUALITY_LABELS: - quality = quality_from_label(item_label) - f_id = item_label - elif item_label in QUALITY_FORMATS: - quality = quality_from_format_id(format_id) - f_id = format_id - else: - quality, f_id = [None] * 2 - formats.append({ - 'url': format_url, - 'format_id': f_id, - 'width': width, - 'height': height, - 'quality': quality, - }) - - # Example: http://www.npo.nl/de-nieuwe-mens-deel-1/21-07-2010/WO_VPRO_043706 - if item.get('contentType') in ('url', 'audio'): - add_format_url(item_url) - continue - - try: - stream_info = self._download_json( - item_url + '&type=json', video_id, - 'Downloading %s stream JSON' - % item_label or item.get('format') or format_id or num) - except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404: - error = (self._parse_json( - ee.cause.read().decode(), video_id, - fatal=False) or {}).get('errorstring') - if error: - raise ExtractorError(error, expected=True) - raise - # Stream URL instead of JSON, example: npo:LI_NL1_4188102 - if isinstance(stream_info, compat_str): - if not stream_info.startswith('http'): - continue - video_url = stream_info - # JSON - else: - video_url = stream_info.get('url') - if not video_url or 'vodnotavailable.' in video_url or video_url in urls: - continue - urls.add(video_url) - if determine_ext(video_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, ext='mp4', - entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) - else: - add_format_url(video_url) - - is_live = metadata.get('medium') == 'live' - - if not is_live: - for num, stream in enumerate(metadata.get('streams', [])): - stream_url = stream.get('url') - if not is_legal_url(stream_url): - continue - urls.add(stream_url) - # smooth streaming is not supported - stream_type = stream.get('type', '').lower() - if stream_type in ['ss', 'ms']: - continue - if stream_type == 'hds': - f4m_formats = self._extract_f4m_formats( - stream_url, video_id, fatal=False) - # f4m downloader downloads only piece of live stream - for f4m_format in f4m_formats: - f4m_format['preference'] = -1 - formats.extend(f4m_formats) - elif stream_type == 'hls': - formats.extend(self._extract_m3u8_formats( - stream_url, video_id, ext='mp4', fatal=False)) - # Example: http://www.npo.nl/de-nieuwe-mens-deel-1/21-07-2010/WO_VPRO_043706 - elif '.asf' in stream_url: - asx = self._download_xml( - stream_url, video_id, - 'Downloading stream %d ASX playlist' % num, - transform_source=fix_xml_ampersands, fatal=False) - if not asx: - continue - ref = asx.find('./ENTRY/Ref') - if ref is None: - continue - video_url = ref.get('href') - if not video_url or video_url in urls: - continue - urls.add(video_url) - formats.append({ - 'url': video_url, - 'ext': stream.get('formaat', 'asf'), - 'quality': stream.get('kwaliteit'), - 'preference': -10, - }) - else: - formats.append({ - 'url': stream_url, - 'quality': stream.get('kwaliteit'), - }) - - self._sort_formats(formats) - - subtitles = {} - if metadata.get('tt888') == 'ja': - subtitles['nl'] = [{ - 'ext': 'vtt', - 'url': 'http://tt888.omroep.nl/tt888/%s' % video_id, - }] - - return { - 'id': video_id, - 'title': self._live_title(title) if is_live else title, - 'description': metadata.get('info'), - 'thumbnail': metadata.get('images', [{'url': None}])[-1]['url'], - 'upload_date': unified_strdate(metadata.get('gidsdatum')), - 'duration': parse_duration(metadata.get('tijdsduur')), - 'formats': formats, - 'subtitles': subtitles, - 'is_live': is_live, - } - - -class NPOLiveIE(NPOBaseIE): - IE_NAME = 'npo.nl:live' - _VALID_URL = r'https?://(?:www\.)?npo(?:start)?\.nl/live(?:/(?P<id>[^/?#&]+))?' - - _TESTS = [{ - 'url': 'http://www.npo.nl/live/npo-1', - 'info_dict': { - 'id': 'LI_NL1_4188102', - 'display_id': 'npo-1', - 'ext': 'mp4', - 'title': 're:^NPO 1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'is_live': True, - }, - 'params': { - 'skip_download': True, - } - }, { - 'url': 'http://www.npo.nl/live', - 'only_matching': True, - }, { - 'url': 'https://www.npostart.nl/live/npo-1', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) or 'npo-1' - - webpage = self._download_webpage(url, display_id) - - live_id = self._search_regex( - [r'media-id="([^"]+)"', r'data-prid="([^"]+)"'], webpage, 'live id') - - return { - '_type': 'url_transparent', - 'url': 'npo:%s' % live_id, - 'ie_key': NPOIE.ie_key(), - 'id': live_id, - 'display_id': display_id, - } - - -class NPORadioIE(InfoExtractor): - IE_NAME = 'npo.nl:radio' - _VALID_URL = r'https?://(?:www\.)?npo\.nl/radio/(?P<id>[^/]+)' - - _TEST = { - 'url': 'http://www.npo.nl/radio/radio-1', - 'info_dict': { - 'id': 'radio-1', - 'ext': 'mp3', - 'title': 're:^NPO Radio 1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'is_live': True, - }, - 'params': { - 'skip_download': True, - } - } - - @classmethod - def suitable(cls, url): - return False if NPORadioFragmentIE.suitable(url) else super(NPORadioIE, cls).suitable(url) - - @staticmethod - def _html_get_attribute_regex(attribute): - return r'{0}\s*=\s*\'([^\']+)\''.format(attribute) - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - title = self._html_search_regex( - self._html_get_attribute_regex('data-channel'), webpage, 'title') - - stream = self._parse_json( - self._html_search_regex(self._html_get_attribute_regex('data-streams'), webpage, 'data-streams'), - video_id) - - codec = stream.get('codec') - - return { - 'id': video_id, - 'url': stream['url'], - 'title': self._live_title(title), - 'acodec': codec, - 'ext': codec, - 'is_live': True, - } - - -class NPORadioFragmentIE(InfoExtractor): - IE_NAME = 'npo.nl:radio:fragment' - _VALID_URL = r'https?://(?:www\.)?npo\.nl/radio/[^/]+/fragment/(?P<id>\d+)' - - _TEST = { - 'url': 'http://www.npo.nl/radio/radio-5/fragment/174356', - 'md5': 'dd8cc470dad764d0fdc70a9a1e2d18c2', - 'info_dict': { - 'id': '174356', - 'ext': 'mp3', - 'title': 'Jubileumconcert Willeke Alberti', - }, - } - - def _real_extract(self, url): - audio_id = self._match_id(url) - - webpage = self._download_webpage(url, audio_id) - - title = self._html_search_regex( - r'href="/radio/[^/]+/fragment/%s" title="([^"]+)"' % audio_id, - webpage, 'title') - - audio_url = self._search_regex( - r"data-streams='([^']+)'", webpage, 'audio url') - - return { - 'id': audio_id, - 'url': audio_url, - 'title': title, - } - - -class NPODataMidEmbedIE(InfoExtractor): - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - r'data-mid=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video_id', group='id') - return { - '_type': 'url_transparent', - 'ie_key': 'NPO', - 'url': 'npo:%s' % video_id, - 'display_id': display_id - } - - -class SchoolTVIE(NPODataMidEmbedIE): - IE_NAME = 'schooltv' - _VALID_URL = r'https?://(?:www\.)?schooltv\.nl/video/(?P<id>[^/?#&]+)' - - _TEST = { - 'url': 'http://www.schooltv.nl/video/ademhaling-de-hele-dag-haal-je-adem-maar-wat-gebeurt-er-dan-eigenlijk-in-je-lichaam/', - 'info_dict': { - 'id': 'WO_NTR_429477', - 'display_id': 'ademhaling-de-hele-dag-haal-je-adem-maar-wat-gebeurt-er-dan-eigenlijk-in-je-lichaam', - 'title': 'Ademhaling: De hele dag haal je adem. Maar wat gebeurt er dan eigenlijk in je lichaam?', - 'ext': 'mp4', - 'description': 'md5:abfa0ff690adb73fd0297fd033aaa631' - }, - 'params': { - # Skip because of m3u8 download - 'skip_download': True - } - } - - -class HetKlokhuisIE(NPODataMidEmbedIE): - IE_NAME = 'hetklokhuis' - _VALID_URL = r'https?://(?:www\.)?hetklokhuis\.nl/[^/]+/\d+/(?P<id>[^/?#&]+)' - - _TEST = { - 'url': 'http://hetklokhuis.nl/tv-uitzending/3471/Zwaartekrachtsgolven', - 'info_dict': { - 'id': 'VPWON_1260528', - 'display_id': 'Zwaartekrachtsgolven', - 'ext': 'm4v', - 'title': 'Het Klokhuis: Zwaartekrachtsgolven', - 'description': 'md5:c94f31fb930d76c2efa4a4a71651dd48', - 'upload_date': '20170223', - }, - 'params': { - 'skip_download': True - } - } - - -class NPOPlaylistBaseIE(NPOIE): - def _real_extract(self, url): - playlist_id = self._match_id(url) - - webpage = self._download_webpage(url, playlist_id) - - entries = [ - self.url_result('npo:%s' % video_id if not video_id.startswith('http') else video_id) - for video_id in orderedSet(re.findall(self._PLAYLIST_ENTRY_RE, webpage)) - ] - - playlist_title = self._html_search_regex( - self._PLAYLIST_TITLE_RE, webpage, 'playlist title', - default=None) or self._og_search_title(webpage) - - return self.playlist_result(entries, playlist_id, playlist_title) - - -class VPROIE(NPOPlaylistBaseIE): - IE_NAME = 'vpro' - _VALID_URL = r'https?://(?:www\.)?(?:(?:tegenlicht\.)?vpro|2doc)\.nl/(?:[^/]+/)*(?P<id>[^/]+)\.html' - _PLAYLIST_TITLE_RE = (r'<h1[^>]+class=["\'].*?\bmedia-platform-title\b.*?["\'][^>]*>([^<]+)', - r'<h5[^>]+class=["\'].*?\bmedia-platform-subtitle\b.*?["\'][^>]*>([^<]+)') - _PLAYLIST_ENTRY_RE = r'data-media-id="([^"]+)"' - - _TESTS = [ - { - 'url': 'http://tegenlicht.vpro.nl/afleveringen/2012-2013/de-toekomst-komt-uit-afrika.html', - 'md5': 'f8065e4e5a7824068ed3c7e783178f2c', - 'info_dict': { - 'id': 'VPWON_1169289', - 'ext': 'm4v', - 'title': 'De toekomst komt uit Afrika', - 'description': 'md5:52cf4eefbc96fffcbdc06d024147abea', - 'upload_date': '20130225', - }, - 'skip': 'Video gone', - }, - { - 'url': 'http://www.vpro.nl/programmas/2doc/2015/sergio-herman.html', - 'info_dict': { - 'id': 'sergio-herman', - 'title': 'sergio herman: fucking perfect', - }, - 'playlist_count': 2, - }, - { - # playlist with youtube embed - 'url': 'http://www.vpro.nl/programmas/2doc/2015/education-education.html', - 'info_dict': { - 'id': 'education-education', - 'title': 'education education', - }, - 'playlist_count': 2, - }, - { - 'url': 'http://www.2doc.nl/documentaires/series/2doc/2015/oktober/de-tegenprestatie.html', - 'info_dict': { - 'id': 'de-tegenprestatie', - 'title': 'De Tegenprestatie', - }, - 'playlist_count': 2, - }, { - 'url': 'http://www.2doc.nl/speel~VARA_101375237~mh17-het-verdriet-van-nederland~.html', - 'info_dict': { - 'id': 'VARA_101375237', - 'ext': 'm4v', - 'title': 'MH17: Het verdriet van Nederland', - 'description': 'md5:09e1a37c1fdb144621e22479691a9f18', - 'upload_date': '20150716', - }, - 'params': { - # Skip because of m3u8 download - 'skip_download': True - }, - } - ] - - -class WNLIE(NPOPlaylistBaseIE): - IE_NAME = 'wnl' - _VALID_URL = r'https?://(?:www\.)?omroepwnl\.nl/video/detail/(?P<id>[^/]+)__\d+' - _PLAYLIST_TITLE_RE = r'(?s)<h1[^>]+class="subject"[^>]*>(.+?)</h1>' - _PLAYLIST_ENTRY_RE = r'<a[^>]+href="([^"]+)"[^>]+class="js-mid"[^>]*>Deel \d+' - - _TESTS = [{ - 'url': 'http://www.omroepwnl.nl/video/detail/vandaag-de-dag-6-mei__060515', - 'info_dict': { - 'id': 'vandaag-de-dag-6-mei', - 'title': 'Vandaag de Dag 6 mei', - }, - 'playlist_count': 4, - }] - - -class AndereTijdenIE(NPOPlaylistBaseIE): - IE_NAME = 'anderetijden' - _VALID_URL = r'https?://(?:www\.)?anderetijden\.nl/programma/(?:[^/]+/)+(?P<id>[^/?#&]+)' - _PLAYLIST_TITLE_RE = r'(?s)<h1[^>]+class=["\'].*?\bpage-title\b.*?["\'][^>]*>(.+?)</h1>' - _PLAYLIST_ENTRY_RE = r'<figure[^>]+class=["\']episode-container episode-page["\'][^>]+data-prid=["\'](.+?)["\']' - - _TESTS = [{ - 'url': 'http://anderetijden.nl/programma/1/Andere-Tijden/aflevering/676/Duitse-soldaten-over-de-Slag-bij-Arnhem', - 'info_dict': { - 'id': 'Duitse-soldaten-over-de-Slag-bij-Arnhem', - 'title': 'Duitse soldaten over de Slag bij Arnhem', - }, - 'playlist_count': 3, - }] diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py deleted file mode 100644 index 6d01a25c3..000000000 --- a/youtube_dl/extractor/nrk.py +++ /dev/null @@ -1,873 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import itertools -import random -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - determine_ext, - ExtractorError, - int_or_none, - parse_duration, - str_or_none, - try_get, - urljoin, - url_or_none, -) - - -class NRKBaseIE(InfoExtractor): - _GEO_COUNTRIES = ['NO'] - _CDN_REPL_REGEX = r'''(?x):// - (?: - nrkod\d{1,2}-httpcache0-47115-cacheod0\.dna\.ip-only\.net/47115-cacheod0| - nrk-od-no\.telenorcdn\.net| - minicdn-od\.nrk\.no/od/nrkhd-osl-rr\.netwerk\.no/no - )/''' - - def _extract_nrk_formats(self, asset_url, video_id): - if re.match(r'https?://[^/]+\.akamaihd\.net/i/', asset_url): - return self._extract_akamai_formats(asset_url, video_id) - asset_url = re.sub(r'(?:bw_(?:low|high)=\d+|no_audio_only)&?', '', asset_url) - formats = self._extract_m3u8_formats( - asset_url, video_id, 'mp4', 'm3u8_native', fatal=False) - if not formats and re.search(self._CDN_REPL_REGEX, asset_url): - formats = self._extract_m3u8_formats( - re.sub(self._CDN_REPL_REGEX, '://nrk-od-%02d.akamaized.net/no/' % random.randint(0, 99), asset_url), - video_id, 'mp4', 'm3u8_native', fatal=False) - return formats - - def _raise_error(self, data): - MESSAGES = { - 'ProgramRightsAreNotReady': 'Du kan dessverre ikke se eller høre programmet', - 'ProgramRightsHasExpired': 'Programmet har gått ut', - 'NoProgramRights': 'Ikke tilgjengelig', - 'ProgramIsGeoBlocked': 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge', - } - message_type = data.get('messageType', '') - # Can be ProgramIsGeoBlocked or ChannelIsGeoBlocked* - if 'IsGeoBlocked' in message_type or try_get(data, lambda x: x['usageRights']['isGeoBlocked']) is True: - self.raise_geo_restricted( - msg=MESSAGES.get('ProgramIsGeoBlocked'), - countries=self._GEO_COUNTRIES) - message = data.get('endUserMessage') or MESSAGES.get(message_type, message_type) - raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True) - - def _call_api(self, path, video_id, item=None, note=None, fatal=True, query=None): - return self._download_json( - urljoin('https://psapi.nrk.no/', path), - video_id, note or 'Downloading %s JSON' % item, - fatal=fatal, query=query, - headers={'Accept-Encoding': 'gzip, deflate, br'}) - - -class NRKIE(NRKBaseIE): - _VALID_URL = r'''(?x) - (?: - nrk:| - https?:// - (?: - (?:www\.)?nrk\.no/video/(?:PS\*|[^_]+_)| - v8[-.]psapi\.nrk\.no/mediaelement/ - ) - ) - (?P<id>[^?\#&]+) - ''' - - _TESTS = [{ - # video - 'url': 'http://www.nrk.no/video/PS*150533', - 'md5': 'f46be075326e23ad0e524edfcb06aeb6', - 'info_dict': { - 'id': '150533', - 'ext': 'mp4', - 'title': 'Dompap og andre fugler i Piip-Show', - 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f', - 'duration': 262, - } - }, { - # audio - 'url': 'http://www.nrk.no/video/PS*154915', - # MD5 is unstable - 'info_dict': { - 'id': '154915', - 'ext': 'mp4', - 'title': 'Slik høres internett ut når du er blind', - 'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568', - 'duration': 20, - } - }, { - 'url': 'nrk:ecc1b952-96dc-4a98-81b9-5296dc7a98d9', - 'only_matching': True, - }, { - 'url': 'nrk:clip/7707d5a3-ebe7-434a-87d5-a3ebe7a34a70', - 'only_matching': True, - }, { - 'url': 'https://v8-psapi.nrk.no/mediaelement/ecc1b952-96dc-4a98-81b9-5296dc7a98d9', - 'only_matching': True, - }, { - 'url': 'https://www.nrk.no/video/dompap-og-andre-fugler-i-piip-show_150533', - 'only_matching': True, - }, { - 'url': 'https://www.nrk.no/video/humor/kommentatorboksen-reiser-til-sjos_d1fda11f-a4ad-437a-a374-0398bc84e999', - 'only_matching': True, - }, { - # podcast - 'url': 'nrk:l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8', - 'only_matching': True, - }, { - 'url': 'nrk:podcast/l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8', - 'only_matching': True, - }, { - # clip - 'url': 'nrk:150533', - 'only_matching': True, - }, { - 'url': 'nrk:clip/150533', - 'only_matching': True, - }, { - # program - 'url': 'nrk:MDDP12000117', - 'only_matching': True, - }, { - 'url': 'nrk:program/ENRK10100318', - 'only_matching': True, - }, { - # direkte - 'url': 'nrk:nrk1', - 'only_matching': True, - }, { - 'url': 'nrk:channel/nrk1', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url).split('/')[-1] - - path_templ = 'playback/%s/' + video_id - - def call_playback_api(item, query=None): - return self._call_api(path_templ % item, video_id, item, query=query) - # known values for preferredCdn: akamai, iponly, minicdn and telenor - manifest = call_playback_api('manifest', {'preferredCdn': 'akamai'}) - - video_id = try_get(manifest, lambda x: x['id'], compat_str) or video_id - - if manifest.get('playability') == 'nonPlayable': - self._raise_error(manifest['nonPlayable']) - - playable = manifest['playable'] - - formats = [] - for asset in playable['assets']: - if not isinstance(asset, dict): - continue - if asset.get('encrypted'): - continue - format_url = url_or_none(asset.get('url')) - if not format_url: - continue - asset_format = (asset.get('format') or '').lower() - if asset_format == 'hls' or determine_ext(format_url) == 'm3u8': - formats.extend(self._extract_nrk_formats(format_url, video_id)) - elif asset_format == 'mp3': - formats.append({ - 'url': format_url, - 'format_id': asset_format, - 'vcodec': 'none', - }) - self._sort_formats(formats) - - data = call_playback_api('metadata') - - preplay = data['preplay'] - titles = preplay['titles'] - title = titles['title'] - alt_title = titles.get('subtitle') - - description = preplay.get('description') - duration = parse_duration(playable.get('duration')) or parse_duration(data.get('duration')) - - thumbnails = [] - for image in try_get( - preplay, lambda x: x['poster']['images'], list) or []: - if not isinstance(image, dict): - continue - image_url = url_or_none(image.get('url')) - if not image_url: - continue - thumbnails.append({ - 'url': image_url, - 'width': int_or_none(image.get('pixelWidth')), - 'height': int_or_none(image.get('pixelHeight')), - }) - - subtitles = {} - for sub in try_get(playable, lambda x: x['subtitles'], list) or []: - if not isinstance(sub, dict): - continue - sub_url = url_or_none(sub.get('webVtt')) - if not sub_url: - continue - sub_key = str_or_none(sub.get('language')) or 'nb' - sub_type = str_or_none(sub.get('type')) - if sub_type: - sub_key += '-%s' % sub_type - subtitles.setdefault(sub_key, []).append({ - 'url': sub_url, - }) - - legal_age = try_get( - data, lambda x: x['legalAge']['body']['rating']['code'], compat_str) - # https://en.wikipedia.org/wiki/Norwegian_Media_Authority - age_limit = None - if legal_age: - if legal_age == 'A': - age_limit = 0 - elif legal_age.isdigit(): - age_limit = int_or_none(legal_age) - - is_series = try_get(data, lambda x: x['_links']['series']['name']) == 'series' - - info = { - 'id': video_id, - 'title': title, - 'alt_title': alt_title, - 'description': description, - 'duration': duration, - 'thumbnails': thumbnails, - 'age_limit': age_limit, - 'formats': formats, - 'subtitles': subtitles, - } - - if is_series: - series = season_id = season_number = episode = episode_number = None - programs = self._call_api( - 'programs/%s' % video_id, video_id, 'programs', fatal=False) - if programs and isinstance(programs, dict): - series = str_or_none(programs.get('seriesTitle')) - season_id = str_or_none(programs.get('seasonId')) - season_number = int_or_none(programs.get('seasonNumber')) - episode = str_or_none(programs.get('episodeTitle')) - episode_number = int_or_none(programs.get('episodeNumber')) - if not series: - series = title - if alt_title: - title += ' - %s' % alt_title - if not season_number: - season_number = int_or_none(self._search_regex( - r'Sesong\s+(\d+)', description or '', 'season number', - default=None)) - if not episode: - episode = alt_title if is_series else None - if not episode_number: - episode_number = int_or_none(self._search_regex( - r'^(\d+)\.', episode or '', 'episode number', - default=None)) - if not episode_number: - episode_number = int_or_none(self._search_regex( - r'\((\d+)\s*:\s*\d+\)', description or '', - 'episode number', default=None)) - info.update({ - 'title': title, - 'series': series, - 'season_id': season_id, - 'season_number': season_number, - 'episode': episode, - 'episode_number': episode_number, - }) - - return info - - -class NRKTVIE(InfoExtractor): - IE_DESC = 'NRK TV and NRK Radio' - _EPISODE_RE = r'(?P<id>[a-zA-Z]{4}\d{8})' - _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/(?:[^/]+/)*%s' % _EPISODE_RE - _TESTS = [{ - 'url': 'https://tv.nrk.no/program/MDDP12000117', - 'md5': 'c4a5960f1b00b40d47db65c1064e0ab1', - 'info_dict': { - 'id': 'MDDP12000117', - 'ext': 'mp4', - 'title': 'Alarm Trolltunga', - 'description': 'md5:46923a6e6510eefcce23d5ef2a58f2ce', - 'duration': 2223.44, - 'age_limit': 6, - 'subtitles': { - 'nb-nor': [{ - 'ext': 'vtt', - }], - 'nb-ttv': [{ - 'ext': 'vtt', - }] - }, - }, - }, { - 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', - 'md5': '8d40dab61cea8ab0114e090b029a0565', - 'info_dict': { - 'id': 'MUHH48000314', - 'ext': 'mp4', - 'title': '20 spørsmål - 23. mai 2014', - 'alt_title': '23. mai 2014', - 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', - 'duration': 1741, - 'series': '20 spørsmål', - 'episode': '23. mai 2014', - 'age_limit': 0, - }, - }, { - 'url': 'https://tv.nrk.no/program/mdfp15000514', - 'info_dict': { - 'id': 'MDFP15000514', - 'ext': 'mp4', - 'title': 'Kunnskapskanalen - Grunnlovsjubiléet - Stor ståhei for ingenting', - 'description': 'md5:89290c5ccde1b3a24bb8050ab67fe1db', - 'duration': 4605.08, - 'series': 'Kunnskapskanalen', - 'episode': 'Grunnlovsjubiléet - Stor ståhei for ingenting', - 'age_limit': 0, - }, - 'params': { - 'skip_download': True, - }, - }, { - # single playlist video - 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', - 'info_dict': { - 'id': 'MSPO40010515', - 'ext': 'mp4', - 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', - 'description': 'md5:c03aba1e917561eface5214020551b7a', - 'age_limit': 0, - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': ['Failed to download m3u8 information'], - 'skip': 'particular part is not supported currently', - }, { - 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', - 'info_dict': { - 'id': 'MSPO40010515', - 'ext': 'mp4', - 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', - 'description': 'md5:c03aba1e917561eface5214020551b7a', - 'age_limit': 0, - }, - 'expected_warnings': ['Failed to download m3u8 information'], - 'skip': 'Ikke tilgjengelig utenfor Norge', - }, { - 'url': 'https://tv.nrk.no/serie/anno/KMTE50001317/sesong-3/episode-13', - 'info_dict': { - 'id': 'KMTE50001317', - 'ext': 'mp4', - 'title': 'Anno - 13. episode', - 'description': 'md5:11d9613661a8dbe6f9bef54e3a4cbbfa', - 'duration': 2340, - 'series': 'Anno', - 'episode': '13. episode', - 'season_number': 3, - 'episode_number': 13, - 'age_limit': 0, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://tv.nrk.no/serie/nytt-paa-nytt/MUHH46000317/27-01-2017', - 'info_dict': { - 'id': 'MUHH46000317', - 'ext': 'mp4', - 'title': 'Nytt på Nytt 27.01.2017', - 'description': 'md5:5358d6388fba0ea6f0b6d11c48b9eb4b', - 'duration': 1796, - 'series': 'Nytt på nytt', - 'episode': '27.01.2017', - 'age_limit': 0, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'ProgramRightsHasExpired', - }, { - 'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#', - 'only_matching': True, - }, { - 'url': 'https://tv.nrk.no/serie/lindmo/2018/MUHU11006318/avspiller', - 'only_matching': True, - }, { - 'url': 'https://radio.nrk.no/serie/dagsnytt/sesong/201507/NPUB21019315', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - return self.url_result( - 'nrk:%s' % video_id, ie=NRKIE.ie_key(), video_id=video_id) - - -class NRKTVEpisodeIE(InfoExtractor): - _VALID_URL = r'https?://tv\.nrk\.no/serie/(?P<id>[^/]+/sesong/(?P<season_number>\d+)/episode/(?P<episode_number>\d+))' - _TESTS = [{ - 'url': 'https://tv.nrk.no/serie/hellums-kro/sesong/1/episode/2', - 'info_dict': { - 'id': 'MUHH36005220', - 'ext': 'mp4', - 'title': 'Hellums kro - 2. Kro, krig og kjærlighet', - 'description': 'md5:ad92ddffc04cea8ce14b415deef81787', - 'duration': 1563.92, - 'series': 'Hellums kro', - 'season_number': 1, - 'episode_number': 2, - 'episode': '2. Kro, krig og kjærlighet', - 'age_limit': 6, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://tv.nrk.no/serie/backstage/sesong/1/episode/8', - 'info_dict': { - 'id': 'MSUI14000816', - 'ext': 'mp4', - 'title': 'Backstage - 8. episode', - 'description': 'md5:de6ca5d5a2d56849e4021f2bf2850df4', - 'duration': 1320, - 'series': 'Backstage', - 'season_number': 1, - 'episode_number': 8, - 'episode': '8. episode', - 'age_limit': 0, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'ProgramRightsHasExpired', - }] - - def _real_extract(self, url): - display_id, season_number, episode_number = re.match(self._VALID_URL, url).groups() - - webpage = self._download_webpage(url, display_id) - - info = self._search_json_ld(webpage, display_id, default={}) - nrk_id = info.get('@id') or self._html_search_meta( - 'nrk:program-id', webpage, default=None) or self._search_regex( - r'data-program-id=["\'](%s)' % NRKTVIE._EPISODE_RE, webpage, - 'nrk id') - assert re.match(NRKTVIE._EPISODE_RE, nrk_id) - - info.update({ - '_type': 'url', - 'id': nrk_id, - 'url': 'nrk:%s' % nrk_id, - 'ie_key': NRKIE.ie_key(), - 'season_number': int(season_number), - 'episode_number': int(episode_number), - }) - return info - - -class NRKTVSerieBaseIE(NRKBaseIE): - def _extract_entries(self, entry_list): - if not isinstance(entry_list, list): - return [] - entries = [] - for episode in entry_list: - nrk_id = episode.get('prfId') or episode.get('episodeId') - if not nrk_id or not isinstance(nrk_id, compat_str): - continue - entries.append(self.url_result( - 'nrk:%s' % nrk_id, ie=NRKIE.ie_key(), video_id=nrk_id)) - return entries - - _ASSETS_KEYS = ('episodes', 'instalments',) - - def _extract_assets_key(self, embedded): - for asset_key in self._ASSETS_KEYS: - if embedded.get(asset_key): - return asset_key - - @staticmethod - def _catalog_name(serie_kind): - return 'podcast' if serie_kind in ('podcast', 'podkast') else 'series' - - def _entries(self, data, display_id): - for page_num in itertools.count(1): - embedded = data.get('_embedded') or data - if not isinstance(embedded, dict): - break - assets_key = self._extract_assets_key(embedded) - if not assets_key: - break - # Extract entries - entries = try_get( - embedded, - (lambda x: x[assets_key]['_embedded'][assets_key], - lambda x: x[assets_key]), - list) - for e in self._extract_entries(entries): - yield e - # Find next URL - next_url_path = try_get( - data, - (lambda x: x['_links']['next']['href'], - lambda x: x['_embedded'][assets_key]['_links']['next']['href']), - compat_str) - if not next_url_path: - break - data = self._call_api( - next_url_path, display_id, - note='Downloading %s JSON page %d' % (assets_key, page_num), - fatal=False) - if not data: - break - - -class NRKTVSeasonIE(NRKTVSerieBaseIE): - _VALID_URL = r'''(?x) - https?:// - (?P<domain>tv|radio)\.nrk\.no/ - (?P<serie_kind>serie|pod[ck]ast)/ - (?P<serie>[^/]+)/ - (?: - (?:sesong/)?(?P<id>\d+)| - sesong/(?P<id_2>[^/?#&]+) - ) - ''' - _TESTS = [{ - 'url': 'https://tv.nrk.no/serie/backstage/sesong/1', - 'info_dict': { - 'id': 'backstage/1', - 'title': 'Sesong 1', - }, - 'playlist_mincount': 30, - }, { - # no /sesong/ in path - 'url': 'https://tv.nrk.no/serie/lindmo/2016', - 'info_dict': { - 'id': 'lindmo/2016', - 'title': '2016', - }, - 'playlist_mincount': 29, - }, { - # weird nested _embedded in catalog JSON response - 'url': 'https://radio.nrk.no/serie/dickie-dick-dickens/sesong/1', - 'info_dict': { - 'id': 'dickie-dick-dickens/1', - 'title': 'Sesong 1', - }, - 'playlist_mincount': 11, - }, { - # 841 entries, multi page - 'url': 'https://radio.nrk.no/serie/dagsnytt/sesong/201509', - 'info_dict': { - 'id': 'dagsnytt/201509', - 'title': 'September 2015', - }, - 'playlist_mincount': 841, - }, { - # 180 entries, single page - 'url': 'https://tv.nrk.no/serie/spangas/sesong/1', - 'only_matching': True, - }, { - 'url': 'https://radio.nrk.no/podkast/hele_historien/sesong/diagnose-kverulant', - 'info_dict': { - 'id': 'hele_historien/diagnose-kverulant', - 'title': 'Diagnose kverulant', - }, - 'playlist_mincount': 3, - }, { - 'url': 'https://radio.nrk.no/podkast/loerdagsraadet/sesong/202101', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return (False if NRKTVIE.suitable(url) or NRKTVEpisodeIE.suitable(url) or NRKRadioPodkastIE.suitable(url) - else super(NRKTVSeasonIE, cls).suitable(url)) - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - domain = mobj.group('domain') - serie_kind = mobj.group('serie_kind') - serie = mobj.group('serie') - season_id = mobj.group('id') or mobj.group('id_2') - display_id = '%s/%s' % (serie, season_id) - - data = self._call_api( - '%s/catalog/%s/%s/seasons/%s' - % (domain, self._catalog_name(serie_kind), serie, season_id), - display_id, 'season', query={'pageSize': 50}) - - title = try_get(data, lambda x: x['titles']['title'], compat_str) or display_id - return self.playlist_result( - self._entries(data, display_id), - display_id, title) - - -class NRKTVSeriesIE(NRKTVSerieBaseIE): - _VALID_URL = r'https?://(?P<domain>(?:tv|radio)\.nrk|(?:tv\.)?nrksuper)\.no/(?P<serie_kind>serie|pod[ck]ast)/(?P<id>[^/]+)' - _TESTS = [{ - # new layout, instalments - 'url': 'https://tv.nrk.no/serie/groenn-glede', - 'info_dict': { - 'id': 'groenn-glede', - 'title': 'Grønn glede', - 'description': 'md5:7576e92ae7f65da6993cf90ee29e4608', - }, - 'playlist_mincount': 90, - }, { - # new layout, instalments, more entries - 'url': 'https://tv.nrk.no/serie/lindmo', - 'only_matching': True, - }, { - 'url': 'https://tv.nrk.no/serie/blank', - 'info_dict': { - 'id': 'blank', - 'title': 'Blank', - 'description': 'md5:7664b4e7e77dc6810cd3bca367c25b6e', - }, - 'playlist_mincount': 30, - }, { - # new layout, seasons - 'url': 'https://tv.nrk.no/serie/backstage', - 'info_dict': { - 'id': 'backstage', - 'title': 'Backstage', - 'description': 'md5:63692ceb96813d9a207e9910483d948b', - }, - 'playlist_mincount': 60, - }, { - # old layout - 'url': 'https://tv.nrksuper.no/serie/labyrint', - 'info_dict': { - 'id': 'labyrint', - 'title': 'Labyrint', - 'description': 'I Daidalos sin undersjøiske Labyrint venter spennende oppgaver, skumle robotskapninger og slim.', - }, - 'playlist_mincount': 3, - }, { - 'url': 'https://tv.nrk.no/serie/broedrene-dal-og-spektralsteinene', - 'only_matching': True, - }, { - 'url': 'https://tv.nrk.no/serie/saving-the-human-race', - 'only_matching': True, - }, { - 'url': 'https://tv.nrk.no/serie/postmann-pat', - 'only_matching': True, - }, { - 'url': 'https://radio.nrk.no/serie/dickie-dick-dickens', - 'info_dict': { - 'id': 'dickie-dick-dickens', - 'title': 'Dickie Dick Dickens', - 'description': 'md5:19e67411ffe57f7dce08a943d7a0b91f', - }, - 'playlist_mincount': 8, - }, { - 'url': 'https://nrksuper.no/serie/labyrint', - 'only_matching': True, - }, { - 'url': 'https://radio.nrk.no/podkast/ulrikkes_univers', - 'info_dict': { - 'id': 'ulrikkes_univers', - }, - 'playlist_mincount': 10, - }, { - 'url': 'https://radio.nrk.no/podkast/ulrikkes_univers/nrkno-poddkast-26588-134079-05042018030000', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return ( - False if any(ie.suitable(url) - for ie in (NRKTVIE, NRKTVEpisodeIE, NRKRadioPodkastIE, NRKTVSeasonIE)) - else super(NRKTVSeriesIE, cls).suitable(url)) - - def _real_extract(self, url): - site, serie_kind, series_id = re.match(self._VALID_URL, url).groups() - is_radio = site == 'radio.nrk' - domain = 'radio' if is_radio else 'tv' - - size_prefix = 'p' if is_radio else 'embeddedInstalmentsP' - series = self._call_api( - '%s/catalog/%s/%s' - % (domain, self._catalog_name(serie_kind), series_id), - series_id, 'serie', query={size_prefix + 'ageSize': 50}) - titles = try_get(series, [ - lambda x: x['titles'], - lambda x: x[x['type']]['titles'], - lambda x: x[x['seriesType']]['titles'], - ]) or {} - - entries = [] - entries.extend(self._entries(series, series_id)) - embedded = series.get('_embedded') or {} - linked_seasons = try_get(series, lambda x: x['_links']['seasons']) or [] - embedded_seasons = embedded.get('seasons') or [] - if len(linked_seasons) > len(embedded_seasons): - for season in linked_seasons: - season_url = urljoin(url, season.get('href')) - if not season_url: - season_name = season.get('name') - if season_name and isinstance(season_name, compat_str): - season_url = 'https://%s.nrk.no/serie/%s/sesong/%s' % (domain, series_id, season_name) - if season_url: - entries.append(self.url_result( - season_url, ie=NRKTVSeasonIE.ie_key(), - video_title=season.get('title'))) - else: - for season in embedded_seasons: - entries.extend(self._entries(season, series_id)) - entries.extend(self._entries( - embedded.get('extraMaterial') or {}, series_id)) - - return self.playlist_result( - entries, series_id, titles.get('title'), titles.get('subtitle')) - - -class NRKTVDirekteIE(NRKTVIE): - IE_DESC = 'NRK TV Direkte and NRK Radio Direkte' - _VALID_URL = r'https?://(?:tv|radio)\.nrk\.no/direkte/(?P<id>[^/?#&]+)' - - _TESTS = [{ - 'url': 'https://tv.nrk.no/direkte/nrk1', - 'only_matching': True, - }, { - 'url': 'https://radio.nrk.no/direkte/p1_oslo_akershus', - 'only_matching': True, - }] - - -class NRKRadioPodkastIE(InfoExtractor): - _VALID_URL = r'https?://radio\.nrk\.no/pod[ck]ast/(?:[^/]+/)+(?P<id>l_[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' - - _TESTS = [{ - 'url': 'https://radio.nrk.no/podkast/ulrikkes_univers/l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8', - 'md5': '8d40dab61cea8ab0114e090b029a0565', - 'info_dict': { - 'id': 'MUHH48000314AA', - 'ext': 'mp4', - 'title': '20 spørsmål 23.05.2014', - 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', - 'duration': 1741, - 'series': '20 spørsmål', - 'episode': '23.05.2014', - }, - }, { - 'url': 'https://radio.nrk.no/podcast/ulrikkes_univers/l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8', - 'only_matching': True, - }, { - 'url': 'https://radio.nrk.no/podkast/ulrikkes_univers/sesong/1/l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8', - 'only_matching': True, - }, { - 'url': 'https://radio.nrk.no/podkast/hele_historien/sesong/bortfoert-i-bergen/l_774d1a2c-7aa7-4965-8d1a-2c7aa7d9652c', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - return self.url_result( - 'nrk:%s' % video_id, ie=NRKIE.ie_key(), video_id=video_id) - - -class NRKPlaylistBaseIE(InfoExtractor): - def _extract_description(self, webpage): - pass - - def _real_extract(self, url): - playlist_id = self._match_id(url) - - webpage = self._download_webpage(url, playlist_id) - - entries = [ - self.url_result('nrk:%s' % video_id, NRKIE.ie_key()) - for video_id in re.findall(self._ITEM_RE, webpage) - ] - - playlist_title = self. _extract_title(webpage) - playlist_description = self._extract_description(webpage) - - return self.playlist_result( - entries, playlist_id, playlist_title, playlist_description) - - -class NRKPlaylistIE(NRKPlaylistBaseIE): - _VALID_URL = r'https?://(?:www\.)?nrk\.no/(?!video|skole)(?:[^/]+/)+(?P<id>[^/]+)' - _ITEM_RE = r'class="[^"]*\brich\b[^"]*"[^>]+data-video-id="([^"]+)"' - _TESTS = [{ - 'url': 'http://www.nrk.no/troms/gjenopplev-den-historiske-solformorkelsen-1.12270763', - 'info_dict': { - 'id': 'gjenopplev-den-historiske-solformorkelsen-1.12270763', - 'title': 'Gjenopplev den historiske solformørkelsen', - 'description': 'md5:c2df8ea3bac5654a26fc2834a542feed', - }, - 'playlist_count': 2, - }, { - 'url': 'http://www.nrk.no/kultur/bok/rivertonprisen-til-karin-fossum-1.12266449', - 'info_dict': { - 'id': 'rivertonprisen-til-karin-fossum-1.12266449', - 'title': 'Rivertonprisen til Karin Fossum', - 'description': 'Første kvinne på 15 år til å vinne krimlitteraturprisen.', - }, - 'playlist_count': 2, - }] - - def _extract_title(self, webpage): - return self._og_search_title(webpage, fatal=False) - - def _extract_description(self, webpage): - return self._og_search_description(webpage) - - -class NRKTVEpisodesIE(NRKPlaylistBaseIE): - _VALID_URL = r'https?://tv\.nrk\.no/program/[Ee]pisodes/[^/]+/(?P<id>\d+)' - _ITEM_RE = r'data-episode=["\']%s' % NRKTVIE._EPISODE_RE - _TESTS = [{ - 'url': 'https://tv.nrk.no/program/episodes/nytt-paa-nytt/69031', - 'info_dict': { - 'id': '69031', - 'title': 'Nytt på nytt, sesong: 201210', - }, - 'playlist_count': 4, - }] - - def _extract_title(self, webpage): - return self._html_search_regex( - r'<h1>([^<]+)</h1>', webpage, 'title', fatal=False) - - -class NRKSkoleIE(InfoExtractor): - IE_DESC = 'NRK Skole' - _VALID_URL = r'https?://(?:www\.)?nrk\.no/skole/?\?.*\bmediaId=(?P<id>\d+)' - - _TESTS = [{ - 'url': 'https://www.nrk.no/skole/?page=search&q=&mediaId=14099', - 'md5': '18c12c3d071953c3bf8d54ef6b2587b7', - 'info_dict': { - 'id': '6021', - 'ext': 'mp4', - 'title': 'Genetikk og eneggede tvillinger', - 'description': 'md5:3aca25dcf38ec30f0363428d2b265f8d', - 'duration': 399, - }, - }, { - 'url': 'https://www.nrk.no/skole/?page=objectives&subject=naturfag&objective=K15114&mediaId=19355', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - nrk_id = self._download_json( - 'https://nrkno-skole-prod.kube.nrk.no/skole/api/media/%s' % video_id, - video_id)['psId'] - - return self.url_result('nrk:%s' % nrk_id) diff --git a/youtube_dl/extractor/ntvde.py b/youtube_dl/extractor/ntvde.py deleted file mode 100644 index 101a5374c..000000000 --- a/youtube_dl/extractor/ntvde.py +++ /dev/null @@ -1,77 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_urlparse -from ..utils import ( - int_or_none, - js_to_json, - parse_duration, -) - - -class NTVDeIE(InfoExtractor): - IE_NAME = 'n-tv.de' - _VALID_URL = r'https?://(?:www\.)?n-tv\.de/mediathek/videos/[^/?#]+/[^/?#]+-article(?P<id>.+)\.html' - - _TESTS = [{ - 'url': 'http://www.n-tv.de/mediathek/videos/panorama/Schnee-und-Glaette-fuehren-zu-zahlreichen-Unfaellen-und-Staus-article14438086.html', - 'md5': '6ef2514d4b1e8e03ca24b49e2f167153', - 'info_dict': { - 'id': '14438086', - 'ext': 'mp4', - 'thumbnail': r're:^https?://.*\.jpg$', - 'title': 'Schnee und Glätte führen zu zahlreichen Unfällen und Staus', - 'alt_title': 'Winterchaos auf deutschen Straßen', - 'description': 'Schnee und Glätte sorgen deutschlandweit für einen chaotischen Start in die Woche: Auf den Straßen kommt es zu kilometerlangen Staus und Dutzenden Glätteunfällen. In Düsseldorf und München wirbelt der Schnee zudem den Flugplan durcheinander. Dutzende Flüge landen zu spät, einige fallen ganz aus.', - 'duration': 4020, - 'timestamp': 1422892797, - 'upload_date': '20150202', - }, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - info = self._parse_json(self._search_regex( - r'(?s)ntv\.pageInfo\.article\s*=\s*(\{.*?\});', webpage, 'info'), - video_id, transform_source=js_to_json) - timestamp = int_or_none(info.get('publishedDateAsUnixTimeStamp')) - vdata = self._parse_json(self._search_regex( - r'(?s)\$\(\s*"\#player"\s*\)\s*\.data\(\s*"player",\s*(\{.*?\})\);', - webpage, 'player data'), video_id, - transform_source=lambda s: js_to_json(re.sub(r'advertising:\s*{[^}]+},', '', s))) - duration = parse_duration(vdata.get('duration')) - - formats = [] - if vdata.get('video'): - formats.append({ - 'format_id': 'flash', - 'url': 'rtmp://fms.n-tv.de/%s' % vdata['video'], - }) - if vdata.get('videoMp4'): - formats.append({ - 'format_id': 'mobile', - 'url': compat_urlparse.urljoin('http://video.n-tv.de', vdata['videoMp4']), - 'tbr': 400, # estimation - }) - if vdata.get('videoM3u8'): - m3u8_url = compat_urlparse.urljoin('http://video.n-tv.de', vdata['videoM3u8']) - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native', - preference=0, m3u8_id='hls', fatal=False)) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': info['headline'], - 'description': info.get('intro'), - 'alt_title': info.get('kicker'), - 'timestamp': timestamp, - 'thumbnail': vdata.get('html5VideoPoster'), - 'duration': duration, - 'formats': formats, - } diff --git a/youtube_dl/extractor/nuvid.py b/youtube_dl/extractor/nuvid.py deleted file mode 100644 index ab6bfcd7f..000000000 --- a/youtube_dl/extractor/nuvid.py +++ /dev/null @@ -1,71 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - parse_duration, -) - - -class NuvidIE(InfoExtractor): - _VALID_URL = r'https?://(?:www|m)\.nuvid\.com/video/(?P<id>[0-9]+)' - _TEST = { - 'url': 'http://m.nuvid.com/video/1310741/', - 'md5': 'eab207b7ac4fccfb4e23c86201f11277', - 'info_dict': { - 'id': '1310741', - 'ext': 'mp4', - 'title': 'Horny babes show their awesome bodeis and', - 'duration': 129, - 'age_limit': 18, - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - page_url = 'http://m.nuvid.com/video/%s' % video_id - webpage = self._download_webpage( - page_url, video_id, 'Downloading video page') - # When dwnld_speed exists and has a value larger than the MP4 file's - # bitrate, Nuvid returns the MP4 URL - # It's unit is 100bytes/millisecond, see mobile-nuvid-min.js for the algorithm - self._set_cookie('nuvid.com', 'dwnld_speed', '10.0') - mp4_webpage = self._download_webpage( - page_url, video_id, 'Downloading video page for MP4 format') - - html5_video_re = r'(?s)<(?:video|audio)[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', - video_url = self._html_search_regex(html5_video_re, webpage, video_id) - mp4_video_url = self._html_search_regex(html5_video_re, mp4_webpage, video_id) - formats = [{ - 'url': video_url, - }] - if mp4_video_url != video_url: - formats.append({ - 'url': mp4_video_url, - }) - - title = self._html_search_regex( - [r'<span title="([^"]+)">', - r'<div class="thumb-holder video">\s*<h5[^>]*>([^<]+)</h5>', - r'<span[^>]+class="title_thumb">([^<]+)</span>'], webpage, 'title').strip() - thumbnails = [ - { - 'url': thumb_url, - } for thumb_url in re.findall(r'<img src="([^"]+)" alt="" />', webpage) - ] - thumbnail = thumbnails[0]['url'] if thumbnails else None - duration = parse_duration(self._html_search_regex( - [r'<i class="fa fa-clock-o"></i>\s*(\d{2}:\d{2})', - r'<span[^>]+class="view_time">([^<]+)</span>'], webpage, 'duration', fatal=False)) - - return { - 'id': video_id, - 'title': title, - 'thumbnails': thumbnails, - 'thumbnail': thumbnail, - 'duration': duration, - 'age_limit': 18, - 'formats': formats, - } diff --git a/youtube_dl/extractor/nytimes.py b/youtube_dl/extractor/nytimes.py deleted file mode 100644 index 976b1c694..000000000 --- a/youtube_dl/extractor/nytimes.py +++ /dev/null @@ -1,261 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import hmac -import hashlib -import base64 - -from .common import InfoExtractor -from ..utils import ( - determine_ext, - float_or_none, - int_or_none, - js_to_json, - mimetype2ext, - parse_iso8601, - remove_start, -) - - -class NYTimesBaseIE(InfoExtractor): - _SECRET = b'pX(2MbU2);4N{7J8)>YwKRJ+/pQ3JkiU2Q^V>mFYv6g6gYvt6v' - - def _extract_video_from_id(self, video_id): - # Authorization generation algorithm is reverse engineered from `signer` in - # http://graphics8.nytimes.com/video/vhs/vhs-2.x.min.js - path = '/svc/video/api/v3/video/' + video_id - hm = hmac.new(self._SECRET, (path + ':vhs').encode(), hashlib.sha512).hexdigest() - video_data = self._download_json('http://www.nytimes.com' + path, video_id, 'Downloading video JSON', headers={ - 'Authorization': 'NYTV ' + base64.b64encode(hm.encode()).decode(), - 'X-NYTV': 'vhs', - }, fatal=False) - if not video_data: - video_data = self._download_json( - 'http://www.nytimes.com/svc/video/api/v2/video/' + video_id, - video_id, 'Downloading video JSON') - - title = video_data['headline'] - - def get_file_size(file_size): - if isinstance(file_size, int): - return file_size - elif isinstance(file_size, dict): - return int(file_size.get('value', 0)) - else: - return None - - urls = [] - formats = [] - for video in video_data.get('renditions', []): - video_url = video.get('url') - format_id = video.get('type') - if not video_url or format_id == 'thumbs' or video_url in urls: - continue - urls.append(video_url) - ext = mimetype2ext(video.get('mimetype')) or determine_ext(video_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', 'm3u8_native', - m3u8_id=format_id or 'hls', fatal=False)) - elif ext == 'mpd': - continue - # formats.extend(self._extract_mpd_formats( - # video_url, video_id, format_id or 'dash', fatal=False)) - else: - formats.append({ - 'url': video_url, - 'format_id': format_id, - 'vcodec': video.get('videoencoding') or video.get('video_codec'), - 'width': int_or_none(video.get('width')), - 'height': int_or_none(video.get('height')), - 'filesize': get_file_size(video.get('file_size') or video.get('fileSize')), - 'tbr': int_or_none(video.get('bitrate'), 1000) or None, - 'ext': ext, - }) - self._sort_formats(formats, ('height', 'width', 'filesize', 'tbr', 'fps', 'format_id')) - - thumbnails = [] - for image in video_data.get('images', []): - image_url = image.get('url') - if not image_url: - continue - thumbnails.append({ - 'url': 'http://www.nytimes.com/' + image_url, - 'width': int_or_none(image.get('width')), - 'height': int_or_none(image.get('height')), - }) - - publication_date = video_data.get('publication_date') - timestamp = parse_iso8601(publication_date[:-8]) if publication_date else None - - return { - 'id': video_id, - 'title': title, - 'description': video_data.get('summary'), - 'timestamp': timestamp, - 'uploader': video_data.get('byline'), - 'duration': float_or_none(video_data.get('duration'), 1000), - 'formats': formats, - 'thumbnails': thumbnails, - } - - -class NYTimesIE(NYTimesBaseIE): - _VALID_URL = r'https?://(?:(?:www\.)?nytimes\.com/video/(?:[^/]+/)+?|graphics8\.nytimes\.com/bcvideo/\d+(?:\.\d+)?/iframe/embed\.html\?videoId=)(?P<id>\d+)' - - _TESTS = [{ - 'url': 'http://www.nytimes.com/video/opinion/100000002847155/verbatim-what-is-a-photocopier.html?playlistId=100000001150263', - 'md5': 'd665342765db043f7e225cff19df0f2d', - 'info_dict': { - 'id': '100000002847155', - 'ext': 'mov', - 'title': 'Verbatim: What Is a Photocopier?', - 'description': 'md5:93603dada88ddbda9395632fdc5da260', - 'timestamp': 1398631707, - 'upload_date': '20140427', - 'uploader': 'Brett Weiner', - 'duration': 419, - } - }, { - 'url': 'http://www.nytimes.com/video/travel/100000003550828/36-hours-in-dubai.html', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - return self._extract_video_from_id(video_id) - - -class NYTimesArticleIE(NYTimesBaseIE): - _VALID_URL = r'https?://(?:www\.)?nytimes\.com/(.(?<!video))*?/(?:[^/]+/)*(?P<id>[^.]+)(?:\.html)?' - _TESTS = [{ - 'url': 'http://www.nytimes.com/2015/04/14/business/owner-of-gravity-payments-a-credit-card-processor-is-setting-a-new-minimum-wage-70000-a-year.html?_r=0', - 'md5': 'e2076d58b4da18e6a001d53fd56db3c9', - 'info_dict': { - 'id': '100000003628438', - 'ext': 'mov', - 'title': 'New Minimum Wage: $70,000 a Year', - 'description': 'Dan Price, C.E.O. of Gravity Payments, surprised his 120-person staff by announcing that he planned over the next three years to raise the salary of every employee to $70,000 a year.', - 'timestamp': 1429033037, - 'upload_date': '20150414', - 'uploader': 'Matthew Williams', - } - }, { - 'url': 'http://www.nytimes.com/2016/10/14/podcasts/revelations-from-the-final-weeks.html', - 'md5': 'e0d52040cafb07662acf3c9132db3575', - 'info_dict': { - 'id': '100000004709062', - 'title': 'The Run-Up: ‘He Was Like an Octopus’', - 'ext': 'mp3', - 'description': 'md5:fb5c6b93b12efc51649b4847fe066ee4', - 'series': 'The Run-Up', - 'episode': '‘He Was Like an Octopus’', - 'episode_number': 20, - 'duration': 2130, - } - }, { - 'url': 'http://www.nytimes.com/2016/10/16/books/review/inside-the-new-york-times-book-review-the-rise-of-hitler.html', - 'info_dict': { - 'id': '100000004709479', - 'title': 'The Rise of Hitler', - 'ext': 'mp3', - 'description': 'md5:bce877fd9e3444990cb141875fab0028', - 'creator': 'Pamela Paul', - 'duration': 3475, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://www.nytimes.com/news/minute/2014/03/17/times-minute-whats-next-in-crimea/?_php=true&_type=blogs&_php=true&_type=blogs&_r=1', - 'only_matching': True, - }] - - def _extract_podcast_from_json(self, json, page_id, webpage): - podcast_audio = self._parse_json( - json, page_id, transform_source=js_to_json) - - audio_data = podcast_audio['data'] - track = audio_data['track'] - - episode_title = track['title'] - video_url = track['source'] - - description = track.get('description') or self._html_search_meta( - ['og:description', 'twitter:description'], webpage) - - podcast_title = audio_data.get('podcast', {}).get('title') - title = ('%s: %s' % (podcast_title, episode_title) - if podcast_title else episode_title) - - episode = audio_data.get('podcast', {}).get('episode') or '' - episode_number = int_or_none(self._search_regex( - r'[Ee]pisode\s+(\d+)', episode, 'episode number', default=None)) - - return { - 'id': remove_start(podcast_audio.get('target'), 'FT') or page_id, - 'url': video_url, - 'title': title, - 'description': description, - 'creator': track.get('credit'), - 'series': podcast_title, - 'episode': episode_title, - 'episode_number': episode_number, - 'duration': int_or_none(track.get('duration')), - } - - def _real_extract(self, url): - page_id = self._match_id(url) - - webpage = self._download_webpage(url, page_id) - - video_id = self._search_regex( - r'data-videoid=["\'](\d+)', webpage, 'video id', - default=None, fatal=False) - if video_id is not None: - return self._extract_video_from_id(video_id) - - podcast_data = self._search_regex( - (r'NYTD\.FlexTypes\.push\s*\(\s*({.+?})\s*\)\s*;\s*</script', - r'NYTD\.FlexTypes\.push\s*\(\s*({.+})\s*\)\s*;'), - webpage, 'podcast data') - return self._extract_podcast_from_json(podcast_data, page_id, webpage) - - -class NYTimesCookingIE(NYTimesBaseIE): - _VALID_URL = r'https?://cooking\.nytimes\.com/(?:guid|recip)es/(?P<id>\d+)' - _TESTS = [{ - 'url': 'https://cooking.nytimes.com/recipes/1017817-cranberry-curd-tart', - 'md5': 'dab81fa2eaeb3f9ed47498bdcfcdc1d3', - 'info_dict': { - 'id': '100000004756089', - 'ext': 'mov', - 'timestamp': 1479383008, - 'uploader': 'By SHAW LASH, ADAM SAEWITZ and JAMES HERRON', - 'title': 'Cranberry Tart', - 'upload_date': '20161117', - 'description': 'If you are a fan of lemon curd or the classic French tarte au citron, you will love this cranberry version.', - }, - }, { - 'url': 'https://cooking.nytimes.com/guides/13-how-to-cook-a-turkey', - 'md5': '4b2e8c70530a89b8d905a2b572316eb8', - 'info_dict': { - 'id': '100000003951728', - 'ext': 'mov', - 'timestamp': 1445509539, - 'description': 'Turkey guide', - 'upload_date': '20151022', - 'title': 'Turkey', - } - }] - - def _real_extract(self, url): - page_id = self._match_id(url) - - webpage = self._download_webpage(url, page_id) - - video_id = self._search_regex( - r'data-video-id=["\'](\d+)', webpage, 'video id') - - return self._extract_video_from_id(video_id) diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py deleted file mode 100644 index 7ed9fac55..000000000 --- a/youtube_dl/extractor/odnoklassniki.py +++ /dev/null @@ -1,268 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_etree_fromstring, - compat_parse_qs, - compat_urllib_parse_unquote, - compat_urllib_parse_urlparse, -) -from ..utils import ( - ExtractorError, - unified_strdate, - int_or_none, - qualities, - unescapeHTML, - urlencode_postdata, -) - - -class OdnoklassnikiIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?:// - (?:(?:www|m|mobile)\.)? - (?:odnoklassniki|ok)\.ru/ - (?: - video(?:embed)?/| - web-api/video/moviePlayer/| - live/| - dk\?.*?st\.mvId= - ) - (?P<id>[\d-]+) - ''' - _TESTS = [{ - # metadata in JSON - 'url': 'http://ok.ru/video/20079905452', - 'md5': '0b62089b479e06681abaaca9d204f152', - 'info_dict': { - 'id': '20079905452', - 'ext': 'mp4', - 'title': 'Культура меняет нас (прекрасный ролик!))', - 'duration': 100, - 'upload_date': '20141207', - 'uploader_id': '330537914540', - 'uploader': 'Виталий Добровольский', - 'like_count': int, - 'age_limit': 0, - }, - }, { - # metadataUrl - 'url': 'http://ok.ru/video/63567059965189-0?fromTime=5', - 'md5': '6ff470ea2dd51d5d18c295a355b0b6bc', - 'info_dict': { - 'id': '63567059965189-0', - 'ext': 'mp4', - 'title': 'Девушка без комплексов ...', - 'duration': 191, - 'upload_date': '20150518', - 'uploader_id': '534380003155', - 'uploader': '☭ Андрей Мещанинов ☭', - 'like_count': int, - 'age_limit': 0, - 'start_time': 5, - }, - }, { - # YouTube embed (metadataUrl, provider == USER_YOUTUBE) - 'url': 'http://ok.ru/video/64211978996595-1', - 'md5': '2f206894ffb5dbfcce2c5a14b909eea5', - 'info_dict': { - 'id': 'V_VztHT5BzY', - 'ext': 'mp4', - 'title': 'Космическая среда от 26 августа 2015', - 'description': 'md5:848eb8b85e5e3471a3a803dae1343ed0', - 'duration': 440, - 'upload_date': '20150826', - 'uploader_id': 'tvroscosmos', - 'uploader': 'Телестудия Роскосмоса', - 'age_limit': 0, - }, - }, { - # YouTube embed (metadata, provider == USER_YOUTUBE, no metadata.movie.title field) - 'url': 'http://ok.ru/video/62036049272859-0', - 'info_dict': { - 'id': '62036049272859-0', - 'ext': 'mp4', - 'title': 'МУЗЫКА ДОЖДЯ .', - 'description': 'md5:6f1867132bd96e33bf53eda1091e8ed0', - 'upload_date': '20120106', - 'uploader_id': '473534735899', - 'uploader': 'МARINA D', - 'age_limit': 0, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'Video has not been found', - }, { - 'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452', - 'only_matching': True, - }, { - 'url': 'http://www.ok.ru/video/20648036891', - 'only_matching': True, - }, { - 'url': 'http://www.ok.ru/videoembed/20648036891', - 'only_matching': True, - }, { - 'url': 'http://m.ok.ru/video/20079905452', - 'only_matching': True, - }, { - 'url': 'http://mobile.ok.ru/video/20079905452', - 'only_matching': True, - }, { - 'url': 'https://www.ok.ru/live/484531969818', - 'only_matching': True, - }, { - 'url': 'https://m.ok.ru/dk?st.cmd=movieLayer&st.discId=863789452017&st.retLoc=friend&st.rtu=%2Fdk%3Fst.cmd%3DfriendMovies%26st.mode%3Down%26st.mrkId%3D%257B%2522uploadedMovieMarker%2522%253A%257B%2522marker%2522%253A%25221519410114503%2522%252C%2522hasMore%2522%253Atrue%257D%252C%2522sharedMovieMarker%2522%253A%257B%2522marker%2522%253Anull%252C%2522hasMore%2522%253Afalse%257D%257D%26st.friendId%3D561722190321%26st.frwd%3Don%26_prevCmd%3DfriendMovies%26tkn%3D7257&st.discType=MOVIE&st.mvId=863789452017&_prevCmd=friendMovies&tkn=3648#lst#', - 'only_matching': True, - }, { - # Paid video - 'url': 'https://ok.ru/video/954886983203', - 'only_matching': True, - }] - - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:odnoklassniki|ok)\.ru/videoembed/.+?)\1', webpage) - if mobj: - return mobj.group('url') - - def _real_extract(self, url): - start_time = int_or_none(compat_parse_qs( - compat_urllib_parse_urlparse(url).query).get('fromTime', [None])[0]) - - video_id = self._match_id(url) - - webpage = self._download_webpage( - 'http://ok.ru/video/%s' % video_id, video_id) - - error = self._search_regex( - r'[^>]+class="vp_video_stub_txt"[^>]*>([^<]+)<', - webpage, 'error', default=None) - if error: - raise ExtractorError(error, expected=True) - - player = self._parse_json( - unescapeHTML(self._search_regex( - r'data-options=(?P<quote>["\'])(?P<player>{.+?%s.+?})(?P=quote)' % video_id, - webpage, 'player', group='player')), - video_id) - - flashvars = player['flashvars'] - - metadata = flashvars.get('metadata') - if metadata: - metadata = self._parse_json(metadata, video_id) - else: - data = {} - st_location = flashvars.get('location') - if st_location: - data['st.location'] = st_location - metadata = self._download_json( - compat_urllib_parse_unquote(flashvars['metadataUrl']), - video_id, 'Downloading metadata JSON', - data=urlencode_postdata(data)) - - movie = metadata['movie'] - - # Some embedded videos may not contain title in movie dict (e.g. - # http://ok.ru/video/62036049272859-0) thus we allow missing title - # here and it's going to be extracted later by an extractor that - # will process the actual embed. - provider = metadata.get('provider') - title = movie['title'] if provider == 'UPLOADED_ODKL' else movie.get('title') - - thumbnail = movie.get('poster') - duration = int_or_none(movie.get('duration')) - - author = metadata.get('author', {}) - uploader_id = author.get('id') - uploader = author.get('name') - - upload_date = unified_strdate(self._html_search_meta( - 'ya:ovs:upload_date', webpage, 'upload date', default=None)) - - age_limit = None - adult = self._html_search_meta( - 'ya:ovs:adult', webpage, 'age limit', default=None) - if adult: - age_limit = 18 if adult == 'true' else 0 - - like_count = int_or_none(metadata.get('likeCount')) - - info = { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'duration': duration, - 'upload_date': upload_date, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'like_count': like_count, - 'age_limit': age_limit, - 'start_time': start_time, - } - - if provider == 'USER_YOUTUBE': - info.update({ - '_type': 'url_transparent', - 'url': movie['contentId'], - }) - return info - - assert title - if provider == 'LIVE_TV_APP': - info['title'] = self._live_title(title) - - quality = qualities(('4', '0', '1', '2', '3', '5')) - - formats = [{ - 'url': f['url'], - 'ext': 'mp4', - 'format_id': f['name'], - } for f in metadata['videos']] - - m3u8_url = metadata.get('hlsManifestUrl') - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - - dash_manifest = metadata.get('metadataEmbedded') - if dash_manifest: - formats.extend(self._parse_mpd_formats( - compat_etree_fromstring(dash_manifest), 'mpd')) - - for fmt in formats: - fmt_type = self._search_regex( - r'\btype[/=](\d)', fmt['url'], - 'format type', default=None) - if fmt_type: - fmt['quality'] = quality(fmt_type) - - # Live formats - m3u8_url = metadata.get('hlsMasterPlaylistUrl') - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', entry_protocol='m3u8', - m3u8_id='hls', fatal=False)) - rtmp_url = metadata.get('rtmpUrl') - if rtmp_url: - formats.append({ - 'url': rtmp_url, - 'format_id': 'rtmp', - 'ext': 'flv', - }) - - if not formats: - payment_info = metadata.get('paymentInfo') - if payment_info: - raise ExtractorError('This video is paid, subscribe to download it', expected=True) - - self._sort_formats(formats) - - info['formats'] = formats - return info diff --git a/youtube_dl/extractor/ondemandkorea.py b/youtube_dl/extractor/ondemandkorea.py deleted file mode 100644 index df1ce3c1d..000000000 --- a/youtube_dl/extractor/ondemandkorea.py +++ /dev/null @@ -1,62 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - js_to_json, -) - - -class OnDemandKoreaIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ondemandkorea\.com/(?P<id>[^/]+)\.html' - _GEO_COUNTRIES = ['US', 'CA'] - _TEST = { - 'url': 'http://www.ondemandkorea.com/ask-us-anything-e43.html', - 'info_dict': { - 'id': 'ask-us-anything-e43', - 'ext': 'mp4', - 'title': 'Ask Us Anything : E43', - 'thumbnail': r're:^https?://.*\.jpg$', - }, - 'params': { - 'skip_download': 'm3u8 download' - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id, fatal=False) - - if not webpage: - # Page sometimes returns captcha page with HTTP 403 - raise ExtractorError( - 'Unable to access page. You may have been blocked.', - expected=True) - - if 'msg_block_01.png' in webpage: - self.raise_geo_restricted( - msg='This content is not available in your region', - countries=self._GEO_COUNTRIES) - - if 'This video is only available to ODK PLUS members.' in webpage: - raise ExtractorError( - 'This video is only available to ODK PLUS members.', - expected=True) - - title = self._og_search_title(webpage) - - jw_config = self._parse_json( - self._search_regex( - r'(?s)jwplayer\(([\'"])(?:(?!\1).)+\1\)\.setup\s*\((?P<options>.+?)\);', - webpage, 'jw config', group='options'), - video_id, transform_source=js_to_json) - info = self._parse_jwplayer_data( - jw_config, video_id, require_title=False, m3u8_id='hls', - base_url=url) - - info.update({ - 'title': title, - 'thumbnail': self._og_search_thumbnail(webpage), - }) - return info diff --git a/youtube_dl/extractor/onet.py b/youtube_dl/extractor/onet.py deleted file mode 100644 index e55b2ac89..000000000 --- a/youtube_dl/extractor/onet.py +++ /dev/null @@ -1,268 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - determine_ext, - ExtractorError, - float_or_none, - get_element_by_class, - int_or_none, - js_to_json, - NO_DEFAULT, - parse_iso8601, - remove_start, - strip_or_none, - url_basename, -) - - -class OnetBaseIE(InfoExtractor): - _URL_BASE_RE = r'https?://(?:(?:www\.)?onet\.tv|onet100\.vod\.pl)/[a-z]/' - - def _search_mvp_id(self, webpage): - return self._search_regex( - r'id=(["\'])mvp:(?P<id>.+?)\1', webpage, 'mvp id', group='id') - - def _extract_from_id(self, video_id, webpage=None): - response = self._download_json( - 'http://qi.ckm.onetapi.pl/', video_id, - query={ - 'body[id]': video_id, - 'body[jsonrpc]': '2.0', - 'body[method]': 'get_asset_detail', - 'body[params][ID_Publikacji]': video_id, - 'body[params][Service]': 'www.onet.pl', - 'content-type': 'application/jsonp', - 'x-onet-app': 'player.front.onetapi.pl', - }) - - error = response.get('error') - if error: - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, error['message']), expected=True) - - video = response['result'].get('0') - - formats = [] - for format_type, formats_dict in video['formats'].items(): - if not isinstance(formats_dict, dict): - continue - for format_id, format_list in formats_dict.items(): - if not isinstance(format_list, list): - continue - for f in format_list: - video_url = f.get('url') - if not video_url: - continue - ext = determine_ext(video_url) - if format_id.startswith('ism'): - formats.extend(self._extract_ism_formats( - video_url, video_id, 'mss', fatal=False)) - elif ext == 'mpd': - formats.extend(self._extract_mpd_formats( - video_url, video_id, mpd_id='dash', fatal=False)) - elif format_id.startswith('hls'): - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - else: - http_f = { - 'url': video_url, - 'format_id': format_id, - 'abr': float_or_none(f.get('audio_bitrate')), - } - if format_type == 'audio': - http_f['vcodec'] = 'none' - else: - http_f.update({ - 'height': int_or_none(f.get('vertical_resolution')), - 'width': int_or_none(f.get('horizontal_resolution')), - 'vbr': float_or_none(f.get('video_bitrate')), - }) - formats.append(http_f) - self._sort_formats(formats) - - meta = video.get('meta', {}) - - title = (self._og_search_title( - webpage, default=None) if webpage else None) or meta['title'] - description = (self._og_search_description( - webpage, default=None) if webpage else None) or meta.get('description') - duration = meta.get('length') or meta.get('lenght') - timestamp = parse_iso8601(meta.get('addDate'), ' ') - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'duration': duration, - 'timestamp': timestamp, - 'formats': formats, - } - - -class OnetMVPIE(OnetBaseIE): - _VALID_URL = r'onetmvp:(?P<id>\d+\.\d+)' - - _TEST = { - 'url': 'onetmvp:381027.1509591944', - 'only_matching': True, - } - - def _real_extract(self, url): - return self._extract_from_id(self._match_id(url)) - - -class OnetIE(OnetBaseIE): - _VALID_URL = OnetBaseIE._URL_BASE_RE + r'[a-z]+/(?P<display_id>[0-9a-z-]+)/(?P<id>[0-9a-z]+)' - IE_NAME = 'onet.tv' - - _TESTS = [{ - 'url': 'http://onet.tv/k/openerfestival/open-er-festival-2016-najdziwniejsze-wymagania-gwiazd/qbpyqc', - 'md5': '436102770fb095c75b8bb0392d3da9ff', - 'info_dict': { - 'id': 'qbpyqc', - 'display_id': 'open-er-festival-2016-najdziwniejsze-wymagania-gwiazd', - 'ext': 'mp4', - 'title': 'Open\'er Festival 2016: najdziwniejsze wymagania gwiazd', - 'description': 'Trzy samochody, których nigdy nie użyto, prywatne spa, hotel dekorowany czarnym suknem czy nielegalne używki. Organizatorzy koncertów i festiwali muszą stawać przed nie lada wyzwaniem zapraszając gwia...', - 'upload_date': '20160705', - 'timestamp': 1467721580, - }, - }, { - 'url': 'https://onet100.vod.pl/k/openerfestival/open-er-festival-2016-najdziwniejsze-wymagania-gwiazd/qbpyqc', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id, video_id = mobj.group('display_id', 'id') - - webpage = self._download_webpage(url, display_id) - - mvp_id = self._search_mvp_id(webpage) - - info_dict = self._extract_from_id(mvp_id, webpage) - info_dict.update({ - 'id': video_id, - 'display_id': display_id, - }) - - return info_dict - - -class OnetChannelIE(OnetBaseIE): - _VALID_URL = OnetBaseIE._URL_BASE_RE + r'(?P<id>[a-z]+)(?:[?#]|$)' - IE_NAME = 'onet.tv:channel' - - _TESTS = [{ - 'url': 'http://onet.tv/k/openerfestival', - 'info_dict': { - 'id': 'openerfestival', - 'title': "Open'er Festival", - 'description': "Tak było na Open'er Festival 2016! Oglądaj nasze reportaże i wywiady z artystami.", - }, - 'playlist_mincount': 35, - }, { - 'url': 'https://onet100.vod.pl/k/openerfestival', - 'only_matching': True, - }] - - def _real_extract(self, url): - channel_id = self._match_id(url) - - webpage = self._download_webpage(url, channel_id) - - current_clip_info = self._parse_json(self._search_regex( - r'var\s+currentClip\s*=\s*({[^}]+})', webpage, 'video info'), channel_id, - transform_source=lambda s: js_to_json(re.sub(r'\'\s*\+\s*\'', '', s))) - video_id = remove_start(current_clip_info['ckmId'], 'mvp:') - video_name = url_basename(current_clip_info['url']) - - if self._downloader.params.get('noplaylist'): - self.to_screen( - 'Downloading just video %s because of --no-playlist' % video_name) - return self._extract_from_id(video_id, webpage) - - self.to_screen( - 'Downloading channel %s - add --no-playlist to just download video %s' % ( - channel_id, video_name)) - matches = re.findall( - r'<a[^>]+href=[\'"](%s[a-z]+/[0-9a-z-]+/[0-9a-z]+)' % self._URL_BASE_RE, - webpage) - entries = [ - self.url_result(video_link, OnetIE.ie_key()) - for video_link in matches] - - channel_title = strip_or_none(get_element_by_class('o_channelName', webpage)) - channel_description = strip_or_none(get_element_by_class('o_channelDesc', webpage)) - return self.playlist_result(entries, channel_id, channel_title, channel_description) - - -class OnetPlIE(InfoExtractor): - _VALID_URL = r'https?://(?:[^/]+\.)?(?:onet|businessinsider\.com|plejada)\.pl/(?:[^/]+/)+(?P<id>[0-9a-z]+)' - IE_NAME = 'onet.pl' - - _TESTS = [{ - 'url': 'http://eurosport.onet.pl/zimowe/skoki-narciarskie/ziobro-wygral-kwalifikacje-w-pjongczangu/9ckrly', - 'md5': 'b94021eb56214c3969380388b6e73cb0', - 'info_dict': { - 'id': '1561707.1685479', - 'ext': 'mp4', - 'title': 'Ziobro wygrał kwalifikacje w Pjongczangu', - 'description': 'md5:61fb0740084d2d702ea96512a03585b4', - 'upload_date': '20170214', - 'timestamp': 1487078046, - }, - }, { - # embedded via pulsembed - 'url': 'http://film.onet.pl/pensjonat-nad-rozlewiskiem-relacja-z-planu-serialu/y428n0', - 'info_dict': { - 'id': '501235.965429946', - 'ext': 'mp4', - 'title': '"Pensjonat nad rozlewiskiem": relacja z planu serialu', - 'upload_date': '20170622', - 'timestamp': 1498159955, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://film.onet.pl/zwiastuny/ghost-in-the-shell-drugi-zwiastun-pl/5q6yl3', - 'only_matching': True, - }, { - 'url': 'http://moto.onet.pl/jak-wybierane-sa-miejsca-na-fotoradary/6rs04e', - 'only_matching': True, - }, { - 'url': 'http://businessinsider.com.pl/wideo/scenariusz-na-koniec-swiata-wedlug-nasa/dwnqptk', - 'only_matching': True, - }, { - 'url': 'http://plejada.pl/weronika-rosati-o-swoim-domniemanym-slubie/n2bq89', - 'only_matching': True, - }] - - def _search_mvp_id(self, webpage, default=NO_DEFAULT): - return self._search_regex( - r'data-(?:params-)?mvp=["\'](\d+\.\d+)', webpage, 'mvp id', - default=default) - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - mvp_id = self._search_mvp_id(webpage, default=None) - - if not mvp_id: - pulsembed_url = self._search_regex( - r'data-src=(["\'])(?P<url>(?:https?:)?//pulsembed\.eu/.+?)\1', - webpage, 'pulsembed url', group='url') - webpage = self._download_webpage( - pulsembed_url, video_id, 'Downloading pulsembed webpage') - mvp_id = self._search_mvp_id(webpage) - - return self.url_result( - 'onetmvp:%s' % mvp_id, OnetMVPIE.ie_key(), video_id=mvp_id) diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py deleted file mode 100644 index eb957b8fe..000000000 --- a/youtube_dl/extractor/ooyala.py +++ /dev/null @@ -1,210 +0,0 @@ -from __future__ import unicode_literals - -import base64 -import re - -from .common import InfoExtractor -from ..compat import ( - compat_b64decode, - compat_str, -) -from ..utils import ( - determine_ext, - ExtractorError, - float_or_none, - int_or_none, - try_get, - unsmuggle_url, -) - - -class OoyalaBaseIE(InfoExtractor): - _PLAYER_BASE = 'http://player.ooyala.com/' - _CONTENT_TREE_BASE = _PLAYER_BASE + 'player_api/v1/content_tree/' - _AUTHORIZATION_URL_TEMPLATE = _PLAYER_BASE + 'sas/player_api/v2/authorization/embed_code/%s/%s' - - def _extract(self, content_tree_url, video_id, domain=None, supportedformats=None, embed_token=None): - content_tree = self._download_json(content_tree_url, video_id)['content_tree'] - metadata = content_tree[list(content_tree)[0]] - embed_code = metadata['embed_code'] - pcode = metadata.get('asset_pcode') or embed_code - title = metadata['title'] - - auth_data = self._download_json( - self._AUTHORIZATION_URL_TEMPLATE % (pcode, embed_code), - video_id, headers=self.geo_verification_headers(), query={ - 'domain': domain or 'player.ooyala.com', - 'supportedFormats': supportedformats or 'mp4,rtmp,m3u8,hds,dash,smooth', - 'embedToken': embed_token, - })['authorization_data'][embed_code] - - urls = [] - formats = [] - streams = auth_data.get('streams') or [{ - 'delivery_type': 'hls', - 'url': { - 'data': base64.b64encode(('http://player.ooyala.com/hls/player/all/%s.m3u8' % embed_code).encode()).decode(), - } - }] - for stream in streams: - url_data = try_get(stream, lambda x: x['url']['data'], compat_str) - if not url_data: - continue - s_url = compat_b64decode(url_data).decode('utf-8') - if not s_url or s_url in urls: - continue - urls.append(s_url) - ext = determine_ext(s_url, None) - delivery_type = stream.get('delivery_type') - if delivery_type == 'hls' or ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - re.sub(r'/ip(?:ad|hone)/', '/all/', s_url), embed_code, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - elif delivery_type == 'hds' or ext == 'f4m': - formats.extend(self._extract_f4m_formats( - s_url + '?hdcore=3.7.0', embed_code, f4m_id='hds', fatal=False)) - elif delivery_type == 'dash' or ext == 'mpd': - formats.extend(self._extract_mpd_formats( - s_url, embed_code, mpd_id='dash', fatal=False)) - elif delivery_type == 'smooth': - self._extract_ism_formats( - s_url, embed_code, ism_id='mss', fatal=False) - elif ext == 'smil': - formats.extend(self._extract_smil_formats( - s_url, embed_code, fatal=False)) - else: - formats.append({ - 'url': s_url, - 'ext': ext or delivery_type, - 'vcodec': stream.get('video_codec'), - 'format_id': delivery_type, - 'width': int_or_none(stream.get('width')), - 'height': int_or_none(stream.get('height')), - 'abr': int_or_none(stream.get('audio_bitrate')), - 'vbr': int_or_none(stream.get('video_bitrate')), - 'fps': float_or_none(stream.get('framerate')), - }) - if not formats and not auth_data.get('authorized'): - raise ExtractorError('%s said: %s' % ( - self.IE_NAME, auth_data['message']), expected=True) - self._sort_formats(formats) - - subtitles = {} - for lang, sub in metadata.get('closed_captions_vtt', {}).get('captions', {}).items(): - sub_url = sub.get('url') - if not sub_url: - continue - subtitles[lang] = [{ - 'url': sub_url, - }] - - return { - 'id': embed_code, - 'title': title, - 'description': metadata.get('description'), - 'thumbnail': metadata.get('thumbnail_image') or metadata.get('promo_image'), - 'duration': float_or_none(metadata.get('duration'), 1000), - 'subtitles': subtitles, - 'formats': formats, - } - - -class OoyalaIE(OoyalaBaseIE): - _VALID_URL = r'(?:ooyala:|https?://.+?\.ooyala\.com/.*?(?:embedCode|ec)=)(?P<id>.+?)(&|$)' - - _TESTS = [ - { - # From http://it.slashdot.org/story/13/04/25/178216/recovering-data-from-broken-hard-drives-and-ssds-video - 'url': 'http://player.ooyala.com/player.js?embedCode=pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8', - 'info_dict': { - 'id': 'pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8', - 'ext': 'mp4', - 'title': 'Explaining Data Recovery from Hard Drives and SSDs', - 'description': 'How badly damaged does a drive have to be to defeat Russell and his crew? Apparently, smashed to bits.', - 'duration': 853.386, - }, - # The video in the original webpage now uses PlayWire - 'skip': 'Ooyala said: movie expired', - }, { - # Only available for ipad - 'url': 'http://player.ooyala.com/player.js?embedCode=x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0', - 'info_dict': { - 'id': 'x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0', - 'ext': 'mp4', - 'title': 'Simulation Overview - Levels of Simulation', - 'duration': 194.948, - }, - }, - { - # Information available only through SAS api - # From http://community.plm.automation.siemens.com/t5/News-NX-Manufacturing/Tool-Path-Divide/ba-p/4187 - 'url': 'http://player.ooyala.com/player.js?embedCode=FiOG81ZTrvckcchQxmalf4aQj590qTEx', - 'md5': 'a84001441b35ea492bc03736e59e7935', - 'info_dict': { - 'id': 'FiOG81ZTrvckcchQxmalf4aQj590qTEx', - 'ext': 'mp4', - 'title': 'Divide Tool Path.mp4', - 'duration': 204.405, - } - }, - { - # empty stream['url']['data'] - 'url': 'http://player.ooyala.com/player.js?embedCode=w2bnZtYjE6axZ_dw1Cd0hQtXd_ige2Is', - 'only_matching': True, - } - ] - - @staticmethod - def _url_for_embed_code(embed_code): - return 'http://player.ooyala.com/player.js?embedCode=%s' % embed_code - - @classmethod - def _build_url_result(cls, embed_code): - return cls.url_result(cls._url_for_embed_code(embed_code), - ie=cls.ie_key()) - - def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) - embed_code = self._match_id(url) - domain = smuggled_data.get('domain') - supportedformats = smuggled_data.get('supportedformats') - embed_token = smuggled_data.get('embed_token') - content_tree_url = self._CONTENT_TREE_BASE + 'embed_code/%s/%s' % (embed_code, embed_code) - return self._extract(content_tree_url, embed_code, domain, supportedformats, embed_token) - - -class OoyalaExternalIE(OoyalaBaseIE): - _VALID_URL = r'''(?x) - (?: - ooyalaexternal:| - https?://.+?\.ooyala\.com/.*?\bexternalId= - ) - (?P<partner_id>[^:]+) - : - (?P<id>.+) - (?: - :| - .*?&pcode= - ) - (?P<pcode>.+?) - (?:&|$) - ''' - - _TEST = { - 'url': 'https://player.ooyala.com/player.js?externalId=espn:10365079&pcode=1kNG061cgaoolOncv54OAO1ceO-I&adSetCode=91cDU6NuXTGKz3OdjOxFdAgJVtQcKJnI&callback=handleEvents&hasModuleParams=1&height=968&playerBrandingId=7af3bd04449c444c964f347f11873075&targetReplaceId=videoPlayer&width=1656&wmode=opaque&allowScriptAccess=always', - 'info_dict': { - 'id': 'FkYWtmazr6Ed8xmvILvKLWjd4QvYZpzG', - 'ext': 'mp4', - 'title': 'dm_140128_30for30Shorts___JudgingJewellv2', - 'duration': 1302.0, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - } - - def _real_extract(self, url): - partner_id, video_id, pcode = re.match(self._VALID_URL, url).groups() - content_tree_url = self._CONTENT_TREE_BASE + 'external_id/%s/%s:%s' % (pcode, partner_id, video_id) - return self._extract(content_tree_url, video_id) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py deleted file mode 100644 index 0c20d0177..000000000 --- a/youtube_dl/extractor/openload.py +++ /dev/null @@ -1,238 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import json -import os -import subprocess -import tempfile - -from ..compat import ( - compat_urlparse, - compat_kwargs, -) -from ..utils import ( - check_executable, - encodeArgument, - ExtractorError, - get_exe_version, - is_outdated_version, - std_headers, -) - - -def cookie_to_dict(cookie): - cookie_dict = { - 'name': cookie.name, - 'value': cookie.value, - } - if cookie.port_specified: - cookie_dict['port'] = cookie.port - if cookie.domain_specified: - cookie_dict['domain'] = cookie.domain - if cookie.path_specified: - cookie_dict['path'] = cookie.path - if cookie.expires is not None: - cookie_dict['expires'] = cookie.expires - if cookie.secure is not None: - cookie_dict['secure'] = cookie.secure - if cookie.discard is not None: - cookie_dict['discard'] = cookie.discard - try: - if (cookie.has_nonstandard_attr('httpOnly') - or cookie.has_nonstandard_attr('httponly') - or cookie.has_nonstandard_attr('HttpOnly')): - cookie_dict['httponly'] = True - except TypeError: - pass - return cookie_dict - - -def cookie_jar_to_list(cookie_jar): - return [cookie_to_dict(cookie) for cookie in cookie_jar] - - -class PhantomJSwrapper(object): - """PhantomJS wrapper class - - This class is experimental. - """ - - _TEMPLATE = r''' - phantom.onError = function(msg, trace) {{ - var msgStack = ['PHANTOM ERROR: ' + msg]; - if(trace && trace.length) {{ - msgStack.push('TRACE:'); - trace.forEach(function(t) {{ - msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line - + (t.function ? ' (in function ' + t.function +')' : '')); - }}); - }} - console.error(msgStack.join('\n')); - phantom.exit(1); - }}; - var page = require('webpage').create(); - var fs = require('fs'); - var read = {{ mode: 'r', charset: 'utf-8' }}; - var write = {{ mode: 'w', charset: 'utf-8' }}; - JSON.parse(fs.read("{cookies}", read)).forEach(function(x) {{ - phantom.addCookie(x); - }}); - page.settings.resourceTimeout = {timeout}; - page.settings.userAgent = "{ua}"; - page.onLoadStarted = function() {{ - page.evaluate(function() {{ - delete window._phantom; - delete window.callPhantom; - }}); - }}; - var saveAndExit = function() {{ - fs.write("{html}", page.content, write); - fs.write("{cookies}", JSON.stringify(phantom.cookies), write); - phantom.exit(); - }}; - page.onLoadFinished = function(status) {{ - if(page.url === "") {{ - page.setContent(fs.read("{html}", read), "{url}"); - }} - else {{ - {jscode} - }} - }}; - page.open(""); - ''' - - _TMP_FILE_NAMES = ['script', 'html', 'cookies'] - - @staticmethod - def _version(): - return get_exe_version('phantomjs', version_re=r'([0-9.]+)') - - def __init__(self, extractor, required_version=None, timeout=10000): - self._TMP_FILES = {} - - self.exe = check_executable('phantomjs', ['-v']) - if not self.exe: - raise ExtractorError('PhantomJS executable not found in PATH, ' - 'download it from http://phantomjs.org', - expected=True) - - self.extractor = extractor - - if required_version: - version = self._version() - if is_outdated_version(version, required_version): - self.extractor._downloader.report_warning( - 'Your copy of PhantomJS is outdated, update it to version ' - '%s or newer if you encounter any errors.' % required_version) - - self.options = { - 'timeout': timeout, - } - for name in self._TMP_FILE_NAMES: - tmp = tempfile.NamedTemporaryFile(delete=False) - tmp.close() - self._TMP_FILES[name] = tmp - - def __del__(self): - for name in self._TMP_FILE_NAMES: - try: - os.remove(self._TMP_FILES[name].name) - except (IOError, OSError, KeyError): - pass - - def _save_cookies(self, url): - cookies = cookie_jar_to_list(self.extractor._downloader.cookiejar) - for cookie in cookies: - if 'path' not in cookie: - cookie['path'] = '/' - if 'domain' not in cookie: - cookie['domain'] = compat_urlparse.urlparse(url).netloc - with open(self._TMP_FILES['cookies'].name, 'wb') as f: - f.write(json.dumps(cookies).encode('utf-8')) - - def _load_cookies(self): - with open(self._TMP_FILES['cookies'].name, 'rb') as f: - cookies = json.loads(f.read().decode('utf-8')) - for cookie in cookies: - if cookie['httponly'] is True: - cookie['rest'] = {'httpOnly': None} - if 'expiry' in cookie: - cookie['expire_time'] = cookie['expiry'] - self.extractor._set_cookie(**compat_kwargs(cookie)) - - def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on webpage', headers={}, jscode='saveAndExit();'): - """ - Downloads webpage (if needed) and executes JS - - Params: - url: website url - html: optional, html code of website - video_id: video id - note: optional, displayed when downloading webpage - note2: optional, displayed when executing JS - headers: custom http headers - jscode: code to be executed when page is loaded - - Returns tuple with: - * downloaded website (after JS execution) - * anything you print with `console.log` (but not inside `page.execute`!) - - In most cases you don't need to add any `jscode`. - It is executed in `page.onLoadFinished`. - `saveAndExit();` is mandatory, use it instead of `phantom.exit()` - It is possible to wait for some element on the webpage, for example: - var check = function() { - var elementFound = page.evaluate(function() { - return document.querySelector('#b.done') !== null; - }); - if(elementFound) - saveAndExit(); - else - window.setTimeout(check, 500); - } - - page.evaluate(function(){ - document.querySelector('#a').click(); - }); - check(); - """ - if 'saveAndExit();' not in jscode: - raise ExtractorError('`saveAndExit();` not found in `jscode`') - if not html: - html = self.extractor._download_webpage(url, video_id, note=note, headers=headers) - with open(self._TMP_FILES['html'].name, 'wb') as f: - f.write(html.encode('utf-8')) - - self._save_cookies(url) - - replaces = self.options - replaces['url'] = url - user_agent = headers.get('User-Agent') or std_headers['User-Agent'] - replaces['ua'] = user_agent.replace('"', '\\"') - replaces['jscode'] = jscode - - for x in self._TMP_FILE_NAMES: - replaces[x] = self._TMP_FILES[x].name.replace('\\', '\\\\').replace('"', '\\"') - - with open(self._TMP_FILES['script'].name, 'wb') as f: - f.write(self._TEMPLATE.format(**replaces).encode('utf-8')) - - if video_id is None: - self.extractor.to_screen('%s' % (note2,)) - else: - self.extractor.to_screen('%s: %s' % (video_id, note2)) - - p = subprocess.Popen([ - self.exe, '--ssl-protocol=any', - self._TMP_FILES['script'].name - ], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - out, err = p.communicate() - if p.returncode != 0: - raise ExtractorError( - 'Executing JS failed\n:' + encodeArgument(err)) - with open(self._TMP_FILES['html'].name, 'rb') as f: - html = f.read().decode('utf-8') - - self._load_cookies() - - return (html, encodeArgument(out)) diff --git a/youtube_dl/extractor/ora.py b/youtube_dl/extractor/ora.py deleted file mode 100644 index 1d42be39b..000000000 --- a/youtube_dl/extractor/ora.py +++ /dev/null @@ -1,75 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re -from .common import InfoExtractor -from ..compat import compat_urlparse -from ..utils import ( - get_element_by_attribute, - qualities, - unescapeHTML, -) - - -class OraTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:ora\.tv|unsafespeech\.com)/([^/]+/)*(?P<id>[^/\?#]+)' - _TESTS = [{ - 'url': 'https://www.ora.tv/larrykingnow/2015/12/16/vine-youtube-stars-zach-king-king-bach-on-their-viral-videos-0_36jupg6090pq', - 'md5': 'fa33717591c631ec93b04b0e330df786', - 'info_dict': { - 'id': '50178', - 'ext': 'mp4', - 'title': 'Vine & YouTube Stars Zach King & King Bach On Their Viral Videos!', - 'description': 'md5:ebbc5b1424dd5dba7be7538148287ac1', - } - }, { - 'url': 'http://www.unsafespeech.com/video/2016/5/10/student-self-censorship-and-the-thought-police-on-university-campuses-0_6622bnkppw4d', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - video_data = self._search_regex( - r'"(?:video|current)"\s*:\s*({[^}]+?})', webpage, 'current video') - m3u8_url = self._search_regex( - r'hls_stream"?\s*:\s*"([^"]+)', video_data, 'm3u8 url', None) - if m3u8_url: - formats = self._extract_m3u8_formats( - m3u8_url, display_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False) - # similar to GameSpotIE - m3u8_path = compat_urlparse.urlparse(m3u8_url).path - QUALITIES_RE = r'((,[a-z]+\d+)+,?)' - available_qualities = self._search_regex( - QUALITIES_RE, m3u8_path, 'qualities').strip(',').split(',') - http_path = m3u8_path[1:].split('/', 1)[1] - http_template = re.sub(QUALITIES_RE, r'%s', http_path) - http_template = http_template.replace('.csmil/master.m3u8', '') - http_template = compat_urlparse.urljoin( - 'http://videocdn-pmd.ora.tv/', http_template) - preference = qualities( - ['mobile400', 'basic400', 'basic600', 'sd900', 'sd1200', 'sd1500', 'hd720', 'hd1080']) - for q in available_qualities: - formats.append({ - 'url': http_template % q, - 'format_id': q, - 'preference': preference(q), - }) - self._sort_formats(formats) - else: - return self.url_result(self._search_regex( - r'"youtube_id"\s*:\s*"([^"]+)', webpage, 'youtube id'), 'Youtube') - - return { - 'id': self._search_regex( - r'"id"\s*:\s*(\d+)', video_data, 'video id', default=display_id), - 'display_id': display_id, - 'title': unescapeHTML(self._og_search_title(webpage)), - 'description': get_element_by_attribute( - 'class', 'video_txt_decription', webpage), - 'thumbnail': self._proto_relative_url(self._search_regex( - r'"thumb"\s*:\s*"([^"]+)', video_data, 'thumbnail', None)), - 'formats': formats, - } diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py deleted file mode 100644 index 8d537d7ae..000000000 --- a/youtube_dl/extractor/orf.py +++ /dev/null @@ -1,592 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - clean_html, - determine_ext, - float_or_none, - HEADRequest, - int_or_none, - orderedSet, - remove_end, - str_or_none, - strip_jsonp, - unescapeHTML, - unified_strdate, - url_or_none, -) - - -class ORFTVthekIE(InfoExtractor): - IE_NAME = 'orf:tvthek' - IE_DESC = 'ORF TVthek' - _VALID_URL = r'https?://tvthek\.orf\.at/(?:[^/]+/)+(?P<id>\d+)' - - _TESTS = [{ - 'url': 'http://tvthek.orf.at/program/Aufgetischt/2745173/Aufgetischt-Mit-der-Steirischen-Tafelrunde/8891389', - 'playlist': [{ - 'md5': '2942210346ed779588f428a92db88712', - 'info_dict': { - 'id': '8896777', - 'ext': 'mp4', - 'title': 'Aufgetischt: Mit der Steirischen Tafelrunde', - 'description': 'md5:c1272f0245537812d4e36419c207b67d', - 'duration': 2668, - 'upload_date': '20141208', - }, - }], - 'skip': 'Blocked outside of Austria / Germany', - }, { - 'url': 'http://tvthek.orf.at/topic/Im-Wandel-der-Zeit/8002126/Best-of-Ingrid-Thurnher/7982256', - 'info_dict': { - 'id': '7982259', - 'ext': 'mp4', - 'title': 'Best of Ingrid Thurnher', - 'upload_date': '20140527', - 'description': 'Viele Jahre war Ingrid Thurnher das "Gesicht" der ZIB 2. Vor ihrem Wechsel zur ZIB 2 im Jahr 1995 moderierte sie unter anderem "Land und Leute", "Österreich-Bild" und "Niederösterreich heute".', - }, - 'params': { - 'skip_download': True, # rtsp downloads - }, - 'skip': 'Blocked outside of Austria / Germany', - }, { - 'url': 'http://tvthek.orf.at/topic/Fluechtlingskrise/10463081/Heimat-Fremde-Heimat/13879132/Senioren-betreuen-Migrantenkinder/13879141', - 'only_matching': True, - }, { - 'url': 'http://tvthek.orf.at/profile/Universum/35429', - 'only_matching': True, - }] - - def _real_extract(self, url): - playlist_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) - - data_jsb = self._parse_json( - self._search_regex( - r'<div[^>]+class=(["\']).*?VideoPlaylist.*?\1[^>]+data-jsb=(["\'])(?P<json>.+?)\2', - webpage, 'playlist', group='json'), - playlist_id, transform_source=unescapeHTML)['playlist']['videos'] - - entries = [] - for sd in data_jsb: - video_id, title = sd.get('id'), sd.get('title') - if not video_id or not title: - continue - video_id = compat_str(video_id) - formats = [] - for fd in sd['sources']: - src = url_or_none(fd.get('src')) - if not src: - continue - format_id_list = [] - for key in ('delivery', 'quality', 'quality_string'): - value = fd.get(key) - if value: - format_id_list.append(value) - format_id = '-'.join(format_id_list) - ext = determine_ext(src) - if ext == 'm3u8': - m3u8_formats = self._extract_m3u8_formats( - src, video_id, 'mp4', m3u8_id=format_id, fatal=False) - if any('/geoprotection' in f['url'] for f in m3u8_formats): - self.raise_geo_restricted() - formats.extend(m3u8_formats) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - src, video_id, f4m_id=format_id, fatal=False)) - elif ext == 'mpd': - formats.extend(self._extract_mpd_formats( - src, video_id, mpd_id=format_id, fatal=False)) - else: - formats.append({ - 'format_id': format_id, - 'url': src, - 'protocol': fd.get('protocol'), - }) - - # Check for geoblocking. - # There is a property is_geoprotection, but that's always false - geo_str = sd.get('geoprotection_string') - if geo_str: - try: - http_url = next( - f['url'] - for f in formats - if re.match(r'^https?://.*\.mp4$', f['url'])) - except StopIteration: - pass - else: - req = HEADRequest(http_url) - self._request_webpage( - req, video_id, - note='Testing for geoblocking', - errnote=(( - 'This video seems to be blocked outside of %s. ' - 'You may want to try the streaming-* formats.') - % geo_str), - fatal=False) - - self._check_formats(formats, video_id) - self._sort_formats(formats) - - subtitles = {} - for sub in sd.get('subtitles', []): - sub_src = sub.get('src') - if not sub_src: - continue - subtitles.setdefault(sub.get('lang', 'de-AT'), []).append({ - 'url': sub_src, - }) - - upload_date = unified_strdate(sd.get('created_date')) - - thumbnails = [] - preview = sd.get('preview_image_url') - if preview: - thumbnails.append({ - 'id': 'preview', - 'url': preview, - 'preference': 0, - }) - image = sd.get('image_full_url') - if not image and len(data_jsb) == 1: - image = self._og_search_thumbnail(webpage) - if image: - thumbnails.append({ - 'id': 'full', - 'url': image, - 'preference': 1, - }) - - entries.append({ - '_type': 'video', - 'id': video_id, - 'title': title, - 'formats': formats, - 'subtitles': subtitles, - 'description': sd.get('description'), - 'duration': int_or_none(sd.get('duration_in_seconds')), - 'upload_date': upload_date, - 'thumbnails': thumbnails, - }) - - return { - '_type': 'playlist', - 'entries': entries, - 'id': playlist_id, - } - - -class ORFRadioIE(InfoExtractor): - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - show_date = mobj.group('date') - show_id = mobj.group('show') - - data = self._download_json( - 'http://audioapi.orf.at/%s/api/json/current/broadcast/%s/%s' - % (self._API_STATION, show_id, show_date), show_id) - - entries = [] - for info in data['streams']: - loop_stream_id = str_or_none(info.get('loopStreamId')) - if not loop_stream_id: - continue - title = str_or_none(data.get('title')) - if not title: - continue - start = int_or_none(info.get('start'), scale=1000) - end = int_or_none(info.get('end'), scale=1000) - duration = end - start if end and start else None - entries.append({ - 'id': loop_stream_id.replace('.mp3', ''), - 'url': 'https://loopstream01.apa.at/?channel=%s&id=%s' % (self._LOOP_STATION, loop_stream_id), - 'title': title, - 'description': clean_html(data.get('subtitle')), - 'duration': duration, - 'timestamp': start, - 'ext': 'mp3', - 'series': data.get('programTitle'), - }) - - return { - '_type': 'playlist', - 'id': show_id, - 'title': data.get('title'), - 'description': clean_html(data.get('subtitle')), - 'entries': entries, - } - - -class ORFFM4IE(ORFRadioIE): - IE_NAME = 'orf:fm4' - IE_DESC = 'radio FM4' - _VALID_URL = r'https?://(?P<station>fm4)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>4\w+)' - _API_STATION = 'fm4' - _LOOP_STATION = 'fm4' - - _TEST = { - 'url': 'http://fm4.orf.at/player/20170107/4CC', - 'md5': '2b0be47375432a7ef104453432a19212', - 'info_dict': { - 'id': '2017-01-07_2100_tl_54_7DaysSat18_31295', - 'ext': 'mp3', - 'title': 'Solid Steel Radioshow', - 'description': 'Die Mixshow von Coldcut und Ninja Tune.', - 'duration': 3599, - 'timestamp': 1483819257, - 'upload_date': '20170107', - }, - 'skip': 'Shows from ORF radios are only available for 7 days.', - 'only_matching': True, - } - - -class ORFNOEIE(ORFRadioIE): - IE_NAME = 'orf:noe' - IE_DESC = 'Radio Niederösterreich' - _VALID_URL = r'https?://(?P<station>noe)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' - _API_STATION = 'noe' - _LOOP_STATION = 'oe2n' - - _TEST = { - 'url': 'https://noe.orf.at/player/20200423/NGM', - 'only_matching': True, - } - - -class ORFWIEIE(ORFRadioIE): - IE_NAME = 'orf:wien' - IE_DESC = 'Radio Wien' - _VALID_URL = r'https?://(?P<station>wien)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' - _API_STATION = 'wie' - _LOOP_STATION = 'oe2w' - - _TEST = { - 'url': 'https://wien.orf.at/player/20200423/WGUM', - 'only_matching': True, - } - - -class ORFBGLIE(ORFRadioIE): - IE_NAME = 'orf:burgenland' - IE_DESC = 'Radio Burgenland' - _VALID_URL = r'https?://(?P<station>burgenland)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' - _API_STATION = 'bgl' - _LOOP_STATION = 'oe2b' - - _TEST = { - 'url': 'https://burgenland.orf.at/player/20200423/BGM', - 'only_matching': True, - } - - -class ORFOOEIE(ORFRadioIE): - IE_NAME = 'orf:oberoesterreich' - IE_DESC = 'Radio Oberösterreich' - _VALID_URL = r'https?://(?P<station>ooe)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' - _API_STATION = 'ooe' - _LOOP_STATION = 'oe2o' - - _TEST = { - 'url': 'https://ooe.orf.at/player/20200423/OGMO', - 'only_matching': True, - } - - -class ORFSTMIE(ORFRadioIE): - IE_NAME = 'orf:steiermark' - IE_DESC = 'Radio Steiermark' - _VALID_URL = r'https?://(?P<station>steiermark)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' - _API_STATION = 'stm' - _LOOP_STATION = 'oe2st' - - _TEST = { - 'url': 'https://steiermark.orf.at/player/20200423/STGMS', - 'only_matching': True, - } - - -class ORFKTNIE(ORFRadioIE): - IE_NAME = 'orf:kaernten' - IE_DESC = 'Radio Kärnten' - _VALID_URL = r'https?://(?P<station>kaernten)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' - _API_STATION = 'ktn' - _LOOP_STATION = 'oe2k' - - _TEST = { - 'url': 'https://kaernten.orf.at/player/20200423/KGUMO', - 'only_matching': True, - } - - -class ORFSBGIE(ORFRadioIE): - IE_NAME = 'orf:salzburg' - IE_DESC = 'Radio Salzburg' - _VALID_URL = r'https?://(?P<station>salzburg)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' - _API_STATION = 'sbg' - _LOOP_STATION = 'oe2s' - - _TEST = { - 'url': 'https://salzburg.orf.at/player/20200423/SGUM', - 'only_matching': True, - } - - -class ORFTIRIE(ORFRadioIE): - IE_NAME = 'orf:tirol' - IE_DESC = 'Radio Tirol' - _VALID_URL = r'https?://(?P<station>tirol)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' - _API_STATION = 'tir' - _LOOP_STATION = 'oe2t' - - _TEST = { - 'url': 'https://tirol.orf.at/player/20200423/TGUMO', - 'only_matching': True, - } - - -class ORFVBGIE(ORFRadioIE): - IE_NAME = 'orf:vorarlberg' - IE_DESC = 'Radio Vorarlberg' - _VALID_URL = r'https?://(?P<station>vorarlberg)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' - _API_STATION = 'vbg' - _LOOP_STATION = 'oe2v' - - _TEST = { - 'url': 'https://vorarlberg.orf.at/player/20200423/VGUM', - 'only_matching': True, - } - - -class ORFOE3IE(ORFRadioIE): - IE_NAME = 'orf:oe3' - IE_DESC = 'Radio Österreich 3' - _VALID_URL = r'https?://(?P<station>oe3)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' - _API_STATION = 'oe3' - _LOOP_STATION = 'oe3' - - _TEST = { - 'url': 'https://oe3.orf.at/player/20200424/3WEK', - 'only_matching': True, - } - - -class ORFOE1IE(ORFRadioIE): - IE_NAME = 'orf:oe1' - IE_DESC = 'Radio Österreich 1' - _VALID_URL = r'https?://(?P<station>oe1)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' - _API_STATION = 'oe1' - _LOOP_STATION = 'oe1' - - _TEST = { - 'url': 'http://oe1.orf.at/player/20170108/456544', - 'md5': '34d8a6e67ea888293741c86a099b745b', - 'info_dict': { - 'id': '2017-01-08_0759_tl_51_7DaysSun6_256141', - 'ext': 'mp3', - 'title': 'Morgenjournal', - 'duration': 609, - 'timestamp': 1483858796, - 'upload_date': '20170108', - }, - 'skip': 'Shows from ORF radios are only available for 7 days.' - } - - -class ORFIPTVIE(InfoExtractor): - IE_NAME = 'orf:iptv' - IE_DESC = 'iptv.ORF.at' - _VALID_URL = r'https?://iptv\.orf\.at/(?:#/)?stories/(?P<id>\d+)' - - _TEST = { - 'url': 'http://iptv.orf.at/stories/2275236/', - 'md5': 'c8b22af4718a4b4af58342529453e3e5', - 'info_dict': { - 'id': '350612', - 'ext': 'flv', - 'title': 'Weitere Evakuierungen um Vulkan Calbuco', - 'description': 'md5:d689c959bdbcf04efeddedbf2299d633', - 'duration': 68.197, - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20150425', - }, - } - - def _real_extract(self, url): - story_id = self._match_id(url) - - webpage = self._download_webpage( - 'http://iptv.orf.at/stories/%s' % story_id, story_id) - - video_id = self._search_regex( - r'data-video(?:id)?="(\d+)"', webpage, 'video id') - - data = self._download_json( - 'http://bits.orf.at/filehandler/static-api/json/current/data.json?file=%s' % video_id, - video_id)[0] - - duration = float_or_none(data['duration'], 1000) - - video = data['sources']['default'] - load_balancer_url = video['loadBalancerUrl'] - abr = int_or_none(video.get('audioBitrate')) - vbr = int_or_none(video.get('bitrate')) - fps = int_or_none(video.get('videoFps')) - width = int_or_none(video.get('videoWidth')) - height = int_or_none(video.get('videoHeight')) - thumbnail = video.get('preview') - - rendition = self._download_json( - load_balancer_url, video_id, transform_source=strip_jsonp) - - f = { - 'abr': abr, - 'vbr': vbr, - 'fps': fps, - 'width': width, - 'height': height, - } - - formats = [] - for format_id, format_url in rendition['redirect'].items(): - if format_id == 'rtmp': - ff = f.copy() - ff.update({ - 'url': format_url, - 'format_id': format_id, - }) - formats.append(ff) - elif determine_ext(format_url) == 'f4m': - formats.extend(self._extract_f4m_formats( - format_url, video_id, f4m_id=format_id)) - elif determine_ext(format_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', m3u8_id=format_id)) - else: - continue - self._sort_formats(formats) - - title = remove_end(self._og_search_title(webpage), ' - iptv.ORF.at') - description = self._og_search_description(webpage) - upload_date = unified_strdate(self._html_search_meta( - 'dc.date', webpage, 'upload date')) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'duration': duration, - 'thumbnail': thumbnail, - 'upload_date': upload_date, - 'formats': formats, - } - - -class ORFFM4StoryIE(InfoExtractor): - IE_NAME = 'orf:fm4:story' - IE_DESC = 'fm4.orf.at stories' - _VALID_URL = r'https?://fm4\.orf\.at/stories/(?P<id>\d+)' - - _TEST = { - 'url': 'http://fm4.orf.at/stories/2865738/', - 'playlist': [{ - 'md5': 'e1c2c706c45c7b34cf478bbf409907ca', - 'info_dict': { - 'id': '547792', - 'ext': 'flv', - 'title': 'Manu Delago und Inner Tongue live', - 'description': 'Manu Delago und Inner Tongue haben bei der FM4 Soundpark Session live alles gegeben. Hier gibt es Fotos und die gesamte Session als Video.', - 'duration': 1748.52, - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20170913', - }, - }, { - 'md5': 'c6dd2179731f86f4f55a7b49899d515f', - 'info_dict': { - 'id': '547798', - 'ext': 'flv', - 'title': 'Manu Delago und Inner Tongue live (2)', - 'duration': 1504.08, - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20170913', - 'description': 'Manu Delago und Inner Tongue haben bei der FM4 Soundpark Session live alles gegeben. Hier gibt es Fotos und die gesamte Session als Video.', - }, - }], - } - - def _real_extract(self, url): - story_id = self._match_id(url) - webpage = self._download_webpage(url, story_id) - - entries = [] - all_ids = orderedSet(re.findall(r'data-video(?:id)?="(\d+)"', webpage)) - for idx, video_id in enumerate(all_ids): - data = self._download_json( - 'http://bits.orf.at/filehandler/static-api/json/current/data.json?file=%s' % video_id, - video_id)[0] - - duration = float_or_none(data['duration'], 1000) - - video = data['sources']['q8c'] - load_balancer_url = video['loadBalancerUrl'] - abr = int_or_none(video.get('audioBitrate')) - vbr = int_or_none(video.get('bitrate')) - fps = int_or_none(video.get('videoFps')) - width = int_or_none(video.get('videoWidth')) - height = int_or_none(video.get('videoHeight')) - thumbnail = video.get('preview') - - rendition = self._download_json( - load_balancer_url, video_id, transform_source=strip_jsonp) - - f = { - 'abr': abr, - 'vbr': vbr, - 'fps': fps, - 'width': width, - 'height': height, - } - - formats = [] - for format_id, format_url in rendition['redirect'].items(): - if format_id == 'rtmp': - ff = f.copy() - ff.update({ - 'url': format_url, - 'format_id': format_id, - }) - formats.append(ff) - elif determine_ext(format_url) == 'f4m': - formats.extend(self._extract_f4m_formats( - format_url, video_id, f4m_id=format_id)) - elif determine_ext(format_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', m3u8_id=format_id)) - else: - continue - self._sort_formats(formats) - - title = remove_end(self._og_search_title(webpage), ' - fm4.ORF.at') - if idx >= 1: - # Titles are duplicates, make them unique - title += ' (' + str(idx + 1) + ')' - description = self._og_search_description(webpage) - upload_date = unified_strdate(self._html_search_meta( - 'dc.date', webpage, 'upload date')) - - entries.append({ - 'id': video_id, - 'title': title, - 'description': description, - 'duration': duration, - 'thumbnail': thumbnail, - 'upload_date': upload_date, - 'formats': formats, - }) - - return self.playlist_result(entries) diff --git a/youtube_dl/extractor/packtpub.py b/youtube_dl/extractor/packtpub.py deleted file mode 100644 index 11ad3b3b8..000000000 --- a/youtube_dl/extractor/packtpub.py +++ /dev/null @@ -1,164 +0,0 @@ -from __future__ import unicode_literals - -import json -import re - -from .common import InfoExtractor -from ..compat import ( - # compat_str, - compat_HTTPError, -) -from ..utils import ( - clean_html, - ExtractorError, - # remove_end, - str_or_none, - strip_or_none, - unified_timestamp, - # urljoin, -) - - -class PacktPubBaseIE(InfoExtractor): - # _PACKT_BASE = 'https://www.packtpub.com' - _STATIC_PRODUCTS_BASE = 'https://static.packt-cdn.com/products/' - - -class PacktPubIE(PacktPubBaseIE): - _VALID_URL = r'https?://(?:(?:www\.)?packtpub\.com/mapt|subscription\.packtpub\.com)/video/[^/]+/(?P<course_id>\d+)/(?P<chapter_id>[^/]+)/(?P<id>[^/]+)(?:/(?P<display_id>[^/?&#]+))?' - - _TESTS = [{ - 'url': 'https://www.packtpub.com/mapt/video/web-development/9781787122215/20528/20530/Project+Intro', - 'md5': '1e74bd6cfd45d7d07666f4684ef58f70', - 'info_dict': { - 'id': '20530', - 'ext': 'mp4', - 'title': 'Project Intro', - 'thumbnail': r're:(?i)^https?://.*\.jpg', - 'timestamp': 1490918400, - 'upload_date': '20170331', - }, - }, { - 'url': 'https://subscription.packtpub.com/video/web_development/9781787122215/20528/20530/project-intro', - 'only_matching': True, - }, { - 'url': 'https://subscription.packtpub.com/video/programming/9781838988906/p1/video1_1/business-card-project', - 'only_matching': True, - }] - _NETRC_MACHINE = 'packtpub' - _TOKEN = None - - def _real_initialize(self): - username, password = self._get_login_info() - if username is None: - return - try: - self._TOKEN = self._download_json( - 'https://services.packtpub.com/auth-v1/users/tokens', None, - 'Downloading Authorization Token', data=json.dumps({ - 'username': username, - 'password': password, - }).encode())['data']['access'] - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 401, 404): - message = self._parse_json(e.cause.read().decode(), None)['message'] - raise ExtractorError(message, expected=True) - raise - - def _real_extract(self, url): - course_id, chapter_id, video_id, display_id = re.match(self._VALID_URL, url).groups() - - headers = {} - if self._TOKEN: - headers['Authorization'] = 'Bearer ' + self._TOKEN - try: - video_url = self._download_json( - 'https://services.packtpub.com/products-v1/products/%s/%s/%s' % (course_id, chapter_id, video_id), video_id, - 'Downloading JSON video', headers=headers)['data'] - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: - self.raise_login_required('This video is locked') - raise - - # TODO: find a better way to avoid duplicating course requests - # metadata = self._download_json( - # '%s/products/%s/chapters/%s/sections/%s/metadata' - # % (self._MAPT_REST, course_id, chapter_id, video_id), - # video_id)['data'] - - # title = metadata['pageTitle'] - # course_title = metadata.get('title') - # if course_title: - # title = remove_end(title, ' - %s' % course_title) - # timestamp = unified_timestamp(metadata.get('publicationDate')) - # thumbnail = urljoin(self._PACKT_BASE, metadata.get('filepath')) - - return { - 'id': video_id, - 'url': video_url, - 'title': display_id or video_id, # title, - # 'thumbnail': thumbnail, - # 'timestamp': timestamp, - } - - -class PacktPubCourseIE(PacktPubBaseIE): - _VALID_URL = r'(?P<url>https?://(?:(?:www\.)?packtpub\.com/mapt|subscription\.packtpub\.com)/video/[^/]+/(?P<id>\d+))' - _TESTS = [{ - 'url': 'https://www.packtpub.com/mapt/video/web-development/9781787122215', - 'info_dict': { - 'id': '9781787122215', - 'title': 'Learn Nodejs by building 12 projects [Video]', - 'description': 'md5:489da8d953f416e51927b60a1c7db0aa', - }, - 'playlist_count': 90, - }, { - 'url': 'https://subscription.packtpub.com/video/web_development/9781787122215', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return False if PacktPubIE.suitable(url) else super( - PacktPubCourseIE, cls).suitable(url) - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - url, course_id = mobj.group('url', 'id') - - course = self._download_json( - self._STATIC_PRODUCTS_BASE + '%s/toc' % course_id, course_id) - metadata = self._download_json( - self._STATIC_PRODUCTS_BASE + '%s/summary' % course_id, - course_id, fatal=False) or {} - - entries = [] - for chapter_num, chapter in enumerate(course['chapters'], 1): - chapter_id = str_or_none(chapter.get('id')) - sections = chapter.get('sections') - if not chapter_id or not isinstance(sections, list): - continue - chapter_info = { - 'chapter': chapter.get('title'), - 'chapter_number': chapter_num, - 'chapter_id': chapter_id, - } - for section in sections: - section_id = str_or_none(section.get('id')) - if not section_id or section.get('contentType') != 'video': - continue - entry = { - '_type': 'url_transparent', - 'url': '/'.join([url, chapter_id, section_id]), - 'title': strip_or_none(section.get('title')), - 'description': clean_html(section.get('summary')), - 'thumbnail': metadata.get('coverImage'), - 'timestamp': unified_timestamp(metadata.get('publicationDate')), - 'ie_key': PacktPubIE.ie_key(), - } - entry.update(chapter_info) - entries.append(entry) - - return self.playlist_result( - entries, course_id, metadata.get('title'), - clean_html(metadata.get('about'))) diff --git a/youtube_dl/extractor/palcomp3.py b/youtube_dl/extractor/palcomp3.py deleted file mode 100644 index fb29d83f9..000000000 --- a/youtube_dl/extractor/palcomp3.py +++ /dev/null @@ -1,148 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - int_or_none, - str_or_none, - try_get, -) - - -class PalcoMP3BaseIE(InfoExtractor): - _GQL_QUERY_TMPL = '''{ - artist(slug: "%s") { - %s - } -}''' - _ARTIST_FIELDS_TMPL = '''music(slug: "%%s") { - %s - }''' - _MUSIC_FIELDS = '''duration - hls - mp3File - musicID - plays - title''' - - def _call_api(self, artist_slug, artist_fields): - return self._download_json( - 'https://www.palcomp3.com.br/graphql/', artist_slug, query={ - 'query': self._GQL_QUERY_TMPL % (artist_slug, artist_fields), - })['data'] - - def _parse_music(self, music): - music_id = compat_str(music['musicID']) - title = music['title'] - - formats = [] - hls_url = music.get('hls') - if hls_url: - formats.append({ - 'url': hls_url, - 'protocol': 'm3u8_native', - 'ext': 'mp4', - }) - mp3_file = music.get('mp3File') - if mp3_file: - formats.append({ - 'url': mp3_file, - }) - - return { - 'id': music_id, - 'title': title, - 'formats': formats, - 'duration': int_or_none(music.get('duration')), - 'view_count': int_or_none(music.get('plays')), - } - - def _real_initialize(self): - self._ARTIST_FIELDS_TMPL = self._ARTIST_FIELDS_TMPL % self._MUSIC_FIELDS - - def _real_extract(self, url): - artist_slug, music_slug = re.match(self._VALID_URL, url).groups() - artist_fields = self._ARTIST_FIELDS_TMPL % music_slug - music = self._call_api(artist_slug, artist_fields)['artist']['music'] - return self._parse_music(music) - - -class PalcoMP3IE(PalcoMP3BaseIE): - IE_NAME = 'PalcoMP3:song' - _VALID_URL = r'https?://(?:www\.)?palcomp3\.com(?:\.br)?/(?P<artist>[^/]+)/(?P<id>[^/?&#]+)' - _TESTS = [{ - 'url': 'https://www.palcomp3.com/maiaraemaraisaoficial/nossas-composicoes-cuida-bem-dela/', - 'md5': '99fd6405b2d8fd589670f6db1ba3b358', - 'info_dict': { - 'id': '3162927', - 'ext': 'mp3', - 'title': 'Nossas Composições - CUIDA BEM DELA', - 'duration': 210, - 'view_count': int, - } - }] - - @classmethod - def suitable(cls, url): - return False if PalcoMP3VideoIE.suitable(url) else super(PalcoMP3IE, cls).suitable(url) - - -class PalcoMP3ArtistIE(PalcoMP3BaseIE): - IE_NAME = 'PalcoMP3:artist' - _VALID_URL = r'https?://(?:www\.)?palcomp3\.com(?:\.br)?/(?P<id>[^/?&#]+)' - _TESTS = [{ - 'url': 'https://www.palcomp3.com.br/condedoforro/', - 'info_dict': { - 'id': '358396', - 'title': 'Conde do Forró', - }, - 'playlist_mincount': 188, - }] - _ARTIST_FIELDS_TMPL = '''artistID - musics { - nodes { - %s - } - } - name''' - - @ classmethod - def suitable(cls, url): - return False if re.match(PalcoMP3IE._VALID_URL, url) else super(PalcoMP3ArtistIE, cls).suitable(url) - - def _real_extract(self, url): - artist_slug = self._match_id(url) - artist = self._call_api(artist_slug, self._ARTIST_FIELDS_TMPL)['artist'] - - def entries(): - for music in (try_get(artist, lambda x: x['musics']['nodes'], list) or []): - yield self._parse_music(music) - - return self.playlist_result( - entries(), str_or_none(artist.get('artistID')), artist.get('name')) - - -class PalcoMP3VideoIE(PalcoMP3BaseIE): - IE_NAME = 'PalcoMP3:video' - _VALID_URL = r'https?://(?:www\.)?palcomp3\.com(?:\.br)?/(?P<artist>[^/]+)/(?P<id>[^/?&#]+)/?#clipe' - _TESTS = [{ - 'url': 'https://www.palcomp3.com/maiaraemaraisaoficial/maiara-e-maraisa-voce-faz-falta-aqui-ao-vivo-em-vicosa-mg/#clipe', - 'add_ie': ['Youtube'], - 'info_dict': { - 'id': '_pD1nR2qqPg', - 'ext': 'mp4', - 'title': 'Maiara e Maraisa - Você Faz Falta Aqui - DVD Ao Vivo Em Campo Grande', - 'description': 'md5:7043342c09a224598e93546e98e49282', - 'upload_date': '20161107', - 'uploader_id': 'maiaramaraisaoficial', - 'uploader': 'Maiara e Maraisa', - } - }] - _MUSIC_FIELDS = 'youtubeID' - - def _parse_music(self, music): - youtube_id = music['youtubeID'] - return self.url_result(youtube_id, 'Youtube', youtube_id) diff --git a/youtube_dl/extractor/pandoratv.py b/youtube_dl/extractor/pandoratv.py deleted file mode 100644 index 538738c09..000000000 --- a/youtube_dl/extractor/pandoratv.py +++ /dev/null @@ -1,134 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, -) -from ..utils import ( - ExtractorError, - float_or_none, - parse_duration, - str_to_int, - urlencode_postdata, -) - - -class PandoraTVIE(InfoExtractor): - IE_NAME = 'pandora.tv' - IE_DESC = '판도라TV' - _VALID_URL = r'''(?x) - https?:// - (?: - (?:www\.)?pandora\.tv/view/(?P<user_id>[^/]+)/(?P<id>\d+)| # new format - (?:.+?\.)?channel\.pandora\.tv/channel/video\.ptv\?| # old format - m\.pandora\.tv/?\? # mobile - ) - ''' - _TESTS = [{ - 'url': 'http://jp.channel.pandora.tv/channel/video.ptv?c1=&prgid=53294230&ch_userid=mikakim&ref=main&lot=cate_01_2', - 'info_dict': { - 'id': '53294230', - 'ext': 'flv', - 'title': '頭を撫でてくれる?', - 'description': '頭を撫でてくれる?', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 39, - 'upload_date': '20151218', - 'uploader': 'カワイイ動物まとめ', - 'uploader_id': 'mikakim', - 'view_count': int, - 'like_count': int, - } - }, { - 'url': 'http://channel.pandora.tv/channel/video.ptv?ch_userid=gogoucc&prgid=54721744', - 'info_dict': { - 'id': '54721744', - 'ext': 'flv', - 'title': '[HD] JAPAN COUNTDOWN 170423', - 'description': '[HD] JAPAN COUNTDOWN 170423', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 1704.9, - 'upload_date': '20170423', - 'uploader': 'GOGO_UCC', - 'uploader_id': 'gogoucc', - 'view_count': int, - 'like_count': int, - }, - 'params': { - # Test metadata only - 'skip_download': True, - }, - }, { - 'url': 'http://www.pandora.tv/view/mikakim/53294230#36797454_new', - 'only_matching': True, - }, { - 'url': 'http://m.pandora.tv/?c=view&ch_userid=mikakim&prgid=54600346', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - user_id = mobj.group('user_id') - video_id = mobj.group('id') - - if not user_id or not video_id: - qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - video_id = qs.get('prgid', [None])[0] - user_id = qs.get('ch_userid', [None])[0] - if any(not f for f in (video_id, user_id,)): - raise ExtractorError('Invalid URL', expected=True) - - data = self._download_json( - 'http://m.pandora.tv/?c=view&m=viewJsonApi&ch_userid=%s&prgid=%s' - % (user_id, video_id), video_id) - - info = data['data']['rows']['vod_play_info']['result'] - - formats = [] - for format_id, format_url in info.items(): - if not format_url: - continue - height = self._search_regex( - r'^v(\d+)[Uu]rl$', format_id, 'height', default=None) - if not height: - continue - - play_url = self._download_json( - 'http://m.pandora.tv/?c=api&m=play_url', video_id, - data=urlencode_postdata({ - 'prgid': video_id, - 'runtime': info.get('runtime'), - 'vod_url': format_url, - }), - headers={ - 'Origin': url, - 'Content-Type': 'application/x-www-form-urlencoded', - }) - format_url = play_url.get('url') - if not format_url: - continue - - formats.append({ - 'format_id': '%sp' % height, - 'url': format_url, - 'height': int(height), - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': info['subject'], - 'description': info.get('body'), - 'thumbnail': info.get('thumbnail') or info.get('poster'), - 'duration': float_or_none(info.get('runtime'), 1000) or parse_duration(info.get('time')), - 'upload_date': info['fid'].split('/')[-1][:8] if isinstance(info.get('fid'), compat_str) else None, - 'uploader': info.get('nickname'), - 'uploader_id': info.get('upload_userid'), - 'view_count': str_to_int(info.get('hit')), - 'like_count': str_to_int(info.get('likecnt')), - 'formats': formats, - } diff --git a/youtube_dl/extractor/parliamentliveuk.py b/youtube_dl/extractor/parliamentliveuk.py deleted file mode 100644 index bdd5ff565..000000000 --- a/youtube_dl/extractor/parliamentliveuk.py +++ /dev/null @@ -1,43 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor - - -class ParliamentLiveUKIE(InfoExtractor): - IE_NAME = 'parliamentlive.tv' - IE_DESC = 'UK parliament videos' - _VALID_URL = r'(?i)https?://(?:www\.)?parliamentlive\.tv/Event/Index/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' - - _TESTS = [{ - 'url': 'http://parliamentlive.tv/Event/Index/c1e9d44d-fd6c-4263-b50f-97ed26cc998b', - 'info_dict': { - 'id': '1_af9nv9ym', - 'ext': 'mp4', - 'title': 'Home Affairs Committee', - 'uploader_id': 'FFMPEG-01', - 'timestamp': 1422696664, - 'upload_date': '20150131', - }, - }, { - 'url': 'http://parliamentlive.tv/event/index/3f24936f-130f-40bf-9a5d-b3d6479da6a4', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage( - 'http://vodplayer.parliamentlive.tv/?mid=' + video_id, video_id) - widget_config = self._parse_json(self._search_regex( - r'(?s)kWidgetConfig\s*=\s*({.+});', - webpage, 'kaltura widget config'), video_id) - kaltura_url = 'kaltura:%s:%s' % ( - widget_config['wid'][1:], widget_config['entry_id']) - event_title = self._download_json( - 'http://parliamentlive.tv/Event/GetShareVideo/' + video_id, video_id)['event']['title'] - return { - '_type': 'url_transparent', - 'title': event_title, - 'description': '', - 'url': kaltura_url, - 'ie_key': 'Kaltura', - } diff --git a/youtube_dl/extractor/patreon.py b/youtube_dl/extractor/patreon.py deleted file mode 100644 index 761a4b1de..000000000 --- a/youtube_dl/extractor/patreon.py +++ /dev/null @@ -1,156 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - clean_html, - determine_ext, - int_or_none, - KNOWN_EXTENSIONS, - mimetype2ext, - parse_iso8601, - str_or_none, - try_get, -) - - -class PatreonIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?patreon\.com/(?:creation\?hid=|posts/(?:[\w-]+-)?)(?P<id>\d+)' - _TESTS = [{ - 'url': 'http://www.patreon.com/creation?hid=743933', - 'md5': 'e25505eec1053a6e6813b8ed369875cc', - 'info_dict': { - 'id': '743933', - 'ext': 'mp3', - 'title': 'Episode 166: David Smalley of Dogma Debate', - 'description': 'md5:713b08b772cd6271b9f3906683cfacdf', - 'uploader': 'Cognitive Dissonance Podcast', - 'thumbnail': 're:^https?://.*$', - 'timestamp': 1406473987, - 'upload_date': '20140727', - 'uploader_id': '87145', - }, - }, { - 'url': 'http://www.patreon.com/creation?hid=754133', - 'md5': '3eb09345bf44bf60451b8b0b81759d0a', - 'info_dict': { - 'id': '754133', - 'ext': 'mp3', - 'title': 'CD 167 Extra', - 'uploader': 'Cognitive Dissonance Podcast', - 'thumbnail': 're:^https?://.*$', - }, - 'skip': 'Patron-only content', - }, { - 'url': 'https://www.patreon.com/creation?hid=1682498', - 'info_dict': { - 'id': 'SU4fj_aEMVw', - 'ext': 'mp4', - 'title': 'I\'m on Patreon!', - 'uploader': 'TraciJHines', - 'thumbnail': 're:^https?://.*$', - 'upload_date': '20150211', - 'description': 'md5:c5a706b1f687817a3de09db1eb93acd4', - 'uploader_id': 'TraciJHines', - }, - 'params': { - 'noplaylist': True, - 'skip_download': True, - } - }, { - 'url': 'https://www.patreon.com/posts/episode-166-of-743933', - 'only_matching': True, - }, { - 'url': 'https://www.patreon.com/posts/743933', - 'only_matching': True, - }] - - # Currently Patreon exposes download URL via hidden CSS, so login is not - # needed. Keeping this commented for when this inevitably changes. - ''' - def _login(self): - username, password = self._get_login_info() - if username is None: - return - - login_form = { - 'redirectUrl': 'http://www.patreon.com/', - 'email': username, - 'password': password, - } - - request = sanitized_Request( - 'https://www.patreon.com/processLogin', - compat_urllib_parse_urlencode(login_form).encode('utf-8') - ) - login_page = self._download_webpage(request, None, note='Logging in') - - if re.search(r'onLoginFailed', login_page): - raise ExtractorError('Unable to login, incorrect username and/or password', expected=True) - - def _real_initialize(self): - self._login() - ''' - - def _real_extract(self, url): - video_id = self._match_id(url) - post = self._download_json( - 'https://www.patreon.com/api/posts/' + video_id, video_id, query={ - 'fields[media]': 'download_url,mimetype,size_bytes', - 'fields[post]': 'comment_count,content,embed,image,like_count,post_file,published_at,title', - 'fields[user]': 'full_name,url', - 'json-api-use-default-includes': 'false', - 'include': 'media,user', - }) - attributes = post['data']['attributes'] - title = attributes['title'].strip() - image = attributes.get('image') or {} - info = { - 'id': video_id, - 'title': title, - 'description': clean_html(attributes.get('content')), - 'thumbnail': image.get('large_url') or image.get('url'), - 'timestamp': parse_iso8601(attributes.get('published_at')), - 'like_count': int_or_none(attributes.get('like_count')), - 'comment_count': int_or_none(attributes.get('comment_count')), - } - - for i in post.get('included', []): - i_type = i.get('type') - if i_type == 'media': - media_attributes = i.get('attributes') or {} - download_url = media_attributes.get('download_url') - ext = mimetype2ext(media_attributes.get('mimetype')) - if download_url and ext in KNOWN_EXTENSIONS: - info.update({ - 'ext': ext, - 'filesize': int_or_none(media_attributes.get('size_bytes')), - 'url': download_url, - }) - elif i_type == 'user': - user_attributes = i.get('attributes') - if user_attributes: - info.update({ - 'uploader': user_attributes.get('full_name'), - 'uploader_id': str_or_none(i.get('id')), - 'uploader_url': user_attributes.get('url'), - }) - - if not info.get('url'): - embed_url = try_get(attributes, lambda x: x['embed']['url']) - if embed_url: - info.update({ - '_type': 'url', - 'url': embed_url, - }) - - if not info.get('url'): - post_file = attributes['post_file'] - ext = determine_ext(post_file.get('name')) - if ext in KNOWN_EXTENSIONS: - info.update({ - 'ext': ext, - 'url': post_file['url'], - }) - - return info diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py deleted file mode 100644 index d4baa16ee..000000000 --- a/youtube_dl/extractor/pbs.py +++ /dev/null @@ -1,710 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - ExtractorError, - determine_ext, - int_or_none, - float_or_none, - js_to_json, - orderedSet, - strip_jsonp, - strip_or_none, - unified_strdate, - url_or_none, - US_RATINGS, -) - - -class PBSIE(InfoExtractor): - _STATIONS = ( - (r'(?:video|www|player)\.pbs\.org', 'PBS: Public Broadcasting Service'), # http://www.pbs.org/ - (r'video\.aptv\.org', 'APT - Alabama Public Television (WBIQ)'), # http://aptv.org/ - (r'video\.gpb\.org', 'GPB/Georgia Public Broadcasting (WGTV)'), # http://www.gpb.org/ - (r'video\.mpbonline\.org', 'Mississippi Public Broadcasting (WMPN)'), # http://www.mpbonline.org - (r'video\.wnpt\.org', 'Nashville Public Television (WNPT)'), # http://www.wnpt.org - (r'video\.wfsu\.org', 'WFSU-TV (WFSU)'), # http://wfsu.org/ - (r'video\.wsre\.org', 'WSRE (WSRE)'), # http://www.wsre.org - (r'video\.wtcitv\.org', 'WTCI (WTCI)'), # http://www.wtcitv.org - (r'video\.pba\.org', 'WPBA/Channel 30 (WPBA)'), # http://pba.org/ - (r'video\.alaskapublic\.org', 'Alaska Public Media (KAKM)'), # http://alaskapublic.org/kakm - # (r'kuac\.org', 'KUAC (KUAC)'), # http://kuac.org/kuac-tv/ - # (r'ktoo\.org', '360 North (KTOO)'), # http://www.ktoo.org/ - # (r'azpm\.org', 'KUAT 6 (KUAT)'), # http://www.azpm.org/ - (r'video\.azpbs\.org', 'Arizona PBS (KAET)'), # http://www.azpbs.org - (r'portal\.knme\.org', 'KNME-TV/Channel 5 (KNME)'), # http://www.newmexicopbs.org/ - (r'video\.vegaspbs\.org', 'Vegas PBS (KLVX)'), # http://vegaspbs.org/ - (r'watch\.aetn\.org', 'AETN/ARKANSAS ETV NETWORK (KETS)'), # http://www.aetn.org/ - (r'video\.ket\.org', 'KET (WKLE)'), # http://www.ket.org/ - (r'video\.wkno\.org', 'WKNO/Channel 10 (WKNO)'), # http://www.wkno.org/ - (r'video\.lpb\.org', 'LPB/LOUISIANA PUBLIC BROADCASTING (WLPB)'), # http://www.lpb.org/ - (r'videos\.oeta\.tv', 'OETA (KETA)'), # http://www.oeta.tv - (r'video\.optv\.org', 'Ozarks Public Television (KOZK)'), # http://www.optv.org/ - (r'watch\.wsiu\.org', 'WSIU Public Broadcasting (WSIU)'), # http://www.wsiu.org/ - (r'video\.keet\.org', 'KEET TV (KEET)'), # http://www.keet.org - (r'pbs\.kixe\.org', 'KIXE/Channel 9 (KIXE)'), # http://kixe.org/ - (r'video\.kpbs\.org', 'KPBS San Diego (KPBS)'), # http://www.kpbs.org/ - (r'video\.kqed\.org', 'KQED (KQED)'), # http://www.kqed.org - (r'vids\.kvie\.org', 'KVIE Public Television (KVIE)'), # http://www.kvie.org - (r'video\.pbssocal\.org', 'PBS SoCal/KOCE (KOCE)'), # http://www.pbssocal.org/ - (r'video\.valleypbs\.org', 'ValleyPBS (KVPT)'), # http://www.valleypbs.org/ - (r'video\.cptv\.org', 'CONNECTICUT PUBLIC TELEVISION (WEDH)'), # http://cptv.org - (r'watch\.knpb\.org', 'KNPB Channel 5 (KNPB)'), # http://www.knpb.org/ - (r'video\.soptv\.org', 'SOPTV (KSYS)'), # http://www.soptv.org - # (r'klcs\.org', 'KLCS/Channel 58 (KLCS)'), # http://www.klcs.org - # (r'krcb\.org', 'KRCB Television & Radio (KRCB)'), # http://www.krcb.org - # (r'kvcr\.org', 'KVCR TV/DT/FM :: Vision for the Future (KVCR)'), # http://kvcr.org - (r'video\.rmpbs\.org', 'Rocky Mountain PBS (KRMA)'), # http://www.rmpbs.org - (r'video\.kenw\.org', 'KENW-TV3 (KENW)'), # http://www.kenw.org - (r'video\.kued\.org', 'KUED Channel 7 (KUED)'), # http://www.kued.org - (r'video\.wyomingpbs\.org', 'Wyoming PBS (KCWC)'), # http://www.wyomingpbs.org - (r'video\.cpt12\.org', 'Colorado Public Television / KBDI 12 (KBDI)'), # http://www.cpt12.org/ - (r'video\.kbyueleven\.org', 'KBYU-TV (KBYU)'), # http://www.kbyutv.org/ - (r'video\.thirteen\.org', 'Thirteen/WNET New York (WNET)'), # http://www.thirteen.org - (r'video\.wgbh\.org', 'WGBH/Channel 2 (WGBH)'), # http://wgbh.org - (r'video\.wgby\.org', 'WGBY (WGBY)'), # http://www.wgby.org - (r'watch\.njtvonline\.org', 'NJTV Public Media NJ (WNJT)'), # http://www.njtvonline.org/ - # (r'ripbs\.org', 'Rhode Island PBS (WSBE)'), # http://www.ripbs.org/home/ - (r'watch\.wliw\.org', 'WLIW21 (WLIW)'), # http://www.wliw.org/ - (r'video\.mpt\.tv', 'mpt/Maryland Public Television (WMPB)'), # http://www.mpt.org - (r'watch\.weta\.org', 'WETA Television and Radio (WETA)'), # http://www.weta.org - (r'video\.whyy\.org', 'WHYY (WHYY)'), # http://www.whyy.org - (r'video\.wlvt\.org', 'PBS 39 (WLVT)'), # http://www.wlvt.org/ - (r'video\.wvpt\.net', 'WVPT - Your Source for PBS and More! (WVPT)'), # http://www.wvpt.net - (r'video\.whut\.org', 'Howard University Television (WHUT)'), # http://www.whut.org - (r'video\.wedu\.org', 'WEDU PBS (WEDU)'), # http://www.wedu.org - (r'video\.wgcu\.org', 'WGCU Public Media (WGCU)'), # http://www.wgcu.org/ - # (r'wjct\.org', 'WJCT Public Broadcasting (WJCT)'), # http://www.wjct.org - (r'video\.wpbt2\.org', 'WPBT2 (WPBT)'), # http://www.wpbt2.org - (r'video\.wucftv\.org', 'WUCF TV (WUCF)'), # http://wucftv.org - (r'video\.wuft\.org', 'WUFT/Channel 5 (WUFT)'), # http://www.wuft.org - (r'watch\.wxel\.org', 'WXEL/Channel 42 (WXEL)'), # http://www.wxel.org/home/ - (r'video\.wlrn\.org', 'WLRN/Channel 17 (WLRN)'), # http://www.wlrn.org/ - (r'video\.wusf\.usf\.edu', 'WUSF Public Broadcasting (WUSF)'), # http://wusf.org/ - (r'video\.scetv\.org', 'ETV (WRLK)'), # http://www.scetv.org - (r'video\.unctv\.org', 'UNC-TV (WUNC)'), # http://www.unctv.org/ - # (r'pbsguam\.org', 'PBS Guam (KGTF)'), # http://www.pbsguam.org/ - (r'video\.pbshawaii\.org', 'PBS Hawaii - Oceanic Cable Channel 10 (KHET)'), # http://www.pbshawaii.org/ - (r'video\.idahoptv\.org', 'Idaho Public Television (KAID)'), # http://idahoptv.org - (r'video\.ksps\.org', 'KSPS (KSPS)'), # http://www.ksps.org/home/ - (r'watch\.opb\.org', 'OPB (KOPB)'), # http://www.opb.org - (r'watch\.nwptv\.org', 'KWSU/Channel 10 & KTNW/Channel 31 (KWSU)'), # http://www.kwsu.org - (r'video\.will\.illinois\.edu', 'WILL-TV (WILL)'), # http://will.illinois.edu/ - (r'video\.networkknowledge\.tv', 'Network Knowledge - WSEC/Springfield (WSEC)'), # http://www.wsec.tv - (r'video\.wttw\.com', 'WTTW11 (WTTW)'), # http://www.wttw.com/ - # (r'wtvp\.org', 'WTVP & WTVP.org, Public Media for Central Illinois (WTVP)'), # http://www.wtvp.org/ - (r'video\.iptv\.org', 'Iowa Public Television/IPTV (KDIN)'), # http://www.iptv.org/ - (r'video\.ninenet\.org', 'Nine Network (KETC)'), # http://www.ninenet.org - (r'video\.wfwa\.org', 'PBS39 Fort Wayne (WFWA)'), # http://wfwa.org/ - (r'video\.wfyi\.org', 'WFYI Indianapolis (WFYI)'), # http://www.wfyi.org - (r'video\.mptv\.org', 'Milwaukee Public Television (WMVS)'), # http://www.mptv.org - (r'video\.wnin\.org', 'WNIN (WNIN)'), # http://www.wnin.org/ - (r'video\.wnit\.org', 'WNIT Public Television (WNIT)'), # http://www.wnit.org/ - (r'video\.wpt\.org', 'WPT (WPNE)'), # http://www.wpt.org/ - (r'video\.wvut\.org', 'WVUT/Channel 22 (WVUT)'), # http://wvut.org/ - (r'video\.weiu\.net', 'WEIU/Channel 51 (WEIU)'), # http://www.weiu.net - (r'video\.wqpt\.org', 'WQPT-TV (WQPT)'), # http://www.wqpt.org - (r'video\.wycc\.org', 'WYCC PBS Chicago (WYCC)'), # http://www.wycc.org - # (r'lakeshorepublicmedia\.org', 'Lakeshore Public Television (WYIN)'), # http://lakeshorepublicmedia.org/ - (r'video\.wipb\.org', 'WIPB-TV (WIPB)'), # http://wipb.org - (r'video\.indianapublicmedia\.org', 'WTIU (WTIU)'), # http://indianapublicmedia.org/tv/ - (r'watch\.cetconnect\.org', 'CET (WCET)'), # http://www.cetconnect.org - (r'video\.thinktv\.org', 'ThinkTVNetwork (WPTD)'), # http://www.thinktv.org - (r'video\.wbgu\.org', 'WBGU-TV (WBGU)'), # http://wbgu.org - (r'video\.wgvu\.org', 'WGVU TV (WGVU)'), # http://www.wgvu.org/ - (r'video\.netnebraska\.org', 'NET1 (KUON)'), # http://netnebraska.org - (r'video\.pioneer\.org', 'Pioneer Public Television (KWCM)'), # http://www.pioneer.org - (r'watch\.sdpb\.org', 'SDPB Television (KUSD)'), # http://www.sdpb.org - (r'video\.tpt\.org', 'TPT (KTCA)'), # http://www.tpt.org - (r'watch\.ksmq\.org', 'KSMQ (KSMQ)'), # http://www.ksmq.org/ - (r'watch\.kpts\.org', 'KPTS/Channel 8 (KPTS)'), # http://www.kpts.org/ - (r'watch\.ktwu\.org', 'KTWU/Channel 11 (KTWU)'), # http://ktwu.org - # (r'shptv\.org', 'Smoky Hills Public Television (KOOD)'), # http://www.shptv.org - # (r'kcpt\.org', 'KCPT Kansas City Public Television (KCPT)'), # http://kcpt.org/ - # (r'blueridgepbs\.org', 'Blue Ridge PBS (WBRA)'), # http://www.blueridgepbs.org/ - (r'watch\.easttennesseepbs\.org', 'East Tennessee PBS (WSJK)'), # http://easttennesseepbs.org - (r'video\.wcte\.tv', 'WCTE-TV (WCTE)'), # http://www.wcte.org - (r'video\.wljt\.org', 'WLJT, Channel 11 (WLJT)'), # http://wljt.org/ - (r'video\.wosu\.org', 'WOSU TV (WOSU)'), # http://wosu.org/ - (r'video\.woub\.org', 'WOUB/WOUC (WOUB)'), # http://woub.org/tv/index.php?section=5 - (r'video\.wvpublic\.org', 'WVPB (WVPB)'), # http://wvpublic.org/ - (r'video\.wkyupbs\.org', 'WKYU-PBS (WKYU)'), # http://www.wkyupbs.org - # (r'wyes\.org', 'WYES-TV/New Orleans (WYES)'), # http://www.wyes.org - (r'video\.kera\.org', 'KERA 13 (KERA)'), # http://www.kera.org/ - (r'video\.mpbn\.net', 'MPBN (WCBB)'), # http://www.mpbn.net/ - (r'video\.mountainlake\.org', 'Mountain Lake PBS (WCFE)'), # http://www.mountainlake.org/ - (r'video\.nhptv\.org', 'NHPTV (WENH)'), # http://nhptv.org/ - (r'video\.vpt\.org', 'Vermont PBS (WETK)'), # http://www.vpt.org - (r'video\.witf\.org', 'witf (WITF)'), # http://www.witf.org - (r'watch\.wqed\.org', 'WQED Multimedia (WQED)'), # http://www.wqed.org/ - (r'video\.wmht\.org', 'WMHT Educational Telecommunications (WMHT)'), # http://www.wmht.org/home/ - (r'video\.deltabroadcasting\.org', 'Q-TV (WDCQ)'), # http://www.deltabroadcasting.org - (r'video\.dptv\.org', 'WTVS Detroit Public TV (WTVS)'), # http://www.dptv.org/ - (r'video\.wcmu\.org', 'CMU Public Television (WCMU)'), # http://www.wcmu.org - (r'video\.wkar\.org', 'WKAR-TV (WKAR)'), # http://wkar.org/ - (r'wnmuvideo\.nmu\.edu', 'WNMU-TV Public TV 13 (WNMU)'), # http://wnmutv.nmu.edu - (r'video\.wdse\.org', 'WDSE - WRPT (WDSE)'), # http://www.wdse.org/ - (r'video\.wgte\.org', 'WGTE TV (WGTE)'), # http://www.wgte.org - (r'video\.lptv\.org', 'Lakeland Public Television (KAWE)'), # http://www.lakelandptv.org - # (r'prairiepublic\.org', 'PRAIRIE PUBLIC (KFME)'), # http://www.prairiepublic.org/ - (r'video\.kmos\.org', 'KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS)'), # http://www.kmos.org/ - (r'watch\.montanapbs\.org', 'MontanaPBS (KUSM)'), # http://montanapbs.org - (r'video\.krwg\.org', 'KRWG/Channel 22 (KRWG)'), # http://www.krwg.org - (r'video\.kacvtv\.org', 'KACV (KACV)'), # http://www.panhandlepbs.org/home/ - (r'video\.kcostv\.org', 'KCOS/Channel 13 (KCOS)'), # www.kcostv.org - (r'video\.wcny\.org', 'WCNY/Channel 24 (WCNY)'), # http://www.wcny.org - (r'video\.wned\.org', 'WNED (WNED)'), # http://www.wned.org/ - (r'watch\.wpbstv\.org', 'WPBS (WPBS)'), # http://www.wpbstv.org - (r'video\.wskg\.org', 'WSKG Public TV (WSKG)'), # http://wskg.org - (r'video\.wxxi\.org', 'WXXI (WXXI)'), # http://wxxi.org - (r'video\.wpsu\.org', 'WPSU (WPSU)'), # http://www.wpsu.org - # (r'wqln\.org', 'WQLN/Channel 54 (WQLN)'), # http://www.wqln.org - (r'on-demand\.wvia\.org', 'WVIA Public Media Studios (WVIA)'), # http://www.wvia.org/ - (r'video\.wtvi\.org', 'WTVI (WTVI)'), # http://www.wtvi.org/ - # (r'whro\.org', 'WHRO (WHRO)'), # http://whro.org - (r'video\.westernreservepublicmedia\.org', 'Western Reserve PBS (WNEO)'), # http://www.WesternReservePublicMedia.org/ - (r'video\.ideastream\.org', 'WVIZ/PBS ideastream (WVIZ)'), # http://www.wviz.org/ - (r'video\.kcts9\.org', 'KCTS 9 (KCTS)'), # http://kcts9.org/ - (r'video\.basinpbs\.org', 'Basin PBS (KPBT)'), # http://www.basinpbs.org - (r'video\.houstonpbs\.org', 'KUHT / Channel 8 (KUHT)'), # http://www.houstonpublicmedia.org/ - # (r'tamu\.edu', 'KAMU - TV (KAMU)'), # http://KAMU.tamu.edu - # (r'kedt\.org', 'KEDT/Channel 16 (KEDT)'), # http://www.kedt.org - (r'video\.klrn\.org', 'KLRN (KLRN)'), # http://www.klrn.org - (r'video\.klru\.tv', 'KLRU (KLRU)'), # http://www.klru.org - # (r'kmbh\.org', 'KMBH-TV (KMBH)'), # http://www.kmbh.org - # (r'knct\.org', 'KNCT (KNCT)'), # http://www.knct.org - # (r'ktxt\.org', 'KTTZ-TV (KTXT)'), # http://www.ktxt.org - (r'video\.wtjx\.org', 'WTJX Channel 12 (WTJX)'), # http://www.wtjx.org/ - (r'video\.ideastations\.org', 'WCVE PBS (WCVE)'), # http://ideastations.org/ - (r'video\.kbtc\.org', 'KBTC Public Television (KBTC)'), # http://kbtc.org - ) - - IE_NAME = 'pbs' - IE_DESC = 'Public Broadcasting Service (PBS) and member stations: %s' % ', '.join(list(zip(*_STATIONS))[1]) - - _VALID_URL = r'''(?x)https?:// - (?: - # Direct video URL - (?:%s)/(?:(?:vir|port)alplayer|video)/(?P<id>[0-9]+)(?:[?/]|$) | - # Article with embedded player (or direct video) - (?:www\.)?pbs\.org/(?:[^/]+/){1,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) | - # Player - (?:video|player)\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+)/ - ) - ''' % '|'.join(list(zip(*_STATIONS))[0]) - - _GEO_COUNTRIES = ['US'] - - _TESTS = [ - { - 'url': 'http://www.pbs.org/tpt/constitution-usa-peter-sagal/watch/a-more-perfect-union/', - 'md5': '173dc391afd361fa72eab5d3d918968d', - 'info_dict': { - 'id': '2365006249', - 'ext': 'mp4', - 'title': 'Constitution USA with Peter Sagal - A More Perfect Union', - 'description': 'md5:31b664af3c65fd07fa460d306b837d00', - 'duration': 3190, - }, - }, - { - 'url': 'http://www.pbs.org/wgbh/pages/frontline/losing-iraq/', - 'md5': '6f722cb3c3982186d34b0f13374499c7', - 'info_dict': { - 'id': '2365297690', - 'ext': 'mp4', - 'title': 'FRONTLINE - Losing Iraq', - 'description': 'md5:5979a4d069b157f622d02bff62fbe654', - 'duration': 5050, - }, - }, - { - 'url': 'http://www.pbs.org/newshour/bb/education-jan-june12-cyberschools_02-23/', - 'md5': 'b19856d7f5351b17a5ab1dc6a64be633', - 'info_dict': { - 'id': '2201174722', - 'ext': 'mp4', - 'title': 'PBS NewsHour - Cyber Schools Gain Popularity, but Quality Questions Persist', - 'description': 'md5:86ab9a3d04458b876147b355788b8781', - 'duration': 801, - }, - }, - { - 'url': 'http://www.pbs.org/wnet/gperf/dudamel-conducts-verdi-requiem-hollywood-bowl-full-episode/3374/', - 'md5': 'c62859342be2a0358d6c9eb306595978', - 'info_dict': { - 'id': '2365297708', - 'ext': 'mp4', - 'title': 'Great Performances - Dudamel Conducts Verdi Requiem at the Hollywood Bowl - Full', - 'description': 'md5:657897370e09e2bc6bf0f8d2cd313c6b', - 'duration': 6559, - 'thumbnail': r're:^https?://.*\.jpg$', - }, - }, - { - 'url': 'http://www.pbs.org/wgbh/nova/earth/killer-typhoon.html', - 'md5': '908f3e5473a693b266b84e25e1cf9703', - 'info_dict': { - 'id': '2365160389', - 'display_id': 'killer-typhoon', - 'ext': 'mp4', - 'description': 'md5:c741d14e979fc53228c575894094f157', - 'title': 'NOVA - Killer Typhoon', - 'duration': 3172, - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20140122', - 'age_limit': 10, - }, - }, - { - 'url': 'http://www.pbs.org/wgbh/pages/frontline/united-states-of-secrets/', - 'info_dict': { - 'id': 'united-states-of-secrets', - }, - 'playlist_count': 2, - }, - { - 'url': 'http://www.pbs.org/wgbh/americanexperience/films/great-war/', - 'info_dict': { - 'id': 'great-war', - }, - 'playlist_count': 3, - }, - { - 'url': 'http://www.pbs.org/wgbh/americanexperience/films/death/player/', - 'info_dict': { - 'id': '2276541483', - 'display_id': 'player', - 'ext': 'mp4', - 'title': 'American Experience - Death and the Civil War, Chapter 1', - 'description': 'md5:67fa89a9402e2ee7d08f53b920674c18', - 'duration': 682, - 'thumbnail': r're:^https?://.*\.jpg$', - }, - 'params': { - 'skip_download': True, # requires ffmpeg - }, - }, - { - 'url': 'http://www.pbs.org/video/2365245528/', - 'md5': '115223d41bd55cda8ae5cd5ed4e11497', - 'info_dict': { - 'id': '2365245528', - 'display_id': '2365245528', - 'ext': 'mp4', - 'title': 'FRONTLINE - United States of Secrets (Part One)', - 'description': 'md5:55756bd5c551519cc4b7703e373e217e', - 'duration': 6851, - 'thumbnail': r're:^https?://.*\.jpg$', - }, - }, - { - # Video embedded in iframe containing angle brackets as attribute's value (e.g. - # "<iframe style='position: absolute;<br />\ntop: 0; left: 0;' ...", see - # https://github.com/ytdl-org/youtube-dl/issues/7059) - 'url': 'http://www.pbs.org/food/features/a-chefs-life-season-3-episode-5-prickly-business/', - 'md5': '59b0ef5009f9ac8a319cc5efebcd865e', - 'info_dict': { - 'id': '2365546844', - 'display_id': 'a-chefs-life-season-3-episode-5-prickly-business', - 'ext': 'mp4', - 'title': "A Chef's Life - Season 3, Ep. 5: Prickly Business", - 'description': 'md5:c0ff7475a4b70261c7e58f493c2792a5', - 'duration': 1480, - 'thumbnail': r're:^https?://.*\.jpg$', - }, - }, - { - # Frontline video embedded via flp2012.js - 'url': 'http://www.pbs.org/wgbh/pages/frontline/the-atomic-artists', - 'info_dict': { - 'id': '2070868960', - 'display_id': 'the-atomic-artists', - 'ext': 'mp4', - 'title': 'FRONTLINE - The Atomic Artists', - 'description': 'md5:f677e4520cfacb4a5ce1471e31b57800', - 'duration': 723, - 'thumbnail': r're:^https?://.*\.jpg$', - }, - 'params': { - 'skip_download': True, # requires ffmpeg - }, - }, - { - # Serves hd only via wigget/partnerplayer page - 'url': 'http://www.pbs.org/video/2365641075/', - 'md5': 'fdf907851eab57211dd589cf12006666', - 'info_dict': { - 'id': '2365641075', - 'ext': 'mp4', - 'title': 'FRONTLINE - Netanyahu at War', - 'duration': 6852, - 'thumbnail': r're:^https?://.*\.jpg$', - 'formats': 'mincount:8', - }, - }, - { - # https://github.com/ytdl-org/youtube-dl/issues/13801 - 'url': 'https://www.pbs.org/video/pbs-newshour-full-episode-july-31-2017-1501539057/', - 'info_dict': { - 'id': '3003333873', - 'ext': 'mp4', - 'title': 'PBS NewsHour - full episode July 31, 2017', - 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', - 'duration': 3265, - 'thumbnail': r're:^https?://.*\.jpg$', - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://www.pbs.org/wgbh/roadshow/watch/episode/2105-indianapolis-hour-2/', - 'info_dict': { - 'id': '2365936247', - 'ext': 'mp4', - 'title': 'Antiques Roadshow - Indianapolis, Hour 2', - 'description': 'md5:524b32249db55663e7231b6b8d1671a2', - 'duration': 3180, - 'thumbnail': r're:^https?://.*\.jpg$', - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': ['HTTP Error 403: Forbidden'], - }, - { - 'url': 'https://www.pbs.org/wgbh/masterpiece/episodes/victoria-s2-e1/', - 'info_dict': { - 'id': '3007193718', - 'ext': 'mp4', - 'title': "Victoria - A Soldier's Daughter / The Green-Eyed Monster", - 'description': 'md5:37efbac85e0c09b009586523ec143652', - 'duration': 6292, - 'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$', - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': ['HTTP Error 403: Forbidden'], - }, - { - 'url': 'https://player.pbs.org/partnerplayer/tOz9tM5ljOXQqIIWke53UA==/', - 'info_dict': { - 'id': '3011407934', - 'ext': 'mp4', - 'title': 'Stories from the Stage - Road Trip', - 'duration': 1619, - 'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$', - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': ['HTTP Error 403: Forbidden'], - }, - { - 'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true', - 'only_matching': True, - }, - { - 'url': 'http://watch.knpb.org/video/2365616055/', - 'only_matching': True, - }, - { - 'url': 'https://player.pbs.org/portalplayer/3004638221/?uid=', - 'only_matching': True, - } - ] - _ERRORS = { - 101: 'We\'re sorry, but this video is not yet available.', - 403: 'We\'re sorry, but this video is not available in your region due to right restrictions.', - 404: 'We are experiencing technical difficulties that are preventing us from playing the video at this time. Please check back again soon.', - 410: 'This video has expired and is no longer available for online streaming.', - } - - def _real_initialize(self): - cookie = (self._download_json( - 'http://localization.services.pbs.org/localize/auto/cookie/', - None, headers=self.geo_verification_headers(), fatal=False) or {}).get('cookie') - if cookie: - station = self._search_regex(r'#?s=\["([^"]+)"', cookie, 'station') - if station: - self._set_cookie('.pbs.org', 'pbsol.station', station) - - def _extract_webpage(self, url): - mobj = re.match(self._VALID_URL, url) - - description = None - - presumptive_id = mobj.group('presumptive_id') - display_id = presumptive_id - if presumptive_id: - webpage = self._download_webpage(url, display_id) - - description = strip_or_none(self._og_search_description( - webpage, default=None) or self._html_search_meta( - 'description', webpage, default=None)) - upload_date = unified_strdate(self._search_regex( - r'<input type="hidden" id="air_date_[0-9]+" value="([^"]+)"', - webpage, 'upload date', default=None)) - - # tabbed frontline videos - MULTI_PART_REGEXES = ( - r'<div[^>]+class="videotab[^"]*"[^>]+vid="(\d+)"', - r'<a[^>]+href=["\']#(?:video-|part)\d+["\'][^>]+data-cove[Ii]d=["\'](\d+)', - ) - for p in MULTI_PART_REGEXES: - tabbed_videos = orderedSet(re.findall(p, webpage)) - if tabbed_videos: - return tabbed_videos, presumptive_id, upload_date, description - - MEDIA_ID_REGEXES = [ - r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'", # frontline video embed - r'class="coveplayerid">([^<]+)<', # coveplayer - r'<section[^>]+data-coveid="(\d+)"', # coveplayer from http://www.pbs.org/wgbh/frontline/film/real-csi/ - r'<input type="hidden" id="pbs_video_id_[0-9]+" value="([0-9]+)"/>', # jwplayer - r"(?s)window\.PBS\.playerConfig\s*=\s*{.*?id\s*:\s*'([0-9]+)',", - r'<div[^>]+\bdata-cove-id=["\'](\d+)"', # http://www.pbs.org/wgbh/roadshow/watch/episode/2105-indianapolis-hour-2/ - r'<iframe[^>]+\bsrc=["\'](?:https?:)?//video\.pbs\.org/widget/partnerplayer/(\d+)', # https://www.pbs.org/wgbh/masterpiece/episodes/victoria-s2-e1/ - ] - - media_id = self._search_regex( - MEDIA_ID_REGEXES, webpage, 'media ID', fatal=False, default=None) - if media_id: - return media_id, presumptive_id, upload_date, description - - # Frontline video embedded via flp - video_id = self._search_regex( - r'videoid\s*:\s*"([\d+a-z]{7,})"', webpage, 'videoid', default=None) - if video_id: - # pkg_id calculation is reverse engineered from - # http://www.pbs.org/wgbh/pages/frontline/js/flp2012.js - prg_id = self._search_regex( - r'videoid\s*:\s*"([\d+a-z]{7,})"', webpage, 'videoid')[7:] - if 'q' in prg_id: - prg_id = prg_id.split('q')[1] - prg_id = int(prg_id, 16) - getdir = self._download_json( - 'http://www.pbs.org/wgbh/pages/frontline/.json/getdir/getdir%d.json' % prg_id, - presumptive_id, 'Downloading getdir JSON', - transform_source=strip_jsonp) - return getdir['mid'], presumptive_id, upload_date, description - - for iframe in re.findall(r'(?s)<iframe(.+?)></iframe>', webpage): - url = self._search_regex( - r'src=(["\'])(?P<url>.+?partnerplayer.+?)\1', iframe, - 'player URL', default=None, group='url') - if url: - break - - if not url: - url = self._og_search_url(webpage) - - mobj = re.match( - self._VALID_URL, self._proto_relative_url(url.strip())) - - player_id = mobj.group('player_id') - if not display_id: - display_id = player_id - if player_id: - player_page = self._download_webpage( - url, display_id, note='Downloading player page', - errnote='Could not download player page') - video_id = self._search_regex( - r'<div\s+id=["\']video_(\d+)', player_page, 'video ID', - default=None) - if not video_id: - video_info = self._extract_video_data( - player_page, 'video data', display_id) - video_id = compat_str( - video_info.get('id') or video_info['contentID']) - else: - video_id = mobj.group('id') - display_id = video_id - - return video_id, display_id, None, description - - def _extract_video_data(self, string, name, video_id, fatal=True): - return self._parse_json( - self._search_regex( - [r'(?s)PBS\.videoData\s*=\s*({.+?});\n', - r'window\.videoBridge\s*=\s*({.+?});'], - string, name, default='{}'), - video_id, transform_source=js_to_json, fatal=fatal) - - def _real_extract(self, url): - video_id, display_id, upload_date, description = self._extract_webpage(url) - - if isinstance(video_id, list): - entries = [self.url_result( - 'http://video.pbs.org/video/%s' % vid_id, 'PBS', vid_id) - for vid_id in video_id] - return self.playlist_result(entries, display_id) - - info = None - redirects = [] - redirect_urls = set() - - def extract_redirect_urls(info): - for encoding_name in ('recommended_encoding', 'alternate_encoding'): - redirect = info.get(encoding_name) - if not redirect: - continue - redirect_url = redirect.get('url') - if redirect_url and redirect_url not in redirect_urls: - redirects.append(redirect) - redirect_urls.add(redirect_url) - encodings = info.get('encodings') - if isinstance(encodings, list): - for encoding in encodings: - encoding_url = url_or_none(encoding) - if encoding_url and encoding_url not in redirect_urls: - redirects.append({'url': encoding_url}) - redirect_urls.add(encoding_url) - - chapters = [] - # Player pages may also serve different qualities - for page in ('widget/partnerplayer', 'portalplayer'): - player = self._download_webpage( - 'http://player.pbs.org/%s/%s' % (page, video_id), - display_id, 'Downloading %s page' % page, fatal=False) - if player: - video_info = self._extract_video_data( - player, '%s video data' % page, display_id, fatal=False) - if video_info: - extract_redirect_urls(video_info) - if not info: - info = video_info - if not chapters: - raw_chapters = video_info.get('chapters') or [] - if not raw_chapters: - for chapter_data in re.findall(r'(?s)chapters\.push\(({.*?})\)', player): - chapter = self._parse_json(chapter_data, video_id, js_to_json, fatal=False) - if not chapter: - continue - raw_chapters.append(chapter) - for chapter in raw_chapters: - start_time = float_or_none(chapter.get('start_time'), 1000) - duration = float_or_none(chapter.get('duration'), 1000) - if start_time is None or duration is None: - continue - chapters.append({ - 'start_time': start_time, - 'end_time': start_time + duration, - 'title': chapter.get('title'), - }) - - formats = [] - http_url = None - for num, redirect in enumerate(redirects): - redirect_id = redirect.get('eeid') - - redirect_info = self._download_json( - '%s?format=json' % redirect['url'], display_id, - 'Downloading %s video url info' % (redirect_id or num), - headers=self.geo_verification_headers()) - - if redirect_info['status'] == 'error': - message = self._ERRORS.get( - redirect_info['http_code'], redirect_info['message']) - if redirect_info['http_code'] == 403: - self.raise_geo_restricted( - msg=message, countries=self._GEO_COUNTRIES) - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, message), expected=True) - - format_url = redirect_info.get('url') - if not format_url: - continue - - if determine_ext(format_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, display_id, 'mp4', m3u8_id='hls', fatal=False)) - else: - formats.append({ - 'url': format_url, - 'format_id': redirect_id, - }) - if re.search(r'^https?://.*(?:\d+k|baseline)', format_url): - http_url = format_url - self._remove_duplicate_formats(formats) - m3u8_formats = list(filter( - lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none', - formats)) - if http_url: - for m3u8_format in m3u8_formats: - bitrate = self._search_regex(r'(\d+)k', m3u8_format['url'], 'bitrate', default=None) - # Lower qualities (150k and 192k) are not available as HTTP formats (see [1]), - # we won't try extracting them. - # Since summer 2016 higher quality formats (4500k and 6500k) are also available - # albeit they are not documented in [2]. - # 1. https://github.com/ytdl-org/youtube-dl/commit/cbc032c8b70a038a69259378c92b4ba97b42d491#commitcomment-17313656 - # 2. https://projects.pbs.org/confluence/display/coveapi/COVE+Video+Specifications - if not bitrate or int(bitrate) < 400: - continue - f_url = re.sub(r'\d+k|baseline', bitrate + 'k', http_url) - # This may produce invalid links sometimes (e.g. - # http://www.pbs.org/wgbh/frontline/film/suicide-plan) - if not self._is_valid_url(f_url, display_id, 'http-%sk video' % bitrate): - continue - f = m3u8_format.copy() - f.update({ - 'url': f_url, - 'format_id': m3u8_format['format_id'].replace('hls', 'http'), - 'protocol': 'http', - }) - formats.append(f) - self._sort_formats(formats) - - rating_str = info.get('rating') - if rating_str is not None: - rating_str = rating_str.rpartition('-')[2] - age_limit = US_RATINGS.get(rating_str) - - subtitles = {} - closed_captions_url = info.get('closed_captions_url') - if closed_captions_url: - subtitles['en'] = [{ - 'ext': 'ttml', - 'url': closed_captions_url, - }] - mobj = re.search(r'/(\d+)_Encoded\.dfxp', closed_captions_url) - if mobj: - ttml_caption_suffix, ttml_caption_id = mobj.group(0, 1) - ttml_caption_id = int(ttml_caption_id) - subtitles['en'].extend([{ - 'url': closed_captions_url.replace( - ttml_caption_suffix, '/%d_Encoded.srt' % (ttml_caption_id + 1)), - 'ext': 'srt', - }, { - 'url': closed_captions_url.replace( - ttml_caption_suffix, '/%d_Encoded.vtt' % (ttml_caption_id + 2)), - 'ext': 'vtt', - }]) - - # info['title'] is often incomplete (e.g. 'Full Episode', 'Episode 5', etc) - # Try turning it to 'program - title' naming scheme if possible - alt_title = info.get('program', {}).get('title') - if alt_title: - info['title'] = alt_title + ' - ' + re.sub(r'^' + alt_title + r'[\s\-:]+', '', info['title']) - - description = info.get('description') or info.get( - 'program', {}).get('description') or description - - return { - 'id': video_id, - 'display_id': display_id, - 'title': info['title'], - 'description': description, - 'thumbnail': info.get('image_url'), - 'duration': int_or_none(info.get('duration')), - 'age_limit': age_limit, - 'upload_date': upload_date, - 'formats': formats, - 'subtitles': subtitles, - 'chapters': chapters, - } diff --git a/youtube_dl/extractor/peertube.py b/youtube_dl/extractor/peertube.py deleted file mode 100644 index 3af533925..000000000 --- a/youtube_dl/extractor/peertube.py +++ /dev/null @@ -1,628 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - int_or_none, - parse_resolution, - str_or_none, - try_get, - unified_timestamp, - url_or_none, - urljoin, -) - - -class PeerTubeIE(InfoExtractor): - _INSTANCES_RE = r'''(?: - # Taken from https://instances.joinpeertube.org/instances - peertube\.rainbowswingers\.net| - tube\.stanisic\.nl| - peer\.suiri\.us| - medias\.libox\.fr| - videomensoif\.ynh\.fr| - peertube\.travelpandas\.eu| - peertube\.rachetjay\.fr| - peertube\.montecsys\.fr| - tube\.eskuero\.me| - peer\.tube| - peertube\.umeahackerspace\.se| - tube\.nx-pod\.de| - video\.monsieurbidouille\.fr| - tube\.openalgeria\.org| - vid\.lelux\.fi| - video\.anormallostpod\.ovh| - tube\.crapaud-fou\.org| - peertube\.stemy\.me| - lostpod\.space| - exode\.me| - peertube\.snargol\.com| - vis\.ion\.ovh| - videosdulib\.re| - v\.mbius\.io| - videos\.judrey\.eu| - peertube\.osureplayviewer\.xyz| - peertube\.mathieufamily\.ovh| - www\.videos-libr\.es| - fightforinfo\.com| - peertube\.fediverse\.ru| - peertube\.oiseauroch\.fr| - video\.nesven\.eu| - v\.bearvideo\.win| - video\.qoto\.org| - justporn\.cc| - video\.vny\.fr| - peervideo\.club| - tube\.taker\.fr| - peertube\.chantierlibre\.org| - tube\.ipfixe\.info| - tube\.kicou\.info| - tube\.dodsorf\.as| - videobit\.cc| - video\.yukari\.moe| - videos\.elbinario\.net| - hkvideo\.live| - pt\.tux\.tf| - www\.hkvideo\.live| - FIGHTFORINFO\.com| - pt\.765racing\.com| - peertube\.gnumeria\.eu\.org| - nordenmedia\.com| - peertube\.co\.uk| - tube\.darfweb\.eu| - tube\.kalah-france\.org| - 0ch\.in| - vod\.mochi\.academy| - film\.node9\.org| - peertube\.hatthieves\.es| - video\.fitchfamily\.org| - peertube\.ddns\.net| - video\.ifuncle\.kr| - video\.fdlibre\.eu| - tube\.22decembre\.eu| - peertube\.harmoniescreatives\.com| - tube\.fabrigli\.fr| - video\.thedwyers\.co| - video\.bruitbruit\.com| - peertube\.foxfam\.club| - peer\.philoxweb\.be| - videos\.bugs\.social| - peertube\.malbert\.xyz| - peertube\.bilange\.ca| - libretube\.net| - diytelevision\.com| - peertube\.fedilab\.app| - libre\.video| - video\.mstddntfdn\.online| - us\.tv| - peertube\.sl-network\.fr| - peertube\.dynlinux\.io| - peertube\.david\.durieux\.family| - peertube\.linuxrocks\.online| - peerwatch\.xyz| - v\.kretschmann\.social| - tube\.otter\.sh| - yt\.is\.nota\.live| - tube\.dragonpsi\.xyz| - peertube\.boneheadmedia\.com| - videos\.funkwhale\.audio| - watch\.44con\.com| - peertube\.gcaillaut\.fr| - peertube\.icu| - pony\.tube| - spacepub\.space| - tube\.stbr\.io| - v\.mom-gay\.faith| - tube\.port0\.xyz| - peertube\.simounet\.net| - play\.jergefelt\.se| - peertube\.zeteo\.me| - tube\.danq\.me| - peertube\.kerenon\.com| - tube\.fab-l3\.org| - tube\.calculate\.social| - peertube\.mckillop\.org| - tube\.netzspielplatz\.de| - vod\.ksite\.de| - peertube\.laas\.fr| - tube\.govital\.net| - peertube\.stephenson\.cc| - bistule\.nohost\.me| - peertube\.kajalinifi\.de| - video\.ploud\.jp| - video\.omniatv\.com| - peertube\.ffs2play\.fr| - peertube\.leboulaire\.ovh| - peertube\.tronic-studio\.com| - peertube\.public\.cat| - peertube\.metalbanana\.net| - video\.1000i100\.fr| - peertube\.alter-nativ-voll\.de| - tube\.pasa\.tf| - tube\.worldofhauru\.xyz| - pt\.kamp\.site| - peertube\.teleassist\.fr| - videos\.mleduc\.xyz| - conf\.tube| - media\.privacyinternational\.org| - pt\.forty-two\.nl| - video\.halle-leaks\.de| - video\.grosskopfgames\.de| - peertube\.schaeferit\.de| - peertube\.jackbot\.fr| - tube\.extinctionrebellion\.fr| - peertube\.f-si\.org| - video\.subak\.ovh| - videos\.koweb\.fr| - peertube\.zergy\.net| - peertube\.roflcopter\.fr| - peertube\.floss-marketing-school\.com| - vloggers\.social| - peertube\.iriseden\.eu| - videos\.ubuntu-paris\.org| - peertube\.mastodon\.host| - armstube\.com| - peertube\.s2s\.video| - peertube\.lol| - tube\.open-plug\.eu| - open\.tube| - peertube\.ch| - peertube\.normandie-libre\.fr| - peertube\.slat\.org| - video\.lacaveatonton\.ovh| - peertube\.uno| - peertube\.servebeer\.com| - peertube\.fedi\.quebec| - tube\.h3z\.jp| - tube\.plus200\.com| - peertube\.eric\.ovh| - tube\.metadocs\.cc| - tube\.unmondemeilleur\.eu| - gouttedeau\.space| - video\.antirep\.net| - nrop\.cant\.at| - tube\.ksl-bmx\.de| - tube\.plaf\.fr| - tube\.tchncs\.de| - video\.devinberg\.com| - hitchtube\.fr| - peertube\.kosebamse\.com| - yunopeertube\.myddns\.me| - peertube\.varney\.fr| - peertube\.anon-kenkai\.com| - tube\.maiti\.info| - tubee\.fr| - videos\.dinofly\.com| - toobnix\.org| - videotape\.me| - voca\.tube| - video\.heromuster\.com| - video\.lemediatv\.fr| - video\.up\.edu\.ph| - balafon\.video| - video\.ivel\.fr| - thickrips\.cloud| - pt\.laurentkruger\.fr| - video\.monarch-pass\.net| - peertube\.artica\.center| - video\.alternanet\.fr| - indymotion\.fr| - fanvid\.stopthatimp\.net| - video\.farci\.org| - v\.lesterpig\.com| - video\.okaris\.de| - tube\.pawelko\.net| - peertube\.mablr\.org| - tube\.fede\.re| - pytu\.be| - evertron\.tv| - devtube\.dev-wiki\.de| - raptube\.antipub\.org| - video\.selea\.se| - peertube\.mygaia\.org| - video\.oh14\.de| - peertube\.livingutopia\.org| - peertube\.the-penguin\.de| - tube\.thechangebook\.org| - tube\.anjara\.eu| - pt\.pube\.tk| - video\.samedi\.pm| - mplayer\.demouliere\.eu| - widemus\.de| - peertube\.me| - peertube\.zapashcanon\.fr| - video\.latavernedejohnjohn\.fr| - peertube\.pcservice46\.fr| - peertube\.mazzonetto\.eu| - video\.irem\.univ-paris-diderot\.fr| - video\.livecchi\.cloud| - alttube\.fr| - video\.coop\.tools| - video\.cabane-libre\.org| - peertube\.openstreetmap\.fr| - videos\.alolise\.org| - irrsinn\.video| - video\.antopie\.org| - scitech\.video| - tube2\.nemsia\.org| - video\.amic37\.fr| - peertube\.freeforge\.eu| - video\.arbitrarion\.com| - video\.datsemultimedia\.com| - stoptrackingus\.tv| - peertube\.ricostrongxxx\.com| - docker\.videos\.lecygnenoir\.info| - peertube\.togart\.de| - tube\.postblue\.info| - videos\.domainepublic\.net| - peertube\.cyber-tribal\.com| - video\.gresille\.org| - peertube\.dsmouse\.net| - cinema\.yunohost\.support| - tube\.theocevaer\.fr| - repro\.video| - tube\.4aem\.com| - quaziinc\.com| - peertube\.metawurst\.space| - videos\.wakapo\.com| - video\.ploud\.fr| - video\.freeradical\.zone| - tube\.valinor\.fr| - refuznik\.video| - pt\.kircheneuenburg\.de| - peertube\.asrun\.eu| - peertube\.lagob\.fr| - videos\.side-ways\.net| - 91video\.online| - video\.valme\.io| - video\.taboulisme\.com| - videos-libr\.es| - tv\.mooh\.fr| - nuage\.acostey\.fr| - video\.monsieur-a\.fr| - peertube\.librelois\.fr| - videos\.pair2jeux\.tube| - videos\.pueseso\.club| - peer\.mathdacloud\.ovh| - media\.assassinate-you\.net| - vidcommons\.org| - ptube\.rousset\.nom\.fr| - tube\.cyano\.at| - videos\.squat\.net| - video\.iphodase\.fr| - peertube\.makotoworkshop\.org| - peertube\.serveur\.slv-valbonne\.fr| - vault\.mle\.party| - hostyour\.tv| - videos\.hack2g2\.fr| - libre\.tube| - pire\.artisanlogiciel\.net| - videos\.numerique-en-commun\.fr| - video\.netsyms\.com| - video\.die-partei\.social| - video\.writeas\.org| - peertube\.swarm\.solvingmaz\.es| - tube\.pericoloso\.ovh| - watching\.cypherpunk\.observer| - videos\.adhocmusic\.com| - tube\.rfc1149\.net| - peertube\.librelabucm\.org| - videos\.numericoop\.fr| - peertube\.koehn\.com| - peertube\.anarchmusicall\.net| - tube\.kampftoast\.de| - vid\.y-y\.li| - peertube\.xtenz\.xyz| - diode\.zone| - tube\.egf\.mn| - peertube\.nomagic\.uk| - visionon\.tv| - videos\.koumoul\.com| - video\.rastapuls\.com| - video\.mantlepro\.com| - video\.deadsuperhero\.com| - peertube\.musicstudio\.pro| - peertube\.we-keys\.fr| - artitube\.artifaille\.fr| - peertube\.ethernia\.net| - tube\.midov\.pl| - peertube\.fr| - watch\.snoot\.tube| - peertube\.donnadieu\.fr| - argos\.aquilenet\.fr| - tube\.nemsia\.org| - tube\.bruniau\.net| - videos\.darckoune\.moe| - tube\.traydent\.info| - dev\.videos\.lecygnenoir\.info| - peertube\.nayya\.org| - peertube\.live| - peertube\.mofgao\.space| - video\.lequerrec\.eu| - peertube\.amicale\.net| - aperi\.tube| - tube\.ac-lyon\.fr| - video\.lw1\.at| - www\.yiny\.org| - videos\.pofilo\.fr| - tube\.lou\.lt| - choob\.h\.etbus\.ch| - tube\.hoga\.fr| - peertube\.heberge\.fr| - video\.obermui\.de| - videos\.cloudfrancois\.fr| - betamax\.video| - video\.typica\.us| - tube\.piweb\.be| - video\.blender\.org| - peertube\.cat| - tube\.kdy\.ch| - pe\.ertu\.be| - peertube\.social| - videos\.lescommuns\.org| - tv\.datamol\.org| - videonaute\.fr| - dialup\.express| - peertube\.nogafa\.org| - megatube\.lilomoino\.fr| - peertube\.tamanoir\.foucry\.net| - peertube\.devosi\.org| - peertube\.1312\.media| - tube\.bootlicker\.party| - skeptikon\.fr| - video\.blueline\.mg| - tube\.homecomputing\.fr| - tube\.ouahpiti\.info| - video\.tedomum\.net| - video\.g3l\.org| - fontube\.fr| - peertube\.gaialabs\.ch| - tube\.kher\.nl| - peertube\.qtg\.fr| - video\.migennes\.net| - tube\.p2p\.legal| - troll\.tv| - videos\.iut-orsay\.fr| - peertube\.solidev\.net| - videos\.cemea\.org| - video\.passageenseine\.fr| - videos\.festivalparminous\.org| - peertube\.touhoppai\.moe| - sikke\.fi| - peer\.hostux\.social| - share\.tube| - peertube\.walkingmountains\.fr| - videos\.benpro\.fr| - peertube\.parleur\.net| - peertube\.heraut\.eu| - tube\.aquilenet\.fr| - peertube\.gegeweb\.eu| - framatube\.org| - thinkerview\.video| - tube\.conferences-gesticulees\.net| - peertube\.datagueule\.tv| - video\.lqdn\.fr| - tube\.mochi\.academy| - media\.zat\.im| - video\.colibris-outilslibres\.org| - tube\.svnet\.fr| - peertube\.video| - peertube3\.cpy\.re| - peertube2\.cpy\.re| - videos\.tcit\.fr| - peertube\.cpy\.re| - canard\.tube - )''' - _UUID_RE = r'[\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12}' - _API_BASE = 'https://%s/api/v1/videos/%s/%s' - _VALID_URL = r'''(?x) - (?: - peertube:(?P<host>[^:]+):| - https?://(?P<host_2>%s)/(?:videos/(?:watch|embed)|api/v\d/videos)/ - ) - (?P<id>%s) - ''' % (_INSTANCES_RE, _UUID_RE) - _TESTS = [{ - 'url': 'https://framatube.org/videos/watch/9c9de5e8-0a1e-484a-b099-e80766180a6d', - 'md5': '9bed8c0137913e17b86334e5885aacff', - 'info_dict': { - 'id': '9c9de5e8-0a1e-484a-b099-e80766180a6d', - 'ext': 'mp4', - 'title': 'What is PeerTube?', - 'description': 'md5:3fefb8dde2b189186ce0719fda6f7b10', - 'thumbnail': r're:https?://.*\.(?:jpg|png)', - 'timestamp': 1538391166, - 'upload_date': '20181001', - 'uploader': 'Framasoft', - 'uploader_id': '3', - 'uploader_url': 'https://framatube.org/accounts/framasoft', - 'channel': 'Les vidéos de Framasoft', - 'channel_id': '2', - 'channel_url': 'https://framatube.org/video-channels/bf54d359-cfad-4935-9d45-9d6be93f63e8', - 'language': 'en', - 'license': 'Attribution - Share Alike', - 'duration': 113, - 'view_count': int, - 'like_count': int, - 'dislike_count': int, - 'tags': ['framasoft', 'peertube'], - 'categories': ['Science & Technology'], - } - }, { - # Issue #26002 - 'url': 'peertube:spacepub.space:d8943b2d-8280-497b-85ec-bc282ec2afdc', - 'info_dict': { - 'id': 'd8943b2d-8280-497b-85ec-bc282ec2afdc', - 'ext': 'mp4', - 'title': 'Dot matrix printer shell demo', - 'uploader_id': '3', - 'timestamp': 1587401293, - 'upload_date': '20200420', - 'uploader': 'Drew DeVault', - } - }, { - 'url': 'https://peertube.tamanoir.foucry.net/videos/watch/0b04f13d-1e18-4f1d-814e-4979aa7c9c44', - 'only_matching': True, - }, { - # nsfw - 'url': 'https://tube.22decembre.eu/videos/watch/9bb88cd3-9959-46d9-9ab9-33d2bb704c39', - 'only_matching': True, - }, { - 'url': 'https://tube.22decembre.eu/videos/embed/fed67262-6edb-4d1c-833b-daa9085c71d7', - 'only_matching': True, - }, { - 'url': 'https://tube.openalgeria.org/api/v1/videos/c1875674-97d0-4c94-a058-3f7e64c962e8', - 'only_matching': True, - }, { - 'url': 'peertube:video.blender.org:b37a5b9f-e6b5-415c-b700-04a5cd6ec205', - 'only_matching': True, - }] - - @staticmethod - def _extract_peertube_url(webpage, source_url): - mobj = re.match( - r'https?://(?P<host>[^/]+)/videos/(?:watch|embed)/(?P<id>%s)' - % PeerTubeIE._UUID_RE, source_url) - if mobj and any(p in webpage for p in ( - '<title>PeerTube<', - 'There will be other non JS-based clients to access PeerTube', - '>We are sorry but it seems that PeerTube is not compatible with your web browser.<')): - return 'peertube:%s:%s' % mobj.group('host', 'id') - - @staticmethod - def _extract_urls(webpage, source_url): - entries = re.findall( - r'''(?x)<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//%s/videos/embed/%s)''' - % (PeerTubeIE._INSTANCES_RE, PeerTubeIE._UUID_RE), webpage) - if not entries: - peertube_url = PeerTubeIE._extract_peertube_url(webpage, source_url) - if peertube_url: - entries = [peertube_url] - return entries - - def _call_api(self, host, video_id, path, note=None, errnote=None, fatal=True): - return self._download_json( - self._API_BASE % (host, video_id, path), video_id, - note=note, errnote=errnote, fatal=fatal) - - def _get_subtitles(self, host, video_id): - captions = self._call_api( - host, video_id, 'captions', note='Downloading captions JSON', - fatal=False) - if not isinstance(captions, dict): - return - data = captions.get('data') - if not isinstance(data, list): - return - subtitles = {} - for e in data: - language_id = try_get(e, lambda x: x['language']['id'], compat_str) - caption_url = urljoin('https://%s' % host, e.get('captionPath')) - if not caption_url: - continue - subtitles.setdefault(language_id or 'en', []).append({ - 'url': caption_url, - }) - return subtitles - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - host = mobj.group('host') or mobj.group('host_2') - video_id = mobj.group('id') - - video = self._call_api( - host, video_id, '', note='Downloading video JSON') - - title = video['name'] - - formats = [] - files = video.get('files') or [] - for playlist in (video.get('streamingPlaylists') or []): - if not isinstance(playlist, dict): - continue - playlist_files = playlist.get('files') - if not (playlist_files and isinstance(playlist_files, list)): - continue - files.extend(playlist_files) - for file_ in files: - if not isinstance(file_, dict): - continue - file_url = url_or_none(file_.get('fileUrl')) - if not file_url: - continue - file_size = int_or_none(file_.get('size')) - format_id = try_get( - file_, lambda x: x['resolution']['label'], compat_str) - f = parse_resolution(format_id) - f.update({ - 'url': file_url, - 'format_id': format_id, - 'filesize': file_size, - }) - if format_id == '0p': - f['vcodec'] = 'none' - else: - f['fps'] = int_or_none(file_.get('fps')) - formats.append(f) - self._sort_formats(formats) - - description = video.get('description') - if len(description) >= 250: - # description is shortened - full_description = self._call_api( - host, video_id, 'description', note='Downloading description JSON', - fatal=False) - - if isinstance(full_description, dict): - description = str_or_none(full_description.get('description')) or description - - subtitles = self.extract_subtitles(host, video_id) - - def data(section, field, type_): - return try_get(video, lambda x: x[section][field], type_) - - def account_data(field, type_): - return data('account', field, type_) - - def channel_data(field, type_): - return data('channel', field, type_) - - category = data('category', 'label', compat_str) - categories = [category] if category else None - - nsfw = video.get('nsfw') - if nsfw is bool: - age_limit = 18 if nsfw else 0 - else: - age_limit = None - - webpage_url = 'https://%s/videos/watch/%s' % (host, video_id) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': urljoin(webpage_url, video.get('thumbnailPath')), - 'timestamp': unified_timestamp(video.get('publishedAt')), - 'uploader': account_data('displayName', compat_str), - 'uploader_id': str_or_none(account_data('id', int)), - 'uploader_url': url_or_none(account_data('url', compat_str)), - 'channel': channel_data('displayName', compat_str), - 'channel_id': str_or_none(channel_data('id', int)), - 'channel_url': url_or_none(channel_data('url', compat_str)), - 'language': data('language', 'id', compat_str), - 'license': data('licence', 'label', compat_str), - 'duration': int_or_none(video.get('duration')), - 'view_count': int_or_none(video.get('views')), - 'like_count': int_or_none(video.get('likes')), - 'dislike_count': int_or_none(video.get('dislikes')), - 'age_limit': age_limit, - 'tags': try_get(video, lambda x: x['tags'], list), - 'categories': categories, - 'formats': formats, - 'subtitles': subtitles, - 'webpage_url': webpage_url, - } diff --git a/youtube_dl/extractor/performgroup.py b/youtube_dl/extractor/performgroup.py deleted file mode 100644 index 26942bfb3..000000000 --- a/youtube_dl/extractor/performgroup.py +++ /dev/null @@ -1,83 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import int_or_none - - -class PerformGroupIE(InfoExtractor): - _VALID_URL = r'https?://player\.performgroup\.com/eplayer(?:/eplayer\.html|\.js)#/?(?P<id>[0-9a-f]{26})\.(?P<auth_token>[0-9a-z]{26})' - _TESTS = [{ - # http://www.faz.net/aktuell/sport/fussball/wm-2018-playoffs-schweiz-besiegt-nordirland-1-0-15286104.html - 'url': 'http://player.performgroup.com/eplayer/eplayer.html#d478c41c5d192f56b9aa859de8.1w4crrej5w14e1ed4s1ce4ykab', - 'md5': '259cb03d142e2e52471e8837ecacb29f', - 'info_dict': { - 'id': 'xgrwobuzumes1lwjxtcdpwgxd', - 'ext': 'mp4', - 'title': 'Liga MX: Keine Einsicht nach Horrorfoul', - 'description': 'md5:7cd3b459c82725b021e046ab10bf1c5b', - 'timestamp': 1511533477, - 'upload_date': '20171124', - } - }] - - def _call_api(self, service, auth_token, content_id, referer_url): - return self._download_json( - 'http://ep3.performfeeds.com/ep%s/%s/%s/' % (service, auth_token, content_id), - content_id, headers={ - 'Referer': referer_url, - 'Origin': 'http://player.performgroup.com', - }, query={ - '_fmt': 'json', - }) - - def _real_extract(self, url): - player_id, auth_token = re.search(self._VALID_URL, url).groups() - bootstrap = self._call_api('bootstrap', auth_token, player_id, url) - video = bootstrap['config']['dataSource']['sourceItems'][0]['videos'][0] - video_id = video['uuid'] - vod = self._call_api('vod', auth_token, video_id, url) - media = vod['videos']['video'][0]['media'] - - formats = [] - hls_url = media.get('hls', {}).get('url') - if hls_url: - formats.extend(self._extract_m3u8_formats(hls_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - - hds_url = media.get('hds', {}).get('url') - if hds_url: - formats.extend(self._extract_f4m_formats(hds_url + '?hdcore', video_id, f4m_id='hds', fatal=False)) - - for c in media.get('content', []): - c_url = c.get('url') - if not c_url: - continue - tbr = int_or_none(c.get('bitrate'), 1000) - format_id = 'http' - if tbr: - format_id += '-%d' % tbr - formats.append({ - 'format_id': format_id, - 'url': c_url, - 'tbr': tbr, - 'width': int_or_none(c.get('width')), - 'height': int_or_none(c.get('height')), - 'filesize': int_or_none(c.get('fileSize')), - 'vcodec': c.get('type'), - 'fps': int_or_none(c.get('videoFrameRate')), - 'vbr': int_or_none(c.get('videoRate'), 1000), - 'abr': int_or_none(c.get('audioRate'), 1000), - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': video['title'], - 'description': video.get('description'), - 'thumbnail': video.get('poster'), - 'duration': int_or_none(video.get('duration')), - 'timestamp': int_or_none(video.get('publishedTime'), 1000), - 'formats': formats, - } diff --git a/youtube_dl/extractor/philharmoniedeparis.py b/youtube_dl/extractor/philharmoniedeparis.py deleted file mode 100644 index 03da64b11..000000000 --- a/youtube_dl/extractor/philharmoniedeparis.py +++ /dev/null @@ -1,106 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - try_get, - urljoin, -) - - -class PhilharmonieDeParisIE(InfoExtractor): - IE_DESC = 'Philharmonie de Paris' - _VALID_URL = r'''(?x) - https?:// - (?: - live\.philharmoniedeparis\.fr/(?:[Cc]oncert/|embed(?:app)?/|misc/Playlist\.ashx\?id=)| - pad\.philharmoniedeparis\.fr/doc/CIMU/ - ) - (?P<id>\d+) - ''' - _TESTS = [{ - 'url': 'http://pad.philharmoniedeparis.fr/doc/CIMU/1086697/jazz-a-la-villette-knower', - 'md5': 'a0a4b195f544645073631cbec166a2c2', - 'info_dict': { - 'id': '1086697', - 'ext': 'mp4', - 'title': 'Jazz à la Villette : Knower', - }, - }, { - 'url': 'http://live.philharmoniedeparis.fr/concert/1032066.html', - 'info_dict': { - 'id': '1032066', - 'title': 'md5:0a031b81807b3593cffa3c9a87a167a0', - }, - 'playlist_mincount': 2, - }, { - 'url': 'http://live.philharmoniedeparis.fr/Concert/1030324.html', - 'only_matching': True, - }, { - 'url': 'http://live.philharmoniedeparis.fr/misc/Playlist.ashx?id=1030324&track=&lang=fr', - 'only_matching': True, - }, { - 'url': 'https://live.philharmoniedeparis.fr/embedapp/1098406/berlioz-fantastique-lelio-les-siecles-national-youth-choir-of.html?lang=fr-FR', - 'only_matching': True, - }, { - 'url': 'https://live.philharmoniedeparis.fr/embed/1098406/berlioz-fantastique-lelio-les-siecles-national-youth-choir-of.html?lang=fr-FR', - 'only_matching': True, - }] - _LIVE_URL = 'https://live.philharmoniedeparis.fr' - - def _real_extract(self, url): - video_id = self._match_id(url) - - config = self._download_json( - '%s/otoPlayer/config.ashx' % self._LIVE_URL, video_id, query={ - 'id': video_id, - 'lang': 'fr-FR', - }) - - def extract_entry(source): - if not isinstance(source, dict): - return - title = source.get('title') - if not title: - return - files = source.get('files') - if not isinstance(files, dict): - return - format_urls = set() - formats = [] - for format_id in ('mobile', 'desktop'): - format_url = try_get( - files, lambda x: x[format_id]['file'], compat_str) - if not format_url or format_url in format_urls: - continue - format_urls.add(format_url) - m3u8_url = urljoin(self._LIVE_URL, format_url) - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - if not formats: - return - self._sort_formats(formats) - return { - 'title': title, - 'formats': formats, - } - - thumbnail = urljoin(self._LIVE_URL, config.get('image')) - - info = extract_entry(config) - if info: - info.update({ - 'id': video_id, - 'thumbnail': thumbnail, - }) - return info - - entries = [] - for num, chapter in enumerate(config['chapters'], start=1): - entry = extract_entry(chapter) - entry['id'] = '%s-%d' % (video_id, num) - entries.append(entry) - - return self.playlist_result(entries, video_id, config.get('title')) diff --git a/youtube_dl/extractor/photobucket.py b/youtube_dl/extractor/photobucket.py deleted file mode 100644 index 6c8bbe1d9..000000000 --- a/youtube_dl/extractor/photobucket.py +++ /dev/null @@ -1,46 +0,0 @@ -from __future__ import unicode_literals - -import json -import re - -from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote - - -class PhotobucketIE(InfoExtractor): - _VALID_URL = r'https?://(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))' - _TEST = { - 'url': 'http://media.photobucket.com/user/rachaneronas/media/TiredofLinkBuildingTryBacklinkMyDomaincom_zpsc0c3b9fa.mp4.html?filters[term]=search&filters[primary]=videos&filters[secondary]=images&sort=1&o=0', - 'md5': '7dabfb92b0a31f6c16cebc0f8e60ff99', - 'info_dict': { - 'id': 'zpsc0c3b9fa', - 'ext': 'mp4', - 'timestamp': 1367669341, - 'upload_date': '20130504', - 'uploader': 'rachaneronas', - 'title': 'Tired of Link Building? Try BacklinkMyDomain.com!', - } - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - video_extension = mobj.group('ext') - - webpage = self._download_webpage(url, video_id) - - # Extract URL, uploader, and title from webpage - self.report_extraction(video_id) - info_json = self._search_regex(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (.*?)\);', - webpage, 'info json') - info = json.loads(info_json) - url = compat_urllib_parse_unquote(self._html_search_regex(r'file=(.+\.mp4)', info['linkcodes']['html'], 'url')) - return { - 'id': video_id, - 'url': url, - 'uploader': info['username'], - 'timestamp': info['creationDate'], - 'title': info['title'], - 'ext': video_extension, - 'thumbnail': info['thumbUrl'], - } diff --git a/youtube_dl/extractor/piksel.py b/youtube_dl/extractor/piksel.py deleted file mode 100644 index ecf56ff8f..000000000 --- a/youtube_dl/extractor/piksel.py +++ /dev/null @@ -1,187 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - dict_get, - ExtractorError, - int_or_none, - parse_iso8601, - try_get, - unescapeHTML, -) - - -class PikselIE(InfoExtractor): - _VALID_URL = r'''(?x)https?:// - (?: - (?: - player\. - (?: - olympusattelecom| - vibebyvista - )| - (?:api|player)\.multicastmedia| - (?:api-ovp|player)\.piksel - )\.com| - (?: - mz-edge\.stream\.co| - movie-s\.nhk\.or - )\.jp| - vidego\.baltimorecity\.gov - )/v/(?:refid/(?P<refid>[^/]+)/prefid/)?(?P<id>[\w-]+)''' - _TESTS = [ - { - 'url': 'http://player.piksel.com/v/ums2867l', - 'md5': '34e34c8d89dc2559976a6079db531e85', - 'info_dict': { - 'id': 'ums2867l', - 'ext': 'mp4', - 'title': 'GX-005 with Caption', - 'timestamp': 1481335659, - 'upload_date': '20161210' - } - }, - { - # Original source: http://www.uscourts.gov/cameras-courts/state-washington-vs-donald-j-trump-et-al - 'url': 'https://player.piksel.com/v/v80kqp41', - 'md5': '753ddcd8cc8e4fa2dda4b7be0e77744d', - 'info_dict': { - 'id': 'v80kqp41', - 'ext': 'mp4', - 'title': 'WAW- State of Washington vs. Donald J. Trump, et al', - 'description': 'State of Washington vs. Donald J. Trump, et al, Case Number 17-CV-00141-JLR, TRO Hearing, Civil Rights Case, 02/3/2017, 1:00 PM (PST), Seattle Federal Courthouse, Seattle, WA, Judge James L. Robart presiding.', - 'timestamp': 1486171129, - 'upload_date': '20170204' - } - }, - { - # https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2019240/ - 'url': 'http://player.piksel.com/v/refid/nhkworld/prefid/nw_vod_v_en_2019_240_20190823233000_02_1566873477', - 'only_matching': True, - } - ] - - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<iframe[^>]+src=["\'](?P<url>(?:https?:)?//player\.piksel\.com/v/[a-z0-9]+)', - webpage) - if mobj: - return mobj.group('url') - - def _call_api(self, app_token, resource, display_id, query, fatal=True): - response = (self._download_json( - 'http://player.piksel.com/ws/ws_%s/api/%s/mode/json/apiv/5' % (resource, app_token), - display_id, query=query, fatal=fatal) or {}).get('response') - failure = try_get(response, lambda x: x['failure']['reason']) - if failure: - if fatal: - raise ExtractorError(failure, expected=True) - self.report_warning(failure) - return response - - def _real_extract(self, url): - ref_id, display_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage(url, display_id) - app_token = self._search_regex([ - r'clientAPI\s*:\s*"([^"]+)"', - r'data-de-api-key\s*=\s*"([^"]+)"' - ], webpage, 'app token') - query = {'refid': ref_id, 'prefid': display_id} if ref_id else {'v': display_id} - program = self._call_api( - app_token, 'program', display_id, query)['WsProgramResponse']['program'] - video_id = program['uuid'] - video_data = program['asset'] - title = video_data['title'] - asset_type = dict_get(video_data, ['assetType', 'asset_type']) - - formats = [] - - def process_asset_file(asset_file): - if not asset_file: - return - # TODO: extract rtmp formats - http_url = asset_file.get('http_url') - if not http_url: - return - tbr = None - vbr = int_or_none(asset_file.get('videoBitrate'), 1024) - abr = int_or_none(asset_file.get('audioBitrate'), 1024) - if asset_type == 'video': - tbr = vbr + abr - elif asset_type == 'audio': - tbr = abr - - format_id = ['http'] - if tbr: - format_id.append(compat_str(tbr)) - - formats.append({ - 'format_id': '-'.join(format_id), - 'url': unescapeHTML(http_url), - 'vbr': vbr, - 'abr': abr, - 'width': int_or_none(asset_file.get('videoWidth')), - 'height': int_or_none(asset_file.get('videoHeight')), - 'filesize': int_or_none(asset_file.get('filesize')), - 'tbr': tbr, - }) - - def process_asset_files(asset_files): - for asset_file in (asset_files or []): - process_asset_file(asset_file) - - process_asset_files(video_data.get('assetFiles')) - process_asset_file(video_data.get('referenceFile')) - if not formats: - asset_id = video_data.get('assetid') or program.get('assetid') - if asset_id: - process_asset_files(try_get(self._call_api( - app_token, 'asset_file', display_id, { - 'assetid': asset_id, - }, False), lambda x: x['WsAssetFileResponse']['AssetFiles'])) - - m3u8_url = dict_get(video_data, [ - 'm3u8iPadURL', - 'ipadM3u8Url', - 'm3u8AndroidURL', - 'm3u8iPhoneURL', - 'iphoneM3u8Url']) - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - - smil_url = dict_get(video_data, ['httpSmil', 'hdSmil', 'rtmpSmil']) - if smil_url: - transform_source = None - if ref_id == 'nhkworld': - # TODO: figure out if this is something to be fixed in urljoin, - # _parse_smil_formats or keep it here - transform_source = lambda x: x.replace('src="/', 'src="').replace('/media"', '/media/"') - formats.extend(self._extract_smil_formats( - re.sub(r'/od/[^/]+/', '/od/http/', smil_url), video_id, - transform_source=transform_source, fatal=False)) - - self._sort_formats(formats) - - subtitles = {} - for caption in video_data.get('captions', []): - caption_url = caption.get('url') - if caption_url: - subtitles.setdefault(caption.get('locale', 'en'), []).append({ - 'url': caption_url}) - - return { - 'id': video_id, - 'title': title, - 'description': video_data.get('description'), - 'thumbnail': video_data.get('thumbnailUrl'), - 'timestamp': parse_iso8601(video_data.get('dateadd')), - 'formats': formats, - 'subtitles': subtitles, - } diff --git a/youtube_dl/extractor/pinterest.py b/youtube_dl/extractor/pinterest.py deleted file mode 100644 index 42528d746..000000000 --- a/youtube_dl/extractor/pinterest.py +++ /dev/null @@ -1,203 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import json -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - determine_ext, - float_or_none, - int_or_none, - try_get, - unified_timestamp, - url_or_none, -) - - -class PinterestBaseIE(InfoExtractor): - _VALID_URL_BASE = r'https?://(?:[^/]+\.)?pinterest\.(?:com|fr|de|ch|jp|cl|ca|it|co\.uk|nz|ru|com\.au|at|pt|co\.kr|es|com\.mx|dk|ph|th|com\.uy|co|nl|info|kr|ie|vn|com\.vn|ec|mx|in|pe|co\.at|hu|co\.in|co\.nz|id|com\.ec|com\.py|tw|be|uk|com\.bo|com\.pe)' - - def _call_api(self, resource, video_id, options): - return self._download_json( - 'https://www.pinterest.com/resource/%sResource/get/' % resource, - video_id, 'Download %s JSON metadata' % resource, query={ - 'data': json.dumps({'options': options}) - })['resource_response'] - - def _extract_video(self, data, extract_formats=True): - video_id = data['id'] - - title = (data.get('title') or data.get('grid_title') or video_id).strip() - - urls = [] - formats = [] - duration = None - if extract_formats: - for format_id, format_dict in data['videos']['video_list'].items(): - if not isinstance(format_dict, dict): - continue - format_url = url_or_none(format_dict.get('url')) - if not format_url or format_url in urls: - continue - urls.append(format_url) - duration = float_or_none(format_dict.get('duration'), scale=1000) - ext = determine_ext(format_url) - if 'hls' in format_id.lower() or ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id=format_id, fatal=False)) - else: - formats.append({ - 'url': format_url, - 'format_id': format_id, - 'width': int_or_none(format_dict.get('width')), - 'height': int_or_none(format_dict.get('height')), - 'duration': duration, - }) - self._sort_formats( - formats, field_preference=('height', 'width', 'tbr', 'format_id')) - - description = data.get('description') or data.get('description_html') or data.get('seo_description') - timestamp = unified_timestamp(data.get('created_at')) - - def _u(field): - return try_get(data, lambda x: x['closeup_attribution'][field], compat_str) - - uploader = _u('full_name') - uploader_id = _u('id') - - repost_count = int_or_none(data.get('repin_count')) - comment_count = int_or_none(data.get('comment_count')) - categories = try_get(data, lambda x: x['pin_join']['visual_annotation'], list) - tags = data.get('hashtags') - - thumbnails = [] - images = data.get('images') - if isinstance(images, dict): - for thumbnail_id, thumbnail in images.items(): - if not isinstance(thumbnail, dict): - continue - thumbnail_url = url_or_none(thumbnail.get('url')) - if not thumbnail_url: - continue - thumbnails.append({ - 'url': thumbnail_url, - 'width': int_or_none(thumbnail.get('width')), - 'height': int_or_none(thumbnail.get('height')), - }) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'duration': duration, - 'timestamp': timestamp, - 'thumbnails': thumbnails, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'repost_count': repost_count, - 'comment_count': comment_count, - 'categories': categories, - 'tags': tags, - 'formats': formats, - 'extractor_key': PinterestIE.ie_key(), - } - - -class PinterestIE(PinterestBaseIE): - _VALID_URL = r'%s/pin/(?P<id>\d+)' % PinterestBaseIE._VALID_URL_BASE - _TESTS = [{ - 'url': 'https://www.pinterest.com/pin/664281013778109217/', - 'md5': '6550c2af85d6d9f3fe3b88954d1577fc', - 'info_dict': { - 'id': '664281013778109217', - 'ext': 'mp4', - 'title': 'Origami', - 'description': 'md5:b9d90ddf7848e897882de9e73344f7dd', - 'duration': 57.7, - 'timestamp': 1593073622, - 'upload_date': '20200625', - 'uploader': 'Love origami -I am Dafei', - 'uploader_id': '586523688879454212', - 'repost_count': 50, - 'comment_count': 0, - 'categories': list, - 'tags': list, - }, - }, { - 'url': 'https://co.pinterest.com/pin/824721750502199491/', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - data = self._call_api( - 'Pin', video_id, { - 'field_set_key': 'unauth_react_main_pin', - 'id': video_id, - })['data'] - return self._extract_video(data) - - -class PinterestCollectionIE(PinterestBaseIE): - _VALID_URL = r'%s/(?P<username>[^/]+)/(?P<id>[^/?#&]+)' % PinterestBaseIE._VALID_URL_BASE - _TESTS = [{ - 'url': 'https://www.pinterest.ca/mashal0407/cool-diys/', - 'info_dict': { - 'id': '585890301462791043', - 'title': 'cool diys', - }, - 'playlist_count': 8, - }, { - 'url': 'https://www.pinterest.ca/fudohub/videos/', - 'info_dict': { - 'id': '682858430939307450', - 'title': 'VIDEOS', - }, - 'playlist_mincount': 365, - 'skip': 'Test with extract_formats=False', - }] - - @classmethod - def suitable(cls, url): - return False if PinterestIE.suitable(url) else super( - PinterestCollectionIE, cls).suitable(url) - - def _real_extract(self, url): - username, slug = re.match(self._VALID_URL, url).groups() - board = self._call_api( - 'Board', slug, { - 'slug': slug, - 'username': username - })['data'] - board_id = board['id'] - options = { - 'board_id': board_id, - 'page_size': 250, - } - bookmark = None - entries = [] - while True: - if bookmark: - options['bookmarks'] = [bookmark] - board_feed = self._call_api('BoardFeed', board_id, options) - for item in (board_feed.get('data') or []): - if not isinstance(item, dict) or item.get('type') != 'pin': - continue - video_id = item.get('id') - if video_id: - # Some pins may not be available anonymously via pin URL - # video = self._extract_video(item, extract_formats=False) - # video.update({ - # '_type': 'url_transparent', - # 'url': 'https://www.pinterest.com/pin/%s/' % video_id, - # }) - # entries.append(video) - entries.append(self._extract_video(item)) - bookmark = board_feed.get('bookmark') - if not bookmark: - break - return self.playlist_result( - entries, playlist_id=board_id, playlist_title=board.get('name')) diff --git a/youtube_dl/extractor/pladform.py b/youtube_dl/extractor/pladform.py deleted file mode 100644 index e86c65396..000000000 --- a/youtube_dl/extractor/pladform.py +++ /dev/null @@ -1,125 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_urlparse -from ..utils import ( - determine_ext, - ExtractorError, - int_or_none, - xpath_text, - qualities, -) - - -class PladformIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?:// - (?: - (?: - out\.pladform\.ru/player| - static\.pladform\.ru/player\.swf - ) - \?.*\bvideoid=| - video\.pladform\.ru/catalog/video/videoid/ - ) - (?P<id>\d+) - ''' - _TESTS = [{ - 'url': 'https://out.pladform.ru/player?pl=64471&videoid=3777899&vk_puid15=0&vk_puid34=0', - 'md5': '53362fac3a27352da20fa2803cc5cd6f', - 'info_dict': { - 'id': '3777899', - 'ext': 'mp4', - 'title': 'СТУДИЯ СОЮЗ • Шоу Студия Союз, 24 выпуск (01.02.2018) Нурлан Сабуров и Слава Комиссаренко', - 'description': 'md5:05140e8bf1b7e2d46e7ba140be57fd95', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 3190, - }, - }, { - 'url': 'http://static.pladform.ru/player.swf?pl=21469&videoid=100183293&vkcid=0', - 'only_matching': True, - }, { - 'url': 'http://video.pladform.ru/catalog/video/videoid/100183293/vkcid/0', - 'only_matching': True, - }] - - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//out\.pladform\.ru/player\?.+?)\1', webpage) - if mobj: - return mobj.group('url') - - def _real_extract(self, url): - video_id = self._match_id(url) - - qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - pl = qs.get('pl', ['1'])[0] - - video = self._download_xml( - 'http://out.pladform.ru/getVideo', video_id, query={ - 'pl': pl, - 'videoid': video_id, - }) - - def fail(text): - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, text), - expected=True) - - if video.tag == 'error': - fail(video.text) - - quality = qualities(('ld', 'sd', 'hd')) - - formats = [] - for src in video.findall('./src'): - if src is None: - continue - format_url = src.text - if not format_url: - continue - if src.get('type') == 'hls' or determine_ext(format_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - else: - formats.append({ - 'url': src.text, - 'format_id': src.get('quality'), - 'quality': quality(src.get('quality')), - }) - - if not formats: - error = xpath_text(video, './cap', 'error', default=None) - if error: - fail(error) - - self._sort_formats(formats) - - webpage = self._download_webpage( - 'http://video.pladform.ru/catalog/video/videoid/%s' % video_id, - video_id) - - title = self._og_search_title(webpage, fatal=False) or xpath_text( - video, './/title', 'title', fatal=True) - description = self._search_regex( - r'</h3>\s*<p>([^<]+)</p>', webpage, 'description', fatal=False) - thumbnail = self._og_search_thumbnail(webpage) or xpath_text( - video, './/cover', 'cover') - - duration = int_or_none(xpath_text(video, './/time', 'duration')) - age_limit = int_or_none(xpath_text(video, './/age18', 'age limit')) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'age_limit': age_limit, - 'formats': formats, - } diff --git a/youtube_dl/extractor/playfm.py b/youtube_dl/extractor/playfm.py deleted file mode 100644 index e766ccca3..000000000 --- a/youtube_dl/extractor/playfm.py +++ /dev/null @@ -1,75 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - ExtractorError, - int_or_none, - parse_iso8601, -) - - -class PlayFMIE(InfoExtractor): - IE_NAME = 'play.fm' - _VALID_URL = r'https?://(?:www\.)?play\.fm/(?P<slug>(?:[^/]+/)+(?P<id>[^/]+))/?(?:$|[?#])' - - _TEST = { - 'url': 'https://www.play.fm/dan-drastic/sven-tasnadi-leipzig-electronic-music-batofar-paris-fr-2014-07-12', - 'md5': 'c505f8307825a245d0c7ad1850001f22', - 'info_dict': { - 'id': '71276', - 'ext': 'mp3', - 'title': 'Sven Tasnadi - LEIPZIG ELECTRONIC MUSIC @ Batofar (Paris,FR) - 2014-07-12', - 'description': '', - 'duration': 5627, - 'timestamp': 1406033781, - 'upload_date': '20140722', - 'uploader': 'Dan Drastic', - 'uploader_id': '71170', - 'view_count': int, - 'comment_count': int, - }, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - slug = mobj.group('slug') - - recordings = self._download_json( - 'http://v2api.play.fm/recordings/slug/%s' % slug, video_id) - - error = recordings.get('error') - if isinstance(error, dict): - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, error.get('message')), - expected=True) - - audio_url = recordings['audio'] - video_id = compat_str(recordings.get('id') or video_id) - title = recordings['title'] - description = recordings.get('description') - duration = int_or_none(recordings.get('recordingDuration')) - timestamp = parse_iso8601(recordings.get('created_at')) - uploader = recordings.get('page', {}).get('title') - uploader_id = compat_str(recordings.get('page', {}).get('id')) - view_count = int_or_none(recordings.get('playCount')) - comment_count = int_or_none(recordings.get('commentCount')) - categories = [tag['name'] for tag in recordings.get('tags', []) if tag.get('name')] - - return { - 'id': video_id, - 'url': audio_url, - 'title': title, - 'description': description, - 'duration': duration, - 'timestamp': timestamp, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'view_count': view_count, - 'comment_count': comment_count, - 'categories': categories, - } diff --git a/youtube_dl/extractor/playplustv.py b/youtube_dl/extractor/playplustv.py deleted file mode 100644 index 1e30ab23a..000000000 --- a/youtube_dl/extractor/playplustv.py +++ /dev/null @@ -1,109 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import json -import re - -from .common import InfoExtractor -from ..compat import compat_HTTPError -from ..utils import ( - clean_html, - ExtractorError, - int_or_none, - PUTRequest, -) - - -class PlayPlusTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?playplus\.(?:com|tv)/VOD/(?P<project_id>[0-9]+)/(?P<id>[0-9a-f]{32})' - _TEST = { - 'url': 'https://www.playplus.tv/VOD/7572/db8d274a5163424e967f35a30ddafb8e', - 'md5': 'd078cb89d7ab6b9df37ce23c647aef72', - 'info_dict': { - 'id': 'db8d274a5163424e967f35a30ddafb8e', - 'ext': 'mp4', - 'title': 'Capítulo 179 - Final', - 'description': 'md5:01085d62d8033a1e34121d3c3cabc838', - 'timestamp': 1529992740, - 'upload_date': '20180626', - }, - 'skip': 'Requires account credential', - } - _NETRC_MACHINE = 'playplustv' - _GEO_COUNTRIES = ['BR'] - _token = None - _profile_id = None - - def _call_api(self, resource, video_id=None, query=None): - return self._download_json('https://api.playplus.tv/api/media/v2/get' + resource, video_id, headers={ - 'Authorization': 'Bearer ' + self._token, - }, query=query) - - def _real_initialize(self): - email, password = self._get_login_info() - if email is None: - self.raise_login_required() - - req = PUTRequest( - 'https://api.playplus.tv/api/web/login', json.dumps({ - 'email': email, - 'password': password, - }).encode(), { - 'Content-Type': 'application/json; charset=utf-8', - }) - - try: - self._token = self._download_json(req, None)['token'] - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - raise ExtractorError(self._parse_json( - e.cause.read(), None)['errorMessage'], expected=True) - raise - - self._profile = self._call_api('Profiles')['list'][0]['_id'] - - def _real_extract(self, url): - project_id, media_id = re.match(self._VALID_URL, url).groups() - media = self._call_api( - 'Media', media_id, { - 'profileId': self._profile, - 'projectId': project_id, - 'mediaId': media_id, - })['obj'] - title = media['title'] - - formats = [] - for f in media.get('files', []): - f_url = f.get('url') - if not f_url: - continue - file_info = f.get('fileInfo') or {} - formats.append({ - 'url': f_url, - 'width': int_or_none(file_info.get('width')), - 'height': int_or_none(file_info.get('height')), - }) - self._sort_formats(formats) - - thumbnails = [] - for thumb in media.get('thumbs', []): - thumb_url = thumb.get('url') - if not thumb_url: - continue - thumbnails.append({ - 'url': thumb_url, - 'width': int_or_none(thumb.get('width')), - 'height': int_or_none(thumb.get('height')), - }) - - return { - 'id': media_id, - 'title': title, - 'formats': formats, - 'thumbnails': thumbnails, - 'description': clean_html(media.get('description')) or media.get('shortDescription'), - 'timestamp': int_or_none(media.get('publishDate'), 1000), - 'view_count': int_or_none(media.get('numberOfViews')), - 'comment_count': int_or_none(media.get('numberOfComments')), - 'tags': media.get('tags'), - } diff --git a/youtube_dl/extractor/playtvak.py b/youtube_dl/extractor/playtvak.py deleted file mode 100644 index 4c5f57919..000000000 --- a/youtube_dl/extractor/playtvak.py +++ /dev/null @@ -1,191 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..compat import ( - compat_urlparse, - compat_urllib_parse_urlencode, -) -from ..utils import ( - ExtractorError, - int_or_none, - parse_iso8601, - qualities, -) - - -class PlaytvakIE(InfoExtractor): - IE_DESC = 'Playtvak.cz, iDNES.cz and Lidovky.cz' - _VALID_URL = r'https?://(?:.+?\.)?(?:playtvak|idnes|lidovky|metro)\.cz/.*\?(?:c|idvideo)=(?P<id>[^&]+)' - _TESTS = [{ - 'url': 'http://www.playtvak.cz/vyzente-vosy-a-srsne-ze-zahrady-dn5-/hodinovy-manzel.aspx?c=A150730_150323_hodinovy-manzel_kuko', - 'md5': '4525ae312c324b4be2f4603cc78ceb4a', - 'info_dict': { - 'id': 'A150730_150323_hodinovy-manzel_kuko', - 'ext': 'mp4', - 'title': 'Vyžeňte vosy a sršně ze zahrady', - 'description': 'md5:4436e61b7df227a093778efb7e373571', - 'thumbnail': r're:(?i)^https?://.*\.(?:jpg|png)$', - 'duration': 279, - 'timestamp': 1438732860, - 'upload_date': '20150805', - 'is_live': False, - } - }, { # live video test - 'url': 'http://slowtv.playtvak.cz/planespotting-0pr-/planespotting.aspx?c=A150624_164934_planespotting_cat', - 'info_dict': { - 'id': 'A150624_164934_planespotting_cat', - 'ext': 'flv', - 'title': 're:^Planespotting [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'description': 'Sledujte provoz na ranveji Letiště Václava Havla v Praze', - 'is_live': True, - }, - 'params': { - 'skip_download': True, # requires rtmpdump - }, - }, { # another live stream, this one without Misc.videoFLV - 'url': 'https://slowtv.playtvak.cz/zive-sledujte-vlaky-v-primem-prenosu-dwi-/hlavni-nadrazi.aspx?c=A151218_145728_hlavni-nadrazi_plap', - 'info_dict': { - 'id': 'A151218_145728_hlavni-nadrazi_plap', - 'ext': 'flv', - 'title': 're:^Hlavní nádraží [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'is_live': True, - }, - 'params': { - 'skip_download': True, # requires rtmpdump - }, - }, { # idnes.cz - 'url': 'http://zpravy.idnes.cz/pes-zavreny-v-aute-rozbijeni-okynek-v-aute-fj5-/domaci.aspx?c=A150809_104116_domaci_pku', - 'md5': '819832ba33cd7016e58a6658577fe289', - 'info_dict': { - 'id': 'A150809_104116_domaci_pku', - 'ext': 'mp4', - 'title': 'Zavřeli jsme mraženou pizzu do auta. Upekla se', - 'description': 'md5:01e73f02329e2e5760bd5eed4d42e3c2', - 'thumbnail': r're:(?i)^https?://.*\.(?:jpg|png)$', - 'duration': 39, - 'timestamp': 1438969140, - 'upload_date': '20150807', - 'is_live': False, - } - }, { # lidovky.cz - 'url': 'http://www.lidovky.cz/dalsi-demonstrace-v-praze-o-migraci-duq-/video.aspx?c=A150808_214044_ln-video_ELE', - 'md5': 'c7209ac4ba9d234d4ad5bab7485bcee8', - 'info_dict': { - 'id': 'A150808_214044_ln-video_ELE', - 'ext': 'mp4', - 'title': 'Táhni! Demonstrace proti imigrantům budila emoce', - 'description': 'md5:97c81d589a9491fbfa323c9fa3cca72c', - 'thumbnail': r're:(?i)^https?://.*\.(?:jpg|png)$', - 'timestamp': 1439052180, - 'upload_date': '20150808', - 'is_live': False, - } - }, { # metro.cz - 'url': 'http://www.metro.cz/video-pod-billboardem-se-na-vltavske-roztocil-kolotoc-deti-vozil-jen-par-hodin-1hx-/metro-extra.aspx?c=A141111_173251_metro-extra_row', - 'md5': '84fc1deedcac37b7d4a6ccae7c716668', - 'info_dict': { - 'id': 'A141111_173251_metro-extra_row', - 'ext': 'mp4', - 'title': 'Recesisté udělali z billboardu kolotoč', - 'description': 'md5:7369926049588c3989a66c9c1a043c4c', - 'thumbnail': r're:(?i)^https?://.*\.(?:jpg|png)$', - 'timestamp': 1415725500, - 'upload_date': '20141111', - 'is_live': False, - } - }, { - 'url': 'http://www.playtvak.cz/embed.aspx?idvideo=V150729_141549_play-porad_kuko', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - info_url = self._html_search_regex( - r'Misc\.video(?:FLV)?\(\s*{\s*data\s*:\s*"([^"]+)"', webpage, 'info url') - - parsed_url = compat_urlparse.urlparse(info_url) - - qs = compat_urlparse.parse_qs(parsed_url.query) - qs.update({ - 'reklama': ['0'], - 'type': ['js'], - }) - - info_url = compat_urlparse.urlunparse( - parsed_url._replace(query=compat_urllib_parse_urlencode(qs, True))) - - json_info = self._download_json( - info_url, video_id, - transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1]) - - item = None - for i in json_info['items']: - if i.get('type') == 'video' or i.get('type') == 'stream': - item = i - break - if not item: - raise ExtractorError('No suitable stream found') - - quality = qualities(('low', 'middle', 'high')) - - formats = [] - for fmt in item['video']: - video_url = fmt.get('file') - if not video_url: - continue - - format_ = fmt['format'] - format_id = '%s_%s' % (format_, fmt['quality']) - preference = None - - if format_ in ('mp4', 'webm'): - ext = format_ - elif format_ == 'rtmp': - ext = 'flv' - elif format_ == 'apple': - ext = 'mp4' - # Some streams have mp3 audio which does not play - # well with ffmpeg filter aac_adtstoasc - preference = -1 - elif format_ == 'adobe': # f4m manifest fails with 404 in 80% of requests - continue - else: # Other formats not supported yet - continue - - formats.append({ - 'url': video_url, - 'ext': ext, - 'format_id': format_id, - 'quality': quality(fmt.get('quality')), - 'preference': preference, - }) - self._sort_formats(formats) - - title = item['title'] - is_live = item['type'] == 'stream' - if is_live: - title = self._live_title(title) - description = self._og_search_description(webpage, default=None) or self._html_search_meta( - 'description', webpage, 'description', default=None) - timestamp = None - duration = None - if not is_live: - duration = int_or_none(item.get('length')) - timestamp = item.get('published') - if timestamp: - timestamp = parse_iso8601(timestamp[:-5]) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': item.get('image'), - 'duration': duration, - 'timestamp': timestamp, - 'is_live': is_live, - 'formats': formats, - } diff --git a/youtube_dl/extractor/playwire.py b/youtube_dl/extractor/playwire.py deleted file mode 100644 index 4d96a10a7..000000000 --- a/youtube_dl/extractor/playwire.py +++ /dev/null @@ -1,75 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - dict_get, - float_or_none, -) - - -class PlaywireIE(InfoExtractor): - _VALID_URL = r'https?://(?:config|cdn)\.playwire\.com(?:/v2)?/(?P<publisher_id>\d+)/(?:videos/v2|embed|config)/(?P<id>\d+)' - _TESTS = [{ - 'url': 'http://config.playwire.com/14907/videos/v2/3353705/player.json', - 'md5': 'e6398701e3595888125729eaa2329ed9', - 'info_dict': { - 'id': '3353705', - 'ext': 'mp4', - 'title': 'S04_RM_UCL_Rus', - 'thumbnail': r're:^https?://.*\.png$', - 'duration': 145.94, - }, - }, { - # m3u8 in f4m - 'url': 'http://config.playwire.com/21772/videos/v2/4840492/zeus.json', - 'info_dict': { - 'id': '4840492', - 'ext': 'mp4', - 'title': 'ITV EL SHOW FULL', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - # Multiple resolutions while bitrates missing - 'url': 'http://cdn.playwire.com/11625/embed/85228.html', - 'only_matching': True, - }, { - 'url': 'http://config.playwire.com/12421/videos/v2/3389892/zeus.json', - 'only_matching': True, - }, { - 'url': 'http://cdn.playwire.com/v2/12342/config/1532636.json', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - publisher_id, video_id = mobj.group('publisher_id'), mobj.group('id') - - player = self._download_json( - 'http://config.playwire.com/%s/videos/v2/%s/zeus.json' % (publisher_id, video_id), - video_id) - - title = player['settings']['title'] - duration = float_or_none(player.get('duration'), 1000) - - content = player['content'] - thumbnail = content.get('poster') - src = content['media']['f4m'] - - formats = self._extract_f4m_formats(src, video_id, m3u8_id='hls') - for a_format in formats: - if not dict_get(a_format, ['tbr', 'width', 'height']): - a_format['quality'] = 1 if '-hd.' in a_format['url'] else 0 - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - } diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py deleted file mode 100644 index 2d63855df..000000000 --- a/youtube_dl/extractor/pluralsight.py +++ /dev/null @@ -1,501 +0,0 @@ -from __future__ import unicode_literals - -import collections -import json -import os -import random -import re - -from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, -) -from ..utils import ( - dict_get, - ExtractorError, - float_or_none, - int_or_none, - parse_duration, - qualities, - srt_subtitles_timecode, - try_get, - update_url_query, - urlencode_postdata, -) - - -class PluralsightBaseIE(InfoExtractor): - _API_BASE = 'https://app.pluralsight.com' - - _GRAPHQL_EP = '%s/player/api/graphql' % _API_BASE - _GRAPHQL_HEADERS = { - 'Content-Type': 'application/json;charset=UTF-8', - } - _GRAPHQL_COURSE_TMPL = ''' -query BootstrapPlayer { - rpc { - bootstrapPlayer { - profile { - firstName - lastName - email - username - userHandle - authed - isAuthed - plan - } - course(courseId: "%s") { - name - title - courseHasCaptions - translationLanguages { - code - name - } - supportsWideScreenVideoFormats - timestamp - modules { - name - title - duration - formattedDuration - author - authorized - clips { - authorized - clipId - duration - formattedDuration - id - index - moduleIndex - moduleTitle - name - title - watched - } - } - } - } - } -}''' - - def _download_course(self, course_id, url, display_id): - try: - return self._download_course_rpc(course_id, url, display_id) - except ExtractorError: - # Old API fallback - return self._download_json( - 'https://app.pluralsight.com/player/user/api/v1/player/payload', - display_id, data=urlencode_postdata({'courseId': course_id}), - headers={'Referer': url}) - - def _download_course_rpc(self, course_id, url, display_id): - response = self._download_json( - self._GRAPHQL_EP, display_id, data=json.dumps({ - 'query': self._GRAPHQL_COURSE_TMPL % course_id, - 'variables': {} - }).encode('utf-8'), headers=self._GRAPHQL_HEADERS) - - course = try_get( - response, lambda x: x['data']['rpc']['bootstrapPlayer']['course'], - dict) - if course: - return course - - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, response['error']['message']), - expected=True) - - -class PluralsightIE(PluralsightBaseIE): - IE_NAME = 'pluralsight' - _VALID_URL = r'https?://(?:(?:www|app)\.)?pluralsight\.com/(?:training/)?player\?' - _LOGIN_URL = 'https://app.pluralsight.com/id/' - - _NETRC_MACHINE = 'pluralsight' - - _TESTS = [{ - 'url': 'http://www.pluralsight.com/training/player?author=mike-mckeown&name=hosting-sql-server-windows-azure-iaas-m7-mgmt&mode=live&clip=3&course=hosting-sql-server-windows-azure-iaas', - 'md5': '4d458cf5cf4c593788672419a8dd4cf8', - 'info_dict': { - 'id': 'hosting-sql-server-windows-azure-iaas-m7-mgmt-04', - 'ext': 'mp4', - 'title': 'Demo Monitoring', - 'duration': 338, - }, - 'skip': 'Requires pluralsight account credentials', - }, { - 'url': 'https://app.pluralsight.com/training/player?course=angularjs-get-started&author=scott-allen&name=angularjs-get-started-m1-introduction&clip=0&mode=live', - 'only_matching': True, - }, { - # available without pluralsight account - 'url': 'http://app.pluralsight.com/training/player?author=scott-allen&name=angularjs-get-started-m1-introduction&mode=live&clip=0&course=angularjs-get-started', - 'only_matching': True, - }, { - 'url': 'https://app.pluralsight.com/player?course=ccna-intro-networking&author=ross-bagurdes&name=ccna-intro-networking-m06&clip=0', - 'only_matching': True, - }] - - GRAPHQL_VIEWCLIP_TMPL = ''' -query viewClip { - viewClip(input: { - author: "%(author)s", - clipIndex: %(clipIndex)d, - courseName: "%(courseName)s", - includeCaptions: %(includeCaptions)s, - locale: "%(locale)s", - mediaType: "%(mediaType)s", - moduleName: "%(moduleName)s", - quality: "%(quality)s" - }) { - urls { - url - cdn - rank - source - }, - status - } -}''' - - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - - login_page = self._download_webpage( - self._LOGIN_URL, None, 'Downloading login page') - - login_form = self._hidden_inputs(login_page) - - login_form.update({ - 'Username': username, - 'Password': password, - }) - - post_url = self._search_regex( - r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, - 'post url', default=self._LOGIN_URL, group='url') - - if not post_url.startswith('http'): - post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) - - response = self._download_webpage( - post_url, None, 'Logging in', - data=urlencode_postdata(login_form), - headers={'Content-Type': 'application/x-www-form-urlencoded'}) - - error = self._search_regex( - r'<span[^>]+class="field-validation-error"[^>]*>([^<]+)</span>', - response, 'error message', default=None) - if error: - raise ExtractorError('Unable to login: %s' % error, expected=True) - - if all(not re.search(p, response) for p in ( - r'__INITIAL_STATE__', r'["\']currentUser["\']', - # new layout? - r'>\s*Sign out\s*<')): - BLOCKED = 'Your account has been blocked due to suspicious activity' - if BLOCKED in response: - raise ExtractorError( - 'Unable to login: %s' % BLOCKED, expected=True) - MUST_AGREE = 'To continue using Pluralsight, you must agree to' - if any(p in response for p in (MUST_AGREE, '>Disagree<', '>Agree<')): - raise ExtractorError( - 'Unable to login: %s some documents. Go to pluralsight.com, ' - 'log in and agree with what Pluralsight requires.' - % MUST_AGREE, expected=True) - - raise ExtractorError('Unable to log in') - - def _get_subtitles(self, author, clip_idx, clip_id, lang, name, duration, video_id): - captions = None - if clip_id: - captions = self._download_json( - '%s/transcript/api/v1/caption/json/%s/%s' - % (self._API_BASE, clip_id, lang), video_id, - 'Downloading captions JSON', 'Unable to download captions JSON', - fatal=False) - if not captions: - captions_post = { - 'a': author, - 'cn': int(clip_idx), - 'lc': lang, - 'm': name, - } - captions = self._download_json( - '%s/player/retrieve-captions' % self._API_BASE, video_id, - 'Downloading captions JSON', 'Unable to download captions JSON', - fatal=False, data=json.dumps(captions_post).encode('utf-8'), - headers={'Content-Type': 'application/json;charset=utf-8'}) - if captions: - return { - lang: [{ - 'ext': 'json', - 'data': json.dumps(captions), - }, { - 'ext': 'srt', - 'data': self._convert_subtitles(duration, captions), - }] - } - - @staticmethod - def _convert_subtitles(duration, subs): - srt = '' - TIME_OFFSET_KEYS = ('displayTimeOffset', 'DisplayTimeOffset') - TEXT_KEYS = ('text', 'Text') - for num, current in enumerate(subs): - current = subs[num] - start, text = ( - float_or_none(dict_get(current, TIME_OFFSET_KEYS, skip_false_values=False)), - dict_get(current, TEXT_KEYS)) - if start is None or text is None: - continue - end = duration if num == len(subs) - 1 else float_or_none( - dict_get(subs[num + 1], TIME_OFFSET_KEYS, skip_false_values=False)) - if end is None: - continue - srt += os.linesep.join( - ( - '%d' % num, - '%s --> %s' % ( - srt_subtitles_timecode(start), - srt_subtitles_timecode(end)), - text, - os.linesep, - )) - return srt - - def _real_extract(self, url): - qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - - author = qs.get('author', [None])[0] - name = qs.get('name', [None])[0] - clip_idx = qs.get('clip', [None])[0] - course_name = qs.get('course', [None])[0] - - if any(not f for f in (author, name, clip_idx, course_name,)): - raise ExtractorError('Invalid URL', expected=True) - - display_id = '%s-%s' % (name, clip_idx) - - course = self._download_course(course_name, url, display_id) - - collection = course['modules'] - - clip = None - - for module_ in collection: - if name in (module_.get('moduleName'), module_.get('name')): - for clip_ in module_.get('clips', []): - clip_index = clip_.get('clipIndex') - if clip_index is None: - clip_index = clip_.get('index') - if clip_index is None: - continue - if compat_str(clip_index) == clip_idx: - clip = clip_ - break - - if not clip: - raise ExtractorError('Unable to resolve clip') - - title = clip['title'] - clip_id = clip.get('clipName') or clip.get('name') or clip['clipId'] - - QUALITIES = { - 'low': {'width': 640, 'height': 480}, - 'medium': {'width': 848, 'height': 640}, - 'high': {'width': 1024, 'height': 768}, - 'high-widescreen': {'width': 1280, 'height': 720}, - } - - QUALITIES_PREFERENCE = ('low', 'medium', 'high', 'high-widescreen',) - quality_key = qualities(QUALITIES_PREFERENCE) - - AllowedQuality = collections.namedtuple('AllowedQuality', ['ext', 'qualities']) - - ALLOWED_QUALITIES = ( - AllowedQuality('webm', ['high', ]), - AllowedQuality('mp4', ['low', 'medium', 'high', ]), - ) - - # Some courses also offer widescreen resolution for high quality (see - # https://github.com/ytdl-org/youtube-dl/issues/7766) - widescreen = course.get('supportsWideScreenVideoFormats') is True - best_quality = 'high-widescreen' if widescreen else 'high' - if widescreen: - for allowed_quality in ALLOWED_QUALITIES: - allowed_quality.qualities.append(best_quality) - - # In order to minimize the number of calls to ViewClip API and reduce - # the probability of being throttled or banned by Pluralsight we will request - # only single format until formats listing was explicitly requested. - if self._downloader.params.get('listformats', False): - allowed_qualities = ALLOWED_QUALITIES - else: - def guess_allowed_qualities(): - req_format = self._downloader.params.get('format') or 'best' - req_format_split = req_format.split('-', 1) - if len(req_format_split) > 1: - req_ext, req_quality = req_format_split - req_quality = '-'.join(req_quality.split('-')[:2]) - for allowed_quality in ALLOWED_QUALITIES: - if req_ext == allowed_quality.ext and req_quality in allowed_quality.qualities: - return (AllowedQuality(req_ext, (req_quality, )), ) - req_ext = 'webm' if self._downloader.params.get('prefer_free_formats') else 'mp4' - return (AllowedQuality(req_ext, (best_quality, )), ) - allowed_qualities = guess_allowed_qualities() - - formats = [] - for ext, qualities_ in allowed_qualities: - for quality in qualities_: - f = QUALITIES[quality].copy() - clip_post = { - 'author': author, - 'includeCaptions': 'false', - 'clipIndex': int(clip_idx), - 'courseName': course_name, - 'locale': 'en', - 'moduleName': name, - 'mediaType': ext, - 'quality': '%dx%d' % (f['width'], f['height']), - } - format_id = '%s-%s' % (ext, quality) - - try: - viewclip = self._download_json( - self._GRAPHQL_EP, display_id, - 'Downloading %s viewclip graphql' % format_id, - data=json.dumps({ - 'query': self.GRAPHQL_VIEWCLIP_TMPL % clip_post, - 'variables': {} - }).encode('utf-8'), - headers=self._GRAPHQL_HEADERS)['data']['viewClip'] - except ExtractorError: - # Still works but most likely will go soon - viewclip = self._download_json( - '%s/video/clips/viewclip' % self._API_BASE, display_id, - 'Downloading %s viewclip JSON' % format_id, fatal=False, - data=json.dumps(clip_post).encode('utf-8'), - headers={'Content-Type': 'application/json;charset=utf-8'}) - - # Pluralsight tracks multiple sequential calls to ViewClip API and start - # to return 429 HTTP errors after some time (see - # https://github.com/ytdl-org/youtube-dl/pull/6989). Moreover it may even lead - # to account ban (see https://github.com/ytdl-org/youtube-dl/issues/6842). - # To somewhat reduce the probability of these consequences - # we will sleep random amount of time before each call to ViewClip. - self._sleep( - random.randint(5, 10), display_id, - '%(video_id)s: Waiting for %(timeout)s seconds to avoid throttling') - - if not viewclip: - continue - - clip_urls = viewclip.get('urls') - if not isinstance(clip_urls, list): - continue - - for clip_url_data in clip_urls: - clip_url = clip_url_data.get('url') - if not clip_url: - continue - cdn = clip_url_data.get('cdn') - clip_f = f.copy() - clip_f.update({ - 'url': clip_url, - 'ext': ext, - 'format_id': '%s-%s' % (format_id, cdn) if cdn else format_id, - 'quality': quality_key(quality), - 'source_preference': int_or_none(clip_url_data.get('rank')), - }) - formats.append(clip_f) - - self._sort_formats(formats) - - duration = int_or_none( - clip.get('duration')) or parse_duration(clip.get('formattedDuration')) - - # TODO: other languages? - subtitles = self.extract_subtitles( - author, clip_idx, clip.get('clipId'), 'en', name, duration, display_id) - - return { - 'id': clip_id, - 'title': title, - 'duration': duration, - 'creator': author, - 'formats': formats, - 'subtitles': subtitles, - } - - -class PluralsightCourseIE(PluralsightBaseIE): - IE_NAME = 'pluralsight:course' - _VALID_URL = r'https?://(?:(?:www|app)\.)?pluralsight\.com/(?:library/)?courses/(?P<id>[^/]+)' - _TESTS = [{ - # Free course from Pluralsight Starter Subscription for Microsoft TechNet - # https://offers.pluralsight.com/technet?loc=zTS3z&prod=zOTprodz&tech=zOttechz&prog=zOTprogz&type=zSOz&media=zOTmediaz&country=zUSz - 'url': 'http://www.pluralsight.com/courses/hosting-sql-server-windows-azure-iaas', - 'info_dict': { - 'id': 'hosting-sql-server-windows-azure-iaas', - 'title': 'Hosting SQL Server in Microsoft Azure IaaS Fundamentals', - 'description': 'md5:61b37e60f21c4b2f91dc621a977d0986', - }, - 'playlist_count': 31, - }, { - # available without pluralsight account - 'url': 'https://www.pluralsight.com/courses/angularjs-get-started', - 'only_matching': True, - }, { - 'url': 'https://app.pluralsight.com/library/courses/understanding-microsoft-azure-amazon-aws/table-of-contents', - 'only_matching': True, - }] - - def _real_extract(self, url): - course_id = self._match_id(url) - - # TODO: PSM cookie - - course = self._download_course(course_id, url, course_id) - - title = course['title'] - course_name = course['name'] - course_data = course['modules'] - description = course.get('description') or course.get('shortDescription') - - entries = [] - for num, module in enumerate(course_data, 1): - author = module.get('author') - module_name = module.get('name') - if not author or not module_name: - continue - for clip in module.get('clips', []): - clip_index = int_or_none(clip.get('index')) - if clip_index is None: - continue - clip_url = update_url_query( - '%s/player' % self._API_BASE, query={ - 'mode': 'live', - 'course': course_name, - 'author': author, - 'name': module_name, - 'clip': clip_index, - }) - entries.append({ - '_type': 'url_transparent', - 'url': clip_url, - 'ie_key': PluralsightIE.ie_key(), - 'chapter': module.get('title'), - 'chapter_number': num, - 'chapter_id': module.get('moduleRef'), - }) - - return self.playlist_result(entries, course_id, title, description) diff --git a/youtube_dl/extractor/podomatic.py b/youtube_dl/extractor/podomatic.py deleted file mode 100644 index e782e3f1f..000000000 --- a/youtube_dl/extractor/podomatic.py +++ /dev/null @@ -1,76 +0,0 @@ -from __future__ import unicode_literals - -import json -import re - -from .common import InfoExtractor -from ..utils import int_or_none - - -class PodomaticIE(InfoExtractor): - IE_NAME = 'podomatic' - _VALID_URL = r'''(?x) - (?P<proto>https?):// - (?: - (?P<channel>[^.]+)\.podomatic\.com/entry| - (?:www\.)?podomatic\.com/podcasts/(?P<channel_2>[^/]+)/episodes - )/ - (?P<id>[^/?#&]+) - ''' - - _TESTS = [{ - 'url': 'http://scienceteachingtips.podomatic.com/entry/2009-01-02T16_03_35-08_00', - 'md5': '84bb855fcf3429e6bf72460e1eed782d', - 'info_dict': { - 'id': '2009-01-02T16_03_35-08_00', - 'ext': 'mp3', - 'uploader': 'Science Teaching Tips', - 'uploader_id': 'scienceteachingtips', - 'title': '64. When the Moon Hits Your Eye', - 'duration': 446, - } - }, { - 'url': 'http://ostbahnhof.podomatic.com/entry/2013-11-15T16_31_21-08_00', - 'md5': 'd2cf443931b6148e27638650e2638297', - 'info_dict': { - 'id': '2013-11-15T16_31_21-08_00', - 'ext': 'mp3', - 'uploader': 'Ostbahnhof / Techno Mix', - 'uploader_id': 'ostbahnhof', - 'title': 'Einunddreizig', - 'duration': 3799, - } - }, { - 'url': 'https://www.podomatic.com/podcasts/scienceteachingtips/episodes/2009-01-02T16_03_35-08_00', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - channel = mobj.group('channel') or mobj.group('channel_2') - - json_url = (('%s://%s.podomatic.com/entry/embed_params/%s' - + '?permalink=true&rtmp=0') % - (mobj.group('proto'), channel, video_id)) - data_json = self._download_webpage( - json_url, video_id, 'Downloading video info') - data = json.loads(data_json) - - video_url = data['downloadLink'] - if not video_url: - video_url = '%s/%s' % (data['streamer'].replace('rtmp', 'http'), data['mediaLocation']) - uploader = data['podcast'] - title = data['title'] - thumbnail = data['imageLocation'] - duration = int_or_none(data.get('length'), 1000) - - return { - 'id': video_id, - 'url': video_url, - 'title': title, - 'uploader': uploader, - 'uploader_id': channel, - 'thumbnail': thumbnail, - 'duration': duration, - } diff --git a/youtube_dl/extractor/pokemon.py b/youtube_dl/extractor/pokemon.py deleted file mode 100644 index 80222d428..000000000 --- a/youtube_dl/extractor/pokemon.py +++ /dev/null @@ -1,71 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - extract_attributes, - int_or_none, -) - - -class PokemonIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?pokemon\.com/[a-z]{2}(?:.*?play=(?P<id>[a-z0-9]{32})|/(?:[^/]+/)+(?P<display_id>[^/?#&]+))' - _TESTS = [{ - 'url': 'https://www.pokemon.com/us/pokemon-episodes/20_30-the-ol-raise-and-switch/', - 'md5': '2fe8eaec69768b25ef898cda9c43062e', - 'info_dict': { - 'id': 'afe22e30f01c41f49d4f1d9eab5cd9a4', - 'ext': 'mp4', - 'title': 'The Ol’ Raise and Switch!', - 'description': 'md5:7db77f7107f98ba88401d3adc80ff7af', - }, - 'add_id': ['LimelightMedia'], - }, { - # no data-video-title - 'url': 'https://www.pokemon.com/fr/episodes-pokemon/films-pokemon/pokemon-lascension-de-darkrai-2008', - 'info_dict': { - 'id': 'dfbaf830d7e54e179837c50c0c6cc0e1', - 'ext': 'mp4', - 'title': "Pokémon : L'ascension de Darkrai", - 'description': 'md5:d1dbc9e206070c3e14a06ff557659fb5', - }, - 'add_id': ['LimelightMedia'], - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://www.pokemon.com/uk/pokemon-episodes/?play=2e8b5c761f1d4a9286165d7748c1ece2', - 'only_matching': True, - }, { - 'url': 'http://www.pokemon.com/fr/episodes-pokemon/18_09-un-hiver-inattendu/', - 'only_matching': True, - }, { - 'url': 'http://www.pokemon.com/de/pokemon-folgen/01_20-bye-bye-smettbo/', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id, display_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage(url, video_id or display_id) - video_data = extract_attributes(self._search_regex( - r'(<[^>]+data-video-id="%s"[^>]*>)' % (video_id if video_id else '[a-z0-9]{32}'), - webpage, 'video data element')) - video_id = video_data['data-video-id'] - title = video_data.get('data-video-title') or self._html_search_meta( - 'pkm-title', webpage, ' title', default=None) or self._search_regex( - r'<h1[^>]+\bclass=["\']us-title[^>]+>([^<]+)', webpage, 'title') - return { - '_type': 'url_transparent', - 'id': video_id, - 'url': 'limelight:media:%s' % video_id, - 'title': title, - 'description': video_data.get('data-video-summary'), - 'thumbnail': video_data.get('data-video-poster'), - 'series': 'Pokémon', - 'season_number': int_or_none(video_data.get('data-video-season')), - 'episode': title, - 'episode_number': int_or_none(video_data.get('data-video-episode')), - 'ie_key': 'LimelightMedia', - } diff --git a/youtube_dl/extractor/polskieradio.py b/youtube_dl/extractor/polskieradio.py deleted file mode 100644 index 978d6f813..000000000 --- a/youtube_dl/extractor/polskieradio.py +++ /dev/null @@ -1,180 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import itertools -import re - -from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urllib_parse_unquote, - compat_urlparse -) -from ..utils import ( - extract_attributes, - int_or_none, - strip_or_none, - unified_timestamp, -) - - -class PolskieRadioIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+/\d+/Artykul/(?P<id>[0-9]+)' - _TESTS = [{ - 'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943,Prof-Andrzej-Nowak-o-historii-nie-da-sie-myslec-beznamietnie', - 'info_dict': { - 'id': '1587943', - 'title': 'Prof. Andrzej Nowak: o historii nie da się myśleć beznamiętnie', - 'description': 'md5:12f954edbf3120c5e7075e17bf9fc5c5', - }, - 'playlist': [{ - 'md5': '2984ee6ce9046d91fc233bc1a864a09a', - 'info_dict': { - 'id': '1540576', - 'ext': 'mp3', - 'title': 'md5:d4623290d4ac983bf924061c75c23a0d', - 'timestamp': 1456594200, - 'upload_date': '20160227', - 'duration': 2364, - 'thumbnail': r're:^https?://static\.prsa\.pl/images/.*\.jpg$' - }, - }], - }, { - 'url': 'http://www.polskieradio.pl/265/5217/Artykul/1635803,Euro-2016-nie-ma-miejsca-na-blad-Polacy-graja-ze-Szwajcaria-o-cwiercfinal', - 'info_dict': { - 'id': '1635803', - 'title': 'Euro 2016: nie ma miejsca na błąd. Polacy grają ze Szwajcarią o ćwierćfinał', - 'description': 'md5:01cb7d0cad58664095d72b51a1ebada2', - }, - 'playlist_mincount': 12, - }, { - 'url': 'http://polskieradio.pl/9/305/Artykul/1632955,Bardzo-popularne-slowo-remis', - 'only_matching': True, - }, { - 'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943', - 'only_matching': True, - }, { - # with mp4 video - 'url': 'http://www.polskieradio.pl/9/299/Artykul/1634903,Brexit-Leszek-Miller-swiat-sie-nie-zawali-Europa-bedzie-trwac-dalej', - 'only_matching': True, - }] - - def _real_extract(self, url): - playlist_id = self._match_id(url) - - webpage = self._download_webpage(url, playlist_id) - - content = self._search_regex( - r'(?s)<div[^>]+class="\s*this-article\s*"[^>]*>(.+?)<div[^>]+class="tags"[^>]*>', - webpage, 'content') - - timestamp = unified_timestamp(self._html_search_regex( - r'(?s)<span[^>]+id="datetime2"[^>]*>(.+?)</span>', - webpage, 'timestamp', fatal=False)) - - thumbnail_url = self._og_search_thumbnail(webpage) - - entries = [] - - media_urls = set() - - for data_media in re.findall(r'<[^>]+data-media=({[^>]+})', content): - media = self._parse_json(data_media, playlist_id, fatal=False) - if not media.get('file') or not media.get('desc'): - continue - media_url = self._proto_relative_url(media['file'], 'http:') - if media_url in media_urls: - continue - media_urls.add(media_url) - entries.append({ - 'id': compat_str(media['id']), - 'url': media_url, - 'title': compat_urllib_parse_unquote(media['desc']), - 'duration': int_or_none(media.get('length')), - 'vcodec': 'none' if media.get('provider') == 'audio' else None, - 'timestamp': timestamp, - 'thumbnail': thumbnail_url - }) - - title = self._og_search_title(webpage).strip() - description = strip_or_none(self._og_search_description(webpage)) - - return self.playlist_result(entries, playlist_id, title, description) - - -class PolskieRadioCategoryIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+(?:,[^/]+)?/(?P<id>\d+)' - _TESTS = [{ - 'url': 'http://www.polskieradio.pl/7/5102,HISTORIA-ZYWA', - 'info_dict': { - 'id': '5102', - 'title': 'HISTORIA ŻYWA', - }, - 'playlist_mincount': 38, - }, { - 'url': 'http://www.polskieradio.pl/7/4807', - 'info_dict': { - 'id': '4807', - 'title': 'Vademecum 1050. rocznicy Chrztu Polski' - }, - 'playlist_mincount': 5 - }, { - 'url': 'http://www.polskieradio.pl/7/129,Sygnaly-dnia?ref=source', - 'only_matching': True - }, { - 'url': 'http://www.polskieradio.pl/37,RedakcjaKatolicka/4143,Kierunek-Krakow', - 'info_dict': { - 'id': '4143', - 'title': 'Kierunek Kraków', - }, - 'playlist_mincount': 61 - }, { - 'url': 'http://www.polskieradio.pl/10,czworka/214,muzyka', - 'info_dict': { - 'id': '214', - 'title': 'Muzyka', - }, - 'playlist_mincount': 61 - }, { - 'url': 'http://www.polskieradio.pl/7,Jedynka/5102,HISTORIA-ZYWA', - 'only_matching': True, - }, { - 'url': 'http://www.polskieradio.pl/8,Dwojka/196,Publicystyka', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return False if PolskieRadioIE.suitable(url) else super(PolskieRadioCategoryIE, cls).suitable(url) - - def _entries(self, url, page, category_id): - content = page - for page_num in itertools.count(2): - for a_entry, entry_id in re.findall( - r'(?s)<article[^>]+>.*?(<a[^>]+href=["\']/\d+/\d+/Artykul/(\d+)[^>]+>).*?</article>', - content): - entry = extract_attributes(a_entry) - href = entry.get('href') - if not href: - continue - yield self.url_result( - compat_urlparse.urljoin(url, href), PolskieRadioIE.ie_key(), - entry_id, entry.get('title')) - mobj = re.search( - r'<div[^>]+class=["\']next["\'][^>]*>\s*<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1', - content) - if not mobj: - break - next_url = compat_urlparse.urljoin(url, mobj.group('url')) - content = self._download_webpage( - next_url, category_id, 'Downloading page %s' % page_num) - - def _real_extract(self, url): - category_id = self._match_id(url) - webpage = self._download_webpage(url, category_id) - title = self._html_search_regex( - r'<title>([^<]+) - [^<]+ - [^<]+</title>', - webpage, 'title', fatal=False) - return self.playlist_result( - self._entries(url, webpage, category_id), - category_id, title) diff --git a/youtube_dl/extractor/popcorntimes.py b/youtube_dl/extractor/popcorntimes.py deleted file mode 100644 index 7bf7f9858..000000000 --- a/youtube_dl/extractor/popcorntimes.py +++ /dev/null @@ -1,99 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_b64decode, - compat_chr, -) -from ..utils import int_or_none - - -class PopcorntimesIE(InfoExtractor): - _VALID_URL = r'https?://popcorntimes\.tv/[^/]+/m/(?P<id>[^/]+)/(?P<display_id>[^/?#&]+)' - _TEST = { - 'url': 'https://popcorntimes.tv/de/m/A1XCFvz/haensel-und-gretel-opera-fantasy', - 'md5': '93f210991ad94ba8c3485950a2453257', - 'info_dict': { - 'id': 'A1XCFvz', - 'display_id': 'haensel-und-gretel-opera-fantasy', - 'ext': 'mp4', - 'title': 'Hänsel und Gretel', - 'description': 'md5:1b8146791726342e7b22ce8125cf6945', - 'thumbnail': r're:^https?://.*\.jpg$', - 'creator': 'John Paul', - 'release_date': '19541009', - 'duration': 4260, - 'tbr': 5380, - 'width': 720, - 'height': 540, - }, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id, display_id = mobj.group('id', 'display_id') - - webpage = self._download_webpage(url, display_id) - - title = self._search_regex( - r'<h1>([^<]+)', webpage, 'title', - default=None) or self._html_search_meta( - 'ya:ovs:original_name', webpage, 'title', fatal=True) - - loc = self._search_regex( - r'PCTMLOC\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, 'loc', - group='value') - - loc_b64 = '' - for c in loc: - c_ord = ord(c) - if ord('a') <= c_ord <= ord('z') or ord('A') <= c_ord <= ord('Z'): - upper = ord('Z') if c_ord <= ord('Z') else ord('z') - c_ord += 13 - if upper < c_ord: - c_ord -= 26 - loc_b64 += compat_chr(c_ord) - - video_url = compat_b64decode(loc_b64).decode('utf-8') - - description = self._html_search_regex( - r'(?s)<div[^>]+class=["\']pt-movie-desc[^>]+>(.+?)</div>', webpage, - 'description', fatal=False) - - thumbnail = self._search_regex( - r'<img[^>]+class=["\']video-preview[^>]+\bsrc=(["\'])(?P<value>(?:(?!\1).)+)\1', - webpage, 'thumbnail', default=None, - group='value') or self._og_search_thumbnail(webpage) - - creator = self._html_search_meta( - 'video:director', webpage, 'creator', default=None) - - release_date = self._html_search_meta( - 'video:release_date', webpage, default=None) - if release_date: - release_date = release_date.replace('-', '') - - def int_meta(name): - return int_or_none(self._html_search_meta( - name, webpage, default=None)) - - return { - 'id': video_id, - 'display_id': display_id, - 'url': video_url, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'creator': creator, - 'release_date': release_date, - 'duration': int_meta('video:duration'), - 'tbr': int_meta('ya:ovs:bitrate'), - 'width': int_meta('og:video:width'), - 'height': int_meta('og:video:height'), - 'http_headers': { - 'Referer': url, - }, - } diff --git a/youtube_dl/extractor/popcorntv.py b/youtube_dl/extractor/popcorntv.py deleted file mode 100644 index 9f834fb6c..000000000 --- a/youtube_dl/extractor/popcorntv.py +++ /dev/null @@ -1,76 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - extract_attributes, - int_or_none, - unified_timestamp, -) - - -class PopcornTVIE(InfoExtractor): - _VALID_URL = r'https?://[^/]+\.popcorntv\.it/guarda/(?P<display_id>[^/]+)/(?P<id>\d+)' - _TESTS = [{ - 'url': 'https://animemanga.popcorntv.it/guarda/food-wars-battaglie-culinarie-episodio-01/9183', - 'md5': '47d65a48d147caf692ab8562fe630b45', - 'info_dict': { - 'id': '9183', - 'display_id': 'food-wars-battaglie-culinarie-episodio-01', - 'ext': 'mp4', - 'title': 'Food Wars, Battaglie Culinarie | Episodio 01', - 'description': 'md5:b8bea378faae4651d3b34c6e112463d0', - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1497610857, - 'upload_date': '20170616', - 'duration': 1440, - 'view_count': int, - }, - }, { - 'url': 'https://cinema.popcorntv.it/guarda/smash-cut/10433', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id, video_id = mobj.group('display_id', 'id') - - webpage = self._download_webpage(url, display_id) - - m3u8_url = extract_attributes( - self._search_regex( - r'(<link[^>]+itemprop=["\'](?:content|embed)Url[^>]*>)', - webpage, 'content' - ))['href'] - - formats = self._extract_m3u8_formats( - m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls') - - title = self._search_regex( - r'<h1[^>]+itemprop=["\']name[^>]*>([^<]+)', webpage, - 'title', default=None) or self._og_search_title(webpage) - - description = self._html_search_regex( - r'(?s)<article[^>]+itemprop=["\']description[^>]*>(.+?)</article>', - webpage, 'description', fatal=False) - thumbnail = self._og_search_thumbnail(webpage) - timestamp = unified_timestamp(self._html_search_meta( - 'uploadDate', webpage, 'timestamp')) - duration = int_or_none(self._html_search_meta( - 'duration', webpage), invscale=60) - view_count = int_or_none(self._html_search_meta( - 'interactionCount', webpage, 'view count')) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'duration': duration, - 'view_count': view_count, - 'formats': formats, - } diff --git a/youtube_dl/extractor/porncom.py b/youtube_dl/extractor/porncom.py deleted file mode 100644 index 5726cab3a..000000000 --- a/youtube_dl/extractor/porncom.py +++ /dev/null @@ -1,103 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_urlparse -from ..utils import ( - int_or_none, - js_to_json, - parse_filesize, - str_to_int, -) - - -class PornComIE(InfoExtractor): - _VALID_URL = r'https?://(?:[a-zA-Z]+\.)?porn\.com/videos/(?:(?P<display_id>[^/]+)-)?(?P<id>\d+)' - _TESTS = [{ - 'url': 'http://www.porn.com/videos/teen-grabs-a-dildo-and-fucks-her-pussy-live-on-1hottie-i-rec-2603339', - 'md5': '3f30ce76267533cd12ba999263156de7', - 'info_dict': { - 'id': '2603339', - 'display_id': 'teen-grabs-a-dildo-and-fucks-her-pussy-live-on-1hottie-i-rec', - 'ext': 'mp4', - 'title': 'Teen grabs a dildo and fucks her pussy live on 1hottie, I rec', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 551, - 'view_count': int, - 'age_limit': 18, - 'categories': list, - 'tags': list, - }, - }, { - 'url': 'http://se.porn.com/videos/marsha-may-rides-seth-on-top-of-his-thick-cock-2658067', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') or video_id - - webpage = self._download_webpage(url, display_id) - - config = self._parse_json( - self._search_regex( - (r'=\s*({.+?})\s*;\s*v1ar\b', - r'=\s*({.+?})\s*,\s*[\da-zA-Z_]+\s*='), - webpage, 'config', default='{}'), - display_id, transform_source=js_to_json, fatal=False) - - if config: - title = config['title'] - formats = [{ - 'url': stream['url'], - 'format_id': stream.get('id'), - 'height': int_or_none(self._search_regex( - r'^(\d+)[pP]', stream.get('id') or '', 'height', default=None)) - } for stream in config['streams'] if stream.get('url')] - thumbnail = (compat_urlparse.urljoin( - config['thumbCDN'], config['poster']) - if config.get('thumbCDN') and config.get('poster') else None) - duration = int_or_none(config.get('length')) - else: - title = self._search_regex( - (r'<title>([^<]+)</title>', r'<h1[^>]*>([^<]+)</h1>'), - webpage, 'title') - formats = [{ - 'url': compat_urlparse.urljoin(url, format_url), - 'format_id': '%sp' % height, - 'height': int(height), - 'filesize_approx': parse_filesize(filesize), - } for format_url, height, filesize in re.findall( - r'<a[^>]+href="(/download/[^"]+)">[^<]*?(\d+)p<span[^>]*>(\d+\s*[a-zA-Z]+)<', - webpage)] - thumbnail = None - duration = None - - self._sort_formats(formats) - - view_count = str_to_int(self._search_regex( - (r'Views:\s*</span>\s*<span>\s*([\d,.]+)', - r'class=["\']views["\'][^>]*><p>([\d,.]+)'), webpage, - 'view count', fatal=False)) - - def extract_list(kind): - s = self._search_regex( - (r'(?s)%s:\s*</span>\s*<span>(.+?)</span>' % kind.capitalize(), - r'(?s)<p[^>]*>%s:(.+?)</p>' % kind.capitalize()), - webpage, kind, fatal=False) - return re.findall(r'<a[^>]+>([^<]+)</a>', s or '') - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'thumbnail': thumbnail, - 'duration': duration, - 'view_count': view_count, - 'formats': formats, - 'age_limit': 18, - 'categories': extract_list('categories'), - 'tags': extract_list('tags'), - } diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py deleted file mode 100644 index c6052ac9f..000000000 --- a/youtube_dl/extractor/pornhd.py +++ /dev/null @@ -1,121 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - determine_ext, - ExtractorError, - int_or_none, - js_to_json, - merge_dicts, - urljoin, -) - - -class PornHdIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<id>\d+)(?:/(?P<display_id>.+))?' - _TESTS = [{ - 'url': 'http://www.pornhd.com/videos/9864/selfie-restroom-masturbation-fun-with-chubby-cutie-hd-porn-video', - 'md5': '87f1540746c1d32ec7a2305c12b96b25', - 'info_dict': { - 'id': '9864', - 'display_id': 'selfie-restroom-masturbation-fun-with-chubby-cutie-hd-porn-video', - 'ext': 'mp4', - 'title': 'Restroom selfie masturbation', - 'description': 'md5:3748420395e03e31ac96857a8f125b2b', - 'thumbnail': r're:^https?://.*\.jpg', - 'view_count': int, - 'like_count': int, - 'age_limit': 18, - }, - 'skip': 'HTTP Error 404: Not Found', - }, { - 'url': 'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video', - 'md5': '1b7b3a40b9d65a8e5b25f7ab9ee6d6de', - 'info_dict': { - 'id': '1962', - 'display_id': 'sierra-day-gets-his-cum-all-over-herself-hd-porn-video', - 'ext': 'mp4', - 'title': 'md5:98c6f8b2d9c229d0f0fde47f61a1a759', - 'description': 'md5:8ff0523848ac2b8f9b065ba781ccf294', - 'thumbnail': r're:^https?://.*\.jpg', - 'view_count': int, - 'like_count': int, - 'age_limit': 18, - }, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') - - webpage = self._download_webpage(url, display_id or video_id) - - title = self._html_search_regex( - [r'<span[^>]+class=["\']video-name["\'][^>]*>([^<]+)', - r'<title>(.+?) - .*?[Pp]ornHD.*?</title>'], webpage, 'title') - - sources = self._parse_json(js_to_json(self._search_regex( - r"(?s)sources'?\s*[:=]\s*(\{.+?\})", - webpage, 'sources', default='{}')), video_id) - - info = {} - if not sources: - entries = self._parse_html5_media_entries(url, webpage, video_id) - if entries: - info = entries[0] - - if not sources and not info: - message = self._html_search_regex( - r'(?s)<(div|p)[^>]+class="no-video"[^>]*>(?P<value>.+?)</\1', - webpage, 'error message', group='value') - raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True) - - formats = [] - for format_id, video_url in sources.items(): - video_url = urljoin(url, video_url) - if not video_url: - continue - height = int_or_none(self._search_regex( - r'^(\d+)[pP]', format_id, 'height', default=None)) - formats.append({ - 'url': video_url, - 'ext': determine_ext(video_url, 'mp4'), - 'format_id': format_id, - 'height': height, - }) - if formats: - info['formats'] = formats - self._sort_formats(info['formats']) - - description = self._html_search_regex( - (r'(?s)<section[^>]+class=["\']video-description[^>]+>(?P<value>.+?)</section>', - r'<(div|p)[^>]+class="description"[^>]*>(?P<value>[^<]+)</\1'), - webpage, 'description', fatal=False, - group='value') or self._html_search_meta( - 'description', webpage, default=None) or self._og_search_description(webpage) - view_count = int_or_none(self._html_search_regex( - r'(\d+) views\s*<', webpage, 'view count', fatal=False)) - thumbnail = self._search_regex( - r"poster'?\s*:\s*([\"'])(?P<url>(?:(?!\1).)+)\1", webpage, - 'thumbnail', default=None, group='url') - - like_count = int_or_none(self._search_regex( - (r'(\d+)</span>\s*likes', - r'(\d+)\s*</11[^>]+>(?: |\s)*\blikes', - r'class=["\']save-count["\'][^>]*>\s*(\d+)'), - webpage, 'like count', fatal=False)) - - return merge_dicts(info, { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'view_count': view_count, - 'like_count': like_count, - 'formats': formats, - 'age_limit': 18, - }) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py deleted file mode 100644 index e2e1500ff..000000000 --- a/youtube_dl/extractor/pornhub.py +++ /dev/null @@ -1,767 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import functools -import itertools -import operator -import re - -from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, - compat_urllib_request, -) -from .openload import PhantomJSwrapper -from ..utils import ( - determine_ext, - ExtractorError, - int_or_none, - merge_dicts, - NO_DEFAULT, - orderedSet, - remove_quotes, - str_to_int, - update_url_query, - urlencode_postdata, - url_or_none, -) - - -class PornHubBaseIE(InfoExtractor): - _NETRC_MACHINE = 'pornhub' - _PORNHUB_HOST_RE = r'(?:(?P<host>pornhub(?:premium)?\.(?:com|net|org))|pornhubthbh7ap3u\.onion)' - - def _download_webpage_handle(self, *args, **kwargs): - def dl(*args, **kwargs): - return super(PornHubBaseIE, self)._download_webpage_handle(*args, **kwargs) - - ret = dl(*args, **kwargs) - - if not ret: - return ret - - webpage, urlh = ret - - if any(re.search(p, webpage) for p in ( - r'<body\b[^>]+\bonload=["\']go\(\)', - r'document\.cookie\s*=\s*["\']RNKEY=', - r'document\.location\.reload\(true\)')): - url_or_request = args[0] - url = (url_or_request.get_full_url() - if isinstance(url_or_request, compat_urllib_request.Request) - else url_or_request) - phantom = PhantomJSwrapper(self, required_version='2.0') - phantom.get(url, html=webpage) - webpage, urlh = dl(*args, **kwargs) - - return webpage, urlh - - def _real_initialize(self): - self._logged_in = False - - def _login(self, host): - if self._logged_in: - return - - site = host.split('.')[0] - - # Both sites pornhub and pornhubpremium have separate accounts - # so there should be an option to provide credentials for both. - # At the same time some videos are available under the same video id - # on both sites so that we have to identify them as the same video. - # For that purpose we have to keep both in the same extractor - # but under different netrc machines. - username, password = self._get_login_info(netrc_machine=site) - if username is None: - return - - login_url = 'https://www.%s/%slogin' % (host, 'premium/' if 'premium' in host else '') - login_page = self._download_webpage( - login_url, None, 'Downloading %s login page' % site) - - def is_logged(webpage): - return any(re.search(p, webpage) for p in ( - r'class=["\']signOut', - r'>Sign\s+[Oo]ut\s*<')) - - if is_logged(login_page): - self._logged_in = True - return - - login_form = self._hidden_inputs(login_page) - - login_form.update({ - 'username': username, - 'password': password, - }) - - response = self._download_json( - 'https://www.%s/front/authenticate' % host, None, - 'Logging in to %s' % site, - data=urlencode_postdata(login_form), - headers={ - 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', - 'Referer': login_url, - 'X-Requested-With': 'XMLHttpRequest', - }) - - if response.get('success') == '1': - self._logged_in = True - return - - message = response.get('message') - if message is not None: - raise ExtractorError( - 'Unable to login: %s' % message, expected=True) - - raise ExtractorError('Unable to log in') - - -class PornHubIE(PornHubBaseIE): - IE_DESC = 'PornHub and Thumbzilla' - _VALID_URL = r'''(?x) - https?:// - (?: - (?:[^/]+\.)? - %s - /(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| - (?:www\.)?thumbzilla\.com/video/ - ) - (?P<id>[\da-z]+) - ''' % PornHubBaseIE._PORNHUB_HOST_RE - _TESTS = [{ - 'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015', - 'md5': 'a6391306d050e4547f62b3f485dd9ba9', - 'info_dict': { - 'id': '648719015', - 'ext': 'mp4', - 'title': 'Seductive Indian beauty strips down and fingers her pink pussy', - 'uploader': 'Babes', - 'upload_date': '20130628', - 'timestamp': 1372447216, - 'duration': 361, - 'view_count': int, - 'like_count': int, - 'dislike_count': int, - 'comment_count': int, - 'age_limit': 18, - 'tags': list, - 'categories': list, - }, - }, { - # non-ASCII title - 'url': 'http://www.pornhub.com/view_video.php?viewkey=1331683002', - 'info_dict': { - 'id': '1331683002', - 'ext': 'mp4', - 'title': '重庆婷婷女王足交', - 'upload_date': '20150213', - 'timestamp': 1423804862, - 'duration': 1753, - 'view_count': int, - 'like_count': int, - 'dislike_count': int, - 'comment_count': int, - 'age_limit': 18, - 'tags': list, - 'categories': list, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'Video has been flagged for verification in accordance with our trust and safety policy', - }, { - # subtitles - 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5af5fef7c2aa7', - 'info_dict': { - 'id': 'ph5af5fef7c2aa7', - 'ext': 'mp4', - 'title': 'BFFS - Cute Teen Girls Share Cock On the Floor', - 'uploader': 'BFFs', - 'duration': 622, - 'view_count': int, - 'like_count': int, - 'dislike_count': int, - 'comment_count': int, - 'age_limit': 18, - 'tags': list, - 'categories': list, - 'subtitles': { - 'en': [{ - "ext": 'srt' - }] - }, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'This video has been disabled', - }, { - 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d', - 'only_matching': True, - }, { - # removed at the request of cam4.com - 'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862', - 'only_matching': True, - }, { - # removed at the request of the copyright owner - 'url': 'http://www.pornhub.com/view_video.php?viewkey=788152859', - 'only_matching': True, - }, { - # removed by uploader - 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph572716d15a111', - 'only_matching': True, - }, { - # private video - 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph56fd731fce6b7', - 'only_matching': True, - }, { - 'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex', - 'only_matching': True, - }, { - 'url': 'http://www.pornhub.com/video/show?viewkey=648719015', - 'only_matching': True, - }, { - 'url': 'https://www.pornhub.net/view_video.php?viewkey=203640933', - 'only_matching': True, - }, { - 'url': 'https://www.pornhub.org/view_video.php?viewkey=203640933', - 'only_matching': True, - }, { - 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5e4acdae54a82', - 'only_matching': True, - }, { - # Some videos are available with the same id on both premium - # and non-premium sites (e.g. this and the following test) - 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5f75b0f4b18e3', - 'only_matching': True, - }, { - 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5f75b0f4b18e3', - 'only_matching': True, - }, { - # geo restricted - 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5a9813bfa7156', - 'only_matching': True, - }, { - 'url': 'http://pornhubthbh7ap3u.onion/view_video.php?viewkey=ph5a9813bfa7156', - 'only_matching': True, - }] - - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub(?:premium)?\.(?:com|net|org)/embed/[\da-z]+)', - webpage) - - def _extract_count(self, pattern, webpage, name): - return str_to_int(self._search_regex( - pattern, webpage, '%s count' % name, fatal=False)) - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - host = mobj.group('host') or 'pornhub.com' - video_id = mobj.group('id') - - self._login(host) - - self._set_cookie(host, 'age_verified', '1') - - def dl_webpage(platform): - self._set_cookie(host, 'platform', platform) - return self._download_webpage( - 'https://www.%s/view_video.php?viewkey=%s' % (host, video_id), - video_id, 'Downloading %s webpage' % platform) - - webpage = dl_webpage('pc') - - error_msg = self._html_search_regex( - (r'(?s)<div[^>]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P<error>.+?)</div>', - r'(?s)<section[^>]+class=["\']noVideo["\'][^>]*>(?P<error>.+?)</section>'), - webpage, 'error message', default=None, group='error') - if error_msg: - error_msg = re.sub(r'\s+', ' ', error_msg) - raise ExtractorError( - 'PornHub said: %s' % error_msg, - expected=True, video_id=video_id) - - if any(re.search(p, webpage) for p in ( - r'class=["\']geoBlocked["\']', - r'>\s*This content is unavailable in your country')): - self.raise_geo_restricted() - - # video_title from flashvars contains whitespace instead of non-ASCII (see - # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying - # on that anymore. - title = self._html_search_meta( - 'twitter:title', webpage, default=None) or self._html_search_regex( - (r'(?s)<h1[^>]+class=["\']title["\'][^>]*>(?P<title>.+?)</h1>', - r'<div[^>]+data-video-title=(["\'])(?P<title>(?:(?!\1).)+)\1', - r'shareTitle["\']\s*[=:]\s*(["\'])(?P<title>(?:(?!\1).)+)\1'), - webpage, 'title', group='title') - - video_urls = [] - video_urls_set = set() - subtitles = {} - - flashvars = self._parse_json( - self._search_regex( - r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'), - video_id) - if flashvars: - subtitle_url = url_or_none(flashvars.get('closedCaptionsFile')) - if subtitle_url: - subtitles.setdefault('en', []).append({ - 'url': subtitle_url, - 'ext': 'srt', - }) - thumbnail = flashvars.get('image_url') - duration = int_or_none(flashvars.get('video_duration')) - media_definitions = flashvars.get('mediaDefinitions') - if isinstance(media_definitions, list): - for definition in media_definitions: - if not isinstance(definition, dict): - continue - video_url = definition.get('videoUrl') - if not video_url or not isinstance(video_url, compat_str): - continue - if video_url in video_urls_set: - continue - video_urls_set.add(video_url) - video_urls.append( - (video_url, int_or_none(definition.get('quality')))) - else: - thumbnail, duration = [None] * 2 - - def extract_js_vars(webpage, pattern, default=NO_DEFAULT): - assignments = self._search_regex( - pattern, webpage, 'encoded url', default=default) - if not assignments: - return {} - - assignments = assignments.split(';') - - js_vars = {} - - def parse_js_value(inp): - inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp) - if '+' in inp: - inps = inp.split('+') - return functools.reduce( - operator.concat, map(parse_js_value, inps)) - inp = inp.strip() - if inp in js_vars: - return js_vars[inp] - return remove_quotes(inp) - - for assn in assignments: - assn = assn.strip() - if not assn: - continue - assn = re.sub(r'var\s+', '', assn) - vname, value = assn.split('=', 1) - js_vars[vname] = parse_js_value(value) - return js_vars - - def add_video_url(video_url): - v_url = url_or_none(video_url) - if not v_url: - return - if v_url in video_urls_set: - return - video_urls.append((v_url, None)) - video_urls_set.add(v_url) - - def parse_quality_items(quality_items): - q_items = self._parse_json(quality_items, video_id, fatal=False) - if not isinstance(q_items, list): - return - for item in q_items: - if isinstance(item, dict): - add_video_url(item.get('url')) - - if not video_urls: - FORMAT_PREFIXES = ('media', 'quality', 'qualityItems') - js_vars = extract_js_vars( - webpage, r'(var\s+(?:%s)_.+)' % '|'.join(FORMAT_PREFIXES), - default=None) - if js_vars: - for key, format_url in js_vars.items(): - if key.startswith(FORMAT_PREFIXES[-1]): - parse_quality_items(format_url) - elif any(key.startswith(p) for p in FORMAT_PREFIXES[:2]): - add_video_url(format_url) - if not video_urls and re.search( - r'<[^>]+\bid=["\']lockedPlayer', webpage): - raise ExtractorError( - 'Video %s is locked' % video_id, expected=True) - - if not video_urls: - js_vars = extract_js_vars( - dl_webpage('tv'), r'(var.+?mediastring.+?)</script>') - add_video_url(js_vars['mediastring']) - - for mobj in re.finditer( - r'<a[^>]+\bclass=["\']downloadBtn\b[^>]+\bhref=(["\'])(?P<url>(?:(?!\1).)+)\1', - webpage): - video_url = mobj.group('url') - if video_url not in video_urls_set: - video_urls.append((video_url, None)) - video_urls_set.add(video_url) - - upload_date = None - formats = [] - - def add_format(format_url, height=None): - ext = determine_ext(format_url) - if ext == 'mpd': - formats.extend(self._extract_mpd_formats( - format_url, video_id, mpd_id='dash', fatal=False)) - return - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - return - if not height: - height = int_or_none(self._search_regex( - r'(?P<height>\d+)[pP]?_\d+[kK]', format_url, 'height', - default=None)) - formats.append({ - 'url': format_url, - 'format_id': '%dp' % height if height else None, - 'height': height, - }) - - for video_url, height in video_urls: - if not upload_date: - upload_date = self._search_regex( - r'/(\d{6}/\d{2})/', video_url, 'upload data', default=None) - if upload_date: - upload_date = upload_date.replace('/', '') - if '/video/get_media' in video_url: - medias = self._download_json(video_url, video_id, fatal=False) - if isinstance(medias, list): - for media in medias: - if not isinstance(media, dict): - continue - video_url = url_or_none(media.get('videoUrl')) - if not video_url: - continue - height = int_or_none(media.get('quality')) - add_format(video_url, height) - continue - add_format(video_url) - self._sort_formats( - formats, field_preference=('height', 'width', 'fps', 'format_id')) - - video_uploader = self._html_search_regex( - r'(?s)From: .+?<(?:a\b[^>]+\bhref=["\']/(?:(?:user|channel)s|model|pornstar)/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<', - webpage, 'uploader', default=None) - - def extract_vote_count(kind, name): - return self._extract_count( - (r'<span[^>]+\bclass="votes%s"[^>]*>([\d,\.]+)</span>' % kind, - r'<span[^>]+\bclass=["\']votes%s["\'][^>]*\bdata-rating=["\'](\d+)' % kind), - webpage, name) - - view_count = self._extract_count( - r'<span class="count">([\d,\.]+)</span> [Vv]iews', webpage, 'view') - like_count = extract_vote_count('Up', 'like') - dislike_count = extract_vote_count('Down', 'dislike') - comment_count = self._extract_count( - r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment') - - def extract_list(meta_key): - div = self._search_regex( - r'(?s)<div[^>]+\bclass=["\'].*?\b%sWrapper[^>]*>(.+?)</div>' - % meta_key, webpage, meta_key, default=None) - if div: - return re.findall(r'<a[^>]+\bhref=[^>]+>([^<]+)', div) - - info = self._search_json_ld(webpage, video_id, default={}) - # description provided in JSON-LD is irrelevant - info['description'] = None - - return merge_dicts({ - 'id': video_id, - 'uploader': video_uploader, - 'upload_date': upload_date, - 'title': title, - 'thumbnail': thumbnail, - 'duration': duration, - 'view_count': view_count, - 'like_count': like_count, - 'dislike_count': dislike_count, - 'comment_count': comment_count, - 'formats': formats, - 'age_limit': 18, - 'tags': extract_list('tags'), - 'categories': extract_list('categories'), - 'subtitles': subtitles, - }, info) - - -class PornHubPlaylistBaseIE(PornHubBaseIE): - def _extract_page(self, url): - return int_or_none(self._search_regex( - r'\bpage=(\d+)', url, 'page', default=None)) - - def _extract_entries(self, webpage, host): - # Only process container div with main playlist content skipping - # drop-down menu that uses similar pattern for videos (see - # https://github.com/ytdl-org/youtube-dl/issues/11594). - container = self._search_regex( - r'(?s)(<div[^>]+class=["\']container.+)', webpage, - 'container', default=webpage) - - return [ - self.url_result( - 'http://www.%s/%s' % (host, video_url), - PornHubIE.ie_key(), video_title=title) - for video_url, title in orderedSet(re.findall( - r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^"]+)"', - container)) - ] - - -class PornHubUserIE(PornHubPlaylistBaseIE): - _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?%s/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/?#&]+))(?:[?#&]|/(?!videos)|$)' % PornHubBaseIE._PORNHUB_HOST_RE - _TESTS = [{ - 'url': 'https://www.pornhub.com/model/zoe_ph', - 'playlist_mincount': 118, - }, { - 'url': 'https://www.pornhub.com/pornstar/liz-vicious', - 'info_dict': { - 'id': 'liz-vicious', - }, - 'playlist_mincount': 118, - }, { - 'url': 'https://www.pornhub.com/users/russianveet69', - 'only_matching': True, - }, { - 'url': 'https://www.pornhub.com/channels/povd', - 'only_matching': True, - }, { - 'url': 'https://www.pornhub.com/model/zoe_ph?abc=1', - 'only_matching': True, - }, { - # Unavailable via /videos page, but available with direct pagination - # on pornstar page (see [1]), requires premium - # 1. https://github.com/ytdl-org/youtube-dl/issues/27853 - 'url': 'https://www.pornhubpremium.com/pornstar/sienna-west', - 'only_matching': True, - }, { - # Same as before, multi page - 'url': 'https://www.pornhubpremium.com/pornstar/lily-labeau', - 'only_matching': True, - }, { - 'url': 'https://pornhubthbh7ap3u.onion/model/zoe_ph', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - user_id = mobj.group('id') - videos_url = '%s/videos' % mobj.group('url') - page = self._extract_page(url) - if page: - videos_url = update_url_query(videos_url, {'page': page}) - return self.url_result( - videos_url, ie=PornHubPagedVideoListIE.ie_key(), video_id=user_id) - - -class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): - @staticmethod - def _has_more(webpage): - return re.search( - r'''(?x) - <li[^>]+\bclass=["\']page_next| - <link[^>]+\brel=["\']next| - <button[^>]+\bid=["\']moreDataBtn - ''', webpage) is not None - - def _entries(self, url, host, item_id): - page = self._extract_page(url) - - VIDEOS = '/videos' - - def download_page(base_url, num, fallback=False): - note = 'Downloading page %d%s' % (num, ' (switch to fallback)' if fallback else '') - return self._download_webpage( - base_url, item_id, note, query={'page': num}) - - def is_404(e): - return isinstance(e.cause, compat_HTTPError) and e.cause.code == 404 - - base_url = url - has_page = page is not None - first_page = page if has_page else 1 - for page_num in (first_page, ) if has_page else itertools.count(first_page): - try: - try: - webpage = download_page(base_url, page_num) - except ExtractorError as e: - # Some sources may not be available via /videos page, - # trying to fallback to main page pagination (see [1]) - # 1. https://github.com/ytdl-org/youtube-dl/issues/27853 - if is_404(e) and page_num == first_page and VIDEOS in base_url: - base_url = base_url.replace(VIDEOS, '') - webpage = download_page(base_url, page_num, fallback=True) - else: - raise - except ExtractorError as e: - if is_404(e) and page_num != first_page: - break - raise - page_entries = self._extract_entries(webpage, host) - if not page_entries: - break - for e in page_entries: - yield e - if not self._has_more(webpage): - break - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - host = mobj.group('host') - item_id = mobj.group('id') - - self._login(host) - - return self.playlist_result(self._entries(url, host, item_id), item_id) - - -class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): - _VALID_URL = r'https?://(?:[^/]+\.)?%s/(?P<id>(?:[^/]+/)*[^/?#&]+)' % PornHubBaseIE._PORNHUB_HOST_RE - _TESTS = [{ - 'url': 'https://www.pornhub.com/model/zoe_ph/videos', - 'only_matching': True, - }, { - 'url': 'http://www.pornhub.com/users/rushandlia/videos', - 'only_matching': True, - }, { - 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos', - 'info_dict': { - 'id': 'pornstar/jenny-blighe/videos', - }, - 'playlist_mincount': 149, - }, { - 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos?page=3', - 'info_dict': { - 'id': 'pornstar/jenny-blighe/videos', - }, - 'playlist_mincount': 40, - }, { - # default sorting as Top Rated Videos - 'url': 'https://www.pornhub.com/channels/povd/videos', - 'info_dict': { - 'id': 'channels/povd/videos', - }, - 'playlist_mincount': 293, - }, { - # Top Rated Videos - 'url': 'https://www.pornhub.com/channels/povd/videos?o=ra', - 'only_matching': True, - }, { - # Most Recent Videos - 'url': 'https://www.pornhub.com/channels/povd/videos?o=da', - 'only_matching': True, - }, { - # Most Viewed Videos - 'url': 'https://www.pornhub.com/channels/povd/videos?o=vi', - 'only_matching': True, - }, { - 'url': 'http://www.pornhub.com/users/zoe_ph/videos/public', - 'only_matching': True, - }, { - # Most Viewed Videos - 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=mv', - 'only_matching': True, - }, { - # Top Rated Videos - 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=tr', - 'only_matching': True, - }, { - # Longest Videos - 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=lg', - 'only_matching': True, - }, { - # Newest Videos - 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=cm', - 'only_matching': True, - }, { - 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos/paid', - 'only_matching': True, - }, { - 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos/fanonly', - 'only_matching': True, - }, { - 'url': 'https://www.pornhub.com/video', - 'only_matching': True, - }, { - 'url': 'https://www.pornhub.com/video?page=3', - 'only_matching': True, - }, { - 'url': 'https://www.pornhub.com/video/search?search=123', - 'only_matching': True, - }, { - 'url': 'https://www.pornhub.com/categories/teen', - 'only_matching': True, - }, { - 'url': 'https://www.pornhub.com/categories/teen?page=3', - 'only_matching': True, - }, { - 'url': 'https://www.pornhub.com/hd', - 'only_matching': True, - }, { - 'url': 'https://www.pornhub.com/hd?page=3', - 'only_matching': True, - }, { - 'url': 'https://www.pornhub.com/described-video', - 'only_matching': True, - }, { - 'url': 'https://www.pornhub.com/described-video?page=2', - 'only_matching': True, - }, { - 'url': 'https://www.pornhub.com/video/incategories/60fps-1/hd-porn', - 'only_matching': True, - }, { - 'url': 'https://www.pornhub.com/playlist/44121572', - 'info_dict': { - 'id': 'playlist/44121572', - }, - 'playlist_mincount': 132, - }, { - 'url': 'https://www.pornhub.com/playlist/4667351', - 'only_matching': True, - }, { - 'url': 'https://de.pornhub.com/playlist/4667351', - 'only_matching': True, - }, { - 'url': 'https://pornhubthbh7ap3u.onion/model/zoe_ph/videos', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return (False - if PornHubIE.suitable(url) or PornHubUserIE.suitable(url) or PornHubUserVideosUploadIE.suitable(url) - else super(PornHubPagedVideoListIE, cls).suitable(url)) - - -class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE): - _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?%s/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos/upload)' % PornHubBaseIE._PORNHUB_HOST_RE - _TESTS = [{ - 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload', - 'info_dict': { - 'id': 'jenny-blighe', - }, - 'playlist_mincount': 129, - }, { - 'url': 'https://www.pornhub.com/model/zoe_ph/videos/upload', - 'only_matching': True, - }, { - 'url': 'http://pornhubthbh7ap3u.onion/pornstar/jenny-blighe/videos/upload', - 'only_matching': True, - }] diff --git a/youtube_dl/extractor/pornovoisines.py b/youtube_dl/extractor/pornovoisines.py deleted file mode 100644 index b6b71069d..000000000 --- a/youtube_dl/extractor/pornovoisines.py +++ /dev/null @@ -1,108 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - float_or_none, - unified_strdate, -) - - -class PornoVoisinesIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?pornovoisines\.com/videos/show/(?P<id>\d+)/(?P<display_id>[^/.]+)' - - _TEST = { - 'url': 'http://www.pornovoisines.com/videos/show/919/recherche-appartement.html', - 'md5': '6f8aca6a058592ab49fe701c8ba8317b', - 'info_dict': { - 'id': '919', - 'display_id': 'recherche-appartement', - 'ext': 'mp4', - 'title': 'Recherche appartement', - 'description': 'md5:fe10cb92ae2dd3ed94bb4080d11ff493', - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20140925', - 'duration': 120, - 'view_count': int, - 'average_rating': float, - 'categories': ['Débutante', 'Débutantes', 'Scénario', 'Sodomie'], - 'age_limit': 18, - 'subtitles': { - 'fr': [{ - 'ext': 'vtt', - }] - }, - } - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') - - settings_url = self._download_json( - 'http://www.pornovoisines.com/api/video/%s/getsettingsurl/' % video_id, - video_id, note='Getting settings URL')['video_settings_url'] - settings = self._download_json(settings_url, video_id)['data'] - - formats = [] - for kind, data in settings['variants'].items(): - if kind == 'HLS': - formats.extend(self._extract_m3u8_formats( - data, video_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls')) - elif kind == 'MP4': - for item in data: - formats.append({ - 'url': item['url'], - 'height': item.get('height'), - 'bitrate': item.get('bitrate'), - }) - self._sort_formats(formats) - - webpage = self._download_webpage(url, video_id) - - title = self._og_search_title(webpage) - description = self._og_search_description(webpage) - - # The webpage has a bug - there's no space between "thumb" and src= - thumbnail = self._html_search_regex( - r'<img[^>]+class=([\'"])thumb\1[^>]*src=([\'"])(?P<url>[^"]+)\2', - webpage, 'thumbnail', fatal=False, group='url') - - upload_date = unified_strdate(self._search_regex( - r'Le\s*<b>([\d/]+)', webpage, 'upload date', fatal=False)) - duration = settings.get('main', {}).get('duration') - view_count = int_or_none(self._search_regex( - r'(\d+) vues', webpage, 'view count', fatal=False)) - average_rating = self._search_regex( - r'Note\s*:\s*(\d+(?:,\d+)?)', webpage, 'average rating', fatal=False) - if average_rating: - average_rating = float_or_none(average_rating.replace(',', '.')) - - categories = self._html_search_regex( - r'(?s)Catégories\s*:\s*<b>(.+?)</b>', webpage, 'categories', fatal=False) - if categories: - categories = [category.strip() for category in categories.split(',')] - - subtitles = {'fr': [{ - 'url': subtitle, - } for subtitle in settings.get('main', {}).get('vtt_tracks', {}).values()]} - - return { - 'id': video_id, - 'display_id': display_id, - 'formats': formats, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'upload_date': upload_date, - 'duration': duration, - 'view_count': view_count, - 'average_rating': average_rating, - 'categories': categories, - 'age_limit': 18, - 'subtitles': subtitles, - } diff --git a/youtube_dl/extractor/pornoxo.py b/youtube_dl/extractor/pornoxo.py deleted file mode 100644 index 2831368b6..000000000 --- a/youtube_dl/extractor/pornoxo.py +++ /dev/null @@ -1,58 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - str_to_int, -) - - -class PornoXOIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?pornoxo\.com/videos/(?P<id>\d+)/(?P<display_id>[^/]+)\.html' - _TEST = { - 'url': 'http://www.pornoxo.com/videos/7564/striptease-from-sexy-secretary.html', - 'md5': '582f28ecbaa9e6e24cb90f50f524ce87', - 'info_dict': { - 'id': '7564', - 'ext': 'flv', - 'title': 'Striptease From Sexy Secretary!', - 'display_id': 'striptease-from-sexy-secretary', - 'description': 'md5:0ee35252b685b3883f4a1d38332f9980', - 'categories': list, # NSFW - 'thumbnail': r're:https?://.*\.jpg$', - 'age_limit': 18, - } - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id, display_id = mobj.groups() - - webpage = self._download_webpage(url, video_id) - video_data = self._extract_jwplayer_data(webpage, video_id, require_title=False) - - title = self._html_search_regex( - r'<title>([^<]+)\s*-\s*PornoXO', webpage, 'title') - - view_count = str_to_int(self._html_search_regex( - r'[vV]iews:\s*([0-9,]+)', webpage, 'view count', fatal=False)) - - categories_str = self._html_search_regex( - r'<meta name="description" content=".*featuring\s*([^"]+)"', - webpage, 'categories', fatal=False) - categories = ( - None if categories_str is None - else categories_str.split(',')) - - video_data.update({ - 'id': video_id, - 'title': title, - 'display_id': display_id, - 'description': self._html_search_meta('description', webpage), - 'categories': categories, - 'view_count': view_count, - 'age_limit': 18, - }) - - return video_data diff --git a/youtube_dl/extractor/presstv.py b/youtube_dl/extractor/presstv.py deleted file mode 100644 index b5c279203..000000000 --- a/youtube_dl/extractor/presstv.py +++ /dev/null @@ -1,74 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import remove_start - - -class PressTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?presstv\.ir/[^/]+/(?P<y>\d+)/(?P<m>\d+)/(?P<d>\d+)/(?P<id>\d+)/(?P<display_id>[^/]+)?' - - _TEST = { - 'url': 'http://www.presstv.ir/Detail/2016/04/09/459911/Australian-sewerage-treatment-facility-/', - 'md5': '5d7e3195a447cb13e9267e931d8dd5a5', - 'info_dict': { - 'id': '459911', - 'display_id': 'Australian-sewerage-treatment-facility-', - 'ext': 'mp4', - 'title': 'Organic mattresses used to clean waste water', - 'upload_date': '20160409', - 'thumbnail': r're:^https?://.*\.jpg', - 'description': 'md5:20002e654bbafb6908395a5c0cfcd125' - } - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') or video_id - - webpage = self._download_webpage(url, display_id) - - # extract video URL from webpage - video_url = self._hidden_inputs(webpage)['inpPlayback'] - - # build list of available formats - # specified in http://www.presstv.ir/Scripts/playback.js - base_url = 'http://192.99.219.222:82/presstv' - _formats = [ - (180, '_low200.mp4'), - (360, '_low400.mp4'), - (720, '_low800.mp4'), - (1080, '.mp4') - ] - - formats = [{ - 'url': base_url + video_url[:-4] + extension, - 'format_id': '%dp' % height, - 'height': height, - } for height, extension in _formats] - - # extract video metadata - title = remove_start( - self._html_search_meta('title', webpage, fatal=True), 'PressTV-') - - thumbnail = self._og_search_thumbnail(webpage) - description = self._og_search_description(webpage) - - upload_date = '%04d%02d%02d' % ( - int(mobj.group('y')), - int(mobj.group('m')), - int(mobj.group('d')), - ) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'formats': formats, - 'thumbnail': thumbnail, - 'upload_date': upload_date, - 'description': description - } diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py deleted file mode 100644 index e47088292..000000000 --- a/youtube_dl/extractor/prosiebensat1.py +++ /dev/null @@ -1,500 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from hashlib import sha1 -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - ExtractorError, - determine_ext, - float_or_none, - int_or_none, - merge_dicts, - unified_strdate, -) - - -class ProSiebenSat1BaseIE(InfoExtractor): - _GEO_BYPASS = False - _ACCESS_ID = None - _SUPPORTED_PROTOCOLS = 'dash:clear,hls:clear,progressive:clear' - _V4_BASE_URL = 'https://vas-v4.p7s1video.net/4.0/get' - - def _extract_video_info(self, url, clip_id): - client_location = url - - video = self._download_json( - 'http://vas.sim-technik.de/vas/live/v2/videos', - clip_id, 'Downloading videos JSON', query={ - 'access_token': self._TOKEN, - 'client_location': client_location, - 'client_name': self._CLIENT_NAME, - 'ids': clip_id, - })[0] - - if video.get('is_protected') is True: - raise ExtractorError('This video is DRM protected.', expected=True) - - formats = [] - if self._ACCESS_ID: - raw_ct = self._ENCRYPTION_KEY + clip_id + self._IV + self._ACCESS_ID - protocols = self._download_json( - self._V4_BASE_URL + 'protocols', clip_id, - 'Downloading protocols JSON', - headers=self.geo_verification_headers(), query={ - 'access_id': self._ACCESS_ID, - 'client_token': sha1((raw_ct).encode()).hexdigest(), - 'video_id': clip_id, - }, fatal=False, expected_status=(403,)) or {} - error = protocols.get('error') or {} - if error.get('title') == 'Geo check failed': - self.raise_geo_restricted(countries=['AT', 'CH', 'DE']) - server_token = protocols.get('server_token') - if server_token: - urls = (self._download_json( - self._V4_BASE_URL + 'urls', clip_id, 'Downloading urls JSON', query={ - 'access_id': self._ACCESS_ID, - 'client_token': sha1((raw_ct + server_token + self._SUPPORTED_PROTOCOLS).encode()).hexdigest(), - 'protocols': self._SUPPORTED_PROTOCOLS, - 'server_token': server_token, - 'video_id': clip_id, - }, fatal=False) or {}).get('urls') or {} - for protocol, variant in urls.items(): - source_url = variant.get('clear', {}).get('url') - if not source_url: - continue - if protocol == 'dash': - formats.extend(self._extract_mpd_formats( - source_url, clip_id, mpd_id=protocol, fatal=False)) - elif protocol == 'hls': - formats.extend(self._extract_m3u8_formats( - source_url, clip_id, 'mp4', 'm3u8_native', - m3u8_id=protocol, fatal=False)) - else: - formats.append({ - 'url': source_url, - 'format_id': protocol, - }) - if not formats: - source_ids = [compat_str(source['id']) for source in video['sources']] - - client_id = self._SALT[:2] + sha1(''.join([clip_id, self._SALT, self._TOKEN, client_location, self._SALT, self._CLIENT_NAME]).encode('utf-8')).hexdigest() - - sources = self._download_json( - 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources' % clip_id, - clip_id, 'Downloading sources JSON', query={ - 'access_token': self._TOKEN, - 'client_id': client_id, - 'client_location': client_location, - 'client_name': self._CLIENT_NAME, - }) - server_id = sources['server_id'] - - def fix_bitrate(bitrate): - bitrate = int_or_none(bitrate) - if not bitrate: - return None - return (bitrate // 1000) if bitrate % 1000 == 0 else bitrate - - for source_id in source_ids: - client_id = self._SALT[:2] + sha1(''.join([self._SALT, clip_id, self._TOKEN, server_id, client_location, source_id, self._SALT, self._CLIENT_NAME]).encode('utf-8')).hexdigest() - urls = self._download_json( - 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url' % clip_id, - clip_id, 'Downloading urls JSON', fatal=False, query={ - 'access_token': self._TOKEN, - 'client_id': client_id, - 'client_location': client_location, - 'client_name': self._CLIENT_NAME, - 'server_id': server_id, - 'source_ids': source_id, - }) - if not urls: - continue - if urls.get('status_code') != 0: - raise ExtractorError('This video is unavailable', expected=True) - urls_sources = urls['sources'] - if isinstance(urls_sources, dict): - urls_sources = urls_sources.values() - for source in urls_sources: - source_url = source.get('url') - if not source_url: - continue - protocol = source.get('protocol') - mimetype = source.get('mimetype') - if mimetype == 'application/f4m+xml' or 'f4mgenerator' in source_url or determine_ext(source_url) == 'f4m': - formats.extend(self._extract_f4m_formats( - source_url, clip_id, f4m_id='hds', fatal=False)) - elif mimetype == 'application/x-mpegURL': - formats.extend(self._extract_m3u8_formats( - source_url, clip_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - elif mimetype == 'application/dash+xml': - formats.extend(self._extract_mpd_formats( - source_url, clip_id, mpd_id='dash', fatal=False)) - else: - tbr = fix_bitrate(source['bitrate']) - if protocol in ('rtmp', 'rtmpe'): - mobj = re.search(r'^(?P<url>rtmpe?://[^/]+)/(?P<path>.+)$', source_url) - if not mobj: - continue - path = mobj.group('path') - mp4colon_index = path.rfind('mp4:') - app = path[:mp4colon_index] - play_path = path[mp4colon_index:] - formats.append({ - 'url': '%s/%s' % (mobj.group('url'), app), - 'app': app, - 'play_path': play_path, - 'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf', - 'page_url': 'http://www.prosieben.de', - 'tbr': tbr, - 'ext': 'flv', - 'format_id': 'rtmp%s' % ('-%d' % tbr if tbr else ''), - }) - else: - formats.append({ - 'url': source_url, - 'tbr': tbr, - 'format_id': 'http%s' % ('-%d' % tbr if tbr else ''), - }) - self._sort_formats(formats) - - return { - 'duration': float_or_none(video.get('duration')), - 'formats': formats, - } - - -class ProSiebenSat1IE(ProSiebenSat1BaseIE): - IE_NAME = 'prosiebensat1' - IE_DESC = 'ProSiebenSat.1 Digital' - _VALID_URL = r'''(?x) - https?:// - (?:www\.)? - (?: - (?:beta\.)? - (?: - prosieben(?:maxx)?|sixx|sat1(?:gold)?|kabeleins(?:doku)?|the-voice-of-germany|advopedia - )\.(?:de|at|ch)| - ran\.de|fem\.com|advopedia\.de|galileo\.tv/video - ) - /(?P<id>.+) - ''' - - _TESTS = [ - { - # Tests changes introduced in https://github.com/ytdl-org/youtube-dl/pull/6242 - # in response to fixing https://github.com/ytdl-org/youtube-dl/issues/6215: - # - malformed f4m manifest support - # - proper handling of URLs starting with `https?://` in 2.0 manifests - # - recursive child f4m manifests extraction - 'url': 'http://www.prosieben.de/tv/circus-halligalli/videos/218-staffel-2-episode-18-jahresrueckblick-ganze-folge', - 'info_dict': { - 'id': '2104602', - 'ext': 'mp4', - 'title': 'CIRCUS HALLIGALLI - Episode 18 - Staffel 2', - 'description': 'md5:8733c81b702ea472e069bc48bb658fc1', - 'upload_date': '20131231', - 'duration': 5845.04, - 'series': 'CIRCUS HALLIGALLI', - 'season_number': 2, - 'episode': 'Episode 18 - Staffel 2', - 'episode_number': 18, - }, - }, - { - 'url': 'http://www.prosieben.de/videokatalog/Gesellschaft/Leben/Trends/video-Lady-Umstyling-f%C3%BCr-Audrina-Rebekka-Audrina-Fergen-billig-aussehen-Battal-Modica-700544.html', - 'info_dict': { - 'id': '2570327', - 'ext': 'mp4', - 'title': 'Lady-Umstyling für Audrina', - 'description': 'md5:4c16d0c17a3461a0d43ea4084e96319d', - 'upload_date': '20131014', - 'duration': 606.76, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - 'skip': 'Seems to be broken', - }, - { - 'url': 'http://www.prosiebenmaxx.de/tv/experience/video/144-countdown-fuer-die-autowerkstatt-ganze-folge', - 'info_dict': { - 'id': '2429369', - 'ext': 'mp4', - 'title': 'Countdown für die Autowerkstatt', - 'description': 'md5:809fc051a457b5d8666013bc40698817', - 'upload_date': '20140223', - 'duration': 2595.04, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - 'skip': 'This video is unavailable', - }, - { - 'url': 'http://www.sixx.de/stars-style/video/sexy-laufen-in-ugg-boots-clip', - 'info_dict': { - 'id': '2904997', - 'ext': 'mp4', - 'title': 'Sexy laufen in Ugg Boots', - 'description': 'md5:edf42b8bd5bc4e5da4db4222c5acb7d6', - 'upload_date': '20140122', - 'duration': 245.32, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - 'skip': 'This video is unavailable', - }, - { - 'url': 'http://www.sat1.de/film/der-ruecktritt/video/im-interview-kai-wiesinger-clip', - 'info_dict': { - 'id': '2906572', - 'ext': 'mp4', - 'title': 'Im Interview: Kai Wiesinger', - 'description': 'md5:e4e5370652ec63b95023e914190b4eb9', - 'upload_date': '20140203', - 'duration': 522.56, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - 'skip': 'This video is unavailable', - }, - { - 'url': 'http://www.kabeleins.de/tv/rosins-restaurants/videos/jagd-auf-fertigkost-im-elsthal-teil-2-ganze-folge', - 'info_dict': { - 'id': '2992323', - 'ext': 'mp4', - 'title': 'Jagd auf Fertigkost im Elsthal - Teil 2', - 'description': 'md5:2669cde3febe9bce13904f701e774eb6', - 'upload_date': '20141014', - 'duration': 2410.44, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - 'skip': 'This video is unavailable', - }, - { - 'url': 'http://www.ran.de/fussball/bundesliga/video/schalke-toennies-moechte-raul-zurueck-ganze-folge', - 'info_dict': { - 'id': '3004256', - 'ext': 'mp4', - 'title': 'Schalke: Tönnies möchte Raul zurück', - 'description': 'md5:4b5b271d9bcde223b54390754c8ece3f', - 'upload_date': '20140226', - 'duration': 228.96, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - 'skip': 'This video is unavailable', - }, - { - 'url': 'http://www.the-voice-of-germany.de/video/31-andreas-kuemmert-rocket-man-clip', - 'info_dict': { - 'id': '2572814', - 'ext': 'mp4', - 'title': 'The Voice of Germany - Andreas Kümmert: Rocket Man', - 'description': 'md5:6ddb02b0781c6adf778afea606652e38', - 'timestamp': 1382041620, - 'upload_date': '20131017', - 'duration': 469.88, - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://www.fem.com/videos/beauty-lifestyle/kurztrips-zum-valentinstag', - 'info_dict': { - 'id': '2156342', - 'ext': 'mp4', - 'title': 'Kurztrips zum Valentinstag', - 'description': 'Romantischer Kurztrip zum Valentinstag? Nina Heinemann verrät, was sich hier wirklich lohnt.', - 'duration': 307.24, - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://www.prosieben.de/tv/joko-gegen-klaas/videos/playlists/episode-8-ganze-folge-playlist', - 'info_dict': { - 'id': '439664', - 'title': 'Episode 8 - Ganze Folge - Playlist', - 'description': 'md5:63b8963e71f481782aeea877658dec84', - }, - 'playlist_count': 2, - 'skip': 'This video is unavailable', - }, - { - # title in <h2 class="subtitle"> - 'url': 'http://www.prosieben.de/stars/oscar-award/videos/jetzt-erst-enthuellt-das-geheimnis-von-emma-stones-oscar-robe-clip', - 'info_dict': { - 'id': '4895826', - 'ext': 'mp4', - 'title': 'Jetzt erst enthüllt: Das Geheimnis von Emma Stones Oscar-Robe', - 'description': 'md5:e5ace2bc43fadf7b63adc6187e9450b9', - 'upload_date': '20170302', - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'geo restricted to Germany', - }, - { - # geo restricted to Germany - 'url': 'http://www.kabeleinsdoku.de/tv/mayday-alarm-im-cockpit/video/102-notlandung-im-hudson-river-ganze-folge', - 'only_matching': True, - }, - { - # geo restricted to Germany - 'url': 'http://www.sat1gold.de/tv/edel-starck/video/11-staffel-1-episode-1-partner-wider-willen-ganze-folge', - 'only_matching': True, - }, - { - # geo restricted to Germany - 'url': 'https://www.galileo.tv/video/diese-emojis-werden-oft-missverstanden', - 'only_matching': True, - }, - { - 'url': 'http://www.sat1gold.de/tv/edel-starck/playlist/die-gesamte-1-staffel', - 'only_matching': True, - }, - { - 'url': 'http://www.advopedia.de/videos/lenssen-klaert-auf/lenssen-klaert-auf-folge-8-staffel-3-feiertage-und-freie-tage', - 'only_matching': True, - }, - ] - - _TOKEN = 'prosieben' - _SALT = '01!8d8F_)r9]4s[qeuXfP%' - _CLIENT_NAME = 'kolibri-2.0.19-splec4' - - _ACCESS_ID = 'x_prosiebenmaxx-de' - _ENCRYPTION_KEY = 'Eeyeey9oquahthainoofashoyoikosag' - _IV = 'Aeluchoc6aevechuipiexeeboowedaok' - - _CLIPID_REGEXES = [ - r'"clip_id"\s*:\s+"(\d+)"', - r'clipid: "(\d+)"', - r'clip[iI]d=(\d+)', - r'clip[iI][dD]\s*=\s*["\'](\d+)', - r"'itemImageUrl'\s*:\s*'/dynamic/thumbnails/full/\d+/(\d+)", - r'proMamsId"\s*:\s*"(\d+)', - r'proMamsId"\s*:\s*"(\d+)', - ] - _TITLE_REGEXES = [ - r'<h2 class="subtitle" itemprop="name">\s*(.+?)</h2>', - r'<header class="clearfix">\s*<h3>(.+?)</h3>', - r'<!-- start video -->\s*<h1>(.+?)</h1>', - r'<h1 class="att-name">\s*(.+?)</h1>', - r'<header class="module_header">\s*<h2>([^<]+)</h2>\s*</header>', - r'<h2 class="video-title" itemprop="name">\s*(.+?)</h2>', - r'<div[^>]+id="veeseoTitle"[^>]*>(.+?)</div>', - r'<h2[^>]+class="subtitle"[^>]*>([^<]+)</h2>', - ] - _DESCRIPTION_REGEXES = [ - r'<p itemprop="description">\s*(.+?)</p>', - r'<div class="videoDecription">\s*<p><strong>Beschreibung</strong>: (.+?)</p>', - r'<div class="g-plusone" data-size="medium"></div>\s*</div>\s*</header>\s*(.+?)\s*<footer>', - r'<p class="att-description">\s*(.+?)\s*</p>', - r'<p class="video-description" itemprop="description">\s*(.+?)</p>', - r'<div[^>]+id="veeseoDescription"[^>]*>(.+?)</div>', - ] - _UPLOAD_DATE_REGEXES = [ - r'<span>\s*(\d{2}\.\d{2}\.\d{4} \d{2}:\d{2}) \|\s*<span itemprop="duration"', - r'<footer>\s*(\d{2}\.\d{2}\.\d{4}) \d{2}:\d{2} Uhr', - r'<span style="padding-left: 4px;line-height:20px; color:#404040">(\d{2}\.\d{2}\.\d{4})</span>', - r'(\d{2}\.\d{2}\.\d{4}) \| \d{2}:\d{2} Min<br/>', - ] - _PAGE_TYPE_REGEXES = [ - r'<meta name="page_type" content="([^"]+)">', - r"'itemType'\s*:\s*'([^']*)'", - ] - _PLAYLIST_ID_REGEXES = [ - r'content[iI]d=(\d+)', - r"'itemId'\s*:\s*'([^']*)'", - ] - _PLAYLIST_CLIP_REGEXES = [ - r'(?s)data-qvt=.+?<a href="([^"]+)"', - ] - - def _extract_clip(self, url, webpage): - clip_id = self._html_search_regex( - self._CLIPID_REGEXES, webpage, 'clip id') - title = self._html_search_regex( - self._TITLE_REGEXES, webpage, 'title', - default=None) or self._og_search_title(webpage) - info = self._extract_video_info(url, clip_id) - description = self._html_search_regex( - self._DESCRIPTION_REGEXES, webpage, 'description', default=None) - if description is None: - description = self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) - upload_date = unified_strdate( - self._html_search_meta('og:published_time', webpage, - 'upload date', default=None) - or self._html_search_regex(self._UPLOAD_DATE_REGEXES, - webpage, 'upload date', default=None)) - - json_ld = self._search_json_ld(webpage, clip_id, default={}) - - return merge_dicts(info, { - 'id': clip_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'upload_date': upload_date, - }, json_ld) - - def _extract_playlist(self, url, webpage): - playlist_id = self._html_search_regex( - self._PLAYLIST_ID_REGEXES, webpage, 'playlist id') - playlist = self._parse_json( - self._search_regex( - r'var\s+contentResources\s*=\s*(\[.+?\]);\s*</script', - webpage, 'playlist'), - playlist_id) - entries = [] - for item in playlist: - clip_id = item.get('id') or item.get('upc') - if not clip_id: - continue - info = self._extract_video_info(url, clip_id) - info.update({ - 'id': clip_id, - 'title': item.get('title') or item.get('teaser', {}).get('headline'), - 'description': item.get('teaser', {}).get('description'), - 'thumbnail': item.get('poster'), - 'duration': float_or_none(item.get('duration')), - 'series': item.get('tvShowTitle'), - 'uploader': item.get('broadcastPublisher'), - }) - entries.append(info) - return self.playlist_result(entries, playlist_id) - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - page_type = self._search_regex( - self._PAGE_TYPE_REGEXES, webpage, - 'page type', default='clip').lower() - if page_type == 'clip': - return self._extract_clip(url, webpage) - elif page_type == 'playlist': - return self._extract_playlist(url, webpage) - else: - raise ExtractorError( - 'Unsupported page type %s' % page_type, expected=True) diff --git a/youtube_dl/extractor/pyvideo.py b/youtube_dl/extractor/pyvideo.py deleted file mode 100644 index b8ac93a62..000000000 --- a/youtube_dl/extractor/pyvideo.py +++ /dev/null @@ -1,72 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import int_or_none - - -class PyvideoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?pyvideo\.org/(?P<category>[^/]+)/(?P<id>[^/?#&.]+)' - - _TESTS = [{ - 'url': 'http://pyvideo.org/pycon-us-2013/become-a-logging-expert-in-30-minutes.html', - 'info_dict': { - 'id': 'become-a-logging-expert-in-30-minutes', - }, - 'playlist_count': 2, - }, { - 'url': 'http://pyvideo.org/pygotham-2012/gloriajw-spotifywitherikbernhardsson182m4v.html', - 'md5': '5fe1c7e0a8aa5570330784c847ff6d12', - 'info_dict': { - 'id': '2542', - 'ext': 'm4v', - 'title': 'Gloriajw-SpotifyWithErikBernhardsson182.m4v', - }, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - category = mobj.group('category') - video_id = mobj.group('id') - - entries = [] - - data = self._download_json( - 'https://raw.githubusercontent.com/pyvideo/data/master/%s/videos/%s.json' - % (category, video_id), video_id, fatal=False) - - if data: - for video in data['videos']: - video_url = video.get('url') - if video_url: - if video.get('type') == 'youtube': - entries.append(self.url_result(video_url, 'Youtube')) - else: - entries.append({ - 'id': compat_str(data.get('id') or video_id), - 'url': video_url, - 'title': data['title'], - 'description': data.get('description') or data.get('summary'), - 'thumbnail': data.get('thumbnail_url'), - 'duration': int_or_none(data.get('duration')), - }) - else: - webpage = self._download_webpage(url, video_id) - title = self._og_search_title(webpage) - media_urls = self._search_regex( - r'(?s)Media URL:(.+?)</li>', webpage, 'media urls') - for m in re.finditer( - r'<a[^>]+href=(["\'])(?P<url>http.+?)\1', media_urls): - media_url = m.group('url') - if re.match(r'https?://www\.youtube\.com/watch\?v=.*', media_url): - entries.append(self.url_result(media_url, 'Youtube')) - else: - entries.append({ - 'id': video_id, - 'url': media_url, - 'title': title, - }) - - return self.playlist_result(entries, video_id) diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py deleted file mode 100644 index 084308aeb..000000000 --- a/youtube_dl/extractor/qqmusic.py +++ /dev/null @@ -1,369 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import random -import re -import time - -from .common import InfoExtractor -from ..utils import ( - clean_html, - ExtractorError, - strip_jsonp, - unescapeHTML, -) - - -class QQMusicIE(InfoExtractor): - IE_NAME = 'qqmusic' - IE_DESC = 'QQ音乐' - _VALID_URL = r'https?://y\.qq\.com/n/yqq/song/(?P<id>[0-9A-Za-z]+)\.html' - _TESTS = [{ - 'url': 'https://y.qq.com/n/yqq/song/004295Et37taLD.html', - 'md5': '5f1e6cea39e182857da7ffc5ef5e6bb8', - 'info_dict': { - 'id': '004295Et37taLD', - 'ext': 'mp3', - 'title': '可惜没如果', - 'release_date': '20141227', - 'creator': '林俊杰', - 'description': 'md5:d85afb3051952ecc50a1ee8a286d1eac', - 'thumbnail': r're:^https?://.*\.jpg$', - } - }, { - 'note': 'There is no mp3-320 version of this song.', - 'url': 'https://y.qq.com/n/yqq/song/004MsGEo3DdNxV.html', - 'md5': 'fa3926f0c585cda0af8fa4f796482e3e', - 'info_dict': { - 'id': '004MsGEo3DdNxV', - 'ext': 'mp3', - 'title': '如果', - 'release_date': '20050626', - 'creator': '李季美', - 'description': 'md5:46857d5ed62bc4ba84607a805dccf437', - 'thumbnail': r're:^https?://.*\.jpg$', - } - }, { - 'note': 'lyrics not in .lrc format', - 'url': 'https://y.qq.com/n/yqq/song/001JyApY11tIp6.html', - 'info_dict': { - 'id': '001JyApY11tIp6', - 'ext': 'mp3', - 'title': 'Shadows Over Transylvania', - 'release_date': '19970225', - 'creator': 'Dark Funeral', - 'description': 'md5:c9b20210587cbcd6836a1c597bab4525', - 'thumbnail': r're:^https?://.*\.jpg$', - }, - 'params': { - 'skip_download': True, - }, - }] - - _FORMATS = { - 'mp3-320': {'prefix': 'M800', 'ext': 'mp3', 'preference': 40, 'abr': 320}, - 'mp3-128': {'prefix': 'M500', 'ext': 'mp3', 'preference': 30, 'abr': 128}, - 'm4a': {'prefix': 'C200', 'ext': 'm4a', 'preference': 10} - } - - # Reference: m_r_GetRUin() in top_player.js - # http://imgcache.gtimg.cn/music/portal_v3/y/top_player.js - @staticmethod - def m_r_get_ruin(): - curMs = int(time.time() * 1000) % 1000 - return int(round(random.random() * 2147483647) * curMs % 1E10) - - def _real_extract(self, url): - mid = self._match_id(url) - - detail_info_page = self._download_webpage( - 'http://s.plcloud.music.qq.com/fcgi-bin/fcg_yqq_song_detail_info.fcg?songmid=%s&play=0' % mid, - mid, note='Download song detail info', - errnote='Unable to get song detail info', encoding='gbk') - - song_name = self._html_search_regex( - r"songname:\s*'([^']+)'", detail_info_page, 'song name') - - publish_time = self._html_search_regex( - r'发行时间:(\d{4}-\d{2}-\d{2})', detail_info_page, - 'publish time', default=None) - if publish_time: - publish_time = publish_time.replace('-', '') - - singer = self._html_search_regex( - r"singer:\s*'([^']+)", detail_info_page, 'singer', default=None) - - lrc_content = self._html_search_regex( - r'<div class="content" id="lrc_content"[^<>]*>([^<>]+)</div>', - detail_info_page, 'LRC lyrics', default=None) - if lrc_content: - lrc_content = lrc_content.replace('\\n', '\n') - - thumbnail_url = None - albummid = self._search_regex( - [r'albummid:\'([0-9a-zA-Z]+)\'', r'"albummid":"([0-9a-zA-Z]+)"'], - detail_info_page, 'album mid', default=None) - if albummid: - thumbnail_url = 'http://i.gtimg.cn/music/photo/mid_album_500/%s/%s/%s.jpg' \ - % (albummid[-2:-1], albummid[-1], albummid) - - guid = self.m_r_get_ruin() - - vkey = self._download_json( - 'http://base.music.qq.com/fcgi-bin/fcg_musicexpress.fcg?json=3&guid=%s' % guid, - mid, note='Retrieve vkey', errnote='Unable to get vkey', - transform_source=strip_jsonp)['key'] - - formats = [] - for format_id, details in self._FORMATS.items(): - formats.append({ - 'url': 'http://cc.stream.qqmusic.qq.com/%s%s.%s?vkey=%s&guid=%s&fromtag=0' - % (details['prefix'], mid, details['ext'], vkey, guid), - 'format': format_id, - 'format_id': format_id, - 'preference': details['preference'], - 'abr': details.get('abr'), - }) - self._check_formats(formats, mid) - self._sort_formats(formats) - - actual_lrc_lyrics = ''.join( - line + '\n' for line in re.findall( - r'(?m)^(\[[0-9]{2}:[0-9]{2}(?:\.[0-9]{2,})?\][^\n]*|\[[^\]]*\])', lrc_content)) - - info_dict = { - 'id': mid, - 'formats': formats, - 'title': song_name, - 'release_date': publish_time, - 'creator': singer, - 'description': lrc_content, - 'thumbnail': thumbnail_url - } - if actual_lrc_lyrics: - info_dict['subtitles'] = { - 'origin': [{ - 'ext': 'lrc', - 'data': actual_lrc_lyrics, - }] - } - return info_dict - - -class QQPlaylistBaseIE(InfoExtractor): - @staticmethod - def qq_static_url(category, mid): - return 'http://y.qq.com/y/static/%s/%s/%s/%s.html' % (category, mid[-2], mid[-1], mid) - - def get_singer_all_songs(self, singmid, num): - return self._download_webpage( - r'https://c.y.qq.com/v8/fcg-bin/fcg_v8_singer_track_cp.fcg', singmid, - query={ - 'format': 'json', - 'inCharset': 'utf8', - 'outCharset': 'utf-8', - 'platform': 'yqq', - 'needNewCode': 0, - 'singermid': singmid, - 'order': 'listen', - 'begin': 0, - 'num': num, - 'songstatus': 1, - }) - - def get_entries_from_page(self, singmid): - entries = [] - - default_num = 1 - json_text = self.get_singer_all_songs(singmid, default_num) - json_obj_all_songs = self._parse_json(json_text, singmid) - - if json_obj_all_songs['code'] == 0: - total = json_obj_all_songs['data']['total'] - json_text = self.get_singer_all_songs(singmid, total) - json_obj_all_songs = self._parse_json(json_text, singmid) - - for item in json_obj_all_songs['data']['list']: - if item['musicData'].get('songmid') is not None: - songmid = item['musicData']['songmid'] - entries.append(self.url_result( - r'https://y.qq.com/n/yqq/song/%s.html' % songmid, 'QQMusic', songmid)) - - return entries - - -class QQMusicSingerIE(QQPlaylistBaseIE): - IE_NAME = 'qqmusic:singer' - IE_DESC = 'QQ音乐 - 歌手' - _VALID_URL = r'https?://y\.qq\.com/n/yqq/singer/(?P<id>[0-9A-Za-z]+)\.html' - _TEST = { - 'url': 'https://y.qq.com/n/yqq/singer/001BLpXF2DyJe2.html', - 'info_dict': { - 'id': '001BLpXF2DyJe2', - 'title': '林俊杰', - 'description': 'md5:870ec08f7d8547c29c93010899103751', - }, - 'playlist_mincount': 12, - } - - def _real_extract(self, url): - mid = self._match_id(url) - - entries = self.get_entries_from_page(mid) - singer_page = self._download_webpage(url, mid, 'Download singer page') - singer_name = self._html_search_regex( - r"singername\s*:\s*'(.*?)'", singer_page, 'singer name', default=None) - singer_desc = None - - if mid: - singer_desc_page = self._download_xml( - 'http://s.plcloud.music.qq.com/fcgi-bin/fcg_get_singer_desc.fcg', mid, - 'Donwload singer description XML', - query={'utf8': 1, 'outCharset': 'utf-8', 'format': 'xml', 'singermid': mid}, - headers={'Referer': 'https://y.qq.com/n/yqq/singer/'}) - - singer_desc = singer_desc_page.find('./data/info/desc').text - - return self.playlist_result(entries, mid, singer_name, singer_desc) - - -class QQMusicAlbumIE(QQPlaylistBaseIE): - IE_NAME = 'qqmusic:album' - IE_DESC = 'QQ音乐 - 专辑' - _VALID_URL = r'https?://y\.qq\.com/n/yqq/album/(?P<id>[0-9A-Za-z]+)\.html' - - _TESTS = [{ - 'url': 'https://y.qq.com/n/yqq/album/000gXCTb2AhRR1.html', - 'info_dict': { - 'id': '000gXCTb2AhRR1', - 'title': '我们都是这样长大的', - 'description': 'md5:179c5dce203a5931970d306aa9607ea6', - }, - 'playlist_count': 4, - }, { - 'url': 'https://y.qq.com/n/yqq/album/002Y5a3b3AlCu3.html', - 'info_dict': { - 'id': '002Y5a3b3AlCu3', - 'title': '그리고...', - 'description': 'md5:a48823755615508a95080e81b51ba729', - }, - 'playlist_count': 8, - }] - - def _real_extract(self, url): - mid = self._match_id(url) - - album = self._download_json( - 'http://i.y.qq.com/v8/fcg-bin/fcg_v8_album_info_cp.fcg?albummid=%s&format=json' % mid, - mid, 'Download album page')['data'] - - entries = [ - self.url_result( - 'https://y.qq.com/n/yqq/song/' + song['songmid'] + '.html', 'QQMusic', song['songmid'] - ) for song in album['list'] - ] - album_name = album.get('name') - album_detail = album.get('desc') - if album_detail is not None: - album_detail = album_detail.strip() - - return self.playlist_result(entries, mid, album_name, album_detail) - - -class QQMusicToplistIE(QQPlaylistBaseIE): - IE_NAME = 'qqmusic:toplist' - IE_DESC = 'QQ音乐 - 排行榜' - _VALID_URL = r'https?://y\.qq\.com/n/yqq/toplist/(?P<id>[0-9]+)\.html' - - _TESTS = [{ - 'url': 'https://y.qq.com/n/yqq/toplist/123.html', - 'info_dict': { - 'id': '123', - 'title': '美国iTunes榜', - 'description': 'md5:89db2335fdbb10678dee2d43fe9aba08', - }, - 'playlist_count': 100, - }, { - 'url': 'https://y.qq.com/n/yqq/toplist/3.html', - 'info_dict': { - 'id': '3', - 'title': '巅峰榜·欧美', - 'description': 'md5:5a600d42c01696b26b71f8c4d43407da', - }, - 'playlist_count': 100, - }, { - 'url': 'https://y.qq.com/n/yqq/toplist/106.html', - 'info_dict': { - 'id': '106', - 'title': '韩国Mnet榜', - 'description': 'md5:cb84b325215e1d21708c615cac82a6e7', - }, - 'playlist_count': 50, - }] - - def _real_extract(self, url): - list_id = self._match_id(url) - - toplist_json = self._download_json( - 'http://i.y.qq.com/v8/fcg-bin/fcg_v8_toplist_cp.fcg', list_id, - note='Download toplist page', - query={'type': 'toplist', 'topid': list_id, 'format': 'json'}) - - entries = [self.url_result( - 'https://y.qq.com/n/yqq/song/' + song['data']['songmid'] + '.html', 'QQMusic', - song['data']['songmid']) - for song in toplist_json['songlist']] - - topinfo = toplist_json.get('topinfo', {}) - list_name = topinfo.get('ListName') - list_description = topinfo.get('info') - return self.playlist_result(entries, list_id, list_name, list_description) - - -class QQMusicPlaylistIE(QQPlaylistBaseIE): - IE_NAME = 'qqmusic:playlist' - IE_DESC = 'QQ音乐 - 歌单' - _VALID_URL = r'https?://y\.qq\.com/n/yqq/playlist/(?P<id>[0-9]+)\.html' - - _TESTS = [{ - 'url': 'http://y.qq.com/n/yqq/playlist/3462654915.html', - 'info_dict': { - 'id': '3462654915', - 'title': '韩国5月新歌精选下旬', - 'description': 'md5:d2c9d758a96b9888cf4fe82f603121d4', - }, - 'playlist_count': 40, - 'skip': 'playlist gone', - }, { - 'url': 'https://y.qq.com/n/yqq/playlist/1374105607.html', - 'info_dict': { - 'id': '1374105607', - 'title': '易入人心的华语民谣', - 'description': '民谣的歌曲易于传唱、、歌词朗朗伤口、旋律简单温馨。属于那种才入耳孔。却上心头的感觉。没有太多的复杂情绪。简单而直接地表达乐者的情绪,就是这样的简单才易入人心。', - }, - 'playlist_count': 20, - }] - - def _real_extract(self, url): - list_id = self._match_id(url) - - list_json = self._download_json( - 'http://i.y.qq.com/qzone-music/fcg-bin/fcg_ucc_getcdinfo_byids_cp.fcg', - list_id, 'Download list page', - query={'type': 1, 'json': 1, 'utf8': 1, 'onlysong': 0, 'disstid': list_id}, - transform_source=strip_jsonp) - if not len(list_json.get('cdlist', [])): - if list_json.get('code'): - raise ExtractorError( - 'QQ Music said: error %d in fetching playlist info' % list_json['code'], - expected=True) - raise ExtractorError('Unable to get playlist info') - - cdlist = list_json['cdlist'][0] - entries = [self.url_result( - 'https://y.qq.com/n/yqq/song/' + song['songmid'] + '.html', 'QQMusic', song['songmid']) - for song in cdlist['songlist']] - - list_name = cdlist.get('dissname') - list_description = clean_html(unescapeHTML(cdlist.get('desc'))) - return self.playlist_result(entries, list_id, list_name, list_description) diff --git a/youtube_dl/extractor/radiocanada.py b/youtube_dl/extractor/radiocanada.py deleted file mode 100644 index a28b1a24c..000000000 --- a/youtube_dl/extractor/radiocanada.py +++ /dev/null @@ -1,171 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_HTTPError -from ..utils import ( - determine_ext, - ExtractorError, - int_or_none, - unified_strdate, -) - - -class RadioCanadaIE(InfoExtractor): - IE_NAME = 'radiocanada' - _VALID_URL = r'(?:radiocanada:|https?://ici\.radio-canada\.ca/widgets/mediaconsole/)(?P<app_code>[^:/]+)[:/](?P<id>[0-9]+)' - _TESTS = [ - { - 'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7184272', - 'info_dict': { - 'id': '7184272', - 'ext': 'mp4', - 'title': 'Le parcours du tireur capté sur vidéo', - 'description': 'Images des caméras de surveillance fournies par la GRC montrant le parcours du tireur d\'Ottawa', - 'upload_date': '20141023', - }, - 'params': { - # m3u8 download - 'skip_download': True, - } - }, - { - # empty Title - 'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7754998/', - 'info_dict': { - 'id': '7754998', - 'ext': 'mp4', - 'title': 'letelejournal22h', - 'description': 'INTEGRALE WEB 22H-TJ', - 'upload_date': '20170720', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, - { - # with protectionType but not actually DRM protected - 'url': 'radiocanada:toutv:140872', - 'info_dict': { - 'id': '140872', - 'title': 'Épisode 1', - 'series': 'District 31', - }, - 'only_matching': True, - } - ] - _GEO_COUNTRIES = ['CA'] - _access_token = None - _claims = None - - def _call_api(self, path, video_id=None, app_code=None, query=None): - if not query: - query = {} - query.update({ - 'client_key': '773aea60-0e80-41bb-9c7f-e6d7c3ad17fb', - 'output': 'json', - }) - if video_id: - query.update({ - 'appCode': app_code, - 'idMedia': video_id, - }) - if self._access_token: - query['access_token'] = self._access_token - try: - return self._download_json( - 'https://services.radio-canada.ca/media/' + path, video_id, query=query) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 422): - data = self._parse_json(e.cause.read().decode(), None) - error = data.get('error_description') or data['errorMessage']['text'] - raise ExtractorError(error, expected=True) - raise - - def _extract_info(self, app_code, video_id): - metas = self._call_api('meta/v1/index.ashx', video_id, app_code)['Metas'] - - def get_meta(name): - for meta in metas: - if meta.get('name') == name: - text = meta.get('text') - if text: - return text - - # protectionType does not necessarily mean the video is DRM protected (see - # https://github.com/ytdl-org/youtube-dl/pull/18609). - if get_meta('protectionType'): - self.report_warning('This video is probably DRM protected.') - - query = { - 'connectionType': 'hd', - 'deviceType': 'ipad', - 'multibitrate': 'true', - } - if self._claims: - query['claims'] = self._claims - v_data = self._call_api('validation/v2/', video_id, app_code, query) - v_url = v_data.get('url') - if not v_url: - error = v_data['message'] - if error == "Le contenu sélectionné n'est pas disponible dans votre pays": - raise self.raise_geo_restricted(error, self._GEO_COUNTRIES) - if error == 'Le contenu sélectionné est disponible seulement en premium': - self.raise_login_required(error) - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, error), expected=True) - formats = self._extract_m3u8_formats(v_url, video_id, 'mp4') - self._sort_formats(formats) - - subtitles = {} - closed_caption_url = get_meta('closedCaption') or get_meta('closedCaptionHTML5') - if closed_caption_url: - subtitles['fr'] = [{ - 'url': closed_caption_url, - 'ext': determine_ext(closed_caption_url, 'vtt'), - }] - - return { - 'id': video_id, - 'title': get_meta('Title') or get_meta('AV-nomEmission'), - 'description': get_meta('Description') or get_meta('ShortDescription'), - 'thumbnail': get_meta('imageHR') or get_meta('imageMR') or get_meta('imageBR'), - 'duration': int_or_none(get_meta('length')), - 'series': get_meta('Emission'), - 'season_number': int_or_none('SrcSaison'), - 'episode_number': int_or_none('SrcEpisode'), - 'upload_date': unified_strdate(get_meta('Date')), - 'subtitles': subtitles, - 'formats': formats, - } - - def _real_extract(self, url): - return self._extract_info(*re.match(self._VALID_URL, url).groups()) - - -class RadioCanadaAudioVideoIE(InfoExtractor): - IE_NAME = 'radiocanada:audiovideo' - _VALID_URL = r'https?://ici\.radio-canada\.ca/([^/]+/)*media-(?P<id>[0-9]+)' - _TESTS = [{ - 'url': 'http://ici.radio-canada.ca/audio-video/media-7527184/barack-obama-au-vietnam', - 'info_dict': { - 'id': '7527184', - 'ext': 'mp4', - 'title': 'Barack Obama au Vietnam', - 'description': 'Les États-Unis lèvent l\'embargo sur la vente d\'armes qui datait de la guerre du Vietnam', - 'upload_date': '20160523', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'https://ici.radio-canada.ca/info/videos/media-7527184/barack-obama-au-vietnam', - 'only_matching': True, - }] - - def _real_extract(self, url): - return self.url_result('radiocanada:medianet:%s' % self._match_id(url)) diff --git a/youtube_dl/extractor/radiofrance.py b/youtube_dl/extractor/radiofrance.py deleted file mode 100644 index a8afc0014..000000000 --- a/youtube_dl/extractor/radiofrance.py +++ /dev/null @@ -1,59 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - - -class RadioFranceIE(InfoExtractor): - _VALID_URL = r'^https?://maison\.radiofrance\.fr/radiovisions/(?P<id>[^?#]+)' - IE_NAME = 'radiofrance' - - _TEST = { - 'url': 'http://maison.radiofrance.fr/radiovisions/one-one', - 'md5': 'bdbb28ace95ed0e04faab32ba3160daf', - 'info_dict': { - 'id': 'one-one', - 'ext': 'ogg', - 'title': 'One to one', - 'description': "Plutôt que d'imaginer la radio de demain comme technologie ou comme création de contenu, je veux montrer que quelles que soient ses évolutions, j'ai l'intime conviction que la radio continuera d'être un grand média de proximité pour les auditeurs.", - 'uploader': 'Thomas Hercouët', - }, - } - - def _real_extract(self, url): - m = re.match(self._VALID_URL, url) - video_id = m.group('id') - - webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, 'title') - description = self._html_search_regex( - r'<div class="bloc_page_wrapper"><div class="text">(.*?)</div>', - webpage, 'description', fatal=False) - uploader = self._html_search_regex( - r'<div class="credit"> © (.*?)</div>', - webpage, 'uploader', fatal=False) - - formats_str = self._html_search_regex( - r'class="jp-jplayer[^"]*" data-source="([^"]+)">', - webpage, 'audio URLs') - formats = [ - { - 'format_id': fm[0], - 'url': fm[1], - 'vcodec': 'none', - 'preference': i, - } - for i, fm in - enumerate(re.findall(r"([a-z0-9]+)\s*:\s*'([^']+)'", formats_str)) - ] - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'description': description, - 'uploader': uploader, - } diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py deleted file mode 100644 index 67b86fc72..000000000 --- a/youtube_dl/extractor/rai.py +++ /dev/null @@ -1,487 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_urlparse, - compat_str, -) -from ..utils import ( - ExtractorError, - determine_ext, - find_xpath_attr, - fix_xml_ampersands, - GeoRestrictedError, - int_or_none, - parse_duration, - remove_start, - strip_or_none, - try_get, - unified_strdate, - unified_timestamp, - update_url_query, - urljoin, - xpath_text, -) - - -class RaiBaseIE(InfoExtractor): - _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' - _GEO_COUNTRIES = ['IT'] - _GEO_BYPASS = False - - def _extract_relinker_info(self, relinker_url, video_id): - if not re.match(r'https?://', relinker_url): - return {'formats': [{'url': relinker_url}]} - - formats = [] - geoprotection = None - is_live = None - duration = None - - for platform in ('mon', 'flash', 'native'): - relinker = self._download_xml( - relinker_url, video_id, - note='Downloading XML metadata for platform %s' % platform, - transform_source=fix_xml_ampersands, - query={'output': 45, 'pl': platform}, - headers=self.geo_verification_headers()) - - if not geoprotection: - geoprotection = xpath_text( - relinker, './geoprotection', default=None) == 'Y' - - if not is_live: - is_live = xpath_text( - relinker, './is_live', default=None) == 'Y' - if not duration: - duration = parse_duration(xpath_text( - relinker, './duration', default=None)) - - url_elem = find_xpath_attr(relinker, './url', 'type', 'content') - if url_elem is None: - continue - - media_url = url_elem.text - - # This does not imply geo restriction (e.g. - # http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html) - if '/video_no_available.mp4' in media_url: - continue - - ext = determine_ext(media_url) - if (ext == 'm3u8' and platform != 'mon') or (ext == 'f4m' and platform != 'flash'): - continue - - if ext == 'm3u8' or 'format=m3u8' in media_url or platform == 'mon': - formats.extend(self._extract_m3u8_formats( - media_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - elif ext == 'f4m' or platform == 'flash': - manifest_url = update_url_query( - media_url.replace('manifest#live_hds.f4m', 'manifest.f4m'), - {'hdcore': '3.7.0', 'plugin': 'aasp-3.7.0.39.44'}) - formats.extend(self._extract_f4m_formats( - manifest_url, video_id, f4m_id='hds', fatal=False)) - else: - bitrate = int_or_none(xpath_text(relinker, 'bitrate')) - formats.append({ - 'url': media_url, - 'tbr': bitrate if bitrate > 0 else None, - 'format_id': 'http-%d' % bitrate if bitrate > 0 else 'http', - }) - - if not formats and geoprotection is True: - self.raise_geo_restricted(countries=self._GEO_COUNTRIES) - - return dict((k, v) for k, v in { - 'is_live': is_live, - 'duration': duration, - 'formats': formats, - }.items() if v is not None) - - @staticmethod - def _extract_subtitles(url, video_data): - STL_EXT = 'stl' - SRT_EXT = 'srt' - subtitles = {} - subtitles_array = video_data.get('subtitlesArray') or [] - for k in ('subtitles', 'subtitlesUrl'): - subtitles_array.append({'url': video_data.get(k)}) - for subtitle in subtitles_array: - sub_url = subtitle.get('url') - if sub_url and isinstance(sub_url, compat_str): - sub_lang = subtitle.get('language') or 'it' - sub_url = urljoin(url, sub_url) - sub_ext = determine_ext(sub_url, SRT_EXT) - subtitles.setdefault(sub_lang, []).append({ - 'ext': sub_ext, - 'url': sub_url, - }) - if STL_EXT == sub_ext: - subtitles[sub_lang].append({ - 'ext': SRT_EXT, - 'url': sub_url[:-len(STL_EXT)] + SRT_EXT, - }) - return subtitles - - -class RaiPlayIE(RaiBaseIE): - _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/.+?-(?P<id>%s))\.(?:html|json)' % RaiBaseIE._UUID_RE - _TESTS = [{ - 'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html', - 'md5': '8970abf8caf8aef4696e7b1f2adfc696', - 'info_dict': { - 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', - 'ext': 'mp4', - 'title': 'Report del 07/04/2014', - 'alt_title': 'St 2013/14 - Espresso nel caffè - 07/04/2014', - 'description': 'md5:d730c168a58f4bb35600fc2f881ec04e', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Rai Gulp', - 'duration': 6160, - 'series': 'Report', - 'season': '2013/14', - 'subtitles': { - 'it': 'count:2', - }, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?', - 'only_matching': True, - }, { - # subtitles at 'subtitlesArray' key (see #27698) - 'url': 'https://www.raiplay.it/video/2020/12/Report---04-01-2021-2e90f1de-8eee-4de4-ac0e-78d21db5b600.html', - 'only_matching': True, - }] - - def _real_extract(self, url): - base, video_id = re.match(self._VALID_URL, url).groups() - - media = self._download_json( - base + '.json', video_id, 'Downloading video JSON') - - title = media['name'] - - video = media['video'] - - relinker_info = self._extract_relinker_info(video['content_url'], video_id) - self._sort_formats(relinker_info['formats']) - - thumbnails = [] - for _, value in media.get('images', {}).items(): - if value: - thumbnails.append({ - 'url': urljoin(url, value), - }) - - date_published = media.get('date_published') - time_published = media.get('time_published') - if date_published and time_published: - date_published += ' ' + time_published - - subtitles = self._extract_subtitles(url, video) - - program_info = media.get('program_info') or {} - season = media.get('season') - - info = { - 'id': remove_start(media.get('id'), 'ContentItem-') or video_id, - 'display_id': video_id, - 'title': self._live_title(title) if relinker_info.get( - 'is_live') else title, - 'alt_title': strip_or_none(media.get('subtitle')), - 'description': media.get('description'), - 'uploader': strip_or_none(media.get('channel')), - 'creator': strip_or_none(media.get('editor') or None), - 'duration': parse_duration(video.get('duration')), - 'timestamp': unified_timestamp(date_published), - 'thumbnails': thumbnails, - 'series': program_info.get('name'), - 'season_number': int_or_none(season), - 'season': season if (season and not season.isdigit()) else None, - 'episode': media.get('episode_title'), - 'episode_number': int_or_none(media.get('episode')), - 'subtitles': subtitles, - } - - info.update(relinker_info) - return info - - -class RaiPlayLiveIE(RaiPlayIE): - _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/dirette/(?P<id>[^/?#&]+))' - _TESTS = [{ - 'url': 'http://www.raiplay.it/dirette/rainews24', - 'info_dict': { - 'id': 'd784ad40-e0ae-4a69-aa76-37519d238a9c', - 'display_id': 'rainews24', - 'ext': 'mp4', - 'title': 're:^Diretta di Rai News 24 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'description': 'md5:4d00bcf6dc98b27c6ec480de329d1497', - 'uploader': 'Rai News 24', - 'creator': 'Rai News 24', - 'is_live': True, - }, - 'params': { - 'skip_download': True, - }, - }] - - -class RaiPlayPlaylistIE(InfoExtractor): - _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/programmi/(?P<id>[^/?#&]+))' - _TESTS = [{ - 'url': 'http://www.raiplay.it/programmi/nondirloalmiocapo/', - 'info_dict': { - 'id': 'nondirloalmiocapo', - 'title': 'Non dirlo al mio capo', - 'description': 'md5:98ab6b98f7f44c2843fd7d6f045f153b', - }, - 'playlist_mincount': 12, - }] - - def _real_extract(self, url): - base, playlist_id = re.match(self._VALID_URL, url).groups() - - program = self._download_json( - base + '.json', playlist_id, 'Downloading program JSON') - - entries = [] - for b in (program.get('blocks') or []): - for s in (b.get('sets') or []): - s_id = s.get('id') - if not s_id: - continue - medias = self._download_json( - '%s/%s.json' % (base, s_id), s_id, - 'Downloading content set JSON', fatal=False) - if not medias: - continue - for m in (medias.get('items') or []): - path_id = m.get('path_id') - if not path_id: - continue - video_url = urljoin(url, path_id) - entries.append(self.url_result( - video_url, ie=RaiPlayIE.ie_key(), - video_id=RaiPlayIE._match_id(video_url))) - - return self.playlist_result( - entries, playlist_id, program.get('name'), - try_get(program, lambda x: x['program_info']['description'])) - - -class RaiIE(RaiBaseIE): - _VALID_URL = r'https?://[^/]+\.(?:rai\.(?:it|tv)|rainews\.it)/.+?-(?P<id>%s)(?:-.+?)?\.html' % RaiBaseIE._UUID_RE - _TESTS = [{ - # var uniquename = "ContentItem-..." - # data-id="ContentItem-..." - 'url': 'http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html', - 'info_dict': { - 'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9', - 'ext': 'mp4', - 'title': 'TG PRIMO TEMPO', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 1758, - 'upload_date': '20140612', - }, - 'skip': 'This content is available only in Italy', - }, { - # with ContentItem in many metas - 'url': 'http://www.rainews.it/dl/rainews/media/Weekend-al-cinema-da-Hollywood-arriva-il-thriller-di-Tate-Taylor-La-ragazza-del-treno-1632c009-c843-4836-bb65-80c33084a64b.html', - 'info_dict': { - 'id': '1632c009-c843-4836-bb65-80c33084a64b', - 'ext': 'mp4', - 'title': 'Weekend al cinema, da Hollywood arriva il thriller di Tate Taylor "La ragazza del treno"', - 'description': 'I film in uscita questa settimana.', - 'thumbnail': r're:^https?://.*\.png$', - 'duration': 833, - 'upload_date': '20161103', - } - }, { - # with ContentItem in og:url - 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-efb17665-691c-45d5-a60c-5301333cbb0c.html', - 'md5': '6865dd00cf0bbf5772fdd89d59bd768a', - 'info_dict': { - 'id': 'efb17665-691c-45d5-a60c-5301333cbb0c', - 'ext': 'mp4', - 'title': 'TG1 ore 20:00 del 03/11/2016', - 'description': 'TG1 edizione integrale ore 20:00 del giorno 03/11/2016', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 2214, - 'upload_date': '20161103', - } - }, { - # initEdizione('ContentItem-...' - 'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', - 'info_dict': { - 'id': 'c2187016-8484-4e3a-8ac8-35e475b07303', - 'ext': 'mp4', - 'title': r're:TG1 ore \d{2}:\d{2} del \d{2}/\d{2}/\d{4}', - 'duration': 2274, - 'upload_date': '20170401', - }, - 'skip': 'Changes daily', - }, { - # HLS live stream with ContentItem in og:url - 'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html', - 'info_dict': { - 'id': '3156f2f2-dc70-4953-8e2f-70d7489d4ce9', - 'ext': 'mp4', - 'title': 'La diretta di Rainews24', - }, - 'params': { - 'skip_download': True, - }, - }, { - # ContentItem in iframe (see #12652) and subtitle at 'subtitlesUrl' key - 'url': 'http://www.presadiretta.rai.it/dl/portali/site/puntata/ContentItem-3ed19d13-26c2-46ff-a551-b10828262f1b.html', - 'info_dict': { - 'id': '1ad6dc64-444a-42a4-9bea-e5419ad2f5fd', - 'ext': 'mp4', - 'title': 'Partiti acchiappavoti - Presa diretta del 13/09/2015', - 'description': 'md5:d291b03407ec505f95f27970c0b025f4', - 'upload_date': '20150913', - 'subtitles': { - 'it': 'count:2', - }, - }, - 'params': { - 'skip_download': True, - }, - }, { - # Direct MMS URL - 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-b63a4089-ac28-48cf-bca5-9f5b5bc46df5.html', - 'only_matching': True, - }, { - 'url': 'https://www.rainews.it/tgr/marche/notiziari/video/2019/02/ContentItem-6ba945a2-889c-4a80-bdeb-8489c70a8db9.html', - 'only_matching': True, - }] - - def _extract_from_content_id(self, content_id, url): - media = self._download_json( - 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % content_id, - content_id, 'Downloading video JSON') - - title = media['name'].strip() - - media_type = media['type'] - if 'Audio' in media_type: - relinker_info = { - 'formats': [{ - 'format_id': media.get('formatoAudio'), - 'url': media['audioUrl'], - 'ext': media.get('formatoAudio'), - }] - } - elif 'Video' in media_type: - relinker_info = self._extract_relinker_info(media['mediaUri'], content_id) - else: - raise ExtractorError('not a media file') - - self._sort_formats(relinker_info['formats']) - - thumbnails = [] - for image_type in ('image', 'image_medium', 'image_300'): - thumbnail_url = media.get(image_type) - if thumbnail_url: - thumbnails.append({ - 'url': compat_urlparse.urljoin(url, thumbnail_url), - }) - - subtitles = self._extract_subtitles(url, media) - - info = { - 'id': content_id, - 'title': title, - 'description': strip_or_none(media.get('desc')), - 'thumbnails': thumbnails, - 'uploader': media.get('author'), - 'upload_date': unified_strdate(media.get('date')), - 'duration': parse_duration(media.get('length')), - 'subtitles': subtitles, - } - - info.update(relinker_info) - - return info - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - content_item_id = None - - content_item_url = self._html_search_meta( - ('og:url', 'og:video', 'og:video:secure_url', 'twitter:url', - 'twitter:player', 'jsonlink'), webpage, default=None) - if content_item_url: - content_item_id = self._search_regex( - r'ContentItem-(%s)' % self._UUID_RE, content_item_url, - 'content item id', default=None) - - if not content_item_id: - content_item_id = self._search_regex( - r'''(?x) - (?: - (?:initEdizione|drawMediaRaiTV)\(| - <(?:[^>]+\bdata-id|var\s+uniquename)=| - <iframe[^>]+\bsrc= - ) - (["\']) - (?:(?!\1).)*\bContentItem-(?P<id>%s) - ''' % self._UUID_RE, - webpage, 'content item id', default=None, group='id') - - content_item_ids = set() - if content_item_id: - content_item_ids.add(content_item_id) - if video_id not in content_item_ids: - content_item_ids.add(video_id) - - for content_item_id in content_item_ids: - try: - return self._extract_from_content_id(content_item_id, url) - except GeoRestrictedError: - raise - except ExtractorError: - pass - - relinker_url = self._proto_relative_url(self._search_regex( - r'''(?x) - (?: - var\s+videoURL| - mediaInfo\.mediaUri - )\s*=\s* - ([\'"]) - (?P<url> - (?:https?:)? - //mediapolis(?:vod)?\.rai\.it/relinker/relinkerServlet\.htm\? - (?:(?!\1).)*\bcont=(?:(?!\1).)+)\1 - ''', - webpage, 'relinker URL', group='url')) - - relinker_info = self._extract_relinker_info( - urljoin(url, relinker_url), video_id) - self._sort_formats(relinker_info['formats']) - - title = self._search_regex( - r'var\s+videoTitolo\s*=\s*([\'"])(?P<title>[^\'"]+)\1', - webpage, 'title', group='title', - default=None) or self._og_search_title(webpage) - - info = { - 'id': video_id, - 'title': title, - } - - info.update(relinker_info) - - return info diff --git a/youtube_dl/extractor/raywenderlich.py b/youtube_dl/extractor/raywenderlich.py deleted file mode 100644 index 5411ece21..000000000 --- a/youtube_dl/extractor/raywenderlich.py +++ /dev/null @@ -1,179 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from .vimeo import VimeoIE -from ..compat import compat_str -from ..utils import ( - ExtractorError, - int_or_none, - merge_dicts, - try_get, - unescapeHTML, - unified_timestamp, - urljoin, -) - - -class RayWenderlichIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?:// - (?: - videos\.raywenderlich\.com/courses| - (?:www\.)?raywenderlich\.com - )/ - (?P<course_id>[^/]+)/lessons/(?P<id>\d+) - ''' - - _TESTS = [{ - 'url': 'https://www.raywenderlich.com/3530-testing-in-ios/lessons/1', - 'info_dict': { - 'id': '248377018', - 'ext': 'mp4', - 'title': 'Introduction', - 'description': 'md5:804d031b3efa9fcb49777d512d74f722', - 'timestamp': 1513906277, - 'upload_date': '20171222', - 'duration': 133, - 'uploader': 'Ray Wenderlich', - 'uploader_id': 'user3304672', - }, - 'params': { - 'noplaylist': True, - 'skip_download': True, - }, - 'add_ie': [VimeoIE.ie_key()], - 'expected_warnings': ['HTTP Error 403: Forbidden'], - }, { - 'url': 'https://videos.raywenderlich.com/courses/105-testing-in-ios/lessons/1', - 'only_matching': True, - }] - - @staticmethod - def _extract_video_id(data, lesson_id): - if not data: - return - groups = try_get(data, lambda x: x['groups'], list) or [] - if not groups: - return - for group in groups: - if not isinstance(group, dict): - continue - contents = try_get(data, lambda x: x['contents'], list) or [] - for content in contents: - if not isinstance(content, dict): - continue - ordinal = int_or_none(content.get('ordinal')) - if ordinal != lesson_id: - continue - video_id = content.get('identifier') - if video_id: - return compat_str(video_id) - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - course_id, lesson_id = mobj.group('course_id', 'id') - display_id = '%s/%s' % (course_id, lesson_id) - - webpage = self._download_webpage(url, display_id) - - thumbnail = self._og_search_thumbnail( - webpage, default=None) or self._html_search_meta( - 'twitter:image', webpage, 'thumbnail') - - if '>Subscribe to unlock' in webpage: - raise ExtractorError( - 'This content is only available for subscribers', - expected=True) - - info = { - 'thumbnail': thumbnail, - } - - vimeo_id = self._search_regex( - r'data-vimeo-id=["\'](\d+)', webpage, 'vimeo id', default=None) - - if not vimeo_id: - data = self._parse_json( - self._search_regex( - r'data-collection=(["\'])(?P<data>{.+?})\1', webpage, - 'data collection', default='{}', group='data'), - display_id, transform_source=unescapeHTML, fatal=False) - video_id = self._extract_video_id( - data, lesson_id) or self._search_regex( - r'/videos/(\d+)/', thumbnail, 'video id') - headers = { - 'Referer': url, - 'X-Requested-With': 'XMLHttpRequest', - } - csrf_token = self._html_search_meta( - 'csrf-token', webpage, 'csrf token', default=None) - if csrf_token: - headers['X-CSRF-Token'] = csrf_token - video = self._download_json( - 'https://videos.raywenderlich.com/api/v1/videos/%s.json' - % video_id, display_id, headers=headers)['video'] - vimeo_id = video['clips'][0]['provider_id'] - info.update({ - '_type': 'url_transparent', - 'title': video.get('name'), - 'description': video.get('description') or video.get( - 'meta_description'), - 'duration': int_or_none(video.get('duration')), - 'timestamp': unified_timestamp(video.get('created_at')), - }) - - return merge_dicts(info, self.url_result( - VimeoIE._smuggle_referrer( - 'https://player.vimeo.com/video/%s' % vimeo_id, url), - ie=VimeoIE.ie_key(), video_id=vimeo_id)) - - -class RayWenderlichCourseIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?:// - (?: - videos\.raywenderlich\.com/courses| - (?:www\.)?raywenderlich\.com - )/ - (?P<id>[^/]+) - ''' - - _TEST = { - 'url': 'https://www.raywenderlich.com/3530-testing-in-ios', - 'info_dict': { - 'title': 'Testing in iOS', - 'id': '3530-testing-in-ios', - }, - 'params': { - 'noplaylist': False, - }, - 'playlist_count': 29, - } - - @classmethod - def suitable(cls, url): - return False if RayWenderlichIE.suitable(url) else super( - RayWenderlichCourseIE, cls).suitable(url) - - def _real_extract(self, url): - course_id = self._match_id(url) - - webpage = self._download_webpage(url, course_id) - - entries = [] - lesson_urls = set() - for lesson_url in re.findall( - r'<a[^>]+\bhref=["\'](/%s/lessons/\d+)' % course_id, webpage): - if lesson_url in lesson_urls: - continue - lesson_urls.add(lesson_url) - entries.append(self.url_result( - urljoin(url, lesson_url), ie=RayWenderlichIE.ie_key())) - - title = self._og_search_title( - webpage, default=None) or self._html_search_meta( - 'twitter:title', webpage, 'title', default=None) - - return self.playlist_result(entries, course_id, title) diff --git a/youtube_dl/extractor/rbmaradio.py b/youtube_dl/extractor/rbmaradio.py deleted file mode 100644 index ae7413fb5..000000000 --- a/youtube_dl/extractor/rbmaradio.py +++ /dev/null @@ -1,72 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - clean_html, - int_or_none, - unified_timestamp, - update_url_query, -) - - -class RBMARadioIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:rbmaradio|redbullradio)\.com/shows/(?P<show_id>[^/]+)/episodes/(?P<id>[^/?#&]+)' - _TEST = { - 'url': 'https://www.rbmaradio.com/shows/main-stage/episodes/ford-lopatin-live-at-primavera-sound-2011', - 'md5': '6bc6f9bcb18994b4c983bc3bf4384d95', - 'info_dict': { - 'id': 'ford-lopatin-live-at-primavera-sound-2011', - 'ext': 'mp3', - 'title': 'Main Stage - Ford & Lopatin at Primavera Sound', - 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 2452, - 'timestamp': 1307103164, - 'upload_date': '20110603', - }, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - show_id = mobj.group('show_id') - episode_id = mobj.group('id') - - webpage = self._download_webpage(url, episode_id) - - episode = self._parse_json( - self._search_regex( - r'__INITIAL_STATE__\s*=\s*({.+?})\s*</script>', - webpage, 'json data'), - episode_id)['episodes'][show_id][episode_id] - - title = episode['title'] - - show_title = episode.get('showTitle') - if show_title: - title = '%s - %s' % (show_title, title) - - formats = [{ - 'url': update_url_query(episode['audioURL'], query={'cbr': abr}), - 'format_id': compat_str(abr), - 'abr': abr, - 'vcodec': 'none', - } for abr in (96, 128, 192, 256)] - self._check_formats(formats, episode_id) - - description = clean_html(episode.get('longTeaser')) - thumbnail = self._proto_relative_url(episode.get('imageURL', {}).get('landscape')) - duration = int_or_none(episode.get('duration')) - timestamp = unified_timestamp(episode.get('publishedAt')) - - return { - 'id': episode_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'timestamp': timestamp, - 'formats': formats, - } diff --git a/youtube_dl/extractor/redbulltv.py b/youtube_dl/extractor/redbulltv.py deleted file mode 100644 index 6d000b372..000000000 --- a/youtube_dl/extractor/redbulltv.py +++ /dev/null @@ -1,231 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_HTTPError -from ..utils import ( - float_or_none, - ExtractorError, -) - - -class RedBullTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?redbull(?:\.tv|\.com(?:/[^/]+)?(?:/tv)?)(?:/events/[^/]+)?/(?:videos?|live|(?:film|episode)s)/(?P<id>AP-\w+)' - _TESTS = [{ - # film - 'url': 'https://www.redbull.tv/video/AP-1Q6XCDTAN1W11', - 'md5': 'fb0445b98aa4394e504b413d98031d1f', - 'info_dict': { - 'id': 'AP-1Q6XCDTAN1W11', - 'ext': 'mp4', - 'title': 'ABC of... WRC - ABC of... S1E6', - 'description': 'md5:5c7ed8f4015c8492ecf64b6ab31e7d31', - 'duration': 1582.04, - }, - }, { - # episode - 'url': 'https://www.redbull.tv/video/AP-1PMHKJFCW1W11', - 'info_dict': { - 'id': 'AP-1PMHKJFCW1W11', - 'ext': 'mp4', - 'title': 'Grime - Hashtags S2E4', - 'description': 'md5:5546aa612958c08a98faaad4abce484d', - 'duration': 904, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://www.redbull.com/int-en/tv/video/AP-1UWHCAR9S1W11/rob-meets-sam-gaze?playlist=playlists::3f81040a-2f31-4832-8e2e-545b1d39d173', - 'only_matching': True, - }, { - 'url': 'https://www.redbull.com/us-en/videos/AP-1YM9QCYE52111', - 'only_matching': True, - }, { - 'url': 'https://www.redbull.com/us-en/events/AP-1XV2K61Q51W11/live/AP-1XUJ86FDH1W11', - 'only_matching': True, - }, { - 'url': 'https://www.redbull.com/int-en/films/AP-1ZSMAW8FH2111', - 'only_matching': True, - }, { - 'url': 'https://www.redbull.com/int-en/episodes/AP-1TQWK7XE11W11', - 'only_matching': True, - }] - - def extract_info(self, video_id): - session = self._download_json( - 'https://api.redbull.tv/v3/session', video_id, - note='Downloading access token', query={ - 'category': 'personal_computer', - 'os_family': 'http', - }) - if session.get('code') == 'error': - raise ExtractorError('%s said: %s' % ( - self.IE_NAME, session['message'])) - token = session['token'] - - try: - video = self._download_json( - 'https://api.redbull.tv/v3/products/' + video_id, - video_id, note='Downloading video information', - headers={'Authorization': token} - ) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: - error_message = self._parse_json( - e.cause.read().decode(), video_id)['error'] - raise ExtractorError('%s said: %s' % ( - self.IE_NAME, error_message), expected=True) - raise - - title = video['title'].strip() - - formats = self._extract_m3u8_formats( - 'https://dms.redbull.tv/v3/%s/%s/playlist.m3u8' % (video_id, token), - video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') - self._sort_formats(formats) - - subtitles = {} - for resource in video.get('resources', []): - if resource.startswith('closed_caption_'): - splitted_resource = resource.split('_') - if splitted_resource[2]: - subtitles.setdefault('en', []).append({ - 'url': 'https://resources.redbull.tv/%s/%s' % (video_id, resource), - 'ext': splitted_resource[2], - }) - - subheading = video.get('subheading') - if subheading: - title += ' - %s' % subheading - - return { - 'id': video_id, - 'title': title, - 'description': video.get('long_description') or video.get( - 'short_description'), - 'duration': float_or_none(video.get('duration'), scale=1000), - 'formats': formats, - 'subtitles': subtitles, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - return self.extract_info(video_id) - - -class RedBullEmbedIE(RedBullTVIE): - _VALID_URL = r'https?://(?:www\.)?redbull\.com/embed/(?P<id>rrn:content:[^:]+:[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}:[a-z]{2}-[A-Z]{2,3})' - _TESTS = [{ - # HLS manifest accessible only using assetId - 'url': 'https://www.redbull.com/embed/rrn:content:episode-videos:f3021f4f-3ed4-51ac-915a-11987126e405:en-INT', - 'only_matching': True, - }] - _VIDEO_ESSENSE_TMPL = '''... on %s { - videoEssence { - attributes - } - }''' - - def _real_extract(self, url): - rrn_id = self._match_id(url) - asset_id = self._download_json( - 'https://edge-graphql.crepo-production.redbullaws.com/v1/graphql', - rrn_id, headers={ - 'Accept': 'application/json', - 'API-KEY': 'e90a1ff11335423998b100c929ecc866', - }, query={ - 'query': '''{ - resource(id: "%s", enforceGeoBlocking: false) { - %s - %s - } -}''' % (rrn_id, self._VIDEO_ESSENSE_TMPL % 'LiveVideo', self._VIDEO_ESSENSE_TMPL % 'VideoResource'), - })['data']['resource']['videoEssence']['attributes']['assetId'] - return self.extract_info(asset_id) - - -class RedBullTVRrnContentIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?redbull\.com/(?P<region>[a-z]{2,3})-(?P<lang>[a-z]{2})/tv/(?:video|live|film)/(?P<id>rrn:content:[^:]+:[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' - _TESTS = [{ - 'url': 'https://www.redbull.com/int-en/tv/video/rrn:content:live-videos:e3e6feb4-e95f-50b7-962a-c70f8fd13c73/mens-dh-finals-fort-william', - 'only_matching': True, - }, { - 'url': 'https://www.redbull.com/int-en/tv/video/rrn:content:videos:a36a0f36-ff1b-5db8-a69d-ee11a14bf48b/tn-ts-style?playlist=rrn:content:event-profiles:83f05926-5de8-5389-b5e4-9bb312d715e8:extras', - 'only_matching': True, - }, { - 'url': 'https://www.redbull.com/int-en/tv/film/rrn:content:films:d1f4d00e-4c04-5d19-b510-a805ffa2ab83/follow-me', - 'only_matching': True, - }] - - def _real_extract(self, url): - region, lang, rrn_id = re.search(self._VALID_URL, url).groups() - rrn_id += ':%s-%s' % (lang, region.upper()) - return self.url_result( - 'https://www.redbull.com/embed/' + rrn_id, - RedBullEmbedIE.ie_key(), rrn_id) - - -class RedBullIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?redbull\.com/(?P<region>[a-z]{2,3})-(?P<lang>[a-z]{2})/(?P<type>(?:episode|film|(?:(?:recap|trailer)-)?video)s|live)/(?!AP-|rrn:content:)(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'https://www.redbull.com/int-en/episodes/grime-hashtags-s02-e04', - 'md5': 'db8271a7200d40053a1809ed0dd574ff', - 'info_dict': { - 'id': 'AA-1MT8DQWA91W14', - 'ext': 'mp4', - 'title': 'Grime - Hashtags S2E4', - 'description': 'md5:5546aa612958c08a98faaad4abce484d', - }, - }, { - 'url': 'https://www.redbull.com/int-en/films/kilimanjaro-mountain-of-greatness', - 'only_matching': True, - }, { - 'url': 'https://www.redbull.com/int-en/recap-videos/uci-mountain-bike-world-cup-2017-mens-xco-finals-from-vallnord', - 'only_matching': True, - }, { - 'url': 'https://www.redbull.com/int-en/trailer-videos/kings-of-content', - 'only_matching': True, - }, { - 'url': 'https://www.redbull.com/int-en/videos/tnts-style-red-bull-dance-your-style-s1-e12', - 'only_matching': True, - }, { - 'url': 'https://www.redbull.com/int-en/live/mens-dh-finals-fort-william', - 'only_matching': True, - }, { - # only available on the int-en website so a fallback is need for the API - # https://www.redbull.com/v3/api/graphql/v1/v3/query/en-GB>en-INT?filter[uriSlug]=fia-wrc-saturday-recap-estonia&rb3Schema=v1:hero - 'url': 'https://www.redbull.com/gb-en/live/fia-wrc-saturday-recap-estonia', - 'only_matching': True, - }] - _INT_FALLBACK_LIST = ['de', 'en', 'es', 'fr'] - _LAT_FALLBACK_MAP = ['ar', 'bo', 'car', 'cl', 'co', 'mx', 'pe'] - - def _real_extract(self, url): - region, lang, filter_type, display_id = re.search(self._VALID_URL, url).groups() - if filter_type == 'episodes': - filter_type = 'episode-videos' - elif filter_type == 'live': - filter_type = 'live-videos' - - regions = [region.upper()] - if region != 'int': - if region in self._LAT_FALLBACK_MAP: - regions.append('LAT') - if lang in self._INT_FALLBACK_LIST: - regions.append('INT') - locale = '>'.join(['%s-%s' % (lang, reg) for reg in regions]) - - rrn_id = self._download_json( - 'https://www.redbull.com/v3/api/graphql/v1/v3/query/' + locale, - display_id, query={ - 'filter[type]': filter_type, - 'filter[uriSlug]': display_id, - 'rb3Schema': 'v1:hero', - })['data']['id'] - - return self.url_result( - 'https://www.redbull.com/embed/' + rrn_id, - RedBullEmbedIE.ie_key(), rrn_id) diff --git a/youtube_dl/extractor/reddit.py b/youtube_dl/extractor/reddit.py deleted file mode 100644 index 222fa0172..000000000 --- a/youtube_dl/extractor/reddit.py +++ /dev/null @@ -1,161 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - int_or_none, - float_or_none, - try_get, - unescapeHTML, - url_or_none, -) - - -class RedditIE(InfoExtractor): - _VALID_URL = r'https?://v\.redd\.it/(?P<id>[^/?#&]+)' - _TEST = { - # from https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/ - 'url': 'https://v.redd.it/zv89llsvexdz', - 'md5': '0a070c53eba7ec4534d95a5a1259e253', - 'info_dict': { - 'id': 'zv89llsvexdz', - 'ext': 'mp4', - 'title': 'zv89llsvexdz', - }, - 'params': { - 'format': 'bestvideo', - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - formats = self._extract_m3u8_formats( - 'https://v.redd.it/%s/HLSPlaylist.m3u8' % video_id, video_id, - 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) - - formats.extend(self._extract_mpd_formats( - 'https://v.redd.it/%s/DASHPlaylist.mpd' % video_id, video_id, - mpd_id='dash', fatal=False)) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': video_id, - 'formats': formats, - } - - -class RedditRIE(InfoExtractor): - _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?reddit\.com/r/[^/]+/comments/(?P<id>[^/?#&]+))' - _TESTS = [{ - 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/', - 'info_dict': { - 'id': 'zv89llsvexdz', - 'ext': 'mp4', - 'title': 'That small heart attack.', - 'thumbnail': r're:^https?://.*\.(?:jpg|png)', - 'thumbnails': 'count:4', - 'timestamp': 1501941939, - 'upload_date': '20170805', - 'uploader': 'Antw87', - 'duration': 12, - 'like_count': int, - 'dislike_count': int, - 'comment_count': int, - 'age_limit': 0, - }, - 'params': { - 'format': 'bestvideo', - 'skip_download': True, - }, - }, { - 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj', - 'only_matching': True, - }, { - # imgur - 'url': 'https://www.reddit.com/r/MadeMeSmile/comments/6t7wi5/wait_for_it/', - 'only_matching': True, - }, { - # imgur @ old reddit - 'url': 'https://old.reddit.com/r/MadeMeSmile/comments/6t7wi5/wait_for_it/', - 'only_matching': True, - }, { - # streamable - 'url': 'https://www.reddit.com/r/videos/comments/6t7sg9/comedians_hilarious_joke_about_the_guam_flag/', - 'only_matching': True, - }, { - # youtube - 'url': 'https://www.reddit.com/r/videos/comments/6t75wq/southern_man_tries_to_speak_without_an_accent/', - 'only_matching': True, - }, { - # reddit video @ nm reddit - 'url': 'https://nm.reddit.com/r/Cricket/comments/8idvby/lousy_cameraman_finds_himself_in_cairns_line_of/', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - url, video_id = mobj.group('url', 'id') - - video_id = self._match_id(url) - - data = self._download_json( - url + '/.json', video_id)[0]['data']['children'][0]['data'] - - video_url = data['url'] - - # Avoid recursing into the same reddit URL - if 'reddit.com/' in video_url and '/%s/' % video_id in video_url: - raise ExtractorError('No media found', expected=True) - - over_18 = data.get('over_18') - if over_18 is True: - age_limit = 18 - elif over_18 is False: - age_limit = 0 - else: - age_limit = None - - thumbnails = [] - - def add_thumbnail(src): - if not isinstance(src, dict): - return - thumbnail_url = url_or_none(src.get('url')) - if not thumbnail_url: - return - thumbnails.append({ - 'url': unescapeHTML(thumbnail_url), - 'width': int_or_none(src.get('width')), - 'height': int_or_none(src.get('height')), - }) - - for image in try_get(data, lambda x: x['preview']['images']) or []: - if not isinstance(image, dict): - continue - add_thumbnail(image.get('source')) - resolutions = image.get('resolutions') - if isinstance(resolutions, list): - for resolution in resolutions: - add_thumbnail(resolution) - - return { - '_type': 'url_transparent', - 'url': video_url, - 'title': data.get('title'), - 'thumbnails': thumbnails, - 'timestamp': float_or_none(data.get('created_utc')), - 'uploader': data.get('author'), - 'duration': int_or_none(try_get( - data, - (lambda x: x['media']['reddit_video']['duration'], - lambda x: x['secure_media']['reddit_video']['duration']))), - 'like_count': int_or_none(data.get('ups')), - 'dislike_count': int_or_none(data.get('downs')), - 'comment_count': int_or_none(data.get('num_comments')), - 'age_limit': age_limit, - } diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py deleted file mode 100644 index a1ca791ca..000000000 --- a/youtube_dl/extractor/redtube.py +++ /dev/null @@ -1,136 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - determine_ext, - ExtractorError, - int_or_none, - merge_dicts, - str_to_int, - unified_strdate, - url_or_none, -) - - -class RedTubeIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:\w+\.)?redtube\.com/|embed\.redtube\.com/\?.*?\bid=)(?P<id>[0-9]+)' - _TESTS = [{ - 'url': 'http://www.redtube.com/66418', - 'md5': 'fc08071233725f26b8f014dba9590005', - 'info_dict': { - 'id': '66418', - 'ext': 'mp4', - 'title': 'Sucked on a toilet', - 'upload_date': '20110811', - 'duration': 596, - 'view_count': int, - 'age_limit': 18, - } - }, { - 'url': 'http://embed.redtube.com/?bgcolor=000000&id=1443286', - 'only_matching': True, - }, { - 'url': 'http://it.redtube.com/66418', - 'only_matching': True, - }] - - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//embed\.redtube\.com/\?.*?\bid=\d+)', - webpage) - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage( - 'http://www.redtube.com/%s' % video_id, video_id) - - ERRORS = ( - (('video-deleted-info', '>This video has been removed'), 'has been removed'), - (('private_video_text', '>This video is private', '>Send a friend request to its owner to be able to view it'), 'is private'), - ) - - for patterns, message in ERRORS: - if any(p in webpage for p in patterns): - raise ExtractorError( - 'Video %s %s' % (video_id, message), expected=True) - - info = self._search_json_ld(webpage, video_id, default={}) - - if not info.get('title'): - info['title'] = self._html_search_regex( - (r'<h(\d)[^>]+class="(?:video_title_text|videoTitle|video_title)[^"]*">(?P<title>(?:(?!\1).)+)</h\1>', - r'(?:videoTitle|title)\s*:\s*(["\'])(?P<title>(?:(?!\1).)+)\1',), - webpage, 'title', group='title', - default=None) or self._og_search_title(webpage) - - formats = [] - sources = self._parse_json( - self._search_regex( - r'sources\s*:\s*({.+?})', webpage, 'source', default='{}'), - video_id, fatal=False) - if sources and isinstance(sources, dict): - for format_id, format_url in sources.items(): - if format_url: - formats.append({ - 'url': format_url, - 'format_id': format_id, - 'height': int_or_none(format_id), - }) - medias = self._parse_json( - self._search_regex( - r'mediaDefinition["\']?\s*:\s*(\[.+?}\s*\])', webpage, - 'media definitions', default='{}'), - video_id, fatal=False) - if medias and isinstance(medias, list): - for media in medias: - format_url = url_or_none(media.get('videoUrl')) - if not format_url: - continue - if media.get('format') == 'hls' or determine_ext(format_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls', - fatal=False)) - continue - format_id = media.get('quality') - formats.append({ - 'url': format_url, - 'format_id': format_id, - 'height': int_or_none(format_id), - }) - if not formats: - video_url = self._html_search_regex( - r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL') - formats.append({'url': video_url}) - self._sort_formats(formats) - - thumbnail = self._og_search_thumbnail(webpage) - upload_date = unified_strdate(self._search_regex( - r'<span[^>]+>(?:ADDED|Published on) ([^<]+)<', - webpage, 'upload date', default=None)) - duration = int_or_none(self._og_search_property( - 'video:duration', webpage, default=None) or self._search_regex( - r'videoDuration\s*:\s*(\d+)', webpage, 'duration', default=None)) - view_count = str_to_int(self._search_regex( - (r'<div[^>]*>Views</div>\s*<div[^>]*>\s*([\d,.]+)', - r'<span[^>]*>VIEWS</span>\s*</td>\s*<td>\s*([\d,.]+)', - r'<span[^>]+\bclass=["\']video_view_count[^>]*>\s*([\d,.]+)'), - webpage, 'view count', default=None)) - - # No self-labeling, but they describe themselves as - # "Home of Videos Porno" - age_limit = 18 - - return merge_dicts(info, { - 'id': video_id, - 'ext': 'mp4', - 'thumbnail': thumbnail, - 'upload_date': upload_date, - 'duration': duration, - 'view_count': view_count, - 'age_limit': age_limit, - 'formats': formats, - }) diff --git a/youtube_dl/extractor/rice.py b/youtube_dl/extractor/rice.py deleted file mode 100644 index f855719ac..000000000 --- a/youtube_dl/extractor/rice.py +++ /dev/null @@ -1,116 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_parse_qs -from ..utils import ( - xpath_text, - xpath_element, - int_or_none, - parse_iso8601, - ExtractorError, -) - - -class RICEIE(InfoExtractor): - _VALID_URL = r'https?://mediahub\.rice\.edu/app/[Pp]ortal/video\.aspx\?(?P<query>.+)' - _TEST = { - 'url': 'https://mediahub.rice.edu/app/Portal/video.aspx?PortalID=25ffd62c-3d01-4b29-8c70-7c94270efb3e&DestinationID=66bc9434-03bd-4725-b47e-c659d8d809db&ContentID=YEWIvbhb40aqdjMD1ALSqw', - 'md5': '9b83b4a2eead4912dc3b7fac7c449b6a', - 'info_dict': { - 'id': 'YEWIvbhb40aqdjMD1ALSqw', - 'ext': 'mp4', - 'title': 'Active Learning in Archeology', - 'upload_date': '20140616', - 'timestamp': 1402926346, - } - } - _NS = 'http://schemas.datacontract.org/2004/07/ensembleVideo.Data.Service.Contracts.Models.Player.Config' - - def _real_extract(self, url): - qs = compat_parse_qs(re.match(self._VALID_URL, url).group('query')) - if not qs.get('PortalID') or not qs.get('DestinationID') or not qs.get('ContentID'): - raise ExtractorError('Invalid URL', expected=True) - - portal_id = qs['PortalID'][0] - playlist_id = qs['DestinationID'][0] - content_id = qs['ContentID'][0] - - content_data = self._download_xml('https://mediahub.rice.edu/api/portal/GetContentTitle', content_id, query={ - 'portalId': portal_id, - 'playlistId': playlist_id, - 'contentId': content_id - }) - metadata = xpath_element(content_data, './/metaData', fatal=True) - title = xpath_text(metadata, 'primaryTitle', fatal=True) - encodings = xpath_element(content_data, './/encodings', fatal=True) - player_data = self._download_xml('https://mediahub.rice.edu/api/player/GetPlayerConfig', content_id, query={ - 'temporaryLinkId': xpath_text(encodings, 'temporaryLinkId', fatal=True), - 'contentId': content_id, - }) - - common_fmt = {} - dimensions = xpath_text(encodings, 'dimensions') - if dimensions: - wh = dimensions.split('x') - if len(wh) == 2: - common_fmt.update({ - 'width': int_or_none(wh[0]), - 'height': int_or_none(wh[1]), - }) - - formats = [] - rtsp_path = xpath_text(player_data, self._xpath_ns('RtspPath', self._NS)) - if rtsp_path: - fmt = { - 'url': rtsp_path, - 'format_id': 'rtsp', - } - fmt.update(common_fmt) - formats.append(fmt) - for source in player_data.findall(self._xpath_ns('.//Source', self._NS)): - video_url = xpath_text(source, self._xpath_ns('File', self._NS)) - if not video_url: - continue - if '.m3u8' in video_url: - formats.extend(self._extract_m3u8_formats(video_url, content_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - else: - fmt = { - 'url': video_url, - 'format_id': video_url.split(':')[0], - } - fmt.update(common_fmt) - rtmp = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>.+))/(?P<playpath>mp4:.+)$', video_url) - if rtmp: - fmt.update({ - 'url': rtmp.group('url'), - 'play_path': rtmp.group('playpath'), - 'app': rtmp.group('app'), - 'ext': 'flv', - }) - formats.append(fmt) - self._sort_formats(formats) - - thumbnails = [] - for content_asset in content_data.findall('.//contentAssets'): - asset_type = xpath_text(content_asset, 'type') - if asset_type == 'image': - image_url = xpath_text(content_asset, 'httpPath') - if not image_url: - continue - thumbnails.append({ - 'id': xpath_text(content_asset, 'ID'), - 'url': image_url, - }) - - return { - 'id': content_id, - 'title': title, - 'description': xpath_text(metadata, 'abstract'), - 'duration': int_or_none(xpath_text(metadata, 'duration')), - 'timestamp': parse_iso8601(xpath_text(metadata, 'dateUpdated')), - 'thumbnails': thumbnails, - 'formats': formats, - } diff --git a/youtube_dl/extractor/rmcdecouverte.py b/youtube_dl/extractor/rmcdecouverte.py deleted file mode 100644 index c3623edcc..000000000 --- a/youtube_dl/extractor/rmcdecouverte.py +++ /dev/null @@ -1,55 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from .brightcove import BrightcoveLegacyIE -from ..compat import ( - compat_parse_qs, - compat_urlparse, -) -from ..utils import smuggle_url - - -class RMCDecouverteIE(InfoExtractor): - _VALID_URL = r'https?://rmcdecouverte\.bfmtv\.com/(?:(?:[^/]+/)*program_(?P<id>\d+)|(?P<live_id>mediaplayer-direct))' - - _TESTS = [{ - 'url': 'https://rmcdecouverte.bfmtv.com/wheeler-dealers-occasions-a-saisir/program_2566/', - 'info_dict': { - 'id': '5983675500001', - 'ext': 'mp4', - 'title': 'CORVETTE', - 'description': 'md5:c1e8295521e45ffebf635d6a7658f506', - 'uploader_id': '1969646226001', - 'upload_date': '20181226', - 'timestamp': 1545861635, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'only available for a week', - }, { - # live, geo restricted, bypassable - 'url': 'https://rmcdecouverte.bfmtv.com/mediaplayer-direct/', - 'only_matching': True, - }] - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1969646226001/default_default/index.html?videoId=%s' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('id') or mobj.group('live_id') - webpage = self._download_webpage(url, display_id) - brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) - if brightcove_legacy_url: - brightcove_id = compat_parse_qs(compat_urlparse.urlparse( - brightcove_legacy_url).query)['@videoPlayer'][0] - else: - brightcove_id = self._search_regex( - r'data-video-id=["\'](\d+)', webpage, 'brightcove id') - return self.url_result( - smuggle_url( - self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, - {'geo_countries': ['FR']}), - 'BrightcoveNew', brightcove_id) diff --git a/youtube_dl/extractor/roosterteeth.py b/youtube_dl/extractor/roosterteeth.py deleted file mode 100644 index 8883639b2..000000000 --- a/youtube_dl/extractor/roosterteeth.py +++ /dev/null @@ -1,137 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, -) -from ..utils import ( - ExtractorError, - int_or_none, - str_or_none, - urlencode_postdata, -) - - -class RoosterTeethIE(InfoExtractor): - _VALID_URL = r'https?://(?:.+?\.)?roosterteeth\.com/(?:episode|watch)/(?P<id>[^/?#&]+)' - _NETRC_MACHINE = 'roosterteeth' - _TESTS = [{ - 'url': 'http://roosterteeth.com/episode/million-dollars-but-season-2-million-dollars-but-the-game-announcement', - 'md5': 'e2bd7764732d785ef797700a2489f212', - 'info_dict': { - 'id': '9156', - 'display_id': 'million-dollars-but-season-2-million-dollars-but-the-game-announcement', - 'ext': 'mp4', - 'title': 'Million Dollars, But... The Game Announcement', - 'description': 'md5:168a54b40e228e79f4ddb141e89fe4f5', - 'thumbnail': r're:^https?://.*\.png$', - 'series': 'Million Dollars, But...', - 'episode': 'Million Dollars, But... The Game Announcement', - }, - }, { - 'url': 'http://achievementhunter.roosterteeth.com/episode/off-topic-the-achievement-hunter-podcast-2016-i-didn-t-think-it-would-pass-31', - 'only_matching': True, - }, { - 'url': 'http://funhaus.roosterteeth.com/episode/funhaus-shorts-2016-austin-sucks-funhaus-shorts', - 'only_matching': True, - }, { - 'url': 'http://screwattack.roosterteeth.com/episode/death-battle-season-3-mewtwo-vs-shadow', - 'only_matching': True, - }, { - 'url': 'http://theknow.roosterteeth.com/episode/the-know-game-news-season-1-boring-steam-sales-are-better', - 'only_matching': True, - }, { - # only available for FIRST members - 'url': 'http://roosterteeth.com/episode/rt-docs-the-world-s-greatest-head-massage-the-world-s-greatest-head-massage-an-asmr-journey-part-one', - 'only_matching': True, - }, { - 'url': 'https://roosterteeth.com/watch/million-dollars-but-season-2-million-dollars-but-the-game-announcement', - 'only_matching': True, - }] - _EPISODE_BASE_URL = 'https://svod-be.roosterteeth.com/api/v1/episodes/' - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - - try: - self._download_json( - 'https://auth.roosterteeth.com/oauth/token', - None, 'Logging in', data=urlencode_postdata({ - 'client_id': '4338d2b4bdc8db1239360f28e72f0d9ddb1fd01e7a38fbb07b4b1f4ba4564cc5', - 'grant_type': 'password', - 'username': username, - 'password': password, - })) - except ExtractorError as e: - msg = 'Unable to login' - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - resp = self._parse_json(e.cause.read().decode(), None, fatal=False) - if resp: - error = resp.get('extra_info') or resp.get('error_description') or resp.get('error') - if error: - msg += ': ' + error - self.report_warning(msg) - - def _real_initialize(self): - if self._get_cookies(self._EPISODE_BASE_URL).get('rt_access_token'): - return - self._login() - - def _real_extract(self, url): - display_id = self._match_id(url) - api_episode_url = self._EPISODE_BASE_URL + display_id - - try: - m3u8_url = self._download_json( - api_episode_url + '/videos', display_id, - 'Downloading video JSON metadata')['data'][0]['attributes']['url'] - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - if self._parse_json(e.cause.read().decode(), display_id).get('access') is False: - self.raise_login_required( - '%s is only available for FIRST members' % display_id) - raise - - formats = self._extract_m3u8_formats( - m3u8_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls') - self._sort_formats(formats) - - episode = self._download_json( - api_episode_url, display_id, - 'Downloading episode JSON metadata')['data'][0] - attributes = episode['attributes'] - title = attributes.get('title') or attributes['display_title'] - video_id = compat_str(episode['id']) - - thumbnails = [] - for image in episode.get('included', {}).get('images', []): - if image.get('type') == 'episode_image': - img_attributes = image.get('attributes') or {} - for k in ('thumb', 'small', 'medium', 'large'): - img_url = img_attributes.get(k) - if img_url: - thumbnails.append({ - 'id': k, - 'url': img_url, - }) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': attributes.get('description') or attributes.get('caption'), - 'thumbnails': thumbnails, - 'series': attributes.get('show_title'), - 'season_number': int_or_none(attributes.get('season_number')), - 'season_id': attributes.get('season_id'), - 'episode': title, - 'episode_number': int_or_none(attributes.get('number')), - 'episode_id': str_or_none(episode.get('uuid')), - 'formats': formats, - 'channel_id': attributes.get('channel_id'), - 'duration': int_or_none(attributes.get('length')), - } diff --git a/youtube_dl/extractor/roxwel.py b/youtube_dl/extractor/roxwel.py deleted file mode 100644 index 65284643b..000000000 --- a/youtube_dl/extractor/roxwel.py +++ /dev/null @@ -1,53 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import unified_strdate, determine_ext - - -class RoxwelIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?roxwel\.com/player/(?P<filename>.+?)(\.|\?|$)' - - _TEST = { - 'url': 'http://www.roxwel.com/player/passionpittakeawalklive.html', - 'info_dict': { - 'id': 'passionpittakeawalklive', - 'ext': 'flv', - 'title': 'Take A Walk (live)', - 'uploader': 'Passion Pit', - 'uploader_id': 'passionpit', - 'upload_date': '20120928', - 'description': 'Passion Pit performs "Take A Walk\" live at The Backyard in Austin, Texas. ', - }, - 'params': { - # rtmp download - 'skip_download': True, - } - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - filename = mobj.group('filename') - info_url = 'http://www.roxwel.com/api/videos/%s' % filename - info = self._download_json(info_url, filename) - - rtmp_rates = sorted([int(r.replace('flv_', '')) for r in info['media_rates'] if r.startswith('flv_')]) - best_rate = rtmp_rates[-1] - url_page_url = 'http://roxwel.com/pl_one_time.php?filename=%s&quality=%s' % (filename, best_rate) - rtmp_url = self._download_webpage(url_page_url, filename, 'Downloading video url') - ext = determine_ext(rtmp_url) - if ext == 'f4v': - rtmp_url = rtmp_url.replace(filename, 'mp4:%s' % filename) - - return { - 'id': filename, - 'title': info['title'], - 'url': rtmp_url, - 'ext': 'flv', - 'description': info['description'], - 'thumbnail': info.get('player_image_url') or info.get('image_url_large'), - 'uploader': info['artist'], - 'uploader_id': info['artistname'], - 'upload_date': unified_strdate(info['dbdate']), - } diff --git a/youtube_dl/extractor/rtbf.py b/youtube_dl/extractor/rtbf.py deleted file mode 100644 index 3b0f3080b..000000000 --- a/youtube_dl/extractor/rtbf.py +++ /dev/null @@ -1,161 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - float_or_none, - int_or_none, - strip_or_none, -) - - -class RTBFIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?://(?:www\.)?rtbf\.be/ - (?: - video/[^?]+\?.*\bid=| - ouftivi/(?:[^/]+/)*[^?]+\?.*\bvideoId=| - auvio/[^/]+\?.*\b(?P<live>l)?id= - )(?P<id>\d+)''' - _TESTS = [{ - 'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274', - 'md5': '8c876a1cceeb6cf31b476461ade72384', - 'info_dict': { - 'id': '1921274', - 'ext': 'mp4', - 'title': 'Les Diables au coeur (épisode 2)', - 'description': '(du 25/04/2014)', - 'duration': 3099.54, - 'upload_date': '20140425', - 'timestamp': 1398456300, - } - }, { - # geo restricted - 'url': 'http://www.rtbf.be/ouftivi/heros/detail_scooby-doo-mysteres-associes?id=1097&videoId=2057442', - 'only_matching': True, - }, { - 'url': 'http://www.rtbf.be/ouftivi/niouzz?videoId=2055858', - 'only_matching': True, - }, { - 'url': 'http://www.rtbf.be/auvio/detail_jeudi-en-prime-siegfried-bracke?id=2102996', - 'only_matching': True, - }, { - # Live - 'url': 'https://www.rtbf.be/auvio/direct_pure-fm?lid=134775', - 'only_matching': True, - }, { - # Audio - 'url': 'https://www.rtbf.be/auvio/detail_cinq-heures-cinema?id=2360811', - 'only_matching': True, - }, { - # With Subtitle - 'url': 'https://www.rtbf.be/auvio/detail_les-carnets-du-bourlingueur?id=2361588', - 'only_matching': True, - }] - _IMAGE_HOST = 'http://ds1.ds.static.rtbf.be' - _PROVIDERS = { - 'YOUTUBE': 'Youtube', - 'DAILYMOTION': 'Dailymotion', - 'VIMEO': 'Vimeo', - } - _QUALITIES = [ - ('mobile', 'SD'), - ('web', 'MD'), - ('high', 'HD'), - ] - - def _real_extract(self, url): - live, media_id = re.match(self._VALID_URL, url).groups() - embed_page = self._download_webpage( - 'https://www.rtbf.be/auvio/embed/' + ('direct' if live else 'media'), - media_id, query={'id': media_id}) - data = self._parse_json(self._html_search_regex( - r'data-media="([^"]+)"', embed_page, 'media data'), media_id) - - error = data.get('error') - if error: - raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) - - provider = data.get('provider') - if provider in self._PROVIDERS: - return self.url_result(data['url'], self._PROVIDERS[provider]) - - title = data['title'] - is_live = data.get('isLive') - if is_live: - title = self._live_title(title) - height_re = r'-(\d+)p\.' - formats = [] - - m3u8_url = data.get('urlHlsAes128') or data.get('urlHls') - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False)) - - fix_url = lambda x: x.replace('//rtbf-vod.', '//rtbf.') if '/geo/drm/' in x else x - http_url = data.get('url') - if formats and http_url and re.search(height_re, http_url): - http_url = fix_url(http_url) - for m3u8_f in formats[:]: - height = m3u8_f.get('height') - if not height: - continue - f = m3u8_f.copy() - del f['protocol'] - f.update({ - 'format_id': m3u8_f['format_id'].replace('hls-', 'http-'), - 'url': re.sub(height_re, '-%dp.' % height, http_url), - }) - formats.append(f) - else: - sources = data.get('sources') or {} - for key, format_id in self._QUALITIES: - format_url = sources.get(key) - if not format_url: - continue - height = int_or_none(self._search_regex( - height_re, format_url, 'height', default=None)) - formats.append({ - 'format_id': format_id, - 'url': fix_url(format_url), - 'height': height, - }) - - mpd_url = data.get('urlDash') - if not data.get('drm') and mpd_url: - formats.extend(self._extract_mpd_formats( - mpd_url, media_id, mpd_id='dash', fatal=False)) - - audio_url = data.get('urlAudio') - if audio_url: - formats.append({ - 'format_id': 'audio', - 'url': audio_url, - 'vcodec': 'none', - }) - self._sort_formats(formats) - - subtitles = {} - for track in (data.get('tracks') or {}).values(): - sub_url = track.get('url') - if not sub_url: - continue - subtitles.setdefault(track.get('lang') or 'fr', []).append({ - 'url': sub_url, - }) - - return { - 'id': media_id, - 'formats': formats, - 'title': title, - 'description': strip_or_none(data.get('description')), - 'thumbnail': data.get('thumbnail'), - 'duration': float_or_none(data.get('realDuration')), - 'timestamp': int_or_none(data.get('liveFrom')), - 'series': data.get('programLabel'), - 'subtitles': subtitles, - 'is_live': is_live, - } diff --git a/youtube_dl/extractor/rtl2.py b/youtube_dl/extractor/rtl2.py deleted file mode 100644 index 70f000ca8..000000000 --- a/youtube_dl/extractor/rtl2.py +++ /dev/null @@ -1,207 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..aes import aes_cbc_decrypt -from ..compat import ( - compat_b64decode, - compat_ord, - compat_str, -) -from ..utils import ( - bytes_to_intlist, - ExtractorError, - intlist_to_bytes, - int_or_none, - strip_or_none, -) - - -class RTL2IE(InfoExtractor): - IE_NAME = 'rtl2' - _VALID_URL = r'https?://(?:www\.)?rtl2\.de/sendung/[^/]+/(?:video/(?P<vico_id>\d+)[^/]+/(?P<vivi_id>\d+)-|folge/)(?P<id>[^/?#]+)' - _TESTS = [{ - 'url': 'http://www.rtl2.de/sendung/grip-das-motormagazin/folge/folge-203-0', - 'info_dict': { - 'id': 'folge-203-0', - 'ext': 'f4v', - 'title': 'GRIP sucht den Sommerkönig', - 'description': 'md5:e3adbb940fd3c6e76fa341b8748b562f' - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], - }, { - 'url': 'http://www.rtl2.de/sendung/koeln-50667/video/5512-anna/21040-anna-erwischt-alex/', - 'info_dict': { - 'id': 'anna-erwischt-alex', - 'ext': 'mp4', - 'title': 'Anna erwischt Alex!', - 'description': 'Anna nimmt ihrem Vater nicht ab, dass er nicht spielt. Und tatsächlich erwischt sie ihn auf frischer Tat.' - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], - }] - - def _real_extract(self, url): - vico_id, vivi_id, display_id = re.match(self._VALID_URL, url).groups() - if not vico_id: - webpage = self._download_webpage(url, display_id) - - mobj = re.search( - r'data-collection="(?P<vico_id>\d+)"[^>]+data-video="(?P<vivi_id>\d+)"', - webpage) - if mobj: - vico_id = mobj.group('vico_id') - vivi_id = mobj.group('vivi_id') - else: - vico_id = self._html_search_regex( - r'vico_id\s*:\s*([0-9]+)', webpage, 'vico_id') - vivi_id = self._html_search_regex( - r'vivi_id\s*:\s*([0-9]+)', webpage, 'vivi_id') - - info = self._download_json( - 'https://service.rtl2.de/api-player-vipo/video.php', - display_id, query={ - 'vico_id': vico_id, - 'vivi_id': vivi_id, - }) - video_info = info['video'] - title = video_info['titel'] - - formats = [] - - rtmp_url = video_info.get('streamurl') - if rtmp_url: - rtmp_url = rtmp_url.replace('\\', '') - stream_url = 'mp4:' + self._html_search_regex(r'/ondemand/(.+)', rtmp_url, 'stream URL') - rtmp_conn = ['S:connect', 'O:1', 'NS:pageUrl:' + url, 'NB:fpad:0', 'NN:videoFunction:1', 'O:0'] - - formats.append({ - 'format_id': 'rtmp', - 'url': rtmp_url, - 'play_path': stream_url, - 'player_url': 'https://www.rtl2.de/sites/default/modules/rtl2/jwplayer/jwplayer-7.6.0/jwplayer.flash.swf', - 'page_url': url, - 'flash_version': 'LNX 11,2,202,429', - 'rtmp_conn': rtmp_conn, - 'no_resume': True, - 'preference': 1, - }) - - m3u8_url = video_info.get('streamurl_hls') - if m3u8_url: - formats.extend(self._extract_akamai_formats(m3u8_url, display_id)) - - self._sort_formats(formats) - - return { - 'id': display_id, - 'title': title, - 'thumbnail': video_info.get('image'), - 'description': video_info.get('beschreibung'), - 'duration': int_or_none(video_info.get('duration')), - 'formats': formats, - } - - -class RTL2YouBaseIE(InfoExtractor): - _BACKWERK_BASE_URL = 'https://p-you-backwerk.rtl2apps.de/' - - -class RTL2YouIE(RTL2YouBaseIE): - IE_NAME = 'rtl2:you' - _VALID_URL = r'http?://you\.rtl2\.de/(?:video/\d+/|youplayer/index\.html\?.*?\bvid=)(?P<id>\d+)' - _TESTS = [{ - 'url': 'http://you.rtl2.de/video/3002/15740/MJUNIK%20%E2%80%93%20Home%20of%20YOU/307-hirn-wo-bist-du', - 'info_dict': { - 'id': '15740', - 'ext': 'mp4', - 'title': 'MJUNIK – Home of YOU - #307 Hirn, wo bist du?!', - 'description': 'md5:ddaa95c61b372b12b66e115b2772fe01', - 'age_limit': 12, - }, - }, { - 'url': 'http://you.rtl2.de/youplayer/index.html?vid=15712', - 'only_matching': True, - }] - _AES_KEY = b'\xe9W\xe4.<*\xb8\x1a\xd2\xb6\x92\xf3C\xd3\xefL\x1b\x03*\xbbbH\xc0\x03\xffo\xc2\xf2(\xaa\xaa!' - _GEO_COUNTRIES = ['DE'] - - def _real_extract(self, url): - video_id = self._match_id(url) - - stream_data = self._download_json( - self._BACKWERK_BASE_URL + 'stream/video/' + video_id, video_id) - - data, iv = compat_b64decode(stream_data['streamUrl']).decode().split(':') - stream_url = intlist_to_bytes(aes_cbc_decrypt( - bytes_to_intlist(compat_b64decode(data)), - bytes_to_intlist(self._AES_KEY), - bytes_to_intlist(compat_b64decode(iv)) - )) - if b'rtl2_you_video_not_found' in stream_url: - raise ExtractorError('video not found', expected=True) - - formats = self._extract_m3u8_formats( - stream_url[:-compat_ord(stream_url[-1])].decode(), - video_id, 'mp4', 'm3u8_native') - self._sort_formats(formats) - - video_data = self._download_json( - self._BACKWERK_BASE_URL + 'video/' + video_id, video_id) - - series = video_data.get('formatTitle') - title = episode = video_data.get('title') or series - if series and series != title: - title = '%s - %s' % (series, title) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'description': strip_or_none(video_data.get('description')), - 'thumbnail': video_data.get('image'), - 'duration': int_or_none(stream_data.get('duration') or video_data.get('duration'), 1000), - 'series': series, - 'episode': episode, - 'age_limit': int_or_none(video_data.get('minimumAge')), - } - - -class RTL2YouSeriesIE(RTL2YouBaseIE): - IE_NAME = 'rtl2:you:series' - _VALID_URL = r'http?://you\.rtl2\.de/videos/(?P<id>\d+)' - _TEST = { - 'url': 'http://you.rtl2.de/videos/115/dragon-ball', - 'info_dict': { - 'id': '115', - }, - 'playlist_mincount': 5, - } - - def _real_extract(self, url): - series_id = self._match_id(url) - stream_data = self._download_json( - self._BACKWERK_BASE_URL + 'videos', - series_id, query={ - 'formatId': series_id, - 'limit': 1000000000, - }) - - entries = [] - for video in stream_data.get('videos', []): - video_id = compat_str(video['videoId']) - if not video_id: - continue - entries.append(self.url_result( - 'http://you.rtl2.de/video/%s/%s' % (series_id, video_id), - 'RTL2You', video_id)) - return self.playlist_result(entries, series_id) diff --git a/youtube_dl/extractor/rtp.py b/youtube_dl/extractor/rtp.py deleted file mode 100644 index 02986f442..000000000 --- a/youtube_dl/extractor/rtp.py +++ /dev/null @@ -1,66 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - determine_ext, - js_to_json, -) - - -class RTPIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/p(?P<program_id>[0-9]+)/(?P<id>[^/?#]+)/?' - _TESTS = [{ - 'url': 'http://www.rtp.pt/play/p405/e174042/paixoes-cruzadas', - 'md5': 'e736ce0c665e459ddb818546220b4ef8', - 'info_dict': { - 'id': 'e174042', - 'ext': 'mp3', - 'title': 'Paixões Cruzadas', - 'description': 'As paixões musicais de António Cartaxo e António Macedo', - 'thumbnail': r're:^https?://.*\.jpg', - }, - }, { - 'url': 'http://www.rtp.pt/play/p831/a-quimica-das-coisas', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - title = self._html_search_meta( - 'twitter:title', webpage, display_name='title', fatal=True) - - config = self._parse_json(self._search_regex( - r'(?s)RTPPlayer\(({.+?})\);', webpage, - 'player config'), video_id, js_to_json) - file_url = config['file'] - ext = determine_ext(file_url) - if ext == 'm3u8': - file_key = config.get('fileKey') - formats = self._extract_m3u8_formats( - file_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=file_key) - if file_key: - formats.append({ - 'url': 'https://cdn-ondemand.rtp.pt' + file_key, - 'preference': 1, - }) - self._sort_formats(formats) - else: - formats = [{ - 'url': file_url, - 'ext': ext, - }] - if config.get('mediaType') == 'audio': - for f in formats: - f['vcodec'] = 'none' - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'description': self._html_search_meta(['description', 'twitter:description'], webpage), - 'thumbnail': config.get('poster') or self._og_search_thumbnail(webpage), - } diff --git a/youtube_dl/extractor/rts.py b/youtube_dl/extractor/rts.py deleted file mode 100644 index aed35f8a9..000000000 --- a/youtube_dl/extractor/rts.py +++ /dev/null @@ -1,235 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .srgssr import SRGSSRIE -from ..compat import compat_str -from ..utils import ( - determine_ext, - int_or_none, - parse_duration, - parse_iso8601, - unescapeHTML, - urljoin, -) - - -class RTSIE(SRGSSRIE): - IE_DESC = 'RTS.ch' - _VALID_URL = r'rts:(?P<rts_id>\d+)|https?://(?:.+?\.)?rts\.ch/(?:[^/]+/){2,}(?P<id>[0-9]+)-(?P<display_id>.+?)\.html' - - _TESTS = [ - { - 'url': 'http://www.rts.ch/archives/tv/divers/3449373-les-enfants-terribles.html', - 'md5': '753b877968ad8afaeddccc374d4256a5', - 'info_dict': { - 'id': '3449373', - 'display_id': 'les-enfants-terribles', - 'ext': 'mp4', - 'duration': 1488, - 'title': 'Les Enfants Terribles', - 'description': 'France Pommier et sa soeur Luce Feral, les deux filles de ce groupe de 5.', - 'uploader': 'Divers', - 'upload_date': '19680921', - 'timestamp': -40280400, - 'thumbnail': r're:^https?://.*\.image', - 'view_count': int, - }, - 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], - }, - { - 'url': 'http://www.rts.ch/emissions/passe-moi-les-jumelles/5624067-entre-ciel-et-mer.html', - 'info_dict': { - 'id': '5624065', - 'title': 'Passe-moi les jumelles', - }, - 'playlist_mincount': 4, - }, - { - 'url': 'http://www.rts.ch/video/sport/hockey/5745975-1-2-kloten-fribourg-5-2-second-but-pour-gotteron-par-kwiatowski.html', - 'info_dict': { - 'id': '5745975', - 'display_id': '1-2-kloten-fribourg-5-2-second-but-pour-gotteron-par-kwiatowski', - 'ext': 'mp4', - 'duration': 48, - 'title': '1/2, Kloten - Fribourg (5-2): second but pour Gottéron par Kwiatowski', - 'description': 'Hockey - Playoff', - 'uploader': 'Hockey', - 'upload_date': '20140403', - 'timestamp': 1396556882, - 'thumbnail': r're:^https?://.*\.image', - 'view_count': int, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], - 'skip': 'Blocked outside Switzerland', - }, - { - 'url': 'http://www.rts.ch/video/info/journal-continu/5745356-londres-cachee-par-un-epais-smog.html', - 'md5': '9bb06503773c07ce83d3cbd793cebb91', - 'info_dict': { - 'id': '5745356', - 'display_id': 'londres-cachee-par-un-epais-smog', - 'ext': 'mp4', - 'duration': 33, - 'title': 'Londres cachée par un épais smog', - 'description': 'Un important voile de smog recouvre Londres depuis mercredi, provoqué par la pollution et du sable du Sahara.', - 'uploader': 'L\'actu en vidéo', - 'upload_date': '20140403', - 'timestamp': 1396537322, - 'thumbnail': r're:^https?://.*\.image', - 'view_count': int, - }, - 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], - }, - { - 'url': 'http://www.rts.ch/audio/couleur3/programmes/la-belle-video-de-stephane-laurenceau/5706148-urban-hippie-de-damien-krisl-03-04-2014.html', - 'md5': 'dd8ef6a22dff163d063e2a52bc8adcae', - 'info_dict': { - 'id': '5706148', - 'display_id': 'urban-hippie-de-damien-krisl-03-04-2014', - 'ext': 'mp3', - 'duration': 123, - 'title': '"Urban Hippie", de Damien Krisl', - 'description': 'Des Hippies super glam.', - 'upload_date': '20140403', - 'timestamp': 1396551600, - }, - }, - { - # article with videos on rhs - 'url': 'http://www.rts.ch/sport/hockey/6693917-hockey-davos-decroche-son-31e-titre-de-champion-de-suisse.html', - 'info_dict': { - 'id': '6693917', - 'title': 'Hockey: Davos décroche son 31e titre de champion de Suisse', - }, - 'playlist_mincount': 5, - }, - { - 'url': 'http://pages.rts.ch/emissions/passe-moi-les-jumelles/5624065-entre-ciel-et-mer.html', - 'only_matching': True, - } - ] - - def _real_extract(self, url): - m = re.match(self._VALID_URL, url) - media_id = m.group('rts_id') or m.group('id') - display_id = m.group('display_id') or media_id - - def download_json(internal_id): - return self._download_json( - 'http://www.rts.ch/a/%s.html?f=json/article' % internal_id, - display_id) - - all_info = download_json(media_id) - - # media_id extracted out of URL is not always a real id - if 'video' not in all_info and 'audio' not in all_info: - entries = [] - - for item in all_info.get('items', []): - item_url = item.get('url') - if not item_url: - continue - entries.append(self.url_result(item_url, 'RTS')) - - if not entries: - page, urlh = self._download_webpage_handle(url, display_id) - if re.match(self._VALID_URL, urlh.geturl()).group('id') != media_id: - return self.url_result(urlh.geturl(), 'RTS') - - # article with videos on rhs - videos = re.findall( - r'<article[^>]+class="content-item"[^>]*>\s*<a[^>]+data-video-urn="urn:([^"]+)"', - page) - if not videos: - videos = re.findall( - r'(?s)<iframe[^>]+class="srg-player"[^>]+src="[^"]+urn:([^"]+)"', - page) - if videos: - entries = [self.url_result('srgssr:%s' % video_urn, 'SRGSSR') for video_urn in videos] - - if entries: - return self.playlist_result(entries, media_id, all_info.get('title')) - - internal_id = self._html_search_regex( - r'<(?:video|audio) data-id="([0-9]+)"', page, - 'internal video id') - all_info = download_json(internal_id) - - media_type = 'video' if 'video' in all_info else 'audio' - - # check for errors - self._get_media_data('rts', media_type, media_id) - - info = all_info['video']['JSONinfo'] if 'video' in all_info else all_info['audio'] - - title = info['title'] - - def extract_bitrate(url): - return int_or_none(self._search_regex( - r'-([0-9]+)k\.', url, 'bitrate', default=None)) - - formats = [] - streams = info.get('streams', {}) - for format_id, format_url in streams.items(): - if format_id == 'hds_sd' and 'hds' in streams: - continue - if format_id == 'hls_sd' and 'hls' in streams: - continue - ext = determine_ext(format_url) - if ext in ('m3u8', 'f4m'): - format_url = self._get_tokenized_src(format_url, media_id, format_id) - if ext == 'f4m': - formats.extend(self._extract_f4m_formats( - format_url + ('?' if '?' not in format_url else '&') + 'hdcore=3.4.0', - media_id, f4m_id=format_id, fatal=False)) - else: - formats.extend(self._extract_m3u8_formats( - format_url, media_id, 'mp4', 'm3u8_native', m3u8_id=format_id, fatal=False)) - else: - formats.append({ - 'format_id': format_id, - 'url': format_url, - 'tbr': extract_bitrate(format_url), - }) - - download_base = 'http://rtsww%s-d.rts.ch/' % ('-a' if media_type == 'audio' else '') - for media in info.get('media', []): - media_url = media.get('url') - if not media_url or re.match(r'https?://', media_url): - continue - rate = media.get('rate') - ext = media.get('ext') or determine_ext(media_url, 'mp4') - format_id = ext - if rate: - format_id += '-%dk' % rate - formats.append({ - 'format_id': format_id, - 'url': urljoin(download_base, media_url), - 'tbr': rate or extract_bitrate(media_url), - }) - - self._check_formats(formats, media_id) - self._sort_formats(formats) - - duration = info.get('duration') or info.get('cutout') or info.get('cutduration') - if isinstance(duration, compat_str): - duration = parse_duration(duration) - - return { - 'id': media_id, - 'display_id': display_id, - 'formats': formats, - 'title': title, - 'description': info.get('intro'), - 'duration': duration, - 'view_count': int_or_none(info.get('plays')), - 'uploader': info.get('programName'), - 'timestamp': parse_iso8601(info.get('broadcast_date')), - 'thumbnail': unescapeHTML(info.get('preview_image_url')), - } diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py deleted file mode 100644 index d2fb754cf..000000000 --- a/youtube_dl/extractor/rtve.py +++ /dev/null @@ -1,268 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import base64 -import io -import re -import sys - -from .common import InfoExtractor -from ..compat import ( - compat_b64decode, - compat_struct_unpack, -) -from ..utils import ( - determine_ext, - ExtractorError, - float_or_none, - qualities, - remove_end, - remove_start, - std_headers, -) - -_bytes_to_chr = (lambda x: x) if sys.version_info[0] == 2 else (lambda x: map(chr, x)) - - -class RTVEALaCartaIE(InfoExtractor): - IE_NAME = 'rtve.es:alacarta' - IE_DESC = 'RTVE a la carta' - _VALID_URL = r'https?://(?:www\.)?rtve\.es/(m/)?(alacarta/videos|filmoteca)/[^/]+/[^/]+/(?P<id>\d+)' - - _TESTS = [{ - 'url': 'http://www.rtve.es/alacarta/videos/balonmano/o-swiss-cup-masculina-final-espana-suecia/2491869/', - 'md5': '1d49b7e1ca7a7502c56a4bf1b60f1b43', - 'info_dict': { - 'id': '2491869', - 'ext': 'mp4', - 'title': 'Balonmano - Swiss Cup masculina. Final: España-Suecia', - 'duration': 5024.566, - 'series': 'Balonmano', - }, - 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], - }, { - 'note': 'Live stream', - 'url': 'http://www.rtve.es/alacarta/videos/television/24h-live/1694255/', - 'info_dict': { - 'id': '1694255', - 'ext': 'mp4', - 'title': 're:^24H LIVE [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'is_live': True, - }, - 'params': { - 'skip_download': 'live stream', - }, - }, { - 'url': 'http://www.rtve.es/alacarta/videos/servir-y-proteger/servir-proteger-capitulo-104/4236788/', - 'md5': 'd850f3c8731ea53952ebab489cf81cbf', - 'info_dict': { - 'id': '4236788', - 'ext': 'mp4', - 'title': 'Servir y proteger - Capítulo 104', - 'duration': 3222.0, - }, - 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], - }, { - 'url': 'http://www.rtve.es/m/alacarta/videos/cuentame-como-paso/cuentame-como-paso-t16-ultimo-minuto-nuestra-vida-capitulo-276/2969138/?media=tve', - 'only_matching': True, - }, { - 'url': 'http://www.rtve.es/filmoteca/no-do/not-1-introduccion-primer-noticiario-espanol/1465256/', - 'only_matching': True, - }] - - def _real_initialize(self): - user_agent_b64 = base64.b64encode(std_headers['User-Agent'].encode('utf-8')).decode('utf-8') - self._manager = self._download_json( - 'http://www.rtve.es/odin/loki/' + user_agent_b64, - None, 'Fetching manager info')['manager'] - - @staticmethod - def _decrypt_url(png): - encrypted_data = io.BytesIO(compat_b64decode(png)[8:]) - while True: - length = compat_struct_unpack('!I', encrypted_data.read(4))[0] - chunk_type = encrypted_data.read(4) - if chunk_type == b'IEND': - break - data = encrypted_data.read(length) - if chunk_type == b'tEXt': - alphabet_data, text = data.split(b'\0') - quality, url_data = text.split(b'%%') - alphabet = [] - e = 0 - d = 0 - for l in _bytes_to_chr(alphabet_data): - if d == 0: - alphabet.append(l) - d = e = (e + 1) % 4 - else: - d -= 1 - url = '' - f = 0 - e = 3 - b = 1 - for letter in _bytes_to_chr(url_data): - if f == 0: - l = int(letter) * 10 - f = 1 - else: - if e == 0: - l += int(letter) - url += alphabet[l] - e = (b + 3) % 4 - f = 0 - b += 1 - else: - e -= 1 - - yield quality.decode(), url - encrypted_data.read(4) # CRC - - def _extract_png_formats(self, video_id): - png = self._download_webpage( - 'http://www.rtve.es/ztnr/movil/thumbnail/%s/videos/%s.png' % (self._manager, video_id), - video_id, 'Downloading url information', query={'q': 'v2'}) - q = qualities(['Media', 'Alta', 'HQ', 'HD_READY', 'HD_FULL']) - formats = [] - for quality, video_url in self._decrypt_url(png): - ext = determine_ext(video_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - elif ext == 'mpd': - formats.extend(self._extract_mpd_formats( - video_url, video_id, 'dash', fatal=False)) - else: - formats.append({ - 'format_id': quality, - 'quality': q(quality), - 'url': video_url, - }) - self._sort_formats(formats) - return formats - - def _real_extract(self, url): - video_id = self._match_id(url) - info = self._download_json( - 'http://www.rtve.es/api/videos/%s/config/alacarta_videos.json' % video_id, - video_id)['page']['items'][0] - if info['state'] == 'DESPU': - raise ExtractorError('The video is no longer available', expected=True) - title = info['title'].strip() - formats = self._extract_png_formats(video_id) - - subtitles = None - sbt_file = info.get('sbtFile') - if sbt_file: - subtitles = self.extract_subtitles(video_id, sbt_file) - - is_live = info.get('live') is True - - return { - 'id': video_id, - 'title': self._live_title(title) if is_live else title, - 'formats': formats, - 'thumbnail': info.get('image'), - 'subtitles': subtitles, - 'duration': float_or_none(info.get('duration'), 1000), - 'is_live': is_live, - 'series': info.get('programTitle'), - } - - def _get_subtitles(self, video_id, sub_file): - subs = self._download_json( - sub_file + '.json', video_id, - 'Downloading subtitles info')['page']['items'] - return dict( - (s['lang'], [{'ext': 'vtt', 'url': s['src']}]) - for s in subs) - - -class RTVEInfantilIE(RTVEALaCartaIE): - IE_NAME = 'rtve.es:infantil' - IE_DESC = 'RTVE infantil' - _VALID_URL = r'https?://(?:www\.)?rtve\.es/infantil/serie/[^/]+/video/[^/]+/(?P<id>[0-9]+)/' - - _TESTS = [{ - 'url': 'http://www.rtve.es/infantil/serie/cleo/video/maneras-vivir/3040283/', - 'md5': '5747454717aedf9f9fdf212d1bcfc48d', - 'info_dict': { - 'id': '3040283', - 'ext': 'mp4', - 'title': 'Maneras de vivir', - 'thumbnail': r're:https?://.+/1426182947956\.JPG', - 'duration': 357.958, - }, - 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], - }] - - -class RTVELiveIE(RTVEALaCartaIE): - IE_NAME = 'rtve.es:live' - IE_DESC = 'RTVE.es live streams' - _VALID_URL = r'https?://(?:www\.)?rtve\.es/directo/(?P<id>[a-zA-Z0-9-]+)' - - _TESTS = [{ - 'url': 'http://www.rtve.es/directo/la-1/', - 'info_dict': { - 'id': 'la-1', - 'ext': 'mp4', - 'title': 're:^La 1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - }, - 'params': { - 'skip_download': 'live stream', - } - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - webpage = self._download_webpage(url, video_id) - title = remove_end(self._og_search_title(webpage), ' en directo en RTVE.es') - title = remove_start(title, 'Estoy viendo ') - - vidplayer_id = self._search_regex( - (r'playerId=player([0-9]+)', - r'class=["\'].*?\blive_mod\b.*?["\'][^>]+data-assetid=["\'](\d+)', - r'data-id=["\'](\d+)'), - webpage, 'internal video ID') - - return { - 'id': video_id, - 'title': self._live_title(title), - 'formats': self._extract_png_formats(vidplayer_id), - 'is_live': True, - } - - -class RTVETelevisionIE(InfoExtractor): - IE_NAME = 'rtve.es:television' - _VALID_URL = r'https?://(?:www\.)?rtve\.es/television/[^/]+/[^/]+/(?P<id>\d+).shtml' - - _TEST = { - 'url': 'http://www.rtve.es/television/20160628/revolucion-del-movil/1364141.shtml', - 'info_dict': { - 'id': '3069778', - 'ext': 'mp4', - 'title': 'Documentos TV - La revolución del móvil', - 'duration': 3496.948, - }, - 'params': { - 'skip_download': True, - }, - } - - def _real_extract(self, url): - page_id = self._match_id(url) - webpage = self._download_webpage(url, page_id) - - alacarta_url = self._search_regex( - r'data-location="alacarta_videos"[^<]+url":"(http://www\.rtve\.es/alacarta.+?)&', - webpage, 'alacarta url', default=None) - if alacarta_url is None: - raise ExtractorError( - 'The webpage doesn\'t contain any video', expected=True) - - return self.url_result(alacarta_url, ie=RTVEALaCartaIE.ie_key()) diff --git a/youtube_dl/extractor/rumble.py b/youtube_dl/extractor/rumble.py deleted file mode 100644 index 4a0225109..000000000 --- a/youtube_dl/extractor/rumble.py +++ /dev/null @@ -1,67 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - determine_ext, - int_or_none, - parse_iso8601, - try_get, -) - - -class RumbleEmbedIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?rumble\.com/embed/(?:[0-9a-z]+\.)?(?P<id>[0-9a-z]+)' - _TESTS = [{ - 'url': 'https://rumble.com/embed/v5pv5f', - 'md5': '36a18a049856720189f30977ccbb2c34', - 'info_dict': { - 'id': 'v5pv5f', - 'ext': 'mp4', - 'title': 'WMAR 2 News Latest Headlines | October 20, 6pm', - 'timestamp': 1571611968, - 'upload_date': '20191020', - } - }, { - 'url': 'https://rumble.com/embed/ufe9n.v5pv5f', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - video = self._download_json( - 'https://rumble.com/embedJS/', video_id, - query={'request': 'video', 'v': video_id}) - title = video['title'] - - formats = [] - for height, ua in (video.get('ua') or {}).items(): - for i in range(2): - f_url = try_get(ua, lambda x: x[i], compat_str) - if f_url: - ext = determine_ext(f_url) - f = { - 'ext': ext, - 'format_id': '%s-%sp' % (ext, height), - 'height': int_or_none(height), - 'url': f_url, - } - bitrate = try_get(ua, lambda x: x[i + 2]['bitrate']) - if bitrate: - f['tbr'] = int_or_none(bitrate) - formats.append(f) - self._sort_formats(formats) - - author = video.get('author') or {} - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'thumbnail': video.get('i'), - 'timestamp': parse_iso8601(video.get('pubDate')), - 'channel': author.get('name'), - 'channel_url': author.get('url'), - 'duration': int_or_none(video.get('duration')), - } diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py deleted file mode 100644 index 8f54d5675..000000000 --- a/youtube_dl/extractor/rutube.py +++ /dev/null @@ -1,313 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re -import itertools - -from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_parse_qs, - compat_urllib_parse_urlparse, -) -from ..utils import ( - determine_ext, - bool_or_none, - int_or_none, - try_get, - unified_timestamp, - url_or_none, -) - - -class RutubeBaseIE(InfoExtractor): - def _download_api_info(self, video_id, query=None): - if not query: - query = {} - query['format'] = 'json' - return self._download_json( - 'http://rutube.ru/api/video/%s/' % video_id, - video_id, 'Downloading video JSON', - 'Unable to download video JSON', query=query) - - @staticmethod - def _extract_info(video, video_id=None, require_title=True): - title = video['title'] if require_title else video.get('title') - - age_limit = video.get('is_adult') - if age_limit is not None: - age_limit = 18 if age_limit is True else 0 - - uploader_id = try_get(video, lambda x: x['author']['id']) - category = try_get(video, lambda x: x['category']['name']) - - return { - 'id': video.get('id') or video_id if video_id else video['id'], - 'title': title, - 'description': video.get('description'), - 'thumbnail': video.get('thumbnail_url'), - 'duration': int_or_none(video.get('duration')), - 'uploader': try_get(video, lambda x: x['author']['name']), - 'uploader_id': compat_str(uploader_id) if uploader_id else None, - 'timestamp': unified_timestamp(video.get('created_ts')), - 'category': [category] if category else None, - 'age_limit': age_limit, - 'view_count': int_or_none(video.get('hits')), - 'comment_count': int_or_none(video.get('comments_count')), - 'is_live': bool_or_none(video.get('is_livestream')), - } - - def _download_and_extract_info(self, video_id, query=None): - return self._extract_info( - self._download_api_info(video_id, query=query), video_id) - - def _download_api_options(self, video_id, query=None): - if not query: - query = {} - query['format'] = 'json' - return self._download_json( - 'http://rutube.ru/api/play/options/%s/' % video_id, - video_id, 'Downloading options JSON', - 'Unable to download options JSON', - headers=self.geo_verification_headers(), query=query) - - def _extract_formats(self, options, video_id): - formats = [] - for format_id, format_url in options['video_balancer'].items(): - ext = determine_ext(format_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - format_url, video_id, f4m_id=format_id, fatal=False)) - else: - formats.append({ - 'url': format_url, - 'format_id': format_id, - }) - self._sort_formats(formats) - return formats - - def _download_and_extract_formats(self, video_id, query=None): - return self._extract_formats( - self._download_api_options(video_id, query=query), video_id) - - -class RutubeIE(RutubeBaseIE): - IE_NAME = 'rutube' - IE_DESC = 'Rutube videos' - _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/(?P<id>[\da-z]{32})' - - _TESTS = [{ - 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/', - 'md5': '1d24f180fac7a02f3900712e5a5764d6', - 'info_dict': { - 'id': '3eac3b4561676c17df9132a9a1e62e3e', - 'ext': 'mp4', - 'title': 'Раненный кенгуру забежал в аптеку', - 'description': 'http://www.ntdtv.ru ', - 'duration': 81, - 'uploader': 'NTDRussian', - 'uploader_id': '29790', - 'timestamp': 1381943602, - 'upload_date': '20131016', - 'age_limit': 0, - }, - }, { - 'url': 'http://rutube.ru/play/embed/a10e53b86e8f349080f718582ce4c661', - 'only_matching': True, - }, { - 'url': 'http://rutube.ru/embed/a10e53b86e8f349080f718582ce4c661', - 'only_matching': True, - }, { - 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/?pl_id=4252', - 'only_matching': True, - }, { - 'url': 'https://rutube.ru/video/10b3a03fc01d5bbcc632a2f3514e8aab/?pl_type=source', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return False if RutubePlaylistIE.suitable(url) else super(RutubeIE, cls).suitable(url) - - @staticmethod - def _extract_urls(webpage): - return [mobj.group('url') for mobj in re.finditer( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//rutube\.ru/embed/[\da-z]{32}.*?)\1', - webpage)] - - def _real_extract(self, url): - video_id = self._match_id(url) - info = self._download_and_extract_info(video_id) - info['formats'] = self._download_and_extract_formats(video_id) - return info - - -class RutubeEmbedIE(RutubeBaseIE): - IE_NAME = 'rutube:embed' - IE_DESC = 'Rutube embedded videos' - _VALID_URL = r'https?://rutube\.ru/(?:video|play)/embed/(?P<id>[0-9]+)' - - _TESTS = [{ - 'url': 'http://rutube.ru/video/embed/6722881?vk_puid37=&vk_puid38=', - 'info_dict': { - 'id': 'a10e53b86e8f349080f718582ce4c661', - 'ext': 'mp4', - 'timestamp': 1387830582, - 'upload_date': '20131223', - 'uploader_id': '297833', - 'description': 'Видео группы ★http://vk.com/foxkidsreset★ музей Fox Kids и Jetix<br/><br/> восстановлено и сделано в шикоформате subziro89 http://vk.com/subziro89', - 'uploader': 'subziro89 ILya', - 'title': 'Мистический городок Эйри в Индиан 5 серия озвучка subziro89', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://rutube.ru/play/embed/8083783', - 'only_matching': True, - }, { - # private video - 'url': 'https://rutube.ru/play/embed/10631925?p=IbAigKqWd1do4mjaM5XLIQ', - 'only_matching': True, - }] - - def _real_extract(self, url): - embed_id = self._match_id(url) - # Query may contain private videos token and should be passed to API - # requests (see #19163) - query = compat_parse_qs(compat_urllib_parse_urlparse(url).query) - options = self._download_api_options(embed_id, query) - video_id = options['effective_video'] - formats = self._extract_formats(options, video_id) - info = self._download_and_extract_info(video_id, query) - info.update({ - 'extractor_key': 'Rutube', - 'formats': formats, - }) - return info - - -class RutubePlaylistBaseIE(RutubeBaseIE): - def _next_page_url(self, page_num, playlist_id, *args, **kwargs): - return self._PAGE_TEMPLATE % (playlist_id, page_num) - - def _entries(self, playlist_id, *args, **kwargs): - next_page_url = None - for pagenum in itertools.count(1): - page = self._download_json( - next_page_url or self._next_page_url( - pagenum, playlist_id, *args, **kwargs), - playlist_id, 'Downloading page %s' % pagenum) - - results = page.get('results') - if not results or not isinstance(results, list): - break - - for result in results: - video_url = url_or_none(result.get('video_url')) - if not video_url: - continue - entry = self._extract_info(result, require_title=False) - entry.update({ - '_type': 'url', - 'url': video_url, - 'ie_key': RutubeIE.ie_key(), - }) - yield entry - - next_page_url = page.get('next') - if not next_page_url or not page.get('has_next'): - break - - def _extract_playlist(self, playlist_id, *args, **kwargs): - return self.playlist_result( - self._entries(playlist_id, *args, **kwargs), - playlist_id, kwargs.get('playlist_name')) - - def _real_extract(self, url): - return self._extract_playlist(self._match_id(url)) - - -class RutubeChannelIE(RutubePlaylistBaseIE): - IE_NAME = 'rutube:channel' - IE_DESC = 'Rutube channels' - _VALID_URL = r'https?://rutube\.ru/tags/video/(?P<id>\d+)' - _TESTS = [{ - 'url': 'http://rutube.ru/tags/video/1800/', - 'info_dict': { - 'id': '1800', - }, - 'playlist_mincount': 68, - }] - - _PAGE_TEMPLATE = 'http://rutube.ru/api/tags/video/%s/?page=%s&format=json' - - -class RutubeMovieIE(RutubePlaylistBaseIE): - IE_NAME = 'rutube:movie' - IE_DESC = 'Rutube movies' - _VALID_URL = r'https?://rutube\.ru/metainfo/tv/(?P<id>\d+)' - _TESTS = [] - - _MOVIE_TEMPLATE = 'http://rutube.ru/api/metainfo/tv/%s/?format=json' - _PAGE_TEMPLATE = 'http://rutube.ru/api/metainfo/tv/%s/video?page=%s&format=json' - - def _real_extract(self, url): - movie_id = self._match_id(url) - movie = self._download_json( - self._MOVIE_TEMPLATE % movie_id, movie_id, - 'Downloading movie JSON') - return self._extract_playlist( - movie_id, playlist_name=movie.get('name')) - - -class RutubePersonIE(RutubePlaylistBaseIE): - IE_NAME = 'rutube:person' - IE_DESC = 'Rutube person videos' - _VALID_URL = r'https?://rutube\.ru/video/person/(?P<id>\d+)' - _TESTS = [{ - 'url': 'http://rutube.ru/video/person/313878/', - 'info_dict': { - 'id': '313878', - }, - 'playlist_mincount': 37, - }] - - _PAGE_TEMPLATE = 'http://rutube.ru/api/video/person/%s/?page=%s&format=json' - - -class RutubePlaylistIE(RutubePlaylistBaseIE): - IE_NAME = 'rutube:playlist' - IE_DESC = 'Rutube playlists' - _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/[\da-z]{32}/\?.*?\bpl_id=(?P<id>\d+)' - _TESTS = [{ - 'url': 'https://rutube.ru/video/cecd58ed7d531fc0f3d795d51cee9026/?pl_id=3097&pl_type=tag', - 'info_dict': { - 'id': '3097', - }, - 'playlist_count': 27, - }, { - 'url': 'https://rutube.ru/video/10b3a03fc01d5bbcc632a2f3514e8aab/?pl_id=4252&pl_type=source', - 'only_matching': True, - }] - - _PAGE_TEMPLATE = 'http://rutube.ru/api/playlist/%s/%s/?page=%s&format=json' - - @classmethod - def suitable(cls, url): - if not super(RutubePlaylistIE, cls).suitable(url): - return False - params = compat_parse_qs(compat_urllib_parse_urlparse(url).query) - return params.get('pl_type', [None])[0] and int_or_none(params.get('pl_id', [None])[0]) - - def _next_page_url(self, page_num, playlist_id, item_kind): - return self._PAGE_TEMPLATE % (item_kind, playlist_id, page_num) - - def _real_extract(self, url): - qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) - playlist_kind = qs['pl_type'][0] - playlist_id = qs['pl_id'][0] - return self._extract_playlist(playlist_id, item_kind=playlist_kind) diff --git a/youtube_dl/extractor/rutv.py b/youtube_dl/extractor/rutv.py deleted file mode 100644 index d2713c19a..000000000 --- a/youtube_dl/extractor/rutv.py +++ /dev/null @@ -1,211 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - int_or_none -) - - -class RUTVIE(InfoExtractor): - IE_DESC = 'RUTV.RU' - _VALID_URL = r'''(?x) - https?:// - (?:test)?player\.(?:rutv\.ru|vgtrk\.com)/ - (?P<path> - flash\d+v/container\.swf\?id=| - iframe/(?P<type>swf|video|live)/id/| - index/iframe/cast_id/ - ) - (?P<id>\d+) - ''' - - _TESTS = [ - { - 'url': 'http://player.rutv.ru/flash2v/container.swf?id=774471&sid=kultura&fbv=true&isPlay=true&ssl=false&i=560&acc_video_id=episode_id/972347/video_id/978186/brand_id/31724', - 'info_dict': { - 'id': '774471', - 'ext': 'mp4', - 'title': 'Монологи на все времена', - 'description': 'md5:18d8b5e6a41fb1faa53819471852d5d5', - 'duration': 2906, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, - { - 'url': 'https://player.vgtrk.com/flash2v/container.swf?id=774016&sid=russiatv&fbv=true&isPlay=true&ssl=false&i=560&acc_video_id=episode_id/972098/video_id/977760/brand_id/57638', - 'info_dict': { - 'id': '774016', - 'ext': 'mp4', - 'title': 'Чужой в семье Сталина', - 'description': '', - 'duration': 2539, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, - { - 'url': 'http://player.rutv.ru/iframe/swf/id/766888/sid/hitech/?acc_video_id=4000', - 'info_dict': { - 'id': '766888', - 'ext': 'mp4', - 'title': 'Вести.net: интернет-гиганты начали перетягивание программных "одеял"', - 'description': 'md5:65ddd47f9830c4f42ed6475f8730c995', - 'duration': 279, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, - { - 'url': 'http://player.rutv.ru/iframe/video/id/771852/start_zoom/true/showZoomBtn/false/sid/russiatv/?acc_video_id=episode_id/970443/video_id/975648/brand_id/5169', - 'info_dict': { - 'id': '771852', - 'ext': 'mp4', - 'title': 'Прямой эфир. Жертвы загадочной болезни: смерть от старости в 17 лет', - 'description': 'md5:b81c8c55247a4bd996b43ce17395b2d8', - 'duration': 3096, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, - { - 'url': 'http://player.rutv.ru/iframe/live/id/51499/showZoomBtn/false/isPlay/true/sid/sochi2014', - 'info_dict': { - 'id': '51499', - 'ext': 'flv', - 'title': 'Сочи-2014. Биатлон. Индивидуальная гонка. Мужчины ', - 'description': 'md5:9e0ed5c9d2fa1efbfdfed90c9a6d179c', - }, - 'skip': 'Translation has finished', - }, - { - 'url': 'http://player.rutv.ru/iframe/live/id/21/showZoomBtn/false/isPlay/true/', - 'info_dict': { - 'id': '21', - 'ext': 'mp4', - 'title': 're:^Россия 24. Прямой эфир [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'is_live': True, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, - { - 'url': 'https://testplayer.vgtrk.com/iframe/live/id/19201/showZoomBtn/false/isPlay/true/', - 'only_matching': True, - }, - ] - - @classmethod - def _extract_url(cls, webpage): - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:test)?player\.(?:rutv\.ru|vgtrk\.com)/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage) - if mobj: - return mobj.group('url') - - mobj = re.search( - r'<meta[^>]+?property=(["\'])og:video\1[^>]+?content=(["\'])(?P<url>https?://(?:test)?player\.(?:rutv\.ru|vgtrk\.com)/flash\d+v/container\.swf\?id=.+?\2)', - webpage) - if mobj: - return mobj.group('url') - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - video_path = mobj.group('path') - - if re.match(r'flash\d+v', video_path): - video_type = 'video' - elif video_path.startswith('iframe'): - video_type = mobj.group('type') - if video_type == 'swf': - video_type = 'video' - elif video_path.startswith('index/iframe/cast_id'): - video_type = 'live' - - is_live = video_type == 'live' - - json_data = self._download_json( - 'http://player.rutv.ru/iframe/data%s/id/%s' % ('live' if is_live else 'video', video_id), - video_id, 'Downloading JSON') - - if json_data['errors']: - raise ExtractorError('%s said: %s' % (self.IE_NAME, json_data['errors']), expected=True) - - playlist = json_data['data']['playlist'] - medialist = playlist['medialist'] - media = medialist[0] - - if media['errors']: - raise ExtractorError('%s said: %s' % (self.IE_NAME, media['errors']), expected=True) - - view_count = playlist.get('count_views') - priority_transport = playlist['priority_transport'] - - thumbnail = media['picture'] - width = int_or_none(media['width']) - height = int_or_none(media['height']) - description = media['anons'] - title = media['title'] - duration = int_or_none(media.get('duration')) - - formats = [] - - for transport, links in media['sources'].items(): - for quality, url in links.items(): - preference = -1 if priority_transport == transport else -2 - if transport == 'rtmp': - mobj = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>.+))/(?P<playpath>.+)$', url) - if not mobj: - continue - fmt = { - 'url': mobj.group('url'), - 'play_path': mobj.group('playpath'), - 'app': mobj.group('app'), - 'page_url': 'http://player.rutv.ru', - 'player_url': 'http://player.rutv.ru/flash3v/osmf.swf?i=22', - 'rtmp_live': True, - 'ext': 'flv', - 'vbr': int(quality), - 'preference': preference, - } - elif transport == 'm3u8': - formats.extend(self._extract_m3u8_formats( - url, video_id, 'mp4', preference=preference, m3u8_id='hls')) - continue - else: - fmt = { - 'url': url - } - fmt.update({ - 'width': width, - 'height': height, - 'format_id': '%s-%s' % (transport, quality), - }) - formats.append(fmt) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': self._live_title(title) if is_live else title, - 'description': description, - 'thumbnail': thumbnail, - 'view_count': view_count, - 'duration': duration, - 'formats': formats, - 'is_live': is_live, - } diff --git a/youtube_dl/extractor/ruutu.py b/youtube_dl/extractor/ruutu.py deleted file mode 100644 index c50cd3ecd..000000000 --- a/youtube_dl/extractor/ruutu.py +++ /dev/null @@ -1,227 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlparse -from ..utils import ( - determine_ext, - ExtractorError, - find_xpath_attr, - int_or_none, - unified_strdate, - url_or_none, - xpath_attr, - xpath_text, -) - - -class RuutuIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?:// - (?: - (?:www\.)?(?:ruutu|supla)\.fi/(?:video|supla|audio)/| - static\.nelonenmedia\.fi/player/misc/embed_player\.html\?.*?\bnid= - ) - (?P<id>\d+) - ''' - _TESTS = [ - { - 'url': 'http://www.ruutu.fi/video/2058907', - 'md5': 'ab2093f39be1ca8581963451b3c0234f', - 'info_dict': { - 'id': '2058907', - 'ext': 'mp4', - 'title': 'Oletko aina halunnut tietää mitä tapahtuu vain hetki ennen lähetystä? - Nyt se selvisi!', - 'description': 'md5:cfc6ccf0e57a814360df464a91ff67d6', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 114, - 'age_limit': 0, - }, - }, - { - 'url': 'http://www.ruutu.fi/video/2057306', - 'md5': '065a10ae4d5b8cfd9d0c3d332465e3d9', - 'info_dict': { - 'id': '2057306', - 'ext': 'mp4', - 'title': 'Superpesis: katso koko kausi Ruudussa', - 'description': 'md5:bfb7336df2a12dc21d18fa696c9f8f23', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 40, - 'age_limit': 0, - }, - }, - { - 'url': 'http://www.supla.fi/supla/2231370', - 'md5': 'df14e782d49a2c0df03d3be2a54ef949', - 'info_dict': { - 'id': '2231370', - 'ext': 'mp4', - 'title': 'Osa 1: Mikael Jungner', - 'description': 'md5:7d90f358c47542e3072ff65d7b1bcffe', - 'thumbnail': r're:^https?://.*\.jpg$', - 'age_limit': 0, - }, - }, - # Episode where <SourceFile> is "NOT-USED", but has other - # downloadable sources available. - { - 'url': 'http://www.ruutu.fi/video/3193728', - 'only_matching': True, - }, - { - # audio podcast - 'url': 'https://www.supla.fi/supla/3382410', - 'md5': 'b9d7155fed37b2ebf6021d74c4b8e908', - 'info_dict': { - 'id': '3382410', - 'ext': 'mp3', - 'title': 'Mikä ihmeen poltergeist?', - 'description': 'md5:bbb6963df17dfd0ecd9eb9a61bf14b52', - 'thumbnail': r're:^https?://.*\.jpg$', - 'age_limit': 0, - }, - 'expected_warnings': [ - 'HTTP Error 502: Bad Gateway', - 'Failed to download m3u8 information', - ], - }, - { - 'url': 'http://www.supla.fi/audio/2231370', - 'only_matching': True, - }, - { - 'url': 'https://static.nelonenmedia.fi/player/misc/embed_player.html?nid=3618790', - 'only_matching': True, - }, - { - # episode - 'url': 'https://www.ruutu.fi/video/3401964', - 'info_dict': { - 'id': '3401964', - 'ext': 'mp4', - 'title': 'Temptation Island Suomi - Kausi 5 - Jakso 17', - 'description': 'md5:87cf01d5e1e88adf0c8a2937d2bd42ba', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 2582, - 'age_limit': 12, - 'upload_date': '20190508', - 'series': 'Temptation Island Suomi', - 'season_number': 5, - 'episode_number': 17, - 'categories': ['Reality ja tositapahtumat', 'Kotimaiset suosikit', 'Romantiikka ja parisuhde'], - }, - 'params': { - 'skip_download': True, - }, - }, - { - # premium - 'url': 'https://www.ruutu.fi/video/3618715', - 'only_matching': True, - }, - ] - _API_BASE = 'https://gatling.nelonenmedia.fi' - - def _real_extract(self, url): - video_id = self._match_id(url) - - video_xml = self._download_xml( - '%s/media-xml-cache' % self._API_BASE, video_id, - query={'id': video_id}) - - formats = [] - processed_urls = [] - - def extract_formats(node): - for child in node: - if child.tag.endswith('Files'): - extract_formats(child) - elif child.tag.endswith('File'): - video_url = child.text - if (not video_url or video_url in processed_urls - or any(p in video_url for p in ('NOT_USED', 'NOT-USED'))): - continue - processed_urls.append(video_url) - ext = determine_ext(video_url) - auth_video_url = url_or_none(self._download_webpage( - '%s/auth/access/v2' % self._API_BASE, video_id, - note='Downloading authenticated %s stream URL' % ext, - fatal=False, query={'stream': video_url})) - if auth_video_url: - processed_urls.append(auth_video_url) - video_url = auth_video_url - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls', - fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - video_url, video_id, f4m_id='hds', fatal=False)) - elif ext == 'mpd': - # video-only and audio-only streams are of different - # duration resulting in out of sync issue - continue - formats.extend(self._extract_mpd_formats( - video_url, video_id, mpd_id='dash', fatal=False)) - elif ext == 'mp3' or child.tag == 'AudioMediaFile': - formats.append({ - 'format_id': 'audio', - 'url': video_url, - 'vcodec': 'none', - }) - else: - proto = compat_urllib_parse_urlparse(video_url).scheme - if not child.tag.startswith('HTTP') and proto != 'rtmp': - continue - preference = -1 if proto == 'rtmp' else 1 - label = child.get('label') - tbr = int_or_none(child.get('bitrate')) - format_id = '%s-%s' % (proto, label if label else tbr) if label or tbr else proto - if not self._is_valid_url(video_url, video_id, format_id): - continue - width, height = [int_or_none(x) for x in child.get('resolution', 'x').split('x')[:2]] - formats.append({ - 'format_id': format_id, - 'url': video_url, - 'width': width, - 'height': height, - 'tbr': tbr, - 'preference': preference, - }) - - extract_formats(video_xml.find('./Clip')) - - def pv(name): - node = find_xpath_attr( - video_xml, './Clip/PassthroughVariables/variable', 'name', name) - if node is not None: - return node.get('value') - - if not formats: - drm = xpath_text(video_xml, './Clip/DRM', default=None) - if drm: - raise ExtractorError('This video is DRM protected.', expected=True) - ns_st_cds = pv('ns_st_cds') - if ns_st_cds != 'free': - raise ExtractorError('This video is %s.' % ns_st_cds, expected=True) - - self._sort_formats(formats) - - themes = pv('themes') - - return { - 'id': video_id, - 'title': xpath_attr(video_xml, './/Behavior/Program', 'program_name', 'title', fatal=True), - 'description': xpath_attr(video_xml, './/Behavior/Program', 'description', 'description'), - 'thumbnail': xpath_attr(video_xml, './/Behavior/Startpicture', 'href', 'thumbnail'), - 'duration': int_or_none(xpath_text(video_xml, './/Runtime', 'duration')) or int_or_none(pv('runtime')), - 'age_limit': int_or_none(xpath_text(video_xml, './/AgeLimit', 'age limit')), - 'upload_date': unified_strdate(pv('date_start')), - 'series': pv('series_name'), - 'season_number': int_or_none(pv('season_number')), - 'episode_number': int_or_none(pv('episode_number')), - 'categories': themes.split(',') if themes else [], - 'formats': formats, - } diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py deleted file mode 100644 index 2cc665122..000000000 --- a/youtube_dl/extractor/safari.py +++ /dev/null @@ -1,264 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import json -import re - -from .common import InfoExtractor - -from ..compat import ( - compat_parse_qs, - compat_urlparse, -) -from ..utils import ( - ExtractorError, - update_url_query, -) - - -class SafariBaseIE(InfoExtractor): - _LOGIN_URL = 'https://learning.oreilly.com/accounts/login/' - _NETRC_MACHINE = 'safari' - - _API_BASE = 'https://learning.oreilly.com/api/v1' - _API_FORMAT = 'json' - - LOGGED_IN = False - - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - - _, urlh = self._download_webpage_handle( - 'https://learning.oreilly.com/accounts/login-check/', None, - 'Downloading login page') - - def is_logged(urlh): - return 'learning.oreilly.com/home/' in urlh.geturl() - - if is_logged(urlh): - self.LOGGED_IN = True - return - - redirect_url = urlh.geturl() - parsed_url = compat_urlparse.urlparse(redirect_url) - qs = compat_parse_qs(parsed_url.query) - next_uri = compat_urlparse.urljoin( - 'https://api.oreilly.com', qs['next'][0]) - - auth, urlh = self._download_json_handle( - 'https://www.oreilly.com/member/auth/login/', None, 'Logging in', - data=json.dumps({ - 'email': username, - 'password': password, - 'redirect_uri': next_uri, - }).encode(), headers={ - 'Content-Type': 'application/json', - 'Referer': redirect_url, - }, expected_status=400) - - credentials = auth.get('credentials') - if (not auth.get('logged_in') and not auth.get('redirect_uri') - and credentials): - raise ExtractorError( - 'Unable to login: %s' % credentials, expected=True) - - # oreilly serves two same instances of the following cookies - # in Set-Cookie header and expects first one to be actually set - for cookie in ('groot_sessionid', 'orm-jwt', 'orm-rt'): - self._apply_first_set_cookie_header(urlh, cookie) - - _, urlh = self._download_webpage_handle( - auth.get('redirect_uri') or next_uri, None, 'Completing login',) - - if is_logged(urlh): - self.LOGGED_IN = True - return - - raise ExtractorError('Unable to log in') - - -class SafariIE(SafariBaseIE): - IE_NAME = 'safari' - IE_DESC = 'safaribooksonline.com online video' - _VALID_URL = r'''(?x) - https?:// - (?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/ - (?: - library/view/[^/]+/(?P<course_id>[^/]+)/(?P<part>[^/?\#&]+)\.html| - videos/[^/]+/[^/]+/(?P<reference_id>[^-]+-[^/?\#&]+) - ) - ''' - - _TESTS = [{ - 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/part00.html', - 'md5': 'dcc5a425e79f2564148652616af1f2a3', - 'info_dict': { - 'id': '0_qbqx90ic', - 'ext': 'mp4', - 'title': 'Introduction to Hadoop Fundamentals LiveLessons', - 'timestamp': 1437758058, - 'upload_date': '20150724', - 'uploader_id': 'stork', - }, - }, { - # non-digits in course id - 'url': 'https://www.safaribooksonline.com/library/view/create-a-nodejs/100000006A0210/part00.html', - 'only_matching': True, - }, { - 'url': 'https://www.safaribooksonline.com/library/view/learning-path-red/9780134664057/RHCE_Introduction.html', - 'only_matching': True, - }, { - 'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314/9780134217314-PYMC_13_00', - 'only_matching': True, - }, { - 'url': 'https://learning.oreilly.com/videos/hadoop-fundamentals-livelessons/9780133392838/9780133392838-00_SeriesIntro', - 'only_matching': True, - }, { - 'url': 'https://www.oreilly.com/library/view/hadoop-fundamentals-livelessons/9780133392838/00_SeriesIntro.html', - 'only_matching': True, - }] - - _PARTNER_ID = '1926081' - _UICONF_ID = '29375172' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - - reference_id = mobj.group('reference_id') - if reference_id: - video_id = reference_id - partner_id = self._PARTNER_ID - ui_id = self._UICONF_ID - else: - video_id = '%s-%s' % (mobj.group('course_id'), mobj.group('part')) - - webpage, urlh = self._download_webpage_handle(url, video_id) - - mobj = re.match(self._VALID_URL, urlh.geturl()) - reference_id = mobj.group('reference_id') - if not reference_id: - reference_id = self._search_regex( - r'data-reference-id=(["\'])(?P<id>(?:(?!\1).)+)\1', - webpage, 'kaltura reference id', group='id') - partner_id = self._search_regex( - r'data-partner-id=(["\'])(?P<id>(?:(?!\1).)+)\1', - webpage, 'kaltura widget id', default=self._PARTNER_ID, - group='id') - ui_id = self._search_regex( - r'data-ui-id=(["\'])(?P<id>(?:(?!\1).)+)\1', - webpage, 'kaltura uiconf id', default=self._UICONF_ID, - group='id') - - query = { - 'wid': '_%s' % partner_id, - 'uiconf_id': ui_id, - 'flashvars[referenceId]': reference_id, - } - - if self.LOGGED_IN: - kaltura_session = self._download_json( - '%s/player/kaltura_session/?reference_id=%s' % (self._API_BASE, reference_id), - video_id, 'Downloading kaltura session JSON', - 'Unable to download kaltura session JSON', fatal=False, - headers={'Accept': 'application/json'}) - if kaltura_session: - session = kaltura_session.get('session') - if session: - query['flashvars[ks]'] = session - - return self.url_result(update_url_query( - 'https://cdnapisec.kaltura.com/html5/html5lib/v2.37.1/mwEmbedFrame.php', query), - 'Kaltura') - - -class SafariApiIE(SafariBaseIE): - IE_NAME = 'safari:api' - _VALID_URL = r'https?://(?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/api/v1/book/(?P<course_id>[^/]+)/chapter(?:-content)?/(?P<part>[^/?#&]+)\.html' - - _TESTS = [{ - 'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html', - 'only_matching': True, - }, { - 'url': 'https://www.safaribooksonline.com/api/v1/book/9780134664057/chapter/RHCE_Introduction.html', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - part = self._download_json( - url, '%s/%s' % (mobj.group('course_id'), mobj.group('part')), - 'Downloading part JSON') - return self.url_result(part['web_url'], SafariIE.ie_key()) - - -class SafariCourseIE(SafariBaseIE): - IE_NAME = 'safari:course' - IE_DESC = 'safaribooksonline.com online courses' - - _VALID_URL = r'''(?x) - https?:// - (?: - (?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/ - (?: - library/view/[^/]+| - api/v1/book| - videos/[^/]+ - )| - techbus\.safaribooksonline\.com - ) - /(?P<id>[^/]+) - ''' - - _TESTS = [{ - 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/', - 'info_dict': { - 'id': '9780133392838', - 'title': 'Hadoop Fundamentals LiveLessons', - }, - 'playlist_count': 22, - 'skip': 'Requires safaribooksonline account credentials', - }, { - 'url': 'https://www.safaribooksonline.com/api/v1/book/9781449396459/?override_format=json', - 'only_matching': True, - }, { - 'url': 'http://techbus.safaribooksonline.com/9780134426365', - 'only_matching': True, - }, { - 'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314', - 'only_matching': True, - }, { - 'url': 'https://learning.oreilly.com/videos/hadoop-fundamentals-livelessons/9780133392838', - 'only_matching': True, - }, { - 'url': 'https://www.oreilly.com/library/view/hadoop-fundamentals-livelessons/9780133392838/', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return (False if SafariIE.suitable(url) or SafariApiIE.suitable(url) - else super(SafariCourseIE, cls).suitable(url)) - - def _real_extract(self, url): - course_id = self._match_id(url) - - course_json = self._download_json( - '%s/book/%s/?override_format=%s' % (self._API_BASE, course_id, self._API_FORMAT), - course_id, 'Downloading course JSON') - - if 'chapters' not in course_json: - raise ExtractorError( - 'No chapters found for course %s' % course_id, expected=True) - - entries = [ - self.url_result(chapter, SafariApiIE.ie_key()) - for chapter in course_json['chapters']] - - course_title = course_json['title'] - - return self.playlist_result(entries, course_id, course_title) diff --git a/youtube_dl/extractor/sapo.py b/youtube_dl/extractor/sapo.py deleted file mode 100644 index 49a9b313a..000000000 --- a/youtube_dl/extractor/sapo.py +++ /dev/null @@ -1,119 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - parse_duration, - unified_strdate, -) - - -class SapoIE(InfoExtractor): - IE_DESC = 'SAPO Vídeos' - _VALID_URL = r'https?://(?:(?:v2|www)\.)?videos\.sapo\.(?:pt|cv|ao|mz|tl)/(?P<id>[\da-zA-Z]{20})' - - _TESTS = [ - { - 'url': 'http://videos.sapo.pt/UBz95kOtiWYUMTA5Ghfi', - 'md5': '79ee523f6ecb9233ac25075dee0eda83', - 'note': 'SD video', - 'info_dict': { - 'id': 'UBz95kOtiWYUMTA5Ghfi', - 'ext': 'mp4', - 'title': 'Benfica - Marcas na Hitória', - 'description': 'md5:c9082000a128c3fd57bf0299e1367f22', - 'duration': 264, - 'uploader': 'tiago_1988', - 'upload_date': '20080229', - 'categories': ['benfica', 'cabral', 'desporto', 'futebol', 'geovanni', 'hooijdonk', 'joao', 'karel', 'lisboa', 'miccoli'], - }, - }, - { - 'url': 'http://videos.sapo.pt/IyusNAZ791ZdoCY5H5IF', - 'md5': '90a2f283cfb49193fe06e861613a72aa', - 'note': 'HD video', - 'info_dict': { - 'id': 'IyusNAZ791ZdoCY5H5IF', - 'ext': 'mp4', - 'title': 'Codebits VII - Report', - 'description': 'md5:6448d6fd81ce86feac05321f354dbdc8', - 'duration': 144, - 'uploader': 'codebits', - 'upload_date': '20140427', - 'categories': ['codebits', 'codebits2014'], - }, - }, - { - 'url': 'http://v2.videos.sapo.pt/yLqjzPtbTimsn2wWBKHz', - 'md5': 'e5aa7cc0bdc6db9b33df1a48e49a15ac', - 'note': 'v2 video', - 'info_dict': { - 'id': 'yLqjzPtbTimsn2wWBKHz', - 'ext': 'mp4', - 'title': 'Hipnose Condicionativa 4', - 'description': 'md5:ef0481abf8fb4ae6f525088a6dadbc40', - 'duration': 692, - 'uploader': 'sapozen', - 'upload_date': '20090609', - 'categories': ['condicionativa', 'heloisa', 'hipnose', 'miranda', 'sapo', 'zen'], - }, - }, - ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - item = self._download_xml( - 'http://rd3.videos.sapo.pt/%s/rss2' % video_id, video_id).find('./channel/item') - - title = item.find('./title').text - description = item.find('./{http://videos.sapo.pt/mrss/}synopse').text - thumbnail = item.find('./{http://search.yahoo.com/mrss/}content').get('url') - duration = parse_duration(item.find('./{http://videos.sapo.pt/mrss/}time').text) - uploader = item.find('./{http://videos.sapo.pt/mrss/}author').text - upload_date = unified_strdate(item.find('./pubDate').text) - view_count = int(item.find('./{http://videos.sapo.pt/mrss/}views').text) - comment_count = int(item.find('./{http://videos.sapo.pt/mrss/}comment_count').text) - tags = item.find('./{http://videos.sapo.pt/mrss/}tags').text - categories = tags.split() if tags else [] - age_limit = 18 if item.find('./{http://videos.sapo.pt/mrss/}m18').text == 'true' else 0 - - video_url = item.find('./{http://videos.sapo.pt/mrss/}videoFile').text - video_size = item.find('./{http://videos.sapo.pt/mrss/}videoSize').text.split('x') - - formats = [{ - 'url': video_url, - 'ext': 'mp4', - 'format_id': 'sd', - 'width': int(video_size[0]), - 'height': int(video_size[1]), - }] - - if item.find('./{http://videos.sapo.pt/mrss/}HD').text == 'true': - formats.append({ - 'url': re.sub(r'/mov/1$', '/mov/39', video_url), - 'ext': 'mp4', - 'format_id': 'hd', - 'width': 1280, - 'height': 720, - }) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'uploader': uploader, - 'upload_date': upload_date, - 'view_count': view_count, - 'comment_count': comment_count, - 'categories': categories, - 'age_limit': age_limit, - 'formats': formats, - } diff --git a/youtube_dl/extractor/savefrom.py b/youtube_dl/extractor/savefrom.py deleted file mode 100644 index 21e44b69a..000000000 --- a/youtube_dl/extractor/savefrom.py +++ /dev/null @@ -1,34 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import os.path -import re - -from .common import InfoExtractor - - -class SaveFromIE(InfoExtractor): - IE_NAME = 'savefrom.net' - _VALID_URL = r'https?://[^.]+\.savefrom\.net/\#url=(?P<url>.*)$' - - _TEST = { - 'url': 'http://en.savefrom.net/#url=http://youtube.com/watch?v=UlVRAPW2WJY&utm_source=youtube.com&utm_medium=short_domains&utm_campaign=ssyoutube.com', - 'info_dict': { - 'id': 'UlVRAPW2WJY', - 'ext': 'mp4', - 'title': 'About Team Radical MMA | MMA Fighting', - 'upload_date': '20120816', - 'uploader': 'Howcast', - 'uploader_id': 'Howcast', - 'description': r're:(?s).* Hi, my name is Rene Dreifuss\. And I\'m here to show you some MMA.*', - }, - 'params': { - 'skip_download': True - } - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = os.path.splitext(url.split('/')[-1])[0] - - return self.url_result(mobj.group('url'), video_id=video_id) diff --git a/youtube_dl/extractor/scrippsnetworks.py b/youtube_dl/extractor/scrippsnetworks.py deleted file mode 100644 index b40b4c4af..000000000 --- a/youtube_dl/extractor/scrippsnetworks.py +++ /dev/null @@ -1,152 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import json -import hashlib -import re - -from .aws import AWSIE -from .anvato import AnvatoIE -from .common import InfoExtractor -from ..utils import ( - smuggle_url, - urlencode_postdata, - xpath_text, -) - - -class ScrippsNetworksWatchIE(AWSIE): - IE_NAME = 'scrippsnetworks:watch' - _VALID_URL = r'''(?x) - https?:// - watch\. - (?P<site>geniuskitchen)\.com/ - (?: - player\.[A-Z0-9]+\.html\#| - show/(?:[^/]+/){2}| - player/ - ) - (?P<id>\d+) - ''' - _TESTS = [{ - 'url': 'http://watch.geniuskitchen.com/player/3787617/Ample-Hills-Ice-Cream-Bike/', - 'info_dict': { - 'id': '4194875', - 'ext': 'mp4', - 'title': 'Ample Hills Ice Cream Bike', - 'description': 'Courtney Rada churns up a signature GK Now ice cream with The Scoopmaster.', - 'uploader': 'ANV', - 'upload_date': '20171011', - 'timestamp': 1507698000, - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': [AnvatoIE.ie_key()], - }] - - _SNI_TABLE = { - 'geniuskitchen': 'genius', - } - - _AWS_API_KEY = 'E7wSQmq0qK6xPrF13WmzKiHo4BQ7tip4pQcSXVl1' - _AWS_PROXY_HOST = 'web.api.video.snidigital.com' - - _AWS_USER_AGENT = 'aws-sdk-js/2.80.0 callback' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - site_id, video_id = mobj.group('site', 'id') - - aws_identity_id_json = json.dumps({ - 'IdentityId': '%s:7655847c-0ae7-4d9b-80d6-56c062927eb3' % self._AWS_REGION - }).encode('utf-8') - token = self._download_json( - 'https://cognito-identity.%s.amazonaws.com/' % self._AWS_REGION, video_id, - data=aws_identity_id_json, - headers={ - 'Accept': '*/*', - 'Content-Type': 'application/x-amz-json-1.1', - 'Referer': url, - 'X-Amz-Content-Sha256': hashlib.sha256(aws_identity_id_json).hexdigest(), - 'X-Amz-Target': 'AWSCognitoIdentityService.GetOpenIdToken', - 'X-Amz-User-Agent': self._AWS_USER_AGENT, - })['Token'] - - sts = self._download_xml( - 'https://sts.amazonaws.com/', video_id, data=urlencode_postdata({ - 'Action': 'AssumeRoleWithWebIdentity', - 'RoleArn': 'arn:aws:iam::710330595350:role/Cognito_WebAPIUnauth_Role', - 'RoleSessionName': 'web-identity', - 'Version': '2011-06-15', - 'WebIdentityToken': token, - }), headers={ - 'Referer': url, - 'X-Amz-User-Agent': self._AWS_USER_AGENT, - 'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8', - }) - - def get(key): - return xpath_text( - sts, './/{https://sts.amazonaws.com/doc/2011-06-15/}%s' % key, - fatal=True) - - mcp_id = self._aws_execute_api({ - 'uri': '/1/web/brands/%s/episodes/scrid/%s' % (self._SNI_TABLE[site_id], video_id), - 'access_key': get('AccessKeyId'), - 'secret_key': get('SecretAccessKey'), - 'session_token': get('SessionToken'), - }, video_id)['results'][0]['mcpId'] - - return self.url_result( - smuggle_url( - 'anvato:anvato_scripps_app_web_prod_0837996dbe373629133857ae9eb72e740424d80a:%s' % mcp_id, - {'geo_countries': ['US']}), - AnvatoIE.ie_key(), video_id=mcp_id) - - -class ScrippsNetworksIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?P<site>cookingchanneltv|discovery|(?:diy|food)network|hgtv|travelchannel)\.com/videos/[0-9a-z-]+-(?P<id>\d+)' - _TESTS = [{ - 'url': 'https://www.cookingchanneltv.com/videos/the-best-of-the-best-0260338', - 'info_dict': { - 'id': '0260338', - 'ext': 'mp4', - 'title': 'The Best of the Best', - 'description': 'Catch a new episode of MasterChef Canada Tuedsay at 9/8c.', - 'timestamp': 1475678834, - 'upload_date': '20161005', - 'uploader': 'SCNI-SCND', - }, - 'add_ie': ['ThePlatform'], - }, { - 'url': 'https://www.diynetwork.com/videos/diy-barnwood-tablet-stand-0265790', - 'only_matching': True, - }, { - 'url': 'https://www.foodnetwork.com/videos/chocolate-strawberry-cake-roll-7524591', - 'only_matching': True, - }, { - 'url': 'https://www.hgtv.com/videos/cookie-decorating-101-0301929', - 'only_matching': True, - }, { - 'url': 'https://www.travelchannel.com/videos/two-climates-one-bag-5302184', - 'only_matching': True, - }, { - 'url': 'https://www.discovery.com/videos/guardians-of-the-glades-cooking-with-tom-cobb-5578368', - 'only_matching': True, - }] - _ACCOUNT_MAP = { - 'cookingchanneltv': 2433005105, - 'discovery': 2706091867, - 'diynetwork': 2433004575, - 'foodnetwork': 2433005105, - 'hgtv': 2433004575, - 'travelchannel': 2433005739, - } - _TP_TEMPL = 'https://link.theplatform.com/s/ip77QC/media/guid/%d/%s?mbr=true' - - def _real_extract(self, url): - site, guid = re.match(self._VALID_URL, url).groups() - return self.url_result(smuggle_url( - self._TP_TEMPL % (self._ACCOUNT_MAP[site], guid), - {'force_smil_url': True}), 'ThePlatform', guid) diff --git a/youtube_dl/extractor/seeker.py b/youtube_dl/extractor/seeker.py deleted file mode 100644 index 7872dc80d..000000000 --- a/youtube_dl/extractor/seeker.py +++ /dev/null @@ -1,58 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - get_element_by_class, - strip_or_none, -) - - -class SeekerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?seeker\.com/(?P<display_id>.*)-(?P<article_id>\d+)\.html' - _TESTS = [{ - 'url': 'http://www.seeker.com/should-trump-be-required-to-release-his-tax-returns-1833805621.html', - 'md5': '897d44bbe0d8986a2ead96de565a92db', - 'info_dict': { - 'id': 'Elrn3gnY', - 'ext': 'mp4', - 'title': 'Should Trump Be Required To Release His Tax Returns?', - 'description': 'md5:41efa8cfa8d627841045eec7b018eb45', - 'timestamp': 1490090165, - 'upload_date': '20170321', - } - }, { - 'url': 'http://www.seeker.com/changes-expected-at-zoos-following-recent-gorilla-lion-shootings-1834116536.html', - 'playlist': [ - { - 'md5': '0497b9f20495174be73ae136949707d2', - 'info_dict': { - 'id': 'FihYQ8AE', - 'ext': 'mp4', - 'title': 'The Pros & Cons Of Zoos', - 'description': 'md5:d88f99a8ea8e7d25e6ff77f271b1271c', - 'timestamp': 1490039133, - 'upload_date': '20170320', - }, - } - ], - 'info_dict': { - 'id': '1834116536', - 'title': 'After Gorilla Killing, Changes Ahead for Zoos', - 'description': 'The largest association of zoos and others are hoping to learn from recent incidents that led to the shooting deaths of a gorilla and two lions.', - }, - }] - - def _real_extract(self, url): - display_id, article_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage(url, display_id) - entries = [] - for jwp_id in re.findall(r'data-video-id="([a-zA-Z0-9]{8})"', webpage): - entries.append(self.url_result( - 'jwplatform:' + jwp_id, 'JWPlatform', jwp_id)) - return self.playlist_result( - entries, article_id, - self._og_search_title(webpage), - strip_or_none(get_element_by_class('subtitle__text', webpage)) or self._og_search_description(webpage)) diff --git a/youtube_dl/extractor/senateisvp.py b/youtube_dl/extractor/senateisvp.py deleted file mode 100644 index db5ef8b57..000000000 --- a/youtube_dl/extractor/senateisvp.py +++ /dev/null @@ -1,153 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - unsmuggle_url, -) -from ..compat import ( - compat_parse_qs, - compat_urlparse, -) - - -class SenateISVPIE(InfoExtractor): - _COMM_MAP = [ - ['ag', '76440', 'http://ag-f.akamaihd.net'], - ['aging', '76442', 'http://aging-f.akamaihd.net'], - ['approps', '76441', 'http://approps-f.akamaihd.net'], - ['armed', '76445', 'http://armed-f.akamaihd.net'], - ['banking', '76446', 'http://banking-f.akamaihd.net'], - ['budget', '76447', 'http://budget-f.akamaihd.net'], - ['cecc', '76486', 'http://srs-f.akamaihd.net'], - ['commerce', '80177', 'http://commerce1-f.akamaihd.net'], - ['csce', '75229', 'http://srs-f.akamaihd.net'], - ['dpc', '76590', 'http://dpc-f.akamaihd.net'], - ['energy', '76448', 'http://energy-f.akamaihd.net'], - ['epw', '76478', 'http://epw-f.akamaihd.net'], - ['ethics', '76449', 'http://ethics-f.akamaihd.net'], - ['finance', '76450', 'http://finance-f.akamaihd.net'], - ['foreign', '76451', 'http://foreign-f.akamaihd.net'], - ['govtaff', '76453', 'http://govtaff-f.akamaihd.net'], - ['help', '76452', 'http://help-f.akamaihd.net'], - ['indian', '76455', 'http://indian-f.akamaihd.net'], - ['intel', '76456', 'http://intel-f.akamaihd.net'], - ['intlnarc', '76457', 'http://intlnarc-f.akamaihd.net'], - ['jccic', '85180', 'http://jccic-f.akamaihd.net'], - ['jec', '76458', 'http://jec-f.akamaihd.net'], - ['judiciary', '76459', 'http://judiciary-f.akamaihd.net'], - ['rpc', '76591', 'http://rpc-f.akamaihd.net'], - ['rules', '76460', 'http://rules-f.akamaihd.net'], - ['saa', '76489', 'http://srs-f.akamaihd.net'], - ['smbiz', '76461', 'http://smbiz-f.akamaihd.net'], - ['srs', '75229', 'http://srs-f.akamaihd.net'], - ['uscc', '76487', 'http://srs-f.akamaihd.net'], - ['vetaff', '76462', 'http://vetaff-f.akamaihd.net'], - ['arch', '', 'http://ussenate-f.akamaihd.net/'] - ] - _IE_NAME = 'senate.gov' - _VALID_URL = r'https?://(?:www\.)?senate\.gov/isvp/?\?(?P<qs>.+)' - _TESTS = [{ - 'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png', - 'info_dict': { - 'id': 'judiciary031715', - 'ext': 'mp4', - 'title': 'Integrated Senate Video Player', - 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'http://www.senate.gov/isvp/?type=live&comm=commerce&filename=commerce011514.mp4&auto_play=false', - 'info_dict': { - 'id': 'commerce011514', - 'ext': 'mp4', - 'title': 'Integrated Senate Video Player' - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'http://www.senate.gov/isvp/?type=arch&comm=intel&filename=intel090613&hc_location=ufi', - # checksum differs each time - 'info_dict': { - 'id': 'intel090613', - 'ext': 'mp4', - 'title': 'Integrated Senate Video Player' - } - }, { - # From http://www.c-span.org/video/?96791-1 - 'url': 'http://www.senate.gov/isvp?type=live&comm=banking&filename=banking012715', - 'only_matching': True, - }] - - @staticmethod - def _search_iframe_url(webpage): - mobj = re.search( - r"<iframe[^>]+src=['\"](?P<url>https?://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]", - webpage) - if mobj: - return mobj.group('url') - - def _get_info_for_comm(self, committee): - for entry in self._COMM_MAP: - if entry[0] == committee: - return entry[1:] - - def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) - - qs = compat_parse_qs(re.match(self._VALID_URL, url).group('qs')) - if not qs.get('filename') or not qs.get('type') or not qs.get('comm'): - raise ExtractorError('Invalid URL', expected=True) - - video_id = re.sub(r'.mp4$', '', qs['filename'][0]) - - webpage = self._download_webpage(url, video_id) - - if smuggled_data.get('force_title'): - title = smuggled_data['force_title'] - else: - title = self._html_search_regex(r'<title>([^<]+)</title>', webpage, video_id) - poster = qs.get('poster') - thumbnail = poster[0] if poster else None - - video_type = qs['type'][0] - committee = video_type if video_type == 'arch' else qs['comm'][0] - stream_num, domain = self._get_info_for_comm(committee) - - formats = [] - if video_type == 'arch': - filename = video_id if '.' in video_id else video_id + '.mp4' - formats = [{ - # All parameters in the query string are necessary to prevent a 403 error - 'url': compat_urlparse.urljoin(domain, filename) + '?v=3.1.0&fp=&r=&g=', - }] - else: - hdcore_sign = 'hdcore=3.1.0' - url_params = (domain, video_id, stream_num) - f4m_url = '%s/z/%s_1@%s/manifest.f4m?' % url_params + hdcore_sign - m3u8_url = '%s/i/%s_1@%s/master.m3u8' % url_params - for entry in self._extract_f4m_formats(f4m_url, video_id, f4m_id='f4m'): - # URLs without the extra param induce an 404 error - entry.update({'extra_param_to_segment_url': hdcore_sign}) - formats.append(entry) - for entry in self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', m3u8_id='m3u8'): - mobj = re.search(r'(?P<tag>(?:-p|-b)).m3u8', entry['url']) - if mobj: - entry['format_id'] += mobj.group('tag') - formats.append(entry) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'thumbnail': thumbnail, - } diff --git a/youtube_dl/extractor/sendtonews.py b/youtube_dl/extractor/sendtonews.py deleted file mode 100644 index 9d9652949..000000000 --- a/youtube_dl/extractor/sendtonews.py +++ /dev/null @@ -1,105 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - float_or_none, - parse_iso8601, - update_url_query, - int_or_none, - determine_protocol, - unescapeHTML, -) - - -class SendtoNewsIE(InfoExtractor): - _VALID_URL = r'https?://embed\.sendtonews\.com/player2/embedplayer\.php\?.*\bSC=(?P<id>[0-9A-Za-z-]+)' - - _TEST = { - # From http://cleveland.cbslocal.com/2016/05/16/indians-score-season-high-15-runs-in-blowout-win-over-reds-rapid-reaction/ - 'url': 'http://embed.sendtonews.com/player2/embedplayer.php?SC=GxfCe0Zo7D-175909-5588&type=single&autoplay=on&sound=YES', - 'info_dict': { - 'id': 'GxfCe0Zo7D-175909-5588' - }, - 'playlist_count': 8, - # test the first video only to prevent lengthy tests - 'playlist': [{ - 'info_dict': { - 'id': '240385', - 'ext': 'mp4', - 'title': 'Indians introduce Encarnacion', - 'description': 'Indians president of baseball operations Chris Antonetti and Edwin Encarnacion discuss the slugger\'s three-year contract with Cleveland', - 'duration': 137.898, - 'thumbnail': r're:https?://.*\.jpg$', - 'upload_date': '20170105', - 'timestamp': 1483649762, - }, - }], - 'params': { - # m3u8 download - 'skip_download': True, - }, - } - - _URL_TEMPLATE = '//embed.sendtonews.com/player2/embedplayer.php?SC=%s' - - @classmethod - def _extract_url(cls, webpage): - mobj = re.search(r'''(?x)<script[^>]+src=([\'"]) - (?:https?:)?//embed\.sendtonews\.com/player/responsiveembed\.php\? - .*\bSC=(?P<SC>[0-9a-zA-Z-]+).* - \1>''', webpage) - if mobj: - sc = mobj.group('SC') - return cls._URL_TEMPLATE % sc - - def _real_extract(self, url): - playlist_id = self._match_id(url) - - data_url = update_url_query( - url.replace('embedplayer.php', 'data_read.php'), - {'cmd': 'loadInitial'}) - playlist_data = self._download_json(data_url, playlist_id) - - entries = [] - for video in playlist_data['playlistData'][0]: - info_dict = self._parse_jwplayer_data( - video['jwconfiguration'], - require_title=False, m3u8_id='hls', rtmp_params={'no_resume': True}) - - for f in info_dict['formats']: - if f.get('tbr'): - continue - tbr = int_or_none(self._search_regex( - r'/(\d+)k/', f['url'], 'bitrate', default=None)) - if not tbr: - continue - f.update({ - 'format_id': '%s-%d' % (determine_protocol(f), tbr), - 'tbr': tbr, - }) - self._sort_formats(info_dict['formats'], ('tbr', 'height', 'width', 'format_id')) - - thumbnails = [] - if video.get('thumbnailUrl'): - thumbnails.append({ - 'id': 'normal', - 'url': video['thumbnailUrl'], - }) - if video.get('smThumbnailUrl'): - thumbnails.append({ - 'id': 'small', - 'url': video['smThumbnailUrl'], - }) - info_dict.update({ - 'title': video['S_headLine'].strip(), - 'description': unescapeHTML(video.get('S_fullStory')), - 'thumbnails': thumbnails, - 'duration': float_or_none(video.get('SM_length')), - 'timestamp': parse_iso8601(video.get('S_sysDate'), delimiter=' '), - }) - entries.append(info_dict) - - return self.playlist_result(entries, playlist_id) diff --git a/youtube_dl/extractor/sevenplus.py b/youtube_dl/extractor/sevenplus.py deleted file mode 100644 index 240afc18f..000000000 --- a/youtube_dl/extractor/sevenplus.py +++ /dev/null @@ -1,94 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .brightcove import BrightcoveNewIE -from ..compat import ( - compat_HTTPError, - compat_str, -) -from ..utils import ( - ExtractorError, - try_get, - update_url_query, -) - - -class SevenPlusIE(BrightcoveNewIE): - IE_NAME = '7plus' - _VALID_URL = r'https?://(?:www\.)?7plus\.com\.au/(?P<path>[^?]+\?.*?\bepisode-id=(?P<id>[^&#]+))' - _TESTS = [{ - 'url': 'https://7plus.com.au/MTYS?episode-id=MTYS7-003', - 'info_dict': { - 'id': 'MTYS7-003', - 'ext': 'mp4', - 'title': 'S7 E3 - Wind Surf', - 'description': 'md5:29c6a69f21accda7601278f81b46483d', - 'uploader_id': '5303576322001', - 'upload_date': '20171201', - 'timestamp': 1512106377, - 'series': 'Mighty Ships', - 'season_number': 7, - 'episode_number': 3, - 'episode': 'Wind Surf', - }, - 'params': { - 'format': 'bestvideo', - 'skip_download': True, - } - }, { - 'url': 'https://7plus.com.au/UUUU?episode-id=AUMS43-001', - 'only_matching': True, - }] - - def _real_extract(self, url): - path, episode_id = re.match(self._VALID_URL, url).groups() - - try: - media = self._download_json( - 'https://videoservice.swm.digital/playback', episode_id, query={ - 'appId': '7plus', - 'deviceType': 'web', - 'platformType': 'web', - 'accountId': 5303576322001, - 'referenceId': 'ref:' + episode_id, - 'deliveryId': 'csai', - 'videoType': 'vod', - })['media'] - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - raise ExtractorError(self._parse_json( - e.cause.read().decode(), episode_id)[0]['error_code'], expected=True) - raise - - for source in media.get('sources', {}): - src = source.get('src') - if not src: - continue - source['src'] = update_url_query(src, {'rule': ''}) - - info = self._parse_brightcove_metadata(media, episode_id) - - content = self._download_json( - 'https://component-cdn.swm.digital/content/' + path, - episode_id, headers={ - 'market-id': 4, - }, fatal=False) or {} - for item in content.get('items', {}): - if item.get('componentData', {}).get('componentType') == 'infoPanel': - for src_key, dst_key in [('title', 'title'), ('shortSynopsis', 'description')]: - value = item.get(src_key) - if value: - info[dst_key] = value - info['series'] = try_get( - item, lambda x: x['seriesLogo']['name'], compat_str) - mobj = re.search(r'^S(\d+)\s+E(\d+)\s+-\s+(.+)$', info['title']) - if mobj: - info.update({ - 'season_number': int(mobj.group(1)), - 'episode_number': int(mobj.group(2)), - 'episode': mobj.group(3), - }) - - return info diff --git a/youtube_dl/extractor/seznamzpravy.py b/youtube_dl/extractor/seznamzpravy.py deleted file mode 100644 index 7a1c7e38b..000000000 --- a/youtube_dl/extractor/seznamzpravy.py +++ /dev/null @@ -1,169 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_str, - compat_urllib_parse_urlparse, -) -from ..utils import ( - urljoin, - int_or_none, - parse_codecs, - try_get, -) - - -def _raw_id(src_url): - return compat_urllib_parse_urlparse(src_url).path.split('/')[-1] - - -class SeznamZpravyIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?seznamzpravy\.cz/iframe/player\?.*\bsrc=' - _TESTS = [{ - 'url': 'https://www.seznamzpravy.cz/iframe/player?duration=241&serviceSlug=zpravy&src=https%3A%2F%2Fv39-a.sdn.szn.cz%2Fv_39%2Fvmd%2F5999c902ea707c67d8e267a9%3Ffl%3Dmdk%2C432f65a0%7C&itemType=video&autoPlay=false&title=Sv%C4%9Bt%20bez%20obalu%3A%20%C4%8Ce%C5%A1t%C3%AD%20voj%C3%A1ci%20na%20mis%C3%ADch%20(kr%C3%A1tk%C3%A1%20verze)&series=Sv%C4%9Bt%20bez%20obalu&serviceName=Seznam%20Zpr%C3%A1vy&poster=%2F%2Fd39-a.sdn.szn.cz%2Fd_39%2Fc_img_F_I%2FR5puJ.jpeg%3Ffl%3Dcro%2C0%2C0%2C1920%2C1080%7Cres%2C1200%2C%2C1%7Cjpg%2C80%2C%2C1&width=1920&height=1080&cutFrom=0&cutTo=0&splVersion=VOD&contentId=170889&contextId=35990&showAdvert=true&collocation=&autoplayPossible=true&embed=&isVideoTooShortForPreroll=false&isVideoTooLongForPostroll=true&videoCommentOpKey=&videoCommentId=&version=4.0.76&dotService=zpravy&gemiusPrismIdentifier=bVc1ZIb_Qax4W2v5xOPGpMeCP31kFfrTzj0SqPTLh_b.Z7&zoneIdPreroll=seznam.pack.videospot&skipOffsetPreroll=5§ionPrefixPreroll=%2Fzpravy', - 'info_dict': { - 'id': '170889', - 'ext': 'mp4', - 'title': 'Svět bez obalu: Čeští vojáci na misích (krátká verze)', - 'thumbnail': r're:^https?://.*\.jpe?g', - 'duration': 241, - 'series': 'Svět bez obalu', - }, - 'params': { - 'skip_download': True, - }, - }, { - # with Location key - 'url': 'https://www.seznamzpravy.cz/iframe/player?duration=null&serviceSlug=zpravy&src=https%3A%2F%2Flive-a.sdn.szn.cz%2Fv_39%2F59e468fe454f8472a96af9fa%3Ffl%3Dmdk%2C5c1e2840%7C&itemType=livevod&autoPlay=false&title=P%C5%99edseda%20KDU-%C4%8CSL%20Pavel%20B%C4%9Blobr%C3%A1dek%20ve%20volebn%C3%AD%20V%C3%BDzv%C4%9B%20Seznamu&series=V%C3%BDzva&serviceName=Seznam%20Zpr%C3%A1vy&poster=%2F%2Fd39-a.sdn.szn.cz%2Fd_39%2Fc_img_G_J%2FjTBCs.jpeg%3Ffl%3Dcro%2C0%2C0%2C1280%2C720%7Cres%2C1200%2C%2C1%7Cjpg%2C80%2C%2C1&width=16&height=9&cutFrom=0&cutTo=0&splVersion=VOD&contentId=185688&contextId=38489&showAdvert=true&collocation=&hideFullScreen=false&hideSubtitles=false&embed=&isVideoTooShortForPreroll=false&isVideoTooShortForPreroll2=false&isVideoTooLongForPostroll=false&fakePostrollZoneID=seznam.clanky.zpravy.preroll&fakePrerollZoneID=seznam.clanky.zpravy.preroll&videoCommentId=&trim=default_16x9&noPrerollVideoLength=30&noPreroll2VideoLength=undefined&noMidrollVideoLength=0&noPostrollVideoLength=999999&autoplayPossible=true&version=5.0.41&dotService=zpravy&gemiusPrismIdentifier=zD3g7byfW5ekpXmxTVLaq5Srjw5i4hsYo0HY1aBwIe..27&zoneIdPreroll=seznam.pack.videospot&skipOffsetPreroll=5§ionPrefixPreroll=%2Fzpravy%2Fvyzva&zoneIdPostroll=seznam.pack.videospot&skipOffsetPostroll=5§ionPrefixPostroll=%2Fzpravy%2Fvyzva®ression=false', - 'info_dict': { - 'id': '185688', - 'ext': 'mp4', - 'title': 'Předseda KDU-ČSL Pavel Bělobrádek ve volební Výzvě Seznamu', - 'thumbnail': r're:^https?://.*\.jpe?g', - 'series': 'Výzva', - }, - 'params': { - 'skip_download': True, - }, - }] - - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') for mobj in re.finditer( - r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?seznamzpravy\.cz/iframe/player\?.*?)\1', - webpage)] - - def _extract_sdn_formats(self, sdn_url, video_id): - sdn_data = self._download_json(sdn_url, video_id) - - if sdn_data.get('Location'): - sdn_url = sdn_data['Location'] - sdn_data = self._download_json(sdn_url, video_id) - - formats = [] - mp4_formats = try_get(sdn_data, lambda x: x['data']['mp4'], dict) or {} - for format_id, format_data in mp4_formats.items(): - relative_url = format_data.get('url') - if not relative_url: - continue - - try: - width, height = format_data.get('resolution') - except (TypeError, ValueError): - width, height = None, None - - f = { - 'url': urljoin(sdn_url, relative_url), - 'format_id': 'http-%s' % format_id, - 'tbr': int_or_none(format_data.get('bandwidth'), scale=1000), - 'width': int_or_none(width), - 'height': int_or_none(height), - } - f.update(parse_codecs(format_data.get('codec'))) - formats.append(f) - - pls = sdn_data.get('pls', {}) - - def get_url(format_id): - return try_get(pls, lambda x: x[format_id]['url'], compat_str) - - dash_rel_url = get_url('dash') - if dash_rel_url: - formats.extend(self._extract_mpd_formats( - urljoin(sdn_url, dash_rel_url), video_id, mpd_id='dash', - fatal=False)) - - hls_rel_url = get_url('hls') - if hls_rel_url: - formats.extend(self._extract_m3u8_formats( - urljoin(sdn_url, hls_rel_url), video_id, ext='mp4', - m3u8_id='hls', fatal=False)) - - self._sort_formats(formats) - return formats - - def _real_extract(self, url): - params = compat_parse_qs(compat_urllib_parse_urlparse(url).query) - - src = params['src'][0] - title = params['title'][0] - video_id = params.get('contentId', [_raw_id(src)])[0] - formats = self._extract_sdn_formats(src + 'spl2,2,VOD', video_id) - - duration = int_or_none(params.get('duration', [None])[0]) - series = params.get('series', [None])[0] - thumbnail = params.get('poster', [None])[0] - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'duration': duration, - 'series': series, - 'formats': formats, - } - - -class SeznamZpravyArticleIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:seznam\.cz/zpravy|seznamzpravy\.cz)/clanek/(?:[^/?#&]+)-(?P<id>\d+)' - _API_URL = 'https://apizpravy.seznam.cz/' - - _TESTS = [{ - # two videos on one page, with SDN URL - 'url': 'https://www.seznamzpravy.cz/clanek/jejich-svet-na-nas-utoci-je-lepsi-branit-se-na-jejich-pisecku-rika-reziser-a-major-v-zaloze-marhoul-35990', - 'info_dict': { - 'id': '35990', - 'title': 'md5:6011c877a36905f28f271fcd8dcdb0f2', - 'description': 'md5:933f7b06fa337a814ba199d3596d27ba', - }, - 'playlist_count': 2, - }, { - # video with live stream URL - 'url': 'https://www.seznam.cz/zpravy/clanek/znovu-do-vlady-s-ano-pavel-belobradek-ve-volebnim-specialu-seznamu-38489', - 'info_dict': { - 'id': '38489', - 'title': 'md5:8fa1afdc36fd378cf0eba2b74c5aca60', - 'description': 'md5:428e7926a1a81986ec7eb23078004fb4', - }, - 'playlist_count': 1, - }] - - def _real_extract(self, url): - article_id = self._match_id(url) - - webpage = self._download_webpage(url, article_id) - - info = self._search_json_ld(webpage, article_id, default={}) - - title = info.get('title') or self._og_search_title(webpage, fatal=False) - description = info.get('description') or self._og_search_description(webpage) - - return self.playlist_result([ - self.url_result(entry_url, ie=SeznamZpravyIE.ie_key()) - for entry_url in SeznamZpravyIE._extract_urls(webpage)], - article_id, title, description) diff --git a/youtube_dl/extractor/shahid.py b/youtube_dl/extractor/shahid.py deleted file mode 100644 index 88b938e05..000000000 --- a/youtube_dl/extractor/shahid.py +++ /dev/null @@ -1,225 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import json -import math -import re - -from .aws import AWSIE -from ..compat import compat_HTTPError -from ..utils import ( - clean_html, - ExtractorError, - InAdvancePagedList, - int_or_none, - parse_iso8601, - str_or_none, - urlencode_postdata, -) - - -class ShahidBaseIE(AWSIE): - _AWS_PROXY_HOST = 'api2.shahid.net' - _AWS_API_KEY = '2RRtuMHx95aNI1Kvtn2rChEuwsCogUd4samGPjLh' - _VALID_URL_BASE = r'https?://shahid\.mbc\.net/[a-z]{2}/' - - def _handle_error(self, e): - fail_data = self._parse_json( - e.cause.read().decode('utf-8'), None, fatal=False) - if fail_data: - faults = fail_data.get('faults', []) - faults_message = ', '.join([clean_html(fault['userMessage']) for fault in faults if fault.get('userMessage')]) - if faults_message: - raise ExtractorError(faults_message, expected=True) - - def _call_api(self, path, video_id, request=None): - query = {} - if request: - query['request'] = json.dumps(request) - try: - return self._aws_execute_api({ - 'uri': '/proxy/v2/' + path, - 'access_key': 'AKIAI6X4TYCIXM2B7MUQ', - 'secret_key': '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', - }, video_id, query) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError): - self._handle_error(e) - raise - - -class ShahidIE(ShahidBaseIE): - _NETRC_MACHINE = 'shahid' - _VALID_URL = ShahidBaseIE._VALID_URL_BASE + r'(?:serie|show|movie)s/[^/]+/(?P<type>episode|clip|movie)-(?P<id>\d+)' - _TESTS = [{ - 'url': 'https://shahid.mbc.net/ar/shows/%D9%85%D8%AA%D8%AD%D9%81-%D8%A7%D9%84%D8%AF%D8%AD%D9%8A%D8%AD-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D9%83%D9%84%D9%8A%D8%A8-1/clip-816924', - 'info_dict': { - 'id': '816924', - 'ext': 'mp4', - 'title': 'متحف الدحيح الموسم 1 كليب 1', - 'timestamp': 1602806400, - 'upload_date': '20201016', - 'description': 'برومو', - 'duration': 22, - 'categories': ['كوميديا'], - }, - 'params': { - # m3u8 download - 'skip_download': True, - } - }, { - 'url': 'https://shahid.mbc.net/ar/movies/%D8%A7%D9%84%D9%82%D9%86%D8%A7%D8%B5%D8%A9/movie-151746', - 'only_matching': True - }, { - # shahid plus subscriber only - 'url': 'https://shahid.mbc.net/ar/series/%D9%85%D8%B1%D8%A7%D9%8A%D8%A7-2011-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/episode-90511', - 'only_matching': True - }, { - 'url': 'https://shahid.mbc.net/en/shows/Ramez-Fi-Al-Shallal-season-1-episode-1/episode-359319', - 'only_matching': True - }] - - def _real_initialize(self): - email, password = self._get_login_info() - if email is None: - return - - try: - user_data = self._download_json( - 'https://shahid.mbc.net/wd/service/users/login', - None, 'Logging in', data=json.dumps({ - 'email': email, - 'password': password, - 'basic': 'false', - }).encode('utf-8'), headers={ - 'Content-Type': 'application/json; charset=UTF-8', - })['user'] - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError): - self._handle_error(e) - raise - - self._download_webpage( - 'https://shahid.mbc.net/populateContext', - None, 'Populate Context', data=urlencode_postdata({ - 'firstName': user_data['firstName'], - 'lastName': user_data['lastName'], - 'userName': user_data['email'], - 'csg_user_name': user_data['email'], - 'subscriberId': user_data['id'], - 'sessionId': user_data['sessionId'], - })) - - def _real_extract(self, url): - page_type, video_id = re.match(self._VALID_URL, url).groups() - if page_type == 'clip': - page_type = 'episode' - - playout = self._call_api( - 'playout/new/url/' + video_id, video_id)['playout'] - - if playout.get('drm'): - raise ExtractorError('This video is DRM protected.', expected=True) - - formats = self._extract_m3u8_formats(re.sub( - # https://docs.aws.amazon.com/mediapackage/latest/ug/manifest-filtering.html - r'aws\.manifestfilter=[\w:;,-]+&?', - '', playout['url']), video_id, 'mp4') - self._sort_formats(formats) - - # video = self._call_api( - # 'product/id', video_id, { - # 'id': video_id, - # 'productType': 'ASSET', - # 'productSubType': page_type.upper() - # })['productModel'] - - response = self._download_json( - 'http://api.shahid.net/api/v1_1/%s/%s' % (page_type, video_id), - video_id, 'Downloading video JSON', query={ - 'apiKey': 'sh@hid0nlin3', - 'hash': 'b2wMCTHpSmyxGqQjJFOycRmLSex+BpTK/ooxy6vHaqs=', - }) - data = response.get('data', {}) - error = data.get('error') - if error: - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, '\n'.join(error.values())), - expected=True) - - video = data[page_type] - title = video['title'] - categories = [ - category['name'] - for category in video.get('genres', []) if 'name' in category] - - return { - 'id': video_id, - 'title': title, - 'description': video.get('description'), - 'thumbnail': video.get('thumbnailUrl'), - 'duration': int_or_none(video.get('duration')), - 'timestamp': parse_iso8601(video.get('referenceDate')), - 'categories': categories, - 'series': video.get('showTitle') or video.get('showName'), - 'season': video.get('seasonTitle'), - 'season_number': int_or_none(video.get('seasonNumber')), - 'season_id': str_or_none(video.get('seasonId')), - 'episode_number': int_or_none(video.get('number')), - 'episode_id': video_id, - 'formats': formats, - } - - -class ShahidShowIE(ShahidBaseIE): - _VALID_URL = ShahidBaseIE._VALID_URL_BASE + r'(?:show|serie)s/[^/]+/(?:show|series)-(?P<id>\d+)' - _TESTS = [{ - 'url': 'https://shahid.mbc.net/ar/shows/%D8%B1%D8%A7%D9%85%D8%B2-%D9%82%D8%B1%D8%B4-%D8%A7%D9%84%D8%A8%D8%AD%D8%B1/show-79187', - 'info_dict': { - 'id': '79187', - 'title': 'رامز قرش البحر', - 'description': 'md5:c88fa7e0f02b0abd39d417aee0d046ff', - }, - 'playlist_mincount': 32, - }, { - 'url': 'https://shahid.mbc.net/ar/series/How-to-live-Longer-(The-Big-Think)/series-291861', - 'only_matching': True - }] - _PAGE_SIZE = 30 - - def _real_extract(self, url): - show_id = self._match_id(url) - - product = self._call_api( - 'playableAsset', show_id, {'showId': show_id})['productModel'] - playlist = product['playlist'] - playlist_id = playlist['id'] - show = product.get('show', {}) - - def page_func(page_num): - playlist = self._call_api( - 'product/playlist', show_id, { - 'playListId': playlist_id, - 'pageNumber': page_num, - 'pageSize': 30, - 'sorts': [{ - 'order': 'DESC', - 'type': 'SORTDATE' - }], - }) - for product in playlist.get('productList', {}).get('products', []): - product_url = product.get('productUrl', []).get('url') - if not product_url: - continue - yield self.url_result( - product_url, 'Shahid', - str_or_none(product.get('id')), - product.get('title')) - - entries = InAdvancePagedList( - page_func, - math.ceil(playlist['count'] / self._PAGE_SIZE), - self._PAGE_SIZE) - - return self.playlist_result( - entries, show_id, show.get('title'), show.get('description')) diff --git a/youtube_dl/extractor/simplecast.py b/youtube_dl/extractor/simplecast.py deleted file mode 100644 index 2d0b3c06d..000000000 --- a/youtube_dl/extractor/simplecast.py +++ /dev/null @@ -1,160 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - clean_podcast_url, - int_or_none, - parse_iso8601, - strip_or_none, - try_get, - urlencode_postdata, -) - - -class SimplecastBaseIE(InfoExtractor): - _UUID_REGEX = r'[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12}' - _API_BASE = 'https://api.simplecast.com/' - - def _call_api(self, path_tmpl, video_id): - return self._download_json( - self._API_BASE + path_tmpl % video_id, video_id) - - def _call_search_api(self, resource, resource_id, resource_url): - return self._download_json( - 'https://api.simplecast.com/%ss/search' % resource, resource_id, - data=urlencode_postdata({'url': resource_url})) - - def _parse_episode(self, episode): - episode_id = episode['id'] - title = episode['title'].strip() - audio_file = episode.get('audio_file') or {} - audio_file_url = audio_file.get('url') or episode.get('audio_file_url') or episode['enclosure_url'] - - season = episode.get('season') or {} - season_href = season.get('href') - season_id = None - if season_href: - season_id = self._search_regex( - r'https?://api.simplecast.com/seasons/(%s)' % self._UUID_REGEX, - season_href, 'season id', default=None) - - webpage_url = episode.get('episode_url') - channel_url = None - if webpage_url: - channel_url = self._search_regex( - r'(https?://[^/]+\.simplecast\.com)', - webpage_url, 'channel url', default=None) - - return { - 'id': episode_id, - 'display_id': episode.get('slug'), - 'title': title, - 'url': clean_podcast_url(audio_file_url), - 'webpage_url': webpage_url, - 'channel_url': channel_url, - 'series': try_get(episode, lambda x: x['podcast']['title']), - 'season_number': int_or_none(season.get('number')), - 'season_id': season_id, - 'thumbnail': episode.get('image_url'), - 'episode_id': episode_id, - 'episode_number': int_or_none(episode.get('number')), - 'description': strip_or_none(episode.get('description')), - 'timestamp': parse_iso8601(episode.get('published_at')), - 'duration': int_or_none(episode.get('duration')), - 'filesize': int_or_none(audio_file.get('size') or episode.get('audio_file_size')), - } - - -class SimplecastIE(SimplecastBaseIE): - IE_NAME = 'simplecast' - _VALID_URL = r'https?://(?:api\.simplecast\.com/episodes|player\.simplecast\.com)/(?P<id>%s)' % SimplecastBaseIE._UUID_REGEX - _COMMON_TEST_INFO = { - 'display_id': 'errant-signal-chris-franklin-new-wave-video-essays', - 'id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876', - 'ext': 'mp3', - 'title': 'Errant Signal - Chris Franklin & New Wave Video Essays', - 'episode_number': 1, - 'episode_id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876', - 'description': 'md5:34752789d3d2702e2d2c975fbd14f357', - 'season_number': 1, - 'season_id': 'e23df0da-bae4-4531-8bbf-71364a88dc13', - 'series': 'The RE:BIND.io Podcast', - 'duration': 5343, - 'timestamp': 1580979475, - 'upload_date': '20200206', - 'webpage_url': r're:^https?://the-re-bind-io-podcast\.simplecast\.com/episodes/errant-signal-chris-franklin-new-wave-video-essays', - 'channel_url': r're:^https?://the-re-bind-io-podcast\.simplecast\.com$', - } - _TESTS = [{ - 'url': 'https://api.simplecast.com/episodes/b6dc49a2-9404-4853-9aa9-9cfc097be876', - 'md5': '8c93be7be54251bf29ee97464eabe61c', - 'info_dict': _COMMON_TEST_INFO, - }, { - 'url': 'https://player.simplecast.com/b6dc49a2-9404-4853-9aa9-9cfc097be876', - 'only_matching': True, - }] - - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'''(?x)<iframe[^>]+src=["\'] - ( - https?://(?:embed\.simplecast\.com/[0-9a-f]{8}| - player\.simplecast\.com/%s - ))''' % SimplecastBaseIE._UUID_REGEX, webpage) - - def _real_extract(self, url): - episode_id = self._match_id(url) - episode = self._call_api('episodes/%s', episode_id) - return self._parse_episode(episode) - - -class SimplecastEpisodeIE(SimplecastBaseIE): - IE_NAME = 'simplecast:episode' - _VALID_URL = r'https?://(?!api\.)[^/]+\.simplecast\.com/episodes/(?P<id>[^/?&#]+)' - _TEST = { - 'url': 'https://the-re-bind-io-podcast.simplecast.com/episodes/errant-signal-chris-franklin-new-wave-video-essays', - 'md5': '8c93be7be54251bf29ee97464eabe61c', - 'info_dict': SimplecastIE._COMMON_TEST_INFO, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - episode = self._call_search_api( - 'episode', mobj.group(1), mobj.group(0)) - return self._parse_episode(episode) - - -class SimplecastPodcastIE(SimplecastBaseIE): - IE_NAME = 'simplecast:podcast' - _VALID_URL = r'https?://(?!(?:api|cdn|embed|feeds|player)\.)(?P<id>[^/]+)\.simplecast\.com(?!/episodes/[^/?&#]+)' - _TESTS = [{ - 'url': 'https://the-re-bind-io-podcast.simplecast.com', - 'playlist_mincount': 33, - 'info_dict': { - 'id': '07d28d26-7522-42eb-8c53-2bdcfc81c43c', - 'title': 'The RE:BIND.io Podcast', - }, - }, { - 'url': 'https://the-re-bind-io-podcast.simplecast.com/episodes', - 'only_matching': True, - }] - - def _real_extract(self, url): - subdomain = self._match_id(url) - site = self._call_search_api('site', subdomain, url) - podcast = site['podcast'] - podcast_id = podcast['id'] - podcast_title = podcast.get('title') - - def entries(): - episodes = self._call_api('podcasts/%s/episodes', podcast_id) - for episode in (episodes.get('collection') or []): - info = self._parse_episode(episode) - info['series'] = podcast_title - yield info - - return self.playlist_result(entries(), podcast_id, podcast_title) diff --git a/youtube_dl/extractor/sina.py b/youtube_dl/extractor/sina.py deleted file mode 100644 index 07b766b4a..000000000 --- a/youtube_dl/extractor/sina.py +++ /dev/null @@ -1,115 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - HEADRequest, - ExtractorError, - int_or_none, - update_url_query, - qualities, - get_element_by_attribute, - clean_html, -) - - -class SinaIE(InfoExtractor): - _VALID_URL = r'''(?x)https?://(?:.*?\.)?video\.sina\.com\.cn/ - (?: - (?:view/|.*\#)(?P<video_id>\d+)| - .+?/(?P<pseudo_id>[^/?#]+)(?:\.s?html)| - # This is used by external sites like Weibo - api/sinawebApi/outplay.php/(?P<token>.+?)\.swf - ) - ''' - - _TESTS = [ - { - 'url': 'http://video.sina.com.cn/news/spj/topvideoes20160504/?opsubject_id=top1#250576622', - 'md5': 'd38433e2fc886007729735650ae4b3e9', - 'info_dict': { - 'id': '250576622', - 'ext': 'mp4', - 'title': '现场:克鲁兹宣布退选 特朗普将稳获提名', - } - }, - { - 'url': 'http://video.sina.com.cn/v/b/101314253-1290078633.html', - 'info_dict': { - 'id': '101314253', - 'ext': 'flv', - 'title': '军方提高对朝情报监视级别', - }, - 'skip': 'the page does not exist or has been deleted', - }, - { - 'url': 'http://video.sina.com.cn/view/250587748.html', - 'md5': '3d1807a25c775092aab3bc157fff49b4', - 'info_dict': { - 'id': '250587748', - 'ext': 'mp4', - 'title': '瞬间泪目:8年前汶川地震珍贵视频首曝光', - }, - }, - ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - - video_id = mobj.group('video_id') - if not video_id: - if mobj.group('token') is not None: - # The video id is in the redirected url - self.to_screen('Getting video id') - request = HEADRequest(url) - _, urlh = self._download_webpage_handle(request, 'NA', False) - return self._real_extract(urlh.geturl()) - else: - pseudo_id = mobj.group('pseudo_id') - webpage = self._download_webpage(url, pseudo_id) - error = get_element_by_attribute('class', 'errtitle', webpage) - if error: - raise ExtractorError('%s said: %s' % ( - self.IE_NAME, clean_html(error)), expected=True) - video_id = self._search_regex( - r"video_id\s*:\s*'(\d+)'", webpage, 'video id') - - video_data = self._download_json( - 'http://s.video.sina.com.cn/video/h5play', - video_id, query={'video_id': video_id}) - if video_data['code'] != 1: - raise ExtractorError('%s said: %s' % ( - self.IE_NAME, video_data['message']), expected=True) - else: - video_data = video_data['data'] - title = video_data['title'] - description = video_data.get('description') - if description: - description = description.strip() - - preference = qualities(['cif', 'sd', 'hd', 'fhd', 'ffd']) - formats = [] - for quality_id, quality in video_data.get('videos', {}).get('mp4', {}).items(): - file_api = quality.get('file_api') - file_id = quality.get('file_id') - if not file_api or not file_id: - continue - formats.append({ - 'format_id': quality_id, - 'url': update_url_query(file_api, {'vid': file_id}), - 'preference': preference(quality_id), - 'ext': 'mp4', - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': video_data.get('image'), - 'duration': int_or_none(video_data.get('length')), - 'timestamp': int_or_none(video_data.get('create_time')), - 'formats': formats, - } diff --git a/youtube_dl/extractor/sixplay.py b/youtube_dl/extractor/sixplay.py deleted file mode 100644 index 7ec66ecf3..000000000 --- a/youtube_dl/extractor/sixplay.py +++ /dev/null @@ -1,129 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_str, - compat_urllib_parse_urlparse, -) -from ..utils import ( - determine_ext, - int_or_none, - try_get, - qualities, -) - - -class SixPlayIE(InfoExtractor): - IE_NAME = '6play' - _VALID_URL = r'(?:6play:|https?://(?:www\.)?(?P<domain>6play\.fr|rtlplay\.be|play\.rtl\.hr|rtlmost\.hu)/.+?-c_)(?P<id>[0-9]+)' - _TESTS = [{ - 'url': 'https://www.6play.fr/minute-par-minute-p_9533/le-but-qui-a-marque-lhistoire-du-football-francais-c_12041051', - 'md5': '31fcd112637baa0c2ab92c4fcd8baf27', - 'info_dict': { - 'id': '12041051', - 'ext': 'mp4', - 'title': 'Le but qui a marqué l\'histoire du football français !', - 'description': 'md5:b59e7e841d646ef1eb42a7868eb6a851', - }, - }, { - 'url': 'https://www.rtlplay.be/rtl-info-13h-p_8551/les-titres-du-rtlinfo-13h-c_12045869', - 'only_matching': True, - }, { - 'url': 'https://play.rtl.hr/pj-masks-p_9455/epizoda-34-sezona-1-catboyevo-cudo-na-dva-kotaca-c_11984989', - 'only_matching': True, - }, { - 'url': 'https://www.rtlmost.hu/megtorve-p_14167/megtorve-6-resz-c_12397787', - 'only_matching': True, - }] - - def _real_extract(self, url): - domain, video_id = re.search(self._VALID_URL, url).groups() - service, consumer_name = { - '6play.fr': ('6play', 'm6web'), - 'rtlplay.be': ('rtlbe_rtl_play', 'rtlbe'), - 'play.rtl.hr': ('rtlhr_rtl_play', 'rtlhr'), - 'rtlmost.hu': ('rtlhu_rtl_most', 'rtlhu'), - }.get(domain, ('6play', 'm6web')) - - data = self._download_json( - 'https://pc.middleware.6play.fr/6play/v2/platforms/m6group_web/services/%s/videos/clip_%s' % (service, video_id), - video_id, headers={ - 'x-customer-name': consumer_name - }, query={ - 'csa': 5, - 'with': 'clips', - }) - - clip_data = data['clips'][0] - title = clip_data['title'] - - urls = [] - quality_key = qualities(['lq', 'sd', 'hq', 'hd']) - formats = [] - subtitles = {} - assets = clip_data.get('assets') or [] - for asset in assets: - asset_url = asset.get('full_physical_path') - protocol = asset.get('protocol') - if not asset_url or ((protocol == 'primetime' or asset.get('type') == 'usp_hlsfp_h264') and not ('_drmnp.ism/' in asset_url or '_unpnp.ism/' in asset_url)) or asset_url in urls: - continue - urls.append(asset_url) - container = asset.get('video_container') - ext = determine_ext(asset_url) - if protocol == 'http_subtitle' or ext == 'vtt': - subtitles.setdefault('fr', []).append({'url': asset_url}) - continue - if container == 'm3u8' or ext == 'm3u8': - if protocol == 'usp': - if compat_parse_qs(compat_urllib_parse_urlparse(asset_url).query).get('token', [None])[0]: - urlh = self._request_webpage( - asset_url, video_id, fatal=False, - headers=self.geo_verification_headers()) - if not urlh: - continue - asset_url = urlh.geturl() - asset_url = asset_url.replace('_drmnp.ism/', '_unpnp.ism/') - for i in range(3, 0, -1): - asset_url = asset_url = asset_url.replace('_sd1/', '_sd%d/' % i) - m3u8_formats = self._extract_m3u8_formats( - asset_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False) - formats.extend(m3u8_formats) - formats.extend(self._extract_mpd_formats( - asset_url.replace('.m3u8', '.mpd'), - video_id, mpd_id='dash', fatal=False)) - if m3u8_formats: - break - else: - formats.extend(self._extract_m3u8_formats( - asset_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - elif container == 'mp4' or ext == 'mp4': - quality = asset.get('video_quality') - formats.append({ - 'url': asset_url, - 'format_id': quality, - 'quality': quality_key(quality), - 'ext': ext, - }) - self._sort_formats(formats) - - def get(getter): - for src in (data, clip_data): - v = try_get(src, getter, compat_str) - if v: - return v - - return { - 'id': video_id, - 'title': title, - 'description': get(lambda x: x['description']), - 'duration': int_or_none(clip_data.get('duration')), - 'series': get(lambda x: x['program']['title']), - 'formats': formats, - 'subtitles': subtitles, - } diff --git a/youtube_dl/extractor/slideshare.py b/youtube_dl/extractor/slideshare.py deleted file mode 100644 index e89ebebe7..000000000 --- a/youtube_dl/extractor/slideshare.py +++ /dev/null @@ -1,56 +0,0 @@ -from __future__ import unicode_literals - -import re -import json - -from .common import InfoExtractor -from ..compat import ( - compat_urlparse, -) -from ..utils import ( - ExtractorError, - get_element_by_id, -) - - -class SlideshareIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?slideshare\.net/[^/]+?/(?P<title>.+?)($|\?)' - - _TEST = { - 'url': 'http://www.slideshare.net/Dataversity/keynote-presentation-managing-scale-and-complexity', - 'info_dict': { - 'id': '25665706', - 'ext': 'mp4', - 'title': 'Managing Scale and Complexity', - 'description': 'This was a keynote presentation at the NoSQL Now! 2013 Conference & Expo (http://www.nosqlnow.com). This presentation was given by Adrian Cockcroft from Netflix.', - }, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - page_title = mobj.group('title') - webpage = self._download_webpage(url, page_title) - slideshare_obj = self._search_regex( - r'\$\.extend\(.*?slideshare_object,\s*(\{.*?\})\);', - webpage, 'slideshare object') - info = json.loads(slideshare_obj) - if info['slideshow']['type'] != 'video': - raise ExtractorError('Webpage type is "%s": only video extraction is supported for Slideshare' % info['slideshow']['type'], expected=True) - - doc = info['doc'] - bucket = info['jsplayer']['video_bucket'] - ext = info['jsplayer']['video_extension'] - video_url = compat_urlparse.urljoin(bucket, doc + '-SD.' + ext) - description = get_element_by_id('slideshow-description-paragraph', webpage) or self._html_search_regex( - r'(?s)<p[^>]+itemprop="description"[^>]*>(.+?)</p>', webpage, - 'description', fatal=False) - - return { - '_type': 'video', - 'id': info['slideshow']['id'], - 'title': info['slideshow']['title'], - 'ext': ext, - 'url': video_url, - 'thumbnail': info['slideshow']['pin_image_url'], - 'description': description.strip() if description else None, - } diff --git a/youtube_dl/extractor/snotr.py b/youtube_dl/extractor/snotr.py deleted file mode 100644 index f77354748..000000000 --- a/youtube_dl/extractor/snotr.py +++ /dev/null @@ -1,73 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - parse_duration, - parse_filesize, - str_to_int, -) - - -class SnotrIE(InfoExtractor): - _VALID_URL = r'http?://(?:www\.)?snotr\.com/video/(?P<id>\d+)/([\w]+)' - _TESTS = [{ - 'url': 'http://www.snotr.com/video/13708/Drone_flying_through_fireworks', - 'info_dict': { - 'id': '13708', - 'ext': 'mp4', - 'title': 'Drone flying through fireworks!', - 'duration': 248, - 'filesize_approx': 40700000, - 'description': 'A drone flying through Fourth of July Fireworks', - 'thumbnail': r're:^https?://.*\.jpg$', - }, - 'expected_warnings': ['description'], - }, { - 'url': 'http://www.snotr.com/video/530/David_Letteman_-_George_W_Bush_Top_10', - 'info_dict': { - 'id': '530', - 'ext': 'mp4', - 'title': 'David Letteman - George W. Bush Top 10', - 'duration': 126, - 'filesize_approx': 8500000, - 'description': 'The top 10 George W. Bush moments, brought to you by David Letterman!', - 'thumbnail': r're:^https?://.*\.jpg$', - } - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - webpage = self._download_webpage(url, video_id) - title = self._og_search_title(webpage) - - description = self._og_search_description(webpage) - info_dict = self._parse_html5_media_entries( - url, webpage, video_id, m3u8_entry_protocol='m3u8_native')[0] - - view_count = str_to_int(self._html_search_regex( - r'<p[^>]*>\s*<strong[^>]*>Views:</strong>\s*<span[^>]*>([\d,\.]+)', - webpage, 'view count', fatal=False)) - - duration = parse_duration(self._html_search_regex( - r'<p[^>]*>\s*<strong[^>]*>Length:</strong>\s*<span[^>]*>([\d:]+)', - webpage, 'duration', fatal=False)) - - filesize_approx = parse_filesize(self._html_search_regex( - r'<p[^>]*>\s*<strong[^>]*>Filesize:</strong>\s*<span[^>]*>([^<]+)', - webpage, 'filesize', fatal=False)) - - info_dict.update({ - 'id': video_id, - 'description': description, - 'title': title, - 'view_count': view_count, - 'duration': duration, - 'filesize_approx': filesize_approx, - }) - - return info_dict diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py deleted file mode 100644 index a62ed84f1..000000000 --- a/youtube_dl/extractor/sohu.py +++ /dev/null @@ -1,202 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urllib_parse_urlencode, -) -from ..utils import ( - ExtractorError, - int_or_none, - try_get, -) - - -class SohuIE(InfoExtractor): - _VALID_URL = r'https?://(?P<mytv>my\.)?tv\.sohu\.com/.+?/(?(mytv)|n)(?P<id>\d+)\.shtml.*?' - - # Sohu videos give different MD5 sums on Travis CI and my machine - _TESTS = [{ - 'note': 'This video is available only in Mainland China', - 'url': 'http://tv.sohu.com/20130724/n382479172.shtml#super', - 'info_dict': { - 'id': '382479172', - 'ext': 'mp4', - 'title': 'MV:Far East Movement《The Illest》', - }, - 'skip': 'On available in China', - }, { - 'url': 'http://tv.sohu.com/20150305/n409385080.shtml', - 'info_dict': { - 'id': '409385080', - 'ext': 'mp4', - 'title': '《2015湖南卫视羊年元宵晚会》唐嫣《花好月圆》', - } - }, { - 'url': 'http://my.tv.sohu.com/us/232799889/78693464.shtml', - 'info_dict': { - 'id': '78693464', - 'ext': 'mp4', - 'title': '【爱范品】第31期:MWC见不到的奇葩手机', - } - }, { - 'note': 'Multipart video', - 'url': 'http://my.tv.sohu.com/pl/8384802/78910339.shtml', - 'info_dict': { - 'id': '78910339', - 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', - }, - 'playlist': [{ - 'info_dict': { - 'id': '78910339_part1', - 'ext': 'mp4', - 'duration': 294, - 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', - } - }, { - 'info_dict': { - 'id': '78910339_part2', - 'ext': 'mp4', - 'duration': 300, - 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', - } - }, { - 'info_dict': { - 'id': '78910339_part3', - 'ext': 'mp4', - 'duration': 150, - 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', - } - }] - }, { - 'note': 'Video with title containing dash', - 'url': 'http://my.tv.sohu.com/us/249884221/78932792.shtml', - 'info_dict': { - 'id': '78932792', - 'ext': 'mp4', - 'title': 'youtube-dl testing video', - }, - 'params': { - 'skip_download': True - } - }] - - def _real_extract(self, url): - - def _fetch_data(vid_id, mytv=False): - if mytv: - base_data_url = 'http://my.tv.sohu.com/play/videonew.do?vid=' - else: - base_data_url = 'http://hot.vrs.sohu.com/vrs_flash.action?vid=' - - return self._download_json( - base_data_url + vid_id, video_id, - 'Downloading JSON data for %s' % vid_id, - headers=self.geo_verification_headers()) - - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - mytv = mobj.group('mytv') is not None - - webpage = self._download_webpage(url, video_id) - - title = re.sub(r' - 搜狐视频$', '', self._og_search_title(webpage)) - - vid = self._html_search_regex( - r'var vid ?= ?["\'](\d+)["\']', - webpage, 'video path') - vid_data = _fetch_data(vid, mytv) - if vid_data['play'] != 1: - if vid_data.get('status') == 12: - raise ExtractorError( - '%s said: There\'s something wrong in the video.' % self.IE_NAME, - expected=True) - else: - self.raise_geo_restricted( - '%s said: The video is only licensed to users in Mainland China.' % self.IE_NAME) - - formats_json = {} - for format_id in ('nor', 'high', 'super', 'ori', 'h2644k', 'h2654k'): - vid_id = vid_data['data'].get('%sVid' % format_id) - if not vid_id: - continue - vid_id = compat_str(vid_id) - formats_json[format_id] = vid_data if vid == vid_id else _fetch_data(vid_id, mytv) - - part_count = vid_data['data']['totalBlocks'] - - playlist = [] - for i in range(part_count): - formats = [] - for format_id, format_data in formats_json.items(): - allot = format_data['allot'] - - data = format_data['data'] - clips_url = data['clipsURL'] - su = data['su'] - - video_url = 'newflv.sohu.ccgslb.net' - cdnId = None - retries = 0 - - while 'newflv.sohu.ccgslb.net' in video_url: - params = { - 'prot': 9, - 'file': clips_url[i], - 'new': su[i], - 'prod': 'flash', - 'rb': 1, - } - - if cdnId is not None: - params['idc'] = cdnId - - download_note = 'Downloading %s video URL part %d of %d' % ( - format_id, i + 1, part_count) - - if retries > 0: - download_note += ' (retry #%d)' % retries - part_info = self._parse_json(self._download_webpage( - 'http://%s/?%s' % (allot, compat_urllib_parse_urlencode(params)), - video_id, download_note), video_id) - - video_url = part_info['url'] - cdnId = part_info.get('nid') - - retries += 1 - if retries > 5: - raise ExtractorError('Failed to get video URL') - - formats.append({ - 'url': video_url, - 'format_id': format_id, - 'filesize': int_or_none( - try_get(data, lambda x: x['clipsBytes'][i])), - 'width': int_or_none(data.get('width')), - 'height': int_or_none(data.get('height')), - 'fps': int_or_none(data.get('fps')), - }) - self._sort_formats(formats) - - playlist.append({ - 'id': '%s_part%d' % (video_id, i + 1), - 'title': title, - 'duration': vid_data['data']['clipsDuration'][i], - 'formats': formats, - }) - - if len(playlist) == 1: - info = playlist[0] - info['id'] = video_id - else: - info = { - '_type': 'multi_video', - 'entries': playlist, - 'id': video_id, - 'title': title, - } - - return info diff --git a/youtube_dl/extractor/sonyliv.py b/youtube_dl/extractor/sonyliv.py deleted file mode 100644 index fedfceb62..000000000 --- a/youtube_dl/extractor/sonyliv.py +++ /dev/null @@ -1,112 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import time -import uuid - -from .common import InfoExtractor -from ..compat import compat_HTTPError -from ..utils import ( - ExtractorError, - int_or_none, -) - - -class SonyLIVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?sonyliv\.com/(?:s(?:how|port)s/[^/]+|movies|clip|trailer|music-videos)/[^/?#&]+-(?P<id>\d+)' - _TESTS = [{ - 'url': 'https://www.sonyliv.com/shows/bachelors-delight-1700000113/achaari-cheese-toast-1000022678?watch=true', - 'info_dict': { - 'title': 'Bachelors Delight - Achaari Cheese Toast', - 'id': '1000022678', - 'ext': 'mp4', - 'upload_date': '20200411', - 'description': 'md5:3957fa31d9309bf336ceb3f37ad5b7cb', - 'timestamp': 1586632091, - 'duration': 185, - 'season_number': 1, - 'episode': 'Achaari Cheese Toast', - 'episode_number': 1, - 'release_year': 2016, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://www.sonyliv.com/movies/tahalka-1000050121?watch=true', - 'only_matching': True, - }, { - 'url': 'https://www.sonyliv.com/clip/jigarbaaz-1000098925', - 'only_matching': True, - }, { - 'url': 'https://www.sonyliv.com/trailer/sandwiched-forever-1000100286?watch=true', - 'only_matching': True, - }, { - 'url': 'https://www.sonyliv.com/sports/india-tour-of-australia-2020-21-1700000286/cricket-hls-day-3-1st-test-aus-vs-ind-19-dec-2020-1000100959?watch=true', - 'only_matching': True, - }, { - 'url': 'https://www.sonyliv.com/music-videos/yeh-un-dinon-ki-baat-hai-1000018779', - 'only_matching': True, - }] - _GEO_COUNTRIES = ['IN'] - _TOKEN = None - - def _call_api(self, version, path, video_id): - headers = {} - if self._TOKEN: - headers['security_token'] = self._TOKEN - try: - return self._download_json( - 'https://apiv2.sonyliv.com/AGL/%s/A/ENG/WEB/%s' % (version, path), - video_id, headers=headers)['resultObj'] - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - message = self._parse_json( - e.cause.read().decode(), video_id)['message'] - if message == 'Geoblocked Country': - self.raise_geo_restricted(countries=self._GEO_COUNTRIES) - raise ExtractorError(message) - raise - - def _real_initialize(self): - self._TOKEN = self._call_api('1.4', 'ALL/GETTOKEN', None) - - def _real_extract(self, url): - video_id = self._match_id(url) - content = self._call_api( - '1.5', 'IN/CONTENT/VIDEOURL/VOD/' + video_id, video_id) - if content.get('isEncrypted'): - raise ExtractorError('This video is DRM protected.', expected=True) - dash_url = content['videoURL'] - headers = { - 'x-playback-session-id': '%s-%d' % (uuid.uuid4().hex, time.time() * 1000) - } - formats = self._extract_mpd_formats( - dash_url, video_id, mpd_id='dash', headers=headers, fatal=False) - formats.extend(self._extract_m3u8_formats( - dash_url.replace('.mpd', '.m3u8').replace('/DASH/', '/HLS/'), - video_id, 'mp4', m3u8_id='hls', headers=headers, fatal=False)) - for f in formats: - f.setdefault('http_headers', {}).update(headers) - self._sort_formats(formats) - - metadata = self._call_api( - '1.6', 'IN/DETAIL/' + video_id, video_id)['containers'][0]['metadata'] - title = metadata['title'] - episode = metadata.get('episodeTitle') - if episode and title != episode: - title += ' - ' + episode - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'thumbnail': content.get('posterURL'), - 'description': metadata.get('longDescription') or metadata.get('shortDescription'), - 'timestamp': int_or_none(metadata.get('creationDate'), 1000), - 'duration': int_or_none(metadata.get('duration')), - 'season_number': int_or_none(metadata.get('season')), - 'episode': episode, - 'episode_number': int_or_none(metadata.get('episodeNumber')), - 'release_year': int_or_none(metadata.get('year')), - } diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py deleted file mode 100644 index abb85e1e5..000000000 --- a/youtube_dl/extractor/soundcloud.py +++ /dev/null @@ -1,815 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import itertools -import re - -from .common import ( - InfoExtractor, - SearchInfoExtractor -) -from ..compat import ( - compat_HTTPError, - compat_kwargs, - compat_str, - compat_urlparse, -) -from ..utils import ( - error_to_compat_str, - ExtractorError, - float_or_none, - HEADRequest, - int_or_none, - KNOWN_EXTENSIONS, - mimetype2ext, - str_or_none, - try_get, - unified_timestamp, - update_url_query, - url_or_none, - urlhandle_detect_ext, -) - - -class SoundcloudEmbedIE(InfoExtractor): - _VALID_URL = r'https?://(?:w|player|p)\.soundcloud\.com/player/?.*?\burl=(?P<id>.+)' - _TEST = { - # from https://www.soundi.fi/uutiset/ennakkokuuntelussa-timo-kaukolammen-station-to-station-to-station-julkaisua-juhlitaan-tanaan-g-livelabissa/ - 'url': 'https://w.soundcloud.com/player/?visual=true&url=https%3A%2F%2Fapi.soundcloud.com%2Fplaylists%2F922213810&show_artwork=true&maxwidth=640&maxheight=960&dnt=1&secret_token=s-ziYey', - 'only_matching': True, - } - - @staticmethod - def _extract_urls(webpage): - return [m.group('url') for m in re.finditer( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?(?:w\.)?soundcloud\.com/player.+?)\1', - webpage)] - - def _real_extract(self, url): - query = compat_urlparse.parse_qs( - compat_urlparse.urlparse(url).query) - api_url = query['url'][0] - secret_token = query.get('secret_token') - if secret_token: - api_url = update_url_query(api_url, {'secret_token': secret_token[0]}) - return self.url_result(api_url) - - -class SoundcloudIE(InfoExtractor): - """Information extractor for soundcloud.com - To access the media, the uid of the song and a stream token - must be extracted from the page source and the script must make - a request to media.soundcloud.com/crossdomain.xml. Then - the media can be grabbed by requesting from an url composed - of the stream token and uid - """ - - _VALID_URL = r'''(?x)^(?:https?://)? - (?:(?:(?:www\.|m\.)?soundcloud\.com/ - (?!stations/track) - (?P<uploader>[\w\d-]+)/ - (?!(?:tracks|albums|sets(?:/.+?)?|reposts|likes|spotlight)/?(?:$|[?#])) - (?P<title>[\w\d-]+)/? - (?P<token>[^?]+?)?(?:[?].*)?$) - |(?:api(?:-v2)?\.soundcloud\.com/tracks/(?P<track_id>\d+) - (?:/?\?secret_token=(?P<secret_token>[^&]+))?) - ) - ''' - IE_NAME = 'soundcloud' - _TESTS = [ - { - 'url': 'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy', - 'md5': 'ebef0a451b909710ed1d7787dddbf0d7', - 'info_dict': { - 'id': '62986583', - 'ext': 'mp3', - 'title': 'Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1', - 'description': 'No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o\'d', - 'uploader': 'E.T. ExTerrestrial Music', - 'uploader_id': '1571244', - 'timestamp': 1349920598, - 'upload_date': '20121011', - 'duration': 143.216, - 'license': 'all-rights-reserved', - 'view_count': int, - 'like_count': int, - 'comment_count': int, - 'repost_count': int, - } - }, - # geo-restricted - { - 'url': 'https://soundcloud.com/the-concept-band/goldrushed-mastered?in=the-concept-band/sets/the-royal-concept-ep', - 'info_dict': { - 'id': '47127627', - 'ext': 'mp3', - 'title': 'Goldrushed', - 'description': 'From Stockholm Sweden\r\nPovel / Magnus / Filip / David\r\nwww.theroyalconcept.com', - 'uploader': 'The Royal Concept', - 'uploader_id': '9615865', - 'timestamp': 1337635207, - 'upload_date': '20120521', - 'duration': 227.155, - 'license': 'all-rights-reserved', - 'view_count': int, - 'like_count': int, - 'comment_count': int, - 'repost_count': int, - }, - }, - # private link - { - 'url': 'https://soundcloud.com/jaimemf/youtube-dl-test-video-a-y-baw/s-8Pjrp', - 'md5': 'aa0dd32bfea9b0c5ef4f02aacd080604', - 'info_dict': { - 'id': '123998367', - 'ext': 'mp3', - 'title': 'Youtube - Dl Test Video \'\' Ä↭', - 'description': 'test chars: \"\'/\\ä↭', - 'uploader': 'jaimeMF', - 'uploader_id': '69767071', - 'timestamp': 1386604920, - 'upload_date': '20131209', - 'duration': 9.927, - 'license': 'all-rights-reserved', - 'view_count': int, - 'like_count': int, - 'comment_count': int, - 'repost_count': int, - }, - }, - # private link (alt format) - { - 'url': 'https://api.soundcloud.com/tracks/123998367?secret_token=s-8Pjrp', - 'md5': 'aa0dd32bfea9b0c5ef4f02aacd080604', - 'info_dict': { - 'id': '123998367', - 'ext': 'mp3', - 'title': 'Youtube - Dl Test Video \'\' Ä↭', - 'description': 'test chars: \"\'/\\ä↭', - 'uploader': 'jaimeMF', - 'uploader_id': '69767071', - 'timestamp': 1386604920, - 'upload_date': '20131209', - 'duration': 9.927, - 'license': 'all-rights-reserved', - 'view_count': int, - 'like_count': int, - 'comment_count': int, - 'repost_count': int, - }, - }, - # downloadable song - { - 'url': 'https://soundcloud.com/oddsamples/bus-brakes', - 'md5': '7624f2351f8a3b2e7cd51522496e7631', - 'info_dict': { - 'id': '128590877', - 'ext': 'mp3', - 'title': 'Bus Brakes', - 'description': 'md5:0053ca6396e8d2fd7b7e1595ef12ab66', - 'uploader': 'oddsamples', - 'uploader_id': '73680509', - 'timestamp': 1389232924, - 'upload_date': '20140109', - 'duration': 17.346, - 'license': 'cc-by-sa', - 'view_count': int, - 'like_count': int, - 'comment_count': int, - 'repost_count': int, - }, - }, - # private link, downloadable format - { - 'url': 'https://soundcloud.com/oriuplift/uponly-238-no-talking-wav/s-AyZUd', - 'md5': '64a60b16e617d41d0bef032b7f55441e', - 'info_dict': { - 'id': '340344461', - 'ext': 'wav', - 'title': 'Uplifting Only 238 [No Talking] (incl. Alex Feed Guestmix) (Aug 31, 2017) [wav]', - 'description': 'md5:fa20ee0fca76a3d6df8c7e57f3715366', - 'uploader': 'Ori Uplift Music', - 'uploader_id': '12563093', - 'timestamp': 1504206263, - 'upload_date': '20170831', - 'duration': 7449.096, - 'license': 'all-rights-reserved', - 'view_count': int, - 'like_count': int, - 'comment_count': int, - 'repost_count': int, - }, - }, - # no album art, use avatar pic for thumbnail - { - 'url': 'https://soundcloud.com/garyvee/sideways-prod-mad-real', - 'md5': '59c7872bc44e5d99b7211891664760c2', - 'info_dict': { - 'id': '309699954', - 'ext': 'mp3', - 'title': 'Sideways (Prod. Mad Real)', - 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', - 'uploader': 'garyvee', - 'uploader_id': '2366352', - 'timestamp': 1488152409, - 'upload_date': '20170226', - 'duration': 207.012, - 'thumbnail': r're:https?://.*\.jpg', - 'license': 'all-rights-reserved', - 'view_count': int, - 'like_count': int, - 'comment_count': int, - 'repost_count': int, - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'https://soundcloud.com/giovannisarani/mezzo-valzer', - 'md5': 'e22aecd2bc88e0e4e432d7dcc0a1abf7', - 'info_dict': { - 'id': '583011102', - 'ext': 'mp3', - 'title': 'Mezzo Valzer', - 'description': 'md5:4138d582f81866a530317bae316e8b61', - 'uploader': 'Micronie', - 'uploader_id': '3352531', - 'timestamp': 1551394171, - 'upload_date': '20190228', - 'duration': 180.157, - 'thumbnail': r're:https?://.*\.jpg', - 'license': 'all-rights-reserved', - 'view_count': int, - 'like_count': int, - 'comment_count': int, - 'repost_count': int, - }, - }, - { - # with AAC HQ format available via OAuth token - 'url': 'https://soundcloud.com/wandw/the-chainsmokers-ft-daya-dont-let-me-down-ww-remix-1', - 'only_matching': True, - }, - ] - - _API_V2_BASE = 'https://api-v2.soundcloud.com/' - _BASE_URL = 'https://soundcloud.com/' - _IMAGE_REPL_RE = r'-([0-9a-z]+)\.jpg' - - _ARTWORK_MAP = { - 'mini': 16, - 'tiny': 20, - 'small': 32, - 'badge': 47, - 't67x67': 67, - 'large': 100, - 't300x300': 300, - 'crop': 400, - 't500x500': 500, - 'original': 0, - } - - def _store_client_id(self, client_id): - self._downloader.cache.store('soundcloud', 'client_id', client_id) - - def _update_client_id(self): - webpage = self._download_webpage('https://soundcloud.com/', None) - for src in reversed(re.findall(r'<script[^>]+src="([^"]+)"', webpage)): - script = self._download_webpage(src, None, fatal=False) - if script: - client_id = self._search_regex( - r'client_id\s*:\s*"([0-9a-zA-Z]{32})"', - script, 'client id', default=None) - if client_id: - self._CLIENT_ID = client_id - self._store_client_id(client_id) - return - raise ExtractorError('Unable to extract client id') - - def _download_json(self, *args, **kwargs): - non_fatal = kwargs.get('fatal') is False - if non_fatal: - del kwargs['fatal'] - query = kwargs.get('query', {}).copy() - for _ in range(2): - query['client_id'] = self._CLIENT_ID - kwargs['query'] = query - try: - return super(SoundcloudIE, self)._download_json(*args, **compat_kwargs(kwargs)) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - self._store_client_id(None) - self._update_client_id() - continue - elif non_fatal: - self._downloader.report_warning(error_to_compat_str(e)) - return False - raise - - def _real_initialize(self): - self._CLIENT_ID = self._downloader.cache.load('soundcloud', 'client_id') or 'YUKXoArFcqrlQn9tfNHvvyfnDISj04zk' - - @classmethod - def _resolv_url(cls, url): - return SoundcloudIE._API_V2_BASE + 'resolve?url=' + url - - def _extract_info_dict(self, info, full_title=None, secret_token=None): - track_id = compat_str(info['id']) - title = info['title'] - - format_urls = set() - formats = [] - query = {'client_id': self._CLIENT_ID} - if secret_token: - query['secret_token'] = secret_token - - if info.get('downloadable') and info.get('has_downloads_left'): - download_url = update_url_query( - self._API_V2_BASE + 'tracks/' + track_id + '/download', query) - redirect_url = (self._download_json(download_url, track_id, fatal=False) or {}).get('redirectUri') - if redirect_url: - urlh = self._request_webpage( - HEADRequest(redirect_url), track_id, fatal=False) - if urlh: - format_url = urlh.geturl() - format_urls.add(format_url) - formats.append({ - 'format_id': 'download', - 'ext': urlhandle_detect_ext(urlh) or 'mp3', - 'filesize': int_or_none(urlh.headers.get('Content-Length')), - 'url': format_url, - 'preference': 10, - }) - - def invalid_url(url): - return not url or url in format_urls - - def add_format(f, protocol, is_preview=False): - mobj = re.search(r'\.(?P<abr>\d+)\.(?P<ext>[0-9a-z]{3,4})(?=[/?])', stream_url) - if mobj: - for k, v in mobj.groupdict().items(): - if not f.get(k): - f[k] = v - format_id_list = [] - if protocol: - format_id_list.append(protocol) - ext = f.get('ext') - if ext == 'aac': - f['abr'] = '256' - for k in ('ext', 'abr'): - v = f.get(k) - if v: - format_id_list.append(v) - preview = is_preview or re.search(r'/(?:preview|playlist)/0/30/', f['url']) - if preview: - format_id_list.append('preview') - abr = f.get('abr') - if abr: - f['abr'] = int(abr) - if protocol == 'hls': - protocol = 'm3u8' if ext == 'aac' else 'm3u8_native' - else: - protocol = 'http' - f.update({ - 'format_id': '_'.join(format_id_list), - 'protocol': protocol, - 'preference': -10 if preview else None, - }) - formats.append(f) - - # New API - transcodings = try_get( - info, lambda x: x['media']['transcodings'], list) or [] - for t in transcodings: - if not isinstance(t, dict): - continue - format_url = url_or_none(t.get('url')) - if not format_url: - continue - stream = self._download_json( - format_url, track_id, query=query, fatal=False) - if not isinstance(stream, dict): - continue - stream_url = url_or_none(stream.get('url')) - if invalid_url(stream_url): - continue - format_urls.add(stream_url) - stream_format = t.get('format') or {} - protocol = stream_format.get('protocol') - if protocol != 'hls' and '/hls' in format_url: - protocol = 'hls' - ext = None - preset = str_or_none(t.get('preset')) - if preset: - ext = preset.split('_')[0] - if ext not in KNOWN_EXTENSIONS: - ext = mimetype2ext(stream_format.get('mime_type')) - add_format({ - 'url': stream_url, - 'ext': ext, - }, 'http' if protocol == 'progressive' else protocol, - t.get('snipped') or '/preview/' in format_url) - - for f in formats: - f['vcodec'] = 'none' - - if not formats and info.get('policy') == 'BLOCK': - self.raise_geo_restricted() - self._sort_formats(formats) - - user = info.get('user') or {} - - thumbnails = [] - artwork_url = info.get('artwork_url') - thumbnail = artwork_url or user.get('avatar_url') - if isinstance(thumbnail, compat_str): - if re.search(self._IMAGE_REPL_RE, thumbnail): - for image_id, size in self._ARTWORK_MAP.items(): - i = { - 'id': image_id, - 'url': re.sub(self._IMAGE_REPL_RE, '-%s.jpg' % image_id, thumbnail), - } - if image_id == 'tiny' and not artwork_url: - size = 18 - elif image_id == 'original': - i['preference'] = 10 - if size: - i.update({ - 'width': size, - 'height': size, - }) - thumbnails.append(i) - else: - thumbnails = [{'url': thumbnail}] - - def extract_count(key): - return int_or_none(info.get('%s_count' % key)) - - return { - 'id': track_id, - 'uploader': user.get('username'), - 'uploader_id': str_or_none(user.get('id')) or user.get('permalink'), - 'uploader_url': user.get('permalink_url'), - 'timestamp': unified_timestamp(info.get('created_at')), - 'title': title, - 'description': info.get('description'), - 'thumbnails': thumbnails, - 'duration': float_or_none(info.get('duration'), 1000), - 'webpage_url': info.get('permalink_url'), - 'license': info.get('license'), - 'view_count': extract_count('playback'), - 'like_count': extract_count('favoritings') or extract_count('likes'), - 'comment_count': extract_count('comment'), - 'repost_count': extract_count('reposts'), - 'genre': info.get('genre'), - 'formats': formats - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - - track_id = mobj.group('track_id') - - query = {} - if track_id: - info_json_url = self._API_V2_BASE + 'tracks/' + track_id - full_title = track_id - token = mobj.group('secret_token') - if token: - query['secret_token'] = token - else: - full_title = resolve_title = '%s/%s' % mobj.group('uploader', 'title') - token = mobj.group('token') - if token: - resolve_title += '/%s' % token - info_json_url = self._resolv_url(self._BASE_URL + resolve_title) - - info = self._download_json( - info_json_url, full_title, 'Downloading info JSON', query=query) - - return self._extract_info_dict(info, full_title, token) - - -class SoundcloudPlaylistBaseIE(SoundcloudIE): - def _extract_set(self, playlist, token=None): - playlist_id = compat_str(playlist['id']) - tracks = playlist.get('tracks') or [] - if not all([t.get('permalink_url') for t in tracks]) and token: - tracks = self._download_json( - self._API_V2_BASE + 'tracks', playlist_id, - 'Downloading tracks', query={ - 'ids': ','.join([compat_str(t['id']) for t in tracks]), - 'playlistId': playlist_id, - 'playlistSecretToken': token, - }) - entries = [] - for track in tracks: - track_id = str_or_none(track.get('id')) - url = track.get('permalink_url') - if not url: - if not track_id: - continue - url = self._API_V2_BASE + 'tracks/' + track_id - if token: - url += '?secret_token=' + token - entries.append(self.url_result( - url, SoundcloudIE.ie_key(), track_id)) - return self.playlist_result( - entries, playlist_id, - playlist.get('title'), - playlist.get('description')) - - -class SoundcloudSetIE(SoundcloudPlaylistBaseIE): - _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<uploader>[\w\d-]+)/sets/(?P<slug_title>[\w\d-]+)(?:/(?P<token>[^?/]+))?' - IE_NAME = 'soundcloud:set' - _TESTS = [{ - 'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep', - 'info_dict': { - 'id': '2284613', - 'title': 'The Royal Concept EP', - 'description': 'md5:71d07087c7a449e8941a70a29e34671e', - }, - 'playlist_mincount': 5, - }, { - 'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep/token', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - - full_title = '%s/sets/%s' % mobj.group('uploader', 'slug_title') - token = mobj.group('token') - if token: - full_title += '/' + token - - info = self._download_json(self._resolv_url( - self._BASE_URL + full_title), full_title) - - if 'errors' in info: - msgs = (compat_str(err['error_message']) for err in info['errors']) - raise ExtractorError('unable to download video webpage: %s' % ','.join(msgs)) - - return self._extract_set(info, token) - - -class SoundcloudPagedPlaylistBaseIE(SoundcloudIE): - def _extract_playlist(self, base_url, playlist_id, playlist_title): - # Per the SoundCloud documentation, the maximum limit for a linked partitioning query is 200. - # https://developers.soundcloud.com/blog/offset-pagination-deprecated - COMMON_QUERY = { - 'limit': 200, - 'linked_partitioning': '1', - } - - query = COMMON_QUERY.copy() - query['offset'] = 0 - - next_href = base_url - - entries = [] - for i in itertools.count(): - response = self._download_json( - next_href, playlist_id, - 'Downloading track page %s' % (i + 1), query=query) - - collection = response['collection'] - - if not isinstance(collection, list): - collection = [] - - # Empty collection may be returned, in this case we proceed - # straight to next_href - - def resolve_entry(candidates): - for cand in candidates: - if not isinstance(cand, dict): - continue - permalink_url = url_or_none(cand.get('permalink_url')) - if not permalink_url: - continue - return self.url_result( - permalink_url, - SoundcloudIE.ie_key() if SoundcloudIE.suitable(permalink_url) else None, - str_or_none(cand.get('id')), cand.get('title')) - - for e in collection: - entry = resolve_entry((e, e.get('track'), e.get('playlist'))) - if entry: - entries.append(entry) - - next_href = response.get('next_href') - if not next_href: - break - - next_href = response['next_href'] - parsed_next_href = compat_urlparse.urlparse(next_href) - query = compat_urlparse.parse_qs(parsed_next_href.query) - query.update(COMMON_QUERY) - - return { - '_type': 'playlist', - 'id': playlist_id, - 'title': playlist_title, - 'entries': entries, - } - - -class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE): - _VALID_URL = r'''(?x) - https?:// - (?:(?:www|m)\.)?soundcloud\.com/ - (?P<user>[^/]+) - (?:/ - (?P<rsrc>tracks|albums|sets|reposts|likes|spotlight) - )? - /?(?:[?#].*)?$ - ''' - IE_NAME = 'soundcloud:user' - _TESTS = [{ - 'url': 'https://soundcloud.com/soft-cell-official', - 'info_dict': { - 'id': '207965082', - 'title': 'Soft Cell (All)', - }, - 'playlist_mincount': 28, - }, { - 'url': 'https://soundcloud.com/soft-cell-official/tracks', - 'info_dict': { - 'id': '207965082', - 'title': 'Soft Cell (Tracks)', - }, - 'playlist_mincount': 27, - }, { - 'url': 'https://soundcloud.com/soft-cell-official/albums', - 'info_dict': { - 'id': '207965082', - 'title': 'Soft Cell (Albums)', - }, - 'playlist_mincount': 1, - }, { - 'url': 'https://soundcloud.com/jcv246/sets', - 'info_dict': { - 'id': '12982173', - 'title': 'Jordi / cv (Sets)', - }, - 'playlist_mincount': 2, - }, { - 'url': 'https://soundcloud.com/jcv246/reposts', - 'info_dict': { - 'id': '12982173', - 'title': 'Jordi / cv (Reposts)', - }, - 'playlist_mincount': 6, - }, { - 'url': 'https://soundcloud.com/clalberg/likes', - 'info_dict': { - 'id': '11817582', - 'title': 'clalberg (Likes)', - }, - 'playlist_mincount': 5, - }, { - 'url': 'https://soundcloud.com/grynpyret/spotlight', - 'info_dict': { - 'id': '7098329', - 'title': 'Grynpyret (Spotlight)', - }, - 'playlist_mincount': 1, - }] - - _BASE_URL_MAP = { - 'all': 'stream/users/%s', - 'tracks': 'users/%s/tracks', - 'albums': 'users/%s/albums', - 'sets': 'users/%s/playlists', - 'reposts': 'stream/users/%s/reposts', - 'likes': 'users/%s/likes', - 'spotlight': 'users/%s/spotlight', - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - uploader = mobj.group('user') - - user = self._download_json( - self._resolv_url(self._BASE_URL + uploader), - uploader, 'Downloading user info') - - resource = mobj.group('rsrc') or 'all' - - return self._extract_playlist( - self._API_V2_BASE + self._BASE_URL_MAP[resource] % user['id'], - str_or_none(user.get('id')), - '%s (%s)' % (user['username'], resource.capitalize())) - - -class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE): - _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/stations/track/[^/]+/(?P<id>[^/?#&]+)' - IE_NAME = 'soundcloud:trackstation' - _TESTS = [{ - 'url': 'https://soundcloud.com/stations/track/officialsundial/your-text', - 'info_dict': { - 'id': '286017854', - 'title': 'Track station: your text', - }, - 'playlist_mincount': 47, - }] - - def _real_extract(self, url): - track_name = self._match_id(url) - - track = self._download_json(self._resolv_url(url), track_name) - track_id = self._search_regex( - r'soundcloud:track-stations:(\d+)', track['id'], 'track id') - - return self._extract_playlist( - self._API_V2_BASE + 'stations/%s/tracks' % track['id'], - track_id, 'Track station: %s' % track['title']) - - -class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE): - _VALID_URL = r'https?://api(?:-v2)?\.soundcloud\.com/playlists/(?P<id>[0-9]+)(?:/?\?secret_token=(?P<token>[^&]+?))?$' - IE_NAME = 'soundcloud:playlist' - _TESTS = [{ - 'url': 'https://api.soundcloud.com/playlists/4110309', - 'info_dict': { - 'id': '4110309', - 'title': 'TILT Brass - Bowery Poetry Club, August \'03 [Non-Site SCR 02]', - 'description': 're:.*?TILT Brass - Bowery Poetry Club', - }, - 'playlist_count': 6, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - playlist_id = mobj.group('id') - - query = {} - token = mobj.group('token') - if token: - query['secret_token'] = token - - data = self._download_json( - self._API_V2_BASE + 'playlists/' + playlist_id, - playlist_id, 'Downloading playlist', query=query) - - return self._extract_set(data, token) - - -class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE): - IE_NAME = 'soundcloud:search' - IE_DESC = 'Soundcloud search' - _MAX_RESULTS = float('inf') - _TESTS = [{ - 'url': 'scsearch15:post-avant jazzcore', - 'info_dict': { - 'title': 'post-avant jazzcore', - }, - 'playlist_count': 15, - }] - - _SEARCH_KEY = 'scsearch' - _MAX_RESULTS_PER_PAGE = 200 - _DEFAULT_RESULTS_PER_PAGE = 50 - - def _get_collection(self, endpoint, collection_id, **query): - limit = min( - query.get('limit', self._DEFAULT_RESULTS_PER_PAGE), - self._MAX_RESULTS_PER_PAGE) - query.update({ - 'limit': limit, - 'linked_partitioning': 1, - 'offset': 0, - }) - next_url = update_url_query(self._API_V2_BASE + endpoint, query) - - collected_results = 0 - - for i in itertools.count(1): - response = self._download_json( - next_url, collection_id, 'Downloading page {0}'.format(i), - 'Unable to download API page') - - collection = response.get('collection', []) - if not collection: - break - - collection = list(filter(bool, collection)) - collected_results += len(collection) - - for item in collection: - yield self.url_result(item['uri'], SoundcloudIE.ie_key()) - - if not collection or collected_results >= limit: - break - - next_url = response.get('next_href') - if not next_url: - break - - def _get_n_results(self, query, n): - tracks = self._get_collection('search/tracks', query, limit=n, q=query) - return self.playlist_result(tracks, playlist_title=query) diff --git a/youtube_dl/extractor/soundgasm.py b/youtube_dl/extractor/soundgasm.py deleted file mode 100644 index 3d78a9d76..000000000 --- a/youtube_dl/extractor/soundgasm.py +++ /dev/null @@ -1,77 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - - -class SoundgasmIE(InfoExtractor): - IE_NAME = 'soundgasm' - _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P<user>[0-9a-zA-Z_-]+)/(?P<display_id>[0-9a-zA-Z_-]+)' - _TEST = { - 'url': 'http://soundgasm.net/u/ytdl/Piano-sample', - 'md5': '010082a2c802c5275bb00030743e75ad', - 'info_dict': { - 'id': '88abd86ea000cafe98f96321b23cc1206cbcbcc9', - 'ext': 'm4a', - 'title': 'Piano sample', - 'description': 'Royalty Free Sample Music', - 'uploader': 'ytdl', - } - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('display_id') - - webpage = self._download_webpage(url, display_id) - - audio_url = self._html_search_regex( - r'(?s)m4a\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, - 'audio URL', group='url') - - title = self._search_regex( - r'<div[^>]+\bclass=["\']jp-title[^>]+>([^<]+)', - webpage, 'title', default=display_id) - - description = self._html_search_regex( - (r'(?s)<div[^>]+\bclass=["\']jp-description[^>]+>(.+?)</div>', - r'(?s)<li>Description:\s(.*?)<\/li>'), - webpage, 'description', fatal=False) - - audio_id = self._search_regex( - r'/([^/]+)\.m4a', audio_url, 'audio id', default=display_id) - - return { - 'id': audio_id, - 'display_id': display_id, - 'url': audio_url, - 'vcodec': 'none', - 'title': title, - 'description': description, - 'uploader': mobj.group('user'), - } - - -class SoundgasmProfileIE(InfoExtractor): - IE_NAME = 'soundgasm:profile' - _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P<id>[^/]+)/?(?:\#.*)?$' - _TEST = { - 'url': 'http://soundgasm.net/u/ytdl', - 'info_dict': { - 'id': 'ytdl', - }, - 'playlist_count': 1, - } - - def _real_extract(self, url): - profile_id = self._match_id(url) - - webpage = self._download_webpage(url, profile_id) - - entries = [ - self.url_result(audio_url, 'Soundgasm') - for audio_url in re.findall(r'href="([^"]+/u/%s/[^"]+)' % profile_id, webpage)] - - return self.playlist_result(entries, profile_id) diff --git a/youtube_dl/extractor/southpark.py b/youtube_dl/extractor/southpark.py deleted file mode 100644 index 0774da06e..000000000 --- a/youtube_dl/extractor/southpark.py +++ /dev/null @@ -1,127 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .mtv import MTVServicesInfoExtractor - - -class SouthParkIE(MTVServicesInfoExtractor): - IE_NAME = 'southpark.cc.com' - _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark(?:\.cc|studios)\.com/(?:clips|(?:full-)?episodes|collections)/(?P<id>.+?)(\?|#|$))' - - _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed' - - _TESTS = [{ - 'url': 'http://southpark.cc.com/clips/104437/bat-daded#tab=featured', - 'info_dict': { - 'id': 'a7bff6c2-ed00-11e0-aca6-0026b9414f30', - 'ext': 'mp4', - 'title': 'South Park|Bat Daded', - 'description': 'Randy disqualifies South Park by getting into a fight with Bat Dad.', - 'timestamp': 1112760000, - 'upload_date': '20050406', - }, - }, { - 'url': 'http://southpark.cc.com/collections/7758/fan-favorites/1', - 'only_matching': True, - }, { - 'url': 'https://www.southparkstudios.com/episodes/h4o269/south-park-stunning-and-brave-season-19-ep-1', - 'only_matching': True, - }] - - def _get_feed_query(self, uri): - return { - 'accountOverride': 'intl.mtvi.com', - 'arcEp': 'shared.southpark.global', - 'ep': '90877963', - 'imageEp': 'shared.southpark.global', - 'mgid': uri, - } - - -class SouthParkEsIE(SouthParkIE): - IE_NAME = 'southpark.cc.com:español' - _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.cc\.com/episodios-en-espanol/(?P<id>.+?)(\?|#|$))' - _LANG = 'es' - - _TESTS = [{ - 'url': 'http://southpark.cc.com/episodios-en-espanol/s01e01-cartman-consigue-una-sonda-anal#source=351c1323-0b96-402d-a8b9-40d01b2e9bde&position=1&sort=!airdate', - 'info_dict': { - 'title': 'Cartman Consigue Una Sonda Anal', - 'description': 'Cartman Consigue Una Sonda Anal', - }, - 'playlist_count': 4, - 'skip': 'Geo-restricted', - }] - - -class SouthParkDeIE(SouthParkIE): - IE_NAME = 'southpark.de' - _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.de/(?:clips|alle-episoden|collections)/(?P<id>.+?)(\?|#|$))' - _FEED_URL = 'http://www.southpark.de/feeds/video-player/mrss/' - - _TESTS = [{ - 'url': 'http://www.southpark.de/clips/uygssh/the-government-wont-respect-my-privacy#tab=featured', - 'info_dict': { - 'id': '85487c96-b3b9-4e39-9127-ad88583d9bf2', - 'ext': 'mp4', - 'title': 'South Park|The Government Won\'t Respect My Privacy', - 'description': 'Cartman explains the benefits of "Shitter" to Stan, Kyle and Craig.', - 'timestamp': 1380160800, - 'upload_date': '20130926', - }, - }, { - # non-ASCII characters in initial URL - 'url': 'http://www.southpark.de/alle-episoden/s18e09-hashtag-aufwärmen', - 'info_dict': { - 'title': 'Hashtag „Aufwärmen“', - 'description': 'Kyle will mit seinem kleinen Bruder Ike Videospiele spielen. Als der nicht mehr mit ihm spielen will, hat Kyle Angst, dass er die Kids von heute nicht mehr versteht.', - }, - 'playlist_count': 3, - }, { - # non-ASCII characters in redirect URL - 'url': 'http://www.southpark.de/alle-episoden/s18e09', - 'info_dict': { - 'title': 'Hashtag „Aufwärmen“', - 'description': 'Kyle will mit seinem kleinen Bruder Ike Videospiele spielen. Als der nicht mehr mit ihm spielen will, hat Kyle Angst, dass er die Kids von heute nicht mehr versteht.', - }, - 'playlist_count': 3, - }, { - 'url': 'http://www.southpark.de/collections/2476/superhero-showdown/1', - 'only_matching': True, - }] - - -class SouthParkNlIE(SouthParkIE): - IE_NAME = 'southpark.nl' - _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.nl/(?:clips|(?:full-)?episodes|collections)/(?P<id>.+?)(\?|#|$))' - _FEED_URL = 'http://www.southpark.nl/feeds/video-player/mrss/' - - _TESTS = [{ - 'url': 'http://www.southpark.nl/full-episodes/s18e06-freemium-isnt-free', - 'info_dict': { - 'title': 'Freemium Isn\'t Free', - 'description': 'Stan is addicted to the new Terrance and Phillip mobile game.', - }, - 'playlist_mincount': 3, - }] - - -class SouthParkDkIE(SouthParkIE): - IE_NAME = 'southparkstudios.dk' - _VALID_URL = r'https?://(?:www\.)?(?P<url>southparkstudios\.(?:dk|nu)/(?:clips|full-episodes|collections)/(?P<id>.+?)(\?|#|$))' - _FEED_URL = 'http://www.southparkstudios.dk/feeds/video-player/mrss/' - - _TESTS = [{ - 'url': 'http://www.southparkstudios.dk/full-episodes/s18e07-grounded-vindaloop', - 'info_dict': { - 'title': 'Grounded Vindaloop', - 'description': 'Butters is convinced he\'s living in a virtual reality.', - }, - 'playlist_mincount': 3, - }, { - 'url': 'http://www.southparkstudios.dk/collections/2476/superhero-showdown/1', - 'only_matching': True, - }, { - 'url': 'http://www.southparkstudios.nu/collections/2476/superhero-showdown/1', - 'only_matching': True, - }] diff --git a/youtube_dl/extractor/spankbang.py b/youtube_dl/extractor/spankbang.py deleted file mode 100644 index 37cb8c839..000000000 --- a/youtube_dl/extractor/spankbang.py +++ /dev/null @@ -1,198 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - determine_ext, - ExtractorError, - merge_dicts, - parse_duration, - parse_resolution, - str_to_int, - url_or_none, - urlencode_postdata, - urljoin, -) - - -class SpankBangIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?:// - (?:[^/]+\.)?spankbang\.com/ - (?: - (?P<id>[\da-z]+)/(?:video|play|embed)\b| - [\da-z]+-(?P<id_2>[\da-z]+)/playlist/[^/?#&]+ - ) - ''' - _TESTS = [{ - 'url': 'http://spankbang.com/3vvn/video/fantasy+solo', - 'md5': '1cc433e1d6aa14bc376535b8679302f7', - 'info_dict': { - 'id': '3vvn', - 'ext': 'mp4', - 'title': 'fantasy solo', - 'description': 'dillion harper masturbates on a bed', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'silly2587', - 'timestamp': 1422571989, - 'upload_date': '20150129', - 'age_limit': 18, - } - }, { - # 480p only - 'url': 'http://spankbang.com/1vt0/video/solvane+gangbang', - 'only_matching': True, - }, { - # no uploader - 'url': 'http://spankbang.com/lklg/video/sex+with+anyone+wedding+edition+2', - 'only_matching': True, - }, { - # mobile page - 'url': 'http://m.spankbang.com/1o2de/video/can+t+remember+her+name', - 'only_matching': True, - }, { - # 4k - 'url': 'https://spankbang.com/1vwqx/video/jade+kush+solo+4k', - 'only_matching': True, - }, { - 'url': 'https://m.spankbang.com/3vvn/play/fantasy+solo/480p/', - 'only_matching': True, - }, { - 'url': 'https://m.spankbang.com/3vvn/play', - 'only_matching': True, - }, { - 'url': 'https://spankbang.com/2y3td/embed/', - 'only_matching': True, - }, { - 'url': 'https://spankbang.com/2v7ik-7ecbgu/playlist/latina+booty', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') or mobj.group('id_2') - webpage = self._download_webpage( - url.replace('/%s/embed' % video_id, '/%s/video' % video_id), - video_id, headers={'Cookie': 'country=US'}) - - if re.search(r'<[^>]+\b(?:id|class)=["\']video_removed', webpage): - raise ExtractorError( - 'Video %s is not available' % video_id, expected=True) - - formats = [] - - def extract_format(format_id, format_url): - f_url = url_or_none(format_url) - if not f_url: - return - f = parse_resolution(format_id) - ext = determine_ext(f_url) - if format_id.startswith('m3u8') or ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - f_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - elif format_id.startswith('mpd') or ext == 'mpd': - formats.extend(self._extract_mpd_formats( - f_url, video_id, mpd_id='dash', fatal=False)) - elif ext == 'mp4' or f.get('width') or f.get('height'): - f.update({ - 'url': f_url, - 'format_id': format_id, - }) - formats.append(f) - - STREAM_URL_PREFIX = 'stream_url_' - - for mobj in re.finditer( - r'%s(?P<id>[^\s=]+)\s*=\s*(["\'])(?P<url>(?:(?!\2).)+)\2' - % STREAM_URL_PREFIX, webpage): - extract_format(mobj.group('id', 'url')) - - if not formats: - stream_key = self._search_regex( - r'data-streamkey\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1', - webpage, 'stream key', group='value') - - stream = self._download_json( - 'https://spankbang.com/api/videos/stream', video_id, - 'Downloading stream JSON', data=urlencode_postdata({ - 'id': stream_key, - 'data': 0, - }), headers={ - 'Referer': url, - 'X-Requested-With': 'XMLHttpRequest', - }) - - for format_id, format_url in stream.items(): - if format_url and isinstance(format_url, list): - format_url = format_url[0] - extract_format(format_id, format_url) - - self._sort_formats(formats, field_preference=('preference', 'height', 'width', 'fps', 'tbr', 'format_id')) - - info = self._search_json_ld(webpage, video_id, default={}) - - title = self._html_search_regex( - r'(?s)<h1[^>]*>(.+?)</h1>', webpage, 'title', default=None) - description = self._search_regex( - r'<div[^>]+\bclass=["\']bottom[^>]+>\s*<p>[^<]*</p>\s*<p>([^<]+)', - webpage, 'description', default=None) - thumbnail = self._og_search_thumbnail(webpage, default=None) - uploader = self._html_search_regex( - (r'(?s)<li[^>]+class=["\']profile[^>]+>(.+?)</a>', - r'class="user"[^>]*><img[^>]+>([^<]+)'), - webpage, 'uploader', default=None) - duration = parse_duration(self._search_regex( - r'<div[^>]+\bclass=["\']right_side[^>]+>\s*<span>([^<]+)', - webpage, 'duration', default=None)) - view_count = str_to_int(self._search_regex( - r'([\d,.]+)\s+plays', webpage, 'view count', default=None)) - - age_limit = self._rta_search(webpage) - - return merge_dicts({ - 'id': video_id, - 'title': title or video_id, - 'description': description, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'duration': duration, - 'view_count': view_count, - 'formats': formats, - 'age_limit': age_limit, - }, info - ) - - -class SpankBangPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P<id>[\da-z]+)/playlist/(?P<display_id>[^/]+)' - _TEST = { - 'url': 'https://spankbang.com/ug0k/playlist/big+ass+titties', - 'info_dict': { - 'id': 'ug0k', - 'title': 'Big Ass Titties', - }, - 'playlist_mincount': 40, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - playlist_id = mobj.group('id') - display_id = mobj.group('display_id') - - webpage = self._download_webpage( - url, playlist_id, headers={'Cookie': 'country=US; mobile=on'}) - - entries = [self.url_result( - urljoin(url, mobj.group('path')), - ie=SpankBangIE.ie_key(), video_id=mobj.group('id')) - for mobj in re.finditer( - r'<a[^>]+\bhref=(["\'])(?P<path>/?[\da-z]+-(?P<id>[\da-z]+)/playlist/%s(?:(?!\1).)*)\1' - % re.escape(display_id), webpage)] - - title = self._html_search_regex( - r'<h1>([^<]+)\s+playlist\s*<', webpage, 'playlist title', - fatal=False) - - return self.playlist_result(entries, playlist_id, title) diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py deleted file mode 100644 index 35ab9ec37..000000000 --- a/youtube_dl/extractor/spankwire.py +++ /dev/null @@ -1,182 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - float_or_none, - int_or_none, - merge_dicts, - str_or_none, - str_to_int, - url_or_none, -) - - -class SpankwireIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?:// - (?:www\.)?spankwire\.com/ - (?: - [^/]+/video| - EmbedPlayer\.aspx/?\?.*?\bArticleId= - ) - (?P<id>\d+) - ''' - _TESTS = [{ - # download URL pattern: */<height>P_<tbr>K_<video_id>.mp4 - 'url': 'http://www.spankwire.com/Buckcherry-s-X-Rated-Music-Video-Crazy-Bitch/video103545/', - 'md5': '5aa0e4feef20aad82cbcae3aed7ab7cd', - 'info_dict': { - 'id': '103545', - 'ext': 'mp4', - 'title': 'Buckcherry`s X Rated Music Video Crazy Bitch', - 'description': 'Crazy Bitch X rated music video.', - 'duration': 222, - 'uploader': 'oreusz', - 'uploader_id': '124697', - 'timestamp': 1178587885, - 'upload_date': '20070508', - 'average_rating': float, - 'view_count': int, - 'comment_count': int, - 'age_limit': 18, - 'categories': list, - 'tags': list, - }, - }, { - # download URL pattern: */mp4_<format_id>_<video_id>.mp4 - 'url': 'http://www.spankwire.com/Titcums-Compiloation-I/video1921551/', - 'md5': '09b3c20833308b736ae8902db2f8d7e6', - 'info_dict': { - 'id': '1921551', - 'ext': 'mp4', - 'title': 'Titcums Compiloation I', - 'description': 'cum on tits', - 'uploader': 'dannyh78999', - 'uploader_id': '3056053', - 'upload_date': '20150822', - 'age_limit': 18, - }, - 'params': { - 'proxy': '127.0.0.1:8118' - }, - 'skip': 'removed', - }, { - 'url': 'https://www.spankwire.com/EmbedPlayer.aspx/?ArticleId=156156&autostart=true', - 'only_matching': True, - }] - - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe[^>]+\bsrc=["\']((?:https?:)?//(?:www\.)?spankwire\.com/EmbedPlayer\.aspx/?\?.*?\bArticleId=\d+)', - webpage) - - def _real_extract(self, url): - video_id = self._match_id(url) - - video = self._download_json( - 'https://www.spankwire.com/api/video/%s.json' % video_id, video_id) - - title = video['title'] - - formats = [] - videos = video.get('videos') - if isinstance(videos, dict): - for format_id, format_url in videos.items(): - video_url = url_or_none(format_url) - if not format_url: - continue - height = int_or_none(self._search_regex( - r'(\d+)[pP]', format_id, 'height', default=None)) - m = re.search( - r'/(?P<height>\d+)[pP]_(?P<tbr>\d+)[kK]', video_url) - if m: - tbr = int(m.group('tbr')) - height = height or int(m.group('height')) - else: - tbr = None - formats.append({ - 'url': video_url, - 'format_id': '%dp' % height if height else format_id, - 'height': height, - 'tbr': tbr, - }) - m3u8_url = url_or_none(video.get('HLS')) - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - self._sort_formats(formats, ('height', 'tbr', 'width', 'format_id')) - - view_count = str_to_int(video.get('viewed')) - - thumbnails = [] - for preference, t in enumerate(('', '2x'), start=0): - thumbnail_url = url_or_none(video.get('poster%s' % t)) - if not thumbnail_url: - continue - thumbnails.append({ - 'url': thumbnail_url, - 'preference': preference, - }) - - def extract_names(key): - entries_list = video.get(key) - if not isinstance(entries_list, list): - return - entries = [] - for entry in entries_list: - name = str_or_none(entry.get('name')) - if name: - entries.append(name) - return entries - - categories = extract_names('categories') - tags = extract_names('tags') - - uploader = None - info = {} - - webpage = self._download_webpage( - 'https://www.spankwire.com/_/video%s/' % video_id, video_id, - fatal=False) - if webpage: - info = self._search_json_ld(webpage, video_id, default={}) - thumbnail_url = None - if 'thumbnail' in info: - thumbnail_url = url_or_none(info['thumbnail']) - del info['thumbnail'] - if not thumbnail_url: - thumbnail_url = self._og_search_thumbnail(webpage) - if thumbnail_url: - thumbnails.append({ - 'url': thumbnail_url, - 'preference': 10, - }) - uploader = self._html_search_regex( - r'(?s)by\s*<a[^>]+\bclass=["\']uploaded__by[^>]*>(.+?)</a>', - webpage, 'uploader', fatal=False) - if not view_count: - view_count = str_to_int(self._search_regex( - r'data-views=["\']([\d,.]+)', webpage, 'view count', - fatal=False)) - - return merge_dicts({ - 'id': video_id, - 'title': title, - 'description': video.get('description'), - 'duration': int_or_none(video.get('duration')), - 'thumbnails': thumbnails, - 'uploader': uploader, - 'uploader_id': str_or_none(video.get('userId')), - 'timestamp': int_or_none(video.get('time_approved_on')), - 'average_rating': float_or_none(video.get('rating')), - 'view_count': view_count, - 'comment_count': int_or_none(video.get('comments')), - 'age_limit': 18, - 'categories': categories, - 'tags': tags, - 'formats': formats, - }, info) diff --git a/youtube_dl/extractor/sport5.py b/youtube_dl/extractor/sport5.py deleted file mode 100644 index a417b5a4e..000000000 --- a/youtube_dl/extractor/sport5.py +++ /dev/null @@ -1,92 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ExtractorError - - -class Sport5IE(InfoExtractor): - _VALID_URL = r'https?://(?:www|vod)?\.sport5\.co\.il/.*\b(?:Vi|docID)=(?P<id>\d+)' - _TESTS = [ - { - 'url': 'http://vod.sport5.co.il/?Vc=147&Vi=176331&Page=1', - 'info_dict': { - 'id': 's5-Y59xx1-GUh2', - 'ext': 'mp4', - 'title': 'ולנסיה-קורדובה 0:3', - 'description': 'אלקאסר, גאייה ופגולי סידרו לקבוצה של נונו ניצחון על קורדובה ואת המקום הראשון בליגה', - 'duration': 228, - 'categories': list, - }, - 'skip': 'Blocked outside of Israel', - }, { - 'url': 'http://www.sport5.co.il/articles.aspx?FolderID=3075&docID=176372&lang=HE', - 'info_dict': { - 'id': 's5-SiXxx1-hKh2', - 'ext': 'mp4', - 'title': 'GOALS_CELTIC_270914.mp4', - 'description': '', - 'duration': 87, - 'categories': list, - }, - 'skip': 'Blocked outside of Israel', - } - ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - media_id = mobj.group('id') - - webpage = self._download_webpage(url, media_id) - - video_id = self._html_search_regex(r'clipId=([\w-]+)', webpage, 'video id') - - metadata = self._download_xml( - 'http://sport5-metadata-rr-d.nsacdn.com/vod/vod/%s/HDS/metadata.xml' % video_id, - video_id) - - error = metadata.find('./Error') - if error is not None: - raise ExtractorError( - '%s returned error: %s - %s' % ( - self.IE_NAME, - error.find('./Name').text, - error.find('./Description').text), - expected=True) - - title = metadata.find('./Title').text - description = metadata.find('./Description').text - duration = int(metadata.find('./Duration').text) - - posters_el = metadata.find('./PosterLinks') - thumbnails = [{ - 'url': thumbnail.text, - 'width': int(thumbnail.get('width')), - 'height': int(thumbnail.get('height')), - } for thumbnail in posters_el.findall('./PosterIMG')] if posters_el is not None else [] - - categories_el = metadata.find('./Categories') - categories = [ - cat.get('name') for cat in categories_el.findall('./Category') - ] if categories_el is not None else [] - - formats = [{ - 'url': fmt.text, - 'ext': 'mp4', - 'vbr': int(fmt.get('bitrate')), - 'width': int(fmt.get('width')), - 'height': int(fmt.get('height')), - } for fmt in metadata.findall('./PlaybackLinks/FileURL')] - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnails': thumbnails, - 'duration': duration, - 'categories': categories, - 'formats': formats, - } diff --git a/youtube_dl/extractor/sportdeutschland.py b/youtube_dl/extractor/sportdeutschland.py deleted file mode 100644 index 3e497a939..000000000 --- a/youtube_dl/extractor/sportdeutschland.py +++ /dev/null @@ -1,105 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_urllib_parse_urlparse, -) -from ..utils import ( - clean_html, - float_or_none, - int_or_none, - parse_iso8601, - strip_or_none, - try_get, -) - - -class SportDeutschlandIE(InfoExtractor): - _VALID_URL = r'https?://sportdeutschland\.tv/(?P<id>(?:[^/]+/)?[^?#/&]+)' - _TESTS = [{ - 'url': 'https://sportdeutschland.tv/badminton/re-live-deutsche-meisterschaften-2020-halbfinals?playlistId=0', - 'info_dict': { - 'id': '5318cac0275701382770543d7edaf0a0', - 'ext': 'mp4', - 'title': 'Re-live: Deutsche Meisterschaften 2020 - Halbfinals - Teil 1', - 'duration': 16106.36, - }, - 'params': { - 'noplaylist': True, - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'https://sportdeutschland.tv/badminton/re-live-deutsche-meisterschaften-2020-halbfinals?playlistId=0', - 'info_dict': { - 'id': 'c6e2fdd01f63013854c47054d2ab776f', - 'title': 'Re-live: Deutsche Meisterschaften 2020 - Halbfinals', - 'description': 'md5:5263ff4c31c04bb780c9f91130b48530', - 'duration': 31397, - }, - 'playlist_count': 2, - }, { - 'url': 'https://sportdeutschland.tv/freeride-world-tour-2021-fieberbrunn-oesterreich', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - data = self._download_json( - 'https://backend.sportdeutschland.tv/api/permalinks/' + display_id, - display_id, query={'access_token': 'true'}) - asset = data['asset'] - title = (asset.get('title') or asset['label']).strip() - asset_id = asset.get('id') or asset.get('uuid') - info = { - 'id': asset_id, - 'title': title, - 'description': clean_html(asset.get('body') or asset.get('description')) or asset.get('teaser'), - 'duration': int_or_none(asset.get('seconds')), - } - videos = asset.get('videos') or [] - if len(videos) > 1: - playlist_id = compat_parse_qs(compat_urllib_parse_urlparse(url).query).get('playlistId', [None])[0] - if playlist_id: - if self._downloader.params.get('noplaylist'): - videos = [videos[int(playlist_id)]] - self.to_screen('Downloading just a single video because of --no-playlist') - else: - self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % asset_id) - - def entries(): - for i, video in enumerate(videos, 1): - video_id = video.get('uuid') - video_url = video.get('url') - if not (video_id and video_url): - continue - formats = self._extract_m3u8_formats( - video_url.replace('.smil', '.m3u8'), video_id, 'mp4', fatal=False) - if not formats: - continue - yield { - 'id': video_id, - 'formats': formats, - 'title': title + ' - ' + (video.get('label') or 'Teil %d' % i), - 'duration': float_or_none(video.get('duration')), - } - info.update({ - '_type': 'multi_video', - 'entries': entries(), - }) - else: - formats = self._extract_m3u8_formats( - videos[0]['url'].replace('.smil', '.m3u8'), asset_id, 'mp4') - section_title = strip_or_none(try_get(data, lambda x: x['section']['title'])) - info.update({ - 'formats': formats, - 'display_id': asset.get('permalink'), - 'thumbnail': try_get(asset, lambda x: x['images'][0]), - 'categories': [section_title] if section_title else None, - 'view_count': int_or_none(asset.get('views')), - 'is_live': asset.get('is_live') is True, - 'timestamp': parse_iso8601(asset.get('date') or asset.get('published_at')), - }) - return info diff --git a/youtube_dl/extractor/springboardplatform.py b/youtube_dl/extractor/springboardplatform.py deleted file mode 100644 index 07d99b579..000000000 --- a/youtube_dl/extractor/springboardplatform.py +++ /dev/null @@ -1,125 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - int_or_none, - xpath_attr, - xpath_text, - xpath_element, - unescapeHTML, - unified_timestamp, -) - - -class SpringboardPlatformIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?:// - cms\.springboardplatform\.com/ - (?: - (?:previews|embed_iframe)/(?P<index>\d+)/video/(?P<id>\d+)| - xml_feeds_advanced/index/(?P<index_2>\d+)/rss3/(?P<id_2>\d+) - ) - ''' - _TESTS = [{ - 'url': 'http://cms.springboardplatform.com/previews/159/video/981017/0/0/1', - 'md5': '5c3cb7b5c55740d482561099e920f192', - 'info_dict': { - 'id': '981017', - 'ext': 'mp4', - 'title': 'Redman "BUD like YOU" "Usher Good Kisser" REMIX', - 'description': 'Redman "BUD like YOU" "Usher Good Kisser" REMIX', - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1409132328, - 'upload_date': '20140827', - 'duration': 193, - }, - }, { - 'url': 'http://cms.springboardplatform.com/embed_iframe/159/video/981017/rab007/rapbasement.com/1/1', - 'only_matching': True, - }, { - 'url': 'http://cms.springboardplatform.com/embed_iframe/20/video/1731611/ki055/kidzworld.com/10', - 'only_matching': True, - }, { - 'url': 'http://cms.springboardplatform.com/xml_feeds_advanced/index/159/rss3/981017/0/0/1/', - 'only_matching': True, - }] - - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//cms\.springboardplatform\.com/embed_iframe/\d+/video/\d+.*?)\1', - webpage)] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') or mobj.group('id_2') - index = mobj.group('index') or mobj.group('index_2') - - video = self._download_xml( - 'http://cms.springboardplatform.com/xml_feeds_advanced/index/%s/rss3/%s' - % (index, video_id), video_id) - - item = xpath_element(video, './/item', 'item', fatal=True) - - content = xpath_element( - item, './{http://search.yahoo.com/mrss/}content', 'content', - fatal=True) - title = unescapeHTML(xpath_text(item, './title', 'title', fatal=True)) - - video_url = content.attrib['url'] - - if 'error_video.mp4' in video_url: - raise ExtractorError( - 'Video %s no longer exists' % video_id, expected=True) - - duration = int_or_none(content.get('duration')) - tbr = int_or_none(content.get('bitrate')) - filesize = int_or_none(content.get('fileSize')) - width = int_or_none(content.get('width')) - height = int_or_none(content.get('height')) - - description = unescapeHTML(xpath_text( - item, './description', 'description')) - thumbnail = xpath_attr( - item, './{http://search.yahoo.com/mrss/}thumbnail', 'url', - 'thumbnail') - - timestamp = unified_timestamp(xpath_text( - item, './{http://cms.springboardplatform.com/namespaces.html}created', - 'timestamp')) - - formats = [{ - 'url': video_url, - 'format_id': 'http', - 'tbr': tbr, - 'filesize': filesize, - 'width': width, - 'height': height, - }] - - m3u8_format = formats[0].copy() - m3u8_format.update({ - 'url': re.sub(r'(https?://)cdn\.', r'\1hls.', video_url) + '.m3u8', - 'ext': 'mp4', - 'format_id': 'hls', - 'protocol': 'm3u8_native', - }) - formats.append(m3u8_format) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'duration': duration, - 'formats': formats, - } diff --git a/youtube_dl/extractor/srgssr.py b/youtube_dl/extractor/srgssr.py deleted file mode 100644 index ac018e740..000000000 --- a/youtube_dl/extractor/srgssr.py +++ /dev/null @@ -1,252 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - float_or_none, - int_or_none, - parse_iso8601, - qualities, - try_get, -) - - -class SRGSSRIE(InfoExtractor): - _VALID_URL = r'''(?x) - (?: - https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=urn| - srgssr - ): - (?P<bu> - srf|rts|rsi|rtr|swi - ):(?:[^:]+:)? - (?P<type> - video|audio - ): - (?P<id> - [0-9a-f\-]{36}|\d+ - ) - ''' - _GEO_BYPASS = False - _GEO_COUNTRIES = ['CH'] - - _ERRORS = { - 'AGERATING12': 'To protect children under the age of 12, this video is only available between 8 p.m. and 6 a.m.', - 'AGERATING18': 'To protect children under the age of 18, this video is only available between 11 p.m. and 5 a.m.', - # 'ENDDATE': 'For legal reasons, this video was only available for a specified period of time.', - 'GEOBLOCK': 'For legal reasons, this video is only available in Switzerland.', - 'LEGAL': 'The video cannot be transmitted for legal reasons.', - 'STARTDATE': 'This video is not yet available. Please try again later.', - } - _DEFAULT_LANGUAGE_CODES = { - 'srf': 'de', - 'rts': 'fr', - 'rsi': 'it', - 'rtr': 'rm', - 'swi': 'en', - } - - def _get_tokenized_src(self, url, video_id, format_id): - token = self._download_json( - 'http://tp.srgssr.ch/akahd/token?acl=*', - video_id, 'Downloading %s token' % format_id, fatal=False) or {} - auth_params = try_get(token, lambda x: x['token']['authparams']) - if auth_params: - url += ('?' if '?' not in url else '&') + auth_params - return url - - def _get_media_data(self, bu, media_type, media_id): - query = {'onlyChapters': True} if media_type == 'video' else {} - full_media_data = self._download_json( - 'https://il.srgssr.ch/integrationlayer/2.0/%s/mediaComposition/%s/%s.json' - % (bu, media_type, media_id), - media_id, query=query)['chapterList'] - try: - media_data = next( - x for x in full_media_data if x.get('id') == media_id) - except StopIteration: - raise ExtractorError('No media information found') - - block_reason = media_data.get('blockReason') - if block_reason and block_reason in self._ERRORS: - message = self._ERRORS[block_reason] - if block_reason == 'GEOBLOCK': - self.raise_geo_restricted( - msg=message, countries=self._GEO_COUNTRIES) - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, message), expected=True) - - return media_data - - def _real_extract(self, url): - bu, media_type, media_id = re.match(self._VALID_URL, url).groups() - media_data = self._get_media_data(bu, media_type, media_id) - title = media_data['title'] - - formats = [] - q = qualities(['SD', 'HD']) - for source in (media_data.get('resourceList') or []): - format_url = source.get('url') - if not format_url: - continue - protocol = source.get('protocol') - quality = source.get('quality') - format_id = [] - for e in (protocol, source.get('encoding'), quality): - if e: - format_id.append(e) - format_id = '-'.join(format_id) - - if protocol in ('HDS', 'HLS'): - if source.get('tokenType') == 'AKAMAI': - format_url = self._get_tokenized_src( - format_url, media_id, format_id) - formats.extend(self._extract_akamai_formats( - format_url, media_id)) - elif protocol == 'HLS': - formats.extend(self._extract_m3u8_formats( - format_url, media_id, 'mp4', 'm3u8_native', - m3u8_id=format_id, fatal=False)) - elif protocol in ('HTTP', 'HTTPS'): - formats.append({ - 'format_id': format_id, - 'url': format_url, - 'quality': q(quality), - }) - - # This is needed because for audio medias the podcast url is usually - # always included, even if is only an audio segment and not the - # whole episode. - if int_or_none(media_data.get('position')) == 0: - for p in ('S', 'H'): - podcast_url = media_data.get('podcast%sdUrl' % p) - if not podcast_url: - continue - quality = p + 'D' - formats.append({ - 'format_id': 'PODCAST-' + quality, - 'url': podcast_url, - 'quality': q(quality), - }) - self._sort_formats(formats) - - subtitles = {} - if media_type == 'video': - for sub in (media_data.get('subtitleList') or []): - sub_url = sub.get('url') - if not sub_url: - continue - lang = sub.get('locale') or self._DEFAULT_LANGUAGE_CODES[bu] - subtitles.setdefault(lang, []).append({ - 'url': sub_url, - }) - - return { - 'id': media_id, - 'title': title, - 'description': media_data.get('description'), - 'timestamp': parse_iso8601(media_data.get('date')), - 'thumbnail': media_data.get('imageUrl'), - 'duration': float_or_none(media_data.get('duration'), 1000), - 'subtitles': subtitles, - 'formats': formats, - } - - -class SRGSSRPlayIE(InfoExtractor): - IE_DESC = 'srf.ch, rts.ch, rsi.ch, rtr.ch and swissinfo.ch play sites' - _VALID_URL = r'''(?x) - https?:// - (?:(?:www|play)\.)? - (?P<bu>srf|rts|rsi|rtr|swissinfo)\.ch/play/(?:tv|radio)/ - (?: - [^/]+/(?P<type>video|audio)/[^?]+| - popup(?P<type_2>video|audio)player - ) - \?.*?\b(?:id=|urn=urn:[^:]+:video:)(?P<id>[0-9a-f\-]{36}|\d+) - ''' - - _TESTS = [{ - 'url': 'http://www.srf.ch/play/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5', - 'md5': '6db2226ba97f62ad42ce09783680046c', - 'info_dict': { - 'id': '28e1a57d-5b76-4399-8ab3-9097f071e6c5', - 'ext': 'mp4', - 'upload_date': '20130701', - 'title': 'Snowden beantragt Asyl in Russland', - 'timestamp': 1372708215, - 'duration': 113.827, - 'thumbnail': r're:^https?://.*1383719781\.png$', - }, - 'expected_warnings': ['Unable to download f4m manifest'], - }, { - 'url': 'http://www.rtr.ch/play/radio/actualitad/audio/saira-tujetsch-tuttina-cuntinuar-cun-sedrun-muster-turissem?id=63cb0778-27f8-49af-9284-8c7a8c6d15fc', - 'info_dict': { - 'id': '63cb0778-27f8-49af-9284-8c7a8c6d15fc', - 'ext': 'mp3', - 'upload_date': '20151013', - 'title': 'Saira: Tujetsch - tuttina cuntinuar cun Sedrun Mustér Turissem', - 'timestamp': 1444709160, - 'duration': 336.816, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - }, { - 'url': 'http://www.rts.ch/play/tv/-/video/le-19h30?id=6348260', - 'md5': '67a2a9ae4e8e62a68d0e9820cc9782df', - 'info_dict': { - 'id': '6348260', - 'display_id': '6348260', - 'ext': 'mp4', - 'duration': 1796.76, - 'title': 'Le 19h30', - 'upload_date': '20141201', - 'timestamp': 1417458600, - 'thumbnail': r're:^https?://.*\.image', - }, - 'params': { - # m3u8 download - 'skip_download': True, - } - }, { - 'url': 'http://play.swissinfo.ch/play/tv/business/video/why-people-were-against-tax-reforms?id=42960270', - 'info_dict': { - 'id': '42960270', - 'ext': 'mp4', - 'title': 'Why people were against tax reforms', - 'description': 'md5:7ac442c558e9630e947427469c4b824d', - 'duration': 94.0, - 'upload_date': '20170215', - 'timestamp': 1487173560, - 'thumbnail': r're:https?://www\.swissinfo\.ch/srgscalableimage/42961964', - 'subtitles': 'count:9', - }, - 'params': { - 'skip_download': True, - } - }, { - 'url': 'https://www.srf.ch/play/tv/popupvideoplayer?id=c4dba0ca-e75b-43b2-a34f-f708a4932e01', - 'only_matching': True, - }, { - 'url': 'https://www.srf.ch/play/tv/10vor10/video/snowden-beantragt-asyl-in-russland?urn=urn:srf:video:28e1a57d-5b76-4399-8ab3-9097f071e6c5', - 'only_matching': True, - }, { - 'url': 'https://www.rts.ch/play/tv/19h30/video/le-19h30?urn=urn:rts:video:6348260', - 'only_matching': True, - }, { - # audio segment, has podcastSdUrl of the full episode - 'url': 'https://www.srf.ch/play/radio/popupaudioplayer?id=50b20dc8-f05b-4972-bf03-e438ff2833eb', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - bu = mobj.group('bu') - media_type = mobj.group('type') or mobj.group('type_2') - media_id = mobj.group('id') - return self.url_result('srgssr:%s:%s:%s' % (bu[:3], media_type, media_id), 'SRGSSR') diff --git a/youtube_dl/extractor/stanfordoc.py b/youtube_dl/extractor/stanfordoc.py deleted file mode 100644 index ae3dd1380..000000000 --- a/youtube_dl/extractor/stanfordoc.py +++ /dev/null @@ -1,91 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - orderedSet, - unescapeHTML, -) - - -class StanfordOpenClassroomIE(InfoExtractor): - IE_NAME = 'stanfordoc' - IE_DESC = 'Stanford Open ClassRoom' - _VALID_URL = r'https?://openclassroom\.stanford\.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$' - _TEST = { - 'url': 'http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100', - 'md5': '544a9468546059d4e80d76265b0443b8', - 'info_dict': { - 'id': 'PracticalUnix_intro-environment', - 'ext': 'mp4', - 'title': 'Intro Environment', - } - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - - if mobj.group('course') and mobj.group('video'): # A specific video - course = mobj.group('course') - video = mobj.group('video') - info = { - 'id': course + '_' + video, - 'uploader': None, - 'upload_date': None, - } - - baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/' - xmlUrl = baseUrl + video + '.xml' - mdoc = self._download_xml(xmlUrl, info['id']) - try: - info['title'] = mdoc.findall('./title')[0].text - info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text - except IndexError: - raise ExtractorError('Invalid metadata XML file') - return info - elif mobj.group('course'): # A course page - course = mobj.group('course') - info = { - 'id': course, - '_type': 'playlist', - 'uploader': None, - 'upload_date': None, - } - - coursepage = self._download_webpage( - url, info['id'], - note='Downloading course info page', - errnote='Unable to download course info page') - - info['title'] = self._html_search_regex( - r'<h1>([^<]+)</h1>', coursepage, 'title', default=info['id']) - - info['description'] = self._html_search_regex( - r'(?s)<description>([^<]+)</description>', - coursepage, 'description', fatal=False) - - links = orderedSet(re.findall(r'<a href="(VideoPage\.php\?[^"]+)">', coursepage)) - info['entries'] = [self.url_result( - 'http://openclassroom.stanford.edu/MainFolder/%s' % unescapeHTML(l) - ) for l in links] - return info - else: # Root page - info = { - 'id': 'Stanford OpenClassroom', - '_type': 'playlist', - 'uploader': None, - 'upload_date': None, - } - info['title'] = info['id'] - - rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php' - rootpage = self._download_webpage(rootURL, info['id'], - errnote='Unable to download course info page') - - links = orderedSet(re.findall(r'<a href="(CoursePage\.php\?[^"]+)">', rootpage)) - info['entries'] = [self.url_result( - 'http://openclassroom.stanford.edu/MainFolder/%s' % unescapeHTML(l) - ) for l in links] - return info diff --git a/youtube_dl/extractor/steam.py b/youtube_dl/extractor/steam.py deleted file mode 100644 index a6a191ceb..000000000 --- a/youtube_dl/extractor/steam.py +++ /dev/null @@ -1,149 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - extract_attributes, - ExtractorError, - get_element_by_class, - js_to_json, -) - - -class SteamIE(InfoExtractor): - _VALID_URL = r"""(?x) - https?://store\.steampowered\.com/ - (agecheck/)? - (?P<urltype>video|app)/ #If the page is only for videos or for a game - (?P<gameID>\d+)/? - (?P<videoID>\d*)(?P<extra>\??) # For urltype == video we sometimes get the videoID - | - https?://(?:www\.)?steamcommunity\.com/sharedfiles/filedetails/\?id=(?P<fileID>[0-9]+) - """ - _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/' - _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' - _TESTS = [{ - 'url': 'http://store.steampowered.com/video/105600/', - 'playlist': [ - { - 'md5': '6a294ee0c4b1f47f5bb76a65e31e3592', - 'info_dict': { - 'id': '2040428', - 'ext': 'mp4', - 'title': 'Terraria 1.3 Trailer', - 'playlist_index': 1, - } - }, - { - 'md5': '911672b20064ca3263fa89650ba5a7aa', - 'info_dict': { - 'id': '2029566', - 'ext': 'mp4', - 'title': 'Terraria 1.2 Trailer', - 'playlist_index': 2, - } - } - ], - 'info_dict': { - 'id': '105600', - 'title': 'Terraria', - }, - 'params': { - 'playlistend': 2, - } - }, { - 'url': 'http://steamcommunity.com/sharedfiles/filedetails/?id=242472205', - 'info_dict': { - 'id': 'X8kpJBlzD2E', - 'ext': 'mp4', - 'upload_date': '20140617', - 'title': 'FRONTIERS - Trapping', - 'description': 'md5:bf6f7f773def614054089e5769c12a6e', - 'uploader': 'AAD Productions', - 'uploader_id': 'AtomicAgeDogGames', - } - }] - - def _real_extract(self, url): - m = re.match(self._VALID_URL, url) - fileID = m.group('fileID') - if fileID: - videourl = url - playlist_id = fileID - else: - gameID = m.group('gameID') - playlist_id = gameID - videourl = self._VIDEO_PAGE_TEMPLATE % playlist_id - - self._set_cookie('steampowered.com', 'mature_content', '1') - - webpage = self._download_webpage(videourl, playlist_id) - - if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None: - videourl = self._AGECHECK_TEMPLATE % playlist_id - self.report_age_confirmation() - webpage = self._download_webpage(videourl, playlist_id) - - flash_vars = self._parse_json(self._search_regex( - r'(?s)rgMovieFlashvars\s*=\s*({.+?});', webpage, - 'flash vars'), playlist_id, js_to_json) - - playlist_title = None - entries = [] - if fileID: - playlist_title = get_element_by_class('workshopItemTitle', webpage) - for movie in flash_vars.values(): - if not movie: - continue - youtube_id = movie.get('YOUTUBE_VIDEO_ID') - if not youtube_id: - continue - entries.append({ - '_type': 'url', - 'url': youtube_id, - 'ie_key': 'Youtube', - }) - else: - playlist_title = get_element_by_class('apphub_AppName', webpage) - for movie_id, movie in flash_vars.items(): - if not movie: - continue - video_id = self._search_regex(r'movie_(\d+)', movie_id, 'video id', fatal=False) - title = movie.get('MOVIE_NAME') - if not title or not video_id: - continue - entry = { - 'id': video_id, - 'title': title.replace('+', ' '), - } - formats = [] - flv_url = movie.get('FILENAME') - if flv_url: - formats.append({ - 'format_id': 'flv', - 'url': flv_url, - }) - highlight_element = self._search_regex( - r'(<div[^>]+id="highlight_movie_%s"[^>]+>)' % video_id, - webpage, 'highlight element', fatal=False) - if highlight_element: - highlight_attribs = extract_attributes(highlight_element) - if highlight_attribs: - entry['thumbnail'] = highlight_attribs.get('data-poster') - for quality in ('', '-hd'): - for ext in ('webm', 'mp4'): - video_url = highlight_attribs.get('data-%s%s-source' % (ext, quality)) - if video_url: - formats.append({ - 'format_id': ext + quality, - 'url': video_url, - }) - if not formats: - continue - entry['formats'] = formats - entries.append(entry) - if not entries: - raise ExtractorError('Could not find any videos') - - return self.playlist_result(entries, playlist_id, playlist_title) diff --git a/youtube_dl/extractor/streamable.py b/youtube_dl/extractor/streamable.py deleted file mode 100644 index 34725274e..000000000 --- a/youtube_dl/extractor/streamable.py +++ /dev/null @@ -1,112 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - float_or_none, - int_or_none, -) - - -class StreamableIE(InfoExtractor): - _VALID_URL = r'https?://streamable\.com/(?:[es]/)?(?P<id>\w+)' - _TESTS = [ - { - 'url': 'https://streamable.com/dnd1', - 'md5': '3e3bc5ca088b48c2d436529b64397fef', - 'info_dict': { - 'id': 'dnd1', - 'ext': 'mp4', - 'title': 'Mikel Oiarzabal scores to make it 0-3 for La Real against Espanyol', - 'thumbnail': r're:https?://.*\.jpg$', - 'uploader': 'teabaker', - 'timestamp': 1454964157.35115, - 'upload_date': '20160208', - 'duration': 61.516, - 'view_count': int, - } - }, - # older video without bitrate, width/height, etc. info - { - 'url': 'https://streamable.com/moo', - 'md5': '2cf6923639b87fba3279ad0df3a64e73', - 'info_dict': { - 'id': 'moo', - 'ext': 'mp4', - 'title': '"Please don\'t eat me!"', - 'thumbnail': r're:https?://.*\.jpg$', - 'timestamp': 1426115495, - 'upload_date': '20150311', - 'duration': 12, - 'view_count': int, - } - }, - { - 'url': 'https://streamable.com/e/dnd1', - 'only_matching': True, - }, - { - 'url': 'https://streamable.com/s/okkqk/drxjds', - 'only_matching': True, - } - ] - - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<iframe[^>]+src=(?P<q1>[\'"])(?P<src>(?:https?:)?//streamable\.com/(?:(?!\1).+))(?P=q1)', - webpage) - if mobj: - return mobj.group('src') - - def _real_extract(self, url): - video_id = self._match_id(url) - - # Note: Using the ajax API, as the public Streamable API doesn't seem - # to return video info like the title properly sometimes, and doesn't - # include info like the video duration - video = self._download_json( - 'https://ajax.streamable.com/videos/%s' % video_id, video_id) - - # Format IDs: - # 0 The video is being uploaded - # 1 The video is being processed - # 2 The video has at least one file ready - # 3 The video is unavailable due to an error - status = video.get('status') - if status != 2: - raise ExtractorError( - 'This video is currently unavailable. It may still be uploading or processing.', - expected=True) - - title = video.get('reddit_title') or video['title'] - - formats = [] - for key, info in video['files'].items(): - if not info.get('url'): - continue - formats.append({ - 'format_id': key, - 'url': self._proto_relative_url(info['url']), - 'width': int_or_none(info.get('width')), - 'height': int_or_none(info.get('height')), - 'filesize': int_or_none(info.get('size')), - 'fps': int_or_none(info.get('framerate')), - 'vbr': float_or_none(info.get('bitrate'), 1000) - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': video.get('description'), - 'thumbnail': self._proto_relative_url(video.get('thumbnail_url')), - 'uploader': video.get('owner', {}).get('user_name'), - 'timestamp': float_or_none(video.get('date_added')), - 'duration': float_or_none(video.get('duration')), - 'view_count': int_or_none(video.get('plays')), - 'formats': formats - } diff --git a/youtube_dl/extractor/stv.py b/youtube_dl/extractor/stv.py deleted file mode 100644 index 539220a94..000000000 --- a/youtube_dl/extractor/stv.py +++ /dev/null @@ -1,95 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - compat_str, - float_or_none, - int_or_none, - smuggle_url, - str_or_none, - try_get, -) - - -class STVPlayerIE(InfoExtractor): - IE_NAME = 'stv:player' - _VALID_URL = r'https?://player\.stv\.tv/(?P<type>episode|video)/(?P<id>[a-z0-9]{4})' - _TESTS = [{ - # shortform - 'url': 'https://player.stv.tv/video/4gwd/emmerdale/60-seconds-on-set-with-laura-norton/', - 'md5': '5adf9439c31d554f8be0707c7abe7e0a', - 'info_dict': { - 'id': '5333973339001', - 'ext': 'mp4', - 'upload_date': '20170301', - 'title': '60 seconds on set with Laura Norton', - 'description': "How many questions can Laura - a.k.a Kerry Wyatt - answer in 60 seconds? Let\'s find out!", - 'timestamp': 1488388054, - 'uploader_id': '1486976045', - }, - 'skip': 'this resource is unavailable outside of the UK', - }, { - # episodes - 'url': 'https://player.stv.tv/episode/4125/jennifer-saunders-memory-lane', - 'only_matching': True, - }] - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1486976045/default_default/index.html?videoId=%s' - _PTYPE_MAP = { - 'episode': 'episodes', - 'video': 'shortform', - } - - def _real_extract(self, url): - ptype, video_id = re.match(self._VALID_URL, url).groups() - - webpage = self._download_webpage(url, video_id, fatal=False) or '' - props = (self._parse_json(self._search_regex( - r'<script[^>]+id="__NEXT_DATA__"[^>]*>({.+?})</script>', - webpage, 'next data', default='{}'), video_id, - fatal=False) or {}).get('props') or {} - player_api_cache = try_get( - props, lambda x: x['initialReduxState']['playerApiCache']) or {} - - api_path, resp = None, {} - for k, v in player_api_cache.items(): - if k.startswith('/episodes/') or k.startswith('/shortform/'): - api_path, resp = k, v - break - else: - episode_id = str_or_none(try_get( - props, lambda x: x['pageProps']['episodeId'])) - api_path = '/%s/%s' % (self._PTYPE_MAP[ptype], episode_id or video_id) - - result = resp.get('results') - if not result: - resp = self._download_json( - 'https://player.api.stv.tv/v1' + api_path, video_id) - result = resp['results'] - - video = result['video'] - video_id = compat_str(video['id']) - - subtitles = {} - _subtitles = result.get('_subtitles') or {} - for ext, sub_url in _subtitles.items(): - subtitles.setdefault('en', []).append({ - 'ext': 'vtt' if ext == 'webvtt' else ext, - 'url': sub_url, - }) - - programme = result.get('programme') or {} - - return { - '_type': 'url_transparent', - 'id': video_id, - 'url': smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, {'geo_countries': ['GB']}), - 'description': result.get('summary'), - 'duration': float_or_none(video.get('length'), 1000), - 'subtitles': subtitles, - 'view_count': int_or_none(result.get('views')), - 'series': programme.get('name') or programme.get('shortName'), - 'ie_key': 'BrightcoveNew', - } diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py deleted file mode 100644 index a5bb6daa7..000000000 --- a/youtube_dl/extractor/svt.py +++ /dev/null @@ -1,425 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - determine_ext, - dict_get, - int_or_none, - unified_timestamp, - str_or_none, - strip_or_none, - try_get, -) - - -class SVTBaseIE(InfoExtractor): - _GEO_COUNTRIES = ['SE'] - - def _extract_video(self, video_info, video_id): - is_live = dict_get(video_info, ('live', 'simulcast'), default=False) - m3u8_protocol = 'm3u8' if is_live else 'm3u8_native' - formats = [] - for vr in video_info['videoReferences']: - player_type = vr.get('playerType') or vr.get('format') - vurl = vr['url'] - ext = determine_ext(vurl) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - vurl, video_id, - ext='mp4', entry_protocol=m3u8_protocol, - m3u8_id=player_type, fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - vurl + '?hdcore=3.3.0', video_id, - f4m_id=player_type, fatal=False)) - elif ext == 'mpd': - if player_type == 'dashhbbtv': - formats.extend(self._extract_mpd_formats( - vurl, video_id, mpd_id=player_type, fatal=False)) - else: - formats.append({ - 'format_id': player_type, - 'url': vurl, - }) - rights = try_get(video_info, lambda x: x['rights'], dict) or {} - if not formats and rights.get('geoBlockedSweden'): - self.raise_geo_restricted( - 'This video is only available in Sweden', - countries=self._GEO_COUNTRIES) - self._sort_formats(formats) - - subtitles = {} - subtitle_references = dict_get(video_info, ('subtitles', 'subtitleReferences')) - if isinstance(subtitle_references, list): - for sr in subtitle_references: - subtitle_url = sr.get('url') - subtitle_lang = sr.get('language', 'sv') - if subtitle_url: - if determine_ext(subtitle_url) == 'm3u8': - # TODO(yan12125): handle WebVTT in m3u8 manifests - continue - - subtitles.setdefault(subtitle_lang, []).append({'url': subtitle_url}) - - title = video_info.get('title') - - series = video_info.get('programTitle') - season_number = int_or_none(video_info.get('season')) - episode = video_info.get('episodeTitle') - episode_number = int_or_none(video_info.get('episodeNumber')) - - timestamp = unified_timestamp(rights.get('validFrom')) - duration = int_or_none(dict_get(video_info, ('materialLength', 'contentDuration'))) - age_limit = None - adult = dict_get( - video_info, ('inappropriateForChildren', 'blockedForChildren'), - skip_false_values=False) - if adult is not None: - age_limit = 18 if adult else 0 - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'subtitles': subtitles, - 'duration': duration, - 'timestamp': timestamp, - 'age_limit': age_limit, - 'series': series, - 'season_number': season_number, - 'episode': episode, - 'episode_number': episode_number, - 'is_live': is_live, - } - - -class SVTIE(SVTBaseIE): - _VALID_URL = r'https?://(?:www\.)?svt\.se/wd\?(?:.*?&)?widgetId=(?P<widget_id>\d+)&.*?\barticleId=(?P<id>\d+)' - _TEST = { - 'url': 'http://www.svt.se/wd?widgetId=23991§ionId=541&articleId=2900353&type=embed&contextSectionId=123&autostart=false', - 'md5': '33e9a5d8f646523ce0868ecfb0eed77d', - 'info_dict': { - 'id': '2900353', - 'ext': 'mp4', - 'title': 'Stjärnorna skojar till det - under SVT-intervjun', - 'duration': 27, - 'age_limit': 0, - }, - } - - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'(?:<iframe src|href)="(?P<url>%s[^"]*)"' % SVTIE._VALID_URL, webpage) - if mobj: - return mobj.group('url') - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - widget_id = mobj.group('widget_id') - article_id = mobj.group('id') - - info = self._download_json( - 'http://www.svt.se/wd?widgetId=%s&articleId=%s&format=json&type=embed&output=json' % (widget_id, article_id), - article_id) - - info_dict = self._extract_video(info['video'], article_id) - info_dict['title'] = info['context']['title'] - return info_dict - - -class SVTPlayBaseIE(SVTBaseIE): - _SVTPLAY_RE = r'root\s*\[\s*(["\'])_*svtplay\1\s*\]\s*=\s*(?P<json>{.+?})\s*;\s*\n' - - -class SVTPlayIE(SVTPlayBaseIE): - IE_DESC = 'SVT Play and Öppet arkiv' - _VALID_URL = r'''(?x) - (?: - (?: - svt:| - https?://(?:www\.)?svt\.se/barnkanalen/barnplay/[^/]+/ - ) - (?P<svt_id>[^/?#&]+)| - https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P<id>[^/?#&]+) - (?:.*?(?:modalId|id)=(?P<modal_id>[\da-zA-Z-]+))? - ) - ''' - _TESTS = [{ - 'url': 'https://www.svtplay.se/video/30479064', - 'md5': '2382036fd6f8c994856c323fe51c426e', - 'info_dict': { - 'id': '8zVbDPA', - 'ext': 'mp4', - 'title': 'Designdrömmar i Stenungsund', - 'timestamp': 1615770000, - 'upload_date': '20210315', - 'duration': 3519, - 'thumbnail': r're:^https?://(?:.*[\.-]jpg|www.svtstatic.se/image/.*)$', - 'age_limit': 0, - 'subtitles': { - 'sv': [{ - 'ext': 'vtt', - }] - }, - }, - 'params': { - 'format': 'bestvideo', - # skip for now due to download test asserts that segment is > 10000 bytes and svt uses - # init segments that are smaller - # AssertionError: Expected test_SVTPlay_jNwpV9P.mp4 to be at least 9.77KiB, but it's only 864.00B - 'skip_download': True, - }, - }, { - 'url': 'https://www.svtplay.se/video/30479064/husdrommar/husdrommar-sasong-8-designdrommar-i-stenungsund?modalId=8zVbDPA', - 'only_matching': True, - }, { - 'url': 'https://www.svtplay.se/video/30684086/rapport/rapport-24-apr-18-00-7?id=e72gVpa', - 'only_matching': True, - }, { - # geo restricted to Sweden - 'url': 'http://www.oppetarkiv.se/video/5219710/trollflojten', - 'only_matching': True, - }, { - 'url': 'http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg', - 'only_matching': True, - }, { - 'url': 'https://www.svtplay.se/kanaler/svt1', - 'only_matching': True, - }, { - 'url': 'svt:1376446-003A', - 'only_matching': True, - }, { - 'url': 'svt:14278044', - 'only_matching': True, - }, { - 'url': 'https://www.svt.se/barnkanalen/barnplay/kar/eWv5MLX/', - 'only_matching': True, - }, { - 'url': 'svt:eWv5MLX', - 'only_matching': True, - }] - - def _adjust_title(self, info): - if info['is_live']: - info['title'] = self._live_title(info['title']) - - def _extract_by_video_id(self, video_id, webpage=None): - data = self._download_json( - 'https://api.svt.se/videoplayer-api/video/%s' % video_id, - video_id, headers=self.geo_verification_headers()) - info_dict = self._extract_video(data, video_id) - if not info_dict.get('title'): - title = dict_get(info_dict, ('episode', 'series')) - if not title and webpage: - title = re.sub( - r'\s*\|\s*.+?$', '', self._og_search_title(webpage)) - if not title: - title = video_id - info_dict['title'] = title - self._adjust_title(info_dict) - return info_dict - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - svt_id = mobj.group('svt_id') or mobj.group('modal_id') - - if svt_id: - return self._extract_by_video_id(svt_id) - - webpage = self._download_webpage(url, video_id) - - data = self._parse_json( - self._search_regex( - self._SVTPLAY_RE, webpage, 'embedded data', default='{}', - group='json'), - video_id, fatal=False) - - thumbnail = self._og_search_thumbnail(webpage) - - if data: - video_info = try_get( - data, lambda x: x['context']['dispatcher']['stores']['VideoTitlePageStore']['data']['video'], - dict) - if video_info: - info_dict = self._extract_video(video_info, video_id) - info_dict.update({ - 'title': data['context']['dispatcher']['stores']['MetaStore']['title'], - 'thumbnail': thumbnail, - }) - self._adjust_title(info_dict) - return info_dict - - svt_id = try_get( - data, lambda x: x['statistics']['dataLake']['content']['id'], - compat_str) - - if not svt_id: - svt_id = self._search_regex( - (r'<video[^>]+data-video-id=["\']([\da-zA-Z-]+)', - r'<[^>]+\bdata-rt=["\']top-area-play-button["\'][^>]+\bhref=["\'][^"\']*video/%s/[^"\']*\b(?:modalId|id)=([\da-zA-Z-]+)' % re.escape(video_id), - r'["\']videoSvtId["\']\s*:\s*["\']([\da-zA-Z-]+)', - r'["\']videoSvtId\\?["\']\s*:\s*\\?["\']([\da-zA-Z-]+)', - r'"content"\s*:\s*{.*?"id"\s*:\s*"([\da-zA-Z-]+)"', - r'["\']svtId["\']\s*:\s*["\']([\da-zA-Z-]+)', - r'["\']svtId\\?["\']\s*:\s*\\?["\']([\da-zA-Z-]+)'), - webpage, 'video id') - - info_dict = self._extract_by_video_id(svt_id, webpage) - info_dict['thumbnail'] = thumbnail - - return info_dict - - -class SVTSeriesIE(SVTPlayBaseIE): - _VALID_URL = r'https?://(?:www\.)?svtplay\.se/(?P<id>[^/?&#]+)(?:.+?\btab=(?P<season_slug>[^&#]+))?' - _TESTS = [{ - 'url': 'https://www.svtplay.se/rederiet', - 'info_dict': { - 'id': '14445680', - 'title': 'Rederiet', - 'description': 'md5:d9fdfff17f5d8f73468176ecd2836039', - }, - 'playlist_mincount': 318, - }, { - 'url': 'https://www.svtplay.se/rederiet?tab=season-2-14445680', - 'info_dict': { - 'id': 'season-2-14445680', - 'title': 'Rederiet - Säsong 2', - 'description': 'md5:d9fdfff17f5d8f73468176ecd2836039', - }, - 'playlist_mincount': 12, - }] - - @classmethod - def suitable(cls, url): - return False if SVTIE.suitable(url) or SVTPlayIE.suitable(url) else super(SVTSeriesIE, cls).suitable(url) - - def _real_extract(self, url): - series_slug, season_id = re.match(self._VALID_URL, url).groups() - - series = self._download_json( - 'https://api.svt.se/contento/graphql', series_slug, - 'Downloading series page', query={ - 'query': '''{ - listablesBySlug(slugs: ["%s"]) { - associatedContent(include: [productionPeriod, season]) { - items { - item { - ... on Episode { - videoSvtId - } - } - } - id - name - } - id - longDescription - name - shortDescription - } -}''' % series_slug, - })['data']['listablesBySlug'][0] - - season_name = None - - entries = [] - for season in series['associatedContent']: - if not isinstance(season, dict): - continue - if season_id: - if season.get('id') != season_id: - continue - season_name = season.get('name') - items = season.get('items') - if not isinstance(items, list): - continue - for item in items: - video = item.get('item') or {} - content_id = video.get('videoSvtId') - if not content_id or not isinstance(content_id, compat_str): - continue - entries.append(self.url_result( - 'svt:' + content_id, SVTPlayIE.ie_key(), content_id)) - - title = series.get('name') - season_name = season_name or season_id - - if title and season_name: - title = '%s - %s' % (title, season_name) - elif season_id: - title = season_id - - return self.playlist_result( - entries, season_id or series.get('id'), title, - dict_get(series, ('longDescription', 'shortDescription'))) - - -class SVTPageIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?svt\.se/(?P<path>(?:[^/]+/)*(?P<id>[^/?&#]+))' - _TESTS = [{ - 'url': 'https://www.svt.se/sport/ishockey/bakom-masken-lehners-kamp-mot-mental-ohalsa', - 'info_dict': { - 'id': '25298267', - 'title': 'Bakom masken – Lehners kamp mot mental ohälsa', - }, - 'playlist_count': 4, - }, { - 'url': 'https://www.svt.se/nyheter/utrikes/svenska-andrea-ar-en-mil-fran-branderna-i-kalifornien', - 'info_dict': { - 'id': '24243746', - 'title': 'Svenska Andrea redo att fly sitt hem i Kalifornien', - }, - 'playlist_count': 2, - }, { - # only programTitle - 'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun', - 'info_dict': { - 'id': '8439V2K', - 'ext': 'mp4', - 'title': 'Stjärnorna skojar till det - under SVT-intervjun', - 'duration': 27, - 'age_limit': 0, - }, - }, { - 'url': 'https://www.svt.se/nyheter/lokalt/vast/svt-testar-tar-nagon-upp-skrapet-1', - 'only_matching': True, - }, { - 'url': 'https://www.svt.se/vader/manadskronikor/maj2018', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return False if SVTIE.suitable(url) or SVTPlayIE.suitable(url) else super(SVTPageIE, cls).suitable(url) - - def _real_extract(self, url): - path, display_id = re.match(self._VALID_URL, url).groups() - - article = self._download_json( - 'https://api.svt.se/nss-api/page/' + path, display_id, - query={'q': 'articles'})['articles']['content'][0] - - entries = [] - - def _process_content(content): - if content.get('_type') in ('VIDEOCLIP', 'VIDEOEPISODE'): - video_id = compat_str(content['image']['svtId']) - entries.append(self.url_result( - 'svt:' + video_id, SVTPlayIE.ie_key(), video_id)) - - for media in article.get('media', []): - _process_content(media) - - for obj in article.get('structuredBody', []): - _process_content(obj.get('content') or {}) - - return self.playlist_result( - entries, str_or_none(article.get('id')), - strip_or_none(article.get('title'))) diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py deleted file mode 100644 index 8ceab7e35..000000000 --- a/youtube_dl/extractor/tagesschau.py +++ /dev/null @@ -1,311 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - determine_ext, - js_to_json, - parse_iso8601, - parse_filesize, -) - - -class TagesschauPlayerIE(InfoExtractor): - IE_NAME = 'tagesschau:player' - _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?P<kind>audio|video)/(?P=kind)-(?P<id>\d+)~player(?:_[^/?#&]+)?\.html' - - _TESTS = [{ - 'url': 'http://www.tagesschau.de/multimedia/video/video-179517~player.html', - 'md5': '8d09548d5c15debad38bee3a4d15ca21', - 'info_dict': { - 'id': '179517', - 'ext': 'mp4', - 'title': 'Marie Kristin Boese, ARD Berlin, über den zukünftigen Kurs der AfD', - 'thumbnail': r're:^https?:.*\.jpg$', - 'formats': 'mincount:6', - }, - }, { - 'url': 'https://www.tagesschau.de/multimedia/audio/audio-29417~player.html', - 'md5': '76e6eec6ebd40740671cf0a2c88617e5', - 'info_dict': { - 'id': '29417', - 'ext': 'mp3', - 'title': 'Trabi - Bye, bye Rennpappe', - 'thumbnail': r're:^https?:.*\.jpg$', - 'formats': 'mincount:2', - }, - }, { - 'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417~player_autoplay-true.html', - 'only_matching': True, - }] - - _FORMATS = { - 'xs': {'quality': 0}, - 's': {'width': 320, 'height': 180, 'quality': 1}, - 'm': {'width': 512, 'height': 288, 'quality': 2}, - 'l': {'width': 960, 'height': 540, 'quality': 3}, - 'xl': {'width': 1280, 'height': 720, 'quality': 4}, - 'xxl': {'quality': 5}, - } - - def _extract_via_api(self, kind, video_id): - info = self._download_json( - 'https://www.tagesschau.de/api/multimedia/{0}/{0}-{1}.json'.format(kind, video_id), - video_id) - title = info['headline'] - formats = [] - for media in info['mediadata']: - for format_id, format_url in media.items(): - if determine_ext(format_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls')) - else: - formats.append({ - 'url': format_url, - 'format_id': format_id, - 'vcodec': 'none' if kind == 'audio' else None, - }) - self._sort_formats(formats) - timestamp = parse_iso8601(info.get('date')) - return { - 'id': video_id, - 'title': title, - 'timestamp': timestamp, - 'formats': formats, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - # kind = mobj.group('kind').lower() - # if kind == 'video': - # return self._extract_via_api(kind, video_id) - - # JSON api does not provide some audio formats (e.g. ogg) thus - # extracting audio via webpage - - webpage = self._download_webpage(url, video_id) - - title = self._og_search_title(webpage).strip() - formats = [] - - for media_json in re.findall(r'({src\s*:\s*["\']http[^}]+type\s*:[^}]+})', webpage): - media = self._parse_json(js_to_json(media_json), video_id, fatal=False) - if not media: - continue - src = media.get('src') - if not src: - return - quality = media.get('quality') - kind = media.get('type', '').split('/')[0] - ext = determine_ext(src) - f = { - 'url': src, - 'format_id': '%s_%s' % (quality, ext) if quality else ext, - 'ext': ext, - 'vcodec': 'none' if kind == 'audio' else None, - } - f.update(self._FORMATS.get(quality, {})) - formats.append(f) - - self._sort_formats(formats) - - thumbnail = self._og_search_thumbnail(webpage) - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'formats': formats, - } - - -class TagesschauIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/(?P<path>[^/]+/(?:[^/]+/)*?(?P<id>[^/#?]+?(?:-?[0-9]+)?))(?:~_?[^/#?]+?)?\.html' - - _TESTS = [{ - 'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html', - 'md5': 'f7c27a0eff3bfe8c7727e65f8fe1b1e6', - 'info_dict': { - 'id': 'video-102143', - 'ext': 'mp4', - 'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt', - 'description': '18.07.2015 20:10 Uhr', - 'thumbnail': r're:^https?:.*\.jpg$', - }, - }, { - 'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html', - 'md5': '3c54c1f6243d279b706bde660ceec633', - 'info_dict': { - 'id': 'ts-5727', - 'ext': 'mp4', - 'title': 'Sendung: tagesschau \t04.12.2014 20:00 Uhr', - 'description': 'md5:695c01bfd98b7e313c501386327aea59', - 'thumbnail': r're:^https?:.*\.jpg$', - }, - }, { - # exclusive audio - 'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417.html', - 'md5': '76e6eec6ebd40740671cf0a2c88617e5', - 'info_dict': { - 'id': 'audio-29417', - 'ext': 'mp3', - 'title': 'Trabi - Bye, bye Rennpappe', - 'description': 'md5:8687dda862cbbe2cfb2df09b56341317', - 'thumbnail': r're:^https?:.*\.jpg$', - }, - }, { - # audio in article - 'url': 'http://www.tagesschau.de/inland/bnd-303.html', - 'md5': 'e0916c623e85fc1d2b26b78f299d3958', - 'info_dict': { - 'id': 'bnd-303', - 'ext': 'mp3', - 'title': 'Viele Baustellen für neuen BND-Chef', - 'description': 'md5:1e69a54be3e1255b2b07cdbce5bcd8b4', - 'thumbnail': r're:^https?:.*\.jpg$', - }, - }, { - 'url': 'http://www.tagesschau.de/inland/afd-parteitag-135.html', - 'info_dict': { - 'id': 'afd-parteitag-135', - 'title': 'Möchtegern-Underdog mit Machtanspruch', - }, - 'playlist_count': 2, - }, { - 'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html', - 'only_matching': True, - }, { - 'url': 'http://www.tagesschau.de/multimedia/sendung/tt-3827.html', - 'only_matching': True, - }, { - 'url': 'http://www.tagesschau.de/multimedia/sendung/nm-3475.html', - 'only_matching': True, - }, { - 'url': 'http://www.tagesschau.de/multimedia/sendung/weltspiegel-3167.html', - 'only_matching': True, - }, { - 'url': 'http://www.tagesschau.de/multimedia/tsvorzwanzig-959.html', - 'only_matching': True, - }, { - 'url': 'http://www.tagesschau.de/multimedia/sendung/bab/bab-3299~_bab-sendung-209.html', - 'only_matching': True, - }, { - 'url': 'http://www.tagesschau.de/multimedia/video/video-102303~_bab-sendung-211.html', - 'only_matching': True, - }, { - 'url': 'http://www.tagesschau.de/100sekunden/index.html', - 'only_matching': True, - }, { - # playlist article with collapsing sections - 'url': 'http://www.tagesschau.de/wirtschaft/faq-freihandelszone-eu-usa-101.html', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return False if TagesschauPlayerIE.suitable(url) else super(TagesschauIE, cls).suitable(url) - - def _extract_formats(self, download_text, media_kind): - links = re.finditer( - r'<div class="button" title="(?P<title>[^"]*)"><a href="(?P<url>[^"]+)">(?P<name>.+?)</a></div>', - download_text) - formats = [] - for l in links: - link_url = l.group('url') - if not link_url: - continue - format_id = self._search_regex( - r'.*/[^/.]+\.([^/]+)\.[^/.]+$', link_url, 'format ID', - default=determine_ext(link_url)) - format = { - 'format_id': format_id, - 'url': l.group('url'), - 'format_name': l.group('name'), - } - title = l.group('title') - if title: - if media_kind.lower() == 'video': - m = re.match( - r'''(?x) - Video:\s*(?P<vcodec>[a-zA-Z0-9/._-]+)\s*&\#10; - (?P<width>[0-9]+)x(?P<height>[0-9]+)px&\#10; - (?P<vbr>[0-9]+)kbps&\#10; - Audio:\s*(?P<abr>[0-9]+)kbps,\s*(?P<audio_desc>[A-Za-z\.0-9]+)&\#10; - Größe:\s*(?P<filesize_approx>[0-9.,]+\s+[a-zA-Z]*B)''', - title) - if m: - format.update({ - 'format_note': m.group('audio_desc'), - 'vcodec': m.group('vcodec'), - 'width': int(m.group('width')), - 'height': int(m.group('height')), - 'abr': int(m.group('abr')), - 'vbr': int(m.group('vbr')), - 'filesize_approx': parse_filesize(m.group('filesize_approx')), - }) - else: - m = re.match( - r'(?P<format>.+?)-Format\s*:\s*(?P<abr>\d+)kbps\s*,\s*(?P<note>.+)', - title) - if m: - format.update({ - 'format_note': '%s, %s' % (m.group('format'), m.group('note')), - 'vcodec': 'none', - 'abr': int(m.group('abr')), - }) - formats.append(format) - self._sort_formats(formats) - return formats - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') or mobj.group('path') - display_id = video_id.lstrip('-') - - webpage = self._download_webpage(url, display_id) - - title = self._html_search_regex( - r'<span[^>]*class="headline"[^>]*>(.+?)</span>', - webpage, 'title', default=None) or self._og_search_title(webpage) - - DOWNLOAD_REGEX = r'(?s)<p>Wir bieten dieses (?P<kind>Video|Audio) in folgenden Formaten zum Download an:</p>\s*<div class="controls">(?P<links>.*?)</div>\s*<p>' - - webpage_type = self._og_search_property('type', webpage, default=None) - if webpage_type == 'website': # Article - entries = [] - for num, (entry_title, media_kind, download_text) in enumerate(re.findall( - r'(?s)<p[^>]+class="infotext"[^>]*>\s*(?:<a[^>]+>)?\s*<strong>(.+?)</strong>.*?</p>.*?%s' % DOWNLOAD_REGEX, - webpage), 1): - entries.append({ - 'id': '%s-%d' % (display_id, num), - 'title': '%s' % entry_title, - 'formats': self._extract_formats(download_text, media_kind), - }) - if len(entries) > 1: - return self.playlist_result(entries, display_id, title) - formats = entries[0]['formats'] - else: # Assume single video - download_text = self._search_regex( - DOWNLOAD_REGEX, webpage, 'download links', group='links') - media_kind = self._search_regex( - DOWNLOAD_REGEX, webpage, 'media kind', default='Video', group='kind') - formats = self._extract_formats(download_text, media_kind) - thumbnail = self._og_search_thumbnail(webpage) - description = self._html_search_regex( - r'(?s)<p class="teasertext">(.*?)</p>', - webpage, 'description', default=None) - - self._sort_formats(formats) - - return { - 'id': display_id, - 'title': title, - 'thumbnail': thumbnail, - 'formats': formats, - 'description': description, - } diff --git a/youtube_dl/extractor/tbs.py b/youtube_dl/extractor/tbs.py deleted file mode 100644 index e8a7c65e0..000000000 --- a/youtube_dl/extractor/tbs.py +++ /dev/null @@ -1,89 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .turner import TurnerBaseIE -from ..compat import ( - compat_urllib_parse_urlparse, - compat_parse_qs, -) -from ..utils import ( - float_or_none, - int_or_none, - strip_or_none, -) - - -class TBSIE(TurnerBaseIE): - _VALID_URL = r'https?://(?:www\.)?(?P<site>tbs|tntdrama)\.com(?P<path>/(?:movies|shows/[^/]+/(?:clips|season-\d+/episode-\d+))/(?P<id>[^/?#]+))' - _TESTS = [{ - 'url': 'http://www.tntdrama.com/shows/the-alienist/clips/monster', - 'info_dict': { - 'id': '8d384cde33b89f3a43ce5329de42903ed5099887', - 'ext': 'mp4', - 'title': 'Monster', - 'description': 'Get a first look at the theatrical trailer for TNT’s highly anticipated new psychological thriller The Alienist, which premieres January 22 on TNT.', - 'timestamp': 1508175329, - 'upload_date': '20171016', - }, - 'params': { - # m3u8 download - 'skip_download': True, - } - }, { - 'url': 'http://www.tbs.com/shows/search-party/season-1/episode-1/explicit-the-mysterious-disappearance-of-the-girl-no-one-knew', - 'only_matching': True, - }, { - 'url': 'http://www.tntdrama.com/movies/star-wars-a-new-hope', - 'only_matching': True, - }] - - def _real_extract(self, url): - site, path, display_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage(url, display_id) - drupal_settings = self._parse_json(self._search_regex( - r'<script[^>]+?data-drupal-selector="drupal-settings-json"[^>]*?>({.+?})</script>', - webpage, 'drupal setting'), display_id) - video_data = next(v for v in drupal_settings['turner_playlist'] if v.get('url') == path) - - media_id = video_data['mediaID'] - title = video_data['title'] - tokenizer_query = compat_parse_qs(compat_urllib_parse_urlparse( - drupal_settings['ngtv_token_url']).query) - - info = self._extract_ngtv_info( - media_id, tokenizer_query, { - 'url': url, - 'site_name': site[:3].upper(), - 'auth_required': video_data.get('authRequired') == '1', - }) - - thumbnails = [] - for image_id, image in video_data.get('images', {}).items(): - image_url = image.get('url') - if not image_url or image.get('type') != 'video': - continue - i = { - 'id': image_id, - 'url': image_url, - } - mobj = re.search(r'(\d+)x(\d+)', image_url) - if mobj: - i.update({ - 'width': int(mobj.group(1)), - 'height': int(mobj.group(2)), - }) - thumbnails.append(i) - - info.update({ - 'id': media_id, - 'title': title, - 'description': strip_or_none(video_data.get('descriptionNoTags') or video_data.get('shortDescriptionNoTags')), - 'duration': float_or_none(video_data.get('duration')) or info.get('duration'), - 'timestamp': int_or_none(video_data.get('created')), - 'season_number': int_or_none(video_data.get('season')), - 'episode_number': int_or_none(video_data.get('episode')), - 'thumbnails': thumbnails, - }) - return info diff --git a/youtube_dl/extractor/teachable.py b/youtube_dl/extractor/teachable.py deleted file mode 100644 index 2394f86d4..000000000 --- a/youtube_dl/extractor/teachable.py +++ /dev/null @@ -1,298 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from .wistia import WistiaIE -from ..utils import ( - clean_html, - ExtractorError, - int_or_none, - get_element_by_class, - strip_or_none, - urlencode_postdata, - urljoin, -) - - -class TeachableBaseIE(InfoExtractor): - _NETRC_MACHINE = 'teachable' - _URL_PREFIX = 'teachable:' - - _SITES = { - # Only notable ones here - 'v1.upskillcourses.com': 'upskill', - 'gns3.teachable.com': 'gns3', - 'academyhacker.com': 'academyhacker', - 'stackskills.com': 'stackskills', - 'market.saleshacker.com': 'saleshacker', - 'learnability.org': 'learnability', - 'edurila.com': 'edurila', - 'courses.workitdaily.com': 'workitdaily', - } - - _VALID_URL_SUB_TUPLE = (_URL_PREFIX, '|'.join(re.escape(site) for site in _SITES.keys())) - - def _real_initialize(self): - self._logged_in = False - - def _login(self, site): - if self._logged_in: - return - - username, password = self._get_login_info( - netrc_machine=self._SITES.get(site, site)) - if username is None: - return - - login_page, urlh = self._download_webpage_handle( - 'https://%s/sign_in' % site, None, - 'Downloading %s login page' % site) - - def is_logged(webpage): - return any(re.search(p, webpage) for p in ( - r'class=["\']user-signout', - r'<a[^>]+\bhref=["\']/sign_out', - r'Log\s+[Oo]ut\s*<')) - - if is_logged(login_page): - self._logged_in = True - return - - login_url = urlh.geturl() - - login_form = self._hidden_inputs(login_page) - - login_form.update({ - 'user[email]': username, - 'user[password]': password, - }) - - post_url = self._search_regex( - r'<form[^>]+action=(["\'])(?P<url>(?:(?!\1).)+)\1', login_page, - 'post url', default=login_url, group='url') - - if not post_url.startswith('http'): - post_url = urljoin(login_url, post_url) - - response = self._download_webpage( - post_url, None, 'Logging in to %s' % site, - data=urlencode_postdata(login_form), - headers={ - 'Content-Type': 'application/x-www-form-urlencoded', - 'Referer': login_url, - }) - - if '>I accept the new Privacy Policy<' in response: - raise ExtractorError( - 'Unable to login: %s asks you to accept new Privacy Policy. ' - 'Go to https://%s/ and accept.' % (site, site), expected=True) - - # Successful login - if is_logged(response): - self._logged_in = True - return - - message = get_element_by_class('alert', response) - if message is not None: - raise ExtractorError( - 'Unable to login: %s' % clean_html(message), expected=True) - - raise ExtractorError('Unable to log in') - - -class TeachableIE(TeachableBaseIE): - _VALID_URL = r'''(?x) - (?: - %shttps?://(?P<site_t>[^/]+)| - https?://(?:www\.)?(?P<site>%s) - ) - /courses/[^/]+/lectures/(?P<id>\d+) - ''' % TeachableBaseIE._VALID_URL_SUB_TUPLE - - _TESTS = [{ - 'url': 'https://gns3.teachable.com/courses/gns3-certified-associate/lectures/6842364', - 'info_dict': { - 'id': 'untlgzk1v7', - 'ext': 'bin', - 'title': 'Overview', - 'description': 'md5:071463ff08b86c208811130ea1c2464c', - 'duration': 736.4, - 'timestamp': 1542315762, - 'upload_date': '20181115', - 'chapter': 'Welcome', - 'chapter_number': 1, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://v1.upskillcourses.com/courses/119763/lectures/1747100', - 'only_matching': True, - }, { - 'url': 'https://gns3.teachable.com/courses/423415/lectures/6885939', - 'only_matching': True, - }, { - 'url': 'teachable:https://v1.upskillcourses.com/courses/essential-web-developer-course/lectures/1747100', - 'only_matching': True, - }] - - @staticmethod - def _is_teachable(webpage): - return 'teachableTracker.linker:autoLink' in webpage and re.search( - r'<link[^>]+href=["\']https?://(?:process\.fs|assets)\.teachablecdn\.com', - webpage) - - @staticmethod - def _extract_url(webpage, source_url): - if not TeachableIE._is_teachable(webpage): - return - if re.match(r'https?://[^/]+/(?:courses|p)', source_url): - return '%s%s' % (TeachableBaseIE._URL_PREFIX, source_url) - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - site = mobj.group('site') or mobj.group('site_t') - video_id = mobj.group('id') - - self._login(site) - - prefixed = url.startswith(self._URL_PREFIX) - if prefixed: - url = url[len(self._URL_PREFIX):] - - webpage = self._download_webpage(url, video_id) - - wistia_urls = WistiaIE._extract_urls(webpage) - if not wistia_urls: - if any(re.search(p, webpage) for p in ( - r'class=["\']lecture-contents-locked', - r'>\s*Lecture contents locked', - r'id=["\']lecture-locked', - # https://academy.tailoredtutors.co.uk/courses/108779/lectures/1955313 - r'class=["\'](?:inner-)?lesson-locked', - r'>LESSON LOCKED<')): - self.raise_login_required('Lecture contents locked') - raise ExtractorError('Unable to find video URL') - - title = self._og_search_title(webpage, default=None) - - chapter = None - chapter_number = None - section_item = self._search_regex( - r'(?s)(?P<li><li[^>]+\bdata-lecture-id=["\']%s[^>]+>.+?</li>)' % video_id, - webpage, 'section item', default=None, group='li') - if section_item: - chapter_number = int_or_none(self._search_regex( - r'data-ss-position=["\'](\d+)', section_item, 'section id', - default=None)) - if chapter_number is not None: - sections = [] - for s in re.findall( - r'(?s)<div[^>]+\bclass=["\']section-title[^>]+>(.+?)</div>', webpage): - section = strip_or_none(clean_html(s)) - if not section: - sections = [] - break - sections.append(section) - if chapter_number <= len(sections): - chapter = sections[chapter_number - 1] - - entries = [{ - '_type': 'url_transparent', - 'url': wistia_url, - 'ie_key': WistiaIE.ie_key(), - 'title': title, - 'chapter': chapter, - 'chapter_number': chapter_number, - } for wistia_url in wistia_urls] - - return self.playlist_result(entries, video_id, title) - - -class TeachableCourseIE(TeachableBaseIE): - _VALID_URL = r'''(?x) - (?: - %shttps?://(?P<site_t>[^/]+)| - https?://(?:www\.)?(?P<site>%s) - ) - /(?:courses|p)/(?:enrolled/)?(?P<id>[^/?#&]+) - ''' % TeachableBaseIE._VALID_URL_SUB_TUPLE - _TESTS = [{ - 'url': 'http://v1.upskillcourses.com/courses/essential-web-developer-course/', - 'info_dict': { - 'id': 'essential-web-developer-course', - 'title': 'The Essential Web Developer Course (Free)', - }, - 'playlist_count': 192, - }, { - 'url': 'http://v1.upskillcourses.com/courses/119763/', - 'only_matching': True, - }, { - 'url': 'http://v1.upskillcourses.com/courses/enrolled/119763', - 'only_matching': True, - }, { - 'url': 'https://gns3.teachable.com/courses/enrolled/423415', - 'only_matching': True, - }, { - 'url': 'teachable:https://learn.vrdev.school/p/gear-vr-developer-mini', - 'only_matching': True, - }, { - 'url': 'teachable:https://filmsimplified.com/p/davinci-resolve-15-crash-course', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return False if TeachableIE.suitable(url) else super( - TeachableCourseIE, cls).suitable(url) - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - site = mobj.group('site') or mobj.group('site_t') - course_id = mobj.group('id') - - self._login(site) - - prefixed = url.startswith(self._URL_PREFIX) - if prefixed: - prefix = self._URL_PREFIX - url = url[len(prefix):] - - webpage = self._download_webpage(url, course_id) - - url_base = 'https://%s/' % site - - entries = [] - - for mobj in re.finditer( - r'(?s)(?P<li><li[^>]+class=(["\'])(?:(?!\2).)*?section-item[^>]+>.+?</li>)', - webpage): - li = mobj.group('li') - if 'fa-youtube-play' not in li and not re.search(r'\d{1,2}:\d{2}', li): - continue - lecture_url = self._search_regex( - r'<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1', li, - 'lecture url', default=None, group='url') - if not lecture_url: - continue - lecture_id = self._search_regex( - r'/lectures/(\d+)', lecture_url, 'lecture id', default=None) - title = self._html_search_regex( - r'<span[^>]+class=["\']lecture-name[^>]+>([^<]+)', li, - 'title', default=None) - entry_url = urljoin(url_base, lecture_url) - if prefixed: - entry_url = self._URL_PREFIX + entry_url - entries.append( - self.url_result( - entry_url, - ie=TeachableIE.ie_key(), video_id=lecture_id, - video_title=clean_html(title))) - - course_title = self._html_search_regex( - (r'(?s)<img[^>]+class=["\']course-image[^>]+>\s*<h\d>(.+?)</h', - r'(?s)<h\d[^>]+class=["\']course-title[^>]+>(.+?)</h'), - webpage, 'course title', fatal=False) - - return self.playlist_result(entries, course_id, course_title) diff --git a/youtube_dl/extractor/teachertube.py b/youtube_dl/extractor/teachertube.py deleted file mode 100644 index 1272078c5..000000000 --- a/youtube_dl/extractor/teachertube.py +++ /dev/null @@ -1,129 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - determine_ext, - ExtractorError, - qualities, -) - - -class TeacherTubeIE(InfoExtractor): - IE_NAME = 'teachertube' - IE_DESC = 'teachertube.com videos' - - _VALID_URL = r'https?://(?:www\.)?teachertube\.com/(viewVideo\.php\?video_id=|music\.php\?music_id=|video/(?:[\da-z-]+-)?|audio/)(?P<id>\d+)' - - _TESTS = [{ - # flowplayer - 'url': 'http://www.teachertube.com/viewVideo.php?video_id=339997', - 'md5': 'f9434ef992fd65936d72999951ee254c', - 'info_dict': { - 'id': '339997', - 'ext': 'mp4', - 'title': 'Measures of dispersion from a frequency table', - 'description': 'Measures of dispersion from a frequency table', - 'thumbnail': r're:https?://.*\.(?:jpg|png)', - }, - }, { - # jwplayer - 'url': 'http://www.teachertube.com/music.php?music_id=8805', - 'md5': '01e8352006c65757caf7b961f6050e21', - 'info_dict': { - 'id': '8805', - 'ext': 'mp3', - 'title': 'PER ASPERA AD ASTRA', - 'description': 'RADIJSKA EMISIJA ZRAKOPLOVNE TEHNI?KE ?KOLE P', - }, - }, { - # unavailable video - 'url': 'http://www.teachertube.com/video/intro-video-schleicher-297790', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - error = self._search_regex( - r'<div\b[^>]+\bclass=["\']msgBox error[^>]+>([^<]+)', webpage, - 'error', default=None) - if error: - raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) - - title = self._html_search_meta('title', webpage, 'title', fatal=True) - TITLE_SUFFIX = ' - TeacherTube' - if title.endswith(TITLE_SUFFIX): - title = title[:-len(TITLE_SUFFIX)].strip() - - description = self._html_search_meta('description', webpage, 'description') - if description: - description = description.strip() - - quality = qualities(['mp3', 'flv', 'mp4']) - - media_urls = re.findall(r'data-contenturl="([^"]+)"', webpage) - media_urls.extend(re.findall(r'var\s+filePath\s*=\s*"([^"]+)"', webpage)) - media_urls.extend(re.findall(r'\'file\'\s*:\s*["\']([^"\']+)["\'],', webpage)) - - formats = [ - { - 'url': media_url, - 'quality': quality(determine_ext(media_url)) - } for media_url in set(media_urls) - ] - - self._sort_formats(formats) - - thumbnail = self._og_search_thumbnail( - webpage, default=None) or self._html_search_meta( - 'thumbnail', webpage) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'formats': formats, - } - - -class TeacherTubeUserIE(InfoExtractor): - IE_NAME = 'teachertube:user:collection' - IE_DESC = 'teachertube.com user and collection videos' - - _VALID_URL = r'https?://(?:www\.)?teachertube\.com/(user/profile|collection)/(?P<user>[0-9a-zA-Z]+)/?' - - _MEDIA_RE = r'''(?sx) - class="?sidebar_thumb_time"?>[0-9:]+</div> - \s* - <a\s+href="(https?://(?:www\.)?teachertube\.com/(?:video|audio)/[^"]+)" - ''' - _TEST = { - 'url': 'http://www.teachertube.com/user/profile/rbhagwati2', - 'info_dict': { - 'id': 'rbhagwati2' - }, - 'playlist_mincount': 179, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - user_id = mobj.group('user') - - urls = [] - webpage = self._download_webpage(url, user_id) - urls.extend(re.findall(self._MEDIA_RE, webpage)) - - pages = re.findall(r'/ajax-user/user-videos/%s\?page=([0-9]+)' % user_id, webpage)[:-1] - for p in pages: - more = 'http://www.teachertube.com/ajax-user/user-videos/%s?page=%s' % (user_id, p) - webpage = self._download_webpage(more, user_id, 'Downloading page %s/%s' % (p, len(pages))) - video_urls = re.findall(self._MEDIA_RE, webpage) - urls.extend(video_urls) - - entries = [self.url_result(vurl, 'TeacherTube') for vurl in urls] - return self.playlist_result(entries, user_id) diff --git a/youtube_dl/extractor/techtalks.py b/youtube_dl/extractor/techtalks.py deleted file mode 100644 index a5b62c717..000000000 --- a/youtube_dl/extractor/techtalks.py +++ /dev/null @@ -1,82 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - get_element_by_attribute, - clean_html, -) - - -class TechTalksIE(InfoExtractor): - _VALID_URL = r'https?://techtalks\.tv/talks/(?:[^/]+/)?(?P<id>\d+)' - - _TESTS = [{ - 'url': 'http://techtalks.tv/talks/learning-topic-models-going-beyond-svd/57758/', - 'info_dict': { - 'id': '57758', - 'title': 'Learning Topic Models --- Going beyond SVD', - }, - 'playlist': [ - { - 'info_dict': { - 'id': '57758', - 'ext': 'flv', - 'title': 'Learning Topic Models --- Going beyond SVD', - }, - }, - { - 'info_dict': { - 'id': '57758-slides', - 'ext': 'flv', - 'title': 'Learning Topic Models --- Going beyond SVD', - }, - }, - ], - 'params': { - # rtmp download - 'skip_download': True, - }, - }, { - 'url': 'http://techtalks.tv/talks/57758', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - talk_id = mobj.group('id') - webpage = self._download_webpage(url, talk_id) - rtmp_url = self._search_regex( - r'netConnectionUrl: \'(.*?)\'', webpage, 'rtmp url') - play_path = self._search_regex( - r'href=\'(.*?)\' [^>]*id="flowplayer_presenter"', - webpage, 'presenter play path') - title = clean_html(get_element_by_attribute('class', 'title', webpage)) - video_info = { - 'id': talk_id, - 'title': title, - 'url': rtmp_url, - 'play_path': play_path, - 'ext': 'flv', - } - m_slides = re.search(r'<a class="slides" href=\'(.*?)\'', webpage) - if m_slides is None: - return video_info - else: - return { - '_type': 'playlist', - 'id': talk_id, - 'title': title, - 'entries': [ - video_info, - # The slides video - { - 'id': talk_id + '-slides', - 'title': title, - 'url': rtmp_url, - 'play_path': m_slides.group(1), - 'ext': 'flv', - }, - ], - } diff --git a/youtube_dl/extractor/tele13.py b/youtube_dl/extractor/tele13.py deleted file mode 100644 index a29a64b6d..000000000 --- a/youtube_dl/extractor/tele13.py +++ /dev/null @@ -1,88 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from .youtube import YoutubeIE -from ..utils import ( - js_to_json, - qualities, - determine_ext, -) - - -class Tele13IE(InfoExtractor): - _VALID_URL = r'^https?://(?:www\.)?t13\.cl/videos(?:/[^/]+)+/(?P<id>[\w-]+)' - _TESTS = [ - { - 'url': 'http://www.t13.cl/videos/actualidad/el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', - 'md5': '4cb1fa38adcad8fea88487a078831755', - 'info_dict': { - 'id': 'el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', - 'ext': 'mp4', - 'title': 'El círculo de hierro de Michelle Bachelet en su regreso a La Moneda', - }, - 'params': { - # HTTP Error 404: Not Found - 'skip_download': True, - }, - }, - { - 'url': 'http://www.t13.cl/videos/mundo/tendencias/video-captan-misteriosa-bola-fuego-cielos-bangkok', - 'md5': '867adf6a3b3fef932c68a71d70b70946', - 'info_dict': { - 'id': 'rOoKv2OMpOw', - 'ext': 'mp4', - 'title': 'Shooting star seen on 7-Sep-2015', - 'description': 'md5:7292ff2a34b2f673da77da222ae77e1e', - 'uploader': 'Porjai Jaturongkhakun', - 'upload_date': '20150906', - 'uploader_id': 'UCnLY_3ezwNcDSC_Wc6suZxw', - }, - 'add_ie': ['Youtube'], - } - ] - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - setup_js = self._search_regex( - r"(?s)jwplayer\('player-vivo'\).setup\((\{.*?\})\)", - webpage, 'setup code') - sources = self._parse_json(self._search_regex( - r'sources\s*:\s*(\[[^\]]+\])', setup_js, 'sources'), - display_id, js_to_json) - - preference = qualities(['Móvil', 'SD', 'HD']) - formats = [] - urls = [] - for f in sources: - format_url = f['file'] - if format_url and format_url not in urls: - ext = determine_ext(format_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, display_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - elif YoutubeIE.suitable(format_url): - return self.url_result(format_url, 'Youtube') - else: - formats.append({ - 'url': format_url, - 'format_id': f.get('label'), - 'preference': preference(f.get('label')), - 'ext': ext, - }) - urls.append(format_url) - self._sort_formats(formats) - - return { - 'id': display_id, - 'title': self._search_regex( - r'title\s*:\s*"([^"]+)"', setup_js, 'title'), - 'description': self._html_search_meta( - 'description', webpage, 'description'), - 'thumbnail': self._search_regex( - r'image\s*:\s*"([^"]+)"', setup_js, 'thumbnail', default=None), - 'formats': formats, - } diff --git a/youtube_dl/extractor/tele5.py b/youtube_dl/extractor/tele5.py deleted file mode 100644 index 3e1a7a9e6..000000000 --- a/youtube_dl/extractor/tele5.py +++ /dev/null @@ -1,108 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from .jwplatform import JWPlatformIE -from .nexx import NexxIE -from ..compat import compat_urlparse -from ..utils import ( - NO_DEFAULT, - smuggle_url, -) - - -class Tele5IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tele5\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)' - _GEO_COUNTRIES = ['DE'] - _TESTS = [{ - 'url': 'https://www.tele5.de/mediathek/filme-online/videos?vid=1549416', - 'info_dict': { - 'id': '1549416', - 'ext': 'mp4', - 'upload_date': '20180814', - 'timestamp': 1534290623, - 'title': 'Pandorum', - }, - 'params': { - 'skip_download': True, - }, - }, { - # jwplatform, nexx unavailable - 'url': 'https://www.tele5.de/filme/ghoul-das-geheimnis-des-friedhofmonsters/', - 'info_dict': { - 'id': 'WJuiOlUp', - 'ext': 'mp4', - 'upload_date': '20200603', - 'timestamp': 1591214400, - 'title': 'Ghoul - Das Geheimnis des Friedhofmonsters', - 'description': 'md5:42002af1d887ff3d5b2b3ca1f8137d97', - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': [JWPlatformIE.ie_key()], - }, { - 'url': 'https://www.tele5.de/kalkofes-mattscheibe/video-clips/politik-und-gesellschaft?ve_id=1551191', - 'only_matching': True, - }, { - 'url': 'https://www.tele5.de/video-clip/?ve_id=1609440', - 'only_matching': True, - }, { - 'url': 'https://www.tele5.de/filme/schlefaz-dragon-crusaders/', - 'only_matching': True, - }, { - 'url': 'https://www.tele5.de/filme/making-of/avengers-endgame/', - 'only_matching': True, - }, { - 'url': 'https://www.tele5.de/star-trek/raumschiff-voyager/ganze-folge/das-vinculum/', - 'only_matching': True, - }, { - 'url': 'https://www.tele5.de/anders-ist-sevda/', - 'only_matching': True, - }] - - def _real_extract(self, url): - qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - video_id = (qs.get('vid') or qs.get('ve_id') or [None])[0] - - NEXX_ID_RE = r'\d{6,}' - JWPLATFORM_ID_RE = r'[a-zA-Z0-9]{8}' - - def nexx_result(nexx_id): - return self.url_result( - 'https://api.nexx.cloud/v3/759/videos/byid/%s' % nexx_id, - ie=NexxIE.ie_key(), video_id=nexx_id) - - nexx_id = jwplatform_id = None - - if video_id: - if re.match(NEXX_ID_RE, video_id): - return nexx_result(video_id) - elif re.match(JWPLATFORM_ID_RE, video_id): - jwplatform_id = video_id - - if not nexx_id: - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - def extract_id(pattern, name, default=NO_DEFAULT): - return self._html_search_regex( - (r'id\s*=\s*["\']video-player["\'][^>]+data-id\s*=\s*["\'](%s)' % pattern, - r'\s+id\s*=\s*["\']player_(%s)' % pattern, - r'\bdata-id\s*=\s*["\'](%s)' % pattern), webpage, name, - default=default) - - nexx_id = extract_id(NEXX_ID_RE, 'nexx id', default=None) - if nexx_id: - return nexx_result(nexx_id) - - if not jwplatform_id: - jwplatform_id = extract_id(JWPLATFORM_ID_RE, 'jwplatform id') - - return self.url_result( - smuggle_url( - 'jwplatform:%s' % jwplatform_id, - {'geo_countries': self._GEO_COUNTRIES}), - ie=JWPlatformIE.ie_key(), video_id=jwplatform_id) diff --git a/youtube_dl/extractor/telemb.py b/youtube_dl/extractor/telemb.py deleted file mode 100644 index 9bcac4ec0..000000000 --- a/youtube_dl/extractor/telemb.py +++ /dev/null @@ -1,78 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import remove_start - - -class TeleMBIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?telemb\.be/(?P<display_id>.+?)_d_(?P<id>\d+)\.html' - _TESTS = [ - { - 'url': 'http://www.telemb.be/mons-cook-with-danielle-des-cours-de-cuisine-en-anglais-_d_13466.html', - 'md5': 'f45ea69878516ba039835794e0f8f783', - 'info_dict': { - 'id': '13466', - 'display_id': 'mons-cook-with-danielle-des-cours-de-cuisine-en-anglais-', - 'ext': 'mp4', - 'title': 'Mons - Cook with Danielle : des cours de cuisine en anglais ! - Les reportages', - 'description': 'md5:bc5225f47b17c309761c856ad4776265', - 'thumbnail': r're:^http://.*\.(?:jpg|png)$', - } - }, - { - # non-ASCII characters in download URL - 'url': 'http://telemb.be/les-reportages-havre-incendie-mortel_d_13514.html', - 'md5': '6e9682736e5ccd4eab7f21e855350733', - 'info_dict': { - 'id': '13514', - 'display_id': 'les-reportages-havre-incendie-mortel', - 'ext': 'mp4', - 'title': 'Havré - Incendie mortel - Les reportages', - 'description': 'md5:5e54cb449acb029c2b7734e2d946bd4a', - 'thumbnail': r're:^http://.*\.(?:jpg|png)$', - } - }, - ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') - - webpage = self._download_webpage(url, display_id) - - formats = [] - for video_url in re.findall(r'file\s*:\s*"([^"]+)"', webpage): - fmt = { - 'url': video_url, - 'format_id': video_url.split(':')[0] - } - rtmp = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>.+))/(?P<playpath>mp4:.+)$', video_url) - if rtmp: - fmt.update({ - 'play_path': rtmp.group('playpath'), - 'app': rtmp.group('app'), - 'player_url': 'http://p.jwpcdn.com/6/10/jwplayer.flash.swf', - 'page_url': 'http://www.telemb.be', - 'preference': -1, - }) - formats.append(fmt) - self._sort_formats(formats) - - title = remove_start(self._og_search_title(webpage), 'TéléMB : ') - description = self._html_search_regex( - r'<meta property="og:description" content="(.+?)" />', - webpage, 'description', fatal=False) - thumbnail = self._og_search_thumbnail(webpage) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'formats': formats, - } diff --git a/youtube_dl/extractor/tennistv.py b/youtube_dl/extractor/tennistv.py deleted file mode 100644 index a586f30ad..000000000 --- a/youtube_dl/extractor/tennistv.py +++ /dev/null @@ -1,112 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import json - -from .common import InfoExtractor - -from ..utils import ( - ExtractorError, - unified_timestamp, -) - - -class TennisTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tennistv\.com/videos/(?P<id>[-a-z0-9]+)' - _TEST = { - 'url': 'https://www.tennistv.com/videos/indian-wells-2018-verdasco-fritz', - 'info_dict': { - 'id': 'indian-wells-2018-verdasco-fritz', - 'ext': 'mp4', - 'title': 'Fernando Verdasco v Taylor Fritz', - 'description': 're:^After his stunning victory.{174}$', - 'thumbnail': 'https://atp-prod.akamaized.net/api/images/v1/images/112831/landscape/1242/0', - 'timestamp': 1521017381, - 'upload_date': '20180314', - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'Requires email and password of a subscribed account', - } - _NETRC_MACHINE = 'tennistv' - - def _login(self): - username, password = self._get_login_info() - if not username or not password: - raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) - - login_form = { - 'Email': username, - 'Password': password, - } - login_json = json.dumps(login_form).encode('utf-8') - headers = { - 'content-type': 'application/json', - 'Referer': 'https://www.tennistv.com/login', - 'Origin': 'https://www.tennistv.com', - } - - login_result = self._download_json( - 'https://www.tennistv.com/api/users/v1/login', None, - note='Logging in', - errnote='Login failed (wrong password?)', - headers=headers, - data=login_json) - - if login_result['error']['errorCode']: - raise ExtractorError('Login failed, %s said: %r' % (self.IE_NAME, login_result['error']['errorMessage'])) - - if login_result['entitlement'] != 'SUBSCRIBED': - self.report_warning('%s may not be subscribed to %s.' % (username, self.IE_NAME)) - - self._session_token = login_result['sessionToken'] - - def _real_initialize(self): - self._login() - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - internal_id = self._search_regex(r'video=([0-9]+)', webpage, 'internal video id') - - headers = { - 'Origin': 'https://www.tennistv.com', - 'authorization': 'ATP %s' % self._session_token, - 'content-type': 'application/json', - 'Referer': url, - } - check_data = { - 'videoID': internal_id, - 'VideoUrlType': 'HLSV3', - } - check_json = json.dumps(check_data).encode('utf-8') - check_result = self._download_json( - 'https://www.tennistv.com/api/users/v1/entitlementchecknondiva', - video_id, note='Checking video authorization', headers=headers, data=check_json) - formats = self._extract_m3u8_formats(check_result['contentUrl'], video_id, ext='mp4') - - vdata_url = 'https://www.tennistv.com/api/channels/v1/de/none/video/%s' % video_id - vdata = self._download_json(vdata_url, video_id) - - timestamp = unified_timestamp(vdata['timestamp']) - thumbnail = vdata['video']['thumbnailUrl'] - description = vdata['displayText']['description'] - title = vdata['video']['title'] - - series = vdata['tour'] - venue = vdata['displayText']['venue'] - round_str = vdata['seo']['round'] - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'formats': formats, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'series': series, - 'season': venue, - 'episode': round_str, - } diff --git a/youtube_dl/extractor/tenplay.py b/youtube_dl/extractor/tenplay.py deleted file mode 100644 index cd30d57f4..000000000 --- a/youtube_dl/extractor/tenplay.py +++ /dev/null @@ -1,70 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - HEADRequest, - parse_age_limit, - parse_iso8601, - # smuggle_url, -) - - -class TenPlayIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?10play\.com\.au/(?:[^/]+/)+(?P<id>tpv\d{6}[a-z]{5})' - _TESTS = [{ - 'url': 'https://10play.com.au/masterchef/episodes/season-1/masterchef-s1-ep-1/tpv190718kwzga', - 'info_dict': { - 'id': '6060533435001', - 'ext': 'mp4', - 'title': 'MasterChef - S1 Ep. 1', - 'description': 'md5:4fe7b78e28af8f2d900cd20d900ef95c', - 'age_limit': 10, - 'timestamp': 1240828200, - 'upload_date': '20090427', - 'uploader_id': '2199827728001', - }, - 'params': { - # 'format': 'bestvideo', - 'skip_download': True, - } - }, { - 'url': 'https://10play.com.au/how-to-stay-married/web-extras/season-1/terrys-talks-ep-1-embracing-change/tpv190915ylupc', - 'only_matching': True, - }] - # BRIGHTCOVE_URL_TEMPLATE = 'https://players.brightcove.net/2199827728001/cN6vRtRQt_default/index.html?videoId=%s' - _GEO_BYPASS = False - _FASTLY_URL_TEMPL = 'https://10-selector.global.ssl.fastly.net/s/kYEXFC/media/%s?mbr=true&manifest=m3u&format=redirect' - - def _real_extract(self, url): - content_id = self._match_id(url) - data = self._download_json( - 'https://10play.com.au/api/video/' + content_id, content_id) - video = data.get('video') or {} - metadata = data.get('metaData') or {} - brightcove_id = video.get('videoId') or metadata['showContentVideoId'] - # brightcove_url = smuggle_url( - # self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, - # {'geo_countries': ['AU']}) - m3u8_url = self._request_webpage(HEADRequest( - self._FASTLY_URL_TEMPL % brightcove_id), brightcove_id).geturl() - if '10play-not-in-oz' in m3u8_url: - self.raise_geo_restricted(countries=['AU']) - formats = self._extract_m3u8_formats(m3u8_url, brightcove_id, 'mp4') - self._sort_formats(formats) - - return { - # '_type': 'url_transparent', - # 'url': brightcove_url, - 'formats': formats, - 'id': brightcove_id, - 'title': video.get('title') or metadata.get('pageContentName') or metadata['showContentName'], - 'description': video.get('description'), - 'age_limit': parse_age_limit(video.get('showRatingClassification') or metadata.get('showProgramClassification')), - 'series': metadata.get('showName'), - 'season': metadata.get('showContentSeason'), - 'timestamp': parse_iso8601(metadata.get('contentPublishDate') or metadata.get('pageContentPublishDate')), - 'thumbnail': video.get('poster'), - 'uploader_id': '2199827728001', - # 'ie_key': 'BrightcoveNew', - } diff --git a/youtube_dl/extractor/testurl.py b/youtube_dl/extractor/testurl.py deleted file mode 100644 index 84a14a0bd..000000000 --- a/youtube_dl/extractor/testurl.py +++ /dev/null @@ -1,64 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ExtractorError - - -class TestURLIE(InfoExtractor): - """ Allows addressing of the test cases as test:yout.*be_1 """ - - IE_DESC = False # Do not list - _VALID_URL = r'test(?:url)?:(?P<id>(?P<extractor>.+?)(?:_(?P<num>[0-9]+))?)$' - - def _real_extract(self, url): - from ..extractor import gen_extractors - - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - extractor_id = mobj.group('extractor') - all_extractors = gen_extractors() - - rex = re.compile(extractor_id, flags=re.IGNORECASE) - matching_extractors = [ - e for e in all_extractors if rex.search(e.IE_NAME)] - - if len(matching_extractors) == 0: - raise ExtractorError( - 'No extractors matching %r found' % extractor_id, - expected=True) - elif len(matching_extractors) > 1: - # Is it obvious which one to pick? - try: - extractor = next( - ie for ie in matching_extractors - if ie.IE_NAME.lower() == extractor_id.lower()) - except StopIteration: - raise ExtractorError( - ('Found multiple matching extractors: %s' % - ' '.join(ie.IE_NAME for ie in matching_extractors)), - expected=True) - else: - extractor = matching_extractors[0] - - num_str = mobj.group('num') - num = int(num_str) if num_str else 0 - - testcases = [] - t = getattr(extractor, '_TEST', None) - if t: - testcases.append(t) - testcases.extend(getattr(extractor, '_TESTS', [])) - - try: - tc = testcases[num] - except IndexError: - raise ExtractorError( - ('Test case %d not found, got only %d tests' % - (num, len(testcases))), - expected=True) - - self.to_screen('Test URL: %s' % tc['url']) - - return self.url_result(tc['url'], video_id=video_id) diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py deleted file mode 100644 index 23c2808a1..000000000 --- a/youtube_dl/extractor/tf1.py +++ /dev/null @@ -1,87 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import json -import re - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - parse_iso8601, - try_get, -) - - -class TF1IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tf1\.fr/[^/]+/(?P<program_slug>[^/]+)/videos/(?P<id>[^/?&#]+)\.html' - _TESTS = [{ - 'url': 'https://www.tf1.fr/tmc/quotidien-avec-yann-barthes/videos/quotidien-premiere-partie-11-juin-2019.html', - 'info_dict': { - 'id': '13641379', - 'ext': 'mp4', - 'title': 'md5:f392bc52245dc5ad43771650c96fb620', - 'description': 'md5:a02cdb217141fb2d469d6216339b052f', - 'upload_date': '20190611', - 'timestamp': 1560273989, - 'duration': 1738, - 'series': 'Quotidien avec Yann Barthès', - 'tags': ['intégrale', 'quotidien', 'Replay'], - }, - 'params': { - # Sometimes wat serves the whole file with the --test option - 'skip_download': True, - 'format': 'bestvideo', - }, - }, { - 'url': 'http://www.tf1.fr/tf1/koh-lanta/videos/replay-koh-lanta-22-mai-2015.html', - 'only_matching': True, - }, { - 'url': 'http://www.tf1.fr/hd1/documentaire/videos/mylene-farmer-d-une-icone.html', - 'only_matching': True, - }] - - def _real_extract(self, url): - program_slug, slug = re.match(self._VALID_URL, url).groups() - video = self._download_json( - 'https://www.tf1.fr/graphql/web', slug, query={ - 'id': '9b80783950b85247541dd1d851f9cc7fa36574af015621f853ab111a679ce26f', - 'variables': json.dumps({ - 'programSlug': program_slug, - 'slug': slug, - }) - })['data']['videoBySlug'] - wat_id = video['streamId'] - - tags = [] - for tag in (video.get('tags') or []): - label = tag.get('label') - if not label: - continue - tags.append(label) - - decoration = video.get('decoration') or {} - - thumbnails = [] - for source in (try_get(decoration, lambda x: x['image']['sources'], list) or []): - source_url = source.get('url') - if not source_url: - continue - thumbnails.append({ - 'url': source_url, - 'width': int_or_none(source.get('width')), - }) - - return { - '_type': 'url_transparent', - 'id': wat_id, - 'url': 'wat:' + wat_id, - 'title': video.get('title'), - 'thumbnails': thumbnails, - 'description': decoration.get('description'), - 'timestamp': parse_iso8601(video.get('date')), - 'duration': int_or_none(try_get(video, lambda x: x['publicPlayingInfos']['duration'])), - 'tags': tags, - 'series': decoration.get('programLabel'), - 'season_number': int_or_none(video.get('season')), - 'episode_number': int_or_none(video.get('episode')), - } diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py deleted file mode 100644 index adfe11e31..000000000 --- a/youtube_dl/extractor/theplatform.py +++ /dev/null @@ -1,414 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re -import time -import hmac -import binascii -import hashlib - - -from .once import OnceIE -from .adobepass import AdobePassIE -from ..compat import ( - compat_parse_qs, - compat_urllib_parse_urlparse, -) -from ..utils import ( - determine_ext, - ExtractorError, - float_or_none, - int_or_none, - sanitized_Request, - unsmuggle_url, - update_url_query, - xpath_with_ns, - mimetype2ext, - find_xpath_attr, -) - -default_ns = 'http://www.w3.org/2005/SMIL21/Language' -_x = lambda p: xpath_with_ns(p, {'smil': default_ns}) - - -class ThePlatformBaseIE(OnceIE): - _TP_TLD = 'com' - - def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'): - meta = self._download_xml( - smil_url, video_id, note=note, query={'format': 'SMIL'}, - headers=self.geo_verification_headers()) - error_element = find_xpath_attr(meta, _x('.//smil:ref'), 'src') - if error_element is not None: - exception = find_xpath_attr( - error_element, _x('.//smil:param'), 'name', 'exception') - if exception is not None: - if exception.get('value') == 'GeoLocationBlocked': - self.raise_geo_restricted(error_element.attrib['abstract']) - elif error_element.attrib['src'].startswith( - 'http://link.theplatform.%s/s/errorFiles/Unavailable.' - % self._TP_TLD): - raise ExtractorError( - error_element.attrib['abstract'], expected=True) - - smil_formats = self._parse_smil_formats( - meta, smil_url, video_id, namespace=default_ns, - # the parameters are from syfy.com, other sites may use others, - # they also work for nbc.com - f4m_params={'g': 'UXWGVKRWHFSP', 'hdcore': '3.0.3'}, - transform_rtmp_url=lambda streamer, src: (streamer, 'mp4:' + src)) - - formats = [] - for _format in smil_formats: - if OnceIE.suitable(_format['url']): - formats.extend(self._extract_once_formats(_format['url'])) - else: - media_url = _format['url'] - if determine_ext(media_url) == 'm3u8': - hdnea2 = self._get_cookies(media_url).get('hdnea2') - if hdnea2: - _format['url'] = update_url_query(media_url, {'hdnea3': hdnea2.value}) - - formats.append(_format) - - subtitles = self._parse_smil_subtitles(meta, default_ns) - - return formats, subtitles - - def _download_theplatform_metadata(self, path, video_id): - info_url = 'http://link.theplatform.%s/s/%s?format=preview' % (self._TP_TLD, path) - return self._download_json(info_url, video_id) - - def _parse_theplatform_metadata(self, info): - subtitles = {} - captions = info.get('captions') - if isinstance(captions, list): - for caption in captions: - lang, src, mime = caption.get('lang', 'en'), caption.get('src'), caption.get('type') - subtitles.setdefault(lang, []).append({ - 'ext': mimetype2ext(mime), - 'url': src, - }) - - duration = info.get('duration') - tp_chapters = info.get('chapters', []) - chapters = [] - if tp_chapters: - def _add_chapter(start_time, end_time): - start_time = float_or_none(start_time, 1000) - end_time = float_or_none(end_time, 1000) - if start_time is None or end_time is None: - return - chapters.append({ - 'start_time': start_time, - 'end_time': end_time, - }) - - for chapter in tp_chapters[:-1]: - _add_chapter(chapter.get('startTime'), chapter.get('endTime')) - _add_chapter(tp_chapters[-1].get('startTime'), tp_chapters[-1].get('endTime') or duration) - - return { - 'title': info['title'], - 'subtitles': subtitles, - 'description': info['description'], - 'thumbnail': info['defaultThumbnailUrl'], - 'duration': float_or_none(duration, 1000), - 'timestamp': int_or_none(info.get('pubDate'), 1000) or None, - 'uploader': info.get('billingCode'), - 'chapters': chapters, - } - - def _extract_theplatform_metadata(self, path, video_id): - info = self._download_theplatform_metadata(path, video_id) - return self._parse_theplatform_metadata(info) - - -class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): - _VALID_URL = r'''(?x) - (?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P<provider_id>[^/]+)/ - (?:(?:(?:[^/]+/)+select/)?(?P<media>media/(?:guid/\d+/)?)?|(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/))? - |theplatform:)(?P<id>[^/\?&]+)''' - - _TESTS = [{ - # from http://www.metacafe.com/watch/cb-e9I_cZgTgIPd/blackberrys_big_bold_z30/ - 'url': 'http://link.theplatform.com/s/dJ5BDC/e9I_cZgTgIPd/meta.smil?format=smil&Tracking=true&mbr=true', - 'info_dict': { - 'id': 'e9I_cZgTgIPd', - 'ext': 'flv', - 'title': 'Blackberry\'s big, bold Z30', - 'description': 'The Z30 is Blackberry\'s biggest, baddest mobile messaging device yet.', - 'duration': 247, - 'timestamp': 1383239700, - 'upload_date': '20131031', - 'uploader': 'CBSI-NEW', - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - 'skip': '404 Not Found', - }, { - # from http://www.cnet.com/videos/tesla-model-s-a-second-step-towards-a-cleaner-motoring-future/ - 'url': 'http://link.theplatform.com/s/kYEXFC/22d_qsQ6MIRT', - 'info_dict': { - 'id': '22d_qsQ6MIRT', - 'ext': 'flv', - 'description': 'md5:ac330c9258c04f9d7512cf26b9595409', - 'title': 'Tesla Model S: A second step towards a cleaner motoring future', - 'timestamp': 1426176191, - 'upload_date': '20150312', - 'uploader': 'CBSI-NEW', - }, - 'params': { - # rtmp download - 'skip_download': True, - } - }, { - 'url': 'https://player.theplatform.com/p/D6x-PC/pulse_preview/embed/select/media/yMBg9E8KFxZD', - 'info_dict': { - 'id': 'yMBg9E8KFxZD', - 'ext': 'mp4', - 'description': 'md5:644ad9188d655b742f942bf2e06b002d', - 'title': 'HIGHLIGHTS: USA bag first ever series Cup win', - 'uploader': 'EGSM', - } - }, { - 'url': 'http://player.theplatform.com/p/NnzsPC/widget/select/media/4Y0TlYUr_ZT7', - 'only_matching': True, - }, { - 'url': 'http://player.theplatform.com/p/2E2eJC/nbcNewsOffsite?guid=tdy_or_siri_150701', - 'md5': 'fb96bb3d85118930a5b055783a3bd992', - 'info_dict': { - 'id': 'tdy_or_siri_150701', - 'ext': 'mp4', - 'title': 'iPhone Siri’s sassy response to a math question has people talking', - 'description': 'md5:a565d1deadd5086f3331d57298ec6333', - 'duration': 83.0, - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1435752600, - 'upload_date': '20150701', - 'uploader': 'NBCU-NEWS', - }, - }, { - # From http://www.nbc.com/the-blacklist/video/sir-crispin-crandall/2928790?onid=137781#vc137781=1 - # geo-restricted (US), HLS encrypted with AES-128 - 'url': 'http://player.theplatform.com/p/NnzsPC/onsite_universal/select/media/guid/2410887629/2928790?fwsitesection=nbc_the_blacklist_video_library&autoPlay=true&carouselID=137781', - 'only_matching': True, - }] - - @classmethod - def _extract_urls(cls, webpage): - m = re.search( - r'''(?x) - <meta\s+ - property=(["'])(?:og:video(?::(?:secure_)?url)?|twitter:player)\1\s+ - content=(["'])(?P<url>https?://player\.theplatform\.com/p/.+?)\2 - ''', webpage) - if m: - return [m.group('url')] - - # Are whitespaces ignored in URLs? - # https://github.com/ytdl-org/youtube-dl/issues/12044 - matches = re.findall( - r'(?s)<(?:iframe|script)[^>]+src=(["\'])((?:https?:)?//player\.theplatform\.com/p/.+?)\1', webpage) - if matches: - return [re.sub(r'\s', '', list(zip(*matches))[1][0])] - - @staticmethod - def _sign_url(url, sig_key, sig_secret, life=600, include_qs=False): - flags = '10' if include_qs else '00' - expiration_date = '%x' % (int(time.time()) + life) - - def str_to_hex(str): - return binascii.b2a_hex(str.encode('ascii')).decode('ascii') - - def hex_to_bytes(hex): - return binascii.a2b_hex(hex.encode('ascii')) - - relative_path = re.match(r'https?://link\.theplatform\.com/s/([^?]+)', url).group(1) - clear_text = hex_to_bytes(flags + expiration_date + str_to_hex(relative_path)) - checksum = hmac.new(sig_key.encode('ascii'), clear_text, hashlib.sha1).hexdigest() - sig = flags + expiration_date + checksum + str_to_hex(sig_secret) - return '%s&sig=%s' % (url, sig) - - def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) - self._initialize_geo_bypass({ - 'countries': smuggled_data.get('geo_countries'), - }) - - mobj = re.match(self._VALID_URL, url) - provider_id = mobj.group('provider_id') - video_id = mobj.group('id') - - if not provider_id: - provider_id = 'dJ5BDC' - - path = provider_id + '/' - if mobj.group('media'): - path += mobj.group('media') - path += video_id - - qs_dict = compat_parse_qs(compat_urllib_parse_urlparse(url).query) - if 'guid' in qs_dict: - webpage = self._download_webpage(url, video_id) - scripts = re.findall(r'<script[^>]+src="([^"]+)"', webpage) - feed_id = None - # feed id usually locates in the last script. - # Seems there's no pattern for the interested script filename, so - # I try one by one - for script in reversed(scripts): - feed_script = self._download_webpage( - self._proto_relative_url(script, 'http:'), - video_id, 'Downloading feed script') - feed_id = self._search_regex( - r'defaultFeedId\s*:\s*"([^"]+)"', feed_script, - 'default feed id', default=None) - if feed_id is not None: - break - if feed_id is None: - raise ExtractorError('Unable to find feed id') - return self.url_result('http://feed.theplatform.com/f/%s/%s?byGuid=%s' % ( - provider_id, feed_id, qs_dict['guid'][0])) - - if smuggled_data.get('force_smil_url', False): - smil_url = url - # Explicitly specified SMIL (see https://github.com/ytdl-org/youtube-dl/issues/7385) - elif '/guid/' in url: - headers = {} - source_url = smuggled_data.get('source_url') - if source_url: - headers['Referer'] = source_url - request = sanitized_Request(url, headers=headers) - webpage = self._download_webpage(request, video_id) - smil_url = self._search_regex( - r'<link[^>]+href=(["\'])(?P<url>.+?)\1[^>]+type=["\']application/smil\+xml', - webpage, 'smil url', group='url') - path = self._search_regex( - r'link\.theplatform\.com/s/((?:[^/?#&]+/)+[^/?#&]+)', smil_url, 'path') - smil_url += '?' if '?' not in smil_url else '&' + 'formats=m3u,mpeg4' - elif mobj.group('config'): - config_url = url + '&form=json' - config_url = config_url.replace('swf/', 'config/') - config_url = config_url.replace('onsite/', 'onsite/config/') - config = self._download_json(config_url, video_id, 'Downloading config') - if 'releaseUrl' in config: - release_url = config['releaseUrl'] - else: - release_url = 'http://link.theplatform.com/s/%s?mbr=true' % path - smil_url = release_url + '&formats=MPEG4&manifest=f4m' - else: - smil_url = 'http://link.theplatform.com/s/%s?mbr=true' % path - - sig = smuggled_data.get('sig') - if sig: - smil_url = self._sign_url(smil_url, sig['key'], sig['secret']) - - formats, subtitles = self._extract_theplatform_smil(smil_url, video_id) - self._sort_formats(formats) - - ret = self._extract_theplatform_metadata(path, video_id) - combined_subtitles = self._merge_subtitles(ret.get('subtitles', {}), subtitles) - ret.update({ - 'id': video_id, - 'formats': formats, - 'subtitles': combined_subtitles, - }) - - return ret - - -class ThePlatformFeedIE(ThePlatformBaseIE): - _URL_TEMPLATE = '%s//feed.theplatform.com/f/%s/%s?form=json&%s' - _VALID_URL = r'https?://feed\.theplatform\.com/f/(?P<provider_id>[^/]+)/(?P<feed_id>[^?/]+)\?(?:[^&]+&)*(?P<filter>by(?:Gui|I)d=(?P<id>[^&]+))' - _TESTS = [{ - # From http://player.theplatform.com/p/7wvmTC/MSNBCEmbeddedOffSite?guid=n_hardball_5biden_140207 - 'url': 'http://feed.theplatform.com/f/7wvmTC/msnbc_video-p-test?form=json&pretty=true&range=-40&byGuid=n_hardball_5biden_140207', - 'md5': '6e32495b5073ab414471b615c5ded394', - 'info_dict': { - 'id': 'n_hardball_5biden_140207', - 'ext': 'mp4', - 'title': 'The Biden factor: will Joe run in 2016?', - 'description': 'Could Vice President Joe Biden be preparing a 2016 campaign? Mark Halperin and Sam Stein weigh in.', - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20140208', - 'timestamp': 1391824260, - 'duration': 467.0, - 'categories': ['MSNBC/Issues/Democrats', 'MSNBC/Issues/Elections/Election 2016'], - 'uploader': 'NBCU-NEWS', - }, - }, { - 'url': 'http://feed.theplatform.com/f/2E2eJC/nnd_NBCNews?byGuid=nn_netcast_180306.Copy.01', - 'only_matching': True, - }] - - def _extract_feed_info(self, provider_id, feed_id, filter_query, video_id, custom_fields=None, asset_types_query={}, account_id=None): - real_url = self._URL_TEMPLATE % (self.http_scheme(), provider_id, feed_id, filter_query) - entry = self._download_json(real_url, video_id)['entries'][0] - main_smil_url = 'http://link.theplatform.com/s/%s/media/guid/%d/%s' % (provider_id, account_id, entry['guid']) if account_id else entry.get('plmedia$publicUrl') - - formats = [] - subtitles = {} - first_video_id = None - duration = None - asset_types = [] - for item in entry['media$content']: - smil_url = item['plfile$url'] - cur_video_id = ThePlatformIE._match_id(smil_url) - if first_video_id is None: - first_video_id = cur_video_id - duration = float_or_none(item.get('plfile$duration')) - file_asset_types = item.get('plfile$assetTypes') or compat_parse_qs(compat_urllib_parse_urlparse(smil_url).query)['assetTypes'] - for asset_type in file_asset_types: - if asset_type in asset_types: - continue - asset_types.append(asset_type) - query = { - 'mbr': 'true', - 'formats': item['plfile$format'], - 'assetTypes': asset_type, - } - if asset_type in asset_types_query: - query.update(asset_types_query[asset_type]) - cur_formats, cur_subtitles = self._extract_theplatform_smil(update_url_query( - main_smil_url or smil_url, query), video_id, 'Downloading SMIL data for %s' % asset_type) - formats.extend(cur_formats) - subtitles = self._merge_subtitles(subtitles, cur_subtitles) - - self._sort_formats(formats) - - thumbnails = [{ - 'url': thumbnail['plfile$url'], - 'width': int_or_none(thumbnail.get('plfile$width')), - 'height': int_or_none(thumbnail.get('plfile$height')), - } for thumbnail in entry.get('media$thumbnails', [])] - - timestamp = int_or_none(entry.get('media$availableDate'), scale=1000) - categories = [item['media$name'] for item in entry.get('media$categories', [])] - - ret = self._extract_theplatform_metadata('%s/%s' % (provider_id, first_video_id), video_id) - subtitles = self._merge_subtitles(subtitles, ret['subtitles']) - ret.update({ - 'id': video_id, - 'formats': formats, - 'subtitles': subtitles, - 'thumbnails': thumbnails, - 'duration': duration, - 'timestamp': timestamp, - 'categories': categories, - }) - if custom_fields: - ret.update(custom_fields(entry)) - - return ret - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - - video_id = mobj.group('id') - provider_id = mobj.group('provider_id') - feed_id = mobj.group('feed_id') - filter_query = mobj.group('filter') - - return self._extract_feed_info(provider_id, feed_id, filter_query, video_id) diff --git a/youtube_dl/extractor/theweatherchannel.py b/youtube_dl/extractor/theweatherchannel.py deleted file mode 100644 index b2a8c3797..000000000 --- a/youtube_dl/extractor/theweatherchannel.py +++ /dev/null @@ -1,102 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import json -import re - -from .theplatform import ThePlatformIE -from ..utils import ( - determine_ext, - parse_duration, - parse_iso8601, -) - - -class TheWeatherChannelIE(ThePlatformIE): - _VALID_URL = r'https?://(?:www\.)?weather\.com(?P<asset_name>(?:/(?P<locale>[a-z]{2}-[A-Z]{2}))?/(?:[^/]+/)*video/(?P<id>[^/?#]+))' - _TESTS = [{ - 'url': 'https://weather.com/series/great-outdoors/video/ice-climber-is-in-for-a-shock', - 'md5': 'c4cbe74c9c17c5676b704b950b73dd92', - 'info_dict': { - 'id': 'cc82397e-cc3f-4d11-9390-a785add090e8', - 'ext': 'mp4', - 'title': 'Ice Climber Is In For A Shock', - 'description': 'md5:55606ce1378d4c72e6545e160c9d9695', - 'uploader': 'TWC - Digital (No Distro)', - 'uploader_id': '6ccd5455-16bb-46f2-9c57-ff858bb9f62c', - 'upload_date': '20160720', - 'timestamp': 1469018835, - } - }, { - 'url': 'https://weather.com/en-CA/international/videos/video/unidentified-object-falls-from-sky-in-india', - 'only_matching': True, - }] - - def _real_extract(self, url): - asset_name, locale, display_id = re.match(self._VALID_URL, url).groups() - if not locale: - locale = 'en-US' - video_data = list(self._download_json( - 'https://weather.com/api/v1/p/redux-dal', display_id, data=json.dumps([{ - 'name': 'getCMSAssetsUrlConfig', - 'params': { - 'language': locale.replace('-', '_'), - 'query': { - 'assetName': { - '$in': asset_name, - }, - }, - } - }]).encode(), headers={ - 'Content-Type': 'application/json', - })['dal']['getCMSAssetsUrlConfig'].values())[0]['data'][0] - video_id = video_data['id'] - seo_meta = video_data.get('seometa', {}) - title = video_data.get('title') or seo_meta['title'] - - urls = [] - thumbnails = [] - formats = [] - for variant_id, variant_url in video_data.get('variants', []).items(): - variant_url = variant_url.strip() - if not variant_url or variant_url in urls: - continue - urls.append(variant_url) - ext = determine_ext(variant_url) - if ext == 'jpg': - thumbnails.append({ - 'url': variant_url, - 'id': variant_id, - }) - elif ThePlatformIE.suitable(variant_url): - tp_formats, _ = self._extract_theplatform_smil(variant_url, video_id) - formats.extend(tp_formats) - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - variant_url, video_id, 'mp4', 'm3u8_native', - m3u8_id=variant_id, fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - variant_url, video_id, f4m_id=variant_id, fatal=False)) - else: - formats.append({ - 'url': variant_url, - 'format_id': variant_id, - }) - self._sort_formats(formats) - - cc_url = video_data.get('cc_url') - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': video_data.get('description') or seo_meta.get('description') or seo_meta.get('og:description'), - 'duration': parse_duration(video_data.get('duration')), - 'uploader': video_data.get('providername'), - 'uploader_id': video_data.get('providerid'), - 'timestamp': parse_iso8601(video_data.get('publishdate')), - 'subtitles': {locale[:2]: [{'url': cc_url}]} if cc_url else None, - 'thumbnails': thumbnails, - 'formats': formats, - } diff --git a/youtube_dl/extractor/thisav.py b/youtube_dl/extractor/thisav.py deleted file mode 100644 index dc3dd03c8..000000000 --- a/youtube_dl/extractor/thisav.py +++ /dev/null @@ -1,73 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import remove_end - - -class ThisAVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?thisav\.com/video/(?P<id>[0-9]+)/.*' - _TESTS = [{ - # jwplayer - 'url': 'http://www.thisav.com/video/47734/%98%26sup1%3B%83%9E%83%82---just-fit.html', - 'md5': '0480f1ef3932d901f0e0e719f188f19b', - 'info_dict': { - 'id': '47734', - 'ext': 'flv', - 'title': '高樹マリア - Just fit', - 'uploader': 'dj7970', - 'uploader_id': 'dj7970' - } - }, { - # html5 media - 'url': 'http://www.thisav.com/video/242352/nerdy-18yo-big-ass-tattoos-and-glasses.html', - 'md5': 'ba90c076bd0f80203679e5b60bf523ee', - 'info_dict': { - 'id': '242352', - 'ext': 'mp4', - 'title': 'Nerdy 18yo Big Ass Tattoos and Glasses', - 'uploader': 'cybersluts', - 'uploader_id': 'cybersluts', - }, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - - video_id = mobj.group('id') - webpage = self._download_webpage(url, video_id) - title = remove_end(self._html_search_regex( - r'<title>([^<]+)</title>', webpage, 'title'), - ' - 視頻 - ThisAV.com-世界第一中文成人娛樂網站') - video_url = self._html_search_regex( - r"addVariable\('file','([^']+)'\);", webpage, 'video url', default=None) - if video_url: - info_dict = { - 'formats': [{ - 'url': video_url, - }], - } - else: - entries = self._parse_html5_media_entries(url, webpage, video_id) - if entries: - info_dict = entries[0] - else: - info_dict = self._extract_jwplayer_data( - webpage, video_id, require_title=False) - uploader = self._html_search_regex( - r': <a href="http://www\.thisav\.com/user/[0-9]+/(?:[^"]+)">([^<]+)</a>', - webpage, 'uploader name', fatal=False) - uploader_id = self._html_search_regex( - r': <a href="http://www\.thisav\.com/user/[0-9]+/([^"]+)">(?:[^<]+)</a>', - webpage, 'uploader id', fatal=False) - - info_dict.update({ - 'id': video_id, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'title': title, - }) - - return info_dict diff --git a/youtube_dl/extractor/threeqsdn.py b/youtube_dl/extractor/threeqsdn.py deleted file mode 100644 index f6d37bb9e..000000000 --- a/youtube_dl/extractor/threeqsdn.py +++ /dev/null @@ -1,164 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_HTTPError -from ..utils import ( - determine_ext, - ExtractorError, - float_or_none, - int_or_none, - parse_iso8601, -) - - -class ThreeQSDNIE(InfoExtractor): - IE_NAME = '3qsdn' - IE_DESC = '3Q SDN' - _VALID_URL = r'https?://playout\.3qsdn\.com/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' - _TESTS = [{ - # https://player.3qsdn.com/demo.html - 'url': 'https://playout.3qsdn.com/7201c779-6b3c-11e7-a40e-002590c750be', - 'md5': '64a57396b16fa011b15e0ea60edce918', - 'info_dict': { - 'id': '7201c779-6b3c-11e7-a40e-002590c750be', - 'ext': 'mp4', - 'title': 'Video Ads', - 'is_live': False, - 'description': 'Video Ads Demo', - 'timestamp': 1500334803, - 'upload_date': '20170717', - 'duration': 888.032, - 'subtitles': { - 'eng': 'count:1', - }, - }, - 'expected_warnings': ['Unknown MIME type application/mp4 in DASH manifest'], - }, { - # live video stream - 'url': 'https://playout.3qsdn.com/66e68995-11ca-11e8-9273-002590c750be', - 'info_dict': { - 'id': '66e68995-11ca-11e8-9273-002590c750be', - 'ext': 'mp4', - 'title': 're:^66e68995-11ca-11e8-9273-002590c750be [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'is_live': True, - }, - 'params': { - 'skip_download': True, # m3u8 downloads - }, - }, { - # live audio stream - 'url': 'http://playout.3qsdn.com/9edf36e0-6bf2-11e2-a16a-9acf09e2db48', - 'only_matching': True, - }, { - # live audio stream with some 404 URLs - 'url': 'http://playout.3qsdn.com/ac5c3186-777a-11e2-9c30-9acf09e2db48', - 'only_matching': True, - }, { - # geo restricted with 'This content is not available in your country' - 'url': 'http://playout.3qsdn.com/d63a3ffe-75e8-11e2-9c30-9acf09e2db48', - 'only_matching': True, - }, { - # geo restricted with 'playout.3qsdn.com/forbidden' - 'url': 'http://playout.3qsdn.com/8e330f26-6ae2-11e2-a16a-9acf09e2db48', - 'only_matching': True, - }, { - # live video with rtmp link - 'url': 'https://playout.3qsdn.com/6092bb9e-8f72-11e4-a173-002590c750be', - 'only_matching': True, - }, { - # ondemand from http://www.philharmonie.tv/veranstaltung/26/ - 'url': 'http://playout.3qsdn.com/0280d6b9-1215-11e6-b427-0cc47a188158?protocol=http', - 'only_matching': True, - }, { - # live video stream - 'url': 'https://playout.3qsdn.com/d755d94b-4ab9-11e3-9162-0025907ad44f?js=true', - 'only_matching': True, - }] - - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<iframe[^>]+\b(?:data-)?src=(["\'])(?P<url>%s.*?)\1' % ThreeQSDNIE._VALID_URL, webpage) - if mobj: - return mobj.group('url') - - def _real_extract(self, url): - video_id = self._match_id(url) - - try: - config = self._download_json( - url.replace('://playout.3qsdn.com/', '://playout.3qsdn.com/config/'), video_id) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - self.raise_geo_restricted() - raise - - live = config.get('streamContent') == 'live' - aspect = float_or_none(config.get('aspect')) - - formats = [] - for source_type, source in (config.get('sources') or {}).items(): - if not source: - continue - if source_type == 'dash': - formats.extend(self._extract_mpd_formats( - source, video_id, mpd_id='mpd', fatal=False)) - elif source_type == 'hls': - formats.extend(self._extract_m3u8_formats( - source, video_id, 'mp4', 'm3u8' if live else 'm3u8_native', - m3u8_id='hls', fatal=False)) - elif source_type == 'progressive': - for s in source: - src = s.get('src') - if not (src and self._is_valid_url(src, video_id)): - continue - width = None - format_id = ['http'] - ext = determine_ext(src) - if ext: - format_id.append(ext) - height = int_or_none(s.get('height')) - if height: - format_id.append('%dp' % height) - if aspect: - width = int(height * aspect) - formats.append({ - 'ext': ext, - 'format_id': '-'.join(format_id), - 'height': height, - 'source_preference': 0, - 'url': src, - 'vcodec': 'none' if height == 0 else None, - 'width': width, - }) - for f in formats: - if f.get('acodec') == 'none': - f['preference'] = -40 - elif f.get('vcodec') == 'none': - f['preference'] = -50 - self._sort_formats(formats, ('preference', 'width', 'height', 'source_preference', 'tbr', 'vbr', 'abr', 'ext', 'format_id')) - - subtitles = {} - for subtitle in (config.get('subtitles') or []): - src = subtitle.get('src') - if not src: - continue - subtitles.setdefault(subtitle.get('label') or 'eng', []).append({ - 'url': src, - }) - - title = config.get('title') or video_id - - return { - 'id': video_id, - 'title': self._live_title(title) if live else title, - 'thumbnail': config.get('poster') or None, - 'description': config.get('description') or None, - 'timestamp': parse_iso8601(config.get('upload_date')), - 'duration': float_or_none(config.get('vlength')) or None, - 'is_live': live, - 'formats': formats, - 'subtitles': subtitles, - } diff --git a/youtube_dl/extractor/tiktok.py b/youtube_dl/extractor/tiktok.py deleted file mode 100644 index 4faa6de54..000000000 --- a/youtube_dl/extractor/tiktok.py +++ /dev/null @@ -1,147 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - compat_str, - ExtractorError, - float_or_none, - int_or_none, - str_or_none, - try_get, - url_or_none, -) - - -class TikTokBaseIE(InfoExtractor): - def _extract_video(self, data, video_id=None): - video = data['video'] - description = str_or_none(try_get(data, lambda x: x['desc'])) - width = int_or_none(try_get(data, lambda x: video['width'])) - height = int_or_none(try_get(data, lambda x: video['height'])) - - format_urls = set() - formats = [] - for format_id in ('download', 'play'): - format_url = url_or_none(video.get('%sAddr' % format_id)) - if not format_url: - continue - if format_url in format_urls: - continue - format_urls.add(format_url) - formats.append({ - 'url': format_url, - 'ext': 'mp4', - 'height': height, - 'width': width, - 'http_headers': { - 'Referer': 'https://www.tiktok.com/', - } - }) - self._sort_formats(formats) - - thumbnail = url_or_none(video.get('cover')) - duration = float_or_none(video.get('duration')) - - uploader = try_get(data, lambda x: x['author']['nickname'], compat_str) - uploader_id = try_get(data, lambda x: x['author']['id'], compat_str) - - timestamp = int_or_none(data.get('createTime')) - - def stats(key): - return int_or_none(try_get( - data, lambda x: x['stats']['%sCount' % key])) - - view_count = stats('play') - like_count = stats('digg') - comment_count = stats('comment') - repost_count = stats('share') - - aweme_id = data.get('id') or video_id - - return { - 'id': aweme_id, - 'title': uploader or aweme_id, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'timestamp': timestamp, - 'view_count': view_count, - 'like_count': like_count, - 'comment_count': comment_count, - 'repost_count': repost_count, - 'formats': formats, - } - - -class TikTokIE(TikTokBaseIE): - _VALID_URL = r'https?://(?:www\.)?tiktok\.com/@[^/]+/video/(?P<id>\d+)' - _TESTS = [{ - 'url': 'https://www.tiktok.com/@zureeal/video/6606727368545406213', - 'md5': '163ceff303bb52de60e6887fe399e6cd', - 'info_dict': { - 'id': '6606727368545406213', - 'ext': 'mp4', - 'title': 'Zureeal', - 'description': '#bowsette#mario#cosplay#uk#lgbt#gaming#asian#bowsettecosplay', - 'thumbnail': r're:^https?://.*', - 'duration': 15, - 'uploader': 'Zureeal', - 'uploader_id': '188294915489964032', - 'timestamp': 1538248586, - 'upload_date': '20180929', - 'view_count': int, - 'like_count': int, - 'comment_count': int, - 'repost_count': int, - } - }] - - def _real_initialize(self): - # Setup session (will set necessary cookies) - self._request_webpage( - 'https://www.tiktok.com/', None, note='Setting up session') - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - page_props = self._parse_json(self._search_regex( - r'<script[^>]+\bid=["\']__NEXT_DATA__[^>]+>\s*({.+?})\s*</script', - webpage, 'data'), video_id)['props']['pageProps'] - data = try_get(page_props, lambda x: x['itemInfo']['itemStruct'], dict) - if not data and page_props.get('statusCode') == 10216: - raise ExtractorError('This video is private', expected=True) - return self._extract_video(data, video_id) - - -class TikTokUserIE(TikTokBaseIE): - _VALID_URL = r'https://(?:www\.)?tiktok\.com/@(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'https://www.tiktok.com/@zureeal', - 'info_dict': { - 'id': '188294915489964032', - }, - 'playlist_mincount': 24, - }] - _WORKING = False - - @classmethod - def suitable(cls, url): - return False if TikTokIE.suitable(url) else super(TikTokUserIE, cls).suitable(url) - - def _real_extract(self, url): - user_id = self._match_id(url) - data = self._download_json( - 'https://m.tiktok.com/h5/share/usr/list/%s/' % user_id, user_id, - query={'_signature': '_'}) - entries = [] - for aweme in data['aweme_list']: - try: - entry = self._extract_video(aweme) - except ExtractorError: - continue - entry['extractor_key'] = TikTokIE.ie_key() - entries.append(entry) - return self.playlist_result(entries, user_id) diff --git a/youtube_dl/extractor/tinypic.py b/youtube_dl/extractor/tinypic.py deleted file mode 100644 index bc2def508..000000000 --- a/youtube_dl/extractor/tinypic.py +++ /dev/null @@ -1,56 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ExtractorError - - -class TinyPicIE(InfoExtractor): - IE_NAME = 'tinypic' - IE_DESC = 'tinypic.com videos' - _VALID_URL = r'https?://(?:.+?\.)?tinypic\.com/player\.php\?v=(?P<id>[^&]+)&s=\d+' - - _TESTS = [ - { - 'url': 'http://tinypic.com/player.php?v=6xw7tc%3E&s=5#.UtqZmbRFCM8', - 'md5': '609b74432465364e72727ebc6203f044', - 'info_dict': { - 'id': '6xw7tc', - 'ext': 'flv', - 'title': 'shadow phenomenon weird', - }, - }, - { - 'url': 'http://de.tinypic.com/player.php?v=dy90yh&s=8', - 'only_matching': True, - } - ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - webpage = self._download_webpage(url, video_id, 'Downloading page') - - mobj = re.search(r'(?m)fo\.addVariable\("file",\s"(?P<fileid>[\da-z]+)"\);\n' - r'\s+fo\.addVariable\("s",\s"(?P<serverid>\d+)"\);', webpage) - if mobj is None: - raise ExtractorError('Video %s does not exist' % video_id, expected=True) - - file_id = mobj.group('fileid') - server_id = mobj.group('serverid') - - KEYWORDS_SUFFIX = ', Video, images, photos, videos, myspace, ebay, video hosting, photo hosting' - keywords = self._html_search_meta('keywords', webpage, 'title') - title = keywords[:-len(KEYWORDS_SUFFIX)] if keywords.endswith(KEYWORDS_SUFFIX) else '' - - video_url = 'http://v%s.tinypic.com/%s.flv' % (server_id, file_id) - thumbnail = 'http://v%s.tinypic.com/%s_th.jpg' % (server_id, file_id) - - return { - 'id': file_id, - 'url': video_url, - 'thumbnail': thumbnail, - 'title': title - } diff --git a/youtube_dl/extractor/tmz.py b/youtube_dl/extractor/tmz.py deleted file mode 100644 index 3d1bf75ff..000000000 --- a/youtube_dl/extractor/tmz.py +++ /dev/null @@ -1,111 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from .jwplatform import JWPlatformIE -from .kaltura import KalturaIE -from ..utils import ( - int_or_none, - unified_timestamp, -) - - -class TMZIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tmz\.com/videos/(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'http://www.tmz.com/videos/0-cegprt2p/', - 'md5': '31f9223e20eef55954973359afa61a20', - 'info_dict': { - 'id': 'P6YjLBLk', - 'ext': 'mp4', - 'title': "No Charges Against Hillary Clinton? Harvey Says It Ain't Over Yet", - 'description': 'md5:b714359fc18607715ebccbd2da8ff488', - 'timestamp': 1467831837, - 'upload_date': '20160706', - }, - 'add_ie': [JWPlatformIE.ie_key()], - }, { - 'url': 'http://www.tmz.com/videos/0_okj015ty/', - 'only_matching': True, - }, { - 'url': 'https://www.tmz.com/videos/071119-chris-morgan-women-4590005-0-zcsejvcr/', - 'only_matching': True, - }, { - 'url': 'https://www.tmz.com/videos/2021-02-19-021921-floyd-mayweather-1043872/', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url).replace('-', '_') - - webpage = self._download_webpage(url, video_id, fatal=False) - if webpage: - tmz_video_id = self._search_regex( - r'nodeRef\s*:\s*["\']tmz:video:([\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12})', - webpage, 'video id', default=None) - video = self._download_json( - 'https://www.tmz.com/_/video/%s' % tmz_video_id, video_id, - fatal=False) - if video: - message = video['message'] - info = { - '_type': 'url_transparent', - 'title': message.get('title'), - 'description': message.get('description'), - 'timestamp': unified_timestamp(message.get('published_at')), - 'duration': int_or_none(message.get('duration')), - } - jwplatform_id = message.get('jwplayer_media_id') - if jwplatform_id: - info.update({ - 'url': 'jwplatform:%s' % jwplatform_id, - 'ie_key': JWPlatformIE.ie_key(), - }) - else: - kaltura_entry_id = message.get('kaltura_entry_id') or video_id - kaltura_partner_id = message.get('kaltura_partner_id') or '591531' - info.update({ - 'url': 'kaltura:%s:%s' % (kaltura_partner_id, kaltura_entry_id), - 'ie_key': KalturaIE.ie_key(), - }) - return info - - return self.url_result( - 'kaltura:591531:%s' % video_id, KalturaIE.ie_key(), video_id) - - -class TMZArticleIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tmz\.com/\d{4}/\d{2}/\d{2}/(?P<id>[^/?#&]+)' - _TEST = { - 'url': 'http://www.tmz.com/2015/04/19/bobby-brown-bobbi-kristina-awake-video-concert', - 'info_dict': { - 'id': 'PAKZa97W', - 'ext': 'mp4', - 'title': 'Bobby Brown Tells Crowd ... Bobbi Kristina is Awake', - 'description': 'Bobby Brown stunned his audience during a concert Saturday night, when he told the crowd, "Bobbi is awake. She\'s watching me."', - 'timestamp': 1429466400, - 'upload_date': '20150419', - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': [JWPlatformIE.ie_key()], - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - tmz_url = self._search_regex( - r'clickLink\s*\(\s*["\'](?P<url>%s)' % TMZIE._VALID_URL, webpage, - 'video id', default=None, group='url') - if tmz_url: - return self.url_result(tmz_url, ie=TMZIE.ie_key()) - - embedded_video_info = self._parse_json(self._html_search_regex( - r'tmzVideoEmbed\(({.+?})\);', webpage, 'embedded video info'), - video_id) - return self.url_result( - 'http://www.tmz.com/videos/%s/' % embedded_video_info['id'], - ie=TMZIE.ie_key()) diff --git a/youtube_dl/extractor/tnaflix.py b/youtube_dl/extractor/tnaflix.py deleted file mode 100644 index b3573c6e0..000000000 --- a/youtube_dl/extractor/tnaflix.py +++ /dev/null @@ -1,327 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - fix_xml_ampersands, - float_or_none, - int_or_none, - parse_duration, - str_to_int, - unescapeHTML, - xpath_text, -) - - -class TNAFlixNetworkBaseIE(InfoExtractor): - # May be overridden in descendants if necessary - _CONFIG_REGEX = [ - r'flashvars\.config\s*=\s*escape\("(?P<url>[^"]+)"', - r'<input[^>]+name="config\d?" value="(?P<url>[^"]+)"', - r'config\s*=\s*(["\'])(?P<url>(?:https?:)?//(?:(?!\1).)+)\1', - ] - _HOST = 'tna' - _VKEY_SUFFIX = '' - _TITLE_REGEX = r'<input[^>]+name="title" value="([^"]+)"' - _DESCRIPTION_REGEX = r'<input[^>]+name="description" value="([^"]+)"' - _UPLOADER_REGEX = r'<input[^>]+name="username" value="([^"]+)"' - _VIEW_COUNT_REGEX = None - _COMMENT_COUNT_REGEX = None - _AVERAGE_RATING_REGEX = None - _CATEGORIES_REGEX = r'<li[^>]*>\s*<span[^>]+class="infoTitle"[^>]*>Categories:</span>\s*<span[^>]+class="listView"[^>]*>(.+?)</span>\s*</li>' - - def _extract_thumbnails(self, flix_xml): - - def get_child(elem, names): - for name in names: - child = elem.find(name) - if child is not None: - return child - - timeline = get_child(flix_xml, ['timeline', 'rolloverBarImage']) - if timeline is None: - return - - pattern_el = get_child(timeline, ['imagePattern', 'pattern']) - if pattern_el is None or not pattern_el.text: - return - - first_el = get_child(timeline, ['imageFirst', 'first']) - last_el = get_child(timeline, ['imageLast', 'last']) - if first_el is None or last_el is None: - return - - first_text = first_el.text - last_text = last_el.text - if not first_text.isdigit() or not last_text.isdigit(): - return - - first = int(first_text) - last = int(last_text) - if first > last: - return - - width = int_or_none(xpath_text(timeline, './imageWidth', 'thumbnail width')) - height = int_or_none(xpath_text(timeline, './imageHeight', 'thumbnail height')) - - return [{ - 'url': self._proto_relative_url(pattern_el.text.replace('#', compat_str(i)), 'http:'), - 'width': width, - 'height': height, - } for i in range(first, last + 1)] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - for display_id_key in ('display_id', 'display_id_2'): - if display_id_key in mobj.groupdict(): - display_id = mobj.group(display_id_key) - if display_id: - break - else: - display_id = video_id - - webpage = self._download_webpage(url, display_id) - - cfg_url = self._proto_relative_url(self._html_search_regex( - self._CONFIG_REGEX, webpage, 'flashvars.config', default=None, - group='url'), 'http:') - - if not cfg_url: - inputs = self._hidden_inputs(webpage) - cfg_url = ('https://cdn-fck.%sflix.com/%sflix/%s%s.fid?key=%s&VID=%s&premium=1&vip=1&alpha' - % (self._HOST, self._HOST, inputs['vkey'], self._VKEY_SUFFIX, inputs['nkey'], video_id)) - - cfg_xml = self._download_xml( - cfg_url, display_id, 'Downloading metadata', - transform_source=fix_xml_ampersands, headers={'Referer': url}) - - formats = [] - - def extract_video_url(vl): - # Any URL modification now results in HTTP Error 403: Forbidden - return unescapeHTML(vl.text) - - video_link = cfg_xml.find('./videoLink') - if video_link is not None: - formats.append({ - 'url': extract_video_url(video_link), - 'ext': xpath_text(cfg_xml, './videoConfig/type', 'type', default='flv'), - }) - - for item in cfg_xml.findall('./quality/item'): - video_link = item.find('./videoLink') - if video_link is None: - continue - res = item.find('res') - format_id = None if res is None else res.text - height = int_or_none(self._search_regex( - r'^(\d+)[pP]', format_id, 'height', default=None)) - formats.append({ - 'url': self._proto_relative_url(extract_video_url(video_link), 'http:'), - 'format_id': format_id, - 'height': height, - }) - - self._sort_formats(formats) - - thumbnail = self._proto_relative_url( - xpath_text(cfg_xml, './startThumb', 'thumbnail'), 'http:') - thumbnails = self._extract_thumbnails(cfg_xml) - - title = None - if self._TITLE_REGEX: - title = self._html_search_regex( - self._TITLE_REGEX, webpage, 'title', default=None) - if not title: - title = self._og_search_title(webpage) - - age_limit = self._rta_search(webpage) or 18 - - duration = parse_duration(self._html_search_meta( - 'duration', webpage, 'duration', default=None)) - - def extract_field(pattern, name): - return self._html_search_regex(pattern, webpage, name, default=None) if pattern else None - - description = extract_field(self._DESCRIPTION_REGEX, 'description') - uploader = extract_field(self._UPLOADER_REGEX, 'uploader') - view_count = str_to_int(extract_field(self._VIEW_COUNT_REGEX, 'view count')) - comment_count = str_to_int(extract_field(self._COMMENT_COUNT_REGEX, 'comment count')) - average_rating = float_or_none(extract_field(self._AVERAGE_RATING_REGEX, 'average rating')) - - categories_str = extract_field(self._CATEGORIES_REGEX, 'categories') - categories = [c.strip() for c in categories_str.split(',')] if categories_str is not None else [] - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'thumbnails': thumbnails, - 'duration': duration, - 'age_limit': age_limit, - 'uploader': uploader, - 'view_count': view_count, - 'comment_count': comment_count, - 'average_rating': average_rating, - 'categories': categories, - 'formats': formats, - } - - -class TNAFlixNetworkEmbedIE(TNAFlixNetworkBaseIE): - _VALID_URL = r'https?://player\.(?:tna|emp)flix\.com/video/(?P<id>\d+)' - - _TITLE_REGEX = r'<title>([^<]+)</title>' - - _TESTS = [{ - 'url': 'https://player.tnaflix.com/video/6538', - 'info_dict': { - 'id': '6538', - 'display_id': '6538', - 'ext': 'mp4', - 'title': 'Educational xxx video', - 'thumbnail': r're:https?://.*\.jpg$', - 'age_limit': 18, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://player.empflix.com/video/33051', - 'only_matching': True, - }] - - @staticmethod - def _extract_urls(webpage): - return [url for _, url in re.findall( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.(?:tna|emp)flix\.com/video/\d+)\1', - webpage)] - - -class TNAEMPFlixBaseIE(TNAFlixNetworkBaseIE): - _DESCRIPTION_REGEX = r'(?s)>Description:</[^>]+>(.+?)<' - _UPLOADER_REGEX = r'<span>by\s*<a[^>]+\bhref=["\']/profile/[^>]+>([^<]+)<' - _CATEGORIES_REGEX = r'(?s)<span[^>]*>Categories:</span>(.+?)</div>' - - -class TNAFlixIE(TNAEMPFlixBaseIE): - _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/[^/]+/(?P<display_id>[^/]+)/video(?P<id>\d+)' - - _TITLE_REGEX = r'<title>(.+?) - (?:TNAFlix Porn Videos|TNAFlix\.com)</title>' - - _TESTS = [{ - # anonymous uploader, no categories - 'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878', - 'md5': '7e569419fe6d69543d01e6be22f5f7c4', - 'info_dict': { - 'id': '553878', - 'display_id': 'Carmella-Decesare-striptease', - 'ext': 'mp4', - 'title': 'Carmella Decesare - striptease', - 'thumbnail': r're:https?://.*\.jpg$', - 'duration': 91, - 'age_limit': 18, - 'categories': ['Porn Stars'], - } - }, { - # non-anonymous uploader, categories - 'url': 'https://www.tnaflix.com/teen-porn/Educational-xxx-video/video6538', - 'md5': '0f5d4d490dbfd117b8607054248a07c0', - 'info_dict': { - 'id': '6538', - 'display_id': 'Educational-xxx-video', - 'ext': 'mp4', - 'title': 'Educational xxx video', - 'description': 'md5:b4fab8f88a8621c8fabd361a173fe5b8', - 'thumbnail': r're:https?://.*\.jpg$', - 'duration': 164, - 'age_limit': 18, - 'uploader': 'bobwhite39', - 'categories': list, - } - }, { - 'url': 'https://www.tnaflix.com/amateur-porn/bunzHD-Ms.Donk/video358632', - 'only_matching': True, - }] - - -class EMPFlixIE(TNAEMPFlixBaseIE): - _VALID_URL = r'https?://(?:www\.)?empflix\.com/(?:videos/(?P<display_id>.+?)-|[^/]+/(?P<display_id_2>[^/]+)/video)(?P<id>[0-9]+)' - - _HOST = 'emp' - _VKEY_SUFFIX = '-1' - - _TESTS = [{ - 'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html', - 'md5': 'bc30d48b91a7179448a0bda465114676', - 'info_dict': { - 'id': '33051', - 'display_id': 'Amateur-Finger-Fuck', - 'ext': 'mp4', - 'title': 'Amateur Finger Fuck', - 'description': 'Amateur solo finger fucking.', - 'thumbnail': r're:https?://.*\.jpg$', - 'duration': 83, - 'age_limit': 18, - 'uploader': 'cwbike', - 'categories': ['Amateur', 'Anal', 'Fisting', 'Home made', 'Solo'], - } - }, { - 'url': 'http://www.empflix.com/videos/[AROMA][ARMD-718]-Aoi-Yoshino-Sawa-25826.html', - 'only_matching': True, - }, { - 'url': 'https://www.empflix.com/amateur-porn/Amateur-Finger-Fuck/video33051', - 'only_matching': True, - }] - - -class MovieFapIE(TNAFlixNetworkBaseIE): - _VALID_URL = r'https?://(?:www\.)?moviefap\.com/videos/(?P<id>[0-9a-f]+)/(?P<display_id>[^/]+)\.html' - - _VIEW_COUNT_REGEX = r'<br>Views\s*<strong>([\d,.]+)</strong>' - _COMMENT_COUNT_REGEX = r'<span[^>]+id="comCount"[^>]*>([\d,.]+)</span>' - _AVERAGE_RATING_REGEX = r'Current Rating\s*<br>\s*<strong>([\d.]+)</strong>' - _CATEGORIES_REGEX = r'(?s)<div[^>]+id="vid_info"[^>]*>\s*<div[^>]*>.+?</div>(.*?)<br>' - - _TESTS = [{ - # normal, multi-format video - 'url': 'http://www.moviefap.com/videos/be9867c9416c19f54a4a/experienced-milf-amazing-handjob.html', - 'md5': '26624b4e2523051b550067d547615906', - 'info_dict': { - 'id': 'be9867c9416c19f54a4a', - 'display_id': 'experienced-milf-amazing-handjob', - 'ext': 'mp4', - 'title': 'Experienced MILF Amazing Handjob', - 'description': 'Experienced MILF giving an Amazing Handjob', - 'thumbnail': r're:https?://.*\.jpg$', - 'age_limit': 18, - 'uploader': 'darvinfred06', - 'view_count': int, - 'comment_count': int, - 'average_rating': float, - 'categories': ['Amateur', 'Masturbation', 'Mature', 'Flashing'], - } - }, { - # quirky single-format case where the extension is given as fid, but the video is really an flv - 'url': 'http://www.moviefap.com/videos/e5da0d3edce5404418f5/jeune-couple-russe.html', - 'md5': 'fa56683e291fc80635907168a743c9ad', - 'info_dict': { - 'id': 'e5da0d3edce5404418f5', - 'display_id': 'jeune-couple-russe', - 'ext': 'flv', - 'title': 'Jeune Couple Russe', - 'description': 'Amateur', - 'thumbnail': r're:https?://.*\.jpg$', - 'age_limit': 18, - 'uploader': 'whiskeyjar', - 'view_count': int, - 'comment_count': int, - 'average_rating': float, - 'categories': ['Amateur', 'Teen'], - } - }] diff --git a/youtube_dl/extractor/toggle.py b/youtube_dl/extractor/toggle.py deleted file mode 100644 index 270c84daa..000000000 --- a/youtube_dl/extractor/toggle.py +++ /dev/null @@ -1,234 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import json -import re - -from .common import InfoExtractor -from ..utils import ( - determine_ext, - ExtractorError, - float_or_none, - int_or_none, - parse_iso8601, - strip_or_none, -) - - -class ToggleIE(InfoExtractor): - IE_NAME = 'toggle' - _VALID_URL = r'(?:https?://(?:(?:www\.)?mewatch|video\.toggle)\.sg/(?:en|zh)/(?:[^/]+/){2,}|toggle:)(?P<id>[0-9]+)' - _TESTS = [{ - 'url': 'http://www.mewatch.sg/en/series/lion-moms-tif/trailers/lion-moms-premier/343115', - 'info_dict': { - 'id': '343115', - 'ext': 'mp4', - 'title': 'Lion Moms Premiere', - 'description': 'md5:aea1149404bff4d7f7b6da11fafd8e6b', - 'upload_date': '20150910', - 'timestamp': 1441858274, - }, - 'params': { - 'skip_download': 'm3u8 download', - } - }, { - 'note': 'DRM-protected video', - 'url': 'http://www.mewatch.sg/en/movies/dug-s-special-mission/341413', - 'info_dict': { - 'id': '341413', - 'ext': 'wvm', - 'title': 'Dug\'s Special Mission', - 'description': 'md5:e86c6f4458214905c1772398fabc93e0', - 'upload_date': '20150827', - 'timestamp': 1440644006, - }, - 'params': { - 'skip_download': 'DRM-protected wvm download', - } - }, { - # this also tests correct video id extraction - 'note': 'm3u8 links are geo-restricted, but Android/mp4 is okay', - 'url': 'http://www.mewatch.sg/en/series/28th-sea-games-5-show/28th-sea-games-5-show-ep11/332861', - 'info_dict': { - 'id': '332861', - 'ext': 'mp4', - 'title': '28th SEA Games (5 Show) - Episode 11', - 'description': 'md5:3cd4f5f56c7c3b1340c50a863f896faa', - 'upload_date': '20150605', - 'timestamp': 1433480166, - }, - 'params': { - 'skip_download': 'DRM-protected wvm download', - }, - 'skip': 'm3u8 links are geo-restricted' - }, { - 'url': 'http://video.toggle.sg/en/clips/seraph-sun-aloysius-will-suddenly-sing-some-old-songs-in-high-pitch-on-set/343331', - 'only_matching': True, - }, { - 'url': 'http://www.mewatch.sg/en/clips/seraph-sun-aloysius-will-suddenly-sing-some-old-songs-in-high-pitch-on-set/343331', - 'only_matching': True, - }, { - 'url': 'http://www.mewatch.sg/zh/series/zero-calling-s2-hd/ep13/336367', - 'only_matching': True, - }, { - 'url': 'http://www.mewatch.sg/en/series/vetri-s2/webisodes/jeeva-is-an-orphan-vetri-s2-webisode-7/342302', - 'only_matching': True, - }, { - 'url': 'http://www.mewatch.sg/en/movies/seven-days/321936', - 'only_matching': True, - }, { - 'url': 'https://www.mewatch.sg/en/tv-show/news/may-2017-cna-singapore-tonight/fri-19-may-2017/512456', - 'only_matching': True, - }, { - 'url': 'http://www.mewatch.sg/en/channels/eleven-plus/401585', - 'only_matching': True, - }] - - _API_USER = 'tvpapi_147' - _API_PASS = '11111' - - def _real_extract(self, url): - video_id = self._match_id(url) - - params = { - 'initObj': { - 'Locale': { - 'LocaleLanguage': '', - 'LocaleCountry': '', - 'LocaleDevice': '', - 'LocaleUserState': 0 - }, - 'Platform': 0, - 'SiteGuid': 0, - 'DomainID': '0', - 'UDID': '', - 'ApiUser': self._API_USER, - 'ApiPass': self._API_PASS - }, - 'MediaID': video_id, - 'mediaType': 0, - } - - info = self._download_json( - 'http://tvpapi.as.tvinci.com/v2_9/gateways/jsonpostgw.aspx?m=GetMediaInfo', - video_id, 'Downloading video info json', data=json.dumps(params).encode('utf-8')) - - title = info['MediaName'] - - formats = [] - for video_file in info.get('Files', []): - video_url, vid_format = video_file.get('URL'), video_file.get('Format') - if not video_url or video_url == 'NA' or not vid_format: - continue - ext = determine_ext(video_url) - vid_format = vid_format.replace(' ', '') - # if geo-restricted, m3u8 is inaccessible, but mp4 is okay - if ext == 'm3u8': - m3u8_formats = self._extract_m3u8_formats( - video_url, video_id, ext='mp4', m3u8_id=vid_format, - note='Downloading %s m3u8 information' % vid_format, - errnote='Failed to download %s m3u8 information' % vid_format, - fatal=False) - for f in m3u8_formats: - # Apple FairPlay Streaming - if '/fpshls/' in f['url']: - continue - formats.append(f) - elif ext == 'mpd': - formats.extend(self._extract_mpd_formats( - video_url, video_id, mpd_id=vid_format, - note='Downloading %s MPD manifest' % vid_format, - errnote='Failed to download %s MPD manifest' % vid_format, - fatal=False)) - elif ext == 'ism': - formats.extend(self._extract_ism_formats( - video_url, video_id, ism_id=vid_format, - note='Downloading %s ISM manifest' % vid_format, - errnote='Failed to download %s ISM manifest' % vid_format, - fatal=False)) - elif ext == 'mp4': - formats.append({ - 'ext': ext, - 'url': video_url, - 'format_id': vid_format, - }) - if not formats: - for meta in (info.get('Metas') or []): - if meta.get('Key') == 'Encryption' and meta.get('Value') == '1': - raise ExtractorError( - 'This video is DRM protected.', expected=True) - # Most likely because geo-blocked - raise ExtractorError('No downloadable videos found', expected=True) - self._sort_formats(formats) - - thumbnails = [] - for picture in info.get('Pictures', []): - if not isinstance(picture, dict): - continue - pic_url = picture.get('URL') - if not pic_url: - continue - thumbnail = { - 'url': pic_url, - } - pic_size = picture.get('PicSize', '') - m = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', pic_size) - if m: - thumbnail.update({ - 'width': int(m.group('width')), - 'height': int(m.group('height')), - }) - thumbnails.append(thumbnail) - - def counter(prefix): - return int_or_none( - info.get(prefix + 'Counter') or info.get(prefix.lower() + '_counter')) - - return { - 'id': video_id, - 'title': title, - 'description': strip_or_none(info.get('Description')), - 'duration': int_or_none(info.get('Duration')), - 'timestamp': parse_iso8601(info.get('CreationDate') or None), - 'average_rating': float_or_none(info.get('Rating')), - 'view_count': counter('View'), - 'like_count': counter('Like'), - 'thumbnails': thumbnails, - 'formats': formats, - } - - -class MeWatchIE(InfoExtractor): - IE_NAME = 'mewatch' - _VALID_URL = r'https?://(?:(?:www|live)\.)?mewatch\.sg/watch/[^/?#&]+-(?P<id>[0-9]+)' - _TESTS = [{ - 'url': 'https://www.mewatch.sg/watch/Recipe-Of-Life-E1-179371', - 'info_dict': { - 'id': '1008625', - 'ext': 'mp4', - 'title': 'Recipe Of Life 味之道', - 'timestamp': 1603306526, - 'description': 'md5:6e88cde8af2068444fc8e1bc3ebf257c', - 'upload_date': '20201021', - }, - 'params': { - 'skip_download': 'm3u8 download', - }, - }, { - 'url': 'https://www.mewatch.sg/watch/Little-Red-Dot-Detectives-S2-搜密。打卡。小红点-S2-E1-176232', - 'only_matching': True, - }, { - 'url': 'https://www.mewatch.sg/watch/Little-Red-Dot-Detectives-S2-%E6%90%9C%E5%AF%86%E3%80%82%E6%89%93%E5%8D%A1%E3%80%82%E5%B0%8F%E7%BA%A2%E7%82%B9-S2-E1-176232', - 'only_matching': True, - }, { - 'url': 'https://live.mewatch.sg/watch/Recipe-Of-Life-E41-189759', - 'only_matching': True, - }] - - def _real_extract(self, url): - item_id = self._match_id(url) - custom_id = self._download_json( - 'https://cdn.mewatch.sg/api/items/' + item_id, - item_id, query={'segments': 'all'})['customId'] - return self.url_result( - 'toggle:' + custom_id, ToggleIE.ie_key(), custom_id) diff --git a/youtube_dl/extractor/toongoggles.py b/youtube_dl/extractor/toongoggles.py deleted file mode 100644 index b5ba1c01d..000000000 --- a/youtube_dl/extractor/toongoggles.py +++ /dev/null @@ -1,81 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - parse_duration, -) - - -class ToonGogglesIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?toongoggles\.com/shows/(?P<show_id>\d+)(?:/[^/]+/episodes/(?P<episode_id>\d+))?' - _TESTS = [{ - 'url': 'http://www.toongoggles.com/shows/217143/bernard-season-2/episodes/217147/football', - 'md5': '18289fc2b951eff6b953a9d8f01e6831', - 'info_dict': { - 'id': '217147', - 'ext': 'mp4', - 'title': 'Football', - 'uploader_id': '1', - 'description': 'Bernard decides to play football in order to be better than Lloyd and tries to beat him no matter how, he even cheats.', - 'upload_date': '20160718', - 'timestamp': 1468879330, - } - }, { - 'url': 'http://www.toongoggles.com/shows/227759/om-nom-stories-around-the-world', - 'info_dict': { - 'id': '227759', - 'title': 'Om Nom Stories Around The World', - }, - 'playlist_mincount': 11, - }] - - def _call_api(self, action, page_id, query): - query.update({ - 'for_ng': 1, - 'for_web': 1, - 'show_meta': 1, - 'version': 7.0, - }) - return self._download_json('http://api.toongoggles.com/' + action, page_id, query=query) - - def _parse_episode_data(self, episode_data): - title = episode_data['episode_name'] - - return { - '_type': 'url_transparent', - 'id': episode_data['episode_id'], - 'title': title, - 'url': 'kaltura:513551:' + episode_data['entry_id'], - 'thumbnail': episode_data.get('thumbnail_url'), - 'description': episode_data.get('description'), - 'duration': parse_duration(episode_data.get('hms')), - 'series': episode_data.get('show_name'), - 'season_number': int_or_none(episode_data.get('season_num')), - 'episode_id': episode_data.get('episode_id'), - 'episode': title, - 'episode_number': int_or_none(episode_data.get('episode_num')), - 'categories': episode_data.get('categories'), - 'ie_key': 'Kaltura', - } - - def _real_extract(self, url): - show_id, episode_id = re.match(self._VALID_URL, url).groups() - if episode_id: - episode_data = self._call_api('search', episode_id, { - 'filter': 'episode', - 'id': episode_id, - })['objects'][0] - return self._parse_episode_data(episode_data) - else: - show_data = self._call_api('getepisodesbyshow', show_id, { - 'max': 1000000000, - 'showid': show_id, - }) - entries = [] - for episode_data in show_data.get('objects', []): - entries.append(self._parse_episode_data(episode_data)) - return self.playlist_result(entries, show_id, show_data.get('show_name')) diff --git a/youtube_dl/extractor/toutv.py b/youtube_dl/extractor/toutv.py deleted file mode 100644 index 44b022fca..000000000 --- a/youtube_dl/extractor/toutv.py +++ /dev/null @@ -1,93 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import json - -from .radiocanada import RadioCanadaIE -from ..compat import compat_HTTPError -from ..utils import ( - ExtractorError, - int_or_none, - merge_dicts, -) - - -class TouTvIE(RadioCanadaIE): - _NETRC_MACHINE = 'toutv' - IE_NAME = 'tou.tv' - _VALID_URL = r'https?://ici\.tou\.tv/(?P<id>[a-zA-Z0-9_-]+(?:/S[0-9]+[EC][0-9]+)?)' - - _TESTS = [{ - 'url': 'http://ici.tou.tv/garfield-tout-court/S2015E17', - 'info_dict': { - 'id': '122017', - 'ext': 'mp4', - 'title': 'Saison 2015 Épisode 17', - 'description': 'La photo de famille 2', - 'upload_date': '20100717', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'skip': '404 Not Found', - }, { - 'url': 'http://ici.tou.tv/hackers', - 'only_matching': True, - }, { - 'url': 'https://ici.tou.tv/l-age-adulte/S01C501', - 'only_matching': True, - }] - _CLIENT_KEY = '90505c8d-9c34-4f34-8da1-3a85bdc6d4f4' - - def _real_initialize(self): - email, password = self._get_login_info() - if email is None: - return - try: - self._access_token = self._download_json( - 'https://services.radio-canada.ca/toutv/profiling/accounts/login', - None, 'Logging in', data=json.dumps({ - 'ClientId': self._CLIENT_KEY, - 'ClientSecret': '34026772-244b-49b6-8b06-317b30ac9a20', - 'Email': email, - 'Password': password, - 'Scope': 'id.write media-validation.read', - }).encode(), headers={ - 'Authorization': 'client-key ' + self._CLIENT_KEY, - 'Content-Type': 'application/json;charset=utf-8', - })['access_token'] - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - error = self._parse_json(e.cause.read().decode(), None)['Message'] - raise ExtractorError(error, expected=True) - raise - self._claims = self._call_api('validation/v2/getClaims')['claims'] - - def _real_extract(self, url): - path = self._match_id(url) - metadata = self._download_json( - 'https://services.radio-canada.ca/toutv/presentation/%s' % path, path, query={ - 'client_key': self._CLIENT_KEY, - 'device': 'web', - 'version': 4, - }) - # IsDrm does not necessarily mean the video is DRM protected (see - # https://github.com/ytdl-org/youtube-dl/issues/13994). - if metadata.get('IsDrm'): - self.report_warning('This video is probably DRM protected.', path) - video_id = metadata['IdMedia'] - details = metadata['Details'] - - return merge_dicts({ - 'id': video_id, - 'title': details.get('OriginalTitle'), - 'description': details.get('Description'), - 'thumbnail': details.get('ImageUrl'), - 'duration': int_or_none(details.get('LengthInSeconds')), - 'series': metadata.get('ProgramTitle'), - 'season_number': int_or_none(metadata.get('SeasonNumber')), - 'season': metadata.get('SeasonTitle'), - 'episode_number': int_or_none(metadata.get('EpisodeNumber')), - 'episode': metadata.get('EpisodeTitle'), - }, self._extract_info(metadata.get('AppCode', 'toutv'), video_id)) diff --git a/youtube_dl/extractor/traileraddict.py b/youtube_dl/extractor/traileraddict.py deleted file mode 100644 index 747370d12..000000000 --- a/youtube_dl/extractor/traileraddict.py +++ /dev/null @@ -1,64 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - - -class TrailerAddictIE(InfoExtractor): - _WORKING = False - _VALID_URL = r'(?:https?://)?(?:www\.)?traileraddict\.com/(?:trailer|clip)/(?P<movie>.+?)/(?P<trailer_name>.+)' - _TEST = { - 'url': 'http://www.traileraddict.com/trailer/prince-avalanche/trailer', - 'md5': '41365557f3c8c397d091da510e73ceb4', - 'info_dict': { - 'id': '76184', - 'ext': 'mp4', - 'title': 'Prince Avalanche Trailer', - 'description': 'Trailer for Prince Avalanche.\n\nTwo highway road workers spend the summer of 1988 away from their city lives. The isolated landscape becomes a place of misadventure as the men find themselves at odds with each other and the women they left behind.', - } - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - name = mobj.group('movie') + '/' + mobj.group('trailer_name') - webpage = self._download_webpage(url, name) - - title = self._search_regex(r'<title>(.+?)</title>', - webpage, 'video title').replace(' - Trailer Addict', '') - view_count_str = self._search_regex( - r'<span class="views_n">([0-9,.]+)</span>', - webpage, 'view count', fatal=False) - view_count = ( - None if view_count_str is None - else int(view_count_str.replace(',', ''))) - video_id = self._search_regex( - r'<param\s+name="movie"\s+value="/emb/([0-9]+)"\s*/>', - webpage, 'video id') - - # Presence of (no)watchplus function indicates HD quality is available - if re.search(r'function (no)?watchplus()', webpage): - fvar = 'fvarhd' - else: - fvar = 'fvar' - - info_url = 'http://www.traileraddict.com/%s.php?tid=%s' % (fvar, str(video_id)) - info_webpage = self._download_webpage(info_url, video_id, 'Downloading the info webpage') - - final_url = self._search_regex(r'&fileurl=(.+)', - info_webpage, 'Download url').replace('%3F', '?') - thumbnail_url = self._search_regex(r'&image=(.+?)&', - info_webpage, 'thumbnail url') - - description = self._html_search_regex( - r'(?s)<div class="synopsis">.*?<div class="movie_label_info"[^>]*>(.*?)</div>', - webpage, 'description', fatal=False) - - return { - 'id': video_id, - 'url': final_url, - 'title': title, - 'thumbnail': thumbnail_url, - 'description': description, - 'view_count': view_count, - } diff --git a/youtube_dl/extractor/trovo.py b/youtube_dl/extractor/trovo.py deleted file mode 100644 index de0107aa9..000000000 --- a/youtube_dl/extractor/trovo.py +++ /dev/null @@ -1,194 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import json - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - int_or_none, - str_or_none, - try_get, -) - - -class TrovoBaseIE(InfoExtractor): - _VALID_URL_BASE = r'https?://(?:www\.)?trovo\.live/' - - def _extract_streamer_info(self, data): - streamer_info = data.get('streamerInfo') or {} - username = streamer_info.get('userName') - return { - 'uploader': streamer_info.get('nickName'), - 'uploader_id': str_or_none(streamer_info.get('uid')), - 'uploader_url': 'https://trovo.live/' + username if username else None, - } - - -class TrovoIE(TrovoBaseIE): - _VALID_URL = TrovoBaseIE._VALID_URL_BASE + r'(?!(?:clip|video)/)(?P<id>[^/?&#]+)' - - def _real_extract(self, url): - username = self._match_id(url) - live_info = self._download_json( - 'https://gql.trovo.live/', username, query={ - 'query': '''{ - getLiveInfo(params: {userName: "%s"}) { - isLive - programInfo { - coverUrl - id - streamInfo { - desc - playUrl - } - title - } - streamerInfo { - nickName - uid - userName - } - } -}''' % username, - })['data']['getLiveInfo'] - if live_info.get('isLive') == 0: - raise ExtractorError('%s is offline' % username, expected=True) - program_info = live_info['programInfo'] - program_id = program_info['id'] - title = self._live_title(program_info['title']) - - formats = [] - for stream_info in (program_info.get('streamInfo') or []): - play_url = stream_info.get('playUrl') - if not play_url: - continue - format_id = stream_info.get('desc') - formats.append({ - 'format_id': format_id, - 'height': int_or_none(format_id[:-1]) if format_id else None, - 'url': play_url, - }) - self._sort_formats(formats) - - info = { - 'id': program_id, - 'title': title, - 'formats': formats, - 'thumbnail': program_info.get('coverUrl'), - 'is_live': True, - } - info.update(self._extract_streamer_info(live_info)) - return info - - -class TrovoVodIE(TrovoBaseIE): - _VALID_URL = TrovoBaseIE._VALID_URL_BASE + r'(?:clip|video)/(?P<id>[^/?&#]+)' - _TESTS = [{ - 'url': 'https://trovo.live/video/ltv-100095501_100095501_1609596043', - 'info_dict': { - 'id': 'ltv-100095501_100095501_1609596043', - 'ext': 'mp4', - 'title': 'Spontaner 12 Stunden Stream! - Ok Boomer!', - 'uploader': 'Exsl', - 'timestamp': 1609640305, - 'upload_date': '20210103', - 'uploader_id': '100095501', - 'duration': 43977, - 'view_count': int, - 'like_count': int, - 'comment_count': int, - 'comments': 'mincount:8', - 'categories': ['Grand Theft Auto V'], - }, - }, { - 'url': 'https://trovo.live/clip/lc-5285890810184026005', - 'only_matching': True, - }] - - def _real_extract(self, url): - vid = self._match_id(url) - resp = self._download_json( - 'https://gql.trovo.live/', vid, data=json.dumps([{ - 'query': '''{ - batchGetVodDetailInfo(params: {vids: ["%s"]}) { - VodDetailInfos - } -}''' % vid, - }, { - 'query': '''{ - getCommentList(params: {appInfo: {postID: "%s"}, pageSize: 1000000000, preview: {}}) { - commentList { - author { - nickName - uid - } - commentID - content - createdAt - parentID - } - } -}''' % vid, - }]).encode(), headers={ - 'Content-Type': 'application/json', - }) - vod_detail_info = resp[0]['data']['batchGetVodDetailInfo']['VodDetailInfos'][vid] - vod_info = vod_detail_info['vodInfo'] - title = vod_info['title'] - - language = vod_info.get('languageName') - formats = [] - for play_info in (vod_info.get('playInfos') or []): - play_url = play_info.get('playUrl') - if not play_url: - continue - format_id = play_info.get('desc') - formats.append({ - 'ext': 'mp4', - 'filesize': int_or_none(play_info.get('fileSize')), - 'format_id': format_id, - 'height': int_or_none(format_id[:-1]) if format_id else None, - 'language': language, - 'protocol': 'm3u8_native', - 'tbr': int_or_none(play_info.get('bitrate')), - 'url': play_url, - 'http_headers': {'Origin': 'https://trovo.live'}, - }) - self._sort_formats(formats) - - category = vod_info.get('categoryName') - get_count = lambda x: int_or_none(vod_info.get(x + 'Num')) - - comment_list = try_get(resp, lambda x: x[1]['data']['getCommentList']['commentList'], list) or [] - comments = [] - for comment in comment_list: - content = comment.get('content') - if not content: - continue - author = comment.get('author') or {} - parent = comment.get('parentID') - comments.append({ - 'author': author.get('nickName'), - 'author_id': str_or_none(author.get('uid')), - 'id': str_or_none(comment.get('commentID')), - 'text': content, - 'timestamp': int_or_none(comment.get('createdAt')), - 'parent': 'root' if parent == 0 else str_or_none(parent), - }) - - info = { - 'id': vid, - 'title': title, - 'formats': formats, - 'thumbnail': vod_info.get('coverUrl'), - 'timestamp': int_or_none(vod_info.get('publishTs')), - 'duration': int_or_none(vod_info.get('duration')), - 'view_count': get_count('watch'), - 'like_count': get_count('like'), - 'comment_count': get_count('comment'), - 'comments': comments, - 'categories': [category] if category else None, - } - info.update(self._extract_streamer_info(vod_detail_info)) - return info diff --git a/youtube_dl/extractor/trutv.py b/youtube_dl/extractor/trutv.py deleted file mode 100644 index ce892c8c5..000000000 --- a/youtube_dl/extractor/trutv.py +++ /dev/null @@ -1,75 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .turner import TurnerBaseIE -from ..utils import ( - int_or_none, - parse_iso8601, -) - - -class TruTVIE(TurnerBaseIE): - _VALID_URL = r'https?://(?:www\.)?trutv\.com/(?:shows|full-episodes)/(?P<series_slug>[0-9A-Za-z-]+)/(?:videos/(?P<clip_slug>[0-9A-Za-z-]+)|(?P<id>\d+))' - _TEST = { - 'url': 'https://www.trutv.com/shows/the-carbonaro-effect/videos/sunlight-activated-flower.html', - 'info_dict': { - 'id': 'f16c03beec1e84cd7d1a51f11d8fcc29124cc7f1', - 'ext': 'mp4', - 'title': 'Sunlight-Activated Flower', - 'description': "A customer is stunned when he sees Michael's sunlight-activated flower.", - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - } - - def _real_extract(self, url): - series_slug, clip_slug, video_id = re.match(self._VALID_URL, url).groups() - - if video_id: - path = 'episode' - display_id = video_id - else: - path = 'series/clip' - display_id = clip_slug - - data = self._download_json( - 'https://api.trutv.com/v2/web/%s/%s/%s' % (path, series_slug, display_id), - display_id) - video_data = data['episode'] if video_id else data['info'] - media_id = video_data['mediaId'] - title = video_data['title'].strip() - - info = self._extract_ngtv_info( - media_id, {}, { - 'url': url, - 'site_name': 'truTV', - 'auth_required': video_data.get('isAuthRequired'), - }) - - thumbnails = [] - for image in video_data.get('images', []): - image_url = image.get('srcUrl') - if not image_url: - continue - thumbnails.append({ - 'url': image_url, - 'width': int_or_none(image.get('width')), - 'height': int_or_none(image.get('height')), - }) - - info.update({ - 'id': media_id, - 'display_id': display_id, - 'title': title, - 'description': video_data.get('description'), - 'thumbnails': thumbnails, - 'timestamp': parse_iso8601(video_data.get('publicationDate')), - 'series': video_data.get('showTitle'), - 'season_number': int_or_none(video_data.get('seasonNum')), - 'episode_number': int_or_none(video_data.get('episodeNum')), - }) - return info diff --git a/youtube_dl/extractor/tubitv.py b/youtube_dl/extractor/tubitv.py deleted file mode 100644 index ebfb05c63..000000000 --- a/youtube_dl/extractor/tubitv.py +++ /dev/null @@ -1,110 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - int_or_none, - sanitized_Request, - urlencode_postdata, -) - - -class TubiTvIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tubitv\.com/(?:video|movies|tv-shows)/(?P<id>[0-9]+)' - _LOGIN_URL = 'http://tubitv.com/login' - _NETRC_MACHINE = 'tubitv' - _GEO_COUNTRIES = ['US'] - _TESTS = [{ - 'url': 'http://tubitv.com/video/283829/the_comedian_at_the_friday', - 'md5': '43ac06be9326f41912dc64ccf7a80320', - 'info_dict': { - 'id': '283829', - 'ext': 'mp4', - 'title': 'The Comedian at The Friday', - 'description': 'A stand up comedian is forced to look at the decisions in his life while on a one week trip to the west coast.', - 'uploader_id': 'bc168bee0d18dd1cb3b86c68706ab434', - }, - }, { - 'url': 'http://tubitv.com/tv-shows/321886/s01_e01_on_nom_stories', - 'only_matching': True, - }, { - 'url': 'http://tubitv.com/movies/383676/tracker', - 'only_matching': True, - }, { - 'url': 'https://tubitv.com/movies/560057/penitentiary?start=true', - 'info_dict': { - 'id': '560057', - 'ext': 'mp4', - 'title': 'Penitentiary', - 'description': 'md5:8d2fc793a93cc1575ff426fdcb8dd3f9', - 'uploader_id': 'd8fed30d4f24fcb22ec294421b9defc2', - 'release_year': 1979, - }, - 'params': { - 'skip_download': True, - }, - }] - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - self.report_login() - form_data = { - 'username': username, - 'password': password, - } - payload = urlencode_postdata(form_data) - request = sanitized_Request(self._LOGIN_URL, payload) - request.add_header('Content-Type', 'application/x-www-form-urlencoded') - login_page = self._download_webpage( - request, None, False, 'Wrong login info') - if not re.search(r'id="tubi-logout"', login_page): - raise ExtractorError( - 'Login failed (invalid username/password)', expected=True) - - def _real_initialize(self): - self._login() - - def _real_extract(self, url): - video_id = self._match_id(url) - video_data = self._download_json( - 'http://tubitv.com/oz/videos/%s/content' % video_id, video_id) - title = video_data['title'] - - formats = self._extract_m3u8_formats( - self._proto_relative_url(video_data['url']), - video_id, 'mp4', 'm3u8_native') - self._sort_formats(formats) - - thumbnails = [] - for thumbnail_url in video_data.get('thumbnails', []): - if not thumbnail_url: - continue - thumbnails.append({ - 'url': self._proto_relative_url(thumbnail_url), - }) - - subtitles = {} - for sub in video_data.get('subtitles', []): - sub_url = sub.get('url') - if not sub_url: - continue - subtitles.setdefault(sub.get('lang', 'English'), []).append({ - 'url': self._proto_relative_url(sub_url), - }) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'subtitles': subtitles, - 'thumbnails': thumbnails, - 'description': video_data.get('description'), - 'duration': int_or_none(video_data.get('duration')), - 'uploader_id': video_data.get('publisher_id'), - 'release_year': int_or_none(video_data.get('year')), - } diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py deleted file mode 100644 index ae584ad69..000000000 --- a/youtube_dl/extractor/tumblr.py +++ /dev/null @@ -1,213 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - int_or_none, - urlencode_postdata -) - - -class TumblrIE(InfoExtractor): - _VALID_URL = r'https?://(?P<blog_name>[^/?#&]+)\.tumblr\.com/(?:post|video)/(?P<id>[0-9]+)(?:$|[/?#])' - _NETRC_MACHINE = 'tumblr' - _LOGIN_URL = 'https://www.tumblr.com/login' - _TESTS = [{ - 'url': 'http://tatianamaslanydaily.tumblr.com/post/54196191430/orphan-black-dvd-extra-behind-the-scenes', - 'md5': '479bb068e5b16462f5176a6828829767', - 'info_dict': { - 'id': '54196191430', - 'ext': 'mp4', - 'title': 'tatiana maslany news, Orphan Black || DVD extra - behind the scenes ↳...', - 'description': 'md5:37db8211e40b50c7c44e95da14f630b7', - 'thumbnail': r're:http://.*\.jpg', - } - }, { - 'url': 'http://5sostrum.tumblr.com/post/90208453769/yall-forgetting-the-greatest-keek-of-them-all', - 'md5': 'bf348ef8c0ef84fbf1cbd6fa6e000359', - 'info_dict': { - 'id': '90208453769', - 'ext': 'mp4', - 'title': '5SOS STRUM ;]', - 'description': 'md5:dba62ac8639482759c8eb10ce474586a', - 'thumbnail': r're:http://.*\.jpg', - } - }, { - 'url': 'http://hdvideotest.tumblr.com/post/130323439814/test-description-for-my-hd-video', - 'md5': '7ae503065ad150122dc3089f8cf1546c', - 'info_dict': { - 'id': '130323439814', - 'ext': 'mp4', - 'title': 'HD Video Testing \u2014 Test description for my HD video', - 'description': 'md5:97cc3ab5fcd27ee4af6356701541319c', - 'thumbnail': r're:http://.*\.jpg', - }, - 'params': { - 'format': 'hd', - }, - }, { - 'url': 'http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching', - 'md5': 'de07e5211d60d4f3a2c3df757ea9f6ab', - 'info_dict': { - 'id': 'Wmur', - 'ext': 'mp4', - 'title': 'naked smoking & stretching', - 'upload_date': '20150506', - 'timestamp': 1430931613, - 'age_limit': 18, - 'uploader_id': '1638622', - 'uploader': 'naked-yogi', - }, - 'add_ie': ['Vidme'], - }, { - 'url': 'http://camdamage.tumblr.com/post/98846056295/', - 'md5': 'a9e0c8371ea1ca306d6554e3fecf50b6', - 'info_dict': { - 'id': '105463834', - 'ext': 'mp4', - 'title': 'Cam Damage-HD 720p', - 'uploader': 'John Moyer', - 'uploader_id': 'user32021558', - }, - 'add_ie': ['Vimeo'], - }, { - 'url': 'http://sutiblr.tumblr.com/post/139638707273', - 'md5': '2dd184b3669e049ba40563a7d423f95c', - 'info_dict': { - 'id': 'ir7qBEIKqvq', - 'ext': 'mp4', - 'title': 'Vine by sutiblr', - 'alt_title': 'Vine by sutiblr', - 'uploader': 'sutiblr', - 'uploader_id': '1198993975374495744', - 'upload_date': '20160220', - 'like_count': int, - 'comment_count': int, - 'repost_count': int, - }, - 'add_ie': ['Vine'], - }, { - 'url': 'http://vitasidorkina.tumblr.com/post/134652425014/joskriver-victoriassecret-invisibility-or', - 'md5': '01c12ceb82cbf6b2fe0703aa56b3ad72', - 'info_dict': { - 'id': '-7LnUPGlSo', - 'ext': 'mp4', - 'title': 'Video by victoriassecret', - 'description': 'Invisibility or flight…which superpower would YOU choose? #VSFashionShow #ThisOrThat', - 'uploader_id': 'victoriassecret', - 'thumbnail': r're:^https?://.*\.jpg' - }, - 'add_ie': ['Instagram'], - }] - - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - - login_page = self._download_webpage( - self._LOGIN_URL, None, 'Downloading login page') - - login_form = self._hidden_inputs(login_page) - login_form.update({ - 'user[email]': username, - 'user[password]': password - }) - - response, urlh = self._download_webpage_handle( - self._LOGIN_URL, None, 'Logging in', - data=urlencode_postdata(login_form), headers={ - 'Content-Type': 'application/x-www-form-urlencoded', - 'Referer': self._LOGIN_URL, - }) - - # Successful login - if '/dashboard' in urlh.geturl(): - return - - login_errors = self._parse_json( - self._search_regex( - r'RegistrationForm\.errors\s*=\s*(\[.+?\])\s*;', response, - 'login errors', default='[]'), - None, fatal=False) - if login_errors: - raise ExtractorError( - 'Unable to login: %s' % login_errors[0], expected=True) - - self.report_warning('Login has probably failed') - - def _real_extract(self, url): - m_url = re.match(self._VALID_URL, url) - video_id = m_url.group('id') - blog = m_url.group('blog_name') - - url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id) - webpage, urlh = self._download_webpage_handle(url, video_id) - - redirect_url = urlh.geturl() - if 'tumblr.com/safe-mode' in redirect_url or redirect_url.startswith('/safe-mode'): - raise ExtractorError( - 'This Tumblr may contain sensitive media. ' - 'Disable safe mode in your account settings ' - 'at https://www.tumblr.com/settings/account#safe_mode', - expected=True) - - iframe_url = self._search_regex( - r'src=\'(https?://www\.tumblr\.com/video/[^\']+)\'', - webpage, 'iframe url', default=None) - if iframe_url is None: - return self.url_result(redirect_url, 'Generic') - - iframe = self._download_webpage(iframe_url, video_id, 'Downloading iframe page') - - duration = None - sources = [] - - sd_url = self._search_regex( - r'<source[^>]+src=(["\'])(?P<url>.+?)\1', iframe, - 'sd video url', default=None, group='url') - if sd_url: - sources.append((sd_url, 'sd')) - - options = self._parse_json( - self._search_regex( - r'data-crt-options=(["\'])(?P<options>.+?)\1', iframe, - 'hd video url', default='', group='options'), - video_id, fatal=False) - if options: - duration = int_or_none(options.get('duration')) - hd_url = options.get('hdUrl') - if hd_url: - sources.append((hd_url, 'hd')) - - formats = [{ - 'url': video_url, - 'ext': 'mp4', - 'format_id': format_id, - 'height': int_or_none(self._search_regex( - r'/(\d{3,4})$', video_url, 'height', default=None)), - 'quality': quality, - } for quality, (video_url, format_id) in enumerate(sources)] - - self._sort_formats(formats) - - # The only place where you can get a title, it's not complete, - # but searching in other places doesn't work for all videos - video_title = self._html_search_regex( - r'(?s)<title>(?P<title>.*?)(?: \| Tumblr)?</title>', - webpage, 'title') - - return { - 'id': video_id, - 'title': video_title, - 'description': self._og_search_description(webpage, default=None), - 'thumbnail': self._og_search_thumbnail(webpage, default=None), - 'duration': duration, - 'formats': formats, - } diff --git a/youtube_dl/extractor/turbo.py b/youtube_dl/extractor/turbo.py deleted file mode 100644 index be3eaa5c2..000000000 --- a/youtube_dl/extractor/turbo.py +++ /dev/null @@ -1,68 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - ExtractorError, - int_or_none, - qualities, - xpath_text, -) - - -class TurboIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?turbo\.fr/videos-voiture/(?P<id>[0-9]+)-' - _API_URL = 'http://www.turbo.fr/api/tv/xml.php?player_generique=player_generique&id={0:}' - _TEST = { - 'url': 'http://www.turbo.fr/videos-voiture/454443-turbo-du-07-09-2014-renault-twingo-3-bentley-continental-gt-speed-ces-guide-achat-dacia.html', - 'md5': '33f4b91099b36b5d5a91f84b5bcba600', - 'info_dict': { - 'id': '454443', - 'ext': 'mp4', - 'duration': 3715, - 'title': 'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia... ', - 'description': 'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia...', - 'thumbnail': r're:^https?://.*\.jpg$', - } - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - webpage = self._download_webpage(url, video_id) - - playlist = self._download_xml(self._API_URL.format(video_id), video_id) - item = playlist.find('./channel/item') - if item is None: - raise ExtractorError('Playlist item was not found', expected=True) - - title = xpath_text(item, './title', 'title') - duration = int_or_none(xpath_text(item, './durate', 'duration')) - thumbnail = xpath_text(item, './visuel_clip', 'thumbnail') - description = self._html_search_meta('description', webpage) - - formats = [] - get_quality = qualities(['3g', 'sd', 'hq']) - for child in item: - m = re.search(r'url_video_(?P<quality>.+)', child.tag) - if m: - quality = compat_str(m.group('quality')) - formats.append({ - 'format_id': quality, - 'url': child.text, - 'quality': get_quality(quality), - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'duration': duration, - 'thumbnail': thumbnail, - 'description': description, - 'formats': formats, - } diff --git a/youtube_dl/extractor/turner.py b/youtube_dl/extractor/turner.py deleted file mode 100644 index 81229a54b..000000000 --- a/youtube_dl/extractor/turner.py +++ /dev/null @@ -1,260 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .adobepass import AdobePassIE -from ..compat import compat_str -from ..utils import ( - fix_xml_ampersands, - xpath_text, - int_or_none, - determine_ext, - float_or_none, - parse_duration, - xpath_attr, - update_url_query, - ExtractorError, - strip_or_none, - url_or_none, -) - - -class TurnerBaseIE(AdobePassIE): - _AKAMAI_SPE_TOKEN_CACHE = {} - - def _extract_timestamp(self, video_data): - return int_or_none(xpath_attr(video_data, 'dateCreated', 'uts')) - - def _add_akamai_spe_token(self, tokenizer_src, video_url, content_id, ap_data, custom_tokenizer_query=None): - secure_path = self._search_regex(r'https?://[^/]+(.+/)', video_url, 'secure path') + '*' - token = self._AKAMAI_SPE_TOKEN_CACHE.get(secure_path) - if not token: - query = { - 'path': secure_path, - } - if custom_tokenizer_query: - query.update(custom_tokenizer_query) - else: - query['videoId'] = content_id - if ap_data.get('auth_required'): - query['accessToken'] = self._extract_mvpd_auth(ap_data['url'], content_id, ap_data['site_name'], ap_data['site_name']) - auth = self._download_xml( - tokenizer_src, content_id, query=query) - error_msg = xpath_text(auth, 'error/msg') - if error_msg: - raise ExtractorError(error_msg, expected=True) - token = xpath_text(auth, 'token') - if not token: - return video_url - self._AKAMAI_SPE_TOKEN_CACHE[secure_path] = token - return video_url + '?hdnea=' + token - - def _extract_cvp_info(self, data_src, video_id, path_data={}, ap_data={}, fatal=False): - video_data = self._download_xml( - data_src, video_id, - transform_source=lambda s: fix_xml_ampersands(s).strip(), - fatal=fatal) - if not video_data: - return {} - video_id = video_data.attrib['id'] - title = xpath_text(video_data, 'headline', fatal=True) - content_id = xpath_text(video_data, 'contentId') or video_id - # rtmp_src = xpath_text(video_data, 'akamai/src') - # if rtmp_src: - # split_rtmp_src = rtmp_src.split(',') - # if len(split_rtmp_src) == 2: - # rtmp_src = split_rtmp_src[1] - # aifp = xpath_text(video_data, 'akamai/aifp', default='') - - urls = [] - formats = [] - thumbnails = [] - subtitles = {} - rex = re.compile( - r'(?P<width>[0-9]+)x(?P<height>[0-9]+)(?:_(?P<bitrate>[0-9]+))?') - # Possible formats locations: files/file, files/groupFiles/files - # and maybe others - for video_file in video_data.findall('.//file'): - video_url = url_or_none(video_file.text.strip()) - if not video_url: - continue - ext = determine_ext(video_url) - if video_url.startswith('/mp4:protected/'): - continue - # TODO Correct extraction for these files - # protected_path_data = path_data.get('protected') - # if not protected_path_data or not rtmp_src: - # continue - # protected_path = self._search_regex( - # r'/mp4:(.+)\.[a-z0-9]', video_url, 'secure path') - # auth = self._download_webpage( - # protected_path_data['tokenizer_src'], query={ - # 'path': protected_path, - # 'videoId': content_id, - # 'aifp': aifp, - # }) - # token = xpath_text(auth, 'token') - # if not token: - # continue - # video_url = rtmp_src + video_url + '?' + token - elif video_url.startswith('/secure/'): - secure_path_data = path_data.get('secure') - if not secure_path_data: - continue - video_url = self._add_akamai_spe_token( - secure_path_data['tokenizer_src'], - secure_path_data['media_src'] + video_url, - content_id, ap_data) - elif not re.match('https?://', video_url): - base_path_data = path_data.get(ext, path_data.get('default', {})) - media_src = base_path_data.get('media_src') - if not media_src: - continue - video_url = media_src + video_url - if video_url in urls: - continue - urls.append(video_url) - format_id = video_file.get('bitrate') - if ext in ('scc', 'srt', 'vtt'): - subtitles.setdefault('en', []).append({ - 'ext': ext, - 'url': video_url, - }) - elif ext == 'png': - thumbnails.append({ - 'id': format_id, - 'url': video_url, - }) - elif ext == 'smil': - formats.extend(self._extract_smil_formats( - video_url, video_id, fatal=False)) - elif re.match(r'https?://[^/]+\.akamaihd\.net/[iz]/', video_url): - formats.extend(self._extract_akamai_formats( - video_url, video_id, { - 'hds': path_data.get('f4m', {}).get('host'), - # nba.cdn.turner.com, ht.cdn.turner.com, ht2.cdn.turner.com - # ht3.cdn.turner.com, i.cdn.turner.com, s.cdn.turner.com - # ssl.cdn.turner.com - 'http': 'pmd.cdn.turner.com', - })) - elif ext == 'm3u8': - m3u8_formats = self._extract_m3u8_formats( - video_url, video_id, 'mp4', - m3u8_id=format_id or 'hls', fatal=False) - if '/secure/' in video_url and '?hdnea=' in video_url: - for f in m3u8_formats: - f['_seekable'] = False - formats.extend(m3u8_formats) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - update_url_query(video_url, {'hdcore': '3.7.0'}), - video_id, f4m_id=format_id or 'hds', fatal=False)) - else: - f = { - 'format_id': format_id, - 'url': video_url, - 'ext': ext, - } - mobj = rex.search(video_url) - if mobj: - f.update({ - 'width': int(mobj.group('width')), - 'height': int(mobj.group('height')), - 'tbr': int_or_none(mobj.group('bitrate')), - }) - elif isinstance(format_id, compat_str): - if format_id.isdigit(): - f['tbr'] = int(format_id) - else: - mobj = re.match(r'ios_(audio|[0-9]+)$', format_id) - if mobj: - if mobj.group(1) == 'audio': - f.update({ - 'vcodec': 'none', - 'ext': 'm4a', - }) - else: - f['tbr'] = int(mobj.group(1)) - formats.append(f) - self._sort_formats(formats) - - for source in video_data.findall('closedCaptions/source'): - for track in source.findall('track'): - track_url = url_or_none(track.get('url')) - if not track_url or track_url.endswith('/big'): - continue - lang = track.get('lang') or track.get('label') or 'en' - subtitles.setdefault(lang, []).append({ - 'url': track_url, - 'ext': { - 'scc': 'scc', - 'webvtt': 'vtt', - 'smptett': 'tt', - }.get(source.get('format')) - }) - - thumbnails.extend({ - 'id': image.get('cut') or image.get('name'), - 'url': image.text, - 'width': int_or_none(image.get('width')), - 'height': int_or_none(image.get('height')), - } for image in video_data.findall('images/image')) - - is_live = xpath_text(video_data, 'isLive') == 'true' - - return { - 'id': video_id, - 'title': self._live_title(title) if is_live else title, - 'formats': formats, - 'subtitles': subtitles, - 'thumbnails': thumbnails, - 'thumbnail': xpath_text(video_data, 'poster'), - 'description': strip_or_none(xpath_text(video_data, 'description')), - 'duration': parse_duration(xpath_text(video_data, 'length') or xpath_text(video_data, 'trt')), - 'timestamp': self._extract_timestamp(video_data), - 'upload_date': xpath_attr(video_data, 'metas', 'version'), - 'series': xpath_text(video_data, 'showTitle'), - 'season_number': int_or_none(xpath_text(video_data, 'seasonNumber')), - 'episode_number': int_or_none(xpath_text(video_data, 'episodeNumber')), - 'is_live': is_live, - } - - def _extract_ngtv_info(self, media_id, tokenizer_query, ap_data=None): - streams_data = self._download_json( - 'http://medium.ngtv.io/media/%s/tv' % media_id, - media_id)['media']['tv'] - duration = None - chapters = [] - formats = [] - for supported_type in ('unprotected', 'bulkaes'): - stream_data = streams_data.get(supported_type, {}) - m3u8_url = stream_data.get('secureUrl') or stream_data.get('url') - if not m3u8_url: - continue - if stream_data.get('playlistProtection') == 'spe': - m3u8_url = self._add_akamai_spe_token( - 'http://token.ngtv.io/token/token_spe', - m3u8_url, media_id, ap_data or {}, tokenizer_query) - formats.extend(self._extract_m3u8_formats( - m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False)) - - duration = float_or_none(stream_data.get('totalRuntime')) - - if not chapters: - for chapter in stream_data.get('contentSegments', []): - start_time = float_or_none(chapter.get('start')) - chapter_duration = float_or_none(chapter.get('duration')) - if start_time is None or chapter_duration is None: - continue - chapters.append({ - 'start_time': start_time, - 'end_time': start_time + chapter_duration, - }) - self._sort_formats(formats) - - return { - 'formats': formats, - 'chapters': chapters, - 'duration': duration, - } diff --git a/youtube_dl/extractor/tv2.py b/youtube_dl/extractor/tv2.py deleted file mode 100644 index 334b7d540..000000000 --- a/youtube_dl/extractor/tv2.py +++ /dev/null @@ -1,248 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_HTTPError -from ..utils import ( - determine_ext, - ExtractorError, - int_or_none, - float_or_none, - js_to_json, - parse_iso8601, - remove_end, - strip_or_none, - try_get, -) - - -class TV2IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tv2\.no/v/(?P<id>\d+)' - _TESTS = [{ - 'url': 'http://www.tv2.no/v/916509/', - 'info_dict': { - 'id': '916509', - 'ext': 'flv', - 'title': 'Se Frode Gryttens hyllest av Steven Gerrard', - 'description': 'TV 2 Sportens huspoet tar avskjed med Liverpools kaptein Steven Gerrard.', - 'timestamp': 1431715610, - 'upload_date': '20150515', - 'duration': 156.967, - 'view_count': int, - 'categories': list, - }, - }] - _API_DOMAIN = 'sumo.tv2.no' - _PROTOCOLS = ('HDS', 'HLS', 'DASH') - _GEO_COUNTRIES = ['NO'] - - def _real_extract(self, url): - video_id = self._match_id(url) - api_base = 'http://%s/api/web/asset/%s' % (self._API_DOMAIN, video_id) - - asset = self._download_json( - api_base + '.json', video_id, - 'Downloading metadata JSON')['asset'] - title = asset.get('subtitle') or asset['title'] - is_live = asset.get('live') is True - - formats = [] - format_urls = [] - for protocol in self._PROTOCOLS: - try: - data = self._download_json( - api_base + '/play.json?protocol=%s&videoFormat=SMIL+ISMUSP' % protocol, - video_id, 'Downloading play JSON')['playback'] - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - error = self._parse_json(e.cause.read().decode(), video_id)['error'] - error_code = error.get('code') - if error_code == 'ASSET_PLAYBACK_INVALID_GEO_LOCATION': - self.raise_geo_restricted(countries=self._GEO_COUNTRIES) - elif error_code == 'SESSION_NOT_AUTHENTICATED': - self.raise_login_required() - raise ExtractorError(error['description']) - raise - items = try_get(data, lambda x: x['items']['item']) - if not items: - continue - if not isinstance(items, list): - items = [items] - for item in items: - if not isinstance(item, dict): - continue - video_url = item.get('url') - if not video_url or video_url in format_urls: - continue - format_id = '%s-%s' % (protocol.lower(), item.get('mediaFormat')) - if not self._is_valid_url(video_url, video_id, format_id): - continue - format_urls.append(video_url) - ext = determine_ext(video_url) - if ext == 'f4m': - formats.extend(self._extract_f4m_formats( - video_url, video_id, f4m_id=format_id, fatal=False)) - elif ext == 'm3u8': - if not data.get('drmProtected'): - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', - 'm3u8' if is_live else 'm3u8_native', - m3u8_id=format_id, fatal=False)) - elif ext == 'mpd': - formats.extend(self._extract_mpd_formats( - video_url, video_id, format_id, fatal=False)) - elif ext == 'ism' or video_url.endswith('.ism/Manifest'): - pass - else: - formats.append({ - 'url': video_url, - 'format_id': format_id, - 'tbr': int_or_none(item.get('bitrate')), - 'filesize': int_or_none(item.get('fileSize')), - }) - if not formats and data.get('drmProtected'): - raise ExtractorError('This video is DRM protected.', expected=True) - self._sort_formats(formats) - - thumbnails = [{ - 'id': thumbnail.get('@type'), - 'url': thumbnail.get('url'), - } for _, thumbnail in (asset.get('imageVersions') or {}).items()] - - return { - 'id': video_id, - 'url': video_url, - 'title': self._live_title(title) if is_live else title, - 'description': strip_or_none(asset.get('description')), - 'thumbnails': thumbnails, - 'timestamp': parse_iso8601(asset.get('createTime')), - 'duration': float_or_none(asset.get('accurateDuration') or asset.get('duration')), - 'view_count': int_or_none(asset.get('views')), - 'categories': asset.get('keywords', '').split(','), - 'formats': formats, - 'is_live': is_live, - } - - -class TV2ArticleIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tv2\.no/(?:a|\d{4}/\d{2}/\d{2}(/[^/]+)+)/(?P<id>\d+)' - _TESTS = [{ - 'url': 'http://www.tv2.no/2015/05/16/nyheter/alesund/krim/pingvin/6930542', - 'info_dict': { - 'id': '6930542', - 'title': 'Russen hetses etter pingvintyveri - innrømmer å ha åpnet luken på buret', - 'description': 'De fire siktede nekter fortsatt for å ha stjålet pingvinbabyene, men innrømmer å ha åpnet luken til de små kyllingene.', - }, - 'playlist_count': 2, - }, { - 'url': 'http://www.tv2.no/a/6930542', - 'only_matching': True, - }] - - def _real_extract(self, url): - playlist_id = self._match_id(url) - - webpage = self._download_webpage(url, playlist_id) - - # Old embed pattern (looks unused nowadays) - assets = re.findall(r'data-assetid=["\'](\d+)', webpage) - - if not assets: - # New embed pattern - for v in re.findall(r'(?s)TV2ContentboxVideo\(({.+?})\)', webpage): - video = self._parse_json( - v, playlist_id, transform_source=js_to_json, fatal=False) - if not video: - continue - asset = video.get('assetId') - if asset: - assets.append(asset) - - entries = [ - self.url_result('http://www.tv2.no/v/%s' % asset_id, 'TV2') - for asset_id in assets] - - title = remove_end(self._og_search_title(webpage), ' - TV2.no') - description = remove_end(self._og_search_description(webpage), ' - TV2.no') - - return self.playlist_result(entries, playlist_id, title, description) - - -class KatsomoIE(TV2IE): - _VALID_URL = r'https?://(?:www\.)?(?:katsomo|mtv(uutiset)?)\.fi/(?:sarja/[0-9a-z-]+-\d+/[0-9a-z-]+-|(?:#!/)?jakso/(?:\d+/[^/]+/)?|video/prog)(?P<id>\d+)' - _TESTS = [{ - 'url': 'https://www.mtv.fi/sarja/mtv-uutiset-live-33001002003/lahden-pelicans-teki-kovan-ratkaisun-ville-nieminen-pihalle-1181321', - 'info_dict': { - 'id': '1181321', - 'ext': 'mp4', - 'title': 'Lahden Pelicans teki kovan ratkaisun – Ville Nieminen pihalle', - 'description': 'Päätöksen teki Pelicansin hallitus.', - 'timestamp': 1575116484, - 'upload_date': '20191130', - 'duration': 37.12, - 'view_count': int, - 'categories': list, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'http://www.katsomo.fi/#!/jakso/33001005/studio55-fi/658521/jukka-kuoppamaki-tekee-yha-lauluja-vaikka-lentokoneessa', - 'only_matching': True, - }, { - 'url': 'https://www.mtvuutiset.fi/video/prog1311159', - 'only_matching': True, - }, { - 'url': 'https://www.katsomo.fi/#!/jakso/1311159', - 'only_matching': True, - }] - _API_DOMAIN = 'api.katsomo.fi' - _PROTOCOLS = ('HLS', 'MPD') - _GEO_COUNTRIES = ['FI'] - - -class MTVUutisetArticleIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)mtvuutiset\.fi/artikkeli/[^/]+/(?P<id>\d+)' - _TESTS = [{ - 'url': 'https://www.mtvuutiset.fi/artikkeli/tallaisia-vaurioita-viking-amorellassa-on-useamman-osaston-alla-vetta/7931384', - 'info_dict': { - 'id': '1311159', - 'ext': 'mp4', - 'title': 'Viking Amorellan matkustajien evakuointi on alkanut – tältä operaatio näyttää laivalla', - 'description': 'Viking Amorellan matkustajien evakuointi on alkanut – tältä operaatio näyttää laivalla', - 'timestamp': 1600608966, - 'upload_date': '20200920', - 'duration': 153.7886666, - 'view_count': int, - 'categories': list, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - # multiple Youtube embeds - 'url': 'https://www.mtvuutiset.fi/artikkeli/50-vuotta-subarun-vastaiskua/6070962', - 'only_matching': True, - }] - - def _real_extract(self, url): - article_id = self._match_id(url) - article = self._download_json( - 'http://api.mtvuutiset.fi/mtvuutiset/api/json/' + article_id, - article_id) - - def entries(): - for video in (article.get('videos') or []): - video_type = video.get('videotype') - video_url = video.get('url') - if not (video_url and video_type in ('katsomo', 'youtube')): - continue - yield self.url_result( - video_url, video_type.capitalize(), video.get('video_id')) - - return self.playlist_result( - entries(), article_id, article.get('title'), article.get('description')) diff --git a/youtube_dl/extractor/tv2hu.py b/youtube_dl/extractor/tv2hu.py deleted file mode 100644 index 86017b757..000000000 --- a/youtube_dl/extractor/tv2hu.py +++ /dev/null @@ -1,62 +0,0 @@ -# encoding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import int_or_none - - -class TV2HuIE(InfoExtractor): - IE_NAME = 'tv2.hu' - _VALID_URL = r'https?://(?:www\.)?tv2\.hu/(?:[^/]+/)+(?P<id>\d+)_[^/?#]+?\.html' - _TESTS = [{ - 'url': 'http://tv2.hu/ezek_megorultek/217679_ezek-megorultek---1.-adas-1.-resz.html', - 'md5': '585e58e2e090f34603804bb2c48e98d8', - 'info_dict': { - 'id': '217679', - 'ext': 'mp4', - 'title': 'Ezek megőrültek! - 1. adás 1. rész', - 'upload_date': '20160826', - 'thumbnail': r're:^https?://.*\.jpg$' - } - }, { - 'url': 'http://tv2.hu/ezek_megorultek/teljes_adasok/217677_ezek-megorultek---1.-adas-2.-resz.html', - 'only_matching': True - }, { - 'url': 'http://tv2.hu/musoraink/aktiv/aktiv_teljes_adas/217963_aktiv-teljes-adas---2016.08.30..html', - 'only_matching': True - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - json_url = self._search_regex( - r'jsonUrl\s*=\s*"([^"]+)"', webpage, 'json url') - json_data = self._download_json(json_url, video_id) - - formats = [] - for b in ('bitrates', 'backupBitrates'): - bitrates = json_data.get(b, {}) - m3u8_url = bitrates.get('hls') - if m3u8_url: - formats.extend(self._extract_wowza_formats( - m3u8_url, video_id, skip_protocols=['rtmp', 'rtsp'])) - - for mp4_url in bitrates.get('mp4', []): - height = int_or_none(self._search_regex( - r'\.(\d+)p\.mp4', mp4_url, 'height', default=None)) - formats.append({ - 'format_id': 'http' + ('-%d' % height if height else ''), - 'url': mp4_url, - 'height': height, - 'width': int_or_none(height / 9.0 * 16.0 if height else None), - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': self._og_search_title(webpage).strip(), - 'thumbnail': self._og_search_thumbnail(webpage), - 'upload_date': self._search_regex( - r'/vod/(\d{8})/', json_url, 'upload_date', default=None), - 'formats': formats, - } diff --git a/youtube_dl/extractor/tv4.py b/youtube_dl/extractor/tv4.py deleted file mode 100644 index b73bab9a8..000000000 --- a/youtube_dl/extractor/tv4.py +++ /dev/null @@ -1,128 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - parse_iso8601, -) - - -class TV4IE(InfoExtractor): - IE_DESC = 'tv4.se and tv4play.se' - _VALID_URL = r'''(?x)https?://(?:www\.)? - (?: - tv4\.se/(?:[^/]+)/klipp/(?:.*)-| - tv4play\.se/ - (?: - (?:program|barn)/(?:(?:[^/]+/){1,2}|(?:[^\?]+)\?video_id=)| - iframe/video/| - film/| - sport/| - ) - )(?P<id>[0-9]+)''' - _GEO_COUNTRIES = ['SE'] - _TESTS = [ - { - 'url': 'http://www.tv4.se/kalla-fakta/klipp/kalla-fakta-5-english-subtitles-2491650', - 'md5': 'cb837212f342d77cec06e6dad190e96d', - 'info_dict': { - 'id': '2491650', - 'ext': 'mp4', - 'title': 'Kalla Fakta 5 (english subtitles)', - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': int, - 'upload_date': '20131125', - }, - }, - { - 'url': 'http://www.tv4play.se/iframe/video/3054113', - 'md5': 'cb837212f342d77cec06e6dad190e96d', - 'info_dict': { - 'id': '3054113', - 'ext': 'mp4', - 'title': 'Så här jobbar ficktjuvarna - se avslöjande bilder', - 'thumbnail': r're:^https?://.*\.jpg$', - 'description': 'Unika bilder avslöjar hur turisternas fickor vittjas mitt på Stockholms central. Två experter på ficktjuvarna avslöjar knepen du ska se upp för.', - 'timestamp': int, - 'upload_date': '20150130', - }, - }, - { - 'url': 'http://www.tv4play.se/sport/3060959', - 'only_matching': True, - }, - { - 'url': 'http://www.tv4play.se/film/2378136', - 'only_matching': True, - }, - { - 'url': 'http://www.tv4play.se/barn/looney-tunes?video_id=3062412', - 'only_matching': True, - }, - { - 'url': 'http://www.tv4play.se/program/farang/3922081', - 'only_matching': True, - }, - { - 'url': 'https://www.tv4play.se/program/nyheterna/avsnitt/13315940', - 'only_matching': True, - } - ] - - def _real_extract(self, url): - video_id = self._match_id(url) - - info = self._download_json( - 'https://playback-api.b17g.net/asset/%s' % video_id, - video_id, 'Downloading video info JSON', query={ - 'service': 'tv4', - 'device': 'browser', - 'protocol': 'hls,dash', - 'drm': 'widevine', - })['metadata'] - - title = info['title'] - - manifest_url = self._download_json( - 'https://playback-api.b17g.net/media/' + video_id, - video_id, query={ - 'service': 'tv4', - 'device': 'browser', - 'protocol': 'hls', - })['playbackItem']['manifestUrl'] - formats = self._extract_m3u8_formats( - manifest_url, video_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False) - formats.extend(self._extract_mpd_formats( - manifest_url.replace('.m3u8', '.mpd'), - video_id, mpd_id='dash', fatal=False)) - formats.extend(self._extract_f4m_formats( - manifest_url.replace('.m3u8', '.f4m'), - video_id, f4m_id='hds', fatal=False)) - formats.extend(self._extract_ism_formats( - re.sub(r'\.ism/.*?\.m3u8', r'.ism/Manifest', manifest_url), - video_id, ism_id='mss', fatal=False)) - - if not formats and info.get('is_geo_restricted'): - self.raise_geo_restricted(countries=self._GEO_COUNTRIES) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - # 'subtitles': subtitles, - 'description': info.get('description'), - 'timestamp': parse_iso8601(info.get('broadcast_date_time')), - 'duration': int_or_none(info.get('duration')), - 'thumbnail': info.get('image'), - 'is_live': info.get('isLive') is True, - 'series': info.get('seriesTitle'), - 'season_number': int_or_none(info.get('seasonNumber')), - 'episode': info.get('episodeTitle'), - 'episode_number': int_or_none(info.get('episodeNumber')), - } diff --git a/youtube_dl/extractor/tv5mondeplus.py b/youtube_dl/extractor/tv5mondeplus.py deleted file mode 100644 index b7fe082b9..000000000 --- a/youtube_dl/extractor/tv5mondeplus.py +++ /dev/null @@ -1,117 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - determine_ext, - extract_attributes, - int_or_none, - parse_duration, -) - - -class TV5MondePlusIE(InfoExtractor): - IE_DESC = 'TV5MONDE+' - _VALID_URL = r'https?://(?:www\.)?(?:tv5mondeplus|revoir\.tv5monde)\.com/toutes-les-videos/[^/]+/(?P<id>[^/?#]+)' - _TESTS = [{ - # movie - 'url': 'https://revoir.tv5monde.com/toutes-les-videos/cinema/rendez-vous-a-atlit', - 'md5': '8cbde5ea7b296cf635073e27895e227f', - 'info_dict': { - 'id': '822a4756-0712-7329-1859-a13ac7fd1407', - 'display_id': 'rendez-vous-a-atlit', - 'ext': 'mp4', - 'title': 'Rendez-vous à Atlit', - 'description': 'md5:2893a4c5e1dbac3eedff2d87956e4efb', - 'upload_date': '20200130', - }, - }, { - # series episode - 'url': 'https://revoir.tv5monde.com/toutes-les-videos/series-fictions/c-est-la-vie-ennemie-juree', - 'info_dict': { - 'id': '0df7007c-4900-3936-c601-87a13a93a068', - 'display_id': 'c-est-la-vie-ennemie-juree', - 'ext': 'mp4', - 'title': "C'est la vie - Ennemie jurée", - 'description': 'md5:dfb5c63087b6f35fe0cc0af4fe44287e', - 'upload_date': '20200130', - 'series': "C'est la vie", - 'episode': 'Ennemie jurée', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://revoir.tv5monde.com/toutes-les-videos/series-fictions/neuf-jours-en-hiver-neuf-jours-en-hiver', - 'only_matching': True, - }, { - 'url': 'https://revoir.tv5monde.com/toutes-les-videos/info-societe/le-journal-de-la-rts-edition-du-30-01-20-19h30', - 'only_matching': True, - }] - _GEO_BYPASS = False - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - if ">Ce programme n'est malheureusement pas disponible pour votre zone géographique.<" in webpage: - self.raise_geo_restricted(countries=['FR']) - - title = episode = self._html_search_regex(r'<h1>([^<]+)', webpage, 'title') - vpl_data = extract_attributes(self._search_regex( - r'(<[^>]+class="video_player_loader"[^>]+>)', - webpage, 'video player loader')) - - video_files = self._parse_json( - vpl_data['data-broadcast'], display_id).get('files', []) - formats = [] - for video_file in video_files: - v_url = video_file.get('url') - if not v_url: - continue - video_format = video_file.get('format') or determine_ext(v_url) - if video_format == 'm3u8': - formats.extend(self._extract_m3u8_formats( - v_url, display_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - else: - formats.append({ - 'url': v_url, - 'format_id': video_format, - }) - self._sort_formats(formats) - - description = self._html_search_regex( - r'(?s)<div[^>]+class=["\']episode-texte[^>]+>(.+?)</div>', webpage, - 'description', fatal=False) - - series = self._html_search_regex( - r'<p[^>]+class=["\']episode-emission[^>]+>([^<]+)', webpage, - 'series', default=None) - - if series and series != title: - title = '%s - %s' % (series, title) - - upload_date = self._search_regex( - r'(?:date_publication|publish_date)["\']\s*:\s*["\'](\d{4}_\d{2}_\d{2})', - webpage, 'upload date', default=None) - if upload_date: - upload_date = upload_date.replace('_', '') - - video_id = self._search_regex( - (r'data-guid=["\']([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})', - r'id_contenu["\']\s:\s*(\d+)'), webpage, 'video id', - default=display_id) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': vpl_data.get('data-image'), - 'duration': int_or_none(vpl_data.get('data-duration')) or parse_duration(self._html_search_meta('duration', webpage)), - 'upload_date': upload_date, - 'formats': formats, - 'series': series, - 'episode': episode, - } diff --git a/youtube_dl/extractor/tv5unis.py b/youtube_dl/extractor/tv5unis.py deleted file mode 100644 index eabdc2271..000000000 --- a/youtube_dl/extractor/tv5unis.py +++ /dev/null @@ -1,121 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - parse_age_limit, - smuggle_url, - try_get, -) - - -class TV5UnisBaseIE(InfoExtractor): - _GEO_COUNTRIES = ['CA'] - - def _real_extract(self, url): - groups = re.match(self._VALID_URL, url).groups() - product = self._download_json( - 'https://api.tv5unis.ca/graphql', groups[0], query={ - 'query': '''{ - %s(%s) { - collection { - title - } - episodeNumber - rating { - name - } - seasonNumber - tags - title - videoElement { - ... on Video { - mediaId - } - } - } -}''' % (self._GQL_QUERY_NAME, self._gql_args(groups)), - })['data'][self._GQL_QUERY_NAME] - media_id = product['videoElement']['mediaId'] - - return { - '_type': 'url_transparent', - 'id': media_id, - 'title': product.get('title'), - 'url': smuggle_url('limelight:media:' + media_id, {'geo_countries': self._GEO_COUNTRIES}), - 'age_limit': parse_age_limit(try_get(product, lambda x: x['rating']['name'])), - 'tags': product.get('tags'), - 'series': try_get(product, lambda x: x['collection']['title']), - 'season_number': int_or_none(product.get('seasonNumber')), - 'episode_number': int_or_none(product.get('episodeNumber')), - 'ie_key': 'LimelightMedia', - } - - -class TV5UnisVideoIE(TV5UnisBaseIE): - IE_NAME = 'tv5unis:video' - _VALID_URL = r'https?://(?:www\.)?tv5unis\.ca/videos/[^/]+/(?P<id>\d+)' - _TEST = { - 'url': 'https://www.tv5unis.ca/videos/bande-annonces/71843', - 'md5': '3d794164928bda97fb87a17e89923d9b', - 'info_dict': { - 'id': 'a883684aecb2486cad9bdc7bbe17f861', - 'ext': 'mp4', - 'title': 'Watatatow', - 'duration': 10.01, - } - } - _GQL_QUERY_NAME = 'productById' - - @staticmethod - def _gql_args(groups): - return 'id: %s' % groups - - -class TV5UnisIE(TV5UnisBaseIE): - IE_NAME = 'tv5unis' - _VALID_URL = r'https?://(?:www\.)?tv5unis\.ca/videos/(?P<id>[^/]+)(?:/saisons/(?P<season_number>\d+)/episodes/(?P<episode_number>\d+))?/?(?:[?#&]|$)' - _TESTS = [{ - 'url': 'https://www.tv5unis.ca/videos/watatatow/saisons/6/episodes/1', - 'md5': 'a479907d2e531a73e1f8dc48d6388d02', - 'info_dict': { - 'id': 'e5ee23a586c44612a56aad61accf16ef', - 'ext': 'mp4', - 'title': 'Je ne peux pas lui résister', - 'description': "Atys, le nouveau concierge de l'école, a réussi à ébranler la confiance de Mado en affirmant qu\'une médaille, ce n'est que du métal. Comme Mado essaie de lui prouver que ses valeurs sont solides, il veut la mettre à l'épreuve...", - 'subtitles': { - 'fr': 'count:1', - }, - 'duration': 1370, - 'age_limit': 8, - 'tags': 'count:3', - 'series': 'Watatatow', - 'season_number': 6, - 'episode_number': 1, - }, - }, { - 'url': 'https://www.tv5unis.ca/videos/le-voyage-de-fanny', - 'md5': '9ca80ebb575c681d10cae1adff3d4774', - 'info_dict': { - 'id': '726188eefe094d8faefb13381d42bc06', - 'ext': 'mp4', - 'title': 'Le voyage de Fanny', - 'description': "Fanny, 12 ans, cachée dans un foyer loin de ses parents, s'occupe de ses deux soeurs. Devant fuir, Fanny prend la tête d'un groupe de huit enfants et s'engage dans un dangereux périple à travers la France occupée pour rejoindre la frontière suisse.", - 'subtitles': { - 'fr': 'count:1', - }, - 'duration': 5587.034, - 'tags': 'count:4', - }, - }] - _GQL_QUERY_NAME = 'productByRootProductSlug' - - @staticmethod - def _gql_args(groups): - args = 'rootProductSlug: "%s"' % groups[0] - if groups[1]: - args += ', seasonNumber: %s, episodeNumber: %s' % groups[1:] - return args diff --git a/youtube_dl/extractor/tver.py b/youtube_dl/extractor/tver.py deleted file mode 100644 index a4a30b1e6..000000000 --- a/youtube_dl/extractor/tver.py +++ /dev/null @@ -1,61 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - int_or_none, - remove_start, - smuggle_url, - try_get, -) - - -class TVerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tver\.jp/(?P<path>(?:corner|episode|feature)/(?P<id>f?\d+))' - # videos are only available for 7 days - _TESTS = [{ - 'url': 'https://tver.jp/corner/f0062178', - 'only_matching': True, - }, { - 'url': 'https://tver.jp/feature/f0062413', - 'only_matching': True, - }, { - 'url': 'https://tver.jp/episode/79622438', - 'only_matching': True, - }, { - # subtitle = ' ' - 'url': 'https://tver.jp/corner/f0068870', - 'only_matching': True, - }] - _TOKEN = None - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' - - def _real_initialize(self): - self._TOKEN = self._download_json( - 'https://tver.jp/api/access_token.php', None)['token'] - - def _real_extract(self, url): - path, video_id = re.match(self._VALID_URL, url).groups() - main = self._download_json( - 'https://api.tver.jp/v4/' + path, video_id, - query={'token': self._TOKEN})['main'] - p_id = main['publisher_id'] - service = remove_start(main['service'], 'ts_') - - r_id = main['reference_id'] - if service not in ('tx', 'russia2018', 'sebare2018live', 'gorin'): - r_id = 'ref:' + r_id - bc_url = smuggle_url( - self.BRIGHTCOVE_URL_TEMPLATE % (p_id, r_id), - {'geo_countries': ['JP']}) - - return { - '_type': 'url_transparent', - 'description': try_get(main, lambda x: x['note'][0]['text'], compat_str), - 'episode_number': int_or_none(try_get(main, lambda x: x['ext']['episode_number'])), - 'url': bc_url, - 'ie_key': 'BrightcoveNew', - } diff --git a/youtube_dl/extractor/tvigle.py b/youtube_dl/extractor/tvigle.py deleted file mode 100644 index 180259aba..000000000 --- a/youtube_dl/extractor/tvigle.py +++ /dev/null @@ -1,138 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - float_or_none, - int_or_none, - parse_age_limit, - try_get, - url_or_none, -) - - -class TvigleIE(InfoExtractor): - IE_NAME = 'tvigle' - IE_DESC = 'Интернет-телевидение Tvigle.ru' - _VALID_URL = r'https?://(?:www\.)?(?:tvigle\.ru/(?:[^/]+/)+(?P<display_id>[^/]+)/$|cloud\.tvigle\.ru/video/(?P<id>\d+))' - - _GEO_BYPASS = False - _GEO_COUNTRIES = ['RU'] - - _TESTS = [ - { - 'url': 'http://www.tvigle.ru/video/sokrat/', - 'info_dict': { - 'id': '1848932', - 'display_id': 'sokrat', - 'ext': 'mp4', - 'title': 'Сократ', - 'description': 'md5:d6b92ffb7217b4b8ebad2e7665253c17', - 'duration': 6586, - 'age_limit': 12, - }, - 'skip': 'georestricted', - }, - { - 'url': 'http://www.tvigle.ru/video/vladimir-vysotskii/vedushchii-teleprogrammy-60-minut-ssha-o-vladimire-vysotskom/', - 'info_dict': { - 'id': '5142516', - 'ext': 'flv', - 'title': 'Ведущий телепрограммы «60 минут» (США) о Владимире Высоцком', - 'description': 'md5:027f7dc872948f14c96d19b4178428a4', - 'duration': 186.080, - 'age_limit': 0, - }, - 'skip': 'georestricted', - }, { - 'url': 'https://cloud.tvigle.ru/video/5267604/', - 'only_matching': True, - } - ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') - - if not video_id: - webpage = self._download_webpage(url, display_id) - video_id = self._html_search_regex( - (r'<div[^>]+class=["\']player["\'][^>]+id=["\'](\d+)', - r'cloudId\s*=\s*["\'](\d+)', - r'class="video-preview current_playing" id="(\d+)"'), - webpage, 'video id') - - video_data = self._download_json( - 'http://cloud.tvigle.ru/api/play/video/%s/' % video_id, display_id) - - item = video_data['playlist']['items'][0] - - videos = item.get('videos') - - error_message = item.get('errorMessage') - if not videos and error_message: - if item.get('isGeoBlocked') is True: - self.raise_geo_restricted( - msg=error_message, countries=self._GEO_COUNTRIES) - else: - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, error_message), - expected=True) - - title = item['title'] - description = item.get('description') - thumbnail = item.get('thumbnail') - duration = float_or_none(item.get('durationMilliseconds'), 1000) - age_limit = parse_age_limit(item.get('ageRestrictions')) - - formats = [] - for vcodec, url_or_fmts in item['videos'].items(): - if vcodec == 'hls': - m3u8_url = url_or_none(url_or_fmts) - if not m3u8_url: - continue - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - elif vcodec == 'dash': - mpd_url = url_or_none(url_or_fmts) - if not mpd_url: - continue - formats.extend(self._extract_mpd_formats( - mpd_url, video_id, mpd_id='dash', fatal=False)) - else: - if not isinstance(url_or_fmts, dict): - continue - for format_id, video_url in url_or_fmts.items(): - if format_id == 'm3u8': - continue - video_url = url_or_none(video_url) - if not video_url: - continue - height = self._search_regex( - r'^(\d+)[pP]$', format_id, 'height', default=None) - filesize = int_or_none(try_get( - item, lambda x: x['video_files_size'][vcodec][format_id])) - formats.append({ - 'url': video_url, - 'format_id': '%s-%s' % (vcodec, format_id), - 'vcodec': vcodec, - 'height': int_or_none(height), - 'filesize': filesize, - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'age_limit': age_limit, - 'formats': formats, - } diff --git a/youtube_dl/extractor/tvland.py b/youtube_dl/extractor/tvland.py deleted file mode 100644 index 791144128..000000000 --- a/youtube_dl/extractor/tvland.py +++ /dev/null @@ -1,37 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .spike import ParamountNetworkIE - - -class TVLandIE(ParamountNetworkIE): - IE_NAME = 'tvland.com' - _VALID_URL = r'https?://(?:www\.)?tvland\.com/(?:video-clips|(?:full-)?episodes)/(?P<id>[^/?#.]+)' - _FEED_URL = 'http://www.tvland.com/feeds/mrss/' - _TESTS = [{ - # Geo-restricted. Without a proxy metadata are still there. With a - # proxy it redirects to http://m.tvland.com/app/ - 'url': 'https://www.tvland.com/episodes/s04pzf/everybody-loves-raymond-the-dog-season-1-ep-19', - 'info_dict': { - 'description': 'md5:84928e7a8ad6649371fbf5da5e1ad75a', - 'title': 'The Dog', - }, - 'playlist_mincount': 5, - }, { - 'url': 'https://www.tvland.com/video-clips/4n87f2/younger-a-first-look-at-younger-season-6', - 'md5': 'e2c6389401cf485df26c79c247b08713', - 'info_dict': { - 'id': '891f7d3c-5b5b-4753-b879-b7ba1a601757', - 'ext': 'mp4', - 'title': 'Younger|April 30, 2019|6|NO-EPISODE#|A First Look at Younger Season 6', - 'description': 'md5:595ea74578d3a888ae878dfd1c7d4ab2', - 'upload_date': '20190430', - 'timestamp': 1556658000, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://www.tvland.com/full-episodes/iu0hz6/younger-a-kiss-is-just-a-kiss-season-3-ep-301', - 'only_matching': True, - }] diff --git a/youtube_dl/extractor/tvnow.py b/youtube_dl/extractor/tvnow.py deleted file mode 100644 index 9c8a8a0dc..000000000 --- a/youtube_dl/extractor/tvnow.py +++ /dev/null @@ -1,486 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - ExtractorError, - int_or_none, - parse_iso8601, - parse_duration, - str_or_none, - update_url_query, - urljoin, -) - - -class TVNowBaseIE(InfoExtractor): - _VIDEO_FIELDS = ( - 'id', 'title', 'free', 'geoblocked', 'articleLong', 'articleShort', - 'broadcastStartDate', 'isDrm', 'duration', 'season', 'episode', - 'manifest.dashclear', 'manifest.hlsclear', 'manifest.smoothclear', - 'format.title', 'format.defaultImage169Format', 'format.defaultImage169Logo') - - def _call_api(self, path, video_id, query): - return self._download_json( - 'https://api.tvnow.de/v3/' + path, video_id, query=query) - - def _extract_video(self, info, display_id): - video_id = compat_str(info['id']) - title = info['title'] - - paths = [] - for manifest_url in (info.get('manifest') or {}).values(): - if not manifest_url: - continue - manifest_url = update_url_query(manifest_url, {'filter': ''}) - path = self._search_regex(r'https?://[^/]+/(.+?)\.ism/', manifest_url, 'path') - if path in paths: - continue - paths.append(path) - - def url_repl(proto, suffix): - return re.sub( - r'(?:hls|dash|hss)([.-])', proto + r'\1', re.sub( - r'\.ism/(?:[^.]*\.(?:m3u8|mpd)|[Mm]anifest)', - '.ism/' + suffix, manifest_url)) - - def make_urls(proto, suffix): - urls = [url_repl(proto, suffix)] - hd_url = urls[0].replace('/manifest/', '/ngvod/') - if hd_url != urls[0]: - urls.append(hd_url) - return urls - - for man_url in make_urls('dash', '.mpd'): - formats = self._extract_mpd_formats( - man_url, video_id, mpd_id='dash', fatal=False) - for man_url in make_urls('hss', 'Manifest'): - formats.extend(self._extract_ism_formats( - man_url, video_id, ism_id='mss', fatal=False)) - for man_url in make_urls('hls', '.m3u8'): - formats.extend(self._extract_m3u8_formats( - man_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', - fatal=False)) - if formats: - break - else: - if info.get('isDrm'): - raise ExtractorError( - 'Video %s is DRM protected' % video_id, expected=True) - if info.get('geoblocked'): - raise self.raise_geo_restricted() - if not info.get('free', True): - raise ExtractorError( - 'Video %s is not available for free' % video_id, expected=True) - self._sort_formats(formats) - - description = info.get('articleLong') or info.get('articleShort') - timestamp = parse_iso8601(info.get('broadcastStartDate'), ' ') - duration = parse_duration(info.get('duration')) - - f = info.get('format', {}) - - thumbnails = [{ - 'url': 'https://aistvnow-a.akamaihd.net/tvnow/movie/%s' % video_id, - }] - thumbnail = f.get('defaultImage169Format') or f.get('defaultImage169Logo') - if thumbnail: - thumbnails.append({ - 'url': thumbnail, - }) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnails': thumbnails, - 'timestamp': timestamp, - 'duration': duration, - 'series': f.get('title'), - 'season_number': int_or_none(info.get('season')), - 'episode_number': int_or_none(info.get('episode')), - 'episode': title, - 'formats': formats, - } - - -class TVNowIE(TVNowBaseIE): - _VALID_URL = r'''(?x) - https?:// - (?:www\.)?tvnow\.(?:de|at|ch)/(?P<station>[^/]+)/ - (?P<show_id>[^/]+)/ - (?!(?:list|jahr)(?:/|$))(?P<id>[^/?\#&]+) - ''' - - @classmethod - def suitable(cls, url): - return (False if TVNowNewIE.suitable(url) or TVNowSeasonIE.suitable(url) or TVNowAnnualIE.suitable(url) or TVNowShowIE.suitable(url) - else super(TVNowIE, cls).suitable(url)) - - _TESTS = [{ - 'url': 'https://www.tvnow.de/rtl2/grip-das-motormagazin/der-neue-porsche-911-gt-3/player', - 'info_dict': { - 'id': '331082', - 'display_id': 'grip-das-motormagazin/der-neue-porsche-911-gt-3', - 'ext': 'mp4', - 'title': 'Der neue Porsche 911 GT 3', - 'description': 'md5:6143220c661f9b0aae73b245e5d898bb', - 'timestamp': 1495994400, - 'upload_date': '20170528', - 'duration': 5283, - 'series': 'GRIP - Das Motormagazin', - 'season_number': 14, - 'episode_number': 405, - 'episode': 'Der neue Porsche 911 GT 3', - }, - }, { - # rtl2 - 'url': 'https://www.tvnow.de/rtl2/armes-deutschland/episode-0008/player', - 'only_matching': True, - }, { - # rtlnitro - 'url': 'https://www.tvnow.de/nitro/alarm-fuer-cobra-11-die-autobahnpolizei/auf-eigene-faust-pilot/player', - 'only_matching': True, - }, { - # superrtl - 'url': 'https://www.tvnow.de/superrtl/die-lustigsten-schlamassel-der-welt/u-a-ketchup-effekt/player', - 'only_matching': True, - }, { - # ntv - 'url': 'https://www.tvnow.de/ntv/startup-news/goetter-in-weiss/player', - 'only_matching': True, - }, { - # vox - 'url': 'https://www.tvnow.de/vox/auto-mobil/neues-vom-automobilmarkt-2017-11-19-17-00-00/player', - 'only_matching': True, - }, { - # rtlplus - 'url': 'https://www.tvnow.de/rtlplus/op-ruft-dr-bruckner/die-vernaehte-frau/player', - 'only_matching': True, - }, { - 'url': 'https://www.tvnow.de/rtl2/grip-das-motormagazin/der-neue-porsche-911-gt-3', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = '%s/%s' % mobj.group(2, 3) - - info = self._call_api( - 'movies/' + display_id, display_id, query={ - 'fields': ','.join(self._VIDEO_FIELDS), - }) - - return self._extract_video(info, display_id) - - -class TVNowNewIE(InfoExtractor): - _VALID_URL = r'''(?x) - (?P<base_url>https?:// - (?:www\.)?tvnow\.(?:de|at|ch)/ - (?:shows|serien))/ - (?P<show>[^/]+)-\d+/ - [^/]+/ - episode-\d+-(?P<episode>[^/?$&]+)-(?P<id>\d+) - ''' - - _TESTS = [{ - 'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669/2017-05/episode-405-der-neue-porsche-911-gt-3-331082', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - base_url = re.sub(r'(?:shows|serien)', '_', mobj.group('base_url')) - show, episode = mobj.group('show', 'episode') - return self.url_result( - # Rewrite new URLs to the old format and use extraction via old API - # at api.tvnow.de as a loophole for bypassing premium content checks - '%s/%s/%s' % (base_url, show, episode), - ie=TVNowIE.ie_key(), video_id=mobj.group('id')) - - -class TVNowNewBaseIE(InfoExtractor): - def _call_api(self, path, video_id, query={}): - result = self._download_json( - 'https://apigw.tvnow.de/module/' + path, video_id, query=query) - error = result.get('error') - if error: - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, error), expected=True) - return result - - -r""" -TODO: new apigw.tvnow.de based version of TVNowIE. Replace old TVNowIE with it -when api.tvnow.de is shut down. This version can't bypass premium checks though. -class TVNowIE(TVNowNewBaseIE): - _VALID_URL = r'''(?x) - https?:// - (?:www\.)?tvnow\.(?:de|at|ch)/ - (?:shows|serien)/[^/]+/ - (?:[^/]+/)+ - (?P<display_id>[^/?$&]+)-(?P<id>\d+) - ''' - - _TESTS = [{ - # episode with annual navigation - 'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669/2017-05/episode-405-der-neue-porsche-911-gt-3-331082', - 'info_dict': { - 'id': '331082', - 'display_id': 'grip-das-motormagazin/der-neue-porsche-911-gt-3', - 'ext': 'mp4', - 'title': 'Der neue Porsche 911 GT 3', - 'description': 'md5:6143220c661f9b0aae73b245e5d898bb', - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1495994400, - 'upload_date': '20170528', - 'duration': 5283, - 'series': 'GRIP - Das Motormagazin', - 'season_number': 14, - 'episode_number': 405, - 'episode': 'Der neue Porsche 911 GT 3', - }, - }, { - # rtl2, episode with season navigation - 'url': 'https://www.tvnow.de/shows/armes-deutschland-11471/staffel-3/episode-14-bernd-steht-seit-der-trennung-von-seiner-frau-allein-da-526124', - 'only_matching': True, - }, { - # rtlnitro - 'url': 'https://www.tvnow.de/serien/alarm-fuer-cobra-11-die-autobahnpolizei-1815/staffel-13/episode-5-auf-eigene-faust-pilot-366822', - 'only_matching': True, - }, { - # superrtl - 'url': 'https://www.tvnow.de/shows/die-lustigsten-schlamassel-der-welt-1221/staffel-2/episode-14-u-a-ketchup-effekt-364120', - 'only_matching': True, - }, { - # ntv - 'url': 'https://www.tvnow.de/shows/startup-news-10674/staffel-2/episode-39-goetter-in-weiss-387630', - 'only_matching': True, - }, { - # vox - 'url': 'https://www.tvnow.de/shows/auto-mobil-174/2017-11/episode-46-neues-vom-automobilmarkt-2017-11-19-17-00-00-380072', - 'only_matching': True, - }, { - 'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669/2017-05/episode-405-der-neue-porsche-911-gt-3-331082', - 'only_matching': True, - }] - - def _extract_video(self, info, url, display_id): - config = info['config'] - source = config['source'] - - video_id = compat_str(info.get('id') or source['videoId']) - title = source['title'].strip() - - paths = [] - for manifest_url in (info.get('manifest') or {}).values(): - if not manifest_url: - continue - manifest_url = update_url_query(manifest_url, {'filter': ''}) - path = self._search_regex(r'https?://[^/]+/(.+?)\.ism/', manifest_url, 'path') - if path in paths: - continue - paths.append(path) - - def url_repl(proto, suffix): - return re.sub( - r'(?:hls|dash|hss)([.-])', proto + r'\1', re.sub( - r'\.ism/(?:[^.]*\.(?:m3u8|mpd)|[Mm]anifest)', - '.ism/' + suffix, manifest_url)) - - formats = self._extract_mpd_formats( - url_repl('dash', '.mpd'), video_id, - mpd_id='dash', fatal=False) - formats.extend(self._extract_ism_formats( - url_repl('hss', 'Manifest'), - video_id, ism_id='mss', fatal=False)) - formats.extend(self._extract_m3u8_formats( - url_repl('hls', '.m3u8'), video_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False)) - if formats: - break - else: - if try_get(info, lambda x: x['rights']['isDrm']): - raise ExtractorError( - 'Video %s is DRM protected' % video_id, expected=True) - if try_get(config, lambda x: x['boards']['geoBlocking']['block']): - raise self.raise_geo_restricted() - if not info.get('free', True): - raise ExtractorError( - 'Video %s is not available for free' % video_id, expected=True) - self._sort_formats(formats) - - description = source.get('description') - thumbnail = url_or_none(source.get('poster')) - timestamp = unified_timestamp(source.get('previewStart')) - duration = parse_duration(source.get('length')) - - series = source.get('format') - season_number = int_or_none(self._search_regex( - r'staffel-(\d+)', url, 'season number', default=None)) - episode_number = int_or_none(self._search_regex( - r'episode-(\d+)', url, 'episode number', default=None)) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'duration': duration, - 'series': series, - 'season_number': season_number, - 'episode_number': episode_number, - 'episode': title, - 'formats': formats, - } - - def _real_extract(self, url): - display_id, video_id = re.match(self._VALID_URL, url).groups() - info = self._call_api('player/' + video_id, video_id) - return self._extract_video(info, video_id, display_id) -""" - - -class TVNowListBaseIE(TVNowNewBaseIE): - _SHOW_VALID_URL = r'''(?x) - (?P<base_url> - https?:// - (?:www\.)?tvnow\.(?:de|at|ch)/(?:shows|serien)/ - [^/?#&]+-(?P<show_id>\d+) - ) - ''' - - @classmethod - def suitable(cls, url): - return (False if TVNowNewIE.suitable(url) - else super(TVNowListBaseIE, cls).suitable(url)) - - def _extract_items(self, url, show_id, list_id, query): - items = self._call_api( - 'teaserrow/format/episode/' + show_id, list_id, - query=query)['items'] - - entries = [] - for item in items: - if not isinstance(item, dict): - continue - item_url = urljoin(url, item.get('url')) - if not item_url: - continue - video_id = str_or_none(item.get('id') or item.get('videoId')) - item_title = item.get('subheadline') or item.get('text') - entries.append(self.url_result( - item_url, ie=TVNowNewIE.ie_key(), video_id=video_id, - video_title=item_title)) - - return self.playlist_result(entries, '%s/%s' % (show_id, list_id)) - - -class TVNowSeasonIE(TVNowListBaseIE): - _VALID_URL = r'%s/staffel-(?P<id>\d+)' % TVNowListBaseIE._SHOW_VALID_URL - _TESTS = [{ - 'url': 'https://www.tvnow.de/serien/alarm-fuer-cobra-11-die-autobahnpolizei-1815/staffel-13', - 'info_dict': { - 'id': '1815/13', - }, - 'playlist_mincount': 22, - }] - - def _real_extract(self, url): - _, show_id, season_id = re.match(self._VALID_URL, url).groups() - return self._extract_items( - url, show_id, season_id, {'season': season_id}) - - -class TVNowAnnualIE(TVNowListBaseIE): - _VALID_URL = r'%s/(?P<year>\d{4})-(?P<month>\d{2})' % TVNowListBaseIE._SHOW_VALID_URL - _TESTS = [{ - 'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669/2017-05', - 'info_dict': { - 'id': '1669/2017-05', - }, - 'playlist_mincount': 2, - }] - - def _real_extract(self, url): - _, show_id, year, month = re.match(self._VALID_URL, url).groups() - return self._extract_items( - url, show_id, '%s-%s' % (year, month), { - 'year': int(year), - 'month': int(month), - }) - - -class TVNowShowIE(TVNowListBaseIE): - _VALID_URL = TVNowListBaseIE._SHOW_VALID_URL - _TESTS = [{ - # annual navigationType - 'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669', - 'info_dict': { - 'id': '1669', - }, - 'playlist_mincount': 73, - }, { - # season navigationType - 'url': 'https://www.tvnow.de/shows/armes-deutschland-11471', - 'info_dict': { - 'id': '11471', - }, - 'playlist_mincount': 3, - }] - - @classmethod - def suitable(cls, url): - return (False if TVNowNewIE.suitable(url) or TVNowSeasonIE.suitable(url) or TVNowAnnualIE.suitable(url) - else super(TVNowShowIE, cls).suitable(url)) - - def _real_extract(self, url): - base_url, show_id = re.match(self._VALID_URL, url).groups() - - result = self._call_api( - 'teaserrow/format/navigation/' + show_id, show_id) - - items = result['items'] - - entries = [] - navigation = result.get('navigationType') - if navigation == 'annual': - for item in items: - if not isinstance(item, dict): - continue - year = int_or_none(item.get('year')) - if year is None: - continue - months = item.get('months') - if not isinstance(months, list): - continue - for month_dict in months: - if not isinstance(month_dict, dict) or not month_dict: - continue - month_number = int_or_none(list(month_dict.keys())[0]) - if month_number is None: - continue - entries.append(self.url_result( - '%s/%04d-%02d' % (base_url, year, month_number), - ie=TVNowAnnualIE.ie_key())) - elif navigation == 'season': - for item in items: - if not isinstance(item, dict): - continue - season_number = int_or_none(item.get('season')) - if season_number is None: - continue - entries.append(self.url_result( - '%s/staffel-%d' % (base_url, season_number), - ie=TVNowSeasonIE.ie_key())) - else: - raise ExtractorError('Unknown navigationType') - - return self.playlist_result(entries, show_id) diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py deleted file mode 100644 index accff75b5..000000000 --- a/youtube_dl/extractor/tvp.py +++ /dev/null @@ -1,252 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import itertools -import re - -from .common import InfoExtractor -from ..utils import ( - clean_html, - determine_ext, - ExtractorError, - get_element_by_attribute, - orderedSet, -) - - -class TVPIE(InfoExtractor): - IE_NAME = 'tvp' - IE_DESC = 'Telewizja Polska' - _VALID_URL = r'https?://[^/]+\.tvp\.(?:pl|info)/(?:video/(?:[^,\s]*,)*|(?:(?!\d+/)[^/]+/)*)(?P<id>\d+)' - - _TESTS = [{ - 'url': 'https://vod.tvp.pl/video/czas-honoru,i-seria-odc-13,194536', - 'md5': 'a21eb0aa862f25414430f15fdfb9e76c', - 'info_dict': { - 'id': '194536', - 'ext': 'mp4', - 'title': 'Czas honoru, odc. 13 – Władek', - 'description': 'md5:437f48b93558370b031740546b696e24', - }, - }, { - 'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176', - 'md5': 'b0005b542e5b4de643a9690326ab1257', - 'info_dict': { - 'id': '17916176', - 'ext': 'mp4', - 'title': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata', - 'description': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata', - }, - }, { - # page id is not the same as video id(#7799) - 'url': 'https://wiadomosci.tvp.pl/33908820/28092017-1930', - 'md5': '84cd3c8aec4840046e5ab712416b73d0', - 'info_dict': { - 'id': '33908820', - 'ext': 'mp4', - 'title': 'Wiadomości, 28.09.2017, 19:30', - 'description': 'Wydanie główne codziennego serwisu informacyjnego.' - }, - 'skip': 'HTTP Error 404: Not Found', - }, { - 'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272', - 'only_matching': True, - }, { - 'url': 'http://wiadomosci.tvp.pl/25169746/24052016-1200', - 'only_matching': True, - }, { - 'url': 'http://krakow.tvp.pl/25511623/25lecie-mck-wyjatkowe-miejsce-na-mapie-krakowa', - 'only_matching': True, - }, { - 'url': 'http://teleexpress.tvp.pl/25522307/wierni-wzieli-udzial-w-procesjach', - 'only_matching': True, - }, { - 'url': 'http://sport.tvp.pl/25522165/krychowiak-uspokaja-w-sprawie-kontuzji-dwa-tygodnie-to-maksimum', - 'only_matching': True, - }, { - 'url': 'http://www.tvp.info/25511919/trwa-rewolucja-wladza-zdecydowala-sie-na-pogwalcenie-konstytucji', - 'only_matching': True, - }] - - def _real_extract(self, url): - page_id = self._match_id(url) - webpage = self._download_webpage(url, page_id) - video_id = self._search_regex([ - r'<iframe[^>]+src="[^"]*?object_id=(\d+)', - r"object_id\s*:\s*'(\d+)'", - r'data-video-id="(\d+)"'], webpage, 'video id', default=page_id) - return { - '_type': 'url_transparent', - 'url': 'tvp:' + video_id, - 'description': self._og_search_description( - webpage, default=None) or self._html_search_meta( - 'description', webpage, default=None), - 'thumbnail': self._og_search_thumbnail(webpage, default=None), - 'ie_key': 'TVPEmbed', - } - - -class TVPEmbedIE(InfoExtractor): - IE_NAME = 'tvp:embed' - IE_DESC = 'Telewizja Polska' - _VALID_URL = r'(?:tvp:|https?://[^/]+\.tvp\.(?:pl|info)/sess/tvplayer\.php\?.*?object_id=)(?P<id>\d+)' - - _TESTS = [{ - 'url': 'tvp:194536', - 'md5': 'a21eb0aa862f25414430f15fdfb9e76c', - 'info_dict': { - 'id': '194536', - 'ext': 'mp4', - 'title': 'Czas honoru, odc. 13 – Władek', - }, - }, { - # not available - 'url': 'http://www.tvp.pl/sess/tvplayer.php?object_id=22670268', - 'md5': '8c9cd59d16edabf39331f93bf8a766c7', - 'info_dict': { - 'id': '22670268', - 'ext': 'mp4', - 'title': 'Panorama, 07.12.2015, 15:40', - }, - 'skip': 'Transmisja została zakończona lub materiał niedostępny', - }, { - 'url': 'tvp:22670268', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage( - 'http://www.tvp.pl/sess/tvplayer.php?object_id=%s' % video_id, video_id) - - error = self._html_search_regex( - r'(?s)<p[^>]+\bclass=["\']notAvailable__text["\'][^>]*>(.+?)</p>', - webpage, 'error', default=None) or clean_html( - get_element_by_attribute('class', 'msg error', webpage)) - if error: - raise ExtractorError('%s said: %s' % ( - self.IE_NAME, clean_html(error)), expected=True) - - title = self._search_regex( - r'name\s*:\s*([\'"])Title\1\s*,\s*value\s*:\s*\1(?P<title>.+?)\1', - webpage, 'title', group='title') - series_title = self._search_regex( - r'name\s*:\s*([\'"])SeriesTitle\1\s*,\s*value\s*:\s*\1(?P<series>.+?)\1', - webpage, 'series', group='series', default=None) - if series_title: - title = '%s, %s' % (series_title, title) - - thumbnail = self._search_regex( - r"poster\s*:\s*'([^']+)'", webpage, 'thumbnail', default=None) - - video_url = self._search_regex( - r'0:{src:([\'"])(?P<url>.*?)\1', webpage, - 'formats', group='url', default=None) - if not video_url or 'material_niedostepny.mp4' in video_url: - video_url = self._download_json( - 'http://www.tvp.pl/pub/stat/videofileinfo?video_id=%s' % video_id, - video_id)['video_url'] - - formats = [] - video_url_base = self._search_regex( - r'(https?://.+?/video)(?:\.(?:ism|f4m|m3u8)|-\d+\.mp4)', - video_url, 'video base url', default=None) - if video_url_base: - # TODO: <Group> found instead of <AdaptationSet> in MPD manifest. - # It's not mentioned in MPEG-DASH standard. Figure that out. - # formats.extend(self._extract_mpd_formats( - # video_url_base + '.ism/video.mpd', - # video_id, mpd_id='dash', fatal=False)) - formats.extend(self._extract_ism_formats( - video_url_base + '.ism/Manifest', - video_id, 'mss', fatal=False)) - formats.extend(self._extract_f4m_formats( - video_url_base + '.ism/video.f4m', - video_id, f4m_id='hds', fatal=False)) - m3u8_formats = self._extract_m3u8_formats( - video_url_base + '.ism/video.m3u8', video_id, - 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) - self._sort_formats(m3u8_formats) - m3u8_formats = list(filter( - lambda f: f.get('vcodec') != 'none', m3u8_formats)) - formats.extend(m3u8_formats) - for i, m3u8_format in enumerate(m3u8_formats, 2): - http_url = '%s-%d.mp4' % (video_url_base, i) - if self._is_valid_url(http_url, video_id): - f = m3u8_format.copy() - f.update({ - 'url': http_url, - 'format_id': f['format_id'].replace('hls', 'http'), - 'protocol': 'http', - }) - formats.append(f) - else: - formats = [{ - 'format_id': 'direct', - 'url': video_url, - 'ext': determine_ext(video_url, 'mp4'), - }] - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'formats': formats, - } - - -class TVPWebsiteIE(InfoExtractor): - IE_NAME = 'tvp:series' - _VALID_URL = r'https?://vod\.tvp\.pl/website/(?P<display_id>[^,]+),(?P<id>\d+)' - - _TESTS = [{ - # series - 'url': 'https://vod.tvp.pl/website/lzy-cennet,38678312/video', - 'info_dict': { - 'id': '38678312', - }, - 'playlist_count': 115, - }, { - # film - 'url': 'https://vod.tvp.pl/website/gloria,35139666', - 'info_dict': { - 'id': '36637049', - 'ext': 'mp4', - 'title': 'Gloria, Gloria', - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': ['TVPEmbed'], - }, { - 'url': 'https://vod.tvp.pl/website/lzy-cennet,38678312', - 'only_matching': True, - }] - - def _entries(self, display_id, playlist_id): - url = 'https://vod.tvp.pl/website/%s,%s/video' % (display_id, playlist_id) - for page_num in itertools.count(1): - page = self._download_webpage( - url, display_id, 'Downloading page %d' % page_num, - query={'page': page_num}) - - video_ids = orderedSet(re.findall( - r'<a[^>]+\bhref=["\']/video/%s,[^,]+,(\d+)' % display_id, - page)) - - if not video_ids: - break - - for video_id in video_ids: - yield self.url_result( - 'tvp:%s' % video_id, ie=TVPEmbedIE.ie_key(), - video_id=video_id) - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id, playlist_id = mobj.group('display_id', 'id') - return self.playlist_result( - self._entries(display_id, playlist_id), playlist_id) diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py deleted file mode 100644 index 0d858c025..000000000 --- a/youtube_dl/extractor/tvplay.py +++ /dev/null @@ -1,492 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_urlparse, -) -from ..utils import ( - determine_ext, - ExtractorError, - int_or_none, - parse_duration, - parse_iso8601, - qualities, - try_get, - update_url_query, - url_or_none, - urljoin, -) - - -class TVPlayIE(InfoExtractor): - IE_NAME = 'mtg' - IE_DESC = 'MTG services' - _VALID_URL = r'''(?x) - (?: - mtg:| - https?:// - (?:www\.)? - (?: - tvplay(?:\.skaties)?\.lv(?:/parraides)?| - (?:tv3play|play\.tv3)\.lt(?:/programos)?| - tv3play(?:\.tv3)?\.ee/sisu| - (?:tv(?:3|6|8|10)play|viafree)\.se/program| - (?:(?:tv3play|viasat4play|tv6play|viafree)\.no|(?:tv3play|viafree)\.dk)/programmer| - play\.nova(?:tv)?\.bg/programi - ) - /(?:[^/]+/)+ - ) - (?P<id>\d+) - ''' - _TESTS = [ - { - 'url': 'http://www.tvplay.lv/parraides/vinas-melo-labak/418113?autostart=true', - 'md5': 'a1612fe0849455423ad8718fe049be21', - 'info_dict': { - 'id': '418113', - 'ext': 'mp4', - 'title': 'Kādi ir īri? - Viņas melo labāk', - 'description': 'Baiba apsmej īrus, kādi tie ir un ko viņi dara.', - 'series': 'Viņas melo labāk', - 'season': '2.sezona', - 'season_number': 2, - 'duration': 25, - 'timestamp': 1406097056, - 'upload_date': '20140723', - }, - }, - { - 'url': 'http://play.tv3.lt/programos/moterys-meluoja-geriau/409229?autostart=true', - 'info_dict': { - 'id': '409229', - 'ext': 'flv', - 'title': 'Moterys meluoja geriau', - 'description': 'md5:9aec0fc68e2cbc992d2a140bd41fa89e', - 'series': 'Moterys meluoja geriau', - 'episode_number': 47, - 'season': '1 sezonas', - 'season_number': 1, - 'duration': 1330, - 'timestamp': 1403769181, - 'upload_date': '20140626', - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - }, - { - 'url': 'http://www.tv3play.ee/sisu/kodu-keset-linna/238551?autostart=true', - 'info_dict': { - 'id': '238551', - 'ext': 'flv', - 'title': 'Kodu keset linna 398537', - 'description': 'md5:7df175e3c94db9e47c0d81ffa5d68701', - 'duration': 1257, - 'timestamp': 1292449761, - 'upload_date': '20101215', - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - }, - { - 'url': 'http://www.tv3play.se/program/husraddarna/395385?autostart=true', - 'info_dict': { - 'id': '395385', - 'ext': 'mp4', - 'title': 'Husräddarna S02E07', - 'description': 'md5:f210c6c89f42d4fc39faa551be813777', - 'duration': 2574, - 'timestamp': 1400596321, - 'upload_date': '20140520', - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://www.tv6play.se/program/den-sista-dokusapan/266636?autostart=true', - 'info_dict': { - 'id': '266636', - 'ext': 'mp4', - 'title': 'Den sista dokusåpan S01E08', - 'description': 'md5:295be39c872520221b933830f660b110', - 'duration': 1492, - 'timestamp': 1330522854, - 'upload_date': '20120229', - 'age_limit': 18, - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://www.tv8play.se/program/antikjakten/282756?autostart=true', - 'info_dict': { - 'id': '282756', - 'ext': 'mp4', - 'title': 'Antikjakten S01E10', - 'description': 'md5:1b201169beabd97e20c5ad0ad67b13b8', - 'duration': 2646, - 'timestamp': 1348575868, - 'upload_date': '20120925', - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://www.tv3play.no/programmer/anna-anka-soker-assistent/230898?autostart=true', - 'info_dict': { - 'id': '230898', - 'ext': 'mp4', - 'title': 'Anna Anka søker assistent - Ep. 8', - 'description': 'md5:f80916bf5bbe1c5f760d127f8dd71474', - 'duration': 2656, - 'timestamp': 1277720005, - 'upload_date': '20100628', - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://www.viasat4play.no/programmer/budbringerne/21873?autostart=true', - 'info_dict': { - 'id': '21873', - 'ext': 'mp4', - 'title': 'Budbringerne program 10', - 'description': 'md5:4db78dc4ec8a85bb04fd322a3ee5092d', - 'duration': 1297, - 'timestamp': 1254205102, - 'upload_date': '20090929', - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://www.tv6play.no/programmer/hotelinspektor-alex-polizzi/361883?autostart=true', - 'info_dict': { - 'id': '361883', - 'ext': 'mp4', - 'title': 'Hotelinspektør Alex Polizzi - Ep. 10', - 'description': 'md5:3ecf808db9ec96c862c8ecb3a7fdaf81', - 'duration': 2594, - 'timestamp': 1393236292, - 'upload_date': '20140224', - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://play.novatv.bg/programi/zdravei-bulgariya/624952?autostart=true', - 'info_dict': { - 'id': '624952', - 'ext': 'flv', - 'title': 'Здравей, България (12.06.2015 г.) ', - 'description': 'md5:99f3700451ac5bb71a260268b8daefd7', - 'duration': 8838, - 'timestamp': 1434100372, - 'upload_date': '20150612', - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - }, - { - 'url': 'https://play.nova.bg/programi/zdravei-bulgariya/764300?autostart=true', - 'only_matching': True, - }, - { - 'url': 'http://tvplay.skaties.lv/parraides/vinas-melo-labak/418113?autostart=true', - 'only_matching': True, - }, - { - 'url': 'https://tvplay.skaties.lv/vinas-melo-labak/418113/?autostart=true', - 'only_matching': True, - }, - { - # views is null - 'url': 'http://tvplay.skaties.lv/parraides/tv3-zinas/760183', - 'only_matching': True, - }, - { - 'url': 'http://tv3play.tv3.ee/sisu/kodu-keset-linna/238551?autostart=true', - 'only_matching': True, - }, - { - 'url': 'http://www.viafree.se/program/underhallning/i-like-radio-live/sasong-1/676869', - 'only_matching': True, - }, - { - 'url': 'mtg:418113', - 'only_matching': True, - } - ] - - def _real_extract(self, url): - video_id = self._match_id(url) - geo_country = self._search_regex( - r'https?://[^/]+\.([a-z]{2})', url, - 'geo country', default=None) - if geo_country: - self._initialize_geo_bypass({'countries': [geo_country.upper()]}) - video = self._download_json( - 'http://playapi.mtgx.tv/v3/videos/%s' % video_id, video_id, 'Downloading video JSON') - - title = video['title'] - - try: - streams = self._download_json( - 'http://playapi.mtgx.tv/v3/videos/stream/%s' % video_id, - video_id, 'Downloading streams JSON') - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - msg = self._parse_json(e.cause.read().decode('utf-8'), video_id) - raise ExtractorError(msg['msg'], expected=True) - raise - - quality = qualities(['hls', 'medium', 'high']) - formats = [] - for format_id, video_url in streams.get('streams', {}).items(): - video_url = url_or_none(video_url) - if not video_url: - continue - ext = determine_ext(video_url) - if ext == 'f4m': - formats.extend(self._extract_f4m_formats( - update_url_query(video_url, { - 'hdcore': '3.5.0', - 'plugin': 'aasp-3.5.0.151.81' - }), video_id, f4m_id='hds', fatal=False)) - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - else: - fmt = { - 'format_id': format_id, - 'quality': quality(format_id), - 'ext': ext, - } - if video_url.startswith('rtmp'): - m = re.search( - r'^(?P<url>rtmp://[^/]+/(?P<app>[^/]+))/(?P<playpath>.+)$', video_url) - if not m: - continue - fmt.update({ - 'ext': 'flv', - 'url': m.group('url'), - 'app': m.group('app'), - 'play_path': m.group('playpath'), - 'preference': -1, - }) - else: - fmt.update({ - 'url': video_url, - }) - formats.append(fmt) - - if not formats and video.get('is_geo_blocked'): - self.raise_geo_restricted( - 'This content might not be available in your country due to copyright reasons') - - self._sort_formats(formats) - - # TODO: webvtt in m3u8 - subtitles = {} - sami_path = video.get('sami_path') - if sami_path: - lang = self._search_regex( - r'_([a-z]{2})\.xml', sami_path, 'lang', - default=compat_urlparse.urlparse(url).netloc.rsplit('.', 1)[-1]) - subtitles[lang] = [{ - 'url': sami_path, - }] - - series = video.get('format_title') - episode_number = int_or_none(video.get('format_position', {}).get('episode')) - season = video.get('_embedded', {}).get('season', {}).get('title') - season_number = int_or_none(video.get('format_position', {}).get('season')) - - return { - 'id': video_id, - 'title': title, - 'description': video.get('description'), - 'series': series, - 'episode_number': episode_number, - 'season': season, - 'season_number': season_number, - 'duration': int_or_none(video.get('duration')), - 'timestamp': parse_iso8601(video.get('created_at')), - 'view_count': try_get(video, lambda x: x['views']['total'], int), - 'age_limit': int_or_none(video.get('age_limit', 0)), - 'formats': formats, - 'subtitles': subtitles, - } - - -class ViafreeIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?:// - (?:www\.)? - viafree\.(?P<country>dk|no|se) - /(?P<id>program(?:mer)?/(?:[^/]+/)+[^/?#&]+) - ''' - _TESTS = [{ - 'url': 'http://www.viafree.no/programmer/underholdning/det-beste-vorspielet/sesong-2/episode-1', - 'info_dict': { - 'id': '757786', - 'ext': 'mp4', - 'title': 'Det beste vorspielet - Sesong 2 - Episode 1', - 'description': 'md5:b632cb848331404ccacd8cd03e83b4c3', - 'series': 'Det beste vorspielet', - 'season_number': 2, - 'duration': 1116, - 'timestamp': 1471200600, - 'upload_date': '20160814', - }, - 'params': { - 'skip_download': True, - }, - }, { - # with relatedClips - 'url': 'http://www.viafree.se/program/reality/sommaren-med-youtube-stjarnorna/sasong-1/avsnitt-1', - 'only_matching': True, - }, { - # Different og:image URL schema - 'url': 'http://www.viafree.se/program/reality/sommaren-med-youtube-stjarnorna/sasong-1/avsnitt-2', - 'only_matching': True, - }, { - 'url': 'http://www.viafree.se/program/livsstil/husraddarna/sasong-2/avsnitt-2', - 'only_matching': True, - }, { - 'url': 'http://www.viafree.dk/programmer/reality/paradise-hotel/saeson-7/episode-5', - 'only_matching': True, - }] - _GEO_BYPASS = False - - @classmethod - def suitable(cls, url): - return False if TVPlayIE.suitable(url) else super(ViafreeIE, cls).suitable(url) - - def _real_extract(self, url): - country, path = re.match(self._VALID_URL, url).groups() - content = self._download_json( - 'https://viafree-content.mtg-api.com/viafree-content/v1/%s/path/%s' % (country, path), path) - program = content['_embedded']['viafreeBlocks'][0]['_embedded']['program'] - guid = program['guid'] - meta = content['meta'] - title = meta['title'] - - try: - stream_href = self._download_json( - program['_links']['streamLink']['href'], guid, - headers=self.geo_verification_headers())['embedded']['prioritizedStreams'][0]['links']['stream']['href'] - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - self.raise_geo_restricted(countries=[country]) - raise - - formats = self._extract_m3u8_formats(stream_href, guid, 'mp4') - self._sort_formats(formats) - episode = program.get('episode') or {} - - return { - 'id': guid, - 'title': title, - 'thumbnail': meta.get('image'), - 'description': meta.get('description'), - 'series': episode.get('seriesTitle'), - 'episode_number': int_or_none(episode.get('episodeNumber')), - 'season_number': int_or_none(episode.get('seasonNumber')), - 'duration': int_or_none(try_get(program, lambda x: x['video']['duration']['milliseconds']), 1000), - 'timestamp': parse_iso8601(try_get(program, lambda x: x['availability']['start'])), - 'formats': formats, - } - - -class TVPlayHomeIE(InfoExtractor): - _VALID_URL = r'https?://(?:tv3?)?play\.(?:tv3\.lt|skaties\.lv|tv3\.ee)/(?:[^/]+/)*[^/?#&]+-(?P<id>\d+)' - _TESTS = [{ - 'url': 'https://tvplay.tv3.lt/aferistai-n-7/aferistai-10047125/', - 'info_dict': { - 'id': '366367', - 'ext': 'mp4', - 'title': 'Aferistai', - 'description': 'Aferistai. Kalėdinė pasaka.', - 'series': 'Aferistai [N-7]', - 'season': '1 sezonas', - 'season_number': 1, - 'duration': 464, - 'timestamp': 1394209658, - 'upload_date': '20140307', - 'age_limit': 18, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://tvplay.skaties.lv/vinas-melo-labak/vinas-melo-labak-10280317/', - 'only_matching': True, - }, { - 'url': 'https://tvplay.tv3.ee/cool-d-ga-mehhikosse/cool-d-ga-mehhikosse-10044354/', - 'only_matching': True, - }, { - 'url': 'https://play.tv3.lt/aferistai-10047125', - 'only_matching': True, - }, { - 'url': 'https://tv3play.skaties.lv/vinas-melo-labak-10280317', - 'only_matching': True, - }, { - 'url': 'https://play.tv3.ee/cool-d-ga-mehhikosse-10044354', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - asset = self._download_json( - urljoin(url, '/sb/public/asset/' + video_id), video_id) - - m3u8_url = asset['movie']['contentUrl'] - video_id = asset['assetId'] - asset_title = asset['title'] - title = asset_title['title'] - - formats = self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls') - self._sort_formats(formats) - - thumbnails = None - image_url = asset.get('imageUrl') - if image_url: - thumbnails = [{ - 'url': urljoin(url, image_url), - 'ext': 'jpg', - }] - - metadata = asset.get('metadata') or {} - - return { - 'id': video_id, - 'title': title, - 'description': asset_title.get('summaryLong') or asset_title.get('summaryShort'), - 'thumbnails': thumbnails, - 'duration': parse_duration(asset_title.get('runTime')), - 'series': asset.get('tvSeriesTitle'), - 'season': asset.get('tvSeasonTitle'), - 'season_number': int_or_none(metadata.get('seasonNumber')), - 'episode': asset_title.get('titleBrief'), - 'episode_number': int_or_none(metadata.get('episodeNumber')), - 'formats': formats, - } diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py deleted file mode 100644 index 74d14049b..000000000 --- a/youtube_dl/extractor/twentyfourvideo.py +++ /dev/null @@ -1,133 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - parse_iso8601, - int_or_none, - xpath_attr, - xpath_element, -) - - -class TwentyFourVideoIE(InfoExtractor): - IE_NAME = '24video' - _VALID_URL = r'''(?x) - https?:// - (?P<host> - (?:(?:www|porno?)\.)?24video\. - (?:net|me|xxx|sexy?|tube|adult|site|vip) - )/ - (?: - video/(?:(?:view|xml)/)?| - player/new24_play\.swf\?id= - ) - (?P<id>\d+) - ''' - - _TESTS = [{ - 'url': 'http://www.24video.net/video/view/1044982', - 'md5': 'e09fc0901d9eaeedac872f154931deeb', - 'info_dict': { - 'id': '1044982', - 'ext': 'mp4', - 'title': 'Эротика каменного века', - 'description': 'Как смотрели порно в каменном веке.', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'SUPERTELO', - 'duration': 31, - 'timestamp': 1275937857, - 'upload_date': '20100607', - 'age_limit': 18, - 'like_count': int, - 'dislike_count': int, - }, - }, { - 'url': 'http://www.24video.net/player/new24_play.swf?id=1044982', - 'only_matching': True, - }, { - 'url': 'http://www.24video.me/video/view/1044982', - 'only_matching': True, - }, { - 'url': 'http://www.24video.tube/video/view/2363750', - 'only_matching': True, - }, { - 'url': 'https://www.24video.site/video/view/2640421', - 'only_matching': True, - }, { - 'url': 'https://porno.24video.net/video/2640421-vsya-takaya-gibkaya-i-v-masle', - 'only_matching': True, - }, { - 'url': 'https://www.24video.vip/video/view/1044982', - 'only_matching': True, - }, { - 'url': 'https://porn.24video.net/video/2640421-vsya-takay', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - host = mobj.group('host') - - webpage = self._download_webpage( - 'http://%s/video/view/%s' % (host, video_id), video_id) - - title = self._og_search_title(webpage) - description = self._html_search_regex( - r'<(p|span)[^>]+itemprop="description"[^>]*>(?P<description>[^<]+)</\1>', - webpage, 'description', fatal=False, group='description') - thumbnail = self._og_search_thumbnail(webpage) - duration = int_or_none(self._og_search_property( - 'duration', webpage, 'duration', fatal=False)) - timestamp = parse_iso8601(self._search_regex( - r'<time[^>]+\bdatetime="([^"]+)"[^>]+itemprop="uploadDate"', - webpage, 'upload date', fatal=False)) - - uploader = self._html_search_regex( - r'class="video-uploaded"[^>]*>\s*<a href="/jsecUser/movies/[^"]+"[^>]*>([^<]+)</a>', - webpage, 'uploader', fatal=False) - - view_count = int_or_none(self._html_search_regex( - r'<span class="video-views">(\d+) просмотр', - webpage, 'view count', fatal=False)) - comment_count = int_or_none(self._html_search_regex( - r'<a[^>]+href="#tab-comments"[^>]*>(\d+) комментари', - webpage, 'comment count', default=None)) - - # Sets some cookies - self._download_xml( - r'http://%s/video/xml/%s?mode=init' % (host, video_id), - video_id, 'Downloading init XML') - - video_xml = self._download_xml( - 'http://%s/video/xml/%s?mode=play' % (host, video_id), - video_id, 'Downloading video XML') - - video = xpath_element(video_xml, './/video', 'video', fatal=True) - - formats = [{ - 'url': xpath_attr(video, '', 'url', 'video URL', fatal=True), - }] - - like_count = int_or_none(video.get('ratingPlus')) - dislike_count = int_or_none(video.get('ratingMinus')) - age_limit = 18 if video.get('adult') == 'true' else 0 - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'duration': duration, - 'timestamp': timestamp, - 'view_count': view_count, - 'comment_count': comment_count, - 'like_count': like_count, - 'dislike_count': dislike_count, - 'age_limit': age_limit, - 'formats': formats, - } diff --git a/youtube_dl/extractor/twentythreevideo.py b/youtube_dl/extractor/twentythreevideo.py deleted file mode 100644 index dc5609192..000000000 --- a/youtube_dl/extractor/twentythreevideo.py +++ /dev/null @@ -1,80 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import int_or_none - - -class TwentyThreeVideoIE(InfoExtractor): - IE_NAME = '23video' - _VALID_URL = r'https?://(?P<domain>[^.]+\.(?:twentythree\.net|23video\.com|filmweb\.no))/v\.ihtml/player\.html\?(?P<query>.*?\bphoto(?:_|%5f)id=(?P<id>\d+).*)' - _TESTS = [{ - 'url': 'https://video.twentythree.net/v.ihtml/player.html?showDescriptions=0&source=site&photo%5fid=20448876&autoPlay=1', - 'md5': '75fcf216303eb1dae9920d651f85ced4', - 'info_dict': { - 'id': '20448876', - 'ext': 'mp4', - 'title': 'Video Marketing Minute: Personalized Video', - 'timestamp': 1513855354, - 'upload_date': '20171221', - 'uploader_id': '12258964', - 'uploader': 'Rasmus Bysted', - } - }, { - 'url': 'https://bonnier-publications-danmark.23video.com/v.ihtml/player.html?token=f0dc46476e06e13afd5a1f84a29e31e8&source=embed&photo%5fid=36137620', - 'only_matching': True, - }] - - def _real_extract(self, url): - domain, query, photo_id = re.match(self._VALID_URL, url).groups() - base_url = 'https://%s' % domain - photo_data = self._download_json( - base_url + '/api/photo/list?' + query, photo_id, query={ - 'format': 'json', - }, transform_source=lambda s: self._search_regex(r'(?s)({.+})', s, 'photo data'))['photo'] - title = photo_data['title'] - - formats = [] - - audio_path = photo_data.get('audio_download') - if audio_path: - formats.append({ - 'format_id': 'audio', - 'url': base_url + audio_path, - 'filesize': int_or_none(photo_data.get('audio_size')), - 'vcodec': 'none', - }) - - def add_common_info_to_list(l, template, id_field, id_value): - f_base = template % id_value - f_path = photo_data.get(f_base + 'download') - if not f_path: - return - l.append({ - id_field: id_value, - 'url': base_url + f_path, - 'width': int_or_none(photo_data.get(f_base + 'width')), - 'height': int_or_none(photo_data.get(f_base + 'height')), - 'filesize': int_or_none(photo_data.get(f_base + 'size')), - }) - - for f in ('mobile_high', 'medium', 'hd', '1080p', '4k'): - add_common_info_to_list(formats, 'video_%s_', 'format_id', f) - - thumbnails = [] - for t in ('quad16', 'quad50', 'quad75', 'quad100', 'small', 'portrait', 'standard', 'medium', 'large', 'original'): - add_common_info_to_list(thumbnails, '%s_', 'id', t) - - return { - 'id': photo_id, - 'title': title, - 'timestamp': int_or_none(photo_data.get('creation_date_epoch')), - 'duration': int_or_none(photo_data.get('video_length')), - 'view_count': int_or_none(photo_data.get('view_count')), - 'comment_count': int_or_none(photo_data.get('number_of_comments')), - 'uploader_id': photo_data.get('user_id'), - 'uploader': photo_data.get('display_name'), - 'thumbnails': thumbnails, - 'formats': formats, - } diff --git a/youtube_dl/extractor/twitcasting.py b/youtube_dl/extractor/twitcasting.py deleted file mode 100644 index 6596eef9f..000000000 --- a/youtube_dl/extractor/twitcasting.py +++ /dev/null @@ -1,111 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - clean_html, - float_or_none, - get_element_by_class, - get_element_by_id, - parse_duration, - str_to_int, - unified_timestamp, - urlencode_postdata, -) - - -class TwitCastingIE(InfoExtractor): - _VALID_URL = r'https?://(?:[^/]+\.)?twitcasting\.tv/(?P<uploader_id>[^/]+)/movie/(?P<id>\d+)' - _TESTS = [{ - 'url': 'https://twitcasting.tv/ivetesangalo/movie/2357609', - 'md5': '745243cad58c4681dc752490f7540d7f', - 'info_dict': { - 'id': '2357609', - 'ext': 'mp4', - 'title': 'Live #2357609', - 'uploader_id': 'ivetesangalo', - 'description': 'Twitter Oficial da cantora brasileira Ivete Sangalo.', - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20110822', - 'timestamp': 1314010824, - 'duration': 32, - 'view_count': int, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://twitcasting.tv/mttbernardini/movie/3689740', - 'info_dict': { - 'id': '3689740', - 'ext': 'mp4', - 'title': 'Live playing something #3689740', - 'uploader_id': 'mttbernardini', - 'description': 'Salve, io sono Matto (ma con la e). Questa è la mia presentazione, in quanto sono letteralmente matto (nel senso di strano), con qualcosa in più.', - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20120212', - 'timestamp': 1329028024, - 'duration': 681, - 'view_count': int, - }, - 'params': { - 'skip_download': True, - 'videopassword': 'abc', - }, - }] - - def _real_extract(self, url): - uploader_id, video_id = re.match(self._VALID_URL, url).groups() - - video_password = self._downloader.params.get('videopassword') - request_data = None - if video_password: - request_data = urlencode_postdata({ - 'password': video_password, - }) - webpage = self._download_webpage(url, video_id, data=request_data) - - title = clean_html(get_element_by_id( - 'movietitle', webpage)) or self._html_search_meta( - ['og:title', 'twitter:title'], webpage, fatal=True) - - video_js_data = {} - m3u8_url = self._search_regex( - r'data-movie-url=(["\'])(?P<url>(?:(?!\1).)+)\1', - webpage, 'm3u8 url', group='url', default=None) - if not m3u8_url: - video_js_data = self._parse_json(self._search_regex( - r"data-movie-playlist='(\[[^']+\])'", - webpage, 'movie playlist'), video_id)[0] - m3u8_url = video_js_data['source']['url'] - - # use `m3u8` entry_protocol until EXT-X-MAP is properly supported by `m3u8_native` entry_protocol - formats = self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', m3u8_id='hls') - - thumbnail = video_js_data.get('thumbnailUrl') or self._og_search_thumbnail(webpage) - description = clean_html(get_element_by_id( - 'authorcomment', webpage)) or self._html_search_meta( - ['description', 'og:description', 'twitter:description'], webpage) - duration = float_or_none(video_js_data.get( - 'duration'), 1000) or parse_duration(clean_html( - get_element_by_class('tw-player-duration-time', webpage))) - view_count = str_to_int(self._search_regex( - r'Total\s*:\s*([\d,]+)\s*Views', webpage, 'views', None)) - timestamp = unified_timestamp(self._search_regex( - r'data-toggle="true"[^>]+datetime="([^"]+)"', - webpage, 'datetime', None)) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'uploader_id': uploader_id, - 'duration': duration, - 'view_count': view_count, - 'formats': formats, - } diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py deleted file mode 100644 index a378bd6dc..000000000 --- a/youtube_dl/extractor/twitch.py +++ /dev/null @@ -1,988 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import collections -import itertools -import json -import random -import re - -from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_str, - compat_urlparse, - compat_urllib_parse_urlencode, - compat_urllib_parse_urlparse, -) -from ..utils import ( - clean_html, - dict_get, - ExtractorError, - float_or_none, - int_or_none, - parse_duration, - parse_iso8601, - qualities, - try_get, - unified_timestamp, - update_url_query, - url_or_none, - urljoin, -) - - -class TwitchBaseIE(InfoExtractor): - _VALID_URL_BASE = r'https?://(?:(?:www|go|m)\.)?twitch\.tv' - - _API_BASE = 'https://api.twitch.tv' - _USHER_BASE = 'https://usher.ttvnw.net' - _LOGIN_FORM_URL = 'https://www.twitch.tv/login' - _LOGIN_POST_URL = 'https://passport.twitch.tv/login' - _CLIENT_ID = 'kimne78kx3ncx6brgo4mv6wki5h1ko' - _NETRC_MACHINE = 'twitch' - - _OPERATION_HASHES = { - 'CollectionSideBar': '27111f1b382effad0b6def325caef1909c733fe6a4fbabf54f8d491ef2cf2f14', - 'FilterableVideoTower_Videos': 'a937f1d22e269e39a03b509f65a7490f9fc247d7f83d6ac1421523e3b68042cb', - 'ClipsCards__User': 'b73ad2bfaecfd30a9e6c28fada15bd97032c83ec77a0440766a56fe0bd632777', - 'ChannelCollectionsContent': '07e3691a1bad77a36aba590c351180439a40baefc1c275356f40fc7082419a84', - 'StreamMetadata': '1c719a40e481453e5c48d9bb585d971b8b372f8ebb105b17076722264dfa5b3e', - 'ComscoreStreamingQuery': 'e1edae8122517d013405f237ffcc124515dc6ded82480a88daef69c83b53ac01', - 'VideoAccessToken_Clip': '36b89d2507fce29e5ca551df756d27c1cfe079e2609642b4390aa4c35796eb11', - 'VideoPreviewOverlay': '3006e77e51b128d838fa4e835723ca4dc9a05c5efd4466c1085215c6e437e65c', - 'VideoMetadata': '226edb3e692509f727fd56821f5653c05740242c82b0388883e0c0e75dcbf687', - } - - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - - def fail(message): - raise ExtractorError( - 'Unable to login. Twitch said: %s' % message, expected=True) - - def login_step(page, urlh, note, data): - form = self._hidden_inputs(page) - form.update(data) - - page_url = urlh.geturl() - post_url = self._search_regex( - r'<form[^>]+action=(["\'])(?P<url>.+?)\1', page, - 'post url', default=self._LOGIN_POST_URL, group='url') - post_url = urljoin(page_url, post_url) - - headers = { - 'Referer': page_url, - 'Origin': 'https://www.twitch.tv', - 'Content-Type': 'text/plain;charset=UTF-8', - } - - response = self._download_json( - post_url, None, note, data=json.dumps(form).encode(), - headers=headers, expected_status=400) - error = dict_get(response, ('error', 'error_description', 'error_code')) - if error: - fail(error) - - if 'Authenticated successfully' in response.get('message', ''): - return None, None - - redirect_url = urljoin( - post_url, - response.get('redirect') or response['redirect_path']) - return self._download_webpage_handle( - redirect_url, None, 'Downloading login redirect page', - headers=headers) - - login_page, handle = self._download_webpage_handle( - self._LOGIN_FORM_URL, None, 'Downloading login page') - - # Some TOR nodes and public proxies are blocked completely - if 'blacklist_message' in login_page: - fail(clean_html(login_page)) - - redirect_page, handle = login_step( - login_page, handle, 'Logging in', { - 'username': username, - 'password': password, - 'client_id': self._CLIENT_ID, - }) - - # Successful login - if not redirect_page: - return - - if re.search(r'(?i)<form[^>]+id="two-factor-submit"', redirect_page) is not None: - # TODO: Add mechanism to request an SMS or phone call - tfa_token = self._get_tfa_info('two-factor authentication token') - login_step(redirect_page, handle, 'Submitting TFA token', { - 'authy_token': tfa_token, - 'remember_2fa': 'true', - }) - - def _prefer_source(self, formats): - try: - source = next(f for f in formats if f['format_id'] == 'Source') - source['quality'] = 10 - except StopIteration: - for f in formats: - if '/chunked/' in f['url']: - f.update({ - 'quality': 10, - 'format_note': 'Source', - }) - self._sort_formats(formats) - - def _download_base_gql(self, video_id, ops, note, fatal=True): - headers = { - 'Content-Type': 'text/plain;charset=UTF-8', - 'Client-ID': self._CLIENT_ID, - } - gql_auth = self._get_cookies('https://gql.twitch.tv').get('auth-token') - if gql_auth: - headers['Authorization'] = 'OAuth ' + gql_auth.value - return self._download_json( - 'https://gql.twitch.tv/gql', video_id, note, - data=json.dumps(ops).encode(), - headers=headers, fatal=fatal) - - def _download_gql(self, video_id, ops, note, fatal=True): - for op in ops: - op['extensions'] = { - 'persistedQuery': { - 'version': 1, - 'sha256Hash': self._OPERATION_HASHES[op['operationName']], - } - } - return self._download_base_gql(video_id, ops, note) - - def _download_access_token(self, video_id, token_kind, param_name): - method = '%sPlaybackAccessToken' % token_kind - ops = { - 'query': '''{ - %s( - %s: "%s", - params: { - platform: "web", - playerBackend: "mediaplayer", - playerType: "site" - } - ) - { - value - signature - } - }''' % (method, param_name, video_id), - } - return self._download_base_gql( - video_id, ops, - 'Downloading %s access token GraphQL' % token_kind)['data'][method] - - -class TwitchVodIE(TwitchBaseIE): - IE_NAME = 'twitch:vod' - _VALID_URL = r'''(?x) - https?:// - (?: - (?:(?:www|go|m)\.)?twitch\.tv/(?:[^/]+/v(?:ideo)?|videos)/| - player\.twitch\.tv/\?.*?\bvideo=v? - ) - (?P<id>\d+) - ''' - - _TESTS = [{ - 'url': 'http://www.twitch.tv/riotgames/v/6528877?t=5m10s', - 'info_dict': { - 'id': 'v6528877', - 'ext': 'mp4', - 'title': 'LCK Summer Split - Week 6 Day 1', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 17208, - 'timestamp': 1435131734, - 'upload_date': '20150624', - 'uploader': 'Riot Games', - 'uploader_id': 'riotgames', - 'view_count': int, - 'start_time': 310, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - # Untitled broadcast (title is None) - 'url': 'http://www.twitch.tv/belkao_o/v/11230755', - 'info_dict': { - 'id': 'v11230755', - 'ext': 'mp4', - 'title': 'Untitled Broadcast', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 1638, - 'timestamp': 1439746708, - 'upload_date': '20150816', - 'uploader': 'BelkAO_o', - 'uploader_id': 'belkao_o', - 'view_count': int, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'skip': 'HTTP Error 404: Not Found', - }, { - 'url': 'http://player.twitch.tv/?t=5m10s&video=v6528877', - 'only_matching': True, - }, { - 'url': 'https://www.twitch.tv/videos/6528877', - 'only_matching': True, - }, { - 'url': 'https://m.twitch.tv/beagsandjam/v/247478721', - 'only_matching': True, - }, { - 'url': 'https://www.twitch.tv/northernlion/video/291940395', - 'only_matching': True, - }, { - 'url': 'https://player.twitch.tv/?video=480452374', - 'only_matching': True, - }] - - def _download_info(self, item_id): - data = self._download_gql( - item_id, [{ - 'operationName': 'VideoMetadata', - 'variables': { - 'channelLogin': '', - 'videoID': item_id, - }, - }], - 'Downloading stream metadata GraphQL')[0]['data'] - video = data.get('video') - if video is None: - raise ExtractorError( - 'Video %s does not exist' % item_id, expected=True) - return self._extract_info_gql(video, item_id) - - @staticmethod - def _extract_info(info): - status = info.get('status') - if status == 'recording': - is_live = True - elif status == 'recorded': - is_live = False - else: - is_live = None - _QUALITIES = ('small', 'medium', 'large') - quality_key = qualities(_QUALITIES) - thumbnails = [] - preview = info.get('preview') - if isinstance(preview, dict): - for thumbnail_id, thumbnail_url in preview.items(): - thumbnail_url = url_or_none(thumbnail_url) - if not thumbnail_url: - continue - if thumbnail_id not in _QUALITIES: - continue - thumbnails.append({ - 'url': thumbnail_url, - 'preference': quality_key(thumbnail_id), - }) - return { - 'id': info['_id'], - 'title': info.get('title') or 'Untitled Broadcast', - 'description': info.get('description'), - 'duration': int_or_none(info.get('length')), - 'thumbnails': thumbnails, - 'uploader': info.get('channel', {}).get('display_name'), - 'uploader_id': info.get('channel', {}).get('name'), - 'timestamp': parse_iso8601(info.get('recorded_at')), - 'view_count': int_or_none(info.get('views')), - 'is_live': is_live, - } - - @staticmethod - def _extract_info_gql(info, item_id): - vod_id = info.get('id') or item_id - # id backward compatibility for download archives - if vod_id[0] != 'v': - vod_id = 'v%s' % vod_id - thumbnail = url_or_none(info.get('previewThumbnailURL')) - if thumbnail: - for p in ('width', 'height'): - thumbnail = thumbnail.replace('{%s}' % p, '0') - return { - 'id': vod_id, - 'title': info.get('title') or 'Untitled Broadcast', - 'description': info.get('description'), - 'duration': int_or_none(info.get('lengthSeconds')), - 'thumbnail': thumbnail, - 'uploader': try_get(info, lambda x: x['owner']['displayName'], compat_str), - 'uploader_id': try_get(info, lambda x: x['owner']['login'], compat_str), - 'timestamp': unified_timestamp(info.get('publishedAt')), - 'view_count': int_or_none(info.get('viewCount')), - } - - def _real_extract(self, url): - vod_id = self._match_id(url) - - info = self._download_info(vod_id) - access_token = self._download_access_token(vod_id, 'video', 'id') - - formats = self._extract_m3u8_formats( - '%s/vod/%s.m3u8?%s' % ( - self._USHER_BASE, vod_id, - compat_urllib_parse_urlencode({ - 'allow_source': 'true', - 'allow_audio_only': 'true', - 'allow_spectre': 'true', - 'player': 'twitchweb', - 'playlist_include_framerate': 'true', - 'nauth': access_token['value'], - 'nauthsig': access_token['signature'], - })), - vod_id, 'mp4', entry_protocol='m3u8_native') - - self._prefer_source(formats) - info['formats'] = formats - - parsed_url = compat_urllib_parse_urlparse(url) - query = compat_parse_qs(parsed_url.query) - if 't' in query: - info['start_time'] = parse_duration(query['t'][0]) - - if info.get('timestamp') is not None: - info['subtitles'] = { - 'rechat': [{ - 'url': update_url_query( - 'https://api.twitch.tv/v5/videos/%s/comments' % vod_id, { - 'client_id': self._CLIENT_ID, - }), - 'ext': 'json', - }], - } - - return info - - -def _make_video_result(node): - assert isinstance(node, dict) - video_id = node.get('id') - if not video_id: - return - return { - '_type': 'url_transparent', - 'ie_key': TwitchVodIE.ie_key(), - 'id': video_id, - 'url': 'https://www.twitch.tv/videos/%s' % video_id, - 'title': node.get('title'), - 'thumbnail': node.get('previewThumbnailURL'), - 'duration': float_or_none(node.get('lengthSeconds')), - 'view_count': int_or_none(node.get('viewCount')), - } - - -class TwitchCollectionIE(TwitchBaseIE): - _VALID_URL = r'https?://(?:(?:www|go|m)\.)?twitch\.tv/collections/(?P<id>[^/]+)' - - _TESTS = [{ - 'url': 'https://www.twitch.tv/collections/wlDCoH0zEBZZbQ', - 'info_dict': { - 'id': 'wlDCoH0zEBZZbQ', - 'title': 'Overthrow Nook, capitalism for children', - }, - 'playlist_mincount': 13, - }] - - _OPERATION_NAME = 'CollectionSideBar' - - def _real_extract(self, url): - collection_id = self._match_id(url) - collection = self._download_gql( - collection_id, [{ - 'operationName': self._OPERATION_NAME, - 'variables': {'collectionID': collection_id}, - }], - 'Downloading collection GraphQL')[0]['data']['collection'] - title = collection.get('title') - entries = [] - for edge in collection['items']['edges']: - if not isinstance(edge, dict): - continue - node = edge.get('node') - if not isinstance(node, dict): - continue - video = _make_video_result(node) - if video: - entries.append(video) - return self.playlist_result( - entries, playlist_id=collection_id, playlist_title=title) - - -class TwitchPlaylistBaseIE(TwitchBaseIE): - _PAGE_LIMIT = 100 - - def _entries(self, channel_name, *args): - cursor = None - variables_common = self._make_variables(channel_name, *args) - entries_key = '%ss' % self._ENTRY_KIND - for page_num in itertools.count(1): - variables = variables_common.copy() - variables['limit'] = self._PAGE_LIMIT - if cursor: - variables['cursor'] = cursor - page = self._download_gql( - channel_name, [{ - 'operationName': self._OPERATION_NAME, - 'variables': variables, - }], - 'Downloading %ss GraphQL page %s' % (self._NODE_KIND, page_num), - fatal=False) - if not page: - break - edges = try_get( - page, lambda x: x[0]['data']['user'][entries_key]['edges'], list) - if not edges: - break - for edge in edges: - if not isinstance(edge, dict): - continue - if edge.get('__typename') != self._EDGE_KIND: - continue - node = edge.get('node') - if not isinstance(node, dict): - continue - if node.get('__typename') != self._NODE_KIND: - continue - entry = self._extract_entry(node) - if entry: - cursor = edge.get('cursor') - yield entry - if not cursor or not isinstance(cursor, compat_str): - break - - -class TwitchVideosIE(TwitchPlaylistBaseIE): - _VALID_URL = r'https?://(?:(?:www|go|m)\.)?twitch\.tv/(?P<id>[^/]+)/(?:videos|profile)' - - _TESTS = [{ - # All Videos sorted by Date - 'url': 'https://www.twitch.tv/spamfish/videos?filter=all', - 'info_dict': { - 'id': 'spamfish', - 'title': 'spamfish - All Videos sorted by Date', - }, - 'playlist_mincount': 924, - }, { - # All Videos sorted by Popular - 'url': 'https://www.twitch.tv/spamfish/videos?filter=all&sort=views', - 'info_dict': { - 'id': 'spamfish', - 'title': 'spamfish - All Videos sorted by Popular', - }, - 'playlist_mincount': 931, - }, { - # Past Broadcasts sorted by Date - 'url': 'https://www.twitch.tv/spamfish/videos?filter=archives', - 'info_dict': { - 'id': 'spamfish', - 'title': 'spamfish - Past Broadcasts sorted by Date', - }, - 'playlist_mincount': 27, - }, { - # Highlights sorted by Date - 'url': 'https://www.twitch.tv/spamfish/videos?filter=highlights', - 'info_dict': { - 'id': 'spamfish', - 'title': 'spamfish - Highlights sorted by Date', - }, - 'playlist_mincount': 901, - }, { - # Uploads sorted by Date - 'url': 'https://www.twitch.tv/esl_csgo/videos?filter=uploads&sort=time', - 'info_dict': { - 'id': 'esl_csgo', - 'title': 'esl_csgo - Uploads sorted by Date', - }, - 'playlist_mincount': 5, - }, { - # Past Premieres sorted by Date - 'url': 'https://www.twitch.tv/spamfish/videos?filter=past_premieres', - 'info_dict': { - 'id': 'spamfish', - 'title': 'spamfish - Past Premieres sorted by Date', - }, - 'playlist_mincount': 1, - }, { - 'url': 'https://www.twitch.tv/spamfish/videos/all', - 'only_matching': True, - }, { - 'url': 'https://m.twitch.tv/spamfish/videos/all', - 'only_matching': True, - }, { - 'url': 'https://www.twitch.tv/spamfish/videos', - 'only_matching': True, - }] - - Broadcast = collections.namedtuple('Broadcast', ['type', 'label']) - - _DEFAULT_BROADCAST = Broadcast(None, 'All Videos') - _BROADCASTS = { - 'archives': Broadcast('ARCHIVE', 'Past Broadcasts'), - 'highlights': Broadcast('HIGHLIGHT', 'Highlights'), - 'uploads': Broadcast('UPLOAD', 'Uploads'), - 'past_premieres': Broadcast('PAST_PREMIERE', 'Past Premieres'), - 'all': _DEFAULT_BROADCAST, - } - - _DEFAULT_SORTED_BY = 'Date' - _SORTED_BY = { - 'time': _DEFAULT_SORTED_BY, - 'views': 'Popular', - } - - _OPERATION_NAME = 'FilterableVideoTower_Videos' - _ENTRY_KIND = 'video' - _EDGE_KIND = 'VideoEdge' - _NODE_KIND = 'Video' - - @classmethod - def suitable(cls, url): - return (False - if any(ie.suitable(url) for ie in ( - TwitchVideosClipsIE, - TwitchVideosCollectionsIE)) - else super(TwitchVideosIE, cls).suitable(url)) - - @staticmethod - def _make_variables(channel_name, broadcast_type, sort): - return { - 'channelOwnerLogin': channel_name, - 'broadcastType': broadcast_type, - 'videoSort': sort.upper(), - } - - @staticmethod - def _extract_entry(node): - return _make_video_result(node) - - def _real_extract(self, url): - channel_name = self._match_id(url) - qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - filter = qs.get('filter', ['all'])[0] - sort = qs.get('sort', ['time'])[0] - broadcast = self._BROADCASTS.get(filter, self._DEFAULT_BROADCAST) - return self.playlist_result( - self._entries(channel_name, broadcast.type, sort), - playlist_id=channel_name, - playlist_title='%s - %s sorted by %s' - % (channel_name, broadcast.label, - self._SORTED_BY.get(sort, self._DEFAULT_SORTED_BY))) - - -class TwitchVideosClipsIE(TwitchPlaylistBaseIE): - _VALID_URL = r'https?://(?:(?:www|go|m)\.)?twitch\.tv/(?P<id>[^/]+)/(?:clips|videos/*?\?.*?\bfilter=clips)' - - _TESTS = [{ - # Clips - 'url': 'https://www.twitch.tv/vanillatv/clips?filter=clips&range=all', - 'info_dict': { - 'id': 'vanillatv', - 'title': 'vanillatv - Clips Top All', - }, - 'playlist_mincount': 1, - }, { - 'url': 'https://www.twitch.tv/dota2ruhub/videos?filter=clips&range=7d', - 'only_matching': True, - }] - - Clip = collections.namedtuple('Clip', ['filter', 'label']) - - _DEFAULT_CLIP = Clip('LAST_WEEK', 'Top 7D') - _RANGE = { - '24hr': Clip('LAST_DAY', 'Top 24H'), - '7d': _DEFAULT_CLIP, - '30d': Clip('LAST_MONTH', 'Top 30D'), - 'all': Clip('ALL_TIME', 'Top All'), - } - - # NB: values other than 20 result in skipped videos - _PAGE_LIMIT = 20 - - _OPERATION_NAME = 'ClipsCards__User' - _ENTRY_KIND = 'clip' - _EDGE_KIND = 'ClipEdge' - _NODE_KIND = 'Clip' - - @staticmethod - def _make_variables(channel_name, filter): - return { - 'login': channel_name, - 'criteria': { - 'filter': filter, - }, - } - - @staticmethod - def _extract_entry(node): - assert isinstance(node, dict) - clip_url = url_or_none(node.get('url')) - if not clip_url: - return - return { - '_type': 'url_transparent', - 'ie_key': TwitchClipsIE.ie_key(), - 'id': node.get('id'), - 'url': clip_url, - 'title': node.get('title'), - 'thumbnail': node.get('thumbnailURL'), - 'duration': float_or_none(node.get('durationSeconds')), - 'timestamp': unified_timestamp(node.get('createdAt')), - 'view_count': int_or_none(node.get('viewCount')), - 'language': node.get('language'), - } - - def _real_extract(self, url): - channel_name = self._match_id(url) - qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - range = qs.get('range', ['7d'])[0] - clip = self._RANGE.get(range, self._DEFAULT_CLIP) - return self.playlist_result( - self._entries(channel_name, clip.filter), - playlist_id=channel_name, - playlist_title='%s - Clips %s' % (channel_name, clip.label)) - - -class TwitchVideosCollectionsIE(TwitchPlaylistBaseIE): - _VALID_URL = r'https?://(?:(?:www|go|m)\.)?twitch\.tv/(?P<id>[^/]+)/videos/*?\?.*?\bfilter=collections' - - _TESTS = [{ - # Collections - 'url': 'https://www.twitch.tv/spamfish/videos?filter=collections', - 'info_dict': { - 'id': 'spamfish', - 'title': 'spamfish - Collections', - }, - 'playlist_mincount': 3, - }] - - _OPERATION_NAME = 'ChannelCollectionsContent' - _ENTRY_KIND = 'collection' - _EDGE_KIND = 'CollectionsItemEdge' - _NODE_KIND = 'Collection' - - @staticmethod - def _make_variables(channel_name): - return { - 'ownerLogin': channel_name, - } - - @staticmethod - def _extract_entry(node): - assert isinstance(node, dict) - collection_id = node.get('id') - if not collection_id: - return - return { - '_type': 'url_transparent', - 'ie_key': TwitchCollectionIE.ie_key(), - 'id': collection_id, - 'url': 'https://www.twitch.tv/collections/%s' % collection_id, - 'title': node.get('title'), - 'thumbnail': node.get('thumbnailURL'), - 'duration': float_or_none(node.get('lengthSeconds')), - 'timestamp': unified_timestamp(node.get('updatedAt')), - 'view_count': int_or_none(node.get('viewCount')), - } - - def _real_extract(self, url): - channel_name = self._match_id(url) - return self.playlist_result( - self._entries(channel_name), playlist_id=channel_name, - playlist_title='%s - Collections' % channel_name) - - -class TwitchStreamIE(TwitchBaseIE): - IE_NAME = 'twitch:stream' - _VALID_URL = r'''(?x) - https?:// - (?: - (?:(?:www|go|m)\.)?twitch\.tv/| - player\.twitch\.tv/\?.*?\bchannel= - ) - (?P<id>[^/#?]+) - ''' - - _TESTS = [{ - 'url': 'http://www.twitch.tv/shroomztv', - 'info_dict': { - 'id': '12772022048', - 'display_id': 'shroomztv', - 'ext': 'mp4', - 'title': 're:^ShroomzTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'description': 'H1Z1 - lonewolfing with ShroomzTV | A3 Battle Royale later - @ShroomzTV', - 'is_live': True, - 'timestamp': 1421928037, - 'upload_date': '20150122', - 'uploader': 'ShroomzTV', - 'uploader_id': 'shroomztv', - 'view_count': int, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'http://www.twitch.tv/miracle_doto#profile-0', - 'only_matching': True, - }, { - 'url': 'https://player.twitch.tv/?channel=lotsofs', - 'only_matching': True, - }, { - 'url': 'https://go.twitch.tv/food', - 'only_matching': True, - }, { - 'url': 'https://m.twitch.tv/food', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return (False - if any(ie.suitable(url) for ie in ( - TwitchVodIE, - TwitchCollectionIE, - TwitchVideosIE, - TwitchVideosClipsIE, - TwitchVideosCollectionsIE, - TwitchClipsIE)) - else super(TwitchStreamIE, cls).suitable(url)) - - def _real_extract(self, url): - channel_name = self._match_id(url).lower() - - gql = self._download_gql( - channel_name, [{ - 'operationName': 'StreamMetadata', - 'variables': {'channelLogin': channel_name}, - }, { - 'operationName': 'ComscoreStreamingQuery', - 'variables': { - 'channel': channel_name, - 'clipSlug': '', - 'isClip': False, - 'isLive': True, - 'isVodOrCollection': False, - 'vodID': '', - }, - }, { - 'operationName': 'VideoPreviewOverlay', - 'variables': {'login': channel_name}, - }], - 'Downloading stream GraphQL') - - user = gql[0]['data']['user'] - - if not user: - raise ExtractorError( - '%s does not exist' % channel_name, expected=True) - - stream = user['stream'] - - if not stream: - raise ExtractorError('%s is offline' % channel_name, expected=True) - - access_token = self._download_access_token( - channel_name, 'stream', 'channelName') - token = access_token['value'] - - stream_id = stream.get('id') or channel_name - query = { - 'allow_source': 'true', - 'allow_audio_only': 'true', - 'allow_spectre': 'true', - 'p': random.randint(1000000, 10000000), - 'player': 'twitchweb', - 'playlist_include_framerate': 'true', - 'segment_preference': '4', - 'sig': access_token['signature'].encode('utf-8'), - 'token': token.encode('utf-8'), - } - formats = self._extract_m3u8_formats( - '%s/api/channel/hls/%s.m3u8' % (self._USHER_BASE, channel_name), - stream_id, 'mp4', query=query) - self._prefer_source(formats) - - view_count = stream.get('viewers') - timestamp = unified_timestamp(stream.get('createdAt')) - - sq_user = try_get(gql, lambda x: x[1]['data']['user'], dict) or {} - uploader = sq_user.get('displayName') - description = try_get( - sq_user, lambda x: x['broadcastSettings']['title'], compat_str) - - thumbnail = url_or_none(try_get( - gql, lambda x: x[2]['data']['user']['stream']['previewImageURL'], - compat_str)) - - title = uploader or channel_name - stream_type = stream.get('type') - if stream_type in ['rerun', 'live']: - title += ' (%s)' % stream_type - - return { - 'id': stream_id, - 'display_id': channel_name, - 'title': self._live_title(title), - 'description': description, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'uploader_id': channel_name, - 'timestamp': timestamp, - 'view_count': view_count, - 'formats': formats, - 'is_live': stream_type == 'live', - } - - -class TwitchClipsIE(TwitchBaseIE): - IE_NAME = 'twitch:clips' - _VALID_URL = r'''(?x) - https?:// - (?: - clips\.twitch\.tv/(?:embed\?.*?\bclip=|(?:[^/]+/)*)| - (?:(?:www|go|m)\.)?twitch\.tv/[^/]+/clip/ - ) - (?P<id>[^/?#&]+) - ''' - - _TESTS = [{ - 'url': 'https://clips.twitch.tv/FaintLightGullWholeWheat', - 'md5': '761769e1eafce0ffebfb4089cb3847cd', - 'info_dict': { - 'id': '42850523', - 'ext': 'mp4', - 'title': 'EA Play 2016 Live from the Novo Theatre', - 'thumbnail': r're:^https?://.*\.jpg', - 'timestamp': 1465767393, - 'upload_date': '20160612', - 'creator': 'EA', - 'uploader': 'stereotype_', - 'uploader_id': '43566419', - }, - }, { - # multiple formats - 'url': 'https://clips.twitch.tv/rflegendary/UninterestedBeeDAESuppy', - 'only_matching': True, - }, { - 'url': 'https://www.twitch.tv/sergeynixon/clip/StormyThankfulSproutFutureMan', - 'only_matching': True, - }, { - 'url': 'https://clips.twitch.tv/embed?clip=InquisitiveBreakableYogurtJebaited', - 'only_matching': True, - }, { - 'url': 'https://m.twitch.tv/rossbroadcast/clip/ConfidentBraveHumanChefFrank', - 'only_matching': True, - }, { - 'url': 'https://go.twitch.tv/rossbroadcast/clip/ConfidentBraveHumanChefFrank', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - clip = self._download_gql( - video_id, [{ - 'operationName': 'VideoAccessToken_Clip', - 'variables': { - 'slug': video_id, - }, - }], - 'Downloading clip access token GraphQL')[0]['data']['clip'] - - if not clip: - raise ExtractorError( - 'This clip is no longer available', expected=True) - - access_query = { - 'sig': clip['playbackAccessToken']['signature'], - 'token': clip['playbackAccessToken']['value'], - } - - data = self._download_base_gql( - video_id, { - 'query': '''{ - clip(slug: "%s") { - broadcaster { - displayName - } - createdAt - curator { - displayName - id - } - durationSeconds - id - tiny: thumbnailURL(width: 86, height: 45) - small: thumbnailURL(width: 260, height: 147) - medium: thumbnailURL(width: 480, height: 272) - title - videoQualities { - frameRate - quality - sourceURL - } - viewCount - } -}''' % video_id}, 'Downloading clip GraphQL', fatal=False) - - if data: - clip = try_get(data, lambda x: x['data']['clip'], dict) or clip - - formats = [] - for option in clip.get('videoQualities', []): - if not isinstance(option, dict): - continue - source = url_or_none(option.get('sourceURL')) - if not source: - continue - formats.append({ - 'url': update_url_query(source, access_query), - 'format_id': option.get('quality'), - 'height': int_or_none(option.get('quality')), - 'fps': int_or_none(option.get('frameRate')), - }) - self._sort_formats(formats) - - thumbnails = [] - for thumbnail_id in ('tiny', 'small', 'medium'): - thumbnail_url = clip.get(thumbnail_id) - if not thumbnail_url: - continue - thumb = { - 'id': thumbnail_id, - 'url': thumbnail_url, - } - mobj = re.search(r'-(\d+)x(\d+)\.', thumbnail_url) - if mobj: - thumb.update({ - 'height': int(mobj.group(2)), - 'width': int(mobj.group(1)), - }) - thumbnails.append(thumb) - - return { - 'id': clip.get('id') or video_id, - 'title': clip.get('title') or video_id, - 'formats': formats, - 'duration': int_or_none(clip.get('durationSeconds')), - 'views': int_or_none(clip.get('viewCount')), - 'timestamp': unified_timestamp(clip.get('createdAt')), - 'thumbnails': thumbnails, - 'creator': try_get(clip, lambda x: x['broadcaster']['displayName'], compat_str), - 'uploader': try_get(clip, lambda x: x['curator']['displayName'], compat_str), - 'uploader_id': try_get(clip, lambda x: x['curator']['id'], compat_str), - } diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py deleted file mode 100644 index cfa7a7326..000000000 --- a/youtube_dl/extractor/twitter.py +++ /dev/null @@ -1,669 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_parse_qs, - compat_urllib_parse_unquote, - compat_urllib_parse_urlparse, -) -from ..utils import ( - dict_get, - ExtractorError, - float_or_none, - int_or_none, - try_get, - strip_or_none, - unified_timestamp, - update_url_query, - url_or_none, - xpath_text, -) - -from .periscope import ( - PeriscopeBaseIE, - PeriscopeIE, -) - - -class TwitterBaseIE(InfoExtractor): - _API_BASE = 'https://api.twitter.com/1.1/' - _BASE_REGEX = r'https?://(?:(?:www|m(?:obile)?)\.)?twitter\.com/' - _GUEST_TOKEN = None - - def _extract_variant_formats(self, variant, video_id): - variant_url = variant.get('url') - if not variant_url: - return [] - elif '.m3u8' in variant_url: - return self._extract_m3u8_formats( - variant_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False) - else: - tbr = int_or_none(dict_get(variant, ('bitrate', 'bit_rate')), 1000) or None - f = { - 'url': variant_url, - 'format_id': 'http' + ('-%d' % tbr if tbr else ''), - 'tbr': tbr, - } - self._search_dimensions_in_video_url(f, variant_url) - return [f] - - def _extract_formats_from_vmap_url(self, vmap_url, video_id): - vmap_url = url_or_none(vmap_url) - if not vmap_url: - return [] - vmap_data = self._download_xml(vmap_url, video_id) - formats = [] - urls = [] - for video_variant in vmap_data.findall('.//{http://twitter.com/schema/videoVMapV2.xsd}videoVariant'): - video_variant.attrib['url'] = compat_urllib_parse_unquote( - video_variant.attrib['url']) - urls.append(video_variant.attrib['url']) - formats.extend(self._extract_variant_formats( - video_variant.attrib, video_id)) - video_url = strip_or_none(xpath_text(vmap_data, './/MediaFile')) - if video_url not in urls: - formats.extend(self._extract_variant_formats({'url': video_url}, video_id)) - return formats - - @staticmethod - def _search_dimensions_in_video_url(a_format, video_url): - m = re.search(r'/(?P<width>\d+)x(?P<height>\d+)/', video_url) - if m: - a_format.update({ - 'width': int(m.group('width')), - 'height': int(m.group('height')), - }) - - def _call_api(self, path, video_id, query={}): - headers = { - 'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw', - } - if not self._GUEST_TOKEN: - self._GUEST_TOKEN = self._download_json( - self._API_BASE + 'guest/activate.json', video_id, - 'Downloading guest token', data=b'', - headers=headers)['guest_token'] - headers['x-guest-token'] = self._GUEST_TOKEN - try: - return self._download_json( - self._API_BASE + path, video_id, headers=headers, query=query) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - raise ExtractorError(self._parse_json( - e.cause.read().decode(), - video_id)['errors'][0]['message'], expected=True) - raise - - -class TwitterCardIE(InfoExtractor): - IE_NAME = 'twitter:card' - _VALID_URL = TwitterBaseIE._BASE_REGEX + r'i/(?:cards/tfw/v1|videos(?:/tweet)?)/(?P<id>\d+)' - _TESTS = [ - { - 'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889', - # MD5 checksums are different in different places - 'info_dict': { - 'id': '560070183650213889', - 'ext': 'mp4', - 'title': "Twitter - You can now shoot, edit and share video on Twitter. Capture life's most moving moments from your perspective.", - 'description': 'md5:18d3e24bb4f6e5007487dd546e53bd96', - 'uploader': 'Twitter', - 'uploader_id': 'Twitter', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 30.033, - 'timestamp': 1422366112, - 'upload_date': '20150127', - }, - }, - { - 'url': 'https://twitter.com/i/cards/tfw/v1/623160978427936768', - 'md5': '7137eca597f72b9abbe61e5ae0161399', - 'info_dict': { - 'id': '623160978427936768', - 'ext': 'mp4', - 'title': "NASA - Fly over Pluto's icy Norgay Mountains and Sputnik Plain in this @NASANewHorizons #PlutoFlyby video.", - 'description': "Fly over Pluto's icy Norgay Mountains and Sputnik Plain in this @NASANewHorizons #PlutoFlyby video. https://t.co/BJYgOjSeGA", - 'uploader': 'NASA', - 'uploader_id': 'NASA', - 'timestamp': 1437408129, - 'upload_date': '20150720', - }, - }, - { - 'url': 'https://twitter.com/i/cards/tfw/v1/654001591733886977', - 'md5': 'b6d9683dd3f48e340ded81c0e917ad46', - 'info_dict': { - 'id': 'dq4Oj5quskI', - 'ext': 'mp4', - 'title': 'Ubuntu 11.10 Overview', - 'description': 'md5:a831e97fa384863d6e26ce48d1c43376', - 'upload_date': '20111013', - 'uploader': 'OMG! UBUNTU!', - 'uploader_id': 'omgubuntu', - }, - 'add_ie': ['Youtube'], - }, - { - 'url': 'https://twitter.com/i/cards/tfw/v1/665289828897005568', - 'md5': '6dabeaca9e68cbb71c99c322a4b42a11', - 'info_dict': { - 'id': 'iBb2x00UVlv', - 'ext': 'mp4', - 'upload_date': '20151113', - 'uploader_id': '1189339351084113920', - 'uploader': 'ArsenalTerje', - 'title': 'Vine by ArsenalTerje', - 'timestamp': 1447451307, - }, - 'add_ie': ['Vine'], - }, { - 'url': 'https://twitter.com/i/videos/tweet/705235433198714880', - 'md5': '884812a2adc8aaf6fe52b15ccbfa3b88', - 'info_dict': { - 'id': '705235433198714880', - 'ext': 'mp4', - 'title': "Brent Yarina - Khalil Iverson's missed highlight dunk. And made highlight dunk. In one highlight.", - 'description': "Khalil Iverson's missed highlight dunk. And made highlight dunk. In one highlight. https://t.co/OrxcJ28Bns", - 'uploader': 'Brent Yarina', - 'uploader_id': 'BTNBrentYarina', - 'timestamp': 1456976204, - 'upload_date': '20160303', - }, - 'skip': 'This content is no longer available.', - }, { - 'url': 'https://twitter.com/i/videos/752274308186120192', - 'only_matching': True, - }, - ] - - def _real_extract(self, url): - status_id = self._match_id(url) - return self.url_result( - 'https://twitter.com/statuses/' + status_id, - TwitterIE.ie_key(), status_id) - - -class TwitterIE(TwitterBaseIE): - IE_NAME = 'twitter' - _VALID_URL = TwitterBaseIE._BASE_REGEX + r'(?:(?:i/web|[^/]+)/status|statuses)/(?P<id>\d+)' - - _TESTS = [{ - 'url': 'https://twitter.com/freethenipple/status/643211948184596480', - 'info_dict': { - 'id': '643211948184596480', - 'ext': 'mp4', - 'title': 'FREE THE NIPPLE - FTN supporters on Hollywood Blvd today!', - 'thumbnail': r're:^https?://.*\.jpg', - 'description': 'FTN supporters on Hollywood Blvd today! http://t.co/c7jHH749xJ', - 'uploader': 'FREE THE NIPPLE', - 'uploader_id': 'freethenipple', - 'duration': 12.922, - 'timestamp': 1442188653, - 'upload_date': '20150913', - 'age_limit': 18, - }, - }, { - 'url': 'https://twitter.com/giphz/status/657991469417025536/photo/1', - 'md5': 'f36dcd5fb92bf7057f155e7d927eeb42', - 'info_dict': { - 'id': '657991469417025536', - 'ext': 'mp4', - 'title': 'Gifs - tu vai cai tu vai cai tu nao eh capaz disso tu vai cai', - 'description': 'Gifs on Twitter: "tu vai cai tu vai cai tu nao eh capaz disso tu vai cai https://t.co/tM46VHFlO5"', - 'thumbnail': r're:^https?://.*\.png', - 'uploader': 'Gifs', - 'uploader_id': 'giphz', - }, - 'expected_warnings': ['height', 'width'], - 'skip': 'Account suspended', - }, { - 'url': 'https://twitter.com/starwars/status/665052190608723968', - 'info_dict': { - 'id': '665052190608723968', - 'ext': 'mp4', - 'title': 'Star Wars - A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens.', - 'description': 'A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens. https://t.co/OkSqT2fjWJ', - 'uploader_id': 'starwars', - 'uploader': 'Star Wars', - 'timestamp': 1447395772, - 'upload_date': '20151113', - }, - }, { - 'url': 'https://twitter.com/BTNBrentYarina/status/705235433198714880', - 'info_dict': { - 'id': '705235433198714880', - 'ext': 'mp4', - 'title': "Brent Yarina - Khalil Iverson's missed highlight dunk. And made highlight dunk. In one highlight.", - 'description': "Khalil Iverson's missed highlight dunk. And made highlight dunk. In one highlight. https://t.co/OrxcJ28Bns", - 'uploader_id': 'BTNBrentYarina', - 'uploader': 'Brent Yarina', - 'timestamp': 1456976204, - 'upload_date': '20160303', - }, - 'params': { - # The same video as https://twitter.com/i/videos/tweet/705235433198714880 - # Test case of TwitterCardIE - 'skip_download': True, - }, - }, { - 'url': 'https://twitter.com/jaydingeer/status/700207533655363584', - 'info_dict': { - 'id': '700207533655363584', - 'ext': 'mp4', - 'title': 'simon vertugo - BEAT PROD: @suhmeduh #Damndaniel', - 'description': 'BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ', - 'thumbnail': r're:^https?://.*\.jpg', - 'uploader': 'simon vertugo', - 'uploader_id': 'simonvertugo', - 'duration': 30.0, - 'timestamp': 1455777459, - 'upload_date': '20160218', - }, - }, { - 'url': 'https://twitter.com/Filmdrunk/status/713801302971588609', - 'md5': '89a15ed345d13b86e9a5a5e051fa308a', - 'info_dict': { - 'id': 'MIOxnrUteUd', - 'ext': 'mp4', - 'title': 'Dr.Pepperの飲み方 #japanese #バカ #ドクペ #電動ガン', - 'uploader': 'TAKUMA', - 'uploader_id': '1004126642786242560', - 'timestamp': 1402826626, - 'upload_date': '20140615', - }, - 'add_ie': ['Vine'], - }, { - 'url': 'https://twitter.com/captainamerica/status/719944021058060289', - 'info_dict': { - 'id': '719944021058060289', - 'ext': 'mp4', - 'title': 'Captain America - @King0fNerd Are you sure you made the right choice? Find out in theaters.', - 'description': '@King0fNerd Are you sure you made the right choice? Find out in theaters. https://t.co/GpgYi9xMJI', - 'uploader_id': 'CaptainAmerica', - 'uploader': 'Captain America', - 'duration': 3.17, - 'timestamp': 1460483005, - 'upload_date': '20160412', - }, - }, { - 'url': 'https://twitter.com/OPP_HSD/status/779210622571536384', - 'info_dict': { - 'id': '1zqKVVlkqLaKB', - 'ext': 'mp4', - 'title': 'Sgt Kerry Schmidt - Ontario Provincial Police - Road rage, mischief, assault, rollover and fire in one occurrence', - 'upload_date': '20160923', - 'uploader_id': '1PmKqpJdOJQoY', - 'uploader': 'Sgt Kerry Schmidt - Ontario Provincial Police', - 'timestamp': 1474613214, - }, - 'add_ie': ['Periscope'], - }, { - # has mp4 formats via mobile API - 'url': 'https://twitter.com/news_al3alm/status/852138619213144067', - 'info_dict': { - 'id': '852138619213144067', - 'ext': 'mp4', - 'title': 'عالم الأخبار - كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة', - 'description': 'كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة https://t.co/xg6OhpyKfN', - 'uploader': 'عالم الأخبار', - 'uploader_id': 'news_al3alm', - 'duration': 277.4, - 'timestamp': 1492000653, - 'upload_date': '20170412', - }, - 'skip': 'Account suspended', - }, { - 'url': 'https://twitter.com/i/web/status/910031516746514432', - 'info_dict': { - 'id': '910031516746514432', - 'ext': 'mp4', - 'title': 'Préfet de Guadeloupe - [Direct] #Maria Le centre se trouve actuellement au sud de Basse-Terre. Restez confinés. Réfugiez-vous dans la pièce la + sûre.', - 'thumbnail': r're:^https?://.*\.jpg', - 'description': '[Direct] #Maria Le centre se trouve actuellement au sud de Basse-Terre. Restez confinés. Réfugiez-vous dans la pièce la + sûre. https://t.co/mwx01Rs4lo', - 'uploader': 'Préfet de Guadeloupe', - 'uploader_id': 'Prefet971', - 'duration': 47.48, - 'timestamp': 1505803395, - 'upload_date': '20170919', - }, - 'params': { - 'skip_download': True, # requires ffmpeg - }, - }, { - # card via api.twitter.com/1.1/videos/tweet/config - 'url': 'https://twitter.com/LisPower1/status/1001551623938805763', - 'info_dict': { - 'id': '1001551623938805763', - 'ext': 'mp4', - 'title': 're:.*?Shep is on a roll today.*?', - 'thumbnail': r're:^https?://.*\.jpg', - 'description': 'md5:37b9f2ff31720cef23b2bd42ee8a0f09', - 'uploader': 'Lis Power', - 'uploader_id': 'LisPower1', - 'duration': 111.278, - 'timestamp': 1527623489, - 'upload_date': '20180529', - }, - 'params': { - 'skip_download': True, # requires ffmpeg - }, - }, { - 'url': 'https://twitter.com/foobar/status/1087791357756956680', - 'info_dict': { - 'id': '1087791357756956680', - 'ext': 'mp4', - 'title': 'Twitter - A new is coming. Some of you got an opt-in to try it now. Check out the emoji button, quick keyboard shortcuts, upgraded trends, advanced search, and more. Let us know your thoughts!', - 'thumbnail': r're:^https?://.*\.jpg', - 'description': 'md5:6dfd341a3310fb97d80d2bf7145df976', - 'uploader': 'Twitter', - 'uploader_id': 'Twitter', - 'duration': 61.567, - 'timestamp': 1548184644, - 'upload_date': '20190122', - }, - }, { - # not available in Periscope - 'url': 'https://twitter.com/ViviEducation/status/1136534865145286656', - 'info_dict': { - 'id': '1vOGwqejwoWxB', - 'ext': 'mp4', - 'title': 'Vivi - Vivi founder @lior_rauchy announcing our new student feedback tool live at @EduTECH_AU #EduTECH2019', - 'uploader': 'Vivi', - 'uploader_id': '1eVjYOLGkGrQL', - }, - 'add_ie': ['TwitterBroadcast'], - }, { - # unified card - 'url': 'https://twitter.com/BrooklynNets/status/1349794411333394432?s=20', - 'info_dict': { - 'id': '1349794411333394432', - 'ext': 'mp4', - 'title': 'md5:d1c4941658e4caaa6cb579260d85dcba', - 'thumbnail': r're:^https?://.*\.jpg', - 'description': 'md5:71ead15ec44cee55071547d6447c6a3e', - 'uploader': 'Brooklyn Nets', - 'uploader_id': 'BrooklynNets', - 'duration': 324.484, - 'timestamp': 1610651040, - 'upload_date': '20210114', - }, - 'params': { - 'skip_download': True, - }, - }, { - # Twitch Clip Embed - 'url': 'https://twitter.com/GunB1g/status/1163218564784017422', - 'only_matching': True, - }, { - # promo_video_website card - 'url': 'https://twitter.com/GunB1g/status/1163218564784017422', - 'only_matching': True, - }, { - # promo_video_convo card - 'url': 'https://twitter.com/poco_dandy/status/1047395834013384704', - 'only_matching': True, - }, { - # appplayer card - 'url': 'https://twitter.com/poco_dandy/status/1150646424461176832', - 'only_matching': True, - }, { - # video_direct_message card - 'url': 'https://twitter.com/qarev001/status/1348948114569269251', - 'only_matching': True, - }, { - # poll2choice_video card - 'url': 'https://twitter.com/CAF_Online/status/1349365911120195585', - 'only_matching': True, - }, { - # poll3choice_video card - 'url': 'https://twitter.com/SamsungMobileSA/status/1348609186725289984', - 'only_matching': True, - }, { - # poll4choice_video card - 'url': 'https://twitter.com/SouthamptonFC/status/1347577658079641604', - 'only_matching': True, - }] - - def _real_extract(self, url): - twid = self._match_id(url) - status = self._call_api( - 'statuses/show/%s.json' % twid, twid, { - 'cards_platform': 'Web-12', - 'include_cards': 1, - 'include_reply_count': 1, - 'include_user_entities': 0, - 'tweet_mode': 'extended', - }) - - title = description = status['full_text'].replace('\n', ' ') - # strip 'https -_t.co_BJYgOjSeGA' junk from filenames - title = re.sub(r'\s+(https?://[^ ]+)', '', title) - user = status.get('user') or {} - uploader = user.get('name') - if uploader: - title = '%s - %s' % (uploader, title) - uploader_id = user.get('screen_name') - - tags = [] - for hashtag in (try_get(status, lambda x: x['entities']['hashtags'], list) or []): - hashtag_text = hashtag.get('text') - if not hashtag_text: - continue - tags.append(hashtag_text) - - info = { - 'id': twid, - 'title': title, - 'description': description, - 'uploader': uploader, - 'timestamp': unified_timestamp(status.get('created_at')), - 'uploader_id': uploader_id, - 'uploader_url': 'https://twitter.com/' + uploader_id if uploader_id else None, - 'like_count': int_or_none(status.get('favorite_count')), - 'repost_count': int_or_none(status.get('retweet_count')), - 'comment_count': int_or_none(status.get('reply_count')), - 'age_limit': 18 if status.get('possibly_sensitive') else 0, - 'tags': tags, - } - - def extract_from_video_info(media): - video_info = media.get('video_info') or {} - - formats = [] - for variant in video_info.get('variants', []): - formats.extend(self._extract_variant_formats(variant, twid)) - self._sort_formats(formats) - - thumbnails = [] - media_url = media.get('media_url_https') or media.get('media_url') - if media_url: - def add_thumbnail(name, size): - thumbnails.append({ - 'id': name, - 'url': update_url_query(media_url, {'name': name}), - 'width': int_or_none(size.get('w') or size.get('width')), - 'height': int_or_none(size.get('h') or size.get('height')), - }) - for name, size in media.get('sizes', {}).items(): - add_thumbnail(name, size) - add_thumbnail('orig', media.get('original_info') or {}) - - info.update({ - 'formats': formats, - 'thumbnails': thumbnails, - 'duration': float_or_none(video_info.get('duration_millis'), 1000), - }) - - media = try_get(status, lambda x: x['extended_entities']['media'][0]) - if media and media.get('type') != 'photo': - extract_from_video_info(media) - else: - card = status.get('card') - if card: - binding_values = card['binding_values'] - - def get_binding_value(k): - o = binding_values.get(k) or {} - return try_get(o, lambda x: x[x['type'].lower() + '_value']) - - card_name = card['name'].split(':')[-1] - if card_name == 'player': - info.update({ - '_type': 'url', - 'url': get_binding_value('player_url'), - }) - elif card_name == 'periscope_broadcast': - info.update({ - '_type': 'url', - 'url': get_binding_value('url') or get_binding_value('player_url'), - 'ie_key': PeriscopeIE.ie_key(), - }) - elif card_name == 'broadcast': - info.update({ - '_type': 'url', - 'url': get_binding_value('broadcast_url'), - 'ie_key': TwitterBroadcastIE.ie_key(), - }) - elif card_name == 'summary': - info.update({ - '_type': 'url', - 'url': get_binding_value('card_url'), - }) - elif card_name == 'unified_card': - media_entities = self._parse_json(get_binding_value('unified_card'), twid)['media_entities'] - extract_from_video_info(next(iter(media_entities.values()))) - # amplify, promo_video_website, promo_video_convo, appplayer, - # video_direct_message, poll2choice_video, poll3choice_video, - # poll4choice_video, ... - else: - is_amplify = card_name == 'amplify' - vmap_url = get_binding_value('amplify_url_vmap') if is_amplify else get_binding_value('player_stream_url') - content_id = get_binding_value('%s_content_id' % (card_name if is_amplify else 'player')) - formats = self._extract_formats_from_vmap_url(vmap_url, content_id or twid) - self._sort_formats(formats) - - thumbnails = [] - for suffix in ('_small', '', '_large', '_x_large', '_original'): - image = get_binding_value('player_image' + suffix) or {} - image_url = image.get('url') - if not image_url or '/player-placeholder' in image_url: - continue - thumbnails.append({ - 'id': suffix[1:] if suffix else 'medium', - 'url': image_url, - 'width': int_or_none(image.get('width')), - 'height': int_or_none(image.get('height')), - }) - - info.update({ - 'formats': formats, - 'thumbnails': thumbnails, - 'duration': int_or_none(get_binding_value( - 'content_duration_seconds')), - }) - else: - expanded_url = try_get(status, lambda x: x['entities']['urls'][0]['expanded_url']) - if not expanded_url: - raise ExtractorError("There's no video in this tweet.") - info.update({ - '_type': 'url', - 'url': expanded_url, - }) - return info - - -class TwitterAmplifyIE(TwitterBaseIE): - IE_NAME = 'twitter:amplify' - _VALID_URL = r'https?://amp\.twimg\.com/v/(?P<id>[0-9a-f\-]{36})' - - _TEST = { - 'url': 'https://amp.twimg.com/v/0ba0c3c7-0af3-4c0a-bed5-7efd1ffa2951', - 'md5': '7df102d0b9fd7066b86f3159f8e81bf6', - 'info_dict': { - 'id': '0ba0c3c7-0af3-4c0a-bed5-7efd1ffa2951', - 'ext': 'mp4', - 'title': 'Twitter Video', - 'thumbnail': 're:^https?://.*', - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - vmap_url = self._html_search_meta( - 'twitter:amplify:vmap', webpage, 'vmap url') - formats = self._extract_formats_from_vmap_url(vmap_url, video_id) - - thumbnails = [] - thumbnail = self._html_search_meta( - 'twitter:image:src', webpage, 'thumbnail', fatal=False) - - def _find_dimension(target): - w = int_or_none(self._html_search_meta( - 'twitter:%s:width' % target, webpage, fatal=False)) - h = int_or_none(self._html_search_meta( - 'twitter:%s:height' % target, webpage, fatal=False)) - return w, h - - if thumbnail: - thumbnail_w, thumbnail_h = _find_dimension('image') - thumbnails.append({ - 'url': thumbnail, - 'width': thumbnail_w, - 'height': thumbnail_h, - }) - - video_w, video_h = _find_dimension('player') - formats[0].update({ - 'width': video_w, - 'height': video_h, - }) - - return { - 'id': video_id, - 'title': 'Twitter Video', - 'formats': formats, - 'thumbnails': thumbnails, - } - - -class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE): - IE_NAME = 'twitter:broadcast' - _VALID_URL = TwitterBaseIE._BASE_REGEX + r'i/broadcasts/(?P<id>[0-9a-zA-Z]{13})' - - _TEST = { - # untitled Periscope video - 'url': 'https://twitter.com/i/broadcasts/1yNGaQLWpejGj', - 'info_dict': { - 'id': '1yNGaQLWpejGj', - 'ext': 'mp4', - 'title': 'Andrea May Sahouri - Periscope Broadcast', - 'uploader': 'Andrea May Sahouri', - 'uploader_id': '1PXEdBZWpGwKe', - }, - } - - def _real_extract(self, url): - broadcast_id = self._match_id(url) - broadcast = self._call_api( - 'broadcasts/show.json', broadcast_id, - {'ids': broadcast_id})['broadcasts'][broadcast_id] - info = self._parse_broadcast_data(broadcast, broadcast_id) - media_key = broadcast['media_key'] - source = self._call_api( - 'live_video_stream/status/' + media_key, media_key)['source'] - m3u8_url = source.get('noRedirectPlaybackUrl') or source['location'] - if '/live_video_stream/geoblocked/' in m3u8_url: - self.raise_geo_restricted() - m3u8_id = compat_parse_qs(compat_urllib_parse_urlparse( - m3u8_url).query).get('type', [None])[0] - state, width, height = self._extract_common_format_info(broadcast) - info['formats'] = self._extract_pscp_m3u8_formats( - m3u8_url, broadcast_id, m3u8_id, state, width, height) - return info diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py deleted file mode 100644 index 2a4faecef..000000000 --- a/youtube_dl/extractor/udemy.py +++ /dev/null @@ -1,481 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_kwargs, - compat_str, - compat_urllib_request, - compat_urlparse, -) -from ..utils import ( - determine_ext, - extract_attributes, - ExtractorError, - float_or_none, - int_or_none, - js_to_json, - sanitized_Request, - try_get, - unescapeHTML, - url_or_none, - urlencode_postdata, -) - - -class UdemyIE(InfoExtractor): - IE_NAME = 'udemy' - _VALID_URL = r'''(?x) - https?:// - (?:[^/]+\.)?udemy\.com/ - (?: - [^#]+\#/lecture/| - lecture/view/?\?lectureId=| - [^/]+/learn/v4/t/lecture/ - ) - (?P<id>\d+) - ''' - _LOGIN_URL = 'https://www.udemy.com/join/login-popup/?displayType=ajax&showSkipButton=1' - _ORIGIN_URL = 'https://www.udemy.com' - _NETRC_MACHINE = 'udemy' - - _TESTS = [{ - 'url': 'https://www.udemy.com/java-tutorial/#/lecture/172757', - 'md5': '98eda5b657e752cf945d8445e261b5c5', - 'info_dict': { - 'id': '160614', - 'ext': 'mp4', - 'title': 'Introduction and Installation', - 'description': 'md5:c0d51f6f21ef4ec65f091055a5eef876', - 'duration': 579.29, - }, - 'skip': 'Requires udemy account credentials', - }, { - # new URL schema - 'url': 'https://www.udemy.com/electric-bass-right-from-the-start/learn/v4/t/lecture/4580906', - 'only_matching': True, - }, { - # no url in outputs format entry - 'url': 'https://www.udemy.com/learn-web-development-complete-step-by-step-guide-to-success/learn/v4/t/lecture/4125812', - 'only_matching': True, - }, { - # only outputs rendition - 'url': 'https://www.udemy.com/how-you-can-help-your-local-community-5-amazing-examples/learn/v4/t/lecture/3225750?start=0', - 'only_matching': True, - }, { - 'url': 'https://wipro.udemy.com/java-tutorial/#/lecture/172757', - 'only_matching': True, - }] - - def _extract_course_info(self, webpage, video_id): - course = self._parse_json( - unescapeHTML(self._search_regex( - r'ng-init=["\'].*\bcourse=({.+?})[;"\']', - webpage, 'course', default='{}')), - video_id, fatal=False) or {} - course_id = course.get('id') or self._search_regex( - [ - r'data-course-id=["\'](\d+)', - r'"courseId"\s*:\s*(\d+)' - ], webpage, 'course id') - return course_id, course.get('title') - - def _enroll_course(self, base_url, webpage, course_id): - def combine_url(base_url, url): - return compat_urlparse.urljoin(base_url, url) if not url.startswith('http') else url - - checkout_url = unescapeHTML(self._search_regex( - r'href=(["\'])(?P<url>(?:https?://(?:www\.)?udemy\.com)?/(?:payment|cart)/checkout/.+?)\1', - webpage, 'checkout url', group='url', default=None)) - if checkout_url: - raise ExtractorError( - 'Course %s is not free. You have to pay for it before you can download. ' - 'Use this URL to confirm purchase: %s' - % (course_id, combine_url(base_url, checkout_url)), - expected=True) - - enroll_url = unescapeHTML(self._search_regex( - r'href=(["\'])(?P<url>(?:https?://(?:www\.)?udemy\.com)?/course/subscribe/.+?)\1', - webpage, 'enroll url', group='url', default=None)) - if enroll_url: - webpage = self._download_webpage( - combine_url(base_url, enroll_url), - course_id, 'Enrolling in the course', - headers={'Referer': base_url}) - if '>You have enrolled in' in webpage: - self.to_screen('%s: Successfully enrolled in the course' % course_id) - - def _download_lecture(self, course_id, lecture_id): - return self._download_json( - 'https://www.udemy.com/api-2.0/users/me/subscribed-courses/%s/lectures/%s?' - % (course_id, lecture_id), - lecture_id, 'Downloading lecture JSON', query={ - 'fields[lecture]': 'title,description,view_html,asset', - 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,stream_urls,captions,data', - }) - - def _handle_error(self, response): - if not isinstance(response, dict): - return - error = response.get('error') - if error: - error_str = 'Udemy returned error #%s: %s' % (error.get('code'), error.get('message')) - error_data = error.get('data') - if error_data: - error_str += ' - %s' % error_data.get('formErrors') - raise ExtractorError(error_str, expected=True) - - def _download_webpage_handle(self, *args, **kwargs): - headers = kwargs.get('headers', {}).copy() - headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36' - kwargs['headers'] = headers - ret = super(UdemyIE, self)._download_webpage_handle( - *args, **compat_kwargs(kwargs)) - if not ret: - return ret - webpage, _ = ret - if any(p in webpage for p in ( - '>Please verify you are a human', - 'Access to this page has been denied because we believe you are using automation tools to browse the website', - '"_pxCaptcha"')): - raise ExtractorError( - 'Udemy asks you to solve a CAPTCHA. Login with browser, ' - 'solve CAPTCHA, then export cookies and pass cookie file to ' - 'youtube-dl with --cookies.', expected=True) - return ret - - def _download_json(self, url_or_request, *args, **kwargs): - headers = { - 'X-Udemy-Snail-Case': 'true', - 'X-Requested-With': 'XMLHttpRequest', - } - for cookie in self._downloader.cookiejar: - if cookie.name == 'client_id': - headers['X-Udemy-Client-Id'] = cookie.value - elif cookie.name == 'access_token': - headers['X-Udemy-Bearer-Token'] = cookie.value - headers['X-Udemy-Authorization'] = 'Bearer %s' % cookie.value - - if isinstance(url_or_request, compat_urllib_request.Request): - for header, value in headers.items(): - url_or_request.add_header(header, value) - else: - url_or_request = sanitized_Request(url_or_request, headers=headers) - - response = super(UdemyIE, self)._download_json(url_or_request, *args, **kwargs) - self._handle_error(response) - return response - - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - - login_popup = self._download_webpage( - self._LOGIN_URL, None, 'Downloading login popup') - - def is_logged(webpage): - return any(re.search(p, webpage) for p in ( - r'href=["\'](?:https://www\.udemy\.com)?/user/logout/', - r'>Logout<')) - - # already logged in - if is_logged(login_popup): - return - - login_form = self._form_hidden_inputs('login-form', login_popup) - - login_form.update({ - 'email': username, - 'password': password, - }) - - response = self._download_webpage( - self._LOGIN_URL, None, 'Logging in', - data=urlencode_postdata(login_form), - headers={ - 'Referer': self._ORIGIN_URL, - 'Origin': self._ORIGIN_URL, - }) - - if not is_logged(response): - error = self._html_search_regex( - r'(?s)<div[^>]+class="form-errors[^"]*">(.+?)</div>', - response, 'error message', default=None) - if error: - raise ExtractorError('Unable to login: %s' % error, expected=True) - raise ExtractorError('Unable to log in') - - def _real_extract(self, url): - lecture_id = self._match_id(url) - - webpage = self._download_webpage(url, lecture_id) - - course_id, _ = self._extract_course_info(webpage, lecture_id) - - try: - lecture = self._download_lecture(course_id, lecture_id) - except ExtractorError as e: - # Error could possibly mean we are not enrolled in the course - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - self._enroll_course(url, webpage, course_id) - lecture = self._download_lecture(course_id, lecture_id) - else: - raise - - title = lecture['title'] - description = lecture.get('description') - - asset = lecture['asset'] - - asset_type = asset.get('asset_type') or asset.get('assetType') - if asset_type != 'Video': - raise ExtractorError( - 'Lecture %s is not a video' % lecture_id, expected=True) - - stream_url = asset.get('stream_url') or asset.get('streamUrl') - if stream_url: - youtube_url = self._search_regex( - r'(https?://www\.youtube\.com/watch\?v=.*)', stream_url, 'youtube URL', default=None) - if youtube_url: - return self.url_result(youtube_url, 'Youtube') - - video_id = compat_str(asset['id']) - thumbnail = asset.get('thumbnail_url') or asset.get('thumbnailUrl') - duration = float_or_none(asset.get('data', {}).get('duration')) - - subtitles = {} - automatic_captions = {} - - formats = [] - - def extract_output_format(src, f_id): - return { - 'url': src.get('url'), - 'format_id': '%sp' % (src.get('height') or f_id), - 'width': int_or_none(src.get('width')), - 'height': int_or_none(src.get('height')), - 'vbr': int_or_none(src.get('video_bitrate_in_kbps')), - 'vcodec': src.get('video_codec'), - 'fps': int_or_none(src.get('frame_rate')), - 'abr': int_or_none(src.get('audio_bitrate_in_kbps')), - 'acodec': src.get('audio_codec'), - 'asr': int_or_none(src.get('audio_sample_rate')), - 'tbr': int_or_none(src.get('total_bitrate_in_kbps')), - 'filesize': int_or_none(src.get('file_size_in_bytes')), - } - - outputs = asset.get('data', {}).get('outputs') - if not isinstance(outputs, dict): - outputs = {} - - def add_output_format_meta(f, key): - output = outputs.get(key) - if isinstance(output, dict): - output_format = extract_output_format(output, key) - output_format.update(f) - return output_format - return f - - def extract_formats(source_list): - if not isinstance(source_list, list): - return - for source in source_list: - video_url = url_or_none(source.get('file') or source.get('src')) - if not video_url: - continue - if source.get('type') == 'application/x-mpegURL' or determine_ext(video_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - continue - format_id = source.get('label') - f = { - 'url': video_url, - 'format_id': '%sp' % format_id, - 'height': int_or_none(format_id), - } - if format_id: - # Some videos contain additional metadata (e.g. - # https://www.udemy.com/ios9-swift/learn/#/lecture/3383208) - f = add_output_format_meta(f, format_id) - formats.append(f) - - def extract_subtitles(track_list): - if not isinstance(track_list, list): - return - for track in track_list: - if not isinstance(track, dict): - continue - if track.get('kind') != 'captions': - continue - src = url_or_none(track.get('src')) - if not src: - continue - lang = track.get('language') or track.get( - 'srclang') or track.get('label') - sub_dict = automatic_captions if track.get( - 'autogenerated') is True else subtitles - sub_dict.setdefault(lang, []).append({ - 'url': src, - }) - - for url_kind in ('download', 'stream'): - urls = asset.get('%s_urls' % url_kind) - if isinstance(urls, dict): - extract_formats(urls.get('Video')) - - captions = asset.get('captions') - if isinstance(captions, list): - for cc in captions: - if not isinstance(cc, dict): - continue - cc_url = url_or_none(cc.get('url')) - if not cc_url: - continue - lang = try_get(cc, lambda x: x['locale']['locale'], compat_str) - sub_dict = (automatic_captions if cc.get('source') == 'auto' - else subtitles) - sub_dict.setdefault(lang or 'en', []).append({ - 'url': cc_url, - }) - - view_html = lecture.get('view_html') - if view_html: - view_html_urls = set() - for source in re.findall(r'<source[^>]+>', view_html): - attributes = extract_attributes(source) - src = attributes.get('src') - if not src: - continue - res = attributes.get('data-res') - height = int_or_none(res) - if src in view_html_urls: - continue - view_html_urls.add(src) - if attributes.get('type') == 'application/x-mpegURL' or determine_ext(src) == 'm3u8': - m3u8_formats = self._extract_m3u8_formats( - src, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False) - for f in m3u8_formats: - m = re.search(r'/hls_(?P<height>\d{3,4})_(?P<tbr>\d{2,})/', f['url']) - if m: - if not f.get('height'): - f['height'] = int(m.group('height')) - if not f.get('tbr'): - f['tbr'] = int(m.group('tbr')) - formats.extend(m3u8_formats) - else: - formats.append(add_output_format_meta({ - 'url': src, - 'format_id': '%dp' % height if height else None, - 'height': height, - }, res)) - - # react rendition since 2017.04.15 (see - # https://github.com/ytdl-org/youtube-dl/issues/12744) - data = self._parse_json( - self._search_regex( - r'videojs-setup-data=(["\'])(?P<data>{.+?})\1', view_html, - 'setup data', default='{}', group='data'), video_id, - transform_source=unescapeHTML, fatal=False) - if data and isinstance(data, dict): - extract_formats(data.get('sources')) - if not duration: - duration = int_or_none(data.get('duration')) - extract_subtitles(data.get('tracks')) - - if not subtitles and not automatic_captions: - text_tracks = self._parse_json( - self._search_regex( - r'text-tracks=(["\'])(?P<data>\[.+?\])\1', view_html, - 'text tracks', default='{}', group='data'), video_id, - transform_source=lambda s: js_to_json(unescapeHTML(s)), - fatal=False) - extract_subtitles(text_tracks) - - if not formats and outputs: - for format_id, output in outputs.items(): - f = extract_output_format(output, format_id) - if f.get('url'): - formats.append(f) - - self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id')) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - 'subtitles': subtitles, - 'automatic_captions': automatic_captions, - } - - -class UdemyCourseIE(UdemyIE): - IE_NAME = 'udemy:course' - _VALID_URL = r'https?://(?:[^/]+\.)?udemy\.com/(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'https://www.udemy.com/java-tutorial/', - 'only_matching': True, - }, { - 'url': 'https://wipro.udemy.com/java-tutorial/', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return False if UdemyIE.suitable(url) else super(UdemyCourseIE, cls).suitable(url) - - def _real_extract(self, url): - course_path = self._match_id(url) - - webpage = self._download_webpage(url, course_path) - - course_id, title = self._extract_course_info(webpage, course_path) - - self._enroll_course(url, webpage, course_id) - - response = self._download_json( - 'https://www.udemy.com/api-2.0/courses/%s/cached-subscriber-curriculum-items' % course_id, - course_id, 'Downloading course curriculum', query={ - 'fields[chapter]': 'title,object_index', - 'fields[lecture]': 'title,asset', - 'page_size': '1000', - }) - - entries = [] - chapter, chapter_number = [None] * 2 - for entry in response['results']: - clazz = entry.get('_class') - if clazz == 'lecture': - asset = entry.get('asset') - if isinstance(asset, dict): - asset_type = asset.get('asset_type') or asset.get('assetType') - if asset_type != 'Video': - continue - lecture_id = entry.get('id') - if lecture_id: - entry = { - '_type': 'url_transparent', - 'url': 'https://www.udemy.com/%s/learn/v4/t/lecture/%s' % (course_path, entry['id']), - 'title': entry.get('title'), - 'ie_key': UdemyIE.ie_key(), - } - if chapter_number: - entry['chapter_number'] = chapter_number - if chapter: - entry['chapter'] = chapter - entries.append(entry) - elif clazz == 'chapter': - chapter_number = entry.get('object_index') - chapter = entry.get('title') - - return self.playlist_result(entries, course_id, title) diff --git a/youtube_dl/extractor/umg.py b/youtube_dl/extractor/umg.py deleted file mode 100644 index 47948b6ce..000000000 --- a/youtube_dl/extractor/umg.py +++ /dev/null @@ -1,101 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - parse_filesize, - parse_iso8601, -) - - -class UMGDeIE(InfoExtractor): - IE_NAME = 'umg:de' - IE_DESC = 'Universal Music Deutschland' - _VALID_URL = r'https?://(?:www\.)?universal-music\.de/[^/]+/videos/[^/?#]+-(?P<id>\d+)' - _TEST = { - 'url': 'https://www.universal-music.de/sido/videos/jedes-wort-ist-gold-wert-457803', - 'md5': 'ebd90f48c80dcc82f77251eb1902634f', - 'info_dict': { - 'id': '457803', - 'ext': 'mp4', - 'title': 'Jedes Wort ist Gold wert', - 'timestamp': 1513591800, - 'upload_date': '20171218', - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - video_data = self._download_json( - 'https://graphql.universal-music.de/', - video_id, query={ - 'query': '''{ - universalMusic(channel:16) { - video(id:%s) { - headline - formats { - formatId - url - type - width - height - mimeType - fileSize - } - duration - createdDate - } - } -}''' % video_id})['data']['universalMusic']['video'] - - title = video_data['headline'] - hls_url_template = 'http://mediadelivery.universal-music-services.de/vod/mp4:autofill/storage/' + '/'.join(list(video_id)) + '/content/%s/file/playlist.m3u8' - - thumbnails = [] - formats = [] - - def add_m3u8_format(format_id): - formats.extend(self._extract_m3u8_formats( - hls_url_template % format_id, video_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False)) - - for f in video_data.get('formats', []): - f_url = f.get('url') - mime_type = f.get('mimeType') - if not f_url or mime_type == 'application/mxf': - continue - fmt = { - 'url': f_url, - 'width': int_or_none(f.get('width')), - 'height': int_or_none(f.get('height')), - 'filesize': parse_filesize(f.get('fileSize')), - } - f_type = f.get('type') - if f_type == 'Image': - thumbnails.append(fmt) - elif f_type == 'Video': - format_id = f.get('formatId') - if format_id: - fmt['format_id'] = format_id - if mime_type == 'video/mp4': - add_m3u8_format(format_id) - urlh = self._request_webpage(f_url, video_id, fatal=False) - if urlh: - first_byte = urlh.read(1) - if first_byte not in (b'F', b'\x00'): - continue - formats.append(fmt) - if not formats: - for format_id in (867, 836, 940): - add_m3u8_format(format_id) - self._sort_formats(formats, ('width', 'height', 'filesize', 'tbr')) - - return { - 'id': video_id, - 'title': title, - 'duration': int_or_none(video_data.get('duration')), - 'timestamp': parse_iso8601(video_data.get('createdDate'), ' '), - 'thumbnails': thumbnails, - 'formats': formats, - } diff --git a/youtube_dl/extractor/unistra.py b/youtube_dl/extractor/unistra.py deleted file mode 100644 index a724cdbef..000000000 --- a/youtube_dl/extractor/unistra.py +++ /dev/null @@ -1,67 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import qualities - - -class UnistraIE(InfoExtractor): - _VALID_URL = r'https?://utv\.unistra\.fr/(?:index|video)\.php\?id_video\=(?P<id>\d+)' - - _TESTS = [ - { - 'url': 'http://utv.unistra.fr/video.php?id_video=154', - 'md5': '736f605cfdc96724d55bb543ab3ced24', - 'info_dict': { - 'id': '154', - 'ext': 'mp4', - 'title': 'M!ss Yella', - 'description': 'md5:104892c71bd48e55d70b902736b81bbf', - }, - }, - { - 'url': 'http://utv.unistra.fr/index.php?id_video=437', - 'md5': '1ddddd6cccaae76f622ce29b8779636d', - 'info_dict': { - 'id': '437', - 'ext': 'mp4', - 'title': 'Prix Louise Weiss 2014', - 'description': 'md5:cc3a8735f079f4fb6b0b570fc10c135a', - }, - } - ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - webpage = self._download_webpage(url, video_id) - - files = set(re.findall(r'file\s*:\s*"(/[^"]+)"', webpage)) - - quality = qualities(['SD', 'HD']) - formats = [] - for file_path in files: - format_id = 'HD' if file_path.endswith('-HD.mp4') else 'SD' - formats.append({ - 'url': 'http://vod-flash.u-strasbg.fr:8080%s' % file_path, - 'format_id': format_id, - 'quality': quality(format_id) - }) - self._sort_formats(formats) - - title = self._html_search_regex( - r'<title>UTV - (.*?)</', webpage, 'title') - description = self._html_search_regex( - r'<meta name="Description" content="(.*?)"', webpage, 'description', flags=re.DOTALL) - thumbnail = self._search_regex( - r'image: "(.*?)"', webpage, 'thumbnail') - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'formats': formats - } diff --git a/youtube_dl/extractor/uol.py b/youtube_dl/extractor/uol.py deleted file mode 100644 index 628adf219..000000000 --- a/youtube_dl/extractor/uol.py +++ /dev/null @@ -1,144 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urllib_parse_urlencode, -) -from ..utils import ( - clean_html, - int_or_none, - parse_duration, - parse_iso8601, - qualities, - update_url_query, -) - - -class UOLIE(InfoExtractor): - IE_NAME = 'uol.com.br' - _VALID_URL = r'https?://(?:.+?\.)?uol\.com\.br/.*?(?:(?:mediaId|v)=|view/(?:[a-z0-9]+/)?|video(?:=|/(?:\d{4}/\d{2}/\d{2}/)?))(?P<id>\d+|[\w-]+-[A-Z0-9]+)' - _TESTS = [{ - 'url': 'http://player.mais.uol.com.br/player_video_v3.swf?mediaId=15951931', - 'md5': '4f1e26683979715ff64e4e29099cf020', - 'info_dict': { - 'id': '15951931', - 'ext': 'mp4', - 'title': 'Miss simpatia é encontrada morta', - 'description': 'md5:3f8c11a0c0556d66daf7e5b45ef823b2', - 'timestamp': 1470421860, - 'upload_date': '20160805', - } - }, { - 'url': 'http://tvuol.uol.com.br/video/incendio-destroi-uma-das-maiores-casas-noturnas-de-londres-04024E9A3268D4C95326', - 'md5': '2850a0e8dfa0a7307e04a96c5bdc5bc2', - 'info_dict': { - 'id': '15954259', - 'ext': 'mp4', - 'title': 'Incêndio destrói uma das maiores casas noturnas de Londres', - 'description': 'Em Londres, um incêndio destruiu uma das maiores boates da cidade. Não há informações sobre vítimas.', - 'timestamp': 1470674520, - 'upload_date': '20160808', - } - }, { - 'url': 'http://mais.uol.com.br/static/uolplayer/index.html?mediaId=15951931', - 'only_matching': True, - }, { - 'url': 'http://mais.uol.com.br/view/15954259', - 'only_matching': True, - }, { - 'url': 'http://noticias.band.uol.com.br/brasilurgente/video/2016/08/05/15951931/miss-simpatia-e-encontrada-morta.html', - 'only_matching': True, - }, { - 'url': 'http://videos.band.uol.com.br/programa.asp?e=noticias&pr=brasil-urgente&v=15951931&t=Policia-desmonte-base-do-PCC-na-Cracolandia', - 'only_matching': True, - }, { - 'url': 'http://mais.uol.com.br/view/cphaa0gl2x8r/incendio-destroi-uma-das-maiores-casas-noturnas-de-londres-04024E9A3268D4C95326', - 'only_matching': True, - }, { - 'url': 'http://noticias.uol.com.br//videos/assistir.htm?video=rafaela-silva-inspira-criancas-no-judo-04024D983968D4C95326', - 'only_matching': True, - }, { - 'url': 'http://mais.uol.com.br/view/e0qbgxid79uv/15275470', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - video_data = self._download_json( - # https://api.mais.uol.com.br/apiuol/v4/player/data/[MEDIA_ID] - 'https://api.mais.uol.com.br/apiuol/v3/media/detail/' + video_id, - video_id)['item'] - media_id = compat_str(video_data['mediaId']) - title = video_data['title'] - ver = video_data.get('revision', 2) - - uol_formats = self._download_json( - 'https://croupier.mais.uol.com.br/v3/formats/%s/jsonp' % media_id, - media_id) - quality = qualities(['mobile', 'WEBM', '360p', '720p', '1080p']) - formats = [] - for format_id, f in uol_formats.items(): - if not isinstance(f, dict): - continue - f_url = f.get('url') or f.get('secureUrl') - if not f_url: - continue - query = { - 'ver': ver, - 'r': 'http://mais.uol.com.br', - } - for k in ('token', 'sign'): - v = f.get(k) - if v: - query[k] = v - f_url = update_url_query(f_url, query) - format_id = format_id - if format_id == 'HLS': - m3u8_formats = self._extract_m3u8_formats( - f_url, media_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False) - encoded_query = compat_urllib_parse_urlencode(query) - for m3u8_f in m3u8_formats: - m3u8_f['extra_param_to_segment_url'] = encoded_query - m3u8_f['url'] = update_url_query(m3u8_f['url'], query) - formats.extend(m3u8_formats) - continue - formats.append({ - 'format_id': format_id, - 'url': f_url, - 'quality': quality(format_id), - 'preference': -1, - }) - self._sort_formats(formats) - - tags = [] - for tag in video_data.get('tags', []): - tag_description = tag.get('description') - if not tag_description: - continue - tags.append(tag_description) - - thumbnails = [] - for q in ('Small', 'Medium', 'Wmedium', 'Large', 'Wlarge', 'Xlarge'): - q_url = video_data.get('thumb' + q) - if not q_url: - continue - thumbnails.append({ - 'id': q, - 'url': q_url, - }) - - return { - 'id': media_id, - 'title': title, - 'description': clean_html(video_data.get('description')), - 'thumbnails': thumbnails, - 'duration': parse_duration(video_data.get('duration')), - 'tags': tags, - 'formats': formats, - 'timestamp': parse_iso8601(video_data.get('publishDate'), ' '), - 'view_count': int_or_none(video_data.get('viewsQtty')), - } diff --git a/youtube_dl/extractor/uplynk.py b/youtube_dl/extractor/uplynk.py deleted file mode 100644 index f06bf5b12..000000000 --- a/youtube_dl/extractor/uplynk.py +++ /dev/null @@ -1,70 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - float_or_none, - ExtractorError, -) - - -class UplynkIE(InfoExtractor): - IE_NAME = 'uplynk' - _VALID_URL = r'https?://.*?\.uplynk\.com/(?P<path>ext/[0-9a-f]{32}/(?P<external_id>[^/?&]+)|(?P<id>[0-9a-f]{32}))\.(?:m3u8|json)(?:.*?\bpbs=(?P<session_id>[^&]+))?' - _TEST = { - 'url': 'http://content.uplynk.com/e89eaf2ce9054aa89d92ddb2d817a52e.m3u8', - 'info_dict': { - 'id': 'e89eaf2ce9054aa89d92ddb2d817a52e', - 'ext': 'mp4', - 'title': '030816-kgo-530pm-solar-eclipse-vid_web.mp4', - 'uploader_id': '4413701bf5a1488db55b767f8ae9d4fa', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - } - - def _extract_uplynk_info(self, uplynk_content_url): - path, external_id, video_id, session_id = re.match(UplynkIE._VALID_URL, uplynk_content_url).groups() - display_id = video_id or external_id - formats = self._extract_m3u8_formats( - 'http://content.uplynk.com/%s.m3u8' % path, - display_id, 'mp4', 'm3u8_native') - if session_id: - for f in formats: - f['extra_param_to_segment_url'] = 'pbs=' + session_id - self._sort_formats(formats) - asset = self._download_json('http://content.uplynk.com/player/assetinfo/%s.json' % path, display_id) - if asset.get('error') == 1: - raise ExtractorError('% said: %s' % (self.IE_NAME, asset['msg']), expected=True) - - return { - 'id': asset['asset'], - 'title': asset['desc'], - 'thumbnail': asset.get('default_poster_url'), - 'duration': float_or_none(asset.get('duration')), - 'uploader_id': asset.get('owner'), - 'formats': formats, - } - - def _real_extract(self, url): - return self._extract_uplynk_info(url) - - -class UplynkPreplayIE(UplynkIE): - IE_NAME = 'uplynk:preplay' - _VALID_URL = r'https?://.*?\.uplynk\.com/preplay2?/(?P<path>ext/[0-9a-f]{32}/(?P<external_id>[^/?&]+)|(?P<id>[0-9a-f]{32}))\.json' - _TEST = None - - def _real_extract(self, url): - path, external_id, video_id = re.match(self._VALID_URL, url).groups() - display_id = video_id or external_id - preplay = self._download_json(url, display_id) - content_url = 'http://content.uplynk.com/%s.m3u8' % path - session_id = preplay.get('sid') - if session_id: - content_url += '?pbs=' + session_id - return self._extract_uplynk_info(content_url) diff --git a/youtube_dl/extractor/urort.py b/youtube_dl/extractor/urort.py deleted file mode 100644 index 8f6edab4b..000000000 --- a/youtube_dl/extractor/urort.py +++ /dev/null @@ -1,66 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse, -) -from ..utils import ( - unified_strdate, -) - - -class UrortIE(InfoExtractor): - IE_DESC = 'NRK P3 Urørt' - _VALID_URL = r'https?://(?:www\.)?urort\.p3\.no/#!/Band/(?P<id>[^/]+)$' - - _TEST = { - 'url': 'https://urort.p3.no/#!/Band/Gerilja', - 'md5': '5ed31a924be8a05e47812678a86e127b', - 'info_dict': { - 'id': '33124-24', - 'ext': 'mp3', - 'title': 'The Bomb', - 'thumbnail': r're:^https?://.+\.jpg', - 'uploader': 'Gerilja', - 'uploader_id': 'Gerilja', - 'upload_date': '20100323', - }, - 'params': { - 'matchtitle': '^The Bomb$', # To test, we want just one video - } - } - - def _real_extract(self, url): - playlist_id = self._match_id(url) - - fstr = compat_urllib_parse.quote("InternalBandUrl eq '%s'" % playlist_id) - json_url = 'http://urort.p3.no/breeze/urort/TrackDTOViews?$filter=%s&$orderby=Released%%20desc&$expand=Tags%%2CFiles' % fstr - songs = self._download_json(json_url, playlist_id) - entries = [] - for s in songs: - formats = [{ - 'tbr': f.get('Quality'), - 'ext': f['FileType'], - 'format_id': '%s-%s' % (f['FileType'], f.get('Quality', '')), - 'url': 'http://p3urort.blob.core.windows.net/tracks/%s' % f['FileRef'], - 'preference': 3 if f['FileType'] == 'mp3' else 2, - } for f in s['Files']] - self._sort_formats(formats) - e = { - 'id': '%d-%s' % (s['BandId'], s['$id']), - 'title': s['Title'], - 'uploader_id': playlist_id, - 'uploader': s.get('BandName', playlist_id), - 'thumbnail': 'http://urort.p3.no/cloud/images/%s' % s['Image'], - 'upload_date': unified_strdate(s.get('Released')), - 'formats': formats, - } - entries.append(e) - - return { - '_type': 'playlist', - 'id': playlist_id, - 'title': playlist_id, - 'entries': entries, - } diff --git a/youtube_dl/extractor/urplay.py b/youtube_dl/extractor/urplay.py deleted file mode 100644 index d6c79147e..000000000 --- a/youtube_dl/extractor/urplay.py +++ /dev/null @@ -1,107 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - dict_get, - int_or_none, - unified_timestamp, -) - - -class URPlayIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ur(?:play|skola)\.se/(?:program|Produkter)/(?P<id>[0-9]+)' - _TESTS = [{ - 'url': 'https://urplay.se/program/203704-ur-samtiden-livet-universum-och-rymdens-markliga-musik-om-vetenskap-kritiskt-tankande-och-motstand', - 'md5': 'ff5b0c89928f8083c74bbd5099c9292d', - 'info_dict': { - 'id': '203704', - 'ext': 'mp4', - 'title': 'UR Samtiden - Livet, universum och rymdens märkliga musik : Om vetenskap, kritiskt tänkande och motstånd', - 'description': 'md5:5344508a52aa78c1ced6c1b8b9e44e9a', - 'timestamp': 1513292400, - 'upload_date': '20171214', - 'series': 'UR Samtiden - Livet, universum och rymdens märkliga musik', - 'duration': 2269, - 'categories': ['Kultur & historia'], - 'tags': ['Kritiskt tänkande', 'Vetenskap', 'Vetenskaplig verksamhet'], - 'episode': 'Om vetenskap, kritiskt tänkande och motstånd', - }, - }, { - 'url': 'https://urskola.se/Produkter/190031-Tripp-Trapp-Trad-Sovkudde', - 'info_dict': { - 'id': '190031', - 'ext': 'mp4', - 'title': 'Tripp, Trapp, Träd : Sovkudde', - 'description': 'md5:b86bffdae04a7e9379d1d7e5947df1d1', - 'timestamp': 1440086400, - 'upload_date': '20150820', - 'series': 'Tripp, Trapp, Träd', - 'duration': 865, - 'tags': ['Sova'], - 'episode': 'Sovkudde', - }, - }, { - 'url': 'http://urskola.se/Produkter/155794-Smasagor-meankieli-Grodan-i-vida-varlden', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - url = url.replace('skola.se/Produkter', 'play.se/program') - webpage = self._download_webpage(url, video_id) - vid = int(video_id) - accessible_episodes = self._parse_json(self._html_search_regex( - r'data-react-class="routes/Product/components/ProgramContainer/ProgramContainer"[^>]+data-react-props="({.+?})"', - webpage, 'urplayer data'), video_id)['accessibleEpisodes'] - urplayer_data = next(e for e in accessible_episodes if e.get('id') == vid) - episode = urplayer_data['title'] - raw_streaming_info = urplayer_data['streamingInfo']['raw'] - host = self._download_json( - 'http://streaming-loadbalancer.ur.se/loadbalancer.json', - video_id)['redirect'] - - formats = [] - for k, v in raw_streaming_info.items(): - if not (k in ('sd', 'hd') and isinstance(v, dict)): - continue - file_http = v.get('location') - if file_http: - formats.extend(self._extract_wowza_formats( - 'http://%s/%splaylist.m3u8' % (host, file_http), - video_id, skip_protocols=['f4m', 'rtmp', 'rtsp'])) - self._sort_formats(formats) - - image = urplayer_data.get('image') or {} - thumbnails = [] - for k, v in image.items(): - t = { - 'id': k, - 'url': v, - } - wh = k.split('x') - if len(wh) == 2: - t.update({ - 'width': int_or_none(wh[0]), - 'height': int_or_none(wh[1]), - }) - thumbnails.append(t) - - series = urplayer_data.get('series') or {} - series_title = dict_get(series, ('seriesTitle', 'title')) or dict_get(urplayer_data, ('seriesTitle', 'mainTitle')) - - return { - 'id': video_id, - 'title': '%s : %s' % (series_title, episode) if series_title else episode, - 'description': urplayer_data.get('description'), - 'thumbnails': thumbnails, - 'timestamp': unified_timestamp(urplayer_data.get('publishedAt')), - 'series': series_title, - 'formats': formats, - 'duration': int_or_none(urplayer_data.get('duration')), - 'categories': urplayer_data.get('categories'), - 'tags': urplayer_data.get('keywords'), - 'season': series.get('label'), - 'episode': episode, - 'episode_number': int_or_none(urplayer_data.get('episodeNumber')), - } diff --git a/youtube_dl/extractor/usanetwork.py b/youtube_dl/extractor/usanetwork.py deleted file mode 100644 index e3784e55f..000000000 --- a/youtube_dl/extractor/usanetwork.py +++ /dev/null @@ -1,24 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .nbc import NBCIE - - -class USANetworkIE(NBCIE): - _VALID_URL = r'https?(?P<permalink>://(?:www\.)?usanetwork\.com/[^/]+/video/[^/]+/(?P<id>\d+))' - _TESTS = [{ - 'url': 'https://www.usanetwork.com/peacock-trailers/video/intelligence-trailer/4185302', - 'info_dict': { - 'id': '4185302', - 'ext': 'mp4', - 'title': 'Intelligence (Trailer)', - 'description': 'A maverick NSA agent enlists the help of a junior systems analyst in a workplace power grab.', - 'upload_date': '20200715', - 'timestamp': 1594785600, - 'uploader': 'NBCU-MPAT', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }] diff --git a/youtube_dl/extractor/ustream.py b/youtube_dl/extractor/ustream.py deleted file mode 100644 index 1e29cbe22..000000000 --- a/youtube_dl/extractor/ustream.py +++ /dev/null @@ -1,284 +0,0 @@ -from __future__ import unicode_literals - -import random -import re - -from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, -) -from ..utils import ( - encode_data_uri, - ExtractorError, - int_or_none, - float_or_none, - mimetype2ext, - str_or_none, -) - - -class UstreamIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:ustream\.tv|video\.ibm\.com)/(?P<type>recorded|embed|embed/recorded)/(?P<id>\d+)' - IE_NAME = 'ustream' - _TESTS = [{ - 'url': 'http://www.ustream.tv/recorded/20274954', - 'md5': '088f151799e8f572f84eb62f17d73e5c', - 'info_dict': { - 'id': '20274954', - 'ext': 'flv', - 'title': 'Young Americans for Liberty February 7, 2012 2:28 AM', - 'description': 'Young Americans for Liberty February 7, 2012 2:28 AM', - 'timestamp': 1328577035, - 'upload_date': '20120207', - 'uploader': 'yaliberty', - 'uploader_id': '6780869', - }, - }, { - # From http://sportscanada.tv/canadagames/index.php/week2/figure-skating/444 - # Title and uploader available only from params JSON - 'url': 'http://www.ustream.tv/embed/recorded/59307601?ub=ff0000&lc=ff0000&oc=ffffff&uc=ffffff&v=3&wmode=direct', - 'md5': '5a2abf40babeac9812ed20ae12d34e10', - 'info_dict': { - 'id': '59307601', - 'ext': 'flv', - 'title': '-CG11- Canada Games Figure Skating', - 'uploader': 'sportscanadatv', - }, - 'skip': 'This Pro Broadcaster has chosen to remove this video from the ustream.tv site.', - }, { - 'url': 'http://www.ustream.tv/embed/10299409', - 'info_dict': { - 'id': '10299409', - }, - 'playlist_count': 3, - }, { - 'url': 'http://www.ustream.tv/recorded/91343263', - 'info_dict': { - 'id': '91343263', - 'ext': 'mp4', - 'title': 'GitHub Universe - General Session - Day 1', - 'upload_date': '20160914', - 'description': 'GitHub Universe - General Session - Day 1', - 'timestamp': 1473872730, - 'uploader': 'wa0dnskeqkr', - 'uploader_id': '38977840', - }, - 'params': { - 'skip_download': True, # m3u8 download - }, - }, { - 'url': 'https://video.ibm.com/embed/recorded/128240221?&autoplay=true&controls=true&volume=100', - 'only_matching': True, - }] - - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?(?:ustream\.tv|video\.ibm\.com)/embed/.+?)\1', webpage) - if mobj is not None: - return mobj.group('url') - - def _get_stream_info(self, url, video_id, app_id_ver, extra_note=None): - def num_to_hex(n): - return hex(n)[2:] - - rnd = random.randrange - - if not extra_note: - extra_note = '' - - conn_info = self._download_json( - 'http://r%d-1-%s-recorded-lp-live.ums.ustream.tv/1/ustream' % (rnd(1e8), video_id), - video_id, note='Downloading connection info' + extra_note, - query={ - 'type': 'viewer', - 'appId': app_id_ver[0], - 'appVersion': app_id_ver[1], - 'rsid': '%s:%s' % (num_to_hex(rnd(1e8)), num_to_hex(rnd(1e8))), - 'rpin': '_rpin.%d' % rnd(1e15), - 'referrer': url, - 'media': video_id, - 'application': 'recorded', - }) - host = conn_info[0]['args'][0]['host'] - connection_id = conn_info[0]['args'][0]['connectionId'] - - return self._download_json( - 'http://%s/1/ustream?connectionId=%s' % (host, connection_id), - video_id, note='Downloading stream info' + extra_note) - - def _get_streams(self, url, video_id, app_id_ver): - # Sometimes the return dict does not have 'stream' - for trial_count in range(3): - stream_info = self._get_stream_info( - url, video_id, app_id_ver, - extra_note=' (try %d)' % (trial_count + 1) if trial_count > 0 else '') - if 'stream' in stream_info[0]['args'][0]: - return stream_info[0]['args'][0]['stream'] - return [] - - def _parse_segmented_mp4(self, dash_stream_info): - def resolve_dash_template(template, idx, chunk_hash): - return template.replace('%', compat_str(idx), 1).replace('%', chunk_hash) - - formats = [] - for stream in dash_stream_info['streams']: - # Use only one provider to avoid too many formats - provider = dash_stream_info['providers'][0] - fragments = [{ - 'url': resolve_dash_template( - provider['url'] + stream['initUrl'], 0, dash_stream_info['hashes']['0']) - }] - for idx in range(dash_stream_info['videoLength'] // dash_stream_info['chunkTime']): - fragments.append({ - 'url': resolve_dash_template( - provider['url'] + stream['segmentUrl'], idx, - dash_stream_info['hashes'][compat_str(idx // 10 * 10)]) - }) - content_type = stream['contentType'] - kind = content_type.split('/')[0] - f = { - 'format_id': '-'.join(filter(None, [ - 'dash', kind, str_or_none(stream.get('bitrate'))])), - 'protocol': 'http_dash_segments', - # TODO: generate a MPD doc for external players? - 'url': encode_data_uri(b'<MPD/>', 'text/xml'), - 'ext': mimetype2ext(content_type), - 'height': stream.get('height'), - 'width': stream.get('width'), - 'fragments': fragments, - } - if kind == 'video': - f.update({ - 'vcodec': stream.get('codec'), - 'acodec': 'none', - 'vbr': stream.get('bitrate'), - }) - else: - f.update({ - 'vcodec': 'none', - 'acodec': stream.get('codec'), - 'abr': stream.get('bitrate'), - }) - formats.append(f) - return formats - - def _real_extract(self, url): - m = re.match(self._VALID_URL, url) - video_id = m.group('id') - - # some sites use this embed format (see: https://github.com/ytdl-org/youtube-dl/issues/2990) - if m.group('type') == 'embed/recorded': - video_id = m.group('id') - desktop_url = 'http://www.ustream.tv/recorded/' + video_id - return self.url_result(desktop_url, 'Ustream') - if m.group('type') == 'embed': - video_id = m.group('id') - webpage = self._download_webpage(url, video_id) - content_video_ids = self._parse_json(self._search_regex( - r'ustream\.vars\.offAirContentVideoIds=([^;]+);', webpage, - 'content video IDs'), video_id) - return self.playlist_result( - map(lambda u: self.url_result('http://www.ustream.tv/recorded/' + u, 'Ustream'), content_video_ids), - video_id) - - params = self._download_json( - 'https://api.ustream.tv/videos/%s.json' % video_id, video_id) - - error = params.get('error') - if error: - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, error), expected=True) - - video = params['video'] - - title = video['title'] - filesize = float_or_none(video.get('file_size')) - - formats = [{ - 'id': video_id, - 'url': video_url, - 'ext': format_id, - 'filesize': filesize, - } for format_id, video_url in video['media_urls'].items() if video_url] - - if not formats: - hls_streams = self._get_streams(url, video_id, app_id_ver=(11, 2)) - if hls_streams: - # m3u8_native leads to intermittent ContentTooShortError - formats.extend(self._extract_m3u8_formats( - hls_streams[0]['url'], video_id, ext='mp4', m3u8_id='hls')) - - ''' - # DASH streams handling is incomplete as 'url' is missing - dash_streams = self._get_streams(url, video_id, app_id_ver=(3, 1)) - if dash_streams: - formats.extend(self._parse_segmented_mp4(dash_streams)) - ''' - - self._sort_formats(formats) - - description = video.get('description') - timestamp = int_or_none(video.get('created_at')) - duration = float_or_none(video.get('length')) - view_count = int_or_none(video.get('views')) - - uploader = video.get('owner', {}).get('username') - uploader_id = video.get('owner', {}).get('id') - - thumbnails = [{ - 'id': thumbnail_id, - 'url': thumbnail_url, - } for thumbnail_id, thumbnail_url in video.get('thumbnail', {}).items()] - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnails': thumbnails, - 'timestamp': timestamp, - 'duration': duration, - 'view_count': view_count, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'formats': formats, - } - - -class UstreamChannelIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ustream\.tv/channel/(?P<slug>.+)' - IE_NAME = 'ustream:channel' - _TEST = { - 'url': 'http://www.ustream.tv/channel/channeljapan', - 'info_dict': { - 'id': '10874166', - }, - 'playlist_mincount': 17, - } - - def _real_extract(self, url): - m = re.match(self._VALID_URL, url) - display_id = m.group('slug') - webpage = self._download_webpage(url, display_id) - channel_id = self._html_search_meta('ustream:channel_id', webpage) - - BASE = 'http://www.ustream.tv' - next_url = '/ajax/socialstream/videos/%s/1.json' % channel_id - video_ids = [] - while next_url: - reply = self._download_json( - compat_urlparse.urljoin(BASE, next_url), display_id, - note='Downloading video information (next: %d)' % (len(video_ids) + 1)) - video_ids.extend(re.findall(r'data-content-id="(\d.*)"', reply['data'])) - next_url = reply['nextUrl'] - - entries = [ - self.url_result('http://www.ustream.tv/recorded/' + vid, 'Ustream') - for vid in video_ids] - return { - '_type': 'playlist', - 'id': channel_id, - 'display_id': display_id, - 'entries': entries, - } diff --git a/youtube_dl/extractor/ustudio.py b/youtube_dl/extractor/ustudio.py deleted file mode 100644 index 56509beed..000000000 --- a/youtube_dl/extractor/ustudio.py +++ /dev/null @@ -1,125 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - unified_strdate, - unescapeHTML, -) - - -class UstudioIE(InfoExtractor): - IE_NAME = 'ustudio' - _VALID_URL = r'https?://(?:(?:www|v1)\.)?ustudio\.com/video/(?P<id>[^/]+)/(?P<display_id>[^/?#&]+)' - _TEST = { - 'url': 'http://ustudio.com/video/Uxu2my9bgSph/san_francisco_golden_gate_bridge', - 'md5': '58bbfca62125378742df01fc2abbdef6', - 'info_dict': { - 'id': 'Uxu2my9bgSph', - 'display_id': 'san_francisco_golden_gate_bridge', - 'ext': 'mp4', - 'title': 'San Francisco: Golden Gate Bridge', - 'description': 'md5:23925500697f2c6d4830e387ba51a9be', - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20111107', - 'uploader': 'Tony Farley', - } - } - - def _real_extract(self, url): - video_id, display_id = re.match(self._VALID_URL, url).groups() - - config = self._download_xml( - 'http://v1.ustudio.com/embed/%s/ustudio/config.xml' % video_id, - display_id) - - def extract(kind): - return [{ - 'url': unescapeHTML(item.attrib['url']), - 'width': int_or_none(item.get('width')), - 'height': int_or_none(item.get('height')), - } for item in config.findall('./qualities/quality/%s' % kind) if item.get('url')] - - formats = extract('video') - self._sort_formats(formats) - - webpage = self._download_webpage(url, display_id) - - title = self._og_search_title(webpage) - upload_date = unified_strdate(self._search_regex( - r'(?s)Uploaded by\s*.+?\s*on\s*<span>([^<]+)</span>', - webpage, 'upload date', fatal=False)) - uploader = self._search_regex( - r'Uploaded by\s*<a[^>]*>([^<]+)<', - webpage, 'uploader', fatal=False) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': self._og_search_description(webpage), - 'thumbnails': extract('image'), - 'upload_date': upload_date, - 'uploader': uploader, - 'formats': formats, - } - - -class UstudioEmbedIE(InfoExtractor): - IE_NAME = 'ustudio:embed' - _VALID_URL = r'https?://(?:(?:app|embed)\.)?ustudio\.com/embed/(?P<uid>[^/]+)/(?P<id>[^/]+)' - _TEST = { - 'url': 'http://app.ustudio.com/embed/DeN7VdYRDKhP/Uw7G1kMCe65T', - 'md5': '47c0be52a09b23a7f40de9469cec58f4', - 'info_dict': { - 'id': 'Uw7G1kMCe65T', - 'ext': 'mp4', - 'title': '5 Things IT Should Know About Video', - 'description': 'md5:93d32650884b500115e158c5677d25ad', - 'uploader_id': 'DeN7VdYRDKhP', - } - } - - def _real_extract(self, url): - uploader_id, video_id = re.match(self._VALID_URL, url).groups() - video_data = self._download_json( - 'http://app.ustudio.com/embed/%s/%s/config.json' % (uploader_id, video_id), - video_id)['videos'][0] - title = video_data['name'] - - formats = [] - for ext, qualities in video_data.get('transcodes', {}).items(): - for quality in qualities: - quality_url = quality.get('url') - if not quality_url: - continue - height = int_or_none(quality.get('height')) - formats.append({ - 'format_id': '%s-%dp' % (ext, height) if height else ext, - 'url': quality_url, - 'width': int_or_none(quality.get('width')), - 'height': height, - }) - self._sort_formats(formats) - - thumbnails = [] - for image in video_data.get('images', []): - image_url = image.get('url') - if not image_url: - continue - thumbnails.append({ - 'url': image_url, - }) - - return { - 'id': video_id, - 'title': title, - 'description': video_data.get('description'), - 'duration': int_or_none(video_data.get('duration')), - 'uploader_id': uploader_id, - 'tags': video_data.get('keywords'), - 'thumbnails': thumbnails, - 'formats': formats, - } diff --git a/youtube_dl/extractor/varzesh3.py b/youtube_dl/extractor/varzesh3.py deleted file mode 100644 index f474ed73f..000000000 --- a/youtube_dl/extractor/varzesh3.py +++ /dev/null @@ -1,79 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_urlparse, - compat_parse_qs, -) -from ..utils import ( - clean_html, - remove_start, -) - - -class Varzesh3IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?video\.varzesh3\.com/(?:[^/]+/)+(?P<id>[^/]+)/?' - _TESTS = [{ - 'url': 'http://video.varzesh3.com/germany/bundesliga/5-%D9%88%D8%A7%DA%A9%D9%86%D8%B4-%D8%A8%D8%B1%D8%AA%D8%B1-%D8%AF%D8%B1%D9%88%D8%A7%D8%B2%D9%87%E2%80%8C%D8%A8%D8%A7%D9%86%D8%A7%D9%86%D8%9B%D9%87%D9%81%D8%AA%D9%87-26-%D8%A8%D9%88%D9%86%D8%AF%D8%B3/', - 'md5': '2a933874cb7dce4366075281eb49e855', - 'info_dict': { - 'id': '76337', - 'ext': 'mp4', - 'title': '۵ واکنش برتر دروازهبانان؛هفته ۲۶ بوندسلیگا', - 'description': 'فصل ۲۰۱۵-۲۰۱۴', - 'thumbnail': r're:^https?://.*\.jpg$', - }, - 'skip': 'HTTP 404 Error', - }, { - 'url': 'http://video.varzesh3.com/video/112785/%D8%AF%D9%84%D9%87-%D8%B9%D9%84%DB%8C%D8%9B-%D8%B3%D8%AA%D8%A7%D8%B1%D9%87-%D9%86%D9%88%D8%B8%D9%87%D9%88%D8%B1-%D9%84%DB%8C%DA%AF-%D8%A8%D8%B1%D8%AA%D8%B1-%D8%AC%D8%B2%DB%8C%D8%B1%D9%87', - 'md5': '841b7cd3afbc76e61708d94e53a4a4e7', - 'info_dict': { - 'id': '112785', - 'ext': 'mp4', - 'title': 'دله علی؛ ستاره نوظهور لیگ برتر جزیره', - 'description': 'فوتبال 120', - }, - 'expected_warnings': ['description'], - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - video_url = self._search_regex( - r'<source[^>]+src="([^"]+)"', webpage, 'video url') - - title = remove_start(self._html_search_regex( - r'<title>([^<]+)</title>', webpage, 'title'), 'ویدیو ورزش 3 | ') - - description = self._html_search_regex( - r'(?s)<div class="matn">(.+?)</div>', - webpage, 'description', default=None) - if description is None: - description = clean_html(self._html_search_meta('description', webpage)) - - thumbnail = self._og_search_thumbnail(webpage, default=None) - if thumbnail is None: - fb_sharer_url = self._search_regex( - r'<a[^>]+href="(https?://www\.facebook\.com/sharer/sharer\.php?[^"]+)"', - webpage, 'facebook sharer URL', fatal=False) - sharer_params = compat_parse_qs(compat_urllib_parse_urlparse(fb_sharer_url).query) - thumbnail = sharer_params.get('p[images][0]', [None])[0] - - video_id = self._search_regex( - r"<link[^>]+rel='(?:canonical|shortlink)'[^>]+href='/\?p=([^']+)'", - webpage, display_id, default=None) - if video_id is None: - video_id = self._search_regex( - r'var\s+VideoId\s*=\s*(\d+);', webpage, 'video id', - default=display_id) - - return { - 'url': video_url, - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - } diff --git a/youtube_dl/extractor/vesti.py b/youtube_dl/extractor/vesti.py deleted file mode 100644 index 5ab716880..000000000 --- a/youtube_dl/extractor/vesti.py +++ /dev/null @@ -1,121 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ExtractorError -from .rutv import RUTVIE - - -class VestiIE(InfoExtractor): - IE_DESC = 'Вести.Ru' - _VALID_URL = r'https?://(?:.+?\.)?vesti\.ru/(?P<id>.+)' - - _TESTS = [ - { - 'url': 'http://www.vesti.ru/videos?vid=575582&cid=1', - 'info_dict': { - 'id': '765035', - 'ext': 'mp4', - 'title': 'Вести.net: биткоины в России не являются законными', - 'description': 'md5:d4bb3859dc1177b28a94c5014c35a36b', - 'duration': 302, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, - { - 'url': 'http://www.vesti.ru/doc.html?id=1349233', - 'info_dict': { - 'id': '773865', - 'ext': 'mp4', - 'title': 'Участники митинга штурмуют Донецкую областную администрацию', - 'description': 'md5:1a160e98b3195379b4c849f2f4958009', - 'duration': 210, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, - { - 'url': 'http://www.vesti.ru/only_video.html?vid=576180', - 'info_dict': { - 'id': '766048', - 'ext': 'mp4', - 'title': 'США заморозило, Британию затопило', - 'description': 'md5:f0ed0695ec05aed27c56a70a58dc4cc1', - 'duration': 87, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, - { - 'url': 'http://hitech.vesti.ru/news/view/id/4000', - 'info_dict': { - 'id': '766888', - 'ext': 'mp4', - 'title': 'Вести.net: интернет-гиганты начали перетягивание программных "одеял"', - 'description': 'md5:65ddd47f9830c4f42ed6475f8730c995', - 'duration': 279, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, - { - 'url': 'http://sochi2014.vesti.ru/video/index/video_id/766403', - 'info_dict': { - 'id': '766403', - 'ext': 'mp4', - 'title': 'XXII зимние Олимпийские игры. Российские хоккеисты стартовали на Олимпиаде с победы', - 'description': 'md5:55805dfd35763a890ff50fa9e35e31b3', - 'duration': 271, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'skip': 'Blocked outside Russia', - }, - { - 'url': 'http://sochi2014.vesti.ru/live/play/live_id/301', - 'info_dict': { - 'id': '51499', - 'ext': 'flv', - 'title': 'Сочи-2014. Биатлон. Индивидуальная гонка. Мужчины ', - 'description': 'md5:9e0ed5c9d2fa1efbfdfed90c9a6d179c', - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - 'skip': 'Translation has finished' - }, - ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - page = self._download_webpage(url, video_id, 'Downloading page') - - mobj = re.search( - r'<meta[^>]+?property="og:video"[^>]+?content="http://www\.vesti\.ru/i/flvplayer_videoHost\.swf\?vid=(?P<id>\d+)', - page) - if mobj: - video_id = mobj.group('id') - page = self._download_webpage('http://www.vesti.ru/only_video.html?vid=%s' % video_id, video_id, - 'Downloading video page') - - rutv_url = RUTVIE._extract_url(page) - if rutv_url: - return self.url_result(rutv_url, 'RUTV') - - raise ExtractorError('No video found', expected=True) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py deleted file mode 100644 index 4ea9f1b4b..000000000 --- a/youtube_dl/extractor/vevo.py +++ /dev/null @@ -1,374 +0,0 @@ -from __future__ import unicode_literals - -import re -import json - -from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, - compat_HTTPError, -) -from ..utils import ( - ExtractorError, - int_or_none, - parse_iso8601, -) - - -class VevoBaseIE(InfoExtractor): - def _extract_json(self, webpage, video_id): - return self._parse_json( - self._search_regex( - r'window\.__INITIAL_STORE__\s*=\s*({.+?});\s*</script>', - webpage, 'initial store'), - video_id) - - -class VevoIE(VevoBaseIE): - ''' - Accepts urls from vevo.com or in the format 'vevo:{id}' - (currently used by MTVIE and MySpaceIE) - ''' - _VALID_URL = r'''(?x) - (?:https?://(?:www\.)?vevo\.com/watch/(?!playlist|genre)(?:[^/]+/(?:[^/]+/)?)?| - https?://cache\.vevo\.com/m/html/embed\.html\?video=| - https?://videoplayer\.vevo\.com/embed/embedded\?videoId=| - https?://embed\.vevo\.com/.*?[?&]isrc=| - vevo:) - (?P<id>[^&?#]+)''' - - _TESTS = [{ - 'url': 'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280', - 'md5': '95ee28ee45e70130e3ab02b0f579ae23', - 'info_dict': { - 'id': 'GB1101300280', - 'ext': 'mp4', - 'title': 'Hurts - Somebody to Die For', - 'timestamp': 1372057200, - 'upload_date': '20130624', - 'uploader': 'Hurts', - 'track': 'Somebody to Die For', - 'artist': 'Hurts', - 'genre': 'Pop', - }, - 'expected_warnings': ['Unable to download SMIL file', 'Unable to download info'], - }, { - 'note': 'v3 SMIL format', - 'url': 'http://www.vevo.com/watch/cassadee-pope/i-wish-i-could-break-your-heart/USUV71302923', - 'md5': 'f6ab09b034f8c22969020b042e5ac7fc', - 'info_dict': { - 'id': 'USUV71302923', - 'ext': 'mp4', - 'title': 'Cassadee Pope - I Wish I Could Break Your Heart', - 'timestamp': 1392796919, - 'upload_date': '20140219', - 'uploader': 'Cassadee Pope', - 'track': 'I Wish I Could Break Your Heart', - 'artist': 'Cassadee Pope', - 'genre': 'Country', - }, - 'expected_warnings': ['Unable to download SMIL file', 'Unable to download info'], - }, { - 'note': 'Age-limited video', - 'url': 'https://www.vevo.com/watch/justin-timberlake/tunnel-vision-explicit/USRV81300282', - 'info_dict': { - 'id': 'USRV81300282', - 'ext': 'mp4', - 'title': 'Justin Timberlake - Tunnel Vision (Explicit)', - 'age_limit': 18, - 'timestamp': 1372888800, - 'upload_date': '20130703', - 'uploader': 'Justin Timberlake', - 'track': 'Tunnel Vision (Explicit)', - 'artist': 'Justin Timberlake', - 'genre': 'Pop', - }, - 'expected_warnings': ['Unable to download SMIL file', 'Unable to download info'], - }, { - 'note': 'No video_info', - 'url': 'http://www.vevo.com/watch/k-camp-1/Till-I-Die/USUV71503000', - 'md5': '8b83cc492d72fc9cf74a02acee7dc1b0', - 'info_dict': { - 'id': 'USUV71503000', - 'ext': 'mp4', - 'title': 'K Camp ft. T.I. - Till I Die', - 'age_limit': 18, - 'timestamp': 1449468000, - 'upload_date': '20151207', - 'uploader': 'K Camp', - 'track': 'Till I Die', - 'artist': 'K Camp', - 'genre': 'Hip-Hop', - }, - 'expected_warnings': ['Unable to download SMIL file', 'Unable to download info'], - }, { - 'note': 'Featured test', - 'url': 'https://www.vevo.com/watch/lemaitre/Wait/USUV71402190', - 'md5': 'd28675e5e8805035d949dc5cf161071d', - 'info_dict': { - 'id': 'USUV71402190', - 'ext': 'mp4', - 'title': 'Lemaitre ft. LoLo - Wait', - 'age_limit': 0, - 'timestamp': 1413432000, - 'upload_date': '20141016', - 'uploader': 'Lemaitre', - 'track': 'Wait', - 'artist': 'Lemaitre', - 'genre': 'Electronic', - }, - 'expected_warnings': ['Unable to download SMIL file', 'Unable to download info'], - }, { - 'note': 'Only available via webpage', - 'url': 'http://www.vevo.com/watch/GBUV71600656', - 'md5': '67e79210613865b66a47c33baa5e37fe', - 'info_dict': { - 'id': 'GBUV71600656', - 'ext': 'mp4', - 'title': 'ABC - Viva Love', - 'age_limit': 0, - 'timestamp': 1461830400, - 'upload_date': '20160428', - 'uploader': 'ABC', - 'track': 'Viva Love', - 'artist': 'ABC', - 'genre': 'Pop', - }, - 'expected_warnings': ['Failed to download video versions info'], - }, { - # no genres available - 'url': 'http://www.vevo.com/watch/INS171400764', - 'only_matching': True, - }, { - # Another case available only via the webpage; using streams/streamsV3 formats - # Geo-restricted to Netherlands/Germany - 'url': 'http://www.vevo.com/watch/boostee/pop-corn-clip-officiel/FR1A91600909', - 'only_matching': True, - }, { - 'url': 'https://embed.vevo.com/?isrc=USH5V1923499&partnerId=4d61b777-8023-4191-9ede-497ed6c24647&partnerAdCode=', - 'only_matching': True, - }] - _VERSIONS = { - 0: 'youtube', # only in AuthenticateVideo videoVersions - 1: 'level3', - 2: 'akamai', - 3: 'level3', - 4: 'amazon', - } - - def _initialize_api(self, video_id): - webpage = self._download_webpage( - 'https://accounts.vevo.com/token', None, - note='Retrieving oauth token', - errnote='Unable to retrieve oauth token', - data=json.dumps({ - 'client_id': 'SPupX1tvqFEopQ1YS6SS', - 'grant_type': 'urn:vevo:params:oauth:grant-type:anonymous', - }).encode('utf-8'), - headers={ - 'Content-Type': 'application/json', - }) - - if re.search(r'(?i)THIS PAGE IS CURRENTLY UNAVAILABLE IN YOUR REGION', webpage): - self.raise_geo_restricted( - '%s said: This page is currently unavailable in your region' % self.IE_NAME) - - auth_info = self._parse_json(webpage, video_id) - self._api_url_template = self.http_scheme() + '//apiv2.vevo.com/%s?token=' + auth_info['legacy_token'] - - def _call_api(self, path, *args, **kwargs): - try: - data = self._download_json(self._api_url_template % path, *args, **kwargs) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError): - errors = self._parse_json(e.cause.read().decode(), None)['errors'] - error_message = ', '.join([error['message'] for error in errors]) - raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message), expected=True) - raise - return data - - def _real_extract(self, url): - video_id = self._match_id(url) - - self._initialize_api(video_id) - - video_info = self._call_api( - 'video/%s' % video_id, video_id, 'Downloading api video info', - 'Failed to download video info') - - video_versions = self._call_api( - 'video/%s/streams' % video_id, video_id, - 'Downloading video versions info', - 'Failed to download video versions info', - fatal=False) - - # Some videos are only available via webpage (e.g. - # https://github.com/ytdl-org/youtube-dl/issues/9366) - if not video_versions: - webpage = self._download_webpage(url, video_id) - json_data = self._extract_json(webpage, video_id) - if 'streams' in json_data.get('default', {}): - video_versions = json_data['default']['streams'][video_id][0] - else: - video_versions = [ - value - for key, value in json_data['apollo']['data'].items() - if key.startswith('%s.streams' % video_id)] - - uploader = None - artist = None - featured_artist = None - artists = video_info.get('artists') - for curr_artist in artists: - if curr_artist.get('role') == 'Featured': - featured_artist = curr_artist['name'] - else: - artist = uploader = curr_artist['name'] - - formats = [] - for video_version in video_versions: - version = self._VERSIONS.get(video_version.get('version'), 'generic') - version_url = video_version.get('url') - if not version_url: - continue - - if '.ism' in version_url: - continue - elif '.mpd' in version_url: - formats.extend(self._extract_mpd_formats( - version_url, video_id, mpd_id='dash-%s' % version, - note='Downloading %s MPD information' % version, - errnote='Failed to download %s MPD information' % version, - fatal=False)) - elif '.m3u8' in version_url: - formats.extend(self._extract_m3u8_formats( - version_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls-%s' % version, - note='Downloading %s m3u8 information' % version, - errnote='Failed to download %s m3u8 information' % version, - fatal=False)) - else: - m = re.search(r'''(?xi) - _(?P<width>[0-9]+)x(?P<height>[0-9]+) - _(?P<vcodec>[a-z0-9]+) - _(?P<vbr>[0-9]+) - _(?P<acodec>[a-z0-9]+) - _(?P<abr>[0-9]+) - \.(?P<ext>[a-z0-9]+)''', version_url) - if not m: - continue - - formats.append({ - 'url': version_url, - 'format_id': 'http-%s-%s' % (version, video_version['quality']), - 'vcodec': m.group('vcodec'), - 'acodec': m.group('acodec'), - 'vbr': int(m.group('vbr')), - 'abr': int(m.group('abr')), - 'ext': m.group('ext'), - 'width': int(m.group('width')), - 'height': int(m.group('height')), - }) - self._sort_formats(formats) - - track = video_info['title'] - if featured_artist: - artist = '%s ft. %s' % (artist, featured_artist) - title = '%s - %s' % (artist, track) if artist else track - - genres = video_info.get('genres') - genre = ( - genres[0] if genres and isinstance(genres, list) - and isinstance(genres[0], compat_str) else None) - - is_explicit = video_info.get('isExplicit') - if is_explicit is True: - age_limit = 18 - elif is_explicit is False: - age_limit = 0 - else: - age_limit = None - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'thumbnail': video_info.get('imageUrl') or video_info.get('thumbnailUrl'), - 'timestamp': parse_iso8601(video_info.get('releaseDate')), - 'uploader': uploader, - 'duration': int_or_none(video_info.get('duration')), - 'view_count': int_or_none(video_info.get('views', {}).get('total')), - 'age_limit': age_limit, - 'track': track, - 'artist': uploader, - 'genre': genre, - } - - -class VevoPlaylistIE(VevoBaseIE): - _VALID_URL = r'https?://(?:www\.)?vevo\.com/watch/(?P<kind>playlist|genre)/(?P<id>[^/?#&]+)' - - _TESTS = [{ - 'url': 'http://www.vevo.com/watch/playlist/dadbf4e7-b99f-4184-9670-6f0e547b6a29', - 'info_dict': { - 'id': 'dadbf4e7-b99f-4184-9670-6f0e547b6a29', - 'title': 'Best-Of: Birdman', - }, - 'playlist_count': 10, - }, { - 'url': 'http://www.vevo.com/watch/genre/rock', - 'info_dict': { - 'id': 'rock', - 'title': 'Rock', - }, - 'playlist_count': 20, - }, { - 'url': 'http://www.vevo.com/watch/playlist/dadbf4e7-b99f-4184-9670-6f0e547b6a29?index=0', - 'md5': '32dcdfddddf9ec6917fc88ca26d36282', - 'info_dict': { - 'id': 'USCMV1100073', - 'ext': 'mp4', - 'title': 'Birdman - Y.U. MAD', - 'timestamp': 1323417600, - 'upload_date': '20111209', - 'uploader': 'Birdman', - 'track': 'Y.U. MAD', - 'artist': 'Birdman', - 'genre': 'Rap/Hip-Hop', - }, - 'expected_warnings': ['Unable to download SMIL file'], - }, { - 'url': 'http://www.vevo.com/watch/genre/rock?index=0', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - playlist_id = mobj.group('id') - playlist_kind = mobj.group('kind') - - webpage = self._download_webpage(url, playlist_id) - - qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - index = qs.get('index', [None])[0] - - if index: - video_id = self._search_regex( - r'<meta[^>]+content=(["\'])vevo://video/(?P<id>.+?)\1[^>]*>', - webpage, 'video id', default=None, group='id') - if video_id: - return self.url_result('vevo:%s' % video_id, VevoIE.ie_key()) - - playlists = self._extract_json(webpage, playlist_id)['default']['%ss' % playlist_kind] - - playlist = (list(playlists.values())[0] - if playlist_kind == 'playlist' else playlists[playlist_id]) - - entries = [ - self.url_result('vevo:%s' % src, VevoIE.ie_key()) - for src in playlist['isrcs']] - - return self.playlist_result( - entries, playlist.get('playlistId') or playlist_id, - playlist.get('name'), playlist.get('description')) diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py deleted file mode 100644 index 22e99e8f0..000000000 --- a/youtube_dl/extractor/vgtv.py +++ /dev/null @@ -1,313 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from .xstream import XstreamIE -from ..utils import ( - ExtractorError, - float_or_none, - try_get, -) - - -class VGTVIE(XstreamIE): - IE_DESC = 'VGTV, BTTV, FTV, Aftenposten and Aftonbladet' - _GEO_BYPASS = False - - _HOST_TO_APPNAME = { - 'vgtv.no': 'vgtv', - 'bt.no/tv': 'bttv', - 'aftenbladet.no/tv': 'satv', - 'fvn.no/fvntv': 'fvntv', - 'aftenposten.no/webtv': 'aptv', - 'ap.vgtv.no/webtv': 'aptv', - 'tv.aftonbladet.se': 'abtv', - # obsolete URL schemas, kept in order to save one HTTP redirect - 'tv.aftonbladet.se/abtv': 'abtv', - 'www.aftonbladet.se/tv': 'abtv', - } - - _APP_NAME_TO_VENDOR = { - 'vgtv': 'vgtv', - 'bttv': 'bt', - 'satv': 'sa', - 'fvntv': 'fvn', - 'aptv': 'ap', - 'abtv': 'ab', - } - - _VALID_URL = r'''(?x) - (?:https?://(?:www\.)? - (?P<host> - %s - ) - /? - (?: - (?:\#!/)?(?:video|live)/| - embed?.*id=| - a(?:rticles)?/ - )| - (?P<appname> - %s - ):) - (?P<id>\d+) - ''' % ('|'.join(_HOST_TO_APPNAME.keys()), '|'.join(_APP_NAME_TO_VENDOR.keys())) - - _TESTS = [ - { - # streamType: vod - 'url': 'http://www.vgtv.no/#!/video/84196/hevnen-er-soet-episode-10-abu', - 'md5': 'b8be7a234cebb840c0d512c78013e02f', - 'info_dict': { - 'id': '84196', - 'ext': 'mp4', - 'title': 'Hevnen er søt: Episode 10 - Abu', - 'description': 'md5:e25e4badb5f544b04341e14abdc72234', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 648.000, - 'timestamp': 1404626400, - 'upload_date': '20140706', - 'view_count': int, - }, - }, - { - # streamType: wasLive - 'url': 'http://www.vgtv.no/#!/live/100764/opptak-vgtv-foelger-em-kvalifiseringen', - 'info_dict': { - 'id': '100764', - 'ext': 'flv', - 'title': 'OPPTAK: VGTV følger EM-kvalifiseringen', - 'description': 'md5:3772d9c0dc2dff92a886b60039a7d4d3', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 9103.0, - 'timestamp': 1410113864, - 'upload_date': '20140907', - 'view_count': int, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'skip': 'Video is no longer available', - }, - { - # streamType: wasLive - 'url': 'http://www.vgtv.no/#!/live/113063/direkte-v75-fra-solvalla', - 'info_dict': { - 'id': '113063', - 'ext': 'mp4', - 'title': 'V75 fra Solvalla 30.05.15', - 'description': 'md5:b3743425765355855f88e096acc93231', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 25966, - 'timestamp': 1432975582, - 'upload_date': '20150530', - 'view_count': int, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, - { - 'url': 'http://www.aftenposten.no/webtv/#!/video/21039/trailer-sweatshop-i-can-t-take-any-more', - 'md5': 'fd828cd29774a729bf4d4425fe192972', - 'info_dict': { - 'id': '21039', - 'ext': 'mp4', - 'title': 'TRAILER: «SWEATSHOP» - I can´t take any more', - 'description': 'md5:21891f2b0dd7ec2f78d84a50e54f8238', - 'duration': 66, - 'timestamp': 1417002452, - 'upload_date': '20141126', - 'view_count': int, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, - { - 'url': 'http://www.bt.no/tv/#!/video/100250/norling-dette-er-forskjellen-paa-1-divisjon-og-eliteserien', - 'only_matching': True, - }, - { - 'url': 'http://ap.vgtv.no/webtv#!/video/111084/de-nye-bysyklene-lettere-bedre-gir-stoerre-hjul-og-feste-til-mobil', - 'only_matching': True, - }, - { - # geoblocked - 'url': 'http://www.vgtv.no/#!/video/127205/inside-the-mind-of-favela-funk', - 'only_matching': True, - }, - { - 'url': 'https://tv.aftonbladet.se/video/36015/vulkanutbrott-i-rymden-nu-slapper-nasa-bilderna', - 'only_matching': True, - }, - { - 'url': 'http://tv.aftonbladet.se/abtv/articles/36015', - 'only_matching': True, - }, - { - 'url': 'https://www.aftonbladet.se/tv/a/36015', - 'only_matching': True, - }, - { - 'url': 'abtv:140026', - 'only_matching': True, - }, - { - 'url': 'http://www.vgtv.no/video/84196/hevnen-er-soet-episode-10-abu', - 'only_matching': True, - }, - ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - host = mobj.group('host') - appname = self._HOST_TO_APPNAME[host] if host else mobj.group('appname') - vendor = self._APP_NAME_TO_VENDOR[appname] - - data = self._download_json( - 'http://svp.vg.no/svp/api/v1/%s/assets/%s?appName=%s-website' - % (vendor, video_id, appname), - video_id, 'Downloading media JSON') - - if data.get('status') == 'inactive': - raise ExtractorError( - 'Video %s is no longer available' % video_id, expected=True) - - info = { - 'formats': [], - } - if len(video_id) == 5: - if appname == 'bttv': - info = self._extract_video_info('btno', video_id) - - streams = data['streamUrls'] - stream_type = data.get('streamType') - is_live = stream_type == 'live' - formats = [] - - hls_url = streams.get('hls') - if hls_url: - formats.extend(self._extract_m3u8_formats( - hls_url, video_id, 'mp4', - entry_protocol='m3u8' if is_live else 'm3u8_native', - m3u8_id='hls', fatal=False)) - - hds_url = streams.get('hds') - if hds_url: - hdcore_sign = 'hdcore=3.7.0' - f4m_formats = self._extract_f4m_formats( - hds_url + '?%s' % hdcore_sign, video_id, f4m_id='hds', fatal=False) - if f4m_formats: - for entry in f4m_formats: - # URLs without the extra param induce an 404 error - entry.update({'extra_param_to_segment_url': hdcore_sign}) - formats.append(entry) - - mp4_urls = streams.get('pseudostreaming') or [] - mp4_url = streams.get('mp4') - if mp4_url: - mp4_urls.append(mp4_url) - for mp4_url in mp4_urls: - format_info = { - 'url': mp4_url, - } - mobj = re.search(r'(\d+)_(\d+)_(\d+)', mp4_url) - if mobj: - tbr = int(mobj.group(3)) - format_info.update({ - 'width': int(mobj.group(1)), - 'height': int(mobj.group(2)), - 'tbr': tbr, - 'format_id': 'mp4-%s' % tbr, - }) - formats.append(format_info) - - info['formats'].extend(formats) - - if not info['formats']: - properties = try_get( - data, lambda x: x['streamConfiguration']['properties'], list) - if properties and 'geoblocked' in properties: - raise self.raise_geo_restricted( - countries=[host.rpartition('.')[-1].partition('/')[0].upper()]) - - self._sort_formats(info['formats']) - - info.update({ - 'id': video_id, - 'title': self._live_title(data['title']) if is_live else data['title'], - 'description': data['description'], - 'thumbnail': data['images']['main'] + '?t[]=900x506q80', - 'timestamp': data['published'], - 'duration': float_or_none(data['duration'], 1000), - 'view_count': data['displays'], - 'is_live': is_live, - }) - return info - - -class BTArticleIE(InfoExtractor): - IE_NAME = 'bt:article' - IE_DESC = 'Bergens Tidende Articles' - _VALID_URL = r'https?://(?:www\.)?bt\.no/(?:[^/]+/)+(?P<id>[^/]+)-\d+\.html' - _TEST = { - 'url': 'http://www.bt.no/nyheter/lokalt/Kjemper-for-internatet-1788214.html', - 'md5': '2acbe8ad129b3469d5ae51b1158878df', - 'info_dict': { - 'id': '23199', - 'ext': 'mp4', - 'title': 'Alrekstad internat', - 'description': 'md5:dc81a9056c874fedb62fc48a300dac58', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 191, - 'timestamp': 1289991323, - 'upload_date': '20101117', - 'view_count': int, - }, - } - - def _real_extract(self, url): - webpage = self._download_webpage(url, self._match_id(url)) - video_id = self._search_regex( - r'<video[^>]+data-id="(\d+)"', webpage, 'video id') - return self.url_result('bttv:%s' % video_id, 'VGTV') - - -class BTVestlendingenIE(InfoExtractor): - IE_NAME = 'bt:vestlendingen' - IE_DESC = 'Bergens Tidende - Vestlendingen' - _VALID_URL = r'https?://(?:www\.)?bt\.no/spesial/vestlendingen/#!/(?P<id>\d+)' - _TESTS = [{ - 'url': 'http://www.bt.no/spesial/vestlendingen/#!/86588', - 'md5': 'd7d17e3337dc80de6d3a540aefbe441b', - 'info_dict': { - 'id': '86588', - 'ext': 'mov', - 'title': 'Otto Wollertsen', - 'description': 'Vestlendingen Otto Fredrik Wollertsen', - 'timestamp': 1430473209, - 'upload_date': '20150501', - }, - 'skip': '404 Error', - }, { - 'url': 'http://www.bt.no/spesial/vestlendingen/#!/86255', - 'md5': 'a2893f8632e96389f4bdf36aa9463ceb', - 'info_dict': { - 'id': '86255', - 'ext': 'mov', - 'title': 'Du må tåle å fryse og være sulten', - 'description': 'md5:b8046f4d022d5830ddab04865791d063', - 'upload_date': '20150321', - 'timestamp': 1426942023, - }, - }] - - def _real_extract(self, url): - return self.url_result('bttv:%s' % self._match_id(url), 'VGTV') diff --git a/youtube_dl/extractor/vh1.py b/youtube_dl/extractor/vh1.py deleted file mode 100644 index dff94a2b8..000000000 --- a/youtube_dl/extractor/vh1.py +++ /dev/null @@ -1,41 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .mtv import MTVServicesInfoExtractor - - -class VH1IE(MTVServicesInfoExtractor): - IE_NAME = 'vh1.com' - _FEED_URL = 'http://www.vh1.com/feeds/mrss/' - _TESTS = [{ - 'url': 'http://www.vh1.com/episodes/0umwpq/hip-hop-squares-kent-jones-vs-nick-young-season-1-ep-120', - 'info_dict': { - 'title': 'Kent Jones vs. Nick Young', - 'description': 'Come to Play. Stay to Party. With Mike Epps, TIP, O’Shea Jackson Jr., T-Pain, Tisha Campbell-Martin and more.', - }, - 'playlist_mincount': 4, - }, { - # Clip - 'url': 'http://www.vh1.com/video-clips/t74mif/scared-famous-scared-famous-extended-preview', - 'info_dict': { - 'id': '0a50c2d2-a86b-4141-9565-911c7e2d0b92', - 'ext': 'mp4', - 'title': 'Scared Famous|October 9, 2017|1|NO-EPISODE#|Scared Famous + Extended Preview', - 'description': 'md5:eff5551a274c473a29463de40f7b09da', - 'upload_date': '20171009', - 'timestamp': 1507574700, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }] - - _VALID_URL = r'https?://(?:www\.)?vh1\.com/(?:video-clips|episodes)/(?P<id>[^/?#.]+)' - - def _real_extract(self, url): - playlist_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) - mgid = self._extract_triforce_mgid(webpage) - videos_info = self._get_videos_info(mgid) - return videos_info diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py deleted file mode 100644 index e37499512..000000000 --- a/youtube_dl/extractor/vice.py +++ /dev/null @@ -1,337 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import functools -import hashlib -import json -import random -import re -import time - -from .adobepass import AdobePassIE -from .common import InfoExtractor -from .youtube import YoutubeIE -from ..compat import ( - compat_HTTPError, - compat_str, -) -from ..utils import ( - clean_html, - ExtractorError, - int_or_none, - OnDemandPagedList, - parse_age_limit, - str_or_none, - try_get, -) - - -class ViceBaseIE(InfoExtractor): - def _call_api(self, resource, resource_key, resource_id, locale, fields, args=''): - return self._download_json( - 'https://video.vice.com/api/v1/graphql', resource_id, query={ - 'query': '''{ - %s(locale: "%s", %s: "%s"%s) { - %s - } -}''' % (resource, locale, resource_key, resource_id, args, fields), - })['data'][resource] - - -class ViceIE(ViceBaseIE, AdobePassIE): - IE_NAME = 'vice' - _VALID_URL = r'https?://(?:(?:video|vms)\.vice|(?:www\.)?vice(?:land|tv))\.com/(?P<locale>[^/]+)/(?:video/[^/]+|embed)/(?P<id>[\da-f]{24})' - _TESTS = [{ - 'url': 'https://video.vice.com/en_us/video/pet-cremator/58c69e38a55424f1227dc3f7', - 'info_dict': { - 'id': '58c69e38a55424f1227dc3f7', - 'ext': 'mp4', - 'title': '10 Questions You Always Wanted To Ask: Pet Cremator', - 'description': 'md5:fe856caacf61fe0e74fab15ce2b07ca5', - 'uploader': 'vice', - 'uploader_id': '57a204088cb727dec794c67b', - 'timestamp': 1489664942, - 'upload_date': '20170316', - 'age_limit': 14, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - # geo restricted to US - 'url': 'https://video.vice.com/en_us/video/the-signal-from-tolva/5816510690b70e6c5fd39a56', - 'info_dict': { - 'id': '5816510690b70e6c5fd39a56', - 'ext': 'mp4', - 'uploader': 'vice', - 'title': 'The Signal From Tölva', - 'description': 'md5:3927e3c79f9e8094606a2b3c5b5e55d5', - 'uploader_id': '57a204088cb727dec794c67b', - 'timestamp': 1477941983, - 'upload_date': '20161031', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'https://video.vice.com/alps/video/ulfs-wien-beruchtigste-grafitti-crew-part-1/581b12b60a0e1f4c0fb6ea2f', - 'info_dict': { - 'id': '581b12b60a0e1f4c0fb6ea2f', - 'ext': 'mp4', - 'title': 'ULFs - Wien berüchtigste Grafitti Crew - Part 1', - 'description': 'Zwischen Hinterzimmer-Tattoos und U-Bahnschächten erzählen uns die Ulfs, wie es ist, "süchtig nach Sachbeschädigung" zu sein.', - 'uploader': 'vice', - 'uploader_id': '57a204088cb727dec794c67b', - 'timestamp': 1485368119, - 'upload_date': '20170125', - 'age_limit': 14, - }, - 'params': { - # AES-encrypted m3u8 - 'skip_download': True, - }, - }, { - 'url': 'https://video.vice.com/en_us/video/pizza-show-trailer/56d8c9a54d286ed92f7f30e4', - 'only_matching': True, - }, { - 'url': 'https://video.vice.com/en_us/embed/57f41d3556a0a80f54726060', - 'only_matching': True, - }, { - 'url': 'https://vms.vice.com/en_us/video/preplay/58c69e38a55424f1227dc3f7', - 'only_matching': True, - }, { - 'url': 'https://www.viceland.com/en_us/video/thursday-march-1-2018/5a8f2d7ff1cdb332dd446ec1', - 'only_matching': True, - }] - - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe\b[^>]+\bsrc=["\']((?:https?:)?//video\.vice\.com/[^/]+/embed/[\da-f]{24})', - webpage) - - @staticmethod - def _extract_url(webpage): - urls = ViceIE._extract_urls(webpage) - return urls[0] if urls else None - - def _real_extract(self, url): - locale, video_id = re.match(self._VALID_URL, url).groups() - - video = self._call_api('videos', 'id', video_id, locale, '''body - locked - rating - thumbnail_url - title''')[0] - title = video['title'].strip() - rating = video.get('rating') - - query = {} - if video.get('locked'): - resource = self._get_mvpd_resource( - 'VICELAND', title, video_id, rating) - query['tvetoken'] = self._extract_mvpd_auth( - url, video_id, 'VICELAND', resource) - - # signature generation algorithm is reverse engineered from signatureGenerator in - # webpack:///../shared/~/vice-player/dist/js/vice-player.js in - # https://www.viceland.com/assets/common/js/web.vendor.bundle.js - # new JS is located here https://vice-web-statics-cdn.vice.com/vice-player/player-embed.js - exp = int(time.time()) + 1440 - - query.update({ - 'exp': exp, - 'sign': hashlib.sha512(('%s:GET:%d' % (video_id, exp)).encode()).hexdigest(), - 'skipadstitching': 1, - 'platform': 'desktop', - 'rn': random.randint(10000, 100000), - }) - - try: - preplay = self._download_json( - 'https://vms.vice.com/%s/video/preplay/%s' % (locale, video_id), - video_id, query=query) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 401): - error = json.loads(e.cause.read().decode()) - error_message = error.get('error_description') or error['details'] - raise ExtractorError('%s said: %s' % ( - self.IE_NAME, error_message), expected=True) - raise - - video_data = preplay['video'] - formats = self._extract_m3u8_formats( - preplay['playURL'], video_id, 'mp4', 'm3u8_native') - self._sort_formats(formats) - episode = video_data.get('episode') or {} - channel = video_data.get('channel') or {} - season = video_data.get('season') or {} - - subtitles = {} - for subtitle in preplay.get('subtitleURLs', []): - cc_url = subtitle.get('url') - if not cc_url: - continue - language_code = try_get(subtitle, lambda x: x['languages'][0]['language_code'], compat_str) or 'en' - subtitles.setdefault(language_code, []).append({ - 'url': cc_url, - }) - - return { - 'formats': formats, - 'id': video_id, - 'title': title, - 'description': clean_html(video.get('body')), - 'thumbnail': video.get('thumbnail_url'), - 'duration': int_or_none(video_data.get('video_duration')), - 'timestamp': int_or_none(video_data.get('created_at'), 1000), - 'age_limit': parse_age_limit(video_data.get('video_rating') or rating), - 'series': try_get(video_data, lambda x: x['show']['base']['display_title'], compat_str), - 'episode_number': int_or_none(episode.get('episode_number')), - 'episode_id': str_or_none(episode.get('id') or video_data.get('episode_id')), - 'season_number': int_or_none(season.get('season_number')), - 'season_id': str_or_none(season.get('id') or video_data.get('season_id')), - 'uploader': channel.get('name'), - 'uploader_id': str_or_none(channel.get('id')), - 'subtitles': subtitles, - } - - -class ViceShowIE(ViceBaseIE): - IE_NAME = 'vice:show' - _VALID_URL = r'https?://(?:video\.vice|(?:www\.)?vice(?:land|tv))\.com/(?P<locale>[^/]+)/show/(?P<id>[^/?#&]+)' - _PAGE_SIZE = 25 - _TESTS = [{ - 'url': 'https://video.vice.com/en_us/show/fck-thats-delicious', - 'info_dict': { - 'id': '57a2040c8cb727dec794c901', - 'title': 'F*ck, That’s Delicious', - 'description': 'The life and eating habits of rap’s greatest bon vivant, Action Bronson.', - }, - 'playlist_mincount': 64, - }, { - 'url': 'https://www.vicetv.com/en_us/show/fck-thats-delicious', - 'only_matching': True, - }] - - def _fetch_page(self, locale, show_id, page): - videos = self._call_api('videos', 'show_id', show_id, locale, '''body - id - url''', ', page: %d, per_page: %d' % (page + 1, self._PAGE_SIZE)) - for video in videos: - yield self.url_result( - video['url'], ViceIE.ie_key(), video.get('id')) - - def _real_extract(self, url): - locale, display_id = re.match(self._VALID_URL, url).groups() - show = self._call_api('shows', 'slug', display_id, locale, '''dek - id - title''')[0] - show_id = show['id'] - - entries = OnDemandPagedList( - functools.partial(self._fetch_page, locale, show_id), - self._PAGE_SIZE) - - return self.playlist_result( - entries, show_id, show.get('title'), show.get('dek')) - - -class ViceArticleIE(ViceBaseIE): - IE_NAME = 'vice:article' - _VALID_URL = r'https://(?:www\.)?vice\.com/(?P<locale>[^/]+)/article/(?:[0-9a-z]{6}/)?(?P<id>[^?#]+)' - - _TESTS = [{ - 'url': 'https://www.vice.com/en_us/article/on-set-with-the-woman-making-mormon-porn-in-utah', - 'info_dict': { - 'id': '58dc0a3dee202d2a0ccfcbd8', - 'ext': 'mp4', - 'title': 'Mormon War on Porn', - 'description': 'md5:1c5d91fe25fa8aa304f9def118b92dbf', - 'uploader': 'vice', - 'uploader_id': '57a204088cb727dec794c67b', - 'timestamp': 1491883129, - 'upload_date': '20170411', - 'age_limit': 17, - }, - 'params': { - # AES-encrypted m3u8 - 'skip_download': True, - }, - 'add_ie': [ViceIE.ie_key()], - }, { - 'url': 'https://www.vice.com/en_us/article/how-to-hack-a-car', - 'md5': '13010ee0bc694ea87ec40724397c2349', - 'info_dict': { - 'id': '3jstaBeXgAs', - 'ext': 'mp4', - 'title': 'How to Hack a Car: Phreaked Out (Episode 2)', - 'description': 'md5:ee95453f7ff495db8efe14ae8bf56f30', - 'uploader': 'Motherboard', - 'uploader_id': 'MotherboardTV', - 'upload_date': '20140529', - }, - 'add_ie': [YoutubeIE.ie_key()], - }, { - 'url': 'https://www.vice.com/en_us/article/znm9dx/karley-sciortino-slutever-reloaded', - 'md5': 'a7ecf64ee4fa19b916c16f4b56184ae2', - 'info_dict': { - 'id': '57f41d3556a0a80f54726060', - 'ext': 'mp4', - 'title': "Making The World's First Male Sex Doll", - 'description': 'md5:19b00b215b99961cf869c40fbe9df755', - 'uploader': 'vice', - 'uploader_id': '57a204088cb727dec794c67b', - 'timestamp': 1476919911, - 'upload_date': '20161019', - 'age_limit': 17, - }, - 'params': { - 'skip_download': True, - 'format': 'bestvideo', - }, - 'add_ie': [ViceIE.ie_key()], - }, { - 'url': 'https://www.vice.com/en_us/article/cowboy-capitalists-part-1', - 'only_matching': True, - }, { - 'url': 'https://www.vice.com/ru/article/big-night-out-ibiza-clive-martin-229', - 'only_matching': True, - }] - - def _real_extract(self, url): - locale, display_id = re.match(self._VALID_URL, url).groups() - - article = self._call_api('articles', 'slug', display_id, locale, '''body - embed_code''')[0] - body = article['body'] - - def _url_res(video_url, ie_key): - return { - '_type': 'url_transparent', - 'url': video_url, - 'display_id': display_id, - 'ie_key': ie_key, - } - - vice_url = ViceIE._extract_url(body) - if vice_url: - return _url_res(vice_url, ViceIE.ie_key()) - - embed_code = self._search_regex( - r'embedCode=([^&\'"]+)', body, - 'ooyala embed code', default=None) - if embed_code: - return _url_res('ooyala:%s' % embed_code, 'Ooyala') - - youtube_url = YoutubeIE._extract_url(body) - if youtube_url: - return _url_res(youtube_url, YoutubeIE.ie_key()) - - video_url = self._html_search_regex( - r'data-video-url="([^"]+)"', - article['embed_code'], 'video URL') - - return _url_res(video_url, ViceIE.ie_key()) diff --git a/youtube_dl/extractor/viddler.py b/youtube_dl/extractor/viddler.py deleted file mode 100644 index 642358433..000000000 --- a/youtube_dl/extractor/viddler.py +++ /dev/null @@ -1,138 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - float_or_none, - int_or_none, -) - - -class ViddlerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?viddler\.com/(?:v|embed|player)/(?P<id>[a-z0-9]+)(?:.+?\bsecret=(\d+))?' - _TESTS = [{ - 'url': 'http://www.viddler.com/v/43903784', - 'md5': '9eee21161d2c7f5b39690c3e325fab2f', - 'info_dict': { - 'id': '43903784', - 'ext': 'mov', - 'title': 'Video Made Easy', - 'description': 'md5:6a697ebd844ff3093bd2e82c37b409cd', - 'uploader': 'viddler', - 'timestamp': 1335371429, - 'upload_date': '20120425', - 'duration': 100.89, - 'thumbnail': r're:^https?://.*\.jpg$', - 'view_count': int, - 'comment_count': int, - 'categories': ['video content', 'high quality video', 'video made easy', 'how to produce video with limited resources', 'viddler'], - } - }, { - 'url': 'http://www.viddler.com/v/4d03aad9/', - 'md5': 'f12c5a7fa839c47a79363bfdf69404fb', - 'info_dict': { - 'id': '4d03aad9', - 'ext': 'ts', - 'title': 'WALL-TO-GORTAT', - 'upload_date': '20150126', - 'uploader': 'deadspin', - 'timestamp': 1422285291, - 'view_count': int, - 'comment_count': int, - } - }, { - 'url': 'http://www.viddler.com/player/221ebbbd/0/', - 'md5': '740511f61d3d1bb71dc14a0fe01a1c10', - 'info_dict': { - 'id': '221ebbbd', - 'ext': 'mov', - 'title': 'LETeens-Grammar-snack-third-conditional', - 'description': ' ', - 'upload_date': '20140929', - 'uploader': 'BCLETeens', - 'timestamp': 1411997190, - 'view_count': int, - 'comment_count': int, - } - }, { - # secret protected - 'url': 'http://www.viddler.com/v/890c0985?secret=34051570', - 'info_dict': { - 'id': '890c0985', - 'ext': 'mp4', - 'title': 'Complete Property Training - Traineeships', - 'description': ' ', - 'upload_date': '20130606', - 'uploader': 'TiffanyBowtell', - 'timestamp': 1370496993, - 'view_count': int, - 'comment_count': int, - }, - 'params': { - 'skip_download': True, - }, - }] - - def _real_extract(self, url): - video_id, secret = re.match(self._VALID_URL, url).groups() - - query = { - 'video_id': video_id, - 'key': 'v0vhrt7bg2xq1vyxhkct', - } - if secret: - query['secret'] = secret - - data = self._download_json( - 'http://api.viddler.com/api/v2/viddler.videos.getPlaybackDetails.json', - video_id, headers={'Referer': url}, query=query)['video'] - - formats = [] - for filed in data['files']: - if filed.get('status', 'ready') != 'ready': - continue - format_id = filed.get('profile_id') or filed['profile_name'] - f = { - 'format_id': format_id, - 'format_note': filed['profile_name'], - 'url': self._proto_relative_url(filed['url']), - 'width': int_or_none(filed.get('width')), - 'height': int_or_none(filed.get('height')), - 'filesize': int_or_none(filed.get('size')), - 'ext': filed.get('ext'), - 'source_preference': -1, - } - formats.append(f) - - if filed.get('cdn_url'): - f = f.copy() - f['url'] = self._proto_relative_url(filed['cdn_url'], 'http:') - f['format_id'] = format_id + '-cdn' - f['source_preference'] = 1 - formats.append(f) - - if filed.get('html5_video_source'): - f = f.copy() - f['url'] = self._proto_relative_url(filed['html5_video_source']) - f['format_id'] = format_id + '-html5' - f['source_preference'] = 0 - formats.append(f) - self._sort_formats(formats) - - categories = [ - t.get('text') for t in data.get('tags', []) if 'text' in t] - - return { - 'id': video_id, - 'title': data['title'], - 'formats': formats, - 'description': data.get('description'), - 'timestamp': int_or_none(data.get('upload_time')), - 'thumbnail': self._proto_relative_url(data.get('thumbnail_url')), - 'uploader': data.get('author'), - 'duration': float_or_none(data.get('length')), - 'view_count': int_or_none(data.get('view_count')), - 'comment_count': int_or_none(data.get('comment_count')), - 'categories': categories, - } diff --git a/youtube_dl/extractor/videa.py b/youtube_dl/extractor/videa.py deleted file mode 100644 index ab2c15cde..000000000 --- a/youtube_dl/extractor/videa.py +++ /dev/null @@ -1,173 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import random -import re -import string - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - int_or_none, - mimetype2ext, - parse_codecs, - update_url_query, - xpath_element, - xpath_text, -) -from ..compat import ( - compat_b64decode, - compat_ord, - compat_struct_pack, -) - - -class VideaIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?:// - videa(?:kid)?\.hu/ - (?: - videok/(?:[^/]+/)*[^?#&]+-| - (?:videojs_)?player\?.*?\bv=| - player/v/ - ) - (?P<id>[^?#&]+) - ''' - _TESTS = [{ - 'url': 'http://videa.hu/videok/allatok/az-orult-kigyasz-285-kigyot-kigyo-8YfIAjxwWGwT8HVQ', - 'md5': '97a7af41faeaffd9f1fc864a7c7e7603', - 'info_dict': { - 'id': '8YfIAjxwWGwT8HVQ', - 'ext': 'mp4', - 'title': 'Az őrült kígyász 285 kígyót enged szabadon', - 'thumbnail': r're:^https?://.*', - 'duration': 21, - }, - }, { - 'url': 'http://videa.hu/videok/origo/jarmuvek/supercars-elozes-jAHDWfWSJH5XuFhH', - 'only_matching': True, - }, { - 'url': 'http://videa.hu/player?v=8YfIAjxwWGwT8HVQ', - 'only_matching': True, - }, { - 'url': 'http://videa.hu/player/v/8YfIAjxwWGwT8HVQ?autoplay=1', - 'only_matching': True, - }, { - 'url': 'https://videakid.hu/videok/origo/jarmuvek/supercars-elozes-jAHDWfWSJH5XuFhH', - 'only_matching': True, - }, { - 'url': 'https://videakid.hu/player?v=8YfIAjxwWGwT8HVQ', - 'only_matching': True, - }, { - 'url': 'https://videakid.hu/player/v/8YfIAjxwWGwT8HVQ?autoplay=1', - 'only_matching': True, - }] - _STATIC_SECRET = 'xHb0ZvME5q8CBcoQi6AngerDu3FGO9fkUlwPmLVY_RTzj2hJIS4NasXWKy1td7p' - - @staticmethod - def _extract_urls(webpage): - return [url for _, url in re.findall( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//videa\.hu/player\?.*?\bv=.+?)\1', - webpage)] - - @staticmethod - def rc4(cipher_text, key): - res = b'' - - key_len = len(key) - S = list(range(256)) - - j = 0 - for i in range(256): - j = (j + S[i] + ord(key[i % key_len])) % 256 - S[i], S[j] = S[j], S[i] - - i = 0 - j = 0 - for m in range(len(cipher_text)): - i = (i + 1) % 256 - j = (j + S[i]) % 256 - S[i], S[j] = S[j], S[i] - k = S[(S[i] + S[j]) % 256] - res += compat_struct_pack('B', k ^ compat_ord(cipher_text[m])) - - return res.decode() - - def _real_extract(self, url): - video_id = self._match_id(url) - query = {'v': video_id} - player_page = self._download_webpage( - 'https://videa.hu/player', video_id, query=query) - - nonce = self._search_regex( - r'_xt\s*=\s*"([^"]+)"', player_page, 'nonce') - l = nonce[:32] - s = nonce[32:] - result = '' - for i in range(0, 32): - result += s[i - (self._STATIC_SECRET.index(l[i]) - 31)] - - random_seed = ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(8)) - query['_s'] = random_seed - query['_t'] = result[:16] - - b64_info, handle = self._download_webpage_handle( - 'http://videa.hu/videaplayer_get_xml.php', video_id, query=query) - if b64_info.startswith('<?xml'): - info = self._parse_xml(b64_info, video_id) - else: - key = result[16:] + random_seed + handle.headers['x-videa-xs'] - info = self._parse_xml(self.rc4( - compat_b64decode(b64_info), key), video_id) - - video = xpath_element(info, './video', 'video') - if not video: - raise ExtractorError(xpath_element( - info, './error', fatal=True), expected=True) - sources = xpath_element( - info, './video_sources', 'sources', fatal=True) - hash_values = xpath_element( - info, './hash_values', 'hash values', fatal=True) - - title = xpath_text(video, './title', fatal=True) - - formats = [] - for source in sources.findall('./video_source'): - source_url = source.text - source_name = source.get('name') - source_exp = source.get('exp') - if not (source_url and source_name and source_exp): - continue - hash_value = xpath_text(hash_values, 'hash_value_' + source_name) - if not hash_value: - continue - source_url = update_url_query(source_url, { - 'md5': hash_value, - 'expires': source_exp, - }) - f = parse_codecs(source.get('codecs')) - f.update({ - 'url': self._proto_relative_url(source_url), - 'ext': mimetype2ext(source.get('mimetype')) or 'mp4', - 'format_id': source.get('name'), - 'width': int_or_none(source.get('width')), - 'height': int_or_none(source.get('height')), - }) - formats.append(f) - self._sort_formats(formats) - - thumbnail = self._proto_relative_url(xpath_text(video, './poster_src')) - - age_limit = None - is_adult = xpath_text(video, './is_adult_content', default=None) - if is_adult: - age_limit = 18 if is_adult == '1' else 0 - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'duration': int_or_none(xpath_text(video, './duration')), - 'age_limit': age_limit, - 'formats': formats, - } diff --git a/youtube_dl/extractor/videomore.py b/youtube_dl/extractor/videomore.py deleted file mode 100644 index e0c10aa5b..000000000 --- a/youtube_dl/extractor/videomore.py +++ /dev/null @@ -1,322 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_str, - compat_urllib_parse_urlparse, -) -from ..utils import ( - ExtractorError, - int_or_none, -) - - -class VideomoreBaseIE(InfoExtractor): - _API_BASE_URL = 'https://more.tv/api/v3/web/' - _VALID_URL_BASE = r'https?://(?:videomore\.ru|more\.tv)/' - - def _download_page_data(self, display_id): - return self._download_json( - self._API_BASE_URL + 'PageData', display_id, query={ - 'url': '/' + display_id, - })['attributes']['response']['data'] - - def _track_url_result(self, track): - track_vod = track['trackVod'] - video_url = track_vod.get('playerLink') or track_vod['link'] - return self.url_result( - video_url, VideomoreIE.ie_key(), track_vod.get('hubId')) - - -class VideomoreIE(InfoExtractor): - IE_NAME = 'videomore' - _VALID_URL = r'''(?x) - videomore:(?P<sid>\d+)$| - https?:// - (?: - videomore\.ru/ - (?: - embed| - [^/]+/[^/]+ - )/| - (?: - (?:player\.)?videomore\.ru| - siren\.more\.tv/player - )/[^/]*\?.*?\btrack_id=| - odysseus\.more.tv/player/(?P<partner_id>\d+)/ - ) - (?P<id>\d+) - (?:[/?#&]|\.(?:xml|json)|$) - ''' - _TESTS = [{ - 'url': 'http://videomore.ru/kino_v_detalayah/5_sezon/367617', - 'md5': '44455a346edc0d509ac5b5a5b531dc35', - 'info_dict': { - 'id': '367617', - 'ext': 'flv', - 'title': 'Кино в деталях 5 сезон В гостях Алексей Чумаков и Юлия Ковальчук', - 'series': 'Кино в деталях', - 'episode': 'В гостях Алексей Чумаков и Юлия Ковальчук', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 2910, - 'view_count': int, - 'comment_count': int, - 'age_limit': 16, - }, - 'skip': 'The video is not available for viewing.', - }, { - 'url': 'http://videomore.ru/embed/259974', - 'info_dict': { - 'id': '259974', - 'ext': 'mp4', - 'title': 'Молодежка 2 сезон 40 серия', - 'series': 'Молодежка', - 'season': '2 сезон', - 'episode': '40 серия', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 2789, - 'view_count': int, - 'age_limit': 16, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://videomore.ru/molodezhka/sezon_promo/341073', - 'info_dict': { - 'id': '341073', - 'ext': 'flv', - 'title': 'Промо Команда проиграла из-за Бакина?', - 'episode': 'Команда проиграла из-за Бакина?', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 29, - 'age_limit': 16, - 'view_count': int, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'The video is not available for viewing.', - }, { - 'url': 'http://videomore.ru/elki_3?track_id=364623', - 'only_matching': True, - }, { - 'url': 'http://videomore.ru/embed/364623', - 'only_matching': True, - }, { - 'url': 'http://videomore.ru/video/tracks/364623.xml', - 'only_matching': True, - }, { - 'url': 'http://videomore.ru/video/tracks/364623.json', - 'only_matching': True, - }, { - 'url': 'http://videomore.ru/video/tracks/158031/quotes/33248', - 'only_matching': True, - }, { - 'url': 'videomore:367617', - 'only_matching': True, - }, { - 'url': 'https://player.videomore.ru/?partner_id=97&track_id=736234&autoplay=0&userToken=', - 'only_matching': True, - }, { - 'url': 'https://odysseus.more.tv/player/1788/352317', - 'only_matching': True, - }, { - 'url': 'https://siren.more.tv/player/config?track_id=352317&partner_id=1788&user_token=', - 'only_matching': True, - }] - _GEO_BYPASS = False - - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<object[^>]+data=(["\'])https?://videomore\.ru/player\.swf\?.*config=(?P<url>https?://videomore\.ru/(?:[^/]+/)+\d+\.xml).*\1', - webpage) - if not mobj: - mobj = re.search( - r'<iframe[^>]+src=([\'"])(?P<url>https?://videomore\.ru/embed/\d+)', - webpage) - - if mobj: - return mobj.group('url') - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('sid') or mobj.group('id') - partner_id = mobj.group('partner_id') or compat_parse_qs(compat_urllib_parse_urlparse(url).query).get('partner_id', [None])[0] or '97' - - item = self._download_json( - 'https://siren.more.tv/player/config', video_id, query={ - 'partner_id': partner_id, - 'track_id': video_id, - })['data']['playlist']['items'][0] - - title = item.get('title') - series = item.get('project_name') - season = item.get('season_name') - episode = item.get('episode_name') - if not title: - title = [] - for v in (series, season, episode): - if v: - title.append(v) - title = ' '.join(title) - - streams = item.get('streams') or [] - for protocol in ('DASH', 'HLS'): - stream_url = item.get(protocol.lower() + '_url') - if stream_url: - streams.append({'protocol': protocol, 'url': stream_url}) - - formats = [] - for stream in streams: - stream_url = stream.get('url') - if not stream_url: - continue - protocol = stream.get('protocol') - if protocol == 'DASH': - formats.extend(self._extract_mpd_formats( - stream_url, video_id, mpd_id='dash', fatal=False)) - elif protocol == 'HLS': - formats.extend(self._extract_m3u8_formats( - stream_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - elif protocol == 'MSS': - formats.extend(self._extract_ism_formats( - stream_url, video_id, ism_id='mss', fatal=False)) - - if not formats: - error = item.get('error') - if error: - if error in ('Данное видео недоступно для просмотра на территории этой страны', 'Данное видео доступно для просмотра только на территории России'): - self.raise_geo_restricted(countries=['RU']) - raise ExtractorError(error, expected=True) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'series': series, - 'season': season, - 'episode': episode, - 'thumbnail': item.get('thumbnail_url'), - 'duration': int_or_none(item.get('duration')), - 'view_count': int_or_none(item.get('views')), - 'age_limit': int_or_none(item.get('min_age')), - 'formats': formats, - } - - -class VideomoreVideoIE(VideomoreBaseIE): - IE_NAME = 'videomore:video' - _VALID_URL = VideomoreBaseIE._VALID_URL_BASE + r'(?P<id>(?:(?:[^/]+/){2})?[^/?#&]+)(?:/*|[?#&].*?)$' - _TESTS = [{ - # single video with og:video:iframe - 'url': 'http://videomore.ru/elki_3', - 'info_dict': { - 'id': '364623', - 'ext': 'flv', - 'title': 'Ёлки 3', - 'description': '', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 5579, - 'age_limit': 6, - 'view_count': int, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'Requires logging in', - }, { - # season single series with og:video:iframe - 'url': 'http://videomore.ru/poslednii_ment/1_sezon/14_seriya', - 'info_dict': { - 'id': '352317', - 'ext': 'mp4', - 'title': 'Последний мент 1 сезон 14 серия', - 'series': 'Последний мент', - 'season': '1 сезон', - 'episode': '14 серия', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 2464, - 'age_limit': 16, - 'view_count': int, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://videomore.ru/sejchas_v_seti/serii_221-240/226_vypusk', - 'only_matching': True, - }, { - # single video without og:video:iframe - 'url': 'http://videomore.ru/marin_i_ego_druzya', - 'info_dict': { - 'id': '359073', - 'ext': 'flv', - 'title': '1 серия. Здравствуй, Аквавилль!', - 'description': 'md5:c6003179538b5d353e7bcd5b1372b2d7', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 754, - 'age_limit': 6, - 'view_count': int, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'redirects to https://more.tv/' - }, { - 'url': 'https://videomore.ru/molodezhka/6_sezon/29_seriya?utm_so', - 'only_matching': True, - }, { - 'url': 'https://more.tv/poslednii_ment/1_sezon/14_seriya', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return False if VideomoreIE.suitable(url) else super(VideomoreVideoIE, cls).suitable(url) - - def _real_extract(self, url): - display_id = self._match_id(url) - return self._track_url_result(self._download_page_data(display_id)) - - -class VideomoreSeasonIE(VideomoreBaseIE): - IE_NAME = 'videomore:season' - _VALID_URL = VideomoreBaseIE._VALID_URL_BASE + r'(?!embed)(?P<id>[^/]+/[^/?#&]+)(?:/*|[?#&].*?)$' - _TESTS = [{ - 'url': 'http://videomore.ru/molodezhka/film_o_filme', - 'info_dict': { - 'id': 'molodezhka/film_o_filme', - 'title': 'Фильм о фильме', - }, - 'playlist_mincount': 3, - }, { - 'url': 'http://videomore.ru/molodezhka/sezon_promo?utm_so', - 'only_matching': True, - }, { - 'url': 'https://more.tv/molodezhka/film_o_filme', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return (False if (VideomoreIE.suitable(url) or VideomoreVideoIE.suitable(url)) - else super(VideomoreSeasonIE, cls).suitable(url)) - - def _real_extract(self, url): - display_id = self._match_id(url) - season = self._download_page_data(display_id) - season_id = compat_str(season['id']) - tracks = self._download_json( - self._API_BASE_URL + 'seasons/%s/tracks' % season_id, - season_id)['data'] - entries = [] - for track in tracks: - entries.append(self._track_url_result(track)) - return self.playlist_result(entries, display_id, season.get('title')) diff --git a/youtube_dl/extractor/vidio.py b/youtube_dl/extractor/vidio.py deleted file mode 100644 index b1243e847..000000000 --- a/youtube_dl/extractor/vidio.py +++ /dev/null @@ -1,89 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - parse_iso8601, - str_or_none, - strip_or_none, - try_get, -) - - -class VidioIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vidio\.com/watch/(?P<id>\d+)-(?P<display_id>[^/?#&]+)' - _TESTS = [{ - 'url': 'http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015', - 'md5': 'cd2801394afc164e9775db6a140b91fe', - 'info_dict': { - 'id': '165683', - 'display_id': 'dj_ambred-booyah-live-2015', - 'ext': 'mp4', - 'title': 'DJ_AMBRED - Booyah (Live 2015)', - 'description': 'md5:27dc15f819b6a78a626490881adbadf8', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 149, - 'like_count': int, - 'uploader': 'TWELVE Pic', - 'timestamp': 1444902800, - 'upload_date': '20151015', - 'uploader_id': 'twelvepictures', - 'channel': 'Cover Music Video', - 'channel_id': '280236', - 'view_count': int, - 'dislike_count': int, - 'comment_count': int, - 'tags': 'count:4', - }, - }, { - 'url': 'https://www.vidio.com/watch/77949-south-korea-test-fires-missile-that-can-strike-all-of-the-north', - 'only_matching': True, - }] - - def _real_initialize(self): - self._api_key = self._download_json( - 'https://www.vidio.com/auth', None, data=b'')['api_key'] - - def _real_extract(self, url): - video_id, display_id = re.match(self._VALID_URL, url).groups() - data = self._download_json( - 'https://api.vidio.com/videos/' + video_id, display_id, headers={ - 'Content-Type': 'application/vnd.api+json', - 'X-API-KEY': self._api_key, - }) - video = data['videos'][0] - title = video['title'].strip() - - formats = self._extract_m3u8_formats( - data['clips'][0]['hls_url'], display_id, 'mp4', 'm3u8_native') - self._sort_formats(formats) - - get_first = lambda x: try_get(data, lambda y: y[x + 's'][0], dict) or {} - channel = get_first('channel') - user = get_first('user') - username = user.get('username') - get_count = lambda x: int_or_none(video.get('total_' + x)) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': strip_or_none(video.get('description')), - 'thumbnail': video.get('image_url_medium'), - 'duration': int_or_none(video.get('duration')), - 'like_count': get_count('likes'), - 'formats': formats, - 'uploader': user.get('name'), - 'timestamp': parse_iso8601(video.get('created_at')), - 'uploader_id': username, - 'uploader_url': 'https://www.vidio.com/@' + username if username else None, - 'channel': channel.get('name'), - 'channel_id': str_or_none(channel.get('id')), - 'view_count': get_count('view_count'), - 'dislike_count': get_count('dislikes'), - 'comment_count': get_count('comments'), - 'tags': video.get('tag_list'), - } diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py deleted file mode 100644 index 174e69cd6..000000000 --- a/youtube_dl/extractor/vidme.py +++ /dev/null @@ -1,295 +0,0 @@ -from __future__ import unicode_literals - -import itertools - -from .common import InfoExtractor -from ..compat import compat_HTTPError -from ..utils import ( - ExtractorError, - int_or_none, - float_or_none, - parse_iso8601, - url_or_none, -) - - -class VidmeIE(InfoExtractor): - IE_NAME = 'vidme' - _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z]{,5})(?:[^\da-zA-Z]|$)' - _TESTS = [{ - 'url': 'https://vid.me/QNB', - 'md5': 'f42d05e7149aeaec5c037b17e5d3dc82', - 'info_dict': { - 'id': 'QNB', - 'ext': 'mp4', - 'title': 'Fishing for piranha - the easy way', - 'description': 'source: https://www.facebook.com/photo.php?v=312276045600871', - 'thumbnail': r're:^https?://.*\.jpg', - 'timestamp': 1406313244, - 'upload_date': '20140725', - 'age_limit': 0, - 'duration': 119.92, - 'view_count': int, - 'like_count': int, - 'comment_count': int, - }, - }, { - 'url': 'https://vid.me/Gc6M', - 'md5': 'f42d05e7149aeaec5c037b17e5d3dc82', - 'info_dict': { - 'id': 'Gc6M', - 'ext': 'mp4', - 'title': 'O Mere Dil ke chain - Arnav and Khushi VM', - 'thumbnail': r're:^https?://.*\.jpg', - 'timestamp': 1441211642, - 'upload_date': '20150902', - 'uploader': 'SunshineM', - 'uploader_id': '3552827', - 'age_limit': 0, - 'duration': 223.72, - 'view_count': int, - 'like_count': int, - 'comment_count': int, - }, - 'params': { - 'skip_download': True, - }, - }, { - # tests uploader field - 'url': 'https://vid.me/4Iib', - 'info_dict': { - 'id': '4Iib', - 'ext': 'mp4', - 'title': 'The Carver', - 'description': 'md5:e9c24870018ae8113be936645b93ba3c', - 'thumbnail': r're:^https?://.*\.jpg', - 'timestamp': 1433203629, - 'upload_date': '20150602', - 'uploader': 'Thomas', - 'uploader_id': '109747', - 'age_limit': 0, - 'duration': 97.859999999999999, - 'view_count': int, - 'like_count': int, - 'comment_count': int, - }, - 'params': { - 'skip_download': True, - }, - }, { - # nsfw test from http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching - 'url': 'https://vid.me/e/Wmur', - 'info_dict': { - 'id': 'Wmur', - 'ext': 'mp4', - 'title': 'naked smoking & stretching', - 'thumbnail': r're:^https?://.*\.jpg', - 'timestamp': 1430931613, - 'upload_date': '20150506', - 'uploader': 'naked-yogi', - 'uploader_id': '1638622', - 'age_limit': 18, - 'duration': 653.26999999999998, - 'view_count': int, - 'like_count': int, - 'comment_count': int, - }, - 'params': { - 'skip_download': True, - }, - }, { - # nsfw, user-disabled - 'url': 'https://vid.me/dzGJ', - 'only_matching': True, - }, { - # suspended - 'url': 'https://vid.me/Ox3G', - 'only_matching': True, - }, { - # deleted - 'url': 'https://vid.me/KTPm', - 'only_matching': True, - }, { - # no formats in the API response - 'url': 'https://vid.me/e5g', - 'info_dict': { - 'id': 'e5g', - 'ext': 'mp4', - 'title': 'Video upload (e5g)', - 'thumbnail': r're:^https?://.*\.jpg', - 'timestamp': 1401480195, - 'upload_date': '20140530', - 'uploader': None, - 'uploader_id': None, - 'age_limit': 0, - 'duration': 483, - 'view_count': int, - 'like_count': int, - 'comment_count': int, - }, - 'params': { - 'skip_download': True, - }, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - try: - response = self._download_json( - 'https://api.vid.me/videoByUrl/%s' % video_id, video_id) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: - response = self._parse_json(e.cause.read(), video_id) - else: - raise - - error = response.get('error') - if error: - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, error), expected=True) - - video = response['video'] - - if video.get('state') == 'deleted': - raise ExtractorError( - 'Vidme said: Sorry, this video has been deleted.', - expected=True) - - if video.get('state') in ('user-disabled', 'suspended'): - raise ExtractorError( - 'Vidme said: This video has been suspended either due to a copyright claim, ' - 'or for violating the terms of use.', - expected=True) - - formats = [] - for f in video.get('formats', []): - format_url = url_or_none(f.get('uri')) - if not format_url: - continue - format_type = f.get('type') - if format_type == 'dash': - formats.extend(self._extract_mpd_formats( - format_url, video_id, mpd_id='dash', fatal=False)) - elif format_type == 'hls': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - else: - formats.append({ - 'format_id': f.get('type'), - 'url': format_url, - 'width': int_or_none(f.get('width')), - 'height': int_or_none(f.get('height')), - 'preference': 0 if f.get('type', '').endswith( - 'clip') else 1, - }) - - if not formats and video.get('complete_url'): - formats.append({ - 'url': video.get('complete_url'), - 'width': int_or_none(video.get('width')), - 'height': int_or_none(video.get('height')), - }) - - self._sort_formats(formats) - - title = video['title'] - description = video.get('description') - thumbnail = video.get('thumbnail_url') - timestamp = parse_iso8601(video.get('date_created'), ' ') - uploader = video.get('user', {}).get('username') - uploader_id = video.get('user', {}).get('user_id') - age_limit = 18 if video.get('nsfw') is True else 0 - duration = float_or_none(video.get('duration')) - view_count = int_or_none(video.get('view_count')) - like_count = int_or_none(video.get('likes_count')) - comment_count = int_or_none(video.get('comment_count')) - - return { - 'id': video_id, - 'title': title or 'Video upload (%s)' % video_id, - 'description': description, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'age_limit': age_limit, - 'timestamp': timestamp, - 'duration': duration, - 'view_count': view_count, - 'like_count': like_count, - 'comment_count': comment_count, - 'formats': formats, - } - - -class VidmeListBaseIE(InfoExtractor): - # Max possible limit according to https://docs.vid.me/#api-Videos-List - _LIMIT = 100 - - def _entries(self, user_id, user_name): - for page_num in itertools.count(1): - page = self._download_json( - 'https://api.vid.me/videos/%s?user=%s&limit=%d&offset=%d' - % (self._API_ITEM, user_id, self._LIMIT, (page_num - 1) * self._LIMIT), - user_name, 'Downloading user %s page %d' % (self._API_ITEM, page_num)) - - videos = page.get('videos', []) - if not videos: - break - - for video in videos: - video_url = video.get('full_url') or video.get('embed_url') - if video_url: - yield self.url_result(video_url, VidmeIE.ie_key()) - - total = int_or_none(page.get('page', {}).get('total')) - if total and self._LIMIT * page_num >= total: - break - - def _real_extract(self, url): - user_name = self._match_id(url) - - user_id = self._download_json( - 'https://api.vid.me/userByUsername?username=%s' % user_name, - user_name)['user']['user_id'] - - return self.playlist_result( - self._entries(user_id, user_name), user_id, - '%s - %s' % (user_name, self._TITLE)) - - -class VidmeUserIE(VidmeListBaseIE): - IE_NAME = 'vidme:user' - _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z_-]{6,})(?!/likes)(?:[^\da-zA-Z_-]|$)' - _API_ITEM = 'list' - _TITLE = 'Videos' - _TESTS = [{ - 'url': 'https://vid.me/MasakoX', - 'info_dict': { - 'id': '16112341', - 'title': 'MasakoX - %s' % _TITLE, - }, - 'playlist_mincount': 191, - }, { - 'url': 'https://vid.me/unsQuare_netWork', - 'only_matching': True, - }] - - -class VidmeUserLikesIE(VidmeListBaseIE): - IE_NAME = 'vidme:user:likes' - _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z_-]{6,})/likes' - _API_ITEM = 'likes' - _TITLE = 'Likes' - _TESTS = [{ - 'url': 'https://vid.me/ErinAlexis/likes', - 'info_dict': { - 'id': '6483530', - 'title': 'ErinAlexis - %s' % _TITLE, - }, - 'playlist_mincount': 415, - }, { - 'url': 'https://vid.me/Kaleidoscope-Ish/likes', - 'only_matching': True, - }] diff --git a/youtube_dl/extractor/vier.py b/youtube_dl/extractor/vier.py deleted file mode 100644 index dbd5ba9ba..000000000 --- a/youtube_dl/extractor/vier.py +++ /dev/null @@ -1,264 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re -import itertools - -from .common import InfoExtractor -from ..utils import ( - urlencode_postdata, - int_or_none, - unified_strdate, -) - - -class VierIE(InfoExtractor): - IE_NAME = 'vier' - IE_DESC = 'vier.be and vijf.be' - _VALID_URL = r'''(?x) - https?:// - (?:www\.)?(?P<site>vier|vijf)\.be/ - (?: - (?: - [^/]+/videos| - video(?:/[^/]+)* - )/ - (?P<display_id>[^/]+)(?:/(?P<id>\d+))?| - (?: - video/v3/embed| - embed/video/public - )/(?P<embed_id>\d+) - ) - ''' - _NETRC_MACHINE = 'vier' - _TESTS = [{ - 'url': 'http://www.vier.be/planb/videos/het-wordt-warm-de-moestuin/16129', - 'md5': 'e4ae2054a6b040ef1e289e20d111b46e', - 'info_dict': { - 'id': '16129', - 'display_id': 'het-wordt-warm-de-moestuin', - 'ext': 'mp4', - 'title': 'Het wordt warm in De Moestuin', - 'description': 'De vele uren werk eisen hun tol. Wim droomt van assistentie...', - 'upload_date': '20121025', - 'series': 'Plan B', - 'tags': ['De Moestuin', 'Moestuin', 'meisjes', 'Tomaat', 'Wim', 'Droom'], - }, - }, { - 'url': 'http://www.vijf.be/temptationisland/videos/zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas/2561614', - 'info_dict': { - 'id': '2561614', - 'display_id': 'zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas', - 'ext': 'mp4', - 'title': 'md5:84f45fe48b8c1fa296a7f6d208d080a7', - 'description': 'md5:0356d4981e58b8cbee19355cbd51a8fe', - 'upload_date': '20170228', - 'series': 'Temptation Island', - 'tags': list, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://www.vier.be/janigaat/videos/jani-gaat-naar-tokio-aflevering-4/2674839', - 'info_dict': { - 'id': '2674839', - 'display_id': 'jani-gaat-naar-tokio-aflevering-4', - 'ext': 'mp4', - 'title': 'Jani gaat naar Tokio - Aflevering 4', - 'description': 'md5:aa8d611541db6ae9e863125704511f88', - 'upload_date': '20170501', - 'series': 'Jani gaat', - 'episode_number': 4, - 'tags': ['Jani Gaat', 'Volledige Aflevering'], - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'Requires account credentials', - }, { - # Requires account credentials but bypassed extraction via v3/embed page - # without metadata - 'url': 'http://www.vier.be/janigaat/videos/jani-gaat-naar-tokio-aflevering-4/2674839', - 'info_dict': { - 'id': '2674839', - 'display_id': 'jani-gaat-naar-tokio-aflevering-4', - 'ext': 'mp4', - 'title': 'jani-gaat-naar-tokio-aflevering-4', - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': ['Log in to extract metadata'], - }, { - # Without video id in URL - 'url': 'http://www.vier.be/planb/videos/dit-najaar-plan-b', - 'only_matching': True, - }, { - 'url': 'http://www.vier.be/video/v3/embed/16129', - 'only_matching': True, - }, { - 'url': 'https://www.vijf.be/embed/video/public/4093', - 'only_matching': True, - }, { - 'url': 'https://www.vier.be/video/blockbusters/in-juli-en-augustus-summer-classics', - 'only_matching': True, - }, { - 'url': 'https://www.vier.be/video/achter-de-rug/2017/achter-de-rug-seizoen-1-aflevering-6', - 'only_matching': True, - }] - - def _real_initialize(self): - self._logged_in = False - - def _login(self, site): - username, password = self._get_login_info() - if username is None or password is None: - return - - login_page = self._download_webpage( - 'http://www.%s.be/user/login' % site, - None, note='Logging in', errnote='Unable to log in', - data=urlencode_postdata({ - 'form_id': 'user_login', - 'name': username, - 'pass': password, - }), - headers={'Content-Type': 'application/x-www-form-urlencoded'}) - - login_error = self._html_search_regex( - r'(?s)<div class="messages error">\s*<div>\s*<h2.+?</h2>(.+?)<', - login_page, 'login error', default=None) - if login_error: - self.report_warning('Unable to log in: %s' % login_error) - else: - self._logged_in = True - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - embed_id = mobj.group('embed_id') - display_id = mobj.group('display_id') or embed_id - video_id = mobj.group('id') or embed_id - site = mobj.group('site') - - if not self._logged_in: - self._login(site) - - webpage = self._download_webpage(url, display_id) - - if r'id="user-login"' in webpage: - self.report_warning( - 'Log in to extract metadata', video_id=display_id) - webpage = self._download_webpage( - 'http://www.%s.be/video/v3/embed/%s' % (site, video_id), - display_id) - - video_id = self._search_regex( - [r'data-nid="(\d+)"', r'"nid"\s*:\s*"(\d+)"'], - webpage, 'video id', default=video_id or display_id) - - playlist_url = self._search_regex( - r'data-file=(["\'])(?P<url>(?:https?:)?//[^/]+/.+?\.m3u8.*?)\1', - webpage, 'm3u8 url', default=None, group='url') - - if not playlist_url: - application = self._search_regex( - [r'data-application="([^"]+)"', r'"application"\s*:\s*"([^"]+)"'], - webpage, 'application', default=site + '_vod') - filename = self._search_regex( - [r'data-filename="([^"]+)"', r'"filename"\s*:\s*"([^"]+)"'], - webpage, 'filename') - playlist_url = 'http://vod.streamcloud.be/%s/_definst_/mp4:%s.mp4/playlist.m3u8' % (application, filename) - - formats = self._extract_wowza_formats( - playlist_url, display_id, skip_protocols=['dash']) - self._sort_formats(formats) - - title = self._og_search_title(webpage, default=display_id) - description = self._html_search_regex( - r'(?s)<div\b[^>]+\bclass=(["\'])[^>]*?\bfield-type-text-with-summary\b[^>]*?\1[^>]*>.*?<p>(?P<value>.+?)</p>', - webpage, 'description', default=None, group='value') - thumbnail = self._og_search_thumbnail(webpage, default=None) - upload_date = unified_strdate(self._html_search_regex( - r'(?s)<div\b[^>]+\bclass=(["\'])[^>]*?\bfield-name-post-date\b[^>]*?\1[^>]*>.*?(?P<value>\d{2}/\d{2}/\d{4})', - webpage, 'upload date', default=None, group='value')) - - series = self._search_regex( - r'data-program=(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, - 'series', default=None, group='value') - episode_number = int_or_none(self._search_regex( - r'(?i)aflevering (\d+)', title, 'episode number', default=None)) - tags = re.findall(r'<a\b[^>]+\bhref=["\']/tags/[^>]+>([^<]+)<', webpage) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'upload_date': upload_date, - 'series': series, - 'episode_number': episode_number, - 'tags': tags, - 'formats': formats, - } - - -class VierVideosIE(InfoExtractor): - IE_NAME = 'vier:videos' - _VALID_URL = r'https?://(?:www\.)?(?P<site>vier|vijf)\.be/(?P<program>[^/]+)/videos(?:\?.*\bpage=(?P<page>\d+)|$)' - _TESTS = [{ - 'url': 'http://www.vier.be/demoestuin/videos', - 'info_dict': { - 'id': 'demoestuin', - }, - 'playlist_mincount': 153, - }, { - 'url': 'http://www.vijf.be/temptationisland/videos', - 'info_dict': { - 'id': 'temptationisland', - }, - 'playlist_mincount': 159, - }, { - 'url': 'http://www.vier.be/demoestuin/videos?page=6', - 'info_dict': { - 'id': 'demoestuin-page6', - }, - 'playlist_mincount': 20, - }, { - 'url': 'http://www.vier.be/demoestuin/videos?page=7', - 'info_dict': { - 'id': 'demoestuin-page7', - }, - 'playlist_mincount': 13, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - program = mobj.group('program') - site = mobj.group('site') - - page_id = mobj.group('page') - if page_id: - page_id = int(page_id) - start_page = page_id - playlist_id = '%s-page%d' % (program, page_id) - else: - start_page = 0 - playlist_id = program - - entries = [] - for current_page_id in itertools.count(start_page): - current_page = self._download_webpage( - 'http://www.%s.be/%s/videos?page=%d' % (site, program, current_page_id), - program, - 'Downloading page %d' % (current_page_id + 1)) - page_entries = [ - self.url_result('http://www.' + site + '.be' + video_url, 'Vier') - for video_url in re.findall( - r'<h[23]><a href="(/[^/]+/videos/[^/]+(?:/\d+)?)">', current_page)] - entries.extend(page_entries) - if page_id or '>Meer<' not in current_page: - break - - return self.playlist_result(entries, playlist_id) diff --git a/youtube_dl/extractor/viewlift.py b/youtube_dl/extractor/viewlift.py deleted file mode 100644 index d6b92b1c8..000000000 --- a/youtube_dl/extractor/viewlift.py +++ /dev/null @@ -1,250 +0,0 @@ -from __future__ import unicode_literals - -import json -import re - -from .common import InfoExtractor -from ..compat import compat_HTTPError -from ..utils import ( - ExtractorError, - int_or_none, - parse_age_limit, -) - - -class ViewLiftBaseIE(InfoExtractor): - _API_BASE = 'https://prod-api.viewlift.com/' - _DOMAINS_REGEX = r'(?:(?:main\.)?snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|(?:monumental|lax)sportsnetwork|vayafilm|failarmy|ftfnext|lnppass\.legapallacanestro|moviespree|app\.myoutdoortv|neoufitness|pflmma|theidentitytb)\.com|(?:hoichoi|app\.horseandcountry|kronon|marquee|supercrosslive)\.tv' - _SITE_MAP = { - 'ftfnext': 'lax', - 'funnyforfree': 'snagfilms', - 'hoichoi': 'hoichoitv', - 'kiddovid': 'snagfilms', - 'laxsportsnetwork': 'lax', - 'legapallacanestro': 'lnp', - 'marquee': 'marquee-tv', - 'monumentalsportsnetwork': 'monumental-network', - 'moviespree': 'bingeflix', - 'pflmma': 'pfl', - 'snagxtreme': 'snagfilms', - 'theidentitytb': 'tampabay', - 'vayafilm': 'snagfilms', - } - _TOKENS = {} - - def _call_api(self, site, path, video_id, query): - token = self._TOKENS.get(site) - if not token: - token_query = {'site': site} - email, password = self._get_login_info(netrc_machine=site) - if email: - resp = self._download_json( - self._API_BASE + 'identity/signin', video_id, - 'Logging in', query=token_query, data=json.dumps({ - 'email': email, - 'password': password, - }).encode()) - else: - resp = self._download_json( - self._API_BASE + 'identity/anonymous-token', video_id, - 'Downloading authorization token', query=token_query) - self._TOKENS[site] = token = resp['authorizationToken'] - return self._download_json( - self._API_BASE + path, video_id, - headers={'Authorization': token}, query=query) - - -class ViewLiftEmbedIE(ViewLiftBaseIE): - IE_NAME = 'viewlift:embed' - _VALID_URL = r'https?://(?:(?:www|embed)\.)?(?P<domain>%s)/embed/player\?.*\bfilmId=(?P<id>[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' % ViewLiftBaseIE._DOMAINS_REGEX - _TESTS = [{ - 'url': 'http://embed.snagfilms.com/embed/player?filmId=74849a00-85a9-11e1-9660-123139220831&w=500', - 'md5': '2924e9215c6eff7a55ed35b72276bd93', - 'info_dict': { - 'id': '74849a00-85a9-11e1-9660-123139220831', - 'ext': 'mp4', - 'title': '#whilewewatch', - 'description': 'md5:b542bef32a6f657dadd0df06e26fb0c8', - 'timestamp': 1334350096, - 'upload_date': '20120413', - } - }, { - # invalid labels, 360p is better that 480p - 'url': 'http://www.snagfilms.com/embed/player?filmId=17ca0950-a74a-11e0-a92a-0026bb61d036', - 'md5': '882fca19b9eb27ef865efeeaed376a48', - 'info_dict': { - 'id': '17ca0950-a74a-11e0-a92a-0026bb61d036', - 'ext': 'mp4', - 'title': 'Life in Limbo', - }, - 'skip': 'The video does not exist', - }, { - 'url': 'http://www.snagfilms.com/embed/player?filmId=0000014c-de2f-d5d6-abcf-ffef58af0017', - 'only_matching': True, - }] - - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:embed\.)?(?:%s)/embed/player.+?)\1' % ViewLiftBaseIE._DOMAINS_REGEX, - webpage) - if mobj: - return mobj.group('url') - - def _real_extract(self, url): - domain, film_id = re.match(self._VALID_URL, url).groups() - site = domain.split('.')[-2] - if site in self._SITE_MAP: - site = self._SITE_MAP[site] - try: - content_data = self._call_api( - site, 'entitlement/video/status', film_id, { - 'id': film_id - })['video'] - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - error_message = self._parse_json(e.cause.read().decode(), film_id).get('errorMessage') - if error_message == 'User does not have a valid subscription or has not purchased this content.': - self.raise_login_required() - raise ExtractorError(error_message, expected=True) - raise - gist = content_data['gist'] - title = gist['title'] - video_assets = content_data['streamingInfo']['videoAssets'] - - formats = [] - mpeg_video_assets = video_assets.get('mpeg') or [] - for video_asset in mpeg_video_assets: - video_asset_url = video_asset.get('url') - if not video_asset: - continue - bitrate = int_or_none(video_asset.get('bitrate')) - height = int_or_none(self._search_regex( - r'^_?(\d+)[pP]$', video_asset.get('renditionValue'), - 'height', default=None)) - formats.append({ - 'url': video_asset_url, - 'format_id': 'http%s' % ('-%d' % bitrate if bitrate else ''), - 'tbr': bitrate, - 'height': height, - 'vcodec': video_asset.get('codec'), - }) - - hls_url = video_assets.get('hls') - if hls_url: - formats.extend(self._extract_m3u8_formats( - hls_url, film_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - self._sort_formats(formats, ('height', 'tbr', 'format_id')) - - info = { - 'id': film_id, - 'title': title, - 'description': gist.get('description'), - 'thumbnail': gist.get('videoImageUrl'), - 'duration': int_or_none(gist.get('runtime')), - 'age_limit': parse_age_limit(content_data.get('parentalRating')), - 'timestamp': int_or_none(gist.get('publishDate'), 1000), - 'formats': formats, - } - for k in ('categories', 'tags'): - info[k] = [v['title'] for v in content_data.get(k, []) if v.get('title')] - return info - - -class ViewLiftIE(ViewLiftBaseIE): - IE_NAME = 'viewlift' - _VALID_URL = r'https?://(?:www\.)?(?P<domain>%s)(?P<path>(?:/(?:films/title|show|(?:news/)?videos?|watch))?/(?P<id>[^?#]+))' % ViewLiftBaseIE._DOMAINS_REGEX - _TESTS = [{ - 'url': 'http://www.snagfilms.com/films/title/lost_for_life', - 'md5': '19844f897b35af219773fd63bdec2942', - 'info_dict': { - 'id': '0000014c-de2f-d5d6-abcf-ffef58af0017', - 'display_id': 'lost_for_life', - 'ext': 'mp4', - 'title': 'Lost for Life', - 'description': 'md5:ea10b5a50405ae1f7b5269a6ec594102', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 4489, - 'categories': 'mincount:3', - 'age_limit': 14, - 'upload_date': '20150421', - 'timestamp': 1429656820, - } - }, { - 'url': 'http://www.snagfilms.com/show/the_world_cut_project/india', - 'md5': 'e6292e5b837642bbda82d7f8bf3fbdfd', - 'info_dict': { - 'id': '00000145-d75c-d96e-a9c7-ff5c67b20000', - 'display_id': 'the_world_cut_project/india', - 'ext': 'mp4', - 'title': 'India', - 'description': 'md5:5c168c5a8f4719c146aad2e0dfac6f5f', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 979, - 'timestamp': 1399478279, - 'upload_date': '20140507', - } - }, { - 'url': 'http://main.snagfilms.com/augie_alone/s_2_ep_12_love', - 'info_dict': { - 'id': '00000148-7b53-de26-a9fb-fbf306f70020', - 'display_id': 'augie_alone/s_2_ep_12_love', - 'ext': 'mp4', - 'title': 'S. 2 Ep. 12 - Love', - 'description': 'Augie finds love.', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 107, - 'upload_date': '20141012', - 'timestamp': 1413129540, - 'age_limit': 17, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://main.snagfilms.com/films/title/the_freebie', - 'only_matching': True, - }, { - # Film is not playable in your area. - 'url': 'http://www.snagfilms.com/films/title/inside_mecca', - 'only_matching': True, - }, { - # Film is not available. - 'url': 'http://www.snagfilms.com/show/augie_alone/flirting', - 'only_matching': True, - }, { - 'url': 'http://www.winnersview.com/videos/the-good-son', - 'only_matching': True, - }, { - # Was once Kaltura embed - 'url': 'https://www.monumentalsportsnetwork.com/videos/john-carlson-postgame-2-25-15', - 'only_matching': True, - }, { - 'url': 'https://www.marquee.tv/watch/sadlerswells-sacredmonsters', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return False if ViewLiftEmbedIE.suitable(url) else super(ViewLiftIE, cls).suitable(url) - - def _real_extract(self, url): - domain, path, display_id = re.match(self._VALID_URL, url).groups() - site = domain.split('.')[-2] - if site in self._SITE_MAP: - site = self._SITE_MAP[site] - modules = self._call_api( - site, 'content/pages', display_id, { - 'includeContent': 'true', - 'moduleOffset': 1, - 'path': path, - 'site': site, - })['modules'] - film_id = next(m['contentData'][0]['gist']['id'] for m in modules if m.get('moduleType') == 'VideoDetailModule') - return { - '_type': 'url_transparent', - 'url': 'http://%s/embed/player?filmId=%s' % (domain, film_id), - 'id': film_id, - 'display_id': display_id, - 'ie_key': 'ViewLiftEmbed', - } diff --git a/youtube_dl/extractor/viidea.py b/youtube_dl/extractor/viidea.py deleted file mode 100644 index a0abbae60..000000000 --- a/youtube_dl/extractor/viidea.py +++ /dev/null @@ -1,202 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, - compat_urlparse, -) -from ..utils import ( - ExtractorError, - js_to_json, - parse_duration, - parse_iso8601, -) - - -class ViideaIE(InfoExtractor): - _VALID_URL = r'''(?x)https?://(?:www\.)?(?: - videolectures\.net| - flexilearn\.viidea\.net| - presentations\.ocwconsortium\.org| - video\.travel-zoom\.si| - video\.pomp-forum\.si| - tv\.nil\.si| - video\.hekovnik.com| - video\.szko\.si| - kpk\.viidea\.com| - inside\.viidea\.net| - video\.kiberpipa\.org| - bvvideo\.si| - kongres\.viidea\.net| - edemokracija\.viidea\.com - )(?:/lecture)?/(?P<id>[^/]+)(?:/video/(?P<part>\d+))?/*(?:[#?].*)?$''' - - _TESTS = [{ - 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/', - 'info_dict': { - 'id': '20171', - 'display_id': 'promogram_igor_mekjavic_eng', - 'ext': 'mp4', - 'title': 'Automatics, robotics and biocybernetics', - 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482', - 'thumbnail': r're:http://.*\.jpg', - 'timestamp': 1372349289, - 'upload_date': '20130627', - 'duration': 565, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - # video with invalid direct format links (HTTP 403) - 'url': 'http://videolectures.net/russir2010_filippova_nlp/', - 'info_dict': { - 'id': '14891', - 'display_id': 'russir2010_filippova_nlp', - 'ext': 'flv', - 'title': 'NLP at Google', - 'description': 'md5:fc7a6d9bf0302d7cc0e53f7ca23747b3', - 'thumbnail': r're:http://.*\.jpg', - 'timestamp': 1284375600, - 'upload_date': '20100913', - 'duration': 5352, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - }, { - # event playlist - 'url': 'http://videolectures.net/deeplearning2015_montreal/', - 'info_dict': { - 'id': '23181', - 'title': 'Deep Learning Summer School, Montreal 2015', - 'description': 'md5:0533a85e4bd918df52a01f0e1ebe87b7', - 'thumbnail': r're:http://.*\.jpg', - 'timestamp': 1438560000, - }, - 'playlist_count': 30, - }, { - # multi part lecture - 'url': 'http://videolectures.net/mlss09uk_bishop_ibi/', - 'info_dict': { - 'id': '9737', - 'display_id': 'mlss09uk_bishop_ibi', - 'title': 'Introduction To Bayesian Inference', - 'thumbnail': r're:http://.*\.jpg', - 'timestamp': 1251622800, - }, - 'playlist': [{ - 'info_dict': { - 'id': '9737_part1', - 'display_id': 'mlss09uk_bishop_ibi_part1', - 'ext': 'wmv', - 'title': 'Introduction To Bayesian Inference (Part 1)', - 'thumbnail': r're:http://.*\.jpg', - 'duration': 4622, - 'timestamp': 1251622800, - 'upload_date': '20090830', - }, - }, { - 'info_dict': { - 'id': '9737_part2', - 'display_id': 'mlss09uk_bishop_ibi_part2', - 'ext': 'wmv', - 'title': 'Introduction To Bayesian Inference (Part 2)', - 'thumbnail': r're:http://.*\.jpg', - 'duration': 5641, - 'timestamp': 1251622800, - 'upload_date': '20090830', - }, - }], - 'playlist_count': 2, - }] - - def _real_extract(self, url): - lecture_slug, explicit_part_id = re.match(self._VALID_URL, url).groups() - - webpage = self._download_webpage(url, lecture_slug) - - cfg = self._parse_json(self._search_regex( - [r'cfg\s*:\s*({.+?})\s*,\s*[\da-zA-Z_]+\s*:\s*\(?\s*function', - r'cfg\s*:\s*({[^}]+})'], - webpage, 'cfg'), lecture_slug, js_to_json) - - lecture_id = compat_str(cfg['obj_id']) - - base_url = self._proto_relative_url(cfg['livepipe'], 'http:') - - try: - lecture_data = self._download_json( - '%s/site/api/lecture/%s?format=json' % (base_url, lecture_id), - lecture_id)['lecture'][0] - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - msg = self._parse_json( - e.cause.read().decode('utf-8'), lecture_id) - raise ExtractorError(msg['detail'], expected=True) - raise - - lecture_info = { - 'id': lecture_id, - 'display_id': lecture_slug, - 'title': lecture_data['title'], - 'timestamp': parse_iso8601(lecture_data.get('time')), - 'description': lecture_data.get('description_wiki'), - 'thumbnail': lecture_data.get('thumb'), - } - - playlist_entries = [] - lecture_type = lecture_data.get('type') - parts = [compat_str(video) for video in cfg.get('videos', [])] - if parts: - multipart = len(parts) > 1 - - def extract_part(part_id): - smil_url = '%s/%s/video/%s/smil.xml' % (base_url, lecture_slug, part_id) - smil = self._download_smil(smil_url, lecture_id) - info = self._parse_smil(smil, smil_url, lecture_id) - self._sort_formats(info['formats']) - info['id'] = lecture_id if not multipart else '%s_part%s' % (lecture_id, part_id) - info['display_id'] = lecture_slug if not multipart else '%s_part%s' % (lecture_slug, part_id) - if multipart: - info['title'] += ' (Part %s)' % part_id - switch = smil.find('.//switch') - if switch is not None: - info['duration'] = parse_duration(switch.attrib.get('dur')) - item_info = lecture_info.copy() - item_info.update(info) - return item_info - - if explicit_part_id or not multipart: - result = extract_part(explicit_part_id or parts[0]) - else: - result = { - '_type': 'multi_video', - 'entries': [extract_part(part) for part in parts], - } - result.update(lecture_info) - - # Immediately return explicitly requested part or non event item - if explicit_part_id or lecture_type != 'evt': - return result - - playlist_entries.append(result) - - # It's probably a playlist - if not parts or lecture_type == 'evt': - playlist_webpage = self._download_webpage( - '%s/site/ajax/drilldown/?id=%s' % (base_url, lecture_id), lecture_id) - entries = [ - self.url_result(compat_urlparse.urljoin(url, video_url), 'Viidea') - for _, video_url in re.findall( - r'<a[^>]+href=(["\'])(.+?)\1[^>]+id=["\']lec=\d+', playlist_webpage)] - playlist_entries.extend(entries) - - playlist = self.playlist_result(playlist_entries, lecture_id) - playlist.update(lecture_info) - return playlist diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py deleted file mode 100644 index 2e9cbf148..000000000 --- a/youtube_dl/extractor/viki.py +++ /dev/null @@ -1,433 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import base64 -import hashlib -import hmac -import itertools -import json -import re -import time - -from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_urllib_parse_urlparse, -) -from ..utils import ( - ExtractorError, - int_or_none, - parse_age_limit, - parse_iso8601, - sanitized_Request, - std_headers, - try_get, -) - - -class VikiBaseIE(InfoExtractor): - _VALID_URL_BASE = r'https?://(?:www\.)?viki\.(?:com|net|mx|jp|fr)/' - _API_QUERY_TEMPLATE = '/v4/%sapp=%s&t=%s&site=www.viki.com' - _API_URL_TEMPLATE = 'https://api.viki.io%s&sig=%s' - - _APP = '100005a' - _APP_VERSION = '6.0.0' - _APP_SECRET = 'MM_d*yP@`&1@]@!AVrXf_o-HVEnoTnm$O-ti4[G~$JDI/Dc-&piU&z&5.;:}95=Iad' - - _GEO_BYPASS = False - _NETRC_MACHINE = 'viki' - - _token = None - - _ERRORS = { - 'geo': 'Sorry, this content is not available in your region.', - 'upcoming': 'Sorry, this content is not yet available.', - 'paywall': 'Sorry, this content is only available to Viki Pass Plus subscribers', - } - - def _prepare_call(self, path, timestamp=None, post_data=None): - path += '?' if '?' not in path else '&' - if not timestamp: - timestamp = int(time.time()) - query = self._API_QUERY_TEMPLATE % (path, self._APP, timestamp) - if self._token: - query += '&token=%s' % self._token - sig = hmac.new( - self._APP_SECRET.encode('ascii'), - query.encode('ascii'), - hashlib.sha1 - ).hexdigest() - url = self._API_URL_TEMPLATE % (query, sig) - return sanitized_Request( - url, json.dumps(post_data).encode('utf-8')) if post_data else url - - def _call_api(self, path, video_id, note, timestamp=None, post_data=None): - resp = self._download_json( - self._prepare_call(path, timestamp, post_data), video_id, note, - headers={'x-viki-app-ver': self._APP_VERSION}) - - error = resp.get('error') - if error: - if error == 'invalid timestamp': - resp = self._download_json( - self._prepare_call(path, int(resp['current_timestamp']), post_data), - video_id, '%s (retry)' % note) - error = resp.get('error') - if error: - self._raise_error(resp['error']) - - return resp - - def _raise_error(self, error): - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, error), - expected=True) - - def _check_errors(self, data): - for reason, status in (data.get('blocking') or {}).items(): - if status and reason in self._ERRORS: - message = self._ERRORS[reason] - if reason == 'geo': - self.raise_geo_restricted(msg=message) - elif reason == 'paywall': - self.raise_login_required(message) - raise ExtractorError('%s said: %s' % ( - self.IE_NAME, message), expected=True) - - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - - login_form = { - 'login_id': username, - 'password': password, - } - - login = self._call_api( - 'sessions.json', None, - 'Logging in', post_data=login_form) - - self._token = login.get('token') - if not self._token: - self.report_warning('Unable to get session token, login has probably failed') - - @staticmethod - def dict_selection(dict_obj, preferred_key, allow_fallback=True): - if preferred_key in dict_obj: - return dict_obj.get(preferred_key) - - if not allow_fallback: - return - - filtered_dict = list(filter(None, [dict_obj.get(k) for k in dict_obj.keys()])) - return filtered_dict[0] if filtered_dict else None - - -class VikiIE(VikiBaseIE): - IE_NAME = 'viki' - _VALID_URL = r'%s(?:videos|player)/(?P<id>[0-9]+v)' % VikiBaseIE._VALID_URL_BASE - _TESTS = [{ - 'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14', - 'info_dict': { - 'id': '1023585v', - 'ext': 'mp4', - 'title': 'Heirs - Episode 14', - 'uploader': 'SBS Contents Hub', - 'timestamp': 1385047627, - 'upload_date': '20131121', - 'age_limit': 13, - 'duration': 3570, - 'episode_number': 14, - }, - 'params': { - 'format': 'bestvideo', - }, - 'skip': 'Blocked in the US', - 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'], - }, { - # clip - 'url': 'http://www.viki.com/videos/1067139v-the-avengers-age-of-ultron-press-conference', - 'md5': '86c0b5dbd4d83a6611a79987cc7a1989', - 'info_dict': { - 'id': '1067139v', - 'ext': 'mp4', - 'title': "'The Avengers: Age of Ultron' Press Conference", - 'description': 'md5:d70b2f9428f5488321bfe1db10d612ea', - 'duration': 352, - 'timestamp': 1430380829, - 'upload_date': '20150430', - 'uploader': 'Arirang TV', - 'like_count': int, - 'age_limit': 0, - }, - 'skip': 'Sorry. There was an error loading this video', - }, { - 'url': 'http://www.viki.com/videos/1048879v-ankhon-dekhi', - 'info_dict': { - 'id': '1048879v', - 'ext': 'mp4', - 'title': 'Ankhon Dekhi', - 'duration': 6512, - 'timestamp': 1408532356, - 'upload_date': '20140820', - 'uploader': 'Spuul', - 'like_count': int, - 'age_limit': 13, - }, - 'skip': 'Blocked in the US', - }, { - # episode - 'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1', - 'md5': '0a53dc252e6e690feccd756861495a8c', - 'info_dict': { - 'id': '44699v', - 'ext': 'mp4', - 'title': 'Boys Over Flowers - Episode 1', - 'description': 'md5:b89cf50038b480b88b5b3c93589a9076', - 'duration': 4172, - 'timestamp': 1270496524, - 'upload_date': '20100405', - 'uploader': 'group8', - 'like_count': int, - 'age_limit': 13, - 'episode_number': 1, - }, - 'params': { - 'format': 'bestvideo', - }, - 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'], - }, { - # youtube external - 'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1', - 'md5': '63f8600c1da6f01b7640eee7eca4f1da', - 'info_dict': { - 'id': '50562v', - 'ext': 'webm', - 'title': 'Poor Nastya [COMPLETE] - Episode 1', - 'description': '', - 'duration': 606, - 'timestamp': 1274949505, - 'upload_date': '20101213', - 'uploader': 'ad14065n', - 'uploader_id': 'ad14065n', - 'like_count': int, - 'age_limit': 13, - }, - 'skip': 'Page not found!', - }, { - 'url': 'http://www.viki.com/player/44699v', - 'only_matching': True, - }, { - # non-English description - 'url': 'http://www.viki.com/videos/158036v-love-in-magic', - 'md5': '41faaba0de90483fb4848952af7c7d0d', - 'info_dict': { - 'id': '158036v', - 'ext': 'mp4', - 'uploader': 'I Planet Entertainment', - 'upload_date': '20111122', - 'timestamp': 1321985454, - 'description': 'md5:44b1e46619df3a072294645c770cef36', - 'title': 'Love In Magic', - 'age_limit': 13, - }, - 'params': { - 'format': 'bestvideo', - }, - 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'], - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - resp = self._download_json( - 'https://www.viki.com/api/videos/' + video_id, - video_id, 'Downloading video JSON', headers={ - 'x-client-user-agent': std_headers['User-Agent'], - 'x-viki-app-ver': '3.0.0', - }) - video = resp['video'] - - self._check_errors(video) - - title = self.dict_selection(video.get('titles', {}), 'en', allow_fallback=False) - episode_number = int_or_none(video.get('number')) - if not title: - title = 'Episode %d' % episode_number if video.get('type') == 'episode' else video.get('id') or video_id - container_titles = try_get(video, lambda x: x['container']['titles'], dict) or {} - container_title = self.dict_selection(container_titles, 'en') - title = '%s - %s' % (container_title, title) - - description = self.dict_selection(video.get('descriptions', {}), 'en') - - like_count = int_or_none(try_get(video, lambda x: x['likes']['count'])) - - thumbnails = [] - for thumbnail_id, thumbnail in (video.get('images') or {}).items(): - thumbnails.append({ - 'id': thumbnail_id, - 'url': thumbnail.get('url'), - }) - - subtitles = {} - for subtitle_lang, _ in (video.get('subtitle_completions') or {}).items(): - subtitles[subtitle_lang] = [{ - 'ext': subtitles_format, - 'url': self._prepare_call( - 'videos/%s/subtitles/%s.%s' % (video_id, subtitle_lang, subtitles_format)), - } for subtitles_format in ('srt', 'vtt')] - - result = { - 'id': video_id, - 'title': title, - 'description': description, - 'duration': int_or_none(video.get('duration')), - 'timestamp': parse_iso8601(video.get('created_at')), - 'uploader': video.get('author'), - 'uploader_url': video.get('author_url'), - 'like_count': like_count, - 'age_limit': parse_age_limit(video.get('rating')), - 'thumbnails': thumbnails, - 'subtitles': subtitles, - 'episode_number': episode_number, - } - - formats = [] - - def add_format(format_id, format_dict, protocol='http'): - # rtmps URLs does not seem to work - if protocol == 'rtmps': - return - format_url = format_dict.get('url') - if not format_url: - return - qs = compat_parse_qs(compat_urllib_parse_urlparse(format_url).query) - stream = qs.get('stream', [None])[0] - if stream: - format_url = base64.b64decode(stream).decode() - if format_id in ('m3u8', 'hls'): - m3u8_formats = self._extract_m3u8_formats( - format_url, video_id, 'mp4', - entry_protocol='m3u8_native', - m3u8_id='m3u8-%s' % protocol, fatal=False) - # Despite CODECS metadata in m3u8 all video-only formats - # are actually video+audio - for f in m3u8_formats: - if '_drm/index_' in f['url']: - continue - if f.get('acodec') == 'none' and f.get('vcodec') != 'none': - f['acodec'] = None - formats.append(f) - elif format_id in ('mpd', 'dash'): - formats.extend(self._extract_mpd_formats( - format_url, video_id, 'mpd-%s' % protocol, fatal=False)) - elif format_url.startswith('rtmp'): - mobj = re.search( - r'^(?P<url>rtmp://[^/]+/(?P<app>.+?))/(?P<playpath>mp4:.+)$', - format_url) - if not mobj: - return - formats.append({ - 'format_id': 'rtmp-%s' % format_id, - 'ext': 'flv', - 'url': mobj.group('url'), - 'play_path': mobj.group('playpath'), - 'app': mobj.group('app'), - 'page_url': url, - }) - else: - formats.append({ - 'url': format_url, - 'format_id': '%s-%s' % (format_id, protocol), - 'height': int_or_none(self._search_regex( - r'^(\d+)[pP]$', format_id, 'height', default=None)), - }) - - for format_id, format_dict in (resp.get('streams') or {}).items(): - add_format(format_id, format_dict) - if not formats: - streams = self._call_api( - 'videos/%s/streams.json' % video_id, video_id, - 'Downloading video streams JSON') - - if 'external' in streams: - result.update({ - '_type': 'url_transparent', - 'url': streams['external']['url'], - }) - return result - - for format_id, stream_dict in streams.items(): - for protocol, format_dict in stream_dict.items(): - add_format(format_id, format_dict, protocol) - self._sort_formats(formats) - - result['formats'] = formats - return result - - -class VikiChannelIE(VikiBaseIE): - IE_NAME = 'viki:channel' - _VALID_URL = r'%s(?:tv|news|movies|artists)/(?P<id>[0-9]+c)' % VikiBaseIE._VALID_URL_BASE - _TESTS = [{ - 'url': 'http://www.viki.com/tv/50c-boys-over-flowers', - 'info_dict': { - 'id': '50c', - 'title': 'Boys Over Flowers', - 'description': 'md5:804ce6e7837e1fd527ad2f25420f4d59', - }, - 'playlist_mincount': 71, - }, { - 'url': 'http://www.viki.com/tv/1354c-poor-nastya-complete', - 'info_dict': { - 'id': '1354c', - 'title': 'Poor Nastya [COMPLETE]', - 'description': 'md5:05bf5471385aa8b21c18ad450e350525', - }, - 'playlist_count': 127, - 'skip': 'Page not found', - }, { - 'url': 'http://www.viki.com/news/24569c-showbiz-korea', - 'only_matching': True, - }, { - 'url': 'http://www.viki.com/movies/22047c-pride-and-prejudice-2005', - 'only_matching': True, - }, { - 'url': 'http://www.viki.com/artists/2141c-shinee', - 'only_matching': True, - }] - - _PER_PAGE = 25 - - def _real_extract(self, url): - channel_id = self._match_id(url) - - channel = self._call_api( - 'containers/%s.json' % channel_id, channel_id, - 'Downloading channel JSON') - - self._check_errors(channel) - - title = self.dict_selection(channel['titles'], 'en') - - description = self.dict_selection(channel['descriptions'], 'en') - - entries = [] - for video_type in ('episodes', 'clips', 'movies'): - for page_num in itertools.count(1): - page = self._call_api( - 'containers/%s/%s.json?per_page=%d&sort=number&direction=asc&with_paging=true&page=%d' - % (channel_id, video_type, self._PER_PAGE, page_num), channel_id, - 'Downloading %s JSON page #%d' % (video_type, page_num)) - for video in page['response']: - video_id = video['id'] - entries.append(self.url_result( - 'https://www.viki.com/videos/%s' % video_id, 'Viki')) - if not page['pagination']['next']: - break - - return self.playlist_result(entries, channel_id, title, description) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py deleted file mode 100644 index 0b386f450..000000000 --- a/youtube_dl/extractor/vimeo.py +++ /dev/null @@ -1,1158 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import base64 -import functools -import re -import itertools - -from .common import InfoExtractor -from ..compat import ( - compat_kwargs, - compat_HTTPError, - compat_str, - compat_urlparse, -) -from ..utils import ( - clean_html, - determine_ext, - ExtractorError, - get_element_by_class, - js_to_json, - int_or_none, - merge_dicts, - OnDemandPagedList, - parse_filesize, - parse_iso8601, - sanitized_Request, - smuggle_url, - std_headers, - str_or_none, - try_get, - unified_timestamp, - unsmuggle_url, - urlencode_postdata, - urljoin, - unescapeHTML, -) - - -class VimeoBaseInfoExtractor(InfoExtractor): - _NETRC_MACHINE = 'vimeo' - _LOGIN_REQUIRED = False - _LOGIN_URL = 'https://vimeo.com/log_in' - - def _login(self): - username, password = self._get_login_info() - if username is None: - if self._LOGIN_REQUIRED: - raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) - return - webpage = self._download_webpage( - self._LOGIN_URL, None, 'Downloading login page') - token, vuid = self._extract_xsrft_and_vuid(webpage) - data = { - 'action': 'login', - 'email': username, - 'password': password, - 'service': 'vimeo', - 'token': token, - } - self._set_vimeo_cookie('vuid', vuid) - try: - self._download_webpage( - self._LOGIN_URL, None, 'Logging in', - data=urlencode_postdata(data), headers={ - 'Content-Type': 'application/x-www-form-urlencoded', - 'Referer': self._LOGIN_URL, - }) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 418: - raise ExtractorError( - 'Unable to log in: bad username or password', - expected=True) - raise ExtractorError('Unable to log in') - - def _get_video_password(self): - password = self._downloader.params.get('videopassword') - if password is None: - raise ExtractorError( - 'This video is protected by a password, use the --video-password option', - expected=True) - return password - - def _verify_video_password(self, url, video_id, password, token, vuid): - if url.startswith('http://'): - # vimeo only supports https now, but the user can give an http url - url = url.replace('http://', 'https://') - self._set_vimeo_cookie('vuid', vuid) - return self._download_webpage( - url + '/password', video_id, 'Verifying the password', - 'Wrong password', data=urlencode_postdata({ - 'password': password, - 'token': token, - }), headers={ - 'Content-Type': 'application/x-www-form-urlencoded', - 'Referer': url, - }) - - def _extract_xsrft_and_vuid(self, webpage): - xsrft = self._search_regex( - r'(?:(?P<q1>["\'])xsrft(?P=q1)\s*:|xsrft\s*[=:])\s*(?P<q>["\'])(?P<xsrft>.+?)(?P=q)', - webpage, 'login token', group='xsrft') - vuid = self._search_regex( - r'["\']vuid["\']\s*:\s*(["\'])(?P<vuid>.+?)\1', - webpage, 'vuid', group='vuid') - return xsrft, vuid - - def _extract_vimeo_config(self, webpage, video_id, *args, **kwargs): - vimeo_config = self._search_regex( - r'vimeo\.config\s*=\s*(?:({.+?})|_extend\([^,]+,\s+({.+?})\));', - webpage, 'vimeo config', *args, **compat_kwargs(kwargs)) - if vimeo_config: - return self._parse_json(vimeo_config, video_id) - - def _set_vimeo_cookie(self, name, value): - self._set_cookie('vimeo.com', name, value) - - def _vimeo_sort_formats(self, formats): - # Bitrates are completely broken. Single m3u8 may contain entries in kbps and bps - # at the same time without actual units specified. This lead to wrong sorting. - self._sort_formats(formats, field_preference=('preference', 'height', 'width', 'fps', 'tbr', 'format_id')) - - def _parse_config(self, config, video_id): - video_data = config['video'] - video_title = video_data['title'] - live_event = video_data.get('live_event') or {} - is_live = live_event.get('status') == 'started' - request = config.get('request') or {} - - formats = [] - config_files = video_data.get('files') or request.get('files') or {} - for f in (config_files.get('progressive') or []): - video_url = f.get('url') - if not video_url: - continue - formats.append({ - 'url': video_url, - 'format_id': 'http-%s' % f.get('quality'), - 'width': int_or_none(f.get('width')), - 'height': int_or_none(f.get('height')), - 'fps': int_or_none(f.get('fps')), - 'tbr': int_or_none(f.get('bitrate')), - }) - - # TODO: fix handling of 308 status code returned for live archive manifest requests - sep_pattern = r'/sep/video/' - for files_type in ('hls', 'dash'): - for cdn_name, cdn_data in (try_get(config_files, lambda x: x[files_type]['cdns']) or {}).items(): - manifest_url = cdn_data.get('url') - if not manifest_url: - continue - format_id = '%s-%s' % (files_type, cdn_name) - sep_manifest_urls = [] - if re.search(sep_pattern, manifest_url): - for suffix, repl in (('', 'video'), ('_sep', 'sep/video')): - sep_manifest_urls.append((format_id + suffix, re.sub( - sep_pattern, '/%s/' % repl, manifest_url))) - else: - sep_manifest_urls = [(format_id, manifest_url)] - for f_id, m_url in sep_manifest_urls: - if files_type == 'hls': - formats.extend(self._extract_m3u8_formats( - m_url, video_id, 'mp4', - 'm3u8' if is_live else 'm3u8_native', m3u8_id=f_id, - note='Downloading %s m3u8 information' % cdn_name, - fatal=False)) - elif files_type == 'dash': - if 'json=1' in m_url: - real_m_url = (self._download_json(m_url, video_id, fatal=False) or {}).get('url') - if real_m_url: - m_url = real_m_url - mpd_formats = self._extract_mpd_formats( - m_url.replace('/master.json', '/master.mpd'), video_id, f_id, - 'Downloading %s MPD information' % cdn_name, - fatal=False) - formats.extend(mpd_formats) - - live_archive = live_event.get('archive') or {} - live_archive_source_url = live_archive.get('source_url') - if live_archive_source_url and live_archive.get('status') == 'done': - formats.append({ - 'format_id': 'live-archive-source', - 'url': live_archive_source_url, - 'preference': 1, - }) - - for f in formats: - if f.get('vcodec') == 'none': - f['preference'] = -50 - elif f.get('acodec') == 'none': - f['preference'] = -40 - - subtitles = {} - for tt in (request.get('text_tracks') or []): - subtitles[tt['lang']] = [{ - 'ext': 'vtt', - 'url': urljoin('https://vimeo.com', tt['url']), - }] - - thumbnails = [] - if not is_live: - for key, thumb in (video_data.get('thumbs') or {}).items(): - thumbnails.append({ - 'id': key, - 'width': int_or_none(key), - 'url': thumb, - }) - thumbnail = video_data.get('thumbnail') - if thumbnail: - thumbnails.append({ - 'url': thumbnail, - }) - - owner = video_data.get('owner') or {} - video_uploader_url = owner.get('url') - - return { - 'id': str_or_none(video_data.get('id')) or video_id, - 'title': self._live_title(video_title) if is_live else video_title, - 'uploader': owner.get('name'), - 'uploader_id': video_uploader_url.split('/')[-1] if video_uploader_url else None, - 'uploader_url': video_uploader_url, - 'thumbnails': thumbnails, - 'duration': int_or_none(video_data.get('duration')), - 'formats': formats, - 'subtitles': subtitles, - 'is_live': is_live, - } - - def _extract_original_format(self, url, video_id, unlisted_hash=None): - query = {'action': 'load_download_config'} - if unlisted_hash: - query['unlisted_hash'] = unlisted_hash - download_data = self._download_json( - url, video_id, fatal=False, query=query, - headers={'X-Requested-With': 'XMLHttpRequest'}) - if download_data: - source_file = download_data.get('source_file') - if isinstance(source_file, dict): - download_url = source_file.get('download_url') - if download_url and not source_file.get('is_cold') and not source_file.get('is_defrosting'): - source_name = source_file.get('public_name', 'Original') - if self._is_valid_url(download_url, video_id, '%s video' % source_name): - ext = (try_get( - source_file, lambda x: x['extension'], - compat_str) or determine_ext( - download_url, None) or 'mp4').lower() - return { - 'url': download_url, - 'ext': ext, - 'width': int_or_none(source_file.get('width')), - 'height': int_or_none(source_file.get('height')), - 'filesize': parse_filesize(source_file.get('size')), - 'format_id': source_name, - 'preference': 1, - } - - -class VimeoIE(VimeoBaseInfoExtractor): - """Information extractor for vimeo.com.""" - - # _VALID_URL matches Vimeo URLs - _VALID_URL = r'''(?x) - https?:// - (?: - (?: - www| - player - ) - \. - )? - vimeo(?:pro)?\.com/ - (?!(?:channels|album|showcase)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/) - (?:.*?/)? - (?: - (?: - play_redirect_hls| - moogaloop\.swf)\?clip_id= - )? - (?:videos?/)? - (?P<id>[0-9]+) - (?:/(?P<unlisted_hash>[\da-f]{10}))? - /?(?:[?&].*)?(?:[#].*)?$ - ''' - IE_NAME = 'vimeo' - _TESTS = [ - { - 'url': 'http://vimeo.com/56015672#at=0', - 'md5': '8879b6cc097e987f02484baf890129e5', - 'info_dict': { - 'id': '56015672', - 'ext': 'mp4', - 'title': "youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550", - 'description': 'md5:2d3305bad981a06ff79f027f19865021', - 'timestamp': 1355990239, - 'upload_date': '20121220', - 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user7108434', - 'uploader_id': 'user7108434', - 'uploader': 'Filippo Valsorda', - 'duration': 10, - 'license': 'by-sa', - }, - 'params': { - 'format': 'best[protocol=https]', - }, - }, - { - 'url': 'http://vimeopro.com/openstreetmapus/state-of-the-map-us-2013/video/68093876', - 'md5': '3b5ca6aa22b60dfeeadf50b72e44ed82', - 'note': 'Vimeo Pro video (#1197)', - 'info_dict': { - 'id': '68093876', - 'ext': 'mp4', - 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/openstreetmapus', - 'uploader_id': 'openstreetmapus', - 'uploader': 'OpenStreetMap US', - 'title': 'Andy Allan - Putting the Carto into OpenStreetMap Cartography', - 'description': 'md5:2c362968038d4499f4d79f88458590c1', - 'duration': 1595, - 'upload_date': '20130610', - 'timestamp': 1370893156, - 'license': 'by', - }, - 'params': { - 'format': 'best[protocol=https]', - }, - }, - { - 'url': 'http://player.vimeo.com/video/54469442', - 'md5': '619b811a4417aa4abe78dc653becf511', - 'note': 'Videos that embed the url in the player page', - 'info_dict': { - 'id': '54469442', - 'ext': 'mp4', - 'title': 'Kathy Sierra: Building the minimum Badass User, Business of Software 2012', - 'uploader': 'Business of Software', - 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/businessofsoftware', - 'uploader_id': 'businessofsoftware', - 'duration': 3610, - 'description': None, - }, - 'params': { - 'format': 'best[protocol=https]', - }, - 'expected_warnings': ['Unable to download JSON metadata'], - }, - { - 'url': 'http://vimeo.com/68375962', - 'md5': 'aaf896bdb7ddd6476df50007a0ac0ae7', - 'note': 'Video protected with password', - 'info_dict': { - 'id': '68375962', - 'ext': 'mp4', - 'title': 'youtube-dl password protected test video', - 'timestamp': 1371200155, - 'upload_date': '20130614', - 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user18948128', - 'uploader_id': 'user18948128', - 'uploader': 'Jaime Marquínez Ferrándiz', - 'duration': 10, - 'description': 'md5:dca3ea23adb29ee387127bc4ddfce63f', - }, - 'params': { - 'format': 'best[protocol=https]', - 'videopassword': 'youtube-dl', - }, - }, - { - 'url': 'http://vimeo.com/channels/keypeele/75629013', - 'md5': '2f86a05afe9d7abc0b9126d229bbe15d', - 'info_dict': { - 'id': '75629013', - 'ext': 'mp4', - 'title': 'Key & Peele: Terrorist Interrogation', - 'description': 'md5:8678b246399b070816b12313e8b4eb5c', - 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/atencio', - 'uploader_id': 'atencio', - 'uploader': 'Peter Atencio', - 'channel_id': 'keypeele', - 'channel_url': r're:https?://(?:www\.)?vimeo\.com/channels/keypeele', - 'timestamp': 1380339469, - 'upload_date': '20130928', - 'duration': 187, - }, - 'expected_warnings': ['Unable to download JSON metadata'], - }, - { - 'url': 'http://vimeo.com/76979871', - 'note': 'Video with subtitles', - 'info_dict': { - 'id': '76979871', - 'ext': 'mp4', - 'title': 'The New Vimeo Player (You Know, For Videos)', - 'description': 'md5:2ec900bf97c3f389378a96aee11260ea', - 'timestamp': 1381846109, - 'upload_date': '20131015', - 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/staff', - 'uploader_id': 'staff', - 'uploader': 'Vimeo Staff', - 'duration': 62, - 'subtitles': { - 'de': [{'ext': 'vtt'}], - 'en': [{'ext': 'vtt'}], - 'es': [{'ext': 'vtt'}], - 'fr': [{'ext': 'vtt'}], - }, - } - }, - { - # from https://www.ouya.tv/game/Pier-Solar-and-the-Great-Architects/ - 'url': 'https://player.vimeo.com/video/98044508', - 'note': 'The js code contains assignments to the same variable as the config', - 'info_dict': { - 'id': '98044508', - 'ext': 'mp4', - 'title': 'Pier Solar OUYA Official Trailer', - 'uploader': 'Tulio Gonçalves', - 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user28849593', - 'uploader_id': 'user28849593', - }, - }, - { - # contains original format - 'url': 'https://vimeo.com/33951933', - 'md5': '53c688fa95a55bf4b7293d37a89c5c53', - 'info_dict': { - 'id': '33951933', - 'ext': 'mp4', - 'title': 'FOX CLASSICS - Forever Classic ID - A Full Minute', - 'uploader': 'The DMCI', - 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/dmci', - 'uploader_id': 'dmci', - 'timestamp': 1324343742, - 'upload_date': '20111220', - 'description': 'md5:ae23671e82d05415868f7ad1aec21147', - }, - }, - { - # only available via https://vimeo.com/channels/tributes/6213729 and - # not via https://vimeo.com/6213729 - 'url': 'https://vimeo.com/channels/tributes/6213729', - 'info_dict': { - 'id': '6213729', - 'ext': 'mp4', - 'title': 'Vimeo Tribute: The Shining', - 'uploader': 'Casey Donahue', - 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/caseydonahue', - 'uploader_id': 'caseydonahue', - 'channel_url': r're:https?://(?:www\.)?vimeo\.com/channels/tributes', - 'channel_id': 'tributes', - 'timestamp': 1250886430, - 'upload_date': '20090821', - 'description': 'md5:bdbf314014e58713e6e5b66eb252f4a6', - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': ['Unable to download JSON metadata'], - }, - { - # redirects to ondemand extractor and should be passed through it - # for successful extraction - 'url': 'https://vimeo.com/73445910', - 'info_dict': { - 'id': '73445910', - 'ext': 'mp4', - 'title': 'The Reluctant Revolutionary', - 'uploader': '10Ft Films', - 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/tenfootfilms', - 'uploader_id': 'tenfootfilms', - 'description': 'md5:0fa704e05b04f91f40b7f3ca2e801384', - 'upload_date': '20130830', - 'timestamp': 1377853339, - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': ['Unable to download JSON metadata'], - 'skip': 'this page is no longer available.', - }, - { - 'url': 'http://player.vimeo.com/video/68375962', - 'md5': 'aaf896bdb7ddd6476df50007a0ac0ae7', - 'info_dict': { - 'id': '68375962', - 'ext': 'mp4', - 'title': 'youtube-dl password protected test video', - 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user18948128', - 'uploader_id': 'user18948128', - 'uploader': 'Jaime Marquínez Ferrándiz', - 'duration': 10, - }, - 'params': { - 'format': 'best[protocol=https]', - 'videopassword': 'youtube-dl', - }, - }, - { - 'url': 'http://vimeo.com/moogaloop.swf?clip_id=2539741', - 'only_matching': True, - }, - { - 'url': 'https://vimeo.com/109815029', - 'note': 'Video not completely processed, "failed" seed status', - 'only_matching': True, - }, - { - 'url': 'https://vimeo.com/groups/travelhd/videos/22439234', - 'only_matching': True, - }, - { - 'url': 'https://vimeo.com/album/2632481/video/79010983', - 'only_matching': True, - }, - { - # source file returns 403: Forbidden - 'url': 'https://vimeo.com/7809605', - 'only_matching': True, - }, - { - 'url': 'https://vimeo.com/160743502/abd0e13fb4', - 'only_matching': True, - }, - { - # requires passing unlisted_hash(a52724358e) to load_download_config request - 'url': 'https://vimeo.com/392479337/a52724358e', - 'only_matching': True, - } - # https://gettingthingsdone.com/workflowmap/ - # vimeo embed with check-password page protected by Referer header - ] - - @staticmethod - def _smuggle_referrer(url, referrer_url): - return smuggle_url(url, {'http_headers': {'Referer': referrer_url}}) - - @staticmethod - def _extract_urls(url, webpage): - urls = [] - # Look for embedded (iframe) Vimeo player - for mobj in re.finditer( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/\d+.*?)\1', - webpage): - urls.append(VimeoIE._smuggle_referrer(unescapeHTML(mobj.group('url')), url)) - PLAIN_EMBED_RE = ( - # Look for embedded (swf embed) Vimeo player - r'<embed[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)\1', - # Look more for non-standard embedded Vimeo player - r'<video[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?vimeo\.com/[0-9]+)\1', - ) - for embed_re in PLAIN_EMBED_RE: - for mobj in re.finditer(embed_re, webpage): - urls.append(mobj.group('url')) - return urls - - @staticmethod - def _extract_url(url, webpage): - urls = VimeoIE._extract_urls(url, webpage) - return urls[0] if urls else None - - def _verify_player_video_password(self, url, video_id, headers): - password = self._get_video_password() - data = urlencode_postdata({ - 'password': base64.b64encode(password.encode()), - }) - headers = merge_dicts(headers, { - 'Content-Type': 'application/x-www-form-urlencoded', - }) - checked = self._download_json( - url + '/check-password', video_id, - 'Verifying the password', data=data, headers=headers) - if checked is False: - raise ExtractorError('Wrong video password', expected=True) - return checked - - def _real_initialize(self): - self._login() - - def _extract_from_api(self, video_id, unlisted_hash=None): - token = self._download_json( - 'https://vimeo.com/_rv/jwt', video_id, headers={ - 'X-Requested-With': 'XMLHttpRequest' - })['token'] - api_url = 'https://api.vimeo.com/videos/' + video_id - if unlisted_hash: - api_url += ':' + unlisted_hash - video = self._download_json( - api_url, video_id, headers={ - 'Authorization': 'jwt ' + token, - }, query={ - 'fields': 'config_url,created_time,description,license,metadata.connections.comments.total,metadata.connections.likes.total,release_time,stats.plays', - }) - info = self._parse_config(self._download_json( - video['config_url'], video_id), video_id) - self._vimeo_sort_formats(info['formats']) - get_timestamp = lambda x: parse_iso8601(video.get(x + '_time')) - info.update({ - 'description': video.get('description'), - 'license': video.get('license'), - 'release_timestamp': get_timestamp('release'), - 'timestamp': get_timestamp('created'), - 'view_count': int_or_none(try_get(video, lambda x: x['stats']['plays'])), - }) - connections = try_get( - video, lambda x: x['metadata']['connections'], dict) or {} - for k in ('comment', 'like'): - info[k + '_count'] = int_or_none(try_get(connections, lambda x: x[k + 's']['total'])) - return info - - def _real_extract(self, url): - url, data = unsmuggle_url(url, {}) - headers = std_headers.copy() - if 'http_headers' in data: - headers.update(data['http_headers']) - if 'Referer' not in headers: - headers['Referer'] = url - - mobj = re.match(self._VALID_URL, url).groupdict() - video_id, unlisted_hash = mobj['id'], mobj.get('unlisted_hash') - if unlisted_hash: - return self._extract_from_api(video_id, unlisted_hash) - - orig_url = url - is_pro = 'vimeopro.com/' in url - if is_pro: - # some videos require portfolio_id to be present in player url - # https://github.com/ytdl-org/youtube-dl/issues/20070 - url = self._extract_url(url, self._download_webpage(url, video_id)) - if not url: - url = 'https://vimeo.com/' + video_id - elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf')): - url = 'https://vimeo.com/' + video_id - - try: - # Retrieve video webpage to extract further information - webpage, urlh = self._download_webpage_handle( - url, video_id, headers=headers) - redirect_url = urlh.geturl() - except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: - errmsg = ee.cause.read() - if b'Because of its privacy settings, this video cannot be played here' in errmsg: - raise ExtractorError( - 'Cannot download embed-only video without embedding ' - 'URL. Please call youtube-dl with the URL of the page ' - 'that embeds this video.', - expected=True) - raise - - if '//player.vimeo.com/video/' in url: - config = self._parse_json(self._search_regex( - r'\bconfig\s*=\s*({.+?})\s*;', webpage, 'info section'), video_id) - if config.get('view') == 4: - config = self._verify_player_video_password( - redirect_url, video_id, headers) - info = self._parse_config(config, video_id) - self._vimeo_sort_formats(info['formats']) - return info - - if re.search(r'<form[^>]+?id="pw_form"', webpage): - video_password = self._get_video_password() - token, vuid = self._extract_xsrft_and_vuid(webpage) - webpage = self._verify_video_password( - redirect_url, video_id, video_password, token, vuid) - - vimeo_config = self._extract_vimeo_config(webpage, video_id, default=None) - if vimeo_config: - seed_status = vimeo_config.get('seed_status') or {} - if seed_status.get('state') == 'failed': - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, seed_status['title']), - expected=True) - - cc_license = None - timestamp = None - video_description = None - info_dict = {} - - channel_id = self._search_regex( - r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None) - if channel_id: - config_url = self._html_search_regex( - r'\bdata-config-url="([^"]+)"', webpage, 'config URL') - video_description = clean_html(get_element_by_class('description', webpage)) - info_dict.update({ - 'channel_id': channel_id, - 'channel_url': 'https://vimeo.com/channels/' + channel_id, - }) - else: - page_config = self._parse_json(self._search_regex( - r'vimeo\.(?:clip|vod_title)_page_config\s*=\s*({.+?});', - webpage, 'page config', default='{}'), video_id, fatal=False) - if not page_config: - return self._extract_from_api(video_id) - config_url = page_config['player']['config_url'] - cc_license = page_config.get('cc_license') - clip = page_config.get('clip') or {} - timestamp = clip.get('uploaded_on') - video_description = clean_html( - clip.get('description') or page_config.get('description_html_escaped')) - config = self._download_json(config_url, video_id) - video = config.get('video') or {} - vod = video.get('vod') or {} - - def is_rented(): - if '>You rented this title.<' in webpage: - return True - if try_get(config, lambda x: x['user']['purchased']): - return True - for purchase_option in (vod.get('purchase_options') or []): - if purchase_option.get('purchased'): - return True - label = purchase_option.get('label_string') - if label and (label.startswith('You rented this') or label.endswith(' remaining')): - return True - return False - - if is_rented() and vod.get('is_trailer'): - feature_id = vod.get('feature_id') - if feature_id and not data.get('force_feature_id', False): - return self.url_result(smuggle_url( - 'https://player.vimeo.com/player/%s' % feature_id, - {'force_feature_id': True}), 'Vimeo') - - if not video_description: - video_description = self._html_search_meta( - ['description', 'og:description', 'twitter:description'], - webpage, default=None) - if not video_description and is_pro: - orig_webpage = self._download_webpage( - orig_url, video_id, - note='Downloading webpage for description', - fatal=False) - if orig_webpage: - video_description = self._html_search_meta( - 'description', orig_webpage, default=None) - if not video_description: - self._downloader.report_warning('Cannot find video description') - - if not timestamp: - timestamp = self._search_regex( - r'<time[^>]+datetime="([^"]+)"', webpage, - 'timestamp', default=None) - - formats = [] - - source_format = self._extract_original_format( - 'https://vimeo.com/' + video_id, video_id, video.get('unlisted_hash')) - if source_format: - formats.append(source_format) - - info_dict_config = self._parse_config(config, video_id) - formats.extend(info_dict_config['formats']) - self._vimeo_sort_formats(formats) - - json_ld = self._search_json_ld(webpage, video_id, default={}) - - if not cc_license: - cc_license = self._search_regex( - r'<link[^>]+rel=["\']license["\'][^>]+href=(["\'])(?P<license>(?:(?!\1).)+)\1', - webpage, 'license', default=None, group='license') - - info_dict.update({ - 'formats': formats, - 'timestamp': unified_timestamp(timestamp), - 'description': video_description, - 'webpage_url': url, - 'license': cc_license, - }) - - return merge_dicts(info_dict, info_dict_config, json_ld) - - -class VimeoOndemandIE(VimeoIE): - IE_NAME = 'vimeo:ondemand' - _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/(?:[^/]+/)?(?P<id>[^/?#&]+)' - _TESTS = [{ - # ondemand video not available via https://vimeo.com/id - 'url': 'https://vimeo.com/ondemand/20704', - 'md5': 'c424deda8c7f73c1dfb3edd7630e2f35', - 'info_dict': { - 'id': '105442900', - 'ext': 'mp4', - 'title': 'המעבדה - במאי יותם פלדמן', - 'uploader': 'גם סרטים', - 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/gumfilms', - 'uploader_id': 'gumfilms', - 'description': 'md5:4c027c965e439de4baab621e48b60791', - 'upload_date': '20140906', - 'timestamp': 1410032453, - }, - 'params': { - 'format': 'best[protocol=https]', - }, - 'expected_warnings': ['Unable to download JSON metadata'], - }, { - # requires Referer to be passed along with og:video:url - 'url': 'https://vimeo.com/ondemand/36938/126682985', - 'info_dict': { - 'id': '126584684', - 'ext': 'mp4', - 'title': 'Rävlock, rätt läte på rätt plats', - 'uploader': 'Lindroth & Norin', - 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/lindrothnorin', - 'uploader_id': 'lindrothnorin', - 'description': 'md5:c3c46a90529612c8279fb6af803fc0df', - 'upload_date': '20150502', - 'timestamp': 1430586422, - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': ['Unable to download JSON metadata'], - }, { - 'url': 'https://vimeo.com/ondemand/nazmaalik', - 'only_matching': True, - }, { - 'url': 'https://vimeo.com/ondemand/141692381', - 'only_matching': True, - }, { - 'url': 'https://vimeo.com/ondemand/thelastcolony/150274832', - 'only_matching': True, - }] - - -class VimeoChannelIE(VimeoBaseInfoExtractor): - IE_NAME = 'vimeo:channel' - _VALID_URL = r'https://vimeo\.com/channels/(?P<id>[^/?#]+)/?(?:$|[?#])' - _MORE_PAGES_INDICATOR = r'<a.+?rel="next"' - _TITLE = None - _TITLE_RE = r'<link rel="alternate"[^>]+?title="(.*?)"' - _TESTS = [{ - 'url': 'https://vimeo.com/channels/tributes', - 'info_dict': { - 'id': 'tributes', - 'title': 'Vimeo Tributes', - }, - 'playlist_mincount': 25, - }] - _BASE_URL_TEMPL = 'https://vimeo.com/channels/%s' - - def _page_url(self, base_url, pagenum): - return '%s/videos/page:%d/' % (base_url, pagenum) - - def _extract_list_title(self, webpage): - return self._TITLE or self._html_search_regex( - self._TITLE_RE, webpage, 'list title', fatal=False) - - def _title_and_entries(self, list_id, base_url): - for pagenum in itertools.count(1): - page_url = self._page_url(base_url, pagenum) - webpage = self._download_webpage( - page_url, list_id, - 'Downloading page %s' % pagenum) - - if pagenum == 1: - yield self._extract_list_title(webpage) - - # Try extracting href first since not all videos are available via - # short https://vimeo.com/id URL (e.g. https://vimeo.com/channels/tributes/6213729) - clips = re.findall( - r'id="clip_(\d+)"[^>]*>\s*<a[^>]+href="(/(?:[^/]+/)*\1)(?:[^>]+\btitle="([^"]+)")?', webpage) - if clips: - for video_id, video_url, video_title in clips: - yield self.url_result( - compat_urlparse.urljoin(base_url, video_url), - VimeoIE.ie_key(), video_id=video_id, video_title=video_title) - # More relaxed fallback - else: - for video_id in re.findall(r'id=["\']clip_(\d+)', webpage): - yield self.url_result( - 'https://vimeo.com/%s' % video_id, - VimeoIE.ie_key(), video_id=video_id) - - if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None: - break - - def _extract_videos(self, list_id, base_url): - title_and_entries = self._title_and_entries(list_id, base_url) - list_title = next(title_and_entries) - return self.playlist_result(title_and_entries, list_id, list_title) - - def _real_extract(self, url): - channel_id = self._match_id(url) - return self._extract_videos(channel_id, self._BASE_URL_TEMPL % channel_id) - - -class VimeoUserIE(VimeoChannelIE): - IE_NAME = 'vimeo:user' - _VALID_URL = r'https://vimeo\.com/(?!(?:[0-9]+|watchlater)(?:$|[?#/]))(?P<id>[^/]+)(?:/videos|[#?]|$)' - _TITLE_RE = r'<a[^>]+?class="user">([^<>]+?)</a>' - _TESTS = [{ - 'url': 'https://vimeo.com/nkistudio/videos', - 'info_dict': { - 'title': 'Nki', - 'id': 'nkistudio', - }, - 'playlist_mincount': 66, - }] - _BASE_URL_TEMPL = 'https://vimeo.com/%s' - - -class VimeoAlbumIE(VimeoBaseInfoExtractor): - IE_NAME = 'vimeo:album' - _VALID_URL = r'https://vimeo\.com/(?:album|showcase)/(?P<id>\d+)(?:$|[?#]|/(?!video))' - _TITLE_RE = r'<header id="page_header">\n\s*<h1>(.*?)</h1>' - _TESTS = [{ - 'url': 'https://vimeo.com/album/2632481', - 'info_dict': { - 'id': '2632481', - 'title': 'Staff Favorites: November 2013', - }, - 'playlist_mincount': 13, - }, { - 'note': 'Password-protected album', - 'url': 'https://vimeo.com/album/3253534', - 'info_dict': { - 'title': 'test', - 'id': '3253534', - }, - 'playlist_count': 1, - 'params': { - 'videopassword': 'youtube-dl', - } - }] - _PAGE_SIZE = 100 - - def _fetch_page(self, album_id, authorization, hashed_pass, page): - api_page = page + 1 - query = { - 'fields': 'link,uri', - 'page': api_page, - 'per_page': self._PAGE_SIZE, - } - if hashed_pass: - query['_hashed_pass'] = hashed_pass - try: - videos = self._download_json( - 'https://api.vimeo.com/albums/%s/videos' % album_id, - album_id, 'Downloading page %d' % api_page, query=query, headers={ - 'Authorization': 'jwt ' + authorization, - })['data'] - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: - return - for video in videos: - link = video.get('link') - if not link: - continue - uri = video.get('uri') - video_id = self._search_regex(r'/videos/(\d+)', uri, 'video_id', default=None) if uri else None - yield self.url_result(link, VimeoIE.ie_key(), video_id) - - def _real_extract(self, url): - album_id = self._match_id(url) - viewer = self._download_json( - 'https://vimeo.com/_rv/viewer', album_id, fatal=False) - if not viewer: - webpage = self._download_webpage(url, album_id) - viewer = self._parse_json(self._search_regex( - r'bootstrap_data\s*=\s*({.+?})</script>', - webpage, 'bootstrap data'), album_id)['viewer'] - jwt = viewer['jwt'] - album = self._download_json( - 'https://api.vimeo.com/albums/' + album_id, - album_id, headers={'Authorization': 'jwt ' + jwt}, - query={'fields': 'description,name,privacy'}) - hashed_pass = None - if try_get(album, lambda x: x['privacy']['view']) == 'password': - password = self._downloader.params.get('videopassword') - if not password: - raise ExtractorError( - 'This album is protected by a password, use the --video-password option', - expected=True) - self._set_vimeo_cookie('vuid', viewer['vuid']) - try: - hashed_pass = self._download_json( - 'https://vimeo.com/showcase/%s/auth' % album_id, - album_id, 'Verifying the password', data=urlencode_postdata({ - 'password': password, - 'token': viewer['xsrft'], - }), headers={ - 'X-Requested-With': 'XMLHttpRequest', - })['hashed_pass'] - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - raise ExtractorError('Wrong password', expected=True) - raise - entries = OnDemandPagedList(functools.partial( - self._fetch_page, album_id, jwt, hashed_pass), self._PAGE_SIZE) - return self.playlist_result( - entries, album_id, album.get('name'), album.get('description')) - - -class VimeoGroupsIE(VimeoChannelIE): - IE_NAME = 'vimeo:group' - _VALID_URL = r'https://vimeo\.com/groups/(?P<id>[^/]+)(?:/(?!videos?/\d+)|$)' - _TESTS = [{ - 'url': 'https://vimeo.com/groups/kattykay', - 'info_dict': { - 'id': 'kattykay', - 'title': 'Katty Kay', - }, - 'playlist_mincount': 27, - }] - _BASE_URL_TEMPL = 'https://vimeo.com/groups/%s' - - -class VimeoReviewIE(VimeoBaseInfoExtractor): - IE_NAME = 'vimeo:review' - IE_DESC = 'Review pages on vimeo' - _VALID_URL = r'(?P<url>https://vimeo\.com/[^/]+/review/(?P<id>[^/]+)/[0-9a-f]{10})' - _TESTS = [{ - 'url': 'https://vimeo.com/user21297594/review/75524534/3c257a1b5d', - 'md5': 'c507a72f780cacc12b2248bb4006d253', - 'info_dict': { - 'id': '75524534', - 'ext': 'mp4', - 'title': "DICK HARDWICK 'Comedian'", - 'uploader': 'Richard Hardwick', - 'uploader_id': 'user21297594', - 'description': "Comedian Dick Hardwick's five minute demo filmed in front of a live theater audience.\nEdit by Doug Mattocks", - }, - 'expected_warnings': ['Unable to download JSON metadata'], - }, { - 'note': 'video player needs Referer', - 'url': 'https://vimeo.com/user22258446/review/91613211/13f927e053', - 'md5': '6295fdab8f4bf6a002d058b2c6dce276', - 'info_dict': { - 'id': '91613211', - 'ext': 'mp4', - 'title': 're:(?i)^Death by dogma versus assembling agile . Sander Hoogendoorn', - 'uploader': 'DevWeek Events', - 'duration': 2773, - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader_id': 'user22258446', - }, - 'skip': 'video gone', - }, { - 'note': 'Password protected', - 'url': 'https://vimeo.com/user37284429/review/138823582/c4d865efde', - 'info_dict': { - 'id': '138823582', - 'ext': 'mp4', - 'title': 'EFFICIENT PICKUP MASTERCLASS MODULE 1', - 'uploader': 'TMB', - 'uploader_id': 'user37284429', - }, - 'params': { - 'videopassword': 'holygrail', - }, - 'skip': 'video gone', - }] - - def _real_initialize(self): - self._login() - - def _real_extract(self, url): - page_url, video_id = re.match(self._VALID_URL, url).groups() - data = self._download_json( - page_url.replace('/review/', '/review/data/'), video_id) - if data.get('isLocked') is True: - video_password = self._get_video_password() - viewer = self._download_json( - 'https://vimeo.com/_rv/viewer', video_id) - webpage = self._verify_video_password( - 'https://vimeo.com/' + video_id, video_id, - video_password, viewer['xsrft'], viewer['vuid']) - clip_page_config = self._parse_json(self._search_regex( - r'window\.vimeo\.clip_page_config\s*=\s*({.+?});', - webpage, 'clip page config'), video_id) - config_url = clip_page_config['player']['config_url'] - clip_data = clip_page_config.get('clip') or {} - else: - clip_data = data['clipData'] - config_url = clip_data['configUrl'] - config = self._download_json(config_url, video_id) - info_dict = self._parse_config(config, video_id) - source_format = self._extract_original_format( - page_url + '/action', video_id) - if source_format: - info_dict['formats'].append(source_format) - self._vimeo_sort_formats(info_dict['formats']) - info_dict['description'] = clean_html(clip_data.get('description')) - return info_dict - - -class VimeoWatchLaterIE(VimeoChannelIE): - IE_NAME = 'vimeo:watchlater' - IE_DESC = 'Vimeo watch later list, "vimeowatchlater" keyword (requires authentication)' - _VALID_URL = r'https://vimeo\.com/(?:home/)?watchlater|:vimeowatchlater' - _TITLE = 'Watch Later' - _LOGIN_REQUIRED = True - _TESTS = [{ - 'url': 'https://vimeo.com/watchlater', - 'only_matching': True, - }] - - def _real_initialize(self): - self._login() - - def _page_url(self, base_url, pagenum): - url = '%s/page:%d/' % (base_url, pagenum) - request = sanitized_Request(url) - # Set the header to get a partial html page with the ids, - # the normal page doesn't contain them. - request.add_header('X-Requested-With', 'XMLHttpRequest') - return request - - def _real_extract(self, url): - return self._extract_videos('watchlater', 'https://vimeo.com/watchlater') - - -class VimeoLikesIE(VimeoChannelIE): - _VALID_URL = r'https://(?:www\.)?vimeo\.com/(?P<id>[^/]+)/likes/?(?:$|[?#]|sort:)' - IE_NAME = 'vimeo:likes' - IE_DESC = 'Vimeo user likes' - _TESTS = [{ - 'url': 'https://vimeo.com/user755559/likes/', - 'playlist_mincount': 293, - 'info_dict': { - 'id': 'user755559', - 'title': 'urza’s Likes', - }, - }, { - 'url': 'https://vimeo.com/stormlapse/likes', - 'only_matching': True, - }] - - def _page_url(self, base_url, pagenum): - return '%s/page:%d/' % (base_url, pagenum) - - def _real_extract(self, url): - user_id = self._match_id(url) - return self._extract_videos(user_id, 'https://vimeo.com/%s/likes' % user_id) - - -class VHXEmbedIE(VimeoBaseInfoExtractor): - IE_NAME = 'vhx:embed' - _VALID_URL = r'https?://embed\.vhx\.tv/videos/(?P<id>\d+)' - - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<iframe[^>]+src="(https?://embed\.vhx\.tv/videos/\d+[^"]*)"', webpage) - return unescapeHTML(mobj.group(1)) if mobj else None - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - config_url = self._parse_json(self._search_regex( - r'window\.OTTData\s*=\s*({.+})', webpage, - 'ott data'), video_id, js_to_json)['config_url'] - config = self._download_json(config_url, video_id) - info = self._parse_config(config, video_id) - info['id'] = video_id - self._vimeo_sort_formats(info['formats']) - return info diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py deleted file mode 100644 index 80b896b56..000000000 --- a/youtube_dl/extractor/vine.py +++ /dev/null @@ -1,154 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - determine_ext, - int_or_none, - unified_timestamp, -) - - -class VineIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vine\.co/(?:v|oembed)/(?P<id>\w+)' - _TESTS = [{ - 'url': 'https://vine.co/v/b9KOOWX7HUx', - 'md5': '2f36fed6235b16da96ce9b4dc890940d', - 'info_dict': { - 'id': 'b9KOOWX7HUx', - 'ext': 'mp4', - 'title': 'Chicken.', - 'alt_title': 'Vine by Jack', - 'timestamp': 1368997951, - 'upload_date': '20130519', - 'uploader': 'Jack', - 'uploader_id': '76', - 'view_count': int, - 'like_count': int, - 'comment_count': int, - 'repost_count': int, - }, - }, { - 'url': 'https://vine.co/v/e192BnZnZ9V', - 'info_dict': { - 'id': 'e192BnZnZ9V', - 'ext': 'mp4', - 'title': 'ยิ้ม~ เขิน~ อาย~ น่าร้ากอ้ะ >//< @n_whitewo @orlameena #lovesicktheseries #lovesickseason2', - 'alt_title': 'Vine by Pimry_zaa', - 'timestamp': 1436057405, - 'upload_date': '20150705', - 'uploader': 'Pimry_zaa', - 'uploader_id': '1135760698325307392', - 'view_count': int, - 'like_count': int, - 'comment_count': int, - 'repost_count': int, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://vine.co/v/MYxVapFvz2z', - 'only_matching': True, - }, { - 'url': 'https://vine.co/v/bxVjBbZlPUH', - 'only_matching': True, - }, { - 'url': 'https://vine.co/oembed/MYxVapFvz2z.json', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - data = self._download_json( - 'https://archive.vine.co/posts/%s.json' % video_id, video_id) - - def video_url(kind): - for url_suffix in ('Url', 'URL'): - format_url = data.get('video%s%s' % (kind, url_suffix)) - if format_url: - return format_url - - formats = [] - for quality, format_id in enumerate(('low', '', 'dash')): - format_url = video_url(format_id.capitalize()) - if not format_url: - continue - # DASH link returns plain mp4 - if format_id == 'dash' and determine_ext(format_url) == 'mpd': - formats.extend(self._extract_mpd_formats( - format_url, video_id, mpd_id='dash', fatal=False)) - else: - formats.append({ - 'url': format_url, - 'format_id': format_id or 'standard', - 'quality': quality, - }) - self._sort_formats(formats) - - username = data.get('username') - - alt_title = 'Vine by %s' % username if username else None - - return { - 'id': video_id, - 'title': data.get('description') or alt_title or 'Vine video', - 'alt_title': alt_title, - 'thumbnail': data.get('thumbnailUrl'), - 'timestamp': unified_timestamp(data.get('created')), - 'uploader': username, - 'uploader_id': data.get('userIdStr'), - 'view_count': int_or_none(data.get('loops')), - 'like_count': int_or_none(data.get('likes')), - 'comment_count': int_or_none(data.get('comments')), - 'repost_count': int_or_none(data.get('reposts')), - 'formats': formats, - } - - -class VineUserIE(InfoExtractor): - IE_NAME = 'vine:user' - _VALID_URL = r'https?://vine\.co/(?P<u>u/)?(?P<user>[^/]+)' - _VINE_BASE_URL = 'https://vine.co/' - _TESTS = [{ - 'url': 'https://vine.co/itsruthb', - 'info_dict': { - 'id': 'itsruthb', - 'title': 'Ruth B', - 'description': '| Instagram/Twitter: itsruthb | still a lost boy from neverland', - }, - 'playlist_mincount': 611, - }, { - 'url': 'https://vine.co/u/942914934646415360', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return False if VineIE.suitable(url) else super(VineUserIE, cls).suitable(url) - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - user = mobj.group('user') - u = mobj.group('u') - - profile_url = '%sapi/users/profiles/%s%s' % ( - self._VINE_BASE_URL, 'vanity/' if not u else '', user) - profile_data = self._download_json( - profile_url, user, note='Downloading user profile data') - - data = profile_data['data'] - user_id = data.get('userId') or data['userIdStr'] - profile = self._download_json( - 'https://archive.vine.co/profiles/%s.json' % user_id, user_id) - entries = [ - self.url_result( - 'https://vine.co/v/%s' % post_id, ie='Vine', video_id=post_id) - for post_id in profile['posts'] - if post_id and isinstance(post_id, compat_str)] - return self.playlist_result( - entries, user, profile.get('username'), profile.get('description')) diff --git a/youtube_dl/extractor/viu.py b/youtube_dl/extractor/viu.py deleted file mode 100644 index 3bd37525b..000000000 --- a/youtube_dl/extractor/viu.py +++ /dev/null @@ -1,272 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_kwargs, - compat_str, -) -from ..utils import ( - ExtractorError, - int_or_none, -) - - -class ViuBaseIE(InfoExtractor): - def _real_initialize(self): - viu_auth_res = self._request_webpage( - 'https://www.viu.com/api/apps/v2/authenticate', None, - 'Requesting Viu auth', query={ - 'acct': 'test', - 'appid': 'viu_desktop', - 'fmt': 'json', - 'iid': 'guest', - 'languageid': 'default', - 'platform': 'desktop', - 'userid': 'guest', - 'useridtype': 'guest', - 'ver': '1.0' - }, headers=self.geo_verification_headers()) - self._auth_token = viu_auth_res.info()['X-VIU-AUTH'] - - def _call_api(self, path, *args, **kwargs): - headers = self.geo_verification_headers() - headers.update({ - 'X-VIU-AUTH': self._auth_token - }) - headers.update(kwargs.get('headers', {})) - kwargs['headers'] = headers - response = self._download_json( - 'https://www.viu.com/api/' + path, *args, - **compat_kwargs(kwargs))['response'] - if response.get('status') != 'success': - raise ExtractorError('%s said: %s' % ( - self.IE_NAME, response['message']), expected=True) - return response - - -class ViuIE(ViuBaseIE): - _VALID_URL = r'(?:viu:|https?://[^/]+\.viu\.com/[a-z]{2}/media/)(?P<id>\d+)' - _TESTS = [{ - 'url': 'https://www.viu.com/en/media/1116705532?containerId=playlist-22168059', - 'info_dict': { - 'id': '1116705532', - 'ext': 'mp4', - 'title': 'Citizen Khan - Ep 1', - 'description': 'md5:d7ea1604f49e5ba79c212c551ce2110e', - }, - 'params': { - 'skip_download': 'm3u8 download', - }, - 'skip': 'Geo-restricted to India', - }, { - 'url': 'https://www.viu.com/en/media/1130599965', - 'info_dict': { - 'id': '1130599965', - 'ext': 'mp4', - 'title': 'Jealousy Incarnate - Episode 1', - 'description': 'md5:d3d82375cab969415d2720b6894361e9', - }, - 'params': { - 'skip_download': 'm3u8 download', - }, - 'skip': 'Geo-restricted to Indonesia', - }, { - 'url': 'https://india.viu.com/en/media/1126286865', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - video_data = self._call_api( - 'clip/load', video_id, 'Downloading video data', query={ - 'appid': 'viu_desktop', - 'fmt': 'json', - 'id': video_id - })['item'][0] - - title = video_data['title'] - - m3u8_url = None - url_path = video_data.get('urlpathd') or video_data.get('urlpath') - tdirforwhole = video_data.get('tdirforwhole') - # #EXT-X-BYTERANGE is not supported by native hls downloader - # and ffmpeg (#10955) - # hls_file = video_data.get('hlsfile') - hls_file = video_data.get('jwhlsfile') - if url_path and tdirforwhole and hls_file: - m3u8_url = '%s/%s/%s' % (url_path, tdirforwhole, hls_file) - else: - # m3u8_url = re.sub( - # r'(/hlsc_)[a-z]+(\d+\.m3u8)', - # r'\1whe\2', video_data['href']) - m3u8_url = video_data['href'] - formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') - self._sort_formats(formats) - - subtitles = {} - for key, value in video_data.items(): - mobj = re.match(r'^subtitle_(?P<lang>[^_]+)_(?P<ext>(vtt|srt))', key) - if not mobj: - continue - subtitles.setdefault(mobj.group('lang'), []).append({ - 'url': value, - 'ext': mobj.group('ext') - }) - - return { - 'id': video_id, - 'title': title, - 'description': video_data.get('description'), - 'series': video_data.get('moviealbumshowname'), - 'episode': title, - 'episode_number': int_or_none(video_data.get('episodeno')), - 'duration': int_or_none(video_data.get('duration')), - 'formats': formats, - 'subtitles': subtitles, - } - - -class ViuPlaylistIE(ViuBaseIE): - IE_NAME = 'viu:playlist' - _VALID_URL = r'https?://www\.viu\.com/[^/]+/listing/playlist-(?P<id>\d+)' - _TEST = { - 'url': 'https://www.viu.com/en/listing/playlist-22461380', - 'info_dict': { - 'id': '22461380', - 'title': 'The Good Wife', - }, - 'playlist_count': 16, - 'skip': 'Geo-restricted to Indonesia', - } - - def _real_extract(self, url): - playlist_id = self._match_id(url) - playlist_data = self._call_api( - 'container/load', playlist_id, - 'Downloading playlist info', query={ - 'appid': 'viu_desktop', - 'fmt': 'json', - 'id': 'playlist-' + playlist_id - })['container'] - - entries = [] - for item in playlist_data.get('item', []): - item_id = item.get('id') - if not item_id: - continue - item_id = compat_str(item_id) - entries.append(self.url_result( - 'viu:' + item_id, 'Viu', item_id)) - - return self.playlist_result( - entries, playlist_id, playlist_data.get('title')) - - -class ViuOTTIE(InfoExtractor): - IE_NAME = 'viu:ott' - _VALID_URL = r'https?://(?:www\.)?viu\.com/ott/(?P<country_code>[a-z]{2})/[a-z]{2}-[a-z]{2}/vod/(?P<id>\d+)' - _TESTS = [{ - 'url': 'http://www.viu.com/ott/sg/en-us/vod/3421/The%20Prime%20Minister%20and%20I', - 'info_dict': { - 'id': '3421', - 'ext': 'mp4', - 'title': 'A New Beginning', - 'description': 'md5:1e7486a619b6399b25ba6a41c0fe5b2c', - }, - 'params': { - 'skip_download': 'm3u8 download', - }, - 'skip': 'Geo-restricted to Singapore', - }, { - 'url': 'http://www.viu.com/ott/hk/zh-hk/vod/7123/%E5%A4%A7%E4%BA%BA%E5%A5%B3%E5%AD%90', - 'info_dict': { - 'id': '7123', - 'ext': 'mp4', - 'title': '這就是我的生活之道', - 'description': 'md5:4eb0d8b08cf04fcdc6bbbeb16043434f', - }, - 'params': { - 'skip_download': 'm3u8 download', - }, - 'skip': 'Geo-restricted to Hong Kong', - }] - - _AREA_ID = { - 'HK': 1, - 'SG': 2, - 'TH': 4, - 'PH': 5, - } - - def _real_extract(self, url): - country_code, video_id = re.match(self._VALID_URL, url).groups() - - query = { - 'r': 'vod/ajax-detail', - 'platform_flag_label': 'web', - 'product_id': video_id, - } - - area_id = self._AREA_ID.get(country_code.upper()) - if area_id: - query['area_id'] = area_id - - product_data = self._download_json( - 'http://www.viu.com/ott/%s/index.php' % country_code, video_id, - 'Downloading video info', query=query)['data'] - - video_data = product_data.get('current_product') - if not video_data: - raise ExtractorError('This video is not available in your region.', expected=True) - - stream_data = self._download_json( - 'https://d1k2us671qcoau.cloudfront.net/distribute_web_%s.php' % country_code, - video_id, 'Downloading stream info', query={ - 'ccs_product_id': video_data['ccs_product_id'], - }, headers={ - 'Referer': url, - 'Origin': re.search(r'https?://[^/]+', url).group(0), - })['data']['stream'] - - stream_sizes = stream_data.get('size', {}) - formats = [] - for vid_format, stream_url in stream_data.get('url', {}).items(): - height = int_or_none(self._search_regex( - r's(\d+)p', vid_format, 'height', default=None)) - formats.append({ - 'format_id': vid_format, - 'url': stream_url, - 'height': height, - 'ext': 'mp4', - 'filesize': int_or_none(stream_sizes.get(vid_format)) - }) - self._sort_formats(formats) - - subtitles = {} - for sub in video_data.get('subtitle', []): - sub_url = sub.get('url') - if not sub_url: - continue - subtitles.setdefault(sub.get('name'), []).append({ - 'url': sub_url, - 'ext': 'srt', - }) - - title = video_data['synopsis'].strip() - - return { - 'id': video_id, - 'title': title, - 'description': video_data.get('description'), - 'series': product_data.get('series', {}).get('name'), - 'episode': title, - 'episode_number': int_or_none(video_data.get('number')), - 'duration': int_or_none(stream_data.get('duration')), - 'thumbnail': video_data.get('cover_image_url'), - 'formats': formats, - 'subtitles': subtitles, - } diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py deleted file mode 100644 index 6b3513ee0..000000000 --- a/youtube_dl/extractor/vk.py +++ /dev/null @@ -1,689 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import collections -import functools -import re - -from .common import InfoExtractor -from ..compat import compat_urlparse -from ..utils import ( - clean_html, - ExtractorError, - get_element_by_class, - int_or_none, - OnDemandPagedList, - orderedSet, - str_or_none, - str_to_int, - unescapeHTML, - unified_timestamp, - url_or_none, - urlencode_postdata, -) -from .dailymotion import DailymotionIE -from .odnoklassniki import OdnoklassnikiIE -from .pladform import PladformIE -from .vimeo import VimeoIE -from .youtube import YoutubeIE - - -class VKBaseIE(InfoExtractor): - _NETRC_MACHINE = 'vk' - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - - login_page, url_handle = self._download_webpage_handle( - 'https://vk.com', None, 'Downloading login page') - - login_form = self._hidden_inputs(login_page) - - login_form.update({ - 'email': username.encode('cp1251'), - 'pass': password.encode('cp1251'), - }) - - # vk serves two same remixlhk cookies in Set-Cookie header and expects - # first one to be actually set - self._apply_first_set_cookie_header(url_handle, 'remixlhk') - - login_page = self._download_webpage( - 'https://login.vk.com/?act=login', None, - note='Logging in', - data=urlencode_postdata(login_form)) - - if re.search(r'onLoginFailed', login_page): - raise ExtractorError( - 'Unable to login, incorrect username and/or password', expected=True) - - def _real_initialize(self): - self._login() - - def _download_payload(self, path, video_id, data, fatal=True): - data['al'] = 1 - code, payload = self._download_json( - 'https://vk.com/%s.php' % path, video_id, - data=urlencode_postdata(data), fatal=fatal, - headers={'X-Requested-With': 'XMLHttpRequest'})['payload'] - if code == '3': - self.raise_login_required() - elif code == '8': - raise ExtractorError(clean_html(payload[0][1:-1]), expected=True) - return payload - - -class VKIE(VKBaseIE): - IE_NAME = 'vk' - IE_DESC = 'VK' - _VALID_URL = r'''(?x) - https?:// - (?: - (?: - (?:(?:m|new)\.)?vk\.com/video_| - (?:www\.)?daxab.com/ - ) - ext\.php\?(?P<embed_query>.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+).*)| - (?: - (?:(?:m|new)\.)?vk\.com/(?:.+?\?.*?z=)?video| - (?:www\.)?daxab.com/embed/ - ) - (?P<videoid>-?\d+_\d+)(?:.*\blist=(?P<list_id>[\da-f]+))? - ) - ''' - _TESTS = [ - { - 'url': 'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521', - 'md5': '7babad3b85ea2e91948005b1b8b0cb84', - 'info_dict': { - 'id': '-77521_162222515', - 'ext': 'mp4', - 'title': 'ProtivoGunz - Хуёвая песня', - 'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*', - 'uploader_id': '-77521', - 'duration': 195, - 'timestamp': 1329049880, - 'upload_date': '20120212', - }, - }, - { - 'url': 'http://vk.com/video205387401_165548505', - 'info_dict': { - 'id': '205387401_165548505', - 'ext': 'mp4', - 'title': 'No name', - 'uploader': 'Tom Cruise', - 'uploader_id': '205387401', - 'duration': 9, - 'timestamp': 1374364108, - 'upload_date': '20130720', - } - }, - { - 'note': 'Embedded video', - 'url': 'https://vk.com/video_ext.php?oid=-77521&id=162222515&hash=87b046504ccd8bfa', - 'md5': '7babad3b85ea2e91948005b1b8b0cb84', - 'info_dict': { - 'id': '-77521_162222515', - 'ext': 'mp4', - 'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*', - 'title': 'ProtivoGunz - Хуёвая песня', - 'duration': 195, - 'upload_date': '20120212', - 'timestamp': 1329049880, - 'uploader_id': '-77521', - }, - }, - { - # VIDEO NOW REMOVED - # please update if you find a video whose URL follows the same pattern - 'url': 'http://vk.com/video-8871596_164049491', - 'md5': 'a590bcaf3d543576c9bd162812387666', - 'note': 'Only available for registered users', - 'info_dict': { - 'id': '-8871596_164049491', - 'ext': 'mp4', - 'uploader': 'Триллеры', - 'title': '► Бойцовский клуб / Fight Club 1999 [HD 720]', - 'duration': 8352, - 'upload_date': '20121218', - 'view_count': int, - }, - 'skip': 'Removed', - }, - { - 'url': 'http://vk.com/hd_kino_mania?z=video-43215063_168067957%2F15c66b9b533119788d', - 'info_dict': { - 'id': '-43215063_168067957', - 'ext': 'mp4', - 'uploader': 'Bro Mazter', - 'title': ' ', - 'duration': 7291, - 'upload_date': '20140328', - 'uploader_id': '223413403', - 'timestamp': 1396018030, - }, - 'skip': 'Requires vk account credentials', - }, - { - 'url': 'http://m.vk.com/video-43215063_169084319?list=125c627d1aa1cebb83&from=wall-43215063_2566540', - 'md5': '0c45586baa71b7cb1d0784ee3f4e00a6', - 'note': 'ivi.ru embed', - 'info_dict': { - 'id': '-43215063_169084319', - 'ext': 'mp4', - 'title': 'Книга Илая', - 'duration': 6771, - 'upload_date': '20140626', - 'view_count': int, - }, - 'skip': 'Removed', - }, - { - # video (removed?) only available with list id - 'url': 'https://vk.com/video30481095_171201961?list=8764ae2d21f14088d4', - 'md5': '091287af5402239a1051c37ec7b92913', - 'info_dict': { - 'id': '30481095_171201961', - 'ext': 'mp4', - 'title': 'ТюменцевВВ_09.07.2015', - 'uploader': 'Anton Ivanov', - 'duration': 109, - 'upload_date': '20150709', - 'view_count': int, - }, - 'skip': 'Removed', - }, - { - # youtube embed - 'url': 'https://vk.com/video276849682_170681728', - 'info_dict': { - 'id': 'V3K4mi0SYkc', - 'ext': 'mp4', - 'title': "DSWD Awards 'Children's Joy Foundation, Inc.' Certificate of Registration and License to Operate", - 'description': 'md5:bf9c26cfa4acdfb146362682edd3827a', - 'duration': 178, - 'upload_date': '20130116', - 'uploader': "Children's Joy Foundation Inc.", - 'uploader_id': 'thecjf', - 'view_count': int, - }, - }, - { - # dailymotion embed - 'url': 'https://vk.com/video-37468416_456239855', - 'info_dict': { - 'id': 'k3lz2cmXyRuJQSjGHUv', - 'ext': 'mp4', - 'title': 'md5:d52606645c20b0ddbb21655adaa4f56f', - 'description': 'md5:424b8e88cc873217f520e582ba28bb36', - 'uploader': 'AniLibria.Tv', - 'upload_date': '20160914', - 'uploader_id': 'x1p5vl5', - 'timestamp': 1473877246, - }, - 'params': { - 'skip_download': True, - }, - }, - { - # video key is extra_data not url\d+ - 'url': 'http://vk.com/video-110305615_171782105', - 'md5': 'e13fcda136f99764872e739d13fac1d1', - 'info_dict': { - 'id': '-110305615_171782105', - 'ext': 'mp4', - 'title': 'S-Dance, репетиции к The way show', - 'uploader': 'THE WAY SHOW | 17 апреля', - 'uploader_id': '-110305615', - 'timestamp': 1454859345, - 'upload_date': '20160207', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # finished live stream, postlive_mp4 - 'url': 'https://vk.com/videos-387766?z=video-387766_456242764%2Fpl_-387766_-2', - 'info_dict': { - 'id': '-387766_456242764', - 'ext': 'mp4', - 'title': 'ИгроМир 2016 День 1 — Игромания Утром', - 'uploader': 'Игромания', - 'duration': 5239, - # TODO: use act=show to extract view_count - # 'view_count': int, - 'upload_date': '20160929', - 'uploader_id': '-387766', - 'timestamp': 1475137527, - }, - 'params': { - 'skip_download': True, - }, - }, - { - # live stream, hls and rtmp links, most likely already finished live - # stream by the time you are reading this comment - 'url': 'https://vk.com/video-140332_456239111', - 'only_matching': True, - }, - { - # removed video, just testing that we match the pattern - 'url': 'http://vk.com/feed?z=video-43215063_166094326%2Fbb50cacd3177146d7a', - 'only_matching': True, - }, - { - # age restricted video, requires vk account credentials - 'url': 'https://vk.com/video205387401_164765225', - 'only_matching': True, - }, - { - # pladform embed - 'url': 'https://vk.com/video-76116461_171554880', - 'only_matching': True, - }, - { - 'url': 'http://new.vk.com/video205387401_165548505', - 'only_matching': True, - }, - { - # This video is no longer available, because its author has been blocked. - 'url': 'https://vk.com/video-10639516_456240611', - 'only_matching': True, - }, - { - # The video is not available in your region. - 'url': 'https://vk.com/video-51812607_171445436', - 'only_matching': True, - }] - - @staticmethod - def _extract_sibnet_urls(webpage): - # https://help.sibnet.ru/?sibnet_video_embed - return [unescapeHTML(mobj.group('url')) for mobj in re.finditer( - r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//video\.sibnet\.ru/shell\.php\?.*?\bvideoid=\d+.*?)\1', - webpage)] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('videoid') - - mv_data = {} - if video_id: - data = { - 'act': 'show_inline', - 'video': video_id, - } - # Some videos (removed?) can only be downloaded with list id specified - list_id = mobj.group('list_id') - if list_id: - data['list'] = list_id - - payload = self._download_payload('al_video', video_id, data) - info_page = payload[1] - opts = payload[-1] - mv_data = opts.get('mvData') or {} - player = opts.get('player') or {} - else: - video_id = '%s_%s' % (mobj.group('oid'), mobj.group('id')) - - info_page = self._download_webpage( - 'http://vk.com/video_ext.php?' + mobj.group('embed_query'), video_id) - - error_message = self._html_search_regex( - [r'(?s)<!><div[^>]+class="video_layer_message"[^>]*>(.+?)</div>', - r'(?s)<div[^>]+id="video_ext_msg"[^>]*>(.+?)</div>'], - info_page, 'error message', default=None) - if error_message: - raise ExtractorError(error_message, expected=True) - - if re.search(r'<!>/login\.php\?.*\bact=security_check', info_page): - raise ExtractorError( - 'You are trying to log in from an unusual location. You should confirm ownership at vk.com to log in with this IP.', - expected=True) - - ERROR_COPYRIGHT = 'Video %s has been removed from public access due to rightholder complaint.' - - ERRORS = { - r'>Видеозапись .*? была изъята из публичного доступа в связи с обращением правообладателя.<': - ERROR_COPYRIGHT, - - r'>The video .*? was removed from public access by request of the copyright holder.<': - ERROR_COPYRIGHT, - - r'<!>Please log in or <': - 'Video %s is only available for registered users, ' - 'use --username and --password options to provide account credentials.', - - r'<!>Unknown error': - 'Video %s does not exist.', - - r'<!>Видео временно недоступно': - 'Video %s is temporarily unavailable.', - - r'<!>Access denied': - 'Access denied to video %s.', - - r'<!>Видеозапись недоступна, так как её автор был заблокирован.': - 'Video %s is no longer available, because its author has been blocked.', - - r'<!>This video is no longer available, because its author has been blocked.': - 'Video %s is no longer available, because its author has been blocked.', - - r'<!>This video is no longer available, because it has been deleted.': - 'Video %s is no longer available, because it has been deleted.', - - r'<!>The video .+? is not available in your region.': - 'Video %s is not available in your region.', - } - - for error_re, error_msg in ERRORS.items(): - if re.search(error_re, info_page): - raise ExtractorError(error_msg % video_id, expected=True) - - player = self._parse_json(self._search_regex( - r'var\s+playerParams\s*=\s*({.+?})\s*;\s*\n', - info_page, 'player params'), video_id) - - youtube_url = YoutubeIE._extract_url(info_page) - if youtube_url: - return self.url_result(youtube_url, YoutubeIE.ie_key()) - - vimeo_url = VimeoIE._extract_url(url, info_page) - if vimeo_url is not None: - return self.url_result(vimeo_url, VimeoIE.ie_key()) - - pladform_url = PladformIE._extract_url(info_page) - if pladform_url: - return self.url_result(pladform_url, PladformIE.ie_key()) - - m_rutube = re.search( - r'\ssrc="((?:https?:)?//rutube\.ru\\?/(?:video|play)\\?/embed(?:.*?))\\?"', info_page) - if m_rutube is not None: - rutube_url = self._proto_relative_url( - m_rutube.group(1).replace('\\', '')) - return self.url_result(rutube_url) - - dailymotion_urls = DailymotionIE._extract_urls(info_page) - if dailymotion_urls: - return self.url_result(dailymotion_urls[0], DailymotionIE.ie_key()) - - odnoklassniki_url = OdnoklassnikiIE._extract_url(info_page) - if odnoklassniki_url: - return self.url_result(odnoklassniki_url, OdnoklassnikiIE.ie_key()) - - sibnet_urls = self._extract_sibnet_urls(info_page) - if sibnet_urls: - return self.url_result(sibnet_urls[0]) - - m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.+?});', info_page) - if m_opts: - m_opts_url = re.search(r"url\s*:\s*'((?!/\b)[^']+)", m_opts.group(1)) - if m_opts_url: - opts_url = m_opts_url.group(1) - if opts_url.startswith('//'): - opts_url = 'http:' + opts_url - return self.url_result(opts_url) - - data = player['params'][0] - title = unescapeHTML(data['md_title']) - - # 2 = live - # 3 = post live (finished live) - is_live = data.get('live') == 2 - if is_live: - title = self._live_title(title) - - timestamp = unified_timestamp(self._html_search_regex( - r'class=["\']mv_info_date[^>]+>([^<]+)(?:<|from)', info_page, - 'upload date', default=None)) or int_or_none(data.get('date')) - - view_count = str_to_int(self._search_regex( - r'class=["\']mv_views_count[^>]+>\s*([\d,.]+)', - info_page, 'view count', default=None)) - - formats = [] - for format_id, format_url in data.items(): - format_url = url_or_none(format_url) - if not format_url or not format_url.startswith(('http', '//', 'rtmp')): - continue - if (format_id.startswith(('url', 'cache')) - or format_id in ('extra_data', 'live_mp4', 'postlive_mp4')): - height = int_or_none(self._search_regex( - r'^(?:url|cache)(\d+)', format_id, 'height', default=None)) - formats.append({ - 'format_id': format_id, - 'url': format_url, - 'height': height, - }) - elif format_id == 'hls': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', 'm3u8_native', - m3u8_id=format_id, fatal=False, live=is_live)) - elif format_id == 'rtmp': - formats.append({ - 'format_id': format_id, - 'url': format_url, - 'ext': 'flv', - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'formats': formats, - 'title': title, - 'thumbnail': data.get('jpg'), - 'uploader': data.get('md_author'), - 'uploader_id': str_or_none(data.get('author_id') or mv_data.get('authorId')), - 'duration': int_or_none(data.get('duration') or mv_data.get('duration')), - 'timestamp': timestamp, - 'view_count': view_count, - 'like_count': int_or_none(mv_data.get('likes')), - 'comment_count': int_or_none(mv_data.get('commcount')), - 'is_live': is_live, - } - - -class VKUserVideosIE(VKBaseIE): - IE_NAME = 'vk:uservideos' - IE_DESC = "VK - User's Videos" - _VALID_URL = r'https?://(?:(?:m|new)\.)?vk\.com/videos(?P<id>-?[0-9]+)(?!\?.*\bz=video)(?:[/?#&](?:.*?\bsection=(?P<section>\w+))?|$)' - _TEMPLATE_URL = 'https://vk.com/videos' - _TESTS = [{ - 'url': 'https://vk.com/videos-767561', - 'info_dict': { - 'id': '-767561_all', - }, - 'playlist_mincount': 1150, - }, { - 'url': 'https://vk.com/videos-767561?section=uploaded', - 'info_dict': { - 'id': '-767561_uploaded', - }, - 'playlist_mincount': 425, - }, { - 'url': 'http://vk.com/videos205387401', - 'only_matching': True, - }, { - 'url': 'http://vk.com/videos-77521', - 'only_matching': True, - }, { - 'url': 'http://vk.com/videos-97664626?section=all', - 'only_matching': True, - }, { - 'url': 'http://m.vk.com/videos205387401', - 'only_matching': True, - }, { - 'url': 'http://new.vk.com/videos205387401', - 'only_matching': True, - }] - _PAGE_SIZE = 1000 - _VIDEO = collections.namedtuple('Video', ['owner_id', 'id']) - - def _fetch_page(self, page_id, section, page): - l = self._download_payload('al_video', page_id, { - 'act': 'load_videos_silent', - 'offset': page * self._PAGE_SIZE, - 'oid': page_id, - 'section': section, - })[0][section]['list'] - - for video in l: - v = self._VIDEO._make(video[:2]) - video_id = '%d_%d' % (v.owner_id, v.id) - yield self.url_result( - 'http://vk.com/video' + video_id, VKIE.ie_key(), video_id) - - def _real_extract(self, url): - page_id, section = re.match(self._VALID_URL, url).groups() - if not section: - section = 'all' - - entries = OnDemandPagedList( - functools.partial(self._fetch_page, page_id, section), - self._PAGE_SIZE) - - return self.playlist_result(entries, '%s_%s' % (page_id, section)) - - -class VKWallPostIE(VKBaseIE): - IE_NAME = 'vk:wallpost' - _VALID_URL = r'https?://(?:(?:(?:(?:m|new)\.)?vk\.com/(?:[^?]+\?.*\bw=)?wall(?P<id>-?\d+_\d+)))' - _TESTS = [{ - # public page URL, audio playlist - 'url': 'https://vk.com/bs.official?w=wall-23538238_35', - 'info_dict': { - 'id': '-23538238_35', - 'title': 'Black Shadow - Wall post -23538238_35', - 'description': 'md5:3f84b9c4f9ef499731cf1ced9998cc0c', - }, - 'playlist': [{ - 'md5': '5ba93864ec5b85f7ce19a9af4af080f6', - 'info_dict': { - 'id': '135220665_111806521', - 'ext': 'mp4', - 'title': 'Black Shadow - Слепое Верование', - 'duration': 370, - 'uploader': 'Black Shadow', - 'artist': 'Black Shadow', - 'track': 'Слепое Верование', - }, - }, { - 'md5': '4cc7e804579122b17ea95af7834c9233', - 'info_dict': { - 'id': '135220665_111802303', - 'ext': 'mp4', - 'title': 'Black Shadow - Война - Негасимое Бездны Пламя!', - 'duration': 423, - 'uploader': 'Black Shadow', - 'artist': 'Black Shadow', - 'track': 'Война - Негасимое Бездны Пламя!', - }, - }], - 'params': { - 'skip_download': True, - 'usenetrc': True, - }, - 'skip': 'Requires vk account credentials', - }, { - # single YouTube embed, no leading - - 'url': 'https://vk.com/wall85155021_6319', - 'info_dict': { - 'id': '85155021_6319', - 'title': 'Сергей Горбунов - Wall post 85155021_6319', - }, - 'playlist_count': 1, - 'params': { - 'usenetrc': True, - }, - 'skip': 'Requires vk account credentials', - }, { - # wall page URL - 'url': 'https://vk.com/wall-23538238_35', - 'only_matching': True, - }, { - # mobile wall page URL - 'url': 'https://m.vk.com/wall-23538238_35', - 'only_matching': True, - }] - _BASE64_CHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMN0PQRSTUVWXYZO123456789+/=' - _AUDIO = collections.namedtuple('Audio', ['id', 'owner_id', 'url', 'title', 'performer', 'duration', 'album_id', 'unk', 'author_link', 'lyrics', 'flags', 'context', 'extra', 'hashes', 'cover_url', 'ads']) - - def _decode(self, enc): - dec = '' - e = n = 0 - for c in enc: - r = self._BASE64_CHARS.index(c) - cond = n % 4 - e = 64 * e + r if cond else r - n += 1 - if cond: - dec += chr(255 & e >> (-2 * n & 6)) - return dec - - def _unmask_url(self, mask_url, vk_id): - if 'audio_api_unavailable' in mask_url: - extra = mask_url.split('?extra=')[1].split('#') - func, base = self._decode(extra[1]).split(chr(11)) - mask_url = list(self._decode(extra[0])) - url_len = len(mask_url) - indexes = [None] * url_len - index = int(base) ^ vk_id - for n in range(url_len - 1, -1, -1): - index = (url_len * (n + 1) ^ index + n) % url_len - indexes[n] = index - for n in range(1, url_len): - c = mask_url[n] - index = indexes[url_len - 1 - n] - mask_url[n] = mask_url[index] - mask_url[index] = c - mask_url = ''.join(mask_url) - return mask_url - - def _real_extract(self, url): - post_id = self._match_id(url) - - webpage = self._download_payload('wkview', post_id, { - 'act': 'show', - 'w': 'wall' + post_id, - })[1] - - description = clean_html(get_element_by_class('wall_post_text', webpage)) - uploader = clean_html(get_element_by_class('author', webpage)) - - entries = [] - - for audio in re.findall(r'data-audio="([^"]+)', webpage): - audio = self._parse_json(unescapeHTML(audio), post_id) - a = self._AUDIO._make(audio[:16]) - if not a.url: - continue - title = unescapeHTML(a.title) - performer = unescapeHTML(a.performer) - entries.append({ - 'id': '%s_%s' % (a.owner_id, a.id), - 'url': self._unmask_url(a.url, a.ads['vk_id']), - 'title': '%s - %s' % (performer, title) if performer else title, - 'thumbnails': [{'url': c_url} for c_url in a.cover_url.split(',')] if a.cover_url else None, - 'duration': int_or_none(a.duration), - 'uploader': uploader, - 'artist': performer, - 'track': title, - 'ext': 'mp4', - 'protocol': 'm3u8', - }) - - for video in re.finditer( - r'<a[^>]+href=(["\'])(?P<url>/video(?:-?[\d_]+).*?)\1', webpage): - entries.append(self.url_result( - compat_urlparse.urljoin(url, video.group('url')), VKIE.ie_key())) - - title = 'Wall post %s' % post_id - - return self.playlist_result( - orderedSet(entries), post_id, - '%s - %s' % (uploader, title) if uploader else title, - description) diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py deleted file mode 100644 index 42da34d44..000000000 --- a/youtube_dl/extractor/vlive.py +++ /dev/null @@ -1,328 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import itertools -import json - -from .naver import NaverBaseIE -from ..compat import ( - compat_HTTPError, - compat_str, -) -from ..utils import ( - ExtractorError, - int_or_none, - merge_dicts, - str_or_none, - strip_or_none, - try_get, - urlencode_postdata, -) - - -class VLiveBaseIE(NaverBaseIE): - _APP_ID = '8c6cc7b45d2568fb668be6e05b6e5a3b' - - -class VLiveIE(VLiveBaseIE): - IE_NAME = 'vlive' - _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/(?:video|embed)/(?P<id>[0-9]+)' - _NETRC_MACHINE = 'vlive' - _TESTS = [{ - 'url': 'http://www.vlive.tv/video/1326', - 'md5': 'cc7314812855ce56de70a06a27314983', - 'info_dict': { - 'id': '1326', - 'ext': 'mp4', - 'title': "Girl's Day's Broadcast", - 'creator': "Girl's Day", - 'view_count': int, - 'uploader_id': 'muploader_a', - }, - }, { - 'url': 'http://www.vlive.tv/video/16937', - 'info_dict': { - 'id': '16937', - 'ext': 'mp4', - 'title': '첸백시 걍방', - 'creator': 'EXO', - 'view_count': int, - 'subtitles': 'mincount:12', - 'uploader_id': 'muploader_j', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://www.vlive.tv/video/129100', - 'md5': 'ca2569453b79d66e5b919e5d308bff6b', - 'info_dict': { - 'id': '129100', - 'ext': 'mp4', - 'title': '[V LIVE] [BTS+] Run BTS! 2019 - EP.71 :: Behind the scene', - 'creator': 'BTS+', - 'view_count': int, - 'subtitles': 'mincount:10', - }, - 'skip': 'This video is only available for CH+ subscribers', - }, { - 'url': 'https://www.vlive.tv/embed/1326', - 'only_matching': True, - }, { - # works only with gcc=KR - 'url': 'https://www.vlive.tv/video/225019', - 'only_matching': True, - }] - - def _real_initialize(self): - self._login() - - def _login(self): - email, password = self._get_login_info() - if None in (email, password): - return - - def is_logged_in(): - login_info = self._download_json( - 'https://www.vlive.tv/auth/loginInfo', None, - note='Downloading login info', - headers={'Referer': 'https://www.vlive.tv/home'}) - return try_get( - login_info, lambda x: x['message']['login'], bool) or False - - LOGIN_URL = 'https://www.vlive.tv/auth/email/login' - self._request_webpage( - LOGIN_URL, None, note='Downloading login cookies') - - self._download_webpage( - LOGIN_URL, None, note='Logging in', - data=urlencode_postdata({'email': email, 'pwd': password}), - headers={ - 'Referer': LOGIN_URL, - 'Content-Type': 'application/x-www-form-urlencoded' - }) - - if not is_logged_in(): - raise ExtractorError('Unable to log in', expected=True) - - def _call_api(self, path_template, video_id, fields=None): - query = {'appId': self._APP_ID, 'gcc': 'KR', 'platformType': 'PC'} - if fields: - query['fields'] = fields - try: - return self._download_json( - 'https://www.vlive.tv/globalv-web/vam-web/' + path_template % video_id, video_id, - 'Downloading %s JSON metadata' % path_template.split('/')[-1].split('-')[0], - headers={'Referer': 'https://www.vlive.tv/'}, query=query) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - self.raise_login_required(json.loads(e.cause.read().decode('utf-8'))['message']) - raise - - def _real_extract(self, url): - video_id = self._match_id(url) - - post = self._call_api( - 'post/v1.0/officialVideoPost-%s', video_id, - 'author{nickname},channel{channelCode,channelName},officialVideo{commentCount,exposeStatus,likeCount,playCount,playTime,status,title,type,vodId}') - - video = post['officialVideo'] - - def get_common_fields(): - channel = post.get('channel') or {} - return { - 'title': video.get('title'), - 'creator': post.get('author', {}).get('nickname'), - 'channel': channel.get('channelName'), - 'channel_id': channel.get('channelCode'), - 'duration': int_or_none(video.get('playTime')), - 'view_count': int_or_none(video.get('playCount')), - 'like_count': int_or_none(video.get('likeCount')), - 'comment_count': int_or_none(video.get('commentCount')), - } - - video_type = video.get('type') - if video_type == 'VOD': - inkey = self._call_api('video/v1.0/vod/%s/inkey', video_id)['inkey'] - vod_id = video['vodId'] - return merge_dicts( - get_common_fields(), - self._extract_video_info(video_id, vod_id, inkey)) - elif video_type == 'LIVE': - status = video.get('status') - if status == 'ON_AIR': - stream_url = self._call_api( - 'old/v3/live/%s/playInfo', - video_id)['result']['adaptiveStreamUrl'] - formats = self._extract_m3u8_formats(stream_url, video_id, 'mp4') - self._sort_formats(formats) - info = get_common_fields() - info.update({ - 'title': self._live_title(video['title']), - 'id': video_id, - 'formats': formats, - 'is_live': True, - }) - return info - elif status == 'ENDED': - raise ExtractorError( - 'Uploading for replay. Please wait...', expected=True) - elif status == 'RESERVED': - raise ExtractorError('Coming soon!', expected=True) - elif video.get('exposeStatus') == 'CANCEL': - raise ExtractorError( - 'We are sorry, but the live broadcast has been canceled.', - expected=True) - else: - raise ExtractorError('Unknown status ' + status) - - -class VLivePostIE(VLiveIE): - IE_NAME = 'vlive:post' - _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/post/(?P<id>\d-\d+)' - _TESTS = [{ - # uploadType = SOS - 'url': 'https://www.vlive.tv/post/1-20088044', - 'info_dict': { - 'id': '1-20088044', - 'title': 'Hola estrellitas la tierra les dice hola (si era así no?) Ha...', - 'description': 'md5:fab8a1e50e6e51608907f46c7fa4b407', - }, - 'playlist_count': 3, - }, { - # uploadType = V - 'url': 'https://www.vlive.tv/post/1-20087926', - 'info_dict': { - 'id': '1-20087926', - 'title': 'James Corden: And so, the baby becamos the Papa💜😭💪😭', - }, - 'playlist_count': 1, - }] - _FVIDEO_TMPL = 'fvideo/v1.0/fvideo-%%s/%s' - _SOS_TMPL = _FVIDEO_TMPL % 'sosPlayInfo' - _INKEY_TMPL = _FVIDEO_TMPL % 'inKey' - - def _real_extract(self, url): - post_id = self._match_id(url) - - post = self._call_api( - 'post/v1.0/post-%s', post_id, - 'attachments{video},officialVideo{videoSeq},plainBody,title') - - video_seq = str_or_none(try_get( - post, lambda x: x['officialVideo']['videoSeq'])) - if video_seq: - return self.url_result( - 'http://www.vlive.tv/video/' + video_seq, - VLiveIE.ie_key(), video_seq) - - title = post['title'] - entries = [] - for idx, video in enumerate(post['attachments']['video'].values()): - video_id = video.get('videoId') - if not video_id: - continue - upload_type = video.get('uploadType') - upload_info = video.get('uploadInfo') or {} - entry = None - if upload_type == 'SOS': - download = self._call_api( - self._SOS_TMPL, video_id)['videoUrl']['download'] - formats = [] - for f_id, f_url in download.items(): - formats.append({ - 'format_id': f_id, - 'url': f_url, - 'height': int_or_none(f_id[:-1]), - }) - self._sort_formats(formats) - entry = { - 'formats': formats, - 'id': video_id, - 'thumbnail': upload_info.get('imageUrl'), - } - elif upload_type == 'V': - vod_id = upload_info.get('videoId') - if not vod_id: - continue - inkey = self._call_api(self._INKEY_TMPL, video_id)['inKey'] - entry = self._extract_video_info(video_id, vod_id, inkey) - if entry: - entry['title'] = '%s_part%s' % (title, idx) - entries.append(entry) - return self.playlist_result( - entries, post_id, title, strip_or_none(post.get('plainBody'))) - - -class VLiveChannelIE(VLiveBaseIE): - IE_NAME = 'vlive:channel' - _VALID_URL = r'https?://(?:channels\.vlive\.tv|(?:(?:www|m)\.)?vlive\.tv/channel)/(?P<id>[0-9A-Z]+)' - _TESTS = [{ - 'url': 'http://channels.vlive.tv/FCD4B', - 'info_dict': { - 'id': 'FCD4B', - 'title': 'MAMAMOO', - }, - 'playlist_mincount': 110 - }, { - 'url': 'https://www.vlive.tv/channel/FCD4B', - 'only_matching': True, - }] - - def _call_api(self, path, channel_key_suffix, channel_value, note, query): - q = { - 'app_id': self._APP_ID, - 'channel' + channel_key_suffix: channel_value, - } - q.update(query) - return self._download_json( - 'http://api.vfan.vlive.tv/vproxy/channelplus/' + path, - channel_value, note='Downloading ' + note, query=q)['result'] - - def _real_extract(self, url): - channel_code = self._match_id(url) - - channel_seq = self._call_api( - 'decodeChannelCode', 'Code', channel_code, - 'decode channel code', {})['channelSeq'] - - channel_name = None - entries = [] - - for page_num in itertools.count(1): - video_list = self._call_api( - 'getChannelVideoList', 'Seq', channel_seq, - 'channel list page #%d' % page_num, { - # Large values of maxNumOfRows (~300 or above) may cause - # empty responses (see [1]), e.g. this happens for [2] that - # has more than 300 videos. - # 1. https://github.com/ytdl-org/youtube-dl/issues/13830 - # 2. http://channels.vlive.tv/EDBF. - 'maxNumOfRows': 100, - 'pageNo': page_num - } - ) - - if not channel_name: - channel_name = try_get( - video_list, - lambda x: x['channelInfo']['channelName'], - compat_str) - - videos = try_get( - video_list, lambda x: x['videoList'], list) - if not videos: - break - - for video in videos: - video_id = video.get('videoSeq') - if not video_id: - continue - video_id = compat_str(video_id) - entries.append( - self.url_result( - 'http://www.vlive.tv/video/%s' % video_id, - ie=VLiveIE.ie_key(), video_id=video_id)) - - return self.playlist_result( - entries, channel_code, channel_name) diff --git a/youtube_dl/extractor/voot.py b/youtube_dl/extractor/voot.py deleted file mode 100644 index 751b21ee5..000000000 --- a/youtube_dl/extractor/voot.py +++ /dev/null @@ -1,100 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - int_or_none, - try_get, - unified_timestamp, -) - - -class VootIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?voot\.com/(?:[^/]+/)+(?P<id>\d+)' - _GEO_COUNTRIES = ['IN'] - _TESTS = [{ - 'url': 'https://www.voot.com/shows/ishq-ka-rang-safed/1/360558/is-this-the-end-of-kamini-/441353', - 'info_dict': { - 'id': '0_8ledb18o', - 'ext': 'mp4', - 'title': 'Ishq Ka Rang Safed - Season 01 - Episode 340', - 'description': 'md5:06291fbbbc4dcbe21235c40c262507c1', - 'timestamp': 1472162937, - 'upload_date': '20160825', - 'duration': 1146, - 'series': 'Ishq Ka Rang Safed', - 'season_number': 1, - 'episode': 'Is this the end of Kamini?', - 'episode_number': 340, - 'view_count': int, - 'like_count': int, - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': ['Failed to download m3u8 information'], - }, { - 'url': 'https://www.voot.com/kids/characters/mighty-cat-masked-niyander-e-/400478/school-bag-disappears/440925', - 'only_matching': True, - }, { - 'url': 'https://www.voot.com/movies/pandavas-5/424627', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - media_info = self._download_json( - 'https://wapi.voot.com/ws/ott/getMediaInfo.json', video_id, - query={ - 'platform': 'Web', - 'pId': 2, - 'mediaId': video_id, - }) - - status_code = try_get(media_info, lambda x: x['status']['code'], int) - if status_code != 0: - raise ExtractorError(media_info['status']['message'], expected=True) - - media = media_info['assets'] - - entry_id = media['EntryId'] - title = media['MediaName'] - formats = self._extract_m3u8_formats( - 'https://cdnapisec.kaltura.com/p/1982551/playManifest/pt/https/f/applehttp/t/web/e/' + entry_id, - video_id, 'mp4', m3u8_id='hls') - self._sort_formats(formats) - - description, series, season_number, episode, episode_number = [None] * 5 - - for meta in try_get(media, lambda x: x['Metas'], list) or []: - key, value = meta.get('Key'), meta.get('Value') - if not key or not value: - continue - if key == 'ContentSynopsis': - description = value - elif key == 'RefSeriesTitle': - series = value - elif key == 'RefSeriesSeason': - season_number = int_or_none(value) - elif key == 'EpisodeMainTitle': - episode = value - elif key == 'EpisodeNo': - episode_number = int_or_none(value) - - return { - 'extractor_key': 'Kaltura', - 'id': entry_id, - 'title': title, - 'description': description, - 'series': series, - 'season_number': season_number, - 'episode': episode, - 'episode_number': episode_number, - 'timestamp': unified_timestamp(media.get('CreationDate')), - 'duration': int_or_none(media.get('Duration')), - 'view_count': int_or_none(media.get('ViewCounter')), - 'like_count': int_or_none(media.get('like_counter')), - 'formats': formats, - } diff --git a/youtube_dl/extractor/vrt.py b/youtube_dl/extractor/vrt.py deleted file mode 100644 index 422025267..000000000 --- a/youtube_dl/extractor/vrt.py +++ /dev/null @@ -1,87 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - extract_attributes, - float_or_none, - get_element_by_class, - strip_or_none, - unified_timestamp, -) - - -class VRTIE(InfoExtractor): - IE_DESC = 'VRT NWS, Flanders News, Flandern Info and Sporza' - _VALID_URL = r'https?://(?:www\.)?(?P<site>vrt\.be/vrtnws|sporza\.be)/[a-z]{2}/\d{4}/\d{2}/\d{2}/(?P<id>[^/?&#]+)' - _TESTS = [{ - 'url': 'https://www.vrt.be/vrtnws/nl/2019/05/15/beelden-van-binnenkant-notre-dame-een-maand-na-de-brand/', - 'md5': 'e1663accf5cf13f375f3cd0d10476669', - 'info_dict': { - 'id': 'pbs-pub-7855fc7b-1448-49bc-b073-316cb60caa71$vid-2ca50305-c38a-4762-9890-65cbd098b7bd', - 'ext': 'mp4', - 'title': 'Beelden van binnenkant Notre-Dame, één maand na de brand', - 'description': 'Op maandagavond 15 april ging een deel van het dakgebinte van de Parijse kathedraal in vlammen op.', - 'timestamp': 1557924660, - 'upload_date': '20190515', - 'duration': 31.2, - }, - }, { - 'url': 'https://sporza.be/nl/2019/05/15/de-belgian-cats-zijn-klaar-voor-het-ek/', - 'md5': '910bba927566e9ab992278f647eb4b75', - 'info_dict': { - 'id': 'pbs-pub-f2c86a46-8138-413a-a4b9-a0015a16ce2c$vid-1f112b31-e58e-4379-908d-aca6d80f8818', - 'ext': 'mp4', - 'title': 'De Belgian Cats zijn klaar voor het EK mét Ann Wauters', - 'timestamp': 1557923760, - 'upload_date': '20190515', - 'duration': 115.17, - }, - }, { - 'url': 'https://www.vrt.be/vrtnws/en/2019/05/15/belgium_s-eurovision-entry-falls-at-the-first-hurdle/', - 'only_matching': True, - }, { - 'url': 'https://www.vrt.be/vrtnws/de/2019/05/15/aus-fuer-eliott-im-halbfinale-des-eurosongfestivals/', - 'only_matching': True, - }] - _CLIENT_MAP = { - 'vrt.be/vrtnws': 'vrtnieuws', - 'sporza.be': 'sporza', - } - - def _real_extract(self, url): - site, display_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage(url, display_id) - attrs = extract_attributes(self._search_regex( - r'(<[^>]+class="vrtvideo"[^>]*>)', webpage, 'vrt video')) - - asset_id = attrs['data-videoid'] - publication_id = attrs.get('data-publicationid') - if publication_id: - asset_id = publication_id + '$' + asset_id - client = attrs.get('data-client') or self._CLIENT_MAP[site] - - title = strip_or_none(get_element_by_class( - 'vrt-title', webpage) or self._html_search_meta( - ['og:title', 'twitter:title', 'name'], webpage)) - description = self._html_search_meta( - ['og:description', 'twitter:description', 'description'], webpage) - if description == '…': - description = None - timestamp = unified_timestamp(self._html_search_meta( - 'article:published_time', webpage)) - - return { - '_type': 'url_transparent', - 'id': asset_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': attrs.get('data-posterimage'), - 'timestamp': timestamp, - 'duration': float_or_none(attrs.get('data-duration'), 1000), - 'url': 'https://mediazone.vrt.be/api/v1/%s/assets/%s' % (client, asset_id), - 'ie_key': 'Canvas', - } diff --git a/youtube_dl/extractor/vrv.py b/youtube_dl/extractor/vrv.py deleted file mode 100644 index 6e51469b0..000000000 --- a/youtube_dl/extractor/vrv.py +++ /dev/null @@ -1,277 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import base64 -import json -import hashlib -import hmac -import random -import string -import time - -from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_urllib_parse_urlencode, - compat_urllib_parse, -) -from ..utils import ( - ExtractorError, - float_or_none, - int_or_none, -) - - -class VRVBaseIE(InfoExtractor): - _API_DOMAIN = None - _API_PARAMS = {} - _CMS_SIGNING = {} - _TOKEN = None - _TOKEN_SECRET = '' - - def _call_api(self, path, video_id, note, data=None): - # https://tools.ietf.org/html/rfc5849#section-3 - base_url = self._API_DOMAIN + '/core/' + path - query = [ - ('oauth_consumer_key', self._API_PARAMS['oAuthKey']), - ('oauth_nonce', ''.join([random.choice(string.ascii_letters) for _ in range(32)])), - ('oauth_signature_method', 'HMAC-SHA1'), - ('oauth_timestamp', int(time.time())), - ] - if self._TOKEN: - query.append(('oauth_token', self._TOKEN)) - encoded_query = compat_urllib_parse_urlencode(query) - headers = self.geo_verification_headers() - if data: - data = json.dumps(data).encode() - headers['Content-Type'] = 'application/json' - base_string = '&'.join([ - 'POST' if data else 'GET', - compat_urllib_parse.quote(base_url, ''), - compat_urllib_parse.quote(encoded_query, '')]) - oauth_signature = base64.b64encode(hmac.new( - (self._API_PARAMS['oAuthSecret'] + '&' + self._TOKEN_SECRET).encode('ascii'), - base_string.encode(), hashlib.sha1).digest()).decode() - encoded_query += '&oauth_signature=' + compat_urllib_parse.quote(oauth_signature, '') - try: - return self._download_json( - '?'.join([base_url, encoded_query]), video_id, - note='Downloading %s JSON metadata' % note, headers=headers, data=data) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - raise ExtractorError(json.loads(e.cause.read().decode())['message'], expected=True) - raise - - def _call_cms(self, path, video_id, note): - if not self._CMS_SIGNING: - index = self._call_api('index', video_id, 'CMS Signing') - self._CMS_SIGNING = index.get('cms_signing') or {} - if not self._CMS_SIGNING: - for signing_policy in index.get('signing_policies', []): - signing_path = signing_policy.get('path') - if signing_path and signing_path.startswith('/cms/'): - name, value = signing_policy.get('name'), signing_policy.get('value') - if name and value: - self._CMS_SIGNING[name] = value - return self._download_json( - self._API_DOMAIN + path, video_id, query=self._CMS_SIGNING, - note='Downloading %s JSON metadata' % note, headers=self.geo_verification_headers()) - - def _get_cms_resource(self, resource_key, video_id): - return self._call_api( - 'cms_resource', video_id, 'resource path', data={ - 'resource_key': resource_key, - })['__links__']['cms_resource']['href'] - - def _real_initialize(self): - webpage = self._download_webpage( - 'https://vrv.co/', None, headers=self.geo_verification_headers()) - self._API_PARAMS = self._parse_json(self._search_regex( - [ - r'window\.__APP_CONFIG__\s*=\s*({.+?})(?:</script>|;)', - r'window\.__APP_CONFIG__\s*=\s*({.+})' - ], webpage, 'app config'), None)['cxApiParams'] - self._API_DOMAIN = self._API_PARAMS.get('apiDomain', 'https://api.vrv.co') - - -class VRVIE(VRVBaseIE): - IE_NAME = 'vrv' - _VALID_URL = r'https?://(?:www\.)?vrv\.co/watch/(?P<id>[A-Z0-9]+)' - _TESTS = [{ - 'url': 'https://vrv.co/watch/GR9PNZ396/Hidden-America-with-Jonah-Ray:BOSTON-WHERE-THE-PAST-IS-THE-PRESENT', - 'info_dict': { - 'id': 'GR9PNZ396', - 'ext': 'mp4', - 'title': 'BOSTON: WHERE THE PAST IS THE PRESENT', - 'description': 'md5:4ec8844ac262ca2df9e67c0983c6b83f', - 'uploader_id': 'seeso', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - # movie listing - 'url': 'https://vrv.co/watch/G6NQXZ1J6/Lily-CAT', - 'info_dict': { - 'id': 'G6NQXZ1J6', - 'title': 'Lily C.A.T', - 'description': 'md5:988b031e7809a6aeb60968be4af7db07', - }, - 'playlist_count': 2, - }] - _NETRC_MACHINE = 'vrv' - - def _real_initialize(self): - super(VRVIE, self)._real_initialize() - - email, password = self._get_login_info() - if email is None: - return - - token_credentials = self._call_api( - 'authenticate/by:credentials', None, 'Token Credentials', data={ - 'email': email, - 'password': password, - }) - self._TOKEN = token_credentials['oauth_token'] - self._TOKEN_SECRET = token_credentials['oauth_token_secret'] - - def _extract_vrv_formats(self, url, video_id, stream_format, audio_lang, hardsub_lang): - if not url or stream_format not in ('hls', 'dash', 'adaptive_hls'): - return [] - stream_id_list = [] - if audio_lang: - stream_id_list.append('audio-%s' % audio_lang) - if hardsub_lang: - stream_id_list.append('hardsub-%s' % hardsub_lang) - format_id = stream_format - if stream_id_list: - format_id += '-' + '-'.join(stream_id_list) - if 'hls' in stream_format: - adaptive_formats = self._extract_m3u8_formats( - url, video_id, 'mp4', m3u8_id=format_id, - note='Downloading %s information' % format_id, - fatal=False) - elif stream_format == 'dash': - adaptive_formats = self._extract_mpd_formats( - url, video_id, mpd_id=format_id, - note='Downloading %s information' % format_id, - fatal=False) - if audio_lang: - for f in adaptive_formats: - if f.get('acodec') != 'none': - f['language'] = audio_lang - return adaptive_formats - - def _real_extract(self, url): - video_id = self._match_id(url) - - object_data = self._call_cms(self._get_cms_resource( - 'cms:/objects/' + video_id, video_id), video_id, 'object')['items'][0] - resource_path = object_data['__links__']['resource']['href'] - video_data = self._call_cms(resource_path, video_id, 'video') - title = video_data['title'] - description = video_data.get('description') - - if video_data.get('__class__') == 'movie_listing': - items = self._call_cms( - video_data['__links__']['movie_listing/movies']['href'], - video_id, 'movie listing').get('items') or [] - if len(items) != 1: - entries = [] - for item in items: - item_id = item.get('id') - if not item_id: - continue - entries.append(self.url_result( - 'https://vrv.co/watch/' + item_id, - self.ie_key(), item_id, item.get('title'))) - return self.playlist_result(entries, video_id, title, description) - video_data = items[0] - - streams_path = video_data['__links__'].get('streams', {}).get('href') - if not streams_path: - self.raise_login_required() - streams_json = self._call_cms(streams_path, video_id, 'streams') - - audio_locale = streams_json.get('audio_locale') - formats = [] - for stream_type, streams in streams_json.get('streams', {}).items(): - if stream_type in ('adaptive_hls', 'adaptive_dash'): - for stream in streams.values(): - formats.extend(self._extract_vrv_formats( - stream.get('url'), video_id, stream_type.split('_')[1], - audio_locale, stream.get('hardsub_locale'))) - self._sort_formats(formats) - - subtitles = {} - for k in ('captions', 'subtitles'): - for subtitle in streams_json.get(k, {}).values(): - subtitle_url = subtitle.get('url') - if not subtitle_url: - continue - subtitles.setdefault(subtitle.get('locale', 'en-US'), []).append({ - 'url': subtitle_url, - 'ext': subtitle.get('format', 'ass'), - }) - - thumbnails = [] - for thumbnail in video_data.get('images', {}).get('thumbnails', []): - thumbnail_url = thumbnail.get('source') - if not thumbnail_url: - continue - thumbnails.append({ - 'url': thumbnail_url, - 'width': int_or_none(thumbnail.get('width')), - 'height': int_or_none(thumbnail.get('height')), - }) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'subtitles': subtitles, - 'thumbnails': thumbnails, - 'description': description, - 'duration': float_or_none(video_data.get('duration_ms'), 1000), - 'uploader_id': video_data.get('channel_id'), - 'series': video_data.get('series_title'), - 'season': video_data.get('season_title'), - 'season_number': int_or_none(video_data.get('season_number')), - 'season_id': video_data.get('season_id'), - 'episode': title, - 'episode_number': int_or_none(video_data.get('episode_number')), - 'episode_id': video_data.get('production_episode_id'), - } - - -class VRVSeriesIE(VRVBaseIE): - IE_NAME = 'vrv:series' - _VALID_URL = r'https?://(?:www\.)?vrv\.co/series/(?P<id>[A-Z0-9]+)' - _TEST = { - 'url': 'https://vrv.co/series/G68VXG3G6/The-Perfect-Insider', - 'info_dict': { - 'id': 'G68VXG3G6', - }, - 'playlist_mincount': 11, - } - - def _real_extract(self, url): - series_id = self._match_id(url) - - seasons_path = self._get_cms_resource( - 'cms:/seasons?series_id=' + series_id, series_id) - seasons_data = self._call_cms(seasons_path, series_id, 'seasons') - - entries = [] - for season in seasons_data.get('items', []): - episodes_path = season['__links__']['season/episodes']['href'] - episodes = self._call_cms(episodes_path, series_id, 'episodes') - for episode in episodes.get('items', []): - episode_id = episode['id'] - entries.append(self.url_result( - 'https://vrv.co/watch/' + episode_id, - 'VRV', episode_id, episode.get('title'))) - - return self.playlist_result(entries, series_id) diff --git a/youtube_dl/extractor/vube.py b/youtube_dl/extractor/vube.py deleted file mode 100644 index 8ce3a6b81..000000000 --- a/youtube_dl/extractor/vube.py +++ /dev/null @@ -1,172 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_str, -) -from ..utils import ( - int_or_none, - ExtractorError, -) - - -class VubeIE(InfoExtractor): - IE_NAME = 'vube' - IE_DESC = 'Vube.com' - _VALID_URL = r'https?://vube\.com/(?:[^/]+/)+(?P<id>[\da-zA-Z]{10})\b' - - _TESTS = [ - { - 'url': 'http://vube.com/trending/William+Wei/Y8NUZ69Tf7?t=s', - 'md5': 'e7aabe1f8f1aa826b9e4735e1f9cee42', - 'info_dict': { - 'id': 'Y8NUZ69Tf7', - 'ext': 'mp4', - 'title': 'Best Drummer Ever [HD]', - 'description': 'md5:2d63c4b277b85c2277761c2cf7337d71', - 'thumbnail': r're:^https?://.*\.jpg', - 'uploader': 'William', - 'timestamp': 1406876915, - 'upload_date': '20140801', - 'duration': 258.051, - 'like_count': int, - 'dislike_count': int, - 'comment_count': int, - 'categories': ['amazing', 'hd', 'best drummer ever', 'william wei', 'bucket drumming', 'street drummer', 'epic street drumming'], - }, - 'skip': 'Not accessible from Travis CI server', - }, { - 'url': 'http://vube.com/Chiara+Grispo+Video+Channel/YL2qNPkqon', - 'md5': 'db7aba89d4603dadd627e9d1973946fe', - 'info_dict': { - 'id': 'YL2qNPkqon', - 'ext': 'mp4', - 'title': 'Chiara Grispo - Price Tag by Jessie J', - 'description': 'md5:8ea652a1f36818352428cb5134933313', - 'thumbnail': r're:^http://frame\.thestaticvube\.com/snap/[0-9x]+/102e7e63057-5ebc-4f5c-4065-6ce4ebde131f\.jpg$', - 'uploader': 'Chiara.Grispo', - 'timestamp': 1388743358, - 'upload_date': '20140103', - 'duration': 170.56, - 'like_count': int, - 'dislike_count': int, - 'comment_count': int, - 'categories': ['pop', 'music', 'cover', 'singing', 'jessie j', 'price tag', 'chiara grispo'], - }, - 'skip': 'Removed due to DMCA', - }, - { - 'url': 'http://vube.com/SerainaMusic/my-7-year-old-sister-and-i-singing-alive-by-krewella/UeBhTudbfS?t=s&n=1', - 'md5': '5d4a52492d76f72712117ce6b0d98d08', - 'info_dict': { - 'id': 'UeBhTudbfS', - 'ext': 'mp4', - 'title': 'My 7 year old Sister and I singing "Alive" by Krewella', - 'description': 'md5:40bcacb97796339f1690642c21d56f4a', - 'thumbnail': r're:^http://frame\.thestaticvube\.com/snap/[0-9x]+/102265d5a9f-0f17-4f6b-5753-adf08484ee1e\.jpg$', - 'uploader': 'Seraina', - 'timestamp': 1396492438, - 'upload_date': '20140403', - 'duration': 240.107, - 'like_count': int, - 'dislike_count': int, - 'comment_count': int, - 'categories': ['seraina', 'jessica', 'krewella', 'alive'], - }, - 'skip': 'Removed due to DMCA', - }, { - 'url': 'http://vube.com/vote/Siren+Gene/0nmsMY5vEq?n=2&t=s', - 'md5': '0584fc13b50f887127d9d1007589d27f', - 'info_dict': { - 'id': '0nmsMY5vEq', - 'ext': 'mp4', - 'title': 'Frozen - Let It Go Cover by Siren Gene', - 'description': 'My rendition of "Let It Go" originally sung by Idina Menzel.', - 'thumbnail': r're:^http://frame\.thestaticvube\.com/snap/[0-9x]+/10283ab622a-86c9-4681-51f2-30d1f65774af\.jpg$', - 'uploader': 'Siren', - 'timestamp': 1395448018, - 'upload_date': '20140322', - 'duration': 221.788, - 'like_count': int, - 'dislike_count': int, - 'comment_count': int, - 'categories': ['let it go', 'cover', 'idina menzel', 'frozen', 'singing', 'disney', 'siren gene'], - }, - 'skip': 'Removed due to DMCA', - } - ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - video = self._download_json( - 'http://vube.com/t-api/v1/video/%s' % video_id, video_id, 'Downloading video JSON') - - public_id = video['public_id'] - - formats = [] - - for media in video['media'].get('video', []) + video['media'].get('audio', []): - if media['transcoding_status'] != 'processed': - continue - fmt = { - 'url': 'http://video.thestaticvube.com/video/%s/%s.mp4' % (media['media_resolution_id'], public_id), - 'abr': int(media['audio_bitrate']), - 'format_id': compat_str(media['media_resolution_id']), - } - vbr = int(media['video_bitrate']) - if vbr: - fmt.update({ - 'vbr': vbr, - 'height': int(media['height']), - }) - formats.append(fmt) - - self._sort_formats(formats) - - if not formats and video.get('vst') == 'dmca': - raise ExtractorError( - 'This video has been removed in response to a complaint received under the US Digital Millennium Copyright Act.', - expected=True) - - title = video['title'] - description = video.get('description') - thumbnail = self._proto_relative_url(video.get('thumbnail_src'), scheme='http:') - uploader = video.get('user_alias') or video.get('channel') - timestamp = int_or_none(video.get('upload_time')) - duration = video['duration'] - view_count = video.get('raw_view_count') - like_count = video.get('total_likes') - dislike_count = video.get('total_hates') - - comments = video.get('comments') - comment_count = None - if comments is None: - comment_data = self._download_json( - 'http://vube.com/api/video/%s/comment' % video_id, - video_id, 'Downloading video comment JSON', fatal=False) - if comment_data is not None: - comment_count = int_or_none(comment_data.get('total')) - else: - comment_count = len(comments) - - categories = [tag['text'] for tag in video['tags']] - - return { - 'id': video_id, - 'formats': formats, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'timestamp': timestamp, - 'duration': duration, - 'view_count': view_count, - 'like_count': like_count, - 'dislike_count': dislike_count, - 'comment_count': comment_count, - 'categories': categories, - } diff --git a/youtube_dl/extractor/vvvvid.py b/youtube_dl/extractor/vvvvid.py deleted file mode 100644 index bc196f8a0..000000000 --- a/youtube_dl/extractor/vvvvid.py +++ /dev/null @@ -1,284 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from .youtube import YoutubeIE -from ..utils import ( - ExtractorError, - int_or_none, - str_or_none, -) - - -class VVVVIDIE(InfoExtractor): - _VALID_URL_BASE = r'https?://(?:www\.)?vvvvid\.it/(?:#!)?(?:show|anime|film|series)/' - _VALID_URL = r'%s(?P<show_id>\d+)/[^/]+/(?P<season_id>\d+)/(?P<id>[0-9]+)' % _VALID_URL_BASE - _TESTS = [{ - # video_type == 'video/vvvvid' - 'url': 'https://www.vvvvid.it/#!show/434/perche-dovrei-guardarlo-di-dario-moccia/437/489048/ping-pong', - 'md5': 'b8d3cecc2e981adc3835adf07f6df91b', - 'info_dict': { - 'id': '489048', - 'ext': 'mp4', - 'title': 'Ping Pong', - 'duration': 239, - 'series': '"Perché dovrei guardarlo?" di Dario Moccia', - 'season_id': '437', - 'episode': 'Ping Pong', - 'episode_number': 1, - 'episode_id': '3334', - 'view_count': int, - 'like_count': int, - 'repost_count': int, - }, - 'params': { - 'skip_download': True, - }, - }, { - # video_type == 'video/rcs' - 'url': 'https://www.vvvvid.it/#!show/376/death-note-live-action/377/482493/episodio-01', - 'md5': '33e0edfba720ad73a8782157fdebc648', - 'info_dict': { - 'id': '482493', - 'ext': 'mp4', - 'title': 'Episodio 01', - }, - 'params': { - 'skip_download': True, - }, - }, { - # video_type == 'video/youtube' - 'url': 'https://www.vvvvid.it/show/404/one-punch-man/406/486683/trailer', - 'md5': '33e0edfba720ad73a8782157fdebc648', - 'info_dict': { - 'id': 'RzmFKUDOUgw', - 'ext': 'mp4', - 'title': 'Trailer', - 'upload_date': '20150906', - 'description': 'md5:a5e802558d35247fee285875328c0b80', - 'uploader_id': 'BandaiVisual', - 'uploader': 'BANDAI NAMCO Arts Channel', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://www.vvvvid.it/show/434/perche-dovrei-guardarlo-di-dario-moccia/437/489048', - 'only_matching': True - }] - _conn_id = None - - def _real_initialize(self): - self._conn_id = self._download_json( - 'https://www.vvvvid.it/user/login', - None, headers=self.geo_verification_headers())['data']['conn_id'] - - def _download_info(self, show_id, path, video_id, fatal=True, query=None): - q = { - 'conn_id': self._conn_id, - } - if query: - q.update(query) - response = self._download_json( - 'https://www.vvvvid.it/vvvvid/ondemand/%s/%s' % (show_id, path), - video_id, headers=self.geo_verification_headers(), query=q, fatal=fatal) - if not (response or fatal): - return - if response.get('result') == 'error': - raise ExtractorError('%s said: %s' % ( - self.IE_NAME, response['message']), expected=True) - return response['data'] - - def _extract_common_video_info(self, video_data): - return { - 'thumbnail': video_data.get('thumbnail'), - 'episode_id': str_or_none(video_data.get('id')), - } - - def _real_extract(self, url): - show_id, season_id, video_id = re.match(self._VALID_URL, url).groups() - - response = self._download_info( - show_id, 'season/%s' % season_id, - video_id, query={'video_id': video_id}) - - vid = int(video_id) - video_data = list(filter( - lambda episode: episode.get('video_id') == vid, response))[0] - title = video_data['title'] - formats = [] - - # vvvvid embed_info decryption algorithm is reverse engineered from function $ds(h) at vvvvid.js - def ds(h): - g = "MNOPIJKL89+/4567UVWXQRSTEFGHABCDcdefYZabstuvopqr0123wxyzklmnghij" - - def f(m): - l = [] - o = 0 - b = False - m_len = len(m) - while ((not b) and o < m_len): - n = m[o] << 2 - o += 1 - k = -1 - j = -1 - if o < m_len: - n += m[o] >> 4 - o += 1 - if o < m_len: - k = (m[o - 1] << 4) & 255 - k += m[o] >> 2 - o += 1 - if o < m_len: - j = (m[o - 1] << 6) & 255 - j += m[o] - o += 1 - else: - b = True - else: - b = True - else: - b = True - l.append(n) - if k != -1: - l.append(k) - if j != -1: - l.append(j) - return l - - c = [] - for e in h: - c.append(g.index(e)) - - c_len = len(c) - for e in range(c_len * 2 - 1, -1, -1): - a = c[e % c_len] ^ c[(e + 1) % c_len] - c[e % c_len] = a - - c = f(c) - d = '' - for e in c: - d += chr(e) - - return d - - info = {} - - def metadata_from_url(r_url): - if not info and r_url: - mobj = re.search(r'_(?:S(\d+))?Ep(\d+)', r_url) - if mobj: - info['episode_number'] = int(mobj.group(2)) - season_number = mobj.group(1) - if season_number: - info['season_number'] = int(season_number) - - video_type = video_data.get('video_type') - is_youtube = False - for quality in ('', '_sd'): - embed_code = video_data.get('embed_info' + quality) - if not embed_code: - continue - embed_code = ds(embed_code) - if video_type == 'video/kenc': - embed_code = re.sub(r'https?(://[^/]+)/z/', r'https\1/i/', embed_code).replace('/manifest.f4m', '/master.m3u8') - kenc = self._download_json( - 'https://www.vvvvid.it/kenc', video_id, query={ - 'action': 'kt', - 'conn_id': self._conn_id, - 'url': embed_code, - }, fatal=False) or {} - kenc_message = kenc.get('message') - if kenc_message: - embed_code += '?' + ds(kenc_message) - formats.extend(self._extract_m3u8_formats( - embed_code, video_id, 'mp4', m3u8_id='hls', fatal=False)) - elif video_type == 'video/rcs': - formats.extend(self._extract_akamai_formats(embed_code, video_id)) - elif video_type == 'video/youtube': - info.update({ - '_type': 'url_transparent', - 'ie_key': YoutubeIE.ie_key(), - 'url': embed_code, - }) - is_youtube = True - break - else: - formats.extend(self._extract_wowza_formats( - 'http://sb.top-ix.org/videomg/_definst_/mp4:%s/playlist.m3u8' % embed_code, video_id)) - metadata_from_url(embed_code) - - if not is_youtube: - self._sort_formats(formats) - info['formats'] = formats - - metadata_from_url(video_data.get('thumbnail')) - info.update(self._extract_common_video_info(video_data)) - info.update({ - 'id': video_id, - 'title': title, - 'duration': int_or_none(video_data.get('length')), - 'series': video_data.get('show_title'), - 'season_id': season_id, - 'episode': title, - 'view_count': int_or_none(video_data.get('views')), - 'like_count': int_or_none(video_data.get('video_likes')), - 'repost_count': int_or_none(video_data.get('video_shares')), - }) - return info - - -class VVVVIDShowIE(VVVVIDIE): - _VALID_URL = r'(?P<base_url>%s(?P<id>\d+)(?:/(?P<show_title>[^/?&#]+))?)/?(?:[?#&]|$)' % VVVVIDIE._VALID_URL_BASE - _TESTS = [{ - 'url': 'https://www.vvvvid.it/show/156/psyco-pass', - 'info_dict': { - 'id': '156', - 'title': 'Psycho-Pass', - 'description': 'md5:94d572c0bd85894b193b8aebc9a3a806', - }, - 'playlist_count': 46, - }, { - 'url': 'https://www.vvvvid.it/show/156', - 'only_matching': True, - }] - - def _real_extract(self, url): - base_url, show_id, show_title = re.match(self._VALID_URL, url).groups() - - seasons = self._download_info( - show_id, 'seasons/', show_title) - - show_info = self._download_info( - show_id, 'info/', show_title, fatal=False) - - if not show_title: - base_url += "/title" - - entries = [] - for season in (seasons or []): - episodes = season.get('episodes') or [] - playlist_title = season.get('name') or show_info.get('title') - for episode in episodes: - if episode.get('playable') is False: - continue - season_id = str_or_none(episode.get('season_id')) - video_id = str_or_none(episode.get('video_id')) - if not (season_id and video_id): - continue - info = self._extract_common_video_info(episode) - info.update({ - '_type': 'url_transparent', - 'ie_key': VVVVIDIE.ie_key(), - 'url': '/'.join([base_url, season_id, video_id]), - 'title': episode.get('title'), - 'description': episode.get('description'), - 'season_id': season_id, - 'playlist_title': playlist_title, - }) - entries.append(info) - - return self.playlist_result( - entries, show_id, show_info.get('title'), show_info.get('description')) diff --git a/youtube_dl/extractor/vzaar.py b/youtube_dl/extractor/vzaar.py deleted file mode 100644 index b7d02fca3..000000000 --- a/youtube_dl/extractor/vzaar.py +++ /dev/null @@ -1,112 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - int_or_none, - float_or_none, - unified_timestamp, - url_or_none, -) - - -class VzaarIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|view)\.)?vzaar\.com/(?:videos/)?(?P<id>\d+)' - _TESTS = [{ - # HTTP and HLS - 'url': 'https://vzaar.com/videos/1152805', - 'md5': 'bde5ddfeb104a6c56a93a06b04901dbf', - 'info_dict': { - 'id': '1152805', - 'ext': 'mp4', - 'title': 'sample video (public)', - }, - }, { - 'url': 'https://view.vzaar.com/27272/player', - 'md5': '3b50012ac9bbce7f445550d54e0508f2', - 'info_dict': { - 'id': '27272', - 'ext': 'mp3', - 'title': 'MP3', - }, - }, { - # hlsAes = true - 'url': 'https://view.vzaar.com/11379930/player', - 'info_dict': { - 'id': '11379930', - 'ext': 'mp4', - 'title': 'Videoaula', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - # with null videoTitle - 'url': 'https://view.vzaar.com/20313539/download', - 'only_matching': True, - }] - - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe[^>]+src=["\']((?:https?:)?//(?:view\.vzaar\.com)/[0-9]+)', - webpage) - - def _real_extract(self, url): - video_id = self._match_id(url) - video_data = self._download_json( - 'http://view.vzaar.com/v2/%s/video' % video_id, video_id) - - title = video_data.get('videoTitle') or video_id - - formats = [] - - source_url = url_or_none(video_data.get('sourceUrl')) - if source_url: - f = { - 'url': source_url, - 'format_id': 'http', - 'preference': 1, - } - if 'audio' in source_url: - f.update({ - 'vcodec': 'none', - 'ext': 'mp3', - }) - else: - f.update({ - 'width': int_or_none(video_data.get('width')), - 'height': int_or_none(video_data.get('height')), - 'ext': 'mp4', - 'fps': float_or_none(video_data.get('fps')), - }) - formats.append(f) - - video_guid = video_data.get('guid') - usp = video_data.get('usp') - if video_data.get('uspEnabled') and isinstance(video_guid, compat_str) and isinstance(usp, dict): - hls_aes = video_data.get('hlsAes') - qs = '&'.join('%s=%s' % (k, v) for k, v in usp.items()) - url_templ = 'http://%%s.vzaar.com/v5/usp%s/%s/%s.ism%%s?' % ('aes' if hls_aes else '', video_guid, video_id) - m3u8_formats = self._extract_m3u8_formats( - url_templ % ('fable', '/.m3u8') + qs, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False) - if hls_aes: - for f in m3u8_formats: - f['_decryption_key_url'] = url_templ % ('goose', '') + qs - formats.extend(m3u8_formats) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'thumbnail': self._proto_relative_url(video_data.get('poster')), - 'duration': float_or_none(video_data.get('videoDuration')), - 'timestamp': unified_timestamp(video_data.get('ts')), - 'formats': formats, - } diff --git a/youtube_dl/extractor/wakanim.py b/youtube_dl/extractor/wakanim.py deleted file mode 100644 index f9a2395d9..000000000 --- a/youtube_dl/extractor/wakanim.py +++ /dev/null @@ -1,66 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - merge_dicts, - urljoin, -) - - -class WakanimIE(InfoExtractor): - _VALID_URL = r'https://(?:www\.)?wakanim\.tv/[^/]+/v2/catalogue/episode/(?P<id>\d+)' - _TESTS = [{ - 'url': 'https://www.wakanim.tv/de/v2/catalogue/episode/2997/the-asterisk-war-omu-staffel-1-episode-02-omu', - 'info_dict': { - 'id': '2997', - 'ext': 'mp4', - 'title': 'Episode 02', - 'description': 'md5:2927701ea2f7e901de8bfa8d39b2852d', - 'series': 'The Asterisk War (OmU.)', - 'season_number': 1, - 'episode': 'Episode 02', - 'episode_number': 2, - }, - 'params': { - 'format': 'bestvideo', - 'skip_download': True, - }, - }, { - # DRM Protected - 'url': 'https://www.wakanim.tv/de/v2/catalogue/episode/7843/sword-art-online-alicization-omu-arc-2-folge-15-omu', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - m3u8_url = urljoin(url, self._search_regex( - r'file\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, 'm3u8 url', - group='url')) - # https://docs.microsoft.com/en-us/azure/media-services/previous/media-services-content-protection-overview#streaming-urls - encryption = self._search_regex( - r'encryption%3D(c(?:enc|bc(?:s-aapl)?))', - m3u8_url, 'encryption', default=None) - if encryption and encryption in ('cenc', 'cbcs-aapl'): - raise ExtractorError('This video is DRM protected.', expected=True) - - formats = self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls') - - info = self._search_json_ld(webpage, video_id, default={}) - - title = self._search_regex( - (r'<h1[^>]+\bclass=["\']episode_h1[^>]+\btitle=(["\'])(?P<title>(?:(?!\1).)+)\1', - r'<span[^>]+\bclass=["\']episode_title["\'][^>]*>(?P<title>[^<]+)'), - webpage, 'title', default=None, group='title') - - return merge_dicts(info, { - 'id': video_id, - 'title': title, - 'formats': formats, - }) diff --git a/youtube_dl/extractor/walla.py b/youtube_dl/extractor/walla.py deleted file mode 100644 index cbb548672..000000000 --- a/youtube_dl/extractor/walla.py +++ /dev/null @@ -1,86 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - xpath_text, - int_or_none, -) - - -class WallaIE(InfoExtractor): - _VALID_URL = r'https?://vod\.walla\.co\.il/[^/]+/(?P<id>\d+)/(?P<display_id>.+)' - _TEST = { - 'url': 'http://vod.walla.co.il/movie/2642630/one-direction-all-for-one', - 'info_dict': { - 'id': '2642630', - 'display_id': 'one-direction-all-for-one', - 'ext': 'flv', - 'title': 'וואן דיירקשן: ההיסטריה', - 'description': 'md5:de9e2512a92442574cdb0913c49bc4d8', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 3600, - }, - 'params': { - # rtmp download - 'skip_download': True, - } - } - - _SUBTITLE_LANGS = { - 'עברית': 'heb', - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') - - video = self._download_xml( - 'http://video2.walla.co.il/?w=null/null/%s/@@/video/flv_pl' % video_id, - display_id) - - item = video.find('./items/item') - - title = xpath_text(item, './title', 'title') - description = xpath_text(item, './synopsis', 'description') - thumbnail = xpath_text(item, './preview_pic', 'thumbnail') - duration = int_or_none(xpath_text(item, './duration', 'duration')) - - subtitles = {} - for subtitle in item.findall('./subtitles/subtitle'): - lang = xpath_text(subtitle, './title') - subtitles[self._SUBTITLE_LANGS.get(lang, lang)] = [{ - 'ext': 'srt', - 'url': xpath_text(subtitle, './src'), - }] - - formats = [] - for quality in item.findall('./qualities/quality'): - format_id = xpath_text(quality, './title') - fmt = { - 'url': 'rtmp://wafla.walla.co.il/vod', - 'play_path': xpath_text(quality, './src'), - 'player_url': 'http://isc.walla.co.il/w9/swf/video_swf/vod/WallaMediaPlayerAvod.swf', - 'page_url': url, - 'ext': 'flv', - 'format_id': xpath_text(quality, './title'), - } - m = re.search(r'^(?P<height>\d+)[Pp]', format_id) - if m: - fmt['height'] = int(m.group('height')) - formats.append(fmt) - self._sort_formats(formats) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - 'subtitles': subtitles, - } diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py deleted file mode 100644 index f1bccc2d6..000000000 --- a/youtube_dl/extractor/wat.py +++ /dev/null @@ -1,106 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - ExtractorError, - int_or_none, - try_get, - unified_strdate, -) - - -class WatIE(InfoExtractor): - _VALID_URL = r'(?:wat:|https?://(?:www\.)?wat\.tv/video/.*-)(?P<id>[0-9a-z]+)' - IE_NAME = 'wat.tv' - _TESTS = [ - { - 'url': 'http://www.wat.tv/video/soupe-figues-l-orange-aux-epices-6z1uz_2hvf7_.html', - 'info_dict': { - 'id': '11713067', - 'ext': 'mp4', - 'title': 'Soupe de figues à l\'orange et aux épices', - 'description': 'Retrouvez l\'émission "Petits plats en équilibre", diffusée le 18 août 2014.', - 'upload_date': '20140819', - 'duration': 120, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'expected_warnings': ['HTTP Error 404'], - 'skip': 'This content is no longer available', - }, - { - 'url': 'http://www.wat.tv/video/gregory-lemarchal-voix-ange-6z1v7_6ygkj_.html', - 'md5': 'b16574df2c3cd1a36ca0098f2a791925', - 'info_dict': { - 'id': '11713075', - 'ext': 'mp4', - 'title': 'Grégory Lemarchal, une voix d\'ange depuis 10 ans (1/3)', - 'upload_date': '20140816', - }, - 'expected_warnings': ["Ce contenu n'est pas disponible pour l'instant."], - 'skip': 'This content is no longer available', - }, - ] - _GEO_BYPASS = False - - def _real_extract(self, url): - video_id = self._match_id(url) - video_id = video_id if video_id.isdigit() and len(video_id) > 6 else compat_str(int(video_id, 36)) - - # 'contentv4' is used in the website, but it also returns the related - # videos, we don't need them - # video_data = self._download_json( - # 'http://www.wat.tv/interface/contentv4s/' + video_id, video_id) - video_data = self._download_json( - 'https://mediainfo.tf1.fr/mediainfocombo/' + video_id, - video_id, query={'context': 'MYTF1'}) - video_info = video_data['media'] - - error_desc = video_info.get('error_desc') - if error_desc: - if video_info.get('error_code') == 'GEOBLOCKED': - self.raise_geo_restricted(error_desc, video_info.get('geoList')) - raise ExtractorError(error_desc, expected=True) - - title = video_info['title'] - - formats = [] - - def extract_formats(manifest_urls): - for f, f_url in manifest_urls.items(): - if not f_url: - continue - if f in ('dash', 'mpd'): - formats.extend(self._extract_mpd_formats( - f_url.replace('://das-q1.tf1.fr/', '://das-q1-ssl.tf1.fr/'), - video_id, mpd_id='dash', fatal=False)) - elif f == 'hls': - formats.extend(self._extract_m3u8_formats( - f_url, video_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False)) - - delivery = video_data.get('delivery') or {} - extract_formats({delivery.get('format'): delivery.get('url')}) - if not formats: - if delivery.get('drm'): - raise ExtractorError('This video is DRM protected.', expected=True) - manifest_urls = self._download_json( - 'http://www.wat.tv/get/webhtml/' + video_id, video_id, fatal=False) - if manifest_urls: - extract_formats(manifest_urls) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'thumbnail': video_info.get('preview'), - 'upload_date': unified_strdate(try_get( - video_data, lambda x: x['mediametrie']['chapters'][0]['estatS4'])), - 'duration': int_or_none(video_info.get('duration')), - 'formats': formats, - } diff --git a/youtube_dl/extractor/watchbox.py b/youtube_dl/extractor/watchbox.py deleted file mode 100644 index 5a4e46e73..000000000 --- a/youtube_dl/extractor/watchbox.py +++ /dev/null @@ -1,161 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - int_or_none, - js_to_json, - strip_or_none, - try_get, - unescapeHTML, - unified_timestamp, -) - - -class WatchBoxIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?watchbox\.de/(?P<kind>serien|filme)/(?:[^/]+/)*[^/]+-(?P<id>\d+)' - _TESTS = [{ - # film - 'url': 'https://www.watchbox.de/filme/free-jimmy-12325.html', - 'info_dict': { - 'id': '341368', - 'ext': 'mp4', - 'title': 'Free Jimmy', - 'description': 'md5:bcd8bafbbf9dc0ef98063d344d7cc5f6', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 4890, - 'age_limit': 16, - 'release_year': 2009, - }, - 'params': { - 'format': 'bestvideo', - 'skip_download': True, - }, - 'expected_warnings': ['Failed to download m3u8 information'], - }, { - # episode - 'url': 'https://www.watchbox.de/serien/ugly-americans-12231/staffel-1/date-in-der-hoelle-328286.html', - 'info_dict': { - 'id': '328286', - 'ext': 'mp4', - 'title': 'S01 E01 - Date in der Hölle', - 'description': 'md5:2f31c74a8186899f33cb5114491dae2b', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 1291, - 'age_limit': 12, - 'release_year': 2010, - 'series': 'Ugly Americans', - 'season_number': 1, - 'episode': 'Date in der Hölle', - 'episode_number': 1, - }, - 'params': { - 'format': 'bestvideo', - 'skip_download': True, - }, - 'expected_warnings': ['Failed to download m3u8 information'], - }, { - 'url': 'https://www.watchbox.de/serien/ugly-americans-12231/staffel-2/der-ring-des-powers-328270', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - kind, video_id = mobj.group('kind', 'id') - - webpage = self._download_webpage(url, video_id) - - player_config = self._parse_json( - self._search_regex( - r'data-player-conf=(["\'])(?P<data>{.+?})\1', webpage, - 'player config', default='{}', group='data'), - video_id, transform_source=unescapeHTML, fatal=False) - - if not player_config: - player_config = self._parse_json( - self._search_regex( - r'playerConf\s*=\s*({.+?})\s*;', webpage, 'player config', - default='{}'), - video_id, transform_source=js_to_json, fatal=False) or {} - - source = player_config.get('source') or {} - - video_id = compat_str(source.get('videoId') or video_id) - - devapi = self._download_json( - 'http://api.watchbox.de/devapi/id/%s' % video_id, video_id, query={ - 'format': 'json', - 'apikey': 'hbbtv', - }, fatal=False) - - item = try_get(devapi, lambda x: x['items'][0], dict) or {} - - title = item.get('title') or try_get( - item, lambda x: x['movie']['headline_movie'], - compat_str) or source['title'] - - formats = [] - hls_url = item.get('media_videourl_hls') or source.get('hls') - if hls_url: - formats.extend(self._extract_m3u8_formats( - hls_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - dash_url = item.get('media_videourl_wv') or source.get('dash') - if dash_url: - formats.extend(self._extract_mpd_formats( - dash_url, video_id, mpd_id='dash', fatal=False)) - mp4_url = item.get('media_videourl') - if mp4_url: - formats.append({ - 'url': mp4_url, - 'format_id': 'mp4', - 'width': int_or_none(item.get('width')), - 'height': int_or_none(item.get('height')), - 'tbr': int_or_none(item.get('bitrate')), - }) - self._sort_formats(formats) - - description = strip_or_none(item.get('descr')) - thumbnail = item.get('media_content_thumbnail_large') or source.get('poster') or item.get('media_thumbnail') - duration = int_or_none(item.get('media_length') or source.get('length')) - timestamp = unified_timestamp(item.get('pubDate')) - view_count = int_or_none(item.get('media_views')) - age_limit = int_or_none(try_get(item, lambda x: x['movie']['fsk'])) - release_year = int_or_none(try_get(item, lambda x: x['movie']['rel_year'])) - - info = { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'timestamp': timestamp, - 'view_count': view_count, - 'age_limit': age_limit, - 'release_year': release_year, - 'formats': formats, - } - - if kind.lower() == 'serien': - series = try_get( - item, lambda x: x['special']['title'], - compat_str) or source.get('format') - season_number = int_or_none(self._search_regex( - r'^S(\d{1,2})\s*E\d{1,2}', title, 'season number', - default=None) or self._search_regex( - r'/staffel-(\d+)/', url, 'season number', default=None)) - episode = source.get('title') - episode_number = int_or_none(self._search_regex( - r'^S\d{1,2}\s*E(\d{1,2})', title, 'episode number', - default=None)) - info.update({ - 'series': series, - 'season_number': season_number, - 'episode': episode, - 'episode_number': episode_number, - }) - - return info diff --git a/youtube_dl/extractor/watchindianporn.py b/youtube_dl/extractor/watchindianporn.py deleted file mode 100644 index fadc539ee..000000000 --- a/youtube_dl/extractor/watchindianporn.py +++ /dev/null @@ -1,68 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import parse_duration - - -class WatchIndianPornIE(InfoExtractor): - IE_DESC = 'Watch Indian Porn' - _VALID_URL = r'https?://(?:www\.)?watchindianporn\.net/(?:[^/]+/)*video/(?P<display_id>[^/]+)-(?P<id>[a-zA-Z0-9]+)\.html' - _TEST = { - 'url': 'http://www.watchindianporn.net/video/hot-milf-from-kerala-shows-off-her-gorgeous-large-breasts-on-camera-RZa2avywNPa.html', - 'md5': '249589a164dde236ec65832bfce17440', - 'info_dict': { - 'id': 'RZa2avywNPa', - 'display_id': 'hot-milf-from-kerala-shows-off-her-gorgeous-large-breasts-on-camera', - 'ext': 'mp4', - 'title': 'Hot milf from kerala shows off her gorgeous large breasts on camera', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 226, - 'view_count': int, - 'categories': list, - 'age_limit': 18, - } - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') - - webpage = self._download_webpage(url, display_id) - - info_dict = self._parse_html5_media_entries(url, webpage, video_id)[0] - - title = self._html_search_regex(( - r'<title>(.+?)\s*-\s*Indian\s+Porn</title>', - r'<h4>(.+?)</h4>' - ), webpage, 'title') - - duration = parse_duration(self._search_regex( - r'Time:\s*<strong>\s*(.+?)\s*</strong>', - webpage, 'duration', fatal=False)) - - view_count = int(self._search_regex( - r'(?s)Time:\s*<strong>.*?</strong>.*?<strong>\s*(\d+)\s*</strong>', - webpage, 'view count', fatal=False)) - - categories = re.findall( - r'<a[^>]+class=[\'"]categories[\'"][^>]*>\s*([^<]+)\s*</a>', - webpage) - - info_dict.update({ - 'id': video_id, - 'display_id': display_id, - 'http_headers': { - 'Referer': url, - }, - 'title': title, - 'duration': duration, - 'view_count': view_count, - 'categories': categories, - 'age_limit': 18, - }) - - return info_dict diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py deleted file mode 100644 index 2903d189e..000000000 --- a/youtube_dl/extractor/wdr.py +++ /dev/null @@ -1,347 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, -) -from ..utils import ( - determine_ext, - ExtractorError, - js_to_json, - strip_jsonp, - try_get, - unified_strdate, - update_url_query, - urlhandle_detect_ext, - url_or_none, -) - - -class WDRIE(InfoExtractor): - _VALID_URL = r'https?://deviceids-medp\.wdr\.de/ondemand/\d+/(?P<id>\d+)\.js' - _GEO_COUNTRIES = ['DE'] - _TEST = { - 'url': 'http://deviceids-medp.wdr.de/ondemand/155/1557833.js', - 'info_dict': { - 'id': 'mdb-1557833', - 'ext': 'mp4', - 'title': 'Biathlon-Staffel verpasst Podest bei Olympia-Generalprobe', - 'upload_date': '20180112', - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - metadata = self._download_json( - url, video_id, transform_source=strip_jsonp) - - is_live = metadata.get('mediaType') == 'live' - - tracker_data = metadata['trackerData'] - title = tracker_data['trackerClipTitle'] - - media_resource = metadata['mediaResource'] - - formats = [] - - # check if the metadata contains a direct URL to a file - for kind, media in media_resource.items(): - if not isinstance(media, dict): - continue - if kind not in ('dflt', 'alt'): - continue - - for tag_name, medium_url in media.items(): - if tag_name not in ('videoURL', 'audioURL'): - continue - - ext = determine_ext(medium_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - medium_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls')) - elif ext == 'f4m': - manifest_url = update_url_query( - medium_url, {'hdcore': '3.2.0', 'plugin': 'aasp-3.2.0.77.18'}) - formats.extend(self._extract_f4m_formats( - manifest_url, video_id, f4m_id='hds', fatal=False)) - elif ext == 'smil': - formats.extend(self._extract_smil_formats( - medium_url, 'stream', fatal=False)) - else: - a_format = { - 'url': medium_url - } - if ext == 'unknown_video': - urlh = self._request_webpage( - medium_url, video_id, note='Determining extension') - ext = urlhandle_detect_ext(urlh) - a_format['ext'] = ext - formats.append(a_format) - - self._sort_formats(formats) - - subtitles = {} - caption_url = media_resource.get('captionURL') - if caption_url: - subtitles['de'] = [{ - 'url': caption_url, - 'ext': 'ttml', - }] - captions_hash = media_resource.get('captionsHash') - if isinstance(captions_hash, dict): - for ext, format_url in captions_hash.items(): - format_url = url_or_none(format_url) - if not format_url: - continue - subtitles.setdefault('de', []).append({ - 'url': format_url, - 'ext': determine_ext(format_url, None) or ext, - }) - - return { - 'id': tracker_data.get('trackerClipId', video_id), - 'title': self._live_title(title) if is_live else title, - 'alt_title': tracker_data.get('trackerClipSubcategory'), - 'formats': formats, - 'subtitles': subtitles, - 'upload_date': unified_strdate(tracker_data.get('trackerClipAirTime')), - 'is_live': is_live, - } - - -class WDRPageIE(InfoExtractor): - _CURRENT_MAUS_URL = r'https?://(?:www\.)wdrmaus.de/(?:[^/]+/){1,2}[^/?#]+\.php5' - _PAGE_REGEX = r'/(?:mediathek/)?(?:[^/]+/)*(?P<display_id>[^/]+)\.html' - _VALID_URL = r'https?://(?:www\d?\.)?(?:(?:kinder\.)?wdr\d?|sportschau)\.de' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL - - _TESTS = [ - { - 'url': 'http://www1.wdr.de/mediathek/video/sendungen/doku-am-freitag/video-geheimnis-aachener-dom-100.html', - # HDS download, MD5 is unstable - 'info_dict': { - 'id': 'mdb-1058683', - 'ext': 'flv', - 'display_id': 'doku-am-freitag/video-geheimnis-aachener-dom-100', - 'title': 'Geheimnis Aachener Dom', - 'alt_title': 'Doku am Freitag', - 'upload_date': '20160304', - 'description': 'md5:87be8ff14d8dfd7a7ee46f0299b52318', - 'is_live': False, - 'subtitles': {'de': [{ - 'url': 'http://ondemand-ww.wdr.de/medp/fsk0/105/1058683/1058683_12220974.xml', - 'ext': 'ttml', - }]}, - }, - 'skip': 'HTTP Error 404: Not Found', - }, - { - 'url': 'http://www1.wdr.de/mediathek/audio/wdr3/wdr3-gespraech-am-samstag/audio-schriftstellerin-juli-zeh-100.html', - 'md5': 'f4c1f96d01cf285240f53ea4309663d8', - 'info_dict': { - 'id': 'mdb-1072000', - 'ext': 'mp3', - 'display_id': 'wdr3-gespraech-am-samstag/audio-schriftstellerin-juli-zeh-100', - 'title': 'Schriftstellerin Juli Zeh', - 'alt_title': 'WDR 3 Gespräch am Samstag', - 'upload_date': '20160312', - 'description': 'md5:e127d320bc2b1f149be697ce044a3dd7', - 'is_live': False, - 'subtitles': {} - }, - 'skip': 'HTTP Error 404: Not Found', - }, - { - 'url': 'http://www1.wdr.de/mediathek/video/live/index.html', - 'info_dict': { - 'id': 'mdb-1406149', - 'ext': 'mp4', - 'title': r're:^WDR Fernsehen im Livestream \(nur in Deutschland erreichbar\) [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'alt_title': 'WDR Fernsehen Live', - 'upload_date': '20150101', - 'is_live': True, - }, - 'params': { - 'skip_download': True, # m3u8 download - }, - }, - { - 'url': 'http://www1.wdr.de/mediathek/video/sendungen/aktuelle-stunde/aktuelle-stunde-120.html', - 'playlist_mincount': 7, - 'info_dict': { - 'id': 'aktuelle-stunde-120', - }, - }, - { - 'url': 'http://www.wdrmaus.de/aktuelle-sendung/index.php5', - 'info_dict': { - 'id': 'mdb-1552552', - 'ext': 'mp4', - 'upload_date': 're:^[0-9]{8}$', - 'title': 're:^Die Sendung mit der Maus vom [0-9.]{10}$', - }, - 'skip': 'The id changes from week to week because of the new episode' - }, - { - 'url': 'http://www.wdrmaus.de/filme/sachgeschichten/achterbahn.php5', - 'md5': '803138901f6368ee497b4d195bb164f2', - 'info_dict': { - 'id': 'mdb-186083', - 'ext': 'mp4', - 'upload_date': '20130919', - 'title': 'Sachgeschichte - Achterbahn ', - }, - }, - { - 'url': 'http://www1.wdr.de/radio/player/radioplayer116~_layout-popupVersion.html', - # Live stream, MD5 unstable - 'info_dict': { - 'id': 'mdb-869971', - 'ext': 'mp4', - 'title': r're:^COSMO Livestream [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'upload_date': '20160101', - }, - 'params': { - 'skip_download': True, # m3u8 download - } - }, - { - 'url': 'http://www.sportschau.de/handballem2018/handball-nationalmannschaft-em-stolperstein-vorrunde-100.html', - 'info_dict': { - 'id': 'mdb-1556012', - 'ext': 'mp4', - 'title': 'DHB-Vizepräsident Bob Hanning - "Die Weltspitze ist extrem breit"', - 'upload_date': '20180111', - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://www.sportschau.de/handballem2018/audio-vorschau---die-handball-em-startet-mit-grossem-favoritenfeld-100.html', - 'only_matching': True, - }, - { - 'url': 'https://kinder.wdr.de/tv/die-sendung-mit-dem-elefanten/av/video-folge---astronaut-100.html', - 'only_matching': True, - }, - ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('display_id') - webpage = self._download_webpage(url, display_id) - - entries = [] - - # Article with several videos - - # for wdr.de the data-extension is in a tag with the class "mediaLink" - # for wdr.de radio players, in a tag with the class "wdrrPlayerPlayBtn" - # for wdrmaus, in a tag with the class "videoButton" (previously a link - # to the page in a multiline "videoLink"-tag) - for mobj in re.finditer( - r'''(?sx)class= - (?: - (["\'])(?:mediaLink|wdrrPlayerPlayBtn|videoButton)\b.*?\1[^>]+| - (["\'])videoLink\b.*?\2[\s]*>\n[^\n]* - )data-extension=(["\'])(?P<data>(?:(?!\3).)+)\3 - ''', webpage): - media_link_obj = self._parse_json( - mobj.group('data'), display_id, transform_source=js_to_json, - fatal=False) - if not media_link_obj: - continue - jsonp_url = try_get( - media_link_obj, lambda x: x['mediaObj']['url'], compat_str) - if jsonp_url: - entries.append(self.url_result(jsonp_url, ie=WDRIE.ie_key())) - - # Playlist (e.g. https://www1.wdr.de/mediathek/video/sendungen/aktuelle-stunde/aktuelle-stunde-120.html) - if not entries: - entries = [ - self.url_result( - compat_urlparse.urljoin(url, mobj.group('href')), - ie=WDRPageIE.ie_key()) - for mobj in re.finditer( - r'<a[^>]+\bhref=(["\'])(?P<href>(?:(?!\1).)+)\1[^>]+\bdata-extension=', - webpage) if re.match(self._PAGE_REGEX, mobj.group('href')) - ] - - return self.playlist_result(entries, playlist_id=display_id) - - -class WDRElefantIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)wdrmaus\.de/elefantenseite/#(?P<id>.+)' - _TEST = { - 'url': 'http://www.wdrmaus.de/elefantenseite/#folge_ostern_2015', - 'info_dict': { - 'title': 'Folge Oster-Spezial 2015', - 'id': 'mdb-1088195', - 'ext': 'mp4', - 'age_limit': None, - 'upload_date': '20150406' - }, - 'params': { - 'skip_download': True, - }, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - - # Table of Contents seems to always be at this address, so fetch it directly. - # The website fetches configurationJS.php5, which links to tableOfContentsJS.php5. - table_of_contents = self._download_json( - 'https://www.wdrmaus.de/elefantenseite/data/tableOfContentsJS.php5', - display_id) - if display_id not in table_of_contents: - raise ExtractorError( - 'No entry in site\'s table of contents for this URL. ' - 'Is the fragment part of the URL (after the #) correct?', - expected=True) - xml_metadata_path = table_of_contents[display_id]['xmlPath'] - xml_metadata = self._download_xml( - 'https://www.wdrmaus.de/elefantenseite/' + xml_metadata_path, - display_id) - zmdb_url_element = xml_metadata.find('./movie/zmdb_url') - if zmdb_url_element is None: - raise ExtractorError( - '%s is not a video' % display_id, expected=True) - return self.url_result(zmdb_url_element.text, ie=WDRIE.ie_key()) - - -class WDRMobileIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?://mobile-ondemand\.wdr\.de/ - .*?/fsk(?P<age_limit>[0-9]+) - /[0-9]+/[0-9]+/ - (?P<id>[0-9]+)_(?P<title>[0-9]+)''' - IE_NAME = 'wdr:mobile' - _TEST = { - 'url': 'http://mobile-ondemand.wdr.de/CMS2010/mdb/ondemand/weltweit/fsk0/42/421735/421735_4283021.mp4', - 'info_dict': { - 'title': '4283021', - 'id': '421735', - 'ext': 'mp4', - 'age_limit': 0, - }, - 'skip': 'Problems with loading data.' - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - return { - 'id': mobj.group('id'), - 'title': mobj.group('title'), - 'age_limit': int(mobj.group('age_limit')), - 'url': url, - 'http_headers': { - 'User-Agent': 'mobile', - }, - } diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py deleted file mode 100644 index ae32a0a68..000000000 --- a/youtube_dl/extractor/wistia.py +++ /dev/null @@ -1,199 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - float_or_none, - int_or_none, - try_get, - unescapeHTML, -) - - -class WistiaBaseIE(InfoExtractor): - _VALID_ID_REGEX = r'(?P<id>[a-z0-9]{10})' - _VALID_URL_BASE = r'https?://(?:fast\.)?wistia\.(?:net|com)/embed/' - _EMBED_BASE_URL = 'http://fast.wistia.com/embed/' - - def _download_embed_config(self, config_type, config_id, referer): - base_url = self._EMBED_BASE_URL + '%ss/%s' % (config_type, config_id) - embed_config = self._download_json( - base_url + '.json', config_id, headers={ - 'Referer': referer if referer.startswith('http') else base_url, # Some videos require this. - }) - - if isinstance(embed_config, dict) and embed_config.get('error'): - raise ExtractorError( - 'Error while getting the playlist', expected=True) - - return embed_config - - def _extract_media(self, embed_config): - data = embed_config['media'] - video_id = data['hashedId'] - title = data['name'] - - formats = [] - thumbnails = [] - for a in data['assets']: - aurl = a.get('url') - if not aurl: - continue - astatus = a.get('status') - atype = a.get('type') - if (astatus is not None and astatus != 2) or atype in ('preview', 'storyboard'): - continue - elif atype in ('still', 'still_image'): - thumbnails.append({ - 'url': aurl, - 'width': int_or_none(a.get('width')), - 'height': int_or_none(a.get('height')), - 'filesize': int_or_none(a.get('size')), - }) - else: - aext = a.get('ext') - display_name = a.get('display_name') - format_id = atype - if atype and atype.endswith('_video') and display_name: - format_id = '%s-%s' % (atype[:-6], display_name) - f = { - 'format_id': format_id, - 'url': aurl, - 'tbr': int_or_none(a.get('bitrate')) or None, - 'preference': 1 if atype == 'original' else None, - } - if display_name == 'Audio': - f.update({ - 'vcodec': 'none', - }) - else: - f.update({ - 'width': int_or_none(a.get('width')), - 'height': int_or_none(a.get('height')), - 'vcodec': a.get('codec'), - }) - if a.get('container') == 'm3u8' or aext == 'm3u8': - ts_f = f.copy() - ts_f.update({ - 'ext': 'ts', - 'format_id': f['format_id'].replace('hls-', 'ts-'), - 'url': f['url'].replace('.bin', '.ts'), - }) - formats.append(ts_f) - f.update({ - 'ext': 'mp4', - 'protocol': 'm3u8_native', - }) - else: - f.update({ - 'container': a.get('container'), - 'ext': aext, - 'filesize': int_or_none(a.get('size')), - }) - formats.append(f) - - self._sort_formats(formats) - - subtitles = {} - for caption in data.get('captions', []): - language = caption.get('language') - if not language: - continue - subtitles[language] = [{ - 'url': self._EMBED_BASE_URL + 'captions/' + video_id + '.vtt?language=' + language, - }] - - return { - 'id': video_id, - 'title': title, - 'description': data.get('seoDescription'), - 'formats': formats, - 'thumbnails': thumbnails, - 'duration': float_or_none(data.get('duration')), - 'timestamp': int_or_none(data.get('createdAt')), - 'subtitles': subtitles, - } - - -class WistiaIE(WistiaBaseIE): - _VALID_URL = r'(?:wistia:|%s(?:iframe|medias)/)%s' % (WistiaBaseIE._VALID_URL_BASE, WistiaBaseIE._VALID_ID_REGEX) - - _TESTS = [{ - # with hls video - 'url': 'wistia:807fafadvk', - 'md5': 'daff0f3687a41d9a71b40e0e8c2610fe', - 'info_dict': { - 'id': '807fafadvk', - 'ext': 'mp4', - 'title': 'Drip Brennan Dunn Workshop', - 'description': 'a JV Webinars video', - 'upload_date': '20160518', - 'timestamp': 1463607249, - 'duration': 4987.11, - }, - }, { - 'url': 'wistia:sh7fpupwlt', - 'only_matching': True, - }, { - 'url': 'http://fast.wistia.net/embed/iframe/sh7fpupwlt', - 'only_matching': True, - }, { - 'url': 'http://fast.wistia.com/embed/iframe/sh7fpupwlt', - 'only_matching': True, - }, { - 'url': 'http://fast.wistia.net/embed/medias/sh7fpupwlt.json', - 'only_matching': True, - }] - - # https://wistia.com/support/embed-and-share/video-on-your-website - @staticmethod - def _extract_url(webpage): - urls = WistiaIE._extract_urls(webpage) - return urls[0] if urls else None - - @staticmethod - def _extract_urls(webpage): - urls = [] - for match in re.finditer( - r'<(?:meta[^>]+?content|(?:iframe|script)[^>]+?src)=["\'](?P<url>(?:https?:)?//(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/[a-z0-9]{10})', webpage): - urls.append(unescapeHTML(match.group('url'))) - for match in re.finditer( - r'''(?sx) - <div[^>]+class=(["'])(?:(?!\1).)*?\bwistia_async_(?P<id>[a-z0-9]{10})\b(?:(?!\1).)*?\1 - ''', webpage): - urls.append('wistia:%s' % match.group('id')) - for match in re.finditer(r'(?:data-wistia-?id=["\']|Wistia\.embed\(["\']|id=["\']wistia_)(?P<id>[a-z0-9]{10})', webpage): - urls.append('wistia:%s' % match.group('id')) - return urls - - def _real_extract(self, url): - video_id = self._match_id(url) - embed_config = self._download_embed_config('media', video_id, url) - return self._extract_media(embed_config) - - -class WistiaPlaylistIE(WistiaBaseIE): - _VALID_URL = r'%splaylists/%s' % (WistiaIE._VALID_URL_BASE, WistiaIE._VALID_ID_REGEX) - - _TEST = { - 'url': 'https://fast.wistia.net/embed/playlists/aodt9etokc', - 'info_dict': { - 'id': 'aodt9etokc', - }, - 'playlist_count': 3, - } - - def _real_extract(self, url): - playlist_id = self._match_id(url) - playlist = self._download_embed_config('playlist', playlist_id, url) - - entries = [] - for media in (try_get(playlist, lambda x: x[0]['medias']) or []): - embed_config = media.get('embed_config') - if not embed_config: - continue - entries.append(self._extract_media(embed_config)) - - return self.playlist_result(entries, playlist_id) diff --git a/youtube_dl/extractor/xboxclips.py b/youtube_dl/extractor/xboxclips.py deleted file mode 100644 index 25f487e1e..000000000 --- a/youtube_dl/extractor/xboxclips.py +++ /dev/null @@ -1,68 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_urllib_parse_urlparse, -) -from ..utils import ( - int_or_none, - month_by_abbreviation, - parse_filesize, -) - - -class XboxClipsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:xboxclips\.com|gameclips\.io)/(?:video\.php\?.*vid=|[^/]+/)(?P<id>[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' - _TESTS = [{ - 'url': 'http://xboxclips.com/video.php?uid=2533274823424419&gamertag=Iabdulelah&vid=074a69a9-5faf-46aa-b93b-9909c1720325', - 'md5': 'fbe1ec805e920aeb8eced3c3e657df5d', - 'info_dict': { - 'id': '074a69a9-5faf-46aa-b93b-9909c1720325', - 'ext': 'mp4', - 'title': 'iAbdulElah playing Titanfall', - 'filesize_approx': 26800000, - 'upload_date': '20140807', - 'duration': 56, - } - }, { - 'url': 'https://gameclips.io/iAbdulElah/074a69a9-5faf-46aa-b93b-9909c1720325', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - if '/video.php' in url: - qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) - url = 'https://gameclips.io/%s/%s' % (qs['gamertag'][0], qs['vid'][0]) - - webpage = self._download_webpage(url, video_id) - info = self._parse_html5_media_entries(url, webpage, video_id)[0] - - title = self._html_search_meta(['og:title', 'twitter:title'], webpage) - upload_date = None - mobj = re.search( - r'>Recorded: (\d{2})-(Jan|Feb|Mar|Apr|May|Ju[nl]|Aug|Sep|Oct|Nov|Dec)-(\d{4})', - webpage) - if mobj: - upload_date = '%s%.2d%s' % (mobj.group(3), month_by_abbreviation(mobj.group(2)), mobj.group(1)) - filesize = parse_filesize(self._html_search_regex( - r'>Size: ([^<]+)<', webpage, 'file size', fatal=False)) - duration = int_or_none(self._html_search_regex( - r'>Duration: (\d+) Seconds<', webpage, 'duration', fatal=False)) - view_count = int_or_none(self._html_search_regex( - r'>Views: (\d+)<', webpage, 'view count', fatal=False)) - - info.update({ - 'id': video_id, - 'title': title, - 'upload_date': upload_date, - 'filesize_approx': filesize, - 'duration': duration, - 'view_count': view_count, - }) - return info diff --git a/youtube_dl/extractor/xfileshare.py b/youtube_dl/extractor/xfileshare.py deleted file mode 100644 index df9efa9fa..000000000 --- a/youtube_dl/extractor/xfileshare.py +++ /dev/null @@ -1,201 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_chr -from ..utils import ( - decode_packed_codes, - determine_ext, - ExtractorError, - int_or_none, - js_to_json, - urlencode_postdata, -) - - -# based on openload_decode from 2bfeee69b976fe049761dd3012e30b637ee05a58 -def aa_decode(aa_code): - symbol_table = [ - ('7', '((゚ー゚) + (o^_^o))'), - ('6', '((o^_^o) +(o^_^o))'), - ('5', '((゚ー゚) + (゚Θ゚))'), - ('2', '((o^_^o) - (゚Θ゚))'), - ('4', '(゚ー゚)'), - ('3', '(o^_^o)'), - ('1', '(゚Θ゚)'), - ('0', '(c^_^o)'), - ] - delim = '(゚Д゚)[゚ε゚]+' - ret = '' - for aa_char in aa_code.split(delim): - for val, pat in symbol_table: - aa_char = aa_char.replace(pat, val) - aa_char = aa_char.replace('+ ', '') - m = re.match(r'^\d+', aa_char) - if m: - ret += compat_chr(int(m.group(0), 8)) - else: - m = re.match(r'^u([\da-f]+)', aa_char) - if m: - ret += compat_chr(int(m.group(1), 16)) - return ret - - -class XFileShareIE(InfoExtractor): - _SITES = ( - (r'aparat\.cam', 'Aparat'), - (r'clipwatching\.com', 'ClipWatching'), - (r'gounlimited\.to', 'GoUnlimited'), - (r'govid\.me', 'GoVid'), - (r'holavid\.com', 'HolaVid'), - (r'streamty\.com', 'Streamty'), - (r'thevideobee\.to', 'TheVideoBee'), - (r'uqload\.com', 'Uqload'), - (r'vidbom\.com', 'VidBom'), - (r'vidlo\.us', 'vidlo'), - (r'vidlocker\.xyz', 'VidLocker'), - (r'vidshare\.tv', 'VidShare'), - (r'vup\.to', 'VUp'), - (r'wolfstream\.tv', 'WolfStream'), - (r'xvideosharing\.com', 'XVideoSharing'), - ) - - IE_DESC = 'XFileShare based sites: %s' % ', '.join(list(zip(*_SITES))[1]) - _VALID_URL = (r'https?://(?:www\.)?(?P<host>%s)/(?:embed-)?(?P<id>[0-9a-zA-Z]+)' - % '|'.join(site for site in list(zip(*_SITES))[0])) - - _FILE_NOT_FOUND_REGEXES = ( - r'>(?:404 - )?File Not Found<', - r'>The file was removed by administrator<', - ) - - _TESTS = [{ - 'url': 'http://xvideosharing.com/fq65f94nd2ve', - 'md5': '4181f63957e8fe90ac836fa58dc3c8a6', - 'info_dict': { - 'id': 'fq65f94nd2ve', - 'ext': 'mp4', - 'title': 'sample', - 'thumbnail': r're:http://.*\.jpg', - }, - }, { - 'url': 'https://aparat.cam/n4d6dh0wvlpr', - 'only_matching': True, - }, { - 'url': 'https://wolfstream.tv/nthme29v9u2x', - 'only_matching': True, - }] - - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:%s)/embed-[0-9a-zA-Z]+.*?)\1' - % '|'.join(site for site in list(zip(*XFileShareIE._SITES))[0]), - webpage)] - - def _real_extract(self, url): - host, video_id = re.match(self._VALID_URL, url).groups() - - url = 'https://%s/' % host + ('embed-%s.html' % video_id if host in ('govid.me', 'vidlo.us') else video_id) - webpage = self._download_webpage(url, video_id) - - if any(re.search(p, webpage) for p in self._FILE_NOT_FOUND_REGEXES): - raise ExtractorError('Video %s does not exist' % video_id, expected=True) - - fields = self._hidden_inputs(webpage) - - if fields.get('op') == 'download1': - countdown = int_or_none(self._search_regex( - r'<span id="countdown_str">(?:[Ww]ait)?\s*<span id="cxc">(\d+)</span>\s*(?:seconds?)?</span>', - webpage, 'countdown', default=None)) - if countdown: - self._sleep(countdown, video_id) - - webpage = self._download_webpage( - url, video_id, 'Downloading video page', - data=urlencode_postdata(fields), headers={ - 'Referer': url, - 'Content-type': 'application/x-www-form-urlencoded', - }) - - title = (self._search_regex( - (r'style="z-index: [0-9]+;">([^<]+)</span>', - r'<td nowrap>([^<]+)</td>', - r'h4-fine[^>]*>([^<]+)<', - r'>Watch (.+)[ <]', - r'<h2 class="video-page-head">([^<]+)</h2>', - r'<h2 style="[^"]*color:#403f3d[^"]*"[^>]*>([^<]+)<', # streamin.to - r'title\s*:\s*"([^"]+)"'), # govid.me - webpage, 'title', default=None) or self._og_search_title( - webpage, default=None) or video_id).strip() - - for regex, func in ( - (r'(eval\(function\(p,a,c,k,e,d\){.+)', decode_packed_codes), - (r'(゚.+)', aa_decode)): - obf_code = self._search_regex(regex, webpage, 'obfuscated code', default=None) - if obf_code: - webpage = webpage.replace(obf_code, func(obf_code)) - - formats = [] - - jwplayer_data = self._search_regex( - [ - r'jwplayer\("[^"]+"\)\.load\(\[({.+?})\]\);', - r'jwplayer\("[^"]+"\)\.setup\(({.+?})\);', - ], webpage, - 'jwplayer data', default=None) - if jwplayer_data: - jwplayer_data = self._parse_json( - jwplayer_data.replace(r"\'", "'"), video_id, js_to_json) - if jwplayer_data: - formats = self._parse_jwplayer_data( - jwplayer_data, video_id, False, - m3u8_id='hls', mpd_id='dash')['formats'] - - if not formats: - urls = [] - for regex in ( - r'(?:file|src)\s*:\s*(["\'])(?P<url>http(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1', - r'file_link\s*=\s*(["\'])(?P<url>http(?:(?!\1).)+)\1', - r'addVariable\((\\?["\'])file\1\s*,\s*(\\?["\'])(?P<url>http(?:(?!\2).)+)\2\)', - r'<embed[^>]+src=(["\'])(?P<url>http(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1'): - for mobj in re.finditer(regex, webpage): - video_url = mobj.group('url') - if video_url not in urls: - urls.append(video_url) - - sources = self._search_regex( - r'sources\s*:\s*(\[(?!{)[^\]]+\])', webpage, 'sources', default=None) - if sources: - urls.extend(self._parse_json(sources, video_id)) - - formats = [] - for video_url in urls: - if determine_ext(video_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls', - fatal=False)) - else: - formats.append({ - 'url': video_url, - 'format_id': 'sd', - }) - self._sort_formats(formats) - - thumbnail = self._search_regex( - [ - r'<video[^>]+poster="([^"]+)"', - r'(?:image|poster)\s*:\s*["\'](http[^"\']+)["\'],', - ], webpage, 'thumbnail', default=None) - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'formats': formats, - } diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py deleted file mode 100644 index f73b9778f..000000000 --- a/youtube_dl/extractor/xhamster.py +++ /dev/null @@ -1,450 +0,0 @@ -from __future__ import unicode_literals - -import itertools -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - clean_html, - determine_ext, - dict_get, - extract_attributes, - ExtractorError, - float_or_none, - int_or_none, - parse_duration, - str_or_none, - try_get, - unified_strdate, - url_or_none, - urljoin, -) - - -class XHamsterIE(InfoExtractor): - _DOMAINS = r'(?:xhamster\.(?:com|one|desi)|xhms\.pro|xhamster\d+\.com)' - _VALID_URL = r'''(?x) - https?:// - (?:.+?\.)?%s/ - (?: - movies/(?P<id>[\dA-Za-z]+)/(?P<display_id>[^/]*)\.html| - videos/(?P<display_id_2>[^/]*)-(?P<id_2>[\dA-Za-z]+) - ) - ''' % _DOMAINS - _TESTS = [{ - 'url': 'https://xhamster.com/videos/femaleagent-shy-beauty-takes-the-bait-1509445', - 'md5': '98b4687efb1ffd331c4197854dc09e8f', - 'info_dict': { - 'id': '1509445', - 'display_id': 'femaleagent-shy-beauty-takes-the-bait', - 'ext': 'mp4', - 'title': 'FemaleAgent Shy beauty takes the bait', - 'timestamp': 1350194821, - 'upload_date': '20121014', - 'uploader': 'Ruseful2011', - 'duration': 893, - 'age_limit': 18, - }, - }, { - 'url': 'https://xhamster.com/videos/britney-spears-sexy-booty-2221348?hd=', - 'info_dict': { - 'id': '2221348', - 'display_id': 'britney-spears-sexy-booty', - 'ext': 'mp4', - 'title': 'Britney Spears Sexy Booty', - 'timestamp': 1379123460, - 'upload_date': '20130914', - 'uploader': 'jojo747400', - 'duration': 200, - 'age_limit': 18, - }, - 'params': { - 'skip_download': True, - }, - }, { - # empty seo, unavailable via new URL schema - 'url': 'http://xhamster.com/movies/5667973/.html', - 'info_dict': { - 'id': '5667973', - 'ext': 'mp4', - 'title': '....', - 'timestamp': 1454948101, - 'upload_date': '20160208', - 'uploader': 'parejafree', - 'duration': 72, - 'age_limit': 18, - }, - 'params': { - 'skip_download': True, - }, - }, { - # mobile site - 'url': 'https://m.xhamster.com/videos/cute-teen-jacqueline-solo-masturbation-8559111', - 'only_matching': True, - }, { - 'url': 'https://xhamster.com/movies/2272726/amber_slayed_by_the_knight.html', - 'only_matching': True, - }, { - # This video is visible for marcoalfa123456's friends only - 'url': 'https://it.xhamster.com/movies/7263980/la_mia_vicina.html', - 'only_matching': True, - }, { - # new URL schema - 'url': 'https://pt.xhamster.com/videos/euro-pedal-pumping-7937821', - 'only_matching': True, - }, { - 'url': 'https://xhamster.one/videos/femaleagent-shy-beauty-takes-the-bait-1509445', - 'only_matching': True, - }, { - 'url': 'https://xhamster.desi/videos/femaleagent-shy-beauty-takes-the-bait-1509445', - 'only_matching': True, - }, { - 'url': 'https://xhamster2.com/videos/femaleagent-shy-beauty-takes-the-bait-1509445', - 'only_matching': True, - }, { - 'url': 'https://xhamster11.com/videos/femaleagent-shy-beauty-takes-the-bait-1509445', - 'only_matching': True, - }, { - 'url': 'https://xhamster26.com/videos/femaleagent-shy-beauty-takes-the-bait-1509445', - 'only_matching': True, - }, { - 'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html', - 'only_matching': True, - }, { - 'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd', - 'only_matching': True, - }, { - 'url': 'http://de.xhamster.com/videos/skinny-girl-fucks-herself-hard-in-the-forest-xhnBJZx', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') or mobj.group('id_2') - display_id = mobj.group('display_id') or mobj.group('display_id_2') - - desktop_url = re.sub(r'^(https?://(?:.+?\.)?)m\.', r'\1', url) - webpage, urlh = self._download_webpage_handle(desktop_url, video_id) - - error = self._html_search_regex( - r'<div[^>]+id=["\']videoClosed["\'][^>]*>(.+?)</div>', - webpage, 'error', default=None) - if error: - raise ExtractorError(error, expected=True) - - age_limit = self._rta_search(webpage) - - def get_height(s): - return int_or_none(self._search_regex( - r'^(\d+)[pP]', s, 'height', default=None)) - - initials = self._parse_json( - self._search_regex( - (r'window\.initials\s*=\s*({.+?})\s*;\s*</script>', - r'window\.initials\s*=\s*({.+?})\s*;'), webpage, 'initials', - default='{}'), - video_id, fatal=False) - if initials: - video = initials['videoModel'] - title = video['title'] - formats = [] - format_urls = set() - format_sizes = {} - sources = try_get(video, lambda x: x['sources'], dict) or {} - for format_id, formats_dict in sources.items(): - if not isinstance(formats_dict, dict): - continue - download_sources = try_get(sources, lambda x: x['download'], dict) or {} - for quality, format_dict in download_sources.items(): - if not isinstance(format_dict, dict): - continue - format_sizes[quality] = float_or_none(format_dict.get('size')) - for quality, format_item in formats_dict.items(): - if format_id == 'download': - # Download link takes some time to be generated, - # skipping for now - continue - format_url = format_item - format_url = url_or_none(format_url) - if not format_url or format_url in format_urls: - continue - format_urls.add(format_url) - formats.append({ - 'format_id': '%s-%s' % (format_id, quality), - 'url': format_url, - 'ext': determine_ext(format_url, 'mp4'), - 'height': get_height(quality), - 'filesize': format_sizes.get(quality), - 'http_headers': { - 'Referer': urlh.geturl(), - }, - }) - xplayer_sources = try_get( - initials, lambda x: x['xplayerSettings']['sources'], dict) - if xplayer_sources: - hls_sources = xplayer_sources.get('hls') - if isinstance(hls_sources, dict): - for hls_format_key in ('url', 'fallback'): - hls_url = hls_sources.get(hls_format_key) - if not hls_url: - continue - hls_url = urljoin(url, hls_url) - if not hls_url or hls_url in format_urls: - continue - format_urls.add(hls_url) - formats.extend(self._extract_m3u8_formats( - hls_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - standard_sources = xplayer_sources.get('standard') - if isinstance(standard_sources, dict): - for format_id, formats_list in standard_sources.items(): - if not isinstance(formats_list, list): - continue - for standard_format in formats_list: - if not isinstance(standard_format, dict): - continue - for standard_format_key in ('url', 'fallback'): - standard_url = standard_format.get(standard_format_key) - if not standard_url: - continue - standard_url = urljoin(url, standard_url) - if not standard_url or standard_url in format_urls: - continue - format_urls.add(standard_url) - ext = determine_ext(standard_url, 'mp4') - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - standard_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - continue - quality = (str_or_none(standard_format.get('quality')) - or str_or_none(standard_format.get('label')) - or '') - formats.append({ - 'format_id': '%s-%s' % (format_id, quality), - 'url': standard_url, - 'ext': ext, - 'height': get_height(quality), - 'filesize': format_sizes.get(quality), - 'http_headers': { - 'Referer': standard_url, - }, - }) - self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id')) - - categories_list = video.get('categories') - if isinstance(categories_list, list): - categories = [] - for c in categories_list: - if not isinstance(c, dict): - continue - c_name = c.get('name') - if isinstance(c_name, compat_str): - categories.append(c_name) - else: - categories = None - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': video.get('description'), - 'timestamp': int_or_none(video.get('created')), - 'uploader': try_get( - video, lambda x: x['author']['name'], compat_str), - 'thumbnail': video.get('thumbURL'), - 'duration': int_or_none(video.get('duration')), - 'view_count': int_or_none(video.get('views')), - 'like_count': int_or_none(try_get( - video, lambda x: x['rating']['likes'], int)), - 'dislike_count': int_or_none(try_get( - video, lambda x: x['rating']['dislikes'], int)), - 'comment_count': int_or_none(video.get('views')), - 'age_limit': age_limit, - 'categories': categories, - 'formats': formats, - } - - # Old layout fallback - - title = self._html_search_regex( - [r'<h1[^>]*>([^<]+)</h1>', - r'<meta[^>]+itemprop=".*?caption.*?"[^>]+content="(.+?)"', - r'<title[^>]*>(.+?)(?:,\s*[^,]*?\s*Porn\s*[^,]*?:\s*xHamster[^<]*| - xHamster\.com)</title>'], - webpage, 'title') - - formats = [] - format_urls = set() - - sources = self._parse_json( - self._search_regex( - r'sources\s*:\s*({.+?})\s*,?\s*\n', webpage, 'sources', - default='{}'), - video_id, fatal=False) - for format_id, format_url in sources.items(): - format_url = url_or_none(format_url) - if not format_url: - continue - if format_url in format_urls: - continue - format_urls.add(format_url) - formats.append({ - 'format_id': format_id, - 'url': format_url, - 'height': get_height(format_id), - }) - - video_url = self._search_regex( - [r'''file\s*:\s*(?P<q>["'])(?P<mp4>.+?)(?P=q)''', - r'''<a\s+href=(?P<q>["'])(?P<mp4>.+?)(?P=q)\s+class=["']mp4Thumb''', - r'''<video[^>]+file=(?P<q>["'])(?P<mp4>.+?)(?P=q)[^>]*>'''], - webpage, 'video url', group='mp4', default=None) - if video_url and video_url not in format_urls: - formats.append({ - 'url': video_url, - }) - - self._sort_formats(formats) - - # Only a few videos have an description - mobj = re.search(r'<span>Description: </span>([^<]+)', webpage) - description = mobj.group(1) if mobj else None - - upload_date = unified_strdate(self._search_regex( - r'hint=["\'](\d{4}-\d{2}-\d{2}) \d{2}:\d{2}:\d{2} [A-Z]{3,4}', - webpage, 'upload date', fatal=False)) - - uploader = self._html_search_regex( - r'<span[^>]+itemprop=["\']author[^>]+><a[^>]+><span[^>]+>([^<]+)', - webpage, 'uploader', default='anonymous') - - thumbnail = self._search_regex( - [r'''["']thumbUrl["']\s*:\s*(?P<q>["'])(?P<thumbnail>.+?)(?P=q)''', - r'''<video[^>]+"poster"=(?P<q>["'])(?P<thumbnail>.+?)(?P=q)[^>]*>'''], - webpage, 'thumbnail', fatal=False, group='thumbnail') - - duration = parse_duration(self._search_regex( - [r'<[^<]+\bitemprop=["\']duration["\'][^<]+\bcontent=["\'](.+?)["\']', - r'Runtime:\s*</span>\s*([\d:]+)'], webpage, - 'duration', fatal=False)) - - view_count = int_or_none(self._search_regex( - r'content=["\']User(?:View|Play)s:(\d+)', - webpage, 'view count', fatal=False)) - - mobj = re.search(r'hint=[\'"](?P<likecount>\d+) Likes / (?P<dislikecount>\d+) Dislikes', webpage) - (like_count, dislike_count) = (mobj.group('likecount'), mobj.group('dislikecount')) if mobj else (None, None) - - mobj = re.search(r'</label>Comments \((?P<commentcount>\d+)\)</div>', webpage) - comment_count = mobj.group('commentcount') if mobj else 0 - - categories_html = self._search_regex( - r'(?s)<table.+?(<span>Categories:.+?)</table>', webpage, - 'categories', default=None) - categories = [clean_html(category) for category in re.findall( - r'<a[^>]+>(.+?)</a>', categories_html)] if categories_html else None - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'upload_date': upload_date, - 'uploader': uploader, - 'thumbnail': thumbnail, - 'duration': duration, - 'view_count': view_count, - 'like_count': int_or_none(like_count), - 'dislike_count': int_or_none(dislike_count), - 'comment_count': int_or_none(comment_count), - 'age_limit': age_limit, - 'categories': categories, - 'formats': formats, - } - - -class XHamsterEmbedIE(InfoExtractor): - _VALID_URL = r'https?://(?:.+?\.)?%s/xembed\.php\?video=(?P<id>\d+)' % XHamsterIE._DOMAINS - _TEST = { - 'url': 'http://xhamster.com/xembed.php?video=3328539', - 'info_dict': { - 'id': '3328539', - 'ext': 'mp4', - 'title': 'Pen Masturbation', - 'timestamp': 1406581861, - 'upload_date': '20140728', - 'uploader': 'ManyakisArt', - 'duration': 5, - 'age_limit': 18, - } - } - - @staticmethod - def _extract_urls(webpage): - return [url for _, url in re.findall( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?xhamster\.com/xembed\.php\?video=\d+)\1', - webpage)] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - video_url = self._search_regex( - r'href="(https?://xhamster\.com/(?:movies/{0}/[^"]*\.html|videos/[^/]*-{0})[^"]*)"'.format(video_id), - webpage, 'xhamster url', default=None) - - if not video_url: - vars = self._parse_json( - self._search_regex(r'vars\s*:\s*({.+?})\s*,\s*\n', webpage, 'vars'), - video_id) - video_url = dict_get(vars, ('downloadLink', 'homepageLink', 'commentsLink', 'shareUrl')) - - return self.url_result(video_url, 'XHamster') - - -class XHamsterUserIE(InfoExtractor): - _VALID_URL = r'https?://(?:.+?\.)?%s/users/(?P<id>[^/?#&]+)' % XHamsterIE._DOMAINS - _TESTS = [{ - # Paginated user profile - 'url': 'https://xhamster.com/users/netvideogirls/videos', - 'info_dict': { - 'id': 'netvideogirls', - }, - 'playlist_mincount': 267, - }, { - # Non-paginated user profile - 'url': 'https://xhamster.com/users/firatkaan/videos', - 'info_dict': { - 'id': 'firatkaan', - }, - 'playlist_mincount': 1, - }] - - def _entries(self, user_id): - next_page_url = 'https://xhamster.com/users/%s/videos/1' % user_id - for pagenum in itertools.count(1): - page = self._download_webpage( - next_page_url, user_id, 'Downloading page %s' % pagenum) - for video_tag in re.findall( - r'(<a[^>]+class=["\'].*?\bvideo-thumb__image-container[^>]+>)', - page): - video = extract_attributes(video_tag) - video_url = url_or_none(video.get('href')) - if not video_url or not XHamsterIE.suitable(video_url): - continue - video_id = XHamsterIE._match_id(video_url) - yield self.url_result( - video_url, ie=XHamsterIE.ie_key(), video_id=video_id) - mobj = re.search(r'<a[^>]+data-page=["\']next[^>]+>', page) - if not mobj: - break - next_page = extract_attributes(mobj.group(0)) - next_page_url = url_or_none(next_page.get('href')) - if not next_page_url: - break - - def _real_extract(self, url): - user_id = self._match_id(url) - return self.playlist_result(self._entries(user_id), user_id) diff --git a/youtube_dl/extractor/ximalaya.py b/youtube_dl/extractor/ximalaya.py deleted file mode 100644 index a912e54b8..000000000 --- a/youtube_dl/extractor/ximalaya.py +++ /dev/null @@ -1,233 +0,0 @@ -# coding: utf-8 - -from __future__ import unicode_literals - -import itertools -import re - -from .common import InfoExtractor - - -class XimalayaBaseIE(InfoExtractor): - _GEO_COUNTRIES = ['CN'] - - -class XimalayaIE(XimalayaBaseIE): - IE_NAME = 'ximalaya' - IE_DESC = '喜马拉雅FM' - _VALID_URL = r'https?://(?:www\.|m\.)?ximalaya\.com/(?P<uid>[0-9]+)/sound/(?P<id>[0-9]+)' - _USER_URL_FORMAT = '%s://www.ximalaya.com/zhubo/%i/' - _TESTS = [ - { - 'url': 'http://www.ximalaya.com/61425525/sound/47740352/', - 'info_dict': { - 'id': '47740352', - 'ext': 'm4a', - 'uploader': '小彬彬爱听书', - 'uploader_id': 61425525, - 'uploader_url': 'http://www.ximalaya.com/zhubo/61425525/', - 'title': '261.唐诗三百首.卷八.送孟浩然之广陵.李白', - 'description': "contains:《送孟浩然之广陵》\n作者:李白\n故人西辞黄鹤楼,烟花三月下扬州。\n孤帆远影碧空尽,惟见长江天际流。", - 'thumbnails': [ - { - 'name': 'cover_url', - 'url': r're:^https?://.*\.jpg$', - }, - { - 'name': 'cover_url_142', - 'url': r're:^https?://.*\.jpg$', - 'width': 180, - 'height': 180 - } - ], - 'categories': ['renwen', '人文'], - 'duration': 93, - 'view_count': int, - 'like_count': int, - } - }, - { - 'url': 'http://m.ximalaya.com/61425525/sound/47740352/', - 'info_dict': { - 'id': '47740352', - 'ext': 'm4a', - 'uploader': '小彬彬爱听书', - 'uploader_id': 61425525, - 'uploader_url': 'http://www.ximalaya.com/zhubo/61425525/', - 'title': '261.唐诗三百首.卷八.送孟浩然之广陵.李白', - 'description': "contains:《送孟浩然之广陵》\n作者:李白\n故人西辞黄鹤楼,烟花三月下扬州。\n孤帆远影碧空尽,惟见长江天际流。", - 'thumbnails': [ - { - 'name': 'cover_url', - 'url': r're:^https?://.*\.jpg$', - }, - { - 'name': 'cover_url_142', - 'url': r're:^https?://.*\.jpg$', - 'width': 180, - 'height': 180 - } - ], - 'categories': ['renwen', '人文'], - 'duration': 93, - 'view_count': int, - 'like_count': int, - } - }, - { - 'url': 'https://www.ximalaya.com/11045267/sound/15705996/', - 'info_dict': { - 'id': '15705996', - 'ext': 'm4a', - 'uploader': '李延隆老师', - 'uploader_id': 11045267, - 'uploader_url': 'https://www.ximalaya.com/zhubo/11045267/', - 'title': 'Lesson 1 Excuse me!', - 'description': "contains:Listen to the tape then answer\xa0this question. Whose handbag is it?\n" - "听录音,然后回答问题,这是谁的手袋?", - 'thumbnails': [ - { - 'name': 'cover_url', - 'url': r're:^https?://.*\.jpg$', - }, - { - 'name': 'cover_url_142', - 'url': r're:^https?://.*\.jpg$', - 'width': 180, - 'height': 180 - } - ], - 'categories': ['train', '外语'], - 'duration': 40, - 'view_count': int, - 'like_count': int, - } - }, - ] - - def _real_extract(self, url): - - is_m = 'm.ximalaya' in url - scheme = 'https' if url.startswith('https') else 'http' - - audio_id = self._match_id(url) - webpage = self._download_webpage(url, audio_id, - note='Download sound page for %s' % audio_id, - errnote='Unable to get sound page') - - audio_info_file = '%s://m.ximalaya.com/tracks/%s.json' % (scheme, audio_id) - audio_info = self._download_json(audio_info_file, audio_id, - 'Downloading info json %s' % audio_info_file, - 'Unable to download info file') - - formats = [] - for bps, k in (('24k', 'play_path_32'), ('64k', 'play_path_64')): - if audio_info.get(k): - formats.append({ - 'format_id': bps, - 'url': audio_info[k], - }) - - thumbnails = [] - for k in audio_info.keys(): - # cover pics kyes like: cover_url', 'cover_url_142' - if k.startswith('cover_url'): - thumbnail = {'name': k, 'url': audio_info[k]} - if k == 'cover_url_142': - thumbnail['width'] = 180 - thumbnail['height'] = 180 - thumbnails.append(thumbnail) - - audio_uploader_id = audio_info.get('uid') - - if is_m: - audio_description = self._html_search_regex(r'(?s)<section\s+class=["\']content[^>]+>(.+?)</section>', - webpage, 'audio_description', fatal=False) - else: - audio_description = self._html_search_regex(r'(?s)<div\s+class=["\']rich_intro[^>]*>(.+?</article>)', - webpage, 'audio_description', fatal=False) - - if not audio_description: - audio_description_file = '%s://www.ximalaya.com/sounds/%s/rich_intro' % (scheme, audio_id) - audio_description = self._download_webpage(audio_description_file, audio_id, - note='Downloading description file %s' % audio_description_file, - errnote='Unable to download descrip file', - fatal=False) - audio_description = audio_description.strip() if audio_description else None - - return { - 'id': audio_id, - 'uploader': audio_info.get('nickname'), - 'uploader_id': audio_uploader_id, - 'uploader_url': self._USER_URL_FORMAT % (scheme, audio_uploader_id) if audio_uploader_id else None, - 'title': audio_info['title'], - 'thumbnails': thumbnails, - 'description': audio_description, - 'categories': list(filter(None, (audio_info.get('category_name'), audio_info.get('category_title')))), - 'duration': audio_info.get('duration'), - 'view_count': audio_info.get('play_count'), - 'like_count': audio_info.get('favorites_count'), - 'formats': formats, - } - - -class XimalayaAlbumIE(XimalayaBaseIE): - IE_NAME = 'ximalaya:album' - IE_DESC = '喜马拉雅FM 专辑' - _VALID_URL = r'https?://(?:www\.|m\.)?ximalaya\.com/(?P<uid>[0-9]+)/album/(?P<id>[0-9]+)' - _TEMPLATE_URL = '%s://www.ximalaya.com/%s/album/%s/' - _BASE_URL_TEMPL = '%s://www.ximalaya.com%s' - _LIST_VIDEO_RE = r'<a[^>]+?href="(?P<url>/%s/sound/(?P<id>\d+)/?)"[^>]+?title="(?P<title>[^>]+)">' - _TESTS = [{ - 'url': 'http://www.ximalaya.com/61425525/album/5534601/', - 'info_dict': { - 'title': '唐诗三百首(含赏析)', - 'id': '5534601', - }, - 'playlist_count': 312, - }, { - 'url': 'http://m.ximalaya.com/61425525/album/5534601', - 'info_dict': { - 'title': '唐诗三百首(含赏析)', - 'id': '5534601', - }, - 'playlist_count': 312, - }, - ] - - def _real_extract(self, url): - self.scheme = scheme = 'https' if url.startswith('https') else 'http' - - mobj = re.match(self._VALID_URL, url) - uid, playlist_id = mobj.group('uid'), mobj.group('id') - - webpage = self._download_webpage(self._TEMPLATE_URL % (scheme, uid, playlist_id), playlist_id, - note='Download album page for %s' % playlist_id, - errnote='Unable to get album info') - - title = self._html_search_regex(r'detailContent_title[^>]*><h1(?:[^>]+)?>([^<]+)</h1>', - webpage, 'title', fatal=False) - - return self.playlist_result(self._entries(webpage, playlist_id, uid), playlist_id, title) - - def _entries(self, page, playlist_id, uid): - html = page - for page_num in itertools.count(1): - for entry in self._process_page(html, uid): - yield entry - - next_url = self._search_regex(r'<a\s+href=(["\'])(?P<more>[\S]+)\1[^>]+rel=(["\'])next\3', - html, 'list_next_url', default=None, group='more') - if not next_url: - break - - next_full_url = self._BASE_URL_TEMPL % (self.scheme, next_url) - html = self._download_webpage(next_full_url, playlist_id) - - def _process_page(self, html, uid): - find_from = html.index('album_soundlist') - for mobj in re.finditer(self._LIST_VIDEO_RE % uid, html[find_from:]): - yield self.url_result(self._BASE_URL_TEMPL % (self.scheme, mobj.group('url')), - XimalayaIE.ie_key(), - mobj.group('id'), - mobj.group('title')) diff --git a/youtube_dl/extractor/xnxx.py b/youtube_dl/extractor/xnxx.py deleted file mode 100644 index ac1ccc404..000000000 --- a/youtube_dl/extractor/xnxx.py +++ /dev/null @@ -1,84 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - determine_ext, - int_or_none, - NO_DEFAULT, - str_to_int, -) - - -class XNXXIE(InfoExtractor): - _VALID_URL = r'https?://(?:video|www)\.xnxx\.com/video-?(?P<id>[0-9a-z]+)/' - _TESTS = [{ - 'url': 'http://www.xnxx.com/video-55awb78/skyrim_test_video', - 'md5': '7583e96c15c0f21e9da3453d9920fbba', - 'info_dict': { - 'id': '55awb78', - 'ext': 'mp4', - 'title': 'Skyrim Test Video', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 469, - 'view_count': int, - 'age_limit': 18, - }, - }, { - 'url': 'http://video.xnxx.com/video1135332/lida_naked_funny_actress_5_', - 'only_matching': True, - }, { - 'url': 'http://www.xnxx.com/video-55awb78/', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - def get(meta, default=NO_DEFAULT, fatal=True): - return self._search_regex( - r'set%s\s*\(\s*(["\'])(?P<value>(?:(?!\1).)+)\1' % meta, - webpage, meta, default=default, fatal=fatal, group='value') - - title = self._og_search_title( - webpage, default=None) or get('VideoTitle') - - formats = [] - for mobj in re.finditer( - r'setVideo(?:Url(?P<id>Low|High)|HLS)\s*\(\s*(?P<q>["\'])(?P<url>(?:https?:)?//.+?)(?P=q)', webpage): - format_url = mobj.group('url') - if determine_ext(format_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', entry_protocol='m3u8_native', - preference=1, m3u8_id='hls', fatal=False)) - else: - format_id = mobj.group('id') - if format_id: - format_id = format_id.lower() - formats.append({ - 'url': format_url, - 'format_id': format_id, - 'quality': -1 if format_id == 'low' else 0, - }) - self._sort_formats(formats) - - thumbnail = self._og_search_thumbnail(webpage, default=None) or get( - 'ThumbUrl', fatal=False) or get('ThumbUrl169', fatal=False) - duration = int_or_none(self._og_search_property('duration', webpage)) - view_count = str_to_int(self._search_regex( - r'id=["\']nb-views-number[^>]+>([\d,.]+)', webpage, 'view count', - default=None)) - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'duration': duration, - 'view_count': view_count, - 'age_limit': 18, - 'formats': formats, - } diff --git a/youtube_dl/extractor/xstream.py b/youtube_dl/extractor/xstream.py deleted file mode 100644 index 76c91bd92..000000000 --- a/youtube_dl/extractor/xstream.py +++ /dev/null @@ -1,119 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - parse_iso8601, - xpath_with_ns, - xpath_text, - find_xpath_attr, -) - - -class XstreamIE(InfoExtractor): - _VALID_URL = r'''(?x) - (?: - xstream:| - https?://frontend\.xstream\.(?:dk|net)/ - ) - (?P<partner_id>[^/]+) - (?: - :| - /feed/video/\?.*?\bid= - ) - (?P<id>\d+) - ''' - _TESTS = [{ - 'url': 'http://frontend.xstream.dk/btno/feed/video/?platform=web&id=86588', - 'md5': 'd7d17e3337dc80de6d3a540aefbe441b', - 'info_dict': { - 'id': '86588', - 'ext': 'mov', - 'title': 'Otto Wollertsen', - 'description': 'Vestlendingen Otto Fredrik Wollertsen', - 'timestamp': 1430473209, - 'upload_date': '20150501', - }, - }, { - 'url': 'http://frontend.xstream.dk/ap/feed/video/?platform=web&id=21039', - 'only_matching': True, - }] - - def _extract_video_info(self, partner_id, video_id): - data = self._download_xml( - 'http://frontend.xstream.dk/%s/feed/video/?platform=web&id=%s' - % (partner_id, video_id), - video_id) - - NS_MAP = { - 'atom': 'http://www.w3.org/2005/Atom', - 'xt': 'http://xstream.dk/', - 'media': 'http://search.yahoo.com/mrss/', - } - - entry = data.find(xpath_with_ns('./atom:entry', NS_MAP)) - - title = xpath_text( - entry, xpath_with_ns('./atom:title', NS_MAP), 'title') - description = xpath_text( - entry, xpath_with_ns('./atom:summary', NS_MAP), 'description') - timestamp = parse_iso8601(xpath_text( - entry, xpath_with_ns('./atom:published', NS_MAP), 'upload date')) - - formats = [] - media_group = entry.find(xpath_with_ns('./media:group', NS_MAP)) - for media_content in media_group.findall(xpath_with_ns('./media:content', NS_MAP)): - media_url = media_content.get('url') - if not media_url: - continue - tbr = int_or_none(media_content.get('bitrate')) - mobj = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>[^/]+))/(?P<playpath>.+)$', media_url) - if mobj: - formats.append({ - 'url': mobj.group('url'), - 'play_path': 'mp4:%s' % mobj.group('playpath'), - 'app': mobj.group('app'), - 'ext': 'flv', - 'tbr': tbr, - 'format_id': 'rtmp-%d' % tbr, - }) - else: - formats.append({ - 'url': media_url, - 'tbr': tbr, - }) - self._sort_formats(formats) - - link = find_xpath_attr( - entry, xpath_with_ns('./atom:link', NS_MAP), 'rel', 'original') - if link is not None: - formats.append({ - 'url': link.get('href'), - 'format_id': link.get('rel'), - 'preference': 1, - }) - - thumbnails = [{ - 'url': splash.get('url'), - 'width': int_or_none(splash.get('width')), - 'height': int_or_none(splash.get('height')), - } for splash in media_group.findall(xpath_with_ns('./xt:splash', NS_MAP))] - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'timestamp': timestamp, - 'formats': formats, - 'thumbnails': thumbnails, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - partner_id = mobj.group('partner_id') - video_id = mobj.group('id') - - return self._extract_video_info(partner_id, video_id) diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py deleted file mode 100644 index 7246409e3..000000000 --- a/youtube_dl/extractor/xtube.py +++ /dev/null @@ -1,233 +0,0 @@ -from __future__ import unicode_literals - -import itertools -import re - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - js_to_json, - orderedSet, - parse_duration, - sanitized_Request, - str_to_int, - url_or_none, -) - - -class XTubeIE(InfoExtractor): - _VALID_URL = r'''(?x) - (?: - xtube:| - https?://(?:www\.)?xtube\.com/(?:watch\.php\?.*\bv=|video-watch/(?:embedded/)?(?P<display_id>[^/]+)-) - ) - (?P<id>[^/?&#]+) - ''' - - _TESTS = [{ - # old URL schema - 'url': 'http://www.xtube.com/watch.php?v=kVTUy_G222_', - 'md5': '092fbdd3cbe292c920ef6fc6a8a9cdab', - 'info_dict': { - 'id': 'kVTUy_G222_', - 'ext': 'mp4', - 'title': 'strange erotica', - 'description': 'contains:an ET kind of thing', - 'uploader': 'greenshowers', - 'duration': 450, - 'view_count': int, - 'comment_count': int, - 'age_limit': 18, - } - }, { - # FLV videos with duplicated formats - 'url': 'http://www.xtube.com/video-watch/A-Super-Run-Part-1-YT-9299752', - 'md5': 'a406963eb349dd43692ec54631efd88b', - 'info_dict': { - 'id': '9299752', - 'display_id': 'A-Super-Run-Part-1-YT', - 'ext': 'flv', - 'title': 'A Super Run - Part 1 (YT)', - 'description': 'md5:4cc3af1aa1b0413289babc88f0d4f616', - 'uploader': 'tshirtguy59', - 'duration': 579, - 'view_count': int, - 'comment_count': int, - 'age_limit': 18, - }, - }, { - # new URL schema - 'url': 'http://www.xtube.com/video-watch/strange-erotica-625837', - 'only_matching': True, - }, { - 'url': 'xtube:625837', - 'only_matching': True, - }, { - 'url': 'xtube:kVTUy_G222_', - 'only_matching': True, - }, { - 'url': 'https://www.xtube.com/video-watch/embedded/milf-tara-and-teen-shared-and-cum-covered-extreme-bukkake-32203482?embedsize=big', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') - - if not display_id: - display_id = video_id - - if video_id.isdigit() and len(video_id) < 11: - url_pattern = 'http://www.xtube.com/video-watch/-%s' - else: - url_pattern = 'http://www.xtube.com/watch.php?v=%s' - - webpage = self._download_webpage( - url_pattern % video_id, display_id, headers={ - 'Cookie': 'age_verified=1; cookiesAccepted=1', - }) - - title, thumbnail, duration, sources, media_definition = [None] * 5 - - config = self._parse_json(self._search_regex( - r'playerConf\s*=\s*({.+?})\s*,\s*(?:\n|loaderConf|playerWrapper)', webpage, 'config', - default='{}'), video_id, transform_source=js_to_json, fatal=False) - if config: - config = config.get('mainRoll') - if isinstance(config, dict): - title = config.get('title') - thumbnail = config.get('poster') - duration = int_or_none(config.get('duration')) - sources = config.get('sources') or config.get('format') - media_definition = config.get('mediaDefinition') - - if not isinstance(sources, dict) and not media_definition: - sources = self._parse_json(self._search_regex( - r'(["\'])?sources\1?\s*:\s*(?P<sources>{.+?}),', - webpage, 'sources', group='sources'), video_id, - transform_source=js_to_json) - - formats = [] - format_urls = set() - - if isinstance(sources, dict): - for format_id, format_url in sources.items(): - format_url = url_or_none(format_url) - if not format_url: - continue - if format_url in format_urls: - continue - format_urls.add(format_url) - formats.append({ - 'url': format_url, - 'format_id': format_id, - 'height': int_or_none(format_id), - }) - - if isinstance(media_definition, list): - for media in media_definition: - video_url = url_or_none(media.get('videoUrl')) - if not video_url: - continue - if video_url in format_urls: - continue - format_urls.add(video_url) - format_id = media.get('format') - if format_id == 'hls': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - elif format_id == 'mp4': - height = int_or_none(media.get('quality')) - formats.append({ - 'url': video_url, - 'format_id': '%s-%d' % (format_id, height) if height else format_id, - 'height': height, - }) - - self._remove_duplicate_formats(formats) - self._sort_formats(formats) - - if not title: - title = self._search_regex( - (r'<h1>\s*(?P<title>[^<]+?)\s*</h1>', r'videoTitle\s*:\s*(["\'])(?P<title>.+?)\1'), - webpage, 'title', group='title') - description = self._og_search_description( - webpage, default=None) or self._html_search_meta( - 'twitter:description', webpage, default=None) or self._search_regex( - r'</h1>\s*<p>([^<]+)', webpage, 'description', fatal=False) - uploader = self._search_regex( - (r'<input[^>]+name="contentOwnerId"[^>]+value="([^"]+)"', - r'<span[^>]+class="nickname"[^>]*>([^<]+)'), - webpage, 'uploader', fatal=False) - if not duration: - duration = parse_duration(self._search_regex( - r'<dt>Runtime:?</dt>\s*<dd>([^<]+)</dd>', - webpage, 'duration', fatal=False)) - view_count = str_to_int(self._search_regex( - (r'["\']viewsCount["\'][^>]*>(\d+)\s+views', - r'<dt>Views:?</dt>\s*<dd>([\d,\.]+)</dd>'), - webpage, 'view count', fatal=False)) - comment_count = str_to_int(self._html_search_regex( - r'>Comments? \(([\d,\.]+)\)<', - webpage, 'comment count', fatal=False)) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'duration': duration, - 'view_count': view_count, - 'comment_count': comment_count, - 'age_limit': 18, - 'formats': formats, - } - - -class XTubeUserIE(InfoExtractor): - IE_DESC = 'XTube user profile' - _VALID_URL = r'https?://(?:www\.)?xtube\.com/profile/(?P<id>[^/]+-\d+)' - _TEST = { - 'url': 'http://www.xtube.com/profile/greenshowers-4056496', - 'info_dict': { - 'id': 'greenshowers-4056496', - 'age_limit': 18, - }, - 'playlist_mincount': 154, - } - - def _real_extract(self, url): - user_id = self._match_id(url) - - entries = [] - for pagenum in itertools.count(1): - request = sanitized_Request( - 'http://www.xtube.com/profile/%s/videos/%d' % (user_id, pagenum), - headers={ - 'Cookie': 'popunder=4', - 'X-Requested-With': 'XMLHttpRequest', - 'Referer': url, - }) - - page = self._download_json( - request, user_id, 'Downloading videos JSON page %d' % pagenum) - - html = page.get('html') - if not html: - break - - for video_id in orderedSet([video_id for _, video_id in re.findall( - r'data-plid=(["\'])(.+?)\1', html)]): - entries.append(self.url_result('xtube:%s' % video_id, XTubeIE.ie_key())) - - page_count = int_or_none(page.get('pageCount')) - if not page_count or pagenum == page_count: - break - - playlist = self.playlist_result(entries, user_id) - playlist['age_limit'] = 18 - return playlist diff --git a/youtube_dl/extractor/xxxymovies.py b/youtube_dl/extractor/xxxymovies.py deleted file mode 100644 index e34ebe3a6..000000000 --- a/youtube_dl/extractor/xxxymovies.py +++ /dev/null @@ -1,81 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - parse_duration, - int_or_none, -) - - -class XXXYMoviesIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?xxxymovies\.com/videos/(?P<id>\d+)/(?P<display_id>[^/]+)' - _TEST = { - 'url': 'http://xxxymovies.com/videos/138669/ecstatic-orgasm-sofcore/', - 'md5': '810b1bdbbffff89dd13bdb369fe7be4b', - 'info_dict': { - 'id': '138669', - 'display_id': 'ecstatic-orgasm-sofcore', - 'ext': 'mp4', - 'title': 'Ecstatic Orgasm Sofcore', - 'duration': 931, - 'categories': list, - 'view_count': int, - 'like_count': int, - 'dislike_count': int, - 'age_limit': 18, - } - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') - - webpage = self._download_webpage(url, display_id) - - video_url = self._search_regex( - r"video_url\s*:\s*'([^']+)'", webpage, 'video URL') - - title = self._html_search_regex( - [r'<div[^>]+\bclass="block_header"[^>]*>\s*<h1>([^<]+)<', - r'<title>(.*?)\s*-\s*(?:XXXYMovies\.com|XXX\s+Movies)</title>'], - webpage, 'title') - - thumbnail = self._search_regex( - r"preview_url\s*:\s*'([^']+)'", - webpage, 'thumbnail', fatal=False) - - categories = self._html_search_meta( - 'keywords', webpage, 'categories', default='').split(',') - - duration = parse_duration(self._search_regex( - r'<span>Duration:</span>\s*(\d+:\d+)', - webpage, 'duration', fatal=False)) - - view_count = int_or_none(self._html_search_regex( - r'<div class="video_views">\s*(\d+)', - webpage, 'view count', fatal=False)) - like_count = int_or_none(self._search_regex( - r'>\s*Likes? <b>\((\d+)\)', - webpage, 'like count', fatal=False)) - dislike_count = int_or_none(self._search_regex( - r'>\s*Dislike <b>\((\d+)\)</b>', - webpage, 'dislike count', fatal=False)) - - age_limit = self._rta_search(webpage) - - return { - 'id': video_id, - 'display_id': display_id, - 'url': video_url, - 'title': title, - 'thumbnail': thumbnail, - 'categories': categories, - 'duration': duration, - 'view_count': view_count, - 'like_count': like_count, - 'dislike_count': dislike_count, - 'age_limit': age_limit, - } diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py deleted file mode 100644 index a17b10d6e..000000000 --- a/youtube_dl/extractor/yahoo.py +++ /dev/null @@ -1,569 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import hashlib -import itertools -import re - -from .common import InfoExtractor, SearchInfoExtractor -from ..compat import ( - compat_str, - compat_urllib_parse, -) -from ..utils import ( - clean_html, - ExtractorError, - int_or_none, - mimetype2ext, - parse_iso8601, - smuggle_url, - try_get, - url_or_none, -) - -from .brightcove import BrightcoveNewIE - - -class YahooIE(InfoExtractor): - IE_DESC = 'Yahoo screen and movies' - _VALID_URL = r'(?P<url>https?://(?:(?P<country>[a-zA-Z]{2}(?:-[a-zA-Z]{2})?|malaysia)\.)?(?:[\da-zA-Z_-]+\.)?yahoo\.com/(?:[^/]+/)*(?P<id>[^?&#]*-[0-9]+(?:-[a-z]+)?)\.html)' - _TESTS = [{ - 'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html', - 'info_dict': { - 'id': '2d25e626-2378-391f-ada0-ddaf1417e588', - 'ext': 'mp4', - 'title': 'Julian Smith & Travis Legg Watch Julian Smith', - 'description': 'Julian and Travis watch Julian Smith', - 'duration': 6863, - 'timestamp': 1369812016, - 'upload_date': '20130529', - }, - }, { - 'url': 'https://screen.yahoo.com/community/community-sizzle-reel-203225340.html?format=embed', - 'md5': '7993e572fac98e044588d0b5260f4352', - 'info_dict': { - 'id': '4fe78544-8d48-39d8-97cd-13f205d9fcdb', - 'ext': 'mp4', - 'title': "Yahoo Saves 'Community'", - 'description': 'md5:4d4145af2fd3de00cbb6c1d664105053', - 'duration': 170, - 'timestamp': 1406838636, - 'upload_date': '20140731', - }, - }, { - 'url': 'https://uk.screen.yahoo.com/editor-picks/cute-raccoon-freed-drain-using-091756545.html', - 'md5': '71298482f7c64cbb7fa064e4553ff1c1', - 'info_dict': { - 'id': 'b3affa53-2e14-3590-852b-0e0db6cd1a58', - 'ext': 'webm', - 'title': 'Cute Raccoon Freed From Drain\u00a0Using Angle Grinder', - 'description': 'md5:f66c890e1490f4910a9953c941dee944', - 'duration': 97, - 'timestamp': 1414489862, - 'upload_date': '20141028', - } - }, { - 'url': 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html', - 'md5': '88e209b417f173d86186bef6e4d1f160', - 'info_dict': { - 'id': 'f885cf7f-43d4-3450-9fac-46ac30ece521', - 'ext': 'mp4', - 'title': 'China Moses Is Crazy About the Blues', - 'description': 'md5:9900ab8cd5808175c7b3fe55b979bed0', - 'duration': 128, - 'timestamp': 1385722202, - 'upload_date': '20131129', - } - }, { - 'url': 'https://www.yahoo.com/movies/v/true-story-trailer-173000497.html', - 'md5': '2a9752f74cb898af5d1083ea9f661b58', - 'info_dict': { - 'id': '071c4013-ce30-3a93-a5b2-e0413cd4a9d1', - 'ext': 'mp4', - 'title': '\'True Story\' Trailer', - 'description': 'True Story', - 'duration': 150, - 'timestamp': 1418919206, - 'upload_date': '20141218', - }, - }, { - 'url': 'https://gma.yahoo.com/pizza-delivery-man-surprised-huge-tip-college-kids-195200785.html', - 'only_matching': True, - }, { - 'note': 'NBC Sports embeds', - 'url': 'http://sports.yahoo.com/blogs/ncaab-the-dagger/tyler-kalinoski-s-buzzer-beater-caps-davidson-s-comeback-win-185609842.html?guid=nbc_cbk_davidsonbuzzerbeater_150313', - 'info_dict': { - 'id': '9CsDKds0kvHI', - 'ext': 'flv', - 'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d', - 'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson', - 'upload_date': '20150313', - 'uploader': 'NBCU-SPORTS', - 'timestamp': 1426270238, - }, - }, { - 'url': 'https://tw.news.yahoo.com/-100120367.html', - 'only_matching': True, - }, { - # Query result is embedded in webpage, but explicit request to video API fails with geo restriction - 'url': 'https://screen.yahoo.com/community/communitary-community-episode-1-ladders-154501237.html', - 'md5': '4fbafb9c9b6f07aa8f870629f6671b35', - 'info_dict': { - 'id': '1f32853c-a271-3eef-8cb6-f6d6872cb504', - 'ext': 'mp4', - 'title': 'Communitary - Community Episode 1: Ladders', - 'description': 'md5:8fc39608213295748e1e289807838c97', - 'duration': 1646, - 'timestamp': 1440436550, - 'upload_date': '20150824', - 'series': 'Communitary', - 'season_number': 6, - 'episode_number': 1, - }, - }, { - # ytwnews://cavideo/ - 'url': 'https://tw.video.yahoo.com/movie-tw/單車天使-中文版預-092316541.html', - 'info_dict': { - 'id': 'ba133ff2-0793-3510-b636-59dfe9ff6cff', - 'ext': 'mp4', - 'title': '單車天使 - 中文版預', - 'description': '中文版預', - 'timestamp': 1476696196, - 'upload_date': '20161017', - }, - 'params': { - 'skip_download': True, - }, - }, { - # Contains both a Yahoo hosted video and multiple Youtube embeds - 'url': 'https://www.yahoo.com/entertainment/gwen-stefani-reveals-the-pop-hit-she-passed-on-assigns-it-to-her-voice-contestant-instead-033045672.html', - 'info_dict': { - 'id': '46c5d95a-528f-3d03-b732-732fcadd51de', - 'title': 'Gwen Stefani reveals the pop hit she passed on, assigns it to her \'Voice\' contestant instead', - 'description': 'Gwen decided not to record this hit herself, but she decided it was the perfect fit for Kyndall Inskeep.', - }, - 'playlist': [{ - 'info_dict': { - 'id': '966d4262-4fd1-3aaa-b45b-049ca6e38ba6', - 'ext': 'mp4', - 'title': 'Gwen Stefani reveals she turned down one of Sia\'s best songs', - 'description': 'On "The Voice" Tuesday, Gwen Stefani told Taylor Swift which Sia hit was almost hers.', - 'timestamp': 1572406500, - 'upload_date': '20191030', - }, - }, { - 'info_dict': { - 'id': '352CFDOQrKg', - 'ext': 'mp4', - 'title': 'Kyndal Inskeep "Performs the Hell Out of" Sia\'s "Elastic Heart" - The Voice Knockouts 2019', - 'description': 'md5:35b61e94c2ae214bc965ff4245f80d11', - 'uploader': 'The Voice', - 'uploader_id': 'NBCTheVoice', - 'upload_date': '20191029', - }, - }], - 'params': { - 'playlistend': 2, - }, - 'expected_warnings': ['HTTP Error 404'], - }, { - 'url': 'https://malaysia.news.yahoo.com/video/bystanders-help-ontario-policeman-bust-190932818.html', - 'only_matching': True, - }, { - 'url': 'https://es-us.noticias.yahoo.com/es-la-puerta-irrompible-que-110539379.html', - 'only_matching': True, - }, { - 'url': 'https://www.yahoo.com/entertainment/v/longtime-cbs-news-60-minutes-032036500-cbs.html', - 'only_matching': True, - }] - - def _extract_yahoo_video(self, video_id, country): - video = self._download_json( - 'https://%s.yahoo.com/_td/api/resource/VideoService.videos;view=full;video_ids=["%s"]' % (country, video_id), - video_id, 'Downloading video JSON metadata')[0] - title = video['title'] - - if country == 'malaysia': - country = 'my' - - is_live = video.get('live_state') == 'live' - fmts = ('m3u8',) if is_live else ('webm', 'mp4') - - urls = [] - formats = [] - subtitles = {} - for fmt in fmts: - media_obj = self._download_json( - 'https://video-api.yql.yahoo.com/v1/video/sapi/streams/' + video_id, - video_id, 'Downloading %s JSON metadata' % fmt, - headers=self.geo_verification_headers(), query={ - 'format': fmt, - 'region': country.upper(), - })['query']['results']['mediaObj'][0] - msg = media_obj.get('status', {}).get('msg') - - for s in media_obj.get('streams', []): - host = s.get('host') - path = s.get('path') - if not host or not path: - continue - s_url = host + path - if s.get('format') == 'm3u8': - formats.extend(self._extract_m3u8_formats( - s_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) - continue - tbr = int_or_none(s.get('bitrate')) - formats.append({ - 'url': s_url, - 'format_id': fmt + ('-%d' % tbr if tbr else ''), - 'width': int_or_none(s.get('width')), - 'height': int_or_none(s.get('height')), - 'tbr': tbr, - 'fps': int_or_none(s.get('framerate')), - }) - - for cc in media_obj.get('closedcaptions', []): - cc_url = cc.get('url') - if not cc_url or cc_url in urls: - continue - urls.append(cc_url) - subtitles.setdefault(cc.get('lang') or 'en-US', []).append({ - 'url': cc_url, - 'ext': mimetype2ext(cc.get('content_type')), - }) - - streaming_url = video.get('streaming_url') - if streaming_url and not is_live: - formats.extend(self._extract_m3u8_formats( - streaming_url, video_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False)) - - if not formats and msg == 'geo restricted': - self.raise_geo_restricted() - - self._sort_formats(formats) - - thumbnails = [] - for thumb in video.get('thumbnails', []): - thumb_url = thumb.get('url') - if not thumb_url: - continue - thumbnails.append({ - 'id': thumb.get('tag'), - 'url': thumb.get('url'), - 'width': int_or_none(thumb.get('width')), - 'height': int_or_none(thumb.get('height')), - }) - - series_info = video.get('series_info') or {} - - return { - 'id': video_id, - 'title': self._live_title(title) if is_live else title, - 'formats': formats, - 'thumbnails': thumbnails, - 'description': clean_html(video.get('description')), - 'timestamp': parse_iso8601(video.get('publish_time')), - 'subtitles': subtitles, - 'duration': int_or_none(video.get('duration')), - 'view_count': int_or_none(video.get('view_count')), - 'is_live': is_live, - 'series': video.get('show_name'), - 'season_number': int_or_none(series_info.get('season_number')), - 'episode_number': int_or_none(series_info.get('episode_number')), - } - - def _real_extract(self, url): - url, country, display_id = re.match(self._VALID_URL, url).groups() - if not country: - country = 'us' - else: - country = country.split('-')[0] - - item = self._download_json( - 'https://%s.yahoo.com/caas/content/article' % country, display_id, - 'Downloading content JSON metadata', query={ - 'url': url - })['items'][0]['data']['partnerData'] - - if item.get('type') != 'video': - entries = [] - - cover = item.get('cover') or {} - if cover.get('type') == 'yvideo': - cover_url = cover.get('url') - if cover_url: - entries.append(self.url_result( - cover_url, 'Yahoo', cover.get('uuid'))) - - for e in (item.get('body') or []): - if e.get('type') == 'videoIframe': - iframe_url = e.get('url') - if not iframe_url: - continue - entries.append(self.url_result(iframe_url)) - - return self.playlist_result( - entries, item.get('uuid'), - item.get('title'), item.get('summary')) - - info = self._extract_yahoo_video(item['uuid'], country) - info['display_id'] = display_id - return info - - -class YahooSearchIE(SearchInfoExtractor): - IE_DESC = 'Yahoo screen search' - _MAX_RESULTS = 1000 - IE_NAME = 'screen.yahoo:search' - _SEARCH_KEY = 'yvsearch' - - def _get_n_results(self, query, n): - """Get a specified number of results for a query""" - entries = [] - for pagenum in itertools.count(0): - result_url = 'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30) - info = self._download_json(result_url, query, - note='Downloading results page ' + str(pagenum + 1)) - m = info['m'] - results = info['results'] - - for (i, r) in enumerate(results): - if (pagenum * 30) + i >= n: - break - mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r) - e = self.url_result('http://' + mobj.group('url'), 'Yahoo') - entries.append(e) - if (pagenum * 30 + i >= n) or (m['last'] >= (m['total'] - 1)): - break - - return { - '_type': 'playlist', - 'id': query, - 'entries': entries, - } - - -class YahooGyaOPlayerIE(InfoExtractor): - IE_NAME = 'yahoo:gyao:player' - _VALID_URL = r'https?://(?:gyao\.yahoo\.co\.jp/(?:player|episode/[^/]+)|streaming\.yahoo\.co\.jp/c/y)/(?P<id>\d+/v\d+/v\d+|[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' - _TESTS = [{ - 'url': 'https://gyao.yahoo.co.jp/player/00998/v00818/v0000000000000008564/', - 'info_dict': { - 'id': '5993125228001', - 'ext': 'mp4', - 'title': 'フューリー 【字幕版】', - 'description': 'md5:21e691c798a15330eda4db17a8fe45a5', - 'uploader_id': '4235717419001', - 'upload_date': '20190124', - 'timestamp': 1548294365, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'https://streaming.yahoo.co.jp/c/y/01034/v00133/v0000000000000000706/', - 'only_matching': True, - }, { - 'url': 'https://gyao.yahoo.co.jp/episode/%E3%81%8D%E3%81%AE%E3%81%86%E4%BD%95%E9%A3%9F%E3%81%B9%E3%81%9F%EF%BC%9F%20%E7%AC%AC2%E8%A9%B1%202019%2F4%2F12%E6%94%BE%E9%80%81%E5%88%86/5cb02352-b725-409e-9f8d-88f947a9f682', - 'only_matching': True, - }] - _GEO_BYPASS = False - - def _real_extract(self, url): - video_id = self._match_id(url).replace('/', ':') - headers = self.geo_verification_headers() - headers['Accept'] = 'application/json' - resp = self._download_json( - 'https://gyao.yahoo.co.jp/apis/playback/graphql', video_id, query={ - 'appId': 'dj00aiZpPUNJeDh2cU1RazU3UCZzPWNvbnN1bWVyc2VjcmV0Jng9NTk-', - 'query': '''{ - content(parameter: {contentId: "%s", logicaAgent: PC_WEB}) { - video { - delivery { - id - } - title - } - } -}''' % video_id, - }, headers=headers) - content = resp['data']['content'] - if not content: - msg = resp['errors'][0]['message'] - if msg == 'not in japan': - self.raise_geo_restricted(countries=['JP']) - raise ExtractorError(msg) - video = content['video'] - return { - '_type': 'url_transparent', - 'id': video_id, - 'title': video['title'], - 'url': smuggle_url( - 'http://players.brightcove.net/4235717419001/SyG5P0gjb_default/index.html?videoId=' + video['delivery']['id'], - {'geo_countries': ['JP']}), - 'ie_key': BrightcoveNewIE.ie_key(), - } - - -class YahooGyaOIE(InfoExtractor): - IE_NAME = 'yahoo:gyao' - _VALID_URL = r'https?://(?:gyao\.yahoo\.co\.jp/(?:p|title(?:/[^/]+)?)|streaming\.yahoo\.co\.jp/p/y)/(?P<id>\d+/v\d+|[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' - _TESTS = [{ - 'url': 'https://gyao.yahoo.co.jp/p/00449/v03102/', - 'info_dict': { - 'id': '00449:v03102', - }, - 'playlist_count': 2, - }, { - 'url': 'https://streaming.yahoo.co.jp/p/y/01034/v00133/', - 'only_matching': True, - }, { - 'url': 'https://gyao.yahoo.co.jp/title/%E3%81%97%E3%82%83%E3%81%B9%E3%81%8F%E3%82%8A007/5b025a49-b2e5-4dc7-945c-09c6634afacf', - 'only_matching': True, - }, { - 'url': 'https://gyao.yahoo.co.jp/title/5b025a49-b2e5-4dc7-945c-09c6634afacf', - 'only_matching': True, - }] - - def _real_extract(self, url): - program_id = self._match_id(url).replace('/', ':') - videos = self._download_json( - 'https://gyao.yahoo.co.jp/api/programs/%s/videos' % program_id, program_id)['videos'] - entries = [] - for video in videos: - video_id = video.get('id') - if not video_id: - continue - entries.append(self.url_result( - 'https://gyao.yahoo.co.jp/player/%s/' % video_id.replace(':', '/'), - YahooGyaOPlayerIE.ie_key(), video_id)) - return self.playlist_result(entries, program_id) - - -class YahooJapanNewsIE(InfoExtractor): - IE_NAME = 'yahoo:japannews' - IE_DESC = 'Yahoo! Japan News' - _VALID_URL = r'https?://(?P<host>(?:news|headlines)\.yahoo\.co\.jp)[^\d]*(?P<id>\d[\d-]*\d)?' - _GEO_COUNTRIES = ['JP'] - _TESTS = [{ - 'url': 'https://headlines.yahoo.co.jp/videonews/ann?a=20190716-00000071-ann-int', - 'info_dict': { - 'id': '1736242', - 'ext': 'mp4', - 'title': 'ムン大統領が対日批判を強化“現金化”効果は?(テレビ朝日系(ANN)) - Yahoo!ニュース', - 'description': '韓国の元徴用工らを巡る裁判の原告が弁護士が差し押さえた三菱重工業の資産を売却して - Yahoo!ニュース(テレビ朝日系(ANN))', - 'thumbnail': r're:^https?://.*\.[a-zA-Z\d]{3,4}$', - }, - 'params': { - 'skip_download': True, - }, - }, { - # geo restricted - 'url': 'https://headlines.yahoo.co.jp/hl?a=20190721-00000001-oxv-l04', - 'only_matching': True, - }, { - 'url': 'https://headlines.yahoo.co.jp/videonews/', - 'only_matching': True, - }, { - 'url': 'https://news.yahoo.co.jp', - 'only_matching': True, - }, { - 'url': 'https://news.yahoo.co.jp/byline/hashimotojunji/20190628-00131977/', - 'only_matching': True, - }, { - 'url': 'https://news.yahoo.co.jp/feature/1356', - 'only_matching': True - }] - - def _extract_formats(self, json_data, content_id): - formats = [] - - video_data = try_get( - json_data, - lambda x: x['ResultSet']['Result'][0]['VideoUrlSet']['VideoUrl'], - list) - for vid in video_data or []: - delivery = vid.get('delivery') - url = url_or_none(vid.get('Url')) - if not delivery or not url: - continue - elif delivery == 'hls': - formats.extend( - self._extract_m3u8_formats( - url, content_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - else: - formats.append({ - 'url': url, - 'format_id': 'http-%s' % compat_str(vid.get('bitrate', '')), - 'height': int_or_none(vid.get('height')), - 'width': int_or_none(vid.get('width')), - 'tbr': int_or_none(vid.get('bitrate')), - }) - self._remove_duplicate_formats(formats) - self._sort_formats(formats) - - return formats - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - host = mobj.group('host') - display_id = mobj.group('id') or host - - webpage = self._download_webpage(url, display_id) - - title = self._html_search_meta( - ['og:title', 'twitter:title'], webpage, 'title', default=None - ) or self._html_search_regex('<title>([^<]+)</title>', webpage, 'title') - - if display_id == host: - # Headline page (w/ multiple BC playlists) ('news.yahoo.co.jp', 'headlines.yahoo.co.jp/videonews/', ...) - stream_plists = re.findall(r'plist=(\d+)', webpage) or re.findall(r'plist["\']:\s*["\']([^"\']+)', webpage) - entries = [ - self.url_result( - smuggle_url( - 'http://players.brightcove.net/5690807595001/HyZNerRl7_default/index.html?playlistId=%s' % plist_id, - {'geo_countries': ['JP']}), - ie='BrightcoveNew', video_id=plist_id) - for plist_id in stream_plists] - return self.playlist_result(entries, playlist_title=title) - - # Article page - description = self._html_search_meta( - ['og:description', 'description', 'twitter:description'], - webpage, 'description', default=None) - thumbnail = self._og_search_thumbnail( - webpage, default=None) or self._html_search_meta( - 'twitter:image', webpage, 'thumbnail', default=None) - space_id = self._search_regex([ - r'<script[^>]+class=["\']yvpub-player["\'][^>]+spaceid=([^&"\']+)', - r'YAHOO\.JP\.srch\.\w+link\.onLoad[^;]+spaceID["\' ]*:["\' ]+([^"\']+)', - r'<!--\s+SpaceID=(\d+)' - ], webpage, 'spaceid') - - content_id = self._search_regex( - r'<script[^>]+class=["\']yvpub-player["\'][^>]+contentid=(?P<contentid>[^&"\']+)', - webpage, 'contentid', group='contentid') - - json_data = self._download_json( - 'https://feapi-yvpub.yahooapis.jp/v1/content/%s' % content_id, - content_id, - query={ - 'appid': 'dj0zaiZpPVZMTVFJR0FwZWpiMyZzPWNvbnN1bWVyc2VjcmV0Jng9YjU-', - 'output': 'json', - 'space_id': space_id, - 'domain': host, - 'ak': hashlib.md5('_'.join((space_id, host)).encode()).hexdigest(), - 'device_type': '1100', - }) - formats = self._extract_formats(json_data, content_id) - - return { - 'id': content_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'formats': formats, - } diff --git a/youtube_dl/extractor/yandexdisk.py b/youtube_dl/extractor/yandexdisk.py deleted file mode 100644 index 6fcd8ee7e..000000000 --- a/youtube_dl/extractor/yandexdisk.py +++ /dev/null @@ -1,147 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import json -import re - -from .common import InfoExtractor -from ..utils import ( - determine_ext, - float_or_none, - int_or_none, - mimetype2ext, - try_get, - urljoin, -) - - -class YandexDiskIE(InfoExtractor): - _VALID_URL = r'''(?x)https?:// - (?P<domain> - yadi\.sk| - disk\.yandex\. - (?: - az| - by| - co(?:m(?:\.(?:am|ge|tr))?|\.il)| - ee| - fr| - k[gz]| - l[tv]| - md| - t[jm]| - u[az]| - ru - ) - )/(?:[di]/|public.*?\bhash=)(?P<id>[^/?#&]+)''' - - _TESTS = [{ - 'url': 'https://yadi.sk/i/VdOeDou8eZs6Y', - 'md5': 'a4a8d52958c8fddcf9845935070402ae', - 'info_dict': { - 'id': 'VdOeDou8eZs6Y', - 'ext': 'mp4', - 'title': '4.mp4', - 'duration': 168.6, - 'uploader': 'y.botova', - 'uploader_id': '300043621', - 'view_count': int, - }, - 'expected_warnings': ['Unable to download JSON metadata'], - }, { - 'url': 'https://yadi.sk/d/h3WAXvDS3Li3Ce', - 'only_matching': True, - }, { - 'url': 'https://yadi.sk/public?hash=5DZ296JK9GWCLp02f6jrObjnctjRxMs8L6%2B%2FuhNqk38%3D', - 'only_matching': True, - }] - - def _real_extract(self, url): - domain, video_id = re.match(self._VALID_URL, url).groups() - - webpage = self._download_webpage(url, video_id) - store = self._parse_json(self._search_regex( - r'<script[^>]+id="store-prefetch"[^>]*>\s*({.+?})\s*</script>', - webpage, 'store'), video_id) - resource = store['resources'][store['rootResourceId']] - - title = resource['name'] - meta = resource.get('meta') or {} - - public_url = meta.get('short_url') - if public_url: - video_id = self._match_id(public_url) - - source_url = (self._download_json( - 'https://cloud-api.yandex.net/v1/disk/public/resources/download', - video_id, query={'public_key': url}, fatal=False) or {}).get('href') - video_streams = resource.get('videoStreams') or {} - video_hash = resource.get('hash') or url - environment = store.get('environment') or {} - sk = environment.get('sk') - yandexuid = environment.get('yandexuid') - if sk and yandexuid and not (source_url and video_streams): - self._set_cookie(domain, 'yandexuid', yandexuid) - - def call_api(action): - return (self._download_json( - urljoin(url, '/public/api/') + action, video_id, data=json.dumps({ - 'hash': video_hash, - 'sk': sk, - }).encode(), headers={ - 'Content-Type': 'text/plain', - }, fatal=False) or {}).get('data') or {} - if not source_url: - # TODO: figure out how to detect if download limit has - # been reached and then avoid unnecessary source format - # extraction requests - source_url = call_api('download-url').get('url') - if not video_streams: - video_streams = call_api('get-video-streams') - - formats = [] - if source_url: - formats.append({ - 'url': source_url, - 'format_id': 'source', - 'ext': determine_ext(title, meta.get('ext') or mimetype2ext(meta.get('mime_type')) or 'mp4'), - 'quality': 1, - 'filesize': int_or_none(meta.get('size')) - }) - - for video in (video_streams.get('videos') or []): - format_url = video.get('url') - if not format_url: - continue - if video.get('dimension') == 'adaptive': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - else: - size = video.get('size') or {} - height = int_or_none(size.get('height')) - format_id = 'hls' - if height: - format_id += '-%dp' % height - formats.append({ - 'ext': 'mp4', - 'format_id': format_id, - 'height': height, - 'protocol': 'm3u8_native', - 'url': format_url, - 'width': int_or_none(size.get('width')), - }) - self._sort_formats(formats) - - uid = resource.get('uid') - display_name = try_get(store, lambda x: x['users'][uid]['displayName']) - - return { - 'id': video_id, - 'title': title, - 'duration': float_or_none(video_streams.get('duration'), 1000), - 'uploader': display_name, - 'uploader_id': uid, - 'view_count': int_or_none(meta.get('views_counter')), - 'formats': formats, - } diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py deleted file mode 100644 index 84969f8e1..000000000 --- a/youtube_dl/extractor/yandexmusic.py +++ /dev/null @@ -1,459 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import hashlib -import itertools -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - ExtractorError, - int_or_none, - float_or_none, - try_get, -) - - -class YandexMusicBaseIE(InfoExtractor): - _VALID_URL_BASE = r'https?://music\.yandex\.(?P<tld>ru|kz|ua|by|com)' - - @staticmethod - def _handle_error(response): - if isinstance(response, dict): - error = response.get('error') - if error: - raise ExtractorError(error, expected=True) - if response.get('type') == 'captcha' or 'captcha' in response: - YandexMusicBaseIE._raise_captcha() - - @staticmethod - def _raise_captcha(): - raise ExtractorError( - 'YandexMusic has considered youtube-dl requests automated and ' - 'asks you to solve a CAPTCHA. You can either wait for some ' - 'time until unblocked and optionally use --sleep-interval ' - 'in future or alternatively you can go to https://music.yandex.ru/ ' - 'solve CAPTCHA, then export cookies and pass cookie file to ' - 'youtube-dl with --cookies', - expected=True) - - def _download_webpage_handle(self, *args, **kwargs): - webpage = super(YandexMusicBaseIE, self)._download_webpage_handle(*args, **kwargs) - if 'Нам очень жаль, но запросы, поступившие с вашего IP-адреса, похожи на автоматические.' in webpage: - self._raise_captcha() - return webpage - - def _download_json(self, *args, **kwargs): - response = super(YandexMusicBaseIE, self)._download_json(*args, **kwargs) - self._handle_error(response) - return response - - def _call_api(self, ep, tld, url, item_id, note, query): - return self._download_json( - 'https://music.yandex.%s/handlers/%s.jsx' % (tld, ep), - item_id, note, - fatal=False, - headers={ - 'Referer': url, - 'X-Requested-With': 'XMLHttpRequest', - 'X-Retpath-Y': url, - }, - query=query) - - -class YandexMusicTrackIE(YandexMusicBaseIE): - IE_NAME = 'yandexmusic:track' - IE_DESC = 'Яндекс.Музыка - Трек' - _VALID_URL = r'%s/album/(?P<album_id>\d+)/track/(?P<id>\d+)' % YandexMusicBaseIE._VALID_URL_BASE - - _TESTS = [{ - 'url': 'http://music.yandex.ru/album/540508/track/4878838', - 'md5': 'dec8b661f12027ceaba33318787fff76', - 'info_dict': { - 'id': '4878838', - 'ext': 'mp3', - 'title': 'md5:c63e19341fdbe84e43425a30bc777856', - 'filesize': int, - 'duration': 193.04, - 'track': 'md5:210508c6ffdfd67a493a6c378f22c3ff', - 'album': 'md5:cd04fb13c4efeafdfa0a6a6aca36d01a', - 'album_artist': 'md5:5f54c35462c07952df33d97cfb5fc200', - 'artist': 'md5:e6fd86621825f14dc0b25db3acd68160', - 'release_year': 2009, - }, - # 'skip': 'Travis CI servers blocked by YandexMusic', - }, { - # multiple disks - 'url': 'http://music.yandex.ru/album/3840501/track/705105', - 'md5': '82a54e9e787301dd45aba093cf6e58c0', - 'info_dict': { - 'id': '705105', - 'ext': 'mp3', - 'title': 'md5:f86d4a9188279860a83000277024c1a6', - 'filesize': int, - 'duration': 239.27, - 'track': 'md5:40f887f0666ba1aa10b835aca44807d1', - 'album': 'md5:624f5224b14f5c88a8e812fd7fbf1873', - 'album_artist': 'md5:dd35f2af4e8927100cbe6f5e62e1fb12', - 'artist': 'md5:dd35f2af4e8927100cbe6f5e62e1fb12', - 'release_year': 2016, - 'genre': 'pop', - 'disc_number': 2, - 'track_number': 9, - }, - # 'skip': 'Travis CI servers blocked by YandexMusic', - }, { - 'url': 'http://music.yandex.com/album/540508/track/4878838', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - tld, album_id, track_id = mobj.group('tld'), mobj.group('album_id'), mobj.group('id') - - track = self._call_api( - 'track', tld, url, track_id, 'Downloading track JSON', - {'track': '%s:%s' % (track_id, album_id)})['track'] - track_title = track['title'] - - download_data = self._download_json( - 'https://music.yandex.ru/api/v2.1/handlers/track/%s:%s/web-album_track-track-track-main/download/m' % (track_id, album_id), - track_id, 'Downloading track location url JSON', - headers={'X-Retpath-Y': url}) - - fd_data = self._download_json( - download_data['src'], track_id, - 'Downloading track location JSON', - query={'format': 'json'}) - key = hashlib.md5(('XGRlBW9FXlekgbPrRHuSiA' + fd_data['path'][1:] + fd_data['s']).encode('utf-8')).hexdigest() - f_url = 'http://%s/get-mp3/%s/%s?track-id=%s ' % (fd_data['host'], key, fd_data['ts'] + fd_data['path'], track['id']) - - thumbnail = None - cover_uri = track.get('albums', [{}])[0].get('coverUri') - if cover_uri: - thumbnail = cover_uri.replace('%%', 'orig') - if not thumbnail.startswith('http'): - thumbnail = 'http://' + thumbnail - - track_info = { - 'id': track_id, - 'ext': 'mp3', - 'url': f_url, - 'filesize': int_or_none(track.get('fileSize')), - 'duration': float_or_none(track.get('durationMs'), 1000), - 'thumbnail': thumbnail, - 'track': track_title, - 'acodec': download_data.get('codec'), - 'abr': int_or_none(download_data.get('bitrate')), - } - - def extract_artist_name(artist): - decomposed = artist.get('decomposed') - if not isinstance(decomposed, list): - return artist['name'] - parts = [artist['name']] - for element in decomposed: - if isinstance(element, dict) and element.get('name'): - parts.append(element['name']) - elif isinstance(element, compat_str): - parts.append(element) - return ''.join(parts) - - def extract_artist(artist_list): - if artist_list and isinstance(artist_list, list): - artists_names = [extract_artist_name(a) for a in artist_list if a.get('name')] - if artists_names: - return ', '.join(artists_names) - - albums = track.get('albums') - if albums and isinstance(albums, list): - album = albums[0] - if isinstance(album, dict): - year = album.get('year') - disc_number = int_or_none(try_get( - album, lambda x: x['trackPosition']['volume'])) - track_number = int_or_none(try_get( - album, lambda x: x['trackPosition']['index'])) - track_info.update({ - 'album': album.get('title'), - 'album_artist': extract_artist(album.get('artists')), - 'release_year': int_or_none(year), - 'genre': album.get('genre'), - 'disc_number': disc_number, - 'track_number': track_number, - }) - - track_artist = extract_artist(track.get('artists')) - if track_artist: - track_info.update({ - 'artist': track_artist, - 'title': '%s - %s' % (track_artist, track_title), - }) - else: - track_info['title'] = track_title - - return track_info - - -class YandexMusicPlaylistBaseIE(YandexMusicBaseIE): - def _extract_tracks(self, source, item_id, url, tld): - tracks = source['tracks'] - track_ids = [compat_str(track_id) for track_id in source['trackIds']] - - # tracks dictionary shipped with playlist.jsx API is limited to 150 tracks, - # missing tracks should be retrieved manually. - if len(tracks) < len(track_ids): - present_track_ids = set([ - compat_str(track['id']) - for track in tracks if track.get('id')]) - missing_track_ids = [ - track_id for track_id in track_ids - if track_id not in present_track_ids] - # Request missing tracks in chunks to avoid exceeding max HTTP header size, - # see https://github.com/ytdl-org/youtube-dl/issues/27355 - _TRACKS_PER_CHUNK = 250 - for chunk_num in itertools.count(0): - start = chunk_num * _TRACKS_PER_CHUNK - end = start + _TRACKS_PER_CHUNK - missing_track_ids_req = missing_track_ids[start:end] - assert missing_track_ids_req - missing_tracks = self._call_api( - 'track-entries', tld, url, item_id, - 'Downloading missing tracks JSON chunk %d' % (chunk_num + 1), { - 'entries': ','.join(missing_track_ids_req), - 'lang': tld, - 'external-domain': 'music.yandex.%s' % tld, - 'overembed': 'false', - 'strict': 'true', - }) - if missing_tracks: - tracks.extend(missing_tracks) - if end >= len(missing_track_ids): - break - - return tracks - - def _build_playlist(self, tracks): - entries = [] - for track in tracks: - track_id = track.get('id') or track.get('realId') - if not track_id: - continue - albums = track.get('albums') - if not albums or not isinstance(albums, list): - continue - album = albums[0] - if not isinstance(album, dict): - continue - album_id = album.get('id') - if not album_id: - continue - entries.append(self.url_result( - 'http://music.yandex.ru/album/%s/track/%s' % (album_id, track_id), - ie=YandexMusicTrackIE.ie_key(), video_id=track_id)) - return entries - - -class YandexMusicAlbumIE(YandexMusicPlaylistBaseIE): - IE_NAME = 'yandexmusic:album' - IE_DESC = 'Яндекс.Музыка - Альбом' - _VALID_URL = r'%s/album/(?P<id>\d+)' % YandexMusicBaseIE._VALID_URL_BASE - - _TESTS = [{ - 'url': 'http://music.yandex.ru/album/540508', - 'info_dict': { - 'id': '540508', - 'title': 'md5:7ed1c3567f28d14be9f61179116f5571', - }, - 'playlist_count': 50, - # 'skip': 'Travis CI servers blocked by YandexMusic', - }, { - 'url': 'https://music.yandex.ru/album/3840501', - 'info_dict': { - 'id': '3840501', - 'title': 'md5:36733472cdaa7dcb1fd9473f7da8e50f', - }, - 'playlist_count': 33, - # 'skip': 'Travis CI servers blocked by YandexMusic', - }, { - # empty artists - 'url': 'https://music.yandex.ru/album/9091882', - 'info_dict': { - 'id': '9091882', - 'title': 'ТЕД на русском', - }, - 'playlist_count': 187, - }] - - @classmethod - def suitable(cls, url): - return False if YandexMusicTrackIE.suitable(url) else super(YandexMusicAlbumIE, cls).suitable(url) - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - tld = mobj.group('tld') - album_id = mobj.group('id') - - album = self._call_api( - 'album', tld, url, album_id, 'Downloading album JSON', - {'album': album_id}) - - entries = self._build_playlist([track for volume in album['volumes'] for track in volume]) - - title = album['title'] - artist = try_get(album, lambda x: x['artists'][0]['name'], compat_str) - if artist: - title = '%s - %s' % (artist, title) - year = album.get('year') - if year: - title += ' (%s)' % year - - return self.playlist_result(entries, compat_str(album['id']), title) - - -class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): - IE_NAME = 'yandexmusic:playlist' - IE_DESC = 'Яндекс.Музыка - Плейлист' - _VALID_URL = r'%s/users/(?P<user>[^/]+)/playlists/(?P<id>\d+)' % YandexMusicBaseIE._VALID_URL_BASE - - _TESTS = [{ - 'url': 'http://music.yandex.ru/users/music.partners/playlists/1245', - 'info_dict': { - 'id': '1245', - 'title': 'md5:841559b3fe2b998eca88d0d2e22a3097', - 'description': 'md5:3b9f27b0efbe53f2ee1e844d07155cc9', - }, - 'playlist_count': 5, - # 'skip': 'Travis CI servers blocked by YandexMusic', - }, { - 'url': 'https://music.yandex.ru/users/ya.playlist/playlists/1036', - 'only_matching': True, - }, { - # playlist exceeding the limit of 150 tracks (see - # https://github.com/ytdl-org/youtube-dl/issues/6666) - 'url': 'https://music.yandex.ru/users/mesiaz/playlists/1364', - 'info_dict': { - 'id': '1364', - 'title': 'md5:b3b400f997d3f878a13ae0699653f7db', - }, - 'playlist_mincount': 437, - # 'skip': 'Travis CI servers blocked by YandexMusic', - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - tld = mobj.group('tld') - user = mobj.group('user') - playlist_id = mobj.group('id') - - playlist = self._call_api( - 'playlist', tld, url, playlist_id, 'Downloading playlist JSON', { - 'owner': user, - 'kinds': playlist_id, - 'light': 'true', - 'lang': tld, - 'external-domain': 'music.yandex.%s' % tld, - 'overembed': 'false', - })['playlist'] - - tracks = self._extract_tracks(playlist, playlist_id, url, tld) - - return self.playlist_result( - self._build_playlist(tracks), - compat_str(playlist_id), - playlist.get('title'), playlist.get('description')) - - -class YandexMusicArtistBaseIE(YandexMusicPlaylistBaseIE): - def _call_artist(self, tld, url, artist_id): - return self._call_api( - 'artist', tld, url, artist_id, - 'Downloading artist %s JSON' % self._ARTIST_WHAT, { - 'artist': artist_id, - 'what': self._ARTIST_WHAT, - 'sort': self._ARTIST_SORT or '', - 'dir': '', - 'period': '', - 'lang': tld, - 'external-domain': 'music.yandex.%s' % tld, - 'overembed': 'false', - }) - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - tld = mobj.group('tld') - artist_id = mobj.group('id') - data = self._call_artist(tld, url, artist_id) - tracks = self._extract_tracks(data, artist_id, url, tld) - title = try_get(data, lambda x: x['artist']['name'], compat_str) - return self.playlist_result( - self._build_playlist(tracks), artist_id, title) - - -class YandexMusicArtistTracksIE(YandexMusicArtistBaseIE): - IE_NAME = 'yandexmusic:artist:tracks' - IE_DESC = 'Яндекс.Музыка - Артист - Треки' - _VALID_URL = r'%s/artist/(?P<id>\d+)/tracks' % YandexMusicBaseIE._VALID_URL_BASE - - _TESTS = [{ - 'url': 'https://music.yandex.ru/artist/617526/tracks', - 'info_dict': { - 'id': '617526', - 'title': 'md5:131aef29d45fd5a965ca613e708c040b', - }, - 'playlist_count': 507, - # 'skip': 'Travis CI servers blocked by YandexMusic', - }] - - _ARTIST_SORT = '' - _ARTIST_WHAT = 'tracks' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - tld = mobj.group('tld') - artist_id = mobj.group('id') - data = self._call_artist(tld, url, artist_id) - tracks = self._extract_tracks(data, artist_id, url, tld) - artist = try_get(data, lambda x: x['artist']['name'], compat_str) - title = '%s - %s' % (artist or artist_id, 'Треки') - return self.playlist_result( - self._build_playlist(tracks), artist_id, title) - - -class YandexMusicArtistAlbumsIE(YandexMusicArtistBaseIE): - IE_NAME = 'yandexmusic:artist:albums' - IE_DESC = 'Яндекс.Музыка - Артист - Альбомы' - _VALID_URL = r'%s/artist/(?P<id>\d+)/albums' % YandexMusicBaseIE._VALID_URL_BASE - - _TESTS = [{ - 'url': 'https://music.yandex.ru/artist/617526/albums', - 'info_dict': { - 'id': '617526', - 'title': 'md5:55dc58d5c85699b7fb41ee926700236c', - }, - 'playlist_count': 8, - # 'skip': 'Travis CI servers blocked by YandexMusic', - }] - - _ARTIST_SORT = 'year' - _ARTIST_WHAT = 'albums' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - tld = mobj.group('tld') - artist_id = mobj.group('id') - data = self._call_artist(tld, url, artist_id) - entries = [] - for album in data['albums']: - if not isinstance(album, dict): - continue - album_id = album.get('id') - if not album_id: - continue - entries.append(self.url_result( - 'http://music.yandex.ru/album/%s' % album_id, - ie=YandexMusicAlbumIE.ie_key(), video_id=album_id)) - artist = try_get(data, lambda x: x['artist']['name'], compat_str) - title = '%s - %s' % (artist or artist_id, 'Альбомы') - return self.playlist_result(entries, artist_id, title) diff --git a/youtube_dl/extractor/yandexvideo.py b/youtube_dl/extractor/yandexvideo.py deleted file mode 100644 index 6a166ec9b..000000000 --- a/youtube_dl/extractor/yandexvideo.py +++ /dev/null @@ -1,144 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - determine_ext, - int_or_none, - try_get, - url_or_none, -) - - -class YandexVideoIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?:// - (?: - yandex\.ru(?:/(?:portal/(?:video|efir)|efir))?/?\?.*?stream_id=| - frontend\.vh\.yandex\.ru/player/ - ) - (?P<id>(?:[\da-f]{32}|[\w-]{12})) - ''' - _TESTS = [{ - 'url': 'https://yandex.ru/portal/video?stream_id=4dbb36ec4e0526d58f9f2dc8f0ecf374', - 'md5': 'e02a05bfaf0d9615ef07ae3a10f4faf4', - 'info_dict': { - 'id': '4dbb36ec4e0526d58f9f2dc8f0ecf374', - 'ext': 'mp4', - 'title': 'Русский Вудсток - главный рок-фест в истории СССР / вДудь', - 'description': 'md5:7d6b8d4bc4a3b9a56499916c1ea5b5fa', - 'thumbnail': r're:^https?://', - 'timestamp': 1549972939, - 'duration': 5575, - 'age_limit': 18, - 'upload_date': '20190212', - 'view_count': int, - 'like_count': int, - 'dislike_count': int, - }, - }, { - 'url': 'https://yandex.ru/portal/efir?stream_id=4dbb262b4fe5cf15a215de4f34eee34d&from=morda', - 'only_matching': True, - }, { - 'url': 'https://yandex.ru/?stream_id=4dbb262b4fe5cf15a215de4f34eee34d', - 'only_matching': True, - }, { - 'url': 'https://frontend.vh.yandex.ru/player/4dbb262b4fe5cf15a215de4f34eee34d?from=morda', - 'only_matching': True, - }, { - # vod-episode, series episode - 'url': 'https://yandex.ru/portal/video?stream_id=45b11db6e4b68797919c93751a938cee', - 'only_matching': True, - }, { - # episode, sports - 'url': 'https://yandex.ru/?stream_channel=1538487871&stream_id=4132a07f71fb0396be93d74b3477131d', - 'only_matching': True, - }, { - # DASH with DRM - 'url': 'https://yandex.ru/portal/video?from=morda&stream_id=485a92d94518d73a9d0ff778e13505f8', - 'only_matching': True, - }, { - 'url': 'https://yandex.ru/efir?stream_active=watching&stream_id=v7a2dZ-v5mSI&from_block=efir_newtab', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - player = try_get((self._download_json( - 'https://frontend.vh.yandex.ru/graphql', video_id, data=('''{ - player(content_id: "%s") { - computed_title - content_url - description - dislikes - duration - likes - program_title - release_date - release_date_ut - release_year - restriction_age - season - start_time - streams - thumbnail - title - views_count - } -}''' % video_id).encode(), fatal=False)), lambda x: x['player']['content']) - if not player or player.get('error'): - player = self._download_json( - 'https://frontend.vh.yandex.ru/v23/player/%s.json' % video_id, - video_id, query={ - 'stream_options': 'hires', - 'disable_trackings': 1, - }) - content = player['content'] - - title = content.get('title') or content['computed_title'] - - formats = [] - streams = content.get('streams') or [] - streams.append({'url': content.get('content_url')}) - for stream in streams: - content_url = url_or_none(stream.get('url')) - if not content_url: - continue - ext = determine_ext(content_url) - if ext == 'ismc': - continue - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - content_url, video_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False)) - elif ext == 'mpd': - formats.extend(self._extract_mpd_formats( - content_url, video_id, mpd_id='dash', fatal=False)) - else: - formats.append({'url': content_url}) - - self._sort_formats(formats) - - timestamp = (int_or_none(content.get('release_date')) - or int_or_none(content.get('release_date_ut')) - or int_or_none(content.get('start_time'))) - season = content.get('season') or {} - - return { - 'id': video_id, - 'title': title, - 'description': content.get('description'), - 'thumbnail': content.get('thumbnail'), - 'timestamp': timestamp, - 'duration': int_or_none(content.get('duration')), - 'series': content.get('program_title'), - 'age_limit': int_or_none(content.get('restriction_age')), - 'view_count': int_or_none(content.get('views_count')), - 'like_count': int_or_none(content.get('likes')), - 'dislike_count': int_or_none(content.get('dislikes')), - 'season_number': int_or_none(season.get('season_number')), - 'season_id': season.get('id'), - 'release_year': int_or_none(content.get('release_year')), - 'formats': formats, - } diff --git a/youtube_dl/extractor/youjizz.py b/youtube_dl/extractor/youjizz.py deleted file mode 100644 index 88aabd272..000000000 --- a/youtube_dl/extractor/youjizz.py +++ /dev/null @@ -1,95 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - determine_ext, - int_or_none, - parse_duration, - url_or_none, -) - - -class YouJizzIE(InfoExtractor): - _VALID_URL = r'https?://(?:\w+\.)?youjizz\.com/videos/(?:[^/#?]*-(?P<id>\d+)\.html|embed/(?P<embed_id>\d+))' - _TESTS = [{ - 'url': 'http://www.youjizz.com/videos/zeichentrick-1-2189178.html', - 'md5': 'b1e1dfaa8bb9537d8b84eeda9cf4acf4', - 'info_dict': { - 'id': '2189178', - 'ext': 'mp4', - 'title': 'Zeichentrick 1', - 'age_limit': 18, - 'duration': 2874, - } - }, { - 'url': 'http://www.youjizz.com/videos/-2189178.html', - 'only_matching': True, - }, { - 'url': 'https://www.youjizz.com/videos/embed/31991001', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') or mobj.group('embed_id') - - webpage = self._download_webpage(url, video_id) - - title = self._html_search_regex( - r'<title>(.+?)</title>', webpage, 'title') - - formats = [] - - encodings = self._parse_json( - self._search_regex( - r'[Ee]ncodings\s*=\s*(\[.+?\]);\n', webpage, 'encodings', - default='[]'), - video_id, fatal=False) - for encoding in encodings: - if not isinstance(encoding, dict): - continue - format_url = url_or_none(encoding.get('filename')) - if not format_url: - continue - if determine_ext(format_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - else: - format_id = encoding.get('name') or encoding.get('quality') - height = int_or_none(self._search_regex( - r'^(\d+)[pP]', format_id, 'height', default=None)) - formats.append({ - 'url': format_url, - 'format_id': format_id, - 'height': height, - }) - - if formats: - info_dict = { - 'formats': formats, - } - else: - # YouJizz's HTML5 player has invalid HTML - webpage = webpage.replace('"controls', '" controls') - info_dict = self._parse_html5_media_entries( - url, webpage, video_id)[0] - - duration = parse_duration(self._search_regex( - r'<strong>Runtime:</strong>([^<]+)', webpage, 'duration', - default=None)) - uploader = self._search_regex( - r'<strong>Uploaded By:.*?<a[^>]*>([^<]+)', webpage, 'uploader', - default=None) - - info_dict.update({ - 'id': video_id, - 'title': title, - 'age_limit': self._rta_search(webpage), - 'duration': duration, - 'uploader': uploader, - }) - - return info_dict diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py deleted file mode 100644 index 880c89687..000000000 --- a/youtube_dl/extractor/youku.py +++ /dev/null @@ -1,309 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import random -import re -import string -import time - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - get_element_by_class, - js_to_json, - str_or_none, - strip_jsonp, -) - - -class YoukuIE(InfoExtractor): - IE_NAME = 'youku' - IE_DESC = '优酷' - _VALID_URL = r'''(?x) - (?: - https?://( - (?:v|player)\.youku\.com/(?:v_show/id_|player\.php/sid/)| - video\.tudou\.com/v/)| - youku:) - (?P<id>[A-Za-z0-9]+)(?:\.html|/v\.swf|) - ''' - - _TESTS = [{ - # MD5 is unstable - 'url': 'http://v.youku.com/v_show/id_XMTc1ODE5Njcy.html', - 'info_dict': { - 'id': 'XMTc1ODE5Njcy', - 'title': '★Smile﹗♡ Git Fresh -Booty Music舞蹈.', - 'ext': 'mp4', - 'duration': 74.73, - 'thumbnail': r're:^https?://.*', - 'uploader': '。躲猫猫、', - 'uploader_id': '36017967', - 'uploader_url': 'http://i.youku.com/u/UMTQ0MDcxODY4', - 'tags': list, - } - }, { - 'url': 'http://player.youku.com/player.php/sid/XNDgyMDQ2NTQw/v.swf', - 'only_matching': True, - }, { - 'url': 'http://v.youku.com/v_show/id_XODgxNjg1Mzk2_ev_1.html', - 'info_dict': { - 'id': 'XODgxNjg1Mzk2', - 'ext': 'mp4', - 'title': '武媚娘传奇 85', - 'duration': 1999.61, - 'thumbnail': r're:^https?://.*', - 'uploader': '疯狂豆花', - 'uploader_id': '62583473', - 'uploader_url': 'http://i.youku.com/u/UMjUwMzMzODky', - 'tags': list, - }, - }, { - 'url': 'http://v.youku.com/v_show/id_XMTI1OTczNDM5Mg==.html', - 'info_dict': { - 'id': 'XMTI1OTczNDM5Mg', - 'ext': 'mp4', - 'title': '花千骨 04', - 'duration': 2363, - 'thumbnail': r're:^https?://.*', - 'uploader': '放剧场-花千骨', - 'uploader_id': '772849359', - 'uploader_url': 'http://i.youku.com/u/UMzA5MTM5NzQzNg==', - 'tags': list, - }, - }, { - 'url': 'http://v.youku.com/v_show/id_XNjA1NzA2Njgw.html', - 'note': 'Video protected with password', - 'info_dict': { - 'id': 'XNjA1NzA2Njgw', - 'ext': 'mp4', - 'title': '邢義田复旦讲座之想象中的胡人—从“左衽孔子”说起', - 'duration': 7264.5, - 'thumbnail': r're:^https?://.*', - 'uploader': 'FoxJin1006', - 'uploader_id': '322014285', - 'uploader_url': 'http://i.youku.com/u/UMTI4ODA1NzE0MA==', - 'tags': list, - }, - 'params': { - 'videopassword': '100600', - }, - }, { - # /play/get.json contains streams with "channel_type":"tail" - 'url': 'http://v.youku.com/v_show/id_XOTUxMzg4NDMy.html', - 'info_dict': { - 'id': 'XOTUxMzg4NDMy', - 'ext': 'mp4', - 'title': '我的世界☆明月庄主☆车震猎杀☆杀人艺术Minecraft', - 'duration': 702.08, - 'thumbnail': r're:^https?://.*', - 'uploader': '明月庄主moon', - 'uploader_id': '38465621', - 'uploader_url': 'http://i.youku.com/u/UMTUzODYyNDg0', - 'tags': list, - }, - }, { - 'url': 'http://video.tudou.com/v/XMjIyNzAzMTQ4NA==.html?f=46177805', - 'info_dict': { - 'id': 'XMjIyNzAzMTQ4NA', - 'ext': 'mp4', - 'title': '卡马乔国足开大脚长传冲吊集锦', - 'duration': 289, - 'thumbnail': r're:^https?://.*', - 'uploader': '阿卜杜拉之星', - 'uploader_id': '2382249', - 'uploader_url': 'http://i.youku.com/u/UOTUyODk5Ng==', - 'tags': list, - }, - }, { - 'url': 'http://video.tudou.com/v/XMjE4ODI3OTg2MA==.html', - 'only_matching': True, - }] - - @staticmethod - def get_ysuid(): - return '%d%s' % (int(time.time()), ''.join([ - random.choice(string.ascii_letters) for i in range(3)])) - - def get_format_name(self, fm): - _dict = { - '3gp': 'h6', - '3gphd': 'h5', - 'flv': 'h4', - 'flvhd': 'h4', - 'mp4': 'h3', - 'mp4hd': 'h3', - 'mp4hd2': 'h4', - 'mp4hd3': 'h4', - 'hd2': 'h2', - 'hd3': 'h1', - } - return _dict.get(fm) - - def _real_extract(self, url): - video_id = self._match_id(url) - - self._set_cookie('youku.com', '__ysuid', self.get_ysuid()) - self._set_cookie('youku.com', 'xreferrer', 'http://www.youku.com') - - _, urlh = self._download_webpage_handle( - 'https://log.mmstat.com/eg.js', video_id, 'Retrieving cna info') - # The etag header is '"foobar"'; let's remove the double quotes - cna = urlh.headers['etag'][1:-1] - - # request basic data - basic_data_params = { - 'vid': video_id, - 'ccode': '0532', - 'client_ip': '192.168.1.1', - 'utid': cna, - 'client_ts': time.time() / 1000, - } - - video_password = self._downloader.params.get('videopassword') - if video_password: - basic_data_params['password'] = video_password - - headers = { - 'Referer': url, - } - headers.update(self.geo_verification_headers()) - data = self._download_json( - 'https://ups.youku.com/ups/get.json', video_id, - 'Downloading JSON metadata', - query=basic_data_params, headers=headers)['data'] - - error = data.get('error') - if error: - error_note = error.get('note') - if error_note is not None and '因版权原因无法观看此视频' in error_note: - raise ExtractorError( - 'Youku said: Sorry, this video is available in China only', expected=True) - elif error_note and '该视频被设为私密' in error_note: - raise ExtractorError( - 'Youku said: Sorry, this video is private', expected=True) - else: - msg = 'Youku server reported error %i' % error.get('code') - if error_note is not None: - msg += ': ' + error_note - raise ExtractorError(msg) - - # get video title - video_data = data['video'] - title = video_data['title'] - - formats = [{ - 'url': stream['m3u8_url'], - 'format_id': self.get_format_name(stream.get('stream_type')), - 'ext': 'mp4', - 'protocol': 'm3u8_native', - 'filesize': int(stream.get('size')), - 'width': stream.get('width'), - 'height': stream.get('height'), - } for stream in data['stream'] if stream.get('channel_type') != 'tail'] - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'duration': video_data.get('seconds'), - 'thumbnail': video_data.get('logo'), - 'uploader': video_data.get('username'), - 'uploader_id': str_or_none(video_data.get('userid')), - 'uploader_url': data.get('uploader', {}).get('homepage'), - 'tags': video_data.get('tags'), - } - - -class YoukuShowIE(InfoExtractor): - _VALID_URL = r'https?://list\.youku\.com/show/id_(?P<id>[0-9a-z]+)\.html' - IE_NAME = 'youku:show' - - _TESTS = [{ - 'url': 'http://list.youku.com/show/id_zc7c670be07ff11e48b3f.html', - 'info_dict': { - 'id': 'zc7c670be07ff11e48b3f', - 'title': '花千骨 DVD版', - 'description': 'md5:a1ae6f5618571bbeb5c9821f9c81b558', - }, - 'playlist_count': 50, - }, { - # Episode number not starting from 1 - 'url': 'http://list.youku.com/show/id_zefbfbd70efbfbd780bef.html', - 'info_dict': { - 'id': 'zefbfbd70efbfbd780bef', - 'title': '超级飞侠3', - 'description': 'md5:275715156abebe5ccc2a1992e9d56b98', - }, - 'playlist_count': 24, - }, { - # Ongoing playlist. The initial page is the last one - 'url': 'http://list.youku.com/show/id_za7c275ecd7b411e1a19e.html', - 'only_matching': True, - }, { - # No data-id value. - 'url': 'http://list.youku.com/show/id_zefbfbd61237fefbfbdef.html', - 'only_matching': True, - }, { - # Wrong number of reload_id. - 'url': 'http://list.youku.com/show/id_z20eb4acaf5c211e3b2ad.html', - 'only_matching': True, - }] - - def _extract_entries(self, playlist_data_url, show_id, note, query): - query['callback'] = 'cb' - playlist_data = self._download_json( - playlist_data_url, show_id, query=query, note=note, - transform_source=lambda s: js_to_json(strip_jsonp(s))).get('html') - if playlist_data is None: - return [None, None] - drama_list = (get_element_by_class('p-drama-grid', playlist_data) - or get_element_by_class('p-drama-half-row', playlist_data)) - if drama_list is None: - raise ExtractorError('No episodes found') - video_urls = re.findall(r'<a[^>]+href="([^"]+)"', drama_list) - return playlist_data, [ - self.url_result(self._proto_relative_url(video_url, 'http:'), YoukuIE.ie_key()) - for video_url in video_urls] - - def _real_extract(self, url): - show_id = self._match_id(url) - webpage = self._download_webpage(url, show_id) - - entries = [] - page_config = self._parse_json(self._search_regex( - r'var\s+PageConfig\s*=\s*({.+});', webpage, 'page config'), - show_id, transform_source=js_to_json) - first_page, initial_entries = self._extract_entries( - 'http://list.youku.com/show/module', show_id, - note='Downloading initial playlist data page', - query={ - 'id': page_config['showid'], - 'tab': 'showInfo', - }) - first_page_reload_id = self._html_search_regex( - r'<div[^>]+id="(reload_\d+)', first_page, 'first page reload id') - # The first reload_id has the same items as first_page - reload_ids = re.findall('<li[^>]+data-id="([^"]+)">', first_page) - entries.extend(initial_entries) - for idx, reload_id in enumerate(reload_ids): - if reload_id == first_page_reload_id: - continue - _, new_entries = self._extract_entries( - 'http://list.youku.com/show/episode', show_id, - note='Downloading playlist data page %d' % (idx + 1), - query={ - 'id': page_config['showid'], - 'stage': reload_id, - }) - if new_entries is not None: - entries.extend(new_entries) - desc = self._html_search_meta('description', webpage, fatal=False) - playlist_title = desc.split(',')[0] if desc else None - detail_li = get_element_by_class('p-intro', webpage) - playlist_description = get_element_by_class( - 'intro-more', detail_li) if detail_li else None - - return self.playlist_result( - entries, show_id, playlist_title, playlist_description) diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py deleted file mode 100644 index 7084d3d12..000000000 --- a/youtube_dl/extractor/youporn.py +++ /dev/null @@ -1,184 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - extract_attributes, - int_or_none, - str_to_int, - unified_strdate, - url_or_none, -) - - -class YouPornIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?youporn\.com/(?:watch|embed)/(?P<id>\d+)(?:/(?P<display_id>[^/?#&]+))?' - _TESTS = [{ - 'url': 'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/', - 'md5': '3744d24c50438cf5b6f6d59feb5055c2', - 'info_dict': { - 'id': '505835', - 'display_id': 'sex-ed-is-it-safe-to-masturbate-daily', - 'ext': 'mp4', - 'title': 'Sex Ed: Is It Safe To Masturbate Daily?', - 'description': 'Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 210, - 'uploader': 'Ask Dan And Jennifer', - 'upload_date': '20101217', - 'average_rating': int, - 'view_count': int, - 'categories': list, - 'tags': list, - 'age_limit': 18, - }, - 'skip': 'This video has been disabled', - }, { - # Unknown uploader - 'url': 'http://www.youporn.com/watch/561726/big-tits-awesome-brunette-on-amazing-webcam-show/?from=related3&al=2&from_id=561726&pos=4', - 'info_dict': { - 'id': '561726', - 'display_id': 'big-tits-awesome-brunette-on-amazing-webcam-show', - 'ext': 'mp4', - 'title': 'Big Tits Awesome Brunette On amazing webcam show', - 'description': 'http://sweetlivegirls.com Big Tits Awesome Brunette On amazing webcam show.mp4', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Unknown', - 'upload_date': '20110418', - 'average_rating': int, - 'view_count': int, - 'categories': list, - 'tags': list, - 'age_limit': 18, - }, - 'params': { - 'skip_download': True, - }, - 'skip': '404', - }, { - 'url': 'https://www.youporn.com/embed/505835/sex-ed-is-it-safe-to-masturbate-daily/', - 'only_matching': True, - }, { - 'url': 'http://www.youporn.com/watch/505835', - 'only_matching': True, - }, { - 'url': 'https://www.youporn.com/watch/13922959/femdom-principal/', - 'only_matching': True, - }] - - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe[^>]+\bsrc=["\']((?:https?:)?//(?:www\.)?youporn\.com/embed/\d+)', - webpage) - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') or video_id - - definitions = self._download_json( - 'https://www.youporn.com/api/video/media_definitions/%s/' % video_id, - display_id) - - formats = [] - for definition in definitions: - if not isinstance(definition, dict): - continue - video_url = url_or_none(definition.get('videoUrl')) - if not video_url: - continue - f = { - 'url': video_url, - 'filesize': int_or_none(definition.get('videoSize')), - } - height = int_or_none(definition.get('quality')) - # Video URL's path looks like this: - # /201012/17/505835/720p_1500k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4 - # /201012/17/505835/vl_240p_240k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4 - # /videos/201703/11/109285532/1080P_4000K_109285532.mp4 - # We will benefit from it by extracting some metadata - mobj = re.search(r'(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+', video_url) - if mobj: - if not height: - height = int(mobj.group('height')) - bitrate = int(mobj.group('bitrate')) - f.update({ - 'format_id': '%dp-%dk' % (height, bitrate), - 'tbr': bitrate, - }) - f['height'] = height - formats.append(f) - self._sort_formats(formats) - - webpage = self._download_webpage( - 'http://www.youporn.com/watch/%s' % video_id, display_id, - headers={'Cookie': 'age_verified=1'}) - - title = self._html_search_regex( - r'(?s)<div[^>]+class=["\']watchVideoTitle[^>]+>(.+?)</div>', - webpage, 'title', default=None) or self._og_search_title( - webpage, default=None) or self._html_search_meta( - 'title', webpage, fatal=True) - - description = self._html_search_regex( - r'(?s)<div[^>]+\bid=["\']description["\'][^>]*>(.+?)</div>', - webpage, 'description', - default=None) or self._og_search_description( - webpage, default=None) - thumbnail = self._search_regex( - r'(?:imageurl\s*=|poster\s*:)\s*(["\'])(?P<thumbnail>.+?)\1', - webpage, 'thumbnail', fatal=False, group='thumbnail') - duration = int_or_none(self._html_search_meta( - 'video:duration', webpage, 'duration', fatal=False)) - - uploader = self._html_search_regex( - r'(?s)<div[^>]+class=["\']submitByLink["\'][^>]*>(.+?)</div>', - webpage, 'uploader', fatal=False) - upload_date = unified_strdate(self._html_search_regex( - [r'UPLOADED:\s*<span>([^<]+)', - r'Date\s+[Aa]dded:\s*<span>([^<]+)', - r'(?s)<div[^>]+class=["\']videoInfo(?:Date|Time)["\'][^>]*>(.+?)</div>'], - webpage, 'upload date', fatal=False)) - - age_limit = self._rta_search(webpage) - - view_count = None - views = self._search_regex( - r'(<div[^>]+\bclass=["\']js_videoInfoViews["\']>)', webpage, - 'views', default=None) - if views: - view_count = str_to_int(extract_attributes(views).get('data-value')) - comment_count = str_to_int(self._search_regex( - r'>All [Cc]omments? \(([\d,.]+)\)', - webpage, 'comment count', default=None)) - - def extract_tag_box(regex, title): - tag_box = self._search_regex(regex, webpage, title, default=None) - if not tag_box: - return [] - return re.findall(r'<a[^>]+href=[^>]+>([^<]+)', tag_box) - - categories = extract_tag_box( - r'(?s)Categories:.*?</[^>]+>(.+?)</div>', 'categories') - tags = extract_tag_box( - r'(?s)Tags:.*?</div>\s*<div[^>]+class=["\']tagBoxContent["\'][^>]*>(.+?)</div>', - 'tags') - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'uploader': uploader, - 'upload_date': upload_date, - 'view_count': view_count, - 'comment_count': comment_count, - 'categories': categories, - 'tags': tags, - 'age_limit': age_limit, - 'formats': formats, - } diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py deleted file mode 100644 index dc4bd4a77..000000000 --- a/youtube_dl/extractor/youtube.py +++ /dev/null @@ -1,3257 +0,0 @@ -# coding: utf-8 - -from __future__ import unicode_literals - -import itertools -import json -import os.path -import random -import re -import traceback - -from .common import InfoExtractor, SearchInfoExtractor -from ..compat import ( - compat_chr, - compat_HTTPError, - compat_parse_qs, - compat_str, - compat_urllib_parse_unquote_plus, - compat_urllib_parse_urlencode, - compat_urllib_parse_urlparse, - compat_urlparse, -) -from ..jsinterp import JSInterpreter -from ..utils import ( - ExtractorError, - clean_html, - dict_get, - float_or_none, - int_or_none, - mimetype2ext, - parse_codecs, - parse_duration, - qualities, - remove_start, - smuggle_url, - str_or_none, - str_to_int, - try_get, - unescapeHTML, - unified_strdate, - unsmuggle_url, - update_url_query, - url_or_none, - urlencode_postdata, - urljoin, -) - - -def parse_qs(url): - return compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - - -class YoutubeBaseInfoExtractor(InfoExtractor): - """Provide base functions for Youtube extractors""" - _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' - _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge' - - _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup' - _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge' - _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}' - - _NETRC_MACHINE = 'youtube' - # If True it will raise an error if no login info is provided - _LOGIN_REQUIRED = False - - _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM)' - - def _login(self): - """ - Attempt to log in to YouTube. - True is returned if successful or skipped. - False is returned if login failed. - - If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised. - """ - username, password = self._get_login_info() - # No authentication to be performed - if username is None: - if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None: - raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) - return True - - login_page = self._download_webpage( - self._LOGIN_URL, None, - note='Downloading login page', - errnote='unable to fetch login page', fatal=False) - if login_page is False: - return - - login_form = self._hidden_inputs(login_page) - - def req(url, f_req, note, errnote): - data = login_form.copy() - data.update({ - 'pstMsg': 1, - 'checkConnection': 'youtube', - 'checkedDomains': 'youtube', - 'hl': 'en', - 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]', - 'f.req': json.dumps(f_req), - 'flowName': 'GlifWebSignIn', - 'flowEntry': 'ServiceLogin', - # TODO: reverse actual botguard identifier generation algo - 'bgRequest': '["identifier",""]', - }) - return self._download_json( - url, None, note=note, errnote=errnote, - transform_source=lambda s: re.sub(r'^[^[]*', '', s), - fatal=False, - data=urlencode_postdata(data), headers={ - 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8', - 'Google-Accounts-XSRF': 1, - }) - - def warn(message): - self._downloader.report_warning(message) - - lookup_req = [ - username, - None, [], None, 'US', None, None, 2, False, True, - [ - None, None, - [2, 1, None, 1, - 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', - None, [], 4], - 1, [None, None, []], None, None, None, True - ], - username, - ] - - lookup_results = req( - self._LOOKUP_URL, lookup_req, - 'Looking up account info', 'Unable to look up account info') - - if lookup_results is False: - return False - - user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str) - if not user_hash: - warn('Unable to extract user hash') - return False - - challenge_req = [ - user_hash, - None, 1, None, [1, None, None, None, [password, None, True]], - [ - None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4], - 1, [None, None, []], None, None, None, True - ]] - - challenge_results = req( - self._CHALLENGE_URL, challenge_req, - 'Logging in', 'Unable to log in') - - if challenge_results is False: - return - - login_res = try_get(challenge_results, lambda x: x[0][5], list) - if login_res: - login_msg = try_get(login_res, lambda x: x[5], compat_str) - warn( - 'Unable to login: %s' % 'Invalid password' - if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg) - return False - - res = try_get(challenge_results, lambda x: x[0][-1], list) - if not res: - warn('Unable to extract result entry') - return False - - login_challenge = try_get(res, lambda x: x[0][0], list) - if login_challenge: - challenge_str = try_get(login_challenge, lambda x: x[2], compat_str) - if challenge_str == 'TWO_STEP_VERIFICATION': - # SEND_SUCCESS - TFA code has been successfully sent to phone - # QUOTA_EXCEEDED - reached the limit of TFA codes - status = try_get(login_challenge, lambda x: x[5], compat_str) - if status == 'QUOTA_EXCEEDED': - warn('Exceeded the limit of TFA codes, try later') - return False - - tl = try_get(challenge_results, lambda x: x[1][2], compat_str) - if not tl: - warn('Unable to extract TL') - return False - - tfa_code = self._get_tfa_info('2-step verification code') - - if not tfa_code: - warn( - 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>' - '(Note that only TOTP (Google Authenticator App) codes work at this time.)') - return False - - tfa_code = remove_start(tfa_code, 'G-') - - tfa_req = [ - user_hash, None, 2, None, - [ - 9, None, None, None, None, None, None, None, - [None, tfa_code, True, 2] - ]] - - tfa_results = req( - self._TFA_URL.format(tl), tfa_req, - 'Submitting TFA code', 'Unable to submit TFA code') - - if tfa_results is False: - return False - - tfa_res = try_get(tfa_results, lambda x: x[0][5], list) - if tfa_res: - tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str) - warn( - 'Unable to finish TFA: %s' % 'Invalid TFA code' - if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg) - return False - - check_cookie_url = try_get( - tfa_results, lambda x: x[0][-1][2], compat_str) - else: - CHALLENGES = { - 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.", - 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.', - 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.", - } - challenge = CHALLENGES.get( - challenge_str, - '%s returned error %s.' % (self.IE_NAME, challenge_str)) - warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge) - return False - else: - check_cookie_url = try_get(res, lambda x: x[2], compat_str) - - if not check_cookie_url: - warn('Unable to extract CheckCookie URL') - return False - - check_cookie_results = self._download_webpage( - check_cookie_url, None, 'Checking cookie', fatal=False) - - if check_cookie_results is False: - return False - - if 'https://myaccount.google.com/' not in check_cookie_results: - warn('Unable to log in') - return False - - return True - - def _initialize_consent(self): - cookies = self._get_cookies('https://www.youtube.com/') - if cookies.get('__Secure-3PSID'): - return - consent_id = None - consent = cookies.get('CONSENT') - if consent: - if 'YES' in consent.value: - return - consent_id = self._search_regex( - r'PENDING\+(\d+)', consent.value, 'consent', default=None) - if not consent_id: - consent_id = random.randint(100, 999) - self._set_cookie('.youtube.com', 'CONSENT', 'YES+cb.20210328-17-p0.en+FX+%s' % consent_id) - - def _real_initialize(self): - self._initialize_consent() - if self._downloader is None: - return - if not self._login(): - return - - _DEFAULT_API_DATA = { - 'context': { - 'client': { - 'clientName': 'WEB', - 'clientVersion': '2.20201021.03.00', - } - }, - } - - _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;' - _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;' - _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)' - - def _call_api(self, ep, query, video_id, fatal=True): - data = self._DEFAULT_API_DATA.copy() - data.update(query) - - return self._download_json( - 'https://www.youtube.com/youtubei/v1/%s' % ep, video_id=video_id, - note='Downloading API JSON', errnote='Unable to download API page', - data=json.dumps(data).encode('utf8'), fatal=fatal, - headers={'content-type': 'application/json'}, - query={'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'}) - - def _extract_yt_initial_data(self, video_id, webpage): - return self._parse_json( - self._search_regex( - (r'%s\s*%s' % (self._YT_INITIAL_DATA_RE, self._YT_INITIAL_BOUNDARY_RE), - self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'), - video_id) - - def _extract_ytcfg(self, video_id, webpage): - return self._parse_json( - self._search_regex( - r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg', - default='{}'), video_id, fatal=False) or {} - - def _extract_video(self, renderer): - video_id = renderer['videoId'] - title = try_get( - renderer, - (lambda x: x['title']['runs'][0]['text'], - lambda x: x['title']['simpleText']), compat_str) - description = try_get( - renderer, lambda x: x['descriptionSnippet']['runs'][0]['text'], - compat_str) - duration = parse_duration(try_get( - renderer, lambda x: x['lengthText']['simpleText'], compat_str)) - view_count_text = try_get( - renderer, lambda x: x['viewCountText']['simpleText'], compat_str) or '' - view_count = str_to_int(self._search_regex( - r'^([\d,]+)', re.sub(r'\s', '', view_count_text), - 'view count', default=None)) - uploader = try_get( - renderer, - (lambda x: x['ownerText']['runs'][0]['text'], - lambda x: x['shortBylineText']['runs'][0]['text']), compat_str) - return { - '_type': 'url', - 'ie_key': YoutubeIE.ie_key(), - 'id': video_id, - 'url': video_id, - 'title': title, - 'description': description, - 'duration': duration, - 'view_count': view_count, - 'uploader': uploader, - } - - -class YoutubeIE(YoutubeBaseInfoExtractor): - IE_DESC = 'YouTube.com' - _INVIDIOUS_SITES = ( - # invidious-redirect websites - r'(?:www\.)?redirect\.invidious\.io', - r'(?:(?:www|dev)\.)?invidio\.us', - # Invidious instances taken from https://github.com/iv-org/documentation/blob/master/Invidious-Instances.md - r'(?:(?:www|no)\.)?invidiou\.sh', - r'(?:(?:www|fi)\.)?invidious\.snopyta\.org', - r'(?:www\.)?invidious\.kabi\.tk', - r'(?:www\.)?invidious\.13ad\.de', - r'(?:www\.)?invidious\.mastodon\.host', - r'(?:www\.)?invidious\.zapashcanon\.fr', - r'(?:www\.)?(?:invidious(?:-us)?|piped)\.kavin\.rocks', - r'(?:www\.)?invidious\.tinfoil-hat\.net', - r'(?:www\.)?invidious\.himiko\.cloud', - r'(?:www\.)?invidious\.reallyancient\.tech', - r'(?:www\.)?invidious\.tube', - r'(?:www\.)?invidiou\.site', - r'(?:www\.)?invidious\.site', - r'(?:www\.)?invidious\.xyz', - r'(?:www\.)?invidious\.nixnet\.xyz', - r'(?:www\.)?invidious\.048596\.xyz', - r'(?:www\.)?invidious\.drycat\.fr', - r'(?:www\.)?inv\.skyn3t\.in', - r'(?:www\.)?tube\.poal\.co', - r'(?:www\.)?tube\.connect\.cafe', - r'(?:www\.)?vid\.wxzm\.sx', - r'(?:www\.)?vid\.mint\.lgbt', - r'(?:www\.)?vid\.puffyan\.us', - r'(?:www\.)?yewtu\.be', - r'(?:www\.)?yt\.elukerio\.org', - r'(?:www\.)?yt\.lelux\.fi', - r'(?:www\.)?invidious\.ggc-project\.de', - r'(?:www\.)?yt\.maisputain\.ovh', - r'(?:www\.)?ytprivate\.com', - r'(?:www\.)?invidious\.13ad\.de', - r'(?:www\.)?invidious\.toot\.koeln', - r'(?:www\.)?invidious\.fdn\.fr', - r'(?:www\.)?watch\.nettohikari\.com', - r'(?:www\.)?invidious\.namazso\.eu', - r'(?:www\.)?invidious\.silkky\.cloud', - r'(?:www\.)?invidious\.exonip\.de', - r'(?:www\.)?invidious\.riverside\.rocks', - r'(?:www\.)?invidious\.blamefran\.net', - r'(?:www\.)?invidious\.moomoo\.de', - r'(?:www\.)?ytb\.trom\.tf', - r'(?:www\.)?yt\.cyberhost\.uk', - r'(?:www\.)?kgg2m7yk5aybusll\.onion', - r'(?:www\.)?qklhadlycap4cnod\.onion', - r'(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion', - r'(?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion', - r'(?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion', - r'(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion', - r'(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p', - r'(?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion', - r'(?:www\.)?w6ijuptxiku4xpnnaetxvnkc5vqcdu7mgns2u77qefoixi63vbvnpnqd\.onion', - r'(?:www\.)?kbjggqkzv65ivcqj6bumvp337z6264huv5kpkwuv6gu5yjiskvan7fad\.onion', - r'(?:www\.)?grwp24hodrefzvjjuccrkw3mjq4tzhaaq32amf33dzpmuxe7ilepcmad\.onion', - r'(?:www\.)?hpniueoejy4opn7bc4ftgazyqjoeqwlvh2uiku2xqku6zpoa4bf5ruid\.onion', - ) - _VALID_URL = r"""(?x)^ - ( - (?:https?://|//) # http(s):// or protocol-independent URL - (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com| - (?:www\.)?deturl\.com/www\.youtube\.com| - (?:www\.)?pwnyoutube\.com| - (?:www\.)?hooktube\.com| - (?:www\.)?yourepeat\.com| - tube\.majestyc\.net| - %(invidious)s| - youtube\.googleapis\.com)/ # the various hostnames, with wildcard subdomains - (?:.*?\#/)? # handle anchor (#/) redirect urls - (?: # the various things that can precede the ID: - (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/ - |(?: # or the v= param in all its forms - (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) - (?:\?|\#!?) # the params delimiter ? or # or #! - (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&v=V36LpHqtcDY) - v= - ) - )) - |(?: - youtu\.be| # just youtu.be/xxxx - vid\.plus| # or vid.plus/xxxx - zwearz\.com/watch| # or zwearz.com/watch/xxxx - %(invidious)s - )/ - |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId= - ) - )? # all until now is optional -> you can pass the naked ID - (?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID - (?(1).+)? # if we found the ID, everything can follow - $""" % { - 'invidious': '|'.join(_INVIDIOUS_SITES), - } - _PLAYER_INFO_RE = ( - r'/s/player/(?P<id>[a-zA-Z0-9_-]{8,})/player', - r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$', - r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.js$', - ) - _SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt') - - _GEO_BYPASS = False - - IE_NAME = 'youtube' - _TESTS = [ - { - 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9', - 'info_dict': { - 'id': 'BaW_jenozKc', - 'ext': 'mp4', - 'title': 'youtube-dl test video "\'/\\ä↭𝕐', - 'uploader': 'Philipp Hagemeister', - 'uploader_id': 'phihag', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', - 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q', - 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q', - 'upload_date': '20121002', - 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', - 'categories': ['Science & Technology'], - 'tags': ['youtube-dl'], - 'duration': 10, - 'view_count': int, - 'like_count': int, - 'dislike_count': int, - 'start_time': 1, - 'end_time': 9, - } - }, - { - 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ', - 'note': 'Embed-only video (#1746)', - 'info_dict': { - 'id': 'yZIXLfi8CZQ', - 'ext': 'mp4', - 'upload_date': '20120608', - 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012', - 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7', - 'uploader': 'SET India', - 'uploader_id': 'setindia', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia', - 'age_limit': 18, - }, - 'skip': 'Private video', - }, - { - 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=yZIXLfi8CZQ', - 'note': 'Use the first video ID in the URL', - 'info_dict': { - 'id': 'BaW_jenozKc', - 'ext': 'mp4', - 'title': 'youtube-dl test video "\'/\\ä↭𝕐', - 'uploader': 'Philipp Hagemeister', - 'uploader_id': 'phihag', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', - 'upload_date': '20121002', - 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', - 'categories': ['Science & Technology'], - 'tags': ['youtube-dl'], - 'duration': 10, - 'view_count': int, - 'like_count': int, - 'dislike_count': int, - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I', - 'note': '256k DASH audio (format 141) via DASH manifest', - 'info_dict': { - 'id': 'a9LDPn-MO4I', - 'ext': 'm4a', - 'upload_date': '20121002', - 'uploader_id': '8KVIDEO', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO', - 'description': '', - 'uploader': '8KVIDEO', - 'title': 'UHDTV TEST 8K VIDEO.mp4' - }, - 'params': { - 'youtube_include_dash_manifest': True, - 'format': '141', - }, - 'skip': 'format 141 not served anymore', - }, - # DASH manifest with encrypted signature - { - 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA', - 'info_dict': { - 'id': 'IB3lcPjvWLA', - 'ext': 'm4a', - 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson', - 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf', - 'duration': 244, - 'uploader': 'AfrojackVEVO', - 'uploader_id': 'AfrojackVEVO', - 'upload_date': '20131011', - 'abr': 129.495, - }, - 'params': { - 'youtube_include_dash_manifest': True, - 'format': '141/bestaudio[ext=m4a]', - }, - }, - # Controversy video - { - 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8', - 'info_dict': { - 'id': 'T4XJQO3qol8', - 'ext': 'mp4', - 'duration': 219, - 'upload_date': '20100909', - 'uploader': 'Amazing Atheist', - 'uploader_id': 'TheAmazingAtheist', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist', - 'title': 'Burning Everyone\'s Koran', - 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms \r\n\r\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html', - } - }, - # Normal age-gate video (No vevo, embed allowed), available via embed page - { - 'url': 'https://youtube.com/watch?v=HtVdAasjOgU', - 'info_dict': { - 'id': 'HtVdAasjOgU', - 'ext': 'mp4', - 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer', - 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}', - 'duration': 142, - 'uploader': 'The Witcher', - 'uploader_id': 'WitcherGame', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame', - 'upload_date': '20140605', - 'age_limit': 18, - }, - }, - { - # Age-gated video only available with authentication (unavailable - # via embed page workaround) - 'url': 'XgnwCQzjau8', - 'only_matching': True, - }, - # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421) - # YouTube Red ad is not captured for creator - { - 'url': '__2ABJjxzNo', - 'info_dict': { - 'id': '__2ABJjxzNo', - 'ext': 'mp4', - 'duration': 266, - 'upload_date': '20100430', - 'uploader_id': 'deadmau5', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5', - 'creator': 'deadmau5', - 'description': 'md5:6cbcd3a92ce1bc676fc4d6ab4ace2336', - 'uploader': 'deadmau5', - 'title': 'Deadmau5 - Some Chords (HD)', - 'alt_title': 'Some Chords', - }, - 'expected_warnings': [ - 'DASH manifest missing', - ] - }, - # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431) - { - 'url': 'lqQg6PlCWgI', - 'info_dict': { - 'id': 'lqQg6PlCWgI', - 'ext': 'mp4', - 'duration': 6085, - 'upload_date': '20150827', - 'uploader_id': 'olympic', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic', - 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games', - 'uploader': 'Olympic', - 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games', - }, - 'params': { - 'skip_download': 'requires avconv', - } - }, - # Non-square pixels - { - 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0', - 'info_dict': { - 'id': '_b-2C3KPAM0', - 'ext': 'mp4', - 'stretched_ratio': 16 / 9., - 'duration': 85, - 'upload_date': '20110310', - 'uploader_id': 'AllenMeow', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow', - 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯', - 'uploader': '孫ᄋᄅ', - 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人', - }, - }, - # url_encoded_fmt_stream_map is empty string - { - 'url': 'qEJwOuvDf7I', - 'info_dict': { - 'id': 'qEJwOuvDf7I', - 'ext': 'webm', - 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге', - 'description': '', - 'upload_date': '20150404', - 'uploader_id': 'spbelect', - 'uploader': 'Наблюдатели Петербурга', - }, - 'params': { - 'skip_download': 'requires avconv', - }, - 'skip': 'This live event has ended.', - }, - # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097) - { - 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y', - 'info_dict': { - 'id': 'FIl7x6_3R5Y', - 'ext': 'webm', - 'title': 'md5:7b81415841e02ecd4313668cde88737a', - 'description': 'md5:116377fd2963b81ec4ce64b542173306', - 'duration': 220, - 'upload_date': '20150625', - 'uploader_id': 'dorappi2000', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000', - 'uploader': 'dorappi2000', - 'formats': 'mincount:31', - }, - 'skip': 'not actual anymore', - }, - # DASH manifest with segment_list - { - 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8', - 'md5': '8ce563a1d667b599d21064e982ab9e31', - 'info_dict': { - 'id': 'CsmdDsKjzN8', - 'ext': 'mp4', - 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510 - 'uploader': 'Airtek', - 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.', - 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ', - 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015', - }, - 'params': { - 'youtube_include_dash_manifest': True, - 'format': '135', # bestvideo - }, - 'skip': 'This live event has ended.', - }, - { - # Multifeed videos (multiple cameras), URL is for Main Camera - 'url': 'https://www.youtube.com/watch?v=jvGDaLqkpTg', - 'info_dict': { - 'id': 'jvGDaLqkpTg', - 'title': 'Tom Clancy Free Weekend Rainbow Whatever', - 'description': 'md5:e03b909557865076822aa169218d6a5d', - }, - 'playlist': [{ - 'info_dict': { - 'id': 'jvGDaLqkpTg', - 'ext': 'mp4', - 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Main Camera)', - 'description': 'md5:e03b909557865076822aa169218d6a5d', - 'duration': 10643, - 'upload_date': '20161111', - 'uploader': 'Team PGP', - 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg', - }, - }, { - 'info_dict': { - 'id': '3AKt1R1aDnw', - 'ext': 'mp4', - 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 2)', - 'description': 'md5:e03b909557865076822aa169218d6a5d', - 'duration': 10991, - 'upload_date': '20161111', - 'uploader': 'Team PGP', - 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg', - }, - }, { - 'info_dict': { - 'id': 'RtAMM00gpVc', - 'ext': 'mp4', - 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 3)', - 'description': 'md5:e03b909557865076822aa169218d6a5d', - 'duration': 10995, - 'upload_date': '20161111', - 'uploader': 'Team PGP', - 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg', - }, - }, { - 'info_dict': { - 'id': '6N2fdlP3C5U', - 'ext': 'mp4', - 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 4)', - 'description': 'md5:e03b909557865076822aa169218d6a5d', - 'duration': 10990, - 'upload_date': '20161111', - 'uploader': 'Team PGP', - 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg', - }, - }], - 'params': { - 'skip_download': True, - }, - }, - { - # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536) - 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo', - 'info_dict': { - 'id': 'gVfLd0zydlo', - 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30', - }, - 'playlist_count': 2, - 'skip': 'Not multifeed anymore', - }, - { - 'url': 'https://vid.plus/FlRa-iH7PGw', - 'only_matching': True, - }, - { - 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html', - 'only_matching': True, - }, - { - # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468) - # Also tests cut-off URL expansion in video description (see - # https://github.com/ytdl-org/youtube-dl/issues/1892, - # https://github.com/ytdl-org/youtube-dl/issues/8164) - 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg', - 'info_dict': { - 'id': 'lsguqyKfVQg', - 'ext': 'mp4', - 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21', - 'alt_title': 'Dark Walk - Position Music', - 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a', - 'duration': 133, - 'upload_date': '20151119', - 'uploader_id': 'IronSoulElf', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf', - 'uploader': 'IronSoulElf', - 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan', - 'track': 'Dark Walk - Position Music', - 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan', - 'album': 'Position Music - Production Music Vol. 143 - Dark Walk', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468) - 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8', - 'only_matching': True, - }, - { - # Video with yt:stretch=17:0 - 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM', - 'info_dict': { - 'id': 'Q39EVAstoRM', - 'ext': 'mp4', - 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4', - 'description': 'md5:ee18a25c350637c8faff806845bddee9', - 'upload_date': '20151107', - 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA', - 'uploader': 'CH GAMER DROID', - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'This video does not exist.', - }, - { - # Video with incomplete 'yt:stretch=16:' - 'url': 'https://www.youtube.com/watch?v=FRhJzUSJbGI', - 'only_matching': True, - }, - { - # Video licensed under Creative Commons - 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA', - 'info_dict': { - 'id': 'M4gD1WSo5mA', - 'ext': 'mp4', - 'title': 'md5:e41008789470fc2533a3252216f1c1d1', - 'description': 'md5:a677553cf0840649b731a3024aeff4cc', - 'duration': 721, - 'upload_date': '20150127', - 'uploader_id': 'BerkmanCenter', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter', - 'uploader': 'The Berkman Klein Center for Internet & Society', - 'license': 'Creative Commons Attribution license (reuse allowed)', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # Channel-like uploader_url - 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg', - 'info_dict': { - 'id': 'eQcmzGIKrzg', - 'ext': 'mp4', - 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders', - 'description': 'md5:13a2503d7b5904ef4b223aa101628f39', - 'duration': 4060, - 'upload_date': '20151119', - 'uploader': 'Bernie Sanders', - 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg', - 'license': 'Creative Commons Attribution license (reuse allowed)', - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;v=V36LpHqtcDY', - 'only_matching': True, - }, - { - # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059) - 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo', - 'only_matching': True, - }, - { - # Rental video preview - 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg', - 'info_dict': { - 'id': 'uGpuVWrhIzE', - 'ext': 'mp4', - 'title': 'Piku - Trailer', - 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb', - 'upload_date': '20150811', - 'uploader': 'FlixMatrix', - 'uploader_id': 'FlixMatrixKaravan', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan', - 'license': 'Standard YouTube License', - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'This video is not available.', - }, - { - # YouTube Red video with episode data - 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4', - 'info_dict': { - 'id': 'iqKdEhx-dD4', - 'ext': 'mp4', - 'title': 'Isolation - Mind Field (Ep 1)', - 'description': 'md5:f540112edec5d09fc8cc752d3d4ba3cd', - 'duration': 2085, - 'upload_date': '20170118', - 'uploader': 'Vsauce', - 'uploader_id': 'Vsauce', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce', - 'series': 'Mind Field', - 'season_number': 1, - 'episode_number': 1, - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': [ - 'Skipping DASH manifest', - ], - }, - { - # The following content has been identified by the YouTube community - # as inappropriate or offensive to some audiences. - 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI', - 'info_dict': { - 'id': '6SJNVb0GnPI', - 'ext': 'mp4', - 'title': 'Race Differences in Intelligence', - 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1', - 'duration': 965, - 'upload_date': '20140124', - 'uploader': 'New Century Foundation', - 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg', - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'This video has been removed for violating YouTube\'s policy on hate speech.', - }, - { - # itag 212 - 'url': '1t24XAntNCY', - 'only_matching': True, - }, - { - # geo restricted to JP - 'url': 'sJL6WA-aGkQ', - 'only_matching': True, - }, - { - 'url': 'https://invidio.us/watch?v=BaW_jenozKc', - 'only_matching': True, - }, - { - 'url': 'https://redirect.invidious.io/watch?v=BaW_jenozKc', - 'only_matching': True, - }, - { - # from https://nitter.pussthecat.org/YouTube/status/1360363141947944964#m - 'url': 'https://redirect.invidious.io/Yh0AhrY9GjA', - 'only_matching': True, - }, - { - # DRM protected - 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc', - 'only_matching': True, - }, - { - # Video with unsupported adaptive stream type formats - 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U', - 'info_dict': { - 'id': 'Z4Vy8R84T1U', - 'ext': 'mp4', - 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta', - 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', - 'duration': 433, - 'upload_date': '20130923', - 'uploader': 'Amelia Putri Harwita', - 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q', - 'formats': 'maxcount:10', - }, - 'params': { - 'skip_download': True, - 'youtube_include_dash_manifest': False, - }, - 'skip': 'not actual anymore', - }, - { - # Youtube Music Auto-generated description - 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs', - 'info_dict': { - 'id': 'MgNrAu2pzNs', - 'ext': 'mp4', - 'title': 'Voyeur Girl', - 'description': 'md5:7ae382a65843d6df2685993e90a8628f', - 'upload_date': '20190312', - 'uploader': 'Stephen - Topic', - 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA', - 'artist': 'Stephen', - 'track': 'Voyeur Girl', - 'album': 'it\'s too much love to know my dear', - 'release_date': '20190313', - 'release_year': 2019, - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q', - 'only_matching': True, - }, - { - # invalid -> valid video id redirection - 'url': 'DJztXj2GPfl', - 'info_dict': { - 'id': 'DJztXj2GPfk', - 'ext': 'mp4', - 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)', - 'description': 'md5:bf577a41da97918e94fa9798d9228825', - 'upload_date': '20090125', - 'uploader': 'Prochorowka', - 'uploader_id': 'Prochorowka', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka', - 'artist': 'Panjabi MC', - 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix', - 'album': 'Beware of the Boys (Mundian To Bach Ke)', - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'Video unavailable', - }, - { - # empty description results in an empty string - 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k', - 'info_dict': { - 'id': 'x41yOUIvK2k', - 'ext': 'mp4', - 'title': 'IMG 3456', - 'description': '', - 'upload_date': '20170613', - 'uploader_id': 'ElevageOrVert', - 'uploader': 'ElevageOrVert', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # with '};' inside yt initial data (see [1]) - # see [2] for an example with '};' inside ytInitialPlayerResponse - # 1. https://github.com/ytdl-org/youtube-dl/issues/27093 - # 2. https://github.com/ytdl-org/youtube-dl/issues/27216 - 'url': 'https://www.youtube.com/watch?v=CHqg6qOn4no', - 'info_dict': { - 'id': 'CHqg6qOn4no', - 'ext': 'mp4', - 'title': 'Part 77 Sort a list of simple types in c#', - 'description': 'md5:b8746fa52e10cdbf47997903f13b20dc', - 'upload_date': '20130831', - 'uploader_id': 'kudvenkat', - 'uploader': 'kudvenkat', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # another example of '};' in ytInitialData - 'url': 'https://www.youtube.com/watch?v=gVfgbahppCY', - 'only_matching': True, - }, - { - 'url': 'https://www.youtube.com/watch_popup?v=63RmMXCd_bQ', - 'only_matching': True, - }, - { - # https://github.com/ytdl-org/youtube-dl/pull/28094 - 'url': 'OtqTfy26tG0', - 'info_dict': { - 'id': 'OtqTfy26tG0', - 'ext': 'mp4', - 'title': 'Burn Out', - 'description': 'md5:8d07b84dcbcbfb34bc12a56d968b6131', - 'upload_date': '20141120', - 'uploader': 'The Cinematic Orchestra - Topic', - 'uploader_id': 'UCIzsJBIyo8hhpFm1NK0uLgw', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCIzsJBIyo8hhpFm1NK0uLgw', - 'artist': 'The Cinematic Orchestra', - 'track': 'Burn Out', - 'album': 'Every Day', - 'release_data': None, - 'release_year': None, - }, - 'params': { - 'skip_download': True, - }, - }, - { - # controversial video, only works with bpctr when authenticated with cookies - 'url': 'https://www.youtube.com/watch?v=nGC3D_FkCmg', - 'only_matching': True, - }, - { - # restricted location, https://github.com/ytdl-org/youtube-dl/issues/28685 - 'url': 'cBvYw8_A0vQ', - 'info_dict': { - 'id': 'cBvYw8_A0vQ', - 'ext': 'mp4', - 'title': '4K Ueno Okachimachi Street Scenes 上野御徒町歩き', - 'description': 'md5:ea770e474b7cd6722b4c95b833c03630', - 'upload_date': '20201120', - 'uploader': 'Walk around Japan', - 'uploader_id': 'UC3o_t8PzBmXf5S9b7GLx1Mw', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UC3o_t8PzBmXf5S9b7GLx1Mw', - }, - 'params': { - 'skip_download': True, - }, - }, - ] - _formats = { - '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, - '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, - '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'}, - '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'}, - '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'}, - '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, - '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well - '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'}, - '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, - '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, - '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, - '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, - '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, - '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, - '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - - - # 3D videos - '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20}, - '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20}, - '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20}, - '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20}, - '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20}, - '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20}, - '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20}, - - # Apple HTTP Live Streaming - '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, - '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, - '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, - '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, - '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10}, - '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10}, - '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, - '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10}, - - # DASH mp4 video - '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559) - '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, - '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, - '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'}, - - # Dash mp4 audio - '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'}, - '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'}, - '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'}, - '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, - '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, - '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'}, - '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'}, - - # Dash webm - '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'}, - '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug) - '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, - '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, - '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, - '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, - - # Dash webm audio - '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128}, - '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256}, - - # Dash webm audio with opus inside - '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50}, - '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70}, - '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160}, - - # RTMP (unnamed) - '_rtmp': {'protocol': 'rtmp'}, - - # av01 video only formats sometimes served with "unknown" codecs - '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, - '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, - '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, - '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, - } - - @classmethod - def suitable(cls, url): - # Hack for lazy extractors until more generic solution is implemented - # (see #28780) - from .youtube import parse_qs - qs = parse_qs(url) - if qs.get('list', [None])[0]: - return False - return super(YoutubeIE, cls).suitable(url) - - def __init__(self, *args, **kwargs): - super(YoutubeIE, self).__init__(*args, **kwargs) - self._code_cache = {} - self._player_cache = {} - - def _signature_cache_id(self, example_sig): - """ Return a string representation of a signature """ - return '.'.join(compat_str(len(part)) for part in example_sig.split('.')) - - @classmethod - def _extract_player_info(cls, player_url): - for player_re in cls._PLAYER_INFO_RE: - id_m = re.search(player_re, player_url) - if id_m: - break - else: - raise ExtractorError('Cannot identify player %r' % player_url) - return id_m.group('id') - - def _extract_signature_function(self, video_id, player_url, example_sig): - player_id = self._extract_player_info(player_url) - - # Read from filesystem cache - func_id = 'js_%s_%s' % ( - player_id, self._signature_cache_id(example_sig)) - assert os.path.basename(func_id) == func_id - - cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id) - if cache_spec is not None: - return lambda s: ''.join(s[i] for i in cache_spec) - - if player_id not in self._code_cache: - self._code_cache[player_id] = self._download_webpage( - player_url, video_id, - note='Downloading player ' + player_id, - errnote='Download of %s failed' % player_url) - code = self._code_cache[player_id] - res = self._parse_sig_js(code) - - test_string = ''.join(map(compat_chr, range(len(example_sig)))) - cache_res = res(test_string) - cache_spec = [ord(c) for c in cache_res] - - self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec) - return res - - def _print_sig_code(self, func, example_sig): - def gen_sig_code(idxs): - def _genslice(start, end, step): - starts = '' if start == 0 else str(start) - ends = (':%d' % (end + step)) if end + step >= 0 else ':' - steps = '' if step == 1 else (':%d' % step) - return 's[%s%s%s]' % (starts, ends, steps) - - step = None - # Quelch pyflakes warnings - start will be set when step is set - start = '(Never used)' - for i, prev in zip(idxs[1:], idxs[:-1]): - if step is not None: - if i - prev == step: - continue - yield _genslice(start, prev, step) - step = None - continue - if i - prev in [-1, 1]: - step = i - prev - start = prev - continue - else: - yield 's[%d]' % prev - if step is None: - yield 's[%d]' % i - else: - yield _genslice(start, i, step) - - test_string = ''.join(map(compat_chr, range(len(example_sig)))) - cache_res = func(test_string) - cache_spec = [ord(c) for c in cache_res] - expr_code = ' + '.join(gen_sig_code(cache_spec)) - signature_id_tuple = '(%s)' % ( - ', '.join(compat_str(len(p)) for p in example_sig.split('.'))) - code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n' - ' return %s\n') % (signature_id_tuple, expr_code) - self.to_screen('Extracted signature function:\n' + code) - - def _parse_sig_js(self, jscode): - funcname = self._search_regex( - (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\bm=(?P<sig>[a-zA-Z0-9$]{2})\(decodeURIComponent\(h\.s\)\)', - r'\bc&&\(c=(?P<sig>[a-zA-Z0-9$]{2})\(decodeURIComponent\(c\)\)', - r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\);[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\)', - r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', - r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', - # Obsolete patterns - r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(', - r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('), - jscode, 'Initial JS player signature function name', group='sig') - - jsi = JSInterpreter(jscode) - initial_function = jsi.extract_function(funcname) - return lambda s: initial_function([s]) - - def _decrypt_signature(self, s, video_id, player_url): - """Turn the encrypted s field into a working signature""" - - if player_url is None: - raise ExtractorError('Cannot decrypt signature without player_url') - - if player_url.startswith('//'): - player_url = 'https:' + player_url - elif not re.match(r'https?://', player_url): - player_url = compat_urlparse.urljoin( - 'https://www.youtube.com', player_url) - try: - player_id = (player_url, self._signature_cache_id(s)) - if player_id not in self._player_cache: - func = self._extract_signature_function( - video_id, player_url, s - ) - self._player_cache[player_id] = func - func = self._player_cache[player_id] - if self._downloader.params.get('youtube_print_sig_code'): - self._print_sig_code(func, s) - return func(s) - except Exception as e: - tb = traceback.format_exc() - raise ExtractorError( - 'Signature extraction failed: ' + tb, cause=e) - - def _mark_watched(self, video_id, player_response): - playback_url = url_or_none(try_get( - player_response, - lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl'])) - if not playback_url: - return - parsed_playback_url = compat_urlparse.urlparse(playback_url) - qs = compat_urlparse.parse_qs(parsed_playback_url.query) - - # cpn generation algorithm is reverse engineered from base.js. - # In fact it works even with dummy cpn. - CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_' - cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16))) - - qs.update({ - 'ver': ['2'], - 'cpn': [cpn], - }) - playback_url = compat_urlparse.urlunparse( - parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True))) - - self._download_webpage( - playback_url, video_id, 'Marking watched', - 'Unable to mark watched', fatal=False) - - @staticmethod - def _extract_urls(webpage): - # Embedded YouTube player - entries = [ - unescapeHTML(mobj.group('url')) - for mobj in re.finditer(r'''(?x) - (?: - <iframe[^>]+?src=| - data-video-url=| - <embed[^>]+?src=| - embedSWF\(?:\s*| - <object[^>]+data=| - new\s+SWFObject\( - ) - (["\']) - (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/ - (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?) - \1''', webpage)] - - # lazyYT YouTube embed - entries.extend(list(map( - unescapeHTML, - re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)))) - - # Wordpress "YouTube Video Importer" plugin - matches = re.findall(r'''(?x)<div[^>]+ - class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+ - data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage) - entries.extend(m[-1] for m in matches) - - return entries - - @staticmethod - def _extract_url(webpage): - urls = YoutubeIE._extract_urls(webpage) - return urls[0] if urls else None - - @classmethod - def extract_id(cls, url): - mobj = re.match(cls._VALID_URL, url, re.VERBOSE) - if mobj is None: - raise ExtractorError('Invalid URL: %s' % url) - video_id = mobj.group(2) - return video_id - - def _extract_chapters_from_json(self, data, video_id, duration): - chapters_list = try_get( - data, - lambda x: x['playerOverlays'] - ['playerOverlayRenderer'] - ['decoratedPlayerBarRenderer'] - ['decoratedPlayerBarRenderer'] - ['playerBar'] - ['chapteredPlayerBarRenderer'] - ['chapters'], - list) - if not chapters_list: - return - - def chapter_time(chapter): - return float_or_none( - try_get( - chapter, - lambda x: x['chapterRenderer']['timeRangeStartMillis'], - int), - scale=1000) - chapters = [] - for next_num, chapter in enumerate(chapters_list, start=1): - start_time = chapter_time(chapter) - if start_time is None: - continue - end_time = (chapter_time(chapters_list[next_num]) - if next_num < len(chapters_list) else duration) - if end_time is None: - continue - title = try_get( - chapter, lambda x: x['chapterRenderer']['title']['simpleText'], - compat_str) - chapters.append({ - 'start_time': start_time, - 'end_time': end_time, - 'title': title, - }) - return chapters - - def _extract_yt_initial_variable(self, webpage, regex, video_id, name): - return self._parse_json(self._search_regex( - (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE), - regex), webpage, name, default='{}'), video_id, fatal=False) - - def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) - video_id = self._match_id(url) - base_url = self.http_scheme() + '//www.youtube.com/' - webpage_url = base_url + 'watch?v=' + video_id - webpage = self._download_webpage( - webpage_url + '&bpctr=9999999999&has_verified=1', video_id, fatal=False) - - player_response = None - if webpage: - player_response = self._extract_yt_initial_variable( - webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE, - video_id, 'initial player response') - if not player_response: - player_response = self._call_api( - 'player', {'videoId': video_id}, video_id) - - playability_status = player_response.get('playabilityStatus') or {} - if playability_status.get('reason') == 'Sign in to confirm your age': - video_info = self._download_webpage( - base_url + 'get_video_info', video_id, - 'Refetching age-gated info webpage', - 'unable to download video info webpage', query={ - 'video_id': video_id, - 'eurl': 'https://youtube.googleapis.com/v/' + video_id, - 'html5': 1, - # See https://github.com/ytdl-org/youtube-dl/issues/29333#issuecomment-864049544 - 'c': 'TVHTML5', - 'cver': '6.20180913', - }, fatal=False) - if video_info: - pr = self._parse_json( - try_get( - compat_parse_qs(video_info), - lambda x: x['player_response'][0], compat_str) or '{}', - video_id, fatal=False) - if pr and isinstance(pr, dict): - player_response = pr - - trailer_video_id = try_get( - playability_status, - lambda x: x['errorScreen']['playerLegacyDesktopYpcTrailerRenderer']['trailerVideoId'], - compat_str) - if trailer_video_id: - return self.url_result( - trailer_video_id, self.ie_key(), trailer_video_id) - - def get_text(x): - if not x: - return - text = x.get('simpleText') - if text and isinstance(text, compat_str): - return text - runs = x.get('runs') - if not isinstance(runs, list): - return - return ''.join([r['text'] for r in runs if isinstance(r.get('text'), compat_str)]) - - search_meta = ( - lambda x: self._html_search_meta(x, webpage, default=None)) \ - if webpage else lambda x: None - - video_details = player_response.get('videoDetails') or {} - microformat = try_get( - player_response, - lambda x: x['microformat']['playerMicroformatRenderer'], - dict) or {} - video_title = video_details.get('title') \ - or get_text(microformat.get('title')) \ - or search_meta(['og:title', 'twitter:title', 'title']) - video_description = video_details.get('shortDescription') - - if not smuggled_data.get('force_singlefeed', False): - if not self._downloader.params.get('noplaylist'): - multifeed_metadata_list = try_get( - player_response, - lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'], - compat_str) - if multifeed_metadata_list: - entries = [] - feed_ids = [] - for feed in multifeed_metadata_list.split(','): - # Unquote should take place before split on comma (,) since textual - # fields may contain comma as well (see - # https://github.com/ytdl-org/youtube-dl/issues/8536) - feed_data = compat_parse_qs( - compat_urllib_parse_unquote_plus(feed)) - - def feed_entry(name): - return try_get( - feed_data, lambda x: x[name][0], compat_str) - - feed_id = feed_entry('id') - if not feed_id: - continue - feed_title = feed_entry('title') - title = video_title - if feed_title: - title += ' (%s)' % feed_title - entries.append({ - '_type': 'url_transparent', - 'ie_key': 'Youtube', - 'url': smuggle_url( - base_url + 'watch?v=' + feed_data['id'][0], - {'force_singlefeed': True}), - 'title': title, - }) - feed_ids.append(feed_id) - self.to_screen( - 'Downloading multifeed video (%s) - add --no-playlist to just download video %s' - % (', '.join(feed_ids), video_id)) - return self.playlist_result( - entries, video_id, video_title, video_description) - else: - self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - - formats = [] - itags = [] - itag_qualities = {} - player_url = None - q = qualities(['tiny', 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres']) - streaming_data = player_response.get('streamingData') or {} - streaming_formats = streaming_data.get('formats') or [] - streaming_formats.extend(streaming_data.get('adaptiveFormats') or []) - for fmt in streaming_formats: - if fmt.get('targetDurationSec') or fmt.get('drmFamilies'): - continue - - itag = str_or_none(fmt.get('itag')) - quality = fmt.get('quality') - if itag and quality: - itag_qualities[itag] = quality - # FORMAT_STREAM_TYPE_OTF(otf=1) requires downloading the init fragment - # (adding `&sq=0` to the URL) and parsing emsg box to determine the - # number of fragment that would subsequently requested with (`&sq=N`) - if fmt.get('type') == 'FORMAT_STREAM_TYPE_OTF': - continue - - fmt_url = fmt.get('url') - if not fmt_url: - sc = compat_parse_qs(fmt.get('signatureCipher')) - fmt_url = url_or_none(try_get(sc, lambda x: x['url'][0])) - encrypted_sig = try_get(sc, lambda x: x['s'][0]) - if not (sc and fmt_url and encrypted_sig): - continue - if not player_url: - if not webpage: - continue - player_url = self._search_regex( - r'"(?:PLAYER_JS_URL|jsUrl)"\s*:\s*"([^"]+)"', - webpage, 'player URL', fatal=False) - if not player_url: - continue - signature = self._decrypt_signature(sc['s'][0], video_id, player_url) - sp = try_get(sc, lambda x: x['sp'][0]) or 'signature' - fmt_url += '&' + sp + '=' + signature - - if itag: - itags.append(itag) - tbr = float_or_none( - fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) - dct = { - 'asr': int_or_none(fmt.get('audioSampleRate')), - 'filesize': int_or_none(fmt.get('contentLength')), - 'format_id': itag, - 'format_note': fmt.get('qualityLabel') or quality, - 'fps': int_or_none(fmt.get('fps')), - 'height': int_or_none(fmt.get('height')), - 'quality': q(quality), - 'tbr': tbr, - 'url': fmt_url, - 'width': fmt.get('width'), - } - mimetype = fmt.get('mimeType') - if mimetype: - mobj = re.match( - r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', mimetype) - if mobj: - dct['ext'] = mimetype2ext(mobj.group(1)) - dct.update(parse_codecs(mobj.group(2))) - no_audio = dct.get('acodec') == 'none' - no_video = dct.get('vcodec') == 'none' - if no_audio: - dct['vbr'] = tbr - if no_video: - dct['abr'] = tbr - if no_audio or no_video: - dct['downloader_options'] = { - # Youtube throttles chunks >~10M - 'http_chunk_size': 10485760, - } - if dct.get('ext'): - dct['container'] = dct['ext'] + '_dash' - formats.append(dct) - - hls_manifest_url = streaming_data.get('hlsManifestUrl') - if hls_manifest_url: - for f in self._extract_m3u8_formats( - hls_manifest_url, video_id, 'mp4', fatal=False): - itag = self._search_regex( - r'/itag/(\d+)', f['url'], 'itag', default=None) - if itag: - f['format_id'] = itag - formats.append(f) - - if self._downloader.params.get('youtube_include_dash_manifest', True): - dash_manifest_url = streaming_data.get('dashManifestUrl') - if dash_manifest_url: - for f in self._extract_mpd_formats( - dash_manifest_url, video_id, fatal=False): - itag = f['format_id'] - if itag in itags: - continue - if itag in itag_qualities: - f['quality'] = q(itag_qualities[itag]) - filesize = int_or_none(self._search_regex( - r'/clen/(\d+)', f.get('fragment_base_url') - or f['url'], 'file size', default=None)) - if filesize: - f['filesize'] = filesize - formats.append(f) - - if not formats: - if streaming_data.get('licenseInfos'): - raise ExtractorError( - 'This video is DRM protected.', expected=True) - pemr = try_get( - playability_status, - lambda x: x['errorScreen']['playerErrorMessageRenderer'], - dict) or {} - reason = get_text(pemr.get('reason')) or playability_status.get('reason') - subreason = pemr.get('subreason') - if subreason: - subreason = clean_html(get_text(subreason)) - if subreason == 'The uploader has not made this video available in your country.': - countries = microformat.get('availableCountries') - if not countries: - regions_allowed = search_meta('regionsAllowed') - countries = regions_allowed.split(',') if regions_allowed else None - self.raise_geo_restricted( - subreason, countries) - reason += '\n' + subreason - if reason: - raise ExtractorError(reason, expected=True) - - self._sort_formats(formats) - - keywords = video_details.get('keywords') or [] - if not keywords and webpage: - keywords = [ - unescapeHTML(m.group('content')) - for m in re.finditer(self._meta_regex('og:video:tag'), webpage)] - for keyword in keywords: - if keyword.startswith('yt:stretch='): - mobj = re.search(r'(\d+)\s*:\s*(\d+)', keyword) - if mobj: - # NB: float is intentional for forcing float division - w, h = (float(v) for v in mobj.groups()) - if w > 0 and h > 0: - ratio = w / h - for f in formats: - if f.get('vcodec') != 'none': - f['stretched_ratio'] = ratio - break - - thumbnails = [] - for container in (video_details, microformat): - for thumbnail in (try_get( - container, - lambda x: x['thumbnail']['thumbnails'], list) or []): - thumbnail_url = thumbnail.get('url') - if not thumbnail_url: - continue - thumbnails.append({ - 'height': int_or_none(thumbnail.get('height')), - 'url': thumbnail_url, - 'width': int_or_none(thumbnail.get('width')), - }) - if thumbnails: - break - else: - thumbnail = search_meta(['og:image', 'twitter:image']) - if thumbnail: - thumbnails = [{'url': thumbnail}] - - category = microformat.get('category') or search_meta('genre') - channel_id = video_details.get('channelId') \ - or microformat.get('externalChannelId') \ - or search_meta('channelId') - duration = int_or_none( - video_details.get('lengthSeconds') - or microformat.get('lengthSeconds')) \ - or parse_duration(search_meta('duration')) - is_live = video_details.get('isLive') - owner_profile_url = microformat.get('ownerProfileUrl') - - info = { - 'id': video_id, - 'title': self._live_title(video_title) if is_live else video_title, - 'formats': formats, - 'thumbnails': thumbnails, - 'description': video_description, - 'upload_date': unified_strdate( - microformat.get('uploadDate') - or search_meta('uploadDate')), - 'uploader': video_details['author'], - 'uploader_id': self._search_regex(r'/(?:channel|user)/([^/?&#]+)', owner_profile_url, 'uploader id') if owner_profile_url else None, - 'uploader_url': owner_profile_url, - 'channel_id': channel_id, - 'channel_url': 'https://www.youtube.com/channel/' + channel_id if channel_id else None, - 'duration': duration, - 'view_count': int_or_none( - video_details.get('viewCount') - or microformat.get('viewCount') - or search_meta('interactionCount')), - 'average_rating': float_or_none(video_details.get('averageRating')), - 'age_limit': 18 if ( - microformat.get('isFamilySafe') is False - or search_meta('isFamilyFriendly') == 'false' - or search_meta('og:restrictions:age') == '18+') else 0, - 'webpage_url': webpage_url, - 'categories': [category] if category else None, - 'tags': keywords, - 'is_live': is_live, - } - - pctr = try_get( - player_response, - lambda x: x['captions']['playerCaptionsTracklistRenderer'], dict) - if pctr: - def process_language(container, base_url, lang_code, query): - lang_subs = [] - for fmt in self._SUBTITLE_FORMATS: - query.update({ - 'fmt': fmt, - }) - lang_subs.append({ - 'ext': fmt, - 'url': update_url_query(base_url, query), - }) - container[lang_code] = lang_subs - - subtitles = {} - for caption_track in (pctr.get('captionTracks') or []): - base_url = caption_track.get('baseUrl') - if not base_url: - continue - if caption_track.get('kind') != 'asr': - lang_code = caption_track.get('languageCode') - if not lang_code: - continue - process_language( - subtitles, base_url, lang_code, {}) - continue - automatic_captions = {} - for translation_language in (pctr.get('translationLanguages') or []): - translation_language_code = translation_language.get('languageCode') - if not translation_language_code: - continue - process_language( - automatic_captions, base_url, translation_language_code, - {'tlang': translation_language_code}) - info['automatic_captions'] = automatic_captions - info['subtitles'] = subtitles - - parsed_url = compat_urllib_parse_urlparse(url) - for component in [parsed_url.fragment, parsed_url.query]: - query = compat_parse_qs(component) - for k, v in query.items(): - for d_k, s_ks in [('start', ('start', 't')), ('end', ('end',))]: - d_k += '_time' - if d_k not in info and k in s_ks: - info[d_k] = parse_duration(query[k][0]) - - if video_description: - mobj = re.search(r'(?s)(?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description) - if mobj: - release_year = mobj.group('release_year') - release_date = mobj.group('release_date') - if release_date: - release_date = release_date.replace('-', '') - if not release_year: - release_year = release_date[:4] - info.update({ - 'album': mobj.group('album'.strip()), - 'artist': mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·')), - 'track': mobj.group('track').strip(), - 'release_date': release_date, - 'release_year': int_or_none(release_year), - }) - - initial_data = None - if webpage: - initial_data = self._extract_yt_initial_variable( - webpage, self._YT_INITIAL_DATA_RE, video_id, - 'yt initial data') - if not initial_data: - initial_data = self._call_api( - 'next', {'videoId': video_id}, video_id, fatal=False) - - if initial_data: - chapters = self._extract_chapters_from_json( - initial_data, video_id, duration) - if not chapters: - for engagment_pannel in (initial_data.get('engagementPanels') or []): - contents = try_get( - engagment_pannel, lambda x: x['engagementPanelSectionListRenderer']['content']['macroMarkersListRenderer']['contents'], - list) - if not contents: - continue - - def chapter_time(mmlir): - return parse_duration( - get_text(mmlir.get('timeDescription'))) - - chapters = [] - for next_num, content in enumerate(contents, start=1): - mmlir = content.get('macroMarkersListItemRenderer') or {} - start_time = chapter_time(mmlir) - end_time = chapter_time(try_get( - contents, lambda x: x[next_num]['macroMarkersListItemRenderer'])) \ - if next_num < len(contents) else duration - if start_time is None or end_time is None: - continue - chapters.append({ - 'start_time': start_time, - 'end_time': end_time, - 'title': get_text(mmlir.get('title')), - }) - if chapters: - break - if chapters: - info['chapters'] = chapters - - contents = try_get( - initial_data, - lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], - list) or [] - for content in contents: - vpir = content.get('videoPrimaryInfoRenderer') - if vpir: - stl = vpir.get('superTitleLink') - if stl: - stl = get_text(stl) - if try_get( - vpir, - lambda x: x['superTitleIcon']['iconType']) == 'LOCATION_PIN': - info['location'] = stl - else: - mobj = re.search(r'(.+?)\s*S(\d+)\s*•\s*E(\d+)', stl) - if mobj: - info.update({ - 'series': mobj.group(1), - 'season_number': int(mobj.group(2)), - 'episode_number': int(mobj.group(3)), - }) - for tlb in (try_get( - vpir, - lambda x: x['videoActions']['menuRenderer']['topLevelButtons'], - list) or []): - tbr = tlb.get('toggleButtonRenderer') or {} - for getter, regex in [( - lambda x: x['defaultText']['accessibility']['accessibilityData'], - r'(?P<count>[\d,]+)\s*(?P<type>(?:dis)?like)'), ([ - lambda x: x['accessibility'], - lambda x: x['accessibilityData']['accessibilityData'], - ], r'(?P<type>(?:dis)?like) this video along with (?P<count>[\d,]+) other people')]: - label = (try_get(tbr, getter, dict) or {}).get('label') - if label: - mobj = re.match(regex, label) - if mobj: - info[mobj.group('type') + '_count'] = str_to_int(mobj.group('count')) - break - sbr_tooltip = try_get( - vpir, lambda x: x['sentimentBar']['sentimentBarRenderer']['tooltip']) - if sbr_tooltip: - like_count, dislike_count = sbr_tooltip.split(' / ') - info.update({ - 'like_count': str_to_int(like_count), - 'dislike_count': str_to_int(dislike_count), - }) - vsir = content.get('videoSecondaryInfoRenderer') - if vsir: - info['channel'] = get_text(try_get( - vsir, - lambda x: x['owner']['videoOwnerRenderer']['title'], - dict)) - rows = try_get( - vsir, - lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'], - list) or [] - multiple_songs = False - for row in rows: - if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True: - multiple_songs = True - break - for row in rows: - mrr = row.get('metadataRowRenderer') or {} - mrr_title = mrr.get('title') - if not mrr_title: - continue - mrr_title = get_text(mrr['title']) - mrr_contents_text = get_text(mrr['contents'][0]) - if mrr_title == 'License': - info['license'] = mrr_contents_text - elif not multiple_songs: - if mrr_title == 'Album': - info['album'] = mrr_contents_text - elif mrr_title == 'Artist': - info['artist'] = mrr_contents_text - elif mrr_title == 'Song': - info['track'] = mrr_contents_text - - for s_k, d_k in [('artist', 'creator'), ('track', 'alt_title')]: - v = info.get(s_k) - if v: - info[d_k] = v - - self.mark_watched(video_id, player_response) - - return info - - -class YoutubeTabIE(YoutubeBaseInfoExtractor): - IE_DESC = 'YouTube.com tab' - _VALID_URL = r'''(?x) - https?:// - (?:\w+\.)? - (?: - youtube(?:kids)?\.com| - invidio\.us - )/ - (?: - (?:channel|c|user|feed|hashtag)/| - (?:playlist|watch)\?.*?\blist=| - (?!(?:watch|embed|v|e)\b) - ) - (?P<id>[^/?\#&]+) - ''' - IE_NAME = 'youtube:tab' - - _TESTS = [{ - # playlists, multipage - 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid', - 'playlist_mincount': 94, - 'info_dict': { - 'id': 'UCqj7Cz7revf5maW9g5pgNcg', - 'title': 'Игорь Клейнер - Playlists', - 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2', - }, - }, { - # playlists, multipage, different order - 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd', - 'playlist_mincount': 94, - 'info_dict': { - 'id': 'UCqj7Cz7revf5maW9g5pgNcg', - 'title': 'Игорь Клейнер - Playlists', - 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2', - }, - }, { - # playlists, series - 'url': 'https://www.youtube.com/c/3blue1brown/playlists?view=50&sort=dd&shelf_id=3', - 'playlist_mincount': 5, - 'info_dict': { - 'id': 'UCYO_jab_esuFRV4b17AJtAw', - 'title': '3Blue1Brown - Playlists', - 'description': 'md5:e1384e8a133307dd10edee76e875d62f', - }, - }, { - # playlists, singlepage - 'url': 'https://www.youtube.com/user/ThirstForScience/playlists', - 'playlist_mincount': 4, - 'info_dict': { - 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ', - 'title': 'ThirstForScience - Playlists', - 'description': 'md5:609399d937ea957b0f53cbffb747a14c', - } - }, { - 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists', - 'only_matching': True, - }, { - # basic, single video playlist - 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', - 'info_dict': { - 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', - 'uploader': 'Sergey M.', - 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', - 'title': 'youtube-dl public playlist', - }, - 'playlist_count': 1, - }, { - # empty playlist - 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', - 'info_dict': { - 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', - 'uploader': 'Sergey M.', - 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', - 'title': 'youtube-dl empty playlist', - }, - 'playlist_count': 0, - }, { - # Home tab - 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured', - 'info_dict': { - 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'lex will - Home', - 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', - }, - 'playlist_mincount': 2, - }, { - # Videos tab - 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos', - 'info_dict': { - 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'lex will - Videos', - 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', - }, - 'playlist_mincount': 975, - }, { - # Videos tab, sorted by popular - 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid', - 'info_dict': { - 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'lex will - Videos', - 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', - }, - 'playlist_mincount': 199, - }, { - # Playlists tab - 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists', - 'info_dict': { - 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'lex will - Playlists', - 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', - }, - 'playlist_mincount': 17, - }, { - # Community tab - 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community', - 'info_dict': { - 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'lex will - Community', - 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', - }, - 'playlist_mincount': 18, - }, { - # Channels tab - 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels', - 'info_dict': { - 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'lex will - Channels', - 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', - }, - 'playlist_mincount': 138, - }, { - 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA', - 'only_matching': True, - }, { - 'url': 'https://www.youtubekids.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', - 'only_matching': True, - }, { - 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', - 'only_matching': True, - }, { - 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.', - 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', - 'info_dict': { - 'title': '29C3: Not my department', - 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', - 'uploader': 'Christiaan008', - 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg', - }, - 'playlist_count': 96, - }, { - 'note': 'Large playlist', - 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q', - 'info_dict': { - 'title': 'Uploads from Cauchemar', - 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q', - 'uploader': 'Cauchemar', - 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q', - }, - 'playlist_mincount': 1123, - }, { - # even larger playlist, 8832 videos - 'url': 'http://www.youtube.com/user/NASAgovVideo/videos', - 'only_matching': True, - }, { - 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos', - 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA', - 'info_dict': { - 'title': 'Uploads from Interstellar Movie', - 'id': 'UUXw-G3eDE9trcvY2sBMM_aA', - 'uploader': 'Interstellar Movie', - 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA', - }, - 'playlist_mincount': 21, - }, { - # https://github.com/ytdl-org/youtube-dl/issues/21844 - 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', - 'info_dict': { - 'title': 'Data Analysis with Dr Mike Pound', - 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', - 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA', - 'uploader': 'Computerphile', - }, - 'playlist_mincount': 11, - }, { - 'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', - 'only_matching': True, - }, { - # Playlist URL that does not actually serve a playlist - 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4', - 'info_dict': { - 'id': 'FqZTN594JQw', - 'ext': 'webm', - 'title': "Smiley's People 01 detective, Adventure Series, Action", - 'uploader': 'STREEM', - 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng', - 'upload_date': '20150526', - 'license': 'Standard YouTube License', - 'description': 'md5:507cdcb5a49ac0da37a920ece610be80', - 'categories': ['People & Blogs'], - 'tags': list, - 'view_count': int, - 'like_count': int, - 'dislike_count': int, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'This video is not available.', - 'add_ie': [YoutubeIE.ie_key()], - }, { - 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live', - 'info_dict': { - 'id': '9Auq9mYxFEE', - 'ext': 'mp4', - 'title': 'Watch Sky News live', - 'uploader': 'Sky News', - 'uploader_id': 'skynews', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews', - 'upload_date': '20191102', - 'description': 'md5:78de4e1c2359d0ea3ed829678e38b662', - 'categories': ['News & Politics'], - 'tags': list, - 'like_count': int, - 'dislike_count': int, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://www.youtube.com/user/TheYoungTurks/live', - 'info_dict': { - 'id': 'a48o2S1cPoo', - 'ext': 'mp4', - 'title': 'The Young Turks - Live Main Show', - 'uploader': 'The Young Turks', - 'uploader_id': 'TheYoungTurks', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks', - 'upload_date': '20150715', - 'license': 'Standard YouTube License', - 'description': 'md5:438179573adcdff3c97ebb1ee632b891', - 'categories': ['News & Politics'], - 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'], - 'like_count': int, - 'dislike_count': int, - }, - 'params': { - 'skip_download': True, - }, - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/c/CommanderVideoHq/live', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/feed/trending', - 'only_matching': True, - }, { - # needs auth - 'url': 'https://www.youtube.com/feed/library', - 'only_matching': True, - }, { - # needs auth - 'url': 'https://www.youtube.com/feed/history', - 'only_matching': True, - }, { - # needs auth - 'url': 'https://www.youtube.com/feed/subscriptions', - 'only_matching': True, - }, { - # needs auth - 'url': 'https://www.youtube.com/feed/watch_later', - 'only_matching': True, - }, { - # no longer available? - 'url': 'https://www.youtube.com/feed/recommended', - 'only_matching': True, - }, { - # inline playlist with not always working continuations - 'url': 'https://www.youtube.com/watch?v=UC6u0Tct-Fo&list=PL36D642111D65BE7C', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/course', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/zsecurity', - 'only_matching': True, - }, { - 'url': 'http://www.youtube.com/NASAgovVideo/videos', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/TheYoungTurks/live', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/hashtag/cctv9', - 'info_dict': { - 'id': 'cctv9', - 'title': '#cctv9', - }, - 'playlist_mincount': 350, - }, { - 'url': 'https://www.youtube.com/watch?list=PLW4dVinRY435CBE_JD3t-0SRXKfnZHS1P&feature=youtu.be&v=M9cJMXmQ_ZU', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return False if YoutubeIE.suitable(url) else super( - YoutubeTabIE, cls).suitable(url) - - def _extract_channel_id(self, webpage): - channel_id = self._html_search_meta( - 'channelId', webpage, 'channel id', default=None) - if channel_id: - return channel_id - channel_url = self._html_search_meta( - ('og:url', 'al:ios:url', 'al:android:url', 'al:web:url', - 'twitter:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad', - 'twitter:app:url:googleplay'), webpage, 'channel url') - return self._search_regex( - r'https?://(?:www\.)?youtube\.com/channel/([^/?#&])+', - channel_url, 'channel id') - - @staticmethod - def _extract_grid_item_renderer(item): - assert isinstance(item, dict) - for key, renderer in item.items(): - if not key.startswith('grid') or not key.endswith('Renderer'): - continue - if not isinstance(renderer, dict): - continue - return renderer - - def _grid_entries(self, grid_renderer): - for item in grid_renderer['items']: - if not isinstance(item, dict): - continue - renderer = self._extract_grid_item_renderer(item) - if not isinstance(renderer, dict): - continue - title = try_get( - renderer, (lambda x: x['title']['runs'][0]['text'], - lambda x: x['title']['simpleText']), compat_str) - # playlist - playlist_id = renderer.get('playlistId') - if playlist_id: - yield self.url_result( - 'https://www.youtube.com/playlist?list=%s' % playlist_id, - ie=YoutubeTabIE.ie_key(), video_id=playlist_id, - video_title=title) - continue - # video - video_id = renderer.get('videoId') - if video_id: - yield self._extract_video(renderer) - continue - # channel - channel_id = renderer.get('channelId') - if channel_id: - title = try_get( - renderer, lambda x: x['title']['simpleText'], compat_str) - yield self.url_result( - 'https://www.youtube.com/channel/%s' % channel_id, - ie=YoutubeTabIE.ie_key(), video_title=title) - continue - # generic endpoint URL support - ep_url = urljoin('https://www.youtube.com/', try_get( - renderer, lambda x: x['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'], - compat_str)) - if ep_url: - for ie in (YoutubeTabIE, YoutubePlaylistIE, YoutubeIE): - if ie.suitable(ep_url): - yield self.url_result( - ep_url, ie=ie.ie_key(), video_id=ie._match_id(ep_url), video_title=title) - break - - def _shelf_entries_from_content(self, shelf_renderer): - content = shelf_renderer.get('content') - if not isinstance(content, dict): - return - renderer = content.get('gridRenderer') - if renderer: - # TODO: add support for nested playlists so each shelf is processed - # as separate playlist - # TODO: this includes only first N items - for entry in self._grid_entries(renderer): - yield entry - renderer = content.get('horizontalListRenderer') - if renderer: - # TODO - pass - - def _shelf_entries(self, shelf_renderer, skip_channels=False): - ep = try_get( - shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'], - compat_str) - shelf_url = urljoin('https://www.youtube.com', ep) - if shelf_url: - # Skipping links to another channels, note that checking for - # endpoint.commandMetadata.webCommandMetadata.webPageTypwebPageType == WEB_PAGE_TYPE_CHANNEL - # will not work - if skip_channels and '/channels?' in shelf_url: - return - title = try_get( - shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str) - yield self.url_result(shelf_url, video_title=title) - # Shelf may not contain shelf URL, fallback to extraction from content - for entry in self._shelf_entries_from_content(shelf_renderer): - yield entry - - def _playlist_entries(self, video_list_renderer): - for content in video_list_renderer['contents']: - if not isinstance(content, dict): - continue - renderer = content.get('playlistVideoRenderer') or content.get('playlistPanelVideoRenderer') - if not isinstance(renderer, dict): - continue - video_id = renderer.get('videoId') - if not video_id: - continue - yield self._extract_video(renderer) - - def _video_entry(self, video_renderer): - video_id = video_renderer.get('videoId') - if video_id: - return self._extract_video(video_renderer) - - def _post_thread_entries(self, post_thread_renderer): - post_renderer = try_get( - post_thread_renderer, lambda x: x['post']['backstagePostRenderer'], dict) - if not post_renderer: - return - # video attachment - video_renderer = try_get( - post_renderer, lambda x: x['backstageAttachment']['videoRenderer'], dict) - video_id = None - if video_renderer: - entry = self._video_entry(video_renderer) - if entry: - yield entry - # inline video links - runs = try_get(post_renderer, lambda x: x['contentText']['runs'], list) or [] - for run in runs: - if not isinstance(run, dict): - continue - ep_url = try_get( - run, lambda x: x['navigationEndpoint']['urlEndpoint']['url'], compat_str) - if not ep_url: - continue - if not YoutubeIE.suitable(ep_url): - continue - ep_video_id = YoutubeIE._match_id(ep_url) - if video_id == ep_video_id: - continue - yield self.url_result(ep_url, ie=YoutubeIE.ie_key(), video_id=video_id) - - def _post_thread_continuation_entries(self, post_thread_continuation): - contents = post_thread_continuation.get('contents') - if not isinstance(contents, list): - return - for content in contents: - renderer = content.get('backstagePostThreadRenderer') - if not isinstance(renderer, dict): - continue - for entry in self._post_thread_entries(renderer): - yield entry - - def _rich_grid_entries(self, contents): - for content in contents: - video_renderer = try_get(content, lambda x: x['richItemRenderer']['content']['videoRenderer'], dict) - if video_renderer: - entry = self._video_entry(video_renderer) - if entry: - yield entry - - @staticmethod - def _build_continuation_query(continuation, ctp=None): - query = { - 'ctoken': continuation, - 'continuation': continuation, - } - if ctp: - query['itct'] = ctp - return query - - @staticmethod - def _extract_next_continuation_data(renderer): - next_continuation = try_get( - renderer, lambda x: x['continuations'][0]['nextContinuationData'], dict) - if not next_continuation: - return - continuation = next_continuation.get('continuation') - if not continuation: - return - ctp = next_continuation.get('clickTrackingParams') - return YoutubeTabIE._build_continuation_query(continuation, ctp) - - @classmethod - def _extract_continuation(cls, renderer): - next_continuation = cls._extract_next_continuation_data(renderer) - if next_continuation: - return next_continuation - contents = [] - for key in ('contents', 'items'): - contents.extend(try_get(renderer, lambda x: x[key], list) or []) - for content in contents: - if not isinstance(content, dict): - continue - continuation_ep = try_get( - content, lambda x: x['continuationItemRenderer']['continuationEndpoint'], - dict) - if not continuation_ep: - continue - continuation = try_get( - continuation_ep, lambda x: x['continuationCommand']['token'], compat_str) - if not continuation: - continue - ctp = continuation_ep.get('clickTrackingParams') - return YoutubeTabIE._build_continuation_query(continuation, ctp) - - def _entries(self, tab, item_id, webpage): - tab_content = try_get(tab, lambda x: x['content'], dict) - if not tab_content: - return - slr_renderer = try_get(tab_content, lambda x: x['sectionListRenderer'], dict) - if slr_renderer: - is_channels_tab = tab.get('title') == 'Channels' - continuation = None - slr_contents = try_get(slr_renderer, lambda x: x['contents'], list) or [] - for slr_content in slr_contents: - if not isinstance(slr_content, dict): - continue - is_renderer = try_get(slr_content, lambda x: x['itemSectionRenderer'], dict) - if not is_renderer: - continue - isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or [] - for isr_content in isr_contents: - if not isinstance(isr_content, dict): - continue - renderer = isr_content.get('playlistVideoListRenderer') - if renderer: - for entry in self._playlist_entries(renderer): - yield entry - continuation = self._extract_continuation(renderer) - continue - renderer = isr_content.get('gridRenderer') - if renderer: - for entry in self._grid_entries(renderer): - yield entry - continuation = self._extract_continuation(renderer) - continue - renderer = isr_content.get('shelfRenderer') - if renderer: - for entry in self._shelf_entries(renderer, not is_channels_tab): - yield entry - continue - renderer = isr_content.get('backstagePostThreadRenderer') - if renderer: - for entry in self._post_thread_entries(renderer): - yield entry - continuation = self._extract_continuation(renderer) - continue - renderer = isr_content.get('videoRenderer') - if renderer: - entry = self._video_entry(renderer) - if entry: - yield entry - - if not continuation: - continuation = self._extract_continuation(is_renderer) - if not continuation: - continuation = self._extract_continuation(slr_renderer) - else: - rich_grid_renderer = tab_content.get('richGridRenderer') - if not rich_grid_renderer: - return - for entry in self._rich_grid_entries(rich_grid_renderer.get('contents') or []): - yield entry - continuation = self._extract_continuation(rich_grid_renderer) - - ytcfg = self._extract_ytcfg(item_id, webpage) - client_version = try_get( - ytcfg, lambda x: x['INNERTUBE_CLIENT_VERSION'], compat_str) or '2.20210407.08.00' - - headers = { - 'x-youtube-client-name': '1', - 'x-youtube-client-version': client_version, - 'content-type': 'application/json', - } - - context = try_get(ytcfg, lambda x: x['INNERTUBE_CONTEXT'], dict) or { - 'client': { - 'clientName': 'WEB', - 'clientVersion': client_version, - } - } - visitor_data = try_get(context, lambda x: x['client']['visitorData'], compat_str) - - identity_token = self._extract_identity_token(ytcfg, webpage) - if identity_token: - headers['x-youtube-identity-token'] = identity_token - - data = { - 'context': context, - } - - for page_num in itertools.count(1): - if not continuation: - break - if visitor_data: - headers['x-goog-visitor-id'] = visitor_data - data['continuation'] = continuation['continuation'] - data['clickTracking'] = { - 'clickTrackingParams': continuation['itct'] - } - count = 0 - retries = 3 - while count <= retries: - try: - # Downloading page may result in intermittent 5xx HTTP error - # that is usually worked around with a retry - response = self._download_json( - 'https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', - None, 'Downloading page %d%s' % (page_num, ' (retry #%d)' % count if count else ''), - headers=headers, data=json.dumps(data).encode('utf8')) - break - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503): - count += 1 - if count <= retries: - continue - raise - if not response: - break - - visitor_data = try_get( - response, lambda x: x['responseContext']['visitorData'], compat_str) or visitor_data - - continuation_contents = try_get( - response, lambda x: x['continuationContents'], dict) - if continuation_contents: - continuation_renderer = continuation_contents.get('playlistVideoListContinuation') - if continuation_renderer: - for entry in self._playlist_entries(continuation_renderer): - yield entry - continuation = self._extract_continuation(continuation_renderer) - continue - continuation_renderer = continuation_contents.get('gridContinuation') - if continuation_renderer: - for entry in self._grid_entries(continuation_renderer): - yield entry - continuation = self._extract_continuation(continuation_renderer) - continue - continuation_renderer = continuation_contents.get('itemSectionContinuation') - if continuation_renderer: - for entry in self._post_thread_continuation_entries(continuation_renderer): - yield entry - continuation = self._extract_continuation(continuation_renderer) - continue - - on_response_received = dict_get(response, ('onResponseReceivedActions', 'onResponseReceivedEndpoints')) - continuation_items = try_get( - on_response_received, lambda x: x[0]['appendContinuationItemsAction']['continuationItems'], list) - if continuation_items: - continuation_item = continuation_items[0] - if not isinstance(continuation_item, dict): - continue - renderer = self._extract_grid_item_renderer(continuation_item) - if renderer: - grid_renderer = {'items': continuation_items} - for entry in self._grid_entries(grid_renderer): - yield entry - continuation = self._extract_continuation(grid_renderer) - continue - renderer = continuation_item.get('playlistVideoRenderer') or continuation_item.get('itemSectionRenderer') - if renderer: - video_list_renderer = {'contents': continuation_items} - for entry in self._playlist_entries(video_list_renderer): - yield entry - continuation = self._extract_continuation(video_list_renderer) - continue - renderer = continuation_item.get('backstagePostThreadRenderer') - if renderer: - continuation_renderer = {'contents': continuation_items} - for entry in self._post_thread_continuation_entries(continuation_renderer): - yield entry - continuation = self._extract_continuation(continuation_renderer) - continue - renderer = continuation_item.get('richItemRenderer') - if renderer: - for entry in self._rich_grid_entries(continuation_items): - yield entry - continuation = self._extract_continuation({'contents': continuation_items}) - continue - - break - - @staticmethod - def _extract_selected_tab(tabs): - for tab in tabs: - if try_get(tab, lambda x: x['tabRenderer']['selected'], bool): - return tab['tabRenderer'] - else: - raise ExtractorError('Unable to find selected tab') - - @staticmethod - def _extract_uploader(data): - uploader = {} - sidebar_renderer = try_get( - data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list) - if sidebar_renderer: - for item in sidebar_renderer: - if not isinstance(item, dict): - continue - renderer = item.get('playlistSidebarSecondaryInfoRenderer') - if not isinstance(renderer, dict): - continue - owner = try_get( - renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict) - if owner: - uploader['uploader'] = owner.get('text') - uploader['uploader_id'] = try_get( - owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str) - uploader['uploader_url'] = urljoin( - 'https://www.youtube.com/', - try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str)) - return uploader - - @staticmethod - def _extract_alert(data): - alerts = [] - for alert in try_get(data, lambda x: x['alerts'], list) or []: - if not isinstance(alert, dict): - continue - alert_text = try_get( - alert, lambda x: x['alertRenderer']['text'], dict) - if not alert_text: - continue - text = try_get( - alert_text, - (lambda x: x['simpleText'], lambda x: x['runs'][0]['text']), - compat_str) - if text: - alerts.append(text) - return '\n'.join(alerts) - - def _extract_from_tabs(self, item_id, webpage, data, tabs): - selected_tab = self._extract_selected_tab(tabs) - renderer = try_get( - data, lambda x: x['metadata']['channelMetadataRenderer'], dict) - playlist_id = item_id - title = description = None - if renderer: - channel_title = renderer.get('title') or item_id - tab_title = selected_tab.get('title') - title = channel_title or item_id - if tab_title: - title += ' - %s' % tab_title - description = renderer.get('description') - playlist_id = renderer.get('externalId') - else: - renderer = try_get( - data, lambda x: x['metadata']['playlistMetadataRenderer'], dict) - if renderer: - title = renderer.get('title') - else: - renderer = try_get( - data, lambda x: x['header']['hashtagHeaderRenderer'], dict) - if renderer: - title = try_get(renderer, lambda x: x['hashtag']['simpleText']) - playlist = self.playlist_result( - self._entries(selected_tab, item_id, webpage), - playlist_id=playlist_id, playlist_title=title, - playlist_description=description) - playlist.update(self._extract_uploader(data)) - return playlist - - def _extract_from_playlist(self, item_id, url, data, playlist): - title = playlist.get('title') or try_get( - data, lambda x: x['titleText']['simpleText'], compat_str) - playlist_id = playlist.get('playlistId') or item_id - # Inline playlist rendition continuation does not always work - # at Youtube side, so delegating regular tab-based playlist URL - # processing whenever possible. - playlist_url = urljoin(url, try_get( - playlist, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'], - compat_str)) - if playlist_url and playlist_url != url: - return self.url_result( - playlist_url, ie=YoutubeTabIE.ie_key(), video_id=playlist_id, - video_title=title) - return self.playlist_result( - self._playlist_entries(playlist), playlist_id=playlist_id, - playlist_title=title) - - def _extract_identity_token(self, ytcfg, webpage): - if ytcfg: - token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str) - if token: - return token - return self._search_regex( - r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage, - 'identity token', default=None) - - def _real_extract(self, url): - item_id = self._match_id(url) - url = compat_urlparse.urlunparse( - compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com')) - # Handle both video/playlist URLs - qs = parse_qs(url) - video_id = qs.get('v', [None])[0] - playlist_id = qs.get('list', [None])[0] - if video_id and playlist_id: - if self._downloader.params.get('noplaylist'): - self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id) - self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) - webpage = self._download_webpage(url, item_id) - data = self._extract_yt_initial_data(item_id, webpage) - tabs = try_get( - data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list) - if tabs: - return self._extract_from_tabs(item_id, webpage, data, tabs) - playlist = try_get( - data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict) - if playlist: - return self._extract_from_playlist(item_id, url, data, playlist) - # Fallback to video extraction if no playlist alike page is recognized. - # First check for the current video then try the v attribute of URL query. - video_id = try_get( - data, lambda x: x['currentVideoEndpoint']['watchEndpoint']['videoId'], - compat_str) or video_id - if video_id: - return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id) - # Capture and output alerts - alert = self._extract_alert(data) - if alert: - raise ExtractorError(alert, expected=True) - # Failed to recognize - raise ExtractorError('Unable to recognize tab page') - - -class YoutubePlaylistIE(InfoExtractor): - IE_DESC = 'YouTube.com playlists' - _VALID_URL = r'''(?x)(?: - (?:https?://)? - (?:\w+\.)? - (?: - (?: - youtube(?:kids)?\.com| - invidio\.us - ) - /.*?\?.*?\blist= - )? - (?P<id>%(playlist_id)s) - )''' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} - IE_NAME = 'youtube:playlist' - _TESTS = [{ - 'note': 'issue #673', - 'url': 'PLBB231211A4F62143', - 'info_dict': { - 'title': '[OLD]Team Fortress 2 (Class-based LP)', - 'id': 'PLBB231211A4F62143', - 'uploader': 'Wickydoo', - 'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q', - }, - 'playlist_mincount': 29, - }, { - 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl', - 'info_dict': { - 'title': 'YDL_safe_search', - 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl', - }, - 'playlist_count': 2, - 'skip': 'This playlist is private', - }, { - 'note': 'embedded', - 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', - 'playlist_count': 4, - 'info_dict': { - 'title': 'JODA15', - 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', - 'uploader': 'milan', - 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw', - } - }, { - 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', - 'playlist_mincount': 982, - 'info_dict': { - 'title': '2018 Chinese New Singles (11/6 updated)', - 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', - 'uploader': 'LBK', - 'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA', - } - }, { - 'url': 'TLGGrESM50VT6acwMjAyMjAxNw', - 'only_matching': True, - }, { - # music album playlist - 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - if YoutubeTabIE.suitable(url): - return False - # Hack for lazy extractors until more generic solution is implemented - # (see #28780) - from .youtube import parse_qs - qs = parse_qs(url) - if qs.get('v', [None])[0]: - return False - return super(YoutubePlaylistIE, cls).suitable(url) - - def _real_extract(self, url): - playlist_id = self._match_id(url) - qs = parse_qs(url) - if not qs: - qs = {'list': playlist_id} - return self.url_result( - update_url_query('https://www.youtube.com/playlist', qs), - ie=YoutubeTabIE.ie_key(), video_id=playlist_id) - - -class YoutubeYtBeIE(InfoExtractor): - _VALID_URL = r'https?://youtu\.be/(?P<id>[0-9A-Za-z_-]{11})/*?.*?\blist=(?P<playlist_id>%(playlist_id)s)' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} - _TESTS = [{ - 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5', - 'info_dict': { - 'id': 'yeWKywCrFtk', - 'ext': 'mp4', - 'title': 'Small Scale Baler and Braiding Rugs', - 'uploader': 'Backus-Page House Museum', - 'uploader_id': 'backuspagemuseum', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum', - 'upload_date': '20161008', - 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a', - 'categories': ['Nonprofits & Activism'], - 'tags': list, - 'like_count': int, - 'dislike_count': int, - }, - 'params': { - 'noplaylist': True, - 'skip_download': True, - }, - }, { - 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - playlist_id = mobj.group('playlist_id') - return self.url_result( - update_url_query('https://www.youtube.com/watch', { - 'v': video_id, - 'list': playlist_id, - 'feature': 'youtu.be', - }), ie=YoutubeTabIE.ie_key(), video_id=playlist_id) - - -class YoutubeYtUserIE(InfoExtractor): - _VALID_URL = r'ytuser:(?P<id>.+)' - _TESTS = [{ - 'url': 'ytuser:phihag', - 'only_matching': True, - }] - - def _real_extract(self, url): - user_id = self._match_id(url) - return self.url_result( - 'https://www.youtube.com/user/%s' % user_id, - ie=YoutubeTabIE.ie_key(), video_id=user_id) - - -class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): - IE_NAME = 'youtube:favorites' - IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?' - _LOGIN_REQUIRED = True - _TESTS = [{ - 'url': ':ytfav', - 'only_matching': True, - }, { - 'url': ':ytfavorites', - 'only_matching': True, - }] - - def _real_extract(self, url): - return self.url_result( - 'https://www.youtube.com/playlist?list=LL', - ie=YoutubeTabIE.ie_key()) - - -class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor): - IE_DESC = 'YouTube.com searches' - # there doesn't appear to be a real limit, for example if you search for - # 'python' you get more than 8.000.000 results - _MAX_RESULTS = float('inf') - IE_NAME = 'youtube:search' - _SEARCH_KEY = 'ytsearch' - _SEARCH_PARAMS = None - _TESTS = [] - - def _entries(self, query, n): - data = { - 'context': { - 'client': { - 'clientName': 'WEB', - 'clientVersion': '2.20201021.03.00', - } - }, - 'query': query, - } - if self._SEARCH_PARAMS: - data['params'] = self._SEARCH_PARAMS - total = 0 - for page_num in itertools.count(1): - search = self._download_json( - 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', - video_id='query "%s"' % query, - note='Downloading page %s' % page_num, - errnote='Unable to download API page', fatal=False, - data=json.dumps(data).encode('utf8'), - headers={'content-type': 'application/json'}) - if not search: - break - slr_contents = try_get( - search, - (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'], - lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']), - list) - if not slr_contents: - break - for slr_content in slr_contents: - isr_contents = try_get( - slr_content, - lambda x: x['itemSectionRenderer']['contents'], - list) - if not isr_contents: - continue - for content in isr_contents: - if not isinstance(content, dict): - continue - video = content.get('videoRenderer') - if not isinstance(video, dict): - continue - video_id = video.get('videoId') - if not video_id: - continue - yield self._extract_video(video) - total += 1 - if total == n: - return - token = try_get( - slr_contents, - lambda x: x[-1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'], - compat_str) - if not token: - break - data['continuation'] = token - - def _get_n_results(self, query, n): - """Get a specified number of results for a query""" - return self.playlist_result(self._entries(query, n), query) - - -class YoutubeSearchDateIE(YoutubeSearchIE): - IE_NAME = YoutubeSearchIE.IE_NAME + ':date' - _SEARCH_KEY = 'ytsearchdate' - IE_DESC = 'YouTube.com searches, newest videos first' - _SEARCH_PARAMS = 'CAI%3D' - - -r""" -class YoutubeSearchURLIE(YoutubeSearchIE): - IE_DESC = 'YouTube.com search URLs' - IE_NAME = 'youtube:search_url' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)' - _TESTS = [{ - 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', - 'playlist_mincount': 5, - 'info_dict': { - 'title': 'youtube-dl test video', - } - }, { - 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - query = compat_urllib_parse_unquote_plus(mobj.group('query')) - webpage = self._download_webpage(url, query) - return self.playlist_result(self._process_page(webpage), playlist_title=query) -""" - - -class YoutubeFeedsInfoExtractor(YoutubeTabIE): - """ - Base class for feed extractors - Subclasses must define the _FEED_NAME property. - """ - _LOGIN_REQUIRED = True - - @property - def IE_NAME(self): - return 'youtube:%s' % self._FEED_NAME - - def _real_initialize(self): - self._login() - - def _real_extract(self, url): - return self.url_result( - 'https://www.youtube.com/feed/%s' % self._FEED_NAME, - ie=YoutubeTabIE.ie_key()) - - -class YoutubeWatchLaterIE(InfoExtractor): - IE_NAME = 'youtube:watchlater' - IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)' - _VALID_URL = r':ytwatchlater' - _TESTS = [{ - 'url': ':ytwatchlater', - 'only_matching': True, - }] - - def _real_extract(self, url): - return self.url_result( - 'https://www.youtube.com/playlist?list=WL', ie=YoutubeTabIE.ie_key()) - - -class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): - IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)' - _VALID_URL = r':ytrec(?:ommended)?' - _FEED_NAME = 'recommended' - _TESTS = [{ - 'url': ':ytrec', - 'only_matching': True, - }, { - 'url': ':ytrecommended', - 'only_matching': True, - }] - - -class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): - IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)' - _VALID_URL = r':ytsubs(?:criptions)?' - _FEED_NAME = 'subscriptions' - _TESTS = [{ - 'url': ':ytsubs', - 'only_matching': True, - }, { - 'url': ':ytsubscriptions', - 'only_matching': True, - }] - - -class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): - IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)' - _VALID_URL = r':ythistory' - _FEED_NAME = 'history' - _TESTS = [{ - 'url': ':ythistory', - 'only_matching': True, - }] - - -class YoutubeTruncatedURLIE(InfoExtractor): - IE_NAME = 'youtube:truncated_url' - IE_DESC = False # Do not list - _VALID_URL = r'''(?x) - (?:https?://)? - (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/ - (?:watch\?(?: - feature=[a-z_]+| - annotation_id=annotation_[^&]+| - x-yt-cl=[0-9]+| - hl=[^&]*| - t=[0-9]+ - )? - | - attribution_link\?a=[^&]+ - ) - $ - ''' - - _TESTS = [{ - 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/watch?', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/watch?feature=foo', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/watch?hl=en-GB', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/watch?t=2372', - 'only_matching': True, - }] - - def _real_extract(self, url): - raise ExtractorError( - 'Did you forget to quote the URL? Remember that & is a meta ' - 'character in most shells, so you want to put the URL in quotes, ' - 'like youtube-dl ' - '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" ' - ' or simply youtube-dl BaW_jenozKc .', - expected=True) - - -class YoutubeTruncatedIDIE(InfoExtractor): - IE_NAME = 'youtube:truncated_id' - IE_DESC = False # Do not list - _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$' - - _TESTS = [{ - 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - raise ExtractorError( - 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url), - expected=True) diff --git a/youtube_dl/extractor/zapiks.py b/youtube_dl/extractor/zapiks.py deleted file mode 100644 index f6496f516..000000000 --- a/youtube_dl/extractor/zapiks.py +++ /dev/null @@ -1,109 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - parse_duration, - parse_iso8601, - xpath_with_ns, - xpath_text, - int_or_none, -) - - -class ZapiksIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?zapiks\.(?:fr|com)/(?:(?:[a-z]{2}/)?(?P<display_id>.+?)\.html|index\.php\?.*\bmedia_id=(?P<id>\d+))' - _TESTS = [ - { - 'url': 'http://www.zapiks.fr/ep2s3-bon-appetit-eh-be-viva.html', - 'md5': 'aeb3c473b2d564b2d46d664d28d5f050', - 'info_dict': { - 'id': '80798', - 'ext': 'mp4', - 'title': 'EP2S3 - Bon Appétit - Eh bé viva les pyrénées con!', - 'description': 'md5:7054d6f6f620c6519be1fe710d4da847', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 528, - 'timestamp': 1359044972, - 'upload_date': '20130124', - 'view_count': int, - }, - }, - { - 'url': 'http://www.zapiks.com/ep3s5-bon-appetit-baqueira-m-1.html', - 'only_matching': True, - }, - { - 'url': 'http://www.zapiks.com/nl/ep3s5-bon-appetit-baqueira-m-1.html', - 'only_matching': True, - }, - { - 'url': 'http://www.zapiks.fr/index.php?action=playerIframe&media_id=118046&width=640&height=360&autoStart=false&language=fr', - 'only_matching': True, - }, - ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') or video_id - - webpage = self._download_webpage(url, display_id) - - if not video_id: - video_id = self._search_regex( - r'data-media-id="(\d+)"', webpage, 'video id') - - playlist = self._download_xml( - 'http://www.zapiks.fr/view/index.php?action=playlist&media_id=%s&lang=en' % video_id, - display_id) - - NS_MAP = { - 'jwplayer': 'http://rss.jwpcdn.com/' - } - - def ns(path): - return xpath_with_ns(path, NS_MAP) - - item = playlist.find('./channel/item') - - title = xpath_text(item, 'title', 'title') or self._og_search_title(webpage) - description = self._og_search_description(webpage, default=None) - thumbnail = xpath_text( - item, ns('./jwplayer:image'), 'thumbnail') or self._og_search_thumbnail(webpage, default=None) - duration = parse_duration(self._html_search_meta( - 'duration', webpage, 'duration', default=None)) - timestamp = parse_iso8601(self._html_search_meta( - 'uploadDate', webpage, 'upload date', default=None), ' ') - - view_count = int_or_none(self._search_regex( - r'UserPlays:(\d+)', webpage, 'view count', default=None)) - comment_count = int_or_none(self._search_regex( - r'UserComments:(\d+)', webpage, 'comment count', default=None)) - - formats = [] - for source in item.findall(ns('./jwplayer:source')): - format_id = source.attrib['label'] - f = { - 'url': source.attrib['file'], - 'format_id': format_id, - } - m = re.search(r'^(?P<height>\d+)[pP]', format_id) - if m: - f['height'] = int(m.group('height')) - formats.append(f) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'timestamp': timestamp, - 'view_count': view_count, - 'comment_count': comment_count, - 'formats': formats, - } diff --git a/youtube_dl/extractor/zattoo.py b/youtube_dl/extractor/zattoo.py deleted file mode 100644 index 6bac3026e..000000000 --- a/youtube_dl/extractor/zattoo.py +++ /dev/null @@ -1,433 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re -from uuid import uuid4 - -from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, -) -from ..utils import ( - ExtractorError, - int_or_none, - try_get, - url_or_none, - urlencode_postdata, -) - - -class ZattooPlatformBaseIE(InfoExtractor): - _power_guide_hash = None - - def _host_url(self): - return 'https://%s' % (self._API_HOST if hasattr(self, '_API_HOST') else self._HOST) - - def _login(self): - username, password = self._get_login_info() - if not username or not password: - self.raise_login_required( - 'A valid %s account is needed to access this media.' - % self._NETRC_MACHINE) - - try: - data = self._download_json( - '%s/zapi/v2/account/login' % self._host_url(), None, 'Logging in', - data=urlencode_postdata({ - 'login': username, - 'password': password, - 'remember': 'true', - }), headers={ - 'Referer': '%s/login' % self._host_url(), - 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', - }) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: - raise ExtractorError( - 'Unable to login: incorrect username and/or password', - expected=True) - raise - - self._power_guide_hash = data['session']['power_guide_hash'] - - def _real_initialize(self): - webpage = self._download_webpage( - self._host_url(), None, 'Downloading app token') - app_token = self._html_search_regex( - r'appToken\s*=\s*(["\'])(?P<token>(?:(?!\1).)+?)\1', - webpage, 'app token', group='token') - app_version = self._html_search_regex( - r'<!--\w+-(.+?)-', webpage, 'app version', default='2.8.2') - - # Will setup appropriate cookies - self._request_webpage( - '%s/zapi/v2/session/hello' % self._host_url(), None, - 'Opening session', data=urlencode_postdata({ - 'client_app_token': app_token, - 'uuid': compat_str(uuid4()), - 'lang': 'en', - 'app_version': app_version, - 'format': 'json', - })) - - self._login() - - def _extract_cid(self, video_id, channel_name): - channel_groups = self._download_json( - '%s/zapi/v2/cached/channels/%s' % (self._host_url(), - self._power_guide_hash), - video_id, 'Downloading channel list', - query={'details': False})['channel_groups'] - channel_list = [] - for chgrp in channel_groups: - channel_list.extend(chgrp['channels']) - try: - return next( - chan['cid'] for chan in channel_list - if chan.get('cid') and ( - chan.get('display_alias') == channel_name - or chan.get('cid') == channel_name)) - except StopIteration: - raise ExtractorError('Could not extract channel id') - - def _extract_cid_and_video_info(self, video_id): - data = self._download_json( - '%s/zapi/v2/cached/program/power_details/%s' % ( - self._host_url(), self._power_guide_hash), - video_id, - 'Downloading video information', - query={ - 'program_ids': video_id, - 'complete': True, - }) - - p = data['programs'][0] - cid = p['cid'] - - info_dict = { - 'id': video_id, - 'title': p.get('t') or p['et'], - 'description': p.get('d'), - 'thumbnail': p.get('i_url'), - 'creator': p.get('channel_name'), - 'episode': p.get('et'), - 'episode_number': int_or_none(p.get('e_no')), - 'season_number': int_or_none(p.get('s_no')), - 'release_year': int_or_none(p.get('year')), - 'categories': try_get(p, lambda x: x['c'], list), - 'tags': try_get(p, lambda x: x['g'], list) - } - - return cid, info_dict - - def _extract_formats(self, cid, video_id, record_id=None, is_live=False): - postdata_common = { - 'https_watch_urls': True, - } - - if is_live: - postdata_common.update({'timeshift': 10800}) - url = '%s/zapi/watch/live/%s' % (self._host_url(), cid) - elif record_id: - url = '%s/zapi/watch/recording/%s' % (self._host_url(), record_id) - else: - url = '%s/zapi/watch/recall/%s/%s' % (self._host_url(), cid, video_id) - - formats = [] - for stream_type in ('dash', 'hls', 'hls5', 'hds'): - postdata = postdata_common.copy() - postdata['stream_type'] = stream_type - - data = self._download_json( - url, video_id, 'Downloading %s formats' % stream_type.upper(), - data=urlencode_postdata(postdata), fatal=False) - if not data: - continue - - watch_urls = try_get( - data, lambda x: x['stream']['watch_urls'], list) - if not watch_urls: - continue - - for watch in watch_urls: - if not isinstance(watch, dict): - continue - watch_url = url_or_none(watch.get('url')) - if not watch_url: - continue - format_id_list = [stream_type] - maxrate = watch.get('maxrate') - if maxrate: - format_id_list.append(compat_str(maxrate)) - audio_channel = watch.get('audio_channel') - if audio_channel: - format_id_list.append(compat_str(audio_channel)) - preference = 1 if audio_channel == 'A' else None - format_id = '-'.join(format_id_list) - if stream_type in ('dash', 'dash_widevine', 'dash_playready'): - this_formats = self._extract_mpd_formats( - watch_url, video_id, mpd_id=format_id, fatal=False) - elif stream_type in ('hls', 'hls5', 'hls5_fairplay'): - this_formats = self._extract_m3u8_formats( - watch_url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id=format_id, - fatal=False) - elif stream_type == 'hds': - this_formats = self._extract_f4m_formats( - watch_url, video_id, f4m_id=format_id, fatal=False) - elif stream_type == 'smooth_playready': - this_formats = self._extract_ism_formats( - watch_url, video_id, ism_id=format_id, fatal=False) - else: - assert False - for this_format in this_formats: - this_format['preference'] = preference - formats.extend(this_formats) - self._sort_formats(formats) - return formats - - def _extract_video(self, channel_name, video_id, record_id=None, is_live=False): - if is_live: - cid = self._extract_cid(video_id, channel_name) - info_dict = { - 'id': channel_name, - 'title': self._live_title(channel_name), - 'is_live': True, - } - else: - cid, info_dict = self._extract_cid_and_video_info(video_id) - formats = self._extract_formats( - cid, video_id, record_id=record_id, is_live=is_live) - info_dict['formats'] = formats - return info_dict - - -class QuicklineBaseIE(ZattooPlatformBaseIE): - _NETRC_MACHINE = 'quickline' - _HOST = 'mobiltv.quickline.com' - - -class QuicklineIE(QuicklineBaseIE): - _VALID_URL = r'https?://(?:www\.)?%s/watch/(?P<channel>[^/]+)/(?P<id>[0-9]+)' % re.escape(QuicklineBaseIE._HOST) - - _TEST = { - 'url': 'https://mobiltv.quickline.com/watch/prosieben/130671867-maze-runner-die-auserwaehlten-in-der-brandwueste', - 'only_matching': True, - } - - def _real_extract(self, url): - channel_name, video_id = re.match(self._VALID_URL, url).groups() - return self._extract_video(channel_name, video_id) - - -class QuicklineLiveIE(QuicklineBaseIE): - _VALID_URL = r'https?://(?:www\.)?%s/watch/(?P<id>[^/]+)' % re.escape(QuicklineBaseIE._HOST) - - _TEST = { - 'url': 'https://mobiltv.quickline.com/watch/srf1', - 'only_matching': True, - } - - @classmethod - def suitable(cls, url): - return False if QuicklineIE.suitable(url) else super(QuicklineLiveIE, cls).suitable(url) - - def _real_extract(self, url): - channel_name = video_id = self._match_id(url) - return self._extract_video(channel_name, video_id, is_live=True) - - -class ZattooBaseIE(ZattooPlatformBaseIE): - _NETRC_MACHINE = 'zattoo' - _HOST = 'zattoo.com' - - -def _make_valid_url(tmpl, host): - return tmpl % re.escape(host) - - -class ZattooIE(ZattooBaseIE): - _VALID_URL_TEMPLATE = r'https?://(?:www\.)?%s/watch/(?P<channel>[^/]+?)/(?P<id>[0-9]+)[^/]+(?:/(?P<recid>[0-9]+))?' - _VALID_URL = _make_valid_url(_VALID_URL_TEMPLATE, ZattooBaseIE._HOST) - - # Since regular videos are only available for 7 days and recorded videos - # are only available for a specific user, we cannot have detailed tests. - _TESTS = [{ - 'url': 'https://zattoo.com/watch/prosieben/130671867-maze-runner-die-auserwaehlten-in-der-brandwueste', - 'only_matching': True, - }, { - 'url': 'https://zattoo.com/watch/srf_zwei/132905652-eishockey-spengler-cup/102791477/1512211800000/1514433500000/92000', - 'only_matching': True, - }] - - def _real_extract(self, url): - channel_name, video_id, record_id = re.match(self._VALID_URL, url).groups() - return self._extract_video(channel_name, video_id, record_id) - - -class ZattooLiveIE(ZattooBaseIE): - _VALID_URL = r'https?://(?:www\.)?zattoo\.com/watch/(?P<id>[^/]+)' - - _TEST = { - 'url': 'https://zattoo.com/watch/srf1', - 'only_matching': True, - } - - @classmethod - def suitable(cls, url): - return False if ZattooIE.suitable(url) else super(ZattooLiveIE, cls).suitable(url) - - def _real_extract(self, url): - channel_name = video_id = self._match_id(url) - return self._extract_video(channel_name, video_id, is_live=True) - - -class NetPlusIE(ZattooIE): - _NETRC_MACHINE = 'netplus' - _HOST = 'netplus.tv' - _API_HOST = 'www.%s' % _HOST - _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) - - _TESTS = [{ - 'url': 'https://www.netplus.tv/watch/abc/123-abc', - 'only_matching': True, - }] - - -class MNetTVIE(ZattooIE): - _NETRC_MACHINE = 'mnettv' - _HOST = 'tvplus.m-net.de' - _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) - - _TESTS = [{ - 'url': 'https://tvplus.m-net.de/watch/abc/123-abc', - 'only_matching': True, - }] - - -class WalyTVIE(ZattooIE): - _NETRC_MACHINE = 'walytv' - _HOST = 'player.waly.tv' - _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) - - _TESTS = [{ - 'url': 'https://player.waly.tv/watch/abc/123-abc', - 'only_matching': True, - }] - - -class BBVTVIE(ZattooIE): - _NETRC_MACHINE = 'bbvtv' - _HOST = 'bbv-tv.net' - _API_HOST = 'www.%s' % _HOST - _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) - - _TESTS = [{ - 'url': 'https://www.bbv-tv.net/watch/abc/123-abc', - 'only_matching': True, - }] - - -class VTXTVIE(ZattooIE): - _NETRC_MACHINE = 'vtxtv' - _HOST = 'vtxtv.ch' - _API_HOST = 'www.%s' % _HOST - _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) - - _TESTS = [{ - 'url': 'https://www.vtxtv.ch/watch/abc/123-abc', - 'only_matching': True, - }] - - -class MyVisionTVIE(ZattooIE): - _NETRC_MACHINE = 'myvisiontv' - _HOST = 'myvisiontv.ch' - _API_HOST = 'www.%s' % _HOST - _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) - - _TESTS = [{ - 'url': 'https://www.myvisiontv.ch/watch/abc/123-abc', - 'only_matching': True, - }] - - -class GlattvisionTVIE(ZattooIE): - _NETRC_MACHINE = 'glattvisiontv' - _HOST = 'iptv.glattvision.ch' - _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) - - _TESTS = [{ - 'url': 'https://iptv.glattvision.ch/watch/abc/123-abc', - 'only_matching': True, - }] - - -class SAKTVIE(ZattooIE): - _NETRC_MACHINE = 'saktv' - _HOST = 'saktv.ch' - _API_HOST = 'www.%s' % _HOST - _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) - - _TESTS = [{ - 'url': 'https://www.saktv.ch/watch/abc/123-abc', - 'only_matching': True, - }] - - -class EWETVIE(ZattooIE): - _NETRC_MACHINE = 'ewetv' - _HOST = 'tvonline.ewe.de' - _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) - - _TESTS = [{ - 'url': 'https://tvonline.ewe.de/watch/abc/123-abc', - 'only_matching': True, - }] - - -class QuantumTVIE(ZattooIE): - _NETRC_MACHINE = 'quantumtv' - _HOST = 'quantum-tv.com' - _API_HOST = 'www.%s' % _HOST - _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) - - _TESTS = [{ - 'url': 'https://www.quantum-tv.com/watch/abc/123-abc', - 'only_matching': True, - }] - - -class OsnatelTVIE(ZattooIE): - _NETRC_MACHINE = 'osnateltv' - _HOST = 'tvonline.osnatel.de' - _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) - - _TESTS = [{ - 'url': 'https://tvonline.osnatel.de/watch/abc/123-abc', - 'only_matching': True, - }] - - -class EinsUndEinsTVIE(ZattooIE): - _NETRC_MACHINE = '1und1tv' - _HOST = '1und1.tv' - _API_HOST = 'www.%s' % _HOST - _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) - - _TESTS = [{ - 'url': 'https://www.1und1.tv/watch/abc/123-abc', - 'only_matching': True, - }] - - -class SaltTVIE(ZattooIE): - _NETRC_MACHINE = 'salttv' - _HOST = 'tv.salt.ch' - _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) - - _TESTS = [{ - 'url': 'https://tv.salt.ch/watch/abc/123-abc', - 'only_matching': True, - }] diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py deleted file mode 100644 index 4dd56f66d..000000000 --- a/youtube_dl/extractor/zdf.py +++ /dev/null @@ -1,378 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - determine_ext, - float_or_none, - int_or_none, - merge_dicts, - NO_DEFAULT, - orderedSet, - parse_codecs, - qualities, - try_get, - unified_timestamp, - update_url_query, - url_or_none, - urljoin, -) - - -class ZDFBaseIE(InfoExtractor): - _GEO_COUNTRIES = ['DE'] - _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh', 'hd') - - def _call_api(self, url, video_id, item, api_token=None, referrer=None): - headers = {} - if api_token: - headers['Api-Auth'] = 'Bearer %s' % api_token - if referrer: - headers['Referer'] = referrer - return self._download_json( - url, video_id, 'Downloading JSON %s' % item, headers=headers) - - @staticmethod - def _extract_subtitles(src): - subtitles = {} - for caption in try_get(src, lambda x: x['captions'], list) or []: - subtitle_url = url_or_none(caption.get('uri')) - if subtitle_url: - lang = caption.get('language', 'deu') - subtitles.setdefault(lang, []).append({ - 'url': subtitle_url, - }) - return subtitles - - def _extract_format(self, video_id, formats, format_urls, meta): - format_url = url_or_none(meta.get('url')) - if not format_url: - return - if format_url in format_urls: - return - format_urls.add(format_url) - mime_type = meta.get('mimeType') - ext = determine_ext(format_url) - if mime_type == 'application/x-mpegURL' or ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', m3u8_id='hls', - entry_protocol='m3u8_native', fatal=False)) - elif mime_type == 'application/f4m+xml' or ext == 'f4m': - formats.extend(self._extract_f4m_formats( - update_url_query(format_url, {'hdcore': '3.7.0'}), video_id, f4m_id='hds', fatal=False)) - else: - f = parse_codecs(meta.get('mimeCodec')) - format_id = ['http'] - for p in (meta.get('type'), meta.get('quality')): - if p and isinstance(p, compat_str): - format_id.append(p) - f.update({ - 'url': format_url, - 'format_id': '-'.join(format_id), - 'format_note': meta.get('quality'), - 'language': meta.get('language'), - 'quality': qualities(self._QUALITIES)(meta.get('quality')), - 'preference': -10, - }) - formats.append(f) - - def _extract_ptmd(self, ptmd_url, video_id, api_token, referrer): - ptmd = self._call_api( - ptmd_url, video_id, 'metadata', api_token, referrer) - - content_id = ptmd.get('basename') or ptmd_url.split('/')[-1] - - formats = [] - track_uris = set() - for p in ptmd['priorityList']: - formitaeten = p.get('formitaeten') - if not isinstance(formitaeten, list): - continue - for f in formitaeten: - f_qualities = f.get('qualities') - if not isinstance(f_qualities, list): - continue - for quality in f_qualities: - tracks = try_get(quality, lambda x: x['audio']['tracks'], list) - if not tracks: - continue - for track in tracks: - self._extract_format( - content_id, formats, track_uris, { - 'url': track.get('uri'), - 'type': f.get('type'), - 'mimeType': f.get('mimeType'), - 'quality': quality.get('quality'), - 'language': track.get('language'), - }) - self._sort_formats(formats) - - duration = float_or_none(try_get( - ptmd, lambda x: x['attributes']['duration']['value']), scale=1000) - - return { - 'extractor_key': ZDFIE.ie_key(), - 'id': content_id, - 'duration': duration, - 'formats': formats, - 'subtitles': self._extract_subtitles(ptmd), - } - - def _extract_player(self, webpage, video_id, fatal=True): - return self._parse_json( - self._search_regex( - r'(?s)data-zdfplayer-jsb=(["\'])(?P<json>{.+?})\1', webpage, - 'player JSON', default='{}' if not fatal else NO_DEFAULT, - group='json'), - video_id) - - -class ZDFIE(ZDFBaseIE): - _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)\.html' - _TESTS = [{ - # Same as https://www.phoenix.de/sendungen/ereignisse/corona-nachgehakt/wohin-fuehrt-der-protest-in-der-pandemie-a-2050630.html - 'url': 'https://www.zdf.de/politik/phoenix-sendungen/wohin-fuehrt-der-protest-in-der-pandemie-100.html', - 'md5': '34ec321e7eb34231fd88616c65c92db0', - 'info_dict': { - 'id': '210222_phx_nachgehakt_corona_protest', - 'ext': 'mp4', - 'title': 'Wohin führt der Protest in der Pandemie?', - 'description': 'md5:7d643fe7f565e53a24aac036b2122fbd', - 'duration': 1691, - 'timestamp': 1613948400, - 'upload_date': '20210221', - }, - }, { - # Same as https://www.3sat.de/film/ab-18/10-wochen-sommer-108.html - 'url': 'https://www.zdf.de/dokumentation/ab-18/10-wochen-sommer-102.html', - 'md5': '0aff3e7bc72c8813f5e0fae333316a1d', - 'info_dict': { - 'id': '141007_ab18_10wochensommer_film', - 'ext': 'mp4', - 'title': 'Ab 18! - 10 Wochen Sommer', - 'description': 'md5:8253f41dc99ce2c3ff892dac2d65fe26', - 'duration': 2660, - 'timestamp': 1608604200, - 'upload_date': '20201222', - }, - }, { - 'url': 'https://www.zdf.de/dokumentation/terra-x/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100.html', - 'info_dict': { - 'id': '151025_magie_farben2_tex', - 'ext': 'mp4', - 'title': 'Die Magie der Farben (2/2)', - 'description': 'md5:a89da10c928c6235401066b60a6d5c1a', - 'duration': 2615, - 'timestamp': 1465021200, - 'upload_date': '20160604', - }, - }, { - # Same as https://www.phoenix.de/sendungen/dokumentationen/gesten-der-maechtigen-i-a-89468.html?ref=suche - 'url': 'https://www.zdf.de/politik/phoenix-sendungen/die-gesten-der-maechtigen-100.html', - 'only_matching': True, - }, { - # Same as https://www.3sat.de/film/spielfilm/der-hauptmann-100.html - 'url': 'https://www.zdf.de/filme/filme-sonstige/der-hauptmann-112.html', - 'only_matching': True, - }, { - # Same as https://www.3sat.de/wissen/nano/nano-21-mai-2019-102.html, equal media ids - 'url': 'https://www.zdf.de/wissen/nano/nano-21-mai-2019-102.html', - 'only_matching': True, - }, { - 'url': 'https://www.zdf.de/service-und-hilfe/die-neue-zdf-mediathek/zdfmediathek-trailer-100.html', - 'only_matching': True, - }, { - 'url': 'https://www.zdf.de/filme/taunuskrimi/die-lebenden-und-die-toten-1---ein-taunuskrimi-100.html', - 'only_matching': True, - }, { - 'url': 'https://www.zdf.de/dokumentation/planet-e/planet-e-uebersichtsseite-weitere-dokumentationen-von-planet-e-100.html', - 'only_matching': True, - }] - - def _extract_entry(self, url, player, content, video_id): - title = content.get('title') or content['teaserHeadline'] - - t = content['mainVideoContent']['http://zdf.de/rels/target'] - - ptmd_path = t.get('http://zdf.de/rels/streams/ptmd') - - if not ptmd_path: - ptmd_path = t[ - 'http://zdf.de/rels/streams/ptmd-template'].replace( - '{playerId}', 'ngplayer_2_4') - - info = self._extract_ptmd( - urljoin(url, ptmd_path), video_id, player['apiToken'], url) - - thumbnails = [] - layouts = try_get( - content, lambda x: x['teaserImageRef']['layouts'], dict) - if layouts: - for layout_key, layout_url in layouts.items(): - layout_url = url_or_none(layout_url) - if not layout_url: - continue - thumbnail = { - 'url': layout_url, - 'format_id': layout_key, - } - mobj = re.search(r'(?P<width>\d+)x(?P<height>\d+)', layout_key) - if mobj: - thumbnail.update({ - 'width': int(mobj.group('width')), - 'height': int(mobj.group('height')), - }) - thumbnails.append(thumbnail) - - return merge_dicts(info, { - 'title': title, - 'description': content.get('leadParagraph') or content.get('teasertext'), - 'duration': int_or_none(t.get('duration')), - 'timestamp': unified_timestamp(content.get('editorialDate')), - 'thumbnails': thumbnails, - }) - - def _extract_regular(self, url, player, video_id): - content = self._call_api( - player['content'], video_id, 'content', player['apiToken'], url) - return self._extract_entry(player['content'], player, content, video_id) - - def _extract_mobile(self, video_id): - video = self._download_json( - 'https://zdf-cdn.live.cellular.de/mediathekV2/document/%s' % video_id, - video_id) - - document = video['document'] - - title = document['titel'] - content_id = document['basename'] - - formats = [] - format_urls = set() - for f in document['formitaeten']: - self._extract_format(content_id, formats, format_urls, f) - self._sort_formats(formats) - - thumbnails = [] - teaser_bild = document.get('teaserBild') - if isinstance(teaser_bild, dict): - for thumbnail_key, thumbnail in teaser_bild.items(): - thumbnail_url = try_get( - thumbnail, lambda x: x['url'], compat_str) - if thumbnail_url: - thumbnails.append({ - 'url': thumbnail_url, - 'id': thumbnail_key, - 'width': int_or_none(thumbnail.get('width')), - 'height': int_or_none(thumbnail.get('height')), - }) - - return { - 'id': content_id, - 'title': title, - 'description': document.get('beschreibung'), - 'duration': int_or_none(document.get('length')), - 'timestamp': unified_timestamp(document.get('date')) or unified_timestamp( - try_get(video, lambda x: x['meta']['editorialDate'], compat_str)), - 'thumbnails': thumbnails, - 'subtitles': self._extract_subtitles(document), - 'formats': formats, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id, fatal=False) - if webpage: - player = self._extract_player(webpage, url, fatal=False) - if player: - return self._extract_regular(url, player, video_id) - - return self._extract_mobile(video_id) - - -class ZDFChannelIE(ZDFBaseIE): - _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'https://www.zdf.de/sport/das-aktuelle-sportstudio', - 'info_dict': { - 'id': 'das-aktuelle-sportstudio', - 'title': 'das aktuelle sportstudio | ZDF', - }, - 'playlist_mincount': 23, - }, { - 'url': 'https://www.zdf.de/dokumentation/planet-e', - 'info_dict': { - 'id': 'planet-e', - 'title': 'planet e.', - }, - 'playlist_mincount': 50, - }, { - 'url': 'https://www.zdf.de/filme/taunuskrimi/', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return False if ZDFIE.suitable(url) else super(ZDFChannelIE, cls).suitable(url) - - def _real_extract(self, url): - channel_id = self._match_id(url) - - webpage = self._download_webpage(url, channel_id) - - entries = [ - self.url_result(item_url, ie=ZDFIE.ie_key()) - for item_url in orderedSet(re.findall( - r'data-plusbar-url=["\'](http.+?\.html)', webpage))] - - return self.playlist_result( - entries, channel_id, self._og_search_title(webpage, fatal=False)) - - r""" - player = self._extract_player(webpage, channel_id) - - channel_id = self._search_regex( - r'docId\s*:\s*(["\'])(?P<id>(?!\1).+?)\1', webpage, - 'channel id', group='id') - - channel = self._call_api( - 'https://api.zdf.de/content/documents/%s.json' % channel_id, - player, url, channel_id) - - items = [] - for module in channel['module']: - for teaser in try_get(module, lambda x: x['teaser'], list) or []: - t = try_get( - teaser, lambda x: x['http://zdf.de/rels/target'], dict) - if not t: - continue - items.extend(try_get( - t, - lambda x: x['resultsWithVideo']['http://zdf.de/rels/search/results'], - list) or []) - items.extend(try_get( - module, - lambda x: x['filterRef']['resultsWithVideo']['http://zdf.de/rels/search/results'], - list) or []) - - entries = [] - entry_urls = set() - for item in items: - t = try_get(item, lambda x: x['http://zdf.de/rels/target'], dict) - if not t: - continue - sharing_url = t.get('http://zdf.de/rels/sharing-url') - if not sharing_url or not isinstance(sharing_url, compat_str): - continue - if sharing_url in entry_urls: - continue - entry_urls.add(sharing_url) - entries.append(self.url_result( - sharing_url, ie=ZDFIE.ie_key(), video_id=t.get('id'))) - - return self.playlist_result(entries, channel_id, channel.get('title')) - """ diff --git a/youtube_dl/extractor/zingmp3.py b/youtube_dl/extractor/zingmp3.py deleted file mode 100644 index 207c04f5e..000000000 --- a/youtube_dl/extractor/zingmp3.py +++ /dev/null @@ -1,161 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - int_or_none, -) - - -class ZingMp3BaseIE(InfoExtractor): - _VALID_URL_TMPL = r'https?://(?:mp3\.zing|zingmp3)\.vn/(?:%s)/[^/]+/(?P<id>\w+)\.html' - _GEO_COUNTRIES = ['VN'] - - def _extract_item(self, item, fatal): - item_id = item['id'] - title = item.get('name') or item['title'] - - formats = [] - for k, v in (item.get('source') or {}).items(): - if not v: - continue - if k in ('mp4', 'hls'): - for res, video_url in v.items(): - if not video_url: - continue - if k == 'hls': - formats.extend(self._extract_m3u8_formats( - video_url, item_id, 'mp4', - 'm3u8_native', m3u8_id=k, fatal=False)) - elif k == 'mp4': - formats.append({ - 'format_id': 'mp4-' + res, - 'url': video_url, - 'height': int_or_none(self._search_regex( - r'^(\d+)p', res, 'resolution', default=None)), - }) - else: - formats.append({ - 'ext': 'mp3', - 'format_id': k, - 'tbr': int_or_none(k), - 'url': self._proto_relative_url(v), - 'vcodec': 'none', - }) - if not formats: - if not fatal: - return - msg = item['msg'] - if msg == 'Sorry, this content is not available in your country.': - self.raise_geo_restricted(countries=self._GEO_COUNTRIES) - raise ExtractorError(msg, expected=True) - self._sort_formats(formats) - - subtitles = None - lyric = item.get('lyric') - if lyric: - subtitles = { - 'origin': [{ - 'url': lyric, - }], - } - - album = item.get('album') or {} - - return { - 'id': item_id, - 'title': title, - 'formats': formats, - 'thumbnail': item.get('thumbnail'), - 'subtitles': subtitles, - 'duration': int_or_none(item.get('duration')), - 'track': title, - 'artist': item.get('artists_names'), - 'album': album.get('name') or album.get('title'), - 'album_artist': album.get('artists_names'), - } - - def _real_extract(self, url): - page_id = self._match_id(url) - webpage = self._download_webpage( - url.replace('://zingmp3.vn/', '://mp3.zing.vn/'), - page_id, query={'play_song': 1}) - data_path = self._search_regex( - r'data-xml="([^"]+)', webpage, 'data path') - return self._process_data(self._download_json( - 'https://mp3.zing.vn/xhr' + data_path, page_id)['data']) - - -class ZingMp3IE(ZingMp3BaseIE): - _VALID_URL = ZingMp3BaseIE._VALID_URL_TMPL % 'bai-hat|video-clip' - _TESTS = [{ - 'url': 'http://mp3.zing.vn/bai-hat/Xa-Mai-Xa-Bao-Thy/ZWZB9WAB.html', - 'md5': 'ead7ae13693b3205cbc89536a077daed', - 'info_dict': { - 'id': 'ZWZB9WAB', - 'title': 'Xa Mãi Xa', - 'ext': 'mp3', - 'thumbnail': r're:^https?://.+\.jpg', - 'subtitles': { - 'origin': [{ - 'ext': 'lrc', - }] - }, - 'duration': 255, - 'track': 'Xa Mãi Xa', - 'artist': 'Bảo Thy', - 'album': 'Special Album', - 'album_artist': 'Bảo Thy', - }, - }, { - 'url': 'https://mp3.zing.vn/video-clip/Suong-Hoa-Dua-Loi-K-ICM-RYO/ZO8ZF7C7.html', - 'md5': 'e9c972b693aa88301ef981c8151c4343', - 'info_dict': { - 'id': 'ZO8ZF7C7', - 'title': 'Sương Hoa Đưa Lối', - 'ext': 'mp4', - 'thumbnail': r're:^https?://.+\.jpg', - 'duration': 207, - 'track': 'Sương Hoa Đưa Lối', - 'artist': 'K-ICM, RYO', - }, - }, { - 'url': 'https://zingmp3.vn/bai-hat/Xa-Mai-Xa-Bao-Thy/ZWZB9WAB.html', - 'only_matching': True, - }] - IE_NAME = 'zingmp3' - IE_DESC = 'mp3.zing.vn' - - def _process_data(self, data): - return self._extract_item(data, True) - - -class ZingMp3AlbumIE(ZingMp3BaseIE): - _VALID_URL = ZingMp3BaseIE._VALID_URL_TMPL % 'album|playlist' - _TESTS = [{ - 'url': 'http://mp3.zing.vn/album/Lau-Dai-Tinh-Ai-Bang-Kieu-Minh-Tuyet/ZWZBWDAF.html', - 'info_dict': { - '_type': 'playlist', - 'id': 'ZWZBWDAF', - 'title': 'Lâu Đài Tình Ái', - }, - 'playlist_count': 10, - }, { - 'url': 'http://mp3.zing.vn/playlist/Duong-Hong-Loan-apollobee/IWCAACCB.html', - 'only_matching': True, - }, { - 'url': 'https://zingmp3.vn/album/Lau-Dai-Tinh-Ai-Bang-Kieu-Minh-Tuyet/ZWZBWDAF.html', - 'only_matching': True, - }] - IE_NAME = 'zingmp3:album' - - def _process_data(self, data): - def entries(): - for item in (data.get('items') or []): - entry = self._extract_item(item, False) - if entry: - yield entry - info = data.get('info') or {} - return self.playlist_result( - entries(), info.get('id'), info.get('name') or info.get('title')) diff --git a/youtube_dl/extractor/zoom.py b/youtube_dl/extractor/zoom.py deleted file mode 100644 index db073d91d..000000000 --- a/youtube_dl/extractor/zoom.py +++ /dev/null @@ -1,68 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - int_or_none, - js_to_json, - parse_filesize, - urlencode_postdata, -) - - -class ZoomIE(InfoExtractor): - IE_NAME = 'zoom' - _VALID_URL = r'(?P<base_url>https?://(?:[^.]+\.)?zoom.us/)rec(?:ording)?/(?:play|share)/(?P<id>[A-Za-z0-9_.-]+)' - _TEST = { - 'url': 'https://economist.zoom.us/rec/play/dUk_CNBETmZ5VA2BwEl-jjakPpJ3M1pcfVYAPRsoIbEByGsLjUZtaa4yCATQuOL3der8BlTwxQePl_j0.EImBkXzTIaPvdZO5', - 'md5': 'ab445e8c911fddc4f9adc842c2c5d434', - 'info_dict': { - 'id': 'dUk_CNBETmZ5VA2BwEl-jjakPpJ3M1pcfVYAPRsoIbEByGsLjUZtaa4yCATQuOL3der8BlTwxQePl_j0.EImBkXzTIaPvdZO5', - 'ext': 'mp4', - 'title': 'China\'s "two sessions" and the new five-year plan', - } - } - - def _real_extract(self, url): - base_url, play_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage(url, play_id) - - try: - form = self._form_hidden_inputs('password_form', webpage) - except ExtractorError: - form = None - if form: - password = self._downloader.params.get('videopassword') - if not password: - raise ExtractorError( - 'This video is protected by a passcode, use the --video-password option', expected=True) - is_meeting = form.get('useWhichPasswd') == 'meeting' - validation = self._download_json( - base_url + 'rec/validate%s_passwd' % ('_meet' if is_meeting else ''), - play_id, 'Validating passcode', 'Wrong passcode', data=urlencode_postdata({ - 'id': form[('meet' if is_meeting else 'file') + 'Id'], - 'passwd': password, - 'action': form.get('action'), - })) - if not validation.get('status'): - raise ExtractorError(validation['errorMessage'], expected=True) - webpage = self._download_webpage(url, play_id) - - data = self._parse_json(self._search_regex( - r'(?s)window\.__data__\s*=\s*({.+?});', - webpage, 'data'), play_id, js_to_json) - - return { - 'id': play_id, - 'title': data['topic'], - 'url': data['viewMp4Url'], - 'width': int_or_none(data.get('viewResolvtionsWidth')), - 'height': int_or_none(data.get('viewResolvtionsHeight')), - 'http_headers': { - 'Referer': base_url, - }, - 'filesize_approx': parse_filesize(data.get('fileSize')), - } diff --git a/youtube_dl/extractor/zype.py b/youtube_dl/extractor/zype.py deleted file mode 100644 index f20f953cb..000000000 --- a/youtube_dl/extractor/zype.py +++ /dev/null @@ -1,145 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_HTTPError -from ..utils import ( - dict_get, - ExtractorError, - int_or_none, - js_to_json, - parse_iso8601, -) - - -class ZypeIE(InfoExtractor): - _ID_RE = r'[\da-fA-F]+' - _COMMON_RE = r'//player\.zype\.com/embed/%s\.(?:js|json|html)\?.*?(?:access_token|(?:ap[ip]|player)_key)=' - _VALID_URL = r'https?:%s[^&]+' % (_COMMON_RE % ('(?P<id>%s)' % _ID_RE)) - _TEST = { - 'url': 'https://player.zype.com/embed/5b400b834b32992a310622b9.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ&autoplay=false&controls=true&da=false', - 'md5': 'eaee31d474c76a955bdaba02a505c595', - 'info_dict': { - 'id': '5b400b834b32992a310622b9', - 'ext': 'mp4', - 'title': 'Smoky Barbecue Favorites', - 'thumbnail': r're:^https?://.*\.jpe?g', - 'description': 'md5:5ff01e76316bd8d46508af26dc86023b', - 'timestamp': 1504915200, - 'upload_date': '20170909', - }, - } - - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?%s.+?)\1' % (ZypeIE._COMMON_RE % ZypeIE._ID_RE), - webpage)] - - def _real_extract(self, url): - video_id = self._match_id(url) - - try: - response = self._download_json(re.sub( - r'\.(?:js|html)\?', '.json?', url), video_id)['response'] - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 401, 403): - raise ExtractorError(self._parse_json( - e.cause.read().decode(), video_id)['message'], expected=True) - raise - - body = response['body'] - video = response['video'] - title = video['title'] - - if isinstance(body, dict): - formats = [] - for output in body.get('outputs', []): - output_url = output.get('url') - if not output_url: - continue - name = output.get('name') - if name == 'm3u8': - formats = self._extract_m3u8_formats( - output_url, video_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False) - else: - f = { - 'format_id': name, - 'tbr': int_or_none(output.get('bitrate')), - 'url': output_url, - } - if name in ('m4a', 'mp3'): - f['vcodec'] = 'none' - else: - f.update({ - 'height': int_or_none(output.get('height')), - 'width': int_or_none(output.get('width')), - }) - formats.append(f) - text_tracks = body.get('subtitles') or [] - else: - m3u8_url = self._search_regex( - r'(["\'])(?P<url>(?:(?!\1).)+\.m3u8(?:(?!\1).)*)\1', - body, 'm3u8 url', group='url', default=None) - if not m3u8_url: - source = self._search_regex( - r'(?s)sources\s*:\s*\[\s*({.+?})\s*\]', body, 'source') - - def get_attr(key): - return self._search_regex( - r'\b%s\s*:\s*([\'"])(?P<val>(?:(?!\1).)+)\1' % key, - source, key, group='val') - - if get_attr('integration') == 'verizon-media': - m3u8_url = 'https://content.uplynk.com/%s.m3u8' % get_attr('id') - formats = self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls') - text_tracks = self._search_regex( - r'textTracks\s*:\s*(\[[^]]+\])', - body, 'text tracks', default=None) - if text_tracks: - text_tracks = self._parse_json( - text_tracks, video_id, js_to_json, False) - self._sort_formats(formats) - - subtitles = {} - if text_tracks: - for text_track in text_tracks: - tt_url = dict_get(text_track, ('file', 'src')) - if not tt_url: - continue - subtitles.setdefault(text_track.get('label') or 'English', []).append({ - 'url': tt_url, - }) - - thumbnails = [] - for thumbnail in video.get('thumbnails', []): - thumbnail_url = thumbnail.get('url') - if not thumbnail_url: - continue - thumbnails.append({ - 'url': thumbnail_url, - 'width': int_or_none(thumbnail.get('width')), - 'height': int_or_none(thumbnail.get('height')), - }) - - return { - 'id': video_id, - 'display_id': video.get('friendly_title'), - 'title': title, - 'thumbnails': thumbnails, - 'description': dict_get(video, ('description', 'ott_description', 'short_description')), - 'timestamp': parse_iso8601(video.get('published_at')), - 'duration': int_or_none(video.get('duration')), - 'view_count': int_or_none(video.get('request_count')), - 'average_rating': int_or_none(video.get('rating')), - 'season_number': int_or_none(video.get('season')), - 'episode_number': int_or_none(video.get('episode')), - 'formats': formats, - 'subtitles': subtitles, - } diff --git a/youtube_dl/options.py b/youtube_dl/options.py deleted file mode 100644 index 378d66cad..000000000 --- a/youtube_dl/options.py +++ /dev/null @@ -1,916 +0,0 @@ -from __future__ import unicode_literals - -import os.path -import optparse -import re -import sys - -from .downloader.external import list_external_downloaders -from .compat import ( - compat_expanduser, - compat_get_terminal_size, - compat_getenv, - compat_kwargs, - compat_shlex_split, -) -from .utils import ( - preferredencoding, - write_string, -) -from .version import __version__ - - -def _hide_login_info(opts): - PRIVATE_OPTS = set(['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username']) - eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$') - - def _scrub_eq(o): - m = eqre.match(o) - if m: - return m.group('key') + '=PRIVATE' - else: - return o - - opts = list(map(_scrub_eq, opts)) - for idx, opt in enumerate(opts): - if opt in PRIVATE_OPTS and idx + 1 < len(opts): - opts[idx + 1] = 'PRIVATE' - return opts - - -def parseOpts(overrideArguments=None): - def _readOptions(filename_bytes, default=[]): - try: - optionf = open(filename_bytes) - except IOError: - return default # silently skip if file is not present - try: - # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56 - contents = optionf.read() - if sys.version_info < (3,): - contents = contents.decode(preferredencoding()) - res = compat_shlex_split(contents, comments=True) - finally: - optionf.close() - return res - - def _readUserConf(): - xdg_config_home = compat_getenv('XDG_CONFIG_HOME') - if xdg_config_home: - userConfFile = os.path.join(xdg_config_home, 'youtube-dl', 'config') - if not os.path.isfile(userConfFile): - userConfFile = os.path.join(xdg_config_home, 'youtube-dl.conf') - else: - userConfFile = os.path.join(compat_expanduser('~'), '.config', 'youtube-dl', 'config') - if not os.path.isfile(userConfFile): - userConfFile = os.path.join(compat_expanduser('~'), '.config', 'youtube-dl.conf') - userConf = _readOptions(userConfFile, None) - - if userConf is None: - appdata_dir = compat_getenv('appdata') - if appdata_dir: - userConf = _readOptions( - os.path.join(appdata_dir, 'youtube-dl', 'config'), - default=None) - if userConf is None: - userConf = _readOptions( - os.path.join(appdata_dir, 'youtube-dl', 'config.txt'), - default=None) - - if userConf is None: - userConf = _readOptions( - os.path.join(compat_expanduser('~'), 'youtube-dl.conf'), - default=None) - if userConf is None: - userConf = _readOptions( - os.path.join(compat_expanduser('~'), 'youtube-dl.conf.txt'), - default=None) - - if userConf is None: - userConf = [] - - return userConf - - def _format_option_string(option): - ''' ('-o', '--option') -> -o, --format METAVAR''' - - opts = [] - - if option._short_opts: - opts.append(option._short_opts[0]) - if option._long_opts: - opts.append(option._long_opts[0]) - if len(opts) > 1: - opts.insert(1, ', ') - - if option.takes_value(): - opts.append(' %s' % option.metavar) - - return ''.join(opts) - - def _comma_separated_values_options_callback(option, opt_str, value, parser): - setattr(parser.values, option.dest, value.split(',')) - - # No need to wrap help messages if we're on a wide console - columns = compat_get_terminal_size().columns - max_width = columns if columns else 80 - max_help_position = 80 - - fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position) - fmt.format_option_strings = _format_option_string - - kw = { - 'version': __version__, - 'formatter': fmt, - 'usage': '%prog [OPTIONS] URL [URL...]', - 'conflict_handler': 'resolve', - } - - parser = optparse.OptionParser(**compat_kwargs(kw)) - - general = optparse.OptionGroup(parser, 'General Options') - general.add_option( - '-h', '--help', - action='help', - help='Print this help text and exit') - general.add_option( - '--version', - action='version', - help='Print program version and exit') - general.add_option( - '-i', '--ignore-errors', - action='store_true', dest='ignoreerrors', default=False, - help='Continue on download errors, for example to skip unavailable videos in a playlist') - general.add_option( - '--abort-on-error', - action='store_false', dest='ignoreerrors', - help='Abort downloading of further videos (in the playlist or the command line) if an error occurs') - general.add_option( - '--dump-user-agent', - action='store_true', dest='dump_user_agent', default=False, - help='Display the current browser identification') - general.add_option( - '--list-extractors', - action='store_true', dest='list_extractors', default=False, - help='List all supported extractors') - general.add_option( - '--extractor-descriptions', - action='store_true', dest='list_extractor_descriptions', default=False, - help='Output descriptions of all supported extractors') - general.add_option( - '--force-generic-extractor', - action='store_true', dest='force_generic_extractor', default=False, - help='Force extraction to use the generic extractor') - general.add_option( - '--default-search', - dest='default_search', metavar='PREFIX', - help='Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for youtube-dl "large apple". Use the value "auto" to let youtube-dl guess ("auto_warning" to emit a warning when guessing). "error" just throws an error. The default value "fixup_error" repairs broken URLs, but emits an error if this is not possible instead of searching.') - general.add_option( - '--ignore-config', - action='store_true', - help='Do not read configuration files. ' - 'When given in the global configuration file /etc/youtube-dl.conf: ' - 'Do not read the user configuration in ~/.config/youtube-dl/config ' - '(%APPDATA%/youtube-dl/config.txt on Windows)') - general.add_option( - '--config-location', - dest='config_location', metavar='PATH', - help='Location of the configuration file; either the path to the config or its containing directory.') - general.add_option( - '--flat-playlist', - action='store_const', dest='extract_flat', const='in_playlist', - default=False, - help='Do not extract the videos of a playlist, only list them.') - general.add_option( - '--mark-watched', - action='store_true', dest='mark_watched', default=False, - help='Mark videos watched (YouTube only)') - general.add_option( - '--no-mark-watched', - action='store_false', dest='mark_watched', default=False, - help='Do not mark videos watched (YouTube only)') - general.add_option( - '--no-color', '--no-colors', - action='store_true', dest='no_color', - default=False, - help='Do not emit color codes in output') - - network = optparse.OptionGroup(parser, 'Network Options') - network.add_option( - '--proxy', dest='proxy', - default=None, metavar='URL', - help='Use the specified HTTP/HTTPS/SOCKS proxy. To enable ' - 'SOCKS proxy, specify a proper scheme. For example ' - 'socks5://127.0.0.1:1080/. Pass in an empty string (--proxy "") ' - 'for direct connection') - network.add_option( - '--socket-timeout', - dest='socket_timeout', type=float, default=None, metavar='SECONDS', - help='Time to wait before giving up, in seconds') - network.add_option( - '--source-address', - metavar='IP', dest='source_address', default=None, - help='Client-side IP address to bind to', - ) - network.add_option( - '-4', '--force-ipv4', - action='store_const', const='0.0.0.0', dest='source_address', - help='Make all connections via IPv4', - ) - network.add_option( - '-6', '--force-ipv6', - action='store_const', const='::', dest='source_address', - help='Make all connections via IPv6', - ) - - geo = optparse.OptionGroup(parser, 'Geo Restriction') - geo.add_option( - '--geo-verification-proxy', - dest='geo_verification_proxy', default=None, metavar='URL', - help='Use this proxy to verify the IP address for some geo-restricted sites. ' - 'The default proxy specified by --proxy (or none, if the option is not present) is used for the actual downloading.') - geo.add_option( - '--cn-verification-proxy', - dest='cn_verification_proxy', default=None, metavar='URL', - help=optparse.SUPPRESS_HELP) - geo.add_option( - '--geo-bypass', - action='store_true', dest='geo_bypass', default=True, - help='Bypass geographic restriction via faking X-Forwarded-For HTTP header') - geo.add_option( - '--no-geo-bypass', - action='store_false', dest='geo_bypass', default=True, - help='Do not bypass geographic restriction via faking X-Forwarded-For HTTP header') - geo.add_option( - '--geo-bypass-country', metavar='CODE', - dest='geo_bypass_country', default=None, - help='Force bypass geographic restriction with explicitly provided two-letter ISO 3166-2 country code') - geo.add_option( - '--geo-bypass-ip-block', metavar='IP_BLOCK', - dest='geo_bypass_ip_block', default=None, - help='Force bypass geographic restriction with explicitly provided IP block in CIDR notation') - - selection = optparse.OptionGroup(parser, 'Video Selection') - selection.add_option( - '--playlist-start', - dest='playliststart', metavar='NUMBER', default=1, type=int, - help='Playlist video to start at (default is %default)') - selection.add_option( - '--playlist-end', - dest='playlistend', metavar='NUMBER', default=None, type=int, - help='Playlist video to end at (default is last)') - selection.add_option( - '--playlist-items', - dest='playlist_items', metavar='ITEM_SPEC', default=None, - help='Playlist video items to download. Specify indices of the videos in the playlist separated by commas like: "--playlist-items 1,2,5,8" if you want to download videos indexed 1, 2, 5, 8 in the playlist. You can specify range: "--playlist-items 1-3,7,10-13", it will download the videos at index 1, 2, 3, 7, 10, 11, 12 and 13.') - selection.add_option( - '--match-title', - dest='matchtitle', metavar='REGEX', - help='Download only matching titles (regex or caseless sub-string)') - selection.add_option( - '--reject-title', - dest='rejecttitle', metavar='REGEX', - help='Skip download for matching titles (regex or caseless sub-string)') - selection.add_option( - '--max-downloads', - dest='max_downloads', metavar='NUMBER', type=int, default=None, - help='Abort after downloading NUMBER files') - selection.add_option( - '--min-filesize', - metavar='SIZE', dest='min_filesize', default=None, - help='Do not download any videos smaller than SIZE (e.g. 50k or 44.6m)') - selection.add_option( - '--max-filesize', - metavar='SIZE', dest='max_filesize', default=None, - help='Do not download any videos larger than SIZE (e.g. 50k or 44.6m)') - selection.add_option( - '--date', - metavar='DATE', dest='date', default=None, - help='Download only videos uploaded in this date') - selection.add_option( - '--datebefore', - metavar='DATE', dest='datebefore', default=None, - help='Download only videos uploaded on or before this date (i.e. inclusive)') - selection.add_option( - '--dateafter', - metavar='DATE', dest='dateafter', default=None, - help='Download only videos uploaded on or after this date (i.e. inclusive)') - selection.add_option( - '--min-views', - metavar='COUNT', dest='min_views', default=None, type=int, - help='Do not download any videos with less than COUNT views') - selection.add_option( - '--max-views', - metavar='COUNT', dest='max_views', default=None, type=int, - help='Do not download any videos with more than COUNT views') - selection.add_option( - '--match-filter', - metavar='FILTER', dest='match_filter', default=None, - help=( - 'Generic video filter. ' - 'Specify any key (see the "OUTPUT TEMPLATE" for a list of available keys) to ' - 'match if the key is present, ' - '!key to check if the key is not present, ' - 'key > NUMBER (like "comment_count > 12", also works with ' - '>=, <, <=, !=, =) to compare against a number, ' - 'key = \'LITERAL\' (like "uploader = \'Mike Smith\'", also works with !=) ' - 'to match against a string literal ' - 'and & to require multiple matches. ' - 'Values which are not known are excluded unless you ' - 'put a question mark (?) after the operator. ' - 'For example, to only match videos that have been liked more than ' - '100 times and disliked less than 50 times (or the dislike ' - 'functionality is not available at the given service), but who ' - 'also have a description, use --match-filter ' - '"like_count > 100 & dislike_count <? 50 & description" .' - )) - selection.add_option( - '--no-playlist', - action='store_true', dest='noplaylist', default=False, - help='Download only the video, if the URL refers to a video and a playlist.') - selection.add_option( - '--yes-playlist', - action='store_false', dest='noplaylist', default=False, - help='Download the playlist, if the URL refers to a video and a playlist.') - selection.add_option( - '--age-limit', - metavar='YEARS', dest='age_limit', default=None, type=int, - help='Download only videos suitable for the given age') - selection.add_option( - '--download-archive', metavar='FILE', - dest='download_archive', - help='Download only videos not listed in the archive file. Record the IDs of all downloaded videos in it.') - selection.add_option( - '--include-ads', - dest='include_ads', action='store_true', - help='Download advertisements as well (experimental)') - - authentication = optparse.OptionGroup(parser, 'Authentication Options') - authentication.add_option( - '-u', '--username', - dest='username', metavar='USERNAME', - help='Login with this account ID') - authentication.add_option( - '-p', '--password', - dest='password', metavar='PASSWORD', - help='Account password. If this option is left out, youtube-dl will ask interactively.') - authentication.add_option( - '-2', '--twofactor', - dest='twofactor', metavar='TWOFACTOR', - help='Two-factor authentication code') - authentication.add_option( - '-n', '--netrc', - action='store_true', dest='usenetrc', default=False, - help='Use .netrc authentication data') - authentication.add_option( - '--video-password', - dest='videopassword', metavar='PASSWORD', - help='Video password (vimeo, youku)') - - adobe_pass = optparse.OptionGroup(parser, 'Adobe Pass Options') - adobe_pass.add_option( - '--ap-mso', - dest='ap_mso', metavar='MSO', - help='Adobe Pass multiple-system operator (TV provider) identifier, use --ap-list-mso for a list of available MSOs') - adobe_pass.add_option( - '--ap-username', - dest='ap_username', metavar='USERNAME', - help='Multiple-system operator account login') - adobe_pass.add_option( - '--ap-password', - dest='ap_password', metavar='PASSWORD', - help='Multiple-system operator account password. If this option is left out, youtube-dl will ask interactively.') - adobe_pass.add_option( - '--ap-list-mso', - action='store_true', dest='ap_list_mso', default=False, - help='List all supported multiple-system operators') - - video_format = optparse.OptionGroup(parser, 'Video Format Options') - video_format.add_option( - '-f', '--format', - action='store', dest='format', metavar='FORMAT', default=None, - help='Video format code, see the "FORMAT SELECTION" for all the info') - video_format.add_option( - '--all-formats', - action='store_const', dest='format', const='all', - help='Download all available video formats') - video_format.add_option( - '--prefer-free-formats', - action='store_true', dest='prefer_free_formats', default=False, - help='Prefer free video formats unless a specific one is requested') - video_format.add_option( - '-F', '--list-formats', - action='store_true', dest='listformats', - help='List all available formats of requested videos') - video_format.add_option( - '--youtube-include-dash-manifest', - action='store_true', dest='youtube_include_dash_manifest', default=True, - help=optparse.SUPPRESS_HELP) - video_format.add_option( - '--youtube-skip-dash-manifest', - action='store_false', dest='youtube_include_dash_manifest', - help='Do not download the DASH manifests and related data on YouTube videos') - video_format.add_option( - '--merge-output-format', - action='store', dest='merge_output_format', metavar='FORMAT', default=None, - help=( - 'If a merge is required (e.g. bestvideo+bestaudio), ' - 'output to given container format. One of mkv, mp4, ogg, webm, flv. ' - 'Ignored if no merge is required')) - - subtitles = optparse.OptionGroup(parser, 'Subtitle Options') - subtitles.add_option( - '--write-sub', '--write-srt', - action='store_true', dest='writesubtitles', default=False, - help='Write subtitle file') - subtitles.add_option( - '--write-auto-sub', '--write-automatic-sub', - action='store_true', dest='writeautomaticsub', default=False, - help='Write automatically generated subtitle file (YouTube only)') - subtitles.add_option( - '--all-subs', - action='store_true', dest='allsubtitles', default=False, - help='Download all the available subtitles of the video') - subtitles.add_option( - '--list-subs', - action='store_true', dest='listsubtitles', default=False, - help='List all available subtitles for the video') - subtitles.add_option( - '--sub-format', - action='store', dest='subtitlesformat', metavar='FORMAT', default='best', - help='Subtitle format, accepts formats preference, for example: "srt" or "ass/srt/best"') - subtitles.add_option( - '--sub-lang', '--sub-langs', '--srt-lang', - action='callback', dest='subtitleslangs', metavar='LANGS', type='str', - default=[], callback=_comma_separated_values_options_callback, - help='Languages of the subtitles to download (optional) separated by commas, use --list-subs for available language tags') - - downloader = optparse.OptionGroup(parser, 'Download Options') - downloader.add_option( - '-r', '--limit-rate', '--rate-limit', - dest='ratelimit', metavar='RATE', - help='Maximum download rate in bytes per second (e.g. 50K or 4.2M)') - downloader.add_option( - '-R', '--retries', - dest='retries', metavar='RETRIES', default=10, - help='Number of retries (default is %default), or "infinite".') - downloader.add_option( - '--fragment-retries', - dest='fragment_retries', metavar='RETRIES', default=10, - help='Number of retries for a fragment (default is %default), or "infinite" (DASH, hlsnative and ISM)') - downloader.add_option( - '--skip-unavailable-fragments', - action='store_true', dest='skip_unavailable_fragments', default=True, - help='Skip unavailable fragments (DASH, hlsnative and ISM)') - downloader.add_option( - '--abort-on-unavailable-fragment', - action='store_false', dest='skip_unavailable_fragments', - help='Abort downloading when some fragment is not available') - downloader.add_option( - '--keep-fragments', - action='store_true', dest='keep_fragments', default=False, - help='Keep downloaded fragments on disk after downloading is finished; fragments are erased by default') - downloader.add_option( - '--buffer-size', - dest='buffersize', metavar='SIZE', default='1024', - help='Size of download buffer (e.g. 1024 or 16K) (default is %default)') - downloader.add_option( - '--no-resize-buffer', - action='store_true', dest='noresizebuffer', default=False, - help='Do not automatically adjust the buffer size. By default, the buffer size is automatically resized from an initial value of SIZE.') - downloader.add_option( - '--http-chunk-size', - dest='http_chunk_size', metavar='SIZE', default=None, - help='Size of a chunk for chunk-based HTTP downloading (e.g. 10485760 or 10M) (default is disabled). ' - 'May be useful for bypassing bandwidth throttling imposed by a webserver (experimental)') - downloader.add_option( - '--test', - action='store_true', dest='test', default=False, - help=optparse.SUPPRESS_HELP) - downloader.add_option( - '--playlist-reverse', - action='store_true', - help='Download playlist videos in reverse order') - downloader.add_option( - '--playlist-random', - action='store_true', - help='Download playlist videos in random order') - downloader.add_option( - '--xattr-set-filesize', - dest='xattr_set_filesize', action='store_true', - help='Set file xattribute ytdl.filesize with expected file size') - downloader.add_option( - '--hls-prefer-native', - dest='hls_prefer_native', action='store_true', default=None, - help='Use the native HLS downloader instead of ffmpeg') - downloader.add_option( - '--hls-prefer-ffmpeg', - dest='hls_prefer_native', action='store_false', default=None, - help='Use ffmpeg instead of the native HLS downloader') - downloader.add_option( - '--hls-use-mpegts', - dest='hls_use_mpegts', action='store_true', - help='Use the mpegts container for HLS videos, allowing to play the ' - 'video while downloading (some players may not be able to play it)') - downloader.add_option( - '--external-downloader', - dest='external_downloader', metavar='COMMAND', - help='Use the specified external downloader. ' - 'Currently supports %s' % ','.join(list_external_downloaders())) - downloader.add_option( - '--external-downloader-args', - dest='external_downloader_args', metavar='ARGS', - help='Give these arguments to the external downloader') - - workarounds = optparse.OptionGroup(parser, 'Workarounds') - workarounds.add_option( - '--encoding', - dest='encoding', metavar='ENCODING', - help='Force the specified encoding (experimental)') - workarounds.add_option( - '--no-check-certificate', - action='store_true', dest='no_check_certificate', default=False, - help='Suppress HTTPS certificate validation') - workarounds.add_option( - '--prefer-insecure', - '--prefer-unsecure', action='store_true', dest='prefer_insecure', - help='Use an unencrypted connection to retrieve information about the video. (Currently supported only for YouTube)') - workarounds.add_option( - '--user-agent', - metavar='UA', dest='user_agent', - help='Specify a custom user agent') - workarounds.add_option( - '--referer', - metavar='URL', dest='referer', default=None, - help='Specify a custom referer, use if the video access is restricted to one domain', - ) - workarounds.add_option( - '--add-header', - metavar='FIELD:VALUE', dest='headers', action='append', - help='Specify a custom HTTP header and its value, separated by a colon \':\'. You can use this option multiple times', - ) - workarounds.add_option( - '--bidi-workaround', - dest='bidi_workaround', action='store_true', - help='Work around terminals that lack bidirectional text support. Requires bidiv or fribidi executable in PATH') - workarounds.add_option( - '--sleep-interval', '--min-sleep-interval', metavar='SECONDS', - dest='sleep_interval', type=float, - help=( - 'Number of seconds to sleep before each download when used alone ' - 'or a lower bound of a range for randomized sleep before each download ' - '(minimum possible number of seconds to sleep) when used along with ' - '--max-sleep-interval.')) - workarounds.add_option( - '--max-sleep-interval', metavar='SECONDS', - dest='max_sleep_interval', type=float, - help=( - 'Upper bound of a range for randomized sleep before each download ' - '(maximum possible number of seconds to sleep). Must only be used ' - 'along with --min-sleep-interval.')) - - verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options') - verbosity.add_option( - '-q', '--quiet', - action='store_true', dest='quiet', default=False, - help='Activate quiet mode') - verbosity.add_option( - '--no-warnings', - dest='no_warnings', action='store_true', default=False, - help='Ignore warnings') - verbosity.add_option( - '-s', '--simulate', - action='store_true', dest='simulate', default=False, - help='Do not download the video and do not write anything to disk') - verbosity.add_option( - '--skip-download', - action='store_true', dest='skip_download', default=False, - help='Do not download the video') - verbosity.add_option( - '-g', '--get-url', - action='store_true', dest='geturl', default=False, - help='Simulate, quiet but print URL') - verbosity.add_option( - '-e', '--get-title', - action='store_true', dest='gettitle', default=False, - help='Simulate, quiet but print title') - verbosity.add_option( - '--get-id', - action='store_true', dest='getid', default=False, - help='Simulate, quiet but print id') - verbosity.add_option( - '--get-thumbnail', - action='store_true', dest='getthumbnail', default=False, - help='Simulate, quiet but print thumbnail URL') - verbosity.add_option( - '--get-description', - action='store_true', dest='getdescription', default=False, - help='Simulate, quiet but print video description') - verbosity.add_option( - '--get-duration', - action='store_true', dest='getduration', default=False, - help='Simulate, quiet but print video length') - verbosity.add_option( - '--get-filename', - action='store_true', dest='getfilename', default=False, - help='Simulate, quiet but print output filename') - verbosity.add_option( - '--get-format', - action='store_true', dest='getformat', default=False, - help='Simulate, quiet but print output format') - verbosity.add_option( - '-j', '--dump-json', - action='store_true', dest='dumpjson', default=False, - help='Simulate, quiet but print JSON information. See the "OUTPUT TEMPLATE" for a description of available keys.') - verbosity.add_option( - '-J', '--dump-single-json', - action='store_true', dest='dump_single_json', default=False, - help='Simulate, quiet but print JSON information for each command-line argument. If the URL refers to a playlist, dump the whole playlist information in a single line.') - verbosity.add_option( - '--print-json', - action='store_true', dest='print_json', default=False, - help='Be quiet and print the video information as JSON (video is still being downloaded).', - ) - verbosity.add_option( - '--newline', - action='store_true', dest='progress_with_newline', default=False, - help='Output progress bar as new lines') - verbosity.add_option( - '--no-progress', - action='store_true', dest='noprogress', default=False, - help='Do not print progress bar') - verbosity.add_option( - '--console-title', - action='store_true', dest='consoletitle', default=False, - help='Display progress in console titlebar') - verbosity.add_option( - '-v', '--verbose', - action='store_true', dest='verbose', default=False, - help='Print various debugging information') - verbosity.add_option( - '--dump-pages', '--dump-intermediate-pages', - action='store_true', dest='dump_intermediate_pages', default=False, - help='Print downloaded pages encoded using base64 to debug problems (very verbose)') - verbosity.add_option( - '--write-pages', - action='store_true', dest='write_pages', default=False, - help='Write downloaded intermediary pages to files in the current directory to debug problems') - verbosity.add_option( - '--youtube-print-sig-code', - action='store_true', dest='youtube_print_sig_code', default=False, - help=optparse.SUPPRESS_HELP) - verbosity.add_option( - '--print-traffic', '--dump-headers', - dest='debug_printtraffic', action='store_true', default=False, - help='Display sent and read HTTP traffic') - verbosity.add_option( - '-C', '--call-home', - dest='call_home', action='store_true', default=False, - help='Contact the youtube-dl server for debugging') - verbosity.add_option( - '--no-call-home', - dest='call_home', action='store_false', default=False, - help='Do NOT contact the youtube-dl server for debugging') - - filesystem = optparse.OptionGroup(parser, 'Filesystem Options') - filesystem.add_option( - '-a', '--batch-file', - dest='batchfile', metavar='FILE', - help="File containing URLs to download ('-' for stdin), one URL per line. " - "Lines starting with '#', ';' or ']' are considered as comments and ignored.") - filesystem.add_option( - '--id', default=False, - action='store_true', dest='useid', help='Use only video ID in file name') - filesystem.add_option( - '-o', '--output', - dest='outtmpl', metavar='TEMPLATE', - help=('Output filename template, see the "OUTPUT TEMPLATE" for all the info')) - filesystem.add_option( - '--output-na-placeholder', - dest='outtmpl_na_placeholder', metavar='PLACEHOLDER', default='NA', - help=('Placeholder value for unavailable meta fields in output filename template (default is "%default")')) - filesystem.add_option( - '--autonumber-size', - dest='autonumber_size', metavar='NUMBER', type=int, - help=optparse.SUPPRESS_HELP) - filesystem.add_option( - '--autonumber-start', - dest='autonumber_start', metavar='NUMBER', default=1, type=int, - help='Specify the start value for %(autonumber)s (default is %default)') - filesystem.add_option( - '--restrict-filenames', - action='store_true', dest='restrictfilenames', default=False, - help='Restrict filenames to only ASCII characters, and avoid "&" and spaces in filenames') - filesystem.add_option( - '-A', '--auto-number', - action='store_true', dest='autonumber', default=False, - help=optparse.SUPPRESS_HELP) - filesystem.add_option( - '-t', '--title', - action='store_true', dest='usetitle', default=False, - help=optparse.SUPPRESS_HELP) - filesystem.add_option( - '-l', '--literal', default=False, - action='store_true', dest='usetitle', - help=optparse.SUPPRESS_HELP) - filesystem.add_option( - '-w', '--no-overwrites', - action='store_true', dest='nooverwrites', default=False, - help='Do not overwrite files') - filesystem.add_option( - '-c', '--continue', - action='store_true', dest='continue_dl', default=True, - help='Force resume of partially downloaded files. By default, youtube-dl will resume downloads if possible.') - filesystem.add_option( - '--no-continue', - action='store_false', dest='continue_dl', - help='Do not resume partially downloaded files (restart from beginning)') - filesystem.add_option( - '--no-part', - action='store_true', dest='nopart', default=False, - help='Do not use .part files - write directly into output file') - filesystem.add_option( - '--no-mtime', - action='store_false', dest='updatetime', default=True, - help='Do not use the Last-modified header to set the file modification time') - filesystem.add_option( - '--write-description', - action='store_true', dest='writedescription', default=False, - help='Write video description to a .description file') - filesystem.add_option( - '--write-info-json', - action='store_true', dest='writeinfojson', default=False, - help='Write video metadata to a .info.json file') - filesystem.add_option( - '--write-annotations', - action='store_true', dest='writeannotations', default=False, - help='Write video annotations to a .annotations.xml file') - filesystem.add_option( - '--load-info-json', '--load-info', - dest='load_info_filename', metavar='FILE', - help='JSON file containing the video information (created with the "--write-info-json" option)') - filesystem.add_option( - '--cookies', - dest='cookiefile', metavar='FILE', - help='File to read cookies from and dump cookie jar in') - filesystem.add_option( - '--cache-dir', dest='cachedir', default=None, metavar='DIR', - help='Location in the filesystem where youtube-dl can store some downloaded information permanently. By default $XDG_CACHE_HOME/youtube-dl or ~/.cache/youtube-dl . At the moment, only YouTube player files (for videos with obfuscated signatures) are cached, but that may change.') - filesystem.add_option( - '--no-cache-dir', action='store_const', const=False, dest='cachedir', - help='Disable filesystem caching') - filesystem.add_option( - '--rm-cache-dir', - action='store_true', dest='rm_cachedir', - help='Delete all filesystem cache files') - - thumbnail = optparse.OptionGroup(parser, 'Thumbnail Options') - thumbnail.add_option( - '--write-thumbnail', - action='store_true', dest='writethumbnail', default=False, - help='Write thumbnail image to disk') - thumbnail.add_option( - '--write-all-thumbnails', - action='store_true', dest='write_all_thumbnails', default=False, - help='Write all thumbnail image formats to disk') - thumbnail.add_option( - '--list-thumbnails', - action='store_true', dest='list_thumbnails', default=False, - help='Simulate and list all available thumbnail formats') - - postproc = optparse.OptionGroup(parser, 'Post-processing Options') - postproc.add_option( - '-x', '--extract-audio', - action='store_true', dest='extractaudio', default=False, - help='Convert video files to audio-only files (requires ffmpeg/avconv and ffprobe/avprobe)') - postproc.add_option( - '--audio-format', metavar='FORMAT', dest='audioformat', default='best', - help='Specify audio format: "best", "aac", "flac", "mp3", "m4a", "opus", "vorbis", or "wav"; "%default" by default; No effect without -x') - postproc.add_option( - '--audio-quality', metavar='QUALITY', - dest='audioquality', default='5', - help='Specify ffmpeg/avconv audio quality, insert a value between 0 (better) and 9 (worse) for VBR or a specific bitrate like 128K (default %default)') - postproc.add_option( - '--recode-video', - metavar='FORMAT', dest='recodevideo', default=None, - help='Encode the video to another format if necessary (currently supported: mp4|flv|ogg|webm|mkv|avi)') - postproc.add_option( - '--postprocessor-args', - dest='postprocessor_args', metavar='ARGS', - help='Give these arguments to the postprocessor') - postproc.add_option( - '-k', '--keep-video', - action='store_true', dest='keepvideo', default=False, - help='Keep the video file on disk after the post-processing; the video is erased by default') - postproc.add_option( - '--no-post-overwrites', - action='store_true', dest='nopostoverwrites', default=False, - help='Do not overwrite post-processed files; the post-processed files are overwritten by default') - postproc.add_option( - '--embed-subs', - action='store_true', dest='embedsubtitles', default=False, - help='Embed subtitles in the video (only for mp4, webm and mkv videos)') - postproc.add_option( - '--embed-thumbnail', - action='store_true', dest='embedthumbnail', default=False, - help='Embed thumbnail in the audio as cover art') - postproc.add_option( - '--add-metadata', - action='store_true', dest='addmetadata', default=False, - help='Write metadata to the video file') - postproc.add_option( - '--metadata-from-title', - metavar='FORMAT', dest='metafromtitle', - help='Parse additional metadata like song title / artist from the video title. ' - 'The format syntax is the same as --output. Regular expression with ' - 'named capture groups may also be used. ' - 'The parsed parameters replace existing values. ' - 'Example: --metadata-from-title "%(artist)s - %(title)s" matches a title like ' - '"Coldplay - Paradise". ' - 'Example (regex): --metadata-from-title "(?P<artist>.+?) - (?P<title>.+)"') - postproc.add_option( - '--xattrs', - action='store_true', dest='xattrs', default=False, - help='Write metadata to the video file\'s xattrs (using dublin core and xdg standards)') - postproc.add_option( - '--fixup', - metavar='POLICY', dest='fixup', default='detect_or_warn', - help='Automatically correct known faults of the file. ' - 'One of never (do nothing), warn (only emit a warning), ' - 'detect_or_warn (the default; fix file if we can, warn otherwise)') - postproc.add_option( - '--prefer-avconv', - action='store_false', dest='prefer_ffmpeg', - help='Prefer avconv over ffmpeg for running the postprocessors') - postproc.add_option( - '--prefer-ffmpeg', - action='store_true', dest='prefer_ffmpeg', - help='Prefer ffmpeg over avconv for running the postprocessors (default)') - postproc.add_option( - '--ffmpeg-location', '--avconv-location', metavar='PATH', - dest='ffmpeg_location', - help='Location of the ffmpeg/avconv binary; either the path to the binary or its containing directory.') - postproc.add_option( - '--exec', - metavar='CMD', dest='exec_cmd', - help='Execute a command on the file after downloading and post-processing, similar to find\'s -exec syntax. Example: --exec \'adb push {} /sdcard/Music/ && rm {}\'') - postproc.add_option( - '--convert-subs', '--convert-subtitles', - metavar='FORMAT', dest='convertsubtitles', default=None, - help='Convert the subtitles to other format (currently supported: srt|ass|vtt|lrc)') - - parser.add_option_group(general) - parser.add_option_group(network) - parser.add_option_group(geo) - parser.add_option_group(selection) - parser.add_option_group(downloader) - parser.add_option_group(filesystem) - parser.add_option_group(thumbnail) - parser.add_option_group(verbosity) - parser.add_option_group(workarounds) - parser.add_option_group(video_format) - parser.add_option_group(subtitles) - parser.add_option_group(authentication) - parser.add_option_group(adobe_pass) - parser.add_option_group(postproc) - - if overrideArguments is not None: - opts, args = parser.parse_args(overrideArguments) - if opts.verbose: - write_string('[debug] Override config: ' + repr(overrideArguments) + '\n') - else: - def compat_conf(conf): - if sys.version_info < (3,): - return [a.decode(preferredencoding(), 'replace') for a in conf] - return conf - - command_line_conf = compat_conf(sys.argv[1:]) - opts, args = parser.parse_args(command_line_conf) - - system_conf = user_conf = custom_conf = [] - - if '--config-location' in command_line_conf: - location = compat_expanduser(opts.config_location) - if os.path.isdir(location): - location = os.path.join(location, 'youtube-dl.conf') - if not os.path.exists(location): - parser.error('config-location %s does not exist.' % location) - custom_conf = _readOptions(location) - elif '--ignore-config' in command_line_conf: - pass - else: - system_conf = _readOptions('/etc/youtube-dl.conf') - if '--ignore-config' not in system_conf: - user_conf = _readUserConf() - - argv = system_conf + user_conf + custom_conf + command_line_conf - opts, args = parser.parse_args(argv) - if opts.verbose: - for conf_label, conf in ( - ('System config', system_conf), - ('User config', user_conf), - ('Custom config', custom_conf), - ('Command-line args', command_line_conf)): - write_string('[debug] %s: %s\n' % (conf_label, repr(_hide_login_info(conf)))) - - return parser, opts, args diff --git a/youtube_dl/postprocessor/__init__.py b/youtube_dl/postprocessor/__init__.py deleted file mode 100644 index 3ea518399..000000000 --- a/youtube_dl/postprocessor/__init__.py +++ /dev/null @@ -1,40 +0,0 @@ -from __future__ import unicode_literals - -from .embedthumbnail import EmbedThumbnailPP -from .ffmpeg import ( - FFmpegPostProcessor, - FFmpegEmbedSubtitlePP, - FFmpegExtractAudioPP, - FFmpegFixupStretchedPP, - FFmpegFixupM3u8PP, - FFmpegFixupM4aPP, - FFmpegMergerPP, - FFmpegMetadataPP, - FFmpegVideoConvertorPP, - FFmpegSubtitlesConvertorPP, -) -from .xattrpp import XAttrMetadataPP -from .execafterdownload import ExecAfterDownloadPP -from .metadatafromtitle import MetadataFromTitlePP - - -def get_postprocessor(key): - return globals()[key + 'PP'] - - -__all__ = [ - 'EmbedThumbnailPP', - 'ExecAfterDownloadPP', - 'FFmpegEmbedSubtitlePP', - 'FFmpegExtractAudioPP', - 'FFmpegFixupM3u8PP', - 'FFmpegFixupM4aPP', - 'FFmpegFixupStretchedPP', - 'FFmpegMergerPP', - 'FFmpegMetadataPP', - 'FFmpegPostProcessor', - 'FFmpegSubtitlesConvertorPP', - 'FFmpegVideoConvertorPP', - 'MetadataFromTitlePP', - 'XAttrMetadataPP', -] diff --git a/youtube_dl/postprocessor/common.py b/youtube_dl/postprocessor/common.py deleted file mode 100644 index 599dd1df2..000000000 --- a/youtube_dl/postprocessor/common.py +++ /dev/null @@ -1,69 +0,0 @@ -from __future__ import unicode_literals - -import os - -from ..utils import ( - PostProcessingError, - cli_configuration_args, - encodeFilename, -) - - -class PostProcessor(object): - """Post Processor class. - - PostProcessor objects can be added to downloaders with their - add_post_processor() method. When the downloader has finished a - successful download, it will take its internal chain of PostProcessors - and start calling the run() method on each one of them, first with - an initial argument and then with the returned value of the previous - PostProcessor. - - The chain will be stopped if one of them ever returns None or the end - of the chain is reached. - - PostProcessor objects follow a "mutual registration" process similar - to InfoExtractor objects. - - Optionally PostProcessor can use a list of additional command-line arguments - with self._configuration_args. - """ - - _downloader = None - - def __init__(self, downloader=None): - self._downloader = downloader - - def set_downloader(self, downloader): - """Sets the downloader for this PP.""" - self._downloader = downloader - - def run(self, information): - """Run the PostProcessor. - - The "information" argument is a dictionary like the ones - composed by InfoExtractors. The only difference is that this - one has an extra field called "filepath" that points to the - downloaded file. - - This method returns a tuple, the first element is a list of the files - that can be deleted, and the second of which is the updated - information. - - In addition, this method may raise a PostProcessingError - exception if post processing fails. - """ - return [], information # by default, keep file and do nothing - - def try_utime(self, path, atime, mtime, errnote='Cannot update utime of file'): - try: - os.utime(encodeFilename(path), (atime, mtime)) - except Exception: - self._downloader.report_warning(errnote) - - def _configuration_args(self, default=[]): - return cli_configuration_args(self._downloader.params, 'postprocessor_args', default) - - -class AudioConversionError(PostProcessingError): - pass diff --git a/youtube_dl/postprocessor/embedthumbnail.py b/youtube_dl/postprocessor/embedthumbnail.py deleted file mode 100644 index 3990908b6..000000000 --- a/youtube_dl/postprocessor/embedthumbnail.py +++ /dev/null @@ -1,130 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - - -import os -import subprocess - -from .ffmpeg import FFmpegPostProcessor - -from ..utils import ( - check_executable, - encodeArgument, - encodeFilename, - PostProcessingError, - prepend_extension, - replace_extension, - shell_quote -) - - -class EmbedThumbnailPPError(PostProcessingError): - pass - - -class EmbedThumbnailPP(FFmpegPostProcessor): - def __init__(self, downloader=None, already_have_thumbnail=False): - super(EmbedThumbnailPP, self).__init__(downloader) - self._already_have_thumbnail = already_have_thumbnail - - def run(self, info): - filename = info['filepath'] - temp_filename = prepend_extension(filename, 'temp') - - if not info.get('thumbnails'): - self._downloader.to_screen('[embedthumbnail] There aren\'t any thumbnails to embed') - return [], info - - thumbnail_filename = info['thumbnails'][-1]['filename'] - - if not os.path.exists(encodeFilename(thumbnail_filename)): - self._downloader.report_warning( - 'Skipping embedding the thumbnail because the file is missing.') - return [], info - - def is_webp(path): - with open(encodeFilename(path), 'rb') as f: - b = f.read(12) - return b[0:4] == b'RIFF' and b[8:] == b'WEBP' - - # Correct extension for WebP file with wrong extension (see #25687, #25717) - _, thumbnail_ext = os.path.splitext(thumbnail_filename) - if thumbnail_ext: - thumbnail_ext = thumbnail_ext[1:].lower() - if thumbnail_ext != 'webp' and is_webp(thumbnail_filename): - self._downloader.to_screen( - '[ffmpeg] Correcting extension to webp and escaping path for thumbnail "%s"' % thumbnail_filename) - thumbnail_webp_filename = replace_extension(thumbnail_filename, 'webp') - os.rename(encodeFilename(thumbnail_filename), encodeFilename(thumbnail_webp_filename)) - thumbnail_filename = thumbnail_webp_filename - thumbnail_ext = 'webp' - - # Convert unsupported thumbnail formats to JPEG (see #25687, #25717) - if thumbnail_ext not in ['jpg', 'png']: - # NB: % is supposed to be escaped with %% but this does not work - # for input files so working around with standard substitution - escaped_thumbnail_filename = thumbnail_filename.replace('%', '#') - os.rename(encodeFilename(thumbnail_filename), encodeFilename(escaped_thumbnail_filename)) - escaped_thumbnail_jpg_filename = replace_extension(escaped_thumbnail_filename, 'jpg') - self._downloader.to_screen('[ffmpeg] Converting thumbnail "%s" to JPEG' % escaped_thumbnail_filename) - self.run_ffmpeg(escaped_thumbnail_filename, escaped_thumbnail_jpg_filename, ['-bsf:v', 'mjpeg2jpeg']) - os.remove(encodeFilename(escaped_thumbnail_filename)) - thumbnail_jpg_filename = replace_extension(thumbnail_filename, 'jpg') - # Rename back to unescaped for further processing - os.rename(encodeFilename(escaped_thumbnail_jpg_filename), encodeFilename(thumbnail_jpg_filename)) - thumbnail_filename = thumbnail_jpg_filename - - if info['ext'] == 'mp3': - options = [ - '-c', 'copy', '-map', '0', '-map', '1', - '-metadata:s:v', 'title="Album cover"', '-metadata:s:v', 'comment="Cover (Front)"'] - - self._downloader.to_screen('[ffmpeg] Adding thumbnail to "%s"' % filename) - - self.run_ffmpeg_multiple_files([filename, thumbnail_filename], temp_filename, options) - - if not self._already_have_thumbnail: - os.remove(encodeFilename(thumbnail_filename)) - os.remove(encodeFilename(filename)) - os.rename(encodeFilename(temp_filename), encodeFilename(filename)) - - elif info['ext'] in ['m4a', 'mp4']: - atomicparsley = next((x - for x in ['AtomicParsley', 'atomicparsley'] - if check_executable(x, ['-v'])), None) - - if atomicparsley is None: - raise EmbedThumbnailPPError('AtomicParsley was not found. Please install.') - - cmd = [encodeFilename(atomicparsley, True), - encodeFilename(filename, True), - encodeArgument('--artwork'), - encodeFilename(thumbnail_filename, True), - encodeArgument('-o'), - encodeFilename(temp_filename, True)] - - self._downloader.to_screen('[atomicparsley] Adding thumbnail to "%s"' % filename) - - if self._downloader.params.get('verbose', False): - self._downloader.to_screen('[debug] AtomicParsley command line: %s' % shell_quote(cmd)) - - p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - stdout, stderr = p.communicate() - - if p.returncode != 0: - msg = stderr.decode('utf-8', 'replace').strip() - raise EmbedThumbnailPPError(msg) - - if not self._already_have_thumbnail: - os.remove(encodeFilename(thumbnail_filename)) - # for formats that don't support thumbnails (like 3gp) AtomicParsley - # won't create to the temporary file - if b'No changes' in stdout: - self._downloader.report_warning('The file format doesn\'t support embedding a thumbnail') - else: - os.remove(encodeFilename(filename)) - os.rename(encodeFilename(temp_filename), encodeFilename(filename)) - else: - raise EmbedThumbnailPPError('Only mp3 and m4a/mp4 are supported for thumbnail embedding for now.') - - return [], info diff --git a/youtube_dl/postprocessor/execafterdownload.py b/youtube_dl/postprocessor/execafterdownload.py deleted file mode 100644 index 64dabe790..000000000 --- a/youtube_dl/postprocessor/execafterdownload.py +++ /dev/null @@ -1,31 +0,0 @@ -from __future__ import unicode_literals - -import subprocess - -from .common import PostProcessor -from ..compat import compat_shlex_quote -from ..utils import ( - encodeArgument, - PostProcessingError, -) - - -class ExecAfterDownloadPP(PostProcessor): - def __init__(self, downloader, exec_cmd): - super(ExecAfterDownloadPP, self).__init__(downloader) - self.exec_cmd = exec_cmd - - def run(self, information): - cmd = self.exec_cmd - if '{}' not in cmd: - cmd += ' {}' - - cmd = cmd.replace('{}', compat_shlex_quote(information['filepath'])) - - self._downloader.to_screen('[exec] Executing command: %s' % cmd) - retCode = subprocess.call(encodeArgument(cmd), shell=True) - if retCode != 0: - raise PostProcessingError( - 'Command returned error code %d' % retCode) - - return [], information diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py deleted file mode 100644 index 9f76c9d4e..000000000 --- a/youtube_dl/postprocessor/ffmpeg.py +++ /dev/null @@ -1,660 +0,0 @@ -from __future__ import unicode_literals - -import io -import os -import subprocess -import time -import re - - -from .common import AudioConversionError, PostProcessor - -from ..utils import ( - encodeArgument, - encodeFilename, - get_exe_version, - is_outdated_version, - PostProcessingError, - prepend_extension, - shell_quote, - subtitles_filename, - dfxp2srt, - ISO639Utils, - replace_extension, -) - - -EXT_TO_OUT_FORMATS = { - 'aac': 'adts', - 'flac': 'flac', - 'm4a': 'ipod', - 'mka': 'matroska', - 'mkv': 'matroska', - 'mpg': 'mpeg', - 'ogv': 'ogg', - 'ts': 'mpegts', - 'wma': 'asf', - 'wmv': 'asf', -} -ACODECS = { - 'mp3': 'libmp3lame', - 'aac': 'aac', - 'flac': 'flac', - 'm4a': 'aac', - 'opus': 'libopus', - 'vorbis': 'libvorbis', - 'wav': None, -} - - -class FFmpegPostProcessorError(PostProcessingError): - pass - - -class FFmpegPostProcessor(PostProcessor): - def __init__(self, downloader=None): - PostProcessor.__init__(self, downloader) - self._determine_executables() - - def check_version(self): - if not self.available: - raise FFmpegPostProcessorError('ffmpeg or avconv not found. Please install one.') - - required_version = '10-0' if self.basename == 'avconv' else '1.0' - if is_outdated_version( - self._versions[self.basename], required_version): - warning = 'Your copy of %s is outdated, update %s to version %s or newer if you encounter any errors.' % ( - self.basename, self.basename, required_version) - if self._downloader: - self._downloader.report_warning(warning) - - @staticmethod - def get_versions(downloader=None): - return FFmpegPostProcessor(downloader)._versions - - def _determine_executables(self): - programs = ['avprobe', 'avconv', 'ffmpeg', 'ffprobe'] - prefer_ffmpeg = True - - def get_ffmpeg_version(path): - ver = get_exe_version(path, args=['-version']) - if ver: - regexs = [ - r'(?:\d+:)?([0-9.]+)-[0-9]+ubuntu[0-9.]+$', # Ubuntu, see [1] - r'n([0-9.]+)$', # Arch Linux - # 1. http://www.ducea.com/2006/06/17/ubuntu-package-version-naming-explanation/ - ] - for regex in regexs: - mobj = re.match(regex, ver) - if mobj: - ver = mobj.group(1) - return ver - - self.basename = None - self.probe_basename = None - - self._paths = None - self._versions = None - if self._downloader: - prefer_ffmpeg = self._downloader.params.get('prefer_ffmpeg', True) - location = self._downloader.params.get('ffmpeg_location') - if location is not None: - if not os.path.exists(location): - self._downloader.report_warning( - 'ffmpeg-location %s does not exist! ' - 'Continuing without avconv/ffmpeg.' % (location)) - self._versions = {} - return - elif not os.path.isdir(location): - basename = os.path.splitext(os.path.basename(location))[0] - if basename not in programs: - self._downloader.report_warning( - 'Cannot identify executable %s, its basename should be one of %s. ' - 'Continuing without avconv/ffmpeg.' % - (location, ', '.join(programs))) - self._versions = {} - return None - location = os.path.dirname(os.path.abspath(location)) - if basename in ('ffmpeg', 'ffprobe'): - prefer_ffmpeg = True - - self._paths = dict( - (p, os.path.join(location, p)) for p in programs) - self._versions = dict( - (p, get_ffmpeg_version(self._paths[p])) for p in programs) - if self._versions is None: - self._versions = dict( - (p, get_ffmpeg_version(p)) for p in programs) - self._paths = dict((p, p) for p in programs) - - if prefer_ffmpeg is False: - prefs = ('avconv', 'ffmpeg') - else: - prefs = ('ffmpeg', 'avconv') - for p in prefs: - if self._versions[p]: - self.basename = p - break - - if prefer_ffmpeg is False: - prefs = ('avprobe', 'ffprobe') - else: - prefs = ('ffprobe', 'avprobe') - for p in prefs: - if self._versions[p]: - self.probe_basename = p - break - - @property - def available(self): - return self.basename is not None - - @property - def executable(self): - return self._paths[self.basename] - - @property - def probe_available(self): - return self.probe_basename is not None - - @property - def probe_executable(self): - return self._paths[self.probe_basename] - - def get_audio_codec(self, path): - if not self.probe_available and not self.available: - raise PostProcessingError('ffprobe/avprobe and ffmpeg/avconv not found. Please install one.') - try: - if self.probe_available: - cmd = [ - encodeFilename(self.probe_executable, True), - encodeArgument('-show_streams')] - else: - cmd = [ - encodeFilename(self.executable, True), - encodeArgument('-i')] - cmd.append(encodeFilename(self._ffmpeg_filename_argument(path), True)) - if self._downloader.params.get('verbose', False): - self._downloader.to_screen( - '[debug] %s command line: %s' % (self.basename, shell_quote(cmd))) - handle = subprocess.Popen( - cmd, stderr=subprocess.PIPE, - stdout=subprocess.PIPE, stdin=subprocess.PIPE) - stdout_data, stderr_data = handle.communicate() - expected_ret = 0 if self.probe_available else 1 - if handle.wait() != expected_ret: - return None - except (IOError, OSError): - return None - output = (stdout_data if self.probe_available else stderr_data).decode('ascii', 'ignore') - if self.probe_available: - audio_codec = None - for line in output.split('\n'): - if line.startswith('codec_name='): - audio_codec = line.split('=')[1].strip() - elif line.strip() == 'codec_type=audio' and audio_codec is not None: - return audio_codec - else: - # Stream #FILE_INDEX:STREAM_INDEX[STREAM_ID](LANGUAGE): CODEC_TYPE: CODEC_NAME - mobj = re.search( - r'Stream\s*#\d+:\d+(?:\[0x[0-9a-f]+\])?(?:\([a-z]{3}\))?:\s*Audio:\s*([0-9a-z]+)', - output) - if mobj: - return mobj.group(1) - return None - - def run_ffmpeg_multiple_files(self, input_paths, out_path, opts): - self.check_version() - - oldest_mtime = min( - os.stat(encodeFilename(path)).st_mtime for path in input_paths) - - opts += self._configuration_args() - - files_cmd = [] - for path in input_paths: - files_cmd.extend([ - encodeArgument('-i'), - encodeFilename(self._ffmpeg_filename_argument(path), True) - ]) - cmd = [encodeFilename(self.executable, True), encodeArgument('-y')] - # avconv does not have repeat option - if self.basename == 'ffmpeg': - cmd += [encodeArgument('-loglevel'), encodeArgument('repeat+info')] - cmd += (files_cmd - + [encodeArgument(o) for o in opts] - + [encodeFilename(self._ffmpeg_filename_argument(out_path), True)]) - - if self._downloader.params.get('verbose', False): - self._downloader.to_screen('[debug] ffmpeg command line: %s' % shell_quote(cmd)) - p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) - stdout, stderr = p.communicate() - if p.returncode != 0: - stderr = stderr.decode('utf-8', 'replace') - msgs = stderr.strip().split('\n') - msg = msgs[-1] - if self._downloader.params.get('verbose', False): - self._downloader.to_screen('[debug] ' + '\n'.join(msgs[:-1])) - raise FFmpegPostProcessorError(msg) - self.try_utime(out_path, oldest_mtime, oldest_mtime) - - def run_ffmpeg(self, path, out_path, opts): - self.run_ffmpeg_multiple_files([path], out_path, opts) - - def _ffmpeg_filename_argument(self, fn): - # Always use 'file:' because the filename may contain ':' (ffmpeg - # interprets that as a protocol) or can start with '-' (-- is broken in - # ffmpeg, see https://ffmpeg.org/trac/ffmpeg/ticket/2127 for details) - # Also leave '-' intact in order not to break streaming to stdout. - return 'file:' + fn if fn != '-' else fn - - -class FFmpegExtractAudioPP(FFmpegPostProcessor): - def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, nopostoverwrites=False): - FFmpegPostProcessor.__init__(self, downloader) - if preferredcodec is None: - preferredcodec = 'best' - self._preferredcodec = preferredcodec - self._preferredquality = preferredquality - self._nopostoverwrites = nopostoverwrites - - def run_ffmpeg(self, path, out_path, codec, more_opts): - if codec is None: - acodec_opts = [] - else: - acodec_opts = ['-acodec', codec] - opts = ['-vn'] + acodec_opts + more_opts - try: - FFmpegPostProcessor.run_ffmpeg(self, path, out_path, opts) - except FFmpegPostProcessorError as err: - raise AudioConversionError(err.msg) - - def run(self, information): - path = information['filepath'] - - filecodec = self.get_audio_codec(path) - if filecodec is None: - raise PostProcessingError('WARNING: unable to obtain file audio codec with ffprobe') - - more_opts = [] - if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'): - if filecodec == 'aac' and self._preferredcodec in ['m4a', 'best']: - # Lossless, but in another container - acodec = 'copy' - extension = 'm4a' - more_opts = ['-bsf:a', 'aac_adtstoasc'] - elif filecodec in ['aac', 'flac', 'mp3', 'vorbis', 'opus']: - # Lossless if possible - acodec = 'copy' - extension = filecodec - if filecodec == 'aac': - more_opts = ['-f', 'adts'] - if filecodec == 'vorbis': - extension = 'ogg' - else: - # MP3 otherwise. - acodec = 'libmp3lame' - extension = 'mp3' - more_opts = [] - if self._preferredquality is not None: - if int(self._preferredquality) < 10: - more_opts += ['-q:a', self._preferredquality] - else: - more_opts += ['-b:a', self._preferredquality + 'k'] - else: - # We convert the audio (lossy if codec is lossy) - acodec = ACODECS[self._preferredcodec] - extension = self._preferredcodec - more_opts = [] - if self._preferredquality is not None: - # The opus codec doesn't support the -aq option - if int(self._preferredquality) < 10 and extension != 'opus': - more_opts += ['-q:a', self._preferredquality] - else: - more_opts += ['-b:a', self._preferredquality + 'k'] - if self._preferredcodec == 'aac': - more_opts += ['-f', 'adts'] - if self._preferredcodec == 'm4a': - more_opts += ['-bsf:a', 'aac_adtstoasc'] - if self._preferredcodec == 'vorbis': - extension = 'ogg' - if self._preferredcodec == 'wav': - extension = 'wav' - more_opts += ['-f', 'wav'] - - prefix, sep, ext = path.rpartition('.') # not os.path.splitext, since the latter does not work on unicode in all setups - new_path = prefix + sep + extension - - information['filepath'] = new_path - information['ext'] = extension - - # If we download foo.mp3 and convert it to... foo.mp3, then don't delete foo.mp3, silly. - if (new_path == path - or (self._nopostoverwrites and os.path.exists(encodeFilename(new_path)))): - self._downloader.to_screen('[ffmpeg] Post-process file %s exists, skipping' % new_path) - return [], information - - try: - self._downloader.to_screen('[ffmpeg] Destination: ' + new_path) - self.run_ffmpeg(path, new_path, acodec, more_opts) - except AudioConversionError as e: - raise PostProcessingError( - 'audio conversion failed: ' + e.msg) - except Exception: - raise PostProcessingError('error running ' + self.basename) - - # Try to update the date time for extracted audio file. - if information.get('filetime') is not None: - self.try_utime( - new_path, time.time(), information['filetime'], - errnote='Cannot update utime of audio file') - - return [path], information - - -class FFmpegVideoConvertorPP(FFmpegPostProcessor): - def __init__(self, downloader=None, preferedformat=None): - super(FFmpegVideoConvertorPP, self).__init__(downloader) - self._preferedformat = preferedformat - - def run(self, information): - path = information['filepath'] - if information['ext'] == self._preferedformat: - self._downloader.to_screen('[ffmpeg] Not converting video file %s - already is in target format %s' % (path, self._preferedformat)) - return [], information - options = [] - if self._preferedformat == 'avi': - options.extend(['-c:v', 'libxvid', '-vtag', 'XVID']) - prefix, sep, ext = path.rpartition('.') - outpath = prefix + sep + self._preferedformat - self._downloader.to_screen('[' + 'ffmpeg' + '] Converting video from %s to %s, Destination: ' % (information['ext'], self._preferedformat) + outpath) - self.run_ffmpeg(path, outpath, options) - information['filepath'] = outpath - information['format'] = self._preferedformat - information['ext'] = self._preferedformat - return [path], information - - -class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): - def run(self, information): - if information['ext'] not in ('mp4', 'webm', 'mkv'): - self._downloader.to_screen('[ffmpeg] Subtitles can only be embedded in mp4, webm or mkv files') - return [], information - subtitles = information.get('requested_subtitles') - if not subtitles: - self._downloader.to_screen('[ffmpeg] There aren\'t any subtitles to embed') - return [], information - - filename = information['filepath'] - - ext = information['ext'] - sub_langs = [] - sub_filenames = [] - webm_vtt_warn = False - - for lang, sub_info in subtitles.items(): - sub_ext = sub_info['ext'] - if ext != 'webm' or ext == 'webm' and sub_ext == 'vtt': - sub_langs.append(lang) - sub_filenames.append(subtitles_filename(filename, lang, sub_ext, ext)) - else: - if not webm_vtt_warn and ext == 'webm' and sub_ext != 'vtt': - webm_vtt_warn = True - self._downloader.to_screen('[ffmpeg] Only WebVTT subtitles can be embedded in webm files') - - if not sub_langs: - return [], information - - input_files = [filename] + sub_filenames - - opts = [ - '-map', '0', - '-c', 'copy', - # Don't copy the existing subtitles, we may be running the - # postprocessor a second time - '-map', '-0:s', - # Don't copy Apple TV chapters track, bin_data (see #19042, #19024, - # https://trac.ffmpeg.org/ticket/6016) - '-map', '-0:d', - ] - if information['ext'] == 'mp4': - opts += ['-c:s', 'mov_text'] - for (i, lang) in enumerate(sub_langs): - opts.extend(['-map', '%d:0' % (i + 1)]) - lang_code = ISO639Utils.short2long(lang) or lang - opts.extend(['-metadata:s:s:%d' % i, 'language=%s' % lang_code]) - - temp_filename = prepend_extension(filename, 'temp') - self._downloader.to_screen('[ffmpeg] Embedding subtitles in \'%s\'' % filename) - self.run_ffmpeg_multiple_files(input_files, temp_filename, opts) - os.remove(encodeFilename(filename)) - os.rename(encodeFilename(temp_filename), encodeFilename(filename)) - - return sub_filenames, information - - -class FFmpegMetadataPP(FFmpegPostProcessor): - def run(self, info): - metadata = {} - - def add(meta_list, info_list=None): - if not info_list: - info_list = meta_list - if not isinstance(meta_list, (list, tuple)): - meta_list = (meta_list,) - if not isinstance(info_list, (list, tuple)): - info_list = (info_list,) - for info_f in info_list: - if info.get(info_f) is not None: - for meta_f in meta_list: - metadata[meta_f] = info[info_f] - break - - # See [1-4] for some info on media metadata/metadata supported - # by ffmpeg. - # 1. https://kdenlive.org/en/project/adding-meta-data-to-mp4-video/ - # 2. https://wiki.multimedia.cx/index.php/FFmpeg_Metadata - # 3. https://kodi.wiki/view/Video_file_tagging - # 4. http://atomicparsley.sourceforge.net/mpeg-4files.html - - add('title', ('track', 'title')) - add('date', 'upload_date') - add(('description', 'comment'), 'description') - add('purl', 'webpage_url') - add('track', 'track_number') - add('artist', ('artist', 'creator', 'uploader', 'uploader_id')) - add('genre') - add('album') - add('album_artist') - add('disc', 'disc_number') - add('show', 'series') - add('season_number') - add('episode_id', ('episode', 'episode_id')) - add('episode_sort', 'episode_number') - - if not metadata: - self._downloader.to_screen('[ffmpeg] There isn\'t any metadata to add') - return [], info - - filename = info['filepath'] - temp_filename = prepend_extension(filename, 'temp') - in_filenames = [filename] - options = [] - - if info['ext'] == 'm4a': - options.extend(['-vn', '-acodec', 'copy']) - else: - options.extend(['-c', 'copy']) - - for (name, value) in metadata.items(): - options.extend(['-metadata', '%s=%s' % (name, value)]) - - chapters = info.get('chapters', []) - if chapters: - metadata_filename = replace_extension(filename, 'meta') - with io.open(metadata_filename, 'wt', encoding='utf-8') as f: - def ffmpeg_escape(text): - return re.sub(r'(=|;|#|\\|\n)', r'\\\1', text) - - metadata_file_content = ';FFMETADATA1\n' - for chapter in chapters: - metadata_file_content += '[CHAPTER]\nTIMEBASE=1/1000\n' - metadata_file_content += 'START=%d\n' % (chapter['start_time'] * 1000) - metadata_file_content += 'END=%d\n' % (chapter['end_time'] * 1000) - chapter_title = chapter.get('title') - if chapter_title: - metadata_file_content += 'title=%s\n' % ffmpeg_escape(chapter_title) - f.write(metadata_file_content) - in_filenames.append(metadata_filename) - options.extend(['-map_metadata', '1']) - - self._downloader.to_screen('[ffmpeg] Adding metadata to \'%s\'' % filename) - self.run_ffmpeg_multiple_files(in_filenames, temp_filename, options) - if chapters: - os.remove(metadata_filename) - os.remove(encodeFilename(filename)) - os.rename(encodeFilename(temp_filename), encodeFilename(filename)) - return [], info - - -class FFmpegMergerPP(FFmpegPostProcessor): - def run(self, info): - filename = info['filepath'] - temp_filename = prepend_extension(filename, 'temp') - args = ['-c', 'copy', '-map', '0:v:0', '-map', '1:a:0'] - self._downloader.to_screen('[ffmpeg] Merging formats into "%s"' % filename) - self.run_ffmpeg_multiple_files(info['__files_to_merge'], temp_filename, args) - os.rename(encodeFilename(temp_filename), encodeFilename(filename)) - return info['__files_to_merge'], info - - def can_merge(self): - # TODO: figure out merge-capable ffmpeg version - if self.basename != 'avconv': - return True - - required_version = '10-0' - if is_outdated_version( - self._versions[self.basename], required_version): - warning = ('Your copy of %s is outdated and unable to properly mux separate video and audio files, ' - 'youtube-dl will download single file media. ' - 'Update %s to version %s or newer to fix this.') % ( - self.basename, self.basename, required_version) - if self._downloader: - self._downloader.report_warning(warning) - return False - return True - - -class FFmpegFixupStretchedPP(FFmpegPostProcessor): - def run(self, info): - stretched_ratio = info.get('stretched_ratio') - if stretched_ratio is None or stretched_ratio == 1: - return [], info - - filename = info['filepath'] - temp_filename = prepend_extension(filename, 'temp') - - options = ['-c', 'copy', '-aspect', '%f' % stretched_ratio] - self._downloader.to_screen('[ffmpeg] Fixing aspect ratio in "%s"' % filename) - self.run_ffmpeg(filename, temp_filename, options) - - os.remove(encodeFilename(filename)) - os.rename(encodeFilename(temp_filename), encodeFilename(filename)) - - return [], info - - -class FFmpegFixupM4aPP(FFmpegPostProcessor): - def run(self, info): - if info.get('container') != 'm4a_dash': - return [], info - - filename = info['filepath'] - temp_filename = prepend_extension(filename, 'temp') - - options = ['-c', 'copy', '-f', 'mp4'] - self._downloader.to_screen('[ffmpeg] Correcting container in "%s"' % filename) - self.run_ffmpeg(filename, temp_filename, options) - - os.remove(encodeFilename(filename)) - os.rename(encodeFilename(temp_filename), encodeFilename(filename)) - - return [], info - - -class FFmpegFixupM3u8PP(FFmpegPostProcessor): - def run(self, info): - filename = info['filepath'] - if self.get_audio_codec(filename) == 'aac': - temp_filename = prepend_extension(filename, 'temp') - - options = ['-c', 'copy', '-f', 'mp4', '-bsf:a', 'aac_adtstoasc'] - self._downloader.to_screen('[ffmpeg] Fixing malformed AAC bitstream in "%s"' % filename) - self.run_ffmpeg(filename, temp_filename, options) - - os.remove(encodeFilename(filename)) - os.rename(encodeFilename(temp_filename), encodeFilename(filename)) - return [], info - - -class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor): - def __init__(self, downloader=None, format=None): - super(FFmpegSubtitlesConvertorPP, self).__init__(downloader) - self.format = format - - def run(self, info): - subs = info.get('requested_subtitles') - filename = info['filepath'] - new_ext = self.format - new_format = new_ext - if new_format == 'vtt': - new_format = 'webvtt' - if subs is None: - self._downloader.to_screen('[ffmpeg] There aren\'t any subtitles to convert') - return [], info - self._downloader.to_screen('[ffmpeg] Converting subtitles') - sub_filenames = [] - for lang, sub in subs.items(): - ext = sub['ext'] - if ext == new_ext: - self._downloader.to_screen( - '[ffmpeg] Subtitle file for %s is already in the requested format' % new_ext) - continue - old_file = subtitles_filename(filename, lang, ext, info.get('ext')) - sub_filenames.append(old_file) - new_file = subtitles_filename(filename, lang, new_ext, info.get('ext')) - - if ext in ('dfxp', 'ttml', 'tt'): - self._downloader.report_warning( - 'You have requested to convert dfxp (TTML) subtitles into another format, ' - 'which results in style information loss') - - dfxp_file = old_file - srt_file = subtitles_filename(filename, lang, 'srt', info.get('ext')) - - with open(dfxp_file, 'rb') as f: - srt_data = dfxp2srt(f.read()) - - with io.open(srt_file, 'wt', encoding='utf-8') as f: - f.write(srt_data) - old_file = srt_file - - subs[lang] = { - 'ext': 'srt', - 'data': srt_data - } - - if new_ext == 'srt': - continue - else: - sub_filenames.append(srt_file) - - self.run_ffmpeg(old_file, new_file, ['-f', new_format]) - - with io.open(new_file, 'rt', encoding='utf-8') as f: - subs[lang] = { - 'ext': new_ext, - 'data': f.read(), - } - - return sub_filenames, info diff --git a/youtube_dl/postprocessor/metadatafromtitle.py b/youtube_dl/postprocessor/metadatafromtitle.py deleted file mode 100644 index f5c14d974..000000000 --- a/youtube_dl/postprocessor/metadatafromtitle.py +++ /dev/null @@ -1,48 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import PostProcessor - - -class MetadataFromTitlePP(PostProcessor): - def __init__(self, downloader, titleformat): - super(MetadataFromTitlePP, self).__init__(downloader) - self._titleformat = titleformat - self._titleregex = (self.format_to_regex(titleformat) - if re.search(r'%\(\w+\)s', titleformat) - else titleformat) - - def format_to_regex(self, fmt): - r""" - Converts a string like - '%(title)s - %(artist)s' - to a regex like - '(?P<title>.+)\ \-\ (?P<artist>.+)' - """ - lastpos = 0 - regex = '' - # replace %(..)s with regex group and escape other string parts - for match in re.finditer(r'%\((\w+)\)s', fmt): - regex += re.escape(fmt[lastpos:match.start()]) - regex += r'(?P<' + match.group(1) + '>.+)' - lastpos = match.end() - if lastpos < len(fmt): - regex += re.escape(fmt[lastpos:]) - return regex - - def run(self, info): - title = info['title'] - match = re.match(self._titleregex, title) - if match is None: - self._downloader.to_screen( - '[fromtitle] Could not interpret title of video as "%s"' - % self._titleformat) - return [], info - for attribute, value in match.groupdict().items(): - info[attribute] = value - self._downloader.to_screen( - '[fromtitle] parsed %s: %s' - % (attribute, value if value is not None else 'NA')) - - return [], info diff --git a/youtube_dl/postprocessor/xattrpp.py b/youtube_dl/postprocessor/xattrpp.py deleted file mode 100644 index 814dabecf..000000000 --- a/youtube_dl/postprocessor/xattrpp.py +++ /dev/null @@ -1,79 +0,0 @@ -from __future__ import unicode_literals - -from .common import PostProcessor -from ..compat import compat_os_name -from ..utils import ( - hyphenate_date, - write_xattr, - XAttrMetadataError, - XAttrUnavailableError, -) - - -class XAttrMetadataPP(PostProcessor): - - # - # More info about extended attributes for media: - # http://freedesktop.org/wiki/CommonExtendedAttributes/ - # http://www.freedesktop.org/wiki/PhreedomDraft/ - # http://dublincore.org/documents/usageguide/elements.shtml - # - # TODO: - # * capture youtube keywords and put them in 'user.dublincore.subject' (comma-separated) - # * figure out which xattrs can be used for 'duration', 'thumbnail', 'resolution' - # - - def run(self, info): - """ Set extended attributes on downloaded file (if xattr support is found). """ - - # Write the metadata to the file's xattrs - self._downloader.to_screen('[metadata] Writing metadata to file\'s xattrs') - - filename = info['filepath'] - - try: - xattr_mapping = { - 'user.xdg.referrer.url': 'webpage_url', - # 'user.xdg.comment': 'description', - 'user.dublincore.title': 'title', - 'user.dublincore.date': 'upload_date', - 'user.dublincore.description': 'description', - 'user.dublincore.contributor': 'uploader', - 'user.dublincore.format': 'format', - } - - num_written = 0 - for xattrname, infoname in xattr_mapping.items(): - - value = info.get(infoname) - - if value: - if infoname == 'upload_date': - value = hyphenate_date(value) - - byte_value = value.encode('utf-8') - write_xattr(filename, xattrname, byte_value) - num_written += 1 - - return [], info - - except XAttrUnavailableError as e: - self._downloader.report_error(str(e)) - return [], info - - except XAttrMetadataError as e: - if e.reason == 'NO_SPACE': - self._downloader.report_warning( - 'There\'s no disk space left, disk quota exceeded or filesystem xattr limit exceeded. ' - + (('Some ' if num_written else '') + 'extended attributes are not written.').capitalize()) - elif e.reason == 'VALUE_TOO_LONG': - self._downloader.report_warning( - 'Unable to write extended attributes due to too long values.') - else: - msg = 'This filesystem doesn\'t support extended attributes. ' - if compat_os_name == 'nt': - msg += 'You need to use NTFS.' - else: - msg += '(You may have to enable them in your /etc/fstab)' - self._downloader.report_error(msg) - return [], info diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py deleted file mode 100644 index cb3cfcc17..000000000 --- a/youtube_dl/utils.py +++ /dev/null @@ -1,5774 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -from __future__ import unicode_literals - -import base64 -import binascii -import calendar -import codecs -import collections -import contextlib -import ctypes -import datetime -import email.utils -import email.header -import errno -import functools -import gzip -import io -import itertools -import json -import locale -import math -import operator -import os -import platform -import random -import re -import socket -import ssl -import subprocess -import sys -import tempfile -import time -import traceback -import xml.etree.ElementTree -import zlib - -from .compat import ( - compat_HTMLParseError, - compat_HTMLParser, - compat_HTTPError, - compat_basestring, - compat_chr, - compat_cookiejar, - compat_ctypes_WINFUNCTYPE, - compat_etree_fromstring, - compat_expanduser, - compat_html_entities, - compat_html_entities_html5, - compat_http_client, - compat_integer_types, - compat_kwargs, - compat_os_name, - compat_parse_qs, - compat_shlex_quote, - compat_str, - compat_struct_pack, - compat_struct_unpack, - compat_urllib_error, - compat_urllib_parse, - compat_urllib_parse_urlencode, - compat_urllib_parse_urlparse, - compat_urllib_parse_unquote_plus, - compat_urllib_request, - compat_urlparse, - compat_xpath, -) - -from .socks import ( - ProxyType, - sockssocket, -) - - -def register_socks_protocols(): - # "Register" SOCKS protocols - # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904 - # URLs with protocols not in urlparse.uses_netloc are not handled correctly - for scheme in ('socks', 'socks4', 'socks4a', 'socks5'): - if scheme not in compat_urlparse.uses_netloc: - compat_urlparse.uses_netloc.append(scheme) - - -# This is not clearly defined otherwise -compiled_regex_type = type(re.compile('')) - - -def random_user_agent(): - _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36' - _CHROME_VERSIONS = ( - '74.0.3729.129', - '76.0.3780.3', - '76.0.3780.2', - '74.0.3729.128', - '76.0.3780.1', - '76.0.3780.0', - '75.0.3770.15', - '74.0.3729.127', - '74.0.3729.126', - '76.0.3779.1', - '76.0.3779.0', - '75.0.3770.14', - '74.0.3729.125', - '76.0.3778.1', - '76.0.3778.0', - '75.0.3770.13', - '74.0.3729.124', - '74.0.3729.123', - '73.0.3683.121', - '76.0.3777.1', - '76.0.3777.0', - '75.0.3770.12', - '74.0.3729.122', - '76.0.3776.4', - '75.0.3770.11', - '74.0.3729.121', - '76.0.3776.3', - '76.0.3776.2', - '73.0.3683.120', - '74.0.3729.120', - '74.0.3729.119', - '74.0.3729.118', - '76.0.3776.1', - '76.0.3776.0', - '76.0.3775.5', - '75.0.3770.10', - '74.0.3729.117', - '76.0.3775.4', - '76.0.3775.3', - '74.0.3729.116', - '75.0.3770.9', - '76.0.3775.2', - '76.0.3775.1', - '76.0.3775.0', - '75.0.3770.8', - '74.0.3729.115', - '74.0.3729.114', - '76.0.3774.1', - '76.0.3774.0', - '75.0.3770.7', - '74.0.3729.113', - '74.0.3729.112', - '74.0.3729.111', - '76.0.3773.1', - '76.0.3773.0', - '75.0.3770.6', - '74.0.3729.110', - '74.0.3729.109', - '76.0.3772.1', - '76.0.3772.0', - '75.0.3770.5', - '74.0.3729.108', - '74.0.3729.107', - '76.0.3771.1', - '76.0.3771.0', - '75.0.3770.4', - '74.0.3729.106', - '74.0.3729.105', - '75.0.3770.3', - '74.0.3729.104', - '74.0.3729.103', - '74.0.3729.102', - '75.0.3770.2', - '74.0.3729.101', - '75.0.3770.1', - '75.0.3770.0', - '74.0.3729.100', - '75.0.3769.5', - '75.0.3769.4', - '74.0.3729.99', - '75.0.3769.3', - '75.0.3769.2', - '75.0.3768.6', - '74.0.3729.98', - '75.0.3769.1', - '75.0.3769.0', - '74.0.3729.97', - '73.0.3683.119', - '73.0.3683.118', - '74.0.3729.96', - '75.0.3768.5', - '75.0.3768.4', - '75.0.3768.3', - '75.0.3768.2', - '74.0.3729.95', - '74.0.3729.94', - '75.0.3768.1', - '75.0.3768.0', - '74.0.3729.93', - '74.0.3729.92', - '73.0.3683.117', - '74.0.3729.91', - '75.0.3766.3', - '74.0.3729.90', - '75.0.3767.2', - '75.0.3767.1', - '75.0.3767.0', - '74.0.3729.89', - '73.0.3683.116', - '75.0.3766.2', - '74.0.3729.88', - '75.0.3766.1', - '75.0.3766.0', - '74.0.3729.87', - '73.0.3683.115', - '74.0.3729.86', - '75.0.3765.1', - '75.0.3765.0', - '74.0.3729.85', - '73.0.3683.114', - '74.0.3729.84', - '75.0.3764.1', - '75.0.3764.0', - '74.0.3729.83', - '73.0.3683.113', - '75.0.3763.2', - '75.0.3761.4', - '74.0.3729.82', - '75.0.3763.1', - '75.0.3763.0', - '74.0.3729.81', - '73.0.3683.112', - '75.0.3762.1', - '75.0.3762.0', - '74.0.3729.80', - '75.0.3761.3', - '74.0.3729.79', - '73.0.3683.111', - '75.0.3761.2', - '74.0.3729.78', - '74.0.3729.77', - '75.0.3761.1', - '75.0.3761.0', - '73.0.3683.110', - '74.0.3729.76', - '74.0.3729.75', - '75.0.3760.0', - '74.0.3729.74', - '75.0.3759.8', - '75.0.3759.7', - '75.0.3759.6', - '74.0.3729.73', - '75.0.3759.5', - '74.0.3729.72', - '73.0.3683.109', - '75.0.3759.4', - '75.0.3759.3', - '74.0.3729.71', - '75.0.3759.2', - '74.0.3729.70', - '73.0.3683.108', - '74.0.3729.69', - '75.0.3759.1', - '75.0.3759.0', - '74.0.3729.68', - '73.0.3683.107', - '74.0.3729.67', - '75.0.3758.1', - '75.0.3758.0', - '74.0.3729.66', - '73.0.3683.106', - '74.0.3729.65', - '75.0.3757.1', - '75.0.3757.0', - '74.0.3729.64', - '73.0.3683.105', - '74.0.3729.63', - '75.0.3756.1', - '75.0.3756.0', - '74.0.3729.62', - '73.0.3683.104', - '75.0.3755.3', - '75.0.3755.2', - '73.0.3683.103', - '75.0.3755.1', - '75.0.3755.0', - '74.0.3729.61', - '73.0.3683.102', - '74.0.3729.60', - '75.0.3754.2', - '74.0.3729.59', - '75.0.3753.4', - '74.0.3729.58', - '75.0.3754.1', - '75.0.3754.0', - '74.0.3729.57', - '73.0.3683.101', - '75.0.3753.3', - '75.0.3752.2', - '75.0.3753.2', - '74.0.3729.56', - '75.0.3753.1', - '75.0.3753.0', - '74.0.3729.55', - '73.0.3683.100', - '74.0.3729.54', - '75.0.3752.1', - '75.0.3752.0', - '74.0.3729.53', - '73.0.3683.99', - '74.0.3729.52', - '75.0.3751.1', - '75.0.3751.0', - '74.0.3729.51', - '73.0.3683.98', - '74.0.3729.50', - '75.0.3750.0', - '74.0.3729.49', - '74.0.3729.48', - '74.0.3729.47', - '75.0.3749.3', - '74.0.3729.46', - '73.0.3683.97', - '75.0.3749.2', - '74.0.3729.45', - '75.0.3749.1', - '75.0.3749.0', - '74.0.3729.44', - '73.0.3683.96', - '74.0.3729.43', - '74.0.3729.42', - '75.0.3748.1', - '75.0.3748.0', - '74.0.3729.41', - '75.0.3747.1', - '73.0.3683.95', - '75.0.3746.4', - '74.0.3729.40', - '74.0.3729.39', - '75.0.3747.0', - '75.0.3746.3', - '75.0.3746.2', - '74.0.3729.38', - '75.0.3746.1', - '75.0.3746.0', - '74.0.3729.37', - '73.0.3683.94', - '75.0.3745.5', - '75.0.3745.4', - '75.0.3745.3', - '75.0.3745.2', - '74.0.3729.36', - '75.0.3745.1', - '75.0.3745.0', - '75.0.3744.2', - '74.0.3729.35', - '73.0.3683.93', - '74.0.3729.34', - '75.0.3744.1', - '75.0.3744.0', - '74.0.3729.33', - '73.0.3683.92', - '74.0.3729.32', - '74.0.3729.31', - '73.0.3683.91', - '75.0.3741.2', - '75.0.3740.5', - '74.0.3729.30', - '75.0.3741.1', - '75.0.3741.0', - '74.0.3729.29', - '75.0.3740.4', - '73.0.3683.90', - '74.0.3729.28', - '75.0.3740.3', - '73.0.3683.89', - '75.0.3740.2', - '74.0.3729.27', - '75.0.3740.1', - '75.0.3740.0', - '74.0.3729.26', - '73.0.3683.88', - '73.0.3683.87', - '74.0.3729.25', - '75.0.3739.1', - '75.0.3739.0', - '73.0.3683.86', - '74.0.3729.24', - '73.0.3683.85', - '75.0.3738.4', - '75.0.3738.3', - '75.0.3738.2', - '75.0.3738.1', - '75.0.3738.0', - '74.0.3729.23', - '73.0.3683.84', - '74.0.3729.22', - '74.0.3729.21', - '75.0.3737.1', - '75.0.3737.0', - '74.0.3729.20', - '73.0.3683.83', - '74.0.3729.19', - '75.0.3736.1', - '75.0.3736.0', - '74.0.3729.18', - '73.0.3683.82', - '74.0.3729.17', - '75.0.3735.1', - '75.0.3735.0', - '74.0.3729.16', - '73.0.3683.81', - '75.0.3734.1', - '75.0.3734.0', - '74.0.3729.15', - '73.0.3683.80', - '74.0.3729.14', - '75.0.3733.1', - '75.0.3733.0', - '75.0.3732.1', - '74.0.3729.13', - '74.0.3729.12', - '73.0.3683.79', - '74.0.3729.11', - '75.0.3732.0', - '74.0.3729.10', - '73.0.3683.78', - '74.0.3729.9', - '74.0.3729.8', - '74.0.3729.7', - '75.0.3731.3', - '75.0.3731.2', - '75.0.3731.0', - '74.0.3729.6', - '73.0.3683.77', - '73.0.3683.76', - '75.0.3730.5', - '75.0.3730.4', - '73.0.3683.75', - '74.0.3729.5', - '73.0.3683.74', - '75.0.3730.3', - '75.0.3730.2', - '74.0.3729.4', - '73.0.3683.73', - '73.0.3683.72', - '75.0.3730.1', - '75.0.3730.0', - '74.0.3729.3', - '73.0.3683.71', - '74.0.3729.2', - '73.0.3683.70', - '74.0.3729.1', - '74.0.3729.0', - '74.0.3726.4', - '73.0.3683.69', - '74.0.3726.3', - '74.0.3728.0', - '74.0.3726.2', - '73.0.3683.68', - '74.0.3726.1', - '74.0.3726.0', - '74.0.3725.4', - '73.0.3683.67', - '73.0.3683.66', - '74.0.3725.3', - '74.0.3725.2', - '74.0.3725.1', - '74.0.3724.8', - '74.0.3725.0', - '73.0.3683.65', - '74.0.3724.7', - '74.0.3724.6', - '74.0.3724.5', - '74.0.3724.4', - '74.0.3724.3', - '74.0.3724.2', - '74.0.3724.1', - '74.0.3724.0', - '73.0.3683.64', - '74.0.3723.1', - '74.0.3723.0', - '73.0.3683.63', - '74.0.3722.1', - '74.0.3722.0', - '73.0.3683.62', - '74.0.3718.9', - '74.0.3702.3', - '74.0.3721.3', - '74.0.3721.2', - '74.0.3721.1', - '74.0.3721.0', - '74.0.3720.6', - '73.0.3683.61', - '72.0.3626.122', - '73.0.3683.60', - '74.0.3720.5', - '72.0.3626.121', - '74.0.3718.8', - '74.0.3720.4', - '74.0.3720.3', - '74.0.3718.7', - '74.0.3720.2', - '74.0.3720.1', - '74.0.3720.0', - '74.0.3718.6', - '74.0.3719.5', - '73.0.3683.59', - '74.0.3718.5', - '74.0.3718.4', - '74.0.3719.4', - '74.0.3719.3', - '74.0.3719.2', - '74.0.3719.1', - '73.0.3683.58', - '74.0.3719.0', - '73.0.3683.57', - '73.0.3683.56', - '74.0.3718.3', - '73.0.3683.55', - '74.0.3718.2', - '74.0.3718.1', - '74.0.3718.0', - '73.0.3683.54', - '74.0.3717.2', - '73.0.3683.53', - '74.0.3717.1', - '74.0.3717.0', - '73.0.3683.52', - '74.0.3716.1', - '74.0.3716.0', - '73.0.3683.51', - '74.0.3715.1', - '74.0.3715.0', - '73.0.3683.50', - '74.0.3711.2', - '74.0.3714.2', - '74.0.3713.3', - '74.0.3714.1', - '74.0.3714.0', - '73.0.3683.49', - '74.0.3713.1', - '74.0.3713.0', - '72.0.3626.120', - '73.0.3683.48', - '74.0.3712.2', - '74.0.3712.1', - '74.0.3712.0', - '73.0.3683.47', - '72.0.3626.119', - '73.0.3683.46', - '74.0.3710.2', - '72.0.3626.118', - '74.0.3711.1', - '74.0.3711.0', - '73.0.3683.45', - '72.0.3626.117', - '74.0.3710.1', - '74.0.3710.0', - '73.0.3683.44', - '72.0.3626.116', - '74.0.3709.1', - '74.0.3709.0', - '74.0.3704.9', - '73.0.3683.43', - '72.0.3626.115', - '74.0.3704.8', - '74.0.3704.7', - '74.0.3708.0', - '74.0.3706.7', - '74.0.3704.6', - '73.0.3683.42', - '72.0.3626.114', - '74.0.3706.6', - '72.0.3626.113', - '74.0.3704.5', - '74.0.3706.5', - '74.0.3706.4', - '74.0.3706.3', - '74.0.3706.2', - '74.0.3706.1', - '74.0.3706.0', - '73.0.3683.41', - '72.0.3626.112', - '74.0.3705.1', - '74.0.3705.0', - '73.0.3683.40', - '72.0.3626.111', - '73.0.3683.39', - '74.0.3704.4', - '73.0.3683.38', - '74.0.3704.3', - '74.0.3704.2', - '74.0.3704.1', - '74.0.3704.0', - '73.0.3683.37', - '72.0.3626.110', - '72.0.3626.109', - '74.0.3703.3', - '74.0.3703.2', - '73.0.3683.36', - '74.0.3703.1', - '74.0.3703.0', - '73.0.3683.35', - '72.0.3626.108', - '74.0.3702.2', - '74.0.3699.3', - '74.0.3702.1', - '74.0.3702.0', - '73.0.3683.34', - '72.0.3626.107', - '73.0.3683.33', - '74.0.3701.1', - '74.0.3701.0', - '73.0.3683.32', - '73.0.3683.31', - '72.0.3626.105', - '74.0.3700.1', - '74.0.3700.0', - '73.0.3683.29', - '72.0.3626.103', - '74.0.3699.2', - '74.0.3699.1', - '74.0.3699.0', - '73.0.3683.28', - '72.0.3626.102', - '73.0.3683.27', - '73.0.3683.26', - '74.0.3698.0', - '74.0.3696.2', - '72.0.3626.101', - '73.0.3683.25', - '74.0.3696.1', - '74.0.3696.0', - '74.0.3694.8', - '72.0.3626.100', - '74.0.3694.7', - '74.0.3694.6', - '74.0.3694.5', - '74.0.3694.4', - '72.0.3626.99', - '72.0.3626.98', - '74.0.3694.3', - '73.0.3683.24', - '72.0.3626.97', - '72.0.3626.96', - '72.0.3626.95', - '73.0.3683.23', - '72.0.3626.94', - '73.0.3683.22', - '73.0.3683.21', - '72.0.3626.93', - '74.0.3694.2', - '72.0.3626.92', - '74.0.3694.1', - '74.0.3694.0', - '74.0.3693.6', - '73.0.3683.20', - '72.0.3626.91', - '74.0.3693.5', - '74.0.3693.4', - '74.0.3693.3', - '74.0.3693.2', - '73.0.3683.19', - '74.0.3693.1', - '74.0.3693.0', - '73.0.3683.18', - '72.0.3626.90', - '74.0.3692.1', - '74.0.3692.0', - '73.0.3683.17', - '72.0.3626.89', - '74.0.3687.3', - '74.0.3691.1', - '74.0.3691.0', - '73.0.3683.16', - '72.0.3626.88', - '72.0.3626.87', - '73.0.3683.15', - '74.0.3690.1', - '74.0.3690.0', - '73.0.3683.14', - '72.0.3626.86', - '73.0.3683.13', - '73.0.3683.12', - '74.0.3689.1', - '74.0.3689.0', - '73.0.3683.11', - '72.0.3626.85', - '73.0.3683.10', - '72.0.3626.84', - '73.0.3683.9', - '74.0.3688.1', - '74.0.3688.0', - '73.0.3683.8', - '72.0.3626.83', - '74.0.3687.2', - '74.0.3687.1', - '74.0.3687.0', - '73.0.3683.7', - '72.0.3626.82', - '74.0.3686.4', - '72.0.3626.81', - '74.0.3686.3', - '74.0.3686.2', - '74.0.3686.1', - '74.0.3686.0', - '73.0.3683.6', - '72.0.3626.80', - '74.0.3685.1', - '74.0.3685.0', - '73.0.3683.5', - '72.0.3626.79', - '74.0.3684.1', - '74.0.3684.0', - '73.0.3683.4', - '72.0.3626.78', - '72.0.3626.77', - '73.0.3683.3', - '73.0.3683.2', - '72.0.3626.76', - '73.0.3683.1', - '73.0.3683.0', - '72.0.3626.75', - '71.0.3578.141', - '73.0.3682.1', - '73.0.3682.0', - '72.0.3626.74', - '71.0.3578.140', - '73.0.3681.4', - '73.0.3681.3', - '73.0.3681.2', - '73.0.3681.1', - '73.0.3681.0', - '72.0.3626.73', - '71.0.3578.139', - '72.0.3626.72', - '72.0.3626.71', - '73.0.3680.1', - '73.0.3680.0', - '72.0.3626.70', - '71.0.3578.138', - '73.0.3678.2', - '73.0.3679.1', - '73.0.3679.0', - '72.0.3626.69', - '71.0.3578.137', - '73.0.3678.1', - '73.0.3678.0', - '71.0.3578.136', - '73.0.3677.1', - '73.0.3677.0', - '72.0.3626.68', - '72.0.3626.67', - '71.0.3578.135', - '73.0.3676.1', - '73.0.3676.0', - '73.0.3674.2', - '72.0.3626.66', - '71.0.3578.134', - '73.0.3674.1', - '73.0.3674.0', - '72.0.3626.65', - '71.0.3578.133', - '73.0.3673.2', - '73.0.3673.1', - '73.0.3673.0', - '72.0.3626.64', - '71.0.3578.132', - '72.0.3626.63', - '72.0.3626.62', - '72.0.3626.61', - '72.0.3626.60', - '73.0.3672.1', - '73.0.3672.0', - '72.0.3626.59', - '71.0.3578.131', - '73.0.3671.3', - '73.0.3671.2', - '73.0.3671.1', - '73.0.3671.0', - '72.0.3626.58', - '71.0.3578.130', - '73.0.3670.1', - '73.0.3670.0', - '72.0.3626.57', - '71.0.3578.129', - '73.0.3669.1', - '73.0.3669.0', - '72.0.3626.56', - '71.0.3578.128', - '73.0.3668.2', - '73.0.3668.1', - '73.0.3668.0', - '72.0.3626.55', - '71.0.3578.127', - '73.0.3667.2', - '73.0.3667.1', - '73.0.3667.0', - '72.0.3626.54', - '71.0.3578.126', - '73.0.3666.1', - '73.0.3666.0', - '72.0.3626.53', - '71.0.3578.125', - '73.0.3665.4', - '73.0.3665.3', - '72.0.3626.52', - '73.0.3665.2', - '73.0.3664.4', - '73.0.3665.1', - '73.0.3665.0', - '72.0.3626.51', - '71.0.3578.124', - '72.0.3626.50', - '73.0.3664.3', - '73.0.3664.2', - '73.0.3664.1', - '73.0.3664.0', - '73.0.3663.2', - '72.0.3626.49', - '71.0.3578.123', - '73.0.3663.1', - '73.0.3663.0', - '72.0.3626.48', - '71.0.3578.122', - '73.0.3662.1', - '73.0.3662.0', - '72.0.3626.47', - '71.0.3578.121', - '73.0.3661.1', - '72.0.3626.46', - '73.0.3661.0', - '72.0.3626.45', - '71.0.3578.120', - '73.0.3660.2', - '73.0.3660.1', - '73.0.3660.0', - '72.0.3626.44', - '71.0.3578.119', - '73.0.3659.1', - '73.0.3659.0', - '72.0.3626.43', - '71.0.3578.118', - '73.0.3658.1', - '73.0.3658.0', - '72.0.3626.42', - '71.0.3578.117', - '73.0.3657.1', - '73.0.3657.0', - '72.0.3626.41', - '71.0.3578.116', - '73.0.3656.1', - '73.0.3656.0', - '72.0.3626.40', - '71.0.3578.115', - '73.0.3655.1', - '73.0.3655.0', - '72.0.3626.39', - '71.0.3578.114', - '73.0.3654.1', - '73.0.3654.0', - '72.0.3626.38', - '71.0.3578.113', - '73.0.3653.1', - '73.0.3653.0', - '72.0.3626.37', - '71.0.3578.112', - '73.0.3652.1', - '73.0.3652.0', - '72.0.3626.36', - '71.0.3578.111', - '73.0.3651.1', - '73.0.3651.0', - '72.0.3626.35', - '71.0.3578.110', - '73.0.3650.1', - '73.0.3650.0', - '72.0.3626.34', - '71.0.3578.109', - '73.0.3649.1', - '73.0.3649.0', - '72.0.3626.33', - '71.0.3578.108', - '73.0.3648.2', - '73.0.3648.1', - '73.0.3648.0', - '72.0.3626.32', - '71.0.3578.107', - '73.0.3647.2', - '73.0.3647.1', - '73.0.3647.0', - '72.0.3626.31', - '71.0.3578.106', - '73.0.3635.3', - '73.0.3646.2', - '73.0.3646.1', - '73.0.3646.0', - '72.0.3626.30', - '71.0.3578.105', - '72.0.3626.29', - '73.0.3645.2', - '73.0.3645.1', - '73.0.3645.0', - '72.0.3626.28', - '71.0.3578.104', - '72.0.3626.27', - '72.0.3626.26', - '72.0.3626.25', - '72.0.3626.24', - '73.0.3644.0', - '73.0.3643.2', - '72.0.3626.23', - '71.0.3578.103', - '73.0.3643.1', - '73.0.3643.0', - '72.0.3626.22', - '71.0.3578.102', - '73.0.3642.1', - '73.0.3642.0', - '72.0.3626.21', - '71.0.3578.101', - '73.0.3641.1', - '73.0.3641.0', - '72.0.3626.20', - '71.0.3578.100', - '72.0.3626.19', - '73.0.3640.1', - '73.0.3640.0', - '72.0.3626.18', - '73.0.3639.1', - '71.0.3578.99', - '73.0.3639.0', - '72.0.3626.17', - '73.0.3638.2', - '72.0.3626.16', - '73.0.3638.1', - '73.0.3638.0', - '72.0.3626.15', - '71.0.3578.98', - '73.0.3635.2', - '71.0.3578.97', - '73.0.3637.1', - '73.0.3637.0', - '72.0.3626.14', - '71.0.3578.96', - '71.0.3578.95', - '72.0.3626.13', - '71.0.3578.94', - '73.0.3636.2', - '71.0.3578.93', - '73.0.3636.1', - '73.0.3636.0', - '72.0.3626.12', - '71.0.3578.92', - '73.0.3635.1', - '73.0.3635.0', - '72.0.3626.11', - '71.0.3578.91', - '73.0.3634.2', - '73.0.3634.1', - '73.0.3634.0', - '72.0.3626.10', - '71.0.3578.90', - '71.0.3578.89', - '73.0.3633.2', - '73.0.3633.1', - '73.0.3633.0', - '72.0.3610.4', - '72.0.3626.9', - '71.0.3578.88', - '73.0.3632.5', - '73.0.3632.4', - '73.0.3632.3', - '73.0.3632.2', - '73.0.3632.1', - '73.0.3632.0', - '72.0.3626.8', - '71.0.3578.87', - '73.0.3631.2', - '73.0.3631.1', - '73.0.3631.0', - '72.0.3626.7', - '71.0.3578.86', - '72.0.3626.6', - '73.0.3630.1', - '73.0.3630.0', - '72.0.3626.5', - '71.0.3578.85', - '72.0.3626.4', - '73.0.3628.3', - '73.0.3628.2', - '73.0.3629.1', - '73.0.3629.0', - '72.0.3626.3', - '71.0.3578.84', - '73.0.3628.1', - '73.0.3628.0', - '71.0.3578.83', - '73.0.3627.1', - '73.0.3627.0', - '72.0.3626.2', - '71.0.3578.82', - '71.0.3578.81', - '71.0.3578.80', - '72.0.3626.1', - '72.0.3626.0', - '71.0.3578.79', - '70.0.3538.124', - '71.0.3578.78', - '72.0.3623.4', - '72.0.3625.2', - '72.0.3625.1', - '72.0.3625.0', - '71.0.3578.77', - '70.0.3538.123', - '72.0.3624.4', - '72.0.3624.3', - '72.0.3624.2', - '71.0.3578.76', - '72.0.3624.1', - '72.0.3624.0', - '72.0.3623.3', - '71.0.3578.75', - '70.0.3538.122', - '71.0.3578.74', - '72.0.3623.2', - '72.0.3610.3', - '72.0.3623.1', - '72.0.3623.0', - '72.0.3622.3', - '72.0.3622.2', - '71.0.3578.73', - '70.0.3538.121', - '72.0.3622.1', - '72.0.3622.0', - '71.0.3578.72', - '70.0.3538.120', - '72.0.3621.1', - '72.0.3621.0', - '71.0.3578.71', - '70.0.3538.119', - '72.0.3620.1', - '72.0.3620.0', - '71.0.3578.70', - '70.0.3538.118', - '71.0.3578.69', - '72.0.3619.1', - '72.0.3619.0', - '71.0.3578.68', - '70.0.3538.117', - '71.0.3578.67', - '72.0.3618.1', - '72.0.3618.0', - '71.0.3578.66', - '70.0.3538.116', - '72.0.3617.1', - '72.0.3617.0', - '71.0.3578.65', - '70.0.3538.115', - '72.0.3602.3', - '71.0.3578.64', - '72.0.3616.1', - '72.0.3616.0', - '71.0.3578.63', - '70.0.3538.114', - '71.0.3578.62', - '72.0.3615.1', - '72.0.3615.0', - '71.0.3578.61', - '70.0.3538.113', - '72.0.3614.1', - '72.0.3614.0', - '71.0.3578.60', - '70.0.3538.112', - '72.0.3613.1', - '72.0.3613.0', - '71.0.3578.59', - '70.0.3538.111', - '72.0.3612.2', - '72.0.3612.1', - '72.0.3612.0', - '70.0.3538.110', - '71.0.3578.58', - '70.0.3538.109', - '72.0.3611.2', - '72.0.3611.1', - '72.0.3611.0', - '71.0.3578.57', - '70.0.3538.108', - '72.0.3610.2', - '71.0.3578.56', - '71.0.3578.55', - '72.0.3610.1', - '72.0.3610.0', - '71.0.3578.54', - '70.0.3538.107', - '71.0.3578.53', - '72.0.3609.3', - '71.0.3578.52', - '72.0.3609.2', - '71.0.3578.51', - '72.0.3608.5', - '72.0.3609.1', - '72.0.3609.0', - '71.0.3578.50', - '70.0.3538.106', - '72.0.3608.4', - '72.0.3608.3', - '72.0.3608.2', - '71.0.3578.49', - '72.0.3608.1', - '72.0.3608.0', - '70.0.3538.105', - '71.0.3578.48', - '72.0.3607.1', - '72.0.3607.0', - '71.0.3578.47', - '70.0.3538.104', - '72.0.3606.2', - '72.0.3606.1', - '72.0.3606.0', - '71.0.3578.46', - '70.0.3538.103', - '70.0.3538.102', - '72.0.3605.3', - '72.0.3605.2', - '72.0.3605.1', - '72.0.3605.0', - '71.0.3578.45', - '70.0.3538.101', - '71.0.3578.44', - '71.0.3578.43', - '70.0.3538.100', - '70.0.3538.99', - '71.0.3578.42', - '72.0.3604.1', - '72.0.3604.0', - '71.0.3578.41', - '70.0.3538.98', - '71.0.3578.40', - '72.0.3603.2', - '72.0.3603.1', - '72.0.3603.0', - '71.0.3578.39', - '70.0.3538.97', - '72.0.3602.2', - '71.0.3578.38', - '71.0.3578.37', - '72.0.3602.1', - '72.0.3602.0', - '71.0.3578.36', - '70.0.3538.96', - '72.0.3601.1', - '72.0.3601.0', - '71.0.3578.35', - '70.0.3538.95', - '72.0.3600.1', - '72.0.3600.0', - '71.0.3578.34', - '70.0.3538.94', - '72.0.3599.3', - '72.0.3599.2', - '72.0.3599.1', - '72.0.3599.0', - '71.0.3578.33', - '70.0.3538.93', - '72.0.3598.1', - '72.0.3598.0', - '71.0.3578.32', - '70.0.3538.87', - '72.0.3597.1', - '72.0.3597.0', - '72.0.3596.2', - '71.0.3578.31', - '70.0.3538.86', - '71.0.3578.30', - '71.0.3578.29', - '72.0.3596.1', - '72.0.3596.0', - '71.0.3578.28', - '70.0.3538.85', - '72.0.3595.2', - '72.0.3591.3', - '72.0.3595.1', - '72.0.3595.0', - '71.0.3578.27', - '70.0.3538.84', - '72.0.3594.1', - '72.0.3594.0', - '71.0.3578.26', - '70.0.3538.83', - '72.0.3593.2', - '72.0.3593.1', - '72.0.3593.0', - '71.0.3578.25', - '70.0.3538.82', - '72.0.3589.3', - '72.0.3592.2', - '72.0.3592.1', - '72.0.3592.0', - '71.0.3578.24', - '72.0.3589.2', - '70.0.3538.81', - '70.0.3538.80', - '72.0.3591.2', - '72.0.3591.1', - '72.0.3591.0', - '71.0.3578.23', - '70.0.3538.79', - '71.0.3578.22', - '72.0.3590.1', - '72.0.3590.0', - '71.0.3578.21', - '70.0.3538.78', - '70.0.3538.77', - '72.0.3589.1', - '72.0.3589.0', - '71.0.3578.20', - '70.0.3538.76', - '71.0.3578.19', - '70.0.3538.75', - '72.0.3588.1', - '72.0.3588.0', - '71.0.3578.18', - '70.0.3538.74', - '72.0.3586.2', - '72.0.3587.0', - '71.0.3578.17', - '70.0.3538.73', - '72.0.3586.1', - '72.0.3586.0', - '71.0.3578.16', - '70.0.3538.72', - '72.0.3585.1', - '72.0.3585.0', - '71.0.3578.15', - '70.0.3538.71', - '71.0.3578.14', - '72.0.3584.1', - '72.0.3584.0', - '71.0.3578.13', - '70.0.3538.70', - '72.0.3583.2', - '71.0.3578.12', - '72.0.3583.1', - '72.0.3583.0', - '71.0.3578.11', - '70.0.3538.69', - '71.0.3578.10', - '72.0.3582.0', - '72.0.3581.4', - '71.0.3578.9', - '70.0.3538.67', - '72.0.3581.3', - '72.0.3581.2', - '72.0.3581.1', - '72.0.3581.0', - '71.0.3578.8', - '70.0.3538.66', - '72.0.3580.1', - '72.0.3580.0', - '71.0.3578.7', - '70.0.3538.65', - '71.0.3578.6', - '72.0.3579.1', - '72.0.3579.0', - '71.0.3578.5', - '70.0.3538.64', - '71.0.3578.4', - '71.0.3578.3', - '71.0.3578.2', - '71.0.3578.1', - '71.0.3578.0', - '70.0.3538.63', - '69.0.3497.128', - '70.0.3538.62', - '70.0.3538.61', - '70.0.3538.60', - '70.0.3538.59', - '71.0.3577.1', - '71.0.3577.0', - '70.0.3538.58', - '69.0.3497.127', - '71.0.3576.2', - '71.0.3576.1', - '71.0.3576.0', - '70.0.3538.57', - '70.0.3538.56', - '71.0.3575.2', - '70.0.3538.55', - '69.0.3497.126', - '70.0.3538.54', - '71.0.3575.1', - '71.0.3575.0', - '71.0.3574.1', - '71.0.3574.0', - '70.0.3538.53', - '69.0.3497.125', - '70.0.3538.52', - '71.0.3573.1', - '71.0.3573.0', - '70.0.3538.51', - '69.0.3497.124', - '71.0.3572.1', - '71.0.3572.0', - '70.0.3538.50', - '69.0.3497.123', - '71.0.3571.2', - '70.0.3538.49', - '69.0.3497.122', - '71.0.3571.1', - '71.0.3571.0', - '70.0.3538.48', - '69.0.3497.121', - '71.0.3570.1', - '71.0.3570.0', - '70.0.3538.47', - '69.0.3497.120', - '71.0.3568.2', - '71.0.3569.1', - '71.0.3569.0', - '70.0.3538.46', - '69.0.3497.119', - '70.0.3538.45', - '71.0.3568.1', - '71.0.3568.0', - '70.0.3538.44', - '69.0.3497.118', - '70.0.3538.43', - '70.0.3538.42', - '71.0.3567.1', - '71.0.3567.0', - '70.0.3538.41', - '69.0.3497.117', - '71.0.3566.1', - '71.0.3566.0', - '70.0.3538.40', - '69.0.3497.116', - '71.0.3565.1', - '71.0.3565.0', - '70.0.3538.39', - '69.0.3497.115', - '71.0.3564.1', - '71.0.3564.0', - '70.0.3538.38', - '69.0.3497.114', - '71.0.3563.0', - '71.0.3562.2', - '70.0.3538.37', - '69.0.3497.113', - '70.0.3538.36', - '70.0.3538.35', - '71.0.3562.1', - '71.0.3562.0', - '70.0.3538.34', - '69.0.3497.112', - '70.0.3538.33', - '71.0.3561.1', - '71.0.3561.0', - '70.0.3538.32', - '69.0.3497.111', - '71.0.3559.6', - '71.0.3560.1', - '71.0.3560.0', - '71.0.3559.5', - '71.0.3559.4', - '70.0.3538.31', - '69.0.3497.110', - '71.0.3559.3', - '70.0.3538.30', - '69.0.3497.109', - '71.0.3559.2', - '71.0.3559.1', - '71.0.3559.0', - '70.0.3538.29', - '69.0.3497.108', - '71.0.3558.2', - '71.0.3558.1', - '71.0.3558.0', - '70.0.3538.28', - '69.0.3497.107', - '71.0.3557.2', - '71.0.3557.1', - '71.0.3557.0', - '70.0.3538.27', - '69.0.3497.106', - '71.0.3554.4', - '70.0.3538.26', - '71.0.3556.1', - '71.0.3556.0', - '70.0.3538.25', - '71.0.3554.3', - '69.0.3497.105', - '71.0.3554.2', - '70.0.3538.24', - '69.0.3497.104', - '71.0.3555.2', - '70.0.3538.23', - '71.0.3555.1', - '71.0.3555.0', - '70.0.3538.22', - '69.0.3497.103', - '71.0.3554.1', - '71.0.3554.0', - '70.0.3538.21', - '69.0.3497.102', - '71.0.3553.3', - '70.0.3538.20', - '69.0.3497.101', - '71.0.3553.2', - '69.0.3497.100', - '71.0.3553.1', - '71.0.3553.0', - '70.0.3538.19', - '69.0.3497.99', - '69.0.3497.98', - '69.0.3497.97', - '71.0.3552.6', - '71.0.3552.5', - '71.0.3552.4', - '71.0.3552.3', - '71.0.3552.2', - '71.0.3552.1', - '71.0.3552.0', - '70.0.3538.18', - '69.0.3497.96', - '71.0.3551.3', - '71.0.3551.2', - '71.0.3551.1', - '71.0.3551.0', - '70.0.3538.17', - '69.0.3497.95', - '71.0.3550.3', - '71.0.3550.2', - '71.0.3550.1', - '71.0.3550.0', - '70.0.3538.16', - '69.0.3497.94', - '71.0.3549.1', - '71.0.3549.0', - '70.0.3538.15', - '69.0.3497.93', - '69.0.3497.92', - '71.0.3548.1', - '71.0.3548.0', - '70.0.3538.14', - '69.0.3497.91', - '71.0.3547.1', - '71.0.3547.0', - '70.0.3538.13', - '69.0.3497.90', - '71.0.3546.2', - '69.0.3497.89', - '71.0.3546.1', - '71.0.3546.0', - '70.0.3538.12', - '69.0.3497.88', - '71.0.3545.4', - '71.0.3545.3', - '71.0.3545.2', - '71.0.3545.1', - '71.0.3545.0', - '70.0.3538.11', - '69.0.3497.87', - '71.0.3544.5', - '71.0.3544.4', - '71.0.3544.3', - '71.0.3544.2', - '71.0.3544.1', - '71.0.3544.0', - '69.0.3497.86', - '70.0.3538.10', - '69.0.3497.85', - '70.0.3538.9', - '69.0.3497.84', - '71.0.3543.4', - '70.0.3538.8', - '71.0.3543.3', - '71.0.3543.2', - '71.0.3543.1', - '71.0.3543.0', - '70.0.3538.7', - '69.0.3497.83', - '71.0.3542.2', - '71.0.3542.1', - '71.0.3542.0', - '70.0.3538.6', - '69.0.3497.82', - '69.0.3497.81', - '71.0.3541.1', - '71.0.3541.0', - '70.0.3538.5', - '69.0.3497.80', - '71.0.3540.1', - '71.0.3540.0', - '70.0.3538.4', - '69.0.3497.79', - '70.0.3538.3', - '71.0.3539.1', - '71.0.3539.0', - '69.0.3497.78', - '68.0.3440.134', - '69.0.3497.77', - '70.0.3538.2', - '70.0.3538.1', - '70.0.3538.0', - '69.0.3497.76', - '68.0.3440.133', - '69.0.3497.75', - '70.0.3537.2', - '70.0.3537.1', - '70.0.3537.0', - '69.0.3497.74', - '68.0.3440.132', - '70.0.3536.0', - '70.0.3535.5', - '70.0.3535.4', - '70.0.3535.3', - '69.0.3497.73', - '68.0.3440.131', - '70.0.3532.8', - '70.0.3532.7', - '69.0.3497.72', - '69.0.3497.71', - '70.0.3535.2', - '70.0.3535.1', - '70.0.3535.0', - '69.0.3497.70', - '68.0.3440.130', - '69.0.3497.69', - '68.0.3440.129', - '70.0.3534.4', - '70.0.3534.3', - '70.0.3534.2', - '70.0.3534.1', - '70.0.3534.0', - '69.0.3497.68', - '68.0.3440.128', - '70.0.3533.2', - '70.0.3533.1', - '70.0.3533.0', - '69.0.3497.67', - '68.0.3440.127', - '70.0.3532.6', - '70.0.3532.5', - '70.0.3532.4', - '69.0.3497.66', - '68.0.3440.126', - '70.0.3532.3', - '70.0.3532.2', - '70.0.3532.1', - '69.0.3497.60', - '69.0.3497.65', - '69.0.3497.64', - '70.0.3532.0', - '70.0.3531.0', - '70.0.3530.4', - '70.0.3530.3', - '70.0.3530.2', - '69.0.3497.58', - '68.0.3440.125', - '69.0.3497.57', - '69.0.3497.56', - '69.0.3497.55', - '69.0.3497.54', - '70.0.3530.1', - '70.0.3530.0', - '69.0.3497.53', - '68.0.3440.124', - '69.0.3497.52', - '70.0.3529.3', - '70.0.3529.2', - '70.0.3529.1', - '70.0.3529.0', - '69.0.3497.51', - '70.0.3528.4', - '68.0.3440.123', - '70.0.3528.3', - '70.0.3528.2', - '70.0.3528.1', - '70.0.3528.0', - '69.0.3497.50', - '68.0.3440.122', - '70.0.3527.1', - '70.0.3527.0', - '69.0.3497.49', - '68.0.3440.121', - '70.0.3526.1', - '70.0.3526.0', - '68.0.3440.120', - '69.0.3497.48', - '69.0.3497.47', - '68.0.3440.119', - '68.0.3440.118', - '70.0.3525.5', - '70.0.3525.4', - '70.0.3525.3', - '68.0.3440.117', - '69.0.3497.46', - '70.0.3525.2', - '70.0.3525.1', - '70.0.3525.0', - '69.0.3497.45', - '68.0.3440.116', - '70.0.3524.4', - '70.0.3524.3', - '69.0.3497.44', - '70.0.3524.2', - '70.0.3524.1', - '70.0.3524.0', - '70.0.3523.2', - '69.0.3497.43', - '68.0.3440.115', - '70.0.3505.9', - '69.0.3497.42', - '70.0.3505.8', - '70.0.3523.1', - '70.0.3523.0', - '69.0.3497.41', - '68.0.3440.114', - '70.0.3505.7', - '69.0.3497.40', - '70.0.3522.1', - '70.0.3522.0', - '70.0.3521.2', - '69.0.3497.39', - '68.0.3440.113', - '70.0.3505.6', - '70.0.3521.1', - '70.0.3521.0', - '69.0.3497.38', - '68.0.3440.112', - '70.0.3520.1', - '70.0.3520.0', - '69.0.3497.37', - '68.0.3440.111', - '70.0.3519.3', - '70.0.3519.2', - '70.0.3519.1', - '70.0.3519.0', - '69.0.3497.36', - '68.0.3440.110', - '70.0.3518.1', - '70.0.3518.0', - '69.0.3497.35', - '69.0.3497.34', - '68.0.3440.109', - '70.0.3517.1', - '70.0.3517.0', - '69.0.3497.33', - '68.0.3440.108', - '69.0.3497.32', - '70.0.3516.3', - '70.0.3516.2', - '70.0.3516.1', - '70.0.3516.0', - '69.0.3497.31', - '68.0.3440.107', - '70.0.3515.4', - '68.0.3440.106', - '70.0.3515.3', - '70.0.3515.2', - '70.0.3515.1', - '70.0.3515.0', - '69.0.3497.30', - '68.0.3440.105', - '68.0.3440.104', - '70.0.3514.2', - '70.0.3514.1', - '70.0.3514.0', - '69.0.3497.29', - '68.0.3440.103', - '70.0.3513.1', - '70.0.3513.0', - '69.0.3497.28', - ) - return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS) - - -std_headers = { - 'User-Agent': random_user_agent(), - 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', - 'Accept-Encoding': 'gzip, deflate', - 'Accept-Language': 'en-us,en;q=0.5', -} - - -USER_AGENTS = { - 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27', -} - - -NO_DEFAULT = object() - -ENGLISH_MONTH_NAMES = [ - 'January', 'February', 'March', 'April', 'May', 'June', - 'July', 'August', 'September', 'October', 'November', 'December'] - -MONTH_NAMES = { - 'en': ENGLISH_MONTH_NAMES, - 'fr': [ - 'janvier', 'février', 'mars', 'avril', 'mai', 'juin', - 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'], -} - -KNOWN_EXTENSIONS = ( - 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac', - 'flv', 'f4v', 'f4a', 'f4b', - 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus', - 'mkv', 'mka', 'mk3d', - 'avi', 'divx', - 'mov', - 'asf', 'wmv', 'wma', - '3gp', '3g2', - 'mp3', - 'flac', - 'ape', - 'wav', - 'f4f', 'f4m', 'm3u8', 'smil') - -# needed for sanitizing filenames in restricted mode -ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ', - itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'], - 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y'))) - -DATE_FORMATS = ( - '%d %B %Y', - '%d %b %Y', - '%B %d %Y', - '%B %dst %Y', - '%B %dnd %Y', - '%B %drd %Y', - '%B %dth %Y', - '%b %d %Y', - '%b %dst %Y', - '%b %dnd %Y', - '%b %drd %Y', - '%b %dth %Y', - '%b %dst %Y %I:%M', - '%b %dnd %Y %I:%M', - '%b %drd %Y %I:%M', - '%b %dth %Y %I:%M', - '%Y %m %d', - '%Y-%m-%d', - '%Y/%m/%d', - '%Y/%m/%d %H:%M', - '%Y/%m/%d %H:%M:%S', - '%Y-%m-%d %H:%M', - '%Y-%m-%d %H:%M:%S', - '%Y-%m-%d %H:%M:%S.%f', - '%d.%m.%Y %H:%M', - '%d.%m.%Y %H.%M', - '%Y-%m-%dT%H:%M:%SZ', - '%Y-%m-%dT%H:%M:%S.%fZ', - '%Y-%m-%dT%H:%M:%S.%f0Z', - '%Y-%m-%dT%H:%M:%S', - '%Y-%m-%dT%H:%M:%S.%f', - '%Y-%m-%dT%H:%M', - '%b %d %Y at %H:%M', - '%b %d %Y at %H:%M:%S', - '%B %d %Y at %H:%M', - '%B %d %Y at %H:%M:%S', -) - -DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS) -DATE_FORMATS_DAY_FIRST.extend([ - '%d-%m-%Y', - '%d.%m.%Y', - '%d.%m.%y', - '%d/%m/%Y', - '%d/%m/%y', - '%d/%m/%Y %H:%M:%S', -]) - -DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS) -DATE_FORMATS_MONTH_FIRST.extend([ - '%m-%d-%Y', - '%m.%d.%Y', - '%m/%d/%Y', - '%m/%d/%y', - '%m/%d/%Y %H:%M:%S', -]) - -PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)" -JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>' - - -def preferredencoding(): - """Get preferred encoding. - - Returns the best encoding scheme for the system, based on - locale.getpreferredencoding() and some further tweaks. - """ - try: - pref = locale.getpreferredencoding() - 'TEST'.encode(pref) - except Exception: - pref = 'UTF-8' - - return pref - - -def write_json_file(obj, fn): - """ Encode obj as JSON and write it to fn, atomically if possible """ - - fn = encodeFilename(fn) - if sys.version_info < (3, 0) and sys.platform != 'win32': - encoding = get_filesystem_encoding() - # os.path.basename returns a bytes object, but NamedTemporaryFile - # will fail if the filename contains non ascii characters unless we - # use a unicode object - path_basename = lambda f: os.path.basename(fn).decode(encoding) - # the same for os.path.dirname - path_dirname = lambda f: os.path.dirname(fn).decode(encoding) - else: - path_basename = os.path.basename - path_dirname = os.path.dirname - - args = { - 'suffix': '.tmp', - 'prefix': path_basename(fn) + '.', - 'dir': path_dirname(fn), - 'delete': False, - } - - # In Python 2.x, json.dump expects a bytestream. - # In Python 3.x, it writes to a character stream - if sys.version_info < (3, 0): - args['mode'] = 'wb' - else: - args.update({ - 'mode': 'w', - 'encoding': 'utf-8', - }) - - tf = tempfile.NamedTemporaryFile(**compat_kwargs(args)) - - try: - with tf: - json.dump(obj, tf) - if sys.platform == 'win32': - # Need to remove existing file on Windows, else os.rename raises - # WindowsError or FileExistsError. - try: - os.unlink(fn) - except OSError: - pass - try: - mask = os.umask(0) - os.umask(mask) - os.chmod(tf.name, 0o666 & ~mask) - except OSError: - pass - os.rename(tf.name, fn) - except Exception: - try: - os.remove(tf.name) - except OSError: - pass - raise - - -if sys.version_info >= (2, 7): - def find_xpath_attr(node, xpath, key, val=None): - """ Find the xpath xpath[@key=val] """ - assert re.match(r'^[a-zA-Z_-]+$', key) - expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val)) - return node.find(expr) -else: - def find_xpath_attr(node, xpath, key, val=None): - for f in node.findall(compat_xpath(xpath)): - if key not in f.attrib: - continue - if val is None or f.attrib.get(key) == val: - return f - return None - -# On python2.6 the xml.etree.ElementTree.Element methods don't support -# the namespace parameter - - -def xpath_with_ns(path, ns_map): - components = [c.split(':') for c in path.split('/')] - replaced = [] - for c in components: - if len(c) == 1: - replaced.append(c[0]) - else: - ns, tag = c - replaced.append('{%s}%s' % (ns_map[ns], tag)) - return '/'.join(replaced) - - -def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT): - def _find_xpath(xpath): - return node.find(compat_xpath(xpath)) - - if isinstance(xpath, (str, compat_str)): - n = _find_xpath(xpath) - else: - for xp in xpath: - n = _find_xpath(xp) - if n is not None: - break - - if n is None: - if default is not NO_DEFAULT: - return default - elif fatal: - name = xpath if name is None else name - raise ExtractorError('Could not find XML element %s' % name) - else: - return None - return n - - -def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT): - n = xpath_element(node, xpath, name, fatal=fatal, default=default) - if n is None or n == default: - return n - if n.text is None: - if default is not NO_DEFAULT: - return default - elif fatal: - name = xpath if name is None else name - raise ExtractorError('Could not find XML element\'s text %s' % name) - else: - return None - return n.text - - -def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT): - n = find_xpath_attr(node, xpath, key) - if n is None: - if default is not NO_DEFAULT: - return default - elif fatal: - name = '%s[@%s]' % (xpath, key) if name is None else name - raise ExtractorError('Could not find XML attribute %s' % name) - else: - return None - return n.attrib[key] - - -def get_element_by_id(id, html): - """Return the content of the tag with the specified ID in the passed HTML document""" - return get_element_by_attribute('id', id, html) - - -def get_element_by_class(class_name, html): - """Return the content of the first tag with the specified class in the passed HTML document""" - retval = get_elements_by_class(class_name, html) - return retval[0] if retval else None - - -def get_element_by_attribute(attribute, value, html, escape_value=True): - retval = get_elements_by_attribute(attribute, value, html, escape_value) - return retval[0] if retval else None - - -def get_elements_by_class(class_name, html): - """Return the content of all tags with the specified class in the passed HTML document as a list""" - return get_elements_by_attribute( - 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name), - html, escape_value=False) - - -def get_elements_by_attribute(attribute, value, html, escape_value=True): - """Return the content of the tag with the specified attribute in the passed HTML document""" - - value = re.escape(value) if escape_value else value - - retlist = [] - for m in re.finditer(r'''(?xs) - <([a-zA-Z0-9:._-]+) - (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*? - \s+%s=['"]?%s['"]? - (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*? - \s*> - (?P<content>.*?) - </\1> - ''' % (re.escape(attribute), value), html): - res = m.group('content') - - if res.startswith('"') or res.startswith("'"): - res = res[1:-1] - - retlist.append(unescapeHTML(res)) - - return retlist - - -class HTMLAttributeParser(compat_HTMLParser): - """Trivial HTML parser to gather the attributes for a single element""" - def __init__(self): - self.attrs = {} - compat_HTMLParser.__init__(self) - - def handle_starttag(self, tag, attrs): - self.attrs = dict(attrs) - - -def extract_attributes(html_element): - """Given a string for an HTML element such as - <el - a="foo" B="bar" c="&98;az" d=boz - empty= noval entity="&" - sq='"' dq="'" - > - Decode and return a dictionary of attributes. - { - 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz', - 'empty': '', 'noval': None, 'entity': '&', - 'sq': '"', 'dq': '\'' - }. - NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions, - but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5. - """ - parser = HTMLAttributeParser() - try: - parser.feed(html_element) - parser.close() - # Older Python may throw HTMLParseError in case of malformed HTML - except compat_HTMLParseError: - pass - return parser.attrs - - -def clean_html(html): - """Clean an HTML snippet into a readable string""" - - if html is None: # Convenience for sanitizing descriptions etc. - return html - - # Newline vs <br /> - html = html.replace('\n', ' ') - html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html) - html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html) - # Strip html tags - html = re.sub('<.*?>', '', html) - # Replace html entities - html = unescapeHTML(html) - return html.strip() - - -def sanitize_open(filename, open_mode): - """Try to open the given filename, and slightly tweak it if this fails. - - Attempts to open the given filename. If this fails, it tries to change - the filename slightly, step by step, until it's either able to open it - or it fails and raises a final exception, like the standard open() - function. - - It returns the tuple (stream, definitive_file_name). - """ - try: - if filename == '-': - if sys.platform == 'win32': - import msvcrt - msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY) - return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename) - stream = open(encodeFilename(filename), open_mode) - return (stream, filename) - except (IOError, OSError) as err: - if err.errno in (errno.EACCES,): - raise - - # In case of error, try to remove win32 forbidden chars - alt_filename = sanitize_path(filename) - if alt_filename == filename: - raise - else: - # An exception here should be caught in the caller - stream = open(encodeFilename(alt_filename), open_mode) - return (stream, alt_filename) - - -def timeconvert(timestr): - """Convert RFC 2822 defined time string into system timestamp""" - timestamp = None - timetuple = email.utils.parsedate_tz(timestr) - if timetuple is not None: - timestamp = email.utils.mktime_tz(timetuple) - return timestamp - - -def sanitize_filename(s, restricted=False, is_id=False): - """Sanitizes a string so it could be used as part of a filename. - If restricted is set, use a stricter subset of allowed characters. - Set is_id if this is not an arbitrary string, but an ID that should be kept - if possible. - """ - def replace_insane(char): - if restricted and char in ACCENT_CHARS: - return ACCENT_CHARS[char] - if char == '?' or ord(char) < 32 or ord(char) == 127: - return '' - elif char == '"': - return '' if restricted else '\'' - elif char == ':': - return '_-' if restricted else ' -' - elif char in '\\/|*<>': - return '_' - if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()): - return '_' - if restricted and ord(char) > 127: - return '_' - return char - - # Handle timestamps - s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) - result = ''.join(map(replace_insane, s)) - if not is_id: - while '__' in result: - result = result.replace('__', '_') - result = result.strip('_') - # Common case of "Foreign band name - English song title" - if restricted and result.startswith('-_'): - result = result[2:] - if result.startswith('-'): - result = '_' + result[len('-'):] - result = result.lstrip('.') - if not result: - result = '_' - return result - - -def sanitize_path(s): - """Sanitizes and normalizes path on Windows""" - if sys.platform != 'win32': - return s - drive_or_unc, _ = os.path.splitdrive(s) - if sys.version_info < (2, 7) and not drive_or_unc: - drive_or_unc, _ = os.path.splitunc(s) - norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep) - if drive_or_unc: - norm_path.pop(0) - sanitized_path = [ - path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part) - for path_part in norm_path] - if drive_or_unc: - sanitized_path.insert(0, drive_or_unc + os.path.sep) - return os.path.join(*sanitized_path) - - -def sanitize_url(url): - # Prepend protocol-less URLs with `http:` scheme in order to mitigate - # the number of unwanted failures due to missing protocol - if url.startswith('//'): - return 'http:%s' % url - # Fix some common typos seen so far - COMMON_TYPOS = ( - # https://github.com/ytdl-org/youtube-dl/issues/15649 - (r'^httpss://', r'https://'), - # https://bx1.be/lives/direct-tv/ - (r'^rmtp([es]?)://', r'rtmp\1://'), - ) - for mistake, fixup in COMMON_TYPOS: - if re.match(mistake, url): - return re.sub(mistake, fixup, url) - return url - - -def sanitized_Request(url, *args, **kwargs): - return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs) - - -def expand_path(s): - """Expand shell variables and ~""" - return os.path.expandvars(compat_expanduser(s)) - - -def orderedSet(iterable): - """ Remove all duplicates from the input iterable """ - res = [] - for el in iterable: - if el not in res: - res.append(el) - return res - - -def _htmlentity_transform(entity_with_semicolon): - """Transforms an HTML entity to a character.""" - entity = entity_with_semicolon[:-1] - - # Known non-numeric HTML entity - if entity in compat_html_entities.name2codepoint: - return compat_chr(compat_html_entities.name2codepoint[entity]) - - # TODO: HTML5 allows entities without a semicolon. For example, - # 'Éric' should be decoded as 'Éric'. - if entity_with_semicolon in compat_html_entities_html5: - return compat_html_entities_html5[entity_with_semicolon] - - mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity) - if mobj is not None: - numstr = mobj.group(1) - if numstr.startswith('x'): - base = 16 - numstr = '0%s' % numstr - else: - base = 10 - # See https://github.com/ytdl-org/youtube-dl/issues/7518 - try: - return compat_chr(int(numstr, base)) - except ValueError: - pass - - # Unknown entity in name, return its literal representation - return '&%s;' % entity - - -def unescapeHTML(s): - if s is None: - return None - assert type(s) == compat_str - - return re.sub( - r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s) - - -def get_subprocess_encoding(): - if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5: - # For subprocess calls, encode with locale encoding - # Refer to http://stackoverflow.com/a/9951851/35070 - encoding = preferredencoding() - else: - encoding = sys.getfilesystemencoding() - if encoding is None: - encoding = 'utf-8' - return encoding - - -def encodeFilename(s, for_subprocess=False): - """ - @param s The name of the file - """ - - assert type(s) == compat_str - - # Python 3 has a Unicode API - if sys.version_info >= (3, 0): - return s - - # Pass '' directly to use Unicode APIs on Windows 2000 and up - # (Detecting Windows NT 4 is tricky because 'major >= 4' would - # match Windows 9x series as well. Besides, NT 4 is obsolete.) - if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5: - return s - - # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible - if sys.platform.startswith('java'): - return s - - return s.encode(get_subprocess_encoding(), 'ignore') - - -def decodeFilename(b, for_subprocess=False): - - if sys.version_info >= (3, 0): - return b - - if not isinstance(b, bytes): - return b - - return b.decode(get_subprocess_encoding(), 'ignore') - - -def encodeArgument(s): - if not isinstance(s, compat_str): - # Legacy code that uses byte strings - # Uncomment the following line after fixing all post processors - # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s)) - s = s.decode('ascii') - return encodeFilename(s, True) - - -def decodeArgument(b): - return decodeFilename(b, True) - - -def decodeOption(optval): - if optval is None: - return optval - if isinstance(optval, bytes): - optval = optval.decode(preferredencoding()) - - assert isinstance(optval, compat_str) - return optval - - -def formatSeconds(secs): - if secs > 3600: - return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60) - elif secs > 60: - return '%d:%02d' % (secs // 60, secs % 60) - else: - return '%d' % secs - - -def make_HTTPS_handler(params, **kwargs): - opts_no_check_certificate = params.get('nocheckcertificate', False) - if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9 - context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH) - if opts_no_check_certificate: - context.check_hostname = False - context.verify_mode = ssl.CERT_NONE - try: - return YoutubeDLHTTPSHandler(params, context=context, **kwargs) - except TypeError: - # Python 2.7.8 - # (create_default_context present but HTTPSHandler has no context=) - pass - - if sys.version_info < (3, 2): - return YoutubeDLHTTPSHandler(params, **kwargs) - else: # Python < 3.4 - context = ssl.SSLContext(ssl.PROTOCOL_TLSv1) - context.verify_mode = (ssl.CERT_NONE - if opts_no_check_certificate - else ssl.CERT_REQUIRED) - context.set_default_verify_paths() - return YoutubeDLHTTPSHandler(params, context=context, **kwargs) - - -def bug_reports_message(): - if ytdl_is_updateable(): - update_cmd = 'type doas pacman -Sy hypervideo to update' - else: - update_cmd = 'see https://yt-dl.org/update on how to update' - msg = '; please report this issue on https://yt-dl.org/bug .' - msg += ' Make sure you are using the latest version; %s.' % update_cmd - msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.' - return msg - - -class YoutubeDLError(Exception): - """Base exception for YoutubeDL errors.""" - pass - - -class ExtractorError(YoutubeDLError): - """Error during info extraction.""" - - def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None): - """ tb, if given, is the original traceback (so that it can be printed out). - If expected is set, this is a normal error message and most likely not a bug in youtube-dl. - """ - - if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError): - expected = True - if video_id is not None: - msg = video_id + ': ' + msg - if cause: - msg += ' (caused by %r)' % cause - if not expected: - msg += bug_reports_message() - super(ExtractorError, self).__init__(msg) - - self.traceback = tb - self.exc_info = sys.exc_info() # preserve original exception - self.cause = cause - self.video_id = video_id - - def format_traceback(self): - if self.traceback is None: - return None - return ''.join(traceback.format_tb(self.traceback)) - - -class UnsupportedError(ExtractorError): - def __init__(self, url): - super(UnsupportedError, self).__init__( - 'Unsupported URL: %s' % url, expected=True) - self.url = url - - -class RegexNotFoundError(ExtractorError): - """Error when a regex didn't match""" - pass - - -class GeoRestrictedError(ExtractorError): - """Geographic restriction Error exception. - - This exception may be thrown when a video is not available from your - geographic location due to geographic restrictions imposed by a website. - """ - def __init__(self, msg, countries=None): - super(GeoRestrictedError, self).__init__(msg, expected=True) - self.msg = msg - self.countries = countries - - -class DownloadError(YoutubeDLError): - """Download Error exception. - - This exception may be thrown by FileDownloader objects if they are not - configured to continue on errors. They will contain the appropriate - error message. - """ - - def __init__(self, msg, exc_info=None): - """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """ - super(DownloadError, self).__init__(msg) - self.exc_info = exc_info - - -class SameFileError(YoutubeDLError): - """Same File exception. - - This exception will be thrown by FileDownloader objects if they detect - multiple files would have to be downloaded to the same file on disk. - """ - pass - - -class PostProcessingError(YoutubeDLError): - """Post Processing exception. - - This exception may be raised by PostProcessor's .run() method to - indicate an error in the postprocessing task. - """ - - def __init__(self, msg): - super(PostProcessingError, self).__init__(msg) - self.msg = msg - - -class MaxDownloadsReached(YoutubeDLError): - """ --max-downloads limit has been reached. """ - pass - - -class UnavailableVideoError(YoutubeDLError): - """Unavailable Format exception. - - This exception will be thrown when a video is requested - in a format that is not available for that video. - """ - pass - - -class ContentTooShortError(YoutubeDLError): - """Content Too Short exception. - - This exception may be raised by FileDownloader objects when a file they - download is too small for what the server announced first, indicating - the connection was probably interrupted. - """ - - def __init__(self, downloaded, expected): - super(ContentTooShortError, self).__init__( - 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected) - ) - # Both in bytes - self.downloaded = downloaded - self.expected = expected - - -class XAttrMetadataError(YoutubeDLError): - def __init__(self, code=None, msg='Unknown error'): - super(XAttrMetadataError, self).__init__(msg) - self.code = code - self.msg = msg - - # Parsing code and msg - if (self.code in (errno.ENOSPC, errno.EDQUOT) - or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg): - self.reason = 'NO_SPACE' - elif self.code == errno.E2BIG or 'Argument list too long' in self.msg: - self.reason = 'VALUE_TOO_LONG' - else: - self.reason = 'NOT_SUPPORTED' - - -class XAttrUnavailableError(YoutubeDLError): - pass - - -def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs): - # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting - # expected HTTP responses to meet HTTP/1.0 or later (see also - # https://github.com/ytdl-org/youtube-dl/issues/6727) - if sys.version_info < (3, 0): - kwargs['strict'] = True - hc = http_class(*args, **compat_kwargs(kwargs)) - source_address = ydl_handler._params.get('source_address') - - if source_address is not None: - # This is to workaround _create_connection() from socket where it will try all - # address data from getaddrinfo() including IPv6. This filters the result from - # getaddrinfo() based on the source_address value. - # This is based on the cpython socket.create_connection() function. - # https://github.com/python/cpython/blob/master/Lib/socket.py#L691 - def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None): - host, port = address - err = None - addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM) - af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6 - ip_addrs = [addr for addr in addrs if addr[0] == af] - if addrs and not ip_addrs: - ip_version = 'v4' if af == socket.AF_INET else 'v6' - raise socket.error( - "No remote IP%s addresses available for connect, can't use '%s' as source address" - % (ip_version, source_address[0])) - for res in ip_addrs: - af, socktype, proto, canonname, sa = res - sock = None - try: - sock = socket.socket(af, socktype, proto) - if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT: - sock.settimeout(timeout) - sock.bind(source_address) - sock.connect(sa) - err = None # Explicitly break reference cycle - return sock - except socket.error as _: - err = _ - if sock is not None: - sock.close() - if err is not None: - raise err - else: - raise socket.error('getaddrinfo returns an empty list') - if hasattr(hc, '_create_connection'): - hc._create_connection = _create_connection - sa = (source_address, 0) - if hasattr(hc, 'source_address'): # Python 2.7+ - hc.source_address = sa - else: # Python 2.6 - def _hc_connect(self, *args, **kwargs): - sock = _create_connection( - (self.host, self.port), self.timeout, sa) - if is_https: - self.sock = ssl.wrap_socket( - sock, self.key_file, self.cert_file, - ssl_version=ssl.PROTOCOL_TLSv1) - else: - self.sock = sock - hc.connect = functools.partial(_hc_connect, hc) - - return hc - - -def handle_youtubedl_headers(headers): - filtered_headers = headers - - if 'Youtubedl-no-compression' in filtered_headers: - filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding') - del filtered_headers['Youtubedl-no-compression'] - - return filtered_headers - - -class YoutubeDLHandler(compat_urllib_request.HTTPHandler): - """Handler for HTTP requests and responses. - - This class, when installed with an OpenerDirector, automatically adds - the standard headers to every HTTP request and handles gzipped and - deflated responses from web servers. If compression is to be avoided in - a particular request, the original request in the program code only has - to include the HTTP header "Youtubedl-no-compression", which will be - removed before making the real request. - - Part of this code was copied from: - - http://techknack.net/python-urllib2-handlers/ - - Andrew Rowls, the author of that code, agreed to release it to the - public domain. - """ - - def __init__(self, params, *args, **kwargs): - compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs) - self._params = params - - def http_open(self, req): - conn_class = compat_http_client.HTTPConnection - - socks_proxy = req.headers.get('Ytdl-socks-proxy') - if socks_proxy: - conn_class = make_socks_conn_class(conn_class, socks_proxy) - del req.headers['Ytdl-socks-proxy'] - - return self.do_open(functools.partial( - _create_http_connection, self, conn_class, False), - req) - - @staticmethod - def deflate(data): - try: - return zlib.decompress(data, -zlib.MAX_WBITS) - except zlib.error: - return zlib.decompress(data) - - def http_request(self, req): - # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not - # always respected by websites, some tend to give out URLs with non percent-encoded - # non-ASCII characters (see telemb.py, ard.py [#3412]) - # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991) - # To work around aforementioned issue we will replace request's original URL with - # percent-encoded one - # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09) - # the code of this workaround has been moved here from YoutubeDL.urlopen() - url = req.get_full_url() - url_escaped = escape_url(url) - - # Substitute URL if any change after escaping - if url != url_escaped: - req = update_Request(req, url=url_escaped) - - for h, v in std_headers.items(): - # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275 - # The dict keys are capitalized because of this bug by urllib - if h.capitalize() not in req.headers: - req.add_header(h, v) - - req.headers = handle_youtubedl_headers(req.headers) - - if sys.version_info < (2, 7) and '#' in req.get_full_url(): - # Python 2.6 is brain-dead when it comes to fragments - req._Request__original = req._Request__original.partition('#')[0] - req._Request__r_type = req._Request__r_type.partition('#')[0] - - return req - - def http_response(self, req, resp): - old_resp = resp - # gzip - if resp.headers.get('Content-encoding', '') == 'gzip': - content = resp.read() - gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb') - try: - uncompressed = io.BytesIO(gz.read()) - except IOError as original_ioerror: - # There may be junk add the end of the file - # See http://stackoverflow.com/q/4928560/35070 for details - for i in range(1, 1024): - try: - gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb') - uncompressed = io.BytesIO(gz.read()) - except IOError: - continue - break - else: - raise original_ioerror - resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code) - resp.msg = old_resp.msg - del resp.headers['Content-encoding'] - # deflate - if resp.headers.get('Content-encoding', '') == 'deflate': - gz = io.BytesIO(self.deflate(resp.read())) - resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code) - resp.msg = old_resp.msg - del resp.headers['Content-encoding'] - # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see - # https://github.com/ytdl-org/youtube-dl/issues/6457). - if 300 <= resp.code < 400: - location = resp.headers.get('Location') - if location: - # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3 - if sys.version_info >= (3, 0): - location = location.encode('iso-8859-1').decode('utf-8') - else: - location = location.decode('utf-8') - location_escaped = escape_url(location) - if location != location_escaped: - del resp.headers['Location'] - if sys.version_info < (3, 0): - location_escaped = location_escaped.encode('utf-8') - resp.headers['Location'] = location_escaped - return resp - - https_request = http_request - https_response = http_response - - -def make_socks_conn_class(base_class, socks_proxy): - assert issubclass(base_class, ( - compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection)) - - url_components = compat_urlparse.urlparse(socks_proxy) - if url_components.scheme.lower() == 'socks5': - socks_type = ProxyType.SOCKS5 - elif url_components.scheme.lower() in ('socks', 'socks4'): - socks_type = ProxyType.SOCKS4 - elif url_components.scheme.lower() == 'socks4a': - socks_type = ProxyType.SOCKS4A - - def unquote_if_non_empty(s): - if not s: - return s - return compat_urllib_parse_unquote_plus(s) - - proxy_args = ( - socks_type, - url_components.hostname, url_components.port or 1080, - True, # Remote DNS - unquote_if_non_empty(url_components.username), - unquote_if_non_empty(url_components.password), - ) - - class SocksConnection(base_class): - def connect(self): - self.sock = sockssocket() - self.sock.setproxy(*proxy_args) - if type(self.timeout) in (int, float): - self.sock.settimeout(self.timeout) - self.sock.connect((self.host, self.port)) - - if isinstance(self, compat_http_client.HTTPSConnection): - if hasattr(self, '_context'): # Python > 2.6 - self.sock = self._context.wrap_socket( - self.sock, server_hostname=self.host) - else: - self.sock = ssl.wrap_socket(self.sock) - - return SocksConnection - - -class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler): - def __init__(self, params, https_conn_class=None, *args, **kwargs): - compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs) - self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection - self._params = params - - def https_open(self, req): - kwargs = {} - conn_class = self._https_conn_class - - if hasattr(self, '_context'): # python > 2.6 - kwargs['context'] = self._context - if hasattr(self, '_check_hostname'): # python 3.x - kwargs['check_hostname'] = self._check_hostname - - socks_proxy = req.headers.get('Ytdl-socks-proxy') - if socks_proxy: - conn_class = make_socks_conn_class(conn_class, socks_proxy) - del req.headers['Ytdl-socks-proxy'] - - return self.do_open(functools.partial( - _create_http_connection, self, conn_class, True), - req, **kwargs) - - -class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar): - """ - See [1] for cookie file format. - - 1. https://curl.haxx.se/docs/http-cookies.html - """ - _HTTPONLY_PREFIX = '#HttpOnly_' - _ENTRY_LEN = 7 - _HEADER = '''# Netscape HTTP Cookie File -# This file is generated by youtube-dl. Do not edit. - -''' - _CookieFileEntry = collections.namedtuple( - 'CookieFileEntry', - ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value')) - - def save(self, filename=None, ignore_discard=False, ignore_expires=False): - """ - Save cookies to a file. - - Most of the code is taken from CPython 3.8 and slightly adapted - to support cookie files with UTF-8 in both python 2 and 3. - """ - if filename is None: - if self.filename is not None: - filename = self.filename - else: - raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT) - - # Store session cookies with `expires` set to 0 instead of an empty - # string - for cookie in self: - if cookie.expires is None: - cookie.expires = 0 - - with io.open(filename, 'w', encoding='utf-8') as f: - f.write(self._HEADER) - now = time.time() - for cookie in self: - if not ignore_discard and cookie.discard: - continue - if not ignore_expires and cookie.is_expired(now): - continue - if cookie.secure: - secure = 'TRUE' - else: - secure = 'FALSE' - if cookie.domain.startswith('.'): - initial_dot = 'TRUE' - else: - initial_dot = 'FALSE' - if cookie.expires is not None: - expires = compat_str(cookie.expires) - else: - expires = '' - if cookie.value is None: - # cookies.txt regards 'Set-Cookie: foo' as a cookie - # with no name, whereas http.cookiejar regards it as a - # cookie with no value. - name = '' - value = cookie.name - else: - name = cookie.name - value = cookie.value - f.write( - '\t'.join([cookie.domain, initial_dot, cookie.path, - secure, expires, name, value]) + '\n') - - def load(self, filename=None, ignore_discard=False, ignore_expires=False): - """Load cookies from a file.""" - if filename is None: - if self.filename is not None: - filename = self.filename - else: - raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT) - - def prepare_line(line): - if line.startswith(self._HTTPONLY_PREFIX): - line = line[len(self._HTTPONLY_PREFIX):] - # comments and empty lines are fine - if line.startswith('#') or not line.strip(): - return line - cookie_list = line.split('\t') - if len(cookie_list) != self._ENTRY_LEN: - raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list)) - cookie = self._CookieFileEntry(*cookie_list) - if cookie.expires_at and not cookie.expires_at.isdigit(): - raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at) - return line - - cf = io.StringIO() - with io.open(filename, encoding='utf-8') as f: - for line in f: - try: - cf.write(prepare_line(line)) - except compat_cookiejar.LoadError as e: - write_string( - 'WARNING: skipping cookie file entry due to %s: %r\n' - % (e, line), sys.stderr) - continue - cf.seek(0) - self._really_load(cf, filename, ignore_discard, ignore_expires) - # Session cookies are denoted by either `expires` field set to - # an empty string or 0. MozillaCookieJar only recognizes the former - # (see [1]). So we need force the latter to be recognized as session - # cookies on our own. - # Session cookies may be important for cookies-based authentication, - # e.g. usually, when user does not check 'Remember me' check box while - # logging in on a site, some important cookies are stored as session - # cookies so that not recognizing them will result in failed login. - # 1. https://bugs.python.org/issue17164 - for cookie in self: - # Treat `expires=0` cookies as session cookies - if cookie.expires == 0: - cookie.expires = None - cookie.discard = True - - -class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor): - def __init__(self, cookiejar=None): - compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar) - - def http_response(self, request, response): - # Python 2 will choke on next HTTP request in row if there are non-ASCII - # characters in Set-Cookie HTTP header of last response (see - # https://github.com/ytdl-org/youtube-dl/issues/6769). - # In order to at least prevent crashing we will percent encode Set-Cookie - # header before HTTPCookieProcessor starts processing it. - # if sys.version_info < (3, 0) and response.headers: - # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'): - # set_cookie = response.headers.get(set_cookie_header) - # if set_cookie: - # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ") - # if set_cookie != set_cookie_escaped: - # del response.headers[set_cookie_header] - # response.headers[set_cookie_header] = set_cookie_escaped - return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response) - - https_request = compat_urllib_request.HTTPCookieProcessor.http_request - https_response = http_response - - -class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler): - """YoutubeDL redirect handler - - The code is based on HTTPRedirectHandler implementation from CPython [1]. - - This redirect handler solves two issues: - - ensures redirect URL is always unicode under python 2 - - introduces support for experimental HTTP response status code - 308 Permanent Redirect [2] used by some sites [3] - - 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py - 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308 - 3. https://github.com/ytdl-org/youtube-dl/issues/28768 - """ - - http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302 - - def redirect_request(self, req, fp, code, msg, headers, newurl): - """Return a Request or None in response to a redirect. - - This is called by the http_error_30x methods when a - redirection response is received. If a redirection should - take place, return a new Request to allow http_error_30x to - perform the redirect. Otherwise, raise HTTPError if no-one - else should try to handle this url. Return None if you can't - but another Handler might. - """ - m = req.get_method() - if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD") - or code in (301, 302, 303) and m == "POST")): - raise compat_HTTPError(req.full_url, code, msg, headers, fp) - # Strictly (according to RFC 2616), 301 or 302 in response to - # a POST MUST NOT cause a redirection without confirmation - # from the user (of urllib.request, in this case). In practice, - # essentially all clients do redirect in this case, so we do - # the same. - - # On python 2 urlh.geturl() may sometimes return redirect URL - # as byte string instead of unicode. This workaround allows - # to force it always return unicode. - if sys.version_info[0] < 3: - newurl = compat_str(newurl) - - # Be conciliant with URIs containing a space. This is mainly - # redundant with the more complete encoding done in http_error_302(), - # but it is kept for compatibility with other callers. - newurl = newurl.replace(' ', '%20') - - CONTENT_HEADERS = ("content-length", "content-type") - # NB: don't use dict comprehension for python 2.6 compatibility - newheaders = dict((k, v) for k, v in req.headers.items() - if k.lower() not in CONTENT_HEADERS) - return compat_urllib_request.Request( - newurl, headers=newheaders, origin_req_host=req.origin_req_host, - unverifiable=True) - - -def extract_timezone(date_str): - m = re.search( - r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)', - date_str) - if not m: - timezone = datetime.timedelta() - else: - date_str = date_str[:-len(m.group('tz'))] - if not m.group('sign'): - timezone = datetime.timedelta() - else: - sign = 1 if m.group('sign') == '+' else -1 - timezone = datetime.timedelta( - hours=sign * int(m.group('hours')), - minutes=sign * int(m.group('minutes'))) - return timezone, date_str - - -def parse_iso8601(date_str, delimiter='T', timezone=None): - """ Return a UNIX timestamp from the given date """ - - if date_str is None: - return None - - date_str = re.sub(r'\.[0-9]+', '', date_str) - - if timezone is None: - timezone, date_str = extract_timezone(date_str) - - try: - date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter) - dt = datetime.datetime.strptime(date_str, date_format) - timezone - return calendar.timegm(dt.timetuple()) - except ValueError: - pass - - -def date_formats(day_first=True): - return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST - - -def unified_strdate(date_str, day_first=True): - """Return a string with the date in the format YYYYMMDD""" - - if date_str is None: - return None - upload_date = None - # Replace commas - date_str = date_str.replace(',', ' ') - # Remove AM/PM + timezone - date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str) - _, date_str = extract_timezone(date_str) - - for expression in date_formats(day_first): - try: - upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d') - except ValueError: - pass - if upload_date is None: - timetuple = email.utils.parsedate_tz(date_str) - if timetuple: - try: - upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d') - except ValueError: - pass - if upload_date is not None: - return compat_str(upload_date) - - -def unified_timestamp(date_str, day_first=True): - if date_str is None: - return None - - date_str = re.sub(r'[,|]', '', date_str) - - pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0 - timezone, date_str = extract_timezone(date_str) - - # Remove AM/PM + timezone - date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str) - - # Remove unrecognized timezones from ISO 8601 alike timestamps - m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str) - if m: - date_str = date_str[:-len(m.group('tz'))] - - # Python only supports microseconds, so remove nanoseconds - m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str) - if m: - date_str = m.group(1) - - for expression in date_formats(day_first): - try: - dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta) - return calendar.timegm(dt.timetuple()) - except ValueError: - pass - timetuple = email.utils.parsedate_tz(date_str) - if timetuple: - return calendar.timegm(timetuple) + pm_delta * 3600 - - -def determine_ext(url, default_ext='unknown_video'): - if url is None or '.' not in url: - return default_ext - guess = url.partition('?')[0].rpartition('.')[2] - if re.match(r'^[A-Za-z0-9]+$', guess): - return guess - # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download - elif guess.rstrip('/') in KNOWN_EXTENSIONS: - return guess.rstrip('/') - else: - return default_ext - - -def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None): - return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext) - - -def date_from_str(date_str): - """ - Return a datetime object from a string in the format YYYYMMDD or - (now|today)[+-][0-9](day|week|month|year)(s)?""" - today = datetime.date.today() - if date_str in ('now', 'today'): - return today - if date_str == 'yesterday': - return today - datetime.timedelta(days=1) - match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str) - if match is not None: - sign = match.group('sign') - time = int(match.group('time')) - if sign == '-': - time = -time - unit = match.group('unit') - # A bad approximation? - if unit == 'month': - unit = 'day' - time *= 30 - elif unit == 'year': - unit = 'day' - time *= 365 - unit += 's' - delta = datetime.timedelta(**{unit: time}) - return today + delta - return datetime.datetime.strptime(date_str, '%Y%m%d').date() - - -def hyphenate_date(date_str): - """ - Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format""" - match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str) - if match is not None: - return '-'.join(match.groups()) - else: - return date_str - - -class DateRange(object): - """Represents a time interval between two dates""" - - def __init__(self, start=None, end=None): - """start and end must be strings in the format accepted by date""" - if start is not None: - self.start = date_from_str(start) - else: - self.start = datetime.datetime.min.date() - if end is not None: - self.end = date_from_str(end) - else: - self.end = datetime.datetime.max.date() - if self.start > self.end: - raise ValueError('Date range: "%s" , the start date must be before the end date' % self) - - @classmethod - def day(cls, day): - """Returns a range that only contains the given day""" - return cls(day, day) - - def __contains__(self, date): - """Check if the date is in the range""" - if not isinstance(date, datetime.date): - date = date_from_str(date) - return self.start <= date <= self.end - - def __str__(self): - return '%s - %s' % (self.start.isoformat(), self.end.isoformat()) - - -def platform_name(): - """ Returns the platform name as a compat_str """ - res = platform.platform() - if isinstance(res, bytes): - res = res.decode(preferredencoding()) - - assert isinstance(res, compat_str) - return res - - -def _windows_write_string(s, out): - """ Returns True if the string was written using special methods, - False if it has yet to be written out.""" - # Adapted from http://stackoverflow.com/a/3259271/35070 - - import ctypes - import ctypes.wintypes - - WIN_OUTPUT_IDS = { - 1: -11, - 2: -12, - } - - try: - fileno = out.fileno() - except AttributeError: - # If the output stream doesn't have a fileno, it's virtual - return False - except io.UnsupportedOperation: - # Some strange Windows pseudo files? - return False - if fileno not in WIN_OUTPUT_IDS: - return False - - GetStdHandle = compat_ctypes_WINFUNCTYPE( - ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)( - ('GetStdHandle', ctypes.windll.kernel32)) - h = GetStdHandle(WIN_OUTPUT_IDS[fileno]) - - WriteConsoleW = compat_ctypes_WINFUNCTYPE( - ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR, - ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD), - ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32)) - written = ctypes.wintypes.DWORD(0) - - GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32)) - FILE_TYPE_CHAR = 0x0002 - FILE_TYPE_REMOTE = 0x8000 - GetConsoleMode = compat_ctypes_WINFUNCTYPE( - ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, - ctypes.POINTER(ctypes.wintypes.DWORD))( - ('GetConsoleMode', ctypes.windll.kernel32)) - INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value - - def not_a_console(handle): - if handle == INVALID_HANDLE_VALUE or handle is None: - return True - return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR - or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0) - - if not_a_console(h): - return False - - def next_nonbmp_pos(s): - try: - return next(i for i, c in enumerate(s) if ord(c) > 0xffff) - except StopIteration: - return len(s) - - while s: - count = min(next_nonbmp_pos(s), 1024) - - ret = WriteConsoleW( - h, s, count if count else 2, ctypes.byref(written), None) - if ret == 0: - raise OSError('Failed to write string') - if not count: # We just wrote a non-BMP character - assert written.value == 2 - s = s[1:] - else: - assert written.value > 0 - s = s[written.value:] - return True - - -def write_string(s, out=None, encoding=None): - if out is None: - out = sys.stderr - assert type(s) == compat_str - - if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'): - if _windows_write_string(s, out): - return - - if ('b' in getattr(out, 'mode', '') - or sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr - byt = s.encode(encoding or preferredencoding(), 'ignore') - out.write(byt) - elif hasattr(out, 'buffer'): - enc = encoding or getattr(out, 'encoding', None) or preferredencoding() - byt = s.encode(enc, 'ignore') - out.buffer.write(byt) - else: - out.write(s) - out.flush() - - -def bytes_to_intlist(bs): - if not bs: - return [] - if isinstance(bs[0], int): # Python 3 - return list(bs) - else: - return [ord(c) for c in bs] - - -def intlist_to_bytes(xs): - if not xs: - return b'' - return compat_struct_pack('%dB' % len(xs), *xs) - - -# Cross-platform file locking -if sys.platform == 'win32': - import ctypes.wintypes - import msvcrt - - class OVERLAPPED(ctypes.Structure): - _fields_ = [ - ('Internal', ctypes.wintypes.LPVOID), - ('InternalHigh', ctypes.wintypes.LPVOID), - ('Offset', ctypes.wintypes.DWORD), - ('OffsetHigh', ctypes.wintypes.DWORD), - ('hEvent', ctypes.wintypes.HANDLE), - ] - - kernel32 = ctypes.windll.kernel32 - LockFileEx = kernel32.LockFileEx - LockFileEx.argtypes = [ - ctypes.wintypes.HANDLE, # hFile - ctypes.wintypes.DWORD, # dwFlags - ctypes.wintypes.DWORD, # dwReserved - ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow - ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh - ctypes.POINTER(OVERLAPPED) # Overlapped - ] - LockFileEx.restype = ctypes.wintypes.BOOL - UnlockFileEx = kernel32.UnlockFileEx - UnlockFileEx.argtypes = [ - ctypes.wintypes.HANDLE, # hFile - ctypes.wintypes.DWORD, # dwReserved - ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow - ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh - ctypes.POINTER(OVERLAPPED) # Overlapped - ] - UnlockFileEx.restype = ctypes.wintypes.BOOL - whole_low = 0xffffffff - whole_high = 0x7fffffff - - def _lock_file(f, exclusive): - overlapped = OVERLAPPED() - overlapped.Offset = 0 - overlapped.OffsetHigh = 0 - overlapped.hEvent = 0 - f._lock_file_overlapped_p = ctypes.pointer(overlapped) - handle = msvcrt.get_osfhandle(f.fileno()) - if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0, - whole_low, whole_high, f._lock_file_overlapped_p): - raise OSError('Locking file failed: %r' % ctypes.FormatError()) - - def _unlock_file(f): - assert f._lock_file_overlapped_p - handle = msvcrt.get_osfhandle(f.fileno()) - if not UnlockFileEx(handle, 0, - whole_low, whole_high, f._lock_file_overlapped_p): - raise OSError('Unlocking file failed: %r' % ctypes.FormatError()) - -else: - # Some platforms, such as Jython, is missing fcntl - try: - import fcntl - - def _lock_file(f, exclusive): - fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH) - - def _unlock_file(f): - fcntl.flock(f, fcntl.LOCK_UN) - except ImportError: - UNSUPPORTED_MSG = 'file locking is not supported on this platform' - - def _lock_file(f, exclusive): - raise IOError(UNSUPPORTED_MSG) - - def _unlock_file(f): - raise IOError(UNSUPPORTED_MSG) - - -class locked_file(object): - def __init__(self, filename, mode, encoding=None): - assert mode in ['r', 'a', 'w'] - self.f = io.open(filename, mode, encoding=encoding) - self.mode = mode - - def __enter__(self): - exclusive = self.mode != 'r' - try: - _lock_file(self.f, exclusive) - except IOError: - self.f.close() - raise - return self - - def __exit__(self, etype, value, traceback): - try: - _unlock_file(self.f) - finally: - self.f.close() - - def __iter__(self): - return iter(self.f) - - def write(self, *args): - return self.f.write(*args) - - def read(self, *args): - return self.f.read(*args) - - -def get_filesystem_encoding(): - encoding = sys.getfilesystemencoding() - return encoding if encoding is not None else 'utf-8' - - -def shell_quote(args): - quoted_args = [] - encoding = get_filesystem_encoding() - for a in args: - if isinstance(a, bytes): - # We may get a filename encoded with 'encodeFilename' - a = a.decode(encoding) - quoted_args.append(compat_shlex_quote(a)) - return ' '.join(quoted_args) - - -def smuggle_url(url, data): - """ Pass additional data in a URL for internal use. """ - - url, idata = unsmuggle_url(url, {}) - data.update(idata) - sdata = compat_urllib_parse_urlencode( - {'__youtubedl_smuggle': json.dumps(data)}) - return url + '#' + sdata - - -def unsmuggle_url(smug_url, default=None): - if '#__youtubedl_smuggle' not in smug_url: - return smug_url, default - url, _, sdata = smug_url.rpartition('#') - jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0] - data = json.loads(jsond) - return url, data - - -def format_bytes(bytes): - if bytes is None: - return 'N/A' - if type(bytes) is str: - bytes = float(bytes) - if bytes == 0.0: - exponent = 0 - else: - exponent = int(math.log(bytes, 1024.0)) - suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent] - converted = float(bytes) / float(1024 ** exponent) - return '%.2f%s' % (converted, suffix) - - -def lookup_unit_table(unit_table, s): - units_re = '|'.join(re.escape(u) for u in unit_table) - m = re.match( - r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s) - if not m: - return None - num_str = m.group('num').replace(',', '.') - mult = unit_table[m.group('unit')] - return int(float(num_str) * mult) - - -def parse_filesize(s): - if s is None: - return None - - # The lower-case forms are of course incorrect and unofficial, - # but we support those too - _UNIT_TABLE = { - 'B': 1, - 'b': 1, - 'bytes': 1, - 'KiB': 1024, - 'KB': 1000, - 'kB': 1024, - 'Kb': 1000, - 'kb': 1000, - 'kilobytes': 1000, - 'kibibytes': 1024, - 'MiB': 1024 ** 2, - 'MB': 1000 ** 2, - 'mB': 1024 ** 2, - 'Mb': 1000 ** 2, - 'mb': 1000 ** 2, - 'megabytes': 1000 ** 2, - 'mebibytes': 1024 ** 2, - 'GiB': 1024 ** 3, - 'GB': 1000 ** 3, - 'gB': 1024 ** 3, - 'Gb': 1000 ** 3, - 'gb': 1000 ** 3, - 'gigabytes': 1000 ** 3, - 'gibibytes': 1024 ** 3, - 'TiB': 1024 ** 4, - 'TB': 1000 ** 4, - 'tB': 1024 ** 4, - 'Tb': 1000 ** 4, - 'tb': 1000 ** 4, - 'terabytes': 1000 ** 4, - 'tebibytes': 1024 ** 4, - 'PiB': 1024 ** 5, - 'PB': 1000 ** 5, - 'pB': 1024 ** 5, - 'Pb': 1000 ** 5, - 'pb': 1000 ** 5, - 'petabytes': 1000 ** 5, - 'pebibytes': 1024 ** 5, - 'EiB': 1024 ** 6, - 'EB': 1000 ** 6, - 'eB': 1024 ** 6, - 'Eb': 1000 ** 6, - 'eb': 1000 ** 6, - 'exabytes': 1000 ** 6, - 'exbibytes': 1024 ** 6, - 'ZiB': 1024 ** 7, - 'ZB': 1000 ** 7, - 'zB': 1024 ** 7, - 'Zb': 1000 ** 7, - 'zb': 1000 ** 7, - 'zettabytes': 1000 ** 7, - 'zebibytes': 1024 ** 7, - 'YiB': 1024 ** 8, - 'YB': 1000 ** 8, - 'yB': 1024 ** 8, - 'Yb': 1000 ** 8, - 'yb': 1000 ** 8, - 'yottabytes': 1000 ** 8, - 'yobibytes': 1024 ** 8, - } - - return lookup_unit_table(_UNIT_TABLE, s) - - -def parse_count(s): - if s is None: - return None - - s = s.strip() - - if re.match(r'^[\d,.]+$', s): - return str_to_int(s) - - _UNIT_TABLE = { - 'k': 1000, - 'K': 1000, - 'm': 1000 ** 2, - 'M': 1000 ** 2, - 'kk': 1000 ** 2, - 'KK': 1000 ** 2, - } - - return lookup_unit_table(_UNIT_TABLE, s) - - -def parse_resolution(s): - if s is None: - return {} - - mobj = re.search(r'\b(?P<w>\d+)\s*[xX×]\s*(?P<h>\d+)\b', s) - if mobj: - return { - 'width': int(mobj.group('w')), - 'height': int(mobj.group('h')), - } - - mobj = re.search(r'\b(\d+)[pPiI]\b', s) - if mobj: - return {'height': int(mobj.group(1))} - - mobj = re.search(r'\b([48])[kK]\b', s) - if mobj: - return {'height': int(mobj.group(1)) * 540} - - return {} - - -def parse_bitrate(s): - if not isinstance(s, compat_str): - return - mobj = re.search(r'\b(\d+)\s*kbps', s) - if mobj: - return int(mobj.group(1)) - - -def month_by_name(name, lang='en'): - """ Return the number of a month by (locale-independently) English name """ - - month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en']) - - try: - return month_names.index(name) + 1 - except ValueError: - return None - - -def month_by_abbreviation(abbrev): - """ Return the number of a month by (locale-independently) English - abbreviations """ - - try: - return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1 - except ValueError: - return None - - -def fix_xml_ampersands(xml_str): - """Replace all the '&' by '&' in XML""" - return re.sub( - r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)', - '&', - xml_str) - - -def setproctitle(title): - assert isinstance(title, compat_str) - - # ctypes in Jython is not complete - # http://bugs.jython.org/issue2148 - if sys.platform.startswith('java'): - return - - try: - libc = ctypes.cdll.LoadLibrary('libc.so.6') - except OSError: - return - except TypeError: - # LoadLibrary in Windows Python 2.7.13 only expects - # a bytestring, but since unicode_literals turns - # every string into a unicode string, it fails. - return - title_bytes = title.encode('utf-8') - buf = ctypes.create_string_buffer(len(title_bytes)) - buf.value = title_bytes - try: - libc.prctl(15, buf, 0, 0, 0) - except AttributeError: - return # Strange libc, just skip this - - -def remove_start(s, start): - return s[len(start):] if s is not None and s.startswith(start) else s - - -def remove_end(s, end): - return s[:-len(end)] if s is not None and s.endswith(end) else s - - -def remove_quotes(s): - if s is None or len(s) < 2: - return s - for quote in ('"', "'", ): - if s[0] == quote and s[-1] == quote: - return s[1:-1] - return s - - -def url_basename(url): - path = compat_urlparse.urlparse(url).path - return path.strip('/').split('/')[-1] - - -def base_url(url): - return re.match(r'https?://[^?#&]+/', url).group() - - -def urljoin(base, path): - if isinstance(path, bytes): - path = path.decode('utf-8') - if not isinstance(path, compat_str) or not path: - return None - if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path): - return path - if isinstance(base, bytes): - base = base.decode('utf-8') - if not isinstance(base, compat_str) or not re.match( - r'^(?:https?:)?//', base): - return None - return compat_urlparse.urljoin(base, path) - - -class HEADRequest(compat_urllib_request.Request): - def get_method(self): - return 'HEAD' - - -class PUTRequest(compat_urllib_request.Request): - def get_method(self): - return 'PUT' - - -def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1): - if get_attr: - if v is not None: - v = getattr(v, get_attr, None) - if v == '': - v = None - if v is None: - return default - try: - return int(v) * invscale // scale - except (ValueError, TypeError): - return default - - -def str_or_none(v, default=None): - return default if v is None else compat_str(v) - - -def str_to_int(int_str): - """ A more relaxed version of int_or_none """ - if isinstance(int_str, compat_integer_types): - return int_str - elif isinstance(int_str, compat_str): - int_str = re.sub(r'[,\.\+]', '', int_str) - return int_or_none(int_str) - - -def float_or_none(v, scale=1, invscale=1, default=None): - if v is None: - return default - try: - return float(v) * invscale / scale - except (ValueError, TypeError): - return default - - -def bool_or_none(v, default=None): - return v if isinstance(v, bool) else default - - -def strip_or_none(v, default=None): - return v.strip() if isinstance(v, compat_str) else default - - -def url_or_none(url): - if not url or not isinstance(url, compat_str): - return None - url = url.strip() - return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None - - -def parse_duration(s): - if not isinstance(s, compat_basestring): - return None - - s = s.strip() - - days, hours, mins, secs, ms = [None] * 5 - m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s) - if m: - days, hours, mins, secs, ms = m.groups() - else: - m = re.match( - r'''(?ix)(?:P? - (?: - [0-9]+\s*y(?:ears?)?\s* - )? - (?: - [0-9]+\s*m(?:onths?)?\s* - )? - (?: - [0-9]+\s*w(?:eeks?)?\s* - )? - (?: - (?P<days>[0-9]+)\s*d(?:ays?)?\s* - )? - T)? - (?: - (?P<hours>[0-9]+)\s*h(?:ours?)?\s* - )? - (?: - (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s* - )? - (?: - (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s* - )?Z?$''', s) - if m: - days, hours, mins, secs, ms = m.groups() - else: - m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s) - if m: - hours, mins = m.groups() - else: - return None - - duration = 0 - if secs: - duration += float(secs) - if mins: - duration += float(mins) * 60 - if hours: - duration += float(hours) * 60 * 60 - if days: - duration += float(days) * 24 * 60 * 60 - if ms: - duration += float(ms) - return duration - - -def prepend_extension(filename, ext, expected_real_ext=None): - name, real_ext = os.path.splitext(filename) - return ( - '{0}.{1}{2}'.format(name, ext, real_ext) - if not expected_real_ext or real_ext[1:] == expected_real_ext - else '{0}.{1}'.format(filename, ext)) - - -def replace_extension(filename, ext, expected_real_ext=None): - name, real_ext = os.path.splitext(filename) - return '{0}.{1}'.format( - name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename, - ext) - - -def check_executable(exe, args=[]): - """ Checks if the given binary is installed somewhere in PATH, and returns its name. - args can be a list of arguments for a short output (like -version) """ - try: - subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() - except OSError: - return False - return exe - - -def get_exe_version(exe, args=['--version'], - version_re=None, unrecognized='present'): - """ Returns the version of the specified executable, - or False if the executable is not present """ - try: - # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers - # SIGTTOU if youtube-dl is run in the background. - # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656 - out, _ = subprocess.Popen( - [encodeArgument(exe)] + args, - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate() - except OSError: - return False - if isinstance(out, bytes): # Python 2.x - out = out.decode('ascii', 'ignore') - return detect_exe_version(out, version_re, unrecognized) - - -def detect_exe_version(output, version_re=None, unrecognized='present'): - assert isinstance(output, compat_str) - if version_re is None: - version_re = r'version\s+([-0-9._a-zA-Z]+)' - m = re.search(version_re, output) - if m: - return m.group(1) - else: - return unrecognized - - -class PagedList(object): - def __len__(self): - # This is only useful for tests - return len(self.getslice()) - - -class OnDemandPagedList(PagedList): - def __init__(self, pagefunc, pagesize, use_cache=True): - self._pagefunc = pagefunc - self._pagesize = pagesize - self._use_cache = use_cache - if use_cache: - self._cache = {} - - def getslice(self, start=0, end=None): - res = [] - for pagenum in itertools.count(start // self._pagesize): - firstid = pagenum * self._pagesize - nextfirstid = pagenum * self._pagesize + self._pagesize - if start >= nextfirstid: - continue - - page_results = None - if self._use_cache: - page_results = self._cache.get(pagenum) - if page_results is None: - page_results = list(self._pagefunc(pagenum)) - if self._use_cache: - self._cache[pagenum] = page_results - - startv = ( - start % self._pagesize - if firstid <= start < nextfirstid - else 0) - - endv = ( - ((end - 1) % self._pagesize) + 1 - if (end is not None and firstid <= end <= nextfirstid) - else None) - - if startv != 0 or endv is not None: - page_results = page_results[startv:endv] - res.extend(page_results) - - # A little optimization - if current page is not "full", ie. does - # not contain page_size videos then we can assume that this page - # is the last one - there are no more ids on further pages - - # i.e. no need to query again. - if len(page_results) + startv < self._pagesize: - break - - # If we got the whole page, but the next page is not interesting, - # break out early as well - if end == nextfirstid: - break - return res - - -class InAdvancePagedList(PagedList): - def __init__(self, pagefunc, pagecount, pagesize): - self._pagefunc = pagefunc - self._pagecount = pagecount - self._pagesize = pagesize - - def getslice(self, start=0, end=None): - res = [] - start_page = start // self._pagesize - end_page = ( - self._pagecount if end is None else (end // self._pagesize + 1)) - skip_elems = start - start_page * self._pagesize - only_more = None if end is None else end - start - for pagenum in range(start_page, end_page): - page = list(self._pagefunc(pagenum)) - if skip_elems: - page = page[skip_elems:] - skip_elems = None - if only_more is not None: - if len(page) < only_more: - only_more -= len(page) - else: - page = page[:only_more] - res.extend(page) - break - res.extend(page) - return res - - -def uppercase_escape(s): - unicode_escape = codecs.getdecoder('unicode_escape') - return re.sub( - r'\\U[0-9a-fA-F]{8}', - lambda m: unicode_escape(m.group(0))[0], - s) - - -def lowercase_escape(s): - unicode_escape = codecs.getdecoder('unicode_escape') - return re.sub( - r'\\u[0-9a-fA-F]{4}', - lambda m: unicode_escape(m.group(0))[0], - s) - - -def escape_rfc3986(s): - """Escape non-ASCII characters as suggested by RFC 3986""" - if sys.version_info < (3, 0) and isinstance(s, compat_str): - s = s.encode('utf-8') - return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]") - - -def escape_url(url): - """Escape URL as suggested by RFC 3986""" - url_parsed = compat_urllib_parse_urlparse(url) - return url_parsed._replace( - netloc=url_parsed.netloc.encode('idna').decode('ascii'), - path=escape_rfc3986(url_parsed.path), - params=escape_rfc3986(url_parsed.params), - query=escape_rfc3986(url_parsed.query), - fragment=escape_rfc3986(url_parsed.fragment) - ).geturl() - - -def read_batch_urls(batch_fd): - def fixup(url): - if not isinstance(url, compat_str): - url = url.decode('utf-8', 'replace') - BOM_UTF8 = '\xef\xbb\xbf' - if url.startswith(BOM_UTF8): - url = url[len(BOM_UTF8):] - url = url.strip() - if url.startswith(('#', ';', ']')): - return False - return url - - with contextlib.closing(batch_fd) as fd: - return [url for url in map(fixup, fd) if url] - - -def urlencode_postdata(*args, **kargs): - return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii') - - -def update_url_query(url, query): - if not query: - return url - parsed_url = compat_urlparse.urlparse(url) - qs = compat_parse_qs(parsed_url.query) - qs.update(query) - return compat_urlparse.urlunparse(parsed_url._replace( - query=compat_urllib_parse_urlencode(qs, True))) - - -def update_Request(req, url=None, data=None, headers={}, query={}): - req_headers = req.headers.copy() - req_headers.update(headers) - req_data = data or req.data - req_url = update_url_query(url or req.get_full_url(), query) - req_get_method = req.get_method() - if req_get_method == 'HEAD': - req_type = HEADRequest - elif req_get_method == 'PUT': - req_type = PUTRequest - else: - req_type = compat_urllib_request.Request - new_req = req_type( - req_url, data=req_data, headers=req_headers, - origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) - if hasattr(req, 'timeout'): - new_req.timeout = req.timeout - return new_req - - -def _multipart_encode_impl(data, boundary): - content_type = 'multipart/form-data; boundary=%s' % boundary - - out = b'' - for k, v in data.items(): - out += b'--' + boundary.encode('ascii') + b'\r\n' - if isinstance(k, compat_str): - k = k.encode('utf-8') - if isinstance(v, compat_str): - v = v.encode('utf-8') - # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578 - # suggests sending UTF-8 directly. Firefox sends UTF-8, too - content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n' - if boundary.encode('ascii') in content: - raise ValueError('Boundary overlaps with data') - out += content - - out += b'--' + boundary.encode('ascii') + b'--\r\n' - - return out, content_type - - -def multipart_encode(data, boundary=None): - ''' - Encode a dict to RFC 7578-compliant form-data - - data: - A dict where keys and values can be either Unicode or bytes-like - objects. - boundary: - If specified a Unicode object, it's used as the boundary. Otherwise - a random boundary is generated. - - Reference: https://tools.ietf.org/html/rfc7578 - ''' - has_specified_boundary = boundary is not None - - while True: - if boundary is None: - boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff)) - - try: - out, content_type = _multipart_encode_impl(data, boundary) - break - except ValueError: - if has_specified_boundary: - raise - boundary = None - - return out, content_type - - -def dict_get(d, key_or_keys, default=None, skip_false_values=True): - if isinstance(key_or_keys, (list, tuple)): - for key in key_or_keys: - if key not in d or d[key] is None or skip_false_values and not d[key]: - continue - return d[key] - return default - return d.get(key_or_keys, default) - - -def try_get(src, getter, expected_type=None): - if not isinstance(getter, (list, tuple)): - getter = [getter] - for get in getter: - try: - v = get(src) - except (AttributeError, KeyError, TypeError, IndexError): - pass - else: - if expected_type is None or isinstance(v, expected_type): - return v - - -def merge_dicts(*dicts): - merged = {} - for a_dict in dicts: - for k, v in a_dict.items(): - if v is None: - continue - if (k not in merged - or (isinstance(v, compat_str) and v - and isinstance(merged[k], compat_str) - and not merged[k])): - merged[k] = v - return merged - - -def encode_compat_str(string, encoding=preferredencoding(), errors='strict'): - return string if isinstance(string, compat_str) else compat_str(string, encoding, errors) - - -US_RATINGS = { - 'G': 0, - 'PG': 10, - 'PG-13': 13, - 'R': 16, - 'NC': 18, -} - - -TV_PARENTAL_GUIDELINES = { - 'TV-Y': 0, - 'TV-Y7': 7, - 'TV-G': 0, - 'TV-PG': 0, - 'TV-14': 14, - 'TV-MA': 17, -} - - -def parse_age_limit(s): - if type(s) == int: - return s if 0 <= s <= 21 else None - if not isinstance(s, compat_basestring): - return None - m = re.match(r'^(?P<age>\d{1,2})\+?$', s) - if m: - return int(m.group('age')) - if s in US_RATINGS: - return US_RATINGS[s] - m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s) - if m: - return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)] - return None - - -def strip_jsonp(code): - return re.sub( - r'''(?sx)^ - (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*) - (?:\s*&&\s*(?P=func_name))? - \s*\(\s*(?P<callback_data>.*)\);? - \s*?(?://[^\n]*)*$''', - r'\g<callback_data>', code) - - -def js_to_json(code): - COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*' - SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE) - INTEGER_TABLE = ( - (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16), - (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8), - ) - - def fix_kv(m): - v = m.group(0) - if v in ('true', 'false', 'null'): - return v - elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',': - return "" - - if v[0] in ("'", '"'): - v = re.sub(r'(?s)\\.|"', lambda m: { - '"': '\\"', - "\\'": "'", - '\\\n': '', - '\\x': '\\u00', - }.get(m.group(0), m.group(0)), v[1:-1]) - else: - for regex, base in INTEGER_TABLE: - im = re.match(regex, v) - if im: - i = int(im.group(1), base) - return '"%d":' % i if v.endswith(':') else '%d' % i - - return '"%s"' % v - - return re.sub(r'''(?sx) - "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"| - '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'| - {comment}|,(?={skip}[\]}}])| - (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*| - \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?| - [0-9]+(?={skip}:)| - !+ - '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code) - - -def qualities(quality_ids): - """ Get a numeric quality value out of a list of possible values """ - def q(qid): - try: - return quality_ids.index(qid) - except ValueError: - return -1 - return q - - -DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s' - - -def limit_length(s, length): - """ Add ellipses to overly long strings """ - if s is None: - return None - ELLIPSES = '...' - if len(s) > length: - return s[:length - len(ELLIPSES)] + ELLIPSES - return s - - -def version_tuple(v): - return tuple(int(e) for e in re.split(r'[-.]', v)) - - -def is_outdated_version(version, limit, assume_new=True): - if not version: - return not assume_new - try: - return version_tuple(version) < version_tuple(limit) - except ValueError: - return not assume_new - - -def ytdl_is_updateable(): - """ Returns if youtube-dl can be updated with -U """ - from zipimport import zipimporter - - return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen') - - -def args_to_str(args): - # Get a short string representation for a subprocess command - return ' '.join(compat_shlex_quote(a) for a in args) - - -def error_to_compat_str(err): - err_str = str(err) - # On python 2 error byte string must be decoded with proper - # encoding rather than ascii - if sys.version_info[0] < 3: - err_str = err_str.decode(preferredencoding()) - return err_str - - -def mimetype2ext(mt): - if mt is None: - return None - - ext = { - 'audio/mp4': 'm4a', - # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as - # it's the most popular one - 'audio/mpeg': 'mp3', - }.get(mt) - if ext is not None: - return ext - - _, _, res = mt.rpartition('/') - res = res.split(';')[0].strip().lower() - - return { - '3gpp': '3gp', - 'smptett+xml': 'tt', - 'ttaf+xml': 'dfxp', - 'ttml+xml': 'ttml', - 'x-flv': 'flv', - 'x-mp4-fragmented': 'mp4', - 'x-ms-sami': 'sami', - 'x-ms-wmv': 'wmv', - 'mpegurl': 'm3u8', - 'x-mpegurl': 'm3u8', - 'vnd.apple.mpegurl': 'm3u8', - 'dash+xml': 'mpd', - 'f4m+xml': 'f4m', - 'hds+xml': 'f4m', - 'vnd.ms-sstr+xml': 'ism', - 'quicktime': 'mov', - 'mp2t': 'ts', - 'x-wav': 'wav', - }.get(res, res) - - -def parse_codecs(codecs_str): - # http://tools.ietf.org/html/rfc6381 - if not codecs_str: - return {} - split_codecs = list(filter(None, map( - lambda str: str.strip(), codecs_str.strip().strip(',').split(',')))) - vcodec, acodec = None, None - for full_codec in split_codecs: - codec = full_codec.split('.')[0] - if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1', 'av01', 'theora'): - if not vcodec: - vcodec = full_codec - elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'): - if not acodec: - acodec = full_codec - else: - write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr) - if not vcodec and not acodec: - if len(split_codecs) == 2: - return { - 'vcodec': split_codecs[0], - 'acodec': split_codecs[1], - } - else: - return { - 'vcodec': vcodec or 'none', - 'acodec': acodec or 'none', - } - return {} - - -def urlhandle_detect_ext(url_handle): - getheader = url_handle.headers.get - - cd = getheader('Content-Disposition') - if cd: - m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd) - if m: - e = determine_ext(m.group('filename'), default_ext=None) - if e: - return e - - return mimetype2ext(getheader('Content-Type')) - - -def encode_data_uri(data, mime_type): - return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii')) - - -def age_restricted(content_limit, age_limit): - """ Returns True iff the content should be blocked """ - - if age_limit is None: # No limit set - return False - if content_limit is None: - return False # Content available for everyone - return age_limit < content_limit - - -def is_html(first_bytes): - """ Detect whether a file contains HTML by examining its first bytes. """ - - BOMS = [ - (b'\xef\xbb\xbf', 'utf-8'), - (b'\x00\x00\xfe\xff', 'utf-32-be'), - (b'\xff\xfe\x00\x00', 'utf-32-le'), - (b'\xff\xfe', 'utf-16-le'), - (b'\xfe\xff', 'utf-16-be'), - ] - for bom, enc in BOMS: - if first_bytes.startswith(bom): - s = first_bytes[len(bom):].decode(enc, 'replace') - break - else: - s = first_bytes.decode('utf-8', 'replace') - - return re.match(r'^\s*<', s) - - -def determine_protocol(info_dict): - protocol = info_dict.get('protocol') - if protocol is not None: - return protocol - - url = info_dict['url'] - if url.startswith('rtmp'): - return 'rtmp' - elif url.startswith('mms'): - return 'mms' - elif url.startswith('rtsp'): - return 'rtsp' - - ext = determine_ext(url) - if ext == 'm3u8': - return 'm3u8' - elif ext == 'f4m': - return 'f4m' - - return compat_urllib_parse_urlparse(url).scheme - - -def render_table(header_row, data): - """ Render a list of rows, each as a list of values """ - table = [header_row] + data - max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)] - format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s' - return '\n'.join(format_str % tuple(row) for row in table) - - -def _match_one(filter_part, dct): - COMPARISON_OPERATORS = { - '<': operator.lt, - '<=': operator.le, - '>': operator.gt, - '>=': operator.ge, - '=': operator.eq, - '!=': operator.ne, - } - operator_rex = re.compile(r'''(?x)\s* - (?P<key>[a-z_]+) - \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s* - (?: - (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)| - (?P<quote>["\'])(?P<quotedstrval>(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)| - (?P<strval>(?![0-9.])[a-z0-9A-Z]*) - ) - \s*$ - ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys()))) - m = operator_rex.search(filter_part) - if m: - op = COMPARISON_OPERATORS[m.group('op')] - actual_value = dct.get(m.group('key')) - if (m.group('quotedstrval') is not None - or m.group('strval') is not None - # If the original field is a string and matching comparisonvalue is - # a number we should respect the origin of the original field - # and process comparison value as a string (see - # https://github.com/ytdl-org/youtube-dl/issues/11082). - or actual_value is not None and m.group('intval') is not None - and isinstance(actual_value, compat_str)): - if m.group('op') not in ('=', '!='): - raise ValueError( - 'Operator %s does not support string values!' % m.group('op')) - comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval') - quote = m.group('quote') - if quote is not None: - comparison_value = comparison_value.replace(r'\%s' % quote, quote) - else: - try: - comparison_value = int(m.group('intval')) - except ValueError: - comparison_value = parse_filesize(m.group('intval')) - if comparison_value is None: - comparison_value = parse_filesize(m.group('intval') + 'B') - if comparison_value is None: - raise ValueError( - 'Invalid integer value %r in filter part %r' % ( - m.group('intval'), filter_part)) - if actual_value is None: - return m.group('none_inclusive') - return op(actual_value, comparison_value) - - UNARY_OPERATORS = { - '': lambda v: (v is True) if isinstance(v, bool) else (v is not None), - '!': lambda v: (v is False) if isinstance(v, bool) else (v is None), - } - operator_rex = re.compile(r'''(?x)\s* - (?P<op>%s)\s*(?P<key>[a-z_]+) - \s*$ - ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys()))) - m = operator_rex.search(filter_part) - if m: - op = UNARY_OPERATORS[m.group('op')] - actual_value = dct.get(m.group('key')) - return op(actual_value) - - raise ValueError('Invalid filter part %r' % filter_part) - - -def match_str(filter_str, dct): - """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """ - - return all( - _match_one(filter_part, dct) for filter_part in filter_str.split('&')) - - -def match_filter_func(filter_str): - def _match_func(info_dict): - if match_str(filter_str, info_dict): - return None - else: - video_title = info_dict.get('title', info_dict.get('id', 'video')) - return '%s does not pass filter %s, skipping ..' % (video_title, filter_str) - return _match_func - - -def parse_dfxp_time_expr(time_expr): - if not time_expr: - return - - mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr) - if mobj: - return float(mobj.group('time_offset')) - - mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr) - if mobj: - return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.')) - - -def srt_subtitles_timecode(seconds): - return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000) - - -def dfxp2srt(dfxp_data): - ''' - @param dfxp_data A bytes-like object containing DFXP data - @returns A unicode object containing converted SRT data - ''' - LEGACY_NAMESPACES = ( - (b'http://www.w3.org/ns/ttml', [ - b'http://www.w3.org/2004/11/ttaf1', - b'http://www.w3.org/2006/04/ttaf1', - b'http://www.w3.org/2006/10/ttaf1', - ]), - (b'http://www.w3.org/ns/ttml#styling', [ - b'http://www.w3.org/ns/ttml#style', - ]), - ) - - SUPPORTED_STYLING = [ - 'color', - 'fontFamily', - 'fontSize', - 'fontStyle', - 'fontWeight', - 'textDecoration' - ] - - _x = functools.partial(xpath_with_ns, ns_map={ - 'xml': 'http://www.w3.org/XML/1998/namespace', - 'ttml': 'http://www.w3.org/ns/ttml', - 'tts': 'http://www.w3.org/ns/ttml#styling', - }) - - styles = {} - default_style = {} - - class TTMLPElementParser(object): - _out = '' - _unclosed_elements = [] - _applied_styles = [] - - def start(self, tag, attrib): - if tag in (_x('ttml:br'), 'br'): - self._out += '\n' - else: - unclosed_elements = [] - style = {} - element_style_id = attrib.get('style') - if default_style: - style.update(default_style) - if element_style_id: - style.update(styles.get(element_style_id, {})) - for prop in SUPPORTED_STYLING: - prop_val = attrib.get(_x('tts:' + prop)) - if prop_val: - style[prop] = prop_val - if style: - font = '' - for k, v in sorted(style.items()): - if self._applied_styles and self._applied_styles[-1].get(k) == v: - continue - if k == 'color': - font += ' color="%s"' % v - elif k == 'fontSize': - font += ' size="%s"' % v - elif k == 'fontFamily': - font += ' face="%s"' % v - elif k == 'fontWeight' and v == 'bold': - self._out += '<b>' - unclosed_elements.append('b') - elif k == 'fontStyle' and v == 'italic': - self._out += '<i>' - unclosed_elements.append('i') - elif k == 'textDecoration' and v == 'underline': - self._out += '<u>' - unclosed_elements.append('u') - if font: - self._out += '<font' + font + '>' - unclosed_elements.append('font') - applied_style = {} - if self._applied_styles: - applied_style.update(self._applied_styles[-1]) - applied_style.update(style) - self._applied_styles.append(applied_style) - self._unclosed_elements.append(unclosed_elements) - - def end(self, tag): - if tag not in (_x('ttml:br'), 'br'): - unclosed_elements = self._unclosed_elements.pop() - for element in reversed(unclosed_elements): - self._out += '</%s>' % element - if unclosed_elements and self._applied_styles: - self._applied_styles.pop() - - def data(self, data): - self._out += data - - def close(self): - return self._out.strip() - - def parse_node(node): - target = TTMLPElementParser() - parser = xml.etree.ElementTree.XMLParser(target=target) - parser.feed(xml.etree.ElementTree.tostring(node)) - return parser.close() - - for k, v in LEGACY_NAMESPACES: - for ns in v: - dfxp_data = dfxp_data.replace(ns, k) - - dfxp = compat_etree_fromstring(dfxp_data) - out = [] - paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p') - - if not paras: - raise ValueError('Invalid dfxp/TTML subtitle') - - repeat = False - while True: - for style in dfxp.findall(_x('.//ttml:style')): - style_id = style.get('id') or style.get(_x('xml:id')) - if not style_id: - continue - parent_style_id = style.get('style') - if parent_style_id: - if parent_style_id not in styles: - repeat = True - continue - styles[style_id] = styles[parent_style_id].copy() - for prop in SUPPORTED_STYLING: - prop_val = style.get(_x('tts:' + prop)) - if prop_val: - styles.setdefault(style_id, {})[prop] = prop_val - if repeat: - repeat = False - else: - break - - for p in ('body', 'div'): - ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p]) - if ele is None: - continue - style = styles.get(ele.get('style')) - if not style: - continue - default_style.update(style) - - for para, index in zip(paras, itertools.count(1)): - begin_time = parse_dfxp_time_expr(para.attrib.get('begin')) - end_time = parse_dfxp_time_expr(para.attrib.get('end')) - dur = parse_dfxp_time_expr(para.attrib.get('dur')) - if begin_time is None: - continue - if not end_time: - if not dur: - continue - end_time = begin_time + dur - out.append('%d\n%s --> %s\n%s\n\n' % ( - index, - srt_subtitles_timecode(begin_time), - srt_subtitles_timecode(end_time), - parse_node(para))) - - return ''.join(out) - - -def cli_option(params, command_option, param): - param = params.get(param) - if param: - param = compat_str(param) - return [command_option, param] if param is not None else [] - - -def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None): - param = params.get(param) - if param is None: - return [] - assert isinstance(param, bool) - if separator: - return [command_option + separator + (true_value if param else false_value)] - return [command_option, true_value if param else false_value] - - -def cli_valueless_option(params, command_option, param, expected_value=True): - param = params.get(param) - return [command_option] if param == expected_value else [] - - -def cli_configuration_args(params, param, default=[]): - ex_args = params.get(param) - if ex_args is None: - return default - assert isinstance(ex_args, list) - return ex_args - - -class ISO639Utils(object): - # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt - _lang_map = { - 'aa': 'aar', - 'ab': 'abk', - 'ae': 'ave', - 'af': 'afr', - 'ak': 'aka', - 'am': 'amh', - 'an': 'arg', - 'ar': 'ara', - 'as': 'asm', - 'av': 'ava', - 'ay': 'aym', - 'az': 'aze', - 'ba': 'bak', - 'be': 'bel', - 'bg': 'bul', - 'bh': 'bih', - 'bi': 'bis', - 'bm': 'bam', - 'bn': 'ben', - 'bo': 'bod', - 'br': 'bre', - 'bs': 'bos', - 'ca': 'cat', - 'ce': 'che', - 'ch': 'cha', - 'co': 'cos', - 'cr': 'cre', - 'cs': 'ces', - 'cu': 'chu', - 'cv': 'chv', - 'cy': 'cym', - 'da': 'dan', - 'de': 'deu', - 'dv': 'div', - 'dz': 'dzo', - 'ee': 'ewe', - 'el': 'ell', - 'en': 'eng', - 'eo': 'epo', - 'es': 'spa', - 'et': 'est', - 'eu': 'eus', - 'fa': 'fas', - 'ff': 'ful', - 'fi': 'fin', - 'fj': 'fij', - 'fo': 'fao', - 'fr': 'fra', - 'fy': 'fry', - 'ga': 'gle', - 'gd': 'gla', - 'gl': 'glg', - 'gn': 'grn', - 'gu': 'guj', - 'gv': 'glv', - 'ha': 'hau', - 'he': 'heb', - 'iw': 'heb', # Replaced by he in 1989 revision - 'hi': 'hin', - 'ho': 'hmo', - 'hr': 'hrv', - 'ht': 'hat', - 'hu': 'hun', - 'hy': 'hye', - 'hz': 'her', - 'ia': 'ina', - 'id': 'ind', - 'in': 'ind', # Replaced by id in 1989 revision - 'ie': 'ile', - 'ig': 'ibo', - 'ii': 'iii', - 'ik': 'ipk', - 'io': 'ido', - 'is': 'isl', - 'it': 'ita', - 'iu': 'iku', - 'ja': 'jpn', - 'jv': 'jav', - 'ka': 'kat', - 'kg': 'kon', - 'ki': 'kik', - 'kj': 'kua', - 'kk': 'kaz', - 'kl': 'kal', - 'km': 'khm', - 'kn': 'kan', - 'ko': 'kor', - 'kr': 'kau', - 'ks': 'kas', - 'ku': 'kur', - 'kv': 'kom', - 'kw': 'cor', - 'ky': 'kir', - 'la': 'lat', - 'lb': 'ltz', - 'lg': 'lug', - 'li': 'lim', - 'ln': 'lin', - 'lo': 'lao', - 'lt': 'lit', - 'lu': 'lub', - 'lv': 'lav', - 'mg': 'mlg', - 'mh': 'mah', - 'mi': 'mri', - 'mk': 'mkd', - 'ml': 'mal', - 'mn': 'mon', - 'mr': 'mar', - 'ms': 'msa', - 'mt': 'mlt', - 'my': 'mya', - 'na': 'nau', - 'nb': 'nob', - 'nd': 'nde', - 'ne': 'nep', - 'ng': 'ndo', - 'nl': 'nld', - 'nn': 'nno', - 'no': 'nor', - 'nr': 'nbl', - 'nv': 'nav', - 'ny': 'nya', - 'oc': 'oci', - 'oj': 'oji', - 'om': 'orm', - 'or': 'ori', - 'os': 'oss', - 'pa': 'pan', - 'pi': 'pli', - 'pl': 'pol', - 'ps': 'pus', - 'pt': 'por', - 'qu': 'que', - 'rm': 'roh', - 'rn': 'run', - 'ro': 'ron', - 'ru': 'rus', - 'rw': 'kin', - 'sa': 'san', - 'sc': 'srd', - 'sd': 'snd', - 'se': 'sme', - 'sg': 'sag', - 'si': 'sin', - 'sk': 'slk', - 'sl': 'slv', - 'sm': 'smo', - 'sn': 'sna', - 'so': 'som', - 'sq': 'sqi', - 'sr': 'srp', - 'ss': 'ssw', - 'st': 'sot', - 'su': 'sun', - 'sv': 'swe', - 'sw': 'swa', - 'ta': 'tam', - 'te': 'tel', - 'tg': 'tgk', - 'th': 'tha', - 'ti': 'tir', - 'tk': 'tuk', - 'tl': 'tgl', - 'tn': 'tsn', - 'to': 'ton', - 'tr': 'tur', - 'ts': 'tso', - 'tt': 'tat', - 'tw': 'twi', - 'ty': 'tah', - 'ug': 'uig', - 'uk': 'ukr', - 'ur': 'urd', - 'uz': 'uzb', - 've': 'ven', - 'vi': 'vie', - 'vo': 'vol', - 'wa': 'wln', - 'wo': 'wol', - 'xh': 'xho', - 'yi': 'yid', - 'ji': 'yid', # Replaced by yi in 1989 revision - 'yo': 'yor', - 'za': 'zha', - 'zh': 'zho', - 'zu': 'zul', - } - - @classmethod - def short2long(cls, code): - """Convert language code from ISO 639-1 to ISO 639-2/T""" - return cls._lang_map.get(code[:2]) - - @classmethod - def long2short(cls, code): - """Convert language code from ISO 639-2/T to ISO 639-1""" - for short_name, long_name in cls._lang_map.items(): - if long_name == code: - return short_name - - -class ISO3166Utils(object): - # From http://data.okfn.org/data/core/country-list - _country_map = { - 'AF': 'Afghanistan', - 'AX': 'Åland Islands', - 'AL': 'Albania', - 'DZ': 'Algeria', - 'AS': 'American Samoa', - 'AD': 'Andorra', - 'AO': 'Angola', - 'AI': 'Anguilla', - 'AQ': 'Antarctica', - 'AG': 'Antigua and Barbuda', - 'AR': 'Argentina', - 'AM': 'Armenia', - 'AW': 'Aruba', - 'AU': 'Australia', - 'AT': 'Austria', - 'AZ': 'Azerbaijan', - 'BS': 'Bahamas', - 'BH': 'Bahrain', - 'BD': 'Bangladesh', - 'BB': 'Barbados', - 'BY': 'Belarus', - 'BE': 'Belgium', - 'BZ': 'Belize', - 'BJ': 'Benin', - 'BM': 'Bermuda', - 'BT': 'Bhutan', - 'BO': 'Bolivia, Plurinational State of', - 'BQ': 'Bonaire, Sint Eustatius and Saba', - 'BA': 'Bosnia and Herzegovina', - 'BW': 'Botswana', - 'BV': 'Bouvet Island', - 'BR': 'Brazil', - 'IO': 'British Indian Ocean Territory', - 'BN': 'Brunei Darussalam', - 'BG': 'Bulgaria', - 'BF': 'Burkina Faso', - 'BI': 'Burundi', - 'KH': 'Cambodia', - 'CM': 'Cameroon', - 'CA': 'Canada', - 'CV': 'Cape Verde', - 'KY': 'Cayman Islands', - 'CF': 'Central African Republic', - 'TD': 'Chad', - 'CL': 'Chile', - 'CN': 'China', - 'CX': 'Christmas Island', - 'CC': 'Cocos (Keeling) Islands', - 'CO': 'Colombia', - 'KM': 'Comoros', - 'CG': 'Congo', - 'CD': 'Congo, the Democratic Republic of the', - 'CK': 'Cook Islands', - 'CR': 'Costa Rica', - 'CI': 'Côte d\'Ivoire', - 'HR': 'Croatia', - 'CU': 'Cuba', - 'CW': 'Curaçao', - 'CY': 'Cyprus', - 'CZ': 'Czech Republic', - 'DK': 'Denmark', - 'DJ': 'Djibouti', - 'DM': 'Dominica', - 'DO': 'Dominican Republic', - 'EC': 'Ecuador', - 'EG': 'Egypt', - 'SV': 'El Salvador', - 'GQ': 'Equatorial Guinea', - 'ER': 'Eritrea', - 'EE': 'Estonia', - 'ET': 'Ethiopia', - 'FK': 'Falkland Islands (Malvinas)', - 'FO': 'Faroe Islands', - 'FJ': 'Fiji', - 'FI': 'Finland', - 'FR': 'France', - 'GF': 'French Guiana', - 'PF': 'French Polynesia', - 'TF': 'French Southern Territories', - 'GA': 'Gabon', - 'GM': 'Gambia', - 'GE': 'Georgia', - 'DE': 'Germany', - 'GH': 'Ghana', - 'GI': 'Gibraltar', - 'GR': 'Greece', - 'GL': 'Greenland', - 'GD': 'Grenada', - 'GP': 'Guadeloupe', - 'GU': 'Guam', - 'GT': 'Guatemala', - 'GG': 'Guernsey', - 'GN': 'Guinea', - 'GW': 'Guinea-Bissau', - 'GY': 'Guyana', - 'HT': 'Haiti', - 'HM': 'Heard Island and McDonald Islands', - 'VA': 'Holy See (Vatican City State)', - 'HN': 'Honduras', - 'HK': 'Hong Kong', - 'HU': 'Hungary', - 'IS': 'Iceland', - 'IN': 'India', - 'ID': 'Indonesia', - 'IR': 'Iran, Islamic Republic of', - 'IQ': 'Iraq', - 'IE': 'Ireland', - 'IM': 'Isle of Man', - 'IL': 'Israel', - 'IT': 'Italy', - 'JM': 'Jamaica', - 'JP': 'Japan', - 'JE': 'Jersey', - 'JO': 'Jordan', - 'KZ': 'Kazakhstan', - 'KE': 'Kenya', - 'KI': 'Kiribati', - 'KP': 'Korea, Democratic People\'s Republic of', - 'KR': 'Korea, Republic of', - 'KW': 'Kuwait', - 'KG': 'Kyrgyzstan', - 'LA': 'Lao People\'s Democratic Republic', - 'LV': 'Latvia', - 'LB': 'Lebanon', - 'LS': 'Lesotho', - 'LR': 'Liberia', - 'LY': 'Libya', - 'LI': 'Liechtenstein', - 'LT': 'Lithuania', - 'LU': 'Luxembourg', - 'MO': 'Macao', - 'MK': 'Macedonia, the Former Yugoslav Republic of', - 'MG': 'Madagascar', - 'MW': 'Malawi', - 'MY': 'Malaysia', - 'MV': 'Maldives', - 'ML': 'Mali', - 'MT': 'Malta', - 'MH': 'Marshall Islands', - 'MQ': 'Martinique', - 'MR': 'Mauritania', - 'MU': 'Mauritius', - 'YT': 'Mayotte', - 'MX': 'Mexico', - 'FM': 'Micronesia, Federated States of', - 'MD': 'Moldova, Republic of', - 'MC': 'Monaco', - 'MN': 'Mongolia', - 'ME': 'Montenegro', - 'MS': 'Montserrat', - 'MA': 'Morocco', - 'MZ': 'Mozambique', - 'MM': 'Myanmar', - 'NA': 'Namibia', - 'NR': 'Nauru', - 'NP': 'Nepal', - 'NL': 'Netherlands', - 'NC': 'New Caledonia', - 'NZ': 'New Zealand', - 'NI': 'Nicaragua', - 'NE': 'Niger', - 'NG': 'Nigeria', - 'NU': 'Niue', - 'NF': 'Norfolk Island', - 'MP': 'Northern Mariana Islands', - 'NO': 'Norway', - 'OM': 'Oman', - 'PK': 'Pakistan', - 'PW': 'Palau', - 'PS': 'Palestine, State of', - 'PA': 'Panama', - 'PG': 'Papua New Guinea', - 'PY': 'Paraguay', - 'PE': 'Peru', - 'PH': 'Philippines', - 'PN': 'Pitcairn', - 'PL': 'Poland', - 'PT': 'Portugal', - 'PR': 'Puerto Rico', - 'QA': 'Qatar', - 'RE': 'Réunion', - 'RO': 'Romania', - 'RU': 'Russian Federation', - 'RW': 'Rwanda', - 'BL': 'Saint Barthélemy', - 'SH': 'Saint Helena, Ascension and Tristan da Cunha', - 'KN': 'Saint Kitts and Nevis', - 'LC': 'Saint Lucia', - 'MF': 'Saint Martin (French part)', - 'PM': 'Saint Pierre and Miquelon', - 'VC': 'Saint Vincent and the Grenadines', - 'WS': 'Samoa', - 'SM': 'San Marino', - 'ST': 'Sao Tome and Principe', - 'SA': 'Saudi Arabia', - 'SN': 'Senegal', - 'RS': 'Serbia', - 'SC': 'Seychelles', - 'SL': 'Sierra Leone', - 'SG': 'Singapore', - 'SX': 'Sint Maarten (Dutch part)', - 'SK': 'Slovakia', - 'SI': 'Slovenia', - 'SB': 'Solomon Islands', - 'SO': 'Somalia', - 'ZA': 'South Africa', - 'GS': 'South Georgia and the South Sandwich Islands', - 'SS': 'South Sudan', - 'ES': 'Spain', - 'LK': 'Sri Lanka', - 'SD': 'Sudan', - 'SR': 'Suriname', - 'SJ': 'Svalbard and Jan Mayen', - 'SZ': 'Swaziland', - 'SE': 'Sweden', - 'CH': 'Switzerland', - 'SY': 'Syrian Arab Republic', - 'TW': 'Taiwan, Province of China', - 'TJ': 'Tajikistan', - 'TZ': 'Tanzania, United Republic of', - 'TH': 'Thailand', - 'TL': 'Timor-Leste', - 'TG': 'Togo', - 'TK': 'Tokelau', - 'TO': 'Tonga', - 'TT': 'Trinidad and Tobago', - 'TN': 'Tunisia', - 'TR': 'Turkey', - 'TM': 'Turkmenistan', - 'TC': 'Turks and Caicos Islands', - 'TV': 'Tuvalu', - 'UG': 'Uganda', - 'UA': 'Ukraine', - 'AE': 'United Arab Emirates', - 'GB': 'United Kingdom', - 'US': 'United States', - 'UM': 'United States Minor Outlying Islands', - 'UY': 'Uruguay', - 'UZ': 'Uzbekistan', - 'VU': 'Vanuatu', - 'VE': 'Venezuela, Bolivarian Republic of', - 'VN': 'Viet Nam', - 'VG': 'Virgin Islands, British', - 'VI': 'Virgin Islands, U.S.', - 'WF': 'Wallis and Futuna', - 'EH': 'Western Sahara', - 'YE': 'Yemen', - 'ZM': 'Zambia', - 'ZW': 'Zimbabwe', - } - - @classmethod - def short2full(cls, code): - """Convert an ISO 3166-2 country code to the corresponding full name""" - return cls._country_map.get(code.upper()) - - -class GeoUtils(object): - # Major IPv4 address blocks per country - _country_ip_map = { - 'AD': '46.172.224.0/19', - 'AE': '94.200.0.0/13', - 'AF': '149.54.0.0/17', - 'AG': '209.59.64.0/18', - 'AI': '204.14.248.0/21', - 'AL': '46.99.0.0/16', - 'AM': '46.70.0.0/15', - 'AO': '105.168.0.0/13', - 'AP': '182.50.184.0/21', - 'AQ': '23.154.160.0/24', - 'AR': '181.0.0.0/12', - 'AS': '202.70.112.0/20', - 'AT': '77.116.0.0/14', - 'AU': '1.128.0.0/11', - 'AW': '181.41.0.0/18', - 'AX': '185.217.4.0/22', - 'AZ': '5.197.0.0/16', - 'BA': '31.176.128.0/17', - 'BB': '65.48.128.0/17', - 'BD': '114.130.0.0/16', - 'BE': '57.0.0.0/8', - 'BF': '102.178.0.0/15', - 'BG': '95.42.0.0/15', - 'BH': '37.131.0.0/17', - 'BI': '154.117.192.0/18', - 'BJ': '137.255.0.0/16', - 'BL': '185.212.72.0/23', - 'BM': '196.12.64.0/18', - 'BN': '156.31.0.0/16', - 'BO': '161.56.0.0/16', - 'BQ': '161.0.80.0/20', - 'BR': '191.128.0.0/12', - 'BS': '24.51.64.0/18', - 'BT': '119.2.96.0/19', - 'BW': '168.167.0.0/16', - 'BY': '178.120.0.0/13', - 'BZ': '179.42.192.0/18', - 'CA': '99.224.0.0/11', - 'CD': '41.243.0.0/16', - 'CF': '197.242.176.0/21', - 'CG': '160.113.0.0/16', - 'CH': '85.0.0.0/13', - 'CI': '102.136.0.0/14', - 'CK': '202.65.32.0/19', - 'CL': '152.172.0.0/14', - 'CM': '102.244.0.0/14', - 'CN': '36.128.0.0/10', - 'CO': '181.240.0.0/12', - 'CR': '201.192.0.0/12', - 'CU': '152.206.0.0/15', - 'CV': '165.90.96.0/19', - 'CW': '190.88.128.0/17', - 'CY': '31.153.0.0/16', - 'CZ': '88.100.0.0/14', - 'DE': '53.0.0.0/8', - 'DJ': '197.241.0.0/17', - 'DK': '87.48.0.0/12', - 'DM': '192.243.48.0/20', - 'DO': '152.166.0.0/15', - 'DZ': '41.96.0.0/12', - 'EC': '186.68.0.0/15', - 'EE': '90.190.0.0/15', - 'EG': '156.160.0.0/11', - 'ER': '196.200.96.0/20', - 'ES': '88.0.0.0/11', - 'ET': '196.188.0.0/14', - 'EU': '2.16.0.0/13', - 'FI': '91.152.0.0/13', - 'FJ': '144.120.0.0/16', - 'FK': '80.73.208.0/21', - 'FM': '119.252.112.0/20', - 'FO': '88.85.32.0/19', - 'FR': '90.0.0.0/9', - 'GA': '41.158.0.0/15', - 'GB': '25.0.0.0/8', - 'GD': '74.122.88.0/21', - 'GE': '31.146.0.0/16', - 'GF': '161.22.64.0/18', - 'GG': '62.68.160.0/19', - 'GH': '154.160.0.0/12', - 'GI': '95.164.0.0/16', - 'GL': '88.83.0.0/19', - 'GM': '160.182.0.0/15', - 'GN': '197.149.192.0/18', - 'GP': '104.250.0.0/19', - 'GQ': '105.235.224.0/20', - 'GR': '94.64.0.0/13', - 'GT': '168.234.0.0/16', - 'GU': '168.123.0.0/16', - 'GW': '197.214.80.0/20', - 'GY': '181.41.64.0/18', - 'HK': '113.252.0.0/14', - 'HN': '181.210.0.0/16', - 'HR': '93.136.0.0/13', - 'HT': '148.102.128.0/17', - 'HU': '84.0.0.0/14', - 'ID': '39.192.0.0/10', - 'IE': '87.32.0.0/12', - 'IL': '79.176.0.0/13', - 'IM': '5.62.80.0/20', - 'IN': '117.192.0.0/10', - 'IO': '203.83.48.0/21', - 'IQ': '37.236.0.0/14', - 'IR': '2.176.0.0/12', - 'IS': '82.221.0.0/16', - 'IT': '79.0.0.0/10', - 'JE': '87.244.64.0/18', - 'JM': '72.27.0.0/17', - 'JO': '176.29.0.0/16', - 'JP': '133.0.0.0/8', - 'KE': '105.48.0.0/12', - 'KG': '158.181.128.0/17', - 'KH': '36.37.128.0/17', - 'KI': '103.25.140.0/22', - 'KM': '197.255.224.0/20', - 'KN': '198.167.192.0/19', - 'KP': '175.45.176.0/22', - 'KR': '175.192.0.0/10', - 'KW': '37.36.0.0/14', - 'KY': '64.96.0.0/15', - 'KZ': '2.72.0.0/13', - 'LA': '115.84.64.0/18', - 'LB': '178.135.0.0/16', - 'LC': '24.92.144.0/20', - 'LI': '82.117.0.0/19', - 'LK': '112.134.0.0/15', - 'LR': '102.183.0.0/16', - 'LS': '129.232.0.0/17', - 'LT': '78.56.0.0/13', - 'LU': '188.42.0.0/16', - 'LV': '46.109.0.0/16', - 'LY': '41.252.0.0/14', - 'MA': '105.128.0.0/11', - 'MC': '88.209.64.0/18', - 'MD': '37.246.0.0/16', - 'ME': '178.175.0.0/17', - 'MF': '74.112.232.0/21', - 'MG': '154.126.0.0/17', - 'MH': '117.103.88.0/21', - 'MK': '77.28.0.0/15', - 'ML': '154.118.128.0/18', - 'MM': '37.111.0.0/17', - 'MN': '49.0.128.0/17', - 'MO': '60.246.0.0/16', - 'MP': '202.88.64.0/20', - 'MQ': '109.203.224.0/19', - 'MR': '41.188.64.0/18', - 'MS': '208.90.112.0/22', - 'MT': '46.11.0.0/16', - 'MU': '105.16.0.0/12', - 'MV': '27.114.128.0/18', - 'MW': '102.70.0.0/15', - 'MX': '187.192.0.0/11', - 'MY': '175.136.0.0/13', - 'MZ': '197.218.0.0/15', - 'NA': '41.182.0.0/16', - 'NC': '101.101.0.0/18', - 'NE': '197.214.0.0/18', - 'NF': '203.17.240.0/22', - 'NG': '105.112.0.0/12', - 'NI': '186.76.0.0/15', - 'NL': '145.96.0.0/11', - 'NO': '84.208.0.0/13', - 'NP': '36.252.0.0/15', - 'NR': '203.98.224.0/19', - 'NU': '49.156.48.0/22', - 'NZ': '49.224.0.0/14', - 'OM': '5.36.0.0/15', - 'PA': '186.72.0.0/15', - 'PE': '186.160.0.0/14', - 'PF': '123.50.64.0/18', - 'PG': '124.240.192.0/19', - 'PH': '49.144.0.0/13', - 'PK': '39.32.0.0/11', - 'PL': '83.0.0.0/11', - 'PM': '70.36.0.0/20', - 'PR': '66.50.0.0/16', - 'PS': '188.161.0.0/16', - 'PT': '85.240.0.0/13', - 'PW': '202.124.224.0/20', - 'PY': '181.120.0.0/14', - 'QA': '37.210.0.0/15', - 'RE': '102.35.0.0/16', - 'RO': '79.112.0.0/13', - 'RS': '93.86.0.0/15', - 'RU': '5.136.0.0/13', - 'RW': '41.186.0.0/16', - 'SA': '188.48.0.0/13', - 'SB': '202.1.160.0/19', - 'SC': '154.192.0.0/11', - 'SD': '102.120.0.0/13', - 'SE': '78.64.0.0/12', - 'SG': '8.128.0.0/10', - 'SI': '188.196.0.0/14', - 'SK': '78.98.0.0/15', - 'SL': '102.143.0.0/17', - 'SM': '89.186.32.0/19', - 'SN': '41.82.0.0/15', - 'SO': '154.115.192.0/18', - 'SR': '186.179.128.0/17', - 'SS': '105.235.208.0/21', - 'ST': '197.159.160.0/19', - 'SV': '168.243.0.0/16', - 'SX': '190.102.0.0/20', - 'SY': '5.0.0.0/16', - 'SZ': '41.84.224.0/19', - 'TC': '65.255.48.0/20', - 'TD': '154.68.128.0/19', - 'TG': '196.168.0.0/14', - 'TH': '171.96.0.0/13', - 'TJ': '85.9.128.0/18', - 'TK': '27.96.24.0/21', - 'TL': '180.189.160.0/20', - 'TM': '95.85.96.0/19', - 'TN': '197.0.0.0/11', - 'TO': '175.176.144.0/21', - 'TR': '78.160.0.0/11', - 'TT': '186.44.0.0/15', - 'TV': '202.2.96.0/19', - 'TW': '120.96.0.0/11', - 'TZ': '156.156.0.0/14', - 'UA': '37.52.0.0/14', - 'UG': '102.80.0.0/13', - 'US': '6.0.0.0/8', - 'UY': '167.56.0.0/13', - 'UZ': '84.54.64.0/18', - 'VA': '212.77.0.0/19', - 'VC': '207.191.240.0/21', - 'VE': '186.88.0.0/13', - 'VG': '66.81.192.0/20', - 'VI': '146.226.0.0/16', - 'VN': '14.160.0.0/11', - 'VU': '202.80.32.0/20', - 'WF': '117.20.32.0/21', - 'WS': '202.4.32.0/19', - 'YE': '134.35.0.0/16', - 'YT': '41.242.116.0/22', - 'ZA': '41.0.0.0/11', - 'ZM': '102.144.0.0/13', - 'ZW': '102.177.192.0/18', - } - - @classmethod - def random_ipv4(cls, code_or_block): - if len(code_or_block) == 2: - block = cls._country_ip_map.get(code_or_block.upper()) - if not block: - return None - else: - block = code_or_block - addr, preflen = block.split('/') - addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0] - addr_max = addr_min | (0xffffffff >> int(preflen)) - return compat_str(socket.inet_ntoa( - compat_struct_pack('!L', random.randint(addr_min, addr_max)))) - - -class PerRequestProxyHandler(compat_urllib_request.ProxyHandler): - def __init__(self, proxies=None): - # Set default handlers - for type in ('http', 'https'): - setattr(self, '%s_open' % type, - lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open: - meth(r, proxy, type)) - compat_urllib_request.ProxyHandler.__init__(self, proxies) - - def proxy_open(self, req, proxy, type): - req_proxy = req.headers.get('Ytdl-request-proxy') - if req_proxy is not None: - proxy = req_proxy - del req.headers['Ytdl-request-proxy'] - - if proxy == '__noproxy__': - return None # No Proxy - if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'): - req.add_header('Ytdl-socks-proxy', proxy) - # youtube-dl's http/https handlers do wrapping the socket with socks - return None - return compat_urllib_request.ProxyHandler.proxy_open( - self, req, proxy, type) - - -# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is -# released into Public Domain -# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387 - -def long_to_bytes(n, blocksize=0): - """long_to_bytes(n:long, blocksize:int) : string - Convert a long integer to a byte string. - - If optional blocksize is given and greater than zero, pad the front of the - byte string with binary zeros so that the length is a multiple of - blocksize. - """ - # after much testing, this algorithm was deemed to be the fastest - s = b'' - n = int(n) - while n > 0: - s = compat_struct_pack('>I', n & 0xffffffff) + s - n = n >> 32 - # strip off leading zeros - for i in range(len(s)): - if s[i] != b'\000'[0]: - break - else: - # only happens when n == 0 - s = b'\000' - i = 0 - s = s[i:] - # add back some pad bytes. this could be done more efficiently w.r.t. the - # de-padding being done above, but sigh... - if blocksize > 0 and len(s) % blocksize: - s = (blocksize - len(s) % blocksize) * b'\000' + s - return s - - -def bytes_to_long(s): - """bytes_to_long(string) : long - Convert a byte string to a long integer. - - This is (essentially) the inverse of long_to_bytes(). - """ - acc = 0 - length = len(s) - if length % 4: - extra = (4 - length % 4) - s = b'\000' * extra + s - length = length + extra - for i in range(0, length, 4): - acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0] - return acc - - -def ohdave_rsa_encrypt(data, exponent, modulus): - ''' - Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/ - - Input: - data: data to encrypt, bytes-like object - exponent, modulus: parameter e and N of RSA algorithm, both integer - Output: hex string of encrypted data - - Limitation: supports one block encryption only - ''' - - payload = int(binascii.hexlify(data[::-1]), 16) - encrypted = pow(payload, exponent, modulus) - return '%x' % encrypted - - -def pkcs1pad(data, length): - """ - Padding input data with PKCS#1 scheme - - @param {int[]} data input data - @param {int} length target length - @returns {int[]} padded data - """ - if len(data) > length - 11: - raise ValueError('Input data too long for PKCS#1 padding') - - pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)] - return [0, 2] + pseudo_random + [0] + data - - -def encode_base_n(num, n, table=None): - FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' - if not table: - table = FULL_TABLE[:n] - - if n > len(table): - raise ValueError('base %d exceeds table length %d' % (n, len(table))) - - if num == 0: - return table[0] - - ret = '' - while num: - ret = table[num % n] + ret - num = num // n - return ret - - -def decode_packed_codes(code): - mobj = re.search(PACKED_CODES_RE, code) - obfuscated_code, base, count, symbols = mobj.groups() - base = int(base) - count = int(count) - symbols = symbols.split('|') - symbol_table = {} - - while count: - count -= 1 - base_n_count = encode_base_n(count, base) - symbol_table[base_n_count] = symbols[count] or base_n_count - - return re.sub( - r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)], - obfuscated_code) - - -def caesar(s, alphabet, shift): - if shift == 0: - return s - l = len(alphabet) - return ''.join( - alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c - for c in s) - - -def rot47(s): - return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47) - - -def parse_m3u8_attributes(attrib): - info = {} - for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib): - if val.startswith('"'): - val = val[1:-1] - info[key] = val - return info - - -def urshift(val, n): - return val >> n if val >= 0 else (val + 0x100000000) >> n - - -# Based on png2str() written by @gdkchan and improved by @yokrysty -# Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706 -def decode_png(png_data): - # Reference: https://www.w3.org/TR/PNG/ - header = png_data[8:] - - if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR': - raise IOError('Not a valid PNG file.') - - int_map = {1: '>B', 2: '>H', 4: '>I'} - unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0] - - chunks = [] - - while header: - length = unpack_integer(header[:4]) - header = header[4:] - - chunk_type = header[:4] - header = header[4:] - - chunk_data = header[:length] - header = header[length:] - - header = header[4:] # Skip CRC - - chunks.append({ - 'type': chunk_type, - 'length': length, - 'data': chunk_data - }) - - ihdr = chunks[0]['data'] - - width = unpack_integer(ihdr[:4]) - height = unpack_integer(ihdr[4:8]) - - idat = b'' - - for chunk in chunks: - if chunk['type'] == b'IDAT': - idat += chunk['data'] - - if not idat: - raise IOError('Unable to read PNG data.') - - decompressed_data = bytearray(zlib.decompress(idat)) - - stride = width * 3 - pixels = [] - - def _get_pixel(idx): - x = idx % stride - y = idx // stride - return pixels[y][x] - - for y in range(height): - basePos = y * (1 + stride) - filter_type = decompressed_data[basePos] - - current_row = [] - - pixels.append(current_row) - - for x in range(stride): - color = decompressed_data[1 + basePos + x] - basex = y * stride + x - left = 0 - up = 0 - - if x > 2: - left = _get_pixel(basex - 3) - if y > 0: - up = _get_pixel(basex - stride) - - if filter_type == 1: # Sub - color = (color + left) & 0xff - elif filter_type == 2: # Up - color = (color + up) & 0xff - elif filter_type == 3: # Average - color = (color + ((left + up) >> 1)) & 0xff - elif filter_type == 4: # Paeth - a = left - b = up - c = 0 - - if x > 2 and y > 0: - c = _get_pixel(basex - stride - 3) - - p = a + b - c - - pa = abs(p - a) - pb = abs(p - b) - pc = abs(p - c) - - if pa <= pb and pa <= pc: - color = (color + a) & 0xff - elif pb <= pc: - color = (color + b) & 0xff - else: - color = (color + c) & 0xff - - current_row.append(color) - - return width, height, pixels - - -def write_xattr(path, key, value): - # This mess below finds the best xattr tool for the job - try: - # try the pyxattr module... - import xattr - - if hasattr(xattr, 'set'): # pyxattr - # Unicode arguments are not supported in python-pyxattr until - # version 0.5.0 - # See https://github.com/ytdl-org/youtube-dl/issues/5498 - pyxattr_required_version = '0.5.0' - if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version): - # TODO: fallback to CLI tools - raise XAttrUnavailableError( - 'python-pyxattr is detected but is too old. ' - 'youtube-dl requires %s or above while your version is %s. ' - 'Falling back to other xattr implementations' % ( - pyxattr_required_version, xattr.__version__)) - - setxattr = xattr.set - else: # xattr - setxattr = xattr.setxattr - - try: - setxattr(path, key, value) - except EnvironmentError as e: - raise XAttrMetadataError(e.errno, e.strerror) - - except ImportError: - if compat_os_name == 'nt': - # Write xattrs to NTFS Alternate Data Streams: - # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29 - assert ':' not in key - assert os.path.exists(path) - - ads_fn = path + ':' + key - try: - with open(ads_fn, 'wb') as f: - f.write(value) - except EnvironmentError as e: - raise XAttrMetadataError(e.errno, e.strerror) - else: - user_has_setfattr = check_executable('setfattr', ['--version']) - user_has_xattr = check_executable('xattr', ['-h']) - - if user_has_setfattr or user_has_xattr: - - value = value.decode('utf-8') - if user_has_setfattr: - executable = 'setfattr' - opts = ['-n', key, '-v', value] - elif user_has_xattr: - executable = 'xattr' - opts = ['-w', key, value] - - cmd = ([encodeFilename(executable, True)] - + [encodeArgument(o) for o in opts] - + [encodeFilename(path, True)]) - - try: - p = subprocess.Popen( - cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) - except EnvironmentError as e: - raise XAttrMetadataError(e.errno, e.strerror) - stdout, stderr = p.communicate() - stderr = stderr.decode('utf-8', 'replace') - if p.returncode != 0: - raise XAttrMetadataError(p.returncode, stderr) - - else: - # On Unix, and can't find pyxattr, setfattr, or xattr. - if sys.platform.startswith('linux'): - raise XAttrUnavailableError( - "Couldn't find a tool to set the xattrs. " - "Install either the python 'pyxattr' or 'xattr' " - "modules, or the GNU 'attr' package " - "(which contains the 'setfattr' tool).") - else: - raise XAttrUnavailableError( - "Couldn't find a tool to set the xattrs. " - "Install either the python 'xattr' module, " - "or the 'xattr' binary.") - - -def random_birthday(year_field, month_field, day_field): - start_date = datetime.date(1950, 1, 1) - end_date = datetime.date(1995, 12, 31) - offset = random.randint(0, (end_date - start_date).days) - random_date = start_date + datetime.timedelta(offset) - return { - year_field: str(random_date.year), - month_field: str(random_date.month), - day_field: str(random_date.day), - } - - -def clean_podcast_url(url): - return re.sub(r'''(?x) - (?: - (?: - chtbl\.com/track| - media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/ - play\.podtrac\.com - )/[^/]+| - (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure - flex\.acast\.com| - pd(?: - cn\.co| # https://podcorn.com/analytics-prefix/ - st\.fm # https://podsights.com/docs/ - )/e - )/''', '', url) diff --git a/youtube_dl/version.py b/youtube_dl/version.py deleted file mode 100644 index 461dd87ca..000000000 --- a/youtube_dl/version.py +++ /dev/null @@ -1,3 +0,0 @@ -from __future__ import unicode_literals - -__version__ = '2021.06.06' diff --git a/yt-dlp.cmd b/yt-dlp.cmd new file mode 100644 index 000000000..2b651a41e --- /dev/null +++ b/yt-dlp.cmd @@ -0,0 +1 @@ +@py "%~dp0yt_dlp\__main__.py" %*
\ No newline at end of file diff --git a/yt-dlp.sh b/yt-dlp.sh new file mode 100644 index 000000000..71a9aa163 --- /dev/null +++ b/yt-dlp.sh @@ -0,0 +1,2 @@ +#!/bin/sh +exec python3 "$(dirname "$(realpath "$0")")/yt_dlp/__main__.py" "$@" diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py new file mode 100644 index 000000000..4a7712cb6 --- /dev/null +++ b/yt_dlp/YoutubeDL.py @@ -0,0 +1,3552 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +from __future__ import absolute_import, unicode_literals + +import collections +import contextlib +import copy +import datetime +import errno +import fileinput +import functools +import io +import itertools +import json +import locale +import operator +import os +import platform +import re +import shutil +import subprocess +import sys +import tempfile +import time +import tokenize +import traceback +import random +import unicodedata + +from string import ascii_letters + +from .compat import ( + compat_basestring, + compat_get_terminal_size, + compat_kwargs, + compat_numeric_types, + compat_os_name, + compat_pycrypto_AES, + compat_shlex_quote, + compat_str, + compat_tokenize_tokenize, + compat_urllib_error, + compat_urllib_request, + compat_urllib_request_DataHandler, + windows_enable_vt_mode, +) +from .cookies import load_cookies +from .utils import ( + age_restricted, + args_to_str, + ContentTooShortError, + date_from_str, + DateRange, + DEFAULT_OUTTMPL, + determine_ext, + determine_protocol, + DOT_DESKTOP_LINK_TEMPLATE, + DOT_URL_LINK_TEMPLATE, + DOT_WEBLOC_LINK_TEMPLATE, + DownloadError, + encode_compat_str, + encodeFilename, + EntryNotInPlaylist, + error_to_compat_str, + ExistingVideoReached, + expand_path, + ExtractorError, + float_or_none, + format_bytes, + format_field, + formatSeconds, + GeoRestrictedError, + HEADRequest, + int_or_none, + iri_to_uri, + ISO3166Utils, + LazyList, + locked_file, + make_dir, + make_HTTPS_handler, + MaxDownloadsReached, + network_exceptions, + orderedSet, + OUTTMPL_TYPES, + PagedList, + parse_filesize, + PerRequestProxyHandler, + platform_name, + PostProcessingError, + preferredencoding, + prepend_extension, + process_communicate_or_kill, + register_socks_protocols, + RejectedVideoReached, + render_table, + replace_extension, + SameFileError, + sanitize_filename, + sanitize_path, + sanitize_url, + sanitized_Request, + std_headers, + STR_FORMAT_RE_TMPL, + STR_FORMAT_TYPES, + str_or_none, + strftime_or_none, + subtitles_filename, + supports_terminal_sequences, + TERMINAL_SEQUENCES, + ThrottledDownload, + to_high_limit_path, + traverse_obj, + try_get, + UnavailableVideoError, + url_basename, + variadic, + version_tuple, + write_json_file, + write_string, + YoutubeDLCookieProcessor, + YoutubeDLHandler, + YoutubeDLRedirectHandler, +) +from .cache import Cache +from .extractor import ( + gen_extractor_classes, + get_info_extractor, + _LAZY_LOADER, + _PLUGIN_CLASSES as plugin_extractors +) +from .extractor.openload import PhantomJSwrapper +from .downloader import ( + FFmpegFD, + get_suitable_downloader, + shorten_protocol_name +) +from .downloader.rtmp import rtmpdump_version +from .postprocessor import ( + get_postprocessor, + EmbedThumbnailPP, + FFmpegFixupDurationPP, + FFmpegFixupM3u8PP, + FFmpegFixupM4aPP, + FFmpegFixupStretchedPP, + FFmpegFixupTimestampPP, + FFmpegMergerPP, + FFmpegPostProcessor, + MoveFilesAfterDownloadPP, + _PLUGIN_CLASSES as plugin_postprocessors +) +from .update import detect_variant +from .version import __version__ + +if compat_os_name == 'nt': + import ctypes + + +class YoutubeDL(object): + """YoutubeDL class. + + YoutubeDL objects are the ones responsible of downloading the + actual video file and writing it to disk if the user has requested + it, among some other tasks. In most cases there should be one per + program. As, given a video URL, the downloader doesn't know how to + extract all the needed information, task that InfoExtractors do, it + has to pass the URL to one of them. + + For this, YoutubeDL objects have a method that allows + InfoExtractors to be registered in a given order. When it is passed + a URL, the YoutubeDL object handles it to the first InfoExtractor it + finds that reports being able to handle it. The InfoExtractor extracts + all the information about the video or videos the URL refers to, and + YoutubeDL process the extracted information, possibly using a File + Downloader to download the video. + + YoutubeDL objects accept a lot of parameters. In order not to saturate + the object constructor with arguments, it receives a dictionary of + options instead. These options are available through the params + attribute for the InfoExtractors to use. The YoutubeDL also + registers itself as the downloader in charge for the InfoExtractors + that are added to it, so this is a "mutual registration". + + Available options: + + username: Username for authentication purposes. + password: Password for authentication purposes. + videopassword: Password for accessing a video. + ap_mso: Adobe Pass multiple-system operator identifier. + ap_username: Multiple-system operator account username. + ap_password: Multiple-system operator account password. + usenetrc: Use netrc for authentication instead. + verbose: Print additional info to stdout. + quiet: Do not print messages to stdout. + no_warnings: Do not print out anything for warnings. + forceprint: A list of templates to force print + forceurl: Force printing final URL. (Deprecated) + forcetitle: Force printing title. (Deprecated) + forceid: Force printing ID. (Deprecated) + forcethumbnail: Force printing thumbnail URL. (Deprecated) + forcedescription: Force printing description. (Deprecated) + forcefilename: Force printing final filename. (Deprecated) + forceduration: Force printing duration. (Deprecated) + forcejson: Force printing info_dict as JSON. + dump_single_json: Force printing the info_dict of the whole playlist + (or video) as a single JSON line. + force_write_download_archive: Force writing download archive regardless + of 'skip_download' or 'simulate'. + simulate: Do not download the video files. If unset (or None), + simulate only if listsubtitles, listformats or list_thumbnails is used + format: Video format code. see "FORMAT SELECTION" for more details. + allow_unplayable_formats: Allow unplayable formats to be extracted and downloaded. + ignore_no_formats_error: Ignore "No video formats" error. Usefull for + extracting metadata even if the video is not actually + available for download (experimental) + format_sort: How to sort the video formats. see "Sorting Formats" + for more details. + format_sort_force: Force the given format_sort. see "Sorting Formats" + for more details. + allow_multiple_video_streams: Allow multiple video streams to be merged + into a single file + allow_multiple_audio_streams: Allow multiple audio streams to be merged + into a single file + check_formats Whether to test if the formats are downloadable. + Can be True (check all), False (check none) + or None (check only if requested by extractor) + paths: Dictionary of output paths. The allowed keys are 'home' + 'temp' and the keys of OUTTMPL_TYPES (in utils.py) + outtmpl: Dictionary of templates for output names. Allowed keys + are 'default' and the keys of OUTTMPL_TYPES (in utils.py). + For compatibility with youtube-dl, a single string can also be used + outtmpl_na_placeholder: Placeholder for unavailable meta fields. + restrictfilenames: Do not allow "&" and spaces in file names + trim_file_name: Limit length of filename (extension excluded) + windowsfilenames: Force the filenames to be windows compatible + ignoreerrors: Do not stop on download/postprocessing errors. + Can be 'only_download' to ignore only download errors. + Default is 'only_download' for CLI, but False for API + skip_playlist_after_errors: Number of allowed failures until the rest of + the playlist is skipped + force_generic_extractor: Force downloader to use the generic extractor + overwrites: Overwrite all video and metadata files if True, + overwrite only non-video files if None + and don't overwrite any file if False + For compatibility with youtube-dl, + "nooverwrites" may also be used instead + playliststart: Playlist item to start at. + playlistend: Playlist item to end at. + playlist_items: Specific indices of playlist to download. + playlistreverse: Download playlist items in reverse order. + playlistrandom: Download playlist items in random order. + matchtitle: Download only matching titles. + rejecttitle: Reject downloads for matching titles. + logger: Log messages to a logging.Logger instance. + logtostderr: Log messages to stderr instead of stdout. + consoletitle: Display progress in console window's titlebar. + writedescription: Write the video description to a .description file + writeinfojson: Write the video description to a .info.json file + clean_infojson: Remove private fields from the infojson + getcomments: Extract video comments. This will not be written to disk + unless writeinfojson is also given + writeannotations: Write the video annotations to a .annotations.xml file + writethumbnail: Write the thumbnail image to a file + allow_playlist_files: Whether to write playlists' description, infojson etc + also to disk when using the 'write*' options + write_all_thumbnails: Write all thumbnail formats to files + writelink: Write an internet shortcut file, depending on the + current platform (.url/.webloc/.desktop) + writeurllink: Write a Windows internet shortcut file (.url) + writewebloclink: Write a macOS internet shortcut file (.webloc) + writedesktoplink: Write a Linux internet shortcut file (.desktop) + writesubtitles: Write the video subtitles to a file + writeautomaticsub: Write the automatically generated subtitles to a file + allsubtitles: Deprecated - Use subtitleslangs = ['all'] + Downloads all the subtitles of the video + (requires writesubtitles or writeautomaticsub) + listsubtitles: Lists all available subtitles for the video + subtitlesformat: The format code for subtitles + subtitleslangs: List of languages of the subtitles to download (can be regex). + The list may contain "all" to refer to all the available + subtitles. The language can be prefixed with a "-" to + exclude it from the requested languages. Eg: ['all', '-live_chat'] + keepvideo: Keep the video file after post-processing + daterange: A DateRange object, download only if the upload_date is in the range. + skip_download: Skip the actual download of the video file + cachedir: Location of the cache files in the filesystem. + False to disable filesystem cache. + noplaylist: Download single video instead of a playlist if in doubt. + age_limit: An integer representing the user's age in years. + Unsuitable videos for the given age are skipped. + min_views: An integer representing the minimum view count the video + must have in order to not be skipped. + Videos without view count information are always + downloaded. None for no limit. + max_views: An integer representing the maximum view count. + Videos that are more popular than that are not + downloaded. + Videos without view count information are always + downloaded. None for no limit. + download_archive: File name of a file where all downloads are recorded. + Videos already present in the file are not downloaded + again. + break_on_existing: Stop the download process after attempting to download a + file that is in the archive. + break_on_reject: Stop the download process when encountering a video that + has been filtered out. + cookiefile: File name where cookies should be read from and dumped to + cookiesfrombrowser: A tuple containing the name of the browser and the profile + name/path from where cookies are loaded. + Eg: ('chrome', ) or (vivaldi, 'default') + nocheckcertificate:Do not verify SSL certificates + prefer_insecure: Use HTTP instead of HTTPS to retrieve information. + At the moment, this is only supported by YouTube. + proxy: URL of the proxy server to use + geo_verification_proxy: URL of the proxy to use for IP address verification + on geo-restricted sites. + socket_timeout: Time to wait for unresponsive hosts, in seconds + bidi_workaround: Work around buggy terminals without bidirectional text + support, using fridibi + debug_printtraffic:Print out sent and received HTTP traffic + include_ads: Download ads as well + default_search: Prepend this string if an input url is not valid. + 'auto' for elaborate guessing + encoding: Use this encoding instead of the system-specified. + extract_flat: Do not resolve URLs, return the immediate result. + Pass in 'in_playlist' to only show this behavior for + playlist items. + postprocessors: A list of dictionaries, each with an entry + * key: The name of the postprocessor. See + yt_dlp/postprocessor/__init__.py for a list. + * when: When to run the postprocessor. Can be one of + pre_process|before_dl|post_process|after_move. + Assumed to be 'post_process' if not given + post_hooks: Deprecated - Register a custom postprocessor instead + A list of functions that get called as the final step + for each video file, after all postprocessors have been + called. The filename will be passed as the only argument. + progress_hooks: A list of functions that get called on download + progress, with a dictionary with the entries + * status: One of "downloading", "error", or "finished". + Check this first and ignore unknown values. + * info_dict: The extracted info_dict + + If status is one of "downloading", or "finished", the + following properties may also be present: + * filename: The final filename (always present) + * tmpfilename: The filename we're currently writing to + * downloaded_bytes: Bytes on disk + * total_bytes: Size of the whole file, None if unknown + * total_bytes_estimate: Guess of the eventual file size, + None if unavailable. + * elapsed: The number of seconds since download started. + * eta: The estimated time in seconds, None if unknown + * speed: The download speed in bytes/second, None if + unknown + * fragment_index: The counter of the currently + downloaded video fragment. + * fragment_count: The number of fragments (= individual + files that will be merged) + + Progress hooks are guaranteed to be called at least once + (with status "finished") if the download is successful. + postprocessor_hooks: A list of functions that get called on postprocessing + progress, with a dictionary with the entries + * status: One of "started", "processing", or "finished". + Check this first and ignore unknown values. + * postprocessor: Name of the postprocessor + * info_dict: The extracted info_dict + + Progress hooks are guaranteed to be called at least twice + (with status "started" and "finished") if the processing is successful. + merge_output_format: Extension to use when merging formats. + final_ext: Expected final extension; used to detect when the file was + already downloaded and converted. "merge_output_format" is + replaced by this extension when given + fixup: Automatically correct known faults of the file. + One of: + - "never": do nothing + - "warn": only emit a warning + - "detect_or_warn": check whether we can do anything + about it, warn otherwise (default) + source_address: Client-side IP address to bind to. + call_home: Boolean, true iff we are allowed to contact the + yt-dlp servers for debugging. (BROKEN) + sleep_interval_requests: Number of seconds to sleep between requests + during extraction + sleep_interval: Number of seconds to sleep before each download when + used alone or a lower bound of a range for randomized + sleep before each download (minimum possible number + of seconds to sleep) when used along with + max_sleep_interval. + max_sleep_interval:Upper bound of a range for randomized sleep before each + download (maximum possible number of seconds to sleep). + Must only be used along with sleep_interval. + Actual sleep time will be a random float from range + [sleep_interval; max_sleep_interval]. + sleep_interval_subtitles: Number of seconds to sleep before each subtitle download + listformats: Print an overview of available video formats and exit. + list_thumbnails: Print a table of all thumbnails and exit. + match_filter: A function that gets called with the info_dict of + every video. + If it returns a message, the video is ignored. + If it returns None, the video is downloaded. + match_filter_func in utils.py is one example for this. + no_color: Do not emit color codes in output. + geo_bypass: Bypass geographic restriction via faking X-Forwarded-For + HTTP header + geo_bypass_country: + Two-letter ISO 3166-2 country code that will be used for + explicit geographic restriction bypassing via faking + X-Forwarded-For HTTP header + geo_bypass_ip_block: + IP range in CIDR notation that will be used similarly to + geo_bypass_country + + The following options determine which downloader is picked: + external_downloader: A dictionary of protocol keys and the executable of the + external downloader to use for it. The allowed protocols + are default|http|ftp|m3u8|dash|rtsp|rtmp|mms. + Set the value to 'native' to use the native downloader + hls_prefer_native: Deprecated - Use external_downloader = {'m3u8': 'native'} + or {'m3u8': 'ffmpeg'} instead. + Use the native HLS downloader instead of ffmpeg/avconv + if True, otherwise use ffmpeg/avconv if False, otherwise + use downloader suggested by extractor if None. + compat_opts: Compatibility options. See "Differences in default behavior". + The following options do not work when used through the API: + filename, abort-on-error, multistreams, no-live-chat, format-sort + no-clean-infojson, no-playlist-metafiles, no-keep-subs. + Refer __init__.py for their implementation + progress_template: Dictionary of templates for progress outputs. + Allowed keys are 'download', 'postprocess', + 'download-title' (console title) and 'postprocess-title'. + The template is mapped on a dictionary with keys 'progress' and 'info' + + The following parameters are not used by YoutubeDL itself, they are used by + the downloader (see yt_dlp/downloader/common.py): + nopart, updatetime, buffersize, ratelimit, throttledratelimit, min_filesize, + max_filesize, test, noresizebuffer, retries, fragment_retries, continuedl, + noprogress, xattr_set_filesize, hls_use_mpegts, http_chunk_size, + external_downloader_args. + + The following options are used by the post processors: + prefer_ffmpeg: If False, use avconv instead of ffmpeg if both are available, + otherwise prefer ffmpeg. (avconv support is deprecated) + ffmpeg_location: Location of the ffmpeg/avconv binary; either the path + to the binary or its containing directory. + postprocessor_args: A dictionary of postprocessor/executable keys (in lower case) + and a list of additional command-line arguments for the + postprocessor/executable. The dict can also have "PP+EXE" keys + which are used when the given exe is used by the given PP. + Use 'default' as the name for arguments to passed to all PP + For compatibility with youtube-dl, a single list of args + can also be used + + The following options are used by the extractors: + extractor_retries: Number of times to retry for known errors + dynamic_mpd: Whether to process dynamic DASH manifests (default: True) + hls_split_discontinuity: Split HLS playlists to different formats at + discontinuities such as ad breaks (default: False) + extractor_args: A dictionary of arguments to be passed to the extractors. + See "EXTRACTOR ARGUMENTS" for details. + Eg: {'youtube': {'skip': ['dash', 'hls']}} + youtube_include_dash_manifest: Deprecated - Use extractor_args instead. + If True (default), DASH manifests and related + data will be downloaded and processed by extractor. + You can reduce network I/O by disabling it if you don't + care about DASH. (only for youtube) + youtube_include_hls_manifest: Deprecated - Use extractor_args instead. + If True (default), HLS manifests and related + data will be downloaded and processed by extractor. + You can reduce network I/O by disabling it if you don't + care about HLS. (only for youtube) + """ + + _NUMERIC_FIELDS = set(( + 'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx', + 'timestamp', 'release_timestamp', + 'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count', + 'average_rating', 'comment_count', 'age_limit', + 'start_time', 'end_time', + 'chapter_number', 'season_number', 'episode_number', + 'track_number', 'disc_number', 'release_year', + )) + + _format_selection_exts = { + 'audio': {'m4a', 'mp3', 'ogg', 'aac'}, + 'video': {'mp4', 'flv', 'webm', '3gp'}, + 'storyboards': {'mhtml'}, + } + + params = None + _ies = {} + _pps = {'pre_process': [], 'before_dl': [], 'after_move': [], 'post_process': []} + _printed_messages = set() + _first_webpage_request = True + _download_retcode = None + _num_downloads = None + _playlist_level = 0 + _playlist_urls = set() + _screen_file = None + + def __init__(self, params=None, auto_init=True): + """Create a FileDownloader object with the given options. + @param auto_init Whether to load the default extractors and print header (if verbose). + Set to 'no_verbose_header' to not ptint the header + """ + if params is None: + params = {} + self._ies = {} + self._ies_instances = {} + self._pps = {'pre_process': [], 'before_dl': [], 'after_move': [], 'post_process': []} + self._printed_messages = set() + self._first_webpage_request = True + self._post_hooks = [] + self._progress_hooks = [] + self._postprocessor_hooks = [] + self._download_retcode = 0 + self._num_downloads = 0 + self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)] + self._err_file = sys.stderr + self.params = params + self.cache = Cache(self) + + windows_enable_vt_mode() + # FIXME: This will break if we ever print color to stdout + self.params['no_color'] = self.params.get('no_color') or not supports_terminal_sequences(self._err_file) + + if sys.version_info < (3, 6): + self.report_warning( + 'Python version %d.%d is not supported! Please update to Python 3.6 or above' % sys.version_info[:2]) + + if self.params.get('allow_unplayable_formats'): + self.report_warning( + f'You have asked for {self._color_text("unplayable formats", "blue")} to be listed/downloaded. ' + 'This is a developer option intended for debugging. \n' + ' If you experience any issues while using this option, ' + f'{self._color_text("DO NOT", "red")} open a bug report') + + def check_deprecated(param, option, suggestion): + if self.params.get(param) is not None: + self.report_warning('%s is deprecated. Use %s instead' % (option, suggestion)) + return True + return False + + if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'): + if self.params.get('geo_verification_proxy') is None: + self.params['geo_verification_proxy'] = self.params['cn_verification_proxy'] + + check_deprecated('autonumber', '--auto-number', '-o "%(autonumber)s-%(title)s.%(ext)s"') + check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"') + check_deprecated('useid', '--id', '-o "%(id)s.%(ext)s"') + + for msg in self.params.get('warnings', []): + self.report_warning(msg) + + if 'overwrites' not in self.params and self.params.get('nooverwrites') is not None: + # nooverwrites was unnecessarily changed to overwrites + # in 0c3d0f51778b153f65c21906031c2e091fcfb641 + # This ensures compatibility with both keys + self.params['overwrites'] = not self.params['nooverwrites'] + elif self.params.get('overwrites') is None: + self.params.pop('overwrites', None) + else: + self.params['nooverwrites'] = not self.params['overwrites'] + + if params.get('bidi_workaround', False): + try: + import pty + master, slave = pty.openpty() + width = compat_get_terminal_size().columns + if width is None: + width_args = [] + else: + width_args = ['-w', str(width)] + sp_kwargs = dict( + stdin=subprocess.PIPE, + stdout=slave, + stderr=self._err_file) + try: + self._output_process = subprocess.Popen( + ['bidiv'] + width_args, **sp_kwargs + ) + except OSError: + self._output_process = subprocess.Popen( + ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs) + self._output_channel = os.fdopen(master, 'rb') + except OSError as ose: + if ose.errno == errno.ENOENT: + self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that fribidi is an executable file in one of the directories in your $PATH.') + else: + raise + + if (sys.platform != 'win32' + and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] + and not params.get('restrictfilenames', False)): + # Unicode filesystem API will throw errors (#1474, #13027) + self.report_warning( + 'Assuming --restrict-filenames since file system encoding ' + 'cannot encode all characters. ' + 'Set the LC_ALL environment variable to fix this.') + self.params['restrictfilenames'] = True + + self.outtmpl_dict = self.parse_outtmpl() + + # Creating format selector here allows us to catch syntax errors before the extraction + self.format_selector = ( + None if self.params.get('format') is None + else self.build_format_selector(self.params['format'])) + + self._setup_opener() + + if auto_init: + if auto_init != 'no_verbose_header': + self.print_debug_header() + self.add_default_info_extractors() + + for pp_def_raw in self.params.get('postprocessors', []): + pp_def = dict(pp_def_raw) + when = pp_def.pop('when', 'post_process') + pp_class = get_postprocessor(pp_def.pop('key')) + pp = pp_class(self, **compat_kwargs(pp_def)) + self.add_post_processor(pp, when=when) + + for ph in self.params.get('post_hooks', []): + self.add_post_hook(ph) + + for ph in self.params.get('progress_hooks', []): + self.add_progress_hook(ph) + + register_socks_protocols() + + def preload_download_archive(fn): + """Preload the archive, if any is specified""" + if fn is None: + return False + self.write_debug('Loading archive file %r\n' % fn) + try: + with locked_file(fn, 'r', encoding='utf-8') as archive_file: + for line in archive_file: + self.archive.add(line.strip()) + except IOError as ioe: + if ioe.errno != errno.ENOENT: + raise + return False + return True + + self.archive = set() + preload_download_archive(self.params.get('download_archive')) + + def warn_if_short_id(self, argv): + # short YouTube ID starting with dash? + idxs = [ + i for i, a in enumerate(argv) + if re.match(r'^-[0-9A-Za-z_-]{10}$', a)] + if idxs: + correct_argv = ( + ['yt-dlp'] + + [a for i, a in enumerate(argv) if i not in idxs] + + ['--'] + [argv[i] for i in idxs] + ) + self.report_warning( + 'Long argument string detected. ' + 'Use -- to separate parameters and URLs, like this:\n%s\n' % + args_to_str(correct_argv)) + + def add_info_extractor(self, ie): + """Add an InfoExtractor object to the end of the list.""" + ie_key = ie.ie_key() + self._ies[ie_key] = ie + if not isinstance(ie, type): + self._ies_instances[ie_key] = ie + ie.set_downloader(self) + + def _get_info_extractor_class(self, ie_key): + ie = self._ies.get(ie_key) + if ie is None: + ie = get_info_extractor(ie_key) + self.add_info_extractor(ie) + return ie + + def get_info_extractor(self, ie_key): + """ + Get an instance of an IE with name ie_key, it will try to get one from + the _ies list, if there's no instance it will create a new one and add + it to the extractor list. + """ + ie = self._ies_instances.get(ie_key) + if ie is None: + ie = get_info_extractor(ie_key)() + self.add_info_extractor(ie) + return ie + + def add_default_info_extractors(self): + """ + Add the InfoExtractors returned by gen_extractors to the end of the list + """ + for ie in gen_extractor_classes(): + self.add_info_extractor(ie) + + def add_post_processor(self, pp, when='post_process'): + """Add a PostProcessor object to the end of the chain.""" + self._pps[when].append(pp) + pp.set_downloader(self) + + def add_post_hook(self, ph): + """Add the post hook""" + self._post_hooks.append(ph) + + def add_progress_hook(self, ph): + """Add the download progress hook""" + self._progress_hooks.append(ph) + + def add_postprocessor_hook(self, ph): + """Add the postprocessing progress hook""" + self._postprocessor_hooks.append(ph) + + def _bidi_workaround(self, message): + if not hasattr(self, '_output_channel'): + return message + + assert hasattr(self, '_output_process') + assert isinstance(message, compat_str) + line_count = message.count('\n') + 1 + self._output_process.stdin.write((message + '\n').encode('utf-8')) + self._output_process.stdin.flush() + res = ''.join(self._output_channel.readline().decode('utf-8') + for _ in range(line_count)) + return res[:-len('\n')] + + def _write_string(self, message, out=None, only_once=False): + if only_once: + if message in self._printed_messages: + return + self._printed_messages.add(message) + write_string(message, out=out, encoding=self.params.get('encoding')) + + def to_stdout(self, message, skip_eol=False, quiet=False): + """Print message to stdout""" + if self.params.get('logger'): + self.params['logger'].debug(message) + elif not quiet or self.params.get('verbose'): + self._write_string( + '%s%s' % (self._bidi_workaround(message), ('' if skip_eol else '\n')), + self._err_file if quiet else self._screen_file) + + def to_stderr(self, message, only_once=False): + """Print message to stderr""" + assert isinstance(message, compat_str) + if self.params.get('logger'): + self.params['logger'].error(message) + else: + self._write_string('%s\n' % self._bidi_workaround(message), self._err_file, only_once=only_once) + + def to_console_title(self, message): + if not self.params.get('consoletitle', False): + return + if compat_os_name == 'nt': + if ctypes.windll.kernel32.GetConsoleWindow(): + # c_wchar_p() might not be necessary if `message` is + # already of type unicode() + ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message)) + elif 'TERM' in os.environ: + self._write_string('\033]0;%s\007' % message, self._screen_file) + + def save_console_title(self): + if not self.params.get('consoletitle', False): + return + if self.params.get('simulate'): + return + if compat_os_name != 'nt' and 'TERM' in os.environ: + # Save the title on stack + self._write_string('\033[22;0t', self._screen_file) + + def restore_console_title(self): + if not self.params.get('consoletitle', False): + return + if self.params.get('simulate'): + return + if compat_os_name != 'nt' and 'TERM' in os.environ: + # Restore the title from stack + self._write_string('\033[23;0t', self._screen_file) + + def __enter__(self): + self.save_console_title() + return self + + def __exit__(self, *args): + self.restore_console_title() + + if self.params.get('cookiefile') is not None: + self.cookiejar.save(ignore_discard=True, ignore_expires=True) + + def trouble(self, message=None, tb=None): + """Determine action to take when a download problem appears. + + Depending on if the downloader has been configured to ignore + download errors or not, this method may throw an exception or + not when errors are found, after printing the message. + + tb, if given, is additional traceback information. + """ + if message is not None: + self.to_stderr(message) + if self.params.get('verbose'): + if tb is None: + if sys.exc_info()[0]: # if .trouble has been called from an except block + tb = '' + if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]: + tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info)) + tb += encode_compat_str(traceback.format_exc()) + else: + tb_data = traceback.format_list(traceback.extract_stack()) + tb = ''.join(tb_data) + if tb: + self.to_stderr(tb) + if not self.params.get('ignoreerrors'): + if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]: + exc_info = sys.exc_info()[1].exc_info + else: + exc_info = sys.exc_info() + raise DownloadError(message, exc_info) + self._download_retcode = 1 + + def to_screen(self, message, skip_eol=False): + """Print message to stdout if not in quiet mode""" + self.to_stdout( + message, skip_eol, quiet=self.params.get('quiet', False)) + + def _color_text(self, text, color): + if self.params.get('no_color'): + return text + return f'{TERMINAL_SEQUENCES[color.upper()]}{text}{TERMINAL_SEQUENCES["RESET_STYLE"]}' + + def report_warning(self, message, only_once=False): + ''' + Print the message to stderr, it will be prefixed with 'WARNING:' + If stderr is a tty file the 'WARNING:' will be colored + ''' + if self.params.get('logger') is not None: + self.params['logger'].warning(message) + else: + if self.params.get('no_warnings'): + return + self.to_stderr(f'{self._color_text("WARNING:", "yellow")} {message}', only_once) + + def report_error(self, message, tb=None): + ''' + Do the same as trouble, but prefixes the message with 'ERROR:', colored + in red if stderr is a tty file. + ''' + self.trouble(f'{self._color_text("ERROR:", "red")} {message}', tb) + + def write_debug(self, message, only_once=False): + '''Log debug message or Print message to stderr''' + if not self.params.get('verbose', False): + return + message = '[debug] %s' % message + if self.params.get('logger'): + self.params['logger'].debug(message) + else: + self.to_stderr(message, only_once) + + def report_file_already_downloaded(self, file_name): + """Report file has already been fully downloaded.""" + try: + self.to_screen('[download] %s has already been downloaded' % file_name) + except UnicodeEncodeError: + self.to_screen('[download] The file has already been downloaded') + + def report_file_delete(self, file_name): + """Report that existing file will be deleted.""" + try: + self.to_screen('Deleting existing file %s' % file_name) + except UnicodeEncodeError: + self.to_screen('Deleting existing file') + + def raise_no_formats(self, info, forced=False): + has_drm = info.get('__has_drm') + msg = 'This video is DRM protected' if has_drm else 'No video formats found!' + expected = self.params.get('ignore_no_formats_error') + if forced or not expected: + raise ExtractorError(msg, video_id=info['id'], ie=info['extractor'], + expected=has_drm or expected) + else: + self.report_warning(msg) + + def parse_outtmpl(self): + outtmpl_dict = self.params.get('outtmpl', {}) + if not isinstance(outtmpl_dict, dict): + outtmpl_dict = {'default': outtmpl_dict} + # Remove spaces in the default template + if self.params.get('restrictfilenames'): + sanitize = lambda x: x.replace(' - ', ' ').replace(' ', '-') + else: + sanitize = lambda x: x + outtmpl_dict.update({ + k: sanitize(v) for k, v in DEFAULT_OUTTMPL.items() + if outtmpl_dict.get(k) is None}) + for key, val in outtmpl_dict.items(): + if isinstance(val, bytes): + self.report_warning( + 'Parameter outtmpl is bytes, but should be a unicode string. ' + 'Put from __future__ import unicode_literals at the top of your code file or consider switching to Python 3.x.') + return outtmpl_dict + + def get_output_path(self, dir_type='', filename=None): + paths = self.params.get('paths', {}) + assert isinstance(paths, dict) + path = os.path.join( + expand_path(paths.get('home', '').strip()), + expand_path(paths.get(dir_type, '').strip()) if dir_type else '', + filename or '') + + # Temporary fix for #4787 + # 'Treat' all problem characters by passing filename through preferredencoding + # to workaround encoding issues with subprocess on python2 @ Windows + if sys.version_info < (3, 0) and sys.platform == 'win32': + path = encodeFilename(path, True).decode(preferredencoding()) + return sanitize_path(path, force=self.params.get('windowsfilenames')) + + @staticmethod + def _outtmpl_expandpath(outtmpl): + # expand_path translates '%%' into '%' and '$$' into '$' + # correspondingly that is not what we want since we need to keep + # '%%' intact for template dict substitution step. Working around + # with boundary-alike separator hack. + sep = ''.join([random.choice(ascii_letters) for _ in range(32)]) + outtmpl = outtmpl.replace('%%', '%{0}%'.format(sep)).replace('$$', '${0}$'.format(sep)) + + # outtmpl should be expand_path'ed before template dict substitution + # because meta fields may contain env variables we don't want to + # be expanded. For example, for outtmpl "%(title)s.%(ext)s" and + # title "Hello $PATH", we don't want `$PATH` to be expanded. + return expand_path(outtmpl).replace(sep, '') + + @staticmethod + def escape_outtmpl(outtmpl): + ''' Escape any remaining strings like %s, %abc% etc. ''' + return re.sub( + STR_FORMAT_RE_TMPL.format('', '(?![%(\0])'), + lambda mobj: ('' if mobj.group('has_key') else '%') + mobj.group(0), + outtmpl) + + @classmethod + def validate_outtmpl(cls, outtmpl): + ''' @return None or Exception object ''' + outtmpl = re.sub( + STR_FORMAT_RE_TMPL.format('[^)]*', '[ljqBU]'), + lambda mobj: f'{mobj.group(0)[:-1]}s', + cls._outtmpl_expandpath(outtmpl)) + try: + cls.escape_outtmpl(outtmpl) % collections.defaultdict(int) + return None + except ValueError as err: + return err + + @staticmethod + def _copy_infodict(info_dict): + info_dict = dict(info_dict) + for key in ('__original_infodict', '__postprocessors'): + info_dict.pop(key, None) + return info_dict + + def prepare_outtmpl(self, outtmpl, info_dict, sanitize=None): + """ Make the outtmpl and info_dict suitable for substitution: ydl.escape_outtmpl(outtmpl) % info_dict """ + info_dict.setdefault('epoch', int(time.time())) # keep epoch consistent once set + + info_dict = self._copy_infodict(info_dict) + info_dict['duration_string'] = ( # %(duration>%H-%M-%S)s is wrong if duration > 24hrs + formatSeconds(info_dict['duration'], '-' if sanitize else ':') + if info_dict.get('duration', None) is not None + else None) + info_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads + if info_dict.get('resolution') is None: + info_dict['resolution'] = self.format_resolution(info_dict, default=None) + + # For fields playlist_index, playlist_autonumber and autonumber convert all occurrences + # of %(field)s to %(field)0Nd for backward compatibility + field_size_compat_map = { + 'playlist_index': len(str(info_dict.get('_last_playlist_index') or '')), + 'playlist_autonumber': len(str(info_dict.get('n_entries') or '')), + 'autonumber': self.params.get('autonumber_size') or 5, + } + + TMPL_DICT = {} + EXTERNAL_FORMAT_RE = re.compile(STR_FORMAT_RE_TMPL.format('[^)]*', f'[{STR_FORMAT_TYPES}ljqBU]')) + MATH_FUNCTIONS = { + '+': float.__add__, + '-': float.__sub__, + } + # Field is of the form key1.key2... + # where keys (except first) can be string, int or slice + FIELD_RE = r'\w*(?:\.(?:\w+|{num}|{num}?(?::{num}?){{1,2}}))*'.format(num=r'(?:-?\d+)') + MATH_FIELD_RE = r'''{field}|{num}'''.format(field=FIELD_RE, num=r'-?\d+(?:.\d+)?') + MATH_OPERATORS_RE = r'(?:%s)' % '|'.join(map(re.escape, MATH_FUNCTIONS.keys())) + INTERNAL_FORMAT_RE = re.compile(r'''(?x) + (?P<negate>-)? + (?P<fields>{field}) + (?P<maths>(?:{math_op}{math_field})*) + (?:>(?P<strf_format>.+?))? + (?P<alternate>(?<!\\),[^|)]+)? + (?:\|(?P<default>.*?))? + $'''.format(field=FIELD_RE, math_op=MATH_OPERATORS_RE, math_field=MATH_FIELD_RE)) + + def _traverse_infodict(k): + k = k.split('.') + if k[0] == '': + k.pop(0) + return traverse_obj(info_dict, k, is_user_input=True, traverse_string=True) + + def get_value(mdict): + # Object traversal + value = _traverse_infodict(mdict['fields']) + # Negative + if mdict['negate']: + value = float_or_none(value) + if value is not None: + value *= -1 + # Do maths + offset_key = mdict['maths'] + if offset_key: + value = float_or_none(value) + operator = None + while offset_key: + item = re.match( + MATH_FIELD_RE if operator else MATH_OPERATORS_RE, + offset_key).group(0) + offset_key = offset_key[len(item):] + if operator is None: + operator = MATH_FUNCTIONS[item] + continue + item, multiplier = (item[1:], -1) if item[0] == '-' else (item, 1) + offset = float_or_none(item) + if offset is None: + offset = float_or_none(_traverse_infodict(item)) + try: + value = operator(value, multiplier * offset) + except (TypeError, ZeroDivisionError): + return None + operator = None + # Datetime formatting + if mdict['strf_format']: + value = strftime_or_none(value, mdict['strf_format'].replace('\\,', ',')) + + return value + + na = self.params.get('outtmpl_na_placeholder', 'NA') + + def _dumpjson_default(obj): + if isinstance(obj, (set, LazyList)): + return list(obj) + raise TypeError(f'Object of type {type(obj).__name__} is not JSON serializable') + + def create_key(outer_mobj): + if not outer_mobj.group('has_key'): + return outer_mobj.group(0) + key = outer_mobj.group('key') + mobj = re.match(INTERNAL_FORMAT_RE, key) + initial_field = mobj.group('fields').split('.')[-1] if mobj else '' + value, default = None, na + while mobj: + mobj = mobj.groupdict() + default = mobj['default'] if mobj['default'] is not None else default + value = get_value(mobj) + if value is None and mobj['alternate']: + mobj = re.match(INTERNAL_FORMAT_RE, mobj['alternate'][1:]) + else: + break + + fmt = outer_mobj.group('format') + if fmt == 's' and value is not None and key in field_size_compat_map.keys(): + fmt = '0{:d}d'.format(field_size_compat_map[key]) + + value = default if value is None else value + + str_fmt = f'{fmt[:-1]}s' + if fmt[-1] == 'l': # list + delim = '\n' if '#' in (outer_mobj.group('conversion') or '') else ', ' + value, fmt = delim.join(variadic(value)), str_fmt + elif fmt[-1] == 'j': # json + value, fmt = json.dumps(value, default=_dumpjson_default), str_fmt + elif fmt[-1] == 'q': # quoted + value, fmt = compat_shlex_quote(str(value)), str_fmt + elif fmt[-1] == 'B': # bytes + value = f'%{str_fmt}'.encode('utf-8') % str(value).encode('utf-8') + value, fmt = value.decode('utf-8', 'ignore'), 's' + elif fmt[-1] == 'U': # unicode normalized + opts = outer_mobj.group('conversion') or '' + value, fmt = unicodedata.normalize( + # "+" = compatibility equivalence, "#" = NFD + 'NF%s%s' % ('K' if '+' in opts else '', 'D' if '#' in opts else 'C'), + value), str_fmt + elif fmt[-1] == 'c': + if value: + value = str(value)[0] + else: + fmt = str_fmt + elif fmt[-1] not in 'rs': # numeric + value = float_or_none(value) + if value is None: + value, fmt = default, 's' + + if sanitize: + if fmt[-1] == 'r': + # If value is an object, sanitize might convert it to a string + # So we convert it to repr first + value, fmt = repr(value), str_fmt + if fmt[-1] in 'csr': + value = sanitize(initial_field, value) + + key = '%s\0%s' % (key.replace('%', '%\0'), outer_mobj.group('format')) + TMPL_DICT[key] = value + return '{prefix}%({key}){fmt}'.format(key=key, fmt=fmt, prefix=outer_mobj.group('prefix')) + + return EXTERNAL_FORMAT_RE.sub(create_key, outtmpl), TMPL_DICT + + def evaluate_outtmpl(self, outtmpl, info_dict, *args, **kwargs): + outtmpl, info_dict = self.prepare_outtmpl(outtmpl, info_dict, *args, **kwargs) + return self.escape_outtmpl(outtmpl) % info_dict + + def _prepare_filename(self, info_dict, tmpl_type='default'): + try: + sanitize = lambda k, v: sanitize_filename( + compat_str(v), + restricted=self.params.get('restrictfilenames'), + is_id=(k == 'id' or k.endswith('_id'))) + outtmpl = self._outtmpl_expandpath(self.outtmpl_dict.get(tmpl_type, self.outtmpl_dict['default'])) + filename = self.evaluate_outtmpl(outtmpl, info_dict, sanitize) + + force_ext = OUTTMPL_TYPES.get(tmpl_type) + if filename and force_ext is not None: + filename = replace_extension(filename, force_ext, info_dict.get('ext')) + + # https://github.com/blackjack4494/youtube-dlc/issues/85 + trim_file_name = self.params.get('trim_file_name', False) + if trim_file_name: + fn_groups = filename.rsplit('.') + ext = fn_groups[-1] + sub_ext = '' + if len(fn_groups) > 2: + sub_ext = fn_groups[-2] + filename = '.'.join(filter(None, [fn_groups[0][:trim_file_name], sub_ext, ext])) + + return filename + except ValueError as err: + self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')') + return None + + def prepare_filename(self, info_dict, dir_type='', warn=False): + """Generate the output filename.""" + + filename = self._prepare_filename(info_dict, dir_type or 'default') + if not filename and dir_type not in ('', 'temp'): + return '' + + if warn: + if not self.params.get('paths'): + pass + elif filename == '-': + self.report_warning('--paths is ignored when an outputting to stdout', only_once=True) + elif os.path.isabs(filename): + self.report_warning('--paths is ignored since an absolute path is given in output template', only_once=True) + if filename == '-' or not filename: + return filename + + return self.get_output_path(dir_type, filename) + + def _match_entry(self, info_dict, incomplete=False, silent=False): + """ Returns None if the file should be downloaded """ + + video_title = info_dict.get('title', info_dict.get('id', 'video')) + + def check_filter(): + if 'title' in info_dict: + # This can happen when we're just evaluating the playlist + title = info_dict['title'] + matchtitle = self.params.get('matchtitle', False) + if matchtitle: + if not re.search(matchtitle, title, re.IGNORECASE): + return '"' + title + '" title did not match pattern "' + matchtitle + '"' + rejecttitle = self.params.get('rejecttitle', False) + if rejecttitle: + if re.search(rejecttitle, title, re.IGNORECASE): + return '"' + title + '" title matched reject pattern "' + rejecttitle + '"' + date = info_dict.get('upload_date') + if date is not None: + dateRange = self.params.get('daterange', DateRange()) + if date not in dateRange: + return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange) + view_count = info_dict.get('view_count') + if view_count is not None: + min_views = self.params.get('min_views') + if min_views is not None and view_count < min_views: + return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views) + max_views = self.params.get('max_views') + if max_views is not None and view_count > max_views: + return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views) + if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')): + return 'Skipping "%s" because it is age restricted' % video_title + + match_filter = self.params.get('match_filter') + if match_filter is not None: + try: + ret = match_filter(info_dict, incomplete=incomplete) + except TypeError: + # For backward compatibility + ret = None if incomplete else match_filter(info_dict) + if ret is not None: + return ret + return None + + if self.in_download_archive(info_dict): + reason = '%s has already been recorded in the archive' % video_title + break_opt, break_err = 'break_on_existing', ExistingVideoReached + else: + reason = check_filter() + break_opt, break_err = 'break_on_reject', RejectedVideoReached + if reason is not None: + if not silent: + self.to_screen('[download] ' + reason) + if self.params.get(break_opt, False): + raise break_err() + return reason + + @staticmethod + def add_extra_info(info_dict, extra_info): + '''Set the keys from extra_info in info dict if they are missing''' + for key, value in extra_info.items(): + info_dict.setdefault(key, value) + + def extract_info(self, url, download=True, ie_key=None, extra_info=None, + process=True, force_generic_extractor=False): + """ + Return a list with a dictionary for each video extracted. + + Arguments: + url -- URL to extract + + Keyword arguments: + download -- whether to download videos during extraction + ie_key -- extractor key hint + extra_info -- dictionary containing the extra values to add to each result + process -- whether to resolve all unresolved references (URLs, playlist items), + must be True for download to work. + force_generic_extractor -- force using the generic extractor + """ + + if extra_info is None: + extra_info = {} + + if not ie_key and force_generic_extractor: + ie_key = 'Generic' + + if ie_key: + ies = {ie_key: self._get_info_extractor_class(ie_key)} + else: + ies = self._ies + + for ie_key, ie in ies.items(): + if not ie.suitable(url): + continue + + if not ie.working(): + self.report_warning('The program functionality for this site has been marked as broken, ' + 'and will probably not work.') + + temp_id = ie.get_temp_id(url) + if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': ie_key}): + self.to_screen("[%s] %s: has already been recorded in archive" % ( + ie_key, temp_id)) + break + return self.__extract_info(url, self.get_info_extractor(ie_key), download, extra_info, process) + else: + self.report_error('no suitable InfoExtractor for URL %s' % url) + + def __handle_extraction_exceptions(func): + @functools.wraps(func) + def wrapper(self, *args, **kwargs): + try: + return func(self, *args, **kwargs) + except GeoRestrictedError as e: + msg = e.msg + if e.countries: + msg += '\nThis video is available in %s.' % ', '.join( + map(ISO3166Utils.short2full, e.countries)) + msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.' + self.report_error(msg) + except ExtractorError as e: # An error we somewhat expected + self.report_error(compat_str(e), e.format_traceback()) + except ThrottledDownload: + self.to_stderr('\r') + self.report_warning('The download speed is below throttle limit. Re-extracting data') + return wrapper(self, *args, **kwargs) + except (MaxDownloadsReached, ExistingVideoReached, RejectedVideoReached, LazyList.IndexError): + raise + except Exception as e: + if self.params.get('ignoreerrors'): + self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc())) + else: + raise + return wrapper + + @__handle_extraction_exceptions + def __extract_info(self, url, ie, download, extra_info, process): + ie_result = ie.extract(url) + if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here) + return + if isinstance(ie_result, list): + # Backwards compatibility: old IE result format + ie_result = { + '_type': 'compat_list', + 'entries': ie_result, + } + if extra_info.get('original_url'): + ie_result.setdefault('original_url', extra_info['original_url']) + self.add_default_extra_info(ie_result, ie, url) + if process: + return self.process_ie_result(ie_result, download, extra_info) + else: + return ie_result + + def add_default_extra_info(self, ie_result, ie, url): + if url is not None: + self.add_extra_info(ie_result, { + 'webpage_url': url, + 'original_url': url, + 'webpage_url_basename': url_basename(url), + }) + if ie is not None: + self.add_extra_info(ie_result, { + 'extractor': ie.IE_NAME, + 'extractor_key': ie.ie_key(), + }) + + def process_ie_result(self, ie_result, download=True, extra_info=None): + """ + Take the result of the ie(may be modified) and resolve all unresolved + references (URLs, playlist items). + + It will also download the videos if 'download'. + Returns the resolved ie_result. + """ + if extra_info is None: + extra_info = {} + result_type = ie_result.get('_type', 'video') + + if result_type in ('url', 'url_transparent'): + ie_result['url'] = sanitize_url(ie_result['url']) + if ie_result.get('original_url'): + extra_info.setdefault('original_url', ie_result['original_url']) + + extract_flat = self.params.get('extract_flat', False) + if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) + or extract_flat is True): + info_copy = ie_result.copy() + ie = try_get(ie_result.get('ie_key'), self.get_info_extractor) + if ie and not ie_result.get('id'): + info_copy['id'] = ie.get_temp_id(ie_result['url']) + self.add_default_extra_info(info_copy, ie, ie_result['url']) + self.add_extra_info(info_copy, extra_info) + self.__forced_printings(info_copy, self.prepare_filename(info_copy), incomplete=True) + if self.params.get('force_write_download_archive', False): + self.record_download_archive(info_copy) + return ie_result + + if result_type == 'video': + self.add_extra_info(ie_result, extra_info) + ie_result = self.process_video_result(ie_result, download=download) + additional_urls = (ie_result or {}).get('additional_urls') + if additional_urls: + # TODO: Improve MetadataParserPP to allow setting a list + if isinstance(additional_urls, compat_str): + additional_urls = [additional_urls] + self.to_screen( + '[info] %s: %d additional URL(s) requested' % (ie_result['id'], len(additional_urls))) + self.write_debug('Additional URLs: "%s"' % '", "'.join(additional_urls)) + ie_result['additional_entries'] = [ + self.extract_info( + url, download, extra_info, + force_generic_extractor=self.params.get('force_generic_extractor')) + for url in additional_urls + ] + return ie_result + elif result_type == 'url': + # We have to add extra_info to the results because it may be + # contained in a playlist + return self.extract_info( + ie_result['url'], download, + ie_key=ie_result.get('ie_key'), + extra_info=extra_info) + elif result_type == 'url_transparent': + # Use the information from the embedding page + info = self.extract_info( + ie_result['url'], ie_key=ie_result.get('ie_key'), + extra_info=extra_info, download=False, process=False) + + # extract_info may return None when ignoreerrors is enabled and + # extraction failed with an error, don't crash and return early + # in this case + if not info: + return info + + force_properties = dict( + (k, v) for k, v in ie_result.items() if v is not None) + for f in ('_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'): + if f in force_properties: + del force_properties[f] + new_result = info.copy() + new_result.update(force_properties) + + # Extracted info may not be a video result (i.e. + # info.get('_type', 'video') != video) but rather an url or + # url_transparent. In such cases outer metadata (from ie_result) + # should be propagated to inner one (info). For this to happen + # _type of info should be overridden with url_transparent. This + # fixes issue from https://github.com/ytdl-org/youtube-dl/pull/11163. + if new_result.get('_type') == 'url': + new_result['_type'] = 'url_transparent' + + return self.process_ie_result( + new_result, download=download, extra_info=extra_info) + elif result_type in ('playlist', 'multi_video'): + # Protect from infinite recursion due to recursively nested playlists + # (see https://github.com/ytdl-org/youtube-dl/issues/27833) + webpage_url = ie_result['webpage_url'] + if webpage_url in self._playlist_urls: + self.to_screen( + '[download] Skipping already downloaded playlist: %s' + % ie_result.get('title') or ie_result.get('id')) + return + + self._playlist_level += 1 + self._playlist_urls.add(webpage_url) + self._sanitize_thumbnails(ie_result) + try: + return self.__process_playlist(ie_result, download) + finally: + self._playlist_level -= 1 + if not self._playlist_level: + self._playlist_urls.clear() + elif result_type == 'compat_list': + self.report_warning( + 'Extractor %s returned a compat_list result. ' + 'It needs to be updated.' % ie_result.get('extractor')) + + def _fixup(r): + self.add_extra_info(r, { + 'extractor': ie_result['extractor'], + 'webpage_url': ie_result['webpage_url'], + 'webpage_url_basename': url_basename(ie_result['webpage_url']), + 'extractor_key': ie_result['extractor_key'], + }) + return r + ie_result['entries'] = [ + self.process_ie_result(_fixup(r), download, extra_info) + for r in ie_result['entries'] + ] + return ie_result + else: + raise Exception('Invalid result type: %s' % result_type) + + def _ensure_dir_exists(self, path): + return make_dir(path, self.report_error) + + def __process_playlist(self, ie_result, download): + # We process each entry in the playlist + playlist = ie_result.get('title') or ie_result.get('id') + self.to_screen('[download] Downloading playlist: %s' % playlist) + + if 'entries' not in ie_result: + raise EntryNotInPlaylist() + incomplete_entries = bool(ie_result.get('requested_entries')) + if incomplete_entries: + def fill_missing_entries(entries, indexes): + ret = [None] * max(*indexes) + for i, entry in zip(indexes, entries): + ret[i - 1] = entry + return ret + ie_result['entries'] = fill_missing_entries(ie_result['entries'], ie_result['requested_entries']) + + playlist_results = [] + + playliststart = self.params.get('playliststart', 1) + playlistend = self.params.get('playlistend') + # For backwards compatibility, interpret -1 as whole list + if playlistend == -1: + playlistend = None + + playlistitems_str = self.params.get('playlist_items') + playlistitems = None + if playlistitems_str is not None: + def iter_playlistitems(format): + for string_segment in format.split(','): + if '-' in string_segment: + start, end = string_segment.split('-') + for item in range(int(start), int(end) + 1): + yield int(item) + else: + yield int(string_segment) + playlistitems = orderedSet(iter_playlistitems(playlistitems_str)) + + ie_entries = ie_result['entries'] + msg = ( + 'Downloading %d videos' if not isinstance(ie_entries, list) + else 'Collected %d videos; downloading %%d of them' % len(ie_entries)) + + if isinstance(ie_entries, list): + def get_entry(i): + return ie_entries[i - 1] + else: + if not isinstance(ie_entries, PagedList): + ie_entries = LazyList(ie_entries) + + def get_entry(i): + return YoutubeDL.__handle_extraction_exceptions( + lambda self, i: ie_entries[i - 1] + )(self, i) + + entries = [] + items = playlistitems if playlistitems is not None else itertools.count(playliststart) + for i in items: + if i == 0: + continue + if playlistitems is None and playlistend is not None and playlistend < i: + break + entry = None + try: + entry = get_entry(i) + if entry is None: + raise EntryNotInPlaylist() + except (IndexError, EntryNotInPlaylist): + if incomplete_entries: + raise EntryNotInPlaylist() + elif not playlistitems: + break + entries.append(entry) + try: + if entry is not None: + self._match_entry(entry, incomplete=True, silent=True) + except (ExistingVideoReached, RejectedVideoReached): + break + ie_result['entries'] = entries + + # Save playlist_index before re-ordering + entries = [ + ((playlistitems[i - 1] if playlistitems else i + playliststart - 1), entry) + for i, entry in enumerate(entries, 1) + if entry is not None] + n_entries = len(entries) + + if not playlistitems and (playliststart or playlistend): + playlistitems = list(range(playliststart, playliststart + n_entries)) + ie_result['requested_entries'] = playlistitems + + if self.params.get('allow_playlist_files', True): + ie_copy = { + 'playlist': playlist, + 'playlist_id': ie_result.get('id'), + 'playlist_title': ie_result.get('title'), + 'playlist_uploader': ie_result.get('uploader'), + 'playlist_uploader_id': ie_result.get('uploader_id'), + 'playlist_index': 0, + } + ie_copy.update(dict(ie_result)) + + if self._write_info_json('playlist', ie_result, + self.prepare_filename(ie_copy, 'pl_infojson')) is None: + return + if self._write_description('playlist', ie_result, + self.prepare_filename(ie_copy, 'pl_description')) is None: + return + # TODO: This should be passed to ThumbnailsConvertor if necessary + self._write_thumbnails('playlist', ie_copy, self.prepare_filename(ie_copy, 'pl_thumbnail')) + + if self.params.get('playlistreverse', False): + entries = entries[::-1] + if self.params.get('playlistrandom', False): + random.shuffle(entries) + + x_forwarded_for = ie_result.get('__x_forwarded_for_ip') + + self.to_screen('[%s] playlist %s: %s' % (ie_result['extractor'], playlist, msg % n_entries)) + failures = 0 + max_failures = self.params.get('skip_playlist_after_errors') or float('inf') + for i, entry_tuple in enumerate(entries, 1): + playlist_index, entry = entry_tuple + if 'playlist-index' in self.params.get('compat_opts', []): + playlist_index = playlistitems[i - 1] if playlistitems else i + playliststart - 1 + self.to_screen('[download] Downloading video %s of %s' % (i, n_entries)) + # This __x_forwarded_for_ip thing is a bit ugly but requires + # minimal changes + if x_forwarded_for: + entry['__x_forwarded_for_ip'] = x_forwarded_for + extra = { + 'n_entries': n_entries, + '_last_playlist_index': max(playlistitems) if playlistitems else (playlistend or n_entries), + 'playlist_index': playlist_index, + 'playlist_autonumber': i, + 'playlist': playlist, + 'playlist_id': ie_result.get('id'), + 'playlist_title': ie_result.get('title'), + 'playlist_uploader': ie_result.get('uploader'), + 'playlist_uploader_id': ie_result.get('uploader_id'), + 'extractor': ie_result['extractor'], + 'webpage_url': ie_result['webpage_url'], + 'webpage_url_basename': url_basename(ie_result['webpage_url']), + 'extractor_key': ie_result['extractor_key'], + } + + if self._match_entry(entry, incomplete=True) is not None: + continue + + entry_result = self.__process_iterable_entry(entry, download, extra) + if not entry_result: + failures += 1 + if failures >= max_failures: + self.report_error( + 'Skipping the remaining entries in playlist "%s" since %d items failed extraction' % (playlist, failures)) + break + # TODO: skip failed (empty) entries? + playlist_results.append(entry_result) + ie_result['entries'] = playlist_results + self.to_screen('[download] Finished downloading playlist: %s' % playlist) + return ie_result + + @__handle_extraction_exceptions + def __process_iterable_entry(self, entry, download, extra_info): + return self.process_ie_result( + entry, download=download, extra_info=extra_info) + + def _build_format_filter(self, filter_spec): + " Returns a function to filter the formats according to the filter_spec " + + OPERATORS = { + '<': operator.lt, + '<=': operator.le, + '>': operator.gt, + '>=': operator.ge, + '=': operator.eq, + '!=': operator.ne, + } + operator_rex = re.compile(r'''(?x)\s* + (?P<key>width|height|tbr|abr|vbr|asr|filesize|filesize_approx|fps)\s* + (?P<op>%s)(?P<none_inclusive>\s*\?)?\s* + (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)\s* + ''' % '|'.join(map(re.escape, OPERATORS.keys()))) + m = operator_rex.fullmatch(filter_spec) + if m: + try: + comparison_value = int(m.group('value')) + except ValueError: + comparison_value = parse_filesize(m.group('value')) + if comparison_value is None: + comparison_value = parse_filesize(m.group('value') + 'B') + if comparison_value is None: + raise ValueError( + 'Invalid value %r in format specification %r' % ( + m.group('value'), filter_spec)) + op = OPERATORS[m.group('op')] + + if not m: + STR_OPERATORS = { + '=': operator.eq, + '^=': lambda attr, value: attr.startswith(value), + '$=': lambda attr, value: attr.endswith(value), + '*=': lambda attr, value: value in attr, + } + str_operator_rex = re.compile(r'''(?x)\s* + (?P<key>[a-zA-Z0-9._-]+)\s* + (?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s* + (?P<value>[a-zA-Z0-9._-]+)\s* + ''' % '|'.join(map(re.escape, STR_OPERATORS.keys()))) + m = str_operator_rex.fullmatch(filter_spec) + if m: + comparison_value = m.group('value') + str_op = STR_OPERATORS[m.group('op')] + if m.group('negation'): + op = lambda attr, value: not str_op(attr, value) + else: + op = str_op + + if not m: + raise SyntaxError('Invalid filter specification %r' % filter_spec) + + def _filter(f): + actual_value = f.get(m.group('key')) + if actual_value is None: + return m.group('none_inclusive') + return op(actual_value, comparison_value) + return _filter + + def _default_format_spec(self, info_dict, download=True): + + def can_merge(): + merger = FFmpegMergerPP(self) + return merger.available and merger.can_merge() + + prefer_best = ( + not self.params.get('simulate') + and download + and ( + not can_merge() + or info_dict.get('is_live', False) + or self.outtmpl_dict['default'] == '-')) + compat = ( + prefer_best + or self.params.get('allow_multiple_audio_streams', False) + or 'format-spec' in self.params.get('compat_opts', [])) + + return ( + 'best/bestvideo+bestaudio' if prefer_best + else 'bestvideo*+bestaudio/best' if not compat + else 'bestvideo+bestaudio/best') + + def build_format_selector(self, format_spec): + def syntax_error(note, start): + message = ( + 'Invalid format specification: ' + '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1])) + return SyntaxError(message) + + PICKFIRST = 'PICKFIRST' + MERGE = 'MERGE' + SINGLE = 'SINGLE' + GROUP = 'GROUP' + FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters']) + + allow_multiple_streams = {'audio': self.params.get('allow_multiple_audio_streams', False), + 'video': self.params.get('allow_multiple_video_streams', False)} + + check_formats = self.params.get('check_formats') + + def _parse_filter(tokens): + filter_parts = [] + for type, string, start, _, _ in tokens: + if type == tokenize.OP and string == ']': + return ''.join(filter_parts) + else: + filter_parts.append(string) + + def _remove_unused_ops(tokens): + # Remove operators that we don't use and join them with the surrounding strings + # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9' + ALLOWED_OPS = ('/', '+', ',', '(', ')') + last_string, last_start, last_end, last_line = None, None, None, None + for type, string, start, end, line in tokens: + if type == tokenize.OP and string == '[': + if last_string: + yield tokenize.NAME, last_string, last_start, last_end, last_line + last_string = None + yield type, string, start, end, line + # everything inside brackets will be handled by _parse_filter + for type, string, start, end, line in tokens: + yield type, string, start, end, line + if type == tokenize.OP and string == ']': + break + elif type == tokenize.OP and string in ALLOWED_OPS: + if last_string: + yield tokenize.NAME, last_string, last_start, last_end, last_line + last_string = None + yield type, string, start, end, line + elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]: + if not last_string: + last_string = string + last_start = start + last_end = end + else: + last_string += string + if last_string: + yield tokenize.NAME, last_string, last_start, last_end, last_line + + def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False): + selectors = [] + current_selector = None + for type, string, start, _, _ in tokens: + # ENCODING is only defined in python 3.x + if type == getattr(tokenize, 'ENCODING', None): + continue + elif type in [tokenize.NAME, tokenize.NUMBER]: + current_selector = FormatSelector(SINGLE, string, []) + elif type == tokenize.OP: + if string == ')': + if not inside_group: + # ')' will be handled by the parentheses group + tokens.restore_last_token() + break + elif inside_merge and string in ['/', ',']: + tokens.restore_last_token() + break + elif inside_choice and string == ',': + tokens.restore_last_token() + break + elif string == ',': + if not current_selector: + raise syntax_error('"," must follow a format selector', start) + selectors.append(current_selector) + current_selector = None + elif string == '/': + if not current_selector: + raise syntax_error('"/" must follow a format selector', start) + first_choice = current_selector + second_choice = _parse_format_selection(tokens, inside_choice=True) + current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), []) + elif string == '[': + if not current_selector: + current_selector = FormatSelector(SINGLE, 'best', []) + format_filter = _parse_filter(tokens) + current_selector.filters.append(format_filter) + elif string == '(': + if current_selector: + raise syntax_error('Unexpected "("', start) + group = _parse_format_selection(tokens, inside_group=True) + current_selector = FormatSelector(GROUP, group, []) + elif string == '+': + if not current_selector: + raise syntax_error('Unexpected "+"', start) + selector_1 = current_selector + selector_2 = _parse_format_selection(tokens, inside_merge=True) + if not selector_2: + raise syntax_error('Expected a selector', start) + current_selector = FormatSelector(MERGE, (selector_1, selector_2), []) + else: + raise syntax_error('Operator not recognized: "{0}"'.format(string), start) + elif type == tokenize.ENDMARKER: + break + if current_selector: + selectors.append(current_selector) + return selectors + + def _merge(formats_pair): + format_1, format_2 = formats_pair + + formats_info = [] + formats_info.extend(format_1.get('requested_formats', (format_1,))) + formats_info.extend(format_2.get('requested_formats', (format_2,))) + + if not allow_multiple_streams['video'] or not allow_multiple_streams['audio']: + get_no_more = {'video': False, 'audio': False} + for (i, fmt_info) in enumerate(formats_info): + if fmt_info.get('acodec') == fmt_info.get('vcodec') == 'none': + formats_info.pop(i) + continue + for aud_vid in ['audio', 'video']: + if not allow_multiple_streams[aud_vid] and fmt_info.get(aud_vid[0] + 'codec') != 'none': + if get_no_more[aud_vid]: + formats_info.pop(i) + break + get_no_more[aud_vid] = True + + if len(formats_info) == 1: + return formats_info[0] + + video_fmts = [fmt_info for fmt_info in formats_info if fmt_info.get('vcodec') != 'none'] + audio_fmts = [fmt_info for fmt_info in formats_info if fmt_info.get('acodec') != 'none'] + + the_only_video = video_fmts[0] if len(video_fmts) == 1 else None + the_only_audio = audio_fmts[0] if len(audio_fmts) == 1 else None + + output_ext = self.params.get('merge_output_format') + if not output_ext: + if the_only_video: + output_ext = the_only_video['ext'] + elif the_only_audio and not video_fmts: + output_ext = the_only_audio['ext'] + else: + output_ext = 'mkv' + + filtered = lambda *keys: filter(None, (traverse_obj(fmt, *keys) for fmt in formats_info)) + + new_dict = { + 'requested_formats': formats_info, + 'format': '+'.join(filtered('format')), + 'format_id': '+'.join(filtered('format_id')), + 'ext': output_ext, + 'protocol': '+'.join(map(determine_protocol, formats_info)), + 'language': '+'.join(orderedSet(filtered('language'))), + 'format_note': '+'.join(orderedSet(filtered('format_note'))), + 'filesize_approx': sum(filtered('filesize', 'filesize_approx')), + 'tbr': sum(filtered('tbr', 'vbr', 'abr')), + } + + if the_only_video: + new_dict.update({ + 'width': the_only_video.get('width'), + 'height': the_only_video.get('height'), + 'resolution': the_only_video.get('resolution') or self.format_resolution(the_only_video), + 'fps': the_only_video.get('fps'), + 'vcodec': the_only_video.get('vcodec'), + 'vbr': the_only_video.get('vbr'), + 'stretched_ratio': the_only_video.get('stretched_ratio'), + }) + + if the_only_audio: + new_dict.update({ + 'acodec': the_only_audio.get('acodec'), + 'abr': the_only_audio.get('abr'), + 'asr': the_only_audio.get('asr'), + }) + + return new_dict + + def _check_formats(formats): + if not check_formats: + yield from formats + return + for f in formats: + self.to_screen('[info] Testing format %s' % f['format_id']) + temp_file = tempfile.NamedTemporaryFile( + suffix='.tmp', delete=False, + dir=self.get_output_path('temp') or None) + temp_file.close() + try: + success, _ = self.dl(temp_file.name, f, test=True) + except (DownloadError, IOError, OSError, ValueError) + network_exceptions: + success = False + finally: + if os.path.exists(temp_file.name): + try: + os.remove(temp_file.name) + except OSError: + self.report_warning('Unable to delete temporary file "%s"' % temp_file.name) + if success: + yield f + else: + self.to_screen('[info] Unable to download format %s. Skipping...' % f['format_id']) + + def _build_selector_function(selector): + if isinstance(selector, list): # , + fs = [_build_selector_function(s) for s in selector] + + def selector_function(ctx): + for f in fs: + yield from f(ctx) + return selector_function + + elif selector.type == GROUP: # () + selector_function = _build_selector_function(selector.selector) + + elif selector.type == PICKFIRST: # / + fs = [_build_selector_function(s) for s in selector.selector] + + def selector_function(ctx): + for f in fs: + picked_formats = list(f(ctx)) + if picked_formats: + return picked_formats + return [] + + elif selector.type == MERGE: # + + selector_1, selector_2 = map(_build_selector_function, selector.selector) + + def selector_function(ctx): + for pair in itertools.product( + selector_1(copy.deepcopy(ctx)), selector_2(copy.deepcopy(ctx))): + yield _merge(pair) + + elif selector.type == SINGLE: # atom + format_spec = selector.selector or 'best' + + # TODO: Add allvideo, allaudio etc by generalizing the code with best/worst selector + if format_spec == 'all': + def selector_function(ctx): + yield from _check_formats(ctx['formats']) + elif format_spec == 'mergeall': + def selector_function(ctx): + formats = list(_check_formats(ctx['formats'])) + if not formats: + return + merged_format = formats[-1] + for f in formats[-2::-1]: + merged_format = _merge((merged_format, f)) + yield merged_format + + else: + format_fallback, format_reverse, format_idx = False, True, 1 + mobj = re.match( + r'(?P<bw>best|worst|b|w)(?P<type>video|audio|v|a)?(?P<mod>\*)?(?:\.(?P<n>[1-9]\d*))?$', + format_spec) + if mobj is not None: + format_idx = int_or_none(mobj.group('n'), default=1) + format_reverse = mobj.group('bw')[0] == 'b' + format_type = (mobj.group('type') or [None])[0] + not_format_type = {'v': 'a', 'a': 'v'}.get(format_type) + format_modified = mobj.group('mod') is not None + + format_fallback = not format_type and not format_modified # for b, w + _filter_f = ( + (lambda f: f.get('%scodec' % format_type) != 'none') + if format_type and format_modified # bv*, ba*, wv*, wa* + else (lambda f: f.get('%scodec' % not_format_type) == 'none') + if format_type # bv, ba, wv, wa + else (lambda f: f.get('vcodec') != 'none' and f.get('acodec') != 'none') + if not format_modified # b, w + else lambda f: True) # b*, w* + filter_f = lambda f: _filter_f(f) and ( + f.get('vcodec') != 'none' or f.get('acodec') != 'none') + else: + if format_spec in self._format_selection_exts['audio']: + filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') != 'none' + elif format_spec in self._format_selection_exts['video']: + filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') != 'none' and f.get('vcodec') != 'none' + elif format_spec in self._format_selection_exts['storyboards']: + filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') == 'none' and f.get('vcodec') == 'none' + else: + filter_f = lambda f: f.get('format_id') == format_spec # id + + def selector_function(ctx): + formats = list(ctx['formats']) + matches = list(filter(filter_f, formats)) if filter_f is not None else formats + if format_fallback and ctx['incomplete_formats'] and not matches: + # for extractors with incomplete formats (audio only (soundcloud) + # or video only (imgur)) best/worst will fallback to + # best/worst {video,audio}-only format + matches = formats + matches = LazyList(_check_formats(matches[::-1 if format_reverse else 1])) + try: + yield matches[format_idx - 1] + except IndexError: + return + + filters = [self._build_format_filter(f) for f in selector.filters] + + def final_selector(ctx): + ctx_copy = copy.deepcopy(ctx) + for _filter in filters: + ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats'])) + return selector_function(ctx_copy) + return final_selector + + stream = io.BytesIO(format_spec.encode('utf-8')) + try: + tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline))) + except tokenize.TokenError: + raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec))) + + class TokenIterator(object): + def __init__(self, tokens): + self.tokens = tokens + self.counter = 0 + + def __iter__(self): + return self + + def __next__(self): + if self.counter >= len(self.tokens): + raise StopIteration() + value = self.tokens[self.counter] + self.counter += 1 + return value + + next = __next__ + + def restore_last_token(self): + self.counter -= 1 + + parsed_selector = _parse_format_selection(iter(TokenIterator(tokens))) + return _build_selector_function(parsed_selector) + + def _calc_headers(self, info_dict): + res = std_headers.copy() + + add_headers = info_dict.get('http_headers') + if add_headers: + res.update(add_headers) + + cookies = self._calc_cookies(info_dict) + if cookies: + res['Cookie'] = cookies + + if 'X-Forwarded-For' not in res: + x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip') + if x_forwarded_for_ip: + res['X-Forwarded-For'] = x_forwarded_for_ip + + return res + + def _calc_cookies(self, info_dict): + pr = sanitized_Request(info_dict['url']) + self.cookiejar.add_cookie_header(pr) + return pr.get_header('Cookie') + + def _sanitize_thumbnails(self, info_dict): + thumbnails = info_dict.get('thumbnails') + if thumbnails is None: + thumbnail = info_dict.get('thumbnail') + if thumbnail: + info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}] + if thumbnails: + thumbnails.sort(key=lambda t: ( + t.get('preference') if t.get('preference') is not None else -1, + t.get('width') if t.get('width') is not None else -1, + t.get('height') if t.get('height') is not None else -1, + t.get('id') if t.get('id') is not None else '', + t.get('url'))) + + def thumbnail_tester(): + if self.params.get('check_formats'): + test_all = True + to_screen = lambda msg: self.to_screen(f'[info] {msg}') + else: + test_all = False + to_screen = self.write_debug + + def test_thumbnail(t): + if not test_all and not t.get('_test_url'): + return True + to_screen('Testing thumbnail %s' % t['id']) + try: + self.urlopen(HEADRequest(t['url'])) + except network_exceptions as err: + to_screen('Unable to connect to thumbnail %s URL "%s" - %s. Skipping...' % ( + t['id'], t['url'], error_to_compat_str(err))) + return False + return True + + return test_thumbnail + + for i, t in enumerate(thumbnails): + if t.get('id') is None: + t['id'] = '%d' % i + if t.get('width') and t.get('height'): + t['resolution'] = '%dx%d' % (t['width'], t['height']) + t['url'] = sanitize_url(t['url']) + + if self.params.get('check_formats') is not False: + info_dict['thumbnails'] = LazyList(filter(thumbnail_tester(), thumbnails[::-1])).reverse() + else: + info_dict['thumbnails'] = thumbnails + + def process_video_result(self, info_dict, download=True): + assert info_dict.get('_type', 'video') == 'video' + + if 'id' not in info_dict: + raise ExtractorError('Missing "id" field in extractor result') + if 'title' not in info_dict: + raise ExtractorError('Missing "title" field in extractor result', + video_id=info_dict['id'], ie=info_dict['extractor']) + + def report_force_conversion(field, field_not, conversion): + self.report_warning( + '"%s" field is not %s - forcing %s conversion, there is an error in extractor' + % (field, field_not, conversion)) + + def sanitize_string_field(info, string_field): + field = info.get(string_field) + if field is None or isinstance(field, compat_str): + return + report_force_conversion(string_field, 'a string', 'string') + info[string_field] = compat_str(field) + + def sanitize_numeric_fields(info): + for numeric_field in self._NUMERIC_FIELDS: + field = info.get(numeric_field) + if field is None or isinstance(field, compat_numeric_types): + continue + report_force_conversion(numeric_field, 'numeric', 'int') + info[numeric_field] = int_or_none(field) + + sanitize_string_field(info_dict, 'id') + sanitize_numeric_fields(info_dict) + + if 'playlist' not in info_dict: + # It isn't part of a playlist + info_dict['playlist'] = None + info_dict['playlist_index'] = None + + self._sanitize_thumbnails(info_dict) + + thumbnail = info_dict.get('thumbnail') + thumbnails = info_dict.get('thumbnails') + if thumbnail: + info_dict['thumbnail'] = sanitize_url(thumbnail) + elif thumbnails: + info_dict['thumbnail'] = thumbnails[-1]['url'] + + if info_dict.get('display_id') is None and 'id' in info_dict: + info_dict['display_id'] = info_dict['id'] + + if info_dict.get('duration') is not None: + info_dict['duration_string'] = formatSeconds(info_dict['duration']) + + for ts_key, date_key in ( + ('timestamp', 'upload_date'), + ('release_timestamp', 'release_date'), + ): + if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None: + # Working around out-of-range timestamp values (e.g. negative ones on Windows, + # see http://bugs.python.org/issue1646728) + try: + upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key]) + info_dict[date_key] = upload_date.strftime('%Y%m%d') + except (ValueError, OverflowError, OSError): + pass + + live_keys = ('is_live', 'was_live') + live_status = info_dict.get('live_status') + if live_status is None: + for key in live_keys: + if info_dict.get(key) is False: + continue + if info_dict.get(key): + live_status = key + break + if all(info_dict.get(key) is False for key in live_keys): + live_status = 'not_live' + if live_status: + info_dict['live_status'] = live_status + for key in live_keys: + if info_dict.get(key) is None: + info_dict[key] = (live_status == key) + + # Auto generate title fields corresponding to the *_number fields when missing + # in order to always have clean titles. This is very common for TV series. + for field in ('chapter', 'season', 'episode'): + if info_dict.get('%s_number' % field) is not None and not info_dict.get(field): + info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field]) + + for cc_kind in ('subtitles', 'automatic_captions'): + cc = info_dict.get(cc_kind) + if cc: + for _, subtitle in cc.items(): + for subtitle_format in subtitle: + if subtitle_format.get('url'): + subtitle_format['url'] = sanitize_url(subtitle_format['url']) + if subtitle_format.get('ext') is None: + subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower() + + automatic_captions = info_dict.get('automatic_captions') + subtitles = info_dict.get('subtitles') + + info_dict['requested_subtitles'] = self.process_subtitles( + info_dict['id'], subtitles, automatic_captions) + + # We now pick which formats have to be downloaded + if info_dict.get('formats') is None: + # There's only one format available + formats = [info_dict] + else: + formats = info_dict['formats'] + + info_dict['__has_drm'] = any(f.get('has_drm') for f in formats) + if not self.params.get('allow_unplayable_formats'): + formats = [f for f in formats if not f.get('has_drm')] + + if not formats: + self.raise_no_formats(info_dict) + + def is_wellformed(f): + url = f.get('url') + if not url: + self.report_warning( + '"url" field is missing or empty - skipping format, ' + 'there is an error in extractor') + return False + if isinstance(url, bytes): + sanitize_string_field(f, 'url') + return True + + # Filter out malformed formats for better extraction robustness + formats = list(filter(is_wellformed, formats)) + + formats_dict = {} + + # We check that all the formats have the format and format_id fields + for i, format in enumerate(formats): + sanitize_string_field(format, 'format_id') + sanitize_numeric_fields(format) + format['url'] = sanitize_url(format['url']) + if not format.get('format_id'): + format['format_id'] = compat_str(i) + else: + # Sanitize format_id from characters used in format selector expression + format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id']) + format_id = format['format_id'] + if format_id not in formats_dict: + formats_dict[format_id] = [] + formats_dict[format_id].append(format) + + # Make sure all formats have unique format_id + common_exts = set(itertools.chain(*self._format_selection_exts.values())) + for format_id, ambiguous_formats in formats_dict.items(): + ambigious_id = len(ambiguous_formats) > 1 + for i, format in enumerate(ambiguous_formats): + if ambigious_id: + format['format_id'] = '%s-%d' % (format_id, i) + if format.get('ext') is None: + format['ext'] = determine_ext(format['url']).lower() + # Ensure there is no conflict between id and ext in format selection + # See https://github.com/yt-dlp/yt-dlp/issues/1282 + if format['format_id'] != format['ext'] and format['format_id'] in common_exts: + format['format_id'] = 'f%s' % format['format_id'] + + for i, format in enumerate(formats): + if format.get('format') is None: + format['format'] = '{id} - {res}{note}'.format( + id=format['format_id'], + res=self.format_resolution(format), + note=format_field(format, 'format_note', ' (%s)'), + ) + if format.get('protocol') is None: + format['protocol'] = determine_protocol(format) + if format.get('resolution') is None: + format['resolution'] = self.format_resolution(format, default=None) + # Add HTTP headers, so that external programs can use them from the + # json output + full_format_info = info_dict.copy() + full_format_info.update(format) + format['http_headers'] = self._calc_headers(full_format_info) + # Remove private housekeeping stuff + if '__x_forwarded_for_ip' in info_dict: + del info_dict['__x_forwarded_for_ip'] + + # TODO Central sorting goes here + + if not formats or formats[0] is not info_dict: + # only set the 'formats' fields if the original info_dict list them + # otherwise we end up with a circular reference, the first (and unique) + # element in the 'formats' field in info_dict is info_dict itself, + # which can't be exported to json + info_dict['formats'] = formats + + info_dict, _ = self.pre_process(info_dict) + + if self.params.get('list_thumbnails'): + self.list_thumbnails(info_dict) + if self.params.get('listformats'): + if not info_dict.get('formats') and not info_dict.get('url'): + self.to_screen('%s has no formats' % info_dict['id']) + else: + self.list_formats(info_dict) + if self.params.get('listsubtitles'): + if 'automatic_captions' in info_dict: + self.list_subtitles( + info_dict['id'], automatic_captions, 'automatic captions') + self.list_subtitles(info_dict['id'], subtitles, 'subtitles') + list_only = self.params.get('simulate') is None and ( + self.params.get('list_thumbnails') or self.params.get('listformats') or self.params.get('listsubtitles')) + if list_only: + # Without this printing, -F --print-json will not work + self.__forced_printings(info_dict, self.prepare_filename(info_dict), incomplete=True) + return + + format_selector = self.format_selector + if format_selector is None: + req_format = self._default_format_spec(info_dict, download=download) + self.write_debug('Default format spec: %s' % req_format) + format_selector = self.build_format_selector(req_format) + + # While in format selection we may need to have an access to the original + # format set in order to calculate some metrics or do some processing. + # For now we need to be able to guess whether original formats provided + # by extractor are incomplete or not (i.e. whether extractor provides only + # video-only or audio-only formats) for proper formats selection for + # extractors with such incomplete formats (see + # https://github.com/ytdl-org/youtube-dl/pull/5556). + # Since formats may be filtered during format selection and may not match + # the original formats the results may be incorrect. Thus original formats + # or pre-calculated metrics should be passed to format selection routines + # as well. + # We will pass a context object containing all necessary additional data + # instead of just formats. + # This fixes incorrect format selection issue (see + # https://github.com/ytdl-org/youtube-dl/issues/10083). + incomplete_formats = ( + # All formats are video-only or + all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats) + # all formats are audio-only + or all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats)) + + ctx = { + 'formats': formats, + 'incomplete_formats': incomplete_formats, + } + + formats_to_download = list(format_selector(ctx)) + if not formats_to_download: + if not self.params.get('ignore_no_formats_error'): + raise ExtractorError('Requested format is not available', expected=True, + video_id=info_dict['id'], ie=info_dict['extractor']) + else: + self.report_warning('Requested format is not available') + # Process what we can, even without any available formats. + self.process_info(dict(info_dict)) + elif download: + self.to_screen( + '[info] %s: Downloading %d format(s): %s' % ( + info_dict['id'], len(formats_to_download), + ", ".join([f['format_id'] for f in formats_to_download]))) + for fmt in formats_to_download: + new_info = dict(info_dict) + # Save a reference to the original info_dict so that it can be modified in process_info if needed + new_info['__original_infodict'] = info_dict + new_info.update(fmt) + self.process_info(new_info) + # We update the info dict with the best quality format (backwards compatibility) + if formats_to_download: + info_dict.update(formats_to_download[-1]) + return info_dict + + def process_subtitles(self, video_id, normal_subtitles, automatic_captions): + """Select the requested subtitles and their format""" + available_subs = {} + if normal_subtitles and self.params.get('writesubtitles'): + available_subs.update(normal_subtitles) + if automatic_captions and self.params.get('writeautomaticsub'): + for lang, cap_info in automatic_captions.items(): + if lang not in available_subs: + available_subs[lang] = cap_info + + if (not self.params.get('writesubtitles') and not + self.params.get('writeautomaticsub') or not + available_subs): + return None + + all_sub_langs = available_subs.keys() + if self.params.get('allsubtitles', False): + requested_langs = all_sub_langs + elif self.params.get('subtitleslangs', False): + # A list is used so that the order of languages will be the same as + # given in subtitleslangs. See https://github.com/yt-dlp/yt-dlp/issues/1041 + requested_langs = [] + for lang_re in self.params.get('subtitleslangs'): + if lang_re == 'all': + requested_langs.extend(all_sub_langs) + continue + discard = lang_re[0] == '-' + if discard: + lang_re = lang_re[1:] + current_langs = filter(re.compile(lang_re + '$').match, all_sub_langs) + if discard: + for lang in current_langs: + while lang in requested_langs: + requested_langs.remove(lang) + else: + requested_langs.extend(current_langs) + requested_langs = orderedSet(requested_langs) + elif 'en' in available_subs: + requested_langs = ['en'] + else: + requested_langs = [list(all_sub_langs)[0]] + if requested_langs: + self.write_debug('Downloading subtitles: %s' % ', '.join(requested_langs)) + + formats_query = self.params.get('subtitlesformat', 'best') + formats_preference = formats_query.split('/') if formats_query else [] + subs = {} + for lang in requested_langs: + formats = available_subs.get(lang) + if formats is None: + self.report_warning('%s subtitles not available for %s' % (lang, video_id)) + continue + for ext in formats_preference: + if ext == 'best': + f = formats[-1] + break + matches = list(filter(lambda f: f['ext'] == ext, formats)) + if matches: + f = matches[-1] + break + else: + f = formats[-1] + self.report_warning( + 'No subtitle format found matching "%s" for language %s, ' + 'using %s' % (formats_query, lang, f['ext'])) + subs[lang] = f + return subs + + def __forced_printings(self, info_dict, filename, incomplete): + def print_mandatory(field, actual_field=None): + if actual_field is None: + actual_field = field + if (self.params.get('force%s' % field, False) + and (not incomplete or info_dict.get(actual_field) is not None)): + self.to_stdout(info_dict[actual_field]) + + def print_optional(field): + if (self.params.get('force%s' % field, False) + and info_dict.get(field) is not None): + self.to_stdout(info_dict[field]) + + info_dict = info_dict.copy() + if filename is not None: + info_dict['filename'] = filename + if info_dict.get('requested_formats') is not None: + # For RTMP URLs, also include the playpath + info_dict['urls'] = '\n'.join(f['url'] + f.get('play_path', '') for f in info_dict['requested_formats']) + elif 'url' in info_dict: + info_dict['urls'] = info_dict['url'] + info_dict.get('play_path', '') + + if self.params.get('forceprint') or self.params.get('forcejson'): + self.post_extract(info_dict) + for tmpl in self.params.get('forceprint', []): + mobj = re.match(r'\w+(=?)$', tmpl) + if mobj and mobj.group(1): + tmpl = f'{tmpl[:-1]} = %({tmpl[:-1]})s' + elif mobj: + tmpl = '%({})s'.format(tmpl) + self.to_stdout(self.evaluate_outtmpl(tmpl, info_dict)) + + print_mandatory('title') + print_mandatory('id') + print_mandatory('url', 'urls') + print_optional('thumbnail') + print_optional('description') + print_optional('filename') + if self.params.get('forceduration') and info_dict.get('duration') is not None: + self.to_stdout(formatSeconds(info_dict['duration'])) + print_mandatory('format') + + if self.params.get('forcejson'): + self.to_stdout(json.dumps(self.sanitize_info(info_dict))) + + def dl(self, name, info, subtitle=False, test=False): + if not info.get('url'): + self.raise_no_formats(info, True) + + if test: + verbose = self.params.get('verbose') + params = { + 'test': True, + 'quiet': self.params.get('quiet') or not verbose, + 'verbose': verbose, + 'noprogress': not verbose, + 'nopart': True, + 'skip_unavailable_fragments': False, + 'keep_fragments': False, + 'overwrites': True, + '_no_ytdl_file': True, + } + else: + params = self.params + fd = get_suitable_downloader(info, params, to_stdout=(name == '-'))(self, params) + if not test: + for ph in self._progress_hooks: + fd.add_progress_hook(ph) + urls = '", "'.join([f['url'] for f in info.get('requested_formats', [])] or [info['url']]) + self.write_debug('Invoking downloader on "%s"' % urls) + + new_info = copy.deepcopy(self._copy_infodict(info)) + if new_info.get('http_headers') is None: + new_info['http_headers'] = self._calc_headers(new_info) + return fd.download(name, new_info, subtitle) + + def process_info(self, info_dict): + """Process a single resolved IE result.""" + + assert info_dict.get('_type', 'video') == 'video' + + max_downloads = self.params.get('max_downloads') + if max_downloads is not None: + if self._num_downloads >= int(max_downloads): + raise MaxDownloadsReached() + + # TODO: backward compatibility, to be removed + info_dict['fulltitle'] = info_dict['title'] + + if 'format' not in info_dict and 'ext' in info_dict: + info_dict['format'] = info_dict['ext'] + + if self._match_entry(info_dict) is not None: + return + + self.post_extract(info_dict) + self._num_downloads += 1 + + # info_dict['_filename'] needs to be set for backward compatibility + info_dict['_filename'] = full_filename = self.prepare_filename(info_dict, warn=True) + temp_filename = self.prepare_filename(info_dict, 'temp') + files_to_move = {} + + # Forced printings + self.__forced_printings(info_dict, full_filename, incomplete=('format' not in info_dict)) + + if self.params.get('simulate'): + if self.params.get('force_write_download_archive', False): + self.record_download_archive(info_dict) + # Do nothing else if in simulate mode + return + + if full_filename is None: + return + if not self._ensure_dir_exists(encodeFilename(full_filename)): + return + if not self._ensure_dir_exists(encodeFilename(temp_filename)): + return + + if self._write_description('video', info_dict, + self.prepare_filename(info_dict, 'description')) is None: + return + + sub_files = self._write_subtitles(info_dict, temp_filename) + if sub_files is None: + return + files_to_move.update(dict(sub_files)) + + thumb_files = self._write_thumbnails( + 'video', info_dict, temp_filename, self.prepare_filename(info_dict, 'thumbnail')) + if thumb_files is None: + return + files_to_move.update(dict(thumb_files)) + + infofn = self.prepare_filename(info_dict, 'infojson') + _infojson_written = self._write_info_json('video', info_dict, infofn) + if _infojson_written: + info_dict['__infojson_filename'] = infofn + elif _infojson_written is None: + return + + # Note: Annotations are deprecated + annofn = None + if self.params.get('writeannotations', False): + annofn = self.prepare_filename(info_dict, 'annotation') + if annofn: + if not self._ensure_dir_exists(encodeFilename(annofn)): + return + if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(annofn)): + self.to_screen('[info] Video annotations are already present') + elif not info_dict.get('annotations'): + self.report_warning('There are no annotations to write.') + else: + try: + self.to_screen('[info] Writing video annotations to: ' + annofn) + with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile: + annofile.write(info_dict['annotations']) + except (KeyError, TypeError): + self.report_warning('There are no annotations to write.') + except (OSError, IOError): + self.report_error('Cannot write annotations file: ' + annofn) + return + + # Write internet shortcut files + url_link = webloc_link = desktop_link = False + if self.params.get('writelink', False): + if sys.platform == "darwin": # macOS. + webloc_link = True + elif sys.platform.startswith("linux"): + desktop_link = True + else: # if sys.platform in ['win32', 'cygwin']: + url_link = True + if self.params.get('writeurllink', False): + url_link = True + if self.params.get('writewebloclink', False): + webloc_link = True + if self.params.get('writedesktoplink', False): + desktop_link = True + + if url_link or webloc_link or desktop_link: + if 'webpage_url' not in info_dict: + self.report_error('Cannot write internet shortcut file because the "webpage_url" field is missing in the media information') + return + ascii_url = iri_to_uri(info_dict['webpage_url']) + + def _write_link_file(extension, template, newline, embed_filename): + linkfn = replace_extension(full_filename, extension, info_dict.get('ext')) + if self.params.get('overwrites', True) and os.path.exists(encodeFilename(linkfn)): + self.to_screen('[info] Internet shortcut is already present') + else: + try: + self.to_screen('[info] Writing internet shortcut to: ' + linkfn) + with io.open(encodeFilename(to_high_limit_path(linkfn)), 'w', encoding='utf-8', newline=newline) as linkfile: + template_vars = {'url': ascii_url} + if embed_filename: + template_vars['filename'] = linkfn[:-(len(extension) + 1)] + linkfile.write(template % template_vars) + except (OSError, IOError): + self.report_error('Cannot write internet shortcut ' + linkfn) + return False + return True + + if url_link: + if not _write_link_file('url', DOT_URL_LINK_TEMPLATE, '\r\n', embed_filename=False): + return + if webloc_link: + if not _write_link_file('webloc', DOT_WEBLOC_LINK_TEMPLATE, '\n', embed_filename=False): + return + if desktop_link: + if not _write_link_file('desktop', DOT_DESKTOP_LINK_TEMPLATE, '\n', embed_filename=True): + return + + try: + info_dict, files_to_move = self.pre_process(info_dict, 'before_dl', files_to_move) + except PostProcessingError as err: + self.report_error('Preprocessing: %s' % str(err)) + return + + must_record_download_archive = False + if self.params.get('skip_download', False): + info_dict['filepath'] = temp_filename + info_dict['__finaldir'] = os.path.dirname(os.path.abspath(encodeFilename(full_filename))) + info_dict['__files_to_move'] = files_to_move + info_dict = self.run_pp(MoveFilesAfterDownloadPP(self, False), info_dict) + else: + # Download + info_dict.setdefault('__postprocessors', []) + try: + + def existing_file(*filepaths): + ext = info_dict.get('ext') + final_ext = self.params.get('final_ext', ext) + existing_files = [] + for file in orderedSet(filepaths): + if final_ext != ext: + converted = replace_extension(file, final_ext, ext) + if os.path.exists(encodeFilename(converted)): + existing_files.append(converted) + if os.path.exists(encodeFilename(file)): + existing_files.append(file) + + if not existing_files or self.params.get('overwrites', False): + for file in orderedSet(existing_files): + self.report_file_delete(file) + os.remove(encodeFilename(file)) + return None + + info_dict['ext'] = os.path.splitext(existing_files[0])[1][1:] + return existing_files[0] + + success = True + if info_dict.get('requested_formats') is not None: + + def compatible_formats(formats): + # TODO: some formats actually allow this (mkv, webm, ogg, mp4), but not all of them. + video_formats = [format for format in formats if format.get('vcodec') != 'none'] + audio_formats = [format for format in formats if format.get('acodec') != 'none'] + if len(video_formats) > 2 or len(audio_formats) > 2: + return False + + # Check extension + exts = set(format.get('ext') for format in formats) + COMPATIBLE_EXTS = ( + set(('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma')), + set(('webm',)), + ) + for ext_sets in COMPATIBLE_EXTS: + if ext_sets.issuperset(exts): + return True + # TODO: Check acodec/vcodec + return False + + requested_formats = info_dict['requested_formats'] + old_ext = info_dict['ext'] + if self.params.get('merge_output_format') is None: + if not compatible_formats(requested_formats): + info_dict['ext'] = 'mkv' + self.report_warning( + 'Requested formats are incompatible for merge and will be merged into mkv') + if (info_dict['ext'] == 'webm' + and info_dict.get('thumbnails') + # check with type instead of pp_key, __name__, or isinstance + # since we dont want any custom PPs to trigger this + and any(type(pp) == EmbedThumbnailPP for pp in self._pps['post_process'])): + info_dict['ext'] = 'mkv' + self.report_warning( + 'webm doesn\'t support embedding a thumbnail, mkv will be used') + new_ext = info_dict['ext'] + + def correct_ext(filename, ext=new_ext): + if filename == '-': + return filename + filename_real_ext = os.path.splitext(filename)[1][1:] + filename_wo_ext = ( + os.path.splitext(filename)[0] + if filename_real_ext in (old_ext, new_ext) + else filename) + return '%s.%s' % (filename_wo_ext, ext) + + # Ensure filename always has a correct extension for successful merge + full_filename = correct_ext(full_filename) + temp_filename = correct_ext(temp_filename) + dl_filename = existing_file(full_filename, temp_filename) + info_dict['__real_download'] = False + + if dl_filename is not None: + self.report_file_already_downloaded(dl_filename) + elif get_suitable_downloader(info_dict, self.params, to_stdout=temp_filename == '-'): + info_dict['url'] = '\n'.join(f['url'] for f in requested_formats) + success, real_download = self.dl(temp_filename, info_dict) + info_dict['__real_download'] = real_download + else: + downloaded = [] + merger = FFmpegMergerPP(self) + if self.params.get('allow_unplayable_formats'): + self.report_warning( + 'You have requested merging of multiple formats ' + 'while also allowing unplayable formats to be downloaded. ' + 'The formats won\'t be merged to prevent data corruption.') + elif not merger.available: + self.report_warning( + 'You have requested merging of multiple formats but ffmpeg is not installed. ' + 'The formats won\'t be merged.') + + if temp_filename == '-': + reason = ('using a downloader other than ffmpeg' if FFmpegFD.can_merge_formats(info_dict) + else 'but the formats are incompatible for simultaneous download' if merger.available + else 'but ffmpeg is not installed') + self.report_warning( + f'You have requested downloading multiple formats to stdout {reason}. ' + 'The formats will be streamed one after the other') + fname = temp_filename + for f in requested_formats: + new_info = dict(info_dict) + del new_info['requested_formats'] + new_info.update(f) + if temp_filename != '-': + fname = prepend_extension( + correct_ext(temp_filename, new_info['ext']), + 'f%s' % f['format_id'], new_info['ext']) + if not self._ensure_dir_exists(fname): + return + f['filepath'] = fname + downloaded.append(fname) + partial_success, real_download = self.dl(fname, new_info) + info_dict['__real_download'] = info_dict['__real_download'] or real_download + success = success and partial_success + if merger.available and not self.params.get('allow_unplayable_formats'): + info_dict['__postprocessors'].append(merger) + info_dict['__files_to_merge'] = downloaded + # Even if there were no downloads, it is being merged only now + info_dict['__real_download'] = True + else: + for file in downloaded: + files_to_move[file] = None + else: + # Just a single file + dl_filename = existing_file(full_filename, temp_filename) + if dl_filename is None or dl_filename == temp_filename: + # dl_filename == temp_filename could mean that the file was partially downloaded with --no-part. + # So we should try to resume the download + success, real_download = self.dl(temp_filename, info_dict) + info_dict['__real_download'] = real_download + else: + self.report_file_already_downloaded(dl_filename) + + dl_filename = dl_filename or temp_filename + info_dict['__finaldir'] = os.path.dirname(os.path.abspath(encodeFilename(full_filename))) + + except network_exceptions as err: + self.report_error('unable to download video data: %s' % error_to_compat_str(err)) + return + except (OSError, IOError) as err: + raise UnavailableVideoError(err) + except (ContentTooShortError, ) as err: + self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded)) + return + + if success and full_filename != '-': + + def fixup(): + do_fixup = True + fixup_policy = self.params.get('fixup') + vid = info_dict['id'] + + if fixup_policy in ('ignore', 'never'): + return + elif fixup_policy == 'warn': + do_fixup = False + elif fixup_policy != 'force': + assert fixup_policy in ('detect_or_warn', None) + if not info_dict.get('__real_download'): + do_fixup = False + + def ffmpeg_fixup(cndn, msg, cls): + if not cndn: + return + if not do_fixup: + self.report_warning(f'{vid}: {msg}') + return + pp = cls(self) + if pp.available: + info_dict['__postprocessors'].append(pp) + else: + self.report_warning(f'{vid}: {msg}. Install ffmpeg to fix this automatically') + + stretched_ratio = info_dict.get('stretched_ratio') + ffmpeg_fixup( + stretched_ratio not in (1, None), + f'Non-uniform pixel ratio {stretched_ratio}', + FFmpegFixupStretchedPP) + + ffmpeg_fixup( + (info_dict.get('requested_formats') is None + and info_dict.get('container') == 'm4a_dash' + and info_dict.get('ext') == 'm4a'), + 'writing DASH m4a. Only some players support this container', + FFmpegFixupM4aPP) + + downloader = get_suitable_downloader(info_dict, self.params) if 'protocol' in info_dict else None + downloader = downloader.__name__ if downloader else None + ffmpeg_fixup(info_dict.get('requested_formats') is None and downloader == 'HlsFD', + 'malformed AAC bitstream detected', FFmpegFixupM3u8PP) + ffmpeg_fixup(downloader == 'WebSocketFragmentFD', 'malformed timestamps detected', FFmpegFixupTimestampPP) + ffmpeg_fixup(downloader == 'WebSocketFragmentFD', 'malformed duration detected', FFmpegFixupDurationPP) + + fixup() + try: + info_dict = self.post_process(dl_filename, info_dict, files_to_move) + except PostProcessingError as err: + self.report_error('Postprocessing: %s' % str(err)) + return + try: + for ph in self._post_hooks: + ph(info_dict['filepath']) + except Exception as err: + self.report_error('post hooks: %s' % str(err)) + return + must_record_download_archive = True + + if must_record_download_archive or self.params.get('force_write_download_archive', False): + self.record_download_archive(info_dict) + max_downloads = self.params.get('max_downloads') + if max_downloads is not None and self._num_downloads >= int(max_downloads): + raise MaxDownloadsReached() + + def download(self, url_list): + """Download a given list of URLs.""" + outtmpl = self.outtmpl_dict['default'] + if (len(url_list) > 1 + and outtmpl != '-' + and '%' not in outtmpl + and self.params.get('max_downloads') != 1): + raise SameFileError(outtmpl) + + for url in url_list: + try: + # It also downloads the videos + res = self.extract_info( + url, force_generic_extractor=self.params.get('force_generic_extractor', False)) + except UnavailableVideoError: + self.report_error('unable to download video') + except MaxDownloadsReached: + self.to_screen('[info] Maximum number of downloads reached') + raise + except ExistingVideoReached: + self.to_screen('[info] Encountered a video that is already in the archive, stopping due to --break-on-existing') + raise + except RejectedVideoReached: + self.to_screen('[info] Encountered a video that did not match filter, stopping due to --break-on-reject') + raise + else: + if self.params.get('dump_single_json', False): + self.post_extract(res) + self.to_stdout(json.dumps(self.sanitize_info(res))) + + return self._download_retcode + + def download_with_info_file(self, info_filename): + with contextlib.closing(fileinput.FileInput( + [info_filename], mode='r', + openhook=fileinput.hook_encoded('utf-8'))) as f: + # FileInput doesn't have a read method, we can't call json.load + info = self.sanitize_info(json.loads('\n'.join(f)), self.params.get('clean_infojson', True)) + try: + self.process_ie_result(info, download=True) + except (DownloadError, EntryNotInPlaylist, ThrottledDownload): + webpage_url = info.get('webpage_url') + if webpage_url is not None: + self.report_warning('The info failed to download, trying with "%s"' % webpage_url) + return self.download([webpage_url]) + else: + raise + return self._download_retcode + + @staticmethod + def sanitize_info(info_dict, remove_private_keys=False): + ''' Sanitize the infodict for converting to json ''' + if info_dict is None: + return info_dict + info_dict.setdefault('epoch', int(time.time())) + remove_keys = {'__original_infodict'} # Always remove this since this may contain a copy of the entire dict + keep_keys = ['_type'], # Always keep this to facilitate load-info-json + if remove_private_keys: + remove_keys |= { + 'requested_formats', 'requested_subtitles', 'requested_entries', + 'filepath', 'entries', 'original_url', 'playlist_autonumber', + } + empty_values = (None, {}, [], set(), tuple()) + reject = lambda k, v: k not in keep_keys and ( + k.startswith('_') or k in remove_keys or v in empty_values) + else: + reject = lambda k, v: k in remove_keys + filter_fn = lambda obj: ( + list(map(filter_fn, obj)) if isinstance(obj, (LazyList, list, tuple, set)) + else obj if not isinstance(obj, dict) + else dict((k, filter_fn(v)) for k, v in obj.items() if not reject(k, v))) + return filter_fn(info_dict) + + @staticmethod + def filter_requested_info(info_dict, actually_filter=True): + ''' Alias of sanitize_info for backward compatibility ''' + return YoutubeDL.sanitize_info(info_dict, actually_filter) + + def run_pp(self, pp, infodict): + files_to_delete = [] + if '__files_to_move' not in infodict: + infodict['__files_to_move'] = {} + try: + files_to_delete, infodict = pp.run(infodict) + except PostProcessingError as e: + # Must be True and not 'only_download' + if self.params.get('ignoreerrors') is True: + self.report_error(e) + return infodict + raise + + if not files_to_delete: + return infodict + if self.params.get('keepvideo', False): + for f in files_to_delete: + infodict['__files_to_move'].setdefault(f, '') + else: + for old_filename in set(files_to_delete): + self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename) + try: + os.remove(encodeFilename(old_filename)) + except (IOError, OSError): + self.report_warning('Unable to remove downloaded original file') + if old_filename in infodict['__files_to_move']: + del infodict['__files_to_move'][old_filename] + return infodict + + @staticmethod + def post_extract(info_dict): + def actual_post_extract(info_dict): + if info_dict.get('_type') in ('playlist', 'multi_video'): + for video_dict in info_dict.get('entries', {}): + actual_post_extract(video_dict or {}) + return + + post_extractor = info_dict.get('__post_extractor') or (lambda: {}) + extra = post_extractor().items() + info_dict.update(extra) + info_dict.pop('__post_extractor', None) + + original_infodict = info_dict.get('__original_infodict') or {} + original_infodict.update(extra) + original_infodict.pop('__post_extractor', None) + + actual_post_extract(info_dict or {}) + + def pre_process(self, ie_info, key='pre_process', files_to_move=None): + info = dict(ie_info) + info['__files_to_move'] = files_to_move or {} + for pp in self._pps[key]: + info = self.run_pp(pp, info) + return info, info.pop('__files_to_move', None) + + def post_process(self, filename, ie_info, files_to_move=None): + """Run all the postprocessors on the given file.""" + info = dict(ie_info) + info['filepath'] = filename + info['__files_to_move'] = files_to_move or {} + + for pp in ie_info.get('__postprocessors', []) + self._pps['post_process']: + info = self.run_pp(pp, info) + info = self.run_pp(MoveFilesAfterDownloadPP(self), info) + del info['__files_to_move'] + for pp in self._pps['after_move']: + info = self.run_pp(pp, info) + return info + + def _make_archive_id(self, info_dict): + video_id = info_dict.get('id') + if not video_id: + return + # Future-proof against any change in case + # and backwards compatibility with prior versions + extractor = info_dict.get('extractor_key') or info_dict.get('ie_key') # key in a playlist + if extractor is None: + url = str_or_none(info_dict.get('url')) + if not url: + return + # Try to find matching extractor for the URL and take its ie_key + for ie_key, ie in self._ies.items(): + if ie.suitable(url): + extractor = ie_key + break + else: + return + return '%s %s' % (extractor.lower(), video_id) + + def in_download_archive(self, info_dict): + fn = self.params.get('download_archive') + if fn is None: + return False + + vid_id = self._make_archive_id(info_dict) + if not vid_id: + return False # Incomplete video information + + return vid_id in self.archive + + def record_download_archive(self, info_dict): + fn = self.params.get('download_archive') + if fn is None: + return + vid_id = self._make_archive_id(info_dict) + assert vid_id + with locked_file(fn, 'a', encoding='utf-8') as archive_file: + archive_file.write(vid_id + '\n') + self.archive.add(vid_id) + + @staticmethod + def format_resolution(format, default='unknown'): + is_images = format.get('vcodec') == 'none' and format.get('acodec') == 'none' + if format.get('vcodec') == 'none' and format.get('acodec') != 'none': + return 'audio only' + if format.get('resolution') is not None: + return format['resolution'] + if format.get('width') and format.get('height'): + res = '%dx%d' % (format['width'], format['height']) + elif format.get('height'): + res = '%sp' % format['height'] + elif format.get('width'): + res = '%dx?' % format['width'] + elif is_images: + return 'images' + else: + return default + return f'{res} images' if is_images else res + + def _format_note(self, fdict): + res = '' + if fdict.get('ext') in ['f4f', 'f4m']: + res += '(unsupported) ' + if fdict.get('language'): + if res: + res += ' ' + res += '[%s] ' % fdict['language'] + if fdict.get('format_note') is not None: + res += fdict['format_note'] + ' ' + if fdict.get('tbr') is not None: + res += '%4dk ' % fdict['tbr'] + if fdict.get('container') is not None: + if res: + res += ', ' + res += '%s container' % fdict['container'] + if (fdict.get('vcodec') is not None + and fdict.get('vcodec') != 'none'): + if res: + res += ', ' + res += fdict['vcodec'] + if fdict.get('vbr') is not None: + res += '@' + elif fdict.get('vbr') is not None and fdict.get('abr') is not None: + res += 'video@' + if fdict.get('vbr') is not None: + res += '%4dk' % fdict['vbr'] + if fdict.get('fps') is not None: + if res: + res += ', ' + res += '%sfps' % fdict['fps'] + if fdict.get('acodec') is not None: + if res: + res += ', ' + if fdict['acodec'] == 'none': + res += 'video only' + else: + res += '%-5s' % fdict['acodec'] + elif fdict.get('abr') is not None: + if res: + res += ', ' + res += 'audio' + if fdict.get('abr') is not None: + res += '@%3dk' % fdict['abr'] + if fdict.get('asr') is not None: + res += ' (%5dHz)' % fdict['asr'] + if fdict.get('filesize') is not None: + if res: + res += ', ' + res += format_bytes(fdict['filesize']) + elif fdict.get('filesize_approx') is not None: + if res: + res += ', ' + res += '~' + format_bytes(fdict['filesize_approx']) + return res + + def list_formats(self, info_dict): + formats = info_dict.get('formats', [info_dict]) + new_format = ( + 'list-formats' not in self.params.get('compat_opts', []) + and self.params.get('listformats_table', True) is not False) + if new_format: + table = [ + [ + format_field(f, 'format_id'), + format_field(f, 'ext'), + self.format_resolution(f), + format_field(f, 'fps', '%d'), + '|', + format_field(f, 'filesize', ' %s', func=format_bytes) + format_field(f, 'filesize_approx', '~%s', func=format_bytes), + format_field(f, 'tbr', '%4dk'), + shorten_protocol_name(f.get('protocol', '').replace("native", "n")), + '|', + format_field(f, 'vcodec', default='unknown').replace('none', ''), + format_field(f, 'vbr', '%4dk'), + format_field(f, 'acodec', default='unknown').replace('none', ''), + format_field(f, 'abr', '%3dk'), + format_field(f, 'asr', '%5dHz'), + ', '.join(filter(None, ( + 'UNSUPPORTED' if f.get('ext') in ('f4f', 'f4m') else '', + format_field(f, 'language', '[%s]'), + format_field(f, 'format_note'), + format_field(f, 'container', ignore=(None, f.get('ext'))), + ))), + ] for f in formats if f.get('preference') is None or f['preference'] >= -1000] + header_line = ['ID', 'EXT', 'RESOLUTION', 'FPS', '|', ' FILESIZE', ' TBR', 'PROTO', + '|', 'VCODEC', ' VBR', 'ACODEC', ' ABR', ' ASR', 'MORE INFO'] + else: + table = [ + [ + format_field(f, 'format_id'), + format_field(f, 'ext'), + self.format_resolution(f), + self._format_note(f)] + for f in formats + if f.get('preference') is None or f['preference'] >= -1000] + header_line = ['format code', 'extension', 'resolution', 'note'] + + self.to_screen( + '[info] Available formats for %s:' % info_dict['id']) + self.to_stdout(render_table( + header_line, table, delim=new_format, extraGap=(0 if new_format else 1), hideEmpty=new_format)) + + def list_thumbnails(self, info_dict): + thumbnails = list(info_dict.get('thumbnails')) + if not thumbnails: + self.to_screen('[info] No thumbnails present for %s' % info_dict['id']) + return + + self.to_screen( + '[info] Thumbnails for %s:' % info_dict['id']) + self.to_stdout(render_table( + ['ID', 'width', 'height', 'URL'], + [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails])) + + def list_subtitles(self, video_id, subtitles, name='subtitles'): + if not subtitles: + self.to_screen('%s has no %s' % (video_id, name)) + return + self.to_screen( + 'Available %s for %s:' % (name, video_id)) + + def _row(lang, formats): + exts, names = zip(*((f['ext'], f.get('name') or 'unknown') for f in reversed(formats))) + if len(set(names)) == 1: + names = [] if names[0] == 'unknown' else names[:1] + return [lang, ', '.join(names), ', '.join(exts)] + + self.to_stdout(render_table( + ['Language', 'Name', 'Formats'], + [_row(lang, formats) for lang, formats in subtitles.items()], + hideEmpty=True)) + + def urlopen(self, req): + """ Start an HTTP download """ + if isinstance(req, compat_basestring): + req = sanitized_Request(req) + return self._opener.open(req, timeout=self._socket_timeout) + + def print_debug_header(self): + if not self.params.get('verbose'): + return + get_encoding = lambda stream: getattr(stream, 'encoding', 'missing (%s)' % type(stream).__name__) + encoding_str = ( + '[debug] Encodings: locale %s, fs %s, stdout %s, stderr %s, pref %s\n' % ( + locale.getpreferredencoding(), + sys.getfilesystemencoding(), + get_encoding(self._screen_file), get_encoding(self._err_file), + self.get_encoding())) + + logger = self.params.get('logger') + if logger: + write_debug = lambda msg: logger.debug(f'[debug] {msg}') + write_debug(encoding_str) + else: + write_debug = lambda msg: self._write_string(f'[debug] {msg}') + write_string(encoding_str, encoding=None) + + source = detect_variant() + write_debug('yt-dlp version %s%s\n' % (__version__, '' if source == 'unknown' else f' ({source})')) + if _LAZY_LOADER: + write_debug('Lazy loading extractors enabled\n') + if plugin_extractors or plugin_postprocessors: + write_debug('Plugins: %s\n' % [ + '%s%s' % (klass.__name__, '' if klass.__name__ == name else f' as {name}') + for name, klass in itertools.chain(plugin_extractors.items(), plugin_postprocessors.items())]) + if self.params.get('compat_opts'): + write_debug('Compatibility options: %s\n' % ', '.join(self.params.get('compat_opts'))) + try: + sp = subprocess.Popen( + ['git', 'rev-parse', '--short', 'HEAD'], + stdout=subprocess.PIPE, stderr=subprocess.PIPE, + cwd=os.path.dirname(os.path.abspath(__file__))) + out, err = process_communicate_or_kill(sp) + out = out.decode().strip() + if re.match('[0-9a-f]+', out): + write_debug('Git HEAD: %s\n' % out) + except Exception: + try: + sys.exc_clear() + except Exception: + pass + + def python_implementation(): + impl_name = platform.python_implementation() + if impl_name == 'PyPy' and hasattr(sys, 'pypy_version_info'): + return impl_name + ' version %d.%d.%d' % sys.pypy_version_info[:3] + return impl_name + + write_debug('Python version %s (%s %s) - %s\n' % ( + platform.python_version(), + python_implementation(), + platform.architecture()[0], + platform_name())) + + exe_versions = FFmpegPostProcessor.get_versions(self) + exe_versions['rtmpdump'] = rtmpdump_version() + exe_versions['phantomjs'] = PhantomJSwrapper._version() + exe_str = ', '.join( + f'{exe} {v}' for exe, v in sorted(exe_versions.items()) if v + ) or 'none' + write_debug('exe versions: %s\n' % exe_str) + + from .downloader.websocket import has_websockets + from .postprocessor.embedthumbnail import has_mutagen + from .cookies import SQLITE_AVAILABLE, KEYRING_AVAILABLE + + lib_str = ', '.join(sorted(filter(None, ( + compat_pycrypto_AES and compat_pycrypto_AES.__name__.split('.')[0], + has_websockets and 'websockets', + has_mutagen and 'mutagen', + SQLITE_AVAILABLE and 'sqlite', + KEYRING_AVAILABLE and 'keyring', + )))) or 'none' + write_debug('Optional libraries: %s\n' % lib_str) + write_debug('ANSI escape support: stdout = %s, stderr = %s\n' % ( + supports_terminal_sequences(self._screen_file), + supports_terminal_sequences(self._err_file))) + + proxy_map = {} + for handler in self._opener.handlers: + if hasattr(handler, 'proxies'): + proxy_map.update(handler.proxies) + write_debug('Proxy map: ' + compat_str(proxy_map) + '\n') + + if self.params.get('call_home', False): + ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8') + write_debug('Public IP address: %s\n' % ipaddr) + return + latest_version = self.urlopen( + 'https://yt-dl.org/latest/version').read().decode('utf-8') + if version_tuple(latest_version) > version_tuple(__version__): + self.report_warning( + 'You are using an outdated version (newest version: %s)! ' + 'See https://yt-dl.org/update if you need help updating.' % + latest_version) + + def _setup_opener(self): + timeout_val = self.params.get('socket_timeout') + self._socket_timeout = 600 if timeout_val is None else float(timeout_val) + + opts_cookiesfrombrowser = self.params.get('cookiesfrombrowser') + opts_cookiefile = self.params.get('cookiefile') + opts_proxy = self.params.get('proxy') + + self.cookiejar = load_cookies(opts_cookiefile, opts_cookiesfrombrowser, self) + + cookie_processor = YoutubeDLCookieProcessor(self.cookiejar) + if opts_proxy is not None: + if opts_proxy == '': + proxies = {} + else: + proxies = {'http': opts_proxy, 'https': opts_proxy} + else: + proxies = compat_urllib_request.getproxies() + # Set HTTPS proxy to HTTP one if given (https://github.com/ytdl-org/youtube-dl/issues/805) + if 'http' in proxies and 'https' not in proxies: + proxies['https'] = proxies['http'] + proxy_handler = PerRequestProxyHandler(proxies) + + debuglevel = 1 if self.params.get('debug_printtraffic') else 0 + https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel) + ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel) + redirect_handler = YoutubeDLRedirectHandler() + data_handler = compat_urllib_request_DataHandler() + + # When passing our own FileHandler instance, build_opener won't add the + # default FileHandler and allows us to disable the file protocol, which + # can be used for malicious purposes (see + # https://github.com/ytdl-org/youtube-dl/issues/8227) + file_handler = compat_urllib_request.FileHandler() + + def file_open(*args, **kwargs): + raise compat_urllib_error.URLError('file:// scheme is explicitly disabled in yt-dlp for security reasons') + file_handler.file_open = file_open + + opener = compat_urllib_request.build_opener( + proxy_handler, https_handler, cookie_processor, ydlh, redirect_handler, data_handler, file_handler) + + # Delete the default user-agent header, which would otherwise apply in + # cases where our custom HTTP handler doesn't come into play + # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details) + opener.addheaders = [] + self._opener = opener + + def encode(self, s): + if isinstance(s, bytes): + return s # Already encoded + + try: + return s.encode(self.get_encoding()) + except UnicodeEncodeError as err: + err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.' + raise + + def get_encoding(self): + encoding = self.params.get('encoding') + if encoding is None: + encoding = preferredencoding() + return encoding + + def _write_info_json(self, label, ie_result, infofn): + ''' Write infojson and returns True = written, False = skip, None = error ''' + if not self.params.get('writeinfojson'): + return False + elif not infofn: + self.write_debug(f'Skipping writing {label} infojson') + return False + elif not self._ensure_dir_exists(infofn): + return None + elif not self.params.get('overwrites', True) and os.path.exists(infofn): + self.to_screen(f'[info] {label.title()} metadata is already present') + else: + self.to_screen(f'[info] Writing {label} metadata as JSON to: {infofn}') + try: + write_json_file(self.sanitize_info(ie_result, self.params.get('clean_infojson', True)), infofn) + except (OSError, IOError): + self.report_error(f'Cannot write {label} metadata to JSON file {infofn}') + return None + return True + + def _write_description(self, label, ie_result, descfn): + ''' Write description and returns True = written, False = skip, None = error ''' + if not self.params.get('writedescription'): + return False + elif not descfn: + self.write_debug(f'Skipping writing {label} description') + return False + elif not self._ensure_dir_exists(descfn): + return None + elif not self.params.get('overwrites', True) and os.path.exists(descfn): + self.to_screen(f'[info] {label.title()} description is already present') + elif ie_result.get('description') is None: + self.report_warning(f'There\'s no {label} description to write') + return False + else: + try: + self.to_screen(f'[info] Writing {label} description to: {descfn}') + with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile: + descfile.write(ie_result['description']) + except (OSError, IOError): + self.report_error(f'Cannot write {label} description file {descfn}') + return None + return True + + def _write_subtitles(self, info_dict, filename): + ''' Write subtitles to file and return list of (sub_filename, final_sub_filename); or None if error''' + ret = [] + subtitles = info_dict.get('requested_subtitles') + if not subtitles or not (self.params.get('writesubtitles') or self.params.get('writeautomaticsub')): + # subtitles download errors are already managed as troubles in relevant IE + # that way it will silently go on when used with unsupporting IE + return ret + + sub_filename_base = self.prepare_filename(info_dict, 'subtitle') + if not sub_filename_base: + self.to_screen('[info] Skipping writing video subtitles') + return ret + for sub_lang, sub_info in subtitles.items(): + sub_format = sub_info['ext'] + sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext')) + sub_filename_final = subtitles_filename(sub_filename_base, sub_lang, sub_format, info_dict.get('ext')) + if not self.params.get('overwrites', True) and os.path.exists(sub_filename): + self.to_screen(f'[info] Video subtitle {sub_lang}.{sub_format} is already present') + sub_info['filepath'] = sub_filename + ret.append((sub_filename, sub_filename_final)) + continue + + self.to_screen(f'[info] Writing video subtitles to: {sub_filename}') + if sub_info.get('data') is not None: + try: + # Use newline='' to prevent conversion of newline characters + # See https://github.com/ytdl-org/youtube-dl/issues/10268 + with io.open(sub_filename, 'w', encoding='utf-8', newline='') as subfile: + subfile.write(sub_info['data']) + sub_info['filepath'] = sub_filename + ret.append((sub_filename, sub_filename_final)) + continue + except (OSError, IOError): + self.report_error(f'Cannot write video subtitles file {sub_filename}') + return None + + try: + sub_copy = sub_info.copy() + sub_copy.setdefault('http_headers', info_dict.get('http_headers')) + self.dl(sub_filename, sub_copy, subtitle=True) + sub_info['filepath'] = sub_filename + ret.append((sub_filename, sub_filename_final)) + except (ExtractorError, IOError, OSError, ValueError) + network_exceptions as err: + self.report_warning(f'Unable to download video subtitles for {sub_lang!r}: {err}') + continue + return ret + + def _write_thumbnails(self, label, info_dict, filename, thumb_filename_base=None): + ''' Write thumbnails to file and return list of (thumb_filename, final_thumb_filename) ''' + write_all = self.params.get('write_all_thumbnails', False) + thumbnails, ret = [], [] + if write_all or self.params.get('writethumbnail', False): + thumbnails = info_dict.get('thumbnails') or [] + multiple = write_all and len(thumbnails) > 1 + + if thumb_filename_base is None: + thumb_filename_base = filename + if thumbnails and not thumb_filename_base: + self.write_debug(f'Skipping writing {label} thumbnail') + return ret + + for t in thumbnails[::-1]: + thumb_ext = (f'{t["id"]}.' if multiple else '') + determine_ext(t['url'], 'jpg') + thumb_display_id = f'{label} thumbnail' + (f' {t["id"]}' if multiple else '') + thumb_filename = replace_extension(filename, thumb_ext, info_dict.get('ext')) + thumb_filename_final = replace_extension(thumb_filename_base, thumb_ext, info_dict.get('ext')) + + if not self.params.get('overwrites', True) and os.path.exists(thumb_filename): + ret.append((thumb_filename, thumb_filename_final)) + t['filepath'] = thumb_filename + self.to_screen(f'[info] {thumb_display_id.title()} is already present') + else: + self.to_screen(f'[info] Downloading {thumb_display_id} ...') + try: + uf = self.urlopen(t['url']) + self.to_screen(f'[info] Writing {thumb_display_id} to: {thumb_filename}') + with open(encodeFilename(thumb_filename), 'wb') as thumbf: + shutil.copyfileobj(uf, thumbf) + ret.append((thumb_filename, thumb_filename_final)) + t['filepath'] = thumb_filename + except network_exceptions as err: + self.report_warning(f'Unable to download {thumb_display_id}: {err}') + if ret and not write_all: + break + return ret diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py new file mode 100644 index 000000000..512627ebd --- /dev/null +++ b/yt_dlp/__init__.py @@ -0,0 +1,779 @@ +#!/usr/bin/python +# coding: utf-8 + +__license__ = 'CC0-1.0' + +import codecs +import io +import itertools +import os +import random +import re +import sys + +from .options import ( + parseOpts, +) +from .compat import ( + compat_getpass, + compat_shlex_quote, + workaround_optparse_bug9161, +) +from .cookies import SUPPORTED_BROWSERS +from .utils import ( + DateRange, + decodeOption, + DownloadError, + error_to_compat_str, + ExistingVideoReached, + expand_path, + match_filter_func, + MaxDownloadsReached, + preferredencoding, + read_batch_urls, + RejectedVideoReached, + render_table, + SameFileError, + setproctitle, + std_headers, + write_string, +) +from .downloader import ( + FileDownloader, +) +from .extractor import gen_extractors, list_extractors +from .extractor.common import InfoExtractor +from .extractor.adobepass import MSO_INFO +from .postprocessor import ( + FFmpegExtractAudioPP, + FFmpegSubtitlesConvertorPP, + FFmpegThumbnailsConvertorPP, + FFmpegVideoConvertorPP, + FFmpegVideoRemuxerPP, + MetadataFromFieldPP, + MetadataParserPP, +) +from .YoutubeDL import YoutubeDL + + +def _real_main(argv=None): + # Compatibility fixes for Windows + if sys.platform == 'win32': + # https://github.com/ytdl-org/youtube-dl/issues/820 + codecs.register(lambda name: codecs.lookup('utf-8') if name == 'cp65001' else None) + + workaround_optparse_bug9161() + + setproctitle('yt-dlp') + + parser, opts, args = parseOpts(argv) + warnings = [] + + # Set user agent + if opts.user_agent is not None: + std_headers['User-Agent'] = opts.user_agent + + # Set referer + if opts.referer is not None: + std_headers['Referer'] = opts.referer + + # Custom HTTP headers + std_headers.update(opts.headers) + + # Dump user agent + if opts.dump_user_agent: + write_string(std_headers['User-Agent'] + '\n', out=sys.stdout) + sys.exit(0) + + # Batch file verification + batch_urls = [] + if opts.batchfile is not None: + try: + if opts.batchfile == '-': + batchfd = sys.stdin + else: + batchfd = io.open( + expand_path(opts.batchfile), + 'r', encoding='utf-8', errors='ignore') + batch_urls = read_batch_urls(batchfd) + if opts.verbose: + write_string('[debug] Batch file urls: ' + repr(batch_urls) + '\n') + except IOError: + sys.exit('ERROR: batch file %s could not be read' % opts.batchfile) + all_urls = batch_urls + [url.strip() for url in args] # batch_urls are already striped in read_batch_urls + _enc = preferredencoding() + all_urls = [url.decode(_enc, 'ignore') if isinstance(url, bytes) else url for url in all_urls] + + if opts.list_extractors: + for ie in list_extractors(opts.age_limit): + write_string(ie.IE_NAME + (' (CURRENTLY BROKEN)' if not ie.working() else '') + '\n', out=sys.stdout) + matchedUrls = [url for url in all_urls if ie.suitable(url)] + for mu in matchedUrls: + write_string(' ' + mu + '\n', out=sys.stdout) + sys.exit(0) + if opts.list_extractor_descriptions: + for ie in list_extractors(opts.age_limit): + if not ie.working(): + continue + desc = getattr(ie, 'IE_DESC', ie.IE_NAME) + if desc is False: + continue + if hasattr(ie, 'SEARCH_KEY'): + _SEARCHES = ('cute kittens', 'slithering pythons', 'falling cat', 'angry poodle', 'purple fish', 'running tortoise', 'sleeping bunny', 'burping cow') + _COUNTS = ('', '5', '10', 'all') + desc += ' (Example: "%s%s:%s" )' % (ie.SEARCH_KEY, random.choice(_COUNTS), random.choice(_SEARCHES)) + write_string(desc + '\n', out=sys.stdout) + sys.exit(0) + if opts.ap_list_mso: + table = [[mso_id, mso_info['name']] for mso_id, mso_info in MSO_INFO.items()] + write_string('Supported TV Providers:\n' + render_table(['mso', 'mso name'], table) + '\n', out=sys.stdout) + sys.exit(0) + + # Conflicting, missing and erroneous options + if opts.usenetrc and (opts.username is not None or opts.password is not None): + parser.error('using .netrc conflicts with giving username/password') + if opts.password is not None and opts.username is None: + parser.error('account username missing\n') + if opts.ap_password is not None and opts.ap_username is None: + parser.error('TV Provider account username missing\n') + if opts.autonumber_size is not None: + if opts.autonumber_size <= 0: + parser.error('auto number size must be positive') + if opts.autonumber_start is not None: + if opts.autonumber_start < 0: + parser.error('auto number start must be positive or 0') + if opts.username is not None and opts.password is None: + opts.password = compat_getpass('Type account password and press [Return]: ') + if opts.ap_username is not None and opts.ap_password is None: + opts.ap_password = compat_getpass('Type TV provider account password and press [Return]: ') + if opts.ratelimit is not None: + numeric_limit = FileDownloader.parse_bytes(opts.ratelimit) + if numeric_limit is None: + parser.error('invalid rate limit specified') + opts.ratelimit = numeric_limit + if opts.throttledratelimit is not None: + numeric_limit = FileDownloader.parse_bytes(opts.throttledratelimit) + if numeric_limit is None: + parser.error('invalid rate limit specified') + opts.throttledratelimit = numeric_limit + if opts.min_filesize is not None: + numeric_limit = FileDownloader.parse_bytes(opts.min_filesize) + if numeric_limit is None: + parser.error('invalid min_filesize specified') + opts.min_filesize = numeric_limit + if opts.max_filesize is not None: + numeric_limit = FileDownloader.parse_bytes(opts.max_filesize) + if numeric_limit is None: + parser.error('invalid max_filesize specified') + opts.max_filesize = numeric_limit + if opts.sleep_interval is not None: + if opts.sleep_interval < 0: + parser.error('sleep interval must be positive or 0') + if opts.max_sleep_interval is not None: + if opts.max_sleep_interval < 0: + parser.error('max sleep interval must be positive or 0') + if opts.sleep_interval is None: + parser.error('min sleep interval must be specified, use --min-sleep-interval') + if opts.max_sleep_interval < opts.sleep_interval: + parser.error('max sleep interval must be greater than or equal to min sleep interval') + else: + opts.max_sleep_interval = opts.sleep_interval + if opts.sleep_interval_subtitles is not None: + if opts.sleep_interval_subtitles < 0: + parser.error('subtitles sleep interval must be positive or 0') + if opts.sleep_interval_requests is not None: + if opts.sleep_interval_requests < 0: + parser.error('requests sleep interval must be positive or 0') + if opts.ap_mso and opts.ap_mso not in MSO_INFO: + parser.error('Unsupported TV Provider, use --ap-list-mso to get a list of supported TV Providers') + if opts.overwrites: # --yes-overwrites implies --no-continue + opts.continue_dl = False + if opts.concurrent_fragment_downloads <= 0: + raise ValueError('Concurrent fragments must be positive') + + def parse_retries(retries, name=''): + if retries in ('inf', 'infinite'): + parsed_retries = float('inf') + else: + try: + parsed_retries = int(retries) + except (TypeError, ValueError): + parser.error('invalid %sretry count specified' % name) + return parsed_retries + if opts.retries is not None: + opts.retries = parse_retries(opts.retries) + if opts.fragment_retries is not None: + opts.fragment_retries = parse_retries(opts.fragment_retries, 'fragment ') + if opts.extractor_retries is not None: + opts.extractor_retries = parse_retries(opts.extractor_retries, 'extractor ') + if opts.buffersize is not None: + numeric_buffersize = FileDownloader.parse_bytes(opts.buffersize) + if numeric_buffersize is None: + parser.error('invalid buffer size specified') + opts.buffersize = numeric_buffersize + if opts.http_chunk_size is not None: + numeric_chunksize = FileDownloader.parse_bytes(opts.http_chunk_size) + if not numeric_chunksize: + parser.error('invalid http chunk size specified') + opts.http_chunk_size = numeric_chunksize + if opts.playliststart <= 0: + raise ValueError('Playlist start must be positive') + if opts.playlistend not in (-1, None) and opts.playlistend < opts.playliststart: + raise ValueError('Playlist end must be greater than playlist start') + if opts.extractaudio: + if opts.audioformat not in ['best'] + list(FFmpegExtractAudioPP.SUPPORTED_EXTS): + parser.error('invalid audio format specified') + if opts.audioquality: + opts.audioquality = opts.audioquality.strip('k').strip('K') + if not opts.audioquality.isdigit(): + parser.error('invalid audio quality specified') + if opts.recodevideo is not None: + opts.recodevideo = opts.recodevideo.replace(' ', '') + if not re.match(FFmpegVideoConvertorPP.FORMAT_RE, opts.recodevideo): + parser.error('invalid video remux format specified') + if opts.remuxvideo is not None: + opts.remuxvideo = opts.remuxvideo.replace(' ', '') + if not re.match(FFmpegVideoRemuxerPP.FORMAT_RE, opts.remuxvideo): + parser.error('invalid video remux format specified') + if opts.convertsubtitles is not None: + if opts.convertsubtitles not in FFmpegSubtitlesConvertorPP.SUPPORTED_EXTS: + parser.error('invalid subtitle format specified') + if opts.convertthumbnails is not None: + if opts.convertthumbnails not in FFmpegThumbnailsConvertorPP.SUPPORTED_EXTS: + parser.error('invalid thumbnail format specified') + + if opts.cookiesfrombrowser is not None: + opts.cookiesfrombrowser = [ + part.strip() or None for part in opts.cookiesfrombrowser.split(':', 1)] + if opts.cookiesfrombrowser[0].lower() not in SUPPORTED_BROWSERS: + parser.error('unsupported browser specified for cookies') + + if opts.date is not None: + date = DateRange.day(opts.date) + else: + date = DateRange(opts.dateafter, opts.datebefore) + + compat_opts = opts.compat_opts + + def _unused_compat_opt(name): + if name not in compat_opts: + return False + compat_opts.discard(name) + compat_opts.update(['*%s' % name]) + return True + + def set_default_compat(compat_name, opt_name, default=True, remove_compat=True): + attr = getattr(opts, opt_name) + if compat_name in compat_opts: + if attr is None: + setattr(opts, opt_name, not default) + return True + else: + if remove_compat: + _unused_compat_opt(compat_name) + return False + elif attr is None: + setattr(opts, opt_name, default) + return None + + set_default_compat('abort-on-error', 'ignoreerrors', 'only_download') + set_default_compat('no-playlist-metafiles', 'allow_playlist_files') + set_default_compat('no-clean-infojson', 'clean_infojson') + if 'format-sort' in compat_opts: + opts.format_sort.extend(InfoExtractor.FormatSort.ytdl_default) + _video_multistreams_set = set_default_compat('multistreams', 'allow_multiple_video_streams', False, remove_compat=False) + _audio_multistreams_set = set_default_compat('multistreams', 'allow_multiple_audio_streams', False, remove_compat=False) + if _video_multistreams_set is False and _audio_multistreams_set is False: + _unused_compat_opt('multistreams') + outtmpl_default = opts.outtmpl.get('default') + if 'filename' in compat_opts: + if outtmpl_default is None: + outtmpl_default = '%(title)s-%(id)s.%(ext)s' + opts.outtmpl.update({'default': outtmpl_default}) + else: + _unused_compat_opt('filename') + + def validate_outtmpl(tmpl, msg): + err = YoutubeDL.validate_outtmpl(tmpl) + if err: + parser.error('invalid %s %r: %s' % (msg, tmpl, error_to_compat_str(err))) + + for k, tmpl in opts.outtmpl.items(): + validate_outtmpl(tmpl, f'{k} output template') + opts.forceprint = opts.forceprint or [] + for tmpl in opts.forceprint or []: + validate_outtmpl(tmpl, 'print template') + validate_outtmpl(opts.sponsorblock_chapter_title, 'SponsorBlock chapter title') + for k, tmpl in opts.progress_template.items(): + k = f'{k[:-6]} console title' if '-title' in k else f'{k} progress' + validate_outtmpl(tmpl, f'{k} template') + + if opts.extractaudio and not opts.keepvideo and opts.format is None: + opts.format = 'bestaudio/best' + + if outtmpl_default is not None and not os.path.splitext(outtmpl_default)[1] and opts.extractaudio: + parser.error('Cannot download a video and extract audio into the same' + ' file! Use "{0}.%(ext)s" instead of "{0}" as the output' + ' template'.format(outtmpl_default)) + + for f in opts.format_sort: + if re.match(InfoExtractor.FormatSort.regex, f) is None: + parser.error('invalid format sort string "%s" specified' % f) + + def metadataparser_actions(f): + if isinstance(f, str): + cmd = '--parse-metadata %s' % compat_shlex_quote(f) + try: + actions = [MetadataFromFieldPP.to_action(f)] + except Exception as err: + parser.error(f'{cmd} is invalid; {err}') + else: + cmd = '--replace-in-metadata %s' % ' '.join(map(compat_shlex_quote, f)) + actions = ((MetadataParserPP.Actions.REPLACE, x, *f[1:]) for x in f[0].split(',')) + + for action in actions: + try: + MetadataParserPP.validate_action(*action) + except Exception as err: + parser.error(f'{cmd} is invalid; {err}') + yield action + + if opts.parse_metadata is None: + opts.parse_metadata = [] + if opts.metafromtitle is not None: + opts.parse_metadata.append('title:%s' % opts.metafromtitle) + opts.parse_metadata = list(itertools.chain(*map(metadataparser_actions, opts.parse_metadata))) + + any_getting = opts.forceprint or opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration or opts.dumpjson or opts.dump_single_json + any_printing = opts.print_json + download_archive_fn = expand_path(opts.download_archive) if opts.download_archive is not None else opts.download_archive + + # If JSON is not printed anywhere, but comments are requested, save it to file + printing_json = opts.dumpjson or opts.print_json or opts.dump_single_json + if opts.getcomments and not printing_json: + opts.writeinfojson = True + + if opts.no_sponsorblock: + opts.sponsorblock_mark = set() + opts.sponsorblock_remove = set() + sponsorblock_query = opts.sponsorblock_mark | opts.sponsorblock_remove + + if (opts.addmetadata or opts.sponsorblock_mark) and opts.addchapters is None: + opts.addchapters = True + opts.remove_chapters = opts.remove_chapters or [] + + def report_conflict(arg1, arg2): + warnings.append('%s is ignored since %s was given' % (arg2, arg1)) + + if (opts.remove_chapters or sponsorblock_query) and opts.sponskrub is not False: + if opts.sponskrub: + if opts.remove_chapters: + report_conflict('--remove-chapters', '--sponskrub') + if opts.sponsorblock_mark: + report_conflict('--sponsorblock-mark', '--sponskrub') + if opts.sponsorblock_remove: + report_conflict('--sponsorblock-remove', '--sponskrub') + opts.sponskrub = False + if opts.sponskrub_cut and opts.split_chapters and opts.sponskrub is not False: + report_conflict('--split-chapter', '--sponskrub-cut') + opts.sponskrub_cut = False + + if opts.remuxvideo and opts.recodevideo: + report_conflict('--recode-video', '--remux-video') + opts.remuxvideo = False + + if opts.allow_unplayable_formats: + if opts.extractaudio: + report_conflict('--allow-unplayable-formats', '--extract-audio') + opts.extractaudio = False + if opts.remuxvideo: + report_conflict('--allow-unplayable-formats', '--remux-video') + opts.remuxvideo = False + if opts.recodevideo: + report_conflict('--allow-unplayable-formats', '--recode-video') + opts.recodevideo = False + if opts.addmetadata: + report_conflict('--allow-unplayable-formats', '--add-metadata') + opts.addmetadata = False + if opts.embedsubtitles: + report_conflict('--allow-unplayable-formats', '--embed-subs') + opts.embedsubtitles = False + if opts.embedthumbnail: + report_conflict('--allow-unplayable-formats', '--embed-thumbnail') + opts.embedthumbnail = False + if opts.xattrs: + report_conflict('--allow-unplayable-formats', '--xattrs') + opts.xattrs = False + if opts.fixup and opts.fixup.lower() not in ('never', 'ignore'): + report_conflict('--allow-unplayable-formats', '--fixup') + opts.fixup = 'never' + if opts.remove_chapters: + report_conflict('--allow-unplayable-formats', '--remove-chapters') + opts.remove_chapters = [] + if opts.sponsorblock_remove: + report_conflict('--allow-unplayable-formats', '--sponsorblock-remove') + opts.sponsorblock_remove = set() + if opts.sponskrub: + report_conflict('--allow-unplayable-formats', '--sponskrub') + opts.sponskrub = False + + # PostProcessors + postprocessors = list(opts.add_postprocessors) + if sponsorblock_query: + postprocessors.append({ + 'key': 'SponsorBlock', + 'categories': sponsorblock_query, + 'api': opts.sponsorblock_api, + # Run this immediately after extraction is complete + 'when': 'pre_process' + }) + if opts.parse_metadata: + postprocessors.append({ + 'key': 'MetadataParser', + 'actions': opts.parse_metadata, + # Run this immediately after extraction is complete + 'when': 'pre_process' + }) + if opts.convertsubtitles: + postprocessors.append({ + 'key': 'FFmpegSubtitlesConvertor', + 'format': opts.convertsubtitles, + # Run this before the actual video download + 'when': 'before_dl' + }) + if opts.convertthumbnails: + postprocessors.append({ + 'key': 'FFmpegThumbnailsConvertor', + 'format': opts.convertthumbnails, + # Run this before the actual video download + 'when': 'before_dl' + }) + # Must be after all other before_dl + if opts.exec_before_dl_cmd: + postprocessors.append({ + 'key': 'Exec', + 'exec_cmd': opts.exec_before_dl_cmd, + 'when': 'before_dl' + }) + if opts.extractaudio: + postprocessors.append({ + 'key': 'FFmpegExtractAudio', + 'preferredcodec': opts.audioformat, + 'preferredquality': opts.audioquality, + 'nopostoverwrites': opts.nopostoverwrites, + }) + if opts.remuxvideo: + postprocessors.append({ + 'key': 'FFmpegVideoRemuxer', + 'preferedformat': opts.remuxvideo, + }) + if opts.recodevideo: + postprocessors.append({ + 'key': 'FFmpegVideoConvertor', + 'preferedformat': opts.recodevideo, + }) + # If ModifyChapters is going to remove chapters, subtitles must already be in the container. + if opts.embedsubtitles: + already_have_subtitle = opts.writesubtitles and 'no-keep-subs' not in compat_opts + postprocessors.append({ + 'key': 'FFmpegEmbedSubtitle', + # already_have_subtitle = True prevents the file from being deleted after embedding + 'already_have_subtitle': already_have_subtitle + }) + if not opts.writeautomaticsub and 'no-keep-subs' not in compat_opts: + opts.writesubtitles = True + # --all-sub automatically sets --write-sub if --write-auto-sub is not given + # this was the old behaviour if only --all-sub was given. + if opts.allsubtitles and not opts.writeautomaticsub: + opts.writesubtitles = True + # ModifyChapters must run before FFmpegMetadataPP + remove_chapters_patterns = [] + for regex in opts.remove_chapters: + try: + remove_chapters_patterns.append(re.compile(regex)) + except re.error as err: + parser.error(f'invalid --remove-chapters regex {regex!r} - {err}') + if opts.remove_chapters or sponsorblock_query: + postprocessors.append({ + 'key': 'ModifyChapters', + 'remove_chapters_patterns': remove_chapters_patterns, + 'remove_sponsor_segments': opts.sponsorblock_remove, + 'sponsorblock_chapter_title': opts.sponsorblock_chapter_title, + 'force_keyframes': opts.force_keyframes_at_cuts + }) + # FFmpegMetadataPP should be run after FFmpegVideoConvertorPP and + # FFmpegExtractAudioPP as containers before conversion may not support + # metadata (3gp, webm, etc.) + # By default ffmpeg preserves metadata applicable for both + # source and target containers. From this point the container won't change, + # so metadata can be added here. + if opts.addmetadata or opts.addchapters: + postprocessors.append({ + 'key': 'FFmpegMetadata', + 'add_chapters': opts.addchapters, + 'add_metadata': opts.addmetadata, + }) + # Note: Deprecated + # This should be above EmbedThumbnail since sponskrub removes the thumbnail attachment + # but must be below EmbedSubtitle and FFmpegMetadata + # See https://github.com/yt-dlp/yt-dlp/issues/204 , https://github.com/faissaloo/SponSkrub/issues/29 + # If opts.sponskrub is None, sponskrub is used, but it silently fails if the executable can't be found + if opts.sponskrub is not False: + postprocessors.append({ + 'key': 'SponSkrub', + 'path': opts.sponskrub_path, + 'args': opts.sponskrub_args, + 'cut': opts.sponskrub_cut, + 'force': opts.sponskrub_force, + 'ignoreerror': opts.sponskrub is None, + }) + if opts.embedthumbnail: + already_have_thumbnail = opts.writethumbnail or opts.write_all_thumbnails + postprocessors.append({ + 'key': 'EmbedThumbnail', + # already_have_thumbnail = True prevents the file from being deleted after embedding + 'already_have_thumbnail': already_have_thumbnail + }) + if not already_have_thumbnail: + opts.writethumbnail = True + opts.outtmpl['pl_thumbnail'] = '' + if opts.split_chapters: + postprocessors.append({ + 'key': 'FFmpegSplitChapters', + 'force_keyframes': opts.force_keyframes_at_cuts, + }) + # XAttrMetadataPP should be run after post-processors that may change file contents + if opts.xattrs: + postprocessors.append({'key': 'XAttrMetadata'}) + # Exec must be the last PP + if opts.exec_cmd: + postprocessors.append({ + 'key': 'Exec', + 'exec_cmd': opts.exec_cmd, + # Run this only after the files have been moved to their final locations + 'when': 'after_move' + }) + + def report_args_compat(arg, name): + warnings.append('%s given without specifying name. The arguments will be given to all %s' % (arg, name)) + + if 'default' in opts.external_downloader_args: + report_args_compat('--downloader-args', 'external downloaders') + + if 'default-compat' in opts.postprocessor_args and 'default' not in opts.postprocessor_args: + report_args_compat('--post-processor-args', 'post-processors') + opts.postprocessor_args.setdefault('sponskrub', []) + opts.postprocessor_args['default'] = opts.postprocessor_args['default-compat'] + + final_ext = ( + opts.recodevideo if opts.recodevideo in FFmpegVideoConvertorPP.SUPPORTED_EXTS + else opts.remuxvideo if opts.remuxvideo in FFmpegVideoRemuxerPP.SUPPORTED_EXTS + else opts.audioformat if (opts.extractaudio and opts.audioformat != 'best') + else None) + + match_filter = ( + None if opts.match_filter is None + else match_filter_func(opts.match_filter)) + + ydl_opts = { + 'usenetrc': opts.usenetrc, + 'netrc_location': opts.netrc_location, + 'username': opts.username, + 'password': opts.password, + 'twofactor': opts.twofactor, + 'videopassword': opts.videopassword, + 'ap_mso': opts.ap_mso, + 'ap_username': opts.ap_username, + 'ap_password': opts.ap_password, + 'quiet': (opts.quiet or any_getting or any_printing), + 'no_warnings': opts.no_warnings, + 'forceurl': opts.geturl, + 'forcetitle': opts.gettitle, + 'forceid': opts.getid, + 'forcethumbnail': opts.getthumbnail, + 'forcedescription': opts.getdescription, + 'forceduration': opts.getduration, + 'forcefilename': opts.getfilename, + 'forceformat': opts.getformat, + 'forceprint': opts.forceprint, + 'forcejson': opts.dumpjson or opts.print_json, + 'dump_single_json': opts.dump_single_json, + 'force_write_download_archive': opts.force_write_download_archive, + 'simulate': (any_getting or None) if opts.simulate is None else opts.simulate, + 'skip_download': opts.skip_download, + 'format': opts.format, + 'allow_unplayable_formats': opts.allow_unplayable_formats, + 'ignore_no_formats_error': opts.ignore_no_formats_error, + 'format_sort': opts.format_sort, + 'format_sort_force': opts.format_sort_force, + 'allow_multiple_video_streams': opts.allow_multiple_video_streams, + 'allow_multiple_audio_streams': opts.allow_multiple_audio_streams, + 'check_formats': opts.check_formats, + 'listformats': opts.listformats, + 'listformats_table': opts.listformats_table, + 'outtmpl': opts.outtmpl, + 'outtmpl_na_placeholder': opts.outtmpl_na_placeholder, + 'paths': opts.paths, + 'autonumber_size': opts.autonumber_size, + 'autonumber_start': opts.autonumber_start, + 'restrictfilenames': opts.restrictfilenames, + 'windowsfilenames': opts.windowsfilenames, + 'ignoreerrors': opts.ignoreerrors, + 'force_generic_extractor': opts.force_generic_extractor, + 'ratelimit': opts.ratelimit, + 'throttledratelimit': opts.throttledratelimit, + 'overwrites': opts.overwrites, + 'retries': opts.retries, + 'fragment_retries': opts.fragment_retries, + 'extractor_retries': opts.extractor_retries, + 'skip_unavailable_fragments': opts.skip_unavailable_fragments, + 'keep_fragments': opts.keep_fragments, + 'concurrent_fragment_downloads': opts.concurrent_fragment_downloads, + 'buffersize': opts.buffersize, + 'noresizebuffer': opts.noresizebuffer, + 'http_chunk_size': opts.http_chunk_size, + 'continuedl': opts.continue_dl, + 'noprogress': opts.quiet if opts.noprogress is None else opts.noprogress, + 'progress_with_newline': opts.progress_with_newline, + 'progress_template': opts.progress_template, + 'playliststart': opts.playliststart, + 'playlistend': opts.playlistend, + 'playlistreverse': opts.playlist_reverse, + 'playlistrandom': opts.playlist_random, + 'noplaylist': opts.noplaylist, + 'logtostderr': outtmpl_default == '-', + 'consoletitle': opts.consoletitle, + 'nopart': opts.nopart, + 'updatetime': opts.updatetime, + 'writedescription': opts.writedescription, + 'writeannotations': opts.writeannotations, + 'writeinfojson': opts.writeinfojson, + 'allow_playlist_files': opts.allow_playlist_files, + 'clean_infojson': opts.clean_infojson, + 'getcomments': opts.getcomments, + 'writethumbnail': opts.writethumbnail, + 'write_all_thumbnails': opts.write_all_thumbnails, + 'writelink': opts.writelink, + 'writeurllink': opts.writeurllink, + 'writewebloclink': opts.writewebloclink, + 'writedesktoplink': opts.writedesktoplink, + 'writesubtitles': opts.writesubtitles, + 'writeautomaticsub': opts.writeautomaticsub, + 'allsubtitles': opts.allsubtitles, + 'listsubtitles': opts.listsubtitles, + 'subtitlesformat': opts.subtitlesformat, + 'subtitleslangs': opts.subtitleslangs, + 'matchtitle': decodeOption(opts.matchtitle), + 'rejecttitle': decodeOption(opts.rejecttitle), + 'max_downloads': opts.max_downloads, + 'prefer_free_formats': opts.prefer_free_formats, + 'trim_file_name': opts.trim_file_name, + 'verbose': opts.verbose, + 'dump_intermediate_pages': opts.dump_intermediate_pages, + 'write_pages': opts.write_pages, + 'test': opts.test, + 'keepvideo': opts.keepvideo, + 'min_filesize': opts.min_filesize, + 'max_filesize': opts.max_filesize, + 'min_views': opts.min_views, + 'max_views': opts.max_views, + 'daterange': date, + 'cachedir': opts.cachedir, + 'youtube_print_sig_code': opts.youtube_print_sig_code, + 'age_limit': opts.age_limit, + 'download_archive': download_archive_fn, + 'break_on_existing': opts.break_on_existing, + 'break_on_reject': opts.break_on_reject, + 'skip_playlist_after_errors': opts.skip_playlist_after_errors, + 'cookiefile': opts.cookiefile, + 'cookiesfrombrowser': opts.cookiesfrombrowser, + 'nocheckcertificate': opts.no_check_certificate, + 'prefer_insecure': opts.prefer_insecure, + 'proxy': opts.proxy, + 'socket_timeout': opts.socket_timeout, + 'bidi_workaround': opts.bidi_workaround, + 'debug_printtraffic': opts.debug_printtraffic, + 'prefer_ffmpeg': opts.prefer_ffmpeg, + 'include_ads': opts.include_ads, + 'default_search': opts.default_search, + 'dynamic_mpd': opts.dynamic_mpd, + 'extractor_args': opts.extractor_args, + 'youtube_include_dash_manifest': opts.youtube_include_dash_manifest, + 'youtube_include_hls_manifest': opts.youtube_include_hls_manifest, + 'encoding': opts.encoding, + 'extract_flat': opts.extract_flat, + 'mark_watched': opts.mark_watched, + 'merge_output_format': opts.merge_output_format, + 'final_ext': final_ext, + 'postprocessors': postprocessors, + 'fixup': opts.fixup, + 'source_address': opts.source_address, + 'call_home': opts.call_home, + 'sleep_interval_requests': opts.sleep_interval_requests, + 'sleep_interval': opts.sleep_interval, + 'max_sleep_interval': opts.max_sleep_interval, + 'sleep_interval_subtitles': opts.sleep_interval_subtitles, + 'external_downloader': opts.external_downloader, + 'list_thumbnails': opts.list_thumbnails, + 'playlist_items': opts.playlist_items, + 'xattr_set_filesize': opts.xattr_set_filesize, + 'match_filter': match_filter, + 'no_color': opts.no_color, + 'ffmpeg_location': opts.ffmpeg_location, + 'hls_prefer_native': opts.hls_prefer_native, + 'hls_use_mpegts': opts.hls_use_mpegts, + 'hls_split_discontinuity': opts.hls_split_discontinuity, + 'external_downloader_args': opts.external_downloader_args, + 'postprocessor_args': opts.postprocessor_args, + 'cn_verification_proxy': opts.cn_verification_proxy, + 'geo_verification_proxy': opts.geo_verification_proxy, + 'geo_bypass': opts.geo_bypass, + 'geo_bypass_country': opts.geo_bypass_country, + 'geo_bypass_ip_block': opts.geo_bypass_ip_block, + 'warnings': warnings, + 'compat_opts': compat_opts, + } + + with YoutubeDL(ydl_opts) as ydl: + actual_use = len(all_urls) or opts.load_info_filename + + # Remove cache dir + if opts.rm_cachedir: + ydl.cache.remove() + + # Maybe do nothing + if not actual_use: + ydl.warn_if_short_id(sys.argv[1:] if argv is None else argv) + parser.error( + 'You must provide at least one URL.\n' + 'Type yt-dlp --help to see a list of all options.') + + try: + if opts.load_info_filename is not None: + retcode = ydl.download_with_info_file(expand_path(opts.load_info_filename)) + else: + retcode = ydl.download(all_urls) + except (MaxDownloadsReached, ExistingVideoReached, RejectedVideoReached): + ydl.to_screen('Aborting remaining downloads') + retcode = 101 + + sys.exit(retcode) + + +def main(argv=None): + try: + _real_main(argv) + except DownloadError: + sys.exit(1) + except SameFileError: + sys.exit('ERROR: fixed output name but more than one file to download') + except KeyboardInterrupt: + sys.exit('\nERROR: Interrupted by user') + except BrokenPipeError: + # https://docs.python.org/3/library/signal.html#note-on-sigpipe + devnull = os.open(os.devnull, os.O_WRONLY) + os.dup2(devnull, sys.stdout.fileno()) + sys.exit(r'\nERROR: {err}') + + +__all__ = ['main', 'YoutubeDL', 'gen_extractors', 'list_extractors'] diff --git a/yt_dlp/__main__.py b/yt_dlp/__main__.py new file mode 100644 index 000000000..c9f41473d --- /dev/null +++ b/yt_dlp/__main__.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python3 +from __future__ import unicode_literals + +# Execute with +# $ python yt_dlp/__main__.py (2.6+) +# $ python -m yt_dlp (2.7+) + +import sys + +if __package__ is None and not hasattr(sys, 'frozen'): + # direct call of __main__.py + import os.path + path = os.path.realpath(os.path.abspath(__file__)) + sys.path.insert(0, os.path.dirname(os.path.dirname(path))) + +import yt_dlp + +if __name__ == '__main__': + yt_dlp.main() diff --git a/yt_dlp/aes.py b/yt_dlp/aes.py new file mode 100644 index 000000000..60cdeb74e --- /dev/null +++ b/yt_dlp/aes.py @@ -0,0 +1,468 @@ +from __future__ import unicode_literals + +from math import ceil + +from .compat import compat_b64decode, compat_pycrypto_AES +from .utils import bytes_to_intlist, intlist_to_bytes + + +if compat_pycrypto_AES: + def aes_cbc_decrypt_bytes(data, key, iv): + """ Decrypt bytes with AES-CBC using pycryptodome """ + return compat_pycrypto_AES.new(key, compat_pycrypto_AES.MODE_CBC, iv).decrypt(data) + + def aes_gcm_decrypt_and_verify_bytes(data, key, tag, nonce): + """ Decrypt bytes with AES-GCM using pycryptodome """ + return compat_pycrypto_AES.new(key, compat_pycrypto_AES.MODE_GCM, nonce).decrypt_and_verify(data, tag) + +else: + def aes_cbc_decrypt_bytes(data, key, iv): + """ Decrypt bytes with AES-CBC using native implementation since pycryptodome is unavailable """ + return intlist_to_bytes(aes_cbc_decrypt(*map(bytes_to_intlist, (data, key, iv)))) + + def aes_gcm_decrypt_and_verify_bytes(data, key, tag, nonce): + """ Decrypt bytes with AES-GCM using native implementation since pycryptodome is unavailable """ + return intlist_to_bytes(aes_gcm_decrypt_and_verify(*map(bytes_to_intlist, (data, key, tag, nonce)))) + + +BLOCK_SIZE_BYTES = 16 + + +def aes_ctr_decrypt(data, key, iv): + """ + Decrypt with aes in counter mode + + @param {int[]} data cipher + @param {int[]} key 16/24/32-Byte cipher key + @param {int[]} iv 16-Byte initialization vector + @returns {int[]} decrypted data + """ + return aes_ctr_encrypt(data, key, iv) + + +def aes_ctr_encrypt(data, key, iv): + """ + Encrypt with aes in counter mode + + @param {int[]} data cleartext + @param {int[]} key 16/24/32-Byte cipher key + @param {int[]} iv 16-Byte initialization vector + @returns {int[]} encrypted data + """ + expanded_key = key_expansion(key) + block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + counter = iter_vector(iv) + + encrypted_data = [] + for i in range(block_count): + counter_block = next(counter) + block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES] + block += [0] * (BLOCK_SIZE_BYTES - len(block)) + + cipher_counter_block = aes_encrypt(counter_block, expanded_key) + encrypted_data += xor(block, cipher_counter_block) + encrypted_data = encrypted_data[:len(data)] + + return encrypted_data + + +def aes_cbc_decrypt(data, key, iv): + """ + Decrypt with aes in CBC mode + + @param {int[]} data cipher + @param {int[]} key 16/24/32-Byte cipher key + @param {int[]} iv 16-Byte IV + @returns {int[]} decrypted data + """ + expanded_key = key_expansion(key) + block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + + decrypted_data = [] + previous_cipher_block = iv + for i in range(block_count): + block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES] + block += [0] * (BLOCK_SIZE_BYTES - len(block)) + + decrypted_block = aes_decrypt(block, expanded_key) + decrypted_data += xor(decrypted_block, previous_cipher_block) + previous_cipher_block = block + decrypted_data = decrypted_data[:len(data)] + + return decrypted_data + + +def aes_cbc_encrypt(data, key, iv): + """ + Encrypt with aes in CBC mode. Using PKCS#7 padding + + @param {int[]} data cleartext + @param {int[]} key 16/24/32-Byte cipher key + @param {int[]} iv 16-Byte IV + @returns {int[]} encrypted data + """ + expanded_key = key_expansion(key) + block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + + encrypted_data = [] + previous_cipher_block = iv + for i in range(block_count): + block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES] + remaining_length = BLOCK_SIZE_BYTES - len(block) + block += [remaining_length] * remaining_length + mixed_block = xor(block, previous_cipher_block) + + encrypted_block = aes_encrypt(mixed_block, expanded_key) + encrypted_data += encrypted_block + + previous_cipher_block = encrypted_block + + return encrypted_data + + +def aes_gcm_decrypt_and_verify(data, key, tag, nonce): + """ + Decrypt with aes in GBM mode and checks authenticity using tag + + @param {int[]} data cipher + @param {int[]} key 16-Byte cipher key + @param {int[]} tag authentication tag + @param {int[]} nonce IV (recommended 12-Byte) + @returns {int[]} decrypted data + """ + + # XXX: check aes, gcm param + + hash_subkey = aes_encrypt([0] * BLOCK_SIZE_BYTES, key_expansion(key)) + + if len(nonce) == 12: + j0 = nonce + [0, 0, 0, 1] + else: + fill = (BLOCK_SIZE_BYTES - (len(nonce) % BLOCK_SIZE_BYTES)) % BLOCK_SIZE_BYTES + 8 + ghash_in = nonce + [0] * fill + bytes_to_intlist((8 * len(nonce)).to_bytes(8, 'big')) + j0 = ghash(hash_subkey, ghash_in) + + # TODO: add nonce support to aes_ctr_decrypt + + # nonce_ctr = j0[:12] + iv_ctr = inc(j0) + + decrypted_data = aes_ctr_decrypt(data, key, iv_ctr + [0] * (BLOCK_SIZE_BYTES - len(iv_ctr))) + pad_len = len(data) // 16 * 16 + s_tag = ghash( + hash_subkey, + data + + [0] * (BLOCK_SIZE_BYTES - len(data) + pad_len) # pad + + bytes_to_intlist((0 * 8).to_bytes(8, 'big') # length of associated data + + ((len(data) * 8).to_bytes(8, 'big'))) # length of data + ) + + if tag != aes_ctr_encrypt(s_tag, key, j0): + raise ValueError("Mismatching authentication tag") + + return decrypted_data + + +def aes_encrypt(data, expanded_key): + """ + Encrypt one block with aes + + @param {int[]} data 16-Byte state + @param {int[]} expanded_key 176/208/240-Byte expanded key + @returns {int[]} 16-Byte cipher + """ + rounds = len(expanded_key) // BLOCK_SIZE_BYTES - 1 + + data = xor(data, expanded_key[:BLOCK_SIZE_BYTES]) + for i in range(1, rounds + 1): + data = sub_bytes(data) + data = shift_rows(data) + if i != rounds: + data = list(iter_mix_columns(data, MIX_COLUMN_MATRIX)) + data = xor(data, expanded_key[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES]) + + return data + + +def aes_decrypt(data, expanded_key): + """ + Decrypt one block with aes + + @param {int[]} data 16-Byte cipher + @param {int[]} expanded_key 176/208/240-Byte expanded key + @returns {int[]} 16-Byte state + """ + rounds = len(expanded_key) // BLOCK_SIZE_BYTES - 1 + + for i in range(rounds, 0, -1): + data = xor(data, expanded_key[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES]) + if i != rounds: + data = list(iter_mix_columns(data, MIX_COLUMN_MATRIX_INV)) + data = shift_rows_inv(data) + data = sub_bytes_inv(data) + data = xor(data, expanded_key[:BLOCK_SIZE_BYTES]) + + return data + + +def aes_decrypt_text(data, password, key_size_bytes): + """ + Decrypt text + - The first 8 Bytes of decoded 'data' are the 8 high Bytes of the counter + - The cipher key is retrieved by encrypting the first 16 Byte of 'password' + with the first 'key_size_bytes' Bytes from 'password' (if necessary filled with 0's) + - Mode of operation is 'counter' + + @param {str} data Base64 encoded string + @param {str,unicode} password Password (will be encoded with utf-8) + @param {int} key_size_bytes Possible values: 16 for 128-Bit, 24 for 192-Bit or 32 for 256-Bit + @returns {str} Decrypted data + """ + NONCE_LENGTH_BYTES = 8 + + data = bytes_to_intlist(compat_b64decode(data)) + password = bytes_to_intlist(password.encode('utf-8')) + + key = password[:key_size_bytes] + [0] * (key_size_bytes - len(password)) + key = aes_encrypt(key[:BLOCK_SIZE_BYTES], key_expansion(key)) * (key_size_bytes // BLOCK_SIZE_BYTES) + + nonce = data[:NONCE_LENGTH_BYTES] + cipher = data[NONCE_LENGTH_BYTES:] + + decrypted_data = aes_ctr_decrypt(cipher, key, nonce + [0] * (BLOCK_SIZE_BYTES - NONCE_LENGTH_BYTES)) + plaintext = intlist_to_bytes(decrypted_data) + + return plaintext + + +RCON = (0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36) +SBOX = (0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76, + 0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0, + 0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC, 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15, + 0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, 0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75, + 0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0, 0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84, + 0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B, 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF, + 0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, 0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8, + 0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5, 0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2, + 0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17, 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73, + 0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, 0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB, + 0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C, 0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79, + 0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9, 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08, + 0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, 0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A, + 0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E, + 0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94, 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF, + 0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16) +SBOX_INV = (0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb, + 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb, + 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e, + 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25, + 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92, + 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84, + 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06, + 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b, + 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73, + 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e, + 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b, + 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4, + 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f, + 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef, + 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61, + 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d) +MIX_COLUMN_MATRIX = ((0x2, 0x3, 0x1, 0x1), + (0x1, 0x2, 0x3, 0x1), + (0x1, 0x1, 0x2, 0x3), + (0x3, 0x1, 0x1, 0x2)) +MIX_COLUMN_MATRIX_INV = ((0xE, 0xB, 0xD, 0x9), + (0x9, 0xE, 0xB, 0xD), + (0xD, 0x9, 0xE, 0xB), + (0xB, 0xD, 0x9, 0xE)) +RIJNDAEL_EXP_TABLE = (0x01, 0x03, 0x05, 0x0F, 0x11, 0x33, 0x55, 0xFF, 0x1A, 0x2E, 0x72, 0x96, 0xA1, 0xF8, 0x13, 0x35, + 0x5F, 0xE1, 0x38, 0x48, 0xD8, 0x73, 0x95, 0xA4, 0xF7, 0x02, 0x06, 0x0A, 0x1E, 0x22, 0x66, 0xAA, + 0xE5, 0x34, 0x5C, 0xE4, 0x37, 0x59, 0xEB, 0x26, 0x6A, 0xBE, 0xD9, 0x70, 0x90, 0xAB, 0xE6, 0x31, + 0x53, 0xF5, 0x04, 0x0C, 0x14, 0x3C, 0x44, 0xCC, 0x4F, 0xD1, 0x68, 0xB8, 0xD3, 0x6E, 0xB2, 0xCD, + 0x4C, 0xD4, 0x67, 0xA9, 0xE0, 0x3B, 0x4D, 0xD7, 0x62, 0xA6, 0xF1, 0x08, 0x18, 0x28, 0x78, 0x88, + 0x83, 0x9E, 0xB9, 0xD0, 0x6B, 0xBD, 0xDC, 0x7F, 0x81, 0x98, 0xB3, 0xCE, 0x49, 0xDB, 0x76, 0x9A, + 0xB5, 0xC4, 0x57, 0xF9, 0x10, 0x30, 0x50, 0xF0, 0x0B, 0x1D, 0x27, 0x69, 0xBB, 0xD6, 0x61, 0xA3, + 0xFE, 0x19, 0x2B, 0x7D, 0x87, 0x92, 0xAD, 0xEC, 0x2F, 0x71, 0x93, 0xAE, 0xE9, 0x20, 0x60, 0xA0, + 0xFB, 0x16, 0x3A, 0x4E, 0xD2, 0x6D, 0xB7, 0xC2, 0x5D, 0xE7, 0x32, 0x56, 0xFA, 0x15, 0x3F, 0x41, + 0xC3, 0x5E, 0xE2, 0x3D, 0x47, 0xC9, 0x40, 0xC0, 0x5B, 0xED, 0x2C, 0x74, 0x9C, 0xBF, 0xDA, 0x75, + 0x9F, 0xBA, 0xD5, 0x64, 0xAC, 0xEF, 0x2A, 0x7E, 0x82, 0x9D, 0xBC, 0xDF, 0x7A, 0x8E, 0x89, 0x80, + 0x9B, 0xB6, 0xC1, 0x58, 0xE8, 0x23, 0x65, 0xAF, 0xEA, 0x25, 0x6F, 0xB1, 0xC8, 0x43, 0xC5, 0x54, + 0xFC, 0x1F, 0x21, 0x63, 0xA5, 0xF4, 0x07, 0x09, 0x1B, 0x2D, 0x77, 0x99, 0xB0, 0xCB, 0x46, 0xCA, + 0x45, 0xCF, 0x4A, 0xDE, 0x79, 0x8B, 0x86, 0x91, 0xA8, 0xE3, 0x3E, 0x42, 0xC6, 0x51, 0xF3, 0x0E, + 0x12, 0x36, 0x5A, 0xEE, 0x29, 0x7B, 0x8D, 0x8C, 0x8F, 0x8A, 0x85, 0x94, 0xA7, 0xF2, 0x0D, 0x17, + 0x39, 0x4B, 0xDD, 0x7C, 0x84, 0x97, 0xA2, 0xFD, 0x1C, 0x24, 0x6C, 0xB4, 0xC7, 0x52, 0xF6, 0x01) +RIJNDAEL_LOG_TABLE = (0x00, 0x00, 0x19, 0x01, 0x32, 0x02, 0x1a, 0xc6, 0x4b, 0xc7, 0x1b, 0x68, 0x33, 0xee, 0xdf, 0x03, + 0x64, 0x04, 0xe0, 0x0e, 0x34, 0x8d, 0x81, 0xef, 0x4c, 0x71, 0x08, 0xc8, 0xf8, 0x69, 0x1c, 0xc1, + 0x7d, 0xc2, 0x1d, 0xb5, 0xf9, 0xb9, 0x27, 0x6a, 0x4d, 0xe4, 0xa6, 0x72, 0x9a, 0xc9, 0x09, 0x78, + 0x65, 0x2f, 0x8a, 0x05, 0x21, 0x0f, 0xe1, 0x24, 0x12, 0xf0, 0x82, 0x45, 0x35, 0x93, 0xda, 0x8e, + 0x96, 0x8f, 0xdb, 0xbd, 0x36, 0xd0, 0xce, 0x94, 0x13, 0x5c, 0xd2, 0xf1, 0x40, 0x46, 0x83, 0x38, + 0x66, 0xdd, 0xfd, 0x30, 0xbf, 0x06, 0x8b, 0x62, 0xb3, 0x25, 0xe2, 0x98, 0x22, 0x88, 0x91, 0x10, + 0x7e, 0x6e, 0x48, 0xc3, 0xa3, 0xb6, 0x1e, 0x42, 0x3a, 0x6b, 0x28, 0x54, 0xfa, 0x85, 0x3d, 0xba, + 0x2b, 0x79, 0x0a, 0x15, 0x9b, 0x9f, 0x5e, 0xca, 0x4e, 0xd4, 0xac, 0xe5, 0xf3, 0x73, 0xa7, 0x57, + 0xaf, 0x58, 0xa8, 0x50, 0xf4, 0xea, 0xd6, 0x74, 0x4f, 0xae, 0xe9, 0xd5, 0xe7, 0xe6, 0xad, 0xe8, + 0x2c, 0xd7, 0x75, 0x7a, 0xeb, 0x16, 0x0b, 0xf5, 0x59, 0xcb, 0x5f, 0xb0, 0x9c, 0xa9, 0x51, 0xa0, + 0x7f, 0x0c, 0xf6, 0x6f, 0x17, 0xc4, 0x49, 0xec, 0xd8, 0x43, 0x1f, 0x2d, 0xa4, 0x76, 0x7b, 0xb7, + 0xcc, 0xbb, 0x3e, 0x5a, 0xfb, 0x60, 0xb1, 0x86, 0x3b, 0x52, 0xa1, 0x6c, 0xaa, 0x55, 0x29, 0x9d, + 0x97, 0xb2, 0x87, 0x90, 0x61, 0xbe, 0xdc, 0xfc, 0xbc, 0x95, 0xcf, 0xcd, 0x37, 0x3f, 0x5b, 0xd1, + 0x53, 0x39, 0x84, 0x3c, 0x41, 0xa2, 0x6d, 0x47, 0x14, 0x2a, 0x9e, 0x5d, 0x56, 0xf2, 0xd3, 0xab, + 0x44, 0x11, 0x92, 0xd9, 0x23, 0x20, 0x2e, 0x89, 0xb4, 0x7c, 0xb8, 0x26, 0x77, 0x99, 0xe3, 0xa5, + 0x67, 0x4a, 0xed, 0xde, 0xc5, 0x31, 0xfe, 0x18, 0x0d, 0x63, 0x8c, 0x80, 0xc0, 0xf7, 0x70, 0x07) + + +def key_expansion(data): + """ + Generate key schedule + + @param {int[]} data 16/24/32-Byte cipher key + @returns {int[]} 176/208/240-Byte expanded key + """ + data = data[:] # copy + rcon_iteration = 1 + key_size_bytes = len(data) + expanded_key_size_bytes = (key_size_bytes // 4 + 7) * BLOCK_SIZE_BYTES + + while len(data) < expanded_key_size_bytes: + temp = data[-4:] + temp = key_schedule_core(temp, rcon_iteration) + rcon_iteration += 1 + data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes]) + + for _ in range(3): + temp = data[-4:] + data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes]) + + if key_size_bytes == 32: + temp = data[-4:] + temp = sub_bytes(temp) + data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes]) + + for _ in range(3 if key_size_bytes == 32 else 2 if key_size_bytes == 24 else 0): + temp = data[-4:] + data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes]) + data = data[:expanded_key_size_bytes] + + return data + + +def iter_vector(iv): + while True: + yield iv + iv = inc(iv) + + +def sub_bytes(data): + return [SBOX[x] for x in data] + + +def sub_bytes_inv(data): + return [SBOX_INV[x] for x in data] + + +def rotate(data): + return data[1:] + [data[0]] + + +def key_schedule_core(data, rcon_iteration): + data = rotate(data) + data = sub_bytes(data) + data[0] = data[0] ^ RCON[rcon_iteration] + + return data + + +def xor(data1, data2): + return [x ^ y for x, y in zip(data1, data2)] + + +def iter_mix_columns(data, matrix): + for i in (0, 4, 8, 12): + for row in matrix: + mixed = 0 + for j in range(4): + # xor is (+) and (-) + mixed ^= (0 if data[i:i + 4][j] == 0 or row[j] == 0 else + RIJNDAEL_EXP_TABLE[(RIJNDAEL_LOG_TABLE[data[i + j]] + RIJNDAEL_LOG_TABLE[row[j]]) % 0xFF]) + yield mixed + + +def shift_rows(data): + return [data[((column + row) & 0b11) * 4 + row] for column in range(4) for row in range(4)] + + +def shift_rows_inv(data): + return [data[((column - row) & 0b11) * 4 + row] for column in range(4) for row in range(4)] + + +def shift_block(data): + data_shifted = [] + + bit = 0 + for n in data: + if bit: + n |= 0x100 + bit = n & 1 + n >>= 1 + data_shifted.append(n) + + return data_shifted + + +def inc(data): + data = data[:] # copy + for i in range(len(data) - 1, -1, -1): + if data[i] == 255: + data[i] = 0 + else: + data[i] = data[i] + 1 + break + return data + + +def block_product(block_x, block_y): + # NIST SP 800-38D, Algorithm 1 + + if len(block_x) != BLOCK_SIZE_BYTES or len(block_y) != BLOCK_SIZE_BYTES: + raise ValueError("Length of blocks need to be %d bytes" % BLOCK_SIZE_BYTES) + + block_r = [0xE1] + [0] * (BLOCK_SIZE_BYTES - 1) + block_v = block_y[:] + block_z = [0] * BLOCK_SIZE_BYTES + + for i in block_x: + for bit in range(7, -1, -1): + if i & (1 << bit): + block_z = xor(block_z, block_v) + + do_xor = block_v[-1] & 1 + block_v = shift_block(block_v) + if do_xor: + block_v = xor(block_v, block_r) + + return block_z + + +def ghash(subkey, data): + # NIST SP 800-38D, Algorithm 2 + + if len(data) % BLOCK_SIZE_BYTES: + raise ValueError("Length of data should be %d bytes" % BLOCK_SIZE_BYTES) + + last_y = [0] * BLOCK_SIZE_BYTES + for i in range(0, len(data), BLOCK_SIZE_BYTES): + block = data[i : i + BLOCK_SIZE_BYTES] # noqa: E203 + last_y = block_product(xor(last_y, block), subkey) + + return last_y + + +__all__ = [ + 'aes_ctr_decrypt', + 'aes_cbc_decrypt', + 'aes_cbc_decrypt_bytes', + 'aes_decrypt_text', + 'aes_encrypt', + 'aes_gcm_decrypt_and_verify', + 'aes_gcm_decrypt_and_verify_bytes', + 'key_expansion' +] diff --git a/yt_dlp/cache.py b/yt_dlp/cache.py new file mode 100644 index 000000000..e5cb193bc --- /dev/null +++ b/yt_dlp/cache.py @@ -0,0 +1,98 @@ +from __future__ import unicode_literals + +import errno +import io +import json +import os +import re +import shutil +import traceback + +from .compat import compat_getenv +from .utils import ( + expand_path, + write_json_file, +) + + +class Cache(object): + def __init__(self, ydl): + self._ydl = ydl + + def _get_root_dir(self): + res = self._ydl.params.get('cachedir') + if res is None: + cache_root = compat_getenv('XDG_CACHE_HOME', '~/.cache') + res = os.path.join(cache_root, 'yt-dlp') + return expand_path(res) + + def _get_cache_fn(self, section, key, dtype): + assert re.match(r'^[a-zA-Z0-9_.-]+$', section), \ + 'invalid section %r' % section + assert re.match(r'^[a-zA-Z0-9_.-]+$', key), 'invalid key %r' % key + return os.path.join( + self._get_root_dir(), section, '%s.%s' % (key, dtype)) + + @property + def enabled(self): + return self._ydl.params.get('cachedir') is not False + + def store(self, section, key, data, dtype='json'): + assert dtype in ('json',) + + if not self.enabled: + return + + fn = self._get_cache_fn(section, key, dtype) + try: + try: + os.makedirs(os.path.dirname(fn)) + except OSError as ose: + if ose.errno != errno.EEXIST: + raise + self._ydl.write_debug(f'Saving {section}.{key} to cache') + write_json_file(data, fn) + except Exception: + tb = traceback.format_exc() + self._ydl.report_warning( + 'Writing cache to %r failed: %s' % (fn, tb)) + + def load(self, section, key, dtype='json', default=None): + assert dtype in ('json',) + + if not self.enabled: + return default + + cache_fn = self._get_cache_fn(section, key, dtype) + try: + try: + with io.open(cache_fn, 'r', encoding='utf-8') as cachef: + self._ydl.write_debug(f'Loading {section}.{key} from cache') + return json.load(cachef) + except ValueError: + try: + file_size = os.path.getsize(cache_fn) + except (OSError, IOError) as oe: + file_size = str(oe) + self._ydl.report_warning( + 'Cache retrieval from %s failed (%s)' % (cache_fn, file_size)) + except IOError: + pass # No cache available + + return default + + def remove(self): + if not self.enabled: + self._ydl.to_screen('Cache is disabled (Did you combine --no-cache-dir and --rm-cache-dir?)') + return + + cachedir = self._get_root_dir() + if not any((term in cachedir) for term in ('cache', 'tmp')): + raise Exception('Not removing directory %s - this does not look like a cache dir' % cachedir) + + self._ydl.to_screen( + 'Removing cache dir %s .' % cachedir, skip_eol=True) + if os.path.exists(cachedir): + self._ydl.to_screen('.', skip_eol=True) + shutil.rmtree(cachedir) + self._ydl.to_screen('.') diff --git a/yt_dlp/compat.py b/yt_dlp/compat.py new file mode 100644 index 000000000..b107b2114 --- /dev/null +++ b/yt_dlp/compat.py @@ -0,0 +1,292 @@ +# coding: utf-8 + +import asyncio +import base64 +import ctypes +import getpass +import html +import html.parser +import http +import http.client +import http.cookiejar +import http.cookies +import http.server +import itertools +import optparse +import os +import re +import shlex +import shutil +import socket +import struct +import sys +import tokenize +import urllib +import xml.etree.ElementTree as etree +from subprocess import DEVNULL + + +# HTMLParseError has been deprecated in Python 3.3 and removed in +# Python 3.5. Introducing dummy exception for Python >3.5 for compatible +# and uniform cross-version exception handling +class compat_HTMLParseError(Exception): + pass + + +# compat_ctypes_WINFUNCTYPE = ctypes.WINFUNCTYPE +# will not work since ctypes.WINFUNCTYPE does not exist in UNIX machines +def compat_ctypes_WINFUNCTYPE(*args, **kwargs): + return ctypes.WINFUNCTYPE(*args, **kwargs) + + +class _TreeBuilder(etree.TreeBuilder): + def doctype(self, name, pubid, system): + pass + + +def compat_etree_fromstring(text): + return etree.XML(text, parser=etree.XMLParser(target=_TreeBuilder())) + + +compat_os_name = os._name if os.name == 'java' else os.name + + +if compat_os_name == 'nt': + def compat_shlex_quote(s): + return s if re.match(r'^[-_\w./]+$', s) else '"%s"' % s.replace('"', '\\"') +else: + from shlex import quote as compat_shlex_quote + + +def compat_ord(c): + if type(c) is int: + return c + else: + return ord(c) + + +def compat_setenv(key, value, env=os.environ): + env[key] = value + + +if compat_os_name == 'nt' and sys.version_info < (3, 8): + # os.path.realpath on Windows does not follow symbolic links + # prior to Python 3.8 (see https://bugs.python.org/issue9949) + def compat_realpath(path): + while os.path.islink(path): + path = os.path.abspath(os.readlink(path)) + return path +else: + compat_realpath = os.path.realpath + + +def compat_print(s): + assert isinstance(s, compat_str) + print(s) + + +# Fix https://github.com/ytdl-org/youtube-dl/issues/4223 +# See http://bugs.python.org/issue9161 for what is broken +def workaround_optparse_bug9161(): + op = optparse.OptionParser() + og = optparse.OptionGroup(op, 'foo') + try: + og.add_option('-t') + except TypeError: + real_add_option = optparse.OptionGroup.add_option + + def _compat_add_option(self, *args, **kwargs): + enc = lambda v: ( + v.encode('ascii', 'replace') if isinstance(v, compat_str) + else v) + bargs = [enc(a) for a in args] + bkwargs = dict( + (k, enc(v)) for k, v in kwargs.items()) + return real_add_option(self, *bargs, **bkwargs) + optparse.OptionGroup.add_option = _compat_add_option + + +try: + compat_Pattern = re.Pattern +except AttributeError: + compat_Pattern = type(re.compile('')) + + +try: + compat_Match = re.Match +except AttributeError: + compat_Match = type(re.compile('').match('')) + + +try: + compat_asyncio_run = asyncio.run # >= 3.7 +except AttributeError: + def compat_asyncio_run(coro): + try: + loop = asyncio.get_event_loop() + except RuntimeError: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + loop.run_until_complete(coro) + + asyncio.run = compat_asyncio_run + + +# Python 3.8+ does not honor %HOME% on windows, but this breaks compatibility with youtube-dl +# See https://github.com/yt-dlp/yt-dlp/issues/792 +# https://docs.python.org/3/library/os.path.html#os.path.expanduser +if compat_os_name in ('nt', 'ce') and 'HOME' in os.environ: + _userhome = os.environ['HOME'] + + def compat_expanduser(path): + if not path.startswith('~'): + return path + i = path.replace('\\', '/', 1).find('/') # ~user + if i < 0: + i = len(path) + userhome = os.path.join(os.path.dirname(_userhome), path[1:i]) if i > 1 else _userhome + return userhome + path[i:] +else: + compat_expanduser = os.path.expanduser + + +try: + from Cryptodome.Cipher import AES as compat_pycrypto_AES +except ImportError: + try: + from Crypto.Cipher import AES as compat_pycrypto_AES + except ImportError: + compat_pycrypto_AES = None + + +def windows_enable_vt_mode(): # TODO: Do this the proper way https://bugs.python.org/issue30075 + if compat_os_name != 'nt': + return + os.system('') + + +# Deprecated + +compat_basestring = str +compat_chr = chr +compat_input = input +compat_integer_types = (int, ) +compat_kwargs = lambda kwargs: kwargs +compat_numeric_types = (int, float, complex) +compat_str = str +compat_xpath = lambda xpath: xpath +compat_zip = zip + +compat_HTMLParser = html.parser.HTMLParser +compat_HTTPError = urllib.error.HTTPError +compat_Struct = struct.Struct +compat_b64decode = base64.b64decode +compat_cookiejar = http.cookiejar +compat_cookiejar_Cookie = compat_cookiejar.Cookie +compat_cookies = http.cookies +compat_cookies_SimpleCookie = compat_cookies.SimpleCookie +compat_etree_Element = etree.Element +compat_etree_register_namespace = etree.register_namespace +compat_get_terminal_size = shutil.get_terminal_size +compat_getenv = os.getenv +compat_getpass = getpass.getpass +compat_html_entities = html.entities +compat_html_entities_html5 = compat_html_entities.html5 +compat_http_client = http.client +compat_http_server = http.server +compat_itertools_count = itertools.count +compat_parse_qs = urllib.parse.parse_qs +compat_shlex_split = shlex.split +compat_socket_create_connection = socket.create_connection +compat_struct_pack = struct.pack +compat_struct_unpack = struct.unpack +compat_subprocess_get_DEVNULL = lambda: DEVNULL +compat_tokenize_tokenize = tokenize.tokenize +compat_urllib_error = urllib.error +compat_urllib_parse = urllib.parse +compat_urllib_parse_quote = urllib.parse.quote +compat_urllib_parse_quote_plus = urllib.parse.quote_plus +compat_urllib_parse_unquote = urllib.parse.unquote +compat_urllib_parse_unquote_plus = urllib.parse.unquote_plus +compat_urllib_parse_unquote_to_bytes = urllib.parse.unquote_to_bytes +compat_urllib_parse_urlencode = urllib.parse.urlencode +compat_urllib_parse_urlparse = urllib.parse.urlparse +compat_urllib_parse_urlunparse = urllib.parse.urlunparse +compat_urllib_request = urllib.request +compat_urllib_request_DataHandler = urllib.request.DataHandler +compat_urllib_response = urllib.response +compat_urlparse = urllib.parse +compat_urlretrieve = urllib.request.urlretrieve +compat_xml_parse_error = etree.ParseError + + +# Set public objects + +__all__ = [ + 'compat_HTMLParseError', + 'compat_HTMLParser', + 'compat_HTTPError', + 'compat_Match', + 'compat_Pattern', + 'compat_Struct', + 'compat_asyncio_run', + 'compat_b64decode', + 'compat_basestring', + 'compat_chr', + 'compat_cookiejar', + 'compat_cookiejar_Cookie', + 'compat_cookies', + 'compat_cookies_SimpleCookie', + 'compat_ctypes_WINFUNCTYPE', + 'compat_etree_Element', + 'compat_etree_fromstring', + 'compat_etree_register_namespace', + 'compat_expanduser', + 'compat_get_terminal_size', + 'compat_getenv', + 'compat_getpass', + 'compat_html_entities', + 'compat_html_entities_html5', + 'compat_http_client', + 'compat_http_server', + 'compat_input', + 'compat_integer_types', + 'compat_itertools_count', + 'compat_kwargs', + 'compat_numeric_types', + 'compat_ord', + 'compat_os_name', + 'compat_parse_qs', + 'compat_print', + 'compat_pycrypto_AES', + 'compat_realpath', + 'compat_setenv', + 'compat_shlex_quote', + 'compat_shlex_split', + 'compat_socket_create_connection', + 'compat_str', + 'compat_struct_pack', + 'compat_struct_unpack', + 'compat_subprocess_get_DEVNULL', + 'compat_tokenize_tokenize', + 'compat_urllib_error', + 'compat_urllib_parse', + 'compat_urllib_parse_quote', + 'compat_urllib_parse_quote_plus', + 'compat_urllib_parse_unquote', + 'compat_urllib_parse_unquote_plus', + 'compat_urllib_parse_unquote_to_bytes', + 'compat_urllib_parse_urlencode', + 'compat_urllib_parse_urlparse', + 'compat_urllib_parse_urlunparse', + 'compat_urllib_request', + 'compat_urllib_request_DataHandler', + 'compat_urllib_response', + 'compat_urlparse', + 'compat_urlretrieve', + 'compat_xml_parse_error', + 'compat_xpath', + 'compat_zip', + 'windows_enable_vt_mode', + 'workaround_optparse_bug9161', +] diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py new file mode 100644 index 000000000..049ec9fb1 --- /dev/null +++ b/yt_dlp/cookies.py @@ -0,0 +1,745 @@ +import ctypes +import json +import os +import shutil +import struct +import subprocess +import sys +import tempfile +from datetime import datetime, timedelta, timezone +from hashlib import pbkdf2_hmac + +from .aes import aes_cbc_decrypt_bytes, aes_gcm_decrypt_and_verify_bytes +from .compat import ( + compat_b64decode, + compat_cookiejar_Cookie, +) +from .utils import ( + bug_reports_message, + expand_path, + process_communicate_or_kill, + YoutubeDLCookieJar, +) + +try: + import sqlite3 + SQLITE_AVAILABLE = True +except ImportError: + # although sqlite3 is part of the standard library, it is possible to compile python without + # sqlite support. See: https://github.com/yt-dlp/yt-dlp/issues/544 + SQLITE_AVAILABLE = False + + +try: + import keyring + KEYRING_AVAILABLE = True + KEYRING_UNAVAILABLE_REASON = f'due to unknown reasons{bug_reports_message()}' +except ImportError: + KEYRING_AVAILABLE = False + KEYRING_UNAVAILABLE_REASON = ( + 'as the `keyring` module is not installed. ' + 'Please install by running `python3 -m pip install keyring`. ' + 'Depending on your platform, additional packages may be required ' + 'to access the keyring; see https://pypi.org/project/keyring') +except Exception as _err: + KEYRING_AVAILABLE = False + KEYRING_UNAVAILABLE_REASON = 'as the `keyring` module could not be initialized: %s' % _err + + +CHROMIUM_BASED_BROWSERS = {'brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi'} +SUPPORTED_BROWSERS = CHROMIUM_BASED_BROWSERS | {'firefox', 'safari'} + + +class YDLLogger: + def __init__(self, ydl=None): + self._ydl = ydl + + def debug(self, message): + if self._ydl: + self._ydl.write_debug(message) + + def info(self, message): + if self._ydl: + self._ydl.to_screen(f'[Cookies] {message}') + + def warning(self, message, only_once=False): + if self._ydl: + self._ydl.report_warning(message, only_once) + + def error(self, message): + if self._ydl: + self._ydl.report_error(message) + + +def load_cookies(cookie_file, browser_specification, ydl): + cookie_jars = [] + if browser_specification is not None: + browser_name, profile = _parse_browser_specification(*browser_specification) + cookie_jars.append(extract_cookies_from_browser(browser_name, profile, YDLLogger(ydl))) + + if cookie_file is not None: + cookie_file = expand_path(cookie_file) + jar = YoutubeDLCookieJar(cookie_file) + if os.access(cookie_file, os.R_OK): + jar.load(ignore_discard=True, ignore_expires=True) + cookie_jars.append(jar) + + return _merge_cookie_jars(cookie_jars) + + +def extract_cookies_from_browser(browser_name, profile=None, logger=YDLLogger()): + if browser_name == 'firefox': + return _extract_firefox_cookies(profile, logger) + elif browser_name == 'safari': + return _extract_safari_cookies(profile, logger) + elif browser_name in CHROMIUM_BASED_BROWSERS: + return _extract_chrome_cookies(browser_name, profile, logger) + else: + raise ValueError('unknown browser: {}'.format(browser_name)) + + +def _extract_firefox_cookies(profile, logger): + logger.info('Extracting cookies from firefox') + if not SQLITE_AVAILABLE: + logger.warning('Cannot extract cookies from firefox without sqlite3 support. ' + 'Please use a python interpreter compiled with sqlite3 support') + return YoutubeDLCookieJar() + + if profile is None: + search_root = _firefox_browser_dir() + elif _is_path(profile): + search_root = profile + else: + search_root = os.path.join(_firefox_browser_dir(), profile) + + cookie_database_path = _find_most_recently_used_file(search_root, 'cookies.sqlite') + if cookie_database_path is None: + raise FileNotFoundError('could not find firefox cookies database in {}'.format(search_root)) + logger.debug('Extracting cookies from: "{}"'.format(cookie_database_path)) + + with tempfile.TemporaryDirectory(prefix='youtube_dl') as tmpdir: + cursor = None + try: + cursor = _open_database_copy(cookie_database_path, tmpdir) + cursor.execute('SELECT host, name, value, path, expiry, isSecure FROM moz_cookies') + jar = YoutubeDLCookieJar() + for host, name, value, path, expiry, is_secure in cursor.fetchall(): + cookie = compat_cookiejar_Cookie( + version=0, name=name, value=value, port=None, port_specified=False, + domain=host, domain_specified=bool(host), domain_initial_dot=host.startswith('.'), + path=path, path_specified=bool(path), secure=is_secure, expires=expiry, discard=False, + comment=None, comment_url=None, rest={}) + jar.set_cookie(cookie) + logger.info('Extracted {} cookies from firefox'.format(len(jar))) + return jar + finally: + if cursor is not None: + cursor.connection.close() + + +def _firefox_browser_dir(): + if sys.platform in ('linux', 'linux2'): + return os.path.expanduser('~/.mozilla/firefox') + elif sys.platform == 'win32': + return os.path.expandvars(r'%APPDATA%\Mozilla\Firefox\Profiles') + elif sys.platform == 'darwin': + return os.path.expanduser('~/Library/Application Support/Firefox') + else: + raise ValueError('unsupported platform: {}'.format(sys.platform)) + + +def _get_chromium_based_browser_settings(browser_name): + # https://chromium.googlesource.com/chromium/src/+/HEAD/docs/user_data_dir.md + if sys.platform in ('linux', 'linux2'): + config = _config_home() + browser_dir = { + 'brave': os.path.join(config, 'BraveSoftware/Brave-Browser'), + 'chrome': os.path.join(config, 'google-chrome'), + 'chromium': os.path.join(config, 'chromium'), + 'edge': os.path.join(config, 'microsoft-edge'), + 'opera': os.path.join(config, 'opera'), + 'vivaldi': os.path.join(config, 'vivaldi'), + }[browser_name] + + elif sys.platform == 'win32': + appdata_local = os.path.expandvars('%LOCALAPPDATA%') + appdata_roaming = os.path.expandvars('%APPDATA%') + browser_dir = { + 'brave': os.path.join(appdata_local, r'BraveSoftware\Brave-Browser\User Data'), + 'chrome': os.path.join(appdata_local, r'Google\Chrome\User Data'), + 'chromium': os.path.join(appdata_local, r'Chromium\User Data'), + 'edge': os.path.join(appdata_local, r'Microsoft\Edge\User Data'), + 'opera': os.path.join(appdata_roaming, r'Opera Software\Opera Stable'), + 'vivaldi': os.path.join(appdata_local, r'Vivaldi\User Data'), + }[browser_name] + + elif sys.platform == 'darwin': + appdata = os.path.expanduser('~/Library/Application Support') + browser_dir = { + 'brave': os.path.join(appdata, 'BraveSoftware/Brave-Browser'), + 'chrome': os.path.join(appdata, 'Google/Chrome'), + 'chromium': os.path.join(appdata, 'Chromium'), + 'edge': os.path.join(appdata, 'Microsoft Edge'), + 'opera': os.path.join(appdata, 'com.operasoftware.Opera'), + 'vivaldi': os.path.join(appdata, 'Vivaldi'), + }[browser_name] + + else: + raise ValueError('unsupported platform: {}'.format(sys.platform)) + + # Linux keyring names can be determined by snooping on dbus while opening the browser in KDE: + # dbus-monitor "interface='org.kde.KWallet'" "type=method_return" + keyring_name = { + 'brave': 'Brave', + 'chrome': 'Chrome', + 'chromium': 'Chromium', + 'edge': 'Microsoft Edge' if sys.platform == 'darwin' else 'Chromium', + 'opera': 'Opera' if sys.platform == 'darwin' else 'Chromium', + 'vivaldi': 'Vivaldi' if sys.platform == 'darwin' else 'Chrome', + }[browser_name] + + browsers_without_profiles = {'opera'} + + return { + 'browser_dir': browser_dir, + 'keyring_name': keyring_name, + 'supports_profiles': browser_name not in browsers_without_profiles + } + + +def _extract_chrome_cookies(browser_name, profile, logger): + logger.info('Extracting cookies from {}'.format(browser_name)) + + if not SQLITE_AVAILABLE: + logger.warning(('Cannot extract cookies from {} without sqlite3 support. ' + 'Please use a python interpreter compiled with sqlite3 support').format(browser_name)) + return YoutubeDLCookieJar() + + config = _get_chromium_based_browser_settings(browser_name) + + if profile is None: + search_root = config['browser_dir'] + elif _is_path(profile): + search_root = profile + config['browser_dir'] = os.path.dirname(profile) if config['supports_profiles'] else profile + else: + if config['supports_profiles']: + search_root = os.path.join(config['browser_dir'], profile) + else: + logger.error('{} does not support profiles'.format(browser_name)) + search_root = config['browser_dir'] + + cookie_database_path = _find_most_recently_used_file(search_root, 'Cookies') + if cookie_database_path is None: + raise FileNotFoundError('could not find {} cookies database in "{}"'.format(browser_name, search_root)) + logger.debug('Extracting cookies from: "{}"'.format(cookie_database_path)) + + decryptor = get_cookie_decryptor(config['browser_dir'], config['keyring_name'], logger) + + with tempfile.TemporaryDirectory(prefix='youtube_dl') as tmpdir: + cursor = None + try: + cursor = _open_database_copy(cookie_database_path, tmpdir) + cursor.connection.text_factory = bytes + column_names = _get_column_names(cursor, 'cookies') + secure_column = 'is_secure' if 'is_secure' in column_names else 'secure' + cursor.execute('SELECT host_key, name, value, encrypted_value, path, ' + 'expires_utc, {} FROM cookies'.format(secure_column)) + jar = YoutubeDLCookieJar() + failed_cookies = 0 + for host_key, name, value, encrypted_value, path, expires_utc, is_secure in cursor.fetchall(): + host_key = host_key.decode('utf-8') + name = name.decode('utf-8') + value = value.decode('utf-8') + path = path.decode('utf-8') + + if not value and encrypted_value: + value = decryptor.decrypt(encrypted_value) + if value is None: + failed_cookies += 1 + continue + + cookie = compat_cookiejar_Cookie( + version=0, name=name, value=value, port=None, port_specified=False, + domain=host_key, domain_specified=bool(host_key), domain_initial_dot=host_key.startswith('.'), + path=path, path_specified=bool(path), secure=is_secure, expires=expires_utc, discard=False, + comment=None, comment_url=None, rest={}) + jar.set_cookie(cookie) + if failed_cookies > 0: + failed_message = ' ({} could not be decrypted)'.format(failed_cookies) + else: + failed_message = '' + logger.info('Extracted {} cookies from {}{}'.format(len(jar), browser_name, failed_message)) + return jar + finally: + if cursor is not None: + cursor.connection.close() + + +class ChromeCookieDecryptor: + """ + Overview: + + Linux: + - cookies are either v10 or v11 + - v10: AES-CBC encrypted with a fixed key + - v11: AES-CBC encrypted with an OS protected key (keyring) + - v11 keys can be stored in various places depending on the activate desktop environment [2] + + Mac: + - cookies are either v10 or not v10 + - v10: AES-CBC encrypted with an OS protected key (keyring) and more key derivation iterations than linux + - not v10: 'old data' stored as plaintext + + Windows: + - cookies are either v10 or not v10 + - v10: AES-GCM encrypted with a key which is encrypted with DPAPI + - not v10: encrypted with DPAPI + + Sources: + - [1] https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/ + - [2] https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/key_storage_linux.cc + - KeyStorageLinux::CreateService + """ + + def decrypt(self, encrypted_value): + raise NotImplementedError + + +def get_cookie_decryptor(browser_root, browser_keyring_name, logger): + if sys.platform in ('linux', 'linux2'): + return LinuxChromeCookieDecryptor(browser_keyring_name, logger) + elif sys.platform == 'darwin': + return MacChromeCookieDecryptor(browser_keyring_name, logger) + elif sys.platform == 'win32': + return WindowsChromeCookieDecryptor(browser_root, logger) + else: + raise NotImplementedError('Chrome cookie decryption is not supported ' + 'on this platform: {}'.format(sys.platform)) + + +class LinuxChromeCookieDecryptor(ChromeCookieDecryptor): + def __init__(self, browser_keyring_name, logger): + self._logger = logger + self._v10_key = self.derive_key(b'peanuts') + if KEYRING_AVAILABLE: + self._v11_key = self.derive_key(_get_linux_keyring_password(browser_keyring_name)) + else: + self._v11_key = None + + @staticmethod + def derive_key(password): + # values from + # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_linux.cc + return pbkdf2_sha1(password, salt=b'saltysalt', iterations=1, key_length=16) + + def decrypt(self, encrypted_value): + version = encrypted_value[:3] + ciphertext = encrypted_value[3:] + + if version == b'v10': + return _decrypt_aes_cbc(ciphertext, self._v10_key, self._logger) + + elif version == b'v11': + if self._v11_key is None: + self._logger.warning(f'cannot decrypt cookie {KEYRING_UNAVAILABLE_REASON}', only_once=True) + return None + return _decrypt_aes_cbc(ciphertext, self._v11_key, self._logger) + + else: + return None + + +class MacChromeCookieDecryptor(ChromeCookieDecryptor): + def __init__(self, browser_keyring_name, logger): + self._logger = logger + password = _get_mac_keyring_password(browser_keyring_name, logger) + self._v10_key = None if password is None else self.derive_key(password) + + @staticmethod + def derive_key(password): + # values from + # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_mac.mm + return pbkdf2_sha1(password, salt=b'saltysalt', iterations=1003, key_length=16) + + def decrypt(self, encrypted_value): + version = encrypted_value[:3] + ciphertext = encrypted_value[3:] + + if version == b'v10': + if self._v10_key is None: + self._logger.warning('cannot decrypt v10 cookies: no key found', only_once=True) + return None + + return _decrypt_aes_cbc(ciphertext, self._v10_key, self._logger) + + else: + # other prefixes are considered 'old data' which were stored as plaintext + # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_mac.mm + return encrypted_value + + +class WindowsChromeCookieDecryptor(ChromeCookieDecryptor): + def __init__(self, browser_root, logger): + self._logger = logger + self._v10_key = _get_windows_v10_key(browser_root, logger) + + def decrypt(self, encrypted_value): + version = encrypted_value[:3] + ciphertext = encrypted_value[3:] + + if version == b'v10': + if self._v10_key is None: + self._logger.warning('cannot decrypt v10 cookies: no key found', only_once=True) + return None + + # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_win.cc + # kNonceLength + nonce_length = 96 // 8 + # boringssl + # EVP_AEAD_AES_GCM_TAG_LEN + authentication_tag_length = 16 + + raw_ciphertext = ciphertext + nonce = raw_ciphertext[:nonce_length] + ciphertext = raw_ciphertext[nonce_length:-authentication_tag_length] + authentication_tag = raw_ciphertext[-authentication_tag_length:] + + return _decrypt_aes_gcm(ciphertext, self._v10_key, nonce, authentication_tag, self._logger) + + else: + # any other prefix means the data is DPAPI encrypted + # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_win.cc + return _decrypt_windows_dpapi(encrypted_value, self._logger).decode('utf-8') + + +def _extract_safari_cookies(profile, logger): + if profile is not None: + logger.error('safari does not support profiles') + if sys.platform != 'darwin': + raise ValueError('unsupported platform: {}'.format(sys.platform)) + + cookies_path = os.path.expanduser('~/Library/Cookies/Cookies.binarycookies') + + if not os.path.isfile(cookies_path): + raise FileNotFoundError('could not find safari cookies database') + + with open(cookies_path, 'rb') as f: + cookies_data = f.read() + + jar = parse_safari_cookies(cookies_data, logger=logger) + logger.info('Extracted {} cookies from safari'.format(len(jar))) + return jar + + +class ParserError(Exception): + pass + + +class DataParser: + def __init__(self, data, logger): + self._data = data + self.cursor = 0 + self._logger = logger + + def read_bytes(self, num_bytes): + if num_bytes < 0: + raise ParserError('invalid read of {} bytes'.format(num_bytes)) + end = self.cursor + num_bytes + if end > len(self._data): + raise ParserError('reached end of input') + data = self._data[self.cursor:end] + self.cursor = end + return data + + def expect_bytes(self, expected_value, message): + value = self.read_bytes(len(expected_value)) + if value != expected_value: + raise ParserError('unexpected value: {} != {} ({})'.format(value, expected_value, message)) + + def read_uint(self, big_endian=False): + data_format = '>I' if big_endian else '<I' + return struct.unpack(data_format, self.read_bytes(4))[0] + + def read_double(self, big_endian=False): + data_format = '>d' if big_endian else '<d' + return struct.unpack(data_format, self.read_bytes(8))[0] + + def read_cstring(self): + buffer = [] + while True: + c = self.read_bytes(1) + if c == b'\x00': + return b''.join(buffer).decode('utf-8') + else: + buffer.append(c) + + def skip(self, num_bytes, description='unknown'): + if num_bytes > 0: + self._logger.debug('skipping {} bytes ({}): {}'.format( + num_bytes, description, self.read_bytes(num_bytes))) + elif num_bytes < 0: + raise ParserError('invalid skip of {} bytes'.format(num_bytes)) + + def skip_to(self, offset, description='unknown'): + self.skip(offset - self.cursor, description) + + def skip_to_end(self, description='unknown'): + self.skip_to(len(self._data), description) + + +def _mac_absolute_time_to_posix(timestamp): + return int((datetime(2001, 1, 1, 0, 0, tzinfo=timezone.utc) + timedelta(seconds=timestamp)).timestamp()) + + +def _parse_safari_cookies_header(data, logger): + p = DataParser(data, logger) + p.expect_bytes(b'cook', 'database signature') + number_of_pages = p.read_uint(big_endian=True) + page_sizes = [p.read_uint(big_endian=True) for _ in range(number_of_pages)] + return page_sizes, p.cursor + + +def _parse_safari_cookies_page(data, jar, logger): + p = DataParser(data, logger) + p.expect_bytes(b'\x00\x00\x01\x00', 'page signature') + number_of_cookies = p.read_uint() + record_offsets = [p.read_uint() for _ in range(number_of_cookies)] + if number_of_cookies == 0: + logger.debug('a cookies page of size {} has no cookies'.format(len(data))) + return + + p.skip_to(record_offsets[0], 'unknown page header field') + + for record_offset in record_offsets: + p.skip_to(record_offset, 'space between records') + record_length = _parse_safari_cookies_record(data[record_offset:], jar, logger) + p.read_bytes(record_length) + p.skip_to_end('space in between pages') + + +def _parse_safari_cookies_record(data, jar, logger): + p = DataParser(data, logger) + record_size = p.read_uint() + p.skip(4, 'unknown record field 1') + flags = p.read_uint() + is_secure = bool(flags & 0x0001) + p.skip(4, 'unknown record field 2') + domain_offset = p.read_uint() + name_offset = p.read_uint() + path_offset = p.read_uint() + value_offset = p.read_uint() + p.skip(8, 'unknown record field 3') + expiration_date = _mac_absolute_time_to_posix(p.read_double()) + _creation_date = _mac_absolute_time_to_posix(p.read_double()) # noqa: F841 + + try: + p.skip_to(domain_offset) + domain = p.read_cstring() + + p.skip_to(name_offset) + name = p.read_cstring() + + p.skip_to(path_offset) + path = p.read_cstring() + + p.skip_to(value_offset) + value = p.read_cstring() + except UnicodeDecodeError: + logger.warning('failed to parse Safari cookie because UTF-8 decoding failed', only_once=True) + return record_size + + p.skip_to(record_size, 'space at the end of the record') + + cookie = compat_cookiejar_Cookie( + version=0, name=name, value=value, port=None, port_specified=False, + domain=domain, domain_specified=bool(domain), domain_initial_dot=domain.startswith('.'), + path=path, path_specified=bool(path), secure=is_secure, expires=expiration_date, discard=False, + comment=None, comment_url=None, rest={}) + jar.set_cookie(cookie) + return record_size + + +def parse_safari_cookies(data, jar=None, logger=YDLLogger()): + """ + References: + - https://github.com/libyal/dtformats/blob/main/documentation/Safari%20Cookies.asciidoc + - this data appears to be out of date but the important parts of the database structure is the same + - there are a few bytes here and there which are skipped during parsing + """ + if jar is None: + jar = YoutubeDLCookieJar() + page_sizes, body_start = _parse_safari_cookies_header(data, logger) + p = DataParser(data[body_start:], logger) + for page_size in page_sizes: + _parse_safari_cookies_page(p.read_bytes(page_size), jar, logger) + p.skip_to_end('footer') + return jar + + +def _get_linux_keyring_password(browser_keyring_name): + password = keyring.get_password('{} Keys'.format(browser_keyring_name), + '{} Safe Storage'.format(browser_keyring_name)) + if password is None: + # this sometimes occurs in KDE because chrome does not check hasEntry and instead + # just tries to read the value (which kwallet returns "") whereas keyring checks hasEntry + # to verify this: + # dbus-monitor "interface='org.kde.KWallet'" "type=method_return" + # while starting chrome. + # this may be a bug as the intended behaviour is to generate a random password and store + # it, but that doesn't matter here. + password = '' + return password.encode('utf-8') + + +def _get_mac_keyring_password(browser_keyring_name, logger): + if KEYRING_AVAILABLE: + logger.debug('using keyring to obtain password') + password = keyring.get_password('{} Safe Storage'.format(browser_keyring_name), browser_keyring_name) + return password.encode('utf-8') + else: + logger.debug('using find-generic-password to obtain password') + proc = subprocess.Popen(['security', 'find-generic-password', + '-w', # write password to stdout + '-a', browser_keyring_name, # match 'account' + '-s', '{} Safe Storage'.format(browser_keyring_name)], # match 'service' + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL) + try: + stdout, stderr = process_communicate_or_kill(proc) + if stdout[-1:] == b'\n': + stdout = stdout[:-1] + return stdout + except BaseException as e: + logger.warning(f'exception running find-generic-password: {type(e).__name__}({e})') + return None + + +def _get_windows_v10_key(browser_root, logger): + path = _find_most_recently_used_file(browser_root, 'Local State') + if path is None: + logger.error('could not find local state file') + return None + with open(path, 'r') as f: + data = json.load(f) + try: + base64_key = data['os_crypt']['encrypted_key'] + except KeyError: + logger.error('no encrypted key in Local State') + return None + encrypted_key = compat_b64decode(base64_key) + prefix = b'DPAPI' + if not encrypted_key.startswith(prefix): + logger.error('invalid key') + return None + return _decrypt_windows_dpapi(encrypted_key[len(prefix):], logger) + + +def pbkdf2_sha1(password, salt, iterations, key_length): + return pbkdf2_hmac('sha1', password, salt, iterations, key_length) + + +def _decrypt_aes_cbc(ciphertext, key, logger, initialization_vector=b' ' * 16): + plaintext = aes_cbc_decrypt_bytes(ciphertext, key, initialization_vector) + padding_length = plaintext[-1] + try: + return plaintext[:-padding_length].decode('utf-8') + except UnicodeDecodeError: + logger.warning('failed to decrypt cookie (AES-CBC) because UTF-8 decoding failed. Possibly the key is wrong?', only_once=True) + return None + + +def _decrypt_aes_gcm(ciphertext, key, nonce, authentication_tag, logger): + try: + plaintext = aes_gcm_decrypt_and_verify_bytes(ciphertext, key, authentication_tag, nonce) + except ValueError: + logger.warning('failed to decrypt cookie (AES-GCM) because the MAC check failed. Possibly the key is wrong?', only_once=True) + return None + + try: + return plaintext.decode('utf-8') + except UnicodeDecodeError: + logger.warning('failed to decrypt cookie (AES-GCM) because UTF-8 decoding failed. Possibly the key is wrong?', only_once=True) + return None + + +def _decrypt_windows_dpapi(ciphertext, logger): + """ + References: + - https://docs.microsoft.com/en-us/windows/win32/api/dpapi/nf-dpapi-cryptunprotectdata + """ + from ctypes.wintypes import DWORD + + class DATA_BLOB(ctypes.Structure): + _fields_ = [('cbData', DWORD), + ('pbData', ctypes.POINTER(ctypes.c_char))] + + buffer = ctypes.create_string_buffer(ciphertext) + blob_in = DATA_BLOB(ctypes.sizeof(buffer), buffer) + blob_out = DATA_BLOB() + ret = ctypes.windll.crypt32.CryptUnprotectData( + ctypes.byref(blob_in), # pDataIn + None, # ppszDataDescr: human readable description of pDataIn + None, # pOptionalEntropy: salt? + None, # pvReserved: must be NULL + None, # pPromptStruct: information about prompts to display + 0, # dwFlags + ctypes.byref(blob_out) # pDataOut + ) + if not ret: + logger.warning('failed to decrypt with DPAPI', only_once=True) + return None + + result = ctypes.string_at(blob_out.pbData, blob_out.cbData) + ctypes.windll.kernel32.LocalFree(blob_out.pbData) + return result + + +def _config_home(): + return os.environ.get('XDG_CONFIG_HOME', os.path.expanduser('~/.config')) + + +def _open_database_copy(database_path, tmpdir): + # cannot open sqlite databases if they are already in use (e.g. by the browser) + database_copy_path = os.path.join(tmpdir, 'temporary.sqlite') + shutil.copy(database_path, database_copy_path) + conn = sqlite3.connect(database_copy_path) + return conn.cursor() + + +def _get_column_names(cursor, table_name): + table_info = cursor.execute('PRAGMA table_info({})'.format(table_name)).fetchall() + return [row[1].decode('utf-8') for row in table_info] + + +def _find_most_recently_used_file(root, filename): + # if there are multiple browser profiles, take the most recently used one + paths = [] + for root, dirs, files in os.walk(root): + for file in files: + if file == filename: + paths.append(os.path.join(root, file)) + return None if not paths else max(paths, key=lambda path: os.lstat(path).st_mtime) + + +def _merge_cookie_jars(jars): + output_jar = YoutubeDLCookieJar() + for jar in jars: + for cookie in jar: + output_jar.set_cookie(cookie) + if jar.filename is not None: + output_jar.filename = jar.filename + return output_jar + + +def _is_path(value): + return os.path.sep in value + + +def _parse_browser_specification(browser_name, profile=None): + browser_name = browser_name.lower() + if browser_name not in SUPPORTED_BROWSERS: + raise ValueError(f'unsupported browser: "{browser_name}"') + if profile is not None and _is_path(profile): + profile = os.path.expanduser(profile) + return browser_name, profile diff --git a/yt_dlp/downloader/__init__.py b/yt_dlp/downloader/__init__.py new file mode 100644 index 000000000..2449c7411 --- /dev/null +++ b/yt_dlp/downloader/__init__.py @@ -0,0 +1,126 @@ +from __future__ import unicode_literals + +from ..compat import compat_str +from ..utils import ( + determine_protocol, + NO_DEFAULT +) + + +def get_suitable_downloader(info_dict, params={}, default=NO_DEFAULT, protocol=None, to_stdout=False): + info_dict['protocol'] = determine_protocol(info_dict) + info_copy = info_dict.copy() + info_copy['to_stdout'] = to_stdout + + downloaders = [_get_suitable_downloader(info_copy, proto, params, default) + for proto in (protocol or info_copy['protocol']).split('+')] + if set(downloaders) == {FFmpegFD} and FFmpegFD.can_merge_formats(info_copy, params): + return FFmpegFD + elif len(downloaders) == 1: + return downloaders[0] + return None + + +# Some of these require get_suitable_downloader +from .common import FileDownloader +from .dash import DashSegmentsFD +from .f4m import F4mFD +from .hls import HlsFD +from .http import HttpFD +from .rtmp import RtmpFD +from .rtsp import RtspFD +from .ism import IsmFD +from .mhtml import MhtmlFD +from .niconico import NiconicoDmcFD +from .websocket import WebSocketFragmentFD +from .youtube_live_chat import YoutubeLiveChatFD +from .external import ( + get_external_downloader, + FFmpegFD, +) + +PROTOCOL_MAP = { + 'rtmp': RtmpFD, + 'rtmp_ffmpeg': FFmpegFD, + 'm3u8_native': HlsFD, + 'm3u8': FFmpegFD, + 'mms': RtspFD, + 'rtsp': RtspFD, + 'f4m': F4mFD, + 'http_dash_segments': DashSegmentsFD, + 'ism': IsmFD, + 'mhtml': MhtmlFD, + 'niconico_dmc': NiconicoDmcFD, + 'websocket_frag': WebSocketFragmentFD, + 'youtube_live_chat': YoutubeLiveChatFD, + 'youtube_live_chat_replay': YoutubeLiveChatFD, +} + + +def shorten_protocol_name(proto, simplify=False): + short_protocol_names = { + 'm3u8_native': 'm3u8_n', + 'rtmp_ffmpeg': 'rtmp_f', + 'http_dash_segments': 'dash', + 'niconico_dmc': 'dmc', + 'websocket_frag': 'WSfrag', + } + if simplify: + short_protocol_names.update({ + 'https': 'http', + 'ftps': 'ftp', + 'm3u8_native': 'm3u8', + 'rtmp_ffmpeg': 'rtmp', + 'm3u8_frag_urls': 'm3u8', + 'dash_frag_urls': 'dash', + }) + return short_protocol_names.get(proto, proto) + + +def _get_suitable_downloader(info_dict, protocol, params, default): + """Get the downloader class that can handle the info dict.""" + if default is NO_DEFAULT: + default = HttpFD + + # if (info_dict.get('start_time') or info_dict.get('end_time')) and not info_dict.get('requested_formats') and FFmpegFD.can_download(info_dict): + # return FFmpegFD + + info_dict['protocol'] = protocol + downloaders = params.get('external_downloader') + external_downloader = ( + downloaders if isinstance(downloaders, compat_str) or downloaders is None + else downloaders.get(shorten_protocol_name(protocol, True), downloaders.get('default'))) + + if external_downloader is None: + if info_dict['to_stdout'] and FFmpegFD.can_merge_formats(info_dict, params): + return FFmpegFD + elif external_downloader.lower() != 'native': + ed = get_external_downloader(external_downloader) + if ed.can_download(info_dict, external_downloader): + return ed + + if protocol == 'http_dash_segments': + if info_dict.get('is_live') and (external_downloader or '').lower() != 'native': + return FFmpegFD + + if protocol in ('m3u8', 'm3u8_native'): + if info_dict.get('is_live'): + return FFmpegFD + elif (external_downloader or '').lower() == 'native': + return HlsFD + elif get_suitable_downloader( + info_dict, params, None, protocol='m3u8_frag_urls', to_stdout=info_dict['to_stdout']): + return HlsFD + elif params.get('hls_prefer_native') is True: + return HlsFD + elif params.get('hls_prefer_native') is False: + return FFmpegFD + + return PROTOCOL_MAP.get(protocol, default) + + +__all__ = [ + 'FileDownloader', + 'get_suitable_downloader', + 'shorten_protocol_name', +] diff --git a/yt_dlp/downloader/common.py b/yt_dlp/downloader/common.py new file mode 100644 index 000000000..9081794db --- /dev/null +++ b/yt_dlp/downloader/common.py @@ -0,0 +1,428 @@ +from __future__ import division, unicode_literals + +import os +import re +import time +import random + +from ..utils import ( + decodeArgument, + encodeFilename, + error_to_compat_str, + format_bytes, + shell_quote, + timeconvert, +) +from ..minicurses import ( + MultilineLogger, + MultilinePrinter, + QuietMultilinePrinter, + BreaklineStatusPrinter +) + + +class FileDownloader(object): + """File Downloader class. + + File downloader objects are the ones responsible of downloading the + actual video file and writing it to disk. + + File downloaders accept a lot of parameters. In order not to saturate + the object constructor with arguments, it receives a dictionary of + options instead. + + Available options: + + verbose: Print additional info to stdout. + quiet: Do not print messages to stdout. + ratelimit: Download speed limit, in bytes/sec. + throttledratelimit: Assume the download is being throttled below this speed (bytes/sec) + retries: Number of times to retry for HTTP error 5xx + buffersize: Size of download buffer in bytes. + noresizebuffer: Do not automatically resize the download buffer. + continuedl: Try to continue downloads if possible. + noprogress: Do not print the progress bar. + nopart: Do not use temporary .part files. + updatetime: Use the Last-modified header to set output file timestamps. + test: Download only first bytes to test the downloader. + min_filesize: Skip files smaller than this size + max_filesize: Skip files larger than this size + xattr_set_filesize: Set ytdl.filesize user xattribute with expected size. + external_downloader_args: A dictionary of downloader keys (in lower case) + and a list of additional command-line arguments for the + executable. Use 'default' as the name for arguments to be + passed to all downloaders. For compatibility with youtube-dl, + a single list of args can also be used + hls_use_mpegts: Use the mpegts container for HLS videos. + http_chunk_size: Size of a chunk for chunk-based HTTP downloading. May be + useful for bypassing bandwidth throttling imposed by + a webserver (experimental) + progress_template: See YoutubeDL.py + + Subclasses of this one must re-define the real_download method. + """ + + _TEST_FILE_SIZE = 10241 + params = None + + def __init__(self, ydl, params): + """Create a FileDownloader object with the given options.""" + self.ydl = ydl + self._progress_hooks = [] + self.params = params + self._prepare_multiline_status() + self.add_progress_hook(self.report_progress) + + @staticmethod + def format_seconds(seconds): + (mins, secs) = divmod(seconds, 60) + (hours, mins) = divmod(mins, 60) + if hours > 99: + return '--:--:--' + if hours == 0: + return '%02d:%02d' % (mins, secs) + else: + return '%02d:%02d:%02d' % (hours, mins, secs) + + @staticmethod + def calc_percent(byte_counter, data_len): + if data_len is None: + return None + return float(byte_counter) / float(data_len) * 100.0 + + @staticmethod + def format_percent(percent): + if percent is None: + return '---.-%' + return '%6s' % ('%3.1f%%' % percent) + + @staticmethod + def calc_eta(start, now, total, current): + if total is None: + return None + if now is None: + now = time.time() + dif = now - start + if current == 0 or dif < 0.001: # One millisecond + return None + rate = float(current) / dif + return int((float(total) - float(current)) / rate) + + @staticmethod + def format_eta(eta): + if eta is None: + return '--:--' + return FileDownloader.format_seconds(eta) + + @staticmethod + def calc_speed(start, now, bytes): + dif = now - start + if bytes == 0 or dif < 0.001: # One millisecond + return None + return float(bytes) / dif + + @staticmethod + def format_speed(speed): + if speed is None: + return '%10s' % '---b/s' + return '%10s' % ('%s/s' % format_bytes(speed)) + + @staticmethod + def format_retries(retries): + return 'inf' if retries == float('inf') else '%.0f' % retries + + @staticmethod + def best_block_size(elapsed_time, bytes): + new_min = max(bytes / 2.0, 1.0) + new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB + if elapsed_time < 0.001: + return int(new_max) + rate = bytes / elapsed_time + if rate > new_max: + return int(new_max) + if rate < new_min: + return int(new_min) + return int(rate) + + @staticmethod + def parse_bytes(bytestr): + """Parse a string indicating a byte quantity into an integer.""" + matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr) + if matchobj is None: + return None + number = float(matchobj.group(1)) + multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower()) + return int(round(number * multiplier)) + + def to_screen(self, *args, **kargs): + self.ydl.to_stdout(*args, quiet=self.params.get('quiet'), **kargs) + + def to_stderr(self, message): + self.ydl.to_stderr(message) + + def to_console_title(self, message): + self.ydl.to_console_title(message) + + def trouble(self, *args, **kargs): + self.ydl.trouble(*args, **kargs) + + def report_warning(self, *args, **kargs): + self.ydl.report_warning(*args, **kargs) + + def report_error(self, *args, **kargs): + self.ydl.report_error(*args, **kargs) + + def write_debug(self, *args, **kargs): + self.ydl.write_debug(*args, **kargs) + + def slow_down(self, start_time, now, byte_counter): + """Sleep if the download speed is over the rate limit.""" + rate_limit = self.params.get('ratelimit') + if rate_limit is None or byte_counter == 0: + return + if now is None: + now = time.time() + elapsed = now - start_time + if elapsed <= 0.0: + return + speed = float(byte_counter) / elapsed + if speed > rate_limit: + sleep_time = float(byte_counter) / rate_limit - elapsed + if sleep_time > 0: + time.sleep(sleep_time) + + def temp_name(self, filename): + """Returns a temporary filename for the given filename.""" + if self.params.get('nopart', False) or filename == '-' or \ + (os.path.exists(encodeFilename(filename)) and not os.path.isfile(encodeFilename(filename))): + return filename + return filename + '.part' + + def undo_temp_name(self, filename): + if filename.endswith('.part'): + return filename[:-len('.part')] + return filename + + def ytdl_filename(self, filename): + return filename + '.ytdl' + + def try_rename(self, old_filename, new_filename): + if old_filename == new_filename: + return + try: + os.replace(old_filename, new_filename) + except (IOError, OSError) as err: + self.report_error(f'unable to rename file: {err}') + + def try_utime(self, filename, last_modified_hdr): + """Try to set the last-modified time of the given file.""" + if last_modified_hdr is None: + return + if not os.path.isfile(encodeFilename(filename)): + return + timestr = last_modified_hdr + if timestr is None: + return + filetime = timeconvert(timestr) + if filetime is None: + return filetime + # Ignore obviously invalid dates + if filetime == 0: + return + try: + os.utime(filename, (time.time(), filetime)) + except Exception: + pass + return filetime + + def report_destination(self, filename): + """Report destination filename.""" + self.to_screen('[download] Destination: ' + filename) + + def _prepare_multiline_status(self, lines=1): + if self.params.get('noprogress'): + self._multiline = QuietMultilinePrinter() + elif self.ydl.params.get('logger'): + self._multiline = MultilineLogger(self.ydl.params['logger'], lines) + elif self.params.get('progress_with_newline'): + self._multiline = BreaklineStatusPrinter(self.ydl._screen_file, lines) + else: + self._multiline = MultilinePrinter(self.ydl._screen_file, lines, not self.params.get('quiet')) + + def _finish_multiline_status(self): + self._multiline.end() + + def _report_progress_status(self, s): + progress_dict = s.copy() + progress_dict.pop('info_dict') + progress_dict = {'info': s['info_dict'], 'progress': progress_dict} + + progress_template = self.params.get('progress_template', {}) + self._multiline.print_at_line(self.ydl.evaluate_outtmpl( + progress_template.get('download') or '[download] %(progress._default_template)s', + progress_dict), s.get('progress_idx') or 0) + self.to_console_title(self.ydl.evaluate_outtmpl( + progress_template.get('download-title') or 'yt-dlp %(progress._default_template)s', + progress_dict)) + + def report_progress(self, s): + if s['status'] == 'finished': + if self.params.get('noprogress'): + self.to_screen('[download] Download completed') + msg_template = '100%%' + if s.get('total_bytes') is not None: + s['_total_bytes_str'] = format_bytes(s['total_bytes']) + msg_template += ' of %(_total_bytes_str)s' + if s.get('elapsed') is not None: + s['_elapsed_str'] = self.format_seconds(s['elapsed']) + msg_template += ' in %(_elapsed_str)s' + s['_percent_str'] = self.format_percent(100) + s['_default_template'] = msg_template % s + self._report_progress_status(s) + return + + if s['status'] != 'downloading': + return + + if s.get('eta') is not None: + s['_eta_str'] = self.format_eta(s['eta']) + else: + s['_eta_str'] = 'Unknown ETA' + + if s.get('total_bytes') and s.get('downloaded_bytes') is not None: + s['_percent_str'] = self.format_percent(100 * s['downloaded_bytes'] / s['total_bytes']) + elif s.get('total_bytes_estimate') and s.get('downloaded_bytes') is not None: + s['_percent_str'] = self.format_percent(100 * s['downloaded_bytes'] / s['total_bytes_estimate']) + else: + if s.get('downloaded_bytes') == 0: + s['_percent_str'] = self.format_percent(0) + else: + s['_percent_str'] = 'Unknown %' + + if s.get('speed') is not None: + s['_speed_str'] = self.format_speed(s['speed']) + else: + s['_speed_str'] = 'Unknown speed' + + if s.get('total_bytes') is not None: + s['_total_bytes_str'] = format_bytes(s['total_bytes']) + msg_template = '%(_percent_str)s of %(_total_bytes_str)s at %(_speed_str)s ETA %(_eta_str)s' + elif s.get('total_bytes_estimate') is not None: + s['_total_bytes_estimate_str'] = format_bytes(s['total_bytes_estimate']) + msg_template = '%(_percent_str)s of ~%(_total_bytes_estimate_str)s at %(_speed_str)s ETA %(_eta_str)s' + else: + if s.get('downloaded_bytes') is not None: + s['_downloaded_bytes_str'] = format_bytes(s['downloaded_bytes']) + if s.get('elapsed'): + s['_elapsed_str'] = self.format_seconds(s['elapsed']) + msg_template = '%(_downloaded_bytes_str)s at %(_speed_str)s (%(_elapsed_str)s)' + else: + msg_template = '%(_downloaded_bytes_str)s at %(_speed_str)s' + else: + msg_template = '%(_percent_str)s % at %(_speed_str)s ETA %(_eta_str)s' + s['_default_template'] = msg_template % s + self._report_progress_status(s) + + def report_resuming_byte(self, resume_len): + """Report attempt to resume at given byte.""" + self.to_screen('[download] Resuming download at byte %s' % resume_len) + + def report_retry(self, err, count, retries): + """Report retry in case of HTTP error 5xx""" + self.to_screen( + '[download] Got server HTTP error: %s. Retrying (attempt %d of %s) ...' + % (error_to_compat_str(err), count, self.format_retries(retries))) + + def report_file_already_downloaded(self, *args, **kwargs): + """Report file has already been fully downloaded.""" + return self.ydl.report_file_already_downloaded(*args, **kwargs) + + def report_unable_to_resume(self): + """Report it was impossible to resume download.""" + self.to_screen('[download] Unable to resume') + + @staticmethod + def supports_manifest(manifest): + """ Whether the downloader can download the fragments from the manifest. + Redefine in subclasses if needed. """ + pass + + def download(self, filename, info_dict, subtitle=False): + """Download to a filename using the info from info_dict + Return True on success and False otherwise + """ + + nooverwrites_and_exists = ( + not self.params.get('overwrites', True) + and os.path.exists(encodeFilename(filename)) + ) + + if not hasattr(filename, 'write'): + continuedl_and_exists = ( + self.params.get('continuedl', True) + and os.path.isfile(encodeFilename(filename)) + and not self.params.get('nopart', False) + ) + + # Check file already present + if filename != '-' and (nooverwrites_and_exists or continuedl_and_exists): + self.report_file_already_downloaded(filename) + self._hook_progress({ + 'filename': filename, + 'status': 'finished', + 'total_bytes': os.path.getsize(encodeFilename(filename)), + }, info_dict) + return True, False + + if subtitle is False: + min_sleep_interval = self.params.get('sleep_interval') + if min_sleep_interval: + max_sleep_interval = self.params.get('max_sleep_interval', min_sleep_interval) + sleep_interval = random.uniform(min_sleep_interval, max_sleep_interval) + self.to_screen( + '[download] Sleeping %s seconds ...' % ( + int(sleep_interval) if sleep_interval.is_integer() + else '%.2f' % sleep_interval)) + time.sleep(sleep_interval) + else: + sleep_interval_sub = 0 + if type(self.params.get('sleep_interval_subtitles')) is int: + sleep_interval_sub = self.params.get('sleep_interval_subtitles') + if sleep_interval_sub > 0: + self.to_screen( + '[download] Sleeping %s seconds ...' % ( + sleep_interval_sub)) + time.sleep(sleep_interval_sub) + ret = self.real_download(filename, info_dict) + self._finish_multiline_status() + return ret, True + + def real_download(self, filename, info_dict): + """Real download process. Redefine in subclasses.""" + raise NotImplementedError('This method must be implemented by subclasses') + + def _hook_progress(self, status, info_dict): + if not self._progress_hooks: + return + status['info_dict'] = info_dict + # youtube-dl passes the same status object to all the hooks. + # Some third party scripts seems to be relying on this. + # So keep this behavior if possible + for ph in self._progress_hooks: + ph(status) + + def add_progress_hook(self, ph): + # See YoutubeDl.py (search for progress_hooks) for a description of + # this interface + self._progress_hooks.append(ph) + + def _debug_cmd(self, args, exe=None): + if not self.params.get('verbose', False): + return + + str_args = [decodeArgument(a) for a in args] + + if exe is None: + exe = os.path.basename(str_args[0]) + + self.write_debug('%s command line: %s' % (exe, shell_quote(str_args))) diff --git a/yt_dlp/downloader/dash.py b/yt_dlp/downloader/dash.py new file mode 100644 index 000000000..6444ad692 --- /dev/null +++ b/yt_dlp/downloader/dash.py @@ -0,0 +1,62 @@ +from __future__ import unicode_literals + +from ..downloader import get_suitable_downloader +from .fragment import FragmentFD + +from ..utils import urljoin + + +class DashSegmentsFD(FragmentFD): + """ + Download segments in a DASH manifest. External downloaders can take over + the fragment downloads by supporting the 'dash_frag_urls' protocol + """ + + FD_NAME = 'dashsegments' + + def real_download(self, filename, info_dict): + if info_dict.get('is_live'): + self.report_error('Live DASH videos are not supported') + + fragment_base_url = info_dict.get('fragment_base_url') + fragments = info_dict['fragments'][:1] if self.params.get( + 'test', False) else info_dict['fragments'] + + real_downloader = get_suitable_downloader( + info_dict, self.params, None, protocol='dash_frag_urls', to_stdout=(filename == '-')) + + ctx = { + 'filename': filename, + 'total_frags': len(fragments), + } + + if real_downloader: + self._prepare_external_frag_download(ctx) + else: + self._prepare_and_start_frag_download(ctx, info_dict) + + fragments_to_download = [] + frag_index = 0 + for i, fragment in enumerate(fragments): + frag_index += 1 + if frag_index <= ctx['fragment_index']: + continue + fragment_url = fragment.get('url') + if not fragment_url: + assert fragment_base_url + fragment_url = urljoin(fragment_base_url, fragment['path']) + + fragments_to_download.append({ + 'frag_index': frag_index, + 'index': i, + 'url': fragment_url, + }) + + if real_downloader: + self.to_screen( + '[%s] Fragment downloads will be delegated to %s' % (self.FD_NAME, real_downloader.get_basename())) + info_dict['fragments'] = fragments_to_download + fd = real_downloader(self.ydl, self.params) + return fd.real_download(filename, info_dict) + + return self.download_and_append_fragments(ctx, fragments_to_download, info_dict) diff --git a/yt_dlp/downloader/external.py b/yt_dlp/downloader/external.py new file mode 100644 index 000000000..40b9dcfe3 --- /dev/null +++ b/yt_dlp/downloader/external.py @@ -0,0 +1,519 @@ +from __future__ import unicode_literals + +import os.path +import re +import subprocess +import sys +import time + +from .fragment import FragmentFD +from ..compat import ( + compat_setenv, + compat_str, +) +from ..postprocessor.ffmpeg import FFmpegPostProcessor, EXT_TO_OUT_FORMATS +from ..utils import ( + cli_option, + cli_valueless_option, + cli_bool_option, + _configuration_args, + encodeFilename, + encodeArgument, + handle_youtubedl_headers, + check_executable, + is_outdated_version, + process_communicate_or_kill, + sanitize_open, +) + + +class ExternalFD(FragmentFD): + SUPPORTED_PROTOCOLS = ('http', 'https', 'ftp', 'ftps') + can_download_to_stdout = False + + def real_download(self, filename, info_dict): + self.report_destination(filename) + tmpfilename = self.temp_name(filename) + + try: + started = time.time() + retval = self._call_downloader(tmpfilename, info_dict) + except KeyboardInterrupt: + if not info_dict.get('is_live'): + raise + # Live stream downloading cancellation should be considered as + # correct and expected termination thus all postprocessing + # should take place + retval = 0 + self.to_screen('[%s] Interrupted by user' % self.get_basename()) + + if retval == 0: + status = { + 'filename': filename, + 'status': 'finished', + 'elapsed': time.time() - started, + } + if filename != '-': + fsize = os.path.getsize(encodeFilename(tmpfilename)) + self.to_screen('\r[%s] Downloaded %s bytes' % (self.get_basename(), fsize)) + self.try_rename(tmpfilename, filename) + status.update({ + 'downloaded_bytes': fsize, + 'total_bytes': fsize, + }) + self._hook_progress(status, info_dict) + return True + else: + self.to_stderr('\n') + self.report_error('%s exited with code %d' % ( + self.get_basename(), retval)) + return False + + @classmethod + def get_basename(cls): + return cls.__name__[:-2].lower() + + @property + def exe(self): + return self.get_basename() + + @classmethod + def available(cls, path=None): + path = check_executable(path or cls.get_basename(), [cls.AVAILABLE_OPT]) + if path: + cls.exe = path + return path + return False + + @classmethod + def supports(cls, info_dict): + return ( + (cls.can_download_to_stdout or not info_dict.get('to_stdout')) + and info_dict['protocol'] in cls.SUPPORTED_PROTOCOLS) + + @classmethod + def can_download(cls, info_dict, path=None): + return cls.available(path) and cls.supports(info_dict) + + def _option(self, command_option, param): + return cli_option(self.params, command_option, param) + + def _bool_option(self, command_option, param, true_value='true', false_value='false', separator=None): + return cli_bool_option(self.params, command_option, param, true_value, false_value, separator) + + def _valueless_option(self, command_option, param, expected_value=True): + return cli_valueless_option(self.params, command_option, param, expected_value) + + def _configuration_args(self, keys=None, *args, **kwargs): + return _configuration_args( + self.get_basename(), self.params.get('external_downloader_args'), self.get_basename(), + keys, *args, **kwargs) + + def _call_downloader(self, tmpfilename, info_dict): + """ Either overwrite this or implement _make_cmd """ + cmd = [encodeArgument(a) for a in self._make_cmd(tmpfilename, info_dict)] + + self._debug_cmd(cmd) + + if 'fragments' not in info_dict: + p = subprocess.Popen( + cmd, stderr=subprocess.PIPE) + _, stderr = process_communicate_or_kill(p) + if p.returncode != 0: + self.to_stderr(stderr.decode('utf-8', 'replace')) + return p.returncode + + fragment_retries = self.params.get('fragment_retries', 0) + skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) + + count = 0 + while count <= fragment_retries: + p = subprocess.Popen( + cmd, stderr=subprocess.PIPE) + _, stderr = process_communicate_or_kill(p) + if p.returncode == 0: + break + # TODO: Decide whether to retry based on error code + # https://aria2.github.io/manual/en/html/aria2c.html#exit-status + self.to_stderr(stderr.decode('utf-8', 'replace')) + count += 1 + if count <= fragment_retries: + self.to_screen( + '[%s] Got error. Retrying fragments (attempt %d of %s)...' + % (self.get_basename(), count, self.format_retries(fragment_retries))) + if count > fragment_retries: + if not skip_unavailable_fragments: + self.report_error('Giving up after %s fragment retries' % fragment_retries) + return -1 + + decrypt_fragment = self.decrypter(info_dict) + dest, _ = sanitize_open(tmpfilename, 'wb') + for frag_index, fragment in enumerate(info_dict['fragments']): + fragment_filename = '%s-Frag%d' % (tmpfilename, frag_index) + try: + src, _ = sanitize_open(fragment_filename, 'rb') + except IOError: + if skip_unavailable_fragments and frag_index > 1: + self.to_screen('[%s] Skipping fragment %d ...' % (self.get_basename(), frag_index)) + continue + self.report_error('Unable to open fragment %d' % frag_index) + return -1 + dest.write(decrypt_fragment(fragment, src.read())) + src.close() + if not self.params.get('keep_fragments', False): + os.remove(encodeFilename(fragment_filename)) + dest.close() + os.remove(encodeFilename('%s.frag.urls' % tmpfilename)) + return 0 + + +class CurlFD(ExternalFD): + AVAILABLE_OPT = '-V' + + def _make_cmd(self, tmpfilename, info_dict): + cmd = [self.exe, '--location', '-o', tmpfilename] + if info_dict.get('http_headers') is not None: + for key, val in info_dict['http_headers'].items(): + cmd += ['--header', '%s: %s' % (key, val)] + + cmd += self._bool_option('--continue-at', 'continuedl', '-', '0') + cmd += self._valueless_option('--silent', 'noprogress') + cmd += self._valueless_option('--verbose', 'verbose') + cmd += self._option('--limit-rate', 'ratelimit') + retry = self._option('--retry', 'retries') + if len(retry) == 2: + if retry[1] in ('inf', 'infinite'): + retry[1] = '2147483647' + cmd += retry + cmd += self._option('--max-filesize', 'max_filesize') + cmd += self._option('--interface', 'source_address') + cmd += self._option('--proxy', 'proxy') + cmd += self._valueless_option('--insecure', 'nocheckcertificate') + cmd += self._configuration_args() + cmd += ['--', info_dict['url']] + return cmd + + def _call_downloader(self, tmpfilename, info_dict): + cmd = [encodeArgument(a) for a in self._make_cmd(tmpfilename, info_dict)] + + self._debug_cmd(cmd) + + # curl writes the progress to stderr so don't capture it. + p = subprocess.Popen(cmd) + process_communicate_or_kill(p) + return p.returncode + + +class AxelFD(ExternalFD): + AVAILABLE_OPT = '-V' + + def _make_cmd(self, tmpfilename, info_dict): + cmd = [self.exe, '-o', tmpfilename] + if info_dict.get('http_headers') is not None: + for key, val in info_dict['http_headers'].items(): + cmd += ['-H', '%s: %s' % (key, val)] + cmd += self._configuration_args() + cmd += ['--', info_dict['url']] + return cmd + + +class WgetFD(ExternalFD): + AVAILABLE_OPT = '--version' + + def _make_cmd(self, tmpfilename, info_dict): + cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies'] + if info_dict.get('http_headers') is not None: + for key, val in info_dict['http_headers'].items(): + cmd += ['--header', '%s: %s' % (key, val)] + cmd += self._option('--limit-rate', 'ratelimit') + retry = self._option('--tries', 'retries') + if len(retry) == 2: + if retry[1] in ('inf', 'infinite'): + retry[1] = '0' + cmd += retry + cmd += self._option('--bind-address', 'source_address') + cmd += self._option('--proxy', 'proxy') + cmd += self._valueless_option('--no-check-certificate', 'nocheckcertificate') + cmd += self._configuration_args() + cmd += ['--', info_dict['url']] + return cmd + + +class Aria2cFD(ExternalFD): + AVAILABLE_OPT = '-v' + SUPPORTED_PROTOCOLS = ('http', 'https', 'ftp', 'ftps', 'dash_frag_urls', 'm3u8_frag_urls') + + @staticmethod + def supports_manifest(manifest): + UNSUPPORTED_FEATURES = [ + r'#EXT-X-BYTERANGE', # playlists composed of byte ranges of media files [1] + # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.2 + ] + check_results = (not re.search(feature, manifest) for feature in UNSUPPORTED_FEATURES) + return all(check_results) + + def _make_cmd(self, tmpfilename, info_dict): + cmd = [self.exe, '-c', + '--console-log-level=warn', '--summary-interval=0', '--download-result=hide', + '--file-allocation=none', '-x16', '-j16', '-s16'] + if 'fragments' in info_dict: + cmd += ['--allow-overwrite=true', '--allow-piece-length-change=true'] + else: + cmd += ['--min-split-size', '1M'] + + if info_dict.get('http_headers') is not None: + for key, val in info_dict['http_headers'].items(): + cmd += ['--header', '%s: %s' % (key, val)] + cmd += self._option('--max-overall-download-limit', 'ratelimit') + cmd += self._option('--interface', 'source_address') + cmd += self._option('--all-proxy', 'proxy') + cmd += self._bool_option('--check-certificate', 'nocheckcertificate', 'false', 'true', '=') + cmd += self._bool_option('--remote-time', 'updatetime', 'true', 'false', '=') + cmd += self._configuration_args() + + # aria2c strips out spaces from the beginning/end of filenames and paths. + # We work around this issue by adding a "./" to the beginning of the + # filename and relative path, and adding a "/" at the end of the path. + # See: https://github.com/yt-dlp/yt-dlp/issues/276 + # https://github.com/ytdl-org/youtube-dl/issues/20312 + # https://github.com/aria2/aria2/issues/1373 + dn = os.path.dirname(tmpfilename) + if dn: + if not os.path.isabs(dn): + dn = '.%s%s' % (os.path.sep, dn) + cmd += ['--dir', dn + os.path.sep] + if 'fragments' not in info_dict: + cmd += ['--out', '.%s%s' % (os.path.sep, os.path.basename(tmpfilename))] + cmd += ['--auto-file-renaming=false'] + + if 'fragments' in info_dict: + cmd += ['--file-allocation=none', '--uri-selector=inorder'] + url_list_file = '%s.frag.urls' % tmpfilename + url_list = [] + for frag_index, fragment in enumerate(info_dict['fragments']): + fragment_filename = '%s-Frag%d' % (os.path.basename(tmpfilename), frag_index) + url_list.append('%s\n\tout=%s' % (fragment['url'], fragment_filename)) + stream, _ = sanitize_open(url_list_file, 'wb') + stream.write('\n'.join(url_list).encode('utf-8')) + stream.close() + cmd += ['-i', url_list_file] + else: + cmd += ['--', info_dict['url']] + return cmd + + +class HttpieFD(ExternalFD): + AVAILABLE_OPT = '--version' + + @classmethod + def available(cls, path=None): + return ExternalFD.available(cls, path or 'http') + + def _make_cmd(self, tmpfilename, info_dict): + cmd = ['http', '--download', '--output', tmpfilename, info_dict['url']] + + if info_dict.get('http_headers') is not None: + for key, val in info_dict['http_headers'].items(): + cmd += ['%s:%s' % (key, val)] + return cmd + + +class FFmpegFD(ExternalFD): + SUPPORTED_PROTOCOLS = ('http', 'https', 'ftp', 'ftps', 'm3u8', 'm3u8_native', 'rtsp', 'rtmp', 'rtmp_ffmpeg', 'mms', 'http_dash_segments') + can_download_to_stdout = True + + @classmethod + def available(cls, path=None): + # TODO: Fix path for ffmpeg + # Fixme: This may be wrong when --ffmpeg-location is used + return FFmpegPostProcessor().available + + @classmethod + def supports(cls, info_dict): + return all(proto in cls.SUPPORTED_PROTOCOLS for proto in info_dict['protocol'].split('+')) + + def on_process_started(self, proc, stdin): + """ Override this in subclasses """ + pass + + @classmethod + def can_merge_formats(cls, info_dict, params): + return ( + info_dict.get('requested_formats') + and info_dict.get('protocol') + and not params.get('allow_unplayable_formats') + and 'no-direct-merge' not in params.get('compat_opts', []) + and cls.can_download(info_dict)) + + def _call_downloader(self, tmpfilename, info_dict): + urls = [f['url'] for f in info_dict.get('requested_formats', [])] or [info_dict['url']] + ffpp = FFmpegPostProcessor(downloader=self) + if not ffpp.available: + self.report_error('m3u8 download detected but ffmpeg could not be found. Please install') + return False + ffpp.check_version() + + args = [ffpp.executable, '-y'] + + for log_level in ('quiet', 'verbose'): + if self.params.get(log_level, False): + args += ['-loglevel', log_level] + break + if not self.params.get('verbose'): + args += ['-hide_banner'] + + args += info_dict.get('_ffmpeg_args', []) + + # This option exists only for compatibility. Extractors should use `_ffmpeg_args` instead + seekable = info_dict.get('_seekable') + if seekable is not None: + # setting -seekable prevents ffmpeg from guessing if the server + # supports seeking(by adding the header `Range: bytes=0-`), which + # can cause problems in some cases + # https://github.com/ytdl-org/youtube-dl/issues/11800#issuecomment-275037127 + # http://trac.ffmpeg.org/ticket/6125#comment:10 + args += ['-seekable', '1' if seekable else '0'] + + # start_time = info_dict.get('start_time') or 0 + # if start_time: + # args += ['-ss', compat_str(start_time)] + # end_time = info_dict.get('end_time') + # if end_time: + # args += ['-t', compat_str(end_time - start_time)] + + if info_dict.get('http_headers') is not None and re.match(r'^https?://', urls[0]): + # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv: + # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header. + headers = handle_youtubedl_headers(info_dict['http_headers']) + args += [ + '-headers', + ''.join('%s: %s\r\n' % (key, val) for key, val in headers.items())] + + env = None + proxy = self.params.get('proxy') + if proxy: + if not re.match(r'^[\da-zA-Z]+://', proxy): + proxy = 'http://%s' % proxy + + if proxy.startswith('socks'): + self.report_warning( + '%s does not support SOCKS proxies. Downloading is likely to fail. ' + 'Consider adding --hls-prefer-native to your command.' % self.get_basename()) + + # Since December 2015 ffmpeg supports -http_proxy option (see + # http://git.videolan.org/?p=ffmpeg.git;a=commit;h=b4eb1f29ebddd60c41a2eb39f5af701e38e0d3fd) + # We could switch to the following code if we are able to detect version properly + # args += ['-http_proxy', proxy] + env = os.environ.copy() + compat_setenv('HTTP_PROXY', proxy, env=env) + compat_setenv('http_proxy', proxy, env=env) + + protocol = info_dict.get('protocol') + + if protocol == 'rtmp': + player_url = info_dict.get('player_url') + page_url = info_dict.get('page_url') + app = info_dict.get('app') + play_path = info_dict.get('play_path') + tc_url = info_dict.get('tc_url') + flash_version = info_dict.get('flash_version') + live = info_dict.get('rtmp_live', False) + conn = info_dict.get('rtmp_conn') + if player_url is not None: + args += ['-rtmp_swfverify', player_url] + if page_url is not None: + args += ['-rtmp_pageurl', page_url] + if app is not None: + args += ['-rtmp_app', app] + if play_path is not None: + args += ['-rtmp_playpath', play_path] + if tc_url is not None: + args += ['-rtmp_tcurl', tc_url] + if flash_version is not None: + args += ['-rtmp_flashver', flash_version] + if live: + args += ['-rtmp_live', 'live'] + if isinstance(conn, list): + for entry in conn: + args += ['-rtmp_conn', entry] + elif isinstance(conn, compat_str): + args += ['-rtmp_conn', conn] + + for i, url in enumerate(urls): + args += self._configuration_args((f'_i{i + 1}', '_i')) + ['-i', url] + + args += ['-c', 'copy'] + if info_dict.get('requested_formats') or protocol == 'http_dash_segments': + for (i, fmt) in enumerate(info_dict.get('requested_formats') or [info_dict]): + stream_number = fmt.get('manifest_stream_number', 0) + a_or_v = 'a' if fmt.get('acodec') != 'none' else 'v' + args.extend(['-map', f'{i}:{a_or_v}:{stream_number}']) + + if self.params.get('test', False): + args += ['-fs', compat_str(self._TEST_FILE_SIZE)] + + ext = info_dict['ext'] + if protocol in ('m3u8', 'm3u8_native'): + use_mpegts = (tmpfilename == '-') or self.params.get('hls_use_mpegts') + if use_mpegts is None: + use_mpegts = info_dict.get('is_live') + if use_mpegts: + args += ['-f', 'mpegts'] + else: + args += ['-f', 'mp4'] + if (ffpp.basename == 'ffmpeg' and is_outdated_version(ffpp._versions['ffmpeg'], '3.2', False)) and (not info_dict.get('acodec') or info_dict['acodec'].split('.')[0] in ('aac', 'mp4a')): + args += ['-bsf:a', 'aac_adtstoasc'] + elif protocol == 'rtmp': + args += ['-f', 'flv'] + elif ext == 'mp4' and tmpfilename == '-': + args += ['-f', 'mpegts'] + else: + args += ['-f', EXT_TO_OUT_FORMATS.get(ext, ext)] + + args += self._configuration_args(('_o1', '_o', '')) + + args = [encodeArgument(opt) for opt in args] + args.append(encodeFilename(ffpp._ffmpeg_filename_argument(tmpfilename), True)) + self._debug_cmd(args) + + proc = subprocess.Popen(args, stdin=subprocess.PIPE, env=env) + if url in ('-', 'pipe:'): + self.on_process_started(proc, proc.stdin) + try: + retval = proc.wait() + except BaseException as e: + # subprocces.run would send the SIGKILL signal to ffmpeg and the + # mp4 file couldn't be played, but if we ask ffmpeg to quit it + # produces a file that is playable (this is mostly useful for live + # streams). Note that Windows is not affected and produces playable + # files (see https://github.com/ytdl-org/youtube-dl/issues/8300). + if isinstance(e, KeyboardInterrupt) and sys.platform != 'win32' and url not in ('-', 'pipe:'): + process_communicate_or_kill(proc, b'q') + else: + proc.kill() + proc.wait() + raise + return retval + + +class AVconvFD(FFmpegFD): + pass + + +_BY_NAME = dict( + (klass.get_basename(), klass) + for name, klass in globals().items() + if name.endswith('FD') and name not in ('ExternalFD', 'FragmentFD') +) + + +def list_external_downloaders(): + return sorted(_BY_NAME.keys()) + + +def get_external_downloader(external_downloader): + """ Given the name of the executable, see whether we support the given + downloader . """ + # Drop .exe extension on Windows + bn = os.path.splitext(os.path.basename(external_downloader))[0] + return _BY_NAME.get(bn) diff --git a/yt_dlp/downloader/f4m.py b/yt_dlp/downloader/f4m.py new file mode 100644 index 000000000..9da2776d9 --- /dev/null +++ b/yt_dlp/downloader/f4m.py @@ -0,0 +1,439 @@ +from __future__ import division, unicode_literals + +import io +import itertools +import time + +from .fragment import FragmentFD +from ..compat import ( + compat_b64decode, + compat_etree_fromstring, + compat_urlparse, + compat_urllib_error, + compat_urllib_parse_urlparse, + compat_struct_pack, + compat_struct_unpack, +) +from ..utils import ( + fix_xml_ampersands, + xpath_text, +) + + +class DataTruncatedError(Exception): + pass + + +class FlvReader(io.BytesIO): + """ + Reader for Flv files + The file format is documented in https://www.adobe.com/devnet/f4v.html + """ + + def read_bytes(self, n): + data = self.read(n) + if len(data) < n: + raise DataTruncatedError( + 'FlvReader error: need %d bytes while only %d bytes got' % ( + n, len(data))) + return data + + # Utility functions for reading numbers and strings + def read_unsigned_long_long(self): + return compat_struct_unpack('!Q', self.read_bytes(8))[0] + + def read_unsigned_int(self): + return compat_struct_unpack('!I', self.read_bytes(4))[0] + + def read_unsigned_char(self): + return compat_struct_unpack('!B', self.read_bytes(1))[0] + + def read_string(self): + res = b'' + while True: + char = self.read_bytes(1) + if char == b'\x00': + break + res += char + return res + + def read_box_info(self): + """ + Read a box and return the info as a tuple: (box_size, box_type, box_data) + """ + real_size = size = self.read_unsigned_int() + box_type = self.read_bytes(4) + header_end = 8 + if size == 1: + real_size = self.read_unsigned_long_long() + header_end = 16 + return real_size, box_type, self.read_bytes(real_size - header_end) + + def read_asrt(self): + # version + self.read_unsigned_char() + # flags + self.read_bytes(3) + quality_entry_count = self.read_unsigned_char() + # QualityEntryCount + for i in range(quality_entry_count): + self.read_string() + + segment_run_count = self.read_unsigned_int() + segments = [] + for i in range(segment_run_count): + first_segment = self.read_unsigned_int() + fragments_per_segment = self.read_unsigned_int() + segments.append((first_segment, fragments_per_segment)) + + return { + 'segment_run': segments, + } + + def read_afrt(self): + # version + self.read_unsigned_char() + # flags + self.read_bytes(3) + # time scale + self.read_unsigned_int() + + quality_entry_count = self.read_unsigned_char() + # QualitySegmentUrlModifiers + for i in range(quality_entry_count): + self.read_string() + + fragments_count = self.read_unsigned_int() + fragments = [] + for i in range(fragments_count): + first = self.read_unsigned_int() + first_ts = self.read_unsigned_long_long() + duration = self.read_unsigned_int() + if duration == 0: + discontinuity_indicator = self.read_unsigned_char() + else: + discontinuity_indicator = None + fragments.append({ + 'first': first, + 'ts': first_ts, + 'duration': duration, + 'discontinuity_indicator': discontinuity_indicator, + }) + + return { + 'fragments': fragments, + } + + def read_abst(self): + # version + self.read_unsigned_char() + # flags + self.read_bytes(3) + + self.read_unsigned_int() # BootstrapinfoVersion + # Profile,Live,Update,Reserved + flags = self.read_unsigned_char() + live = flags & 0x20 != 0 + # time scale + self.read_unsigned_int() + # CurrentMediaTime + self.read_unsigned_long_long() + # SmpteTimeCodeOffset + self.read_unsigned_long_long() + + self.read_string() # MovieIdentifier + server_count = self.read_unsigned_char() + # ServerEntryTable + for i in range(server_count): + self.read_string() + quality_count = self.read_unsigned_char() + # QualityEntryTable + for i in range(quality_count): + self.read_string() + # DrmData + self.read_string() + # MetaData + self.read_string() + + segments_count = self.read_unsigned_char() + segments = [] + for i in range(segments_count): + box_size, box_type, box_data = self.read_box_info() + assert box_type == b'asrt' + segment = FlvReader(box_data).read_asrt() + segments.append(segment) + fragments_run_count = self.read_unsigned_char() + fragments = [] + for i in range(fragments_run_count): + box_size, box_type, box_data = self.read_box_info() + assert box_type == b'afrt' + fragments.append(FlvReader(box_data).read_afrt()) + + return { + 'segments': segments, + 'fragments': fragments, + 'live': live, + } + + def read_bootstrap_info(self): + total_size, box_type, box_data = self.read_box_info() + assert box_type == b'abst' + return FlvReader(box_data).read_abst() + + +def read_bootstrap_info(bootstrap_bytes): + return FlvReader(bootstrap_bytes).read_bootstrap_info() + + +def build_fragments_list(boot_info): + """ Return a list of (segment, fragment) for each fragment in the video """ + res = [] + segment_run_table = boot_info['segments'][0] + fragment_run_entry_table = boot_info['fragments'][0]['fragments'] + first_frag_number = fragment_run_entry_table[0]['first'] + fragments_counter = itertools.count(first_frag_number) + for segment, fragments_count in segment_run_table['segment_run']: + # In some live HDS streams (for example Rai), `fragments_count` is + # abnormal and causing out-of-memory errors. It's OK to change the + # number of fragments for live streams as they are updated periodically + if fragments_count == 4294967295 and boot_info['live']: + fragments_count = 2 + for _ in range(fragments_count): + res.append((segment, next(fragments_counter))) + + if boot_info['live']: + res = res[-2:] + + return res + + +def write_unsigned_int(stream, val): + stream.write(compat_struct_pack('!I', val)) + + +def write_unsigned_int_24(stream, val): + stream.write(compat_struct_pack('!I', val)[1:]) + + +def write_flv_header(stream): + """Writes the FLV header to stream""" + # FLV header + stream.write(b'FLV\x01') + stream.write(b'\x05') + stream.write(b'\x00\x00\x00\x09') + stream.write(b'\x00\x00\x00\x00') + + +def write_metadata_tag(stream, metadata): + """Writes optional metadata tag to stream""" + SCRIPT_TAG = b'\x12' + FLV_TAG_HEADER_LEN = 11 + + if metadata: + stream.write(SCRIPT_TAG) + write_unsigned_int_24(stream, len(metadata)) + stream.write(b'\x00\x00\x00\x00\x00\x00\x00') + stream.write(metadata) + write_unsigned_int(stream, FLV_TAG_HEADER_LEN + len(metadata)) + + +def remove_encrypted_media(media): + return list(filter(lambda e: 'drmAdditionalHeaderId' not in e.attrib + and 'drmAdditionalHeaderSetId' not in e.attrib, + media)) + + +def _add_ns(prop, ver=1): + return '{http://ns.adobe.com/f4m/%d.0}%s' % (ver, prop) + + +def get_base_url(manifest): + base_url = xpath_text( + manifest, [_add_ns('baseURL'), _add_ns('baseURL', 2)], + 'base URL', default=None) + if base_url: + base_url = base_url.strip() + return base_url + + +class F4mFD(FragmentFD): + """ + A downloader for f4m manifests or AdobeHDS. + """ + + FD_NAME = 'f4m' + + def _get_unencrypted_media(self, doc): + media = doc.findall(_add_ns('media')) + if not media: + self.report_error('No media found') + if not self.params.get('allow_unplayable_formats'): + for e in (doc.findall(_add_ns('drmAdditionalHeader')) + + doc.findall(_add_ns('drmAdditionalHeaderSet'))): + # If id attribute is missing it's valid for all media nodes + # without drmAdditionalHeaderId or drmAdditionalHeaderSetId attribute + if 'id' not in e.attrib: + self.report_error('Missing ID in f4m DRM') + media = remove_encrypted_media(media) + if not media: + self.report_error('Unsupported DRM') + return media + + def _get_bootstrap_from_url(self, bootstrap_url): + bootstrap = self.ydl.urlopen(bootstrap_url).read() + return read_bootstrap_info(bootstrap) + + def _update_live_fragments(self, bootstrap_url, latest_fragment): + fragments_list = [] + retries = 30 + while (not fragments_list) and (retries > 0): + boot_info = self._get_bootstrap_from_url(bootstrap_url) + fragments_list = build_fragments_list(boot_info) + fragments_list = [f for f in fragments_list if f[1] > latest_fragment] + if not fragments_list: + # Retry after a while + time.sleep(5.0) + retries -= 1 + + if not fragments_list: + self.report_error('Failed to update fragments') + + return fragments_list + + def _parse_bootstrap_node(self, node, base_url): + # Sometimes non empty inline bootstrap info can be specified along + # with bootstrap url attribute (e.g. dummy inline bootstrap info + # contains whitespace characters in [1]). We will prefer bootstrap + # url over inline bootstrap info when present. + # 1. http://live-1-1.rutube.ru/stream/1024/HDS/SD/C2NKsS85HQNckgn5HdEmOQ/1454167650/S-s604419906/move/four/dirs/upper/1024-576p.f4m + bootstrap_url = node.get('url') + if bootstrap_url: + bootstrap_url = compat_urlparse.urljoin( + base_url, bootstrap_url) + boot_info = self._get_bootstrap_from_url(bootstrap_url) + else: + bootstrap_url = None + bootstrap = compat_b64decode(node.text) + boot_info = read_bootstrap_info(bootstrap) + return boot_info, bootstrap_url + + def real_download(self, filename, info_dict): + man_url = info_dict['url'] + requested_bitrate = info_dict.get('tbr') + self.to_screen('[%s] Downloading f4m manifest' % self.FD_NAME) + + urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url)) + man_url = urlh.geturl() + # Some manifests may be malformed, e.g. prosiebensat1 generated manifests + # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244 + # and https://github.com/ytdl-org/youtube-dl/issues/7823) + manifest = fix_xml_ampersands(urlh.read().decode('utf-8', 'ignore')).strip() + + doc = compat_etree_fromstring(manifest) + formats = [(int(f.attrib.get('bitrate', -1)), f) + for f in self._get_unencrypted_media(doc)] + if requested_bitrate is None or len(formats) == 1: + # get the best format + formats = sorted(formats, key=lambda f: f[0]) + rate, media = formats[-1] + else: + rate, media = list(filter( + lambda f: int(f[0]) == requested_bitrate, formats))[0] + + # Prefer baseURL for relative URLs as per 11.2 of F4M 3.0 spec. + man_base_url = get_base_url(doc) or man_url + + base_url = compat_urlparse.urljoin(man_base_url, media.attrib['url']) + bootstrap_node = doc.find(_add_ns('bootstrapInfo')) + boot_info, bootstrap_url = self._parse_bootstrap_node( + bootstrap_node, man_base_url) + live = boot_info['live'] + metadata_node = media.find(_add_ns('metadata')) + if metadata_node is not None: + metadata = compat_b64decode(metadata_node.text) + else: + metadata = None + + fragments_list = build_fragments_list(boot_info) + test = self.params.get('test', False) + if test: + # We only download the first fragment + fragments_list = fragments_list[:1] + total_frags = len(fragments_list) + # For some akamai manifests we'll need to add a query to the fragment url + akamai_pv = xpath_text(doc, _add_ns('pv-2.0')) + + ctx = { + 'filename': filename, + 'total_frags': total_frags, + 'live': live, + } + + self._prepare_frag_download(ctx) + + dest_stream = ctx['dest_stream'] + + if ctx['complete_frags_downloaded_bytes'] == 0: + write_flv_header(dest_stream) + if not live: + write_metadata_tag(dest_stream, metadata) + + base_url_parsed = compat_urllib_parse_urlparse(base_url) + + self._start_frag_download(ctx, info_dict) + + frag_index = 0 + while fragments_list: + seg_i, frag_i = fragments_list.pop(0) + frag_index += 1 + if frag_index <= ctx['fragment_index']: + continue + name = 'Seg%d-Frag%d' % (seg_i, frag_i) + query = [] + if base_url_parsed.query: + query.append(base_url_parsed.query) + if akamai_pv: + query.append(akamai_pv.strip(';')) + if info_dict.get('extra_param_to_segment_url'): + query.append(info_dict['extra_param_to_segment_url']) + url_parsed = base_url_parsed._replace(path=base_url_parsed.path + name, query='&'.join(query)) + try: + success, down_data = self._download_fragment(ctx, url_parsed.geturl(), info_dict) + if not success: + return False + reader = FlvReader(down_data) + while True: + try: + _, box_type, box_data = reader.read_box_info() + except DataTruncatedError: + if test: + # In tests, segments may be truncated, and thus + # FlvReader may not be able to parse the whole + # chunk. If so, write the segment as is + # See https://github.com/ytdl-org/youtube-dl/issues/9214 + dest_stream.write(down_data) + break + raise + if box_type == b'mdat': + self._append_fragment(ctx, box_data) + break + except (compat_urllib_error.HTTPError, ) as err: + if live and (err.code == 404 or err.code == 410): + # We didn't keep up with the live window. Continue + # with the next available fragment. + msg = 'Fragment %d unavailable' % frag_i + self.report_warning(msg) + fragments_list = [] + else: + raise + + if not fragments_list and not test and live and bootstrap_url: + fragments_list = self._update_live_fragments(bootstrap_url, frag_i) + total_frags += len(fragments_list) + if fragments_list and (fragments_list[0][1] > frag_i + 1): + msg = 'Missed %d fragments' % (fragments_list[0][1] - (frag_i + 1)) + self.report_warning(msg) + + self._finish_frag_download(ctx, info_dict) + + return True diff --git a/yt_dlp/downloader/fragment.py b/yt_dlp/downloader/fragment.py new file mode 100644 index 000000000..d0eaede7e --- /dev/null +++ b/yt_dlp/downloader/fragment.py @@ -0,0 +1,486 @@ +from __future__ import division, unicode_literals + +import os +import time +import json +from math import ceil + +try: + import concurrent.futures + can_threaded_download = True +except ImportError: + can_threaded_download = False + +from .common import FileDownloader +from .http import HttpFD +from ..aes import aes_cbc_decrypt_bytes +from ..compat import ( + compat_urllib_error, + compat_struct_pack, +) +from ..utils import ( + DownloadError, + error_to_compat_str, + encodeFilename, + sanitize_open, + sanitized_Request, +) + + +class HttpQuietDownloader(HttpFD): + def to_screen(self, *args, **kargs): + pass + + +class FragmentFD(FileDownloader): + """ + A base file downloader class for fragmented media (e.g. f4m/m3u8 manifests). + + Available options: + + fragment_retries: Number of times to retry a fragment for HTTP error (DASH + and hlsnative only) + skip_unavailable_fragments: + Skip unavailable fragments (DASH and hlsnative only) + keep_fragments: Keep downloaded fragments on disk after downloading is + finished + _no_ytdl_file: Don't use .ytdl file + + For each incomplete fragment download yt-dlp keeps on disk a special + bookkeeping file with download state and metadata (in future such files will + be used for any incomplete download handled by yt-dlp). This file is + used to properly handle resuming, check download file consistency and detect + potential errors. The file has a .ytdl extension and represents a standard + JSON file of the following format: + + extractor: + Dictionary of extractor related data. TBD. + + downloader: + Dictionary of downloader related data. May contain following data: + current_fragment: + Dictionary with current (being downloaded) fragment data: + index: 0-based index of current fragment among all fragments + fragment_count: + Total count of fragments + + This feature is experimental and file format may change in future. + """ + + def report_retry_fragment(self, err, frag_index, count, retries): + self.to_screen( + '\r[download] Got server HTTP error: %s. Retrying fragment %d (attempt %d of %s) ...' + % (error_to_compat_str(err), frag_index, count, self.format_retries(retries))) + + def report_skip_fragment(self, frag_index): + self.to_screen('[download] Skipping fragment %d ...' % frag_index) + + def _prepare_url(self, info_dict, url): + headers = info_dict.get('http_headers') + return sanitized_Request(url, None, headers) if headers else url + + def _prepare_and_start_frag_download(self, ctx, info_dict): + self._prepare_frag_download(ctx) + self._start_frag_download(ctx, info_dict) + + def __do_ytdl_file(self, ctx): + return not ctx['live'] and not ctx['tmpfilename'] == '-' and not self.params.get('_no_ytdl_file') + + def _read_ytdl_file(self, ctx): + assert 'ytdl_corrupt' not in ctx + stream, _ = sanitize_open(self.ytdl_filename(ctx['filename']), 'r') + try: + ytdl_data = json.loads(stream.read()) + ctx['fragment_index'] = ytdl_data['downloader']['current_fragment']['index'] + if 'extra_state' in ytdl_data['downloader']: + ctx['extra_state'] = ytdl_data['downloader']['extra_state'] + except Exception: + ctx['ytdl_corrupt'] = True + finally: + stream.close() + + def _write_ytdl_file(self, ctx): + frag_index_stream, _ = sanitize_open(self.ytdl_filename(ctx['filename']), 'w') + try: + downloader = { + 'current_fragment': { + 'index': ctx['fragment_index'], + }, + } + if 'extra_state' in ctx: + downloader['extra_state'] = ctx['extra_state'] + if ctx.get('fragment_count') is not None: + downloader['fragment_count'] = ctx['fragment_count'] + frag_index_stream.write(json.dumps({'downloader': downloader})) + finally: + frag_index_stream.close() + + def _download_fragment(self, ctx, frag_url, info_dict, headers=None, request_data=None): + fragment_filename = '%s-Frag%d' % (ctx['tmpfilename'], ctx['fragment_index']) + fragment_info_dict = { + 'url': frag_url, + 'http_headers': headers or info_dict.get('http_headers'), + 'request_data': request_data, + 'ctx_id': ctx.get('ctx_id'), + } + success = ctx['dl'].download(fragment_filename, fragment_info_dict) + if not success: + return False, None + if fragment_info_dict.get('filetime'): + ctx['fragment_filetime'] = fragment_info_dict.get('filetime') + ctx['fragment_filename_sanitized'] = fragment_filename + return True, self._read_fragment(ctx) + + def _read_fragment(self, ctx): + down, frag_sanitized = sanitize_open(ctx['fragment_filename_sanitized'], 'rb') + ctx['fragment_filename_sanitized'] = frag_sanitized + frag_content = down.read() + down.close() + return frag_content + + def _append_fragment(self, ctx, frag_content): + try: + ctx['dest_stream'].write(frag_content) + ctx['dest_stream'].flush() + finally: + if self.__do_ytdl_file(ctx): + self._write_ytdl_file(ctx) + if not self.params.get('keep_fragments', False): + os.remove(encodeFilename(ctx['fragment_filename_sanitized'])) + del ctx['fragment_filename_sanitized'] + + def _prepare_frag_download(self, ctx): + if 'live' not in ctx: + ctx['live'] = False + if not ctx['live']: + total_frags_str = '%d' % ctx['total_frags'] + ad_frags = ctx.get('ad_frags', 0) + if ad_frags: + total_frags_str += ' (not including %d ad)' % ad_frags + else: + total_frags_str = 'unknown (live)' + self.to_screen( + '[%s] Total fragments: %s' % (self.FD_NAME, total_frags_str)) + self.report_destination(ctx['filename']) + dl = HttpQuietDownloader( + self.ydl, + { + 'continuedl': True, + 'quiet': True, + 'noprogress': True, + 'ratelimit': self.params.get('ratelimit'), + 'retries': self.params.get('retries', 0), + 'nopart': self.params.get('nopart', False), + 'test': self.params.get('test', False), + } + ) + tmpfilename = self.temp_name(ctx['filename']) + open_mode = 'wb' + resume_len = 0 + + # Establish possible resume length + if os.path.isfile(encodeFilename(tmpfilename)): + open_mode = 'ab' + resume_len = os.path.getsize(encodeFilename(tmpfilename)) + + # Should be initialized before ytdl file check + ctx.update({ + 'tmpfilename': tmpfilename, + 'fragment_index': 0, + }) + + if self.__do_ytdl_file(ctx): + if os.path.isfile(encodeFilename(self.ytdl_filename(ctx['filename']))): + self._read_ytdl_file(ctx) + is_corrupt = ctx.get('ytdl_corrupt') is True + is_inconsistent = ctx['fragment_index'] > 0 and resume_len == 0 + if is_corrupt or is_inconsistent: + message = ( + '.ytdl file is corrupt' if is_corrupt else + 'Inconsistent state of incomplete fragment download') + self.report_warning( + '%s. Restarting from the beginning ...' % message) + ctx['fragment_index'] = resume_len = 0 + if 'ytdl_corrupt' in ctx: + del ctx['ytdl_corrupt'] + self._write_ytdl_file(ctx) + else: + self._write_ytdl_file(ctx) + assert ctx['fragment_index'] == 0 + + dest_stream, tmpfilename = sanitize_open(tmpfilename, open_mode) + + ctx.update({ + 'dl': dl, + 'dest_stream': dest_stream, + 'tmpfilename': tmpfilename, + # Total complete fragments downloaded so far in bytes + 'complete_frags_downloaded_bytes': resume_len, + }) + + def _start_frag_download(self, ctx, info_dict): + resume_len = ctx['complete_frags_downloaded_bytes'] + total_frags = ctx['total_frags'] + ctx_id = ctx.get('ctx_id') + # This dict stores the download progress, it's updated by the progress + # hook + state = { + 'status': 'downloading', + 'downloaded_bytes': resume_len, + 'fragment_index': ctx['fragment_index'], + 'fragment_count': total_frags, + 'filename': ctx['filename'], + 'tmpfilename': ctx['tmpfilename'], + } + + start = time.time() + ctx.update({ + 'started': start, + # Amount of fragment's bytes downloaded by the time of the previous + # frag progress hook invocation + 'prev_frag_downloaded_bytes': 0, + }) + + def frag_progress_hook(s): + if s['status'] not in ('downloading', 'finished'): + return + + if ctx_id is not None and s.get('ctx_id') != ctx_id: + return + + state['max_progress'] = ctx.get('max_progress') + state['progress_idx'] = ctx.get('progress_idx') + + time_now = time.time() + state['elapsed'] = time_now - start + frag_total_bytes = s.get('total_bytes') or 0 + s['fragment_info_dict'] = s.pop('info_dict', {}) + if not ctx['live']: + estimated_size = ( + (ctx['complete_frags_downloaded_bytes'] + frag_total_bytes) + / (state['fragment_index'] + 1) * total_frags) + state['total_bytes_estimate'] = estimated_size + + if s['status'] == 'finished': + state['fragment_index'] += 1 + ctx['fragment_index'] = state['fragment_index'] + state['downloaded_bytes'] += frag_total_bytes - ctx['prev_frag_downloaded_bytes'] + ctx['complete_frags_downloaded_bytes'] = state['downloaded_bytes'] + ctx['prev_frag_downloaded_bytes'] = 0 + else: + frag_downloaded_bytes = s['downloaded_bytes'] + state['downloaded_bytes'] += frag_downloaded_bytes - ctx['prev_frag_downloaded_bytes'] + if not ctx['live']: + state['eta'] = self.calc_eta( + start, time_now, estimated_size - resume_len, + state['downloaded_bytes'] - resume_len) + state['speed'] = s.get('speed') or ctx.get('speed') + ctx['speed'] = state['speed'] + ctx['prev_frag_downloaded_bytes'] = frag_downloaded_bytes + self._hook_progress(state, info_dict) + + ctx['dl'].add_progress_hook(frag_progress_hook) + + return start + + def _finish_frag_download(self, ctx, info_dict): + ctx['dest_stream'].close() + if self.__do_ytdl_file(ctx): + ytdl_filename = encodeFilename(self.ytdl_filename(ctx['filename'])) + if os.path.isfile(ytdl_filename): + os.remove(ytdl_filename) + elapsed = time.time() - ctx['started'] + + if ctx['tmpfilename'] == '-': + downloaded_bytes = ctx['complete_frags_downloaded_bytes'] + else: + self.try_rename(ctx['tmpfilename'], ctx['filename']) + if self.params.get('updatetime', True): + filetime = ctx.get('fragment_filetime') + if filetime: + try: + os.utime(ctx['filename'], (time.time(), filetime)) + except Exception: + pass + downloaded_bytes = os.path.getsize(encodeFilename(ctx['filename'])) + + self._hook_progress({ + 'downloaded_bytes': downloaded_bytes, + 'total_bytes': downloaded_bytes, + 'filename': ctx['filename'], + 'status': 'finished', + 'elapsed': elapsed, + 'ctx_id': ctx.get('ctx_id'), + 'max_progress': ctx.get('max_progress'), + 'progress_idx': ctx.get('progress_idx'), + }, info_dict) + + def _prepare_external_frag_download(self, ctx): + if 'live' not in ctx: + ctx['live'] = False + if not ctx['live']: + total_frags_str = '%d' % ctx['total_frags'] + ad_frags = ctx.get('ad_frags', 0) + if ad_frags: + total_frags_str += ' (not including %d ad)' % ad_frags + else: + total_frags_str = 'unknown (live)' + self.to_screen( + '[%s] Total fragments: %s' % (self.FD_NAME, total_frags_str)) + + tmpfilename = self.temp_name(ctx['filename']) + + # Should be initialized before ytdl file check + ctx.update({ + 'tmpfilename': tmpfilename, + 'fragment_index': 0, + }) + + def decrypter(self, info_dict): + _key_cache = {} + + def _get_key(url): + if url not in _key_cache: + _key_cache[url] = self.ydl.urlopen(self._prepare_url(info_dict, url)).read() + return _key_cache[url] + + def decrypt_fragment(fragment, frag_content): + decrypt_info = fragment.get('decrypt_info') + if not decrypt_info or decrypt_info['METHOD'] != 'AES-128': + return frag_content + iv = decrypt_info.get('IV') or compat_struct_pack('>8xq', fragment['media_sequence']) + decrypt_info['KEY'] = decrypt_info.get('KEY') or _get_key(info_dict.get('_decryption_key_url') or decrypt_info['URI']) + # Don't decrypt the content in tests since the data is explicitly truncated and it's not to a valid block + # size (see https://github.com/ytdl-org/youtube-dl/pull/27660). Tests only care that the correct data downloaded, + # not what it decrypts to. + if self.params.get('test', False): + return frag_content + padding_len = 16 - (len(frag_content) % 16) + decrypted_data = aes_cbc_decrypt_bytes(frag_content + bytes([padding_len] * padding_len), decrypt_info['KEY'], iv) + return decrypted_data[:-decrypted_data[-1]] + + return decrypt_fragment + + def download_and_append_fragments_multiple(self, *args, pack_func=None, finish_func=None): + ''' + @params (ctx1, fragments1, info_dict1), (ctx2, fragments2, info_dict2), ... + all args must be either tuple or list + ''' + max_progress = len(args) + if max_progress == 1: + return self.download_and_append_fragments(*args[0], pack_func=pack_func, finish_func=finish_func) + max_workers = self.params.get('concurrent_fragment_downloads', max_progress) + self._prepare_multiline_status(max_progress) + + def thread_func(idx, ctx, fragments, info_dict, tpe): + ctx['max_progress'] = max_progress + ctx['progress_idx'] = idx + return self.download_and_append_fragments(ctx, fragments, info_dict, pack_func=pack_func, finish_func=finish_func, tpe=tpe) + + class FTPE(concurrent.futures.ThreadPoolExecutor): + # has to stop this or it's going to wait on the worker thread itself + def __exit__(self, exc_type, exc_val, exc_tb): + pass + + spins = [] + for idx, (ctx, fragments, info_dict) in enumerate(args): + tpe = FTPE(ceil(max_workers / max_progress)) + job = tpe.submit(thread_func, idx, ctx, fragments, info_dict, tpe) + spins.append((tpe, job)) + + result = True + for tpe, job in spins: + try: + result = result and job.result() + finally: + tpe.shutdown(wait=True) + return result + + def download_and_append_fragments(self, ctx, fragments, info_dict, *, pack_func=None, finish_func=None, tpe=None): + fragment_retries = self.params.get('fragment_retries', 0) + is_fatal = (lambda idx: idx == 0) if self.params.get('skip_unavailable_fragments', True) else (lambda _: True) + if not pack_func: + pack_func = lambda frag_content, _: frag_content + + def download_fragment(fragment, ctx): + frag_index = ctx['fragment_index'] = fragment['frag_index'] + headers = info_dict.get('http_headers', {}).copy() + byte_range = fragment.get('byte_range') + if byte_range: + headers['Range'] = 'bytes=%d-%d' % (byte_range['start'], byte_range['end'] - 1) + + # Never skip the first fragment + fatal = is_fatal(fragment.get('index') or (frag_index - 1)) + count, frag_content = 0, None + while count <= fragment_retries: + try: + success, frag_content = self._download_fragment(ctx, fragment['url'], info_dict, headers) + if not success: + return False, frag_index + break + except compat_urllib_error.HTTPError as err: + # Unavailable (possibly temporary) fragments may be served. + # First we try to retry then either skip or abort. + # See https://github.com/ytdl-org/youtube-dl/issues/10165, + # https://github.com/ytdl-org/youtube-dl/issues/10448). + count += 1 + if count <= fragment_retries: + self.report_retry_fragment(err, frag_index, count, fragment_retries) + except DownloadError: + # Don't retry fragment if error occurred during HTTP downloading + # itself since it has own retry settings + if not fatal: + break + raise + + if count > fragment_retries: + if not fatal: + return False, frag_index + ctx['dest_stream'].close() + self.report_error('Giving up after %s fragment retries' % fragment_retries) + return False, frag_index + return frag_content, frag_index + + def append_fragment(frag_content, frag_index, ctx): + if not frag_content: + if not is_fatal(frag_index - 1): + self.report_skip_fragment(frag_index) + return True + else: + ctx['dest_stream'].close() + self.report_error( + 'fragment %s not found, unable to continue' % frag_index) + return False + self._append_fragment(ctx, pack_func(frag_content, frag_index)) + return True + + decrypt_fragment = self.decrypter(info_dict) + + max_workers = self.params.get('concurrent_fragment_downloads', 1) + if can_threaded_download and max_workers > 1: + + def _download_fragment(fragment): + ctx_copy = ctx.copy() + frag_content, frag_index = download_fragment(fragment, ctx_copy) + return fragment, frag_content, frag_index, ctx_copy.get('fragment_filename_sanitized') + + self.report_warning('The download speed shown is only of one thread. This is a known issue and patches are welcome') + with tpe or concurrent.futures.ThreadPoolExecutor(max_workers) as pool: + for fragment, frag_content, frag_index, frag_filename in pool.map(_download_fragment, fragments): + ctx['fragment_filename_sanitized'] = frag_filename + ctx['fragment_index'] = frag_index + result = append_fragment(decrypt_fragment(fragment, frag_content), frag_index, ctx) + if not result: + return False + else: + for fragment in fragments: + frag_content, frag_index = download_fragment(fragment, ctx) + result = append_fragment(decrypt_fragment(fragment, frag_content), frag_index, ctx) + if not result: + return False + + if finish_func is not None: + ctx['dest_stream'].write(finish_func()) + ctx['dest_stream'].flush() + self._finish_frag_download(ctx, info_dict) + return True diff --git a/yt_dlp/downloader/hls.py b/yt_dlp/downloader/hls.py new file mode 100644 index 000000000..61312c5ba --- /dev/null +++ b/yt_dlp/downloader/hls.py @@ -0,0 +1,349 @@ +from __future__ import unicode_literals + +import re +import io +import binascii + +from ..downloader import get_suitable_downloader +from .fragment import FragmentFD +from .external import FFmpegFD + +from ..compat import ( + compat_pycrypto_AES, + compat_urlparse, +) +from ..utils import ( + parse_m3u8_attributes, + update_url_query, + bug_reports_message, +) +from .. import webvtt + + +class HlsFD(FragmentFD): + """ + Download segments in a m3u8 manifest. External downloaders can take over + the fragment downloads by supporting the 'm3u8_frag_urls' protocol and + re-defining 'supports_manifest' function + """ + + FD_NAME = 'hlsnative' + + @staticmethod + def can_download(manifest, info_dict, allow_unplayable_formats=False): + UNSUPPORTED_FEATURES = [ + # r'#EXT-X-BYTERANGE', # playlists composed of byte ranges of media files [2] + + # Live streams heuristic does not always work (e.g. geo restricted to Germany + # http://hls-geo.daserste.de/i/videoportal/Film/c_620000/622873/format,716451,716457,716450,716458,716459,.mp4.csmil/index_4_av.m3u8?null=0) + # r'#EXT-X-MEDIA-SEQUENCE:(?!0$)', # live streams [3] + + # This heuristic also is not correct since segments may not be appended as well. + # Twitch vods of finished streams have EXT-X-PLAYLIST-TYPE:EVENT despite + # no segments will definitely be appended to the end of the playlist. + # r'#EXT-X-PLAYLIST-TYPE:EVENT', # media segments may be appended to the end of + # # event media playlists [4] + # r'#EXT-X-MAP:', # media initialization [5] + # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.4 + # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.2 + # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.2 + # 4. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.5 + # 5. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.5 + ] + if not allow_unplayable_formats: + UNSUPPORTED_FEATURES += [ + r'#EXT-X-KEY:METHOD=(?!NONE|AES-128)', # encrypted streams [1] + ] + + def check_results(): + yield not info_dict.get('is_live') + for feature in UNSUPPORTED_FEATURES: + yield not re.search(feature, manifest) + return all(check_results()) + + def real_download(self, filename, info_dict): + man_url = info_dict['url'] + self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME) + + urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url)) + man_url = urlh.geturl() + s = urlh.read().decode('utf-8', 'ignore') + + can_download, message = self.can_download(s, info_dict, self.params.get('allow_unplayable_formats')), None + if can_download and not compat_pycrypto_AES and '#EXT-X-KEY:METHOD=AES-128' in s: + if FFmpegFD.available(): + can_download, message = False, 'The stream has AES-128 encryption and pycryptodomex is not available' + else: + message = ('The stream has AES-128 encryption and neither ffmpeg nor pycryptodomex are available; ' + 'Decryption will be performed natively, but will be extremely slow') + if not can_download: + message = message or 'Unsupported features have been detected' + fd = FFmpegFD(self.ydl, self.params) + self.report_warning(f'{message}; extraction will be delegated to {fd.get_basename()}') + return fd.real_download(filename, info_dict) + elif message: + self.report_warning(message) + + is_webvtt = info_dict['ext'] == 'vtt' + if is_webvtt: + real_downloader = None # Packing the fragments is not currently supported for external downloader + else: + real_downloader = get_suitable_downloader( + info_dict, self.params, None, protocol='m3u8_frag_urls', to_stdout=(filename == '-')) + if real_downloader and not real_downloader.supports_manifest(s): + real_downloader = None + if real_downloader: + self.to_screen( + '[%s] Fragment downloads will be delegated to %s' % (self.FD_NAME, real_downloader.get_basename())) + + def is_ad_fragment_start(s): + return (s.startswith('#ANVATO-SEGMENT-INFO') and 'type=ad' in s + or s.startswith('#UPLYNK-SEGMENT') and s.endswith(',ad')) + + def is_ad_fragment_end(s): + return (s.startswith('#ANVATO-SEGMENT-INFO') and 'type=master' in s + or s.startswith('#UPLYNK-SEGMENT') and s.endswith(',segment')) + + fragments = [] + + media_frags = 0 + ad_frags = 0 + ad_frag_next = False + for line in s.splitlines(): + line = line.strip() + if not line: + continue + if line.startswith('#'): + if is_ad_fragment_start(line): + ad_frag_next = True + elif is_ad_fragment_end(line): + ad_frag_next = False + continue + if ad_frag_next: + ad_frags += 1 + continue + media_frags += 1 + + ctx = { + 'filename': filename, + 'total_frags': media_frags, + 'ad_frags': ad_frags, + } + + if real_downloader: + self._prepare_external_frag_download(ctx) + else: + self._prepare_and_start_frag_download(ctx, info_dict) + + extra_state = ctx.setdefault('extra_state', {}) + + format_index = info_dict.get('format_index') + extra_query = None + extra_param_to_segment_url = info_dict.get('extra_param_to_segment_url') + if extra_param_to_segment_url: + extra_query = compat_urlparse.parse_qs(extra_param_to_segment_url) + i = 0 + media_sequence = 0 + decrypt_info = {'METHOD': 'NONE'} + byte_range = {} + discontinuity_count = 0 + frag_index = 0 + ad_frag_next = False + for line in s.splitlines(): + line = line.strip() + if line: + if not line.startswith('#'): + if format_index and discontinuity_count != format_index: + continue + if ad_frag_next: + continue + frag_index += 1 + if frag_index <= ctx['fragment_index']: + continue + frag_url = ( + line + if re.match(r'^https?://', line) + else compat_urlparse.urljoin(man_url, line)) + if extra_query: + frag_url = update_url_query(frag_url, extra_query) + + fragments.append({ + 'frag_index': frag_index, + 'url': frag_url, + 'decrypt_info': decrypt_info, + 'byte_range': byte_range, + 'media_sequence': media_sequence, + }) + media_sequence += 1 + + elif line.startswith('#EXT-X-MAP'): + if format_index and discontinuity_count != format_index: + continue + if frag_index > 0: + self.report_error( + 'Initialization fragment found after media fragments, unable to download') + return False + frag_index += 1 + map_info = parse_m3u8_attributes(line[11:]) + frag_url = ( + map_info.get('URI') + if re.match(r'^https?://', map_info.get('URI')) + else compat_urlparse.urljoin(man_url, map_info.get('URI'))) + if extra_query: + frag_url = update_url_query(frag_url, extra_query) + + fragments.append({ + 'frag_index': frag_index, + 'url': frag_url, + 'decrypt_info': decrypt_info, + 'byte_range': byte_range, + 'media_sequence': media_sequence + }) + media_sequence += 1 + + if map_info.get('BYTERANGE'): + splitted_byte_range = map_info.get('BYTERANGE').split('@') + sub_range_start = int(splitted_byte_range[1]) if len(splitted_byte_range) == 2 else byte_range['end'] + byte_range = { + 'start': sub_range_start, + 'end': sub_range_start + int(splitted_byte_range[0]), + } + + elif line.startswith('#EXT-X-KEY'): + decrypt_url = decrypt_info.get('URI') + decrypt_info = parse_m3u8_attributes(line[11:]) + if decrypt_info['METHOD'] == 'AES-128': + if 'IV' in decrypt_info: + decrypt_info['IV'] = binascii.unhexlify(decrypt_info['IV'][2:].zfill(32)) + if not re.match(r'^https?://', decrypt_info['URI']): + decrypt_info['URI'] = compat_urlparse.urljoin( + man_url, decrypt_info['URI']) + if extra_query: + decrypt_info['URI'] = update_url_query(decrypt_info['URI'], extra_query) + if decrypt_url != decrypt_info['URI']: + decrypt_info['KEY'] = None + + elif line.startswith('#EXT-X-MEDIA-SEQUENCE'): + media_sequence = int(line[22:]) + elif line.startswith('#EXT-X-BYTERANGE'): + splitted_byte_range = line[17:].split('@') + sub_range_start = int(splitted_byte_range[1]) if len(splitted_byte_range) == 2 else byte_range['end'] + byte_range = { + 'start': sub_range_start, + 'end': sub_range_start + int(splitted_byte_range[0]), + } + elif is_ad_fragment_start(line): + ad_frag_next = True + elif is_ad_fragment_end(line): + ad_frag_next = False + elif line.startswith('#EXT-X-DISCONTINUITY'): + discontinuity_count += 1 + i += 1 + + # We only download the first fragment during the test + if self.params.get('test', False): + fragments = [fragments[0] if fragments else None] + + if real_downloader: + info_dict['fragments'] = fragments + fd = real_downloader(self.ydl, self.params) + # TODO: Make progress updates work without hooking twice + # for ph in self._progress_hooks: + # fd.add_progress_hook(ph) + return fd.real_download(filename, info_dict) + + if is_webvtt: + def pack_fragment(frag_content, frag_index): + output = io.StringIO() + adjust = 0 + overflow = False + mpegts_last = None + for block in webvtt.parse_fragment(frag_content): + if isinstance(block, webvtt.CueBlock): + extra_state['webvtt_mpegts_last'] = mpegts_last + if overflow: + extra_state['webvtt_mpegts_adjust'] += 1 + overflow = False + block.start += adjust + block.end += adjust + + dedup_window = extra_state.setdefault('webvtt_dedup_window', []) + + ready = [] + + i = 0 + is_new = True + while i < len(dedup_window): + wcue = dedup_window[i] + wblock = webvtt.CueBlock.from_json(wcue) + i += 1 + if wblock.hinges(block): + wcue['end'] = block.end + is_new = False + continue + if wblock == block: + is_new = False + continue + if wblock.end > block.start: + continue + ready.append(wblock) + i -= 1 + del dedup_window[i] + + if is_new: + dedup_window.append(block.as_json) + for block in ready: + block.write_into(output) + + # we only emit cues once they fall out of the duplicate window + continue + elif isinstance(block, webvtt.Magic): + # take care of MPEG PES timestamp overflow + if block.mpegts is None: + block.mpegts = 0 + extra_state.setdefault('webvtt_mpegts_adjust', 0) + block.mpegts += extra_state['webvtt_mpegts_adjust'] << 33 + if block.mpegts < extra_state.get('webvtt_mpegts_last', 0): + overflow = True + block.mpegts += 1 << 33 + mpegts_last = block.mpegts + + if frag_index == 1: + extra_state['webvtt_mpegts'] = block.mpegts or 0 + extra_state['webvtt_local'] = block.local or 0 + # XXX: block.local = block.mpegts = None ? + else: + if block.mpegts is not None and block.local is not None: + adjust = ( + (block.mpegts - extra_state.get('webvtt_mpegts', 0)) + - (block.local - extra_state.get('webvtt_local', 0)) + ) + continue + elif isinstance(block, webvtt.HeaderBlock): + if frag_index != 1: + # XXX: this should probably be silent as well + # or verify that all segments contain the same data + self.report_warning(bug_reports_message( + 'Discarding a %s block found in the middle of the stream; ' + 'if the subtitles display incorrectly,' + % (type(block).__name__))) + continue + block.write_into(output) + + return output.getvalue().encode('utf-8') + + def fin_fragments(): + dedup_window = extra_state.get('webvtt_dedup_window') + if not dedup_window: + return b'' + + output = io.StringIO() + for cue in dedup_window: + webvtt.CueBlock.from_json(cue).write_into(output) + + return output.getvalue().encode('utf-8') + + self.download_and_append_fragments( + ctx, fragments, info_dict, pack_func=pack_fragment, finish_func=fin_fragments) + else: + return self.download_and_append_fragments(ctx, fragments, info_dict) diff --git a/yt_dlp/downloader/http.py b/yt_dlp/downloader/http.py new file mode 100644 index 000000000..2e95bb9d1 --- /dev/null +++ b/yt_dlp/downloader/http.py @@ -0,0 +1,385 @@ +from __future__ import unicode_literals + +import errno +import os +import socket +import time +import random +import re + +from .common import FileDownloader +from ..compat import ( + compat_str, + compat_urllib_error, +) +from ..utils import ( + ContentTooShortError, + encodeFilename, + int_or_none, + sanitize_open, + sanitized_Request, + ThrottledDownload, + write_xattr, + XAttrMetadataError, + XAttrUnavailableError, +) + + +class HttpFD(FileDownloader): + def real_download(self, filename, info_dict): + url = info_dict['url'] + request_data = info_dict.get('request_data', None) + + class DownloadContext(dict): + __getattr__ = dict.get + __setattr__ = dict.__setitem__ + __delattr__ = dict.__delitem__ + + ctx = DownloadContext() + ctx.filename = filename + ctx.tmpfilename = self.temp_name(filename) + ctx.stream = None + + # Do not include the Accept-Encoding header + headers = {'Youtubedl-no-compression': 'True'} + add_headers = info_dict.get('http_headers') + if add_headers: + headers.update(add_headers) + + is_test = self.params.get('test', False) + chunk_size = self._TEST_FILE_SIZE if is_test else ( + self.params.get('http_chunk_size') + or info_dict.get('downloader_options', {}).get('http_chunk_size') + or 0) + + ctx.open_mode = 'wb' + ctx.resume_len = 0 + ctx.data_len = None + ctx.block_size = self.params.get('buffersize', 1024) + ctx.start_time = time.time() + ctx.chunk_size = None + throttle_start = None + + if self.params.get('continuedl', True): + # Establish possible resume length + if os.path.isfile(encodeFilename(ctx.tmpfilename)): + ctx.resume_len = os.path.getsize( + encodeFilename(ctx.tmpfilename)) + + ctx.is_resume = ctx.resume_len > 0 + + count = 0 + retries = self.params.get('retries', 0) + + class SucceedDownload(Exception): + pass + + class RetryDownload(Exception): + def __init__(self, source_error): + self.source_error = source_error + + class NextFragment(Exception): + pass + + def set_range(req, start, end): + range_header = 'bytes=%d-' % start + if end: + range_header += compat_str(end) + req.add_header('Range', range_header) + + def establish_connection(): + ctx.chunk_size = (random.randint(int(chunk_size * 0.95), chunk_size) + if not is_test and chunk_size else chunk_size) + if ctx.resume_len > 0: + range_start = ctx.resume_len + if ctx.is_resume: + self.report_resuming_byte(ctx.resume_len) + ctx.open_mode = 'ab' + elif ctx.chunk_size > 0: + range_start = 0 + else: + range_start = None + ctx.is_resume = False + range_end = range_start + ctx.chunk_size - 1 if ctx.chunk_size else None + if range_end and ctx.data_len is not None and range_end >= ctx.data_len: + range_end = ctx.data_len - 1 + has_range = range_start is not None + ctx.has_range = has_range + request = sanitized_Request(url, request_data, headers) + if has_range: + set_range(request, range_start, range_end) + # Establish connection + try: + try: + ctx.data = self.ydl.urlopen(request) + except (compat_urllib_error.URLError, ) as err: + # reason may not be available, e.g. for urllib2.HTTPError on python 2.6 + reason = getattr(err, 'reason', None) + if isinstance(reason, socket.timeout): + raise RetryDownload(err) + raise err + # When trying to resume, Content-Range HTTP header of response has to be checked + # to match the value of requested Range HTTP header. This is due to a webservers + # that don't support resuming and serve a whole file with no Content-Range + # set in response despite of requested Range (see + # https://github.com/ytdl-org/youtube-dl/issues/6057#issuecomment-126129799) + if has_range: + content_range = ctx.data.headers.get('Content-Range') + if content_range: + content_range_m = re.search(r'bytes (\d+)-(\d+)?(?:/(\d+))?', content_range) + # Content-Range is present and matches requested Range, resume is possible + if content_range_m: + if range_start == int(content_range_m.group(1)): + content_range_end = int_or_none(content_range_m.group(2)) + content_len = int_or_none(content_range_m.group(3)) + accept_content_len = ( + # Non-chunked download + not ctx.chunk_size + # Chunked download and requested piece or + # its part is promised to be served + or content_range_end == range_end + or content_len < range_end) + if accept_content_len: + ctx.data_len = content_len + return + # Content-Range is either not present or invalid. Assuming remote webserver is + # trying to send the whole file, resume is not possible, so wiping the local file + # and performing entire redownload + self.report_unable_to_resume() + ctx.resume_len = 0 + ctx.open_mode = 'wb' + ctx.data_len = int_or_none(ctx.data.info().get('Content-length', None)) + return + except (compat_urllib_error.HTTPError, ) as err: + if err.code == 416: + # Unable to resume (requested range not satisfiable) + try: + # Open the connection again without the range header + ctx.data = self.ydl.urlopen( + sanitized_Request(url, request_data, headers)) + content_length = ctx.data.info()['Content-Length'] + except (compat_urllib_error.HTTPError, ) as err: + if err.code < 500 or err.code >= 600: + raise + else: + # Examine the reported length + if (content_length is not None + and (ctx.resume_len - 100 < int(content_length) < ctx.resume_len + 100)): + # The file had already been fully downloaded. + # Explanation to the above condition: in issue #175 it was revealed that + # YouTube sometimes adds or removes a few bytes from the end of the file, + # changing the file size slightly and causing problems for some users. So + # I decided to implement a suggested change and consider the file + # completely downloaded if the file size differs less than 100 bytes from + # the one in the hard drive. + self.report_file_already_downloaded(ctx.filename) + self.try_rename(ctx.tmpfilename, ctx.filename) + self._hook_progress({ + 'filename': ctx.filename, + 'status': 'finished', + 'downloaded_bytes': ctx.resume_len, + 'total_bytes': ctx.resume_len, + }, info_dict) + raise SucceedDownload() + else: + # The length does not match, we start the download over + self.report_unable_to_resume() + ctx.resume_len = 0 + ctx.open_mode = 'wb' + return + elif err.code < 500 or err.code >= 600: + # Unexpected HTTP error + raise + raise RetryDownload(err) + except socket.error as err: + if err.errno != errno.ECONNRESET: + # Connection reset is no problem, just retry + raise + raise RetryDownload(err) + + def download(): + nonlocal throttle_start + data_len = ctx.data.info().get('Content-length', None) + + # Range HTTP header may be ignored/unsupported by a webserver + # (e.g. extractor/scivee.py, extractor/bambuser.py). + # However, for a test we still would like to download just a piece of a file. + # To achieve this we limit data_len to _TEST_FILE_SIZE and manually control + # block size when downloading a file. + if is_test and (data_len is None or int(data_len) > self._TEST_FILE_SIZE): + data_len = self._TEST_FILE_SIZE + + if data_len is not None: + data_len = int(data_len) + ctx.resume_len + min_data_len = self.params.get('min_filesize') + max_data_len = self.params.get('max_filesize') + if min_data_len is not None and data_len < min_data_len: + self.to_screen('\r[download] File is smaller than min-filesize (%s bytes < %s bytes). Aborting.' % (data_len, min_data_len)) + return False + if max_data_len is not None and data_len > max_data_len: + self.to_screen('\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len)) + return False + + byte_counter = 0 + ctx.resume_len + block_size = ctx.block_size + start = time.time() + + # measure time over whole while-loop, so slow_down() and best_block_size() work together properly + now = None # needed for slow_down() in the first loop run + before = start # start measuring + + def retry(e): + to_stdout = ctx.tmpfilename == '-' + if ctx.stream is not None: + if not to_stdout: + ctx.stream.close() + ctx.stream = None + ctx.resume_len = byte_counter if to_stdout else os.path.getsize(encodeFilename(ctx.tmpfilename)) + raise RetryDownload(e) + + while True: + try: + # Download and write + data_block = ctx.data.read(block_size if not is_test else min(block_size, data_len - byte_counter)) + # socket.timeout is a subclass of socket.error but may not have + # errno set + except socket.timeout as e: + retry(e) + except socket.error as e: + # SSLError on python 2 (inherits socket.error) may have + # no errno set but this error message + if e.errno in (errno.ECONNRESET, errno.ETIMEDOUT) or getattr(e, 'message', None) == 'The read operation timed out': + retry(e) + raise + + byte_counter += len(data_block) + + # exit loop when download is finished + if len(data_block) == 0: + break + + # Open destination file just in time + if ctx.stream is None: + try: + ctx.stream, ctx.tmpfilename = sanitize_open( + ctx.tmpfilename, ctx.open_mode) + assert ctx.stream is not None + ctx.filename = self.undo_temp_name(ctx.tmpfilename) + self.report_destination(ctx.filename) + except (OSError, IOError) as err: + self.report_error('unable to open for writing: %s' % str(err)) + return False + + if self.params.get('xattr_set_filesize', False) and data_len is not None: + try: + write_xattr(ctx.tmpfilename, 'user.ytdl.filesize', str(data_len).encode('utf-8')) + except (XAttrUnavailableError, XAttrMetadataError) as err: + self.report_error('unable to set filesize xattr: %s' % str(err)) + + try: + ctx.stream.write(data_block) + except (IOError, OSError) as err: + self.to_stderr('\n') + self.report_error('unable to write data: %s' % str(err)) + return False + + # Apply rate limit + self.slow_down(start, now, byte_counter - ctx.resume_len) + + # end measuring of one loop run + now = time.time() + after = now + + # Adjust block size + if not self.params.get('noresizebuffer', False): + block_size = self.best_block_size(after - before, len(data_block)) + + before = after + + # Progress message + speed = self.calc_speed(start, now, byte_counter - ctx.resume_len) + if ctx.data_len is None: + eta = None + else: + eta = self.calc_eta(start, time.time(), ctx.data_len - ctx.resume_len, byte_counter - ctx.resume_len) + + self._hook_progress({ + 'status': 'downloading', + 'downloaded_bytes': byte_counter, + 'total_bytes': ctx.data_len, + 'tmpfilename': ctx.tmpfilename, + 'filename': ctx.filename, + 'eta': eta, + 'speed': speed, + 'elapsed': now - ctx.start_time, + 'ctx_id': info_dict.get('ctx_id'), + }, info_dict) + + if data_len is not None and byte_counter == data_len: + break + + if speed and speed < (self.params.get('throttledratelimit') or 0): + # The speed must stay below the limit for 3 seconds + # This prevents raising error when the speed temporarily goes down + if throttle_start is None: + throttle_start = now + elif now - throttle_start > 3: + if ctx.stream is not None and ctx.tmpfilename != '-': + ctx.stream.close() + raise ThrottledDownload() + elif speed: + throttle_start = None + + if not is_test and ctx.chunk_size and ctx.data_len is not None and byte_counter < ctx.data_len: + ctx.resume_len = byte_counter + # ctx.block_size = block_size + raise NextFragment() + + if ctx.stream is None: + self.to_stderr('\n') + self.report_error('Did not get any data blocks') + return False + if ctx.tmpfilename != '-': + ctx.stream.close() + + if data_len is not None and byte_counter != data_len: + err = ContentTooShortError(byte_counter, int(data_len)) + if count <= retries: + retry(err) + raise err + + self.try_rename(ctx.tmpfilename, ctx.filename) + + # Update file modification time + if self.params.get('updatetime', True): + info_dict['filetime'] = self.try_utime(ctx.filename, ctx.data.info().get('last-modified', None)) + + self._hook_progress({ + 'downloaded_bytes': byte_counter, + 'total_bytes': byte_counter, + 'filename': ctx.filename, + 'status': 'finished', + 'elapsed': time.time() - ctx.start_time, + 'ctx_id': info_dict.get('ctx_id'), + }, info_dict) + + return True + + while count <= retries: + try: + establish_connection() + return download() + except RetryDownload as e: + count += 1 + if count <= retries: + self.report_retry(e.source_error, count, retries) + else: + self.to_screen(f'[download] Got server HTTP error: {e.source_error}') + continue + except NextFragment: + continue + except SucceedDownload: + return True + + self.report_error('giving up after %s retries' % retries) + return False diff --git a/yt_dlp/downloader/ism.py b/yt_dlp/downloader/ism.py new file mode 100644 index 000000000..09516abe5 --- /dev/null +++ b/yt_dlp/downloader/ism.py @@ -0,0 +1,289 @@ +from __future__ import unicode_literals + +import time +import binascii +import io + +from .fragment import FragmentFD +from ..compat import ( + compat_Struct, + compat_urllib_error, +) + + +u8 = compat_Struct('>B') +u88 = compat_Struct('>Bx') +u16 = compat_Struct('>H') +u1616 = compat_Struct('>Hxx') +u32 = compat_Struct('>I') +u64 = compat_Struct('>Q') + +s88 = compat_Struct('>bx') +s16 = compat_Struct('>h') +s1616 = compat_Struct('>hxx') +s32 = compat_Struct('>i') + +unity_matrix = (s32.pack(0x10000) + s32.pack(0) * 3) * 2 + s32.pack(0x40000000) + +TRACK_ENABLED = 0x1 +TRACK_IN_MOVIE = 0x2 +TRACK_IN_PREVIEW = 0x4 + +SELF_CONTAINED = 0x1 + + +def box(box_type, payload): + return u32.pack(8 + len(payload)) + box_type + payload + + +def full_box(box_type, version, flags, payload): + return box(box_type, u8.pack(version) + u32.pack(flags)[1:] + payload) + + +def write_piff_header(stream, params): + track_id = params['track_id'] + fourcc = params['fourcc'] + duration = params['duration'] + timescale = params.get('timescale', 10000000) + language = params.get('language', 'und') + height = params.get('height', 0) + width = params.get('width', 0) + stream_type = params['stream_type'] + creation_time = modification_time = int(time.time()) + + ftyp_payload = b'isml' # major brand + ftyp_payload += u32.pack(1) # minor version + ftyp_payload += b'piff' + b'iso2' # compatible brands + stream.write(box(b'ftyp', ftyp_payload)) # File Type Box + + mvhd_payload = u64.pack(creation_time) + mvhd_payload += u64.pack(modification_time) + mvhd_payload += u32.pack(timescale) + mvhd_payload += u64.pack(duration) + mvhd_payload += s1616.pack(1) # rate + mvhd_payload += s88.pack(1) # volume + mvhd_payload += u16.pack(0) # reserved + mvhd_payload += u32.pack(0) * 2 # reserved + mvhd_payload += unity_matrix + mvhd_payload += u32.pack(0) * 6 # pre defined + mvhd_payload += u32.pack(0xffffffff) # next track id + moov_payload = full_box(b'mvhd', 1, 0, mvhd_payload) # Movie Header Box + + tkhd_payload = u64.pack(creation_time) + tkhd_payload += u64.pack(modification_time) + tkhd_payload += u32.pack(track_id) # track id + tkhd_payload += u32.pack(0) # reserved + tkhd_payload += u64.pack(duration) + tkhd_payload += u32.pack(0) * 2 # reserved + tkhd_payload += s16.pack(0) # layer + tkhd_payload += s16.pack(0) # alternate group + tkhd_payload += s88.pack(1 if stream_type == 'audio' else 0) # volume + tkhd_payload += u16.pack(0) # reserved + tkhd_payload += unity_matrix + tkhd_payload += u1616.pack(width) + tkhd_payload += u1616.pack(height) + trak_payload = full_box(b'tkhd', 1, TRACK_ENABLED | TRACK_IN_MOVIE | TRACK_IN_PREVIEW, tkhd_payload) # Track Header Box + + mdhd_payload = u64.pack(creation_time) + mdhd_payload += u64.pack(modification_time) + mdhd_payload += u32.pack(timescale) + mdhd_payload += u64.pack(duration) + mdhd_payload += u16.pack(((ord(language[0]) - 0x60) << 10) | ((ord(language[1]) - 0x60) << 5) | (ord(language[2]) - 0x60)) + mdhd_payload += u16.pack(0) # pre defined + mdia_payload = full_box(b'mdhd', 1, 0, mdhd_payload) # Media Header Box + + hdlr_payload = u32.pack(0) # pre defined + if stream_type == 'audio': # handler type + hdlr_payload += b'soun' + hdlr_payload += u32.pack(0) * 3 # reserved + hdlr_payload += b'SoundHandler\0' # name + elif stream_type == 'video': + hdlr_payload += b'vide' + hdlr_payload += u32.pack(0) * 3 # reserved + hdlr_payload += b'VideoHandler\0' # name + elif stream_type == 'text': + hdlr_payload += b'subt' + hdlr_payload += u32.pack(0) * 3 # reserved + hdlr_payload += b'SubtitleHandler\0' # name + else: + assert False + mdia_payload += full_box(b'hdlr', 0, 0, hdlr_payload) # Handler Reference Box + + if stream_type == 'audio': + smhd_payload = s88.pack(0) # balance + smhd_payload += u16.pack(0) # reserved + media_header_box = full_box(b'smhd', 0, 0, smhd_payload) # Sound Media Header + elif stream_type == 'video': + vmhd_payload = u16.pack(0) # graphics mode + vmhd_payload += u16.pack(0) * 3 # opcolor + media_header_box = full_box(b'vmhd', 0, 1, vmhd_payload) # Video Media Header + elif stream_type == 'text': + media_header_box = full_box(b'sthd', 0, 0, b'') # Subtitle Media Header + else: + assert False + minf_payload = media_header_box + + dref_payload = u32.pack(1) # entry count + dref_payload += full_box(b'url ', 0, SELF_CONTAINED, b'') # Data Entry URL Box + dinf_payload = full_box(b'dref', 0, 0, dref_payload) # Data Reference Box + minf_payload += box(b'dinf', dinf_payload) # Data Information Box + + stsd_payload = u32.pack(1) # entry count + + sample_entry_payload = u8.pack(0) * 6 # reserved + sample_entry_payload += u16.pack(1) # data reference index + if stream_type == 'audio': + sample_entry_payload += u32.pack(0) * 2 # reserved + sample_entry_payload += u16.pack(params.get('channels', 2)) + sample_entry_payload += u16.pack(params.get('bits_per_sample', 16)) + sample_entry_payload += u16.pack(0) # pre defined + sample_entry_payload += u16.pack(0) # reserved + sample_entry_payload += u1616.pack(params['sampling_rate']) + + if fourcc == 'AACL': + sample_entry_box = box(b'mp4a', sample_entry_payload) + elif stream_type == 'video': + sample_entry_payload += u16.pack(0) # pre defined + sample_entry_payload += u16.pack(0) # reserved + sample_entry_payload += u32.pack(0) * 3 # pre defined + sample_entry_payload += u16.pack(width) + sample_entry_payload += u16.pack(height) + sample_entry_payload += u1616.pack(0x48) # horiz resolution 72 dpi + sample_entry_payload += u1616.pack(0x48) # vert resolution 72 dpi + sample_entry_payload += u32.pack(0) # reserved + sample_entry_payload += u16.pack(1) # frame count + sample_entry_payload += u8.pack(0) * 32 # compressor name + sample_entry_payload += u16.pack(0x18) # depth + sample_entry_payload += s16.pack(-1) # pre defined + + codec_private_data = binascii.unhexlify(params['codec_private_data'].encode('utf-8')) + if fourcc in ('H264', 'AVC1'): + sps, pps = codec_private_data.split(u32.pack(1))[1:] + avcc_payload = u8.pack(1) # configuration version + avcc_payload += sps[1:4] # avc profile indication + profile compatibility + avc level indication + avcc_payload += u8.pack(0xfc | (params.get('nal_unit_length_field', 4) - 1)) # complete representation (1) + reserved (11111) + length size minus one + avcc_payload += u8.pack(1) # reserved (0) + number of sps (0000001) + avcc_payload += u16.pack(len(sps)) + avcc_payload += sps + avcc_payload += u8.pack(1) # number of pps + avcc_payload += u16.pack(len(pps)) + avcc_payload += pps + sample_entry_payload += box(b'avcC', avcc_payload) # AVC Decoder Configuration Record + sample_entry_box = box(b'avc1', sample_entry_payload) # AVC Simple Entry + else: + assert False + elif stream_type == 'text': + if fourcc == 'TTML': + sample_entry_payload += b'http://www.w3.org/ns/ttml\0' # namespace + sample_entry_payload += b'\0' # schema location + sample_entry_payload += b'\0' # auxilary mime types(??) + sample_entry_box = box(b'stpp', sample_entry_payload) + else: + assert False + else: + assert False + stsd_payload += sample_entry_box + + stbl_payload = full_box(b'stsd', 0, 0, stsd_payload) # Sample Description Box + + stts_payload = u32.pack(0) # entry count + stbl_payload += full_box(b'stts', 0, 0, stts_payload) # Decoding Time to Sample Box + + stsc_payload = u32.pack(0) # entry count + stbl_payload += full_box(b'stsc', 0, 0, stsc_payload) # Sample To Chunk Box + + stco_payload = u32.pack(0) # entry count + stbl_payload += full_box(b'stco', 0, 0, stco_payload) # Chunk Offset Box + + minf_payload += box(b'stbl', stbl_payload) # Sample Table Box + + mdia_payload += box(b'minf', minf_payload) # Media Information Box + + trak_payload += box(b'mdia', mdia_payload) # Media Box + + moov_payload += box(b'trak', trak_payload) # Track Box + + mehd_payload = u64.pack(duration) + mvex_payload = full_box(b'mehd', 1, 0, mehd_payload) # Movie Extends Header Box + + trex_payload = u32.pack(track_id) # track id + trex_payload += u32.pack(1) # default sample description index + trex_payload += u32.pack(0) # default sample duration + trex_payload += u32.pack(0) # default sample size + trex_payload += u32.pack(0) # default sample flags + mvex_payload += full_box(b'trex', 0, 0, trex_payload) # Track Extends Box + + moov_payload += box(b'mvex', mvex_payload) # Movie Extends Box + stream.write(box(b'moov', moov_payload)) # Movie Box + + +def extract_box_data(data, box_sequence): + data_reader = io.BytesIO(data) + while True: + box_size = u32.unpack(data_reader.read(4))[0] + box_type = data_reader.read(4) + if box_type == box_sequence[0]: + box_data = data_reader.read(box_size - 8) + if len(box_sequence) == 1: + return box_data + return extract_box_data(box_data, box_sequence[1:]) + data_reader.seek(box_size - 8, 1) + + +class IsmFD(FragmentFD): + """ + Download segments in a ISM manifest + """ + + FD_NAME = 'ism' + + def real_download(self, filename, info_dict): + segments = info_dict['fragments'][:1] if self.params.get( + 'test', False) else info_dict['fragments'] + + ctx = { + 'filename': filename, + 'total_frags': len(segments), + } + + self._prepare_and_start_frag_download(ctx, info_dict) + + extra_state = ctx.setdefault('extra_state', { + 'ism_track_written': False, + }) + + fragment_retries = self.params.get('fragment_retries', 0) + skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) + + frag_index = 0 + for i, segment in enumerate(segments): + frag_index += 1 + if frag_index <= ctx['fragment_index']: + continue + count = 0 + while count <= fragment_retries: + try: + success, frag_content = self._download_fragment(ctx, segment['url'], info_dict) + if not success: + return False + if not extra_state['ism_track_written']: + tfhd_data = extract_box_data(frag_content, [b'moof', b'traf', b'tfhd']) + info_dict['_download_params']['track_id'] = u32.unpack(tfhd_data[4:8])[0] + write_piff_header(ctx['dest_stream'], info_dict['_download_params']) + extra_state['ism_track_written'] = True + self._append_fragment(ctx, frag_content) + break + except compat_urllib_error.HTTPError as err: + count += 1 + if count <= fragment_retries: + self.report_retry_fragment(err, frag_index, count, fragment_retries) + if count > fragment_retries: + if skip_unavailable_fragments: + self.report_skip_fragment(frag_index) + continue + self.report_error('giving up after %s fragment retries' % fragment_retries) + return False + + self._finish_frag_download(ctx, info_dict) + + return True diff --git a/yt_dlp/downloader/mhtml.py b/yt_dlp/downloader/mhtml.py new file mode 100644 index 000000000..b75db18a8 --- /dev/null +++ b/yt_dlp/downloader/mhtml.py @@ -0,0 +1,202 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import io +import quopri +import re +import uuid + +from .fragment import FragmentFD +from ..utils import ( + escapeHTML, + formatSeconds, + srt_subtitles_timecode, + urljoin, +) +from ..version import __version__ as YT_DLP_VERSION + + +class MhtmlFD(FragmentFD): + FD_NAME = 'mhtml' + + _STYLESHEET = """\ +html, body { + margin: 0; + padding: 0; + height: 100vh; +} + +html { + overflow-y: scroll; + scroll-snap-type: y mandatory; +} + +body { + scroll-snap-type: y mandatory; + display: flex; + flex-flow: column; +} + +body > figure { + max-width: 100vw; + max-height: 100vh; + scroll-snap-align: center; +} + +body > figure > figcaption { + text-align: center; + height: 2.5em; +} + +body > figure > img { + display: block; + margin: auto; + max-width: 100%; + max-height: calc(100vh - 5em); +} +""" + _STYLESHEET = re.sub(r'\s+', ' ', _STYLESHEET) + _STYLESHEET = re.sub(r'\B \B|(?<=[\w\-]) (?=[^\w\-])|(?<=[^\w\-]) (?=[\w\-])', '', _STYLESHEET) + + @staticmethod + def _escape_mime(s): + return '=?utf-8?Q?' + (b''.join( + bytes((b,)) if b >= 0x20 else b'=%02X' % b + for b in quopri.encodestring(s.encode('utf-8'), header=True) + )).decode('us-ascii') + '?=' + + def _gen_cid(self, i, fragment, frag_boundary): + return '%u.%s@yt-dlp.github.io.invalid' % (i, frag_boundary) + + def _gen_stub(self, *, fragments, frag_boundary, title): + output = io.StringIO() + + output.write(( + '<!DOCTYPE html>' + '<html>' + '<head>' + '' '<meta name="generator" content="yt-dlp {version}">' + '' '<title>{title}</title>' + '' '<style>{styles}</style>' + '<body>' + ).format( + version=escapeHTML(YT_DLP_VERSION), + styles=self._STYLESHEET, + title=escapeHTML(title) + )) + + t0 = 0 + for i, frag in enumerate(fragments): + output.write('<figure>') + try: + t1 = t0 + frag['duration'] + output.write(( + '<figcaption>Slide #{num}: {t0} – {t1} (duration: {duration})</figcaption>' + ).format( + num=i + 1, + t0=srt_subtitles_timecode(t0), + t1=srt_subtitles_timecode(t1), + duration=formatSeconds(frag['duration'], msec=True) + )) + except (KeyError, ValueError, TypeError): + t1 = None + output.write(( + '<figcaption>Slide #{num}</figcaption>' + ).format(num=i + 1)) + output.write('<img src="cid:{cid}">'.format( + cid=self._gen_cid(i, frag, frag_boundary))) + output.write('</figure>') + t0 = t1 + + return output.getvalue() + + def real_download(self, filename, info_dict): + fragment_base_url = info_dict.get('fragment_base_url') + fragments = info_dict['fragments'][:1] if self.params.get( + 'test', False) else info_dict['fragments'] + title = info_dict['title'] + origin = info_dict['webpage_url'] + + ctx = { + 'filename': filename, + 'total_frags': len(fragments), + } + + self._prepare_and_start_frag_download(ctx, info_dict) + + extra_state = ctx.setdefault('extra_state', { + 'header_written': False, + 'mime_boundary': str(uuid.uuid4()).replace('-', ''), + }) + + frag_boundary = extra_state['mime_boundary'] + + if not extra_state['header_written']: + stub = self._gen_stub( + fragments=fragments, + frag_boundary=frag_boundary, + title=title + ) + + ctx['dest_stream'].write(( + 'MIME-Version: 1.0\r\n' + 'From: <nowhere@yt-dlp.github.io.invalid>\r\n' + 'To: <nowhere@yt-dlp.github.io.invalid>\r\n' + 'Subject: {title}\r\n' + 'Content-type: multipart/related; ' + '' 'boundary="{boundary}"; ' + '' 'type="text/html"\r\n' + 'X.yt-dlp.Origin: {origin}\r\n' + '\r\n' + '--{boundary}\r\n' + 'Content-Type: text/html; charset=utf-8\r\n' + 'Content-Length: {length}\r\n' + '\r\n' + '{stub}\r\n' + ).format( + origin=origin, + boundary=frag_boundary, + length=len(stub), + title=self._escape_mime(title), + stub=stub + ).encode('utf-8')) + extra_state['header_written'] = True + + for i, fragment in enumerate(fragments): + if (i + 1) <= ctx['fragment_index']: + continue + + fragment_url = urljoin(fragment_base_url, fragment['path']) + success, frag_content = self._download_fragment(ctx, fragment_url, info_dict) + if not success: + continue + + mime_type = b'image/jpeg' + if frag_content.startswith(b'\x89PNG\r\n\x1a\n'): + mime_type = b'image/png' + if frag_content.startswith((b'GIF87a', b'GIF89a')): + mime_type = b'image/gif' + if frag_content.startswith(b'RIFF') and frag_content[8:12] == 'WEBP': + mime_type = b'image/webp' + + frag_header = io.BytesIO() + frag_header.write( + b'--%b\r\n' % frag_boundary.encode('us-ascii')) + frag_header.write( + b'Content-ID: <%b>\r\n' % self._gen_cid(i, fragment, frag_boundary).encode('us-ascii')) + frag_header.write( + b'Content-type: %b\r\n' % mime_type) + frag_header.write( + b'Content-length: %u\r\n' % len(frag_content)) + frag_header.write( + b'Content-location: %b\r\n' % fragment_url.encode('us-ascii')) + frag_header.write( + b'X.yt-dlp.Duration: %f\r\n' % fragment['duration']) + frag_header.write(b'\r\n') + self._append_fragment( + ctx, frag_header.getvalue() + frag_content + b'\r\n') + + ctx['dest_stream'].write( + b'--%b--\r\n\r\n' % frag_boundary.encode('us-ascii')) + self._finish_frag_download(ctx, info_dict) + return True diff --git a/yt_dlp/downloader/niconico.py b/yt_dlp/downloader/niconico.py new file mode 100644 index 000000000..521dfece3 --- /dev/null +++ b/yt_dlp/downloader/niconico.py @@ -0,0 +1,57 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import threading + +from .common import FileDownloader +from ..downloader import get_suitable_downloader +from ..extractor.niconico import NiconicoIE +from ..utils import sanitized_Request + + +class NiconicoDmcFD(FileDownloader): + """ Downloading niconico douga from DMC with heartbeat """ + + FD_NAME = 'niconico_dmc' + + def real_download(self, filename, info_dict): + self.to_screen('[%s] Downloading from DMC' % self.FD_NAME) + + ie = NiconicoIE(self.ydl) + info_dict, heartbeat_info_dict = ie._get_heartbeat_info(info_dict) + + fd = get_suitable_downloader(info_dict, params=self.params)(self.ydl, self.params) + + success = download_complete = False + timer = [None] + heartbeat_lock = threading.Lock() + heartbeat_url = heartbeat_info_dict['url'] + heartbeat_data = heartbeat_info_dict['data'].encode() + heartbeat_interval = heartbeat_info_dict.get('interval', 30) + + request = sanitized_Request(heartbeat_url, heartbeat_data) + + def heartbeat(): + try: + self.ydl.urlopen(request).read() + except Exception: + self.to_screen('[%s] Heartbeat failed' % self.FD_NAME) + + with heartbeat_lock: + if not download_complete: + timer[0] = threading.Timer(heartbeat_interval, heartbeat) + timer[0].start() + + heartbeat_info_dict['ping']() + self.to_screen('[%s] Heartbeat with %d second interval ...' % (self.FD_NAME, heartbeat_interval)) + try: + heartbeat() + if type(fd).__name__ == 'HlsFD': + info_dict.update(ie._extract_m3u8_formats(info_dict['url'], info_dict['id'])[0]) + success = fd.real_download(filename, info_dict) + finally: + if heartbeat_lock: + with heartbeat_lock: + timer[0].cancel() + download_complete = True + return success diff --git a/yt_dlp/downloader/rtmp.py b/yt_dlp/downloader/rtmp.py new file mode 100644 index 000000000..6dca64725 --- /dev/null +++ b/yt_dlp/downloader/rtmp.py @@ -0,0 +1,216 @@ +from __future__ import unicode_literals + +import os +import re +import subprocess +import time + +from .common import FileDownloader +from ..compat import compat_str +from ..utils import ( + check_executable, + encodeFilename, + encodeArgument, + get_exe_version, +) + + +def rtmpdump_version(): + return get_exe_version( + 'rtmpdump', ['--help'], r'(?i)RTMPDump\s*v?([0-9a-zA-Z._-]+)') + + +class RtmpFD(FileDownloader): + def real_download(self, filename, info_dict): + def run_rtmpdump(args): + start = time.time() + resume_percent = None + resume_downloaded_data_len = None + proc = subprocess.Popen(args, stderr=subprocess.PIPE) + cursor_in_new_line = True + proc_stderr_closed = False + try: + while not proc_stderr_closed: + # read line from stderr + line = '' + while True: + char = proc.stderr.read(1) + if not char: + proc_stderr_closed = True + break + if char in [b'\r', b'\n']: + break + line += char.decode('ascii', 'replace') + if not line: + # proc_stderr_closed is True + continue + mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec \(([0-9]{1,2}\.[0-9])%\)', line) + if mobj: + downloaded_data_len = int(float(mobj.group(1)) * 1024) + percent = float(mobj.group(2)) + if not resume_percent: + resume_percent = percent + resume_downloaded_data_len = downloaded_data_len + time_now = time.time() + eta = self.calc_eta(start, time_now, 100 - resume_percent, percent - resume_percent) + speed = self.calc_speed(start, time_now, downloaded_data_len - resume_downloaded_data_len) + data_len = None + if percent > 0: + data_len = int(downloaded_data_len * 100 / percent) + self._hook_progress({ + 'status': 'downloading', + 'downloaded_bytes': downloaded_data_len, + 'total_bytes_estimate': data_len, + 'tmpfilename': tmpfilename, + 'filename': filename, + 'eta': eta, + 'elapsed': time_now - start, + 'speed': speed, + }, info_dict) + cursor_in_new_line = False + else: + # no percent for live streams + mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec', line) + if mobj: + downloaded_data_len = int(float(mobj.group(1)) * 1024) + time_now = time.time() + speed = self.calc_speed(start, time_now, downloaded_data_len) + self._hook_progress({ + 'downloaded_bytes': downloaded_data_len, + 'tmpfilename': tmpfilename, + 'filename': filename, + 'status': 'downloading', + 'elapsed': time_now - start, + 'speed': speed, + }, info_dict) + cursor_in_new_line = False + elif self.params.get('verbose', False): + if not cursor_in_new_line: + self.to_screen('') + cursor_in_new_line = True + self.to_screen('[rtmpdump] ' + line) + if not cursor_in_new_line: + self.to_screen('') + return proc.wait() + except BaseException: # Including KeyboardInterrupt + proc.kill() + proc.wait() + raise + + url = info_dict['url'] + player_url = info_dict.get('player_url') + page_url = info_dict.get('page_url') + app = info_dict.get('app') + play_path = info_dict.get('play_path') + tc_url = info_dict.get('tc_url') + flash_version = info_dict.get('flash_version') + live = info_dict.get('rtmp_live', False) + conn = info_dict.get('rtmp_conn') + protocol = info_dict.get('rtmp_protocol') + real_time = info_dict.get('rtmp_real_time', False) + no_resume = info_dict.get('no_resume', False) + continue_dl = self.params.get('continuedl', True) + + self.report_destination(filename) + tmpfilename = self.temp_name(filename) + test = self.params.get('test', False) + + # Check for rtmpdump first + if not check_executable('rtmpdump', ['-h']): + self.report_error('RTMP download detected but "rtmpdump" could not be run. Please install') + return False + + # Download using rtmpdump. rtmpdump returns exit code 2 when + # the connection was interrupted and resuming appears to be + # possible. This is part of rtmpdump's normal usage, AFAIK. + basic_args = [ + 'rtmpdump', '--verbose', '-r', url, + '-o', tmpfilename] + if player_url is not None: + basic_args += ['--swfVfy', player_url] + if page_url is not None: + basic_args += ['--pageUrl', page_url] + if app is not None: + basic_args += ['--app', app] + if play_path is not None: + basic_args += ['--playpath', play_path] + if tc_url is not None: + basic_args += ['--tcUrl', tc_url] + if test: + basic_args += ['--stop', '1'] + if flash_version is not None: + basic_args += ['--flashVer', flash_version] + if live: + basic_args += ['--live'] + if isinstance(conn, list): + for entry in conn: + basic_args += ['--conn', entry] + elif isinstance(conn, compat_str): + basic_args += ['--conn', conn] + if protocol is not None: + basic_args += ['--protocol', protocol] + if real_time: + basic_args += ['--realtime'] + + args = basic_args + if not no_resume and continue_dl and not live: + args += ['--resume'] + if not live and continue_dl: + args += ['--skip', '1'] + + args = [encodeArgument(a) for a in args] + + self._debug_cmd(args, exe='rtmpdump') + + RD_SUCCESS = 0 + RD_FAILED = 1 + RD_INCOMPLETE = 2 + RD_NO_CONNECT = 3 + + started = time.time() + + try: + retval = run_rtmpdump(args) + except KeyboardInterrupt: + if not info_dict.get('is_live'): + raise + retval = RD_SUCCESS + self.to_screen('\n[rtmpdump] Interrupted by user') + + if retval == RD_NO_CONNECT: + self.report_error('[rtmpdump] Could not connect to RTMP server.') + return False + + while retval in (RD_INCOMPLETE, RD_FAILED) and not test and not live: + prevsize = os.path.getsize(encodeFilename(tmpfilename)) + self.to_screen('[rtmpdump] Downloaded %s bytes' % prevsize) + time.sleep(5.0) # This seems to be needed + args = basic_args + ['--resume'] + if retval == RD_FAILED: + args += ['--skip', '1'] + args = [encodeArgument(a) for a in args] + retval = run_rtmpdump(args) + cursize = os.path.getsize(encodeFilename(tmpfilename)) + if prevsize == cursize and retval == RD_FAILED: + break + # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those + if prevsize == cursize and retval == RD_INCOMPLETE and cursize > 1024: + self.to_screen('[rtmpdump] Could not download the whole video. This can happen for some advertisements.') + retval = RD_SUCCESS + break + if retval == RD_SUCCESS or (test and retval == RD_INCOMPLETE): + fsize = os.path.getsize(encodeFilename(tmpfilename)) + self.to_screen('[rtmpdump] Downloaded %s bytes' % fsize) + self.try_rename(tmpfilename, filename) + self._hook_progress({ + 'downloaded_bytes': fsize, + 'total_bytes': fsize, + 'filename': filename, + 'status': 'finished', + 'elapsed': time.time() - started, + }, info_dict) + return True + else: + self.to_stderr('\n') + self.report_error('rtmpdump exited with code %d' % retval) + return False diff --git a/yt_dlp/downloader/rtsp.py b/yt_dlp/downloader/rtsp.py new file mode 100644 index 000000000..7815d59d9 --- /dev/null +++ b/yt_dlp/downloader/rtsp.py @@ -0,0 +1,47 @@ +from __future__ import unicode_literals + +import os +import subprocess + +from .common import FileDownloader +from ..utils import ( + check_executable, + encodeFilename, +) + + +class RtspFD(FileDownloader): + def real_download(self, filename, info_dict): + url = info_dict['url'] + self.report_destination(filename) + tmpfilename = self.temp_name(filename) + + if check_executable('mplayer', ['-h']): + args = [ + 'mplayer', '-really-quiet', '-vo', 'null', '-vc', 'dummy', + '-dumpstream', '-dumpfile', tmpfilename, url] + elif check_executable('mpv', ['-h']): + args = [ + 'mpv', '-really-quiet', '--vo=null', '--stream-dump=' + tmpfilename, url] + else: + self.report_error('MMS or RTSP download detected but neither "mplayer" nor "mpv" could be run. Please install one') + return False + + self._debug_cmd(args) + + retval = subprocess.call(args) + if retval == 0: + fsize = os.path.getsize(encodeFilename(tmpfilename)) + self.to_screen('\r[%s] %s bytes' % (args[0], fsize)) + self.try_rename(tmpfilename, filename) + self._hook_progress({ + 'downloaded_bytes': fsize, + 'total_bytes': fsize, + 'filename': filename, + 'status': 'finished', + }, info_dict) + return True + else: + self.to_stderr('\n') + self.report_error('%s exited with code %d' % (args[0], retval)) + return False diff --git a/yt_dlp/downloader/websocket.py b/yt_dlp/downloader/websocket.py new file mode 100644 index 000000000..088222046 --- /dev/null +++ b/yt_dlp/downloader/websocket.py @@ -0,0 +1,59 @@ +import os +import signal +import asyncio +import threading + +try: + import websockets + has_websockets = True +except ImportError: + has_websockets = False + +from .common import FileDownloader +from .external import FFmpegFD + + +class FFmpegSinkFD(FileDownloader): + """ A sink to ffmpeg for downloading fragments in any form """ + + def real_download(self, filename, info_dict): + info_copy = info_dict.copy() + info_copy['url'] = '-' + + async def call_conn(proc, stdin): + try: + await self.real_connection(stdin, info_dict) + except (BrokenPipeError, OSError): + pass + finally: + try: + stdin.flush() + stdin.close() + except OSError: + pass + os.kill(os.getpid(), signal.SIGINT) + + class FFmpegStdinFD(FFmpegFD): + @classmethod + def get_basename(cls): + return FFmpegFD.get_basename() + + def on_process_started(self, proc, stdin): + thread = threading.Thread(target=asyncio.run, daemon=True, args=(call_conn(proc, stdin), )) + thread.start() + + return FFmpegStdinFD(self.ydl, self.params or {}).download(filename, info_copy) + + async def real_connection(self, sink, info_dict): + """ Override this in subclasses """ + raise NotImplementedError('This method must be implemented by subclasses') + + +class WebSocketFragmentFD(FFmpegSinkFD): + async def real_connection(self, sink, info_dict): + async with websockets.connect(info_dict['url'], extra_headers=info_dict.get('http_headers', {})) as ws: + while True: + recv = await ws.recv() + if isinstance(recv, str): + recv = recv.encode('utf8') + sink.write(recv) diff --git a/yt_dlp/downloader/youtube_live_chat.py b/yt_dlp/downloader/youtube_live_chat.py new file mode 100644 index 000000000..ef4205edc --- /dev/null +++ b/yt_dlp/downloader/youtube_live_chat.py @@ -0,0 +1,236 @@ +from __future__ import division, unicode_literals + +import json +import time + +from .fragment import FragmentFD +from ..compat import compat_urllib_error +from ..utils import ( + try_get, + dict_get, + int_or_none, + RegexNotFoundError, +) +from ..extractor.youtube import YoutubeBaseInfoExtractor as YT_BaseIE + + +class YoutubeLiveChatFD(FragmentFD): + """ Downloads YouTube live chats fragment by fragment """ + + FD_NAME = 'youtube_live_chat' + + def real_download(self, filename, info_dict): + video_id = info_dict['video_id'] + self.to_screen('[%s] Downloading live chat' % self.FD_NAME) + + fragment_retries = self.params.get('fragment_retries', 0) + test = self.params.get('test', False) + + ctx = { + 'filename': filename, + 'live': True, + 'total_frags': None, + } + + ie = YT_BaseIE(self.ydl) + + start_time = int(time.time() * 1000) + + def dl_fragment(url, data=None, headers=None): + http_headers = info_dict.get('http_headers', {}) + if headers: + http_headers = http_headers.copy() + http_headers.update(headers) + return self._download_fragment(ctx, url, info_dict, http_headers, data) + + def parse_actions_replay(live_chat_continuation): + offset = continuation_id = click_tracking_params = None + processed_fragment = bytearray() + for action in live_chat_continuation.get('actions', []): + if 'replayChatItemAction' in action: + replay_chat_item_action = action['replayChatItemAction'] + offset = int(replay_chat_item_action['videoOffsetTimeMsec']) + processed_fragment.extend( + json.dumps(action, ensure_ascii=False).encode('utf-8') + b'\n') + if offset is not None: + continuation = try_get( + live_chat_continuation, + lambda x: x['continuations'][0]['liveChatReplayContinuationData'], dict) + if continuation: + continuation_id = continuation.get('continuation') + click_tracking_params = continuation.get('clickTrackingParams') + self._append_fragment(ctx, processed_fragment) + return continuation_id, offset, click_tracking_params + + def try_refresh_replay_beginning(live_chat_continuation): + # choose the second option that contains the unfiltered live chat replay + refresh_continuation = try_get( + live_chat_continuation, + lambda x: x['header']['liveChatHeaderRenderer']['viewSelector']['sortFilterSubMenuRenderer']['subMenuItems'][1]['continuation']['reloadContinuationData'], dict) + if refresh_continuation: + # no data yet but required to call _append_fragment + self._append_fragment(ctx, b'') + refresh_continuation_id = refresh_continuation.get('continuation') + offset = 0 + click_tracking_params = refresh_continuation.get('trackingParams') + return refresh_continuation_id, offset, click_tracking_params + return parse_actions_replay(live_chat_continuation) + + live_offset = 0 + + def parse_actions_live(live_chat_continuation): + nonlocal live_offset + continuation_id = click_tracking_params = None + processed_fragment = bytearray() + for action in live_chat_continuation.get('actions', []): + timestamp = self.parse_live_timestamp(action) + if timestamp is not None: + live_offset = timestamp - start_time + # compatibility with replay format + pseudo_action = { + 'replayChatItemAction': {'actions': [action]}, + 'videoOffsetTimeMsec': str(live_offset), + 'isLive': True, + } + processed_fragment.extend( + json.dumps(pseudo_action, ensure_ascii=False).encode('utf-8') + b'\n') + continuation_data_getters = [ + lambda x: x['continuations'][0]['invalidationContinuationData'], + lambda x: x['continuations'][0]['timedContinuationData'], + ] + continuation_data = try_get(live_chat_continuation, continuation_data_getters, dict) + if continuation_data: + continuation_id = continuation_data.get('continuation') + click_tracking_params = continuation_data.get('clickTrackingParams') + timeout_ms = int_or_none(continuation_data.get('timeoutMs')) + if timeout_ms is not None: + time.sleep(timeout_ms / 1000) + self._append_fragment(ctx, processed_fragment) + return continuation_id, live_offset, click_tracking_params + + def download_and_parse_fragment(url, frag_index, request_data=None, headers=None): + count = 0 + while count <= fragment_retries: + try: + success, raw_fragment = dl_fragment(url, request_data, headers) + if not success: + return False, None, None, None + try: + data = ie.extract_yt_initial_data(video_id, raw_fragment.decode('utf-8', 'replace')) + except RegexNotFoundError: + data = None + if not data: + data = json.loads(raw_fragment) + live_chat_continuation = try_get( + data, + lambda x: x['continuationContents']['liveChatContinuation'], dict) or {} + if info_dict['protocol'] == 'youtube_live_chat_replay': + if frag_index == 1: + continuation_id, offset, click_tracking_params = try_refresh_replay_beginning(live_chat_continuation) + else: + continuation_id, offset, click_tracking_params = parse_actions_replay(live_chat_continuation) + elif info_dict['protocol'] == 'youtube_live_chat': + continuation_id, offset, click_tracking_params = parse_actions_live(live_chat_continuation) + return True, continuation_id, offset, click_tracking_params + except compat_urllib_error.HTTPError as err: + count += 1 + if count <= fragment_retries: + self.report_retry_fragment(err, frag_index, count, fragment_retries) + if count > fragment_retries: + self.report_error('giving up after %s fragment retries' % fragment_retries) + return False, None, None, None + + self._prepare_and_start_frag_download(ctx, info_dict) + + success, raw_fragment = dl_fragment(info_dict['url']) + if not success: + return False + try: + data = ie.extract_yt_initial_data(video_id, raw_fragment.decode('utf-8', 'replace')) + except RegexNotFoundError: + return False + continuation_id = try_get( + data, + lambda x: x['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']) + # no data yet but required to call _append_fragment + self._append_fragment(ctx, b'') + + ytcfg = ie.extract_ytcfg(video_id, raw_fragment.decode('utf-8', 'replace')) + + if not ytcfg: + return False + api_key = try_get(ytcfg, lambda x: x['INNERTUBE_API_KEY']) + innertube_context = try_get(ytcfg, lambda x: x['INNERTUBE_CONTEXT']) + if not api_key or not innertube_context: + return False + visitor_data = try_get(innertube_context, lambda x: x['client']['visitorData'], str) + if info_dict['protocol'] == 'youtube_live_chat_replay': + url = 'https://www.youtube.com/youtubei/v1/live_chat/get_live_chat_replay?key=' + api_key + chat_page_url = 'https://www.youtube.com/live_chat_replay?continuation=' + continuation_id + elif info_dict['protocol'] == 'youtube_live_chat': + url = 'https://www.youtube.com/youtubei/v1/live_chat/get_live_chat?key=' + api_key + chat_page_url = 'https://www.youtube.com/live_chat?continuation=' + continuation_id + + frag_index = offset = 0 + click_tracking_params = None + while continuation_id is not None: + frag_index += 1 + request_data = { + 'context': innertube_context, + 'continuation': continuation_id, + } + if frag_index > 1: + request_data['currentPlayerState'] = {'playerOffsetMs': str(max(offset - 5000, 0))} + if click_tracking_params: + request_data['context']['clickTracking'] = {'clickTrackingParams': click_tracking_params} + headers = ie.generate_api_headers(ytcfg=ytcfg, visitor_data=visitor_data) + headers.update({'content-type': 'application/json'}) + fragment_request_data = json.dumps(request_data, ensure_ascii=False).encode('utf-8') + b'\n' + success, continuation_id, offset, click_tracking_params = download_and_parse_fragment( + url, frag_index, fragment_request_data, headers) + else: + success, continuation_id, offset, click_tracking_params = download_and_parse_fragment( + chat_page_url, frag_index) + if not success: + return False + if test: + break + + self._finish_frag_download(ctx, info_dict) + return True + + @staticmethod + def parse_live_timestamp(action): + action_content = dict_get( + action, + ['addChatItemAction', 'addLiveChatTickerItemAction', 'addBannerToLiveChatCommand']) + if not isinstance(action_content, dict): + return None + item = dict_get(action_content, ['item', 'bannerRenderer']) + if not isinstance(item, dict): + return None + renderer = dict_get(item, [ + # text + 'liveChatTextMessageRenderer', 'liveChatPaidMessageRenderer', + 'liveChatMembershipItemRenderer', 'liveChatPaidStickerRenderer', + # ticker + 'liveChatTickerPaidMessageItemRenderer', + 'liveChatTickerSponsorItemRenderer', + # banner + 'liveChatBannerRenderer', + ]) + if not isinstance(renderer, dict): + return None + parent_item_getters = [ + lambda x: x['showItemEndpoint']['showLiveChatItemEndpoint']['renderer'], + lambda x: x['contents'], + ] + parent_item = try_get(renderer, parent_item_getters, dict) + if parent_item: + renderer = dict_get(parent_item, [ + 'liveChatTextMessageRenderer', 'liveChatPaidMessageRenderer', + 'liveChatMembershipItemRenderer', 'liveChatPaidStickerRenderer', + ]) + if not isinstance(renderer, dict): + return None + return int_or_none(renderer.get('timestampUsec'), 1000) diff --git a/yt_dlp/extractor/__init__.py b/yt_dlp/extractor/__init__.py new file mode 100644 index 000000000..198c4ae17 --- /dev/null +++ b/yt_dlp/extractor/__init__.py @@ -0,0 +1,53 @@ +from __future__ import unicode_literals + +from ..utils import load_plugins + +try: + from .lazy_extractors import * + from .lazy_extractors import _ALL_CLASSES + _LAZY_LOADER = True + _PLUGIN_CLASSES = {} +except ImportError: + _LAZY_LOADER = False + +if not _LAZY_LOADER: + from .extractors import * + _ALL_CLASSES = [ + klass + for name, klass in globals().items() + if name.endswith('IE') and name != 'GenericIE' + ] + _ALL_CLASSES.append(GenericIE) + + _PLUGIN_CLASSES = load_plugins('extractor', 'IE', globals()) + _ALL_CLASSES = list(_PLUGIN_CLASSES.values()) + _ALL_CLASSES + + +def gen_extractor_classes(): + """ Return a list of supported extractors. + The order does matter; the first extractor matched is the one handling the URL. + """ + return _ALL_CLASSES + + +def gen_extractors(): + """ Return a list of an instance of every supported extractor. + The order does matter; the first extractor matched is the one handling the URL. + """ + return [klass() for klass in gen_extractor_classes()] + + +def list_extractors(age_limit): + """ + Return a list of extractors that are suitable for the given age, + sorted by extractor ID. + """ + + return sorted( + filter(lambda ie: ie.is_suitable(age_limit), gen_extractors()), + key=lambda ie: ie.IE_NAME.lower()) + + +def get_info_extractor(ie_name): + """Returns the info extractor class with the given ie_name""" + return globals()[ie_name + 'IE'] diff --git a/yt_dlp/extractor/abc.py b/yt_dlp/extractor/abc.py new file mode 100644 index 000000000..3e202168e --- /dev/null +++ b/yt_dlp/extractor/abc.py @@ -0,0 +1,257 @@ +from __future__ import unicode_literals + +import hashlib +import hmac +import re +import time + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + js_to_json, + int_or_none, + parse_iso8601, + str_or_none, + try_get, + unescapeHTML, + update_url_query, +) + + +class ABCIE(InfoExtractor): + IE_NAME = 'abc.net.au' + _VALID_URL = r'https?://(?:www\.)?abc\.net\.au/(?:news|btn)/(?:[^/]+/){1,4}(?P<id>\d{5,})' + + _TESTS = [{ + 'url': 'http://www.abc.net.au/news/2014-11-05/australia-to-staff-ebola-treatment-centre-in-sierra-leone/5868334', + 'md5': 'cb3dd03b18455a661071ee1e28344d9f', + 'info_dict': { + 'id': '5868334', + 'ext': 'mp4', + 'title': 'Australia to help staff Ebola treatment centre in Sierra Leone', + 'description': 'md5:809ad29c67a05f54eb41f2a105693a67', + }, + 'skip': 'this video has expired', + }, { + 'url': 'http://www.abc.net.au/news/2015-08-17/warren-entsch-introduces-same-sex-marriage-bill/6702326', + 'md5': '4ebd61bdc82d9a8b722f64f1f4b4d121', + 'info_dict': { + 'id': 'NvqvPeNZsHU', + 'ext': 'mp4', + 'upload_date': '20150816', + 'uploader': 'ABC News (Australia)', + 'description': 'Government backbencher Warren Entsch introduces a cross-party sponsored bill to legalise same-sex marriage, saying the bill is designed to promote "an inclusive Australia, not a divided one.". Read more here: http://ab.co/1Mwc6ef', + 'uploader_id': 'NewsOnABC', + 'title': 'Marriage Equality: Warren Entsch introduces same sex marriage bill', + }, + 'add_ie': ['Youtube'], + 'skip': 'Not accessible from Travis CI server', + }, { + 'url': 'http://www.abc.net.au/news/2015-10-23/nab-lifts-interest-rates-following-westpac-and-cba/6880080', + 'md5': 'b96eee7c9edf4fc5a358a0252881cc1f', + 'info_dict': { + 'id': '6880080', + 'ext': 'mp3', + 'title': 'NAB lifts interest rates, following Westpac and CBA', + 'description': 'md5:f13d8edc81e462fce4a0437c7dc04728', + }, + }, { + 'url': 'http://www.abc.net.au/news/2015-10-19/6866214', + 'only_matching': True, + }, { + 'url': 'https://www.abc.net.au/btn/classroom/wwi-centenary/10527914', + 'info_dict': { + 'id': '10527914', + 'ext': 'mp4', + 'title': 'WWI Centenary', + 'description': 'md5:c2379ec0ca84072e86b446e536954546', + } + }, { + 'url': 'https://www.abc.net.au/news/programs/the-world/2020-06-10/black-lives-matter-protests-spawn-support-for/12342074', + 'info_dict': { + 'id': '12342074', + 'ext': 'mp4', + 'title': 'Black Lives Matter protests spawn support for Papuans in Indonesia', + 'description': 'md5:2961a17dc53abc558589ccd0fb8edd6f', + } + }, { + 'url': 'https://www.abc.net.au/btn/newsbreak/btn-newsbreak-20200814/12560476', + 'info_dict': { + 'id': 'tDL8Ld4dK_8', + 'ext': 'mp4', + 'title': 'Fortnite Banned From Apple and Google App Stores', + 'description': 'md5:a6df3f36ce8f816b74af4bd6462f5651', + 'upload_date': '20200813', + 'uploader': 'Behind the News', + 'uploader_id': 'behindthenews', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + mobj = re.search(r'<a\s+href="(?P<url>[^"]+)"\s+data-duration="\d+"\s+title="Download audio directly">', webpage) + if mobj: + urls_info = mobj.groupdict() + youtube = False + video = False + else: + mobj = re.search(r'<a href="(?P<url>http://www\.youtube\.com/watch\?v=[^"]+)"><span><strong>External Link:</strong>', + webpage) + if mobj is None: + mobj = re.search(r'<iframe width="100%" src="(?P<url>//www\.youtube-nocookie\.com/embed/[^?"]+)', webpage) + if mobj: + urls_info = mobj.groupdict() + youtube = True + video = True + + if mobj is None: + mobj = re.search(r'(?P<type>)"sources": (?P<json_data>\[[^\]]+\]),', webpage) + if mobj is None: + mobj = re.search( + r'inline(?P<type>Video|Audio|YouTube)Data\.push\((?P<json_data>[^)]+)\);', + webpage) + if mobj is None: + expired = self._html_search_regex(r'(?s)class="expired-(?:video|audio)".+?<span>(.+?)</span>', webpage, 'expired', None) + if expired: + raise ExtractorError('%s said: %s' % (self.IE_NAME, expired), expected=True) + raise ExtractorError('Unable to extract video urls') + + urls_info = self._parse_json( + mobj.group('json_data'), video_id, transform_source=js_to_json) + youtube = mobj.group('type') == 'YouTube' + video = mobj.group('type') == 'Video' or urls_info[0]['contentType'] == 'video/mp4' + + if not isinstance(urls_info, list): + urls_info = [urls_info] + + if youtube: + return self.playlist_result([ + self.url_result(url_info['url']) for url_info in urls_info]) + + formats = [] + for url_info in urls_info: + height = int_or_none(url_info.get('height')) + bitrate = int_or_none(url_info.get('bitrate')) + width = int_or_none(url_info.get('width')) + format_id = None + mobj = re.search(r'_(?:(?P<height>\d+)|(?P<bitrate>\d+)k)\.mp4$', url_info['url']) + if mobj: + height_from_url = mobj.group('height') + if height_from_url: + height = height or int_or_none(height_from_url) + width = width or int_or_none(url_info.get('label')) + else: + bitrate = bitrate or int_or_none(mobj.group('bitrate')) + format_id = str_or_none(url_info.get('label')) + formats.append({ + 'url': url_info['url'], + 'vcodec': url_info.get('codec') if video else 'none', + 'width': width, + 'height': height, + 'tbr': bitrate, + 'filesize': int_or_none(url_info.get('filesize')), + 'format_id': format_id + }) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'formats': formats, + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + } + + +class ABCIViewIE(InfoExtractor): + IE_NAME = 'abc.net.au:iview' + _VALID_URL = r'https?://iview\.abc\.net\.au/(?:[^/]+/)*video/(?P<id>[^/?#]+)' + _GEO_COUNTRIES = ['AU'] + + # ABC iview programs are normally available for 14 days only. + _TESTS = [{ + 'url': 'https://iview.abc.net.au/show/gruen/series/11/video/LE1927H001S00', + 'md5': '67715ce3c78426b11ba167d875ac6abf', + 'info_dict': { + 'id': 'LE1927H001S00', + 'ext': 'mp4', + 'title': "Series 11 Ep 1", + 'series': "Gruen", + 'description': 'md5:52cc744ad35045baf6aded2ce7287f67', + 'upload_date': '20190925', + 'uploader_id': 'abc1', + 'timestamp': 1569445289, + }, + 'params': { + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + video_params = self._download_json( + 'https://iview.abc.net.au/api/programs/' + video_id, video_id) + title = unescapeHTML(video_params.get('title') or video_params['seriesTitle']) + stream = next(s for s in video_params['playlist'] if s.get('type') in ('program', 'livestream')) + + house_number = video_params.get('episodeHouseNumber') or video_id + path = '/auth/hls/sign?ts={0}&hn={1}&d=android-tablet'.format( + int(time.time()), house_number) + sig = hmac.new( + b'android.content.res.Resources', + path.encode('utf-8'), hashlib.sha256).hexdigest() + token = self._download_webpage( + 'http://iview.abc.net.au{0}&sig={1}'.format(path, sig), video_id) + + def tokenize_url(url, token): + return update_url_query(url, { + 'hdnea': token, + }) + + for sd in ('720', 'sd', 'sd-low'): + sd_url = try_get( + stream, lambda x: x['streams']['hls'][sd], compat_str) + if not sd_url: + continue + formats = self._extract_m3u8_formats( + tokenize_url(sd_url, token), video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + if formats: + break + self._sort_formats(formats) + + subtitles = {} + src_vtt = stream.get('captions', {}).get('src-vtt') + if src_vtt: + subtitles['en'] = [{ + 'url': src_vtt, + 'ext': 'vtt', + }] + + is_live = video_params.get('livestream') == '1' + if is_live: + title = self._live_title(title) + + return { + 'id': video_id, + 'title': title, + 'description': video_params.get('description'), + 'thumbnail': video_params.get('thumbnail'), + 'duration': int_or_none(video_params.get('eventDuration')), + 'timestamp': parse_iso8601(video_params.get('pubDate'), ' '), + 'series': unescapeHTML(video_params.get('seriesTitle')), + 'series_id': video_params.get('seriesHouseNumber') or video_id[:7], + 'season_number': int_or_none(self._search_regex( + r'\bSeries\s+(\d+)\b', title, 'season number', default=None)), + 'episode_number': int_or_none(self._search_regex( + r'\bEp\s+(\d+)\b', title, 'episode number', default=None)), + 'episode_id': house_number, + 'uploader_id': video_params.get('channel'), + 'formats': formats, + 'subtitles': subtitles, + 'is_live': is_live, + } diff --git a/yt_dlp/extractor/abcnews.py b/yt_dlp/extractor/abcnews.py new file mode 100644 index 000000000..296b8cec1 --- /dev/null +++ b/yt_dlp/extractor/abcnews.py @@ -0,0 +1,157 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .amp import AMPIE +from .common import InfoExtractor +from ..utils import ( + parse_duration, + parse_iso8601, + try_get, +) + + +class AbcNewsVideoIE(AMPIE): + IE_NAME = 'abcnews:video' + _VALID_URL = r'''(?x) + https?:// + (?: + abcnews\.go\.com/ + (?: + (?:[^/]+/)*video/(?P<display_id>[0-9a-z-]+)-| + video/(?:embed|itemfeed)\?.*?\bid= + )| + fivethirtyeight\.abcnews\.go\.com/video/embed/\d+/ + ) + (?P<id>\d+) + ''' + + _TESTS = [{ + 'url': 'http://abcnews.go.com/ThisWeek/video/week-exclusive-irans-foreign-minister-zarif-20411932', + 'info_dict': { + 'id': '20411932', + 'ext': 'mp4', + 'display_id': 'week-exclusive-irans-foreign-minister-zarif', + 'title': '\'This Week\' Exclusive: Iran\'s Foreign Minister Zarif', + 'description': 'George Stephanopoulos goes one-on-one with Iranian Foreign Minister Dr. Javad Zarif.', + 'duration': 180, + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1380454200, + 'upload_date': '20130929', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://abcnews.go.com/video/embed?id=46979033', + 'only_matching': True, + }, { + 'url': 'http://abcnews.go.com/2020/video/2020-husband-stands-teacher-jail-student-affairs-26119478', + 'only_matching': True, + }, { + 'url': 'http://abcnews.go.com/video/itemfeed?id=46979033', + 'only_matching': True, + }, { + 'url': 'https://abcnews.go.com/GMA/News/video/history-christmas-story-67894761', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + display_id = mobj.group('display_id') + video_id = mobj.group('id') + info_dict = self._extract_feed_info( + 'http://abcnews.go.com/video/itemfeed?id=%s' % video_id) + info_dict.update({ + 'id': video_id, + 'display_id': display_id, + }) + return info_dict + + +class AbcNewsIE(InfoExtractor): + IE_NAME = 'abcnews' + _VALID_URL = r'https?://abcnews\.go\.com/(?:[^/]+/)+(?P<display_id>[0-9a-z-]+)/story\?id=(?P<id>\d+)' + + _TESTS = [{ + # Youtube Embeds + 'url': 'https://abcnews.go.com/Entertainment/peter-billingsley-child-actor-christmas-story-hollywood-power/story?id=51286501', + 'info_dict': { + 'id': '51286501', + 'title': "Peter Billingsley: From child actor in 'A Christmas Story' to Hollywood power player", + 'description': 'Billingsley went from a child actor to Hollywood power player.', + }, + 'playlist_count': 5, + }, { + 'url': 'http://abcnews.go.com/Entertainment/justin-timberlake-performs-stop-feeling-eurovision-2016/story?id=39125818', + 'info_dict': { + 'id': '38897857', + 'ext': 'mp4', + 'title': 'Justin Timberlake Drops Hints For Secret Single', + 'description': 'Lara Spencer reports the buzziest stories of the day in "GMA" Pop News.', + 'upload_date': '20160505', + 'timestamp': 1462442280, + }, + 'params': { + # m3u8 download + 'skip_download': True, + # The embedded YouTube video is blocked due to copyright issues + 'playlist_items': '1', + }, + 'add_ie': ['AbcNewsVideo'], + }, { + 'url': 'http://abcnews.go.com/Technology/exclusive-apple-ceo-tim-cook-iphone-cracking-software/story?id=37173343', + 'only_matching': True, + }, { + # inline.type == 'video' + 'url': 'http://abcnews.go.com/Technology/exclusive-apple-ceo-tim-cook-iphone-cracking-software/story?id=37173343', + 'only_matching': True, + }] + + def _real_extract(self, url): + story_id = self._match_id(url) + webpage = self._download_webpage(url, story_id) + story = self._parse_json(self._search_regex( + r"window\['__abcnews__'\]\s*=\s*({.+?});", + webpage, 'data'), story_id)['page']['content']['story']['everscroll'][0] + article_contents = story.get('articleContents') or {} + + def entries(): + featured_video = story.get('featuredVideo') or {} + feed = try_get(featured_video, lambda x: x['video']['feed']) + if feed: + yield { + '_type': 'url', + 'id': featured_video.get('id'), + 'title': featured_video.get('name'), + 'url': feed, + 'thumbnail': featured_video.get('images'), + 'description': featured_video.get('description'), + 'timestamp': parse_iso8601(featured_video.get('uploadDate')), + 'duration': parse_duration(featured_video.get('duration')), + 'ie_key': AbcNewsVideoIE.ie_key(), + } + + for inline in (article_contents.get('inlines') or []): + inline_type = inline.get('type') + if inline_type == 'iframe': + iframe_url = try_get(inline, lambda x: x['attrs']['src']) + if iframe_url: + yield self.url_result(iframe_url) + elif inline_type == 'video': + video_id = inline.get('id') + if video_id: + yield { + '_type': 'url', + 'id': video_id, + 'url': 'http://abcnews.go.com/video/embed?id=' + video_id, + 'thumbnail': inline.get('imgSrc') or inline.get('imgDefault'), + 'description': inline.get('description'), + 'duration': parse_duration(inline.get('duration')), + 'ie_key': AbcNewsVideoIE.ie_key(), + } + + return self.playlist_result( + entries(), story_id, article_contents.get('headline'), + article_contents.get('subHead')) diff --git a/yt_dlp/extractor/abcotvs.py b/yt_dlp/extractor/abcotvs.py new file mode 100644 index 000000000..5bff46634 --- /dev/null +++ b/yt_dlp/extractor/abcotvs.py @@ -0,0 +1,136 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + dict_get, + int_or_none, + try_get, +) + + +class ABCOTVSIE(InfoExtractor): + IE_NAME = 'abcotvs' + IE_DESC = 'ABC Owned Television Stations' + _VALID_URL = r'https?://(?P<site>abc(?:7(?:news|ny|chicago)?|11|13|30)|6abc)\.com(?:(?:/[^/]+)*/(?P<display_id>[^/]+))?/(?P<id>\d+)' + _TESTS = [ + { + 'url': 'http://abc7news.com/entertainment/east-bay-museum-celebrates-vintage-synthesizers/472581/', + 'info_dict': { + 'id': '472548', + 'display_id': 'east-bay-museum-celebrates-vintage-synthesizers', + 'ext': 'mp4', + 'title': 'East Bay museum celebrates synthesized music', + 'description': 'md5:24ed2bd527096ec2a5c67b9d5a9005f3', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1421118520, + 'upload_date': '20150113', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + { + 'url': 'http://abc7news.com/472581', + 'only_matching': True, + }, + { + 'url': 'https://6abc.com/man-75-killed-after-being-struck-by-vehicle-in-chester/5725182/', + 'only_matching': True, + }, + ] + _SITE_MAP = { + '6abc': 'wpvi', + 'abc11': 'wtvd', + 'abc13': 'ktrk', + 'abc30': 'kfsn', + 'abc7': 'kabc', + 'abc7chicago': 'wls', + 'abc7news': 'kgo', + 'abc7ny': 'wabc', + } + + def _real_extract(self, url): + site, display_id, video_id = self._match_valid_url(url).groups() + display_id = display_id or video_id + station = self._SITE_MAP[site] + + data = self._download_json( + 'https://api.abcotvs.com/v2/content', display_id, query={ + 'id': video_id, + 'key': 'otv.web.%s.story' % station, + 'station': station, + })['data'] + video = try_get(data, lambda x: x['featuredMedia']['video'], dict) or data + video_id = compat_str(dict_get(video, ('id', 'publishedKey'), video_id)) + title = video.get('title') or video['linkText'] + + formats = [] + m3u8_url = video.get('m3u8') + if m3u8_url: + formats = self._extract_m3u8_formats( + video['m3u8'].split('?')[0], display_id, 'mp4', m3u8_id='hls', fatal=False) + mp4_url = video.get('mp4') + if mp4_url: + formats.append({ + 'abr': 128, + 'format_id': 'https', + 'height': 360, + 'url': mp4_url, + 'width': 640, + }) + self._sort_formats(formats) + + image = video.get('image') or {} + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': dict_get(video, ('description', 'caption'), try_get(video, lambda x: x['meta']['description'])), + 'thumbnail': dict_get(image, ('source', 'dynamicSource')), + 'timestamp': int_or_none(video.get('date')), + 'duration': int_or_none(video.get('length')), + 'formats': formats, + } + + +class ABCOTVSClipsIE(InfoExtractor): + IE_NAME = 'abcotvs:clips' + _VALID_URL = r'https?://clips\.abcotvs\.com/(?:[^/]+/)*video/(?P<id>\d+)' + _TEST = { + 'url': 'https://clips.abcotvs.com/kabc/video/214814', + 'info_dict': { + 'id': '214814', + 'ext': 'mp4', + 'title': 'SpaceX launch pad explosion destroys rocket, satellite', + 'description': 'md5:9f186e5ad8f490f65409965ee9c7be1b', + 'upload_date': '20160901', + 'timestamp': 1472756695, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + video_data = self._download_json('https://clips.abcotvs.com/vogo/video/getByIds?ids=' + video_id, video_id)['results'][0] + title = video_data['title'] + formats = self._extract_m3u8_formats( + video_data['videoURL'].split('?')[0], video_id, 'mp4') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('description'), + 'thumbnail': video_data.get('thumbnailURL'), + 'duration': int_or_none(video_data.get('duration')), + 'timestamp': int_or_none(video_data.get('pubDate')), + 'formats': formats, + } diff --git a/youtube_dl/extractor/academicearth.py b/yt_dlp/extractor/academicearth.py index 34095501c..34095501c 100644 --- a/youtube_dl/extractor/academicearth.py +++ b/yt_dlp/extractor/academicearth.py diff --git a/yt_dlp/extractor/acast.py b/yt_dlp/extractor/acast.py new file mode 100644 index 000000000..63587c5cf --- /dev/null +++ b/yt_dlp/extractor/acast.py @@ -0,0 +1,125 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ( + clean_html, + clean_podcast_url, + int_or_none, + parse_iso8601, +) + + +class ACastBaseIE(InfoExtractor): + def _extract_episode(self, episode, show_info): + title = episode['title'] + info = { + 'id': episode['id'], + 'display_id': episode.get('episodeUrl'), + 'url': clean_podcast_url(episode['url']), + 'title': title, + 'description': clean_html(episode.get('description') or episode.get('summary')), + 'thumbnail': episode.get('image'), + 'timestamp': parse_iso8601(episode.get('publishDate')), + 'duration': int_or_none(episode.get('duration')), + 'filesize': int_or_none(episode.get('contentLength')), + 'season_number': int_or_none(episode.get('season')), + 'episode': title, + 'episode_number': int_or_none(episode.get('episode')), + } + info.update(show_info) + return info + + def _extract_show_info(self, show): + return { + 'creator': show.get('author'), + 'series': show.get('title'), + } + + def _call_api(self, path, video_id, query=None): + return self._download_json( + 'https://feeder.acast.com/api/v1/shows/' + path, video_id, query=query) + + +class ACastIE(ACastBaseIE): + IE_NAME = 'acast' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:(?:embed|www)\.)?acast\.com/| + play\.acast\.com/s/ + ) + (?P<channel>[^/]+)/(?P<id>[^/#?]+) + ''' + _TESTS = [{ + 'url': 'https://www.acast.com/sparpodcast/2.raggarmordet-rosterurdetforflutna', + 'md5': 'f5598f3ad1e4776fed12ec1407153e4b', + 'info_dict': { + 'id': '2a92b283-1a75-4ad8-8396-499c641de0d9', + 'ext': 'mp3', + 'title': '2. Raggarmordet - Röster ur det förflutna', + 'description': 'md5:a992ae67f4d98f1c0141598f7bebbf67', + 'timestamp': 1477346700, + 'upload_date': '20161024', + 'duration': 2766, + 'creator': 'Anton Berg & Martin Johnson', + 'series': 'Spår', + 'episode': '2. Raggarmordet - Röster ur det förflutna', + } + }, { + 'url': 'http://embed.acast.com/adambuxton/ep.12-adam-joeschristmaspodcast2015', + 'only_matching': True, + }, { + 'url': 'https://play.acast.com/s/rattegangspodden/s04e09styckmordetihelenelund-del2-2', + 'only_matching': True, + }, { + 'url': 'https://play.acast.com/s/sparpodcast/2a92b283-1a75-4ad8-8396-499c641de0d9', + 'only_matching': True, + }] + + def _real_extract(self, url): + channel, display_id = self._match_valid_url(url).groups() + episode = self._call_api( + '%s/episodes/%s' % (channel, display_id), + display_id, {'showInfo': 'true'}) + return self._extract_episode( + episode, self._extract_show_info(episode.get('show') or {})) + + +class ACastChannelIE(ACastBaseIE): + IE_NAME = 'acast:channel' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?acast\.com/| + play\.acast\.com/s/ + ) + (?P<id>[^/#?]+) + ''' + _TESTS = [{ + 'url': 'https://www.acast.com/todayinfocus', + 'info_dict': { + 'id': '4efc5294-5385-4847-98bd-519799ce5786', + 'title': 'Today in Focus', + 'description': 'md5:c09ce28c91002ce4ffce71d6504abaae', + }, + 'playlist_mincount': 200, + }, { + 'url': 'http://play.acast.com/s/ft-banking-weekly', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if ACastIE.suitable(url) else super(ACastChannelIE, cls).suitable(url) + + def _real_extract(self, url): + show_slug = self._match_id(url) + show = self._call_api(show_slug, show_slug) + show_info = self._extract_show_info(show) + entries = [] + for episode in (show.get('episodes') or []): + entries.append(self._extract_episode(episode, show_info)) + return self.playlist_result( + entries, show.get('id'), show.get('title'), show.get('description')) diff --git a/youtube_dl/extractor/adn.py b/yt_dlp/extractor/adn.py index a55ebbcbd..a55ebbcbd 100644 --- a/youtube_dl/extractor/adn.py +++ b/yt_dlp/extractor/adn.py diff --git a/youtube_dl/extractor/adobeconnect.py b/yt_dlp/extractor/adobeconnect.py index 728549eb9..728549eb9 100644 --- a/youtube_dl/extractor/adobeconnect.py +++ b/yt_dlp/extractor/adobeconnect.py diff --git a/yt_dlp/extractor/adobepass.py b/yt_dlp/extractor/adobepass.py new file mode 100644 index 000000000..9378c33cd --- /dev/null +++ b/yt_dlp/extractor/adobepass.py @@ -0,0 +1,1715 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re +import time +import xml.etree.ElementTree as etree + +from .common import InfoExtractor +from ..compat import ( + compat_kwargs, + compat_urlparse, + compat_getpass +) +from ..utils import ( + unescapeHTML, + urlencode_postdata, + unified_timestamp, + ExtractorError, + NO_DEFAULT, +) + + +MSO_INFO = { + 'DTV': { + 'name': 'DIRECTV', + 'username_field': 'username', + 'password_field': 'password', + }, + 'ATT': { + 'name': 'AT&T U-verse', + 'username_field': 'userid', + 'password_field': 'password', + }, + 'ATTOTT': { + 'name': 'DIRECTV NOW', + 'username_field': 'email', + 'password_field': 'loginpassword', + }, + 'RCN': { + 'name': 'RCN', + 'username_field': 'UserName', + 'password_field': 'UserPassword', + }, + 'Rogers': { + 'name': 'Rogers', + 'username_field': 'UserName', + 'password_field': 'UserPassword', + }, + 'Comcast_SSO': { + 'name': 'Comcast XFINITY', + 'username_field': 'user', + 'password_field': 'passwd', + }, + 'TWC': { + 'name': 'Time Warner Cable | Spectrum', + 'username_field': 'Ecom_User_ID', + 'password_field': 'Ecom_Password', + }, + 'Brighthouse': { + 'name': 'Bright House Networks | Spectrum', + 'username_field': 'j_username', + 'password_field': 'j_password', + }, + 'Charter_Direct': { + 'name': 'Charter Spectrum', + 'username_field': 'IDToken1', + 'password_field': 'IDToken2', + }, + 'Spectrum': { + 'name': 'Spectrum', + 'username_field': 'IDToken1', + 'password_field': 'IDToken2', + }, + 'Philo': { + 'name': 'Philo', + 'username_field': 'ident' + }, + 'Verizon': { + 'name': 'Verizon FiOS', + 'username_field': 'IDToken1', + 'password_field': 'IDToken2', + }, + 'Cablevision': { + 'name': 'Optimum/Cablevision', + 'username_field': 'j_username', + 'password_field': 'j_password', + }, + 'thr030': { + 'name': '3 Rivers Communications' + }, + 'com140': { + 'name': 'Access Montana' + }, + 'acecommunications': { + 'name': 'AcenTek' + }, + 'acm010': { + 'name': 'Acme Communications' + }, + 'ada020': { + 'name': 'Adams Cable Service' + }, + 'alb020': { + 'name': 'Albany Mutual Telephone' + }, + 'algona': { + 'name': 'Algona Municipal Utilities' + }, + 'allwest': { + 'name': 'All West Communications' + }, + 'all025': { + 'name': 'Allen\'s Communications' + }, + 'spl010': { + 'name': 'Alliance Communications' + }, + 'all070': { + 'name': 'ALLO Communications' + }, + 'alpine': { + 'name': 'Alpine Communications' + }, + 'hun015': { + 'name': 'American Broadband' + }, + 'nwc010': { + 'name': 'American Broadband Missouri' + }, + 'com130-02': { + 'name': 'American Community Networks' + }, + 'com130-01': { + 'name': 'American Warrior Networks' + }, + 'tom020': { + 'name': 'Amherst Telephone/Tomorrow Valley' + }, + 'tvc020': { + 'name': 'Andycable' + }, + 'arkwest': { + 'name': 'Arkwest Communications' + }, + 'art030': { + 'name': 'Arthur Mutual Telephone Company' + }, + 'arvig': { + 'name': 'Arvig' + }, + 'nttcash010': { + 'name': 'Ashland Home Net' + }, + 'astound': { + 'name': 'Astound (now Wave)' + }, + 'dix030': { + 'name': 'ATC Broadband' + }, + 'ara010': { + 'name': 'ATC Communications' + }, + 'she030-02': { + 'name': 'Ayersville Communications' + }, + 'baldwin': { + 'name': 'Baldwin Lightstream' + }, + 'bal040': { + 'name': 'Ballard TV' + }, + 'cit025': { + 'name': 'Bardstown Cable TV' + }, + 'bay030': { + 'name': 'Bay Country Communications' + }, + 'tel095': { + 'name': 'Beaver Creek Cooperative Telephone' + }, + 'bea020': { + 'name': 'Beaver Valley Cable' + }, + 'bee010': { + 'name': 'Bee Line Cable' + }, + 'wir030': { + 'name': 'Beehive Broadband' + }, + 'bra020': { + 'name': 'BELD' + }, + 'bel020': { + 'name': 'Bellevue Municipal Cable' + }, + 'vol040-01': { + 'name': 'Ben Lomand Connect / BLTV' + }, + 'bev010': { + 'name': 'BEVCOMM' + }, + 'big020': { + 'name': 'Big Sandy Broadband' + }, + 'ble020': { + 'name': 'Bledsoe Telephone Cooperative' + }, + 'bvt010': { + 'name': 'Blue Valley Tele-Communications' + }, + 'bra050': { + 'name': 'Brandenburg Telephone Co.' + }, + 'bte010': { + 'name': 'Bristol Tennessee Essential Services' + }, + 'annearundel': { + 'name': 'Broadstripe' + }, + 'btc010': { + 'name': 'BTC Communications' + }, + 'btc040': { + 'name': 'BTC Vision - Nahunta' + }, + 'bul010': { + 'name': 'Bulloch Telephone Cooperative' + }, + 'but010': { + 'name': 'Butler-Bremer Communications' + }, + 'tel160-csp': { + 'name': 'C Spire SNAP' + }, + 'csicable': { + 'name': 'Cable Services Inc.' + }, + 'cableamerica': { + 'name': 'CableAmerica' + }, + 'cab038': { + 'name': 'CableSouth Media 3' + }, + 'weh010-camtel': { + 'name': 'Cam-Tel Company' + }, + 'car030': { + 'name': 'Cameron Communications' + }, + 'canbytel': { + 'name': 'Canby Telcom' + }, + 'crt020': { + 'name': 'CapRock Tv' + }, + 'car050': { + 'name': 'Carnegie Cable' + }, + 'cas': { + 'name': 'CAS Cable' + }, + 'casscomm': { + 'name': 'CASSCOMM' + }, + 'mid180-02': { + 'name': 'Catalina Broadband Solutions' + }, + 'cccomm': { + 'name': 'CC Communications' + }, + 'nttccde010': { + 'name': 'CDE Lightband' + }, + 'cfunet': { + 'name': 'Cedar Falls Utilities' + }, + 'dem010-01': { + 'name': 'Celect-Bloomer Telephone Area' + }, + 'dem010-02': { + 'name': 'Celect-Bruce Telephone Area' + }, + 'dem010-03': { + 'name': 'Celect-Citizens Connected Area' + }, + 'dem010-04': { + 'name': 'Celect-Elmwood/Spring Valley Area' + }, + 'dem010-06': { + 'name': 'Celect-Mosaic Telecom' + }, + 'dem010-05': { + 'name': 'Celect-West WI Telephone Area' + }, + 'net010-02': { + 'name': 'Cellcom/Nsight Telservices' + }, + 'cen100': { + 'name': 'CentraCom' + }, + 'nttccst010': { + 'name': 'Central Scott / CSTV' + }, + 'cha035': { + 'name': 'Chaparral CableVision' + }, + 'cha050': { + 'name': 'Chariton Valley Communication Corporation, Inc.' + }, + 'cha060': { + 'name': 'Chatmoss Cablevision' + }, + 'nttcche010': { + 'name': 'Cherokee Communications' + }, + 'che050': { + 'name': 'Chesapeake Bay Communications' + }, + 'cimtel': { + 'name': 'Cim-Tel Cable, LLC.' + }, + 'cit180': { + 'name': 'Citizens Cablevision - Floyd, VA' + }, + 'cit210': { + 'name': 'Citizens Cablevision, Inc.' + }, + 'cit040': { + 'name': 'Citizens Fiber' + }, + 'cit250': { + 'name': 'Citizens Mutual' + }, + 'war040': { + 'name': 'Citizens Telephone Corporation' + }, + 'wat025': { + 'name': 'City Of Monroe' + }, + 'wadsworth': { + 'name': 'CityLink' + }, + 'nor100': { + 'name': 'CL Tel' + }, + 'cla010': { + 'name': 'Clarence Telephone and Cedar Communications' + }, + 'ser060': { + 'name': 'Clear Choice Communications' + }, + 'tac020': { + 'name': 'Click! Cable TV' + }, + 'war020': { + 'name': 'CLICK1.NET' + }, + 'cml010': { + 'name': 'CML Telephone Cooperative Association' + }, + 'cns': { + 'name': 'CNS' + }, + 'com160': { + 'name': 'Co-Mo Connect' + }, + 'coa020': { + 'name': 'Coast Communications' + }, + 'coa030': { + 'name': 'Coaxial Cable TV' + }, + 'mid055': { + 'name': 'Cobalt TV (Mid-State Community TV)' + }, + 'col070': { + 'name': 'Columbia Power & Water Systems' + }, + 'col080': { + 'name': 'Columbus Telephone' + }, + 'nor105': { + 'name': 'Communications 1 Cablevision, Inc.' + }, + 'com150': { + 'name': 'Community Cable & Broadband' + }, + 'com020': { + 'name': 'Community Communications Company' + }, + 'coy010': { + 'name': 'commZoom' + }, + 'com025': { + 'name': 'Complete Communication Services' + }, + 'cat020': { + 'name': 'Comporium' + }, + 'com071': { + 'name': 'ComSouth Telesys' + }, + 'consolidatedcable': { + 'name': 'Consolidated' + }, + 'conwaycorp': { + 'name': 'Conway Corporation' + }, + 'coo050': { + 'name': 'Coon Valley Telecommunications Inc' + }, + 'coo080': { + 'name': 'Cooperative Telephone Company' + }, + 'cpt010': { + 'name': 'CP-TEL' + }, + 'cra010': { + 'name': 'Craw-Kan Telephone' + }, + 'crestview': { + 'name': 'Crestview Cable Communications' + }, + 'cross': { + 'name': 'Cross TV' + }, + 'cro030': { + 'name': 'Crosslake Communications' + }, + 'ctc040': { + 'name': 'CTC - Brainerd MN' + }, + 'phe030': { + 'name': 'CTV-Beam - East Alabama' + }, + 'cun010': { + 'name': 'Cunningham Telephone & Cable' + }, + 'dpc010': { + 'name': 'D & P Communications' + }, + 'dak030': { + 'name': 'Dakota Central Telecommunications' + }, + 'nttcdel010': { + 'name': 'Delcambre Telephone LLC' + }, + 'tel160-del': { + 'name': 'Delta Telephone Company' + }, + 'sal040': { + 'name': 'DiamondNet' + }, + 'ind060-dc': { + 'name': 'Direct Communications' + }, + 'doy010': { + 'name': 'Doylestown Cable TV' + }, + 'dic010': { + 'name': 'DRN' + }, + 'dtc020': { + 'name': 'DTC' + }, + 'dtc010': { + 'name': 'DTC Cable (Delhi)' + }, + 'dum010': { + 'name': 'Dumont Telephone Company' + }, + 'dun010': { + 'name': 'Dunkerton Telephone Cooperative' + }, + 'cci010': { + 'name': 'Duo County Telecom' + }, + 'eagle': { + 'name': 'Eagle Communications' + }, + 'weh010-east': { + 'name': 'East Arkansas Cable TV' + }, + 'eatel': { + 'name': 'EATEL Video, LLC' + }, + 'ell010': { + 'name': 'ECTA' + }, + 'emerytelcom': { + 'name': 'Emery Telcom Video LLC' + }, + 'nor200': { + 'name': 'Empire Access' + }, + 'endeavor': { + 'name': 'Endeavor Communications' + }, + 'sun045': { + 'name': 'Enhanced Telecommunications Corporation' + }, + 'mid030': { + 'name': 'enTouch' + }, + 'epb020': { + 'name': 'EPB Smartnet' + }, + 'jea010': { + 'name': 'EPlus Broadband' + }, + 'com065': { + 'name': 'ETC' + }, + 'ete010': { + 'name': 'Etex Communications' + }, + 'fbc-tele': { + 'name': 'F&B Communications' + }, + 'fal010': { + 'name': 'Falcon Broadband' + }, + 'fam010': { + 'name': 'FamilyView CableVision' + }, + 'far020': { + 'name': 'Farmers Mutual Telephone Company' + }, + 'fay010': { + 'name': 'Fayetteville Public Utilities' + }, + 'sal060': { + 'name': 'fibrant' + }, + 'fid010': { + 'name': 'Fidelity Communications' + }, + 'for030': { + 'name': 'FJ Communications' + }, + 'fli020': { + 'name': 'Flint River Communications' + }, + 'far030': { + 'name': 'FMT - Jesup' + }, + 'foo010': { + 'name': 'Foothills Communications' + }, + 'for080': { + 'name': 'Forsyth CableNet' + }, + 'fbcomm': { + 'name': 'Frankfort Plant Board' + }, + 'tel160-fra': { + 'name': 'Franklin Telephone Company' + }, + 'nttcftc010': { + 'name': 'FTC' + }, + 'fullchannel': { + 'name': 'Full Channel, Inc.' + }, + 'gar040': { + 'name': 'Gardonville Cooperative Telephone Association' + }, + 'gbt010': { + 'name': 'GBT Communications, Inc.' + }, + 'tec010': { + 'name': 'Genuine Telecom' + }, + 'clr010': { + 'name': 'Giant Communications' + }, + 'gla010': { + 'name': 'Glasgow EPB' + }, + 'gle010': { + 'name': 'Glenwood Telecommunications' + }, + 'gra060': { + 'name': 'GLW Broadband Inc.' + }, + 'goldenwest': { + 'name': 'Golden West Cablevision' + }, + 'vis030': { + 'name': 'Grantsburg Telcom' + }, + 'gpcom': { + 'name': 'Great Plains Communications' + }, + 'gri010': { + 'name': 'Gridley Cable Inc' + }, + 'hbc010': { + 'name': 'H&B Cable Services' + }, + 'hae010': { + 'name': 'Haefele TV Inc.' + }, + 'htc010': { + 'name': 'Halstad Telephone Company' + }, + 'har005': { + 'name': 'Harlan Municipal Utilities' + }, + 'har020': { + 'name': 'Hart Communications' + }, + 'ced010': { + 'name': 'Hartelco TV' + }, + 'hea040': { + 'name': 'Heart of Iowa Communications Cooperative' + }, + 'htc020': { + 'name': 'Hickory Telephone Company' + }, + 'nttchig010': { + 'name': 'Highland Communication Services' + }, + 'hig030': { + 'name': 'Highland Media' + }, + 'spc010': { + 'name': 'Hilliary Communications' + }, + 'hin020': { + 'name': 'Hinton CATV Co.' + }, + 'hometel': { + 'name': 'HomeTel Entertainment, Inc.' + }, + 'hoodcanal': { + 'name': 'Hood Canal Communications' + }, + 'weh010-hope': { + 'name': 'Hope - Prescott Cable TV' + }, + 'horizoncable': { + 'name': 'Horizon Cable TV, Inc.' + }, + 'hor040': { + 'name': 'Horizon Chillicothe Telephone' + }, + 'htc030': { + 'name': 'HTC Communications Co. - IL' + }, + 'htccomm': { + 'name': 'HTC Communications, Inc. - IA' + }, + 'wal005': { + 'name': 'Huxley Communications' + }, + 'imon': { + 'name': 'ImOn Communications' + }, + 'ind040': { + 'name': 'Independence Telecommunications' + }, + 'rrc010': { + 'name': 'Inland Networks' + }, + 'stc020': { + 'name': 'Innovative Cable TV St Croix' + }, + 'car100': { + 'name': 'Innovative Cable TV St Thomas-St John' + }, + 'icc010': { + 'name': 'Inside Connect Cable' + }, + 'int100': { + 'name': 'Integra Telecom' + }, + 'int050': { + 'name': 'Interstate Telecommunications Coop' + }, + 'irv010': { + 'name': 'Irvine Cable' + }, + 'k2c010': { + 'name': 'K2 Communications' + }, + 'kal010': { + 'name': 'Kalida Telephone Company, Inc.' + }, + 'kal030': { + 'name': 'Kalona Cooperative Telephone Company' + }, + 'kmt010': { + 'name': 'KMTelecom' + }, + 'kpu010': { + 'name': 'KPU Telecommunications' + }, + 'kuh010': { + 'name': 'Kuhn Communications, Inc.' + }, + 'lak130': { + 'name': 'Lakeland Communications' + }, + 'lan010': { + 'name': 'Langco' + }, + 'lau020': { + 'name': 'Laurel Highland Total Communications, Inc.' + }, + 'leh010': { + 'name': 'Lehigh Valley Cooperative Telephone' + }, + 'bra010': { + 'name': 'Limestone Cable/Bracken Cable' + }, + 'loc020': { + 'name': 'LISCO' + }, + 'lit020': { + 'name': 'Litestream' + }, + 'tel140': { + 'name': 'LivCom' + }, + 'loc010': { + 'name': 'LocalTel Communications' + }, + 'weh010-longview': { + 'name': 'Longview - Kilgore Cable TV' + }, + 'lon030': { + 'name': 'Lonsdale Video Ventures, LLC' + }, + 'lns010': { + 'name': 'Lost Nation-Elwood Telephone Co.' + }, + 'nttclpc010': { + 'name': 'LPC Connect' + }, + 'lumos': { + 'name': 'Lumos Networks' + }, + 'madison': { + 'name': 'Madison Communications' + }, + 'mad030': { + 'name': 'Madison County Cable Inc.' + }, + 'nttcmah010': { + 'name': 'Mahaska Communication Group' + }, + 'mar010': { + 'name': 'Marne & Elk Horn Telephone Company' + }, + 'mcc040': { + 'name': 'McClure Telephone Co.' + }, + 'mctv': { + 'name': 'MCTV' + }, + 'merrimac': { + 'name': 'Merrimac Communications Ltd.' + }, + 'metronet': { + 'name': 'Metronet' + }, + 'mhtc': { + 'name': 'MHTC' + }, + 'midhudson': { + 'name': 'Mid-Hudson Cable' + }, + 'midrivers': { + 'name': 'Mid-Rivers Communications' + }, + 'mid045': { + 'name': 'Midstate Communications' + }, + 'mil080': { + 'name': 'Milford Communications' + }, + 'min030': { + 'name': 'MINET' + }, + 'nttcmin010': { + 'name': 'Minford TV' + }, + 'san040-02': { + 'name': 'Mitchell Telecom' + }, + 'mlg010': { + 'name': 'MLGC' + }, + 'mon060': { + 'name': 'Mon-Cre TVE' + }, + 'mou110': { + 'name': 'Mountain Telephone' + }, + 'mou050': { + 'name': 'Mountain Village Cable' + }, + 'mtacomm': { + 'name': 'MTA Communications, LLC' + }, + 'mtc010': { + 'name': 'MTC Cable' + }, + 'med040': { + 'name': 'MTC Technologies' + }, + 'man060': { + 'name': 'MTCC' + }, + 'mtc030': { + 'name': 'MTCO Communications' + }, + 'mul050': { + 'name': 'Mulberry Telecommunications' + }, + 'mur010': { + 'name': 'Murray Electric System' + }, + 'musfiber': { + 'name': 'MUS FiberNET' + }, + 'mpw': { + 'name': 'Muscatine Power & Water' + }, + 'nttcsli010': { + 'name': 'myEVTV.com' + }, + 'nor115': { + 'name': 'NCC' + }, + 'nor260': { + 'name': 'NDTC' + }, + 'nctc': { + 'name': 'Nebraska Central Telecom, Inc.' + }, + 'nel020': { + 'name': 'Nelsonville TV Cable' + }, + 'nem010': { + 'name': 'Nemont' + }, + 'new075': { + 'name': 'New Hope Telephone Cooperative' + }, + 'nor240': { + 'name': 'NICP' + }, + 'cic010': { + 'name': 'NineStar Connect' + }, + 'nktelco': { + 'name': 'NKTelco' + }, + 'nortex': { + 'name': 'Nortex Communications' + }, + 'nor140': { + 'name': 'North Central Telephone Cooperative' + }, + 'nor030': { + 'name': 'Northland Communications' + }, + 'nor075': { + 'name': 'Northwest Communications' + }, + 'nor125': { + 'name': 'Norwood Light Broadband' + }, + 'net010': { + 'name': 'Nsight Telservices' + }, + 'dur010': { + 'name': 'Ntec' + }, + 'nts010': { + 'name': 'NTS Communications' + }, + 'new045': { + 'name': 'NU-Telecom' + }, + 'nulink': { + 'name': 'NuLink' + }, + 'jam030': { + 'name': 'NVC' + }, + 'far035': { + 'name': 'OmniTel Communications' + }, + 'onesource': { + 'name': 'OneSource Communications' + }, + 'cit230': { + 'name': 'Opelika Power Services' + }, + 'daltonutilities': { + 'name': 'OptiLink' + }, + 'mid140': { + 'name': 'OPTURA' + }, + 'ote010': { + 'name': 'OTEC Communication Company' + }, + 'cci020': { + 'name': 'Packerland Broadband' + }, + 'pan010': { + 'name': 'Panora Telco/Guthrie Center Communications' + }, + 'otter': { + 'name': 'Park Region Telephone & Otter Tail Telcom' + }, + 'mid050': { + 'name': 'Partner Communications Cooperative' + }, + 'fib010': { + 'name': 'Pathway' + }, + 'paulbunyan': { + 'name': 'Paul Bunyan Communications' + }, + 'pem020': { + 'name': 'Pembroke Telephone Company' + }, + 'mck010': { + 'name': 'Peoples Rural Telephone Cooperative' + }, + 'pul010': { + 'name': 'PES Energize' + }, + 'phi010': { + 'name': 'Philippi Communications System' + }, + 'phonoscope': { + 'name': 'Phonoscope Cable' + }, + 'pin070': { + 'name': 'Pine Belt Communications, Inc.' + }, + 'weh010-pine': { + 'name': 'Pine Bluff Cable TV' + }, + 'pin060': { + 'name': 'Pineland Telephone Cooperative' + }, + 'cam010': { + 'name': 'Pinpoint Communications' + }, + 'pio060': { + 'name': 'Pioneer Broadband' + }, + 'pioncomm': { + 'name': 'Pioneer Communications' + }, + 'pioneer': { + 'name': 'Pioneer DTV' + }, + 'pla020': { + 'name': 'Plant TiftNet, Inc.' + }, + 'par010': { + 'name': 'PLWC' + }, + 'pro035': { + 'name': 'PMT' + }, + 'vik011': { + 'name': 'Polar Cablevision' + }, + 'pottawatomie': { + 'name': 'Pottawatomie Telephone Co.' + }, + 'premiercomm': { + 'name': 'Premier Communications' + }, + 'psc010': { + 'name': 'PSC' + }, + 'pan020': { + 'name': 'PTCI' + }, + 'qco010': { + 'name': 'QCOL' + }, + 'qua010': { + 'name': 'Quality Cablevision' + }, + 'rad010': { + 'name': 'Radcliffe Telephone Company' + }, + 'car040': { + 'name': 'Rainbow Communications' + }, + 'rai030': { + 'name': 'Rainier Connect' + }, + 'ral010': { + 'name': 'Ralls Technologies' + }, + 'rct010': { + 'name': 'RC Technologies' + }, + 'red040': { + 'name': 'Red River Communications' + }, + 'ree010': { + 'name': 'Reedsburg Utility Commission' + }, + 'mol010': { + 'name': 'Reliance Connects- Oregon' + }, + 'res020': { + 'name': 'Reserve Telecommunications' + }, + 'weh010-resort': { + 'name': 'Resort TV Cable' + }, + 'rld010': { + 'name': 'Richland Grant Telephone Cooperative, Inc.' + }, + 'riv030': { + 'name': 'River Valley Telecommunications Coop' + }, + 'rockportcable': { + 'name': 'Rock Port Cablevision' + }, + 'rsf010': { + 'name': 'RS Fiber' + }, + 'rtc': { + 'name': 'RTC Communication Corp' + }, + 'res040': { + 'name': 'RTC-Reservation Telephone Coop.' + }, + 'rte010': { + 'name': 'RTEC Communications' + }, + 'stc010': { + 'name': 'S&T' + }, + 'san020': { + 'name': 'San Bruno Cable TV' + }, + 'san040-01': { + 'name': 'Santel' + }, + 'sav010': { + 'name': 'SCI Broadband-Savage Communications Inc.' + }, + 'sco050': { + 'name': 'Scottsboro Electric Power Board' + }, + 'scr010': { + 'name': 'Scranton Telephone Company' + }, + 'selco': { + 'name': 'SELCO' + }, + 'she010': { + 'name': 'Shentel' + }, + 'she030': { + 'name': 'Sherwood Mutual Telephone Association, Inc.' + }, + 'ind060-ssc': { + 'name': 'Silver Star Communications' + }, + 'sjoberg': { + 'name': 'Sjoberg\'s Inc.' + }, + 'sou025': { + 'name': 'SKT' + }, + 'sky050': { + 'name': 'SkyBest TV' + }, + 'nttcsmi010': { + 'name': 'Smithville Communications' + }, + 'woo010': { + 'name': 'Solarus' + }, + 'sou075': { + 'name': 'South Central Rural Telephone Cooperative' + }, + 'sou065': { + 'name': 'South Holt Cablevision, Inc.' + }, + 'sou035': { + 'name': 'South Slope Cooperative Communications' + }, + 'spa020': { + 'name': 'Spanish Fork Community Network' + }, + 'spe010': { + 'name': 'Spencer Municipal Utilities' + }, + 'spi005': { + 'name': 'Spillway Communications, Inc.' + }, + 'srt010': { + 'name': 'SRT' + }, + 'cccsmc010': { + 'name': 'St. Maarten Cable TV' + }, + 'sta025': { + 'name': 'Star Communications' + }, + 'sco020': { + 'name': 'STE' + }, + 'uin010': { + 'name': 'STRATA Networks' + }, + 'sum010': { + 'name': 'Sumner Cable TV' + }, + 'pie010': { + 'name': 'Surry TV/PCSI TV' + }, + 'swa010': { + 'name': 'Swayzee Communications' + }, + 'sweetwater': { + 'name': 'Sweetwater Cable Television Co' + }, + 'weh010-talequah': { + 'name': 'Tahlequah Cable TV' + }, + 'tct': { + 'name': 'TCT' + }, + 'tel050': { + 'name': 'Tele-Media Company' + }, + 'com050': { + 'name': 'The Community Agency' + }, + 'thr020': { + 'name': 'Three River' + }, + 'cab140': { + 'name': 'Town & Country Technologies' + }, + 'tra010': { + 'name': 'Trans-Video' + }, + 'tre010': { + 'name': 'Trenton TV Cable Company' + }, + 'tcc': { + 'name': 'Tri County Communications Cooperative' + }, + 'tri025': { + 'name': 'TriCounty Telecom' + }, + 'tri110': { + 'name': 'TrioTel Communications, Inc.' + }, + 'tro010': { + 'name': 'Troy Cablevision, Inc.' + }, + 'tsc': { + 'name': 'TSC' + }, + 'cit220': { + 'name': 'Tullahoma Utilities Board' + }, + 'tvc030': { + 'name': 'TV Cable of Rensselaer' + }, + 'tvc015': { + 'name': 'TVC Cable' + }, + 'cab180': { + 'name': 'TVision' + }, + 'twi040': { + 'name': 'Twin Lakes' + }, + 'tvtinc': { + 'name': 'Twin Valley' + }, + 'uis010': { + 'name': 'Union Telephone Company' + }, + 'uni110': { + 'name': 'United Communications - TN' + }, + 'uni120': { + 'name': 'United Services' + }, + 'uss020': { + 'name': 'US Sonet' + }, + 'cab060': { + 'name': 'USA Communications' + }, + 'she005': { + 'name': 'USA Communications/Shellsburg, IA' + }, + 'val040': { + 'name': 'Valley TeleCom Group' + }, + 'val025': { + 'name': 'Valley Telecommunications' + }, + 'val030': { + 'name': 'Valparaiso Broadband' + }, + 'cla050': { + 'name': 'Vast Broadband' + }, + 'sul015': { + 'name': 'Venture Communications Cooperative, Inc.' + }, + 'ver025': { + 'name': 'Vernon Communications Co-op' + }, + 'weh010-vicksburg': { + 'name': 'Vicksburg Video' + }, + 'vis070': { + 'name': 'Vision Communications' + }, + 'volcanotel': { + 'name': 'Volcano Vision, Inc.' + }, + 'vol040-02': { + 'name': 'VolFirst / BLTV' + }, + 'ver070': { + 'name': 'VTel' + }, + 'nttcvtx010': { + 'name': 'VTX1' + }, + 'bci010-02': { + 'name': 'Vyve Broadband' + }, + 'wab020': { + 'name': 'Wabash Mutual Telephone' + }, + 'waitsfield': { + 'name': 'Waitsfield Cable' + }, + 'wal010': { + 'name': 'Walnut Communications' + }, + 'wavebroadband': { + 'name': 'Wave' + }, + 'wav030': { + 'name': 'Waverly Communications Utility' + }, + 'wbi010': { + 'name': 'WBI' + }, + 'web020': { + 'name': 'Webster-Calhoun Cooperative Telephone Association' + }, + 'wes005': { + 'name': 'West Alabama TV Cable' + }, + 'carolinata': { + 'name': 'West Carolina Communications' + }, + 'wct010': { + 'name': 'West Central Telephone Association' + }, + 'wes110': { + 'name': 'West River Cooperative Telephone Company' + }, + 'ani030': { + 'name': 'WesTel Systems' + }, + 'westianet': { + 'name': 'Western Iowa Networks' + }, + 'nttcwhi010': { + 'name': 'Whidbey Telecom' + }, + 'weh010-white': { + 'name': 'White County Cable TV' + }, + 'wes130': { + 'name': 'Wiatel' + }, + 'wik010': { + 'name': 'Wiktel' + }, + 'wil070': { + 'name': 'Wilkes Communications, Inc./RiverStreet Networks' + }, + 'wil015': { + 'name': 'Wilson Communications' + }, + 'win010': { + 'name': 'Windomnet/SMBS' + }, + 'win090': { + 'name': 'Windstream Cable TV' + }, + 'wcta': { + 'name': 'Winnebago Cooperative Telecom Association' + }, + 'wtc010': { + 'name': 'WTC' + }, + 'wil040': { + 'name': 'WTC Communications, Inc.' + }, + 'wya010': { + 'name': 'Wyandotte Cable' + }, + 'hin020-02': { + 'name': 'X-Stream Services' + }, + 'xit010': { + 'name': 'XIT Communications' + }, + 'yel010': { + 'name': 'Yelcot Communications' + }, + 'mid180-01': { + 'name': 'yondoo' + }, + 'cou060': { + 'name': 'Zito Media' + }, + 'slingtv': { + 'name': 'Sling TV', + 'username_field': 'username', + 'password_field': 'password', + }, +} + + +class AdobePassIE(InfoExtractor): + _SERVICE_PROVIDER_TEMPLATE = 'https://sp.auth.adobe.com/adobe-services/%s' + _USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0' + _MVPD_CACHE = 'ap-mvpd' + + _DOWNLOADING_LOGIN_PAGE = 'Downloading Provider Login Page' + + def _download_webpage_handle(self, *args, **kwargs): + headers = self.geo_verification_headers() + headers.update(kwargs.get('headers', {})) + kwargs['headers'] = headers + return super(AdobePassIE, self)._download_webpage_handle( + *args, **compat_kwargs(kwargs)) + + @staticmethod + def _get_mvpd_resource(provider_id, title, guid, rating): + channel = etree.Element('channel') + channel_title = etree.SubElement(channel, 'title') + channel_title.text = provider_id + item = etree.SubElement(channel, 'item') + resource_title = etree.SubElement(item, 'title') + resource_title.text = title + resource_guid = etree.SubElement(item, 'guid') + resource_guid.text = guid + resource_rating = etree.SubElement(item, 'media:rating') + resource_rating.attrib = {'scheme': 'urn:v-chip'} + resource_rating.text = rating + return '<rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/">' + etree.tostring(channel).decode() + '</rss>' + + def _extract_mvpd_auth(self, url, video_id, requestor_id, resource): + def xml_text(xml_str, tag): + return self._search_regex( + '<%s>(.+?)</%s>' % (tag, tag), xml_str, tag) + + def is_expired(token, date_ele): + token_expires = unified_timestamp(re.sub(r'[_ ]GMT', '', xml_text(token, date_ele))) + return token_expires and token_expires <= int(time.time()) + + def post_form(form_page_res, note, data={}): + form_page, urlh = form_page_res + post_url = self._html_search_regex(r'<form[^>]+action=(["\'])(?P<url>.+?)\1', form_page, 'post url', group='url') + if not re.match(r'https?://', post_url): + post_url = compat_urlparse.urljoin(urlh.geturl(), post_url) + form_data = self._hidden_inputs(form_page) + form_data.update(data) + return self._download_webpage_handle( + post_url, video_id, note, data=urlencode_postdata(form_data), headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + }) + + def raise_mvpd_required(): + raise ExtractorError( + 'This video is only available for users of participating TV providers. ' + 'Use --ap-mso to specify Adobe Pass Multiple-system operator Identifier ' + 'and --ap-username and --ap-password or --netrc to provide account credentials.', expected=True) + + def extract_redirect_url(html, url=None, fatal=False): + # TODO: eliminate code duplication with generic extractor and move + # redirection code into _download_webpage_handle + REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)' + redirect_url = self._search_regex( + r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")' + r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX, + html, 'meta refresh redirect', + default=NO_DEFAULT if fatal else None, fatal=fatal) + if not redirect_url: + return None + if url: + redirect_url = compat_urlparse.urljoin(url, unescapeHTML(redirect_url)) + return redirect_url + + mvpd_headers = { + 'ap_42': 'anonymous', + 'ap_11': 'Linux i686', + 'ap_z': self._USER_AGENT, + 'User-Agent': self._USER_AGENT, + } + + guid = xml_text(resource, 'guid') if '<' in resource else resource + count = 0 + while count < 2: + requestor_info = self._downloader.cache.load(self._MVPD_CACHE, requestor_id) or {} + authn_token = requestor_info.get('authn_token') + if authn_token and is_expired(authn_token, 'simpleTokenExpires'): + authn_token = None + if not authn_token: + # TODO add support for other TV Providers + mso_id = self.get_param('ap_mso') + if not mso_id: + raise_mvpd_required() + username, password = self._get_login_info('ap_username', 'ap_password', mso_id) + if not username or not password: + raise_mvpd_required() + mso_info = MSO_INFO[mso_id] + + provider_redirect_page_res = self._download_webpage_handle( + self._SERVICE_PROVIDER_TEMPLATE % 'authenticate/saml', video_id, + 'Downloading Provider Redirect Page', query={ + 'noflash': 'true', + 'mso_id': mso_id, + 'requestor_id': requestor_id, + 'no_iframe': 'false', + 'domain_name': 'adobe.com', + 'redirect_url': url, + }) + + if mso_id == 'Comcast_SSO': + # Comcast page flow varies by video site and whether you + # are on Comcast's network. + provider_redirect_page, urlh = provider_redirect_page_res + if 'automatically signing you in' in provider_redirect_page: + oauth_redirect_url = self._html_search_regex( + r'window\.location\s*=\s*[\'"]([^\'"]+)', + provider_redirect_page, 'oauth redirect') + self._download_webpage( + oauth_redirect_url, video_id, 'Confirming auto login') + elif 'automatically signed in with' in provider_redirect_page: + # Seems like comcast is rolling up new way of automatically signing customers + oauth_redirect_url = self._html_search_regex( + r'continue:\s*"(https://oauth.xfinity.com/oauth/authorize\?.+)"', provider_redirect_page, + 'oauth redirect (signed)') + # Just need to process the request. No useful data comes back + self._download_webpage(oauth_redirect_url, video_id, 'Confirming auto login') + else: + if '<form name="signin"' in provider_redirect_page: + provider_login_page_res = provider_redirect_page_res + elif 'http-equiv="refresh"' in provider_redirect_page: + oauth_redirect_url = extract_redirect_url( + provider_redirect_page, fatal=True) + provider_login_page_res = self._download_webpage_handle( + oauth_redirect_url, video_id, + self._DOWNLOADING_LOGIN_PAGE) + else: + provider_login_page_res = post_form( + provider_redirect_page_res, + self._DOWNLOADING_LOGIN_PAGE) + + mvpd_confirm_page_res = post_form( + provider_login_page_res, 'Logging in', { + mso_info['username_field']: username, + mso_info['password_field']: password, + }) + mvpd_confirm_page, urlh = mvpd_confirm_page_res + if '<button class="submit" value="Resume">Resume</button>' in mvpd_confirm_page: + post_form(mvpd_confirm_page_res, 'Confirming Login') + elif mso_id == 'Philo': + # Philo has very unique authentication method + self._download_webpage( + 'https://idp.philo.com/auth/init/login_code', video_id, 'Requesting auth code', data=urlencode_postdata({ + 'ident': username, + 'device': 'web', + 'send_confirm_link': False, + 'send_token': True + })) + philo_code = compat_getpass('Type auth code you have received [Return]: ') + self._download_webpage( + 'https://idp.philo.com/auth/update/login_code', video_id, 'Submitting token', data=urlencode_postdata({ + 'token': philo_code + })) + mvpd_confirm_page_res = self._download_webpage_handle('https://idp.philo.com/idp/submit', video_id, 'Confirming Philo Login') + post_form(mvpd_confirm_page_res, 'Confirming Login') + elif mso_id == 'Verizon': + # In general, if you're connecting from a Verizon-assigned IP, + # you will not actually pass your credentials. + provider_redirect_page, urlh = provider_redirect_page_res + # From non-Verizon IP, still gave 'Please wait', but noticed N==Y; will need to try on Verizon IP + if 'Please wait ...' in provider_redirect_page and '\'N\'== "Y"' not in provider_redirect_page: + saml_redirect_url = self._html_search_regex( + r'self\.parent\.location=(["\'])(?P<url>.+?)\1', + provider_redirect_page, + 'SAML Redirect URL', group='url') + saml_login_page = self._download_webpage( + saml_redirect_url, video_id, + 'Downloading SAML Login Page') + elif 'Verizon FiOS - sign in' in provider_redirect_page: + # FXNetworks from non-Verizon IP + saml_login_page_res = post_form( + provider_redirect_page_res, 'Logging in', { + mso_info['username_field']: username, + mso_info['password_field']: password, + }) + saml_login_page, urlh = saml_login_page_res + if 'Please try again.' in saml_login_page: + raise ExtractorError( + 'We\'re sorry, but either the User ID or Password entered is not correct.') + else: + # ABC from non-Verizon IP + saml_redirect_url = self._html_search_regex( + r'var\surl\s*=\s*(["\'])(?P<url>.+?)\1', + provider_redirect_page, + 'SAML Redirect URL', group='url') + saml_redirect_url = saml_redirect_url.replace(r'\/', '/') + saml_redirect_url = saml_redirect_url.replace(r'\-', '-') + saml_redirect_url = saml_redirect_url.replace(r'\x26', '&') + saml_login_page = self._download_webpage( + saml_redirect_url, video_id, + 'Downloading SAML Login Page') + saml_login_page, urlh = post_form( + [saml_login_page, saml_redirect_url], 'Logging in', { + mso_info['username_field']: username, + mso_info['password_field']: password, + }) + if 'Please try again.' in saml_login_page: + raise ExtractorError( + 'Failed to login, incorrect User ID or Password.') + saml_login_url = self._search_regex( + r'xmlHttp\.open\("POST"\s*,\s*(["\'])(?P<url>.+?)\1', + saml_login_page, 'SAML Login URL', group='url') + saml_response_json = self._download_json( + saml_login_url, video_id, 'Downloading SAML Response', + headers={'Content-Type': 'text/xml'}) + self._download_webpage( + saml_response_json['targetValue'], video_id, + 'Confirming Login', data=urlencode_postdata({ + 'SAMLResponse': saml_response_json['SAMLResponse'], + 'RelayState': saml_response_json['RelayState'] + }), headers={ + 'Content-Type': 'application/x-www-form-urlencoded' + }) + elif mso_id == 'Spectrum': + # Spectrum's login for is dynamically loaded via JS so we need to hardcode the flow + # as a one-off implementation. + provider_redirect_page, urlh = provider_redirect_page_res + provider_login_page_res = post_form( + provider_redirect_page_res, self._DOWNLOADING_LOGIN_PAGE) + saml_login_page, urlh = provider_login_page_res + relay_state = self._search_regex( + r'RelayState\s*=\s*"(?P<relay>.+?)";', + saml_login_page, 'RelayState', group='relay') + saml_request = self._search_regex( + r'SAMLRequest\s*=\s*"(?P<saml_request>.+?)";', + saml_login_page, 'SAMLRequest', group='saml_request') + login_json = { + mso_info['username_field']: username, + mso_info['password_field']: password, + 'RelayState': relay_state, + 'SAMLRequest': saml_request, + } + saml_response_json = self._download_json( + 'https://tveauthn.spectrum.net/tveauthentication/api/v1/manualAuth', video_id, + 'Downloading SAML Response', + data=json.dumps(login_json).encode(), + headers={ + 'Content-Type': 'application/json', + 'Accept': 'application/json', + }) + self._download_webpage( + saml_response_json['SAMLRedirectUri'], video_id, + 'Confirming Login', data=urlencode_postdata({ + 'SAMLResponse': saml_response_json['SAMLResponse'], + 'RelayState': relay_state, + }), headers={ + 'Content-Type': 'application/x-www-form-urlencoded' + }) + elif mso_id == 'slingtv': + # SlingTV has a meta-refresh based authentication, but also + # looks at the tab history to count the number of times the + # browser has been on a page + + first_bookend_page, urlh = provider_redirect_page_res + + hidden_data = self._hidden_inputs(first_bookend_page) + hidden_data['history'] = 1 + + provider_login_page_res = self._download_webpage_handle( + urlh.geturl(), video_id, 'Sending first bookend', + query=hidden_data) + + provider_association_redirect, urlh = post_form( + provider_login_page_res, 'Logging in', { + mso_info['username_field']: username, + mso_info['password_field']: password + }) + + provider_refresh_redirect_url = extract_redirect_url( + provider_association_redirect, url=urlh.geturl()) + + last_bookend_page, urlh = self._download_webpage_handle( + provider_refresh_redirect_url, video_id, + 'Downloading Auth Association Redirect Page') + hidden_data = self._hidden_inputs(last_bookend_page) + hidden_data['history'] = 3 + + mvpd_confirm_page_res = self._download_webpage_handle( + urlh.geturl(), video_id, 'Sending final bookend', + query=hidden_data) + + post_form(mvpd_confirm_page_res, 'Confirming Login') + else: + # Some providers (e.g. DIRECTV NOW) have another meta refresh + # based redirect that should be followed. + provider_redirect_page, urlh = provider_redirect_page_res + provider_refresh_redirect_url = extract_redirect_url( + provider_redirect_page, url=urlh.geturl()) + if provider_refresh_redirect_url: + provider_redirect_page_res = self._download_webpage_handle( + provider_refresh_redirect_url, video_id, + 'Downloading Provider Redirect Page (meta refresh)') + provider_login_page_res = post_form( + provider_redirect_page_res, self._DOWNLOADING_LOGIN_PAGE) + form_data = { + mso_info.get('username_field', 'username'): username, + mso_info.get('password_field', 'password'): password + } + if mso_id == 'Cablevision': + form_data['_eventId_proceed'] = '' + mvpd_confirm_page_res = post_form(provider_login_page_res, 'Logging in', form_data) + if mso_id != 'Rogers': + post_form(mvpd_confirm_page_res, 'Confirming Login') + + session = self._download_webpage( + self._SERVICE_PROVIDER_TEMPLATE % 'session', video_id, + 'Retrieving Session', data=urlencode_postdata({ + '_method': 'GET', + 'requestor_id': requestor_id, + }), headers=mvpd_headers) + if '<pendingLogout' in session: + self._downloader.cache.store(self._MVPD_CACHE, requestor_id, {}) + count += 1 + continue + authn_token = unescapeHTML(xml_text(session, 'authnToken')) + requestor_info['authn_token'] = authn_token + self._downloader.cache.store(self._MVPD_CACHE, requestor_id, requestor_info) + + authz_token = requestor_info.get(guid) + if authz_token and is_expired(authz_token, 'simpleTokenTTL'): + authz_token = None + if not authz_token: + authorize = self._download_webpage( + self._SERVICE_PROVIDER_TEMPLATE % 'authorize', video_id, + 'Retrieving Authorization Token', data=urlencode_postdata({ + 'resource_id': resource, + 'requestor_id': requestor_id, + 'authentication_token': authn_token, + 'mso_id': xml_text(authn_token, 'simpleTokenMsoID'), + 'userMeta': '1', + }), headers=mvpd_headers) + if '<pendingLogout' in authorize: + self._downloader.cache.store(self._MVPD_CACHE, requestor_id, {}) + count += 1 + continue + if '<error' in authorize: + raise ExtractorError(xml_text(authorize, 'details'), expected=True) + authz_token = unescapeHTML(xml_text(authorize, 'authzToken')) + requestor_info[guid] = authz_token + self._downloader.cache.store(self._MVPD_CACHE, requestor_id, requestor_info) + + mvpd_headers.update({ + 'ap_19': xml_text(authn_token, 'simpleSamlNameID'), + 'ap_23': xml_text(authn_token, 'simpleSamlSessionIndex'), + }) + + short_authorize = self._download_webpage( + self._SERVICE_PROVIDER_TEMPLATE % 'shortAuthorize', + video_id, 'Retrieving Media Token', data=urlencode_postdata({ + 'authz_token': authz_token, + 'requestor_id': requestor_id, + 'session_guid': xml_text(authn_token, 'simpleTokenAuthenticationGuid'), + 'hashed_guid': 'false', + }), headers=mvpd_headers) + if '<pendingLogout' in short_authorize: + self._downloader.cache.store(self._MVPD_CACHE, requestor_id, {}) + count += 1 + continue + return short_authorize diff --git a/yt_dlp/extractor/adobetv.py b/yt_dlp/extractor/adobetv.py new file mode 100644 index 000000000..12b819206 --- /dev/null +++ b/yt_dlp/extractor/adobetv.py @@ -0,0 +1,288 @@ +from __future__ import unicode_literals + +import functools +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + float_or_none, + int_or_none, + ISO639Utils, + OnDemandPagedList, + parse_duration, + str_or_none, + str_to_int, + unified_strdate, +) + + +class AdobeTVBaseIE(InfoExtractor): + def _call_api(self, path, video_id, query, note=None): + return self._download_json( + 'http://tv.adobe.com/api/v4/' + path, + video_id, note, query=query)['data'] + + def _parse_subtitles(self, video_data, url_key): + subtitles = {} + for translation in video_data.get('translations', []): + vtt_path = translation.get(url_key) + if not vtt_path: + continue + lang = translation.get('language_w3c') or ISO639Utils.long2short(translation['language_medium']) + subtitles.setdefault(lang, []).append({ + 'ext': 'vtt', + 'url': vtt_path, + }) + return subtitles + + def _parse_video_data(self, video_data): + video_id = compat_str(video_data['id']) + title = video_data['title'] + + s3_extracted = False + formats = [] + for source in video_data.get('videos', []): + source_url = source.get('url') + if not source_url: + continue + f = { + 'format_id': source.get('quality_level'), + 'fps': int_or_none(source.get('frame_rate')), + 'height': int_or_none(source.get('height')), + 'tbr': int_or_none(source.get('video_data_rate')), + 'width': int_or_none(source.get('width')), + 'url': source_url, + } + original_filename = source.get('original_filename') + if original_filename: + if not (f.get('height') and f.get('width')): + mobj = re.search(r'_(\d+)x(\d+)', original_filename) + if mobj: + f.update({ + 'height': int(mobj.group(2)), + 'width': int(mobj.group(1)), + }) + if original_filename.startswith('s3://') and not s3_extracted: + formats.append({ + 'format_id': 'original', + 'quality': 1, + 'url': original_filename.replace('s3://', 'https://s3.amazonaws.com/'), + }) + s3_extracted = True + formats.append(f) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('description'), + 'thumbnail': video_data.get('thumbnail'), + 'upload_date': unified_strdate(video_data.get('start_date')), + 'duration': parse_duration(video_data.get('duration')), + 'view_count': str_to_int(video_data.get('playcount')), + 'formats': formats, + 'subtitles': self._parse_subtitles(video_data, 'vtt'), + } + + +class AdobeTVEmbedIE(AdobeTVBaseIE): + IE_NAME = 'adobetv:embed' + _VALID_URL = r'https?://tv\.adobe\.com/embed/\d+/(?P<id>\d+)' + _TEST = { + 'url': 'https://tv.adobe.com/embed/22/4153', + 'md5': 'c8c0461bf04d54574fc2b4d07ac6783a', + 'info_dict': { + 'id': '4153', + 'ext': 'flv', + 'title': 'Creating Graphics Optimized for BlackBerry', + 'description': 'md5:eac6e8dced38bdaae51cd94447927459', + 'thumbnail': r're:https?://.*\.jpg$', + 'upload_date': '20091109', + 'duration': 377, + 'view_count': int, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + video_data = self._call_api( + 'episode/' + video_id, video_id, {'disclosure': 'standard'})[0] + return self._parse_video_data(video_data) + + +class AdobeTVIE(AdobeTVBaseIE): + IE_NAME = 'adobetv' + _VALID_URL = r'https?://tv\.adobe\.com/(?:(?P<language>fr|de|es|jp)/)?watch/(?P<show_urlname>[^/]+)/(?P<id>[^/]+)' + + _TEST = { + 'url': 'http://tv.adobe.com/watch/the-complete-picture-with-julieanne-kost/quick-tip-how-to-draw-a-circle-around-an-object-in-photoshop/', + 'md5': '9bc5727bcdd55251f35ad311ca74fa1e', + 'info_dict': { + 'id': '10981', + 'ext': 'mp4', + 'title': 'Quick Tip - How to Draw a Circle Around an Object in Photoshop', + 'description': 'md5:99ec318dc909d7ba2a1f2b038f7d2311', + 'thumbnail': r're:https?://.*\.jpg$', + 'upload_date': '20110914', + 'duration': 60, + 'view_count': int, + }, + } + + def _real_extract(self, url): + language, show_urlname, urlname = self._match_valid_url(url).groups() + if not language: + language = 'en' + + video_data = self._call_api( + 'episode/get', urlname, { + 'disclosure': 'standard', + 'language': language, + 'show_urlname': show_urlname, + 'urlname': urlname, + })[0] + return self._parse_video_data(video_data) + + +class AdobeTVPlaylistBaseIE(AdobeTVBaseIE): + _PAGE_SIZE = 25 + + def _fetch_page(self, display_id, query, page): + page += 1 + query['page'] = page + for element_data in self._call_api( + self._RESOURCE, display_id, query, 'Download Page %d' % page): + yield self._process_data(element_data) + + def _extract_playlist_entries(self, display_id, query): + return OnDemandPagedList(functools.partial( + self._fetch_page, display_id, query), self._PAGE_SIZE) + + +class AdobeTVShowIE(AdobeTVPlaylistBaseIE): + IE_NAME = 'adobetv:show' + _VALID_URL = r'https?://tv\.adobe\.com/(?:(?P<language>fr|de|es|jp)/)?show/(?P<id>[^/]+)' + + _TEST = { + 'url': 'http://tv.adobe.com/show/the-complete-picture-with-julieanne-kost', + 'info_dict': { + 'id': '36', + 'title': 'The Complete Picture with Julieanne Kost', + 'description': 'md5:fa50867102dcd1aa0ddf2ab039311b27', + }, + 'playlist_mincount': 136, + } + _RESOURCE = 'episode' + _process_data = AdobeTVBaseIE._parse_video_data + + def _real_extract(self, url): + language, show_urlname = self._match_valid_url(url).groups() + if not language: + language = 'en' + query = { + 'disclosure': 'standard', + 'language': language, + 'show_urlname': show_urlname, + } + + show_data = self._call_api( + 'show/get', show_urlname, query)[0] + + return self.playlist_result( + self._extract_playlist_entries(show_urlname, query), + str_or_none(show_data.get('id')), + show_data.get('show_name'), + show_data.get('show_description')) + + +class AdobeTVChannelIE(AdobeTVPlaylistBaseIE): + IE_NAME = 'adobetv:channel' + _VALID_URL = r'https?://tv\.adobe\.com/(?:(?P<language>fr|de|es|jp)/)?channel/(?P<id>[^/]+)(?:/(?P<category_urlname>[^/]+))?' + + _TEST = { + 'url': 'http://tv.adobe.com/channel/development', + 'info_dict': { + 'id': 'development', + }, + 'playlist_mincount': 96, + } + _RESOURCE = 'show' + + def _process_data(self, show_data): + return self.url_result( + show_data['url'], 'AdobeTVShow', str_or_none(show_data.get('id'))) + + def _real_extract(self, url): + language, channel_urlname, category_urlname = self._match_valid_url(url).groups() + if not language: + language = 'en' + query = { + 'channel_urlname': channel_urlname, + 'language': language, + } + if category_urlname: + query['category_urlname'] = category_urlname + + return self.playlist_result( + self._extract_playlist_entries(channel_urlname, query), + channel_urlname) + + +class AdobeTVVideoIE(AdobeTVBaseIE): + IE_NAME = 'adobetv:video' + _VALID_URL = r'https?://video\.tv\.adobe\.com/v/(?P<id>\d+)' + + _TEST = { + # From https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners + 'url': 'https://video.tv.adobe.com/v/2456/', + 'md5': '43662b577c018ad707a63766462b1e87', + 'info_dict': { + 'id': '2456', + 'ext': 'mp4', + 'title': 'New experience with Acrobat DC', + 'description': 'New experience with Acrobat DC', + 'duration': 248.667, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + video_data = self._parse_json(self._search_regex( + r'var\s+bridge\s*=\s*([^;]+);', webpage, 'bridged data'), video_id) + title = video_data['title'] + + formats = [] + sources = video_data.get('sources') or [] + for source in sources: + source_src = source.get('src') + if not source_src: + continue + formats.append({ + 'filesize': int_or_none(source.get('kilobytes') or None, invscale=1000), + 'format_id': '-'.join(filter(None, [source.get('format'), source.get('label')])), + 'height': int_or_none(source.get('height') or None), + 'tbr': int_or_none(source.get('bitrate') or None), + 'width': int_or_none(source.get('width') or None), + 'url': source_src, + }) + self._sort_formats(formats) + + # For both metadata and downloaded files the duration varies among + # formats. I just pick the max one + duration = max(filter(None, [ + float_or_none(source.get('duration'), scale=1000) + for source in sources])) + + return { + 'id': video_id, + 'formats': formats, + 'title': title, + 'description': video_data.get('description'), + 'thumbnail': video_data.get('video', {}).get('poster'), + 'duration': duration, + 'subtitles': self._parse_subtitles(video_data, 'vttPath'), + } diff --git a/yt_dlp/extractor/adultswim.py b/yt_dlp/extractor/adultswim.py new file mode 100644 index 000000000..c97cfc161 --- /dev/null +++ b/yt_dlp/extractor/adultswim.py @@ -0,0 +1,201 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .turner import TurnerBaseIE +from ..utils import ( + determine_ext, + float_or_none, + int_or_none, + mimetype2ext, + parse_age_limit, + parse_iso8601, + strip_or_none, + try_get, +) + + +class AdultSwimIE(TurnerBaseIE): + _VALID_URL = r'https?://(?:www\.)?adultswim\.com/videos/(?P<show_path>[^/?#]+)(?:/(?P<episode_path>[^/?#]+))?' + + _TESTS = [{ + 'url': 'http://adultswim.com/videos/rick-and-morty/pilot', + 'info_dict': { + 'id': 'rQxZvXQ4ROaSOqq-or2Mow', + 'ext': 'mp4', + 'title': 'Rick and Morty - Pilot', + 'description': 'Rick moves in with his daughter\'s family and establishes himself as a bad influence on his grandson, Morty.', + 'timestamp': 1543294800, + 'upload_date': '20181127', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download f4m manifest'], + }, { + 'url': 'http://www.adultswim.com/videos/tim-and-eric-awesome-show-great-job/dr-steve-brule-for-your-wine/', + 'info_dict': { + 'id': 'sY3cMUR_TbuE4YmdjzbIcQ', + 'ext': 'mp4', + 'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine', + 'description': 'Dr. Brule reports live from Wine Country with a special report on wines. \nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.', + 'upload_date': '20080124', + 'timestamp': 1201150800, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'skip': '404 Not Found', + }, { + 'url': 'http://www.adultswim.com/videos/decker/inside-decker-a-new-hero/', + 'info_dict': { + 'id': 'I0LQFQkaSUaFp8PnAWHhoQ', + 'ext': 'mp4', + 'title': 'Decker - Inside Decker: A New Hero', + 'description': 'The guys recap the conclusion of the season. They announce a new hero, take a peek into the Victorville Film Archive and welcome back the talented James Dean.', + 'timestamp': 1469480460, + 'upload_date': '20160725', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download f4m manifest'], + }, { + 'url': 'http://www.adultswim.com/videos/attack-on-titan', + 'info_dict': { + 'id': 'attack-on-titan', + 'title': 'Attack on Titan', + 'description': 'md5:41caa9416906d90711e31dc00cb7db7e', + }, + 'playlist_mincount': 12, + }, { + 'url': 'http://www.adultswim.com/videos/streams/williams-stream', + 'info_dict': { + 'id': 'd8DEBj7QRfetLsRgFnGEyg', + 'ext': 'mp4', + 'title': r're:^Williams Stream \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'description': 'original programming', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'skip': '404 Not Found', + }] + + def _real_extract(self, url): + show_path, episode_path = self._match_valid_url(url).groups() + display_id = episode_path or show_path + query = '''query { + getShowBySlug(slug:"%s") { + %%s + } +}''' % show_path + if episode_path: + query = query % '''title + getVideoBySlug(slug:"%s") { + _id + auth + description + duration + episodeNumber + launchDate + mediaID + seasonNumber + poster + title + tvRating + }''' % episode_path + ['getVideoBySlug'] + else: + query = query % '''metaDescription + title + videos(first:1000,sort:["episode_number"]) { + edges { + node { + _id + slug + } + } + }''' + show_data = self._download_json( + 'https://www.adultswim.com/api/search', display_id, + data=json.dumps({'query': query}).encode(), + headers={'Content-Type': 'application/json'})['data']['getShowBySlug'] + if episode_path: + video_data = show_data['getVideoBySlug'] + video_id = video_data['_id'] + episode_title = title = video_data['title'] + series = show_data.get('title') + if series: + title = '%s - %s' % (series, title) + info = { + 'id': video_id, + 'title': title, + 'description': strip_or_none(video_data.get('description')), + 'duration': float_or_none(video_data.get('duration')), + 'formats': [], + 'subtitles': {}, + 'age_limit': parse_age_limit(video_data.get('tvRating')), + 'thumbnail': video_data.get('poster'), + 'timestamp': parse_iso8601(video_data.get('launchDate')), + 'series': series, + 'season_number': int_or_none(video_data.get('seasonNumber')), + 'episode': episode_title, + 'episode_number': int_or_none(video_data.get('episodeNumber')), + } + + auth = video_data.get('auth') + media_id = video_data.get('mediaID') + if media_id: + info.update(self._extract_ngtv_info(media_id, { + # CDN_TOKEN_APP_ID from: + # https://d2gg02c3xr550i.cloudfront.net/assets/asvp.e9c8bef24322d060ef87.bundle.js + 'appId': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhcHBJZCI6ImFzLXR2ZS1kZXNrdG9wLXB0enQ2bSIsInByb2R1Y3QiOiJ0dmUiLCJuZXR3b3JrIjoiYXMiLCJwbGF0Zm9ybSI6ImRlc2t0b3AiLCJpYXQiOjE1MzI3MDIyNzl9.BzSCk-WYOZ2GMCIaeVb8zWnzhlgnXuJTCu0jGp_VaZE', + }, { + 'url': url, + 'site_name': 'AdultSwim', + 'auth_required': auth, + })) + + if not auth: + extract_data = self._download_json( + 'https://www.adultswim.com/api/shows/v1/videos/' + video_id, + video_id, query={'fields': 'stream'}, fatal=False) or {} + assets = try_get(extract_data, lambda x: x['data']['video']['stream']['assets'], list) or [] + for asset in assets: + asset_url = asset.get('url') + if not asset_url: + continue + ext = determine_ext(asset_url, mimetype2ext(asset.get('mime_type'))) + if ext == 'm3u8': + info['formats'].extend(self._extract_m3u8_formats( + asset_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + elif ext == 'f4m': + continue + # info['formats'].extend(self._extract_f4m_formats( + # asset_url, video_id, f4m_id='hds', fatal=False)) + elif ext in ('scc', 'ttml', 'vtt'): + info['subtitles'].setdefault('en', []).append({ + 'url': asset_url, + }) + self._sort_formats(info['formats']) + + return info + else: + entries = [] + for edge in show_data.get('videos', {}).get('edges', []): + video = edge.get('node') or {} + slug = video.get('slug') + if not slug: + continue + entries.append(self.url_result( + 'http://adultswim.com/videos/%s/%s' % (show_path, slug), + 'AdultSwim', video.get('_id'))) + return self.playlist_result( + entries, show_path, show_data.get('title'), + strip_or_none(show_data.get('metaDescription'))) diff --git a/yt_dlp/extractor/aenetworks.py b/yt_dlp/extractor/aenetworks.py new file mode 100644 index 000000000..8025de5a3 --- /dev/null +++ b/yt_dlp/extractor/aenetworks.py @@ -0,0 +1,341 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .theplatform import ThePlatformIE +from ..utils import ( + ExtractorError, + GeoRestrictedError, + int_or_none, + update_url_query, + urlencode_postdata, +) + + +class AENetworksBaseIE(ThePlatformIE): + _BASE_URL_REGEX = r'''(?x)https?:// + (?:(?:www|play|watch)\.)? + (?P<domain> + (?:history(?:vault)?|aetv|mylifetime|lifetimemovieclub)\.com| + fyi\.tv + )/''' + _THEPLATFORM_KEY = '43jXaGRQud' + _THEPLATFORM_SECRET = 'S10BPXHMlb' + _DOMAIN_MAP = { + 'history.com': ('HISTORY', 'history'), + 'aetv.com': ('AETV', 'aetv'), + 'mylifetime.com': ('LIFETIME', 'lifetime'), + 'lifetimemovieclub.com': ('LIFETIMEMOVIECLUB', 'lmc'), + 'fyi.tv': ('FYI', 'fyi'), + 'historyvault.com': (None, 'historyvault'), + 'biography.com': (None, 'biography'), + } + + def _extract_aen_smil(self, smil_url, video_id, auth=None): + query = {'mbr': 'true'} + if auth: + query['auth'] = auth + TP_SMIL_QUERY = [{ + 'assetTypes': 'high_video_ak', + 'switch': 'hls_high_ak' + }, { + 'assetTypes': 'high_video_s3' + }, { + 'assetTypes': 'high_video_s3', + 'switch': 'hls_high_fastly', + }] + formats = [] + subtitles = {} + last_e = None + for q in TP_SMIL_QUERY: + q.update(query) + m_url = update_url_query(smil_url, q) + m_url = self._sign_url(m_url, self._THEPLATFORM_KEY, self._THEPLATFORM_SECRET) + try: + tp_formats, tp_subtitles = self._extract_theplatform_smil( + m_url, video_id, 'Downloading %s SMIL data' % (q.get('switch') or q['assetTypes'])) + except ExtractorError as e: + if isinstance(e, GeoRestrictedError): + raise + last_e = e + continue + formats.extend(tp_formats) + subtitles = self._merge_subtitles(subtitles, tp_subtitles) + if last_e and not formats: + raise last_e + self._sort_formats(formats) + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + } + + def _extract_aetn_info(self, domain, filter_key, filter_value, url): + requestor_id, brand = self._DOMAIN_MAP[domain] + result = self._download_json( + 'https://feeds.video.aetnd.com/api/v2/%s/videos' % brand, + filter_value, query={'filter[%s]' % filter_key: filter_value})['results'][0] + title = result['title'] + video_id = result['id'] + media_url = result['publicUrl'] + theplatform_metadata = self._download_theplatform_metadata(self._search_regex( + r'https?://link\.theplatform\.com/s/([^?]+)', media_url, 'theplatform_path'), video_id) + info = self._parse_theplatform_metadata(theplatform_metadata) + auth = None + if theplatform_metadata.get('AETN$isBehindWall'): + resource = self._get_mvpd_resource( + requestor_id, theplatform_metadata['title'], + theplatform_metadata.get('AETN$PPL_pplProgramId') or theplatform_metadata.get('AETN$PPL_pplProgramId_OLD'), + theplatform_metadata['ratings'][0]['rating']) + auth = self._extract_mvpd_auth( + url, video_id, requestor_id, resource) + info.update(self._extract_aen_smil(media_url, video_id, auth)) + info.update({ + 'title': title, + 'series': result.get('seriesName'), + 'season_number': int_or_none(result.get('tvSeasonNumber')), + 'episode_number': int_or_none(result.get('tvSeasonEpisodeNumber')), + }) + return info + + +class AENetworksIE(AENetworksBaseIE): + IE_NAME = 'aenetworks' + IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network and History Vault' + _VALID_URL = AENetworksBaseIE._BASE_URL_REGEX + r'''(?P<id> + shows/[^/]+/season-\d+/episode-\d+| + (?: + (?:movie|special)s/[^/]+| + (?:shows/[^/]+/)?videos + )/[^/?#&]+ + )''' + _TESTS = [{ + 'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1', + 'info_dict': { + 'id': '22253814', + 'ext': 'mp4', + 'title': 'Winter is Coming', + 'description': 'md5:641f424b7a19d8e24f26dea22cf59d74', + 'timestamp': 1338306241, + 'upload_date': '20120529', + 'uploader': 'AENE-NEW', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'add_ie': ['ThePlatform'], + 'skip': 'This video is only available for users of participating TV providers.', + }, { + 'url': 'http://www.aetv.com/shows/duck-dynasty/season-9/episode-1', + 'info_dict': { + 'id': '600587331957', + 'ext': 'mp4', + 'title': 'Inlawful Entry', + 'description': 'md5:57c12115a2b384d883fe64ca50529e08', + 'timestamp': 1452634428, + 'upload_date': '20160112', + 'uploader': 'AENE-NEW', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'add_ie': ['ThePlatform'], + }, { + 'url': 'http://www.fyi.tv/shows/tiny-house-nation/season-1/episode-8', + 'only_matching': True + }, { + 'url': 'http://www.mylifetime.com/shows/project-runway-junior/season-1/episode-6', + 'only_matching': True + }, { + 'url': 'http://www.mylifetime.com/movies/center-stage-on-pointe/full-movie', + 'only_matching': True + }, { + 'url': 'https://watch.lifetimemovieclub.com/movies/10-year-reunion/full-movie', + 'only_matching': True + }, { + 'url': 'http://www.history.com/specials/sniper-into-the-kill-zone/full-special', + 'only_matching': True + }, { + 'url': 'https://www.aetv.com/specials/hunting-jonbenets-killer-the-untold-story/preview-hunting-jonbenets-killer-the-untold-story', + 'only_matching': True + }, { + 'url': 'http://www.history.com/videos/history-of-valentines-day', + 'only_matching': True + }, { + 'url': 'https://play.aetv.com/shows/duck-dynasty/videos/best-of-duck-dynasty-getting-quack-in-shape', + 'only_matching': True + }] + + def _real_extract(self, url): + domain, canonical = self._match_valid_url(url).groups() + return self._extract_aetn_info(domain, 'canonical', '/' + canonical, url) + + +class AENetworksListBaseIE(AENetworksBaseIE): + def _call_api(self, resource, slug, brand, fields): + return self._download_json( + 'https://yoga.appsvcs.aetnd.com/graphql', + slug, query={'brand': brand}, data=urlencode_postdata({ + 'query': '''{ + %s(slug: "%s") { + %s + } +}''' % (resource, slug, fields), + }))['data'][resource] + + def _real_extract(self, url): + domain, slug = self._match_valid_url(url).groups() + _, brand = self._DOMAIN_MAP[domain] + playlist = self._call_api(self._RESOURCE, slug, brand, self._FIELDS) + base_url = 'http://watch.%s' % domain + + entries = [] + for item in (playlist.get(self._ITEMS_KEY) or []): + doc = self._get_doc(item) + canonical = doc.get('canonical') + if not canonical: + continue + entries.append(self.url_result( + base_url + canonical, AENetworksIE.ie_key(), doc.get('id'))) + + description = None + if self._PLAYLIST_DESCRIPTION_KEY: + description = playlist.get(self._PLAYLIST_DESCRIPTION_KEY) + + return self.playlist_result( + entries, playlist.get('id'), + playlist.get(self._PLAYLIST_TITLE_KEY), description) + + +class AENetworksCollectionIE(AENetworksListBaseIE): + IE_NAME = 'aenetworks:collection' + _VALID_URL = AENetworksBaseIE._BASE_URL_REGEX + r'(?:[^/]+/)*(?:list|collections)/(?P<id>[^/?#&]+)/?(?:[?#&]|$)' + _TESTS = [{ + 'url': 'https://watch.historyvault.com/list/america-the-story-of-us', + 'info_dict': { + 'id': '282', + 'title': 'America The Story of Us', + }, + 'playlist_mincount': 12, + }, { + 'url': 'https://watch.historyvault.com/shows/america-the-story-of-us-2/season-1/list/america-the-story-of-us', + 'only_matching': True + }, { + 'url': 'https://www.historyvault.com/collections/mysteryquest', + 'only_matching': True + }] + _RESOURCE = 'list' + _ITEMS_KEY = 'items' + _PLAYLIST_TITLE_KEY = 'display_title' + _PLAYLIST_DESCRIPTION_KEY = None + _FIELDS = '''id + display_title + items { + ... on ListVideoItem { + doc { + canonical + id + } + } + }''' + + def _get_doc(self, item): + return item.get('doc') or {} + + +class AENetworksShowIE(AENetworksListBaseIE): + IE_NAME = 'aenetworks:show' + _VALID_URL = AENetworksBaseIE._BASE_URL_REGEX + r'shows/(?P<id>[^/?#&]+)/?(?:[?#&]|$)' + _TESTS = [{ + 'url': 'http://www.history.com/shows/ancient-aliens', + 'info_dict': { + 'id': 'SERIES1574', + 'title': 'Ancient Aliens', + 'description': 'md5:3f6d74daf2672ff3ae29ed732e37ea7f', + }, + 'playlist_mincount': 150, + }] + _RESOURCE = 'series' + _ITEMS_KEY = 'episodes' + _PLAYLIST_TITLE_KEY = 'title' + _PLAYLIST_DESCRIPTION_KEY = 'description' + _FIELDS = '''description + id + title + episodes { + canonical + id + }''' + + def _get_doc(self, item): + return item + + +class HistoryTopicIE(AENetworksBaseIE): + IE_NAME = 'history:topic' + IE_DESC = 'History.com Topic' + _VALID_URL = r'https?://(?:www\.)?history\.com/topics/[^/]+/(?P<id>[\w+-]+?)-video' + _TESTS = [{ + 'url': 'https://www.history.com/topics/valentines-day/history-of-valentines-day-video', + 'info_dict': { + 'id': '40700995724', + 'ext': 'mp4', + 'title': "History of Valentine’s Day", + 'description': 'md5:7b57ea4829b391995b405fa60bd7b5f7', + 'timestamp': 1375819729, + 'upload_date': '20130806', + 'uploader': 'AENE-NEW', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'add_ie': ['ThePlatform'], + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + return self.url_result( + 'http://www.history.com/videos/' + display_id, + AENetworksIE.ie_key()) + + +class HistoryPlayerIE(AENetworksBaseIE): + IE_NAME = 'history:player' + _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:history|biography)\.com)/player/(?P<id>\d+)' + _TESTS = [] + + def _real_extract(self, url): + domain, video_id = self._match_valid_url(url).groups() + return self._extract_aetn_info(domain, 'id', video_id, url) + + +class BiographyIE(AENetworksBaseIE): + _VALID_URL = r'https?://(?:www\.)?biography\.com/video/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.biography.com/video/vincent-van-gogh-full-episode-2075049808', + 'info_dict': { + 'id': '30322987', + 'ext': 'mp4', + 'title': 'Vincent Van Gogh - Full Episode', + 'description': 'A full biography about the most influential 20th century painter, Vincent Van Gogh.', + 'timestamp': 1311970571, + 'upload_date': '20110729', + 'uploader': 'AENE-NEW', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'add_ie': ['ThePlatform'], + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + player_url = self._search_regex( + r'<phoenix-iframe[^>]+src="(%s)' % HistoryPlayerIE._VALID_URL, + webpage, 'player URL') + return self.url_result(player_url, HistoryPlayerIE.ie_key()) diff --git a/yt_dlp/extractor/afreecatv.py b/yt_dlp/extractor/afreecatv.py new file mode 100644 index 000000000..063872b4f --- /dev/null +++ b/yt_dlp/extractor/afreecatv.py @@ -0,0 +1,382 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_xpath +from ..utils import ( + date_from_str, + determine_ext, + ExtractorError, + int_or_none, + unified_strdate, + url_or_none, + urlencode_postdata, + xpath_text, +) + + +class AfreecaTVIE(InfoExtractor): + IE_NAME = 'afreecatv' + IE_DESC = 'afreecatv.com' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:(?:live|afbbs|www)\.)?afreeca(?:tv)?\.com(?::\d+)? + (?: + /app/(?:index|read_ucc_bbs)\.cgi| + /player/[Pp]layer\.(?:swf|html) + )\?.*?\bnTitleNo=| + vod\.afreecatv\.com/PLAYER/STATION/ + ) + (?P<id>\d+) + ''' + _NETRC_MACHINE = 'afreecatv' + _TESTS = [{ + 'url': 'http://live.afreecatv.com:8079/app/index.cgi?szType=read_ucc_bbs&szBjId=dailyapril&nStationNo=16711924&nBbsNo=18605867&nTitleNo=36164052&szSkin=', + 'md5': 'f72c89fe7ecc14c1b5ce506c4996046e', + 'info_dict': { + 'id': '36164052', + 'ext': 'mp4', + 'title': '데일리 에이프릴 요정들의 시상식!', + 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', + 'uploader': 'dailyapril', + 'uploader_id': 'dailyapril', + 'upload_date': '20160503', + }, + 'skip': 'Video is gone', + }, { + 'url': 'http://afbbs.afreecatv.com:8080/app/read_ucc_bbs.cgi?nStationNo=16711924&nTitleNo=36153164&szBjId=dailyapril&nBbsNo=18605867', + 'info_dict': { + 'id': '36153164', + 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'", + 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', + 'uploader': 'dailyapril', + 'uploader_id': 'dailyapril', + }, + 'playlist_count': 2, + 'playlist': [{ + 'md5': 'd8b7c174568da61d774ef0203159bf97', + 'info_dict': { + 'id': '36153164_1', + 'ext': 'mp4', + 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'", + 'upload_date': '20160502', + }, + }, { + 'md5': '58f2ce7f6044e34439ab2d50612ab02b', + 'info_dict': { + 'id': '36153164_2', + 'ext': 'mp4', + 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'", + 'upload_date': '20160502', + }, + }], + 'skip': 'Video is gone', + }, { + 'url': 'http://vod.afreecatv.com/PLAYER/STATION/18650793', + 'info_dict': { + 'id': '18650793', + 'ext': 'mp4', + 'title': '오늘은 다르다! 쏘님의 우월한 위아래~ 댄스리액션!', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': '윈아디', + 'uploader_id': 'badkids', + 'duration': 107, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://vod.afreecatv.com/PLAYER/STATION/10481652', + 'info_dict': { + 'id': '10481652', + 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'", + 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', + 'uploader': 'dailyapril', + 'uploader_id': 'dailyapril', + 'duration': 6492, + }, + 'playlist_count': 2, + 'playlist': [{ + 'md5': 'd8b7c174568da61d774ef0203159bf97', + 'info_dict': { + 'id': '20160502_c4c62b9d_174361386_1', + 'ext': 'mp4', + 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!' (part 1)", + 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', + 'uploader': 'dailyapril', + 'uploader_id': 'dailyapril', + 'upload_date': '20160502', + 'duration': 3601, + }, + }, { + 'md5': '58f2ce7f6044e34439ab2d50612ab02b', + 'info_dict': { + 'id': '20160502_39e739bb_174361386_2', + 'ext': 'mp4', + 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!' (part 2)", + 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', + 'uploader': 'dailyapril', + 'uploader_id': 'dailyapril', + 'upload_date': '20160502', + 'duration': 2891, + }, + }], + 'params': { + 'skip_download': True, + }, + }, { + # non standard key + 'url': 'http://vod.afreecatv.com/PLAYER/STATION/20515605', + 'info_dict': { + 'id': '20170411_BE689A0E_190960999_1_2_h', + 'ext': 'mp4', + 'title': '혼자사는여자집', + 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', + 'uploader': '♥이슬이', + 'uploader_id': 'dasl8121', + 'upload_date': '20170411', + 'duration': 213, + }, + 'params': { + 'skip_download': True, + }, + }, { + # PARTIAL_ADULT + 'url': 'http://vod.afreecatv.com/PLAYER/STATION/32028439', + 'info_dict': { + 'id': '20180327_27901457_202289533_1', + 'ext': 'mp4', + 'title': '[생]빨개요♥ (part 1)', + 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', + 'uploader': '[SA]서아', + 'uploader_id': 'bjdyrksu', + 'upload_date': '20180327', + 'duration': 3601, + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['adult content'], + }, { + 'url': 'http://www.afreecatv.com/player/Player.swf?szType=szBjId=djleegoon&nStationNo=11273158&nBbsNo=13161095&nTitleNo=36327652', + 'only_matching': True, + }, { + 'url': 'http://vod.afreecatv.com/PLAYER/STATION/15055030', + 'only_matching': True, + }] + + @staticmethod + def parse_video_key(key): + video_key = {} + m = re.match(r'^(?P<upload_date>\d{8})_\w+_(?P<part>\d+)$', key) + if m: + video_key['upload_date'] = m.group('upload_date') + video_key['part'] = int(m.group('part')) + return video_key + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + login_form = { + 'szWork': 'login', + 'szType': 'json', + 'szUid': username, + 'szPassword': password, + 'isSaveId': 'false', + 'szScriptVar': 'oLoginRet', + 'szAction': '', + } + + response = self._download_json( + 'https://login.afreecatv.com/app/LoginAction.php', None, + 'Logging in', data=urlencode_postdata(login_form)) + + _ERRORS = { + -4: 'Your account has been suspended due to a violation of our terms and policies.', + -5: 'https://member.afreecatv.com/app/user_delete_progress.php', + -6: 'https://login.afreecatv.com/membership/changeMember.php', + -8: "Hello! AfreecaTV here.\nThe username you have entered belongs to \n an account that requires a legal guardian's consent. \nIf you wish to use our services without restriction, \nplease make sure to go through the necessary verification process.", + -9: 'https://member.afreecatv.com/app/pop_login_block.php', + -11: 'https://login.afreecatv.com/afreeca/second_login.php', + -12: 'https://member.afreecatv.com/app/user_security.php', + 0: 'The username does not exist or you have entered the wrong password.', + -1: 'The username does not exist or you have entered the wrong password.', + -3: 'You have entered your username/password incorrectly.', + -7: 'You cannot use your Global AfreecaTV account to access Korean AfreecaTV.', + -10: 'Sorry for the inconvenience. \nYour account has been blocked due to an unauthorized access. \nPlease contact our Help Center for assistance.', + -32008: 'You have failed to log in. Please contact our Help Center.', + } + + result = int_or_none(response.get('RESULT')) + if result != 1: + error = _ERRORS.get(result, 'You have failed to log in.') + raise ExtractorError( + 'Unable to login: %s said: %s' % (self.IE_NAME, error), + expected=True) + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + if re.search(r'alert\(["\']This video has been deleted', webpage): + raise ExtractorError( + 'Video %s has been deleted' % video_id, expected=True) + + station_id = self._search_regex( + r'nStationNo\s*=\s*(\d+)', webpage, 'station') + bbs_id = self._search_regex( + r'nBbsNo\s*=\s*(\d+)', webpage, 'bbs') + video_id = self._search_regex( + r'nTitleNo\s*=\s*(\d+)', webpage, 'title', default=video_id) + + partial_view = False + adult_view = False + for _ in range(2): + query = { + 'nTitleNo': video_id, + 'nStationNo': station_id, + 'nBbsNo': bbs_id, + } + if partial_view: + query['partialView'] = 'SKIP_ADULT' + if adult_view: + query['adultView'] = 'ADULT_VIEW' + video_xml = self._download_xml( + 'http://afbbs.afreecatv.com:8080/api/video/get_video_info.php', + video_id, 'Downloading video info XML%s' + % (' (skipping adult)' if partial_view else ''), + video_id, headers={ + 'Referer': url, + }, query=query) + + flag = xpath_text(video_xml, './track/flag', 'flag', default=None) + if flag and flag == 'SUCCEED': + break + if flag == 'PARTIAL_ADULT': + self.report_warning( + 'In accordance with local laws and regulations, underage users are restricted from watching adult content. ' + 'Only content suitable for all ages will be downloaded. ' + 'Provide account credentials if you wish to download restricted content.') + partial_view = True + continue + elif flag == 'ADULT': + if not adult_view: + adult_view = True + continue + error = 'Only users older than 19 are able to watch this video. Provide account credentials to download this content.' + else: + error = flag + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error), expected=True) + else: + raise ExtractorError('Unable to download video info') + + video_element = video_xml.findall(compat_xpath('./track/video'))[-1] + if video_element is None or video_element.text is None: + raise ExtractorError( + 'Video %s does not exist' % video_id, expected=True) + + video_url = video_element.text.strip() + + title = xpath_text(video_xml, './track/title', 'title', fatal=True) + + uploader = xpath_text(video_xml, './track/nickname', 'uploader') + uploader_id = xpath_text(video_xml, './track/bj_id', 'uploader id') + duration = int_or_none(xpath_text( + video_xml, './track/duration', 'duration')) + thumbnail = xpath_text(video_xml, './track/titleImage', 'thumbnail') + + common_entry = { + 'uploader': uploader, + 'uploader_id': uploader_id, + 'thumbnail': thumbnail, + } + + info = common_entry.copy() + info.update({ + 'id': video_id, + 'title': title, + 'duration': duration, + }) + + if not video_url: + entries = [] + file_elements = video_element.findall(compat_xpath('./file')) + one = len(file_elements) == 1 + for file_num, file_element in enumerate(file_elements, start=1): + file_url = url_or_none(file_element.text) + if not file_url: + continue + key = file_element.get('key', '') + upload_date = unified_strdate(self._search_regex( + r'^(\d{8})_', key, 'upload date', default=None)) + if upload_date is not None: + # sometimes the upload date isn't included in the file name + # instead, another random ID is, which may parse as a valid + # date but be wildly out of a reasonable range + parsed_date = date_from_str(upload_date) + if parsed_date.year < 2000 or parsed_date.year >= 2100: + upload_date = None + file_duration = int_or_none(file_element.get('duration')) + format_id = key if key else '%s_%s' % (video_id, file_num) + if determine_ext(file_url) == 'm3u8': + formats = self._extract_m3u8_formats( + file_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', + note='Downloading part %d m3u8 information' % file_num) + else: + formats = [{ + 'url': file_url, + 'format_id': 'http', + }] + if not formats and not self.get_param('ignore_no_formats'): + continue + self._sort_formats(formats) + file_info = common_entry.copy() + file_info.update({ + 'id': format_id, + 'title': title if one else '%s (part %d)' % (title, file_num), + 'upload_date': upload_date, + 'duration': file_duration, + 'formats': formats, + }) + entries.append(file_info) + entries_info = info.copy() + entries_info.update({ + '_type': 'multi_video', + 'entries': entries, + }) + return entries_info + + info = { + 'id': video_id, + 'title': title, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'duration': duration, + 'thumbnail': thumbnail, + } + + if determine_ext(video_url) == 'm3u8': + info['formats'] = self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + else: + app, playpath = video_url.split('mp4:') + info.update({ + 'url': app, + 'ext': 'flv', + 'play_path': 'mp4:' + playpath, + 'rtmp_live': True, # downloading won't end without this + }) + + return info diff --git a/youtube_dl/extractor/airmozilla.py b/yt_dlp/extractor/airmozilla.py index 9e38136b4..9e38136b4 100644 --- a/youtube_dl/extractor/airmozilla.py +++ b/yt_dlp/extractor/airmozilla.py diff --git a/youtube_dl/extractor/aliexpress.py b/yt_dlp/extractor/aliexpress.py index 6f241e683..6f241e683 100644 --- a/youtube_dl/extractor/aliexpress.py +++ b/yt_dlp/extractor/aliexpress.py diff --git a/yt_dlp/extractor/aljazeera.py b/yt_dlp/extractor/aljazeera.py new file mode 100644 index 000000000..e829b45e4 --- /dev/null +++ b/yt_dlp/extractor/aljazeera.py @@ -0,0 +1,55 @@ +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor + + +class AlJazeeraIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/(?P<type>program/[^/]+|(?:feature|video)s)/\d{4}/\d{1,2}/\d{1,2}/(?P<id>[^/?&#]+)' + + _TESTS = [{ + 'url': 'https://www.aljazeera.com/program/episode/2014/9/19/deliverance', + 'info_dict': { + 'id': '3792260579001', + 'ext': 'mp4', + 'title': 'The Slum - Episode 1: Deliverance', + 'description': 'As a birth attendant advocating for family planning, Remy is on the frontline of Tondo\'s battle with overcrowding.', + 'uploader_id': '665003303001', + 'timestamp': 1411116829, + 'upload_date': '20140919', + }, + 'add_ie': ['BrightcoveNew'], + 'skip': 'Not accessible from Travis CI server', + }, { + 'url': 'https://www.aljazeera.com/videos/2017/5/11/sierra-leone-709-carat-diamond-to-be-auctioned-off', + 'only_matching': True, + }, { + 'url': 'https://www.aljazeera.com/features/2017/8/21/transforming-pakistans-buses-into-art', + 'only_matching': True, + }] + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' + + def _real_extract(self, url): + post_type, name = self._match_valid_url(url).groups() + post_type = { + 'features': 'post', + 'program': 'episode', + 'videos': 'video', + }[post_type.split('/')[0]] + video = self._download_json( + 'https://www.aljazeera.com/graphql', name, query={ + 'operationName': 'ArchipelagoSingleArticleQuery', + 'variables': json.dumps({ + 'name': name, + 'postType': post_type, + }), + }, headers={ + 'wp-site': 'aje', + })['data']['article']['video'] + video_id = video['id'] + account_id = video.get('accountId') or '665003303001' + player_id = video.get('playerId') or 'BkeSH5BDb' + return self.url_result( + self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, video_id), + 'BrightcoveNew', video_id) diff --git a/youtube_dl/extractor/allocine.py b/yt_dlp/extractor/allocine.py index cd533acfc..cd533acfc 100644 --- a/youtube_dl/extractor/allocine.py +++ b/yt_dlp/extractor/allocine.py diff --git a/youtube_dl/extractor/alphaporno.py b/yt_dlp/extractor/alphaporno.py index 3a6d99f6b..3a6d99f6b 100644 --- a/youtube_dl/extractor/alphaporno.py +++ b/yt_dlp/extractor/alphaporno.py diff --git a/yt_dlp/extractor/alura.py b/yt_dlp/extractor/alura.py new file mode 100644 index 000000000..f5325de2f --- /dev/null +++ b/yt_dlp/extractor/alura.py @@ -0,0 +1,179 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + +from ..compat import ( + compat_urlparse, +) + +from ..utils import ( + urlencode_postdata, + urljoin, + int_or_none, + clean_html, + ExtractorError +) + + +class AluraIE(InfoExtractor): + _VALID_URL = r'https?://(?:cursos\.)?alura\.com\.br/course/(?P<course_name>[^/]+)/task/(?P<id>\d+)' + _LOGIN_URL = 'https://cursos.alura.com.br/loginForm?urlAfterLogin=/loginForm' + _VIDEO_URL = 'https://cursos.alura.com.br/course/%s/task/%s/video' + _NETRC_MACHINE = 'alura' + _TESTS = [{ + 'url': 'https://cursos.alura.com.br/course/clojure-mutabilidade-com-atoms-e-refs/task/60095', + 'info_dict': { + 'id': '60095', + 'ext': 'mp4', + 'title': 'Referências, ref-set e alter' + }, + 'skip': 'Requires alura account credentials'}, + { + # URL without video + 'url': 'https://cursos.alura.com.br/course/clojure-mutabilidade-com-atoms-e-refs/task/60098', + 'only_matching': True}, + { + 'url': 'https://cursos.alura.com.br/course/fundamentos-market-digital/task/55219', + 'only_matching': True} + ] + + def _real_extract(self, url): + + course, video_id = self._match_valid_url(url) + video_url = self._VIDEO_URL % (course, video_id) + + video_dict = self._download_json(video_url, video_id, 'Searching for videos') + + if video_dict: + webpage = self._download_webpage(url, video_id) + video_title = clean_html(self._search_regex( + r'<span[^>]+class=(["\'])task-body-header-title-text\1[^>]*>(?P<title>[^<]+)', + webpage, 'title', group='title')) + + formats = [] + for video_obj in video_dict: + video_url_m3u8 = video_obj.get('link') + video_format = self._extract_m3u8_formats( + video_url_m3u8, None, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False) + for f in video_format: + m = re.search(r'^[\w \W]*-(?P<res>\w*).mp4[\W \w]*', f['url']) + if m: + if not f.get('height'): + f['height'] = int('720' if m.group('res') == 'hd' else '480') + formats.extend(video_format) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_title, + "formats": formats + } + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + pass + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login popup') + + def is_logged(webpage): + return any(re.search(p, webpage) for p in ( + r'href=[\"|\']?/signout[\"|\']', + r'>Logout<')) + + # already logged in + if is_logged(login_page): + return + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'username': username, + 'password': password, + }) + + post_url = self._search_regex( + r'<form[^>]+class=["|\']signin-form["|\'] action=["|\'](?P<url>.+?)["|\']', login_page, + 'post url', default=self._LOGIN_URL, group='url') + + if not post_url.startswith('http'): + post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) + + response = self._download_webpage( + post_url, None, 'Logging in', + data=urlencode_postdata(login_form), + headers={'Content-Type': 'application/x-www-form-urlencoded'}) + + if not is_logged(response): + error = self._html_search_regex( + r'(?s)<p[^>]+class="alert-message[^"]*">(.+?)</p>', + response, 'error message', default=None) + if error: + raise ExtractorError('Unable to login: %s' % error, expected=True) + raise ExtractorError('Unable to log in') + + +class AluraCourseIE(AluraIE): + + _VALID_URL = r'https?://(?:cursos\.)?alura\.com\.br/course/(?P<id>[^/]+)' + _LOGIN_URL = 'https://cursos.alura.com.br/loginForm?urlAfterLogin=/loginForm' + _NETRC_MACHINE = 'aluracourse' + _TESTS = [{ + 'url': 'https://cursos.alura.com.br/course/clojure-mutabilidade-com-atoms-e-refs', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if AluraIE.suitable(url) else super(AluraCourseIE, cls).suitable(url) + + def _real_extract(self, url): + + course_path = self._match_id(url) + webpage = self._download_webpage(url, course_path) + + course_title = self._search_regex( + r'<h1.*?>(.*?)<strong>(?P<course_title>.*?)</strong></h[0-9]>', webpage, + 'course title', default=course_path, group='course_title') + + entries = [] + if webpage: + for path in re.findall(r'<a\b(?=[^>]* class="[^"]*(?<=[" ])courseSectionList-section[" ])(?=[^>]* href="([^"]*))', webpage): + page_url = urljoin(url, path) + section_path = self._download_webpage(page_url, course_path) + for path_video in re.findall(r'<a\b(?=[^>]* class="[^"]*(?<=[" ])task-menu-nav-item-link-VIDEO[" ])(?=[^>]* href="([^"]*))', section_path): + chapter = clean_html( + self._search_regex( + r'<h3[^>]+class=(["\'])task-menu-section-title-text\1[^>]*>(?P<chapter>[^<]+)', + section_path, + 'chapter', + group='chapter')) + + chapter_number = int_or_none( + self._search_regex( + r'<span[^>]+class=(["\'])task-menu-section-title-number[^>]*>(.*?)<strong>(?P<chapter_number>[^<]+)</strong>', + section_path, + 'chapter number', + group='chapter_number')) + video_url = urljoin(url, path_video) + + entry = { + '_type': 'url_transparent', + 'id': self._match_id(video_url), + 'url': video_url, + 'id_key': self.ie_key(), + 'chapter': chapter, + 'chapter_number': chapter_number + } + entries.append(entry) + return self.playlist_result(entries, course_path, course_title) diff --git a/youtube_dl/extractor/amara.py b/yt_dlp/extractor/amara.py index 61d469574..61d469574 100644 --- a/youtube_dl/extractor/amara.py +++ b/yt_dlp/extractor/amara.py diff --git a/yt_dlp/extractor/amcnetworks.py b/yt_dlp/extractor/amcnetworks.py new file mode 100644 index 000000000..e38e215d3 --- /dev/null +++ b/yt_dlp/extractor/amcnetworks.py @@ -0,0 +1,150 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .theplatform import ThePlatformIE +from ..utils import ( + int_or_none, + parse_age_limit, + try_get, + update_url_query, +) + + +class AMCNetworksIE(ThePlatformIE): + _VALID_URL = r'https?://(?:www\.)?(?P<site>amc|bbcamerica|ifc|(?:we|sundance)tv)\.com/(?P<id>(?:movies|shows(?:/[^/]+)+)/[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.bbcamerica.com/shows/the-graham-norton-show/videos/tina-feys-adorable-airline-themed-family-dinner--51631', + 'info_dict': { + 'id': '4Lq1dzOnZGt0', + 'ext': 'mp4', + 'title': "The Graham Norton Show - Season 28 - Tina Fey's Adorable Airline-Themed Family Dinner", + 'description': "It turns out child stewardesses are very generous with the wine! All-new episodes of 'The Graham Norton Show' premiere Fridays at 11/10c on BBC America.", + 'upload_date': '20201120', + 'timestamp': 1605904350, + 'uploader': 'AMCN', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://www.bbcamerica.com/shows/the-hunt/full-episodes/season-1/episode-01-the-hardest-challenge', + 'only_matching': True, + }, { + 'url': 'http://www.amc.com/shows/preacher/full-episodes/season-01/episode-00/pilot', + 'only_matching': True, + }, { + 'url': 'http://www.wetv.com/shows/million-dollar-matchmaker/season-01/episode-06-the-dumped-dj-and-shallow-hal', + 'only_matching': True, + }, { + 'url': 'http://www.ifc.com/movies/chaos', + 'only_matching': True, + }, { + 'url': 'http://www.bbcamerica.com/shows/doctor-who/full-episodes/the-power-of-the-daleks/episode-01-episode-1-color-version', + 'only_matching': True, + }, { + 'url': 'http://www.wetv.com/shows/mama-june-from-not-to-hot/full-episode/season-01/thin-tervention', + 'only_matching': True, + }, { + 'url': 'http://www.wetv.com/shows/la-hair/videos/season-05/episode-09-episode-9-2/episode-9-sneak-peek-3', + 'only_matching': True, + }, { + 'url': 'https://www.sundancetv.com/shows/riviera/full-episodes/season-1/episode-01-episode-1', + 'only_matching': True, + }] + _REQUESTOR_ID_MAP = { + 'amc': 'AMC', + 'bbcamerica': 'BBCA', + 'ifc': 'IFC', + 'sundancetv': 'SUNDANCE', + 'wetv': 'WETV', + } + + def _real_extract(self, url): + site, display_id = self._match_valid_url(url).groups() + requestor_id = self._REQUESTOR_ID_MAP[site] + page_data = self._download_json( + 'https://content-delivery-gw.svc.ds.amcn.com/api/v2/content/amcn/%s/url/%s' + % (requestor_id.lower(), display_id), display_id)['data'] + properties = page_data.get('properties') or {} + query = { + 'mbr': 'true', + 'manifest': 'm3u', + } + + video_player_count = 0 + try: + for v in page_data['children']: + if v.get('type') == 'video-player': + releasePid = v['properties']['currentVideo']['meta']['releasePid'] + tp_path = 'M_UwQC/' + releasePid + media_url = 'https://link.theplatform.com/s/' + tp_path + video_player_count += 1 + except KeyError: + pass + if video_player_count > 1: + self.report_warning( + 'The JSON data has %d video players. Only one will be extracted' % video_player_count) + + # Fall back to videoPid if releasePid not found. + # TODO: Fall back to videoPid if releasePid manifest uses DRM. + if not video_player_count: + tp_path = 'M_UwQC/media/' + properties['videoPid'] + media_url = 'https://link.theplatform.com/s/' + tp_path + + theplatform_metadata = self._download_theplatform_metadata(tp_path, display_id) + info = self._parse_theplatform_metadata(theplatform_metadata) + video_id = theplatform_metadata['pid'] + title = theplatform_metadata['title'] + rating = try_get( + theplatform_metadata, lambda x: x['ratings'][0]['rating']) + video_category = properties.get('videoCategory') + if video_category and video_category.endswith('-Auth'): + resource = self._get_mvpd_resource( + requestor_id, title, video_id, rating) + query['auth'] = self._extract_mvpd_auth( + url, video_id, requestor_id, resource) + media_url = update_url_query(media_url, query) + formats, subtitles = self._extract_theplatform_smil( + media_url, video_id) + self._sort_formats(formats) + + thumbnails = [] + thumbnail_urls = [properties.get('imageDesktop')] + if 'thumbnail' in info: + thumbnail_urls.append(info.pop('thumbnail')) + for thumbnail_url in thumbnail_urls: + if not thumbnail_url: + continue + mobj = re.search(r'(\d+)x(\d+)', thumbnail_url) + thumbnails.append({ + 'url': thumbnail_url, + 'width': int(mobj.group(1)) if mobj else None, + 'height': int(mobj.group(2)) if mobj else None, + }) + + info.update({ + 'age_limit': parse_age_limit(rating), + 'formats': formats, + 'id': video_id, + 'subtitles': subtitles, + 'thumbnails': thumbnails, + }) + ns_keys = theplatform_metadata.get('$xmlns', {}).keys() + if ns_keys: + ns = list(ns_keys)[0] + episode = theplatform_metadata.get(ns + '$episodeTitle') or None + episode_number = int_or_none( + theplatform_metadata.get(ns + '$episode')) + season_number = int_or_none( + theplatform_metadata.get(ns + '$season')) + series = theplatform_metadata.get(ns + '$show') or None + info.update({ + 'episode': episode, + 'episode_number': episode_number, + 'season_number': season_number, + 'series': series, + }) + return info diff --git a/yt_dlp/extractor/americastestkitchen.py b/yt_dlp/extractor/americastestkitchen.py new file mode 100644 index 000000000..6e6099a03 --- /dev/null +++ b/yt_dlp/extractor/americastestkitchen.py @@ -0,0 +1,158 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..utils import ( + clean_html, + int_or_none, + try_get, + unified_strdate, + unified_timestamp, +) + + +class AmericasTestKitchenIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?:americastestkitchen|cooks(?:country|illustrated))\.com/(?P<resource_type>episode|videos)/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.americastestkitchen.com/episode/582-weeknight-japanese-suppers', + 'md5': 'b861c3e365ac38ad319cfd509c30577f', + 'info_dict': { + 'id': '5b400b9ee338f922cb06450c', + 'title': 'Japanese Suppers', + 'ext': 'mp4', + 'description': 'md5:64e606bfee910627efc4b5f050de92b3', + 'thumbnail': r're:^https?://', + 'timestamp': 1523318400, + 'upload_date': '20180410', + 'release_date': '20180410', + 'series': "America's Test Kitchen", + 'season_number': 18, + 'episode': 'Japanese Suppers', + 'episode_number': 15, + }, + 'params': { + 'skip_download': True, + }, + }, { + # Metadata parsing behaves differently for newer episodes (705) as opposed to older episodes (582 above) + 'url': 'https://www.americastestkitchen.com/episode/705-simple-chicken-dinner', + 'md5': '06451608c57651e985a498e69cec17e5', + 'info_dict': { + 'id': '5fbe8c61bda2010001c6763b', + 'title': 'Simple Chicken Dinner', + 'ext': 'mp4', + 'description': 'md5:eb68737cc2fd4c26ca7db30139d109e7', + 'thumbnail': r're:^https?://', + 'timestamp': 1610755200, + 'upload_date': '20210116', + 'release_date': '20210116', + 'series': "America's Test Kitchen", + 'season_number': 21, + 'episode': 'Simple Chicken Dinner', + 'episode_number': 3, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.americastestkitchen.com/videos/3420-pan-seared-salmon', + 'only_matching': True, + }, { + 'url': 'https://www.cookscountry.com/episode/564-when-only-chocolate-will-do', + 'only_matching': True, + }, { + 'url': 'https://www.cooksillustrated.com/videos/4478-beef-wellington', + 'only_matching': True, + }] + + def _real_extract(self, url): + resource_type, video_id = self._match_valid_url(url).groups() + is_episode = resource_type == 'episode' + if is_episode: + resource_type = 'episodes' + + resource = self._download_json( + 'https://www.americastestkitchen.com/api/v6/%s/%s' % (resource_type, video_id), video_id) + video = resource['video'] if is_episode else resource + episode = resource if is_episode else resource.get('episode') or {} + + return { + '_type': 'url_transparent', + 'url': 'https://player.zype.com/embed/%s.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ' % video['zypeId'], + 'ie_key': 'Zype', + 'description': clean_html(video.get('description')), + 'timestamp': unified_timestamp(video.get('publishDate')), + 'release_date': unified_strdate(video.get('publishDate')), + 'episode_number': int_or_none(episode.get('number')), + 'season_number': int_or_none(episode.get('season')), + 'series': try_get(episode, lambda x: x['show']['title']), + 'episode': episode.get('title'), + } + + +class AmericasTestKitchenSeasonIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?P<show>americastestkitchen|cookscountry)\.com/episodes/browse/season_(?P<id>\d+)' + _TESTS = [{ + # ATK Season + 'url': 'https://www.americastestkitchen.com/episodes/browse/season_1', + 'info_dict': { + 'id': 'season_1', + 'title': 'Season 1', + }, + 'playlist_count': 13, + }, { + # Cooks Country Season + 'url': 'https://www.cookscountry.com/episodes/browse/season_12', + 'info_dict': { + 'id': 'season_12', + 'title': 'Season 12', + }, + 'playlist_count': 13, + }] + + def _real_extract(self, url): + show_name, season_number = self._match_valid_url(url).groups() + season_number = int(season_number) + + slug = 'atk' if show_name == 'americastestkitchen' else 'cco' + + season = 'Season %d' % season_number + + season_search = self._download_json( + 'https://y1fnzxui30-dsn.algolia.net/1/indexes/everest_search_%s_season_desc_production' % slug, + season, headers={ + 'Origin': 'https://www.%s.com' % show_name, + 'X-Algolia-API-Key': '8d504d0099ed27c1b73708d22871d805', + 'X-Algolia-Application-Id': 'Y1FNZXUI30', + }, query={ + 'facetFilters': json.dumps([ + 'search_season_list:' + season, + 'search_document_klass:episode', + 'search_show_slug:' + slug, + ]), + 'attributesToRetrieve': 'description,search_%s_episode_number,search_document_date,search_url,title' % slug, + 'attributesToHighlight': '', + 'hitsPerPage': 1000, + }) + + def entries(): + for episode in (season_search.get('hits') or []): + search_url = episode.get('search_url') + if not search_url: + continue + yield { + '_type': 'url', + 'url': 'https://www.%s.com%s' % (show_name, search_url), + 'id': try_get(episode, lambda e: e['objectID'].split('_')[-1]), + 'title': episode.get('title'), + 'description': episode.get('description'), + 'timestamp': unified_timestamp(episode.get('search_document_date')), + 'season_number': season_number, + 'episode_number': int_or_none(episode.get('search_%s_episode_number' % slug)), + 'ie_key': AmericasTestKitchenIE.ie_key(), + } + + return self.playlist_result( + entries(), 'season_%d' % season_number, season) diff --git a/youtube_dl/extractor/amp.py b/yt_dlp/extractor/amp.py index 24c684cad..24c684cad 100644 --- a/youtube_dl/extractor/amp.py +++ b/yt_dlp/extractor/amp.py diff --git a/yt_dlp/extractor/animelab.py b/yt_dlp/extractor/animelab.py new file mode 100644 index 000000000..4fb7ee424 --- /dev/null +++ b/yt_dlp/extractor/animelab.py @@ -0,0 +1,285 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + +from ..utils import ( + ExtractorError, + urlencode_postdata, + int_or_none, + str_or_none, + determine_ext, +) + +from ..compat import compat_HTTPError + + +class AnimeLabBaseIE(InfoExtractor): + _LOGIN_REQUIRED = True + _LOGIN_URL = 'https://www.animelab.com/login' + _NETRC_MACHINE = 'animelab' + + def _login(self): + def is_logged_in(login_webpage): + return 'Sign In' not in login_webpage + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login page') + + # Check if already logged in + if is_logged_in(login_page): + return + + (username, password) = self._get_login_info() + if username is None and self._LOGIN_REQUIRED: + self.raise_login_required('Login is required to access any AnimeLab content') + + login_form = { + 'email': username, + 'password': password, + } + + try: + response = self._download_webpage( + self._LOGIN_URL, None, 'Logging in', 'Wrong login info', + data=urlencode_postdata(login_form), + headers={'Content-Type': 'application/x-www-form-urlencoded'}) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + raise ExtractorError('Unable to log in (wrong credentials?)', expected=True) + else: + raise + + # if login was successful + if is_logged_in(response): + return + + raise ExtractorError('Unable to login (cannot verify if logged in)') + + def _real_initialize(self): + self._login() + + +class AnimeLabIE(AnimeLabBaseIE): + _VALID_URL = r'https?://(?:www\.)?animelab\.com/player/(?P<id>[^/]+)' + + # the following tests require authentication, but a free account will suffice + # just set 'usenetrc' to true in test/local_parameters.json if you use a .netrc file + # or you can set 'username' and 'password' there + # the tests also select a specific format so that the same video is downloaded + # regardless of whether the user is premium or not (needs testing on a premium account) + _TEST = { + 'url': 'https://www.animelab.com/player/fullmetal-alchemist-brotherhood-episode-42', + 'md5': '05bde4b91a5d1ff46ef5b94df05b0f7f', + 'info_dict': { + 'id': '383', + 'ext': 'mp4', + 'display_id': 'fullmetal-alchemist-brotherhood-episode-42', + 'title': 'Fullmetal Alchemist: Brotherhood - Episode 42 - Signs of a Counteroffensive', + 'description': 'md5:103eb61dd0a56d3dfc5dbf748e5e83f4', + 'series': 'Fullmetal Alchemist: Brotherhood', + 'episode': 'Signs of a Counteroffensive', + 'episode_number': 42, + 'duration': 1469, + 'season': 'Season 1', + 'season_number': 1, + 'season_id': '38', + }, + 'params': { + 'format': '[format_id=21711_yeshardsubbed_ja-JP][height=480]', + }, + 'skip': 'All AnimeLab content requires authentication', + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + # unfortunately we can get different URLs for the same formats + # e.g. if we are using a "free" account so no dubs available + # (so _remove_duplicate_formats is not effective) + # so we use a dictionary as a workaround + formats = {} + for language_option_url in ('https://www.animelab.com/player/%s/subtitles', + 'https://www.animelab.com/player/%s/dubbed'): + actual_url = language_option_url % display_id + webpage = self._download_webpage(actual_url, display_id, 'Downloading URL ' + actual_url) + + video_collection = self._parse_json(self._search_regex(r'new\s+?AnimeLabApp\.VideoCollection\s*?\((.*?)\);', webpage, 'AnimeLab VideoCollection'), display_id) + position = int_or_none(self._search_regex(r'playlistPosition\s*?=\s*?(\d+)', webpage, 'Playlist Position')) + + raw_data = video_collection[position]['videoEntry'] + + video_id = str_or_none(raw_data['id']) + + # create a title from many sources (while grabbing other info) + # TODO use more fallback sources to get some of these + series = raw_data.get('showTitle') + video_type = raw_data.get('videoEntryType', {}).get('name') + episode_number = raw_data.get('episodeNumber') + episode_name = raw_data.get('name') + + title_parts = (series, video_type, episode_number, episode_name) + if None not in title_parts: + title = '%s - %s %s - %s' % title_parts + else: + title = episode_name + + description = raw_data.get('synopsis') or self._og_search_description(webpage, default=None) + + duration = int_or_none(raw_data.get('duration')) + + thumbnail_data = raw_data.get('images', []) + thumbnails = [] + for thumbnail in thumbnail_data: + for instance in thumbnail['imageInstances']: + image_data = instance.get('imageInfo', {}) + thumbnails.append({ + 'id': str_or_none(image_data.get('id')), + 'url': image_data.get('fullPath'), + 'width': image_data.get('width'), + 'height': image_data.get('height'), + }) + + season_data = raw_data.get('season', {}) or {} + season = str_or_none(season_data.get('name')) + season_number = int_or_none(season_data.get('seasonNumber')) + season_id = str_or_none(season_data.get('id')) + + for video_data in raw_data['videoList']: + current_video_list = {} + current_video_list['language'] = video_data.get('language', {}).get('languageCode') + + is_hardsubbed = video_data.get('hardSubbed') + + for video_instance in video_data['videoInstances']: + httpurl = video_instance.get('httpUrl') + url = httpurl if httpurl else video_instance.get('rtmpUrl') + if url is None: + # this video format is unavailable to the user (not premium etc.) + continue + + current_format = current_video_list.copy() + + format_id_parts = [] + + format_id_parts.append(str_or_none(video_instance.get('id'))) + + if is_hardsubbed is not None: + if is_hardsubbed: + format_id_parts.append('yeshardsubbed') + else: + format_id_parts.append('nothardsubbed') + + format_id_parts.append(current_format['language']) + + format_id = '_'.join([x for x in format_id_parts if x is not None]) + + ext = determine_ext(url) + if ext == 'm3u8': + for format_ in self._extract_m3u8_formats( + url, video_id, m3u8_id=format_id, fatal=False): + formats[format_['format_id']] = format_ + continue + elif ext == 'mpd': + for format_ in self._extract_mpd_formats( + url, video_id, mpd_id=format_id, fatal=False): + formats[format_['format_id']] = format_ + continue + + current_format['url'] = url + quality_data = video_instance.get('videoQuality') + if quality_data: + quality = quality_data.get('name') or quality_data.get('description') + else: + quality = None + + height = None + if quality: + height = int_or_none(self._search_regex(r'(\d+)p?$', quality, 'Video format height', default=None)) + + if height is None: + self.report_warning('Could not get height of video') + else: + current_format['height'] = height + current_format['format_id'] = format_id + + formats[current_format['format_id']] = current_format + + formats = list(formats.values()) + self._sort_formats(formats) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'series': series, + 'episode': episode_name, + 'episode_number': int_or_none(episode_number), + 'thumbnails': thumbnails, + 'duration': duration, + 'formats': formats, + 'season': season, + 'season_number': season_number, + 'season_id': season_id, + } + + +class AnimeLabShowsIE(AnimeLabBaseIE): + _VALID_URL = r'https?://(?:www\.)?animelab\.com/shows/(?P<id>[^/]+)' + + _TEST = { + 'url': 'https://www.animelab.com/shows/attack-on-titan', + 'info_dict': { + 'id': '45', + 'title': 'Attack on Titan', + 'description': 'md5:989d95a2677e9309368d5cf39ba91469', + }, + 'playlist_count': 59, + 'skip': 'All AnimeLab content requires authentication', + } + + def _real_extract(self, url): + _BASE_URL = 'http://www.animelab.com' + _SHOWS_API_URL = '/api/videoentries/show/videos/' + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id, 'Downloading requested URL') + + show_data_str = self._search_regex(r'({"id":.*}),\svideoEntry', webpage, 'AnimeLab show data') + show_data = self._parse_json(show_data_str, display_id) + + show_id = str_or_none(show_data.get('id')) + title = show_data.get('name') + description = show_data.get('shortSynopsis') or show_data.get('longSynopsis') + + entries = [] + for season in show_data['seasons']: + season_id = season['id'] + get_data = urlencode_postdata({ + 'seasonId': season_id, + 'limit': 1000, + }) + # despite using urlencode_postdata, we are sending a GET request + target_url = _BASE_URL + _SHOWS_API_URL + show_id + "?" + get_data.decode('utf-8') + response = self._download_webpage( + target_url, + None, 'Season id %s' % season_id) + + season_data = self._parse_json(response, display_id) + + for video_data in season_data['list']: + entries.append(self.url_result( + _BASE_URL + '/player/' + video_data['slug'], 'AnimeLab', + str_or_none(video_data.get('id')), video_data.get('name') + )) + + return { + '_type': 'playlist', + 'id': show_id, + 'title': title, + 'description': description, + 'entries': entries, + } + +# TODO implement myqueue diff --git a/youtube_dl/extractor/animeondemand.py b/yt_dlp/extractor/animeondemand.py index 54e097d2f..54e097d2f 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/yt_dlp/extractor/animeondemand.py diff --git a/yt_dlp/extractor/anvato.py b/yt_dlp/extractor/anvato.py new file mode 100644 index 000000000..d688e2c5b --- /dev/null +++ b/yt_dlp/extractor/anvato.py @@ -0,0 +1,398 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import base64 +import hashlib +import json +import random +import re +import time + +from .common import InfoExtractor +from ..aes import aes_encrypt +from ..compat import compat_str +from ..utils import ( + bytes_to_intlist, + determine_ext, + intlist_to_bytes, + int_or_none, + strip_jsonp, + unescapeHTML, + unsmuggle_url, +) + +# This import causes a ModuleNotFoundError on some systems for unknown reason. +# See issues: +# https://github.com/yt-dlp/yt-dlp/issues/35 +# https://github.com/ytdl-org/youtube-dl/issues/27449 +# https://github.com/animelover1984/youtube-dl/issues/17 +try: + from .anvato_token_generator import NFLTokenGenerator +except ImportError: + NFLTokenGenerator = None + + +def md5_text(s): + if not isinstance(s, compat_str): + s = compat_str(s) + return hashlib.md5(s.encode('utf-8')).hexdigest() + + +class AnvatoIE(InfoExtractor): + _VALID_URL = r'anvato:(?P<access_key_or_mcp>[^:]+):(?P<id>\d+)' + + # Copied from anvplayer.min.js + _ANVACK_TABLE = { + 'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ', + 'nbcu_nbcd_desktop_web_qa_1a6f01bdd0dc45a439043b694c8a031d': 'eSxJUbA2UUKBTXryyQ2d6NuM8oEqaPySvaPzfKNA', + 'nbcu_nbcd_desktop_web_acc_eb2ff240a5d4ae9a63d4c297c32716b6c523a129': '89JR3RtUGbvKuuJIiKOMK0SoarLb5MUx8v89RcbP', + 'nbcu_nbcd_watchvod_web_prod_e61107507180976724ec8e8319fe24ba5b4b60e1': 'Uc7dFt7MJ9GsBWB5T7iPvLaMSOt8BBxv4hAXk5vv', + 'nbcu_nbcd_watchvod_web_qa_42afedba88a36203db5a4c09a5ba29d045302232': 'T12oDYVFP2IaFvxkmYMy5dKxswpLHtGZa4ZAXEi7', + 'nbcu_nbcd_watchvod_web_acc_9193214448e2e636b0ffb78abacfd9c4f937c6ca': 'MmobcxUxMedUpohNWwXaOnMjlbiyTOBLL6d46ZpR', + 'nbcu_local_monitor_web_acc_f998ad54eaf26acd8ee033eb36f39a7b791c6335': 'QvfIoPYrwsjUCcASiw3AIkVtQob2LtJHfidp9iWg', + 'nbcu_cable_monitor_web_acc_a413759603e8bedfcd3c61b14767796e17834077': 'uwVPJLShvJWSs6sWEIuVem7MTF8A4IknMMzIlFto', + 'nbcu_nbcd_mcpstage_web_qa_4c43a8f6e95a88dbb40276c0630ba9f693a63a4e': 'PxVYZVwjhgd5TeoPRxL3whssb5OUPnM3zyAzq8GY', + 'nbcu_comcast_comcast_web_prod_074080762ad4ce956b26b43fb22abf153443a8c4': 'afnaRZfDyg1Z3WZHdupKfy6xrbAG2MHqe3VfuSwh', + 'nbcu_comcast_comcast_web_qa_706103bb93ead3ef70b1de12a0e95e3c4481ade0': 'DcjsVbX9b3uoPlhdriIiovgFQZVxpISZwz0cx1ZK', + 'nbcu_comcast_comcastcable_web_prod_669f04817536743563d7331c9293e59fbdbe3d07': '0RwMN2cWy10qhAhOscq3eK7aEe0wqnKt3vJ0WS4D', + 'nbcu_comcast_comcastcable_web_qa_3d9d2d66219094127f0f6b09cc3c7bb076e3e1ca': '2r8G9DEya7PCqBceKZgrn2XkXgASjwLMuaFE1Aad', + 'hearst_hearst_demo_web_stage_960726dfef3337059a01a78816e43b29ec04dfc7': 'cuZBPXTR6kSdoTCVXwk5KGA8rk3NrgGn4H6e9Dsp', + 'anvato_mcpqa_demo_web_stage_18b55e00db5a13faa8d03ae6e41f6f5bcb15b922': 'IOaaLQ8ymqVyem14QuAvE5SndQynTcH5CrLkU2Ih', + 'anvato_nextmedia_demo_web_stage_9787d56a02ff6b9f43e9a2b0920d8ca88beb5818': 'Pqu9zVzI1ApiIzbVA3VkGBEQHvdKSUuKpD6s2uaR', + 'anvato_scripps_app_web_prod_0837996dbe373629133857ae9eb72e740424d80a': 'du1ccmn7RxzgizwbWU7hyUaGodNlJn7HtXI0WgXW', + 'anvato_scripps_app_web_stage_360797e00fe2826be142155c4618cc52fce6c26c': '2PMrQ0BRoqCWl7nzphj0GouIMEh2mZYivAT0S1Su', + 'fs2go_fs2go_go_all_prod_21934911ccfafc03a075894ead2260d11e2ddd24': 'RcuHlKikW2IJw6HvVoEkqq2UsuEJlbEl11pWXs4Q', + 'fs2go_fs2go_go_web_prod_ead4b0eec7460c1a07783808db21b49cf1f2f9a7': '4K0HTT2u1zkQA2MaGaZmkLa1BthGSBdr7jllrhk5', + 'fs2go_fs2go_go_web_stage_407585454a4400355d4391691c67f361': 'ftnc37VKRJBmHfoGGi3kT05bHyeJzilEzhKJCyl3', + 'fs2go_fs2go_go_android_stage_44b714db6f8477f29afcba15a41e1d30': 'CtxpPvVpo6AbZGomYUhkKs7juHZwNml9b9J0J2gI', + 'anvato_cbslocal_app_web_prod_547f3e49241ef0e5d30c79b2efbca5d92c698f67': 'Pw0XX5KBDsyRnPS0R2JrSrXftsy8Jnz5pAjaYC8s', + 'anvato_cbslocal_app_web_stage_547a5f096594cd3e00620c6f825cad1096d28c80': '37OBUhX2uwNyKhhrNzSSNHSRPZpApC3trdqDBpuz', + 'fs2go_att_att_web_prod_1042dddd089a05438b6a08f972941176f699ffd8': 'JLcF20JwYvpv6uAGcLWIaV12jKwaL1R8us4b6Zkg', + 'fs2go_att_att_web_stage_807c5001955fc114a3331fe027ddc76e': 'gbu1oO1y0JiOFh4SUipt86P288JHpyjSqolrrT1x', + 'fs2go_fs2go_tudor_web_prod_a7dd8e5a7cdc830cae55eae6f3e9fee5ee49eb9b': 'ipcp87VCEZXPPe868j3orLqzc03oTy7DXsGkAXXH', + 'anvato_mhz_app_web_prod_b808218b30de7fdf60340cbd9831512bc1bf6d37': 'Stlm5Gs6BEhJLRTZHcNquyzxGqr23EuFmE5DCgjX', + 'fs2go_charter_charter_web_stage_c2c6e5a68375a1bf00fff213d3ff8f61a835a54c': 'Lz4hbJp1fwL6jlcz4M2PMzghM4jp4aAmybtT5dPc', + 'fs2go_charter_charter_web_prod_ebfe3b10f1af215a7321cd3d629e0b81dfa6fa8c': 'vUJsK345A1bVmyYDRhZX0lqFIgVXuqhmuyp1EtPK', + 'anvato_epfox_app_web_prod_b3373168e12f423f41504f207000188daf88251b': 'GDKq1ixvX3MoBNdU5IOYmYa2DTUXYOozPjrCJnW7', + 'anvato_epfox_app_web_stage_a3c2ce60f8f83ef374a88b68ee73a950f8ab87ce': '2jz2NH4BsXMaDsoJ5qkHMbcczAfIReo2eFYuVC1C', + 'fs2go_verizon_verizon_web_stage_08e6df0354a4803f1b1f2428b5a9a382e8dbcd62': 'rKTVapNaAcmnUbGL4ZcuOoY4SE7VmZSQsblPFr7e', + 'fs2go_verizon_verizon_web_prod_f909564cb606eff1f731b5e22e0928676732c445': 'qLSUuHerM3u9eNPzaHyUK52obai5MvE4XDJfqYe1', + 'fs2go_foxcom_synd_web_stage_f7b9091f00ea25a4fdaaae77fca5b54cdc7e7043': '96VKF2vLd24fFiDfwPFpzM5llFN4TiIGAlodE0Re', + 'fs2go_foxcom_synd_web_prod_0f2cdd64d87e4ab6a1d54aada0ff7a7c8387a064': 'agiPjbXEyEZUkbuhcnmVPhe9NNVbDjCFq2xkcx51', + 'anvato_own_app_web_stage_1214ade5d28422c4dae9d03c1243aba0563c4dba': 'mzhamNac3swG4WsJAiUTacnGIODi6SWeVWk5D7ho', + 'anvato_own_app_web_prod_944e162ed927ec3e9ed13eb68ed2f1008ee7565e': '9TSxh6G2TXOLBoYm9ro3LdNjjvnXpKb8UR8KoIP9', + 'anvato_scripps_app_ftv_prod_a10a10468edd5afb16fb48171c03b956176afad1': 'COJ2i2UIPK7xZqIWswxe7FaVBOVgRkP1F6O6qGoH', + 'anvato_scripps_app_ftv_stage_77d3ad2bdb021ec37ca2e35eb09acd396a974c9a': 'Q7nnopNLe2PPfGLOTYBqxSaRpl209IhqaEuDZi1F', + 'anvato_univision_app_web_stage_551236ef07a0e17718c3995c35586b5ed8cb5031': 'D92PoLS6UitwxDRA191HUGT9OYcOjV6mPMa5wNyo', + 'anvato_univision_app_web_prod_039a5c0a6009e637ae8ac906718a79911e0e65e1': '5mVS5u4SQjtw6NGw2uhMbKEIONIiLqRKck5RwQLR', + 'nbcu_cnbc_springfield_ios_prod_670207fae43d6e9a94c351688851a2ce': 'M7fqCCIP9lW53oJbHs19OlJlpDrVyc2OL8gNeuTa', + 'nbcu_cnbc_springfieldvod_ios_prod_7a5f04b1ceceb0e9c9e2264a44aa236e08e034c2': 'Yia6QbJahW0S7K1I0drksimhZb4UFq92xLBmmMvk', + 'anvato_cox_app_web_prod_ce45cda237969f93e7130f50ee8bb6280c1484ab': 'cc0miZexpFtdoqZGvdhfXsLy7FXjRAOgb9V0f5fZ', + 'anvato_cox_app_web_stage_c23dbe016a8e9d8c7101d10172b92434f6088bf9': 'yivU3MYHd2eDZcOfmLbINVtqxyecKTOp8OjOuoGJ', + 'anvato_chnzero_app_web_stage_b1164d1352b579e792e542fddf13ee34c0eeb46b': 'A76QkXMmVH8lTCfU15xva1mZnSVcqeY4Xb22Kp7m', + 'anvato_chnzero_app_web_prod_253d358928dc08ec161eda2389d53707288a730c': 'OA5QI3ZWZZkdtUEDqh28AH8GedsF6FqzJI32596b', + 'anvato_discovery_vodpoc_web_stage_9fa7077b5e8af1f8355f65d4fb8d2e0e9d54e2b7': 'q3oT191tTQ5g3JCP67PkjLASI9s16DuWZ6fYmry3', + 'anvato_discovery_vodpoc_web_prod_688614983167a1af6cdf6d76343fda10a65223c1': 'qRvRQCTVHd0VVOHsMvvfidyWmlYVrTbjby7WqIuK', + 'nbcu_cnbc_springfieldvod_ftv_stage_826040aad1925a46ac5dfb4b3c5143e648c6a30d': 'JQaSb5a8Tz0PT4ti329DNmzDO30TnngTHmvX8Vua', + 'nbcu_cnbc_springfield_ftv_stage_826040aad1925a46ac5dfb4b3c5143e648c6a30d': 'JQaSb5a8Tz0PT4ti329DNmzDO30TnngTHmvX8Vua', + 'nbcu_nbcd_capture_web_stage_4dd9d585bfb984ebf856dee35db027b2465cc4ae': '0j1Ov4Vopyi2HpBZJYdL2m8ERJVGYh3nNpzPiO8F', + 'nbcu_nbcd_watch3_android_prod_7712ca5fcf1c22f19ec1870a9650f9c37db22dcf': '3LN2UB3rPUAMu7ZriWkHky9vpLMXYha8JbSnxBlx', + 'nbcu_nbcd_watchvod3_android_prod_0910a3a4692d57c0b5ff4316075bc5d096be45b9': 'mJagcQ2II30vUOAauOXne7ERwbf5S9nlB3IP17lQ', + 'anvato_scripps_app_atv_prod_790deda22e16e71e83df58f880cd389908a45d52': 'CB6trI1mpoDIM5o54DNTsji90NDBQPZ4z4RqBNSH', + 'nbcu_nbcd_watchv4_android_prod_ff67cef9cb409158c6f8c3533edddadd0b750507': 'j8CHQCUWjlYERj4NFRmUYOND85QNbHViH09UwuKm', + 'nbcu_nbcd_watchvodv4_android_prod_a814d781609989dea6a629d50ae4c7ad8cc8e907': 'rkVnUXxdA9rawVLUlDQtMue9Y4Q7lFEaIotcUhjt', + 'rvVKpA50qlOPLFxMjrCGf5pdkdQDm7qn': '1J7ZkY5Qz5lMLi93QOH9IveE7EYB3rLl', + 'nbcu_dtv_local_web_prod_b266cf49defe255fd4426a97e27c09e513e9f82f': 'HuLnJDqzLa4saCzYMJ79zDRSQpEduw1TzjMNQu2b', + 'nbcu_att_local_web_prod_4cef038b2d969a6b7d700a56a599040b6a619f67': 'Q0Em5VDc2KpydUrVwzWRXAwoNBulWUxCq2faK0AV', + 'nbcu_dish_local_web_prod_c56dcaf2da2e9157a4266c82a78195f1dd570f6b': 'bC1LWmRz9ayj2AlzizeJ1HuhTfIaJGsDBnZNgoRg', + 'nbcu_verizon_local_web_prod_88bebd2ce006d4ed980de8133496f9a74cb9b3e1': 'wzhDKJZpgvUSS1EQvpCQP8Q59qVzcPixqDGJefSk', + 'nbcu_charter_local_web_prod_9ad90f7fc4023643bb718f0fe0fd5beea2382a50': 'PyNbxNhEWLzy1ZvWEQelRuIQY88Eub7xbSVRMdfT', + 'nbcu_suddenlink_local_web_prod_20fb711725cac224baa1c1cb0b1c324d25e97178': '0Rph41lPXZbb3fqeXtHjjbxfSrNbtZp1Ygq7Jypa', + 'nbcu_wow_local_web_prod_652d9ce4f552d9c2e7b5b1ed37b8cb48155174ad': 'qayIBZ70w1dItm2zS42AptXnxW15mkjRrwnBjMPv', + 'nbcu_centurylink_local_web_prod_2034402b029bf3e837ad46814d9e4b1d1345ccd5': 'StePcPMkjsX51PcizLdLRMzxMEl5k2FlsMLUNV4k', + 'nbcu_atlanticbrd_local_web_prod_8d5f5ecbf7f7b2f5e6d908dd75d90ae3565f682e': 'NtYLb4TFUS0pRs3XTkyO5sbVGYjVf17bVbjaGscI', + 'nbcu_nbcd_watchvod_web_dev_08bc05699be47c4f31d5080263a8cfadc16d0f7c': 'hwxi2dgDoSWgfmVVXOYZm14uuvku4QfopstXckhr', + 'anvato_nextmedia_app_web_prod_a4fa8c7204aa65e71044b57aaf63711980cfe5a0': 'tQN1oGPYY1nM85rJYePWGcIb92TG0gSqoVpQTWOw', + 'anvato_mcp_lin_web_prod_4c36fbfd4d8d8ecae6488656e21ac6d1ac972749': 'GUXNf5ZDX2jFUpu4WT2Go4DJ5nhUCzpnwDRRUx1K', + 'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa': 'bLDYF8JqfG42b7bwKEgQiU9E2LTIAtnKzSgYpFUH', + 'anvato_mcp_fs2go_web_prod_c7b90a93e171469cdca00a931211a2f556370d0a': 'icgGoYGipQMMSEvhplZX1pwbN69srwKYWksz3xWK', + 'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336': 'fA2iQdI7RDpynqzQYIpXALVS83NTPr8LLFK4LFsu', + 'anvato_mcp_anv_web_prod_791407490f4c1ef2a4bcb21103e0cb1bcb3352b3': 'rMOUZqe9lwcGq2mNgG3EDusm6lKgsUnczoOX3mbg', + 'anvato_mcp_gray_web_prod_4c10f067c393ed8fc453d3930f8ab2b159973900': 'rMOUZqe9lwcGq2mNgG3EDusm6lKgsUnczoOX3mbg', + 'anvato_mcp_hearst_web_prod_5356c3de0fc7c90a3727b4863ca7fec3a4524a99': 'P3uXJ0fXXditBPCGkfvlnVScpPEfKmc64Zv7ZgbK', + 'anvato_mcp_cbs_web_prod_02f26581ff80e5bda7aad28226a8d369037f2cbe': 'mGPvo5ZA5SgjOFAPEPXv7AnOpFUICX8hvFQVz69n', + 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582': 'qyT6PXXLjVNCrHaRVj0ugAhalNRS7Ee9BP7LUokD', + 'nbcu_nbcd_watchvodv4_web_stage_4108362fba2d4ede21f262fea3c4162cbafd66c7': 'DhaU5lj0W2gEdcSSsnxURq8t7KIWtJfD966crVDk', + 'anvato_scripps_app_ios_prod_409c41960c60b308db43c3cc1da79cab9f1c3d93': 'WPxj5GraLTkYCyj3M7RozLqIycjrXOEcDGFMIJPn', + 'EZqvRyKBJLrgpClDPDF8I7Xpdp40Vx73': '4OxGd2dEakylntVKjKF0UK9PDPYB6A9W', + 'M2v78QkpleXm9hPp9jUXI63x5vA6BogR': 'ka6K32k7ZALmpINkjJUGUo0OE42Md1BQ', + 'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6_secure': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ', + 'X8POa4zPPaKVZHqmWjuEzfP31b1QM9VN': 'Dn5vOY9ooDw7VSl9qztjZI5o0g08mA0z', + 'M2v78QkBMpNJlSPp9diX5F2PBmBy6Bog': 'ka6K32kyo7nDZfNkjQCGWf1lpApXMd1B', + 'bvJ0dQpav07l0hG5JgfVLF2dv1vARwpP': 'BzoQW24GrJZoJfmNodiJKSPeB9B8NOxj', + 'lxQMLg2XZKuEZaWgsqubBxV9INZ6bryY': 'Vm2Mx6noKds9jB71h6urazwlTG3m9x8l', + '04EnjvXeoSmkbJ9ckPs7oY0mcxv7PlyN': 'aXERQP9LMfQVlEDsgGs6eEA1SWznAQ8P', + 'mQbO2ge6BFRWVPYCYpU06YvNt80XLvAX': 'E2BV1NGmasN5v7eujECVPJgwflnLPm2A', + 'g43oeBzJrCml7o6fa5fRL1ErCdeD8z4K': 'RX34mZ6zVH4Nr6whbxIGLv9WSbxEKo8V', + 'VQrDJoP7mtdBzkxhXbSPwGB1coeElk4x': 'j2VejQx0VFKQepAF7dI0mJLKtOVJE18z', + 'WxA5NzLRjCrmq0NUgaU5pdMDuZO7RJ4w': 'lyY5ADLKaIOLEgAsGQCveEMAcqnx3rY9', + 'M4lpMXB71ie0PjMCjdFzVXq0SeRVqz49': 'n2zVkOqaLIv3GbLfBjcwW51LcveWOZ2e', + 'dyDZGEqN8u8nkJZcJns0oxYmtP7KbGAn': 'VXOEqQW9BtEVLajfZQSLEqxgS5B7qn2D', + 'E7QNjrVY5u5mGvgu67IoDgV1CjEND8QR': 'rz8AaDmdKIkLmPNhB5ILPJnjS5PnlL8d', + 'a4zrqjoKlfzg0dwHEWtP31VqcLBpjm4g': 'LY9J16gwETdGWa3hjBu5o0RzuoQDjqXQ', + 'dQP5BZroMsMVLO1hbmT5r2Enu86GjxA6': '7XR3oOdbPF6x3PRFLDCq9RkgsRjAo48V', + 'M4lKNBO1NFe0PjMCj1tzVXq0SeRVqzA9': 'n2zoRqGLRUv3GbLfBmTwW51LcveWOZYe', + 'nAZ7MZdpGCGg1pqFEbsoJOz2C60mv143': 'dYJgdqA9aT4yojETqGi7yNgoFADxqmXP', + '3y1MERYgOuE9NzbFgwhV6Wv2F0YKvbyz': '081xpZDQgC4VadLTavhWQxrku56DAgXV', + 'bmQvmEXr5HWklBMCZOcpE2Z3HBYwqGyl': 'zxXPbVNyMiMAZldhr9FkOmA0fl4aKr2v', + 'wA7oDNYldfr6050Hwxi52lPZiVlB86Ap': 'ZYK16aA7ni0d3l3c34uwpxD7CbReMm8Q', + 'g43MbKMWmFml7o7sJoSRkXxZiXRvJ3QK': 'RX3oBJonvs4Nr6rUWBCGn3matRGqJPXV', + 'mA9VdlqpLS0raGaSDvtoqNrBTzb8XY4q': '0XN4OjBD3fnW7r7IbmtJB4AyfOmlrE2r', + 'mAajOwgkGt17oGoFmEuklMP9H0GnW54d': 'lXbBLPGyzikNGeGujAuAJGjZiwLRxyXR', + 'vy8vjJ9kbUwrRqRu59Cj5dWZfzYErlAb': 'K8l7gpwaGcBpnAnCLNCmPZRdin3eaQX0', + 'xQMWBpR8oHEZaWaSMGUb0avOHjLVYn4Y': 'm2MrN4vEaf9jB7BFy5Srb40jTrN67AYl', + 'xyKEmVO3miRr6D6UVkt7oB8jtD6aJEAv': 'g2ddDebqDfqdgKgswyUKwGjbTWwzq923', + '7Qk0wa2D9FjKapacoJF27aLvUDKkLGA0': 'b2kgBEkephJaMkMTL7s1PLe4Ua6WyP2P', + '3QLg6nqmNTJ5VvVTo7f508LPidz1xwyY': 'g2L1GgpraipmAOAUqmIbBnPxHOmw4MYa', + '3y1B7zZjXTE9NZNSzZSVNPZaTNLjo6Qz': '081b5G6wzH4VagaURmcWbN5mT4JGEe2V', + 'lAqnwvkw6SG6D8DSqmUg6DRLUp0w3G4x': 'O2pbP0xPDFNJjpjIEvcdryOJtpkVM4X5', + 'awA7xd1N0Hr6050Hw2c52lPZiVlB864p': 'GZYKpn4aoT0d3l3c3PiwpxD7CbReMmXQ', + 'jQVqPLl9YHL1WGWtR1HDgWBGT63qRNyV': '6X03ne6vrU4oWyWUN7tQVoajikxJR3Ye', + 'GQRMR8mL7uZK797t7xH3eNzPIP5dOny1': 'm2vqPWGd4U31zWzSyasDRAoMT1PKRp8o', + 'zydq9RdmRhXLkNkfNoTJlMzaF0lWekQB': '3X7LnvE7vH5nkEkSqLiey793Un7dLB8e', + 'VQrDzwkB2IdBzjzu9MHPbEYkSB50gR4x': 'j2VebLzoKUKQeEesmVh0gM1eIp9jKz8z', + 'mAa2wMamBs17oGoFmktklMP9H0GnW54d': 'lXbgP74xZTkNGeGujVUAJGjZiwLRxy8R', + '7yjB6ZLG6sW8R6RF2xcan1KGfJ5dNoyd': 'wXQkPorvPHZ45N5t4Jf6qwg5Tp4xvw29', + 'a4zPpNeWGuzg0m0iX3tPeanGSkRKWXQg': 'LY9oa3QAyHdGW9Wu3Ri5JGeEik7l1N8Q', + 'k2rneA2M38k25cXDwwSknTJlxPxQLZ6M': '61lyA2aEVDzklfdwmmh31saPxQx2VRjp', + 'bK9Zk4OvPnvxduLgxvi8VUeojnjA02eV': 'o5jANYjbeMb4nfBaQvcLAt1jzLzYx6ze', + '5VD6EydM3R9orHmNMGInGCJwbxbQvGRw': 'w3zjmX7g4vnxzCxElvUEOiewkokXprkZ', + '70X35QbVYVYNPUmP9YfbzI06YqYQk2R1': 'vG4Aj2BMjMjoztB7zeFOnCVPJpJ8lMOa', + '26qYwQVG9p1Bks2GgBckjfDJOXOAMgG1': 'r4ev9X0mv5zqJc0yk5IBDcQOwZw8mnwQ', + 'rvVKpA56MBXWlSxMw3cobT5pdkd4Dm7q': '1J7ZkY53pZ645c93owcLZuveE7E8B3rL', + 'qN1zdy1zlYL23IWZGWtDvfV6WeWQWkJo': 'qN1zdy1zlYL23IWZGWtDvfV6WeWQWkJo', + 'jdKqRGF16dKsBviMDae7IGDl7oTjEbVV': 'Q09l7vhlNxPFErIOK6BVCe7KnwUW5DVV', + '3QLkogW1OUJ5VvPsrDH56DY2u7lgZWyY': 'g2LRE1V9espmAOPhE4ubj4ZdUA57yDXa', + 'wyJvWbXGBSdbkEzhv0CW8meou82aqRy8': 'M2wolPvyBIpQGkbT4juedD4ruzQGdK2y', + '7QkdZrzEkFjKap6IYDU2PB0oCNZORmA0': 'b2kN1l96qhJaMkPs9dt1lpjBfwqZoA8P', + 'pvA05113MHG1w3JTYxc6DVlRCjErVz4O': 'gQXeAbblBUnDJ7vujbHvbRd1cxlz3AXO', + 'mA9blJDZwT0raG1cvkuoeVjLC7ZWd54q': '0XN9jRPwMHnW7rvumgfJZOD9CJgVkWYr', + '5QwRN5qKJTvGKlDTmnf7xwNZcjRmvEy9': 'R2GP6LWBJU1QlnytwGt0B9pytWwAdDYy', + 'eyn5rPPbkfw2KYxH32fG1q58CbLJzM40': 'p2gyqooZnS56JWeiDgfmOy1VugOQEBXn', + '3BABn3b5RfPJGDwilbHe7l82uBoR05Am': '7OYZG7KMVhbPdKJS3xcWEN3AuDlLNmXj', + 'xA5zNGXD3HrmqMlF6OS5pdMDuZO7RJ4w': 'yY5DAm6r1IOLE3BCVMFveEMAcqnx3r29', + 'g43PgW3JZfml7o6fDEURL1ErCdeD8zyK': 'RX3aQn1zrS4Nr6whDgCGLv9WSbxEKo2V', + 'lAqp8WbGgiG6D8LTKJcg3O72CDdre1Qx': 'O2pnm6473HNJjpKuVosd3vVeh975yrX5', + 'wyJbYEDxKSdbkJ6S6RhW8meou82aqRy8': 'M2wPm7EgRSpQGlAh70CedD4ruzQGdKYy', + 'M4lgW28nLCe0PVdtaXszVXq0SeRVqzA9': 'n2zmJvg4jHv3G0ETNgiwW51LcveWOZ8e', + '5Qw3OVvp9FvGKlDTmOC7xwNZcjRmvEQ9': 'R2GzDdml9F1Qlnytw9s0B9pytWwAdD8y', + 'vy8a98X7zCwrRqbHrLUjYzwDiK2b70Qb': 'K8lVwzyjZiBpnAaSGeUmnAgxuGOBxmY0', + 'g4eGjJLLoiqRD3Pf9oT5O03LuNbLRDQp': '6XqD59zzpfN4EwQuaGt67qNpSyRBlnYy', + 'g43OPp9boIml7o6fDOIRL1ErCdeD8z4K': 'RX33alNB4s4Nr6whDPUGLv9WSbxEKoXV', + 'xA2ng9OkBcGKzDbTkKsJlx7dUK8R3dA5': 'z2aPnJvzBfObkwGC3vFaPxeBhxoMqZ8K', + 'xyKEgBajZuRr6DEC0Kt7XpD1cnNW9gAv': 'g2ddlEBvRsqdgKaI4jUK9PrgfMexGZ23', + 'BAogww51jIMa2JnH1BcYpXM5F658RNAL': 'rYWDmm0KptlkGv4FGJFMdZmjs9RDE6XR', + 'BAokpg62VtMa2JnH1mHYpXM5F658RNAL': 'rYWryDnlNslkGv4FG4HMdZmjs9RDE62R', + 'a4z1Px5e2hzg0m0iMMCPeanGSkRKWXAg': 'LY9eorNQGUdGW9WuKKf5JGeEik7l1NYQ', + 'kAx69R58kF9nY5YcdecJdl2pFXP53WyX': 'gXyRxELpbfPvLeLSaRil0mp6UEzbZJ8L', + 'BAoY13nwViMa2J2uo2cY6BlETgmdwryL': 'rYWwKzJmNFlkGvGtNoUM9bzwIJVzB1YR', + } + + _MCP_TO_ACCESS_KEY_TABLE = { + 'qa': 'anvato_mcpqa_demo_web_stage_18b55e00db5a13faa8d03ae6e41f6f5bcb15b922', + 'lin': 'anvato_mcp_lin_web_prod_4c36fbfd4d8d8ecae6488656e21ac6d1ac972749', + 'univison': 'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa', + 'uni': 'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa', + 'dev': 'anvato_mcp_fs2go_web_prod_c7b90a93e171469cdca00a931211a2f556370d0a', + 'sps': 'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336', + 'spsstg': 'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336', + 'anv': 'anvato_mcp_anv_web_prod_791407490f4c1ef2a4bcb21103e0cb1bcb3352b3', + 'gray': 'anvato_mcp_gray_web_prod_4c10f067c393ed8fc453d3930f8ab2b159973900', + 'hearst': 'anvato_mcp_hearst_web_prod_5356c3de0fc7c90a3727b4863ca7fec3a4524a99', + 'cbs': 'anvato_mcp_cbs_web_prod_02f26581ff80e5bda7aad28226a8d369037f2cbe', + 'telemundo': 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582' + } + + _TOKEN_GENERATORS = { + 'GXvEgwyJeWem8KCYXfeoHWknwP48Mboj': NFLTokenGenerator, + } + + _API_KEY = '3hwbSuqqT690uxjNYBktSQpa5ZrpYYR0Iofx7NcJHyA' + + _ANVP_RE = r'<script[^>]+\bdata-anvp\s*=\s*(["\'])(?P<anvp>(?:(?!\1).)+)\1' + _AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce' + + _TESTS = [{ + # from https://www.boston25news.com/news/watch-humpback-whale-breaches-right-next-to-fishing-boat-near-nh/817484874 + 'url': 'anvato:8v9BEynrwx8EFLYpgfOWcG1qJqyXKlRM:4465496', + 'info_dict': { + 'id': '4465496', + 'ext': 'mp4', + 'title': 'VIDEO: Humpback whale breaches right next to NH boat', + 'description': 'VIDEO: Humpback whale breaches right next to NH boat. Footage courtesy: Zach Fahey.', + 'duration': 22, + 'timestamp': 1534855680, + 'upload_date': '20180821', + 'uploader': 'ANV', + }, + 'params': { + 'skip_download': True, + }, + }, { + # from https://sanfrancisco.cbslocal.com/2016/06/17/source-oakland-cop-on-leave-for-having-girlfriend-help-with-police-reports/ + 'url': 'anvato:DVzl9QRzox3ZZsP9bNu5Li3X7obQOnqP:3417601', + 'only_matching': True, + }] + + def __init__(self, *args, **kwargs): + super(AnvatoIE, self).__init__(*args, **kwargs) + self.__server_time = None + + def _server_time(self, access_key, video_id): + if self.__server_time is not None: + return self.__server_time + + self.__server_time = int(self._download_json( + self._api_prefix(access_key) + 'server_time?anvack=' + access_key, video_id, + note='Fetching server time')['server_time']) + + return self.__server_time + + def _api_prefix(self, access_key): + return 'https://tkx2-%s.anvato.net/rest/v2/' % ('prod' if 'prod' in access_key else 'stage') + + def _get_video_json(self, access_key, video_id): + # See et() in anvplayer.min.js, which is an alias of getVideoJSON() + video_data_url = self._api_prefix(access_key) + 'mcp/video/%s?anvack=%s' % (video_id, access_key) + server_time = self._server_time(access_key, video_id) + input_data = '%d~%s~%s' % (server_time, md5_text(video_data_url), md5_text(server_time)) + + auth_secret = intlist_to_bytes(aes_encrypt( + bytes_to_intlist(input_data[:64]), bytes_to_intlist(self._AUTH_KEY))) + + video_data_url += '&X-Anvato-Adst-Auth=' + base64.b64encode(auth_secret).decode('ascii') + anvrid = md5_text(time.time() * 1000 * random.random())[:30] + api = { + 'anvrid': anvrid, + 'anvts': server_time, + } + if self._TOKEN_GENERATORS.get(access_key) is not None: + api['anvstk2'] = self._TOKEN_GENERATORS[access_key].generate(self, access_key, video_id) + else: + api['anvstk'] = md5_text('%s|%s|%d|%s' % ( + access_key, anvrid, server_time, + self._ANVACK_TABLE.get(access_key, self._API_KEY))) + + return self._download_json( + video_data_url, video_id, transform_source=strip_jsonp, + data=json.dumps({'api': api}).encode('utf-8')) + + def _get_anvato_videos(self, access_key, video_id): + video_data = self._get_video_json(access_key, video_id) + + formats = [] + for published_url in video_data['published_urls']: + video_url = published_url['embed_url'] + media_format = published_url.get('format') + ext = determine_ext(video_url) + + if ext == 'smil' or media_format == 'smil': + formats.extend(self._extract_smil_formats(video_url, video_id)) + continue + + tbr = int_or_none(published_url.get('kbps')) + a_format = { + 'url': video_url, + 'format_id': ('-'.join(filter(None, ['http', published_url.get('cdn_name')]))).lower(), + 'tbr': tbr if tbr != 0 else None, + } + + if media_format == 'm3u8' and tbr is not None: + a_format.update({ + 'format_id': '-'.join(filter(None, ['hls', compat_str(tbr)])), + 'ext': 'mp4', + }) + elif media_format == 'm3u8-variant' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + continue + elif ext == 'mp3' or media_format == 'mp3': + a_format['vcodec'] = 'none' + else: + a_format.update({ + 'width': int_or_none(published_url.get('width')), + 'height': int_or_none(published_url.get('height')), + }) + formats.append(a_format) + + self._sort_formats(formats) + + subtitles = {} + for caption in video_data.get('captions', []): + a_caption = { + 'url': caption['url'], + 'ext': 'tt' if caption.get('format') == 'SMPTE-TT' else None + } + subtitles.setdefault(caption['language'], []).append(a_caption) + + return { + 'id': video_id, + 'formats': formats, + 'title': video_data.get('def_title'), + 'description': video_data.get('def_description'), + 'tags': video_data.get('def_tags', '').split(','), + 'categories': video_data.get('categories'), + 'thumbnail': video_data.get('src_image_url') or video_data.get('thumbnail'), + 'timestamp': int_or_none(video_data.get( + 'ts_published') or video_data.get('ts_added')), + 'uploader': video_data.get('mcp_id'), + 'duration': int_or_none(video_data.get('duration')), + 'subtitles': subtitles, + } + + @staticmethod + def _extract_urls(ie, webpage, video_id): + entries = [] + for mobj in re.finditer(AnvatoIE._ANVP_RE, webpage): + anvplayer_data = ie._parse_json( + mobj.group('anvp'), video_id, transform_source=unescapeHTML, + fatal=False) + if not anvplayer_data: + continue + video = anvplayer_data.get('video') + if not isinstance(video, compat_str) or not video.isdigit(): + continue + access_key = anvplayer_data.get('accessKey') + if not access_key: + mcp = anvplayer_data.get('mcp') + if mcp: + access_key = AnvatoIE._MCP_TO_ACCESS_KEY_TABLE.get( + mcp.lower()) + if not access_key: + continue + entries.append(ie.url_result( + 'anvato:%s:%s' % (access_key, video), ie=AnvatoIE.ie_key(), + video_id=video)) + return entries + + def _extract_anvato_videos(self, webpage, video_id): + anvplayer_data = self._parse_json( + self._html_search_regex( + self._ANVP_RE, webpage, 'Anvato player data', group='anvp'), + video_id) + return self._get_anvato_videos( + anvplayer_data['accessKey'], anvplayer_data['video']) + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + self._initialize_geo_bypass({ + 'countries': smuggled_data.get('geo_countries'), + }) + + mobj = self._match_valid_url(url) + access_key, video_id = mobj.group('access_key_or_mcp', 'id') + if access_key not in self._ANVACK_TABLE: + access_key = self._MCP_TO_ACCESS_KEY_TABLE.get( + access_key) or access_key + return self._get_anvato_videos(access_key, video_id) diff --git a/yt_dlp/extractor/anvato_token_generator/__init__.py b/yt_dlp/extractor/anvato_token_generator/__init__.py new file mode 100644 index 000000000..6e223db9f --- /dev/null +++ b/yt_dlp/extractor/anvato_token_generator/__init__.py @@ -0,0 +1,7 @@ +from __future__ import unicode_literals + +from .nfl import NFLTokenGenerator + +__all__ = [ + 'NFLTokenGenerator', +] diff --git a/yt_dlp/extractor/anvato_token_generator/common.py b/yt_dlp/extractor/anvato_token_generator/common.py new file mode 100644 index 000000000..b959a903b --- /dev/null +++ b/yt_dlp/extractor/anvato_token_generator/common.py @@ -0,0 +1,6 @@ +from __future__ import unicode_literals + + +class TokenGenerator: + def generate(self, anvack, mcp_id): + raise NotImplementedError('This method must be implemented by subclasses') diff --git a/yt_dlp/extractor/anvato_token_generator/nfl.py b/yt_dlp/extractor/anvato_token_generator/nfl.py new file mode 100644 index 000000000..97a2b245f --- /dev/null +++ b/yt_dlp/extractor/anvato_token_generator/nfl.py @@ -0,0 +1,30 @@ +from __future__ import unicode_literals + +import json + +from .common import TokenGenerator + + +class NFLTokenGenerator(TokenGenerator): + _AUTHORIZATION = None + + def generate(ie, anvack, mcp_id): + if not NFLTokenGenerator._AUTHORIZATION: + reroute = ie._download_json( + 'https://api.nfl.com/v1/reroute', mcp_id, + data=b'grant_type=client_credentials', + headers={'X-Domain-Id': 100}) + NFLTokenGenerator._AUTHORIZATION = '%s %s' % (reroute.get('token_type') or 'Bearer', reroute['access_token']) + return ie._download_json( + 'https://api.nfl.com/v3/shield/', mcp_id, data=json.dumps({ + 'query': '''{ + viewer { + mediaToken(anvack: "%s", id: %s) { + token + } + } +}''' % (anvack, mcp_id), + }).encode(), headers={ + 'Authorization': NFLTokenGenerator._AUTHORIZATION, + 'Content-Type': 'application/json', + })['data']['viewer']['mediaToken']['token'] diff --git a/yt_dlp/extractor/aol.py b/yt_dlp/extractor/aol.py new file mode 100644 index 000000000..4766a2c77 --- /dev/null +++ b/yt_dlp/extractor/aol.py @@ -0,0 +1,136 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .yahoo import YahooIE +from ..utils import ( + ExtractorError, + int_or_none, + parse_qs, + url_or_none, +) + + +class AolIE(YahooIE): + IE_NAME = 'aol.com' + _VALID_URL = r'(?:aol-video:|https?://(?:www\.)?aol\.(?:com|ca|co\.uk|de|jp)/video/(?:[^/]+/)*)(?P<id>\d{9}|[0-9a-f]{24}|[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12})' + + _TESTS = [{ + # video with 5min ID + 'url': 'https://www.aol.com/video/view/u-s--official-warns-of-largest-ever-irs-phone-scam/518167793/', + 'md5': '18ef68f48740e86ae94b98da815eec42', + 'info_dict': { + 'id': '518167793', + 'ext': 'mp4', + 'title': 'U.S. Official Warns Of \'Largest Ever\' IRS Phone Scam', + 'description': 'A major phone scam has cost thousands of taxpayers more than $1 million, with less than a month until income tax returns are due to the IRS.', + 'timestamp': 1395405060, + 'upload_date': '20140321', + 'uploader': 'Newsy Studio', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + }, { + # video with vidible ID + 'url': 'https://www.aol.com/video/view/netflix-is-raising-rates/5707d6b8e4b090497b04f706/', + 'info_dict': { + 'id': '5707d6b8e4b090497b04f706', + 'ext': 'mp4', + 'title': 'Netflix is Raising Rates', + 'description': 'Netflix is rewarding millions of it’s long-standing members with an increase in cost. Veuer’s Carly Figueroa has more.', + 'upload_date': '20160408', + 'timestamp': 1460123280, + 'uploader': 'Veuer', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + }, { + 'url': 'https://www.aol.com/video/view/park-bench-season-2-trailer/559a1b9be4b0c3bfad3357a7/', + 'only_matching': True, + }, { + 'url': 'https://www.aol.com/video/view/donald-trump-spokeswoman-tones-down-megyn-kelly-attacks/519442220/', + 'only_matching': True, + }, { + 'url': 'aol-video:5707d6b8e4b090497b04f706', + 'only_matching': True, + }, { + 'url': 'https://www.aol.com/video/playlist/PL8245/5ca79d19d21f1a04035db606/', + 'only_matching': True, + }, { + 'url': 'https://www.aol.ca/video/view/u-s-woman-s-family-arrested-for-murder-first-pinned-on-panhandler-police/5c7ccf45bc03931fa04b2fe1/', + 'only_matching': True, + }, { + 'url': 'https://www.aol.co.uk/video/view/-one-dead-and-22-hurt-in-bus-crash-/5cb3a6f3d21f1a072b457347/', + 'only_matching': True, + }, { + 'url': 'https://www.aol.de/video/view/eva-braun-privataufnahmen-von-hitlers-geliebter-werden-digitalisiert/5cb2d49de98ab54c113d3d5d/', + 'only_matching': True, + }, { + 'url': 'https://www.aol.jp/video/playlist/5a28e936a1334d000137da0c/5a28f3151e642219fde19831/', + 'only_matching': True, + }, { + # Yahoo video + 'url': 'https://www.aol.com/video/play/991e6700-ac02-11ea-99ff-357400036f61/24bbc846-3e30-3c46-915e-fe8ccd7fcc46/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + if '-' in video_id: + return self._extract_yahoo_video(video_id, 'us') + + response = self._download_json( + 'https://feedapi.b2c.on.aol.com/v1.0/app/videos/aolon/%s/details' % video_id, + video_id)['response'] + if response['statusText'] != 'Ok': + raise ExtractorError('%s said: %s' % (self.IE_NAME, response['statusText']), expected=True) + + video_data = response['data'] + formats = [] + m3u8_url = url_or_none(video_data.get('videoMasterPlaylist')) + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + for rendition in video_data.get('renditions', []): + video_url = url_or_none(rendition.get('url')) + if not video_url: + continue + ext = rendition.get('format') + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + else: + f = { + 'url': video_url, + 'format_id': rendition.get('quality'), + } + mobj = re.search(r'(\d+)x(\d+)', video_url) + if mobj: + f.update({ + 'width': int(mobj.group(1)), + 'height': int(mobj.group(2)), + }) + else: + qs = parse_qs(video_url) + f.update({ + 'width': int_or_none(qs.get('w', [None])[0]), + 'height': int_or_none(qs.get('h', [None])[0]), + }) + formats.append(f) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_data['title'], + 'duration': int_or_none(video_data.get('duration')), + 'timestamp': int_or_none(video_data.get('publishDate')), + 'view_count': int_or_none(video_data.get('views')), + 'description': video_data.get('description'), + 'uploader': video_data.get('videoOwner'), + 'formats': formats, + } diff --git a/yt_dlp/extractor/apa.py b/yt_dlp/extractor/apa.py new file mode 100644 index 000000000..1736cdf56 --- /dev/null +++ b/yt_dlp/extractor/apa.py @@ -0,0 +1,95 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, + url_or_none, +) + + +class APAIE(InfoExtractor): + _VALID_URL = r'(?P<base_url>https?://[^/]+\.apa\.at)/embed/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _TESTS = [{ + 'url': 'http://uvp.apa.at/embed/293f6d17-692a-44e3-9fd5-7b178f3a1029', + 'md5': '2b12292faeb0a7d930c778c7a5b4759b', + 'info_dict': { + 'id': '293f6d17-692a-44e3-9fd5-7b178f3a1029', + 'ext': 'mp4', + 'title': '293f6d17-692a-44e3-9fd5-7b178f3a1029', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + }, { + 'url': 'https://uvp-apapublisher.sf.apa.at/embed/2f94e9e6-d945-4db2-9548-f9a41ebf7b78', + 'only_matching': True, + }, { + 'url': 'http://uvp-rma.sf.apa.at/embed/70404cca-2f47-4855-bbb8-20b1fae58f76', + 'only_matching': True, + }, { + 'url': 'http://uvp-kleinezeitung.sf.apa.at/embed/f1c44979-dba2-4ebf-b021-e4cf2cac3c81', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//[^/]+\.apa\.at/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}.*?)\1', + webpage)] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id, base_url = mobj.group('id', 'base_url') + + webpage = self._download_webpage( + '%s/player/%s' % (base_url, video_id), video_id) + + jwplatform_id = self._search_regex( + r'media[iI]d\s*:\s*["\'](?P<id>[a-zA-Z0-9]{8})', webpage, + 'jwplatform id', default=None) + + if jwplatform_id: + return self.url_result( + 'jwplatform:' + jwplatform_id, ie='JWPlatform', + video_id=video_id) + + def extract(field, name=None): + return self._search_regex( + r'\b%s["\']\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1' % field, + webpage, name or field, default=None, group='value') + + title = extract('title') or video_id + description = extract('description') + thumbnail = extract('poster', 'thumbnail') + + formats = [] + for format_id in ('hls', 'progressive'): + source_url = url_or_none(extract(format_id)) + if not source_url: + continue + ext = determine_ext(source_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + source_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + else: + height = int_or_none(self._search_regex( + r'(\d+)\.mp4', source_url, 'height', default=None)) + formats.append({ + 'url': source_url, + 'format_id': format_id, + 'height': height, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'formats': formats, + } diff --git a/yt_dlp/extractor/aparat.py b/yt_dlp/extractor/aparat.py new file mode 100644 index 000000000..da06a3cac --- /dev/null +++ b/yt_dlp/extractor/aparat.py @@ -0,0 +1,88 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + get_element_by_id, + int_or_none, + merge_dicts, + mimetype2ext, + url_or_none, +) + + +class AparatIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P<id>[a-zA-Z0-9]+)' + + _TESTS = [{ + 'url': 'http://www.aparat.com/v/wP8On', + 'md5': '131aca2e14fe7c4dcb3c4877ba300c89', + 'info_dict': { + 'id': 'wP8On', + 'ext': 'mp4', + 'title': 'تیم گلکسی 11 - زومیت', + 'description': 'md5:096bdabcdcc4569f2b8a5e903a3b3028', + 'duration': 231, + 'timestamp': 1387394859, + 'upload_date': '20131218', + 'view_count': int, + }, + }, { + # multiple formats + 'url': 'https://www.aparat.com/v/8dflw/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + # Provides more metadata + webpage = self._download_webpage(url, video_id, fatal=False) + + if not webpage: + webpage = self._download_webpage( + 'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id, + video_id) + + options = self._parse_json(self._search_regex( + r'options\s*=\s*({.+?})\s*;', webpage, 'options'), video_id) + + formats = [] + for sources in (options.get('multiSRC') or []): + for item in sources: + if not isinstance(item, dict): + continue + file_url = url_or_none(item.get('src')) + if not file_url: + continue + item_type = item.get('type') + if item_type == 'application/vnd.apple.mpegurl': + formats.extend(self._extract_m3u8_formats( + file_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False)) + else: + ext = mimetype2ext(item.get('type')) + label = item.get('label') + formats.append({ + 'url': file_url, + 'ext': ext, + 'format_id': 'http-%s' % (label or ext), + 'height': int_or_none(self._search_regex( + r'(\d+)[pP]', label or '', 'height', + default=None)), + }) + self._sort_formats(formats) + + info = self._search_json_ld(webpage, video_id, default={}) + + if not info.get('title'): + info['title'] = get_element_by_id('videoTitle', webpage) or \ + self._html_search_meta(['og:title', 'twitter:title', 'DC.Title', 'title'], webpage, fatal=True) + + return merge_dicts(info, { + 'id': video_id, + 'thumbnail': url_or_none(options.get('poster')), + 'duration': int_or_none(options.get('duration')), + 'formats': formats, + }) diff --git a/youtube_dl/extractor/appleconnect.py b/yt_dlp/extractor/appleconnect.py index 494f8330c..494f8330c 100644 --- a/youtube_dl/extractor/appleconnect.py +++ b/yt_dlp/extractor/appleconnect.py diff --git a/youtube_dl/extractor/applepodcasts.py b/yt_dlp/extractor/applepodcasts.py index 6a74de758..6a74de758 100644 --- a/youtube_dl/extractor/applepodcasts.py +++ b/yt_dlp/extractor/applepodcasts.py diff --git a/yt_dlp/extractor/appletrailers.py b/yt_dlp/extractor/appletrailers.py new file mode 100644 index 000000000..8140e332b --- /dev/null +++ b/yt_dlp/extractor/appletrailers.py @@ -0,0 +1,283 @@ +from __future__ import unicode_literals + +import re +import json + +from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + int_or_none, + parse_duration, + unified_strdate, +) + + +class AppleTrailersIE(InfoExtractor): + IE_NAME = 'appletrailers' + _VALID_URL = r'https?://(?:www\.|movie)?trailers\.apple\.com/(?:trailers|ca)/(?P<company>[^/]+)/(?P<movie>[^/]+)' + _TESTS = [{ + 'url': 'http://trailers.apple.com/trailers/wb/manofsteel/', + 'info_dict': { + 'id': '5111', + 'title': 'Man of Steel', + }, + 'playlist': [ + { + 'md5': 'd97a8e575432dbcb81b7c3acb741f8a8', + 'info_dict': { + 'id': 'manofsteel-trailer4', + 'ext': 'mov', + 'duration': 111, + 'title': 'Trailer 4', + 'upload_date': '20130523', + 'uploader_id': 'wb', + }, + }, + { + 'md5': 'b8017b7131b721fb4e8d6f49e1df908c', + 'info_dict': { + 'id': 'manofsteel-trailer3', + 'ext': 'mov', + 'duration': 182, + 'title': 'Trailer 3', + 'upload_date': '20130417', + 'uploader_id': 'wb', + }, + }, + { + 'md5': 'd0f1e1150989b9924679b441f3404d48', + 'info_dict': { + 'id': 'manofsteel-trailer', + 'ext': 'mov', + 'duration': 148, + 'title': 'Trailer', + 'upload_date': '20121212', + 'uploader_id': 'wb', + }, + }, + { + 'md5': '5fe08795b943eb2e757fa95cb6def1cb', + 'info_dict': { + 'id': 'manofsteel-teaser', + 'ext': 'mov', + 'duration': 93, + 'title': 'Teaser', + 'upload_date': '20120721', + 'uploader_id': 'wb', + }, + }, + ] + }, { + 'url': 'http://trailers.apple.com/trailers/magnolia/blackthorn/', + 'info_dict': { + 'id': '4489', + 'title': 'Blackthorn', + }, + 'playlist_mincount': 2, + 'expected_warnings': ['Unable to download JSON metadata'], + }, { + # json data only available from http://trailers.apple.com/trailers/feeds/data/15881.json + 'url': 'http://trailers.apple.com/trailers/fox/kungfupanda3/', + 'info_dict': { + 'id': '15881', + 'title': 'Kung Fu Panda 3', + }, + 'playlist_mincount': 4, + }, { + 'url': 'http://trailers.apple.com/ca/metropole/autrui/', + 'only_matching': True, + }, { + 'url': 'http://movietrailers.apple.com/trailers/focus_features/kuboandthetwostrings/', + 'only_matching': True, + }] + + _JSON_RE = r'iTunes.playURL\((.*?)\);' + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + movie = mobj.group('movie') + uploader_id = mobj.group('company') + + webpage = self._download_webpage(url, movie) + film_id = self._search_regex(r"FilmId\s*=\s*'(\d+)'", webpage, 'film id') + film_data = self._download_json( + 'http://trailers.apple.com/trailers/feeds/data/%s.json' % film_id, + film_id, fatal=False) + + if film_data: + entries = [] + for clip in film_data.get('clips', []): + clip_title = clip['title'] + + formats = [] + for version, version_data in clip.get('versions', {}).items(): + for size, size_data in version_data.get('sizes', {}).items(): + src = size_data.get('src') + if not src: + continue + formats.append({ + 'format_id': '%s-%s' % (version, size), + 'url': re.sub(r'_(\d+p\.mov)', r'_h\1', src), + 'width': int_or_none(size_data.get('width')), + 'height': int_or_none(size_data.get('height')), + 'language': version[:2], + }) + self._sort_formats(formats) + + entries.append({ + 'id': movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', clip_title).lower(), + 'formats': formats, + 'title': clip_title, + 'thumbnail': clip.get('screen') or clip.get('thumb'), + 'duration': parse_duration(clip.get('runtime') or clip.get('faded')), + 'upload_date': unified_strdate(clip.get('posted')), + 'uploader_id': uploader_id, + }) + + page_data = film_data.get('page', {}) + return self.playlist_result(entries, film_id, page_data.get('movie_title')) + + playlist_url = compat_urlparse.urljoin(url, 'includes/playlists/itunes.inc') + + def fix_html(s): + s = re.sub(r'(?s)<script[^<]*?>.*?</script>', '', s) + s = re.sub(r'<img ([^<]*?)/?>', r'<img \1/>', s) + # The ' in the onClick attributes are not escaped, it couldn't be parsed + # like: http://trailers.apple.com/trailers/wb/gravity/ + + def _clean_json(m): + return 'iTunes.playURL(%s);' % m.group(1).replace('\'', ''') + s = re.sub(self._JSON_RE, _clean_json, s) + s = '<html>%s</html>' % s + return s + doc = self._download_xml(playlist_url, movie, transform_source=fix_html) + + playlist = [] + for li in doc.findall('./div/ul/li'): + on_click = li.find('.//a').attrib['onClick'] + trailer_info_json = self._search_regex(self._JSON_RE, + on_click, 'trailer info') + trailer_info = json.loads(trailer_info_json) + first_url = trailer_info.get('url') + if not first_url: + continue + title = trailer_info['title'] + video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower() + thumbnail = li.find('.//img').attrib['src'] + upload_date = trailer_info['posted'].replace('-', '') + + runtime = trailer_info['runtime'] + m = re.search(r'(?P<minutes>[0-9]+):(?P<seconds>[0-9]{1,2})', runtime) + duration = None + if m: + duration = 60 * int(m.group('minutes')) + int(m.group('seconds')) + + trailer_id = first_url.split('/')[-1].rpartition('_')[0].lower() + settings_json_url = compat_urlparse.urljoin(url, 'includes/settings/%s.json' % trailer_id) + settings = self._download_json(settings_json_url, trailer_id, 'Downloading settings json') + + formats = [] + for format in settings['metadata']['sizes']: + # The src is a file pointing to the real video file + format_url = re.sub(r'_(\d*p\.mov)', r'_h\1', format['src']) + formats.append({ + 'url': format_url, + 'format': format['type'], + 'width': int_or_none(format['width']), + 'height': int_or_none(format['height']), + }) + + self._sort_formats(formats) + + playlist.append({ + '_type': 'video', + 'id': video_id, + 'formats': formats, + 'title': title, + 'duration': duration, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'uploader_id': uploader_id, + 'http_headers': { + 'User-Agent': 'QuickTime compatible (yt-dlp)', + }, + }) + + return { + '_type': 'playlist', + 'id': movie, + 'entries': playlist, + } + + +class AppleTrailersSectionIE(InfoExtractor): + IE_NAME = 'appletrailers:section' + _SECTIONS = { + 'justadded': { + 'feed_path': 'just_added', + 'title': 'Just Added', + }, + 'exclusive': { + 'feed_path': 'exclusive', + 'title': 'Exclusive', + }, + 'justhd': { + 'feed_path': 'just_hd', + 'title': 'Just HD', + }, + 'mostpopular': { + 'feed_path': 'most_pop', + 'title': 'Most Popular', + }, + 'moviestudios': { + 'feed_path': 'studios', + 'title': 'Movie Studios', + }, + } + _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/#section=(?P<id>%s)' % '|'.join(_SECTIONS) + _TESTS = [{ + 'url': 'http://trailers.apple.com/#section=justadded', + 'info_dict': { + 'title': 'Just Added', + 'id': 'justadded', + }, + 'playlist_mincount': 80, + }, { + 'url': 'http://trailers.apple.com/#section=exclusive', + 'info_dict': { + 'title': 'Exclusive', + 'id': 'exclusive', + }, + 'playlist_mincount': 80, + }, { + 'url': 'http://trailers.apple.com/#section=justhd', + 'info_dict': { + 'title': 'Just HD', + 'id': 'justhd', + }, + 'playlist_mincount': 80, + }, { + 'url': 'http://trailers.apple.com/#section=mostpopular', + 'info_dict': { + 'title': 'Most Popular', + 'id': 'mostpopular', + }, + 'playlist_mincount': 30, + }, { + 'url': 'http://trailers.apple.com/#section=moviestudios', + 'info_dict': { + 'title': 'Movie Studios', + 'id': 'moviestudios', + }, + 'playlist_mincount': 80, + }] + + def _real_extract(self, url): + section = self._match_id(url) + section_data = self._download_json( + 'http://trailers.apple.com/trailers/home/feeds/%s.json' % self._SECTIONS[section]['feed_path'], + section) + entries = [ + self.url_result('http://trailers.apple.com' + e['location']) + for e in section_data] + return self.playlist_result(entries, section, self._SECTIONS[section]['title']) diff --git a/yt_dlp/extractor/archiveorg.py b/yt_dlp/extractor/archiveorg.py new file mode 100644 index 000000000..d90fcb13a --- /dev/null +++ b/yt_dlp/extractor/archiveorg.py @@ -0,0 +1,418 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import json + +from .common import InfoExtractor +from .youtube import YoutubeIE +from ..compat import ( + compat_urllib_parse_unquote, + compat_urllib_parse_unquote_plus, + compat_HTTPError +) +from ..utils import ( + clean_html, + determine_ext, + dict_get, + extract_attributes, + ExtractorError, + HEADRequest, + int_or_none, + KNOWN_EXTENSIONS, + merge_dicts, + mimetype2ext, + parse_duration, + parse_qs, + RegexNotFoundError, + str_to_int, + str_or_none, + try_get, + unified_strdate, + unified_timestamp, +) + + +class ArchiveOrgIE(InfoExtractor): + IE_NAME = 'archive.org' + IE_DESC = 'archive.org video and audio' + _VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details|embed)/(?P<id>[^?#]+)(?:[?].*)?$' + _TESTS = [{ + 'url': 'http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect', + 'md5': '8af1d4cf447933ed3c7f4871162602db', + 'info_dict': { + 'id': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect', + 'ext': 'ogv', + 'title': '1968 Demo - FJCC Conference Presentation Reel #1', + 'description': 'md5:da45c349df039f1cc8075268eb1b5c25', + 'release_date': '19681210', + 'timestamp': 1268695290, + 'upload_date': '20100315', + 'creator': 'SRI International', + 'uploader': 'laura@archive.org', + }, + }, { + 'url': 'https://archive.org/details/Cops1922', + 'md5': '0869000b4ce265e8ca62738b336b268a', + 'info_dict': { + 'id': 'Cops1922', + 'ext': 'mp4', + 'title': 'Buster Keaton\'s "Cops" (1922)', + 'description': 'md5:43a603fd6c5b4b90d12a96b921212b9c', + 'uploader': 'yorkmba99@hotmail.com', + 'timestamp': 1387699629, + 'upload_date': "20131222", + }, + }, { + 'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect', + 'only_matching': True, + }, { + 'url': 'https://archive.org/details/Election_Ads', + 'md5': '284180e857160cf866358700bab668a3', + 'info_dict': { + 'id': 'Election_Ads/Commercial-JFK1960ElectionAdCampaignJingle.mpg', + 'title': 'Commercial-JFK1960ElectionAdCampaignJingle.mpg', + 'ext': 'mp4', + }, + }, { + 'url': 'https://archive.org/details/Election_Ads/Commercial-Nixon1960ElectionAdToughonDefense.mpg', + 'md5': '7915213ef02559b5501fe630e1a53f59', + 'info_dict': { + 'id': 'Election_Ads/Commercial-Nixon1960ElectionAdToughonDefense.mpg', + 'title': 'Commercial-Nixon1960ElectionAdToughonDefense.mpg', + 'ext': 'mp4', + 'timestamp': 1205588045, + 'uploader': 'mikedavisstripmaster@yahoo.com', + 'description': '1960 Presidential Campaign Election Commercials John F Kennedy, Richard M Nixon', + 'upload_date': '20080315', + }, + }, { + 'url': 'https://archive.org/details/gd1977-05-08.shure57.stevenson.29303.flac16', + 'md5': '7d07ffb42aba6537c28e053efa4b54c9', + 'info_dict': { + 'id': 'gd1977-05-08.shure57.stevenson.29303.flac16/gd1977-05-08d01t01.flac', + 'title': 'Turning', + 'ext': 'flac', + }, + }, { + 'url': 'https://archive.org/details/gd1977-05-08.shure57.stevenson.29303.flac16/gd1977-05-08d01t07.flac', + 'md5': 'a07cd8c6ab4ee1560f8a0021717130f3', + 'info_dict': { + 'id': 'gd1977-05-08.shure57.stevenson.29303.flac16/gd1977-05-08d01t07.flac', + 'title': 'Deal', + 'ext': 'flac', + 'timestamp': 1205895624, + 'uploader': 'mvernon54@yahoo.com', + 'description': 'md5:6a31f1996db0aa0fc9da6d6e708a1bb0', + 'upload_date': '20080319', + 'location': 'Barton Hall - Cornell University', + }, + }, { + 'url': 'https://archive.org/details/lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik', + 'md5': '7cb019baa9b332e82ea7c10403acd180', + 'info_dict': { + 'id': 'lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik/disc1/01.01. Bells Of Rostov.mp3', + 'title': 'Bells Of Rostov', + 'ext': 'mp3', + }, + }, { + 'url': 'https://archive.org/details/lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik/disc1/02.02.+Song+And+Chorus+In+The+Polovetsian+Camp+From+%22Prince+Igor%22+(Act+2%2C+Scene+1).mp3', + 'md5': '1d0aabe03edca83ca58d9ed3b493a3c3', + 'info_dict': { + 'id': 'lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik/disc1/02.02. Song And Chorus In The Polovetsian Camp From "Prince Igor" (Act 2, Scene 1).mp3', + 'title': 'Song And Chorus In The Polovetsian Camp From "Prince Igor" (Act 2, Scene 1)', + 'ext': 'mp3', + 'timestamp': 1569662587, + 'uploader': 'associate-joygen-odiongan@archive.org', + 'description': 'md5:012b2d668ae753be36896f343d12a236', + 'upload_date': '20190928', + }, + }] + + @staticmethod + def _playlist_data(webpage): + element = re.findall(r'''(?xs) + <input + (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*? + \s+class=['"]?js-play8-playlist['"]? + (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*? + \s*/> + ''', webpage)[0] + + return json.loads(extract_attributes(element)['value']) + + def _real_extract(self, url): + video_id = compat_urllib_parse_unquote_plus(self._match_id(url)) + identifier, entry_id = (video_id.split('/', 1) + [None])[:2] + + # Archive.org metadata API doesn't clearly demarcate playlist entries + # or subtitle tracks, so we get them from the embeddable player. + embed_page = self._download_webpage( + 'https://archive.org/embed/' + identifier, identifier) + playlist = self._playlist_data(embed_page) + + entries = {} + for p in playlist: + # If the user specified a playlist entry in the URL, ignore the + # rest of the playlist. + if entry_id and p['orig'] != entry_id: + continue + + entries[p['orig']] = { + 'formats': [], + 'thumbnails': [], + 'artist': p.get('artist'), + 'track': p.get('title'), + 'subtitles': {}} + + for track in p.get('tracks', []): + if track['kind'] != 'subtitles': + continue + + entries[p['orig']][track['label']] = { + 'url': 'https://archive.org/' + track['file'].lstrip('/')} + + metadata = self._download_json( + 'http://archive.org/metadata/' + identifier, identifier) + m = metadata['metadata'] + identifier = m['identifier'] + + info = { + 'id': identifier, + 'title': m['title'], + 'description': clean_html(m.get('description')), + 'uploader': dict_get(m, ['uploader', 'adder']), + 'creator': m.get('creator'), + 'license': m.get('licenseurl'), + 'release_date': unified_strdate(m.get('date')), + 'timestamp': unified_timestamp(dict_get(m, ['publicdate', 'addeddate'])), + 'webpage_url': 'https://archive.org/details/' + identifier, + 'location': m.get('venue'), + 'release_year': int_or_none(m.get('year'))} + + for f in metadata['files']: + if f['name'] in entries: + entries[f['name']] = merge_dicts(entries[f['name']], { + 'id': identifier + '/' + f['name'], + 'title': f.get('title') or f['name'], + 'display_id': f['name'], + 'description': clean_html(f.get('description')), + 'creator': f.get('creator'), + 'duration': parse_duration(f.get('length')), + 'track_number': int_or_none(f.get('track')), + 'album': f.get('album'), + 'discnumber': int_or_none(f.get('disc')), + 'release_year': int_or_none(f.get('year'))}) + entry = entries[f['name']] + elif f.get('original') in entries: + entry = entries[f['original']] + else: + continue + + if f.get('format') == 'Thumbnail': + entry['thumbnails'].append({ + 'id': f['name'], + 'url': 'https://archive.org/download/' + identifier + '/' + f['name'], + 'width': int_or_none(f.get('width')), + 'height': int_or_none(f.get('width')), + 'filesize': int_or_none(f.get('size'))}) + + extension = (f['name'].rsplit('.', 1) + [None])[1] + if extension in KNOWN_EXTENSIONS: + entry['formats'].append({ + 'url': 'https://archive.org/download/' + identifier + '/' + f['name'], + 'format': f.get('format'), + 'width': int_or_none(f.get('width')), + 'height': int_or_none(f.get('height')), + 'filesize': int_or_none(f.get('size')), + 'protocol': 'https'}) + + # Sort available formats by filesize + for entry in entries.values(): + entry['formats'] = list(sorted(entry['formats'], key=lambda x: x.get('filesize', -1))) + + if len(entries) == 1: + # If there's only one item, use it as the main info dict + only_video = entries[list(entries.keys())[0]] + if entry_id: + info = merge_dicts(only_video, info) + else: + info = merge_dicts(info, only_video) + else: + # Otherwise, we have a playlist. + info['_type'] = 'playlist' + info['entries'] = list(entries.values()) + + if metadata.get('reviews'): + info['comments'] = [] + for review in metadata['reviews']: + info['comments'].append({ + 'id': review.get('review_id'), + 'author': review.get('reviewer'), + 'text': str_or_none(review.get('reviewtitle'), '') + '\n\n' + review.get('reviewbody'), + 'timestamp': unified_timestamp(review.get('createdate')), + 'parent': 'root'}) + + return info + + +class YoutubeWebArchiveIE(InfoExtractor): + IE_NAME = 'web.archive:youtube' + IE_DESC = 'web.archive.org saved youtube videos' + _VALID_URL = r"""(?x)^ + (?:https?://)?web\.archive\.org/ + (?:web/)? + (?:[0-9A-Za-z_*]+/)? # /web and the version index is optional + + (?:https?(?::|%3[Aa])//)? + (?: + (?:\w+\.)?youtube\.com/watch(?:\?|%3[fF])(?:[^\#]+(?:&|%26))?v(?:=|%3[dD]) # Youtube URL + |(wayback-fakeurl\.archive\.org/yt/) # Or the internal fake url + ) + (?P<id>[0-9A-Za-z_-]{11})(?:%26|\#|&|$) + """ + + _TESTS = [ + { + 'url': 'https://web.archive.org/web/20150415002341/https://www.youtube.com/watch?v=aYAGB11YrSs', + 'info_dict': { + 'id': 'aYAGB11YrSs', + 'ext': 'webm', + 'title': 'Team Fortress 2 - Sandviches!' + } + }, + { + # Internal link + 'url': 'https://web.archive.org/web/2oe/http://wayback-fakeurl.archive.org/yt/97t7Xj_iBv0', + 'info_dict': { + 'id': '97t7Xj_iBv0', + 'ext': 'mp4', + 'title': 'How Flexible Machines Could Save The World' + } + }, + { + # Video from 2012, webm format itag 45. + 'url': 'https://web.archive.org/web/20120712231619/http://www.youtube.com/watch?v=AkhihxRKcrs&gl=US&hl=en', + 'info_dict': { + 'id': 'AkhihxRKcrs', + 'ext': 'webm', + 'title': 'Limited Run: Mondo\'s Modern Classic 1 of 3 (SDCC 2012)' + } + }, + { + # Old flash-only video. Webpage title starts with "YouTube - ". + 'url': 'https://web.archive.org/web/20081211103536/http://www.youtube.com/watch?v=jNQXAC9IVRw', + 'info_dict': { + 'id': 'jNQXAC9IVRw', + 'ext': 'unknown_video', + 'title': 'Me at the zoo' + } + }, + { + # Flash video with .flv extension (itag 34). Title has prefix "YouTube -" + # Title has some weird unicode characters too. + 'url': 'https://web.archive.org/web/20110712231407/http://www.youtube.com/watch?v=lTx3G6h2xyA', + 'info_dict': { + 'id': 'lTx3G6h2xyA', + 'ext': 'flv', + 'title': 'Madeon - Pop Culture (live mashup)' + } + }, + { # Some versions of Youtube have have "YouTube" as page title in html (and later rewritten by js). + 'url': 'https://web.archive.org/web/http://www.youtube.com/watch?v=kH-G_aIBlFw', + 'info_dict': { + 'id': 'kH-G_aIBlFw', + 'ext': 'mp4', + 'title': 'kH-G_aIBlFw' + }, + 'expected_warnings': [ + 'unable to extract title', + ] + }, + { + # First capture is a 302 redirect intermediary page. + 'url': 'https://web.archive.org/web/20050214000000/http://www.youtube.com/watch?v=0altSZ96U4M', + 'info_dict': { + 'id': '0altSZ96U4M', + 'ext': 'mp4', + 'title': '0altSZ96U4M' + }, + 'expected_warnings': [ + 'unable to extract title', + ] + }, + { + # Video not archived, only capture is unavailable video page + 'url': 'https://web.archive.org/web/20210530071008/https://www.youtube.com/watch?v=lHJTf93HL1s&spfreload=10', + 'only_matching': True, + }, + { # Encoded url + 'url': 'https://web.archive.org/web/20120712231619/http%3A//www.youtube.com/watch%3Fgl%3DUS%26v%3DAkhihxRKcrs%26hl%3Den', + 'only_matching': True, + }, + { + 'url': 'https://web.archive.org/web/20120712231619/http%3A//www.youtube.com/watch%3Fv%3DAkhihxRKcrs%26gl%3DUS%26hl%3Den', + 'only_matching': True, + } + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + title = video_id # if we are not able get a title + + def _extract_title(webpage): + page_title = self._html_search_regex( + r'<title>([^<]*)</title>', webpage, 'title', fatal=False) or '' + # YouTube video pages appear to always have either 'YouTube -' as suffix or '- YouTube' as prefix. + try: + page_title = self._html_search_regex( + r'(?:YouTube\s*-\s*(.*)$)|(?:(.*)\s*-\s*YouTube$)', + page_title, 'title', default='') + except RegexNotFoundError: + page_title = None + + if not page_title: + self.report_warning('unable to extract title', video_id=video_id) + return + return page_title + + # If the video is no longer available, the oldest capture may be one before it was removed. + # Setting the capture date in url to early date seems to redirect to earliest capture. + webpage = self._download_webpage( + 'https://web.archive.org/web/20050214000000/http://www.youtube.com/watch?v=%s' % video_id, + video_id=video_id, fatal=False, errnote='unable to download video webpage (probably not archived).') + if webpage: + title = _extract_title(webpage) or title + + # Use link translator mentioned in https://github.com/ytdl-org/youtube-dl/issues/13655 + internal_fake_url = 'https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s' % video_id + try: + video_file_webpage = self._request_webpage( + HEADRequest(internal_fake_url), video_id, + note='Fetching video file url', expected_status=True) + except ExtractorError as e: + # HTTP Error 404 is expected if the video is not saved. + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: + raise ExtractorError( + 'HTTP Error %s. Most likely the video is not archived or issue with web.archive.org.' % e.cause.code, + expected=True) + raise + video_file_url = compat_urllib_parse_unquote(video_file_webpage.url) + video_file_url_qs = parse_qs(video_file_url) + + # Attempt to recover any ext & format info from playback url + format = {'url': video_file_url} + itag = try_get(video_file_url_qs, lambda x: x['itag'][0]) + if itag and itag in YoutubeIE._formats: # Naughty access but it works + format.update(YoutubeIE._formats[itag]) + format.update({'format_id': itag}) + else: + mime = try_get(video_file_url_qs, lambda x: x['mime'][0]) + ext = mimetype2ext(mime) or determine_ext(video_file_url) + format.update({'ext': ext}) + return { + 'id': video_id, + 'title': title, + 'formats': [format], + 'duration': str_to_int(try_get(video_file_url_qs, lambda x: x['dur'][0])) + } diff --git a/yt_dlp/extractor/arcpublishing.py b/yt_dlp/extractor/arcpublishing.py new file mode 100644 index 000000000..5a9b8181a --- /dev/null +++ b/yt_dlp/extractor/arcpublishing.py @@ -0,0 +1,169 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + int_or_none, + parse_iso8601, + try_get, +) + + +class ArcPublishingIE(InfoExtractor): + _UUID_REGEX = r'[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12}' + _VALID_URL = r'arcpublishing:(?P<org>[a-z]+):(?P<id>%s)' % _UUID_REGEX + _TESTS = [{ + # https://www.adn.com/politics/2020/11/02/video-senate-candidates-campaign-in-anchorage-on-eve-of-election-day/ + 'url': 'arcpublishing:adn:8c99cb6e-b29c-4bc9-9173-7bf9979225ab', + 'only_matching': True, + }, { + # https://www.bostonglobe.com/video/2020/12/30/metro/footage-released-showing-officer-talking-about-striking-protesters-with-car/ + 'url': 'arcpublishing:bostonglobe:232b7ae6-7d73-432d-bc0a-85dbf0119ab1', + 'only_matching': True, + }, { + # https://www.actionnewsjax.com/video/live-stream/ + 'url': 'arcpublishing:cmg:cfb1cf1b-3ab5-4d1b-86c5-a5515d311f2a', + 'only_matching': True, + }, { + # https://elcomercio.pe/videos/deportes/deporte-total-futbol-peruano-seleccion-peruana-la-valorizacion-de-los-peruanos-en-el-exterior-tras-un-2020-atipico-nnav-vr-video-noticia/ + 'url': 'arcpublishing:elcomercio:27a7e1f8-2ec7-4177-874f-a4feed2885b3', + 'only_matching': True, + }, { + # https://www.clickondetroit.com/video/community/2020/05/15/events-surrounding-woodward-dream-cruise-being-canceled/ + 'url': 'arcpublishing:gmg:c8793fb2-8d44-4242-881e-2db31da2d9fe', + 'only_matching': True, + }, { + # https://www.wabi.tv/video/2020/12/30/trenton-company-making-equipment-pfizer-covid-vaccine/ + 'url': 'arcpublishing:gray:0b0ba30e-032a-4598-8810-901d70e6033e', + 'only_matching': True, + }, { + # https://www.lateja.cr/el-mundo/video-china-aprueba-con-condiciones-su-primera/dfcbfa57-527f-45ff-a69b-35fe71054143/video/ + 'url': 'arcpublishing:gruponacion:dfcbfa57-527f-45ff-a69b-35fe71054143', + 'only_matching': True, + }, { + # https://www.fifthdomain.com/video/2018/03/09/is-america-vulnerable-to-a-cyber-attack/ + 'url': 'arcpublishing:mco:aa0ca6fe-1127-46d4-b32c-be0d6fdb8055', + 'only_matching': True, + }, { + # https://www.vl.no/kultur/2020/12/09/en-melding-fra-en-lytter-endret-julelista-til-lewi-bergrud/ + 'url': 'arcpublishing:mentormedier:47a12084-650b-4011-bfd0-3699b6947b2d', + 'only_matching': True, + }, { + # https://www.14news.com/2020/12/30/whiskey-theft-caught-camera-henderson-liquor-store/ + 'url': 'arcpublishing:raycom:b89f61f8-79fa-4c09-8255-e64237119bf7', + 'only_matching': True, + }, { + # https://www.theglobeandmail.com/world/video-ethiopian-woman-who-became-symbol-of-integration-in-italy-killed-on/ + 'url': 'arcpublishing:tgam:411b34c1-8701-4036-9831-26964711664b', + 'only_matching': True, + }, { + # https://www.pilotonline.com/460f2931-8130-4719-8ea1-ffcb2d7cb685-132.html + 'url': 'arcpublishing:tronc:460f2931-8130-4719-8ea1-ffcb2d7cb685', + 'only_matching': True, + }] + _POWA_DEFAULTS = [ + (['cmg', 'prisa'], '%s-config-prod.api.cdn.arcpublishing.com/video'), + ([ + 'adn', 'advancelocal', 'answers', 'bonnier', 'bostonglobe', 'demo', + 'gmg', 'gruponacion', 'infobae', 'mco', 'nzme', 'pmn', 'raycom', + 'spectator', 'tbt', 'tgam', 'tronc', 'wapo', 'wweek', + ], 'video-api-cdn.%s.arcpublishing.com/api'), + ] + + @staticmethod + def _extract_urls(webpage): + entries = [] + # https://arcpublishing.atlassian.net/wiki/spaces/POWA/overview + for powa_el in re.findall(r'(<div[^>]+class="[^"]*\bpowa\b[^"]*"[^>]+data-uuid="%s"[^>]*>)' % ArcPublishingIE._UUID_REGEX, webpage): + powa = extract_attributes(powa_el) or {} + org = powa.get('data-org') + uuid = powa.get('data-uuid') + if org and uuid: + entries.append('arcpublishing:%s:%s' % (org, uuid)) + return entries + + def _real_extract(self, url): + org, uuid = self._match_valid_url(url).groups() + for orgs, tmpl in self._POWA_DEFAULTS: + if org in orgs: + base_api_tmpl = tmpl + break + else: + base_api_tmpl = '%s-prod-cdn.video-api.arcpublishing.com/api' + if org == 'wapo': + org = 'washpost' + video = self._download_json( + 'https://%s/v1/ansvideos/findByUuid' % (base_api_tmpl % org), + uuid, query={'uuid': uuid})[0] + title = video['headlines']['basic'] + is_live = video.get('status') == 'live' + + urls = [] + formats = [] + for s in video.get('streams', []): + s_url = s.get('url') + if not s_url or s_url in urls: + continue + urls.append(s_url) + stream_type = s.get('stream_type') + if stream_type == 'smil': + smil_formats = self._extract_smil_formats( + s_url, uuid, fatal=False) + for f in smil_formats: + if f['url'].endswith('/cfx/st'): + f['app'] = 'cfx/st' + if not f['play_path'].startswith('mp4:'): + f['play_path'] = 'mp4:' + f['play_path'] + if isinstance(f['tbr'], float): + f['vbr'] = f['tbr'] * 1000 + del f['tbr'] + f['format_id'] = 'rtmp-%d' % f['vbr'] + formats.extend(smil_formats) + elif stream_type in ('ts', 'hls'): + m3u8_formats = self._extract_m3u8_formats( + s_url, uuid, 'mp4', 'm3u8' if is_live else 'm3u8_native', + m3u8_id='hls', fatal=False) + if all([f.get('acodec') == 'none' for f in m3u8_formats]): + continue + for f in m3u8_formats: + height = f.get('height') + if not height: + continue + vbr = self._search_regex( + r'[_x]%d[_-](\d+)' % height, f['url'], 'vbr', default=None) + if vbr: + f['vbr'] = int(vbr) + formats.extend(m3u8_formats) + else: + vbr = int_or_none(s.get('bitrate')) + formats.append({ + 'format_id': '%s-%d' % (stream_type, vbr) if vbr else stream_type, + 'vbr': vbr, + 'width': int_or_none(s.get('width')), + 'height': int_or_none(s.get('height')), + 'filesize': int_or_none(s.get('filesize')), + 'url': s_url, + 'quality': -10, + }) + self._sort_formats(formats) + + subtitles = {} + for subtitle in (try_get(video, lambda x: x['subtitles']['urls'], list) or []): + subtitle_url = subtitle.get('url') + if subtitle_url: + subtitles.setdefault('en', []).append({'url': subtitle_url}) + + return { + 'id': uuid, + 'title': self._live_title(title) if is_live else title, + 'thumbnail': try_get(video, lambda x: x['promo_image']['url']), + 'description': try_get(video, lambda x: x['subheadlines']['basic']), + 'formats': formats, + 'duration': int_or_none(video.get('duration'), 100), + 'timestamp': parse_iso8601(video.get('created_date')), + 'subtitles': subtitles, + 'is_live': is_live, + } diff --git a/yt_dlp/extractor/ard.py b/yt_dlp/extractor/ard.py new file mode 100644 index 000000000..048d30f27 --- /dev/null +++ b/yt_dlp/extractor/ard.py @@ -0,0 +1,609 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from .generic import GenericIE +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + parse_duration, + qualities, + str_or_none, + try_get, + unified_strdate, + unified_timestamp, + update_url_query, + url_or_none, + xpath_text, +) +from ..compat import compat_etree_fromstring + + +class ARDMediathekBaseIE(InfoExtractor): + _GEO_COUNTRIES = ['DE'] + + def _extract_media_info(self, media_info_url, webpage, video_id): + media_info = self._download_json( + media_info_url, video_id, 'Downloading media JSON') + return self._parse_media_info(media_info, video_id, '"fsk"' in webpage) + + def _parse_media_info(self, media_info, video_id, fsk): + formats = self._extract_formats(media_info, video_id) + + if not formats: + if fsk: + self.raise_no_formats( + 'This video is only available after 20:00', expected=True) + elif media_info.get('_geoblocked'): + self.raise_geo_restricted( + 'This video is not available due to geoblocking', + countries=self._GEO_COUNTRIES, metadata_available=True) + + self._sort_formats(formats) + + subtitles = {} + subtitle_url = media_info.get('_subtitleUrl') + if subtitle_url: + subtitles['de'] = [{ + 'ext': 'ttml', + 'url': subtitle_url, + }] + + return { + 'id': video_id, + 'duration': int_or_none(media_info.get('_duration')), + 'thumbnail': media_info.get('_previewImage'), + 'is_live': media_info.get('_isLive') is True, + 'formats': formats, + 'subtitles': subtitles, + } + + def _ARD_extract_episode_info(self, title): + """Try to extract season/episode data from the title.""" + res = {} + if not title: + return res + + for pattern in [ + # Pattern for title like "Homo sapiens (S06/E07) - Originalversion" + # from: https://www.ardmediathek.de/one/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw + r'.*(?P<ep_info> \(S(?P<season_number>\d+)/E(?P<episode_number>\d+)\)).*', + # E.g.: title="Fritjof aus Norwegen (2) (AD)" + # from: https://www.ardmediathek.de/ard/sammlung/der-krieg-und-ich/68cMkqJdllm639Skj4c7sS/ + r'.*(?P<ep_info> \((?:Folge |Teil )?(?P<episode_number>\d+)(?:/\d+)?\)).*', + r'.*(?P<ep_info>Folge (?P<episode_number>\d+)(?:\:| -|) )\"(?P<episode>.+)\".*', + # E.g.: title="Folge 25/42: Symmetrie" + # from: https://www.ardmediathek.de/ard/video/grips-mathe/folge-25-42-symmetrie/ard-alpha/Y3JpZDovL2JyLmRlL3ZpZGVvLzMyYzI0ZjczLWQ1N2MtNDAxNC05ZmZhLTFjYzRkZDA5NDU5OQ/ + # E.g.: title="Folge 1063 - Vertrauen" + # from: https://www.ardmediathek.de/ard/sendung/die-fallers/Y3JpZDovL3N3ci5kZS8yMzAyMDQ4/ + r'.*(?P<ep_info>Folge (?P<episode_number>\d+)(?:/\d+)?(?:\:| -|) ).*', + ]: + m = re.match(pattern, title) + if m: + groupdict = m.groupdict() + res['season_number'] = int_or_none(groupdict.get('season_number')) + res['episode_number'] = int_or_none(groupdict.get('episode_number')) + res['episode'] = str_or_none(groupdict.get('episode')) + # Build the episode title by removing numeric episode information: + if groupdict.get('ep_info') and not res['episode']: + res['episode'] = str_or_none( + title.replace(groupdict.get('ep_info'), '')) + if res['episode']: + res['episode'] = res['episode'].strip() + break + + # As a fallback use the whole title as the episode name: + if not res.get('episode'): + res['episode'] = title.strip() + return res + + def _extract_formats(self, media_info, video_id): + type_ = media_info.get('_type') + media_array = media_info.get('_mediaArray', []) + formats = [] + for num, media in enumerate(media_array): + for stream in media.get('_mediaStreamArray', []): + stream_urls = stream.get('_stream') + if not stream_urls: + continue + if not isinstance(stream_urls, list): + stream_urls = [stream_urls] + quality = stream.get('_quality') + server = stream.get('_server') + for stream_url in stream_urls: + if not url_or_none(stream_url): + continue + ext = determine_ext(stream_url) + if quality != 'auto' and ext in ('f4m', 'm3u8'): + continue + if ext == 'f4m': + formats.extend(self._extract_f4m_formats( + update_url_query(stream_url, { + 'hdcore': '3.1.1', + 'plugin': 'aasp-3.1.1.69.124' + }), video_id, f4m_id='hds', fatal=False)) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + stream_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + else: + if server and server.startswith('rtmp'): + f = { + 'url': server, + 'play_path': stream_url, + 'format_id': 'a%s-rtmp-%s' % (num, quality), + } + else: + f = { + 'url': stream_url, + 'format_id': 'a%s-%s-%s' % (num, ext, quality) + } + m = re.search( + r'_(?P<width>\d+)x(?P<height>\d+)\.mp4$', + stream_url) + if m: + f.update({ + 'width': int(m.group('width')), + 'height': int(m.group('height')), + }) + if type_ == 'audio': + f['vcodec'] = 'none' + formats.append(f) + return formats + + +class ARDMediathekIE(ARDMediathekBaseIE): + IE_NAME = 'ARD:mediathek' + _VALID_URL = r'^https?://(?:(?:(?:www|classic)\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de|one\.ard\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' + + _TESTS = [{ + # available till 26.07.2022 + 'url': 'http://www.ardmediathek.de/tv/S%C3%9CDLICHT/Was-ist-die-Kunst-der-Zukunft-liebe-Ann/BR-Fernsehen/Video?bcastId=34633636&documentId=44726822', + 'info_dict': { + 'id': '44726822', + 'ext': 'mp4', + 'title': 'Was ist die Kunst der Zukunft, liebe Anna McCarthy?', + 'description': 'md5:4ada28b3e3b5df01647310e41f3a62f5', + 'duration': 1740, + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + }, { + 'url': 'https://one.ard.de/tv/Mord-mit-Aussicht/Mord-mit-Aussicht-6-39-T%C3%B6dliche-Nach/ONE/Video?bcastId=46384294&documentId=55586872', + 'only_matching': True, + }, { + # audio + 'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086', + 'only_matching': True, + }, { + 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht', + 'only_matching': True, + }, { + # audio + 'url': 'http://mediathek.rbb-online.de/radio/Hörspiel/Vor-dem-Fest/kulturradio/Audio?documentId=30796318&topRessort=radio&bcastId=9839158', + 'only_matching': True, + }, { + 'url': 'https://classic.ardmediathek.de/tv/Panda-Gorilla-Co/Panda-Gorilla-Co-Folge-274/Das-Erste/Video?bcastId=16355486&documentId=58234698', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if ARDBetaMediathekIE.suitable(url) else super(ARDMediathekIE, cls).suitable(url) + + def _real_extract(self, url): + # determine video id from url + m = self._match_valid_url(url) + + document_id = None + + numid = re.search(r'documentId=([0-9]+)', url) + if numid: + document_id = video_id = numid.group(1) + else: + video_id = m.group('video_id') + + webpage = self._download_webpage(url, video_id) + + ERRORS = ( + ('>Leider liegt eine Störung vor.', 'Video %s is unavailable'), + ('>Der gewünschte Beitrag ist nicht mehr verfügbar.<', + 'Video %s is no longer available'), + ) + + for pattern, message in ERRORS: + if pattern in webpage: + raise ExtractorError(message % video_id, expected=True) + + if re.search(r'[\?&]rss($|[=&])', url): + doc = compat_etree_fromstring(webpage.encode('utf-8')) + if doc.tag == 'rss': + return GenericIE()._extract_rss(url, video_id, doc) + + title = self._og_search_title(webpage, default=None) or self._html_search_regex( + [r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>', + r'<meta name="dcterms\.title" content="(.*?)"/>', + r'<h4 class="headline">(.*?)</h4>', + r'<title[^>]*>(.*?)</title>'], + webpage, 'title') + description = self._og_search_description(webpage, default=None) or self._html_search_meta( + 'dcterms.abstract', webpage, 'description', default=None) + if description is None: + description = self._html_search_meta( + 'description', webpage, 'meta description', default=None) + if description is None: + description = self._html_search_regex( + r'<p\s+class="teasertext">(.+?)</p>', + webpage, 'teaser text', default=None) + + # Thumbnail is sometimes not present. + # It is in the mobile version, but that seems to use a different URL + # structure altogether. + thumbnail = self._og_search_thumbnail(webpage, default=None) + + media_streams = re.findall(r'''(?x) + mediaCollection\.addMediaStream\([0-9]+,\s*[0-9]+,\s*"[^"]*",\s* + "([^"]+)"''', webpage) + + if media_streams: + QUALITIES = qualities(['lo', 'hi', 'hq']) + formats = [] + for furl in set(media_streams): + if furl.endswith('.f4m'): + fid = 'f4m' + else: + fid_m = re.match(r'.*\.([^.]+)\.[^.]+$', furl) + fid = fid_m.group(1) if fid_m else None + formats.append({ + 'quality': QUALITIES(fid), + 'format_id': fid, + 'url': furl, + }) + self._sort_formats(formats) + info = { + 'formats': formats, + } + else: # request JSON file + if not document_id: + video_id = self._search_regex( + (r'/play/(?:config|media|sola)/(\d+)', r'contentId["\']\s*:\s*(\d+)'), + webpage, 'media id', default=None) + info = self._extract_media_info( + 'http://www.ardmediathek.de/play/media/%s' % video_id, + webpage, video_id) + + info.update({ + 'id': video_id, + 'title': self._live_title(title) if info.get('is_live') else title, + 'description': description, + 'thumbnail': thumbnail, + }) + info.update(self._ARD_extract_episode_info(info['title'])) + + return info + + +class ARDIE(InfoExtractor): + _VALID_URL = r'(?P<mainurl>https?://(?:www\.)?daserste\.de/(?:[^/?#&]+/)+(?P<id>[^/?#&]+))\.html' + _TESTS = [{ + # available till 7.01.2022 + 'url': 'https://www.daserste.de/information/talk/maischberger/videos/maischberger-die-woche-video100.html', + 'md5': '867d8aa39eeaf6d76407c5ad1bb0d4c1', + 'info_dict': { + 'id': 'maischberger-die-woche-video100', + 'display_id': 'maischberger-die-woche-video100', + 'ext': 'mp4', + 'duration': 3687.0, + 'title': 'maischberger. die woche vom 7. Januar 2021', + 'upload_date': '20210107', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + }, { + 'url': 'https://www.daserste.de/information/politik-weltgeschehen/morgenmagazin/videosextern/dominik-kahun-aus-der-nhl-direkt-zur-weltmeisterschaft-100.html', + 'only_matching': True, + }, { + 'url': 'https://www.daserste.de/information/nachrichten-wetter/tagesthemen/videosextern/tagesthemen-17736.html', + 'only_matching': True, + }, { + 'url': 'https://www.daserste.de/unterhaltung/serie/in-aller-freundschaft-die-jungen-aerzte/videos/diversity-tag-sanam-afrashteh100.html', + 'only_matching': True, + }, { + 'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html', + 'only_matching': True, + }, { + 'url': 'https://www.daserste.de/unterhaltung/serie/in-aller-freundschaft-die-jungen-aerzte/Drehpause-100.html', + 'only_matching': True, + }, { + 'url': 'https://www.daserste.de/unterhaltung/film/filmmittwoch-im-ersten/videos/making-ofwendezeit-video-100.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + display_id = mobj.group('id') + + player_url = mobj.group('mainurl') + '~playerXml.xml' + doc = self._download_xml(player_url, display_id) + video_node = doc.find('./video') + upload_date = unified_strdate(xpath_text( + video_node, './broadcastDate')) + thumbnail = xpath_text(video_node, './/teaserImage//variant/url') + + formats = [] + for a in video_node.findall('.//asset'): + file_name = xpath_text(a, './fileName', default=None) + if not file_name: + continue + format_type = a.attrib.get('type') + format_url = url_or_none(file_name) + if format_url: + ext = determine_ext(file_name) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=format_type or 'hls', fatal=False)) + continue + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + update_url_query(format_url, {'hdcore': '3.7.0'}), + display_id, f4m_id=format_type or 'hds', fatal=False)) + continue + f = { + 'format_id': format_type, + 'width': int_or_none(xpath_text(a, './frameWidth')), + 'height': int_or_none(xpath_text(a, './frameHeight')), + 'vbr': int_or_none(xpath_text(a, './bitrateVideo')), + 'abr': int_or_none(xpath_text(a, './bitrateAudio')), + 'vcodec': xpath_text(a, './codecVideo'), + 'tbr': int_or_none(xpath_text(a, './totalBitrate')), + } + server_prefix = xpath_text(a, './serverPrefix', default=None) + if server_prefix: + f.update({ + 'url': server_prefix, + 'playpath': file_name, + }) + else: + if not format_url: + continue + f['url'] = format_url + formats.append(f) + self._sort_formats(formats) + + return { + 'id': xpath_text(video_node, './videoId', default=display_id), + 'formats': formats, + 'display_id': display_id, + 'title': video_node.find('./title').text, + 'duration': parse_duration(video_node.find('./duration').text), + 'upload_date': upload_date, + 'thumbnail': thumbnail, + } + + +class ARDBetaMediathekIE(ARDMediathekBaseIE): + _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?P<client>[^/]+)/(?P<mode>player|live|video|sendung|sammlung)/(?P<display_id>(?:[^/]+/)*)(?P<video_id>[a-zA-Z0-9]+)' + _TESTS = [{ + 'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/', + 'md5': 'a1dc75a39c61601b980648f7c9f9f71d', + 'info_dict': { + 'display_id': 'die-robuste-roswita', + 'id': '78566716', + 'title': 'Die robuste Roswita', + 'description': r're:^Der Mord.*totgeglaubte Ehefrau Roswita', + 'duration': 5316, + 'thumbnail': 'https://img.ardmediathek.de/standard/00/78/56/67/84/575672121/16x9/960?mandant=ard', + 'timestamp': 1596658200, + 'upload_date': '20200805', + 'ext': 'mp4', + }, + }, { + 'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', + 'only_matching': True, + }, { + 'url': 'https://ardmediathek.de/ard/video/saartalk/saartalk-gesellschaftsgift-haltung-gegen-hass/sr-fernsehen/Y3JpZDovL3NyLW9ubGluZS5kZS9TVF84MTY4MA/', + 'only_matching': True, + }, { + 'url': 'https://www.ardmediathek.de/ard/video/trailer/private-eyes-s01-e01/one/Y3JpZDovL3dkci5kZS9CZWl0cmFnLTE1MTgwYzczLWNiMTEtNGNkMS1iMjUyLTg5MGYzOWQxZmQ1YQ/', + 'only_matching': True, + }, { + 'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3N3ci5kZS9hZXgvbzEwNzE5MTU/', + 'only_matching': True, + }, { + 'url': 'https://www.ardmediathek.de/swr/live/Y3JpZDovL3N3ci5kZS8xMzQ4MTA0Mg', + 'only_matching': True, + }, { + # playlist of type 'sendung' + 'url': 'https://www.ardmediathek.de/ard/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw/', + 'only_matching': True, + }, { + # playlist of type 'sammlung' + 'url': 'https://www.ardmediathek.de/ard/sammlung/team-muenster/5JpTzLSbWUAK8184IOvEir/', + 'only_matching': True, + }] + + def _ARD_load_playlist_snipped(self, playlist_id, display_id, client, mode, pageNumber): + """ Query the ARD server for playlist information + and returns the data in "raw" format """ + if mode == 'sendung': + graphQL = json.dumps({ + 'query': '''{ + showPage( + client: "%s" + showId: "%s" + pageNumber: %d + ) { + pagination { + pageSize + totalElements + } + teasers { # Array + mediumTitle + links { target { id href title } } + type + } + }}''' % (client, playlist_id, pageNumber), + }).encode() + else: # mode == 'sammlung' + graphQL = json.dumps({ + 'query': '''{ + morePage( + client: "%s" + compilationId: "%s" + pageNumber: %d + ) { + widget { + pagination { + pageSize + totalElements + } + teasers { # Array + mediumTitle + links { target { id href title } } + type + } + } + }}''' % (client, playlist_id, pageNumber), + }).encode() + # Ressources for ARD graphQL debugging: + # https://api-test.ardmediathek.de/public-gateway + show_page = self._download_json( + 'https://api.ardmediathek.de/public-gateway', + '[Playlist] %s' % display_id, + data=graphQL, + headers={'Content-Type': 'application/json'})['data'] + # align the structure of the returned data: + if mode == 'sendung': + show_page = show_page['showPage'] + else: # mode == 'sammlung' + show_page = show_page['morePage']['widget'] + return show_page + + def _ARD_extract_playlist(self, url, playlist_id, display_id, client, mode): + """ Collects all playlist entries and returns them as info dict. + Supports playlists of mode 'sendung' and 'sammlung', and also nested + playlists. """ + entries = [] + pageNumber = 0 + while True: # iterate by pageNumber + show_page = self._ARD_load_playlist_snipped( + playlist_id, display_id, client, mode, pageNumber) + for teaser in show_page['teasers']: # process playlist items + if '/compilation/' in teaser['links']['target']['href']: + # alternativ cond.: teaser['type'] == "compilation" + # => This is an nested compilation, e.g. like: + # https://www.ardmediathek.de/ard/sammlung/die-kirche-bleibt-im-dorf/5eOHzt8XB2sqeFXbIoJlg2/ + link_mode = 'sammlung' + else: + link_mode = 'video' + + item_url = 'https://www.ardmediathek.de/%s/%s/%s/%s/%s' % ( + client, link_mode, display_id, + # perform HTLM quoting of episode title similar to ARD: + re.sub('^-|-$', '', # remove '-' from begin/end + re.sub('[^a-zA-Z0-9]+', '-', # replace special chars by - + teaser['links']['target']['title'].lower() + .replace('ä', 'ae').replace('ö', 'oe') + .replace('ü', 'ue').replace('ß', 'ss'))), + teaser['links']['target']['id']) + entries.append(self.url_result( + item_url, + ie=ARDBetaMediathekIE.ie_key())) + + if (show_page['pagination']['pageSize'] * (pageNumber + 1) + >= show_page['pagination']['totalElements']): + # we've processed enough pages to get all playlist entries + break + pageNumber = pageNumber + 1 + + return self.playlist_result(entries, playlist_title=display_id) + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('video_id') + display_id = mobj.group('display_id') + if display_id: + display_id = display_id.rstrip('/') + if not display_id: + display_id = video_id + + if mobj.group('mode') in ('sendung', 'sammlung'): + # this is a playlist-URL + return self._ARD_extract_playlist( + url, video_id, display_id, + mobj.group('client'), + mobj.group('mode')) + + player_page = self._download_json( + 'https://api.ardmediathek.de/public-gateway', + display_id, data=json.dumps({ + 'query': '''{ + playerPage(client:"%s", clipId: "%s") { + blockedByFsk + broadcastedOn + maturityContentRating + mediaCollection { + _duration + _geoblocked + _isLive + _mediaArray { + _mediaStreamArray { + _quality + _server + _stream + } + } + _previewImage + _subtitleUrl + _type + } + show { + title + } + synopsis + title + tracking { + atiCustomVars { + contentId + } + } + } +}''' % (mobj.group('client'), video_id), + }).encode(), headers={ + 'Content-Type': 'application/json' + })['data']['playerPage'] + title = player_page['title'] + content_id = str_or_none(try_get( + player_page, lambda x: x['tracking']['atiCustomVars']['contentId'])) + media_collection = player_page.get('mediaCollection') or {} + if not media_collection and content_id: + media_collection = self._download_json( + 'https://www.ardmediathek.de/play/media/' + content_id, + content_id, fatal=False) or {} + info = self._parse_media_info( + media_collection, content_id or video_id, + player_page.get('blockedByFsk')) + age_limit = None + description = player_page.get('synopsis') + maturity_content_rating = player_page.get('maturityContentRating') + if maturity_content_rating: + age_limit = int_or_none(maturity_content_rating.lstrip('FSK')) + if not age_limit and description: + age_limit = int_or_none(self._search_regex( + r'\(FSK\s*(\d+)\)\s*$', description, 'age limit', default=None)) + info.update({ + 'age_limit': age_limit, + 'display_id': display_id, + 'title': title, + 'description': description, + 'timestamp': unified_timestamp(player_page.get('broadcastedOn')), + 'series': try_get(player_page, lambda x: x['show']['title']), + }) + info.update(self._ARD_extract_episode_info(info['title'])) + return info diff --git a/yt_dlp/extractor/arkena.py b/yt_dlp/extractor/arkena.py new file mode 100644 index 000000000..4f4f457c1 --- /dev/null +++ b/yt_dlp/extractor/arkena.py @@ -0,0 +1,163 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + parse_iso8601, + parse_qs, + try_get, +) + + +class ArkenaIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?: + video\.(?:arkena|qbrick)\.com/play2/embed/player\?| + play\.arkena\.com/(?:config|embed)/avp/v\d/player/media/(?P<id>[^/]+)/[^/]+/(?P<account_id>\d+) + ) + ''' + _TESTS = [{ + 'url': 'https://video.qbrick.com/play2/embed/player?accountId=1034090&mediaId=d8ab4607-00090107-aab86310', + 'md5': '97f117754e5f3c020f5f26da4a44ebaf', + 'info_dict': { + 'id': 'd8ab4607-00090107-aab86310', + 'ext': 'mp4', + 'title': 'EM_HT20_117_roslund_v2.mp4', + 'timestamp': 1608285912, + 'upload_date': '20201218', + 'duration': 1429.162667, + 'subtitles': { + 'sv': 'count:3', + }, + }, + }, { + 'url': 'https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411', + 'only_matching': True, + }, { + 'url': 'https://play.arkena.com/config/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411/?callbackMethod=jQuery1111023664739129262213_1469227693893', + 'only_matching': True, + }, { + 'url': 'http://play.arkena.com/config/avp/v1/player/media/327336/darkmatter/131064/?callbackMethod=jQuery1111002221189684892677_1469227595972', + 'only_matching': True, + }, { + 'url': 'http://play.arkena.com/embed/avp/v1/player/media/327336/darkmatter/131064/', + 'only_matching': True, + }, { + 'url': 'http://video.arkena.com/play2/embed/player?accountId=472718&mediaId=35763b3b-00090078-bf604299&pageStyling=styled', + 'only_matching': True, + }] + + @staticmethod + def _extract_url(webpage): + # See https://support.arkena.com/display/PLAY/Ways+to+embed+your+video + mobj = re.search( + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//play\.arkena\.com/embed/avp/.+?)\1', + webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + account_id = mobj.group('account_id') + + # Handle http://video.arkena.com/play2/embed/player URL + if not video_id: + qs = parse_qs(url) + video_id = qs.get('mediaId', [None])[0] + account_id = qs.get('accountId', [None])[0] + if not video_id or not account_id: + raise ExtractorError('Invalid URL', expected=True) + + media = self._download_json( + 'https://video.qbrick.com/api/v1/public/accounts/%s/medias/%s' % (account_id, video_id), + video_id, query={ + # https://video.qbrick.com/docs/api/examples/library-api.html + 'fields': 'asset/resources/*/renditions/*(height,id,language,links/*(href,mimeType),type,size,videos/*(audios/*(codec,sampleRate),bitrate,codec,duration,height,width),width),created,metadata/*(title,description),tags', + }) + metadata = media.get('metadata') or {} + title = metadata['title'] + + duration = None + formats = [] + thumbnails = [] + subtitles = {} + for resource in media['asset']['resources']: + for rendition in (resource.get('renditions') or []): + rendition_type = rendition.get('type') + for i, link in enumerate(rendition.get('links') or []): + href = link.get('href') + if not href: + continue + if rendition_type == 'image': + thumbnails.append({ + 'filesize': int_or_none(rendition.get('size')), + 'height': int_or_none(rendition.get('height')), + 'id': rendition.get('id'), + 'url': href, + 'width': int_or_none(rendition.get('width')), + }) + elif rendition_type == 'subtitle': + subtitles.setdefault(rendition.get('language') or 'en', []).append({ + 'url': href, + }) + elif rendition_type == 'video': + f = { + 'filesize': int_or_none(rendition.get('size')), + 'format_id': rendition.get('id'), + 'url': href, + } + video = try_get(rendition, lambda x: x['videos'][i], dict) + if video: + if not duration: + duration = float_or_none(video.get('duration')) + f.update({ + 'height': int_or_none(video.get('height')), + 'tbr': int_or_none(video.get('bitrate'), 1000), + 'vcodec': video.get('codec'), + 'width': int_or_none(video.get('width')), + }) + audio = try_get(video, lambda x: x['audios'][0], dict) + if audio: + f.update({ + 'acodec': audio.get('codec'), + 'asr': int_or_none(audio.get('sampleRate')), + }) + formats.append(f) + elif rendition_type == 'index': + mime_type = link.get('mimeType') + if mime_type == 'application/smil+xml': + formats.extend(self._extract_smil_formats( + href, video_id, fatal=False)) + elif mime_type == 'application/x-mpegURL': + formats.extend(self._extract_m3u8_formats( + href, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif mime_type == 'application/hds+xml': + formats.extend(self._extract_f4m_formats( + href, video_id, f4m_id='hds', fatal=False)) + elif mime_type == 'application/dash+xml': + formats.extend(self._extract_f4m_formats( + href, video_id, f4m_id='hds', fatal=False)) + elif mime_type == 'application/vnd.ms-sstr+xml': + formats.extend(self._extract_ism_formats( + href, video_id, ism_id='mss', fatal=False)) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': metadata.get('description'), + 'timestamp': parse_iso8601(media.get('created')), + 'thumbnails': thumbnails, + 'subtitles': subtitles, + 'duration': duration, + 'tags': media.get('tags'), + 'formats': formats, + } diff --git a/youtube_dl/extractor/arnes.py b/yt_dlp/extractor/arnes.py index c0032fcab..c0032fcab 100644 --- a/youtube_dl/extractor/arnes.py +++ b/yt_dlp/extractor/arnes.py diff --git a/yt_dlp/extractor/arte.py b/yt_dlp/extractor/arte.py new file mode 100644 index 000000000..296b169d2 --- /dev/null +++ b/yt_dlp/extractor/arte.py @@ -0,0 +1,255 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_str, +) +from ..utils import ( + ExtractorError, + int_or_none, + parse_qs, + qualities, + try_get, + unified_strdate, + url_or_none, +) + + +class ArteTVBaseIE(InfoExtractor): + _ARTE_LANGUAGES = 'fr|de|en|es|it|pl' + _API_BASE = 'https://api.arte.tv/api/player/v1' + + +class ArteTVIE(ArteTVBaseIE): + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos| + api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s) + ) + /(?P<id>\d{6}-\d{3}-[AF]) + ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES} + _TESTS = [{ + 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/', + 'info_dict': { + 'id': '088501-000-A', + 'ext': 'mp4', + 'title': 'Mexico: Stealing Petrol to Survive', + 'upload_date': '20190628', + }, + }, { + 'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/', + 'only_matching': True, + }, { + 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + lang = mobj.group('lang') or mobj.group('lang_2') + + info = self._download_json( + '%s/config/%s/%s' % (self._API_BASE, lang, video_id), video_id) + player_info = info['videoJsonPlayer'] + + vsr = try_get(player_info, lambda x: x['VSR'], dict) + if not vsr: + error = None + if try_get(player_info, lambda x: x['custom_msg']['type']) == 'error': + error = try_get( + player_info, lambda x: x['custom_msg']['msg'], compat_str) + if not error: + error = 'Video %s is not available' % player_info.get('VID') or video_id + raise ExtractorError(error, expected=True) + + upload_date_str = player_info.get('shootingDate') + if not upload_date_str: + upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0] + + title = (player_info.get('VTI') or player_info['VID']).strip() + subtitle = player_info.get('VSU', '').strip() + if subtitle: + title += ' - %s' % subtitle + + qfunc = qualities(['MQ', 'HQ', 'EQ', 'SQ']) + + LANGS = { + 'fr': 'F', + 'de': 'A', + 'en': 'E[ANG]', + 'es': 'E[ESP]', + 'it': 'E[ITA]', + 'pl': 'E[POL]', + } + + langcode = LANGS.get(lang, lang) + + formats = [] + for format_id, format_dict in vsr.items(): + f = dict(format_dict) + format_url = url_or_none(f.get('url')) + streamer = f.get('streamer') + if not format_url and not streamer: + continue + versionCode = f.get('versionCode') + l = re.escape(langcode) + + # Language preference from most to least priority + # Reference: section 6.8 of + # https://www.arte.tv/sites/en/corporate/files/complete-technical-guidelines-arte-geie-v1-07-1.pdf + PREFERENCES = ( + # original version in requested language, without subtitles + r'VO{0}$'.format(l), + # original version in requested language, with partial subtitles in requested language + r'VO{0}-ST{0}$'.format(l), + # original version in requested language, with subtitles for the deaf and hard-of-hearing in requested language + r'VO{0}-STM{0}$'.format(l), + # non-original (dubbed) version in requested language, without subtitles + r'V{0}$'.format(l), + # non-original (dubbed) version in requested language, with subtitles partial subtitles in requested language + r'V{0}-ST{0}$'.format(l), + # non-original (dubbed) version in requested language, with subtitles for the deaf and hard-of-hearing in requested language + r'V{0}-STM{0}$'.format(l), + # original version in requested language, with partial subtitles in different language + r'VO{0}-ST(?!{0}).+?$'.format(l), + # original version in requested language, with subtitles for the deaf and hard-of-hearing in different language + r'VO{0}-STM(?!{0}).+?$'.format(l), + # original version in different language, with partial subtitles in requested language + r'VO(?:(?!{0}).+?)?-ST{0}$'.format(l), + # original version in different language, with subtitles for the deaf and hard-of-hearing in requested language + r'VO(?:(?!{0}).+?)?-STM{0}$'.format(l), + # original version in different language, without subtitles + r'VO(?:(?!{0}))?$'.format(l), + # original version in different language, with partial subtitles in different language + r'VO(?:(?!{0}).+?)?-ST(?!{0}).+?$'.format(l), + # original version in different language, with subtitles for the deaf and hard-of-hearing in different language + r'VO(?:(?!{0}).+?)?-STM(?!{0}).+?$'.format(l), + ) + + for pref, p in enumerate(PREFERENCES): + if re.match(p, versionCode): + lang_pref = len(PREFERENCES) - pref + break + else: + lang_pref = -1 + + media_type = f.get('mediaType') + if media_type == 'hls': + m3u8_formats = self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False) + for m3u8_format in m3u8_formats: + m3u8_format['language_preference'] = lang_pref + formats.extend(m3u8_formats) + continue + + format = { + 'format_id': format_id, + 'language_preference': lang_pref, + 'format_note': '%s, %s' % (f.get('versionCode'), f.get('versionLibelle')), + 'width': int_or_none(f.get('width')), + 'height': int_or_none(f.get('height')), + 'tbr': int_or_none(f.get('bitrate')), + 'quality': qfunc(f.get('quality')), + } + + if media_type == 'rtmp': + format['url'] = f['streamer'] + format['play_path'] = 'mp4:' + f['url'] + format['ext'] = 'flv' + else: + format['url'] = f['url'] + + formats.append(format) + + # For this extractor, quality only represents the relative quality + # with respect to other formats with the same resolution + self._sort_formats(formats, ('res', 'quality')) + + return { + 'id': player_info.get('VID') or video_id, + 'title': title, + 'description': player_info.get('VDE') or player_info.get('V7T'), + 'upload_date': unified_strdate(upload_date_str), + 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'), + 'formats': formats, + } + + +class ArteTVEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+' + _TESTS = [{ + 'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A', + 'info_dict': { + 'id': '100605-013-A', + 'ext': 'mp4', + 'title': 'United we Stream November Lockdown Edition #13', + 'description': 'md5:be40b667f45189632b78c1425c7c2ce1', + 'upload_date': '20201116', + }, + }, { + 'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [url for _, url in re.findall( + r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1', + webpage)] + + def _real_extract(self, url): + qs = parse_qs(url) + json_url = qs['json_url'][0] + video_id = ArteTVIE._match_id(json_url) + return self.url_result( + json_url, ie=ArteTVIE.ie_key(), video_id=video_id) + + +class ArteTVPlaylistIE(ArteTVBaseIE): + _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES + _TESTS = [{ + 'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/', + 'info_dict': { + 'id': 'RC-016954', + 'title': 'Earn a Living', + 'description': 'md5:d322c55011514b3a7241f7fb80d494c2', + }, + 'playlist_mincount': 6, + }, { + 'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/', + 'only_matching': True, + }] + + def _real_extract(self, url): + lang, playlist_id = self._match_valid_url(url).groups() + collection = self._download_json( + '%s/collectionData/%s/%s?source=videos' + % (self._API_BASE, lang, playlist_id), playlist_id) + entries = [] + for video in collection['videos']: + if not isinstance(video, dict): + continue + video_url = url_or_none(video.get('url')) or url_or_none(video.get('jsonUrl')) + if not video_url: + continue + video_id = video.get('programId') + entries.append({ + '_type': 'url_transparent', + 'url': video_url, + 'id': video_id, + 'title': video.get('title'), + 'alt_title': video.get('subtitle'), + 'thumbnail': url_or_none(try_get(video, lambda x: x['mainImage']['url'], compat_str)), + 'duration': int_or_none(video.get('durationSeconds')), + 'view_count': int_or_none(video.get('views')), + 'ie_key': ArteTVIE.ie_key(), + }) + title = collection.get('title') + description = collection.get('shortDescription') or collection.get('teaserText') + return self.playlist_result(entries, playlist_id, title, description) diff --git a/yt_dlp/extractor/asiancrush.py b/yt_dlp/extractor/asiancrush.py new file mode 100644 index 000000000..75a632958 --- /dev/null +++ b/yt_dlp/extractor/asiancrush.py @@ -0,0 +1,200 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import functools +import re + +from .common import InfoExtractor +from .kaltura import KalturaIE +from ..utils import ( + extract_attributes, + int_or_none, + OnDemandPagedList, + parse_age_limit, + strip_or_none, + try_get, +) + + +class AsianCrushBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?(?P<host>(?:(?:asiancrush|yuyutv|midnightpulp)\.com|(?:cocoro|retrocrush)\.tv))' + _KALTURA_KEYS = [ + 'video_url', 'progressive_url', 'download_url', 'thumbnail_url', + 'widescreen_thumbnail_url', 'screencap_widescreen', + ] + _API_SUFFIX = {'retrocrush.tv': '-ott'} + + def _call_api(self, host, endpoint, video_id, query, resource): + return self._download_json( + 'https://api%s.%s/%s' % (self._API_SUFFIX.get(host, ''), host, endpoint), video_id, + 'Downloading %s JSON metadata' % resource, query=query, + headers=self.geo_verification_headers())['objects'] + + def _download_object_data(self, host, object_id, resource): + return self._call_api( + host, 'search', object_id, {'id': object_id}, resource)[0] + + def _get_object_description(self, obj): + return strip_or_none(obj.get('long_description') or obj.get('short_description')) + + def _parse_video_data(self, video): + title = video['name'] + + entry_id, partner_id = [None] * 2 + for k in self._KALTURA_KEYS: + k_url = video.get(k) + if k_url: + mobj = re.search(r'/p/(\d+)/.+?/entryId/([^/]+)/', k_url) + if mobj: + partner_id, entry_id = mobj.groups() + break + + meta_categories = try_get(video, lambda x: x['meta']['categories'], list) or [] + categories = list(filter(None, [c.get('name') for c in meta_categories])) + + show_info = video.get('show_info') or {} + + return { + '_type': 'url_transparent', + 'url': 'kaltura:%s:%s' % (partner_id, entry_id), + 'ie_key': KalturaIE.ie_key(), + 'id': entry_id, + 'title': title, + 'description': self._get_object_description(video), + 'age_limit': parse_age_limit(video.get('mpaa_rating') or video.get('tv_rating')), + 'categories': categories, + 'series': show_info.get('show_name'), + 'season_number': int_or_none(show_info.get('season_num')), + 'season_id': show_info.get('season_id'), + 'episode_number': int_or_none(show_info.get('episode_num')), + } + + +class AsianCrushIE(AsianCrushBaseIE): + _VALID_URL = r'%s/video/(?:[^/]+/)?0+(?P<id>\d+)v\b' % AsianCrushBaseIE._VALID_URL_BASE + _TESTS = [{ + 'url': 'https://www.asiancrush.com/video/004289v/women-who-flirt', + 'md5': 'c3b740e48d0ba002a42c0b72857beae6', + 'info_dict': { + 'id': '1_y4tmjm5r', + 'ext': 'mp4', + 'title': 'Women Who Flirt', + 'description': 'md5:b65c7e0ae03a85585476a62a186f924c', + 'timestamp': 1496936429, + 'upload_date': '20170608', + 'uploader_id': 'craig@crifkin.com', + 'age_limit': 13, + 'categories': 'count:5', + 'duration': 5812, + }, + }, { + 'url': 'https://www.asiancrush.com/video/she-was-pretty/011886v-pretty-episode-3/', + 'only_matching': True, + }, { + 'url': 'https://www.yuyutv.com/video/013886v/the-act-of-killing/', + 'only_matching': True, + }, { + 'url': 'https://www.yuyutv.com/video/peep-show/013922v-warring-factions/', + 'only_matching': True, + }, { + 'url': 'https://www.midnightpulp.com/video/010400v/drifters/', + 'only_matching': True, + }, { + 'url': 'https://www.midnightpulp.com/video/mononoke/016378v-zashikiwarashi-part-1/', + 'only_matching': True, + }, { + 'url': 'https://www.cocoro.tv/video/the-wonderful-wizard-of-oz/008878v-the-wonderful-wizard-of-oz-ep01/', + 'only_matching': True, + }, { + 'url': 'https://www.retrocrush.tv/video/true-tears/012328v-i...gave-away-my-tears', + 'only_matching': True, + }] + + def _real_extract(self, url): + host, video_id = self._match_valid_url(url).groups() + + if host == 'cocoro.tv': + webpage = self._download_webpage(url, video_id) + embed_vars = self._parse_json(self._search_regex( + r'iEmbedVars\s*=\s*({.+?})', webpage, 'embed vars', + default='{}'), video_id, fatal=False) or {} + video_id = embed_vars.get('entry_id') or video_id + + video = self._download_object_data(host, video_id, 'video') + return self._parse_video_data(video) + + +class AsianCrushPlaylistIE(AsianCrushBaseIE): + _VALID_URL = r'%s/series/0+(?P<id>\d+)s\b' % AsianCrushBaseIE._VALID_URL_BASE + _TESTS = [{ + 'url': 'https://www.asiancrush.com/series/006447s/fruity-samurai', + 'info_dict': { + 'id': '6447', + 'title': 'Fruity Samurai', + 'description': 'md5:7535174487e4a202d3872a7fc8f2f154', + }, + 'playlist_count': 13, + }, { + 'url': 'https://www.yuyutv.com/series/013920s/peep-show/', + 'only_matching': True, + }, { + 'url': 'https://www.midnightpulp.com/series/016375s/mononoke/', + 'only_matching': True, + }, { + 'url': 'https://www.cocoro.tv/series/008549s/the-wonderful-wizard-of-oz/', + 'only_matching': True, + }, { + 'url': 'https://www.retrocrush.tv/series/012355s/true-tears', + 'only_matching': True, + }] + _PAGE_SIZE = 1000000000 + + def _fetch_page(self, domain, parent_id, page): + videos = self._call_api( + domain, 'getreferencedobjects', parent_id, { + 'max': self._PAGE_SIZE, + 'object_type': 'video', + 'parent_id': parent_id, + 'start': page * self._PAGE_SIZE, + }, 'page %d' % (page + 1)) + for video in videos: + yield self._parse_video_data(video) + + def _real_extract(self, url): + host, playlist_id = self._match_valid_url(url).groups() + + if host == 'cocoro.tv': + webpage = self._download_webpage(url, playlist_id) + + entries = [] + + for mobj in re.finditer( + r'<a[^>]+href=(["\'])(?P<url>%s.*?)\1[^>]*>' % AsianCrushIE._VALID_URL, + webpage): + attrs = extract_attributes(mobj.group(0)) + if attrs.get('class') == 'clearfix': + entries.append(self.url_result( + mobj.group('url'), ie=AsianCrushIE.ie_key())) + + title = self._html_search_regex( + r'(?s)<h1\b[^>]\bid=["\']movieTitle[^>]+>(.+?)</h1>', webpage, + 'title', default=None) or self._og_search_title( + webpage, default=None) or self._html_search_meta( + 'twitter:title', webpage, 'title', + default=None) or self._search_regex( + r'<title>([^<]+)</title>', webpage, 'title', fatal=False) + if title: + title = re.sub(r'\s*\|\s*.+?$', '', title) + + description = self._og_search_description( + webpage, default=None) or self._html_search_meta( + 'twitter:description', webpage, 'description', fatal=False) + else: + show = self._download_object_data(host, playlist_id, 'show') + title = show.get('name') + description = self._get_object_description(show) + entries = OnDemandPagedList( + functools.partial(self._fetch_page, host, playlist_id), + self._PAGE_SIZE) + + return self.playlist_result(entries, playlist_id, title, description) diff --git a/yt_dlp/extractor/atresplayer.py b/yt_dlp/extractor/atresplayer.py new file mode 100644 index 000000000..8143eb4d7 --- /dev/null +++ b/yt_dlp/extractor/atresplayer.py @@ -0,0 +1,119 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ( + ExtractorError, + int_or_none, + urlencode_postdata, +) + + +class AtresPlayerIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?atresplayer\.com/[^/]+/[^/]+/[^/]+/[^/]+/(?P<display_id>.+?)_(?P<id>[0-9a-f]{24})' + _NETRC_MACHINE = 'atresplayer' + _TESTS = [ + { + 'url': 'https://www.atresplayer.com/antena3/series/pequenas-coincidencias/temporada-1/capitulo-7-asuntos-pendientes_5d4aa2c57ed1a88fc715a615/', + 'info_dict': { + 'id': '5d4aa2c57ed1a88fc715a615', + 'ext': 'mp4', + 'title': 'Capítulo 7: Asuntos pendientes', + 'description': 'md5:7634cdcb4d50d5381bedf93efb537fbc', + 'duration': 3413, + }, + 'params': { + 'format': 'bestvideo', + }, + 'skip': 'This video is only available for registered users' + }, + { + 'url': 'https://www.atresplayer.com/lasexta/programas/el-club-de-la-comedia/temporada-4/capitulo-10-especial-solidario-nochebuena_5ad08edf986b2855ed47adc4/', + 'only_matching': True, + }, + { + 'url': 'https://www.atresplayer.com/antena3/series/el-secreto-de-puente-viejo/el-chico-de-los-tres-lunares/capitulo-977-29-12-14_5ad51046986b2886722ccdea/', + 'only_matching': True, + }, + ] + _API_BASE = 'https://api.atresplayer.com/' + + def _real_initialize(self): + self._login() + + def _handle_error(self, e, code): + if isinstance(e.cause, compat_HTTPError) and e.cause.code == code: + error = self._parse_json(e.cause.read(), None) + if error.get('error') == 'required_registered': + self.raise_login_required() + raise ExtractorError(error['error_description'], expected=True) + raise + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + self._request_webpage( + self._API_BASE + 'login', None, 'Downloading login page') + + try: + target_url = self._download_json( + 'https://account.atresmedia.com/api/login', None, + 'Logging in', headers={ + 'Content-Type': 'application/x-www-form-urlencoded' + }, data=urlencode_postdata({ + 'username': username, + 'password': password, + }))['targetUrl'] + except ExtractorError as e: + self._handle_error(e, 400) + + self._request_webpage(target_url, None, 'Following Target URL') + + def _real_extract(self, url): + display_id, video_id = self._match_valid_url(url).groups() + + try: + episode = self._download_json( + self._API_BASE + 'client/v1/player/episode/' + video_id, video_id) + except ExtractorError as e: + self._handle_error(e, 403) + + title = episode['titulo'] + + formats = [] + subtitles = {} + for source in episode.get('sources', []): + src = source.get('src') + if not src: + continue + src_type = source.get('type') + if src_type == 'application/vnd.apple.mpegurl': + formats, subtitles = self._extract_m3u8_formats( + src, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False) + elif src_type == 'application/dash+xml': + formats, subtitles = self._extract_mpd_formats( + src, video_id, mpd_id='dash', fatal=False) + self._sort_formats(formats) + + heartbeat = episode.get('heartbeat') or {} + omniture = episode.get('omniture') or {} + get_meta = lambda x: heartbeat.get(x) or omniture.get(x) + + return { + 'display_id': display_id, + 'id': video_id, + 'title': title, + 'description': episode.get('descripcion'), + 'thumbnail': episode.get('imgPoster'), + 'duration': int_or_none(episode.get('duration')), + 'formats': formats, + 'channel': get_meta('channel'), + 'season': get_meta('season'), + 'episode_number': int_or_none(get_meta('episodeNumber')), + 'subtitles': subtitles, + } diff --git a/youtube_dl/extractor/atttechchannel.py b/yt_dlp/extractor/atttechchannel.py index 8f93fb353..8f93fb353 100644 --- a/youtube_dl/extractor/atttechchannel.py +++ b/yt_dlp/extractor/atttechchannel.py diff --git a/yt_dlp/extractor/atvat.py b/yt_dlp/extractor/atvat.py new file mode 100644 index 000000000..7c30cfcbb --- /dev/null +++ b/yt_dlp/extractor/atvat.py @@ -0,0 +1,106 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import datetime + +from .common import InfoExtractor +from ..utils import ( + float_or_none, + jwt_encode_hs256, + try_get, +) + + +class ATVAtIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?atv\.at/tv/(?:[^/]+/){2,3}(?P<id>.*)' + + _TESTS = [{ + 'url': 'https://www.atv.at/tv/bauer-sucht-frau/staffel-18/bauer-sucht-frau/bauer-sucht-frau-staffel-18-folge-3-die-hofwochen', + 'md5': '3c3b4aaca9f63e32b35e04a9c2515903', + 'info_dict': { + 'id': 'v-ce9cgn1e70n5-1', + 'ext': 'mp4', + 'title': 'Bauer sucht Frau - Staffel 18 Folge 3 - Die Hofwochen', + } + }, { + 'url': 'https://www.atv.at/tv/bauer-sucht-frau/staffel-18/episode-01/bauer-sucht-frau-staffel-18-vorstellungsfolge-1', + 'only_matching': True, + }] + + # extracted from bootstrap.js function (search for e.encryption_key and use your browser's debugger) + _ACCESS_ID = 'x_atv' + _ENCRYPTION_KEY = 'Hohnaekeishoogh2omaeghooquooshia' + + def _extract_video_info(self, url, content, video): + clip_id = content.get('splitId', content['id']) + formats = [] + clip_urls = video['urls'] + for protocol, variant in clip_urls.items(): + source_url = try_get(variant, lambda x: x['clear']['url']) + if not source_url: + continue + if protocol == 'dash': + formats.extend(self._extract_mpd_formats( + source_url, clip_id, mpd_id=protocol, fatal=False)) + elif protocol == 'hls': + formats.extend(self._extract_m3u8_formats( + source_url, clip_id, 'mp4', 'm3u8_native', + m3u8_id=protocol, fatal=False)) + else: + formats.append({ + 'url': source_url, + 'format_id': protocol, + }) + self._sort_formats(formats) + + return { + 'id': clip_id, + 'title': content.get('title'), + 'duration': float_or_none(content.get('duration')), + 'series': content.get('tvShowTitle'), + 'formats': formats, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + json_data = self._parse_json( + self._search_regex(r'<script id="state" type="text/plain">(.*)</script>', webpage, 'json_data'), + video_id=video_id) + + video_title = json_data['views']['default']['page']['title'] + contentResource = json_data['views']['default']['page']['contentResource'] + content_id = contentResource[0]['id'] + content_ids = [{'id': id, 'subclip_start': content['start'], 'subclip_end': content['end']} + for id, content in enumerate(contentResource)] + + time_of_request = datetime.datetime.now() + not_before = time_of_request - datetime.timedelta(minutes=5) + expire = time_of_request + datetime.timedelta(minutes=5) + payload = { + 'content_ids': { + content_id: content_ids, + }, + 'secure_delivery': True, + 'iat': int(time_of_request.timestamp()), + 'nbf': int(not_before.timestamp()), + 'exp': int(expire.timestamp()), + } + jwt_token = jwt_encode_hs256(payload, self._ENCRYPTION_KEY, headers={'kid': self._ACCESS_ID}) + videos = self._download_json( + 'https://vas-v4.p7s1video.net/4.0/getsources', + content_id, 'Downloading videos JSON', query={ + 'token': jwt_token.decode('utf-8') + }) + + video_id, videos_data = list(videos['data'].items())[0] + entries = [ + self._extract_video_info(url, contentResource[video['id']], video) + for video in videos_data] + + return { + '_type': 'multi_video', + 'id': video_id, + 'title': video_title, + 'entries': entries, + } diff --git a/youtube_dl/extractor/audimedia.py b/yt_dlp/extractor/audimedia.py index 6bd48ef15..6bd48ef15 100644 --- a/youtube_dl/extractor/audimedia.py +++ b/yt_dlp/extractor/audimedia.py diff --git a/youtube_dl/extractor/audioboom.py b/yt_dlp/extractor/audioboom.py index c51837b40..c51837b40 100644 --- a/youtube_dl/extractor/audioboom.py +++ b/yt_dlp/extractor/audioboom.py diff --git a/youtube_dl/extractor/audiomack.py b/yt_dlp/extractor/audiomack.py index cc7771354..cc7771354 100644 --- a/youtube_dl/extractor/audiomack.py +++ b/yt_dlp/extractor/audiomack.py diff --git a/yt_dlp/extractor/audius.py b/yt_dlp/extractor/audius.py new file mode 100644 index 000000000..fa64995d5 --- /dev/null +++ b/yt_dlp/extractor/audius.py @@ -0,0 +1,274 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import random + +from .common import InfoExtractor +from ..utils import ExtractorError, try_get, compat_str, str_or_none +from ..compat import compat_urllib_parse_unquote + + +class AudiusBaseIE(InfoExtractor): + _API_BASE = None + _API_V = '/v1' + + def _get_response_data(self, response): + if isinstance(response, dict): + response_data = response.get('data') + if response_data is not None: + return response_data + if len(response) == 1 and 'message' in response: + raise ExtractorError('API error: %s' % response['message'], + expected=True) + raise ExtractorError('Unexpected API response') + + def _select_api_base(self): + """Selecting one of the currently available API hosts""" + response = super(AudiusBaseIE, self)._download_json( + 'https://api.audius.co/', None, + note='Requesting available API hosts', + errnote='Unable to request available API hosts') + hosts = self._get_response_data(response) + if isinstance(hosts, list): + self._API_BASE = random.choice(hosts) + return + raise ExtractorError('Unable to get available API hosts') + + @staticmethod + def _prepare_url(url, title): + """ + Audius removes forward slashes from the uri, but leaves backslashes. + The problem is that the current version of Chrome replaces backslashes + in the address bar with a forward slashes, so if you copy the link from + there and paste it into youtube-dl, you won't be able to download + anything from this link, since the Audius API won't be able to resolve + this url + """ + url = compat_urllib_parse_unquote(url) + title = compat_urllib_parse_unquote(title) + if '/' in title or '%2F' in title: + fixed_title = title.replace('/', '%5C').replace('%2F', '%5C') + return url.replace(title, fixed_title) + return url + + def _api_request(self, path, item_id=None, note='Downloading JSON metadata', + errnote='Unable to download JSON metadata', + expected_status=None): + if self._API_BASE is None: + self._select_api_base() + try: + response = super(AudiusBaseIE, self)._download_json( + '%s%s%s' % (self._API_BASE, self._API_V, path), item_id, note=note, + errnote=errnote, expected_status=expected_status) + except ExtractorError as exc: + # some of Audius API hosts may not work as expected and return HTML + if 'Failed to parse JSON' in compat_str(exc): + raise ExtractorError('An error occurred while receiving data. Try again', + expected=True) + raise exc + return self._get_response_data(response) + + def _resolve_url(self, url, item_id): + return self._api_request('/resolve?url=%s' % url, item_id, + expected_status=404) + + +class AudiusIE(AudiusBaseIE): + _VALID_URL = r'''(?x)https?://(?:www\.)?(?:audius\.co/(?P<uploader>[\w\d-]+)(?!/album|/playlist)/(?P<title>\S+))''' + IE_DESC = 'Audius.co' + _TESTS = [ + { + # URL from Chrome address bar which replace backslash to forward slash + 'url': 'https://audius.co/test_acc/t%D0%B5%D0%B5%D0%B5est-1.%5E_%7B%7D/%22%3C%3E.%E2%84%96~%60-198631', + 'md5': '92c35d3e754d5a0f17eef396b0d33582', + 'info_dict': { + 'id': 'xd8gY', + 'title': '''Tеееest/ 1.!@#$%^&*()_+=[]{};'\\\":<>,.?/№~`''', + 'ext': 'mp3', + 'description': 'Description', + 'duration': 30, + 'track': '''Tеееest/ 1.!@#$%^&*()_+=[]{};'\\\":<>,.?/№~`''', + 'artist': 'test', + 'genre': 'Electronic', + 'thumbnail': r're:https?://.*\.jpg', + 'view_count': int, + 'like_count': int, + 'repost_count': int, + } + }, + { + # Regular track + 'url': 'https://audius.co/voltra/radar-103692', + 'md5': '491898a0a8de39f20c5d6a8a80ab5132', + 'info_dict': { + 'id': 'KKdy2', + 'title': 'RADAR', + 'ext': 'mp3', + 'duration': 318, + 'track': 'RADAR', + 'artist': 'voltra', + 'genre': 'Trance', + 'thumbnail': r're:https?://.*\.jpg', + 'view_count': int, + 'like_count': int, + 'repost_count': int, + } + }, + ] + + _ARTWORK_MAP = { + "150x150": 150, + "480x480": 480, + "1000x1000": 1000 + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + track_id = try_get(mobj, lambda x: x.group('track_id')) + if track_id is None: + title = mobj.group('title') + # uploader = mobj.group('uploader') + url = self._prepare_url(url, title) + track_data = self._resolve_url(url, title) + else: # API link + title = None + # uploader = None + track_data = self._api_request('/tracks/%s' % track_id, track_id) + + if not isinstance(track_data, dict): + raise ExtractorError('Unexpected API response') + + track_id = track_data.get('id') + if track_id is None: + raise ExtractorError('Unable to get ID of the track') + + artworks_data = track_data.get('artwork') + thumbnails = [] + if isinstance(artworks_data, dict): + for quality_key, thumbnail_url in artworks_data.items(): + thumbnail = { + "url": thumbnail_url + } + quality_code = self._ARTWORK_MAP.get(quality_key) + if quality_code is not None: + thumbnail['preference'] = quality_code + thumbnails.append(thumbnail) + + return { + 'id': track_id, + 'title': track_data.get('title', title), + 'url': '%s/v1/tracks/%s/stream' % (self._API_BASE, track_id), + 'ext': 'mp3', + 'description': track_data.get('description'), + 'duration': track_data.get('duration'), + 'track': track_data.get('title'), + 'artist': try_get(track_data, lambda x: x['user']['name'], compat_str), + 'genre': track_data.get('genre'), + 'thumbnails': thumbnails, + 'view_count': track_data.get('play_count'), + 'like_count': track_data.get('favorite_count'), + 'repost_count': track_data.get('repost_count'), + } + + +class AudiusTrackIE(AudiusIE): + _VALID_URL = r'''(?x)(?:audius:)(?:https?://(?:www\.)?.+/v1/tracks/)?(?P<track_id>\w+)''' + IE_NAME = 'audius:track' + IE_DESC = 'Audius track ID or API link. Prepend with "audius:"' + _TESTS = [ + { + 'url': 'audius:9RWlo', + 'only_matching': True + }, + { + 'url': 'audius:http://discoveryprovider.audius.prod-us-west-2.staked.cloud/v1/tracks/9RWlo', + 'only_matching': True + }, + ] + + +class AudiusPlaylistIE(AudiusBaseIE): + _VALID_URL = r'https?://(?:www\.)?audius\.co/(?P<uploader>[\w\d-]+)/(?:album|playlist)/(?P<title>\S+)' + IE_NAME = 'audius:playlist' + IE_DESC = 'Audius.co playlists' + _TEST = { + 'url': 'https://audius.co/test_acc/playlist/test-playlist-22910', + 'info_dict': { + 'id': 'DNvjN', + 'title': 'test playlist', + 'description': 'Test description\n\nlol', + }, + 'playlist_count': 175, + } + + def _build_playlist(self, tracks): + entries = [] + for track in tracks: + if not isinstance(track, dict): + raise ExtractorError('Unexpected API response') + track_id = str_or_none(track.get('id')) + if not track_id: + raise ExtractorError('Unable to get track ID from playlist') + entries.append(self.url_result( + 'audius:%s' % track_id, + ie=AudiusTrackIE.ie_key(), video_id=track_id)) + return entries + + def _real_extract(self, url): + self._select_api_base() + mobj = self._match_valid_url(url) + title = mobj.group('title') + # uploader = mobj.group('uploader') + url = self._prepare_url(url, title) + playlist_response = self._resolve_url(url, title) + + if not isinstance(playlist_response, list) or len(playlist_response) != 1: + raise ExtractorError('Unexpected API response') + + playlist_data = playlist_response[0] + if not isinstance(playlist_data, dict): + raise ExtractorError('Unexpected API response') + + playlist_id = playlist_data.get('id') + if playlist_id is None: + raise ExtractorError('Unable to get playlist ID') + + playlist_tracks = self._api_request( + '/playlists/%s/tracks' % playlist_id, + title, note='Downloading playlist tracks metadata', + errnote='Unable to download playlist tracks metadata') + if not isinstance(playlist_tracks, list): + raise ExtractorError('Unexpected API response') + + entries = self._build_playlist(playlist_tracks) + return self.playlist_result(entries, playlist_id, + playlist_data.get('playlist_name', title), + playlist_data.get('description')) + + +class AudiusProfileIE(AudiusPlaylistIE): + IE_NAME = 'audius:artist' + IE_DESC = 'Audius.co profile/artist pages' + _VALID_URL = r'https?://(?:www)?audius\.co/(?P<id>[^\/]+)/?(?:[?#]|$)' + _TEST = { + 'url': 'https://audius.co/pzl/', + 'info_dict': { + 'id': 'ezRo7', + 'description': 'TAMALE\n\nContact: officialpzl@gmail.com', + 'title': 'pzl', + }, + 'playlist_count': 24, + } + + def _real_extract(self, url): + self._select_api_base() + profile_id = self._match_id(url) + try: + _profile_data = self._api_request('/full/users/handle/' + profile_id, profile_id) + except ExtractorError as e: + raise ExtractorError('Could not download profile info; ' + str(e)) + profile_audius_id = _profile_data[0]['id'] + profile_bio = _profile_data[0].get('bio') + + api_call = self._api_request('/full/users/handle/%s/tracks' % profile_id, profile_id) + return self.playlist_result(self._build_playlist(api_call), profile_audius_id, profile_id, profile_bio) diff --git a/yt_dlp/extractor/awaan.py b/yt_dlp/extractor/awaan.py new file mode 100644 index 000000000..22cc10d98 --- /dev/null +++ b/yt_dlp/extractor/awaan.py @@ -0,0 +1,186 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import base64 + +from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse_urlencode, + compat_str, +) +from ..utils import ( + int_or_none, + parse_iso8601, + smuggle_url, + unsmuggle_url, + urlencode_postdata, +) + + +class AWAANIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?show/(?P<show_id>\d+)/[^/]+(?:/(?P<id>\d+)/(?P<season_id>\d+))?' + + def _real_extract(self, url): + show_id, video_id, season_id = self._match_valid_url(url).groups() + if video_id and int(video_id) > 0: + return self.url_result( + 'http://awaan.ae/media/%s' % video_id, 'AWAANVideo') + elif season_id and int(season_id) > 0: + return self.url_result(smuggle_url( + 'http://awaan.ae/program/season/%s' % season_id, + {'show_id': show_id}), 'AWAANSeason') + else: + return self.url_result( + 'http://awaan.ae/program/%s' % show_id, 'AWAANSeason') + + +class AWAANBaseIE(InfoExtractor): + def _parse_video_data(self, video_data, video_id, is_live): + title = video_data.get('title_en') or video_data['title_ar'] + img = video_data.get('img') + + return { + 'id': video_id, + 'title': self._live_title(title) if is_live else title, + 'description': video_data.get('description_en') or video_data.get('description_ar'), + 'thumbnail': 'http://admin.mangomolo.com/analytics/%s' % img if img else None, + 'duration': int_or_none(video_data.get('duration')), + 'timestamp': parse_iso8601(video_data.get('create_time'), ' '), + 'is_live': is_live, + 'uploader_id': video_data.get('user_id'), + } + + +class AWAANVideoIE(AWAANBaseIE): + IE_NAME = 'awaan:video' + _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?(?:video(?:/[^/]+)?|media|catchup/[^/]+/[^/]+)/(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://www.dcndigital.ae/#/video/%D8%B1%D8%AD%D9%84%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D8%B1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/17375', + 'md5': '5f61c33bfc7794315c671a62d43116aa', + 'info_dict': + { + 'id': '17375', + 'ext': 'mp4', + 'title': 'رحلة العمر : الحلقة 1', + 'description': 'md5:0156e935d870acb8ef0a66d24070c6d6', + 'duration': 2041, + 'timestamp': 1227504126, + 'upload_date': '20081124', + 'uploader_id': '71', + }, + }, { + 'url': 'http://awaan.ae/video/26723981/%D8%AF%D8%A7%D8%B1-%D8%A7%D9%84%D8%B3%D9%84%D8%A7%D9%85:-%D8%AE%D9%8A%D8%B1-%D8%AF%D9%88%D8%B1-%D8%A7%D9%84%D8%A3%D9%86%D8%B5%D8%A7%D8%B1', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + video_data = self._download_json( + 'http://admin.mangomolo.com/analytics/index.php/plus/video?id=%s' % video_id, + video_id, headers={'Origin': 'http://awaan.ae'}) + info = self._parse_video_data(video_data, video_id, False) + + embed_url = 'http://admin.mangomolo.com/analytics/index.php/customers/embed/video?' + compat_urllib_parse_urlencode({ + 'id': video_data['id'], + 'user_id': video_data['user_id'], + 'signature': video_data['signature'], + 'countries': 'Q0M=', + 'filter': 'DENY', + }) + info.update({ + '_type': 'url_transparent', + 'url': embed_url, + 'ie_key': 'MangomoloVideo', + }) + return info + + +class AWAANLiveIE(AWAANBaseIE): + IE_NAME = 'awaan:live' + _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?live/(?P<id>\d+)' + _TEST = { + 'url': 'http://awaan.ae/live/6/dubai-tv', + 'info_dict': { + 'id': '6', + 'ext': 'mp4', + 'title': 're:Dubai Al Oula [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'upload_date': '20150107', + 'timestamp': 1420588800, + 'uploader_id': '71', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + channel_id = self._match_id(url) + + channel_data = self._download_json( + 'http://admin.mangomolo.com/analytics/index.php/plus/getchanneldetails?channel_id=%s' % channel_id, + channel_id, headers={'Origin': 'http://awaan.ae'}) + info = self._parse_video_data(channel_data, channel_id, True) + + embed_url = 'http://admin.mangomolo.com/analytics/index.php/customers/embed/index?' + compat_urllib_parse_urlencode({ + 'id': base64.b64encode(channel_data['user_id'].encode()).decode(), + 'channelid': base64.b64encode(channel_data['id'].encode()).decode(), + 'signature': channel_data['signature'], + 'countries': 'Q0M=', + 'filter': 'DENY', + }) + info.update({ + '_type': 'url_transparent', + 'url': embed_url, + 'ie_key': 'MangomoloLive', + }) + return info + + +class AWAANSeasonIE(InfoExtractor): + IE_NAME = 'awaan:season' + _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?program/(?:(?P<show_id>\d+)|season/(?P<season_id>\d+))' + _TEST = { + 'url': 'http://dcndigital.ae/#/program/205024/%D9%85%D8%AD%D8%A7%D8%B6%D8%B1%D8%A7%D8%AA-%D8%A7%D9%84%D8%B4%D9%8A%D8%AE-%D8%A7%D9%84%D8%B4%D8%B9%D8%B1%D8%A7%D9%88%D9%8A', + 'info_dict': + { + 'id': '7910', + 'title': 'محاضرات الشيخ الشعراوي', + }, + 'playlist_mincount': 27, + } + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + show_id, season_id = self._match_valid_url(url).groups() + + data = {} + if season_id: + data['season'] = season_id + show_id = smuggled_data.get('show_id') + if show_id is None: + season = self._download_json( + 'http://admin.mangomolo.com/analytics/index.php/plus/season_info?id=%s' % season_id, + season_id, headers={'Origin': 'http://awaan.ae'}) + show_id = season['id'] + data['show_id'] = show_id + show = self._download_json( + 'http://admin.mangomolo.com/analytics/index.php/plus/show', + show_id, data=urlencode_postdata(data), headers={ + 'Origin': 'http://awaan.ae', + 'Content-Type': 'application/x-www-form-urlencoded' + }) + if not season_id: + season_id = show['default_season'] + for season in show['seasons']: + if season['id'] == season_id: + title = season.get('title_en') or season['title_ar'] + + entries = [] + for video in show['videos']: + video_id = compat_str(video['id']) + entries.append(self.url_result( + 'http://awaan.ae/media/%s' % video_id, 'AWAANVideo', video_id)) + + return self.playlist_result(entries, season_id, title) diff --git a/youtube_dl/extractor/aws.py b/yt_dlp/extractor/aws.py index dccfeaf73..dccfeaf73 100644 --- a/youtube_dl/extractor/aws.py +++ b/yt_dlp/extractor/aws.py diff --git a/yt_dlp/extractor/azmedien.py b/yt_dlp/extractor/azmedien.py new file mode 100644 index 000000000..fee640e14 --- /dev/null +++ b/yt_dlp/extractor/azmedien.py @@ -0,0 +1,65 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from .kaltura import KalturaIE + + +class AZMedienIE(InfoExtractor): + IE_DESC = 'AZ Medien videos' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)? + (?P<host> + telezueri\.ch| + telebaern\.tv| + telem1\.ch + )/ + [^/]+/ + (?P<id> + [^/]+-(?P<article_id>\d+) + ) + (?: + \#video= + (?P<kaltura_id> + [_0-9a-z]+ + ) + )? + ''' + + _TESTS = [{ + 'url': 'https://www.telezueri.ch/sonntalk/bundesrats-vakanzen-eu-rahmenabkommen-133214569', + 'info_dict': { + 'id': '1_anruz3wy', + 'ext': 'mp4', + 'title': 'Bundesrats-Vakanzen / EU-Rahmenabkommen', + 'uploader_id': 'TVOnline', + 'upload_date': '20180930', + 'timestamp': 1538328802, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.telebaern.tv/telebaern-news/montag-1-oktober-2018-ganze-sendung-133531189#video=0_7xjo9lf1', + 'only_matching': True + }] + _API_TEMPL = 'https://www.%s/api/pub/gql/%s/NewsArticleTeaser/a4016f65fe62b81dc6664dd9f4910e4ab40383be' + _PARTNER_ID = '1719221' + + def _real_extract(self, url): + host, display_id, article_id, entry_id = self._match_valid_url(url).groups() + + if not entry_id: + entry_id = self._download_json( + self._API_TEMPL % (host, host.split('.')[0]), display_id, query={ + 'variables': json.dumps({ + 'contextId': 'NewsArticle:' + article_id, + }), + })['data']['context']['mainAsset']['video']['kaltura']['kalturaId'] + + return self.url_result( + 'kaltura:%s:%s' % (self._PARTNER_ID, entry_id), + ie=KalturaIE.ie_key(), video_id=entry_id) diff --git a/yt_dlp/extractor/baidu.py b/yt_dlp/extractor/baidu.py new file mode 100644 index 000000000..364fd9459 --- /dev/null +++ b/yt_dlp/extractor/baidu.py @@ -0,0 +1,55 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import unescapeHTML + + +class BaiduVideoIE(InfoExtractor): + IE_DESC = '百度视频' + _VALID_URL = r'https?://v\.baidu\.com/(?P<type>[a-z]+)/(?P<id>\d+)\.htm' + _TESTS = [{ + 'url': 'http://v.baidu.com/comic/1069.htm?frp=bdbrand&q=%E4%B8%AD%E5%8D%8E%E5%B0%8F%E5%BD%93%E5%AE%B6', + 'info_dict': { + 'id': '1069', + 'title': '中华小当家 TV版国语', + 'description': 'md5:51be07afe461cf99fa61231421b5397c', + }, + 'playlist_count': 52, + }, { + 'url': 'http://v.baidu.com/show/11595.htm?frp=bdbrand', + 'info_dict': { + 'id': '11595', + 'title': 're:^奔跑吧兄弟', + 'description': 'md5:1bf88bad6d850930f542d51547c089b8', + }, + 'playlist_mincount': 12, + }] + + def _call_api(self, path, category, playlist_id, note): + return self._download_json('http://app.video.baidu.com/%s/?worktype=adnative%s&id=%s' % ( + path, category, playlist_id), playlist_id, note) + + def _real_extract(self, url): + category, playlist_id = self._match_valid_url(url).groups() + if category == 'show': + category = 'tvshow' + if category == 'tv': + category = 'tvplay' + + playlist_detail = self._call_api( + 'xqinfo', category, playlist_id, 'Download playlist JSON metadata') + + playlist_title = playlist_detail['title'] + playlist_description = unescapeHTML(playlist_detail.get('intro')) + + episodes_detail = self._call_api( + 'xqsingle', category, playlist_id, 'Download episodes JSON metadata') + + entries = [self.url_result( + episode['url'], video_title=episode['title'] + ) for episode in episodes_detail['videos']] + + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) diff --git a/youtube_dl/extractor/bandaichannel.py b/yt_dlp/extractor/bandaichannel.py index d67285913..d67285913 100644 --- a/youtube_dl/extractor/bandaichannel.py +++ b/yt_dlp/extractor/bandaichannel.py diff --git a/yt_dlp/extractor/bandcamp.py b/yt_dlp/extractor/bandcamp.py new file mode 100644 index 000000000..b664145a1 --- /dev/null +++ b/yt_dlp/extractor/bandcamp.py @@ -0,0 +1,431 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import random +import re +import time + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + KNOWN_EXTENSIONS, + parse_filesize, + str_or_none, + try_get, + update_url_query, + unified_strdate, + unified_timestamp, + url_or_none, + urljoin, +) + + +class BandcampIE(InfoExtractor): + _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song', + 'md5': 'c557841d5e50261777a6585648adf439', + 'info_dict': { + 'id': '1812978515', + 'ext': 'mp3', + 'title': "youtube-dl \"'/\\ä↭ - youtube-dl \"'/\\ä↭ - youtube-dl test song \"'/\\ä↭", + 'duration': 9.8485, + 'uploader': 'youtube-dl "\'/\\ä↭', + 'upload_date': '20121129', + 'timestamp': 1354224127, + }, + '_skip': 'There is a limit of 200 free downloads / month for the test song' + }, { + # free download + 'url': 'http://benprunty.bandcamp.com/track/lanius-battle', + 'info_dict': { + 'id': '2650410135', + 'ext': 'aiff', + 'title': 'Ben Prunty - Lanius (Battle)', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Ben Prunty', + 'timestamp': 1396508491, + 'upload_date': '20140403', + 'release_timestamp': 1396483200, + 'release_date': '20140403', + 'duration': 260.877, + 'track': 'Lanius (Battle)', + 'track_number': 1, + 'track_id': '2650410135', + 'artist': 'Ben Prunty', + 'album': 'FTL: Advanced Edition Soundtrack', + }, + }, { + # no free download, mp3 128 + 'url': 'https://relapsealumni.bandcamp.com/track/hail-to-fire', + 'md5': 'fec12ff55e804bb7f7ebeb77a800c8b7', + 'info_dict': { + 'id': '2584466013', + 'ext': 'mp3', + 'title': 'Mastodon - Hail to Fire', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Mastodon', + 'timestamp': 1322005399, + 'upload_date': '20111122', + 'release_timestamp': 1076112000, + 'release_date': '20040207', + 'duration': 120.79, + 'track': 'Hail to Fire', + 'track_number': 5, + 'track_id': '2584466013', + 'artist': 'Mastodon', + 'album': 'Call of the Mastodon', + }, + }] + + def _extract_data_attr(self, webpage, video_id, attr='tralbum', fatal=True): + return self._parse_json(self._html_search_regex( + r'data-%s=(["\'])({.+?})\1' % attr, webpage, + attr + ' data', group=2), video_id, fatal=fatal) + + def _real_extract(self, url): + title = self._match_id(url) + webpage = self._download_webpage(url, title) + tralbum = self._extract_data_attr(webpage, title) + thumbnail = self._og_search_thumbnail(webpage) + + track_id = None + track = None + track_number = None + duration = None + + formats = [] + track_info = try_get(tralbum, lambda x: x['trackinfo'][0], dict) + if track_info: + file_ = track_info.get('file') + if isinstance(file_, dict): + for format_id, format_url in file_.items(): + if not url_or_none(format_url): + continue + ext, abr_str = format_id.split('-', 1) + formats.append({ + 'format_id': format_id, + 'url': self._proto_relative_url(format_url, 'http:'), + 'ext': ext, + 'vcodec': 'none', + 'acodec': ext, + 'abr': int_or_none(abr_str), + }) + track = track_info.get('title') + track_id = str_or_none( + track_info.get('track_id') or track_info.get('id')) + track_number = int_or_none(track_info.get('track_num')) + duration = float_or_none(track_info.get('duration')) + + embed = self._extract_data_attr(webpage, title, 'embed', False) + current = tralbum.get('current') or {} + artist = embed.get('artist') or current.get('artist') or tralbum.get('artist') + timestamp = unified_timestamp( + current.get('publish_date') or tralbum.get('album_publish_date')) + + download_link = tralbum.get('freeDownloadPage') + if download_link: + track_id = compat_str(tralbum['id']) + + download_webpage = self._download_webpage( + download_link, track_id, 'Downloading free downloads page') + + blob = self._extract_data_attr(download_webpage, track_id, 'blob') + + info = try_get( + blob, (lambda x: x['digital_items'][0], + lambda x: x['download_items'][0]), dict) + if info: + downloads = info.get('downloads') + if isinstance(downloads, dict): + if not track: + track = info.get('title') + if not artist: + artist = info.get('artist') + if not thumbnail: + thumbnail = info.get('thumb_url') + + download_formats = {} + download_formats_list = blob.get('download_formats') + if isinstance(download_formats_list, list): + for f in blob['download_formats']: + name, ext = f.get('name'), f.get('file_extension') + if all(isinstance(x, compat_str) for x in (name, ext)): + download_formats[name] = ext.strip('.') + + for format_id, f in downloads.items(): + format_url = f.get('url') + if not format_url: + continue + # Stat URL generation algorithm is reverse engineered from + # download_*_bundle_*.js + stat_url = update_url_query( + format_url.replace('/download/', '/statdownload/'), { + '.rand': int(time.time() * 1000 * random.random()), + }) + format_id = f.get('encoding_name') or format_id + stat = self._download_json( + stat_url, track_id, 'Downloading %s JSON' % format_id, + transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1], + fatal=False) + if not stat: + continue + retry_url = url_or_none(stat.get('retry_url')) + if not retry_url: + continue + formats.append({ + 'url': self._proto_relative_url(retry_url, 'http:'), + 'ext': download_formats.get(format_id), + 'format_id': format_id, + 'format_note': f.get('description'), + 'filesize': parse_filesize(f.get('size_mb')), + 'vcodec': 'none', + }) + + self._sort_formats(formats) + + title = '%s - %s' % (artist, track) if artist else track + + if not duration: + duration = float_or_none(self._html_search_meta( + 'duration', webpage, default=None)) + + return { + 'id': track_id, + 'title': title, + 'thumbnail': thumbnail, + 'uploader': artist, + 'timestamp': timestamp, + 'release_timestamp': unified_timestamp(tralbum.get('album_release_date')), + 'duration': duration, + 'track': track, + 'track_number': track_number, + 'track_id': track_id, + 'artist': artist, + 'album': embed.get('album_title'), + 'formats': formats, + } + + +class BandcampAlbumIE(BandcampIE): + IE_NAME = 'Bandcamp:album' + _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?!/music)(?:/album/(?P<id>[^/?#&]+))?' + + _TESTS = [{ + 'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1', + 'playlist': [ + { + 'md5': '39bc1eded3476e927c724321ddf116cf', + 'info_dict': { + 'id': '1353101989', + 'ext': 'mp3', + 'title': 'Blazo - Intro', + 'timestamp': 1311756226, + 'upload_date': '20110727', + 'uploader': 'Blazo', + } + }, + { + 'md5': '1a2c32e2691474643e912cc6cd4bffaa', + 'info_dict': { + 'id': '38097443', + 'ext': 'mp3', + 'title': 'Blazo - Kero One - Keep It Alive (Blazo remix)', + 'timestamp': 1311757238, + 'upload_date': '20110727', + 'uploader': 'Blazo', + } + }, + ], + 'info_dict': { + 'title': 'Jazz Format Mixtape vol.1', + 'id': 'jazz-format-mixtape-vol-1', + 'uploader_id': 'blazo', + }, + 'params': { + 'playlistend': 2 + }, + 'skip': 'Bandcamp imposes download limits.' + }, { + 'url': 'http://nightbringer.bandcamp.com/album/hierophany-of-the-open-grave', + 'info_dict': { + 'title': 'Hierophany of the Open Grave', + 'uploader_id': 'nightbringer', + 'id': 'hierophany-of-the-open-grave', + }, + 'playlist_mincount': 9, + }, { + 'url': 'http://dotscale.bandcamp.com', + 'info_dict': { + 'title': 'Loom', + 'id': 'dotscale', + 'uploader_id': 'dotscale', + }, + 'playlist_mincount': 7, + }, { + # with escaped quote in title + 'url': 'https://jstrecords.bandcamp.com/album/entropy-ep', + 'info_dict': { + 'title': '"Entropy" EP', + 'uploader_id': 'jstrecords', + 'id': 'entropy-ep', + 'description': 'md5:0ff22959c943622972596062f2f366a5', + }, + 'playlist_mincount': 3, + }, { + # not all tracks have songs + 'url': 'https://insulters.bandcamp.com/album/we-are-the-plague', + 'info_dict': { + 'id': 'we-are-the-plague', + 'title': 'WE ARE THE PLAGUE', + 'uploader_id': 'insulters', + 'description': 'md5:b3cf845ee41b2b1141dc7bde9237255f', + }, + 'playlist_count': 2, + }] + + @classmethod + def suitable(cls, url): + return (False + if BandcampWeeklyIE.suitable(url) or BandcampIE.suitable(url) + else super(BandcampAlbumIE, cls).suitable(url)) + + def _real_extract(self, url): + uploader_id, album_id = self._match_valid_url(url).groups() + playlist_id = album_id or uploader_id + webpage = self._download_webpage(url, playlist_id) + tralbum = self._extract_data_attr(webpage, playlist_id) + track_info = tralbum.get('trackinfo') + if not track_info: + raise ExtractorError('The page doesn\'t contain any tracks') + # Only tracks with duration info have songs + entries = [ + self.url_result( + urljoin(url, t['title_link']), BandcampIE.ie_key(), + str_or_none(t.get('track_id') or t.get('id')), t.get('title')) + for t in track_info + if t.get('duration')] + + current = tralbum.get('current') or {} + + return { + '_type': 'playlist', + 'uploader_id': uploader_id, + 'id': playlist_id, + 'title': current.get('title'), + 'description': current.get('about'), + 'entries': entries, + } + + +class BandcampWeeklyIE(BandcampIE): + IE_NAME = 'Bandcamp:weekly' + _VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://bandcamp.com/?show=224', + 'md5': 'b00df799c733cf7e0c567ed187dea0fd', + 'info_dict': { + 'id': '224', + 'ext': 'opus', + 'title': 'BC Weekly April 4th 2017 - Magic Moments', + 'description': 'md5:5d48150916e8e02d030623a48512c874', + 'duration': 5829.77, + 'release_date': '20170404', + 'series': 'Bandcamp Weekly', + 'episode': 'Magic Moments', + 'episode_id': '224', + }, + 'params': { + 'format': 'opus-lo', + }, + }, { + 'url': 'https://bandcamp.com/?blah/blah@&show=228', + 'only_matching': True + }] + + def _real_extract(self, url): + show_id = self._match_id(url) + webpage = self._download_webpage(url, show_id) + + blob = self._extract_data_attr(webpage, show_id, 'blob') + + show = blob['bcw_data'][show_id] + + formats = [] + for format_id, format_url in show['audio_stream'].items(): + if not url_or_none(format_url): + continue + for known_ext in KNOWN_EXTENSIONS: + if known_ext in format_id: + ext = known_ext + break + else: + ext = None + formats.append({ + 'format_id': format_id, + 'url': format_url, + 'ext': ext, + 'vcodec': 'none', + }) + self._sort_formats(formats) + + title = show.get('audio_title') or 'Bandcamp Weekly' + subtitle = show.get('subtitle') + if subtitle: + title += ' - %s' % subtitle + + return { + 'id': show_id, + 'title': title, + 'description': show.get('desc') or show.get('short_desc'), + 'duration': float_or_none(show.get('audio_duration')), + 'is_live': False, + 'release_date': unified_strdate(show.get('published_date')), + 'series': 'Bandcamp Weekly', + 'episode': show.get('subtitle'), + 'episode_id': show_id, + 'formats': formats + } + + +class BandcampMusicIE(InfoExtractor): + _VALID_URL = r'https?://(?P<id>[^/]+)\.bandcamp\.com/music' + _TESTS = [{ + 'url': 'https://steviasphere.bandcamp.com/music', + 'playlist_mincount': 47, + 'info_dict': { + 'id': 'steviasphere', + }, + }, { + 'url': 'https://coldworldofficial.bandcamp.com/music', + 'playlist_mincount': 10, + 'info_dict': { + 'id': 'coldworldofficial', + }, + }, { + 'url': 'https://nuclearwarnowproductions.bandcamp.com/music', + 'playlist_mincount': 399, + 'info_dict': { + 'id': 'nuclearwarnowproductions', + }, + } + ] + + _TYPE_IE_DICT = { + 'album': BandcampAlbumIE.ie_key(), + 'track': BandcampIE.ie_key() + } + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + items = re.findall(r'href\=\"\/(?P<path>(?P<type>album|track)+/[^\"]+)', webpage) + entries = [ + self.url_result( + f'https://{id}.bandcamp.com/{item[0]}', + ie=self._TYPE_IE_DICT[item[1]]) + for item in items] + return self.playlist_result(entries, id) diff --git a/yt_dlp/extractor/bannedvideo.py b/yt_dlp/extractor/bannedvideo.py new file mode 100644 index 000000000..3db1151f6 --- /dev/null +++ b/yt_dlp/extractor/bannedvideo.py @@ -0,0 +1,158 @@ +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..utils import ( + try_get, + int_or_none, + url_or_none, + float_or_none, + unified_timestamp, +) + + +class BannedVideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?banned\.video/watch\?id=(?P<id>[0-f]{24})' + _TESTS = [{ + 'url': 'https://banned.video/watch?id=5e7a859644e02200c6ef5f11', + 'md5': '14b6e81d41beaaee2215cd75c6ed56e4', + 'info_dict': { + 'id': '5e7a859644e02200c6ef5f11', + 'ext': 'mp4', + 'title': 'China Discovers Origin of Corona Virus: Issues Emergency Statement', + 'thumbnail': r're:^https?://(?:www\.)?assets\.infowarsmedia.com/images/', + 'description': 'md5:560d96f02abbebe6c6b78b47465f6b28', + 'upload_date': '20200324', + 'timestamp': 1585087895, + } + }] + + _GRAPHQL_GETMETADATA_QUERY = ''' +query GetVideoAndComments($id: String!) { + getVideo(id: $id) { + streamUrl + directUrl + unlisted + live + tags { + name + } + title + summary + playCount + largeImage + videoDuration + channel { + _id + title + } + createdAt + } + getVideoComments(id: $id, limit: 999999, offset: 0) { + _id + content + user { + _id + username + } + voteCount { + positive + } + createdAt + replyCount + } +}''' + + _GRAPHQL_GETCOMMENTSREPLIES_QUERY = ''' +query GetCommentReplies($id: String!) { + getCommentReplies(id: $id, limit: 999999, offset: 0) { + _id + content + user { + _id + username + } + voteCount { + positive + } + createdAt + replyCount + } +}''' + + _GRAPHQL_QUERIES = { + 'GetVideoAndComments': _GRAPHQL_GETMETADATA_QUERY, + 'GetCommentReplies': _GRAPHQL_GETCOMMENTSREPLIES_QUERY, + } + + def _call_api(self, video_id, id, operation, note): + return self._download_json( + 'https://api.infowarsmedia.com/graphql', video_id, note=note, + headers={ + 'Content-Type': 'application/json; charset=utf-8' + }, data=json.dumps({ + 'variables': {'id': id}, + 'operationName': operation, + 'query': self._GRAPHQL_QUERIES[operation] + }).encode('utf8')).get('data') + + def _get_comments(self, video_id, comments, comment_data): + yield from comments + for comment in comment_data.copy(): + comment_id = comment.get('_id') + if comment.get('replyCount') > 0: + reply_json = self._call_api( + video_id, comment_id, 'GetCommentReplies', + f'Downloading replies for comment {comment_id}') + for reply in reply_json.get('getCommentReplies'): + yield self._parse_comment(reply, comment_id) + + @staticmethod + def _parse_comment(comment_data, parent): + return { + 'id': comment_data.get('_id'), + 'text': comment_data.get('content'), + 'author': try_get(comment_data, lambda x: x['user']['username']), + 'author_id': try_get(comment_data, lambda x: x['user']['_id']), + 'timestamp': unified_timestamp(comment_data.get('createdAt')), + 'parent': parent, + 'like_count': try_get(comment_data, lambda x: x['voteCount']['positive']), + } + + def _real_extract(self, url): + video_id = self._match_id(url) + video_json = self._call_api(video_id, video_id, 'GetVideoAndComments', 'Downloading video metadata') + video_info = video_json['getVideo'] + is_live = video_info.get('live') + comments = [self._parse_comment(comment, 'root') for comment in video_json.get('getVideoComments')] + + formats = [{ + 'format_id': 'direct', + 'quality': 1, + 'url': video_info.get('directUrl'), + 'ext': 'mp4', + }] if url_or_none(video_info.get('directUrl')) else [] + if video_info.get('streamUrl'): + formats.extend(self._extract_m3u8_formats( + video_info.get('streamUrl'), video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', live=True)) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_info.get('title')[:-1], + 'formats': formats, + 'is_live': is_live, + 'description': video_info.get('summary'), + 'channel': try_get(video_info, lambda x: x['channel']['title']), + 'channel_id': try_get(video_info, lambda x: x['channel']['_id']), + 'view_count': int_or_none(video_info.get('playCount')), + 'thumbnail': url_or_none(video_info.get('largeImage')), + 'duration': float_or_none(video_info.get('videoDuration')), + 'timestamp': unified_timestamp(video_info.get('createdAt')), + 'tags': [tag.get('name') for tag in video_info.get('tags')], + 'availability': self._availability(is_unlisted=video_info.get('unlisted')), + 'comments': comments, + '__post_extractor': self.extract_comments(video_id, comments, video_json.get('getVideoComments')) + } diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py new file mode 100644 index 000000000..4e2dcd76b --- /dev/null +++ b/yt_dlp/extractor/bbc.py @@ -0,0 +1,1622 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import functools +import itertools +import json +import re + +from .common import InfoExtractor +from ..compat import ( + compat_etree_Element, + compat_HTTPError, + compat_str, + compat_urlparse, +) +from ..utils import ( + ExtractorError, + OnDemandPagedList, + clean_html, + dict_get, + float_or_none, + get_element_by_class, + int_or_none, + js_to_json, + parse_duration, + parse_iso8601, + parse_qs, + strip_or_none, + try_get, + unescapeHTML, + unified_timestamp, + url_or_none, + urlencode_postdata, + urljoin, +) + + +class BBCCoUkIE(InfoExtractor): + IE_NAME = 'bbc.co.uk' + IE_DESC = 'BBC iPlayer' + _ID_REGEX = r'(?:[pbm][\da-z]{7}|w[\da-z]{7,14})' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?bbc\.co\.uk/ + (?: + programmes/(?!articles/)| + iplayer(?:/[^/]+)?/(?:episode/|playlist/)| + music/(?:clips|audiovideo/popular)[/#]| + radio/player/| + sounds/play/| + events/[^/]+/play/[^/]+/ + ) + (?P<id>%s)(?!/(?:episodes|broadcasts|clips)) + ''' % _ID_REGEX + + _LOGIN_URL = 'https://account.bbc.com/signin' + _NETRC_MACHINE = 'bbc' + + _MEDIA_SELECTOR_URL_TEMPL = 'https://open.live.bbc.co.uk/mediaselector/6/select/version/2.0/mediaset/%s/vpid/%s' + _MEDIA_SETS = [ + # Provides HQ HLS streams with even better quality that pc mediaset but fails + # with geolocation in some cases when it's even not geo restricted at all (e.g. + # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable. + 'iptv-all', + 'pc', + ] + + _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist' + + _TESTS = [ + { + 'url': 'http://www.bbc.co.uk/programmes/b039g8p7', + 'info_dict': { + 'id': 'b039d07m', + 'ext': 'flv', + 'title': 'Kaleidoscope, Leonard Cohen', + 'description': 'The Canadian poet and songwriter reflects on his musical career.', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, + { + 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/', + 'info_dict': { + 'id': 'b00yng1d', + 'ext': 'flv', + 'title': 'The Man in Black: Series 3: The Printed Name', + 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.", + 'duration': 1800, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'Episode is no longer available on BBC iPlayer Radio', + }, + { + 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/', + 'info_dict': { + 'id': 'b00yng1d', + 'ext': 'flv', + 'title': 'The Voice UK: Series 3: Blind Auditions 5', + 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.', + 'duration': 5100, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only', + }, + { + 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion', + 'info_dict': { + 'id': 'b03k3pb7', + 'ext': 'flv', + 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction", + 'description': '2. Invasion', + 'duration': 3600, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only', + }, { + 'url': 'http://www.bbc.co.uk/programmes/b04v20dw', + 'info_dict': { + 'id': 'b04v209v', + 'ext': 'flv', + 'title': 'Pete Tong, The Essential New Tune Special', + 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!", + 'duration': 10800, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'Episode is no longer available on BBC iPlayer Radio', + }, { + 'url': 'http://www.bbc.co.uk/music/clips/p022h44b', + 'note': 'Audio', + 'info_dict': { + 'id': 'p022h44j', + 'ext': 'flv', + 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances', + 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.", + 'duration': 227, + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz', + 'note': 'Video', + 'info_dict': { + 'id': 'p025c103', + 'ext': 'flv', + 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)', + 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014', + 'duration': 226, + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls', + 'info_dict': { + 'id': 'p02n76xf', + 'ext': 'flv', + 'title': 'Natural World, 2015-2016: 2. Super Powered Owls', + 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d', + 'duration': 3540, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'geolocation', + }, { + 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition', + 'info_dict': { + 'id': 'b05zmgw1', + 'ext': 'flv', + 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.', + 'title': 'Royal Academy Summer Exhibition', + 'duration': 3540, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'geolocation', + }, { + # iptv-all mediaset fails with geolocation however there is no geo restriction + # for this programme at all + 'url': 'http://www.bbc.co.uk/programmes/b06rkn85', + 'info_dict': { + 'id': 'b06rkms3', + 'ext': 'flv', + 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1", + 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!", + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'Now it\'s really geo-restricted', + }, { + # compact player (https://github.com/ytdl-org/youtube-dl/issues/8147) + 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player', + 'info_dict': { + 'id': 'p028bfkj', + 'ext': 'flv', + 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews', + 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, { + 'url': 'https://www.bbc.co.uk/sounds/play/m0007jzb', + 'note': 'Audio', + 'info_dict': { + 'id': 'm0007jz9', + 'ext': 'mp4', + 'title': 'BBC Proms, 2019, Prom 34: West–Eastern Divan Orchestra', + 'description': "Live BBC Proms. West–Eastern Divan Orchestra with Daniel Barenboim and Martha Argerich.", + 'duration': 9840, + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4', + 'only_matching': True, + }, { + 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3', + 'only_matching': True, + }, { + 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo', + 'only_matching': True, + }, { + 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf', + 'only_matching': True, + }, { + 'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55', + 'only_matching': True, + }, { + 'url': 'http://www.bbc.co.uk/programmes/w3csv1y9', + 'only_matching': True, + }, { + 'url': 'https://www.bbc.co.uk/programmes/m00005xn', + 'only_matching': True, + }, { + 'url': 'https://www.bbc.co.uk/programmes/w172w4dww1jqt5s', + 'only_matching': True, + }] + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading signin page') + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'username': username, + 'password': password, + }) + + post_url = urljoin(self._LOGIN_URL, self._search_regex( + r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, + 'post url', default=self._LOGIN_URL, group='url')) + + response, urlh = self._download_webpage_handle( + post_url, None, 'Logging in', data=urlencode_postdata(login_form), + headers={'Referer': self._LOGIN_URL}) + + if self._LOGIN_URL in urlh.geturl(): + error = clean_html(get_element_by_class('form-message', response)) + if error: + raise ExtractorError( + 'Unable to login: %s' % error, expected=True) + raise ExtractorError('Unable to log in') + + def _real_initialize(self): + self._login() + + class MediaSelectionError(Exception): + def __init__(self, id): + self.id = id + + def _extract_asx_playlist(self, connection, programme_id): + asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist') + return [ref.get('href') for ref in asx.findall('./Entry/ref')] + + def _extract_items(self, playlist): + return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS) + + def _extract_medias(self, media_selection): + error = media_selection.get('result') + if error: + raise BBCCoUkIE.MediaSelectionError(error) + return media_selection.get('media') or [] + + def _extract_connections(self, media): + return media.get('connection') or [] + + def _get_subtitles(self, media, programme_id): + subtitles = {} + for connection in self._extract_connections(media): + cc_url = url_or_none(connection.get('href')) + if not cc_url: + continue + captions = self._download_xml( + cc_url, programme_id, 'Downloading captions', fatal=False) + if not isinstance(captions, compat_etree_Element): + continue + subtitles['en'] = [ + { + 'url': connection.get('href'), + 'ext': 'ttml', + }, + ] + break + return subtitles + + def _raise_extractor_error(self, media_selection_error): + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, media_selection_error.id), + expected=True) + + def _download_media_selector(self, programme_id): + last_exception = None + for media_set in self._MEDIA_SETS: + try: + return self._download_media_selector_url( + self._MEDIA_SELECTOR_URL_TEMPL % (media_set, programme_id), programme_id) + except BBCCoUkIE.MediaSelectionError as e: + if e.id in ('notukerror', 'geolocation', 'selectionunavailable'): + last_exception = e + continue + self._raise_extractor_error(e) + self._raise_extractor_error(last_exception) + + def _download_media_selector_url(self, url, programme_id=None): + media_selection = self._download_json( + url, programme_id, 'Downloading media selection JSON', + expected_status=(403, 404)) + return self._process_media_selector(media_selection, programme_id) + + def _process_media_selector(self, media_selection, programme_id): + formats = [] + subtitles = None + urls = [] + + for media in self._extract_medias(media_selection): + kind = media.get('kind') + if kind in ('video', 'audio'): + bitrate = int_or_none(media.get('bitrate')) + encoding = media.get('encoding') + width = int_or_none(media.get('width')) + height = int_or_none(media.get('height')) + file_size = int_or_none(media.get('media_file_size')) + for connection in self._extract_connections(media): + href = connection.get('href') + if href in urls: + continue + if href: + urls.append(href) + conn_kind = connection.get('kind') + protocol = connection.get('protocol') + supplier = connection.get('supplier') + transfer_format = connection.get('transferFormat') + format_id = supplier or conn_kind or protocol + # ASX playlist + if supplier == 'asx': + for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)): + formats.append({ + 'url': ref, + 'format_id': 'ref%s_%s' % (i, format_id), + }) + elif transfer_format == 'dash': + formats.extend(self._extract_mpd_formats( + href, programme_id, mpd_id=format_id, fatal=False)) + elif transfer_format == 'hls': + formats.extend(self._extract_m3u8_formats( + href, programme_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False)) + elif transfer_format == 'hds': + formats.extend(self._extract_f4m_formats( + href, programme_id, f4m_id=format_id, fatal=False)) + else: + if not supplier and bitrate: + format_id += '-%d' % bitrate + fmt = { + 'format_id': format_id, + 'filesize': file_size, + } + if kind == 'video': + fmt.update({ + 'width': width, + 'height': height, + 'tbr': bitrate, + 'vcodec': encoding, + }) + else: + fmt.update({ + 'abr': bitrate, + 'acodec': encoding, + 'vcodec': 'none', + }) + if protocol in ('http', 'https'): + # Direct link + fmt.update({ + 'url': href, + }) + elif protocol == 'rtmp': + application = connection.get('application', 'ondemand') + auth_string = connection.get('authString') + identifier = connection.get('identifier') + server = connection.get('server') + fmt.update({ + 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string), + 'play_path': identifier, + 'app': '%s?%s' % (application, auth_string), + 'page_url': 'http://www.bbc.co.uk', + 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf', + 'rtmp_live': False, + 'ext': 'flv', + }) + else: + continue + formats.append(fmt) + elif kind == 'captions': + subtitles = self.extract_subtitles(media, programme_id) + return formats, subtitles + + def _download_playlist(self, playlist_id): + try: + playlist = self._download_json( + 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id, + playlist_id, 'Downloading playlist JSON') + + version = playlist.get('defaultAvailableVersion') + if version: + smp_config = version['smpConfig'] + title = smp_config['title'] + description = smp_config['summary'] + for item in smp_config['items']: + kind = item['kind'] + if kind not in ('programme', 'radioProgramme'): + continue + programme_id = item.get('vpid') + duration = int_or_none(item.get('duration')) + formats, subtitles = self._download_media_selector(programme_id) + return programme_id, title, description, duration, formats, subtitles + except ExtractorError as ee: + if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404): + raise + + # fallback to legacy playlist + return self._process_legacy_playlist(playlist_id) + + def _process_legacy_playlist_url(self, url, display_id): + playlist = self._download_legacy_playlist_url(url, display_id) + return self._extract_from_legacy_playlist(playlist, display_id) + + def _process_legacy_playlist(self, playlist_id): + return self._process_legacy_playlist_url( + 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id) + + def _download_legacy_playlist_url(self, url, playlist_id=None): + return self._download_xml( + url, playlist_id, 'Downloading legacy playlist XML') + + def _extract_from_legacy_playlist(self, playlist, playlist_id): + no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS) + if no_items is not None: + reason = no_items.get('reason') + if reason == 'preAvailability': + msg = 'Episode %s is not yet available' % playlist_id + elif reason == 'postAvailability': + msg = 'Episode %s is no longer available' % playlist_id + elif reason == 'noMedia': + msg = 'Episode %s is not currently available' % playlist_id + else: + msg = 'Episode %s is not available: %s' % (playlist_id, reason) + raise ExtractorError(msg, expected=True) + + for item in self._extract_items(playlist): + kind = item.get('kind') + if kind not in ('programme', 'radioProgramme'): + continue + title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text + description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS) + description = description_el.text if description_el is not None else None + + def get_programme_id(item): + def get_from_attributes(item): + for p in ('identifier', 'group'): + value = item.get(p) + if value and re.match(r'^[pb][\da-z]{7}$', value): + return value + get_from_attributes(item) + mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS) + if mediator is not None: + return get_from_attributes(mediator) + + programme_id = get_programme_id(item) + duration = int_or_none(item.get('duration')) + + if programme_id: + formats, subtitles = self._download_media_selector(programme_id) + else: + formats, subtitles = self._process_media_selector(item, playlist_id) + programme_id = playlist_id + + return programme_id, title, description, duration, formats, subtitles + + def _real_extract(self, url): + group_id = self._match_id(url) + + webpage = self._download_webpage(url, group_id, 'Downloading video page') + + error = self._search_regex( + r'<div\b[^>]+\bclass=["\'](?:smp|playout)__message delta["\'][^>]*>\s*([^<]+?)\s*<', + webpage, 'error', default=None) + if error: + raise ExtractorError(error, expected=True) + + programme_id = None + duration = None + + tviplayer = self._search_regex( + r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById', + webpage, 'player', default=None) + + if tviplayer: + player = self._parse_json(tviplayer, group_id).get('player', {}) + duration = int_or_none(player.get('duration')) + programme_id = player.get('vpid') + + if not programme_id: + programme_id = self._search_regex( + r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None) + + if programme_id: + formats, subtitles = self._download_media_selector(programme_id) + title = self._og_search_title(webpage, default=None) or self._html_search_regex( + (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>', + r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title') + description = self._search_regex( + (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>', + r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'), + webpage, 'description', default=None) + if not description: + description = self._html_search_meta('description', webpage) + else: + programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id) + + self._sort_formats(formats) + + return { + 'id': programme_id, + 'title': title, + 'description': description, + 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'duration': duration, + 'formats': formats, + 'subtitles': subtitles, + } + + +class BBCIE(BBCCoUkIE): + IE_NAME = 'bbc' + IE_DESC = 'BBC' + _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)' + + _MEDIA_SETS = [ + 'pc', + 'mobile-tablet-main', + ] + + _TESTS = [{ + # article with multiple videos embedded with data-playable containing vpids + 'url': 'http://www.bbc.com/news/world-europe-32668511', + 'info_dict': { + 'id': 'world-europe-32668511', + 'title': 'Russia stages massive WW2 parade', + 'description': 'md5:00ff61976f6081841f759a08bf78cc9c', + }, + 'playlist_count': 2, + }, { + # article with multiple videos embedded with data-playable (more videos) + 'url': 'http://www.bbc.com/news/business-28299555', + 'info_dict': { + 'id': 'business-28299555', + 'title': 'Farnborough Airshow: Video highlights', + 'description': 'BBC reports and video highlights at the Farnborough Airshow.', + }, + 'playlist_count': 9, + 'skip': 'Save time', + }, { + # article with multiple videos embedded with `new SMP()` + # broken + 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460', + 'info_dict': { + 'id': '3662a707-0af9-3149-963f-47bea720b460', + 'title': 'BUGGER', + }, + 'playlist_count': 18, + }, { + # single video embedded with data-playable containing vpid + 'url': 'http://www.bbc.com/news/world-europe-32041533', + 'info_dict': { + 'id': 'p02mprgb', + 'ext': 'mp4', + 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', + 'description': 'md5:2868290467291b37feda7863f7a83f54', + 'duration': 47, + 'timestamp': 1427219242, + 'upload_date': '20150324', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + # article with single video embedded with data-playable containing XML playlist + # with direct video links as progressiveDownloadUrl (for now these are extracted) + # and playlist with f4m and m3u8 as streamingUrl + 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu', + 'info_dict': { + 'id': '150615_telabyad_kentin_cogu', + 'ext': 'mp4', + 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde", + 'description': 'md5:33a4805a855c9baf7115fcbde57e7025', + 'timestamp': 1434397334, + 'upload_date': '20150615', + }, + 'params': { + 'skip_download': True, + } + }, { + # single video embedded with data-playable containing XML playlists (regional section) + 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw', + 'info_dict': { + 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw', + 'ext': 'mp4', + 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción', + 'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8', + 'timestamp': 1434713142, + 'upload_date': '20150619', + }, + 'params': { + 'skip_download': True, + } + }, { + # single video from video playlist embedded with vxp-playlist-data JSON + 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376', + 'info_dict': { + 'id': 'p02w6qjc', + 'ext': 'mp4', + 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''', + 'duration': 56, + 'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''', + }, + 'params': { + 'skip_download': True, + } + }, { + # single video story with digitalData + 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret', + 'info_dict': { + 'id': 'p02q6gc4', + 'ext': 'flv', + 'title': 'Sri Lanka’s spicy secret', + 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.', + 'timestamp': 1437674293, + 'upload_date': '20150723', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + # single video story without digitalData + 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star', + 'info_dict': { + 'id': 'p018zqqg', + 'ext': 'mp4', + 'title': 'Hyundai Santa Fe Sport: Rock star', + 'description': 'md5:b042a26142c4154a6e472933cf20793d', + 'timestamp': 1415867444, + 'upload_date': '20141113', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + # single video embedded with Morph + 'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975', + 'info_dict': { + 'id': 'p041vhd0', + 'ext': 'mp4', + 'title': "Nigeria v Japan - Men's First Round", + 'description': 'Live coverage of the first round from Group B at the Amazonia Arena.', + 'duration': 7980, + 'uploader': 'BBC Sport', + 'uploader_id': 'bbc_sport', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'skip': 'Georestricted to UK', + }, { + # single video with playlist.sxml URL in playlist param + 'url': 'http://www.bbc.com/sport/0/football/33653409', + 'info_dict': { + 'id': 'p02xycnp', + 'ext': 'mp4', + 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', + 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.', + 'duration': 140, + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + # article with multiple videos embedded with playlist.sxml in playlist param + 'url': 'http://www.bbc.com/sport/0/football/34475836', + 'info_dict': { + 'id': '34475836', + 'title': 'Jurgen Klopp: Furious football from a witty and winning coach', + 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.', + }, + 'playlist_count': 3, + }, { + # school report article with single video + 'url': 'http://www.bbc.co.uk/schoolreport/35744779', + 'info_dict': { + 'id': '35744779', + 'title': 'School which breaks down barriers in Jerusalem', + }, + 'playlist_count': 1, + }, { + # single video with playlist URL from weather section + 'url': 'http://www.bbc.com/weather/features/33601775', + 'only_matching': True, + }, { + # custom redirection to www.bbc.com + # also, video with window.__INITIAL_DATA__ + 'url': 'http://www.bbc.co.uk/news/science-environment-33661876', + 'info_dict': { + 'id': 'p02xzws1', + 'ext': 'mp4', + 'title': "Pluto may have 'nitrogen glaciers'", + 'description': 'md5:6a95b593f528d7a5f2605221bc56912f', + 'thumbnail': r're:https?://.+/.+\.jpg', + 'timestamp': 1437785037, + 'upload_date': '20150725', + }, + }, { + # single video article embedded with data-media-vpid + 'url': 'http://www.bbc.co.uk/sport/rowing/35908187', + 'only_matching': True, + }, { + 'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1', + 'info_dict': { + 'id': 'p06556y7', + 'ext': 'mp4', + 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', + 'description': 'md5:4b7dfd063d5a789a1512e99662be3ddd', + }, + 'params': { + 'skip_download': True, + } + }, { + # window.__PRELOADED_STATE__ + 'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl', + 'info_dict': { + 'id': 'b0b9z4vz', + 'ext': 'mp4', + 'title': 'Prom 6: An American in Paris and Turangalila', + 'description': 'md5:51cf7d6f5c8553f197e58203bc78dff8', + 'uploader': 'Radio 3', + 'uploader_id': 'bbc_radio_three', + }, + }, { + 'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227', + 'info_dict': { + 'id': 'p06w9tws', + 'ext': 'mp4', + 'title': 'md5:2fabf12a726603193a2879a055f72514', + 'description': 'Learn English words and phrases from this story', + }, + 'add_ie': [BBCCoUkIE.ie_key()], + }, { + # BBC Reel + 'url': 'https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness', + 'info_dict': { + 'id': 'p07c6sb9', + 'ext': 'mp4', + 'title': 'How positive thinking is harming your happiness', + 'alt_title': 'The downsides of positive thinking', + 'description': 'md5:fad74b31da60d83b8265954ee42d85b4', + 'duration': 235, + 'thumbnail': r're:https?://.+/p07c9dsr.jpg', + 'upload_date': '20190604', + 'categories': ['Psychology'], + }, + }] + + @classmethod + def suitable(cls, url): + EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerEpisodesIE, BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE) + return (False if any(ie.suitable(url) for ie in EXCLUDE_IE) + else super(BBCIE, cls).suitable(url)) + + def _extract_from_media_meta(self, media_meta, video_id): + # Direct links to media in media metadata (e.g. + # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu) + # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml + source_files = media_meta.get('sourceFiles') + if source_files: + return [{ + 'url': f['url'], + 'format_id': format_id, + 'ext': f.get('encoding'), + 'tbr': float_or_none(f.get('bitrate'), 1000), + 'filesize': int_or_none(f.get('filesize')), + } for format_id, f in source_files.items() if f.get('url')], [] + + programme_id = media_meta.get('externalId') + if programme_id: + return self._download_media_selector(programme_id) + + # Process playlist.sxml as legacy playlist + href = media_meta.get('href') + if href: + playlist = self._download_legacy_playlist_url(href) + _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id) + return formats, subtitles + + return [], [] + + def _extract_from_playlist_sxml(self, url, playlist_id, timestamp): + programme_id, title, description, duration, formats, subtitles = \ + self._process_legacy_playlist_url(url, playlist_id) + self._sort_formats(formats) + return { + 'id': programme_id, + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, + 'subtitles': subtitles, + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + json_ld_info = self._search_json_ld(webpage, playlist_id, default={}) + timestamp = json_ld_info.get('timestamp') + + playlist_title = json_ld_info.get('title') + if not playlist_title: + playlist_title = self._og_search_title( + webpage, default=None) or self._html_search_regex( + r'<title>(.+?)</title>', webpage, 'playlist title', default=None) + if playlist_title: + playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip() + + playlist_description = json_ld_info.get( + 'description') or self._og_search_description(webpage, default=None) + + if not timestamp: + timestamp = parse_iso8601(self._search_regex( + [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"', + r'itemprop="datePublished"[^>]+datetime="([^"]+)"', + r'"datePublished":\s*"([^"]+)'], + webpage, 'date', default=None)) + + entries = [] + + # article with multiple videos embedded with playlist.sxml (e.g. + # http://www.bbc.com/sport/0/football/34475836) + playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage) + playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage)) + if playlists: + entries = [ + self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp) + for playlist_url in playlists] + + # news article with multiple videos embedded with data-playable + data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage) + if data_playables: + for _, data_playable_json in data_playables: + data_playable = self._parse_json( + unescapeHTML(data_playable_json), playlist_id, fatal=False) + if not data_playable: + continue + settings = data_playable.get('settings', {}) + if settings: + # data-playable with video vpid in settings.playlistObject.items (e.g. + # http://www.bbc.com/news/world-us-canada-34473351) + playlist_object = settings.get('playlistObject', {}) + if playlist_object: + items = playlist_object.get('items') + if items and isinstance(items, list): + title = playlist_object['title'] + description = playlist_object.get('summary') + duration = int_or_none(items[0].get('duration')) + programme_id = items[0].get('vpid') + formats, subtitles = self._download_media_selector(programme_id) + self._sort_formats(formats) + entries.append({ + 'id': programme_id, + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'duration': duration, + 'formats': formats, + 'subtitles': subtitles, + }) + else: + # data-playable without vpid but with a playlist.sxml URLs + # in otherSettings.playlist (e.g. + # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani) + playlist = data_playable.get('otherSettings', {}).get('playlist', {}) + if playlist: + entry = None + for key in ('streaming', 'progressiveDownload'): + playlist_url = playlist.get('%sUrl' % key) + if not playlist_url: + continue + try: + info = self._extract_from_playlist_sxml( + playlist_url, playlist_id, timestamp) + if not entry: + entry = info + else: + entry['title'] = info['title'] + entry['formats'].extend(info['formats']) + except ExtractorError as e: + # Some playlist URL may fail with 500, at the same time + # the other one may work fine (e.g. + # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu) + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500: + continue + raise + if entry: + self._sort_formats(entry['formats']) + entries.append(entry) + + if entries: + return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) + + # http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227 + group_id = self._search_regex( + r'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX, + webpage, 'group id', default=None) + if group_id: + return self.url_result( + 'https://www.bbc.co.uk/programmes/%s' % group_id, + ie=BBCCoUkIE.ie_key()) + + # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) + programme_id = self._search_regex( + [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX, + r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX, + r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX], + webpage, 'vpid', default=None) + + if programme_id: + formats, subtitles = self._download_media_selector(programme_id) + self._sort_formats(formats) + # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star) + digital_data = self._parse_json( + self._search_regex( + r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'), + programme_id, fatal=False) + page_info = digital_data.get('page', {}).get('pageInfo', {}) + title = page_info.get('pageName') or self._og_search_title(webpage) + description = page_info.get('description') or self._og_search_description(webpage) + timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp + return { + 'id': programme_id, + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'formats': formats, + 'subtitles': subtitles, + } + + # bbc reel (e.g. https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness) + initial_data = self._parse_json(self._html_search_regex( + r'<script[^>]+id=(["\'])initial-data\1[^>]+data-json=(["\'])(?P<json>(?:(?!\2).)+)', + webpage, 'initial data', default='{}', group='json'), playlist_id, fatal=False) + if initial_data: + init_data = try_get( + initial_data, lambda x: x['initData']['items'][0], dict) or {} + smp_data = init_data.get('smpData') or {} + clip_data = try_get(smp_data, lambda x: x['items'][0], dict) or {} + version_id = clip_data.get('versionID') + if version_id: + title = smp_data['title'] + formats, subtitles = self._download_media_selector(version_id) + self._sort_formats(formats) + image_url = smp_data.get('holdingImageURL') + display_date = init_data.get('displayDate') + topic_title = init_data.get('topicTitle') + + return { + 'id': version_id, + 'title': title, + 'formats': formats, + 'alt_title': init_data.get('shortTitle'), + 'thumbnail': image_url.replace('$recipe', 'raw') if image_url else None, + 'description': smp_data.get('summary') or init_data.get('shortSummary'), + 'upload_date': display_date.replace('-', '') if display_date else None, + 'subtitles': subtitles, + 'duration': int_or_none(clip_data.get('duration')), + 'categories': [topic_title] if topic_title else None, + } + + # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975) + # There are several setPayload calls may be present but the video + # seems to be always related to the first one + morph_payload = self._parse_json( + self._search_regex( + r'Morph\.setPayload\([^,]+,\s*({.+?})\);', + webpage, 'morph payload', default='{}'), + playlist_id, fatal=False) + if morph_payload: + components = try_get(morph_payload, lambda x: x['body']['components'], list) or [] + for component in components: + if not isinstance(component, dict): + continue + lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict) + if not lead_media: + continue + identifiers = lead_media.get('identifiers') + if not identifiers or not isinstance(identifiers, dict): + continue + programme_id = identifiers.get('vpid') or identifiers.get('playablePid') + if not programme_id: + continue + title = lead_media.get('title') or self._og_search_title(webpage) + formats, subtitles = self._download_media_selector(programme_id) + self._sort_formats(formats) + description = lead_media.get('summary') + uploader = lead_media.get('masterBrand') + uploader_id = lead_media.get('mid') + duration = None + duration_d = lead_media.get('duration') + if isinstance(duration_d, dict): + duration = parse_duration(dict_get( + duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration'))) + return { + 'id': programme_id, + 'title': title, + 'description': description, + 'duration': duration, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'formats': formats, + 'subtitles': subtitles, + } + + preload_state = self._parse_json(self._search_regex( + r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage, + 'preload state', default='{}'), playlist_id, fatal=False) + if preload_state: + current_programme = preload_state.get('programmes', {}).get('current') or {} + programme_id = current_programme.get('id') + if current_programme and programme_id and current_programme.get('type') == 'playable_item': + title = current_programme.get('titles', {}).get('tertiary') or playlist_title + formats, subtitles = self._download_media_selector(programme_id) + self._sort_formats(formats) + synopses = current_programme.get('synopses') or {} + network = current_programme.get('network') or {} + duration = int_or_none( + current_programme.get('duration', {}).get('value')) + thumbnail = None + image_url = current_programme.get('image_url') + if image_url: + thumbnail = image_url.replace('{recipe}', 'raw') + return { + 'id': programme_id, + 'title': title, + 'description': dict_get(synopses, ('long', 'medium', 'short')), + 'thumbnail': thumbnail, + 'duration': duration, + 'uploader': network.get('short_title'), + 'uploader_id': network.get('id'), + 'formats': formats, + 'subtitles': subtitles, + } + + bbc3_config = self._parse_json( + self._search_regex( + r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage, + 'bbcthree config', default='{}'), + playlist_id, transform_source=js_to_json, fatal=False) or {} + payload = bbc3_config.get('payload') or {} + if payload: + clip = payload.get('currentClip') or {} + clip_vpid = clip.get('vpid') + clip_title = clip.get('title') + if clip_vpid and clip_title: + formats, subtitles = self._download_media_selector(clip_vpid) + self._sort_formats(formats) + return { + 'id': clip_vpid, + 'title': clip_title, + 'thumbnail': dict_get(clip, ('poster', 'imageUrl')), + 'description': clip.get('description'), + 'duration': parse_duration(clip.get('duration')), + 'formats': formats, + 'subtitles': subtitles, + } + bbc3_playlist = try_get( + payload, lambda x: x['content']['bbcMedia']['playlist'], + dict) + if bbc3_playlist: + playlist_title = bbc3_playlist.get('title') or playlist_title + thumbnail = bbc3_playlist.get('holdingImageURL') + entries = [] + for bbc3_item in bbc3_playlist['items']: + programme_id = bbc3_item.get('versionID') + if not programme_id: + continue + formats, subtitles = self._download_media_selector(programme_id) + self._sort_formats(formats) + entries.append({ + 'id': programme_id, + 'title': playlist_title, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'formats': formats, + 'subtitles': subtitles, + }) + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) + + initial_data = self._parse_json(self._search_regex( + r'window\.__INITIAL_DATA__\s*=\s*({.+?});', webpage, + 'preload state', default='{}'), playlist_id, fatal=False) + if initial_data: + def parse_media(media): + if not media: + return + for item in (try_get(media, lambda x: x['media']['items'], list) or []): + item_id = item.get('id') + item_title = item.get('title') + if not (item_id and item_title): + continue + formats, subtitles = self._download_media_selector(item_id) + self._sort_formats(formats) + item_desc = None + blocks = try_get(media, lambda x: x['summary']['blocks'], list) + if blocks: + summary = [] + for block in blocks: + text = try_get(block, lambda x: x['model']['text'], compat_str) + if text: + summary.append(text) + if summary: + item_desc = '\n\n'.join(summary) + item_time = None + for meta in try_get(media, lambda x: x['metadata']['items'], list) or []: + if try_get(meta, lambda x: x['label']) == 'Published': + item_time = unified_timestamp(meta.get('timestamp')) + break + entries.append({ + 'id': item_id, + 'title': item_title, + 'thumbnail': item.get('holdingImageUrl'), + 'formats': formats, + 'subtitles': subtitles, + 'timestamp': item_time, + 'description': strip_or_none(item_desc), + }) + for resp in (initial_data.get('data') or {}).values(): + name = resp.get('name') + if name == 'media-experience': + parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict)) + elif name == 'article': + for block in (try_get(resp, lambda x: x['data']['blocks'], list) or []): + if block.get('type') != 'media': + continue + parse_media(block.get('model')) + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) + + def extract_all(pattern): + return list(filter(None, map( + lambda s: self._parse_json(s, playlist_id, fatal=False), + re.findall(pattern, webpage)))) + + # Multiple video article (e.g. + # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460) + EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX + entries = [] + for match in extract_all(r'new\s+SMP\(({.+?})\)'): + embed_url = match.get('playerSettings', {}).get('externalEmbedUrl') + if embed_url and re.match(EMBED_URL, embed_url): + entries.append(embed_url) + entries.extend(re.findall( + r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage)) + if entries: + return self.playlist_result( + [self.url_result(entry_, 'BBCCoUk') for entry_ in entries], + playlist_id, playlist_title, playlist_description) + + # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511) + medias = extract_all(r"data-media-meta='({[^']+})'") + + if not medias: + # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international) + media_asset = self._search_regex( + r'mediaAssetPage\.init\(\s*({.+?}), "/', + webpage, 'media asset', default=None) + if media_asset: + media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False) + medias = [] + for video in media_asset_page.get('videos', {}).values(): + medias.extend(video.values()) + + if not medias: + # Multiple video playlist with single `now playing` entry (e.g. + # http://www.bbc.com/news/video_and_audio/must_see/33767813) + vxp_playlist = self._parse_json( + self._search_regex( + r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>', + webpage, 'playlist data'), + playlist_id) + playlist_medias = [] + for item in vxp_playlist: + media = item.get('media') + if not media: + continue + playlist_medias.append(media) + # Download single video if found media with asset id matching the video id from URL + if item.get('advert', {}).get('assetId') == playlist_id: + medias = [media] + break + # Fallback to the whole playlist + if not medias: + medias = playlist_medias + + entries = [] + for num, media_meta in enumerate(medias, start=1): + formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id) + if not formats and not self.get_param('ignore_no_formats'): + continue + self._sort_formats(formats) + + video_id = media_meta.get('externalId') + if not video_id: + video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num) + + title = media_meta.get('caption') + if not title: + title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num) + + duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration')) + + images = [] + for image in media_meta.get('images', {}).values(): + images.extend(image.values()) + if 'image' in media_meta: + images.append(media_meta['image']) + + thumbnails = [{ + 'url': image.get('href'), + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + } for image in images] + + entries.append({ + 'id': video_id, + 'title': title, + 'thumbnails': thumbnails, + 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, + 'subtitles': subtitles, + }) + + return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) + + +class BBCCoUkArticleIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)' + IE_NAME = 'bbc.co.uk:article' + IE_DESC = 'BBC articles' + + _TEST = { + 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer', + 'info_dict': { + 'id': '3jNQLTMrPlYGTBn0WV6M2MS', + 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four', + 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.', + }, + 'playlist_count': 4, + 'add_ie': ['BBCCoUk'], + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + title = self._og_search_title(webpage) + description = self._og_search_description(webpage).strip() + + entries = [self.url_result(programme_url) for programme_url in re.findall( + r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)] + + return self.playlist_result(entries, playlist_id, title, description) + + +class BBCCoUkPlaylistBaseIE(InfoExtractor): + def _entries(self, webpage, url, playlist_id): + single_page = 'page' in compat_urlparse.parse_qs( + compat_urlparse.urlparse(url).query) + for page_num in itertools.count(2): + for video_id in re.findall( + self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage): + yield self.url_result( + self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key()) + if single_page: + return + next_page = self._search_regex( + r'<li[^>]+class=(["\'])pagination_+next\1[^>]*><a[^>]+href=(["\'])(?P<url>(?:(?!\2).)+)\2', + webpage, 'next page url', default=None, group='url') + if not next_page: + break + webpage = self._download_webpage( + compat_urlparse.urljoin(url, next_page), playlist_id, + 'Downloading page %d' % page_num, page_num) + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + title, description = self._extract_title_and_description(webpage) + + return self.playlist_result( + self._entries(webpage, url, playlist_id), + playlist_id, title, description) + + +class BBCCoUkIPlayerPlaylistBaseIE(InfoExtractor): + _VALID_URL_TMPL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/%%s/(?P<id>%s)' % BBCCoUkIE._ID_REGEX + + @staticmethod + def _get_default(episode, key, default_key='default'): + return try_get(episode, lambda x: x[key][default_key]) + + def _get_description(self, data): + synopsis = data.get(self._DESCRIPTION_KEY) or {} + return dict_get(synopsis, ('large', 'medium', 'small')) + + def _fetch_page(self, programme_id, per_page, series_id, page): + elements = self._get_elements(self._call_api( + programme_id, per_page, page + 1, series_id)) + for element in elements: + episode = self._get_episode(element) + episode_id = episode.get('id') + if not episode_id: + continue + thumbnail = None + image = self._get_episode_image(episode) + if image: + thumbnail = image.replace('{recipe}', 'raw') + category = self._get_default(episode, 'labels', 'category') + yield { + '_type': 'url', + 'id': episode_id, + 'title': self._get_episode_field(episode, 'subtitle'), + 'url': 'https://www.bbc.co.uk/iplayer/episode/' + episode_id, + 'thumbnail': thumbnail, + 'description': self._get_description(episode), + 'categories': [category] if category else None, + 'series': self._get_episode_field(episode, 'title'), + 'ie_key': BBCCoUkIE.ie_key(), + } + + def _real_extract(self, url): + pid = self._match_id(url) + qs = parse_qs(url) + series_id = qs.get('seriesId', [None])[0] + page = qs.get('page', [None])[0] + per_page = 36 if page else self._PAGE_SIZE + fetch_page = functools.partial(self._fetch_page, pid, per_page, series_id) + entries = fetch_page(int(page) - 1) if page else OnDemandPagedList(fetch_page, self._PAGE_SIZE) + playlist_data = self._get_playlist_data(self._call_api(pid, 1)) + return self.playlist_result( + entries, pid, self._get_playlist_title(playlist_data), + self._get_description(playlist_data)) + + +class BBCCoUkIPlayerEpisodesIE(BBCCoUkIPlayerPlaylistBaseIE): + IE_NAME = 'bbc.co.uk:iplayer:episodes' + _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'episodes' + _TESTS = [{ + 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v', + 'info_dict': { + 'id': 'b05rcz9v', + 'title': 'The Disappearance', + 'description': 'md5:58eb101aee3116bad4da05f91179c0cb', + }, + 'playlist_mincount': 8, + }, { + # all seasons + 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster', + 'info_dict': { + 'id': 'b094m5t9', + 'title': 'Doctor Foster', + 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6', + }, + 'playlist_mincount': 10, + }, { + # explicit season + 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster?seriesId=b094m6nv', + 'info_dict': { + 'id': 'b094m5t9', + 'title': 'Doctor Foster', + 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6', + }, + 'playlist_mincount': 5, + }, { + # all pages + 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove', + 'info_dict': { + 'id': 'm0004c4v', + 'title': 'Beechgrove', + 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.', + }, + 'playlist_mincount': 37, + }, { + # explicit page + 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove?page=2', + 'info_dict': { + 'id': 'm0004c4v', + 'title': 'Beechgrove', + 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.', + }, + 'playlist_mincount': 1, + }] + _PAGE_SIZE = 100 + _DESCRIPTION_KEY = 'synopsis' + + def _get_episode_image(self, episode): + return self._get_default(episode, 'image') + + def _get_episode_field(self, episode, field): + return self._get_default(episode, field) + + @staticmethod + def _get_elements(data): + return data['entities']['results'] + + @staticmethod + def _get_episode(element): + return element.get('episode') or {} + + def _call_api(self, pid, per_page, page=1, series_id=None): + variables = { + 'id': pid, + 'page': page, + 'perPage': per_page, + } + if series_id: + variables['sliceId'] = series_id + return self._download_json( + 'https://graph.ibl.api.bbc.co.uk/', pid, headers={ + 'Content-Type': 'application/json' + }, data=json.dumps({ + 'id': '5692d93d5aac8d796a0305e895e61551', + 'variables': variables, + }).encode('utf-8'))['data']['programme'] + + @staticmethod + def _get_playlist_data(data): + return data + + def _get_playlist_title(self, data): + return self._get_default(data, 'title') + + +class BBCCoUkIPlayerGroupIE(BBCCoUkIPlayerPlaylistBaseIE): + IE_NAME = 'bbc.co.uk:iplayer:group' + _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'group' + _TESTS = [{ + # Available for over a year unlike 30 days for most other programmes + 'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32', + 'info_dict': { + 'id': 'p02tcc32', + 'title': 'Bohemian Icons', + 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7', + }, + 'playlist_mincount': 10, + }, { + # all pages + 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7', + 'info_dict': { + 'id': 'p081d7j7', + 'title': 'Music in Scotland', + 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.', + }, + 'playlist_mincount': 47, + }, { + # explicit page + 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7?page=2', + 'info_dict': { + 'id': 'p081d7j7', + 'title': 'Music in Scotland', + 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.', + }, + 'playlist_mincount': 11, + }] + _PAGE_SIZE = 200 + _DESCRIPTION_KEY = 'synopses' + + def _get_episode_image(self, episode): + return self._get_default(episode, 'images', 'standard') + + def _get_episode_field(self, episode, field): + return episode.get(field) + + @staticmethod + def _get_elements(data): + return data['elements'] + + @staticmethod + def _get_episode(element): + return element + + def _call_api(self, pid, per_page, page=1, series_id=None): + return self._download_json( + 'http://ibl.api.bbc.co.uk/ibl/v1/groups/%s/episodes' % pid, + pid, query={ + 'page': page, + 'per_page': per_page, + })['group_episodes'] + + @staticmethod + def _get_playlist_data(data): + return data['group'] + + def _get_playlist_title(self, data): + return data.get('title') + + +class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE): + IE_NAME = 'bbc.co.uk:playlist' + _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX + _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s' + _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)' + _TESTS = [{ + 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips', + 'info_dict': { + 'id': 'b05rcz9v', + 'title': 'The Disappearance - Clips - BBC Four', + 'description': 'French thriller serial about a missing teenager.', + }, + 'playlist_mincount': 7, + }, { + # multipage playlist, explicit page + 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1', + 'info_dict': { + 'id': 'b00mfl7n', + 'title': 'Frozen Planet - Clips - BBC One', + 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c', + }, + 'playlist_mincount': 24, + }, { + # multipage playlist, all pages + 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips', + 'info_dict': { + 'id': 'b00mfl7n', + 'title': 'Frozen Planet - Clips - BBC One', + 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c', + }, + 'playlist_mincount': 142, + }, { + 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06', + 'only_matching': True, + }, { + 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips', + 'only_matching': True, + }, { + 'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player', + 'only_matching': True, + }] + + def _extract_title_and_description(self, webpage): + title = self._og_search_title(webpage, fatal=False) + description = self._og_search_description(webpage) + return title, description diff --git a/yt_dlp/extractor/beatport.py b/yt_dlp/extractor/beatport.py new file mode 100644 index 000000000..e1cf8b4fe --- /dev/null +++ b/yt_dlp/extractor/beatport.py @@ -0,0 +1,101 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import int_or_none + + +class BeatportIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.|pro\.)?beatport\.com/track/(?P<display_id>[^/]+)/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'https://beatport.com/track/synesthesia-original-mix/5379371', + 'md5': 'b3c34d8639a2f6a7f734382358478887', + 'info_dict': { + 'id': '5379371', + 'display_id': 'synesthesia-original-mix', + 'ext': 'mp4', + 'title': 'Froxic - Synesthesia (Original Mix)', + }, + }, { + 'url': 'https://beatport.com/track/love-and-war-original-mix/3756896', + 'md5': 'e44c3025dfa38c6577fbaeb43da43514', + 'info_dict': { + 'id': '3756896', + 'display_id': 'love-and-war-original-mix', + 'ext': 'mp3', + 'title': 'Wolfgang Gartner - Love & War (Original Mix)', + }, + }, { + 'url': 'https://beatport.com/track/birds-original-mix/4991738', + 'md5': 'a1fd8e8046de3950fd039304c186c05f', + 'info_dict': { + 'id': '4991738', + 'display_id': 'birds-original-mix', + 'ext': 'mp4', + 'title': "Tos, Middle Milk, Mumblin' Johnsson - Birds (Original Mix)", + } + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + track_id = mobj.group('id') + display_id = mobj.group('display_id') + + webpage = self._download_webpage(url, display_id) + + playables = self._parse_json( + self._search_regex( + r'window\.Playables\s*=\s*({.+?});', webpage, + 'playables info', flags=re.DOTALL), + track_id) + + track = next(t for t in playables['tracks'] if t['id'] == int(track_id)) + + title = ', '.join((a['name'] for a in track['artists'])) + ' - ' + track['name'] + if track['mix']: + title += ' (' + track['mix'] + ')' + + formats = [] + for ext, info in track['preview'].items(): + if not info['url']: + continue + fmt = { + 'url': info['url'], + 'ext': ext, + 'format_id': ext, + 'vcodec': 'none', + } + if ext == 'mp3': + fmt['acodec'] = 'mp3' + fmt['abr'] = 96 + fmt['asr'] = 44100 + elif ext == 'mp4': + fmt['acodec'] = 'aac' + fmt['abr'] = 96 + fmt['asr'] = 44100 + formats.append(fmt) + self._sort_formats(formats) + + images = [] + for name, info in track['images'].items(): + image_url = info.get('url') + if name == 'dynamic' or not image_url: + continue + image = { + 'id': name, + 'url': image_url, + 'height': int_or_none(info.get('height')), + 'width': int_or_none(info.get('width')), + } + images.append(image) + + return { + 'id': compat_str(track.get('id')) or track_id, + 'display_id': track.get('slug') or display_id, + 'title': title, + 'formats': formats, + 'thumbnails': images, + } diff --git a/yt_dlp/extractor/beeg.py b/yt_dlp/extractor/beeg.py new file mode 100644 index 000000000..8fbabe708 --- /dev/null +++ b/yt_dlp/extractor/beeg.py @@ -0,0 +1,116 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_str, +) +from ..utils import ( + int_or_none, + parse_qs, + unified_timestamp, +) + + +class BeegIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?beeg\.(?:com|porn(?:/video)?)/(?P<id>\d+)' + _TESTS = [{ + # api/v6 v1 + 'url': 'http://beeg.com/5416503', + 'md5': 'a1a1b1a8bc70a89e49ccfd113aed0820', + 'info_dict': { + 'id': '5416503', + 'ext': 'mp4', + 'title': 'Sultry Striptease', + 'description': 'md5:d22219c09da287c14bed3d6c37ce4bc2', + 'timestamp': 1391813355, + 'upload_date': '20140207', + 'duration': 383, + 'tags': list, + 'age_limit': 18, + } + }, { + # api/v6 v2 + 'url': 'https://beeg.com/1941093077?t=911-1391', + 'only_matching': True, + }, { + # api/v6 v2 w/o t + 'url': 'https://beeg.com/1277207756', + 'only_matching': True, + }, { + 'url': 'https://beeg.porn/video/5416503', + 'only_matching': True, + }, { + 'url': 'https://beeg.porn/5416503', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + beeg_version = self._search_regex( + r'beeg_version\s*=\s*([\da-zA-Z_-]+)', webpage, 'beeg version', + default='1546225636701') + + if len(video_id) >= 10: + query = { + 'v': 2, + } + qs = parse_qs(url) + t = qs.get('t', [''])[0].split('-') + if len(t) > 1: + query.update({ + 's': t[0], + 'e': t[1], + }) + else: + query = {'v': 1} + + for api_path in ('', 'api.'): + video = self._download_json( + 'https://%sbeeg.com/api/v6/%s/video/%s' + % (api_path, beeg_version, video_id), video_id, + fatal=api_path == 'api.', query=query) + if video: + break + + formats = [] + for format_id, video_url in video.items(): + if not video_url: + continue + height = self._search_regex( + r'^(\d+)[pP]$', format_id, 'height', default=None) + if not height: + continue + formats.append({ + 'url': self._proto_relative_url( + video_url.replace('{DATA_MARKERS}', 'data=pc_XX__%s_0' % beeg_version), 'https:'), + 'format_id': format_id, + 'height': int(height), + }) + self._sort_formats(formats) + + title = video['title'] + video_id = compat_str(video.get('id') or video_id) + display_id = video.get('code') + description = video.get('desc') + series = video.get('ps_name') + + timestamp = unified_timestamp(video.get('date')) + duration = int_or_none(video.get('duration')) + + tags = [tag.strip() for tag in video['tags'].split(',')] if video.get('tags') else None + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'series': series, + 'timestamp': timestamp, + 'duration': duration, + 'tags': tags, + 'formats': formats, + 'age_limit': self._rta_search(webpage), + } diff --git a/yt_dlp/extractor/behindkink.py b/yt_dlp/extractor/behindkink.py new file mode 100644 index 000000000..2c97f9817 --- /dev/null +++ b/yt_dlp/extractor/behindkink.py @@ -0,0 +1,45 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import url_basename + + +class BehindKinkIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?behindkink\.com/(?P<year>[0-9]{4})/(?P<month>[0-9]{2})/(?P<day>[0-9]{2})/(?P<id>[^/#?_]+)' + _TEST = { + 'url': 'http://www.behindkink.com/2014/12/05/what-are-you-passionate-about-marley-blaze/', + 'md5': '507b57d8fdcd75a41a9a7bdb7989c762', + 'info_dict': { + 'id': '37127', + 'ext': 'mp4', + 'title': 'What are you passionate about – Marley Blaze', + 'description': 'md5:aee8e9611b4ff70186f752975d9b94b4', + 'upload_date': '20141205', + 'thumbnail': 'http://www.behindkink.com/wp-content/uploads/2014/12/blaze-1.jpg', + 'age_limit': 18, + } + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + display_id = mobj.group('id') + + webpage = self._download_webpage(url, display_id) + + video_url = self._search_regex( + r'<source src="([^"]+)"', webpage, 'video URL') + video_id = url_basename(video_url).split('_')[0] + upload_date = mobj.group('year') + mobj.group('month') + mobj.group('day') + + return { + 'id': video_id, + 'display_id': display_id, + 'url': video_url, + 'title': self._og_search_title(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'description': self._og_search_description(webpage), + 'upload_date': upload_date, + 'age_limit': 18, + } diff --git a/yt_dlp/extractor/bellmedia.py b/yt_dlp/extractor/bellmedia.py new file mode 100644 index 000000000..904c17ed0 --- /dev/null +++ b/yt_dlp/extractor/bellmedia.py @@ -0,0 +1,87 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor + + +class BellMediaIE(InfoExtractor): + _VALID_URL = r'''(?x)https?://(?:www\.)? + (?P<domain> + (?: + ctv| + tsn| + bnn(?:bloomberg)?| + thecomedynetwork| + discovery| + discoveryvelocity| + sciencechannel| + investigationdiscovery| + animalplanet| + bravo| + mtv| + space| + etalk| + marilyn + )\.ca| + (?:much|cp24)\.com + )/.*?(?:\b(?:vid(?:eoid)?|clipId)=|-vid|~|%7E|/(?:episode)?)(?P<id>[0-9]{6,})''' + _TESTS = [{ + 'url': 'https://www.bnnbloomberg.ca/video/david-cockfield-s-top-picks~1403070', + 'md5': '36d3ef559cfe8af8efe15922cd3ce950', + 'info_dict': { + 'id': '1403070', + 'ext': 'flv', + 'title': 'David Cockfield\'s Top Picks', + 'description': 'md5:810f7f8c6a83ad5b48677c3f8e5bb2c3', + 'upload_date': '20180525', + 'timestamp': 1527288600, + }, + }, { + 'url': 'http://www.thecomedynetwork.ca/video/player?vid=923582', + 'only_matching': True, + }, { + 'url': 'http://www.tsn.ca/video/expectations-high-for-milos-raonic-at-us-open~939549', + 'only_matching': True, + }, { + 'url': 'http://www.bnn.ca/video/berman-s-call-part-two-viewer-questions~939654', + 'only_matching': True, + }, { + 'url': 'http://www.ctv.ca/YourMorning/Video/S1E6-Monday-August-29-2016-vid938009', + 'only_matching': True, + }, { + 'url': 'http://www.much.com/shows/atmidnight/episode948007/tuesday-september-13-2016', + 'only_matching': True, + }, { + 'url': 'http://www.much.com/shows/the-almost-impossible-gameshow/928979/episode-6', + 'only_matching': True, + }, { + 'url': 'http://www.ctv.ca/DCs-Legends-of-Tomorrow/Video/S2E11-Turncoat-vid1051430', + 'only_matching': True, + }, { + 'url': 'http://www.etalk.ca/video?videoid=663455', + 'only_matching': True, + }, { + 'url': 'https://www.cp24.com/video?clipId=1982548', + 'only_matching': True, + }] + _DOMAINS = { + 'thecomedynetwork': 'comedy', + 'discoveryvelocity': 'discvel', + 'sciencechannel': 'discsci', + 'investigationdiscovery': 'invdisc', + 'animalplanet': 'aniplan', + 'etalk': 'ctv', + 'bnnbloomberg': 'bnn', + 'marilyn': 'ctv_marilyn', + } + + def _real_extract(self, url): + domain, video_id = self._match_valid_url(url).groups() + domain = domain.split('.')[0] + return { + '_type': 'url_transparent', + 'id': video_id, + 'url': '9c9media:%s_web:%s' % (self._DOMAINS.get(domain, domain), video_id), + 'ie_key': 'NineCNineMedia', + } diff --git a/yt_dlp/extractor/bet.py b/yt_dlp/extractor/bet.py new file mode 100644 index 000000000..2c7144235 --- /dev/null +++ b/yt_dlp/extractor/bet.py @@ -0,0 +1,82 @@ +from __future__ import unicode_literals + +from .mtv import MTVServicesInfoExtractor +from ..utils import unified_strdate + +# TODO Remove - Reason: Outdated Site + + +class BetIE(MTVServicesInfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bet\.com/(?:[^/]+/)+(?P<id>.+?)\.html' + _TESTS = [ + { + 'url': 'http://www.bet.com/news/politics/2014/12/08/in-bet-exclusive-obama-talks-race-and-racism.html', + 'info_dict': { + 'id': '07e96bd3-8850-3051-b856-271b457f0ab8', + 'display_id': 'in-bet-exclusive-obama-talks-race-and-racism', + 'ext': 'flv', + 'title': 'A Conversation With President Obama', + 'description': 'President Obama urges persistence in confronting racism and bias.', + 'duration': 1534, + 'upload_date': '20141208', + 'thumbnail': r're:(?i)^https?://.*\.jpg$', + 'subtitles': { + 'en': 'mincount:2', + } + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, + { + 'url': 'http://www.bet.com/video/news/national/2014/justice-for-ferguson-a-community-reacts.html', + 'info_dict': { + 'id': '9f516bf1-7543-39c4-8076-dd441b459ba9', + 'display_id': 'justice-for-ferguson-a-community-reacts', + 'ext': 'flv', + 'title': 'Justice for Ferguson: A Community Reacts', + 'description': 'A BET News special.', + 'duration': 1696, + 'upload_date': '20141125', + 'thumbnail': r're:(?i)^https?://.*\.jpg$', + 'subtitles': { + 'en': 'mincount:2', + } + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + } + ] + + _FEED_URL = "http://feeds.mtvnservices.com/od/feed/bet-mrss-player" + + def _get_feed_query(self, uri): + return { + 'uuid': uri, + } + + def _extract_mgid(self, webpage): + return self._search_regex(r'data-uri="([^"]+)', webpage, 'mgid') + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + mgid = self._extract_mgid(webpage) + videos_info = self._get_videos_info(mgid) + + info_dict = videos_info['entries'][0] + + upload_date = unified_strdate(self._html_search_meta('date', webpage)) + description = self._html_search_meta('description', webpage) + + info_dict.update({ + 'display_id': display_id, + 'description': description, + 'upload_date': upload_date, + }) + + return info_dict diff --git a/youtube_dl/extractor/bfi.py b/yt_dlp/extractor/bfi.py index 60c8944b5..60c8944b5 100644 --- a/youtube_dl/extractor/bfi.py +++ b/yt_dlp/extractor/bfi.py diff --git a/youtube_dl/extractor/bfmtv.py b/yt_dlp/extractor/bfmtv.py index 501f69d80..501f69d80 100644 --- a/youtube_dl/extractor/bfmtv.py +++ b/yt_dlp/extractor/bfmtv.py diff --git a/youtube_dl/extractor/bibeltv.py b/yt_dlp/extractor/bibeltv.py index 56c2bfee8..56c2bfee8 100644 --- a/youtube_dl/extractor/bibeltv.py +++ b/yt_dlp/extractor/bibeltv.py diff --git a/youtube_dl/extractor/bigflix.py b/yt_dlp/extractor/bigflix.py index 28e3e59f6..28e3e59f6 100644 --- a/youtube_dl/extractor/bigflix.py +++ b/yt_dlp/extractor/bigflix.py diff --git a/youtube_dl/extractor/bild.py b/yt_dlp/extractor/bild.py index b8dfbd42b..b8dfbd42b 100644 --- a/youtube_dl/extractor/bild.py +++ b/yt_dlp/extractor/bild.py diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py new file mode 100644 index 000000000..d6c77e418 --- /dev/null +++ b/yt_dlp/extractor/bilibili.py @@ -0,0 +1,867 @@ +# coding: utf-8 + +import hashlib +import itertools +import functools +import re +import math + +from .common import InfoExtractor, SearchInfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urlparse, + compat_urllib_parse_urlparse +) +from ..utils import ( + ExtractorError, + int_or_none, + float_or_none, + parse_iso8601, + traverse_obj, + try_get, + smuggle_url, + srt_subtitles_timecode, + str_or_none, + str_to_int, + strip_jsonp, + unified_timestamp, + unsmuggle_url, + urlencode_postdata, + OnDemandPagedList +) + + +class BiliBiliIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?:(?:www|bangumi)\.)? + bilibili\.(?:tv|com)/ + (?: + (?: + video/[aA][vV]| + anime/(?P<anime_id>\d+)/play\# + )(?P<id>\d+)| + (s/)?video/[bB][vV](?P<id_bv>[^/?#&]+) + ) + (?:/?\?p=(?P<page>\d+))? + ''' + + _TESTS = [{ + 'url': 'http://www.bilibili.com/video/av1074402/', + 'md5': '5f7d29e1a2872f3df0cf76b1f87d3788', + 'info_dict': { + 'id': '1074402', + 'ext': 'flv', + 'title': '【金坷垃】金泡沫', + 'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923', + 'duration': 308.067, + 'timestamp': 1398012678, + 'upload_date': '20140420', + 'thumbnail': r're:^https?://.+\.jpg', + 'uploader': '菊子桑', + 'uploader_id': '156160', + }, + }, { + # Tested in BiliBiliBangumiIE + 'url': 'http://bangumi.bilibili.com/anime/1869/play#40062', + 'only_matching': True, + }, { + # bilibili.tv + 'url': 'http://www.bilibili.tv/video/av1074402/', + 'only_matching': True, + }, { + 'url': 'http://bangumi.bilibili.com/anime/5802/play#100643', + 'md5': '3f721ad1e75030cc06faf73587cfec57', + 'info_dict': { + 'id': '100643', + 'ext': 'mp4', + 'title': 'CHAOS;CHILD', + 'description': '如果你是神明,并且能够让妄想成为现实。那你会进行怎么样的妄想?是淫靡的世界?独裁社会?毁灭性的制裁?还是……2015年,涩谷。从6年前发生的大灾害“涩谷地震”之后复兴了的这个街区里新设立的私立高中...', + }, + 'skip': 'Geo-restricted to China', + }, { + # Title with double quotes + 'url': 'http://www.bilibili.com/video/av8903802/', + 'info_dict': { + 'id': '8903802', + 'title': '阿滴英文|英文歌分享#6 "Closer', + 'description': '滴妹今天唱Closer給你聽! 有史以来,被推最多次也是最久的歌曲,其实歌词跟我原本想像差蛮多的,不过还是好听! 微博@阿滴英文', + }, + 'playlist': [{ + 'info_dict': { + 'id': '8903802_part1', + 'ext': 'flv', + 'title': '阿滴英文|英文歌分享#6 "Closer', + 'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a', + 'uploader': '阿滴英文', + 'uploader_id': '65880958', + 'timestamp': 1488382634, + 'upload_date': '20170301', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'info_dict': { + 'id': '8903802_part2', + 'ext': 'flv', + 'title': '阿滴英文|英文歌分享#6 "Closer', + 'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a', + 'uploader': '阿滴英文', + 'uploader_id': '65880958', + 'timestamp': 1488382634, + 'upload_date': '20170301', + }, + 'params': { + 'skip_download': True, + }, + }] + }, { + # new BV video id format + 'url': 'https://www.bilibili.com/video/BV1JE411F741', + 'only_matching': True, + }, { + # Anthology + 'url': 'https://www.bilibili.com/video/BV1bK411W797', + 'info_dict': { + 'id': 'BV1bK411W797', + 'title': '物语中的人物是如何吐槽自己的OP的' + }, + 'playlist_count': 17, + }] + + _APP_KEY = 'iVGUTjsxvpLeuDCf' + _BILIBILI_KEY = 'aHRmhWMLkdeMuILqORnYZocwMBpMEOdt' + + def _report_error(self, result): + if 'message' in result: + raise ExtractorError('%s said: %s' % (self.IE_NAME, result['message']), expected=True) + elif 'code' in result: + raise ExtractorError('%s returns error %d' % (self.IE_NAME, result['code']), expected=True) + else: + raise ExtractorError('Can\'t extract Bangumi episode ID') + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + + mobj = self._match_valid_url(url) + video_id = mobj.group('id_bv') or mobj.group('id') + + av_id, bv_id = self._get_video_id_set(video_id, mobj.group('id_bv') is not None) + video_id = av_id + + anime_id = mobj.group('anime_id') + page_id = mobj.group('page') + webpage = self._download_webpage(url, video_id) + + # Bilibili anthologies are similar to playlists but all videos share the same video ID as the anthology itself. + # If the video has no page argument, check to see if it's an anthology + if page_id is None: + if not self.get_param('noplaylist'): + r = self._extract_anthology_entries(bv_id, video_id, webpage) + if r is not None: + self.to_screen('Downloading anthology %s - add --no-playlist to just download video' % video_id) + return r + else: + self.to_screen('Downloading just video %s because of --no-playlist' % video_id) + + if 'anime/' not in url: + cid = self._search_regex( + r'\bcid(?:["\']:|=)(\d+),["\']page(?:["\']:|=)' + str(page_id), webpage, 'cid', + default=None + ) or self._search_regex( + r'\bcid(?:["\']:|=)(\d+)', webpage, 'cid', + default=None + ) or compat_parse_qs(self._search_regex( + [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)', + r'EmbedPlayer\([^)]+,\s*\\"([^"]+)\\"\)', + r'<iframe[^>]+src="https://secure\.bilibili\.com/secure,([^"]+)"'], + webpage, 'player parameters'))['cid'][0] + else: + if 'no_bangumi_tip' not in smuggled_data: + self.to_screen('Downloading episode %s. To download all videos in anime %s, re-run yt-dlp with %s' % ( + video_id, anime_id, compat_urlparse.urljoin(url, '//bangumi.bilibili.com/anime/%s' % anime_id))) + headers = { + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'Referer': url + } + headers.update(self.geo_verification_headers()) + + js = self._download_json( + 'http://bangumi.bilibili.com/web_api/get_source', video_id, + data=urlencode_postdata({'episode_id': video_id}), + headers=headers) + if 'result' not in js: + self._report_error(js) + cid = js['result']['cid'] + + headers = { + 'Accept': 'application/json', + 'Referer': url + } + headers.update(self.geo_verification_headers()) + + entries = [] + + RENDITIONS = ('qn=80&quality=80&type=', 'quality=2&type=mp4') + for num, rendition in enumerate(RENDITIONS, start=1): + payload = 'appkey=%s&cid=%s&otype=json&%s' % (self._APP_KEY, cid, rendition) + sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest() + + video_info = self._download_json( + 'http://interface.bilibili.com/v2/playurl?%s&sign=%s' % (payload, sign), + video_id, note='Downloading video info page', + headers=headers, fatal=num == len(RENDITIONS)) + + if not video_info: + continue + + if 'durl' not in video_info: + if num < len(RENDITIONS): + continue + self._report_error(video_info) + + for idx, durl in enumerate(video_info['durl']): + formats = [{ + 'url': durl['url'], + 'filesize': int_or_none(durl['size']), + }] + for backup_url in durl.get('backup_url', []): + formats.append({ + 'url': backup_url, + # backup URLs have lower priorities + 'quality': -2 if 'hd.mp4' in backup_url else -3, + }) + + for a_format in formats: + a_format.setdefault('http_headers', {}).update({ + 'Referer': url, + }) + + self._sort_formats(formats) + + entries.append({ + 'id': '%s_part%s' % (video_id, idx), + 'duration': float_or_none(durl.get('length'), 1000), + 'formats': formats, + }) + break + + title = self._html_search_regex( + (r'<h1[^>]+\btitle=(["\'])(?P<title>(?:(?!\1).)+)\1', + r'(?s)<h1[^>]*>(?P<title>.+?)</h1>'), webpage, 'title', + group='title') + + # Get part title for anthologies + if page_id is not None: + # TODO: The json is already downloaded by _extract_anthology_entries. Don't redownload for each video + part_title = try_get( + self._download_json( + f'https://api.bilibili.com/x/player/pagelist?bvid={bv_id}&jsonp=jsonp', + video_id, note='Extracting videos in anthology'), + lambda x: x['data'][int(page_id) - 1]['part']) + title = part_title or title + + description = self._html_search_meta('description', webpage) + timestamp = unified_timestamp(self._html_search_regex( + r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time', + default=None) or self._html_search_meta( + 'uploadDate', webpage, 'timestamp', default=None)) + thumbnail = self._html_search_meta(['og:image', 'thumbnailUrl'], webpage) + + # TODO 'view_count' requires deobfuscating Javascript + info = { + 'id': str(video_id) if page_id is None else '%s_part%s' % (video_id, page_id), + 'cid': cid, + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'thumbnail': thumbnail, + 'duration': float_or_none(video_info.get('timelength'), scale=1000), + } + + uploader_mobj = re.search( + r'<a[^>]+href="(?:https?:)?//space\.bilibili\.com/(?P<id>\d+)"[^>]*>\s*(?P<name>[^<]+?)\s*<', + webpage) + if uploader_mobj: + info.update({ + 'uploader': uploader_mobj.group('name').strip(), + 'uploader_id': uploader_mobj.group('id'), + }) + + if not info.get('uploader'): + info['uploader'] = self._html_search_meta( + 'author', webpage, 'uploader', default=None) + + top_level_info = { + 'tags': traverse_obj(self._download_json( + f'https://api.bilibili.com/x/tag/archive/tags?aid={video_id}', + video_id, fatal=False, note='Downloading tags'), ('data', ..., 'tag_name')), + } + + entries[0]['subtitles'] = { + 'danmaku': [{ + 'ext': 'xml', + 'url': f'https://comment.bilibili.com/{cid}.xml', + }] + } + + r''' + # Requires https://github.com/m13253/danmaku2ass which is licenced under GPL3 + # See https://github.com/animelover1984/youtube-dl + + raw_danmaku = self._download_webpage( + f'https://comment.bilibili.com/{cid}.xml', video_id, fatal=False, note='Downloading danmaku comments') + danmaku = NiconicoIE.CreateDanmaku(raw_danmaku, commentType='Bilibili', x=1024, y=576) + entries[0]['subtitles'] = { + 'danmaku': [{ + 'ext': 'ass', + 'data': danmaku + }] + } + ''' + + top_level_info['__post_extractor'] = self.extract_comments(video_id) + + for entry in entries: + entry.update(info) + + if len(entries) == 1: + entries[0].update(top_level_info) + return entries[0] + + for idx, entry in enumerate(entries): + entry['id'] = '%s_part%d' % (video_id, (idx + 1)) + + return { + '_type': 'multi_video', + 'id': str(video_id), + 'bv_id': bv_id, + 'title': title, + 'description': description, + 'entries': entries, + **info, **top_level_info + } + + def _extract_anthology_entries(self, bv_id, video_id, webpage): + title = self._html_search_regex( + (r'<h1[^>]+\btitle=(["\'])(?P<title>(?:(?!\1).)+)\1', + r'(?s)<h1[^>]*>(?P<title>.+?)</h1>'), webpage, 'title', + group='title') + json_data = self._download_json( + f'https://api.bilibili.com/x/player/pagelist?bvid={bv_id}&jsonp=jsonp', + video_id, note='Extracting videos in anthology') + + if json_data['data']: + return self.playlist_from_matches( + json_data['data'], bv_id, title, ie=BiliBiliIE.ie_key(), + getter=lambda entry: 'https://www.bilibili.com/video/%s?p=%d' % (bv_id, entry['page'])) + + def _get_video_id_set(self, id, is_bv): + query = {'bvid': id} if is_bv else {'aid': id} + response = self._download_json( + "http://api.bilibili.cn/x/web-interface/view", + id, query=query, + note='Grabbing original ID via API') + + if response['code'] == -400: + raise ExtractorError('Video ID does not exist', expected=True, video_id=id) + elif response['code'] != 0: + raise ExtractorError(f'Unknown error occurred during API check (code {response["code"]})', + expected=True, video_id=id) + return response['data']['aid'], response['data']['bvid'] + + def _get_comments(self, video_id, commentPageNumber=0): + for idx in itertools.count(1): + replies = traverse_obj( + self._download_json( + f'https://api.bilibili.com/x/v2/reply?pn={idx}&oid={video_id}&type=1&jsonp=jsonp&sort=2&_=1567227301685', + video_id, note=f'Extracting comments from page {idx}'), + ('data', 'replies')) or [] + for children in map(self._get_all_children, replies): + yield from children + + def _get_all_children(self, reply): + yield { + 'author': traverse_obj(reply, ('member', 'uname')), + 'author_id': traverse_obj(reply, ('member', 'mid')), + 'id': reply.get('rpid'), + 'text': traverse_obj(reply, ('content', 'message')), + 'timestamp': reply.get('ctime'), + 'parent': reply.get('parent') or 'root', + } + for children in map(self._get_all_children, reply.get('replies') or []): + yield from children + + +class BiliBiliBangumiIE(InfoExtractor): + _VALID_URL = r'https?://bangumi\.bilibili\.com/anime/(?P<id>\d+)' + + IE_NAME = 'bangumi.bilibili.com' + IE_DESC = 'BiliBili番剧' + + _TESTS = [{ + 'url': 'http://bangumi.bilibili.com/anime/1869', + 'info_dict': { + 'id': '1869', + 'title': '混沌武士', + 'description': 'md5:6a9622b911565794c11f25f81d6a97d2', + }, + 'playlist_count': 26, + }, { + 'url': 'http://bangumi.bilibili.com/anime/1869', + 'info_dict': { + 'id': '1869', + 'title': '混沌武士', + 'description': 'md5:6a9622b911565794c11f25f81d6a97d2', + }, + 'playlist': [{ + 'md5': '91da8621454dd58316851c27c68b0c13', + 'info_dict': { + 'id': '40062', + 'ext': 'mp4', + 'title': '混沌武士', + 'description': '故事发生在日本的江户时代。风是一个小酒馆的打工女。一日,酒馆里来了一群恶霸,虽然他们的举动令风十分不满,但是毕竟风只是一届女流,无法对他们采取什么行动,只能在心里嘟哝。这时,酒家里又进来了个“不良份子...', + 'timestamp': 1414538739, + 'upload_date': '20141028', + 'episode': '疾风怒涛 Tempestuous Temperaments', + 'episode_number': 1, + }, + }], + 'params': { + 'playlist_items': '1', + }, + }] + + @classmethod + def suitable(cls, url): + return False if BiliBiliIE.suitable(url) else super(BiliBiliBangumiIE, cls).suitable(url) + + def _real_extract(self, url): + bangumi_id = self._match_id(url) + + # Sometimes this API returns a JSONP response + season_info = self._download_json( + 'http://bangumi.bilibili.com/jsonp/seasoninfo/%s.ver' % bangumi_id, + bangumi_id, transform_source=strip_jsonp)['result'] + + entries = [{ + '_type': 'url_transparent', + 'url': smuggle_url(episode['webplay_url'], {'no_bangumi_tip': 1}), + 'ie_key': BiliBiliIE.ie_key(), + 'timestamp': parse_iso8601(episode.get('update_time'), delimiter=' '), + 'episode': episode.get('index_title'), + 'episode_number': int_or_none(episode.get('index')), + } for episode in season_info['episodes']] + + entries = sorted(entries, key=lambda entry: entry.get('episode_number')) + + return self.playlist_result( + entries, bangumi_id, + season_info.get('bangumi_title'), season_info.get('evaluate')) + + +class BilibiliChannelIE(InfoExtractor): + _VALID_URL = r'https?://space.bilibili\.com/(?P<id>\d+)' + _API_URL = "https://api.bilibili.com/x/space/arc/search?mid=%s&pn=%d&jsonp=jsonp" + _TESTS = [{ + 'url': 'https://space.bilibili.com/3985676/video', + 'info_dict': {}, + 'playlist_mincount': 112, + }] + + def _entries(self, list_id): + count, max_count = 0, None + + for page_num in itertools.count(1): + data = self._download_json( + self._API_URL % (list_id, page_num), list_id, note=f'Downloading page {page_num}')['data'] + + max_count = max_count or try_get(data, lambda x: x['page']['count']) + + entries = try_get(data, lambda x: x['list']['vlist']) + if not entries: + return + for entry in entries: + yield self.url_result( + 'https://www.bilibili.com/video/%s' % entry['bvid'], + BiliBiliIE.ie_key(), entry['bvid']) + + count += len(entries) + if max_count and count >= max_count: + return + + def _real_extract(self, url): + list_id = self._match_id(url) + return self.playlist_result(self._entries(list_id), list_id) + + +class BilibiliCategoryIE(InfoExtractor): + IE_NAME = 'Bilibili category extractor' + _MAX_RESULTS = 1000000 + _VALID_URL = r'https?://www\.bilibili\.com/v/[a-zA-Z]+\/[a-zA-Z]+' + _TESTS = [{ + 'url': 'https://www.bilibili.com/v/kichiku/mad', + 'info_dict': { + 'id': 'kichiku: mad', + 'title': 'kichiku: mad' + }, + 'playlist_mincount': 45, + 'params': { + 'playlistend': 45 + } + }] + + def _fetch_page(self, api_url, num_pages, query, page_num): + parsed_json = self._download_json( + api_url, query, query={'Search_key': query, 'pn': page_num}, + note='Extracting results from page %s of %s' % (page_num, num_pages)) + + video_list = try_get(parsed_json, lambda x: x['data']['archives'], list) + if not video_list: + raise ExtractorError('Failed to retrieve video list for page %d' % page_num) + + for video in video_list: + yield self.url_result( + 'https://www.bilibili.com/video/%s' % video['bvid'], 'BiliBili', video['bvid']) + + def _entries(self, category, subcategory, query): + # map of categories : subcategories : RIDs + rid_map = { + 'kichiku': { + 'mad': 26, + 'manual_vocaloid': 126, + 'guide': 22, + 'theatre': 216, + 'course': 127 + }, + } + + if category not in rid_map: + raise ExtractorError( + f'The category {category} isn\'t supported. Supported categories: {list(rid_map.keys())}') + if subcategory not in rid_map[category]: + raise ExtractorError( + f'The subcategory {subcategory} isn\'t supported for this category. Supported subcategories: {list(rid_map[category].keys())}') + rid_value = rid_map[category][subcategory] + + api_url = 'https://api.bilibili.com/x/web-interface/newlist?rid=%d&type=1&ps=20&jsonp=jsonp' % rid_value + page_json = self._download_json(api_url, query, query={'Search_key': query, 'pn': '1'}) + page_data = try_get(page_json, lambda x: x['data']['page'], dict) + count, size = int_or_none(page_data.get('count')), int_or_none(page_data.get('size')) + if count is None or not size: + raise ExtractorError('Failed to calculate either page count or size') + + num_pages = math.ceil(count / size) + + return OnDemandPagedList(functools.partial( + self._fetch_page, api_url, num_pages, query), size) + + def _real_extract(self, url): + u = compat_urllib_parse_urlparse(url) + category, subcategory = u.path.split('/')[2:4] + query = '%s: %s' % (category, subcategory) + + return self.playlist_result(self._entries(category, subcategory, query), query, query) + + +class BiliBiliSearchIE(SearchInfoExtractor): + IE_DESC = 'Bilibili video search, "bilisearch" keyword' + _MAX_RESULTS = 100000 + _SEARCH_KEY = 'bilisearch' + + def _search_results(self, query): + for page_num in itertools.count(1): + videos = self._download_json( + 'https://api.bilibili.com/x/web-interface/search/type', query, + note=f'Extracting results from page {page_num}', query={ + 'Search_key': query, + 'keyword': query, + 'page': page_num, + 'context': '', + 'order': 'pubdate', + 'duration': 0, + 'tids_2': '', + '__refresh__': 'true', + 'search_type': 'video', + 'tids': 0, + 'highlight': 1, + })['data'].get('result') or [] + for video in videos: + yield self.url_result(video['arcurl'], 'BiliBili', str(video['aid'])) + + +class BilibiliAudioBaseIE(InfoExtractor): + def _call_api(self, path, sid, query=None): + if not query: + query = {'sid': sid} + return self._download_json( + 'https://www.bilibili.com/audio/music-service-c/web/' + path, + sid, query=query)['data'] + + +class BilibiliAudioIE(BilibiliAudioBaseIE): + _VALID_URL = r'https?://(?:www\.)?bilibili\.com/audio/au(?P<id>\d+)' + _TEST = { + 'url': 'https://www.bilibili.com/audio/au1003142', + 'md5': 'fec4987014ec94ef9e666d4d158ad03b', + 'info_dict': { + 'id': '1003142', + 'ext': 'm4a', + 'title': '【tsukimi】YELLOW / 神山羊', + 'artist': 'tsukimi', + 'comment_count': int, + 'description': 'YELLOW的mp3版!', + 'duration': 183, + 'subtitles': { + 'origin': [{ + 'ext': 'lrc', + }], + }, + 'thumbnail': r're:^https?://.+\.jpg', + 'timestamp': 1564836614, + 'upload_date': '20190803', + 'uploader': 'tsukimi-つきみぐー', + 'view_count': int, + }, + } + + def _real_extract(self, url): + au_id = self._match_id(url) + + play_data = self._call_api('url', au_id) + formats = [{ + 'url': play_data['cdns'][0], + 'filesize': int_or_none(play_data.get('size')), + 'vcodec': 'none' + }] + + song = self._call_api('song/info', au_id) + title = song['title'] + statistic = song.get('statistic') or {} + + subtitles = None + lyric = song.get('lyric') + if lyric: + subtitles = { + 'origin': [{ + 'url': lyric, + }] + } + + return { + 'id': au_id, + 'title': title, + 'formats': formats, + 'artist': song.get('author'), + 'comment_count': int_or_none(statistic.get('comment')), + 'description': song.get('intro'), + 'duration': int_or_none(song.get('duration')), + 'subtitles': subtitles, + 'thumbnail': song.get('cover'), + 'timestamp': int_or_none(song.get('passtime')), + 'uploader': song.get('uname'), + 'view_count': int_or_none(statistic.get('play')), + } + + +class BilibiliAudioAlbumIE(BilibiliAudioBaseIE): + _VALID_URL = r'https?://(?:www\.)?bilibili\.com/audio/am(?P<id>\d+)' + _TEST = { + 'url': 'https://www.bilibili.com/audio/am10624', + 'info_dict': { + 'id': '10624', + 'title': '每日新曲推荐(每日11:00更新)', + 'description': '每天11:00更新,为你推送最新音乐', + }, + 'playlist_count': 19, + } + + def _real_extract(self, url): + am_id = self._match_id(url) + + songs = self._call_api( + 'song/of-menu', am_id, {'sid': am_id, 'pn': 1, 'ps': 100})['data'] + + entries = [] + for song in songs: + sid = str_or_none(song.get('id')) + if not sid: + continue + entries.append(self.url_result( + 'https://www.bilibili.com/audio/au' + sid, + BilibiliAudioIE.ie_key(), sid)) + + if entries: + album_data = self._call_api('menu/info', am_id) or {} + album_title = album_data.get('title') + if album_title: + for entry in entries: + entry['album'] = album_title + return self.playlist_result( + entries, am_id, album_title, album_data.get('intro')) + + return self.playlist_result(entries, am_id) + + +class BiliBiliPlayerIE(InfoExtractor): + _VALID_URL = r'https?://player\.bilibili\.com/player\.html\?.*?\baid=(?P<id>\d+)' + _TEST = { + 'url': 'http://player.bilibili.com/player.html?aid=92494333&cid=157926707&page=1', + 'only_matching': True, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result( + 'http://www.bilibili.tv/video/av%s/' % video_id, + ie=BiliBiliIE.ie_key(), video_id=video_id) + + +class BiliIntlBaseIE(InfoExtractor): + _API_URL = 'https://api.bili{}/intl/gateway{}' + + def _call_api(self, type, endpoint, id): + return self._download_json(self._API_URL.format(type, endpoint), id)['data'] + + def json2srt(self, json): + data = '\n\n'.join( + f'{i + 1}\n{srt_subtitles_timecode(line["from"])} --> {srt_subtitles_timecode(line["to"])}\n{line["content"]}' + for i, line in enumerate(json['body'])) + return data + + def _get_subtitles(self, type, ep_id): + sub_json = self._call_api(type, f'/m/subtitle?ep_id={ep_id}&platform=web', ep_id) + subtitles = {} + for sub in sub_json.get('subtitles', []): + sub_url = sub.get('url') + if not sub_url: + continue + sub_data = self._download_json(sub_url, ep_id, fatal=False) + if not sub_data: + continue + subtitles.setdefault(sub.get('key', 'en'), []).append({ + 'ext': 'srt', + 'data': self.json2srt(sub_data) + }) + return subtitles + + def _get_formats(self, type, ep_id): + video_json = self._call_api(type, f'/web/playurl?ep_id={ep_id}&platform=web', ep_id) + if not video_json: + self.raise_login_required(method='cookies') + video_json = video_json['playurl'] + formats = [] + for vid in video_json.get('video', []): + video_res = vid.get('video_resource') or {} + video_info = vid.get('stream_info') or {} + if not video_res.get('url'): + continue + formats.append({ + 'url': video_res['url'], + 'ext': 'mp4', + 'format_note': video_info.get('desc_words'), + 'width': video_res.get('width'), + 'height': video_res.get('height'), + 'vbr': video_res.get('bandwidth'), + 'acodec': 'none', + 'vcodec': video_res.get('codecs'), + 'filesize': video_res.get('size'), + }) + for aud in video_json.get('audio_resource', []): + if not aud.get('url'): + continue + formats.append({ + 'url': aud['url'], + 'ext': 'mp4', + 'abr': aud.get('bandwidth'), + 'acodec': aud.get('codecs'), + 'vcodec': 'none', + 'filesize': aud.get('size'), + }) + + self._sort_formats(formats) + return formats + + def _extract_ep_info(self, type, episode_data, ep_id): + return { + 'id': ep_id, + 'title': episode_data.get('long_title') or episode_data['title'], + 'thumbnail': episode_data.get('cover'), + 'episode_number': str_to_int(episode_data.get('title')), + 'formats': self._get_formats(type, ep_id), + 'subtitles': self._get_subtitles(type, ep_id), + 'extractor_key': BiliIntlIE.ie_key(), + } + + +class BiliIntlIE(BiliIntlBaseIE): + _VALID_URL = r'https?://(?:www\.)?bili(?P<type>bili\.tv|intl.com)/(?:[a-z]{2}/)?play/(?P<season_id>\d+)/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.bilibili.tv/en/play/34613/341736', + 'info_dict': { + 'id': '341736', + 'ext': 'mp4', + 'title': 'The First Night', + 'thumbnail': 'https://i0.hdslb.com/bfs/intl/management/91e30e5521235d9b163339a26a0b030ebda54310.png', + 'episode_number': 2, + }, + 'params': { + 'format': 'bv', + }, + }, { + 'url': 'https://www.biliintl.com/en/play/34613/341736', + 'info_dict': { + 'id': '341736', + 'ext': 'mp4', + 'title': 'The First Night', + 'thumbnail': 'https://i0.hdslb.com/bfs/intl/management/91e30e5521235d9b163339a26a0b030ebda54310.png', + 'episode_number': 2, + }, + 'params': { + 'format': 'bv', + }, + }] + + def _real_extract(self, url): + type, season_id, id = self._match_valid_url(url).groups() + data_json = self._call_api(type, f'/web/view/ogv_collection?season_id={season_id}', id) + episode_data = next( + episode for episode in data_json.get('episodes', []) + if str(episode.get('ep_id')) == id) + return self._extract_ep_info(type, episode_data, id) + + +class BiliIntlSeriesIE(BiliIntlBaseIE): + _VALID_URL = r'https?://(?:www\.)?bili(?P<type>bili\.tv|intl.com)/(?:[a-z]{2}/)?play/(?P<id>\d+)$' + _TESTS = [{ + 'url': 'https://www.bilibili.tv/en/play/34613', + 'playlist_mincount': 15, + 'info_dict': { + 'id': '34613', + }, + 'params': { + 'skip_download': True, + 'format': 'bv', + }, + }, { + 'url': 'https://www.biliintl.com/en/play/34613', + 'playlist_mincount': 15, + 'info_dict': { + 'id': '34613', + }, + 'params': { + 'skip_download': True, + 'format': 'bv', + }, + }] + + def _entries(self, id, type): + data_json = self._call_api(type, f'/web/view/ogv_collection?season_id={id}', id) + for episode in data_json.get('episodes', []): + episode_id = str(episode.get('ep_id')) + yield self._extract_ep_info(type, episode, episode_id) + + def _real_extract(self, url): + type, id = self._match_valid_url(url).groups() + return self.playlist_result(self._entries(id, type), playlist_id=id) diff --git a/youtube_dl/extractor/biobiochiletv.py b/yt_dlp/extractor/biobiochiletv.py index dc86c57c5..dc86c57c5 100644 --- a/youtube_dl/extractor/biobiochiletv.py +++ b/yt_dlp/extractor/biobiochiletv.py diff --git a/youtube_dl/extractor/biqle.py b/yt_dlp/extractor/biqle.py index 17ebbb257..17ebbb257 100644 --- a/youtube_dl/extractor/biqle.py +++ b/yt_dlp/extractor/biqle.py diff --git a/yt_dlp/extractor/bitchute.py b/yt_dlp/extractor/bitchute.py new file mode 100644 index 000000000..dcae6f4cc --- /dev/null +++ b/yt_dlp/extractor/bitchute.py @@ -0,0 +1,158 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + GeoRestrictedError, + orderedSet, + unified_strdate, + urlencode_postdata, +) + + +class BitChuteIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bitchute\.com/(?:video|embed|torrent/[^/]+)/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.bitchute.com/video/UGlrF9o9b-Q/', + 'md5': '7e427d7ed7af5a75b5855705ec750e2b', + 'info_dict': { + 'id': 'szoMrox2JEI', + 'ext': 'mp4', + 'title': 'This is the first video on #BitChute !', + 'description': 'md5:a0337e7b1fe39e32336974af8173a034', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'BitChute', + 'upload_date': '20170103', + }, + }, { + 'url': 'https://www.bitchute.com/embed/lbb5G1hjPhw/', + 'only_matching': True, + }, { + 'url': 'https://www.bitchute.com/torrent/Zee5BE49045h/szoMrox2JEI.webtorrent', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>%s)' % BitChuteIE._VALID_URL, + webpage)] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'https://www.bitchute.com/video/%s' % video_id, video_id, headers={ + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.57 Safari/537.36', + }) + + title = self._html_search_regex( + (r'<[^>]+\bid=["\']video-title[^>]+>([^<]+)', r'<title>([^<]+)'), + webpage, 'title', default=None) or self._html_search_meta( + 'description', webpage, 'title', + default=None) or self._og_search_description(webpage) + + format_urls = [] + for mobj in re.finditer( + r'addWebSeed\s*\(\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage): + format_urls.append(mobj.group('url')) + format_urls.extend(re.findall(r'as=(https?://[^&"\']+)', webpage)) + + formats = [ + {'url': format_url} + for format_url in orderedSet(format_urls)] + + if not formats: + entries = self._parse_html5_media_entries( + url, webpage, video_id) + if not entries: + error = self._html_search_regex(r'<h1 class="page-title">([^<]+)</h1>', webpage, 'error', default='Cannot find video') + if error == 'Video Unavailable': + raise GeoRestrictedError(error) + raise ExtractorError(error) + formats = entries[0]['formats'] + + self._check_formats(formats, video_id) + self._sort_formats(formats) + + description = self._html_search_regex( + r'(?s)<div\b[^>]+\bclass=["\']full hidden[^>]+>(.+?)</div>', + webpage, 'description', fatal=False) + thumbnail = self._og_search_thumbnail( + webpage, default=None) or self._html_search_meta( + 'twitter:image:src', webpage, 'thumbnail') + uploader = self._html_search_regex( + (r'(?s)<div class=["\']channel-banner.*?<p\b[^>]+\bclass=["\']name[^>]+>(.+?)</p>', + r'(?s)<p\b[^>]+\bclass=["\']video-author[^>]+>(.+?)</p>'), + webpage, 'uploader', fatal=False) + + upload_date = unified_strdate(self._search_regex( + r'class=["\']video-publish-date[^>]+>[^<]+ at \d+:\d+ UTC on (.+?)\.', + webpage, 'upload date', fatal=False)) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'upload_date': upload_date, + 'formats': formats, + } + + +class BitChuteChannelIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bitchute\.com/channel/(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'https://www.bitchute.com/channel/victoriaxrave/', + 'playlist_mincount': 185, + 'info_dict': { + 'id': 'victoriaxrave', + }, + } + + _TOKEN = 'zyG6tQcGPE5swyAEFLqKUwMuMMuF6IO2DZ6ZDQjGfsL0e4dcTLwqkTTul05Jdve7' + + def _entries(self, channel_id): + channel_url = 'https://www.bitchute.com/channel/%s/' % channel_id + offset = 0 + for page_num in itertools.count(1): + data = self._download_json( + '%sextend/' % channel_url, channel_id, + 'Downloading channel page %d' % page_num, + data=urlencode_postdata({ + 'csrfmiddlewaretoken': self._TOKEN, + 'name': '', + 'offset': offset, + }), headers={ + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'Referer': channel_url, + 'X-Requested-With': 'XMLHttpRequest', + 'Cookie': 'csrftoken=%s' % self._TOKEN, + }) + if data.get('success') is False: + break + html = data.get('html') + if not html: + break + video_ids = re.findall( + r'class=["\']channel-videos-image-container[^>]+>\s*<a\b[^>]+\bhref=["\']/video/([^"\'/]+)', + html) + if not video_ids: + break + offset += len(video_ids) + for video_id in video_ids: + yield self.url_result( + 'https://www.bitchute.com/video/%s' % video_id, + ie=BitChuteIE.ie_key(), video_id=video_id) + + def _real_extract(self, url): + channel_id = self._match_id(url) + return self.playlist_result( + self._entries(channel_id), playlist_id=channel_id) diff --git a/yt_dlp/extractor/bitwave.py b/yt_dlp/extractor/bitwave.py new file mode 100644 index 000000000..eb16c469d --- /dev/null +++ b/yt_dlp/extractor/bitwave.py @@ -0,0 +1,61 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class BitwaveReplayIE(InfoExtractor): + IE_NAME = 'bitwave:replay' + _VALID_URL = r'https?://(?:www\.)?bitwave\.tv/(?P<user>\w+)/replay/(?P<id>\w+)/?$' + _TEST = { + 'url': 'https://bitwave.tv/RhythmicCarnage/replay/z4P6eq5L7WDrM85UCrVr', + 'only_matching': True + } + + def _real_extract(self, url): + replay_id = self._match_id(url) + replay = self._download_json( + 'https://api.bitwave.tv/v1/replays/' + replay_id, + replay_id + ) + + return { + 'id': replay_id, + 'title': replay['data']['title'], + 'uploader': replay['data']['name'], + 'uploader_id': replay['data']['name'], + 'url': replay['data']['url'], + 'thumbnails': [ + {'url': x} for x in replay['data']['thumbnails'] + ], + } + + +class BitwaveStreamIE(InfoExtractor): + IE_NAME = 'bitwave:stream' + _VALID_URL = r'https?://(?:www\.)?bitwave\.tv/(?P<id>\w+)/?$' + _TEST = { + 'url': 'https://bitwave.tv/doomtube', + 'only_matching': True + } + + def _real_extract(self, url): + username = self._match_id(url) + channel = self._download_json( + 'https://api.bitwave.tv/v1/channels/' + username, + username) + + formats = self._extract_m3u8_formats( + channel['data']['url'], username, + 'mp4') + self._sort_formats(formats) + + return { + 'id': username, + 'title': self._live_title(channel['data']['title']), + 'uploader': username, + 'uploader_id': username, + 'formats': formats, + 'thumbnail': channel['data']['thumbnail'], + 'is_live': True, + 'view_count': channel['data']['viewCount'] + } diff --git a/yt_dlp/extractor/blackboardcollaborate.py b/yt_dlp/extractor/blackboardcollaborate.py new file mode 100644 index 000000000..8ae294198 --- /dev/null +++ b/yt_dlp/extractor/blackboardcollaborate.py @@ -0,0 +1,67 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import parse_iso8601 + + +class BlackboardCollaborateIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?P<region>[a-z-]+)\.bbcollab\.com/ + (?: + collab/ui/session/playback/load| + recording + )/ + (?P<id>[^/]+)''' + _TESTS = [ + { + 'url': 'https://us-lti.bbcollab.com/collab/ui/session/playback/load/0a633b6a88824deb8c918f470b22b256', + 'md5': 'bb7a055682ee4f25fdb5838cdf014541', + 'info_dict': { + 'id': '0a633b6a88824deb8c918f470b22b256', + 'title': 'HESI A2 Information Session - Thursday, May 6, 2021 - recording_1', + 'ext': 'mp4', + 'duration': 1896000, + 'timestamp': 1620331399, + 'upload_date': '20210506', + }, + }, + { + 'url': 'https://us.bbcollab.com/collab/ui/session/playback/load/76761522adfe4345a0dee6794bbcabda', + 'only_matching': True, + }, + { + 'url': 'https://ca.bbcollab.com/collab/ui/session/playback/load/b6399dcb44df4f21b29ebe581e22479d', + 'only_matching': True, + }, + { + 'url': 'https://eu.bbcollab.com/recording/51ed7b50810c4444a106e48cefb3e6b5', + 'only_matching': True, + }, + { + 'url': 'https://au.bbcollab.com/collab/ui/session/playback/load/2bccf7165d7c419ab87afc1ec3f3bb15', + 'only_matching': True, + }, + ] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + region = mobj.group('region') + video_id = mobj.group('id') + info = self._download_json( + 'https://{}.bbcollab.com/collab/api/csa/recordings/{}/data'.format(region, video_id), video_id) + duration = info.get('duration') + title = info['name'] + upload_date = info.get('created') + streams = info['streams'] + formats = [{'format_id': k, 'url': url} for k, url in streams.items()] + + return { + 'duration': duration, + 'formats': formats, + 'id': video_id, + 'timestamp': parse_iso8601(upload_date), + 'title': title, + } diff --git a/youtube_dl/extractor/bleacherreport.py b/yt_dlp/extractor/bleacherreport.py index d1bf8e829..d1bf8e829 100644 --- a/youtube_dl/extractor/bleacherreport.py +++ b/yt_dlp/extractor/bleacherreport.py diff --git a/yt_dlp/extractor/blinkx.py b/yt_dlp/extractor/blinkx.py new file mode 100644 index 000000000..d70a3b30f --- /dev/null +++ b/yt_dlp/extractor/blinkx.py @@ -0,0 +1,86 @@ +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..utils import ( + remove_start, + int_or_none, +) + + +class BlinkxIE(InfoExtractor): + _VALID_URL = r'(?:https?://(?:www\.)blinkx\.com/#?ce/|blinkx:)(?P<id>[^?]+)' + IE_NAME = 'blinkx' + + _TEST = { + 'url': 'http://www.blinkx.com/ce/Da0Gw3xc5ucpNduzLuDDlv4WC9PuI4fDi1-t6Y3LyfdY2SZS5Urbvn-UPJvrvbo8LTKTc67Wu2rPKSQDJyZeeORCR8bYkhs8lI7eqddznH2ofh5WEEdjYXnoRtj7ByQwt7atMErmXIeYKPsSDuMAAqJDlQZ-3Ff4HJVeH_s3Gh8oQ', + 'md5': '337cf7a344663ec79bf93a526a2e06c7', + 'info_dict': { + 'id': 'Da0Gw3xc', + 'ext': 'mp4', + 'title': 'No Daily Show for John Oliver; HBO Show Renewed - IGN News', + 'uploader': 'IGN News', + 'upload_date': '20150217', + 'timestamp': 1424215740, + 'description': 'HBO has renewed Last Week Tonight With John Oliver for two more seasons.', + 'duration': 47.743333, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + display_id = video_id[:8] + + api_url = ('https://apib4.blinkx.com/api.php?action=play_video&' + + 'video=%s' % video_id) + data_json = self._download_webpage(api_url, display_id) + data = json.loads(data_json)['api']['results'][0] + duration = None + thumbnails = [] + formats = [] + for m in data['media']: + if m['type'] == 'jpg': + thumbnails.append({ + 'url': m['link'], + 'width': int(m['w']), + 'height': int(m['h']), + }) + elif m['type'] == 'original': + duration = float(m['d']) + elif m['type'] == 'youtube': + yt_id = m['link'] + self.to_screen('Youtube video detected: %s' % yt_id) + return self.url_result(yt_id, 'Youtube', video_id=yt_id) + elif m['type'] in ('flv', 'mp4'): + vcodec = remove_start(m['vcodec'], 'ff') + acodec = remove_start(m['acodec'], 'ff') + vbr = int_or_none(m.get('vbr') or m.get('vbitrate'), 1000) + abr = int_or_none(m.get('abr') or m.get('abitrate'), 1000) + tbr = vbr + abr if vbr and abr else None + format_id = '%s-%sk-%s' % (vcodec, tbr, m['w']) + formats.append({ + 'format_id': format_id, + 'url': m['link'], + 'vcodec': vcodec, + 'acodec': acodec, + 'abr': abr, + 'vbr': vbr, + 'tbr': tbr, + 'width': int_or_none(m.get('w')), + 'height': int_or_none(m.get('h')), + }) + + self._sort_formats(formats) + + return { + 'id': display_id, + 'fullid': video_id, + 'title': data['title'], + 'formats': formats, + 'uploader': data.get('channel_name'), + 'timestamp': data.get('pubdate_epoch'), + 'description': data.get('description'), + 'thumbnails': thumbnails, + 'duration': duration, + } diff --git a/youtube_dl/extractor/bloomberg.py b/yt_dlp/extractor/bloomberg.py index 2fbfad1ba..2fbfad1ba 100644 --- a/youtube_dl/extractor/bloomberg.py +++ b/yt_dlp/extractor/bloomberg.py diff --git a/yt_dlp/extractor/bokecc.py b/yt_dlp/extractor/bokecc.py new file mode 100644 index 000000000..6a89d36f4 --- /dev/null +++ b/yt_dlp/extractor/bokecc.py @@ -0,0 +1,59 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..compat import compat_parse_qs +from ..utils import ExtractorError + + +class BokeCCBaseIE(InfoExtractor): + def _extract_bokecc_formats(self, webpage, video_id, format_id=None): + player_params_str = self._html_search_regex( + r'<(?:script|embed)[^>]+src=(?P<q>["\'])(?:https?:)?//p\.bokecc\.com/(?:player|flash/player\.swf)\?(?P<query>.+?)(?P=q)', + webpage, 'player params', group='query') + + player_params = compat_parse_qs(player_params_str) + + info_xml = self._download_xml( + 'http://p.bokecc.com/servlet/playinfo?uid=%s&vid=%s&m=1' % ( + player_params['siteid'][0], player_params['vid'][0]), video_id) + + formats = [{ + 'format_id': format_id, + 'url': quality.find('./copy').attrib['playurl'], + 'quality': int(quality.attrib['value']), + } for quality in info_xml.findall('./video/quality')] + + self._sort_formats(formats) + + return formats + + +class BokeCCIE(BokeCCBaseIE): + _IE_DESC = 'CC视频' + _VALID_URL = r'https?://union\.bokecc\.com/playvideo\.bo\?(?P<query>.*)' + + _TESTS = [{ + 'url': 'http://union.bokecc.com/playvideo.bo?vid=E0ABAE9D4F509B189C33DC5901307461&uid=FE644790DE9D154A', + 'info_dict': { + 'id': 'FE644790DE9D154A_E0ABAE9D4F509B189C33DC5901307461', + 'ext': 'flv', + 'title': 'BokeCC Video', + }, + }] + + def _real_extract(self, url): + qs = compat_parse_qs(self._match_valid_url(url).group('query')) + if not qs.get('vid') or not qs.get('uid'): + raise ExtractorError('Invalid URL', expected=True) + + video_id = '%s_%s' % (qs['uid'][0], qs['vid'][0]) + + webpage = self._download_webpage(url, video_id) + + return { + 'id': video_id, + 'title': 'BokeCC Video', # no title provided in the webpage + 'formats': self._extract_bokecc_formats(webpage, video_id), + } diff --git a/yt_dlp/extractor/bongacams.py b/yt_dlp/extractor/bongacams.py new file mode 100644 index 000000000..9e7551136 --- /dev/null +++ b/yt_dlp/extractor/bongacams.py @@ -0,0 +1,59 @@ +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + try_get, + urlencode_postdata, +) + + +class BongaCamsIE(InfoExtractor): + _VALID_URL = r'https?://(?P<host>(?:[^/]+\.)?bongacams\d*\.com)/(?P<id>[^/?&#]+)' + _TESTS = [{ + 'url': 'https://de.bongacams.com/azumi-8', + 'only_matching': True, + }, { + 'url': 'https://cn.bongacams.com/azumi-8', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + host = mobj.group('host') + channel_id = mobj.group('id') + + amf = self._download_json( + 'https://%s/tools/amf.php' % host, channel_id, + data=urlencode_postdata(( + ('method', 'getRoomData'), + ('args[]', channel_id), + ('args[]', 'false'), + )), headers={'X-Requested-With': 'XMLHttpRequest'}) + + server_url = amf['localData']['videoServerUrl'] + + uploader_id = try_get( + amf, lambda x: x['performerData']['username'], compat_str) or channel_id + uploader = try_get( + amf, lambda x: x['performerData']['displayName'], compat_str) + like_count = int_or_none(try_get( + amf, lambda x: x['performerData']['loversCount'])) + + formats = self._extract_m3u8_formats( + '%s/hls/stream_%s/playlist.m3u8' % (server_url, uploader_id), + channel_id, 'mp4', m3u8_id='hls', live=True) + self._sort_formats(formats) + + return { + 'id': channel_id, + 'title': self._live_title(uploader or uploader_id), + 'uploader': uploader, + 'uploader_id': uploader_id, + 'like_count': like_count, + 'age_limit': 18, + 'is_live': True, + 'formats': formats, + } diff --git a/youtube_dl/extractor/bostonglobe.py b/yt_dlp/extractor/bostonglobe.py index 57882fbee..57882fbee 100644 --- a/youtube_dl/extractor/bostonglobe.py +++ b/yt_dlp/extractor/bostonglobe.py diff --git a/yt_dlp/extractor/box.py b/yt_dlp/extractor/box.py new file mode 100644 index 000000000..8214086a6 --- /dev/null +++ b/yt_dlp/extractor/box.py @@ -0,0 +1,97 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + parse_iso8601, + # try_get, + update_url_query, +) + + +class BoxIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^.]+\.)?app\.box\.com/s/(?P<shared_name>[^/]+)/file/(?P<id>\d+)' + _TEST = { + 'url': 'https://mlssoccer.app.box.com/s/0evd2o3e08l60lr4ygukepvnkord1o1x/file/510727257538', + 'md5': '1f81b2fd3960f38a40a3b8823e5fcd43', + 'info_dict': { + 'id': '510727257538', + 'ext': 'mp4', + 'title': 'Garber St. Louis will be 28th MLS team +scarving.mp4', + 'uploader': 'MLS Video', + 'timestamp': 1566320259, + 'upload_date': '20190820', + 'uploader_id': '235196876', + } + } + + def _real_extract(self, url): + shared_name, file_id = self._match_valid_url(url).groups() + webpage = self._download_webpage(url, file_id) + request_token = self._parse_json(self._search_regex( + r'Box\.config\s*=\s*({.+?});', webpage, + 'Box config'), file_id)['requestToken'] + access_token = self._download_json( + 'https://app.box.com/app-api/enduserapp/elements/tokens', file_id, + 'Downloading token JSON metadata', + data=json.dumps({'fileIDs': [file_id]}).encode(), headers={ + 'Content-Type': 'application/json', + 'X-Request-Token': request_token, + 'X-Box-EndUser-API': 'sharedName=' + shared_name, + })[file_id]['read'] + shared_link = 'https://app.box.com/s/' + shared_name + f = self._download_json( + 'https://api.box.com/2.0/files/' + file_id, file_id, + 'Downloading file JSON metadata', headers={ + 'Authorization': 'Bearer ' + access_token, + 'BoxApi': 'shared_link=' + shared_link, + 'X-Rep-Hints': '[dash]', # TODO: extract `hls` formats + }, query={ + 'fields': 'authenticated_download_url,created_at,created_by,description,extension,is_download_available,name,representations,size' + }) + title = f['name'] + + query = { + 'access_token': access_token, + 'shared_link': shared_link + } + + formats = [] + + # for entry in (try_get(f, lambda x: x['representations']['entries'], list) or []): + # entry_url_template = try_get( + # entry, lambda x: x['content']['url_template']) + # if not entry_url_template: + # continue + # representation = entry.get('representation') + # if representation == 'dash': + # TODO: append query to every fragment URL + # formats.extend(self._extract_mpd_formats( + # entry_url_template.replace('{+asset_path}', 'manifest.mpd'), + # file_id, query=query)) + + authenticated_download_url = f.get('authenticated_download_url') + if authenticated_download_url and f.get('is_download_available'): + formats.append({ + 'ext': f.get('extension') or determine_ext(title), + 'filesize': f.get('size'), + 'format_id': 'download', + 'url': update_url_query(authenticated_download_url, query), + }) + + self._sort_formats(formats) + + creator = f.get('created_by') or {} + + return { + 'id': file_id, + 'title': title, + 'formats': formats, + 'description': f.get('description') or None, + 'uploader': creator.get('name'), + 'timestamp': parse_iso8601(f.get('created_at')), + 'uploader_id': creator.get('id'), + } diff --git a/yt_dlp/extractor/bpb.py b/yt_dlp/extractor/bpb.py new file mode 100644 index 000000000..8f6ef3cf0 --- /dev/null +++ b/yt_dlp/extractor/bpb.py @@ -0,0 +1,62 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + js_to_json, + determine_ext, +) + + +class BpbIE(InfoExtractor): + IE_DESC = 'Bundeszentrale für politische Bildung' + _VALID_URL = r'https?://(?:www\.)?bpb\.de/mediathek/(?P<id>[0-9]+)/' + + _TEST = { + 'url': 'http://www.bpb.de/mediathek/297/joachim-gauck-zu-1989-und-die-erinnerung-an-die-ddr', + # md5 fails in Python 2.6 due to buggy server response and wrong handling of urllib2 + 'md5': 'c4f84c8a8044ca9ff68bb8441d300b3f', + 'info_dict': { + 'id': '297', + 'ext': 'mp4', + 'title': 'Joachim Gauck zu 1989 und die Erinnerung an die DDR', + 'description': 'Joachim Gauck, erster Beauftragter für die Stasi-Unterlagen, spricht auf dem Geschichtsforum über die friedliche Revolution 1989 und eine "gewisse Traurigkeit" im Umgang mit der DDR-Vergangenheit.' + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex( + r'<h2 class="white">(.*?)</h2>', webpage, 'title') + video_info_dicts = re.findall( + r"({\s*src\s*:\s*'https?://film\.bpb\.de/[^}]+})", webpage) + + formats = [] + for video_info in video_info_dicts: + video_info = self._parse_json( + video_info, video_id, transform_source=js_to_json, fatal=False) + if not video_info: + continue + video_url = video_info.get('src') + if not video_url: + continue + quality = 'high' if '_high' in video_url else 'low' + formats.append({ + 'url': video_url, + 'quality': 10 if quality == 'high' else 0, + 'format_note': quality, + 'format_id': '%s-%s' % (quality, determine_ext(video_url)), + }) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'formats': formats, + 'title': title, + 'description': self._og_search_description(webpage), + } diff --git a/yt_dlp/extractor/br.py b/yt_dlp/extractor/br.py new file mode 100644 index 000000000..7169eceb6 --- /dev/null +++ b/yt_dlp/extractor/br.py @@ -0,0 +1,310 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + parse_duration, + parse_iso8601, + xpath_element, + xpath_text, +) + + +class BRIE(InfoExtractor): + IE_DESC = 'Bayerischer Rundfunk' + _VALID_URL = r'(?P<base_url>https?://(?:www\.)?br(?:-klassik)?\.de)/(?:[a-z0-9\-_]+/)+(?P<id>[a-z0-9\-_]+)\.html' + + _TESTS = [ + { + 'url': 'http://www.br.de/mediathek/video/sendungen/abendschau/betriebliche-altersvorsorge-104.html', + 'md5': '83a0477cf0b8451027eb566d88b51106', + 'info_dict': { + 'id': '48f656ef-287e-486f-be86-459122db22cc', + 'ext': 'mp4', + 'title': 'Die böse Überraschung', + 'description': 'md5:ce9ac81b466ce775b8018f6801b48ac9', + 'duration': 180, + 'uploader': 'Reinhard Weber', + 'upload_date': '20150422', + }, + 'skip': '404 not found', + }, + { + 'url': 'http://www.br.de/nachrichten/oberbayern/inhalt/muenchner-polizeipraesident-schreiber-gestorben-100.html', + 'md5': 'af3a3a4aa43ff0ce6a89504c67f427ef', + 'info_dict': { + 'id': 'a4b83e34-123d-4b81-9f4e-c0d3121a4e05', + 'ext': 'flv', + 'title': 'Manfred Schreiber ist tot', + 'description': 'md5:b454d867f2a9fc524ebe88c3f5092d97', + 'duration': 26, + }, + 'skip': '404 not found', + }, + { + 'url': 'https://www.br-klassik.de/audio/peeping-tom-premierenkritik-dance-festival-muenchen-100.html', + 'md5': '8b5b27c0b090f3b35eac4ab3f7a73d3d', + 'info_dict': { + 'id': '74c603c9-26d3-48bb-b85b-079aeed66e0b', + 'ext': 'aac', + 'title': 'Kurzweilig und sehr bewegend', + 'description': 'md5:0351996e3283d64adeb38ede91fac54e', + 'duration': 296, + }, + 'skip': '404 not found', + }, + { + 'url': 'http://www.br.de/radio/bayern1/service/team/videos/team-video-erdelt100.html', + 'md5': 'dbab0aef2e047060ea7a21fc1ce1078a', + 'info_dict': { + 'id': '6ba73750-d405-45d3-861d-1ce8c524e059', + 'ext': 'mp4', + 'title': 'Umweltbewusster Häuslebauer', + 'description': 'md5:d52dae9792d00226348c1dbb13c9bae2', + 'duration': 116, + } + }, + { + 'url': 'http://www.br.de/fernsehen/br-alpha/sendungen/kant-fuer-anfaenger/kritik-der-reinen-vernunft/kant-kritik-01-metaphysik100.html', + 'md5': '23bca295f1650d698f94fc570977dae3', + 'info_dict': { + 'id': 'd982c9ce-8648-4753-b358-98abb8aec43d', + 'ext': 'mp4', + 'title': 'Folge 1 - Metaphysik', + 'description': 'md5:bb659990e9e59905c3d41e369db1fbe3', + 'duration': 893, + 'uploader': 'Eva Maria Steimle', + 'upload_date': '20170208', + } + }, + ] + + def _real_extract(self, url): + base_url, display_id = self._match_valid_url(url).groups() + page = self._download_webpage(url, display_id) + xml_url = self._search_regex( + r"return BRavFramework\.register\(BRavFramework\('avPlayer_(?:[a-f0-9-]{36})'\)\.setup\({dataURL:'(/(?:[a-z0-9\-]+/)+[a-z0-9/~_.-]+)'}\)\);", page, 'XMLURL') + xml = self._download_xml(base_url + xml_url, display_id) + + medias = [] + + for xml_media in xml.findall('video') + xml.findall('audio'): + media_id = xml_media.get('externalId') + media = { + 'id': media_id, + 'title': xpath_text(xml_media, 'title', 'title', True), + 'duration': parse_duration(xpath_text(xml_media, 'duration')), + 'formats': self._extract_formats(xpath_element( + xml_media, 'assets'), media_id), + 'thumbnails': self._extract_thumbnails(xpath_element( + xml_media, 'teaserImage/variants'), base_url), + 'description': xpath_text(xml_media, 'desc'), + 'webpage_url': xpath_text(xml_media, 'permalink'), + 'uploader': xpath_text(xml_media, 'author'), + } + broadcast_date = xpath_text(xml_media, 'broadcastDate') + if broadcast_date: + media['upload_date'] = ''.join(reversed(broadcast_date.split('.'))) + medias.append(media) + + if len(medias) > 1: + self.report_warning( + 'found multiple medias; please ' + 'report this with the video URL to http://yt-dl.org/bug') + if not medias: + raise ExtractorError('No media entries found') + return medias[0] + + def _extract_formats(self, assets, media_id): + formats = [] + for asset in assets.findall('asset'): + format_url = xpath_text(asset, ['downloadUrl', 'url']) + asset_type = asset.get('type') + if asset_type.startswith('HDS'): + formats.extend(self._extract_f4m_formats( + format_url + '?hdcore=3.2.0', media_id, f4m_id='hds', fatal=False)) + elif asset_type.startswith('HLS'): + formats.extend(self._extract_m3u8_formats( + format_url, media_id, 'mp4', 'm3u8_native', m3u8_id='hds', fatal=False)) + else: + format_info = { + 'ext': xpath_text(asset, 'mediaType'), + 'width': int_or_none(xpath_text(asset, 'frameWidth')), + 'height': int_or_none(xpath_text(asset, 'frameHeight')), + 'tbr': int_or_none(xpath_text(asset, 'bitrateVideo')), + 'abr': int_or_none(xpath_text(asset, 'bitrateAudio')), + 'vcodec': xpath_text(asset, 'codecVideo'), + 'acodec': xpath_text(asset, 'codecAudio'), + 'container': xpath_text(asset, 'mediaType'), + 'filesize': int_or_none(xpath_text(asset, 'size')), + } + format_url = self._proto_relative_url(format_url) + if format_url: + http_format_info = format_info.copy() + http_format_info.update({ + 'url': format_url, + 'format_id': 'http-%s' % asset_type, + }) + formats.append(http_format_info) + server_prefix = xpath_text(asset, 'serverPrefix') + if server_prefix: + rtmp_format_info = format_info.copy() + rtmp_format_info.update({ + 'url': server_prefix, + 'play_path': xpath_text(asset, 'fileName'), + 'format_id': 'rtmp-%s' % asset_type, + }) + formats.append(rtmp_format_info) + self._sort_formats(formats) + return formats + + def _extract_thumbnails(self, variants, base_url): + thumbnails = [{ + 'url': base_url + xpath_text(variant, 'url'), + 'width': int_or_none(xpath_text(variant, 'width')), + 'height': int_or_none(xpath_text(variant, 'height')), + } for variant in variants.findall('variant') if xpath_text(variant, 'url')] + thumbnails.sort(key=lambda x: x['width'] * x['height'], reverse=True) + return thumbnails + + +class BRMediathekIE(InfoExtractor): + IE_DESC = 'Bayerischer Rundfunk Mediathek' + _VALID_URL = r'https?://(?:www\.)?br\.de/mediathek/video/[^/?&#]*?-(?P<id>av:[0-9a-f]{24})' + + _TESTS = [{ + 'url': 'https://www.br.de/mediathek/video/gesundheit-die-sendung-vom-28112017-av:5a1e6a6e8fce6d001871cc8e', + 'md5': 'fdc3d485835966d1622587d08ba632ec', + 'info_dict': { + 'id': 'av:5a1e6a6e8fce6d001871cc8e', + 'ext': 'mp4', + 'title': 'Die Sendung vom 28.11.2017', + 'description': 'md5:6000cdca5912ab2277e5b7339f201ccc', + 'timestamp': 1511942766, + 'upload_date': '20171129', + } + }] + + def _real_extract(self, url): + clip_id = self._match_id(url) + + clip = self._download_json( + 'https://proxy-base.master.mango.express/graphql', + clip_id, data=json.dumps({ + "query": """{ + viewer { + clip(id: "%s") { + title + description + duration + createdAt + ageRestriction + videoFiles { + edges { + node { + publicLocation + fileSize + videoProfile { + width + height + bitrate + encoding + } + } + } + } + captionFiles { + edges { + node { + publicLocation + } + } + } + teaserImages { + edges { + node { + imageFiles { + edges { + node { + publicLocation + width + height + } + } + } + } + } + } + } + } +}""" % clip_id}).encode(), headers={ + 'Content-Type': 'application/json', + })['data']['viewer']['clip'] + title = clip['title'] + + formats = [] + for edge in clip.get('videoFiles', {}).get('edges', []): + node = edge.get('node', {}) + n_url = node.get('publicLocation') + if not n_url: + continue + ext = determine_ext(n_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + n_url, clip_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + else: + video_profile = node.get('videoProfile', {}) + tbr = int_or_none(video_profile.get('bitrate')) + format_id = 'http' + if tbr: + format_id += '-%d' % tbr + formats.append({ + 'format_id': format_id, + 'url': n_url, + 'width': int_or_none(video_profile.get('width')), + 'height': int_or_none(video_profile.get('height')), + 'tbr': tbr, + 'filesize': int_or_none(node.get('fileSize')), + }) + self._sort_formats(formats) + + subtitles = {} + for edge in clip.get('captionFiles', {}).get('edges', []): + node = edge.get('node', {}) + n_url = node.get('publicLocation') + if not n_url: + continue + subtitles.setdefault('de', []).append({ + 'url': n_url, + }) + + thumbnails = [] + for edge in clip.get('teaserImages', {}).get('edges', []): + for image_edge in edge.get('node', {}).get('imageFiles', {}).get('edges', []): + node = image_edge.get('node', {}) + n_url = node.get('publicLocation') + if not n_url: + continue + thumbnails.append({ + 'url': n_url, + 'width': int_or_none(node.get('width')), + 'height': int_or_none(node.get('height')), + }) + + return { + 'id': clip_id, + 'title': title, + 'description': clip.get('description'), + 'duration': int_or_none(clip.get('duration')), + 'timestamp': parse_iso8601(clip.get('createdAt')), + 'age_limit': int_or_none(clip.get('ageRestriction')), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': thumbnails, + } diff --git a/yt_dlp/extractor/bravotv.py b/yt_dlp/extractor/bravotv.py new file mode 100644 index 000000000..139d51c09 --- /dev/null +++ b/yt_dlp/extractor/bravotv.py @@ -0,0 +1,120 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .adobepass import AdobePassIE +from ..utils import ( + smuggle_url, + update_url_query, + int_or_none, + float_or_none, + try_get, + dict_get, +) + + +class BravoTVIE(AdobePassIE): + _VALID_URL = r'https?://(?:www\.)?(?P<req_id>bravotv|oxygen)\.com/(?:[^/]+/)+(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'https://www.bravotv.com/top-chef/season-16/episode-15/videos/the-top-chef-season-16-winner-is', + 'md5': 'e34684cfea2a96cd2ee1ef3a60909de9', + 'info_dict': { + 'id': 'epL0pmK1kQlT', + 'ext': 'mp4', + 'title': 'The Top Chef Season 16 Winner Is...', + 'description': 'Find out who takes the title of Top Chef!', + 'uploader': 'NBCU-BRAV', + 'upload_date': '20190314', + 'timestamp': 1552591860, + 'season_number': 16, + 'episode_number': 15, + 'series': 'Top Chef', + 'episode': 'The Top Chef Season 16 Winner Is...', + 'duration': 190.0, + } + }, { + 'url': 'http://www.bravotv.com/below-deck/season-3/ep-14-reunion-part-1', + 'only_matching': True, + }, { + 'url': 'https://www.oxygen.com/in-ice-cold-blood/season-2/episode-16/videos/handling-the-horwitz-house-after-the-murder-season-2', + 'only_matching': True, + }] + + def _real_extract(self, url): + site, display_id = self._match_valid_url(url).groups() + webpage = self._download_webpage(url, display_id) + settings = self._parse_json(self._search_regex( + r'<script[^>]+data-drupal-selector="drupal-settings-json"[^>]*>({.+?})</script>', webpage, 'drupal settings'), + display_id) + info = {} + query = { + 'mbr': 'true', + } + account_pid, release_pid = [None] * 2 + tve = settings.get('ls_tve') + if tve: + query['manifest'] = 'm3u' + mobj = re.search(r'<[^>]+id="pdk-player"[^>]+data-url=["\']?(?:https?:)?//player\.theplatform\.com/p/([^/]+)/(?:[^/]+/)*select/([^?#&"\']+)', webpage) + if mobj: + account_pid, tp_path = mobj.groups() + release_pid = tp_path.strip('/').split('/')[-1] + else: + account_pid = 'HNK2IC' + tp_path = release_pid = tve['release_pid'] + if tve.get('entitlement') == 'auth': + adobe_pass = settings.get('tve_adobe_auth', {}) + if site == 'bravotv': + site = 'bravo' + resource = self._get_mvpd_resource( + adobe_pass.get('adobePassResourceId') or site, + tve['title'], release_pid, tve.get('rating')) + query['auth'] = self._extract_mvpd_auth( + url, release_pid, + adobe_pass.get('adobePassRequestorId') or site, resource) + else: + shared_playlist = settings['ls_playlist'] + account_pid = shared_playlist['account_pid'] + metadata = shared_playlist['video_metadata'][shared_playlist['default_clip']] + tp_path = release_pid = metadata.get('release_pid') + if not release_pid: + release_pid = metadata['guid'] + tp_path = 'media/guid/2140479951/' + release_pid + info.update({ + 'title': metadata['title'], + 'description': metadata.get('description'), + 'season_number': int_or_none(metadata.get('season_num')), + 'episode_number': int_or_none(metadata.get('episode_num')), + }) + query['switch'] = 'progressive' + + tp_url = 'http://link.theplatform.com/s/%s/%s' % (account_pid, tp_path) + + tp_metadata = self._download_json( + update_url_query(tp_url, {'format': 'preview'}), + display_id, fatal=False) + if tp_metadata: + info.update({ + 'title': tp_metadata.get('title'), + 'description': tp_metadata.get('description'), + 'duration': float_or_none(tp_metadata.get('duration'), 1000), + 'season_number': int_or_none( + dict_get(tp_metadata, ('pl1$seasonNumber', 'nbcu$seasonNumber'))), + 'episode_number': int_or_none( + dict_get(tp_metadata, ('pl1$episodeNumber', 'nbcu$episodeNumber'))), + # For some reason the series is sometimes wrapped into a single element array. + 'series': try_get( + dict_get(tp_metadata, ('pl1$show', 'nbcu$show')), + lambda x: x[0] if isinstance(x, list) else x, + expected_type=str), + 'episode': dict_get( + tp_metadata, ('pl1$episodeName', 'nbcu$episodeName', 'title')), + }) + + info.update({ + '_type': 'url_transparent', + 'id': release_pid, + 'url': smuggle_url(update_url_query(tp_url, query), {'force_smil_url': True}), + 'ie_key': 'ThePlatform', + }) + return info diff --git a/yt_dlp/extractor/breakcom.py b/yt_dlp/extractor/breakcom.py new file mode 100644 index 000000000..f38789f99 --- /dev/null +++ b/yt_dlp/extractor/breakcom.py @@ -0,0 +1,90 @@ +from __future__ import unicode_literals + + +from .common import InfoExtractor +from .youtube import YoutubeIE +from ..utils import ( + int_or_none, + url_or_none, +) + + +class BreakIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?break\.com/video/(?P<display_id>[^/]+?)(?:-(?P<id>\d+))?(?:[/?#&]|$)' + _TESTS = [{ + 'url': 'http://www.break.com/video/when-girls-act-like-guys-2468056', + 'info_dict': { + 'id': '2468056', + 'ext': 'mp4', + 'title': 'When Girls Act Like D-Bags', + 'age_limit': 13, + }, + }, { + # youtube embed + 'url': 'http://www.break.com/video/someone-forgot-boat-brakes-work', + 'info_dict': { + 'id': 'RrrDLdeL2HQ', + 'ext': 'mp4', + 'title': 'Whale Watching Boat Crashing Into San Diego Dock', + 'description': 'md5:afc1b2772f0a8468be51dd80eb021069', + 'upload_date': '20160331', + 'uploader': 'Steve Holden', + 'uploader_id': 'sdholden07', + }, + 'params': { + 'skip_download': True, + } + }, { + 'url': 'http://www.break.com/video/ugc/baby-flex-2773063', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id, video_id = self._match_valid_url(url).groups() + + webpage = self._download_webpage(url, display_id) + + youtube_url = YoutubeIE._extract_url(webpage) + if youtube_url: + return self.url_result(youtube_url, ie=YoutubeIE.ie_key()) + + content = self._parse_json( + self._search_regex( + r'(?s)content["\']\s*:\s*(\[.+?\])\s*[,\n]', webpage, + 'content'), + display_id) + + formats = [] + for video in content: + video_url = url_or_none(video.get('url')) + if not video_url: + continue + bitrate = int_or_none(self._search_regex( + r'(\d+)_kbps', video_url, 'tbr', default=None)) + formats.append({ + 'url': video_url, + 'format_id': 'http-%d' % bitrate if bitrate else 'http', + 'tbr': bitrate, + }) + self._sort_formats(formats) + + title = self._search_regex( + (r'title["\']\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1', + r'<h1[^>]*>(?P<value>[^<]+)'), webpage, 'title', group='value') + + def get(key, name): + return int_or_none(self._search_regex( + r'%s["\']\s*:\s*["\'](\d+)' % key, webpage, name, + default=None)) + + age_limit = get('ratings', 'age limit') + video_id = video_id or get('pid', 'video id') or display_id + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'thumbnail': self._og_search_thumbnail(webpage), + 'age_limit': age_limit, + 'formats': formats, + } diff --git a/yt_dlp/extractor/brightcove.py b/yt_dlp/extractor/brightcove.py new file mode 100644 index 000000000..cd1c3f01c --- /dev/null +++ b/yt_dlp/extractor/brightcove.py @@ -0,0 +1,686 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import base64 +import re +import struct + +from .adobepass import AdobePassIE +from .common import InfoExtractor +from ..compat import ( + compat_etree_fromstring, + compat_HTTPError, + compat_parse_qs, + compat_urlparse, + compat_xml_parse_error, +) +from ..utils import ( + clean_html, + extract_attributes, + ExtractorError, + find_xpath_attr, + fix_xml_ampersands, + float_or_none, + int_or_none, + js_to_json, + mimetype2ext, + parse_iso8601, + parse_qs, + smuggle_url, + str_or_none, + try_get, + unescapeHTML, + unsmuggle_url, + UnsupportedError, + update_url_query, + url_or_none, +) + + +class BrightcoveLegacyIE(InfoExtractor): + IE_NAME = 'brightcove:legacy' + _VALID_URL = r'(?:https?://.*brightcove\.com/(services|viewer).*?\?|brightcove:)(?P<query>.*)' + + _TESTS = [ + { + # From http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/ + 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1654948606001&flashID=myExperience&%40videoPlayer=2371591881001', + 'md5': '5423e113865d26e40624dce2e4b45d95', + 'note': 'Test Brightcove downloads and detection in GenericIE', + 'info_dict': { + 'id': '2371591881001', + 'ext': 'mp4', + 'title': 'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”', + 'uploader': '8TV', + 'description': 'md5:a950cc4285c43e44d763d036710cd9cd', + 'timestamp': 1368213670, + 'upload_date': '20130510', + 'uploader_id': '1589608506001', + }, + 'skip': 'The player has been deactivated by the content owner', + }, + { + # From http://medianetwork.oracle.com/video/player/1785452137001 + 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1217746023001&flashID=myPlayer&%40videoPlayer=1785452137001', + 'info_dict': { + 'id': '1785452137001', + 'ext': 'flv', + 'title': 'JVMLS 2012: Arrays 2.0 - Opportunities and Challenges', + 'description': 'John Rose speaks at the JVM Language Summit, August 1, 2012.', + 'uploader': 'Oracle', + 'timestamp': 1344975024, + 'upload_date': '20120814', + 'uploader_id': '1460825906', + }, + 'skip': 'video not playable', + }, + { + # From http://mashable.com/2013/10/26/thermoelectric-bracelet-lets-you-control-your-body-temperature/ + 'url': 'http://c.brightcove.com/services/viewer/federated_f9?&playerID=1265504713001&publisherID=AQ%7E%7E%2CAAABBzUwv1E%7E%2CxP-xFHVUstiMFlNYfvF4G9yFnNaqCw_9&videoID=2750934548001', + 'info_dict': { + 'id': '2750934548001', + 'ext': 'mp4', + 'title': 'This Bracelet Acts as a Personal Thermostat', + 'description': 'md5:547b78c64f4112766ccf4e151c20b6a0', + # 'uploader': 'Mashable', + 'timestamp': 1382041798, + 'upload_date': '20131017', + 'uploader_id': '1130468786001', + }, + }, + { + # test that the default referer works + # from http://national.ballet.ca/interact/video/Lost_in_Motion_II/ + 'url': 'http://link.brightcove.com/services/player/bcpid756015033001?bckey=AQ~~,AAAApYJi_Ck~,GxhXCegT1Dp39ilhXuxMJxasUhVNZiil&bctid=2878862109001', + 'info_dict': { + 'id': '2878862109001', + 'ext': 'mp4', + 'title': 'Lost in Motion II', + 'description': 'md5:363109c02998fee92ec02211bd8000df', + 'uploader': 'National Ballet of Canada', + }, + 'skip': 'Video gone', + }, + { + # test flv videos served by akamaihd.net + # From http://www.redbull.com/en/bike/stories/1331655643987/replay-uci-dh-world-cup-2014-from-fort-william + 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?%40videoPlayer=ref%3Aevent-stream-356&linkBaseURL=http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fvideos%2F1331655630249%2Freplay-uci-fort-william-2014-dh&playerKey=AQ%7E%7E%2CAAAApYJ7UqE%7E%2Cxqr_zXk0I-zzNndy8NlHogrCb5QdyZRf&playerID=1398061561001#__youtubedl_smuggle=%7B%22Referer%22%3A+%22http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fstories%2F1331655643987%2Freplay-uci-dh-world-cup-2014-from-fort-william%22%7D', + # The md5 checksum changes on each download + 'info_dict': { + 'id': '3750436379001', + 'ext': 'flv', + 'title': 'UCI MTB World Cup 2014: Fort William, UK - Downhill Finals', + 'uploader': 'RBTV Old (do not use)', + 'description': 'UCI MTB World Cup 2014: Fort William, UK - Downhill Finals', + 'timestamp': 1409122195, + 'upload_date': '20140827', + 'uploader_id': '710858724001', + }, + 'skip': 'Video gone', + }, + { + # playlist with 'videoList' + # from http://support.brightcove.com/en/video-cloud/docs/playlist-support-single-video-players + 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=3550052898001&playerKey=AQ%7E%7E%2CAAABmA9XpXk%7E%2C-Kp7jNgisre1fG5OdqpAFUTcs0lP_ZoL', + 'info_dict': { + 'title': 'Sealife', + 'id': '3550319591001', + }, + 'playlist_mincount': 7, + 'skip': 'Unsupported URL', + }, + { + # playlist with 'playlistTab' (https://github.com/ytdl-org/youtube-dl/issues/9965) + 'url': 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=AQ%7E%7E,AAABXlLMdok%7E,NJ4EoMlZ4rZdx9eU1rkMVd8EaYPBBUlg', + 'info_dict': { + 'id': '1522758701001', + 'title': 'Lesson 08', + }, + 'playlist_mincount': 10, + 'skip': 'Unsupported URL', + }, + { + # playerID inferred from bcpid + # from http://www.un.org/chinese/News/story.asp?NewsID=27724 + 'url': 'https://link.brightcove.com/services/player/bcpid1722935254001/?bctid=5360463607001&autoStart=false&secureConnections=true&width=650&height=350', + 'only_matching': True, # Tested in GenericIE + } + ] + + @classmethod + def _build_brightcove_url(cls, object_str): + """ + Build a Brightcove url from a xml string containing + <object class="BrightcoveExperience">{params}</object> + """ + + # Fix up some stupid HTML, see https://github.com/ytdl-org/youtube-dl/issues/1553 + object_str = re.sub(r'(<param(?:\s+[a-zA-Z0-9_]+="[^"]*")*)>', + lambda m: m.group(1) + '/>', object_str) + # Fix up some stupid XML, see https://github.com/ytdl-org/youtube-dl/issues/1608 + object_str = object_str.replace('<--', '<!--') + # remove namespace to simplify extraction + object_str = re.sub(r'(<object[^>]*)(xmlns=".*?")', r'\1', object_str) + object_str = fix_xml_ampersands(object_str) + + try: + object_doc = compat_etree_fromstring(object_str.encode('utf-8')) + except compat_xml_parse_error: + return + + fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars') + if fv_el is not None: + flashvars = dict( + (k, v[0]) + for k, v in compat_parse_qs(fv_el.attrib['value']).items()) + else: + flashvars = {} + + data_url = object_doc.attrib.get('data', '') + data_url_params = parse_qs(data_url) + + def find_param(name): + if name in flashvars: + return flashvars[name] + node = find_xpath_attr(object_doc, './param', 'name', name) + if node is not None: + return node.attrib['value'] + return data_url_params.get(name) + + params = {} + + playerID = find_param('playerID') or find_param('playerId') + if playerID is None: + raise ExtractorError('Cannot find player ID') + params['playerID'] = playerID + + playerKey = find_param('playerKey') + # Not all pages define this value + if playerKey is not None: + params['playerKey'] = playerKey + # These fields hold the id of the video + videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID') or find_param('@videoList') + if videoPlayer is not None: + if isinstance(videoPlayer, list): + videoPlayer = videoPlayer[0] + videoPlayer = videoPlayer.strip() + # UUID is also possible for videoPlayer (e.g. + # http://www.popcornflix.com/hoodies-vs-hooligans/7f2d2b87-bbf2-4623-acfb-ea942b4f01dd + # or http://www8.hp.com/cn/zh/home.html) + if not (re.match( + r'^(?:\d+|[\da-fA-F]{8}-?[\da-fA-F]{4}-?[\da-fA-F]{4}-?[\da-fA-F]{4}-?[\da-fA-F]{12})$', + videoPlayer) or videoPlayer.startswith('ref:')): + return None + params['@videoPlayer'] = videoPlayer + linkBase = find_param('linkBaseURL') + if linkBase is not None: + params['linkBaseURL'] = linkBase + return cls._make_brightcove_url(params) + + @classmethod + def _build_brightcove_url_from_js(cls, object_js): + # The layout of JS is as follows: + # customBC.createVideo = function (width, height, playerID, playerKey, videoPlayer, VideoRandomID) { + # // build Brightcove <object /> XML + # } + m = re.search( + r'''(?x)customBC\.createVideo\( + .*? # skipping width and height + ["\'](?P<playerID>\d+)["\']\s*,\s* # playerID + ["\'](?P<playerKey>AQ[^"\']{48})[^"\']*["\']\s*,\s* # playerKey begins with AQ and is 50 characters + # in length, however it's appended to itself + # in places, so truncate + ["\'](?P<videoID>\d+)["\'] # @videoPlayer + ''', object_js) + if m: + return cls._make_brightcove_url(m.groupdict()) + + @classmethod + def _make_brightcove_url(cls, params): + return update_url_query( + 'http://c.brightcove.com/services/viewer/htmlFederated', params) + + @classmethod + def _extract_brightcove_url(cls, webpage): + """Try to extract the brightcove url from the webpage, returns None + if it can't be found + """ + urls = cls._extract_brightcove_urls(webpage) + return urls[0] if urls else None + + @classmethod + def _extract_brightcove_urls(cls, webpage): + """Return a list of all Brightcove URLs from the webpage """ + + url_m = re.search( + r'''(?x) + <meta\s+ + (?:property|itemprop)=([\'"])(?:og:video|embedURL)\1[^>]+ + content=([\'"])(?P<url>https?://(?:secure|c)\.brightcove.com/(?:(?!\2).)+)\2 + ''', webpage) + if url_m: + url = unescapeHTML(url_m.group('url')) + # Some sites don't add it, we can't download with this url, for example: + # http://www.ktvu.com/videos/news/raw-video-caltrain-releases-video-of-man-almost/vCTZdY/ + if 'playerKey' in url or 'videoId' in url or 'idVideo' in url: + return [url] + + matches = re.findall( + r'''(?sx)<object + (?: + [^>]+?class=[\'"][^>]*?BrightcoveExperience.*?[\'"] | + [^>]*?>\s*<param\s+name="movie"\s+value="https?://[^/]*brightcove\.com/ + ).+?>\s*</object>''', + webpage) + if matches: + return list(filter(None, [cls._build_brightcove_url(m) for m in matches])) + + matches = re.findall(r'(customBC\.createVideo\(.+?\);)', webpage) + if matches: + return list(filter(None, [ + cls._build_brightcove_url_from_js(custom_bc) + for custom_bc in matches])) + return [src for _, src in re.findall( + r'<iframe[^>]+src=([\'"])((?:https?:)?//link\.brightcove\.com/services/player/(?!\1).+)\1', webpage)] + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + + # Change the 'videoId' and others field to '@videoPlayer' + url = re.sub(r'(?<=[?&])(videoI(d|D)|idVideo|bctid)', '%40videoPlayer', url) + # Change bckey (used by bcove.me urls) to playerKey + url = re.sub(r'(?<=[?&])bckey', 'playerKey', url) + mobj = self._match_valid_url(url) + query_str = mobj.group('query') + query = compat_urlparse.parse_qs(query_str) + + videoPlayer = query.get('@videoPlayer') + if videoPlayer: + # We set the original url as the default 'Referer' header + referer = query.get('linkBaseURL', [None])[0] or smuggled_data.get('Referer', url) + video_id = videoPlayer[0] + if 'playerID' not in query: + mobj = re.search(r'/bcpid(\d+)', url) + if mobj is not None: + query['playerID'] = [mobj.group(1)] + publisher_id = query.get('publisherId') + if publisher_id and publisher_id[0].isdigit(): + publisher_id = publisher_id[0] + if not publisher_id: + player_key = query.get('playerKey') + if player_key and ',' in player_key[0]: + player_key = player_key[0] + else: + player_id = query.get('playerID') + if player_id and player_id[0].isdigit(): + headers = {} + if referer: + headers['Referer'] = referer + player_page = self._download_webpage( + 'http://link.brightcove.com/services/player/bcpid' + player_id[0], + video_id, headers=headers, fatal=False) + if player_page: + player_key = self._search_regex( + r'<param\s+name="playerKey"\s+value="([\w~,-]+)"', + player_page, 'player key', fatal=False) + if player_key: + enc_pub_id = player_key.split(',')[1].replace('~', '=') + publisher_id = struct.unpack('>Q', base64.urlsafe_b64decode(enc_pub_id))[0] + if publisher_id: + brightcove_new_url = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' % (publisher_id, video_id) + if referer: + brightcove_new_url = smuggle_url(brightcove_new_url, {'referrer': referer}) + return self.url_result(brightcove_new_url, BrightcoveNewIE.ie_key(), video_id) + # TODO: figure out if it's possible to extract playlistId from playerKey + # elif 'playerKey' in query: + # player_key = query['playerKey'] + # return self._get_playlist_info(player_key[0]) + raise UnsupportedError(url) + + +class BrightcoveNewIE(AdobePassIE): + IE_NAME = 'brightcove:new' + _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*(?P<content_type>video|playlist)Id=(?P<video_id>\d+|ref:[^&]+)' + _TESTS = [{ + 'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001', + 'md5': 'c8100925723840d4b0d243f7025703be', + 'info_dict': { + 'id': '4463358922001', + 'ext': 'mp4', + 'title': 'Meet the man behind Popcorn Time', + 'description': 'md5:eac376a4fe366edc70279bfb681aea16', + 'duration': 165.768, + 'timestamp': 1441391203, + 'upload_date': '20150904', + 'uploader_id': '929656772001', + 'formats': 'mincount:20', + }, + }, { + # with rtmp streams + 'url': 'http://players.brightcove.net/4036320279001/5d112ed9-283f-485f-a7f9-33f42e8bc042_default/index.html?videoId=4279049078001', + 'info_dict': { + 'id': '4279049078001', + 'ext': 'mp4', + 'title': 'Titansgrave: Chapter 0', + 'description': 'Titansgrave: Chapter 0', + 'duration': 1242.058, + 'timestamp': 1433556729, + 'upload_date': '20150606', + 'uploader_id': '4036320279001', + 'formats': 'mincount:39', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + }, { + # playlist stream + 'url': 'https://players.brightcove.net/1752604059001/S13cJdUBz_default/index.html?playlistId=5718313430001', + 'info_dict': { + 'id': '5718313430001', + 'title': 'No Audio Playlist', + }, + 'playlist_count': 7, + 'params': { + # m3u8 download + 'skip_download': True, + } + }, { + 'url': 'http://players.brightcove.net/5690807595001/HyZNerRl7_default/index.html?playlistId=5743160747001', + 'only_matching': True, + }, { + # ref: prefixed video id + 'url': 'http://players.brightcove.net/3910869709001/21519b5c-4b3b-4363-accb-bdc8f358f823_default/index.html?videoId=ref:7069442', + 'only_matching': True, + }, { + # non numeric ref: prefixed video id + 'url': 'http://players.brightcove.net/710858724001/default_default/index.html?videoId=ref:event-stream-356', + 'only_matching': True, + }, { + # unavailable video without message but with error_code + 'url': 'http://players.brightcove.net/1305187701/c832abfb-641b-44eb-9da0-2fe76786505f_default/index.html?videoId=4377407326001', + 'only_matching': True, + }] + + @staticmethod + def _extract_url(ie, webpage): + urls = BrightcoveNewIE._extract_urls(ie, webpage) + return urls[0] if urls else None + + @staticmethod + def _extract_urls(ie, webpage): + # Reference: + # 1. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideoiniframe + # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#tag + # 3. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideousingjavascript + # 4. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/in-page-embed-player-implementation.html + # 5. https://support.brightcove.com/en/video-cloud/docs/dynamically-assigning-videos-player + + entries = [] + + # Look for iframe embeds [1] + for _, url in re.findall( + r'<iframe[^>]+src=(["\'])((?:https?:)?//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage): + entries.append(url if url.startswith('http') else 'http:' + url) + + # Look for <video> tags [2] and embed_in_page embeds [3] + # [2] looks like: + for video, script_tag, account_id, player_id, embed in re.findall( + r'''(?isx) + (<video(?:-js)?\s+[^>]*\bdata-video-id\s*=\s*['"]?[^>]+>) + (?:.*? + (<script[^>]+ + src=["\'](?:https?:)?//players\.brightcove\.net/ + (\d+)/([^/]+)_([^/]+)/index(?:\.min)?\.js + ) + )? + ''', webpage): + attrs = extract_attributes(video) + + # According to examples from [4] it's unclear whether video id + # may be optional and what to do when it is + video_id = attrs.get('data-video-id') + if not video_id: + continue + + account_id = account_id or attrs.get('data-account') + if not account_id: + continue + + player_id = player_id or attrs.get('data-player') or 'default' + embed = embed or attrs.get('data-embed') or 'default' + + bc_url = 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' % ( + account_id, player_id, embed, video_id) + + # Some brightcove videos may be embedded with video tag only and + # without script tag or any mentioning of brightcove at all. Such + # embeds are considered ambiguous since they are matched based only + # on data-video-id and data-account attributes and in the wild may + # not be brightcove embeds at all. Let's check reconstructed + # brightcove URLs in case of such embeds and only process valid + # ones. By this we ensure there is indeed a brightcove embed. + if not script_tag and not ie._is_valid_url( + bc_url, video_id, 'possible brightcove video'): + continue + + entries.append(bc_url) + + return entries + + def _parse_brightcove_metadata(self, json_data, video_id, headers={}): + title = json_data['name'].strip() + + num_drm_sources = 0 + formats, subtitles = [], {} + sources = json_data.get('sources') or [] + for source in sources: + container = source.get('container') + ext = mimetype2ext(source.get('type')) + src = source.get('src') + skip_unplayable = not self.get_param('allow_unplayable_formats') + # https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object + if skip_unplayable and (container == 'WVM' or source.get('key_systems')): + num_drm_sources += 1 + continue + elif ext == 'ism' and skip_unplayable: + continue + elif ext == 'm3u8' or container == 'M2TS': + if not src: + continue + f, subs = self._extract_m3u8_formats_and_subtitles( + src, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) + formats.extend(f) + subtitles = self._merge_subtitles(subtitles, subs) + elif ext == 'mpd': + if not src: + continue + f, subs = self._extract_mpd_formats_and_subtitles(src, video_id, 'dash', fatal=False) + formats.extend(f) + subtitles = self._merge_subtitles(subtitles, subs) + else: + streaming_src = source.get('streaming_src') + stream_name, app_name = source.get('stream_name'), source.get('app_name') + if not src and not streaming_src and (not stream_name or not app_name): + continue + tbr = float_or_none(source.get('avg_bitrate'), 1000) + height = int_or_none(source.get('height')) + width = int_or_none(source.get('width')) + f = { + 'tbr': tbr, + 'filesize': int_or_none(source.get('size')), + 'container': container, + 'ext': ext or container.lower(), + } + if width == 0 and height == 0: + f.update({ + 'vcodec': 'none', + }) + else: + f.update({ + 'width': width, + 'height': height, + 'vcodec': source.get('codec'), + }) + + def build_format_id(kind): + format_id = kind + if tbr: + format_id += '-%dk' % int(tbr) + if height: + format_id += '-%dp' % height + return format_id + + if src or streaming_src: + f.update({ + 'url': src or streaming_src, + 'format_id': build_format_id('http' if src else 'http-streaming'), + 'source_preference': 0 if src else -1, + }) + else: + f.update({ + 'url': app_name, + 'play_path': stream_name, + 'format_id': build_format_id('rtmp'), + }) + formats.append(f) + + if not formats: + errors = json_data.get('errors') + if errors: + error = errors[0] + self.raise_no_formats( + error.get('message') or error.get('error_subcode') or error['error_code'], expected=True) + elif (not self.get_param('allow_unplayable_formats') + and sources and num_drm_sources == len(sources)): + self.report_drm(video_id) + + self._sort_formats(formats) + + for f in formats: + f.setdefault('http_headers', {}).update(headers) + + for text_track in json_data.get('text_tracks', []): + if text_track.get('kind') != 'captions': + continue + text_track_url = url_or_none(text_track.get('src')) + if not text_track_url: + continue + lang = (str_or_none(text_track.get('srclang')) + or str_or_none(text_track.get('label')) or 'en').lower() + subtitles.setdefault(lang, []).append({ + 'url': text_track_url, + }) + + is_live = False + duration = float_or_none(json_data.get('duration'), 1000) + if duration is not None and duration <= 0: + is_live = True + + return { + 'id': video_id, + 'title': self._live_title(title) if is_live else title, + 'description': clean_html(json_data.get('description')), + 'thumbnail': json_data.get('thumbnail') or json_data.get('poster'), + 'duration': duration, + 'timestamp': parse_iso8601(json_data.get('published_at')), + 'uploader_id': json_data.get('account_id'), + 'formats': formats, + 'subtitles': subtitles, + 'tags': json_data.get('tags', []), + 'is_live': is_live, + } + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + self._initialize_geo_bypass({ + 'countries': smuggled_data.get('geo_countries'), + 'ip_blocks': smuggled_data.get('geo_ip_blocks'), + }) + + account_id, player_id, embed, content_type, video_id = self._match_valid_url(url).groups() + + policy_key_id = '%s_%s' % (account_id, player_id) + policy_key = self._downloader.cache.load('brightcove', policy_key_id) + policy_key_extracted = False + store_pk = lambda x: self._downloader.cache.store('brightcove', policy_key_id, x) + + def extract_policy_key(): + base_url = 'http://players.brightcove.net/%s/%s_%s/' % (account_id, player_id, embed) + config = self._download_json( + base_url + 'config.json', video_id, fatal=False) or {} + policy_key = try_get( + config, lambda x: x['video_cloud']['policy_key']) + if not policy_key: + webpage = self._download_webpage( + base_url + 'index.min.js', video_id) + + catalog = self._search_regex( + r'catalog\(({.+?})\);', webpage, 'catalog', default=None) + if catalog: + catalog = self._parse_json( + js_to_json(catalog), video_id, fatal=False) + if catalog: + policy_key = catalog.get('policyKey') + + if not policy_key: + policy_key = self._search_regex( + r'policyKey\s*:\s*(["\'])(?P<pk>.+?)\1', + webpage, 'policy key', group='pk') + + store_pk(policy_key) + return policy_key + + api_url = 'https://edge.api.brightcove.com/playback/v1/accounts/%s/%ss/%s' % (account_id, content_type, video_id) + headers = {} + referrer = smuggled_data.get('referrer') + if referrer: + headers.update({ + 'Referer': referrer, + 'Origin': re.search(r'https?://[^/]+', referrer).group(0), + }) + + for _ in range(2): + if not policy_key: + policy_key = extract_policy_key() + policy_key_extracted = True + headers['Accept'] = 'application/json;pk=%s' % policy_key + try: + json_data = self._download_json(api_url, video_id, headers=headers) + break + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 403): + json_data = self._parse_json(e.cause.read().decode(), video_id)[0] + message = json_data.get('message') or json_data['error_code'] + if json_data.get('error_subcode') == 'CLIENT_GEO': + self.raise_geo_restricted(msg=message) + elif json_data.get('error_code') == 'INVALID_POLICY_KEY' and not policy_key_extracted: + policy_key = None + store_pk(None) + continue + raise ExtractorError(message, expected=True) + raise + + errors = json_data.get('errors') + if errors and errors[0].get('error_subcode') == 'TVE_AUTH': + custom_fields = json_data['custom_fields'] + tve_token = self._extract_mvpd_auth( + smuggled_data['source_url'], video_id, + custom_fields['bcadobepassrequestorid'], + custom_fields['bcadobepassresourceid']) + json_data = self._download_json( + api_url, video_id, headers={ + 'Accept': 'application/json;pk=%s' % policy_key + }, query={ + 'tveToken': tve_token, + }) + + if content_type == 'playlist': + return self.playlist_result( + [self._parse_brightcove_metadata(vid, vid.get('id'), headers) + for vid in json_data.get('videos', []) if vid.get('id')], + json_data.get('id'), json_data.get('name'), + json_data.get('description')) + + return self._parse_brightcove_metadata( + json_data, video_id, headers=headers) diff --git a/youtube_dl/extractor/businessinsider.py b/yt_dlp/extractor/businessinsider.py index 73a57b1e4..73a57b1e4 100644 --- a/youtube_dl/extractor/businessinsider.py +++ b/yt_dlp/extractor/businessinsider.py diff --git a/youtube_dl/extractor/buzzfeed.py b/yt_dlp/extractor/buzzfeed.py index ec411091e..ec411091e 100644 --- a/youtube_dl/extractor/buzzfeed.py +++ b/yt_dlp/extractor/buzzfeed.py diff --git a/yt_dlp/extractor/byutv.py b/yt_dlp/extractor/byutv.py new file mode 100644 index 000000000..f4d5086ed --- /dev/null +++ b/yt_dlp/extractor/byutv.py @@ -0,0 +1,122 @@ +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + merge_dicts, + parse_duration, + url_or_none, +) + + +class BYUtvIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?byutv\.org/(?:watch|player)/(?!event/)(?P<id>[0-9a-f-]+)(?:/(?P<display_id>[^/?#&]+))?' + _TESTS = [{ + # ooyalaVOD + 'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d/studio-c-season-5-episode-5', + 'info_dict': { + 'id': 'ZvanRocTpW-G5_yZFeltTAMv6jxOU9KH', + 'display_id': 'studio-c-season-5-episode-5', + 'ext': 'mp4', + 'title': 'Season 5 Episode 5', + 'description': 'md5:1d31dc18ef4f075b28f6a65937d22c65', + 'thumbnail': r're:^https?://.*', + 'duration': 1486.486, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['Ooyala'], + }, { + # dvr + 'url': 'https://www.byutv.org/player/8f1dab9b-b243-47c8-b525-3e2d021a3451/byu-softball-pacific-vs-byu-41219---game-2', + 'info_dict': { + 'id': '8f1dab9b-b243-47c8-b525-3e2d021a3451', + 'display_id': 'byu-softball-pacific-vs-byu-41219---game-2', + 'ext': 'mp4', + 'title': 'Pacific vs. BYU (4/12/19)', + 'description': 'md5:1ac7b57cb9a78015910a4834790ce1f3', + 'duration': 11645, + }, + 'params': { + 'skip_download': True + }, + }, { + 'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d', + 'only_matching': True, + }, { + 'url': 'https://www.byutv.org/player/27741493-dc83-40b0-8420-e7ae38a2ae98/byu-football-toledo-vs-byu-93016?listid=4fe0fee5-0d3c-4a29-b725-e4948627f472&listindex=0&q=toledo', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') or video_id + + video = self._download_json( + 'https://api.byutv.org/api3/catalog/getvideosforcontent', + display_id, query={ + 'contentid': video_id, + 'channel': 'byutv', + 'x-byutv-context': 'web$US', + }, headers={ + 'x-byutv-context': 'web$US', + 'x-byutv-platformkey': 'xsaaw9c7y5', + }) + + ep = video.get('ooyalaVOD') + if ep: + return { + '_type': 'url_transparent', + 'ie_key': 'Ooyala', + 'url': 'ooyala:%s' % ep['providerId'], + 'id': video_id, + 'display_id': display_id, + 'title': ep.get('title'), + 'description': ep.get('description'), + 'thumbnail': ep.get('imageThumbnail'), + } + + info = {} + formats = [] + subtitles = {} + for format_id, ep in video.items(): + if not isinstance(ep, dict): + continue + video_url = url_or_none(ep.get('videoUrl')) + if not video_url: + continue + ext = determine_ext(video_url) + if ext == 'm3u8': + m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False) + formats.extend(m3u8_fmts) + subtitles = self._merge_subtitles(subtitles, m3u8_subs) + elif ext == 'mpd': + mpd_fmts, mpd_subs = self._extract_mpd_formats_and_subtitles( + video_url, video_id, mpd_id='dash', fatal=False) + formats.extend(mpd_fmts) + subtitles = self._merge_subtitles(subtitles, mpd_subs) + else: + formats.append({ + 'url': video_url, + 'format_id': format_id, + }) + merge_dicts(info, { + 'title': ep.get('title'), + 'description': ep.get('description'), + 'thumbnail': ep.get('imageThumbnail'), + 'duration': parse_duration(ep.get('length')), + }) + self._sort_formats(formats) + + return merge_dicts(info, { + 'id': video_id, + 'display_id': display_id, + 'title': display_id, + 'formats': formats, + 'subtitles': subtitles, + }) diff --git a/yt_dlp/extractor/c56.py b/yt_dlp/extractor/c56.py new file mode 100644 index 000000000..a853c530c --- /dev/null +++ b/yt_dlp/extractor/c56.py @@ -0,0 +1,64 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import js_to_json + + +class C56IE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www|player)\.)?56\.com/(?:.+?/)?(?:v_|(?:play_album.+-))(?P<textid>.+?)\.(?:html|swf)' + IE_NAME = '56.com' + _TESTS = [{ + 'url': 'http://www.56.com/u39/v_OTM0NDA3MTY.html', + 'md5': 'e59995ac63d0457783ea05f93f12a866', + 'info_dict': { + 'id': '93440716', + 'ext': 'flv', + 'title': '网事知多少 第32期:车怒', + 'duration': 283.813, + }, + }, { + 'url': 'http://www.56.com/u47/v_MTM5NjQ5ODc2.html', + 'md5': '', + 'info_dict': { + 'id': '82247482', + 'title': '爱的诅咒之杜鹃花开', + }, + 'playlist_count': 7, + 'add_ie': ['Sohu'], + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + text_id = mobj.group('textid') + + webpage = self._download_webpage(url, text_id) + sohu_video_info_str = self._search_regex( + r'var\s+sohuVideoInfo\s*=\s*({[^}]+});', webpage, 'Sohu video info', default=None) + if sohu_video_info_str: + sohu_video_info = self._parse_json( + sohu_video_info_str, text_id, transform_source=js_to_json) + return self.url_result(sohu_video_info['url'], 'Sohu') + + page = self._download_json( + 'http://vxml.56.com/json/%s/' % text_id, text_id, 'Downloading video info') + + info = page['info'] + + formats = [ + { + 'format_id': f['type'], + 'filesize': int(f['filesize']), + 'url': f['url'] + } for f in info['rfiles'] + ] + self._sort_formats(formats) + + return { + 'id': info['vid'], + 'title': info['Subject'], + 'duration': int(info['duration']) / 1000.0, + 'formats': formats, + 'thumbnail': info.get('bimg') or info.get('img'), + } diff --git a/yt_dlp/extractor/cam4.py b/yt_dlp/extractor/cam4.py new file mode 100644 index 000000000..30daf2be9 --- /dev/null +++ b/yt_dlp/extractor/cam4.py @@ -0,0 +1,32 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class CAM4IE(InfoExtractor): + _VALID_URL = r'https?://(?:[^/]+\.)?cam4\.com/(?P<id>[a-z0-9_]+)' + _TEST = { + 'url': 'https://www.cam4.com/foxynesss', + 'info_dict': { + 'id': 'foxynesss', + 'ext': 'mp4', + 'title': 're:^foxynesss [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'age_limit': 18, + } + } + + def _real_extract(self, url): + channel_id = self._match_id(url) + m3u8_playlist = self._download_json('https://www.cam4.com/rest/v1.0/profile/{}/streamInfo'.format(channel_id), channel_id).get('cdnURL') + + formats = self._extract_m3u8_formats(m3u8_playlist, channel_id, 'mp4', m3u8_id='hls', live=True) + self._sort_formats(formats) + + return { + 'id': channel_id, + 'title': self._live_title(channel_id), + 'is_live': True, + 'age_limit': 18, + 'formats': formats, + } diff --git a/youtube_dl/extractor/camdemy.py b/yt_dlp/extractor/camdemy.py index 8f0c6c545..8f0c6c545 100644 --- a/youtube_dl/extractor/camdemy.py +++ b/yt_dlp/extractor/camdemy.py diff --git a/yt_dlp/extractor/cammodels.py b/yt_dlp/extractor/cammodels.py new file mode 100644 index 000000000..eb2a8b4c6 --- /dev/null +++ b/yt_dlp/extractor/cammodels.py @@ -0,0 +1,98 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + url_or_none, +) + + +class CamModelsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?cammodels\.com/cam/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.cammodels.com/cam/AutumnKnight/', + 'only_matching': True, + 'age_limit': 18 + }] + + def _real_extract(self, url): + user_id = self._match_id(url) + + webpage = self._download_webpage( + url, user_id, headers=self.geo_verification_headers()) + + manifest_root = self._html_search_regex( + r'manifestUrlRoot=([^&\']+)', webpage, 'manifest', default=None) + + if not manifest_root: + ERRORS = ( + ("I'm offline, but let's stay connected", 'This user is currently offline'), + ('in a private show', 'This user is in a private show'), + ('is currently performing LIVE', 'This model is currently performing live'), + ) + for pattern, message in ERRORS: + if pattern in webpage: + error = message + expected = True + break + else: + error = 'Unable to find manifest URL root' + expected = False + raise ExtractorError(error, expected=expected) + + manifest = self._download_json( + '%s%s.json' % (manifest_root, user_id), user_id) + + formats = [] + for format_id, format_dict in manifest['formats'].items(): + if not isinstance(format_dict, dict): + continue + encodings = format_dict.get('encodings') + if not isinstance(encodings, list): + continue + vcodec = format_dict.get('videoCodec') + acodec = format_dict.get('audioCodec') + for media in encodings: + if not isinstance(media, dict): + continue + media_url = url_or_none(media.get('location')) + if not media_url: + continue + + format_id_list = [format_id] + height = int_or_none(media.get('videoHeight')) + if height is not None: + format_id_list.append('%dp' % height) + f = { + 'url': media_url, + 'format_id': '-'.join(format_id_list), + 'width': int_or_none(media.get('videoWidth')), + 'height': height, + 'vbr': int_or_none(media.get('videoKbps')), + 'abr': int_or_none(media.get('audioKbps')), + 'fps': int_or_none(media.get('fps')), + 'vcodec': vcodec, + 'acodec': acodec, + } + if 'rtmp' in format_id: + f['ext'] = 'flv' + elif 'hls' in format_id: + f.update({ + 'ext': 'mp4', + # hls skips fragments, preferring rtmp + 'quality': -10, + }) + else: + continue + formats.append(f) + self._sort_formats(formats) + + return { + 'id': user_id, + 'title': self._live_title(user_id), + 'is_live': True, + 'formats': formats, + 'age_limit': 18 + } diff --git a/youtube_dl/extractor/camwithher.py b/yt_dlp/extractor/camwithher.py index bbc5205fd..bbc5205fd 100644 --- a/youtube_dl/extractor/camwithher.py +++ b/yt_dlp/extractor/camwithher.py diff --git a/youtube_dl/extractor/canalc2.py b/yt_dlp/extractor/canalc2.py index 407cc8084..407cc8084 100644 --- a/youtube_dl/extractor/canalc2.py +++ b/yt_dlp/extractor/canalc2.py diff --git a/yt_dlp/extractor/canalplus.py b/yt_dlp/extractor/canalplus.py new file mode 100644 index 000000000..211ea267a --- /dev/null +++ b/yt_dlp/extractor/canalplus.py @@ -0,0 +1,115 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ( + # ExtractorError, + # HEADRequest, + int_or_none, + qualities, + unified_strdate, +) + + +class CanalplusIE(InfoExtractor): + IE_DESC = 'mycanal.fr and piwiplus.fr' + _VALID_URL = r'https?://(?:www\.)?(?P<site>mycanal|piwiplus)\.fr/(?:[^/]+/)*(?P<display_id>[^?/]+)(?:\.html\?.*\bvid=|/p/)(?P<id>\d+)' + _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/%s/%s?format=json' + _SITE_ID_MAP = { + 'mycanal': 'cplus', + 'piwiplus': 'teletoon', + } + + # Only works for direct mp4 URLs + _GEO_COUNTRIES = ['FR'] + + _TESTS = [{ + 'url': 'https://www.mycanal.fr/d17-emissions/lolywood/p/1397061', + 'info_dict': { + 'id': '1397061', + 'display_id': 'lolywood', + 'ext': 'mp4', + 'title': 'Euro 2016 : Je préfère te prévenir - Lolywood - Episode 34', + 'description': 'md5:7d97039d455cb29cdba0d652a0efaa5e', + 'upload_date': '20160602', + }, + }, { + # geo restricted, bypassed + 'url': 'http://www.piwiplus.fr/videos-piwi/pid1405-le-labyrinthe-boing-super-ranger.html?vid=1108190', + 'info_dict': { + 'id': '1108190', + 'display_id': 'pid1405-le-labyrinthe-boing-super-ranger', + 'ext': 'mp4', + 'title': 'BOING SUPER RANGER - Ep : Le labyrinthe', + 'description': 'md5:4cea7a37153be42c1ba2c1d3064376ff', + 'upload_date': '20140724', + }, + 'expected_warnings': ['HTTP Error 403: Forbidden'], + }] + + def _real_extract(self, url): + site, display_id, video_id = self._match_valid_url(url).groups() + + site_id = self._SITE_ID_MAP[site] + + info_url = self._VIDEO_INFO_TEMPLATE % (site_id, video_id) + video_data = self._download_json(info_url, video_id, 'Downloading video JSON') + + if isinstance(video_data, list): + video_data = [video for video in video_data if video.get('ID') == video_id][0] + media = video_data['MEDIA'] + infos = video_data['INFOS'] + + preference = qualities(['MOBILE', 'BAS_DEBIT', 'HAUT_DEBIT', 'HD']) + + # _, fmt_url = next(iter(media['VIDEOS'].items())) + # if '/geo' in fmt_url.lower(): + # response = self._request_webpage( + # HEADRequest(fmt_url), video_id, + # 'Checking if the video is georestricted') + # if '/blocage' in response.geturl(): + # raise ExtractorError( + # 'The video is not available in your country', + # expected=True) + + formats = [] + for format_id, format_url in media['VIDEOS'].items(): + if not format_url: + continue + if format_id == 'HLS': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', 'm3u8_native', m3u8_id=format_id, fatal=False)) + elif format_id == 'HDS': + formats.extend(self._extract_f4m_formats( + format_url + '?hdcore=2.11.3', video_id, f4m_id=format_id, fatal=False)) + else: + formats.append({ + # the secret extracted from ya function in http://player.canalplus.fr/common/js/canalPlayer.js + 'url': format_url + '?secret=pqzerjlsmdkjfoiuerhsdlfknaes', + 'format_id': format_id, + 'quality': preference(format_id), + }) + self._sort_formats(formats) + + thumbnails = [{ + 'id': image_id, + 'url': image_url, + } for image_id, image_url in media.get('images', {}).items()] + + titrage = infos['TITRAGE'] + + return { + 'id': video_id, + 'display_id': display_id, + 'title': '%s - %s' % (titrage['TITRE'], + titrage['SOUS_TITRE']), + 'upload_date': unified_strdate(infos.get('PUBLICATION', {}).get('DATE')), + 'thumbnails': thumbnails, + 'description': infos.get('DESCRIPTION'), + 'duration': int_or_none(infos.get('DURATION')), + 'view_count': int_or_none(infos.get('NB_VUES')), + 'like_count': int_or_none(infos.get('NB_LIKES')), + 'comment_count': int_or_none(infos.get('NB_COMMENTS')), + 'formats': formats, + } diff --git a/yt_dlp/extractor/canvas.py b/yt_dlp/extractor/canvas.py new file mode 100644 index 000000000..49e7e4e39 --- /dev/null +++ b/yt_dlp/extractor/canvas.py @@ -0,0 +1,395 @@ +from __future__ import unicode_literals + + +from .common import InfoExtractor +from .gigya import GigyaBaseIE +from ..compat import compat_HTTPError +from ..utils import ( + ExtractorError, + clean_html, + extract_attributes, + float_or_none, + get_element_by_class, + int_or_none, + merge_dicts, + str_or_none, + strip_or_none, + url_or_none, + urlencode_postdata +) + + +class CanvasIE(InfoExtractor): + _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrt(?:video|nieuws)|sporza|dako)/assets/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', + 'md5': '37b2b7bb9b3dcaa05b67058dc3a714a9', + 'info_dict': { + 'id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', + 'display_id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', + 'ext': 'mp4', + 'title': 'Nachtwacht: De Greystook', + 'description': 'Nachtwacht: De Greystook', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 1468.02, + }, + 'expected_warnings': ['is not a supported codec'], + }, { + 'url': 'https://mediazone.vrt.be/api/v1/canvas/assets/mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e', + 'only_matching': True, + }] + _GEO_BYPASS = False + _HLS_ENTRY_PROTOCOLS_MAP = { + 'HLS': 'm3u8_native', + 'HLS_AES': 'm3u8', + } + _REST_API_BASE = 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v1' + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + site_id, video_id = mobj.group('site_id'), mobj.group('id') + + data = None + if site_id != 'vrtvideo': + # Old API endpoint, serves more formats but may fail for some videos + data = self._download_json( + 'https://mediazone.vrt.be/api/v1/%s/assets/%s' + % (site_id, video_id), video_id, 'Downloading asset JSON', + 'Unable to download asset JSON', fatal=False) + + # New API endpoint + if not data: + headers = self.geo_verification_headers() + headers.update({'Content-Type': 'application/json'}) + token = self._download_json( + '%s/tokens' % self._REST_API_BASE, video_id, + 'Downloading token', data=b'', headers=headers)['vrtPlayerToken'] + data = self._download_json( + '%s/videos/%s' % (self._REST_API_BASE, video_id), + video_id, 'Downloading video JSON', query={ + 'vrtPlayerToken': token, + 'client': '%s@PROD' % site_id, + }, expected_status=400) + if not data.get('title'): + code = data.get('code') + if code == 'AUTHENTICATION_REQUIRED': + self.raise_login_required() + elif code == 'INVALID_LOCATION': + self.raise_geo_restricted(countries=['BE']) + raise ExtractorError(data.get('message') or code, expected=True) + + title = data['title'] + description = data.get('description') + + formats = [] + subtitles = {} + for target in data['targetUrls']: + format_url, format_type = url_or_none(target.get('url')), str_or_none(target.get('type')) + if not format_url or not format_type: + continue + format_type = format_type.upper() + if format_type in self._HLS_ENTRY_PROTOCOLS_MAP: + fmts, subs = self._extract_m3u8_formats_and_subtitles( + format_url, video_id, 'mp4', self._HLS_ENTRY_PROTOCOLS_MAP[format_type], + m3u8_id=format_type, fatal=False) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) + elif format_type == 'HDS': + formats.extend(self._extract_f4m_formats( + format_url, video_id, f4m_id=format_type, fatal=False)) + elif format_type == 'MPEG_DASH': + fmts, subs = self._extract_mpd_formats_and_subtitles( + format_url, video_id, mpd_id=format_type, fatal=False) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) + elif format_type == 'HSS': + fmts, subs = self._extract_ism_formats_and_subtitles( + format_url, video_id, ism_id='mss', fatal=False) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) + else: + formats.append({ + 'format_id': format_type, + 'url': format_url, + }) + self._sort_formats(formats) + + subtitle_urls = data.get('subtitleUrls') + if isinstance(subtitle_urls, list): + for subtitle in subtitle_urls: + subtitle_url = subtitle.get('url') + if subtitle_url and subtitle.get('type') == 'CLOSED': + subtitles.setdefault('nl', []).append({'url': subtitle_url}) + + return { + 'id': video_id, + 'display_id': video_id, + 'title': title, + 'description': description, + 'formats': formats, + 'duration': float_or_none(data.get('duration'), 1000), + 'thumbnail': data.get('posterImageUrl'), + 'subtitles': subtitles, + } + + +class CanvasEenIE(InfoExtractor): + IE_DESC = 'canvas.be and een.be' + _VALID_URL = r'https?://(?:www\.)?(?P<site_id>canvas|een)\.be/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'http://www.canvas.be/video/de-afspraak/najaar-2015/de-afspraak-veilt-voor-de-warmste-week', + 'md5': 'ed66976748d12350b118455979cca293', + 'info_dict': { + 'id': 'mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e', + 'display_id': 'de-afspraak-veilt-voor-de-warmste-week', + 'ext': 'flv', + 'title': 'De afspraak veilt voor de Warmste Week', + 'description': 'md5:24cb860c320dc2be7358e0e5aa317ba6', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 49.02, + }, + 'expected_warnings': ['is not a supported codec'], + }, { + # with subtitles + 'url': 'http://www.canvas.be/video/panorama/2016/pieter-0167', + 'info_dict': { + 'id': 'mz-ast-5240ff21-2d30-4101-bba6-92b5ec67c625', + 'display_id': 'pieter-0167', + 'ext': 'mp4', + 'title': 'Pieter 0167', + 'description': 'md5:943cd30f48a5d29ba02c3a104dc4ec4e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 2553.08, + 'subtitles': { + 'nl': [{ + 'ext': 'vtt', + }], + }, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Pagina niet gevonden', + }, { + 'url': 'https://www.een.be/thuis/emma-pakt-thilly-aan', + 'info_dict': { + 'id': 'md-ast-3a24ced2-64d7-44fb-b4ed-ed1aafbf90b8', + 'display_id': 'emma-pakt-thilly-aan', + 'ext': 'mp4', + 'title': 'Emma pakt Thilly aan', + 'description': 'md5:c5c9b572388a99b2690030afa3f3bad7', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 118.24, + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['is not a supported codec'], + }, { + 'url': 'https://www.canvas.be/check-point/najaar-2016/de-politie-uw-vriend', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + site_id, display_id = mobj.group('site_id'), mobj.group('id') + + webpage = self._download_webpage(url, display_id) + + title = strip_or_none(self._search_regex( + r'<h1[^>]+class="video__body__header__title"[^>]*>(.+?)</h1>', + webpage, 'title', default=None) or self._og_search_title( + webpage, default=None)) + + video_id = self._html_search_regex( + r'data-video=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id', + group='id') + + return { + '_type': 'url_transparent', + 'url': 'https://mediazone.vrt.be/api/v1/%s/assets/%s' % (site_id, video_id), + 'ie_key': CanvasIE.ie_key(), + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': self._og_search_description(webpage), + } + + +class VrtNUIE(GigyaBaseIE): + IE_DESC = 'VrtNU.be' + _VALID_URL = r'https?://(?:www\.)?vrt\.be/vrtnu/a-z/(?:[^/]+/){2}(?P<id>[^/?#&]+)' + _TESTS = [{ + # Available via old API endpoint + 'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1989/postbus-x-s1989a1/', + 'info_dict': { + 'id': 'pbs-pub-e8713dac-899e-41de-9313-81269f4c04ac$vid-90c932b1-e21d-4fb8-99b1-db7b49cf74de', + 'ext': 'mp4', + 'title': 'Postbus X - Aflevering 1 (Seizoen 1989)', + 'description': 'md5:b704f669eb9262da4c55b33d7c6ed4b7', + 'duration': 1457.04, + 'thumbnail': r're:^https?://.*\.jpg$', + 'series': 'Postbus X', + 'season': 'Seizoen 1989', + 'season_number': 1989, + 'episode': 'De zwarte weduwe', + 'episode_number': 1, + 'timestamp': 1595822400, + 'upload_date': '20200727', + }, + 'skip': 'This video is only available for registered users', + 'params': { + 'username': '<snip>', + 'password': '<snip>', + }, + 'expected_warnings': ['is not a supported codec'], + }, { + # Only available via new API endpoint + 'url': 'https://www.vrt.be/vrtnu/a-z/kamp-waes/1/kamp-waes-s1a5/', + 'info_dict': { + 'id': 'pbs-pub-0763b56c-64fb-4d38-b95b-af60bf433c71$vid-ad36a73c-4735-4f1f-b2c0-a38e6e6aa7e1', + 'ext': 'mp4', + 'title': 'Aflevering 5', + 'description': 'Wie valt door de mand tijdens een missie?', + 'duration': 2967.06, + 'season': 'Season 1', + 'season_number': 1, + 'episode_number': 5, + }, + 'skip': 'This video is only available for registered users', + 'params': { + 'username': '<snip>', + 'password': '<snip>', + }, + 'expected_warnings': ['Unable to download asset JSON', 'is not a supported codec', 'Unknown MIME type'], + }] + _NETRC_MACHINE = 'vrtnu' + _APIKEY = '3_qhEcPa5JGFROVwu5SWKqJ4mVOIkwlFNMSKwzPDAh8QZOtHqu6L4nD5Q7lk0eXOOG' + _CONTEXT_ID = 'R3595707040' + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + auth_info = self._download_json( + 'https://accounts.vrt.be/accounts.login', None, + note='Login data', errnote='Could not get Login data', + headers={}, data=urlencode_postdata({ + 'loginID': username, + 'password': password, + 'sessionExpiration': '-2', + 'APIKey': self._APIKEY, + 'targetEnv': 'jssdk', + })) + + if auth_info.get('errorDetails'): + raise ExtractorError('Unable to login: VrtNU said: ' + auth_info.get('errorDetails'), expected=True) + + # Sometimes authentication fails for no good reason, retry + login_attempt = 1 + while login_attempt <= 3: + try: + self._request_webpage('https://token.vrt.be/vrtnuinitlogin', + None, note='Requesting XSRF Token', errnote='Could not get XSRF Token', + query={'provider': 'site', 'destination': 'https://www.vrt.be/vrtnu/'}) + + post_data = { + 'UID': auth_info['UID'], + 'UIDSignature': auth_info['UIDSignature'], + 'signatureTimestamp': auth_info['signatureTimestamp'], + 'client_id': 'vrtnu-site', + '_csrf': self._get_cookies('https://login.vrt.be').get('OIDCXSRF').value, + } + + self._request_webpage( + 'https://login.vrt.be/perform_login', + None, note='Requesting a token', errnote='Could not get a token', + headers={}, data=urlencode_postdata(post_data)) + + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + login_attempt += 1 + self.report_warning('Authentication failed') + self._sleep(1, None, msg_template='Waiting for %(timeout)s seconds before trying again') + else: + raise e + else: + break + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + attrs = extract_attributes(self._search_regex( + r'(<nui-media[^>]+>)', webpage, 'media element')) + video_id = attrs['videoid'] + publication_id = attrs.get('publicationid') + if publication_id: + video_id = publication_id + '$' + video_id + + page = (self._parse_json(self._search_regex( + r'digitalData\s*=\s*({.+?});', webpage, 'digial data', + default='{}'), video_id, fatal=False) or {}).get('page') or {} + + info = self._search_json_ld(webpage, display_id, default={}) + return merge_dicts(info, { + '_type': 'url_transparent', + 'url': 'https://mediazone.vrt.be/api/v1/vrtvideo/assets/%s' % video_id, + 'ie_key': CanvasIE.ie_key(), + 'id': video_id, + 'display_id': display_id, + 'season_number': int_or_none(page.get('episode_season')), + }) + + +class DagelijkseKostIE(InfoExtractor): + IE_DESC = 'dagelijksekost.een.be' + _VALID_URL = r'https?://dagelijksekost\.een\.be/gerechten/(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'https://dagelijksekost.een.be/gerechten/hachis-parmentier-met-witloof', + 'md5': '30bfffc323009a3e5f689bef6efa2365', + 'info_dict': { + 'id': 'md-ast-27a4d1ff-7d7b-425e-b84f-a4d227f592fa', + 'display_id': 'hachis-parmentier-met-witloof', + 'ext': 'mp4', + 'title': 'Hachis parmentier met witloof', + 'description': 'md5:9960478392d87f63567b5b117688cdc5', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 283.02, + }, + 'expected_warnings': ['is not a supported codec'], + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + title = strip_or_none(get_element_by_class( + 'dish-metadata__title', webpage + ) or self._html_search_meta( + 'twitter:title', webpage)) + + description = clean_html(get_element_by_class( + 'dish-description', webpage) + ) or self._html_search_meta( + ('description', 'twitter:description', 'og:description'), + webpage) + + video_id = self._html_search_regex( + r'data-url=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id', + group='id') + + return { + '_type': 'url_transparent', + 'url': 'https://mediazone.vrt.be/api/v1/dako/assets/%s' % video_id, + 'ie_key': CanvasIE.ie_key(), + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + } diff --git a/youtube_dl/extractor/carambatv.py b/yt_dlp/extractor/carambatv.py index b57b86af7..b57b86af7 100644 --- a/youtube_dl/extractor/carambatv.py +++ b/yt_dlp/extractor/carambatv.py diff --git a/youtube_dl/extractor/cartoonnetwork.py b/yt_dlp/extractor/cartoonnetwork.py index 48b33617f..48b33617f 100644 --- a/youtube_dl/extractor/cartoonnetwork.py +++ b/yt_dlp/extractor/cartoonnetwork.py diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py new file mode 100644 index 000000000..5e4526c53 --- /dev/null +++ b/yt_dlp/extractor/cbc.py @@ -0,0 +1,425 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_str, +) +from ..utils import ( + js_to_json, + smuggle_url, + try_get, + orderedSet, + strip_or_none, + ExtractorError, +) + + +class CBCIE(InfoExtractor): + IE_NAME = 'cbc.ca' + _VALID_URL = r'https?://(?:www\.)?cbc\.ca/(?!player/)(?:[^/]+/)+(?P<id>[^/?#]+)' + _TESTS = [{ + # with mediaId + 'url': 'http://www.cbc.ca/22minutes/videos/clips-season-23/don-cherry-play-offs', + 'md5': '97e24d09672fc4cf56256d6faa6c25bc', + 'info_dict': { + 'id': '2682904050', + 'ext': 'mp4', + 'title': 'Don Cherry – All-Stars', + 'description': 'Don Cherry has a bee in his bonnet about AHL player John Scott because that guy’s got heart.', + 'timestamp': 1454463000, + 'upload_date': '20160203', + 'uploader': 'CBCC-NEW', + }, + 'skip': 'Geo-restricted to Canada', + }, { + # with clipId, feed available via tpfeed.cbc.ca and feed.theplatform.com + 'url': 'http://www.cbc.ca/22minutes/videos/22-minutes-update/22-minutes-update-episode-4', + 'md5': '162adfa070274b144f4fdc3c3b8207db', + 'info_dict': { + 'id': '2414435309', + 'ext': 'mp4', + 'title': '22 Minutes Update: What Not To Wear Quebec', + 'description': "This week's latest Canadian top political story is What Not To Wear Quebec.", + 'upload_date': '20131025', + 'uploader': 'CBCC-NEW', + 'timestamp': 1382717907, + }, + 'skip': 'No longer available', + }, { + # with clipId, feed only available via tpfeed.cbc.ca + 'url': 'http://www.cbc.ca/archives/entry/1978-robin-williams-freestyles-on-90-minutes-live', + 'md5': '0274a90b51a9b4971fe005c63f592f12', + 'info_dict': { + 'id': '2487345465', + 'ext': 'mp4', + 'title': 'Robin Williams freestyles on 90 Minutes Live', + 'description': 'Wacky American comedian Robin Williams shows off his infamous "freestyle" comedic talents while being interviewed on CBC\'s 90 Minutes Live.', + 'upload_date': '19780210', + 'uploader': 'CBCC-NEW', + 'timestamp': 255977160, + }, + }, { + # multiple iframes + 'url': 'http://www.cbc.ca/natureofthings/blog/birds-eye-view-from-vancouvers-burrard-street-bridge-how-we-got-the-shot', + 'playlist': [{ + 'md5': '377572d0b49c4ce0c9ad77470e0b96b4', + 'info_dict': { + 'id': '2680832926', + 'ext': 'mp4', + 'title': 'An Eagle\'s-Eye View Off Burrard Bridge', + 'description': 'Hercules the eagle flies from Vancouver\'s Burrard Bridge down to a nearby park with a mini-camera strapped to his back.', + 'upload_date': '20160201', + 'timestamp': 1454342820, + 'uploader': 'CBCC-NEW', + }, + }, { + 'md5': '415a0e3f586113894174dfb31aa5bb1a', + 'info_dict': { + 'id': '2658915080', + 'ext': 'mp4', + 'title': 'Fly like an eagle!', + 'description': 'Eagle equipped with a mini camera flies from the world\'s tallest tower', + 'upload_date': '20150315', + 'timestamp': 1426443984, + 'uploader': 'CBCC-NEW', + }, + }], + 'skip': 'Geo-restricted to Canada', + }, { + # multiple CBC.APP.Caffeine.initInstance(...) + 'url': 'http://www.cbc.ca/news/canada/calgary/dog-indoor-exercise-winter-1.3928238', + 'info_dict': { + 'title': 'Keep Rover active during the deep freeze with doggie pushups and other fun indoor tasks', + 'id': 'dog-indoor-exercise-winter-1.3928238', + 'description': 'md5:c18552e41726ee95bd75210d1ca9194c', + }, + 'playlist_mincount': 6, + }] + + @classmethod + def suitable(cls, url): + return False if CBCPlayerIE.suitable(url) else super(CBCIE, cls).suitable(url) + + def _extract_player_init(self, player_init, display_id): + player_info = self._parse_json(player_init, display_id, js_to_json) + media_id = player_info.get('mediaId') + if not media_id: + clip_id = player_info['clipId'] + feed = self._download_json( + 'http://tpfeed.cbc.ca/f/ExhSPC/vms_5akSXx4Ng_Zn?byCustomValue={:mpsReleases}{%s}' % clip_id, + clip_id, fatal=False) + if feed: + media_id = try_get(feed, lambda x: x['entries'][0]['guid'], compat_str) + if not media_id: + media_id = self._download_json( + 'http://feed.theplatform.com/f/h9dtGB/punlNGjMlc1F?fields=id&byContent=byReleases%3DbyId%253D' + clip_id, + clip_id)['entries'][0]['id'].split('/')[-1] + return self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + title = self._og_search_title(webpage, default=None) or self._html_search_meta( + 'twitter:title', webpage, 'title', default=None) or self._html_search_regex( + r'<title>([^<]+)</title>', webpage, 'title', fatal=False) + entries = [ + self._extract_player_init(player_init, display_id) + for player_init in re.findall(r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage)] + media_ids = [] + for media_id_re in ( + r'<iframe[^>]+src="[^"]+?mediaId=(\d+)"', + r'<div[^>]+\bid=["\']player-(\d+)', + r'guid["\']\s*:\s*["\'](\d+)'): + media_ids.extend(re.findall(media_id_re, webpage)) + entries.extend([ + self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) + for media_id in orderedSet(media_ids)]) + return self.playlist_result( + entries, display_id, strip_or_none(title), + self._og_search_description(webpage)) + + +class CBCPlayerIE(InfoExtractor): + IE_NAME = 'cbc.ca:player' + _VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/|i/caffeine/syndicate/\?mediaId=))(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://www.cbc.ca/player/play/2683190193', + 'md5': '64d25f841ddf4ddb28a235338af32e2c', + 'info_dict': { + 'id': '2683190193', + 'ext': 'mp4', + 'title': 'Gerry Runs a Sweat Shop', + 'description': 'md5:b457e1c01e8ff408d9d801c1c2cd29b0', + 'timestamp': 1455071400, + 'upload_date': '20160210', + 'uploader': 'CBCC-NEW', + }, + 'skip': 'Geo-restricted to Canada', + }, { + # Redirected from http://www.cbc.ca/player/AudioMobile/All%20in%20a%20Weekend%20Montreal/ID/2657632011/ + 'url': 'http://www.cbc.ca/player/play/2657631896', + 'md5': 'e5e708c34ae6fca156aafe17c43e8b75', + 'info_dict': { + 'id': '2657631896', + 'ext': 'mp3', + 'title': 'CBC Montreal is organizing its first ever community hackathon!', + 'description': 'The modern technology we tend to depend on so heavily, is never without it\'s share of hiccups and headaches. Next weekend - CBC Montreal will be getting members of the public for its first Hackathon.', + 'timestamp': 1425704400, + 'upload_date': '20150307', + 'uploader': 'CBCC-NEW', + }, + }, { + 'url': 'http://www.cbc.ca/player/play/2164402062', + 'md5': '33fcd8f6719b9dd60a5e73adcb83b9f6', + 'info_dict': { + 'id': '2164402062', + 'ext': 'mp4', + 'title': 'Cancer survivor four times over', + 'description': 'Tim Mayer has beaten three different forms of cancer four times in five years.', + 'timestamp': 1320410746, + 'upload_date': '20111104', + 'uploader': 'CBCC-NEW', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return { + '_type': 'url_transparent', + 'ie_key': 'ThePlatform', + 'url': smuggle_url( + 'http://link.theplatform.com/s/ExhSPC/media/guid/2655402169/%s?mbr=true&formats=MPEG4,FLV,MP3' % video_id, { + 'force_smil_url': True + }), + 'id': video_id, + } + + +class CBCGemIE(InfoExtractor): + IE_NAME = 'gem.cbc.ca' + _VALID_URL = r'https?://gem\.cbc\.ca/media/(?P<id>[0-9a-z-]+/s[0-9]+[a-z][0-9]+)' + _TESTS = [{ + # This is a normal, public, TV show video + 'url': 'https://gem.cbc.ca/media/schitts-creek/s06e01', + 'md5': '93dbb31c74a8e45b378cf13bd3f6f11e', + 'info_dict': { + 'id': 'schitts-creek/s06e01', + 'ext': 'mp4', + 'title': 'Smoke Signals', + 'description': 'md5:929868d20021c924020641769eb3e7f1', + 'thumbnail': 'https://images.radio-canada.ca/v1/synps-cbc/episode/perso/cbc_schitts_creek_season_06e01_thumbnail_v01.jpg?im=Resize=(Size)', + 'duration': 1314, + 'categories': ['comedy'], + 'series': 'Schitt\'s Creek', + 'season': 'Season 6', + 'season_number': 6, + 'episode': 'Smoke Signals', + 'episode_number': 1, + 'episode_id': 'schitts-creek/s06e01', + }, + 'params': {'format': 'bv'}, + 'skip': 'Geo-restricted to Canada', + }, { + # This video requires an account in the browser, but works fine in yt-dlp + 'url': 'https://gem.cbc.ca/media/schitts-creek/s01e01', + 'md5': '297a9600f554f2258aed01514226a697', + 'info_dict': { + 'id': 'schitts-creek/s01e01', + 'ext': 'mp4', + 'title': 'The Cup Runneth Over', + 'description': 'md5:9bca14ea49ab808097530eb05a29e797', + 'thumbnail': 'https://images.radio-canada.ca/v1/synps-cbc/episode/perso/cbc_schitts_creek_season_01e01_thumbnail_v01.jpg?im=Resize=(Size)', + 'series': 'Schitt\'s Creek', + 'season_number': 1, + 'season': 'Season 1', + 'episode_number': 1, + 'episode': 'The Cup Runneth Over', + 'episode_id': 'schitts-creek/s01e01', + 'duration': 1309, + 'categories': ['comedy'], + }, + 'params': {'format': 'bv'}, + 'skip': 'Geo-restricted to Canada', + }] + _API_BASE = 'https://services.radio-canada.ca/ott/cbc-api/v2/assets/' + + def _real_extract(self, url): + video_id = self._match_id(url) + video_info = self._download_json(self._API_BASE + video_id, video_id) + + last_error = None + attempt = -1 + retries = self.get_param('extractor_retries', 15) + while attempt < retries: + attempt += 1 + if last_error: + self.report_warning('%s. Retrying ...' % last_error) + m3u8_info = self._download_json( + video_info['playSession']['url'], video_id, + note='Downloading JSON metadata%s' % f' (attempt {attempt})') + m3u8_url = m3u8_info.get('url') + if m3u8_url: + break + elif m3u8_info.get('errorCode') == 1: + self.raise_geo_restricted(countries=['CA']) + else: + last_error = f'{self.IE_NAME} said: {m3u8_info.get("errorCode")} - {m3u8_info.get("message")}' + # 35 means media unavailable, but retries work + if m3u8_info.get('errorCode') != 35 or attempt >= retries: + raise ExtractorError(last_error) + + formats = self._extract_m3u8_formats(m3u8_url, video_id, m3u8_id='hls') + self._remove_duplicate_formats(formats) + + for i, format in enumerate(formats): + if format.get('vcodec') == 'none': + if format.get('ext') is None: + format['ext'] = 'm4a' + if format.get('acodec') is None: + format['acodec'] = 'mp4a.40.2' + + # Put described audio at the beginning of the list, so that it + # isn't chosen by default, as most people won't want it. + if 'descriptive' in format['format_id'].lower(): + format['preference'] = -2 + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_info['title'], + 'description': video_info.get('description'), + 'thumbnail': video_info.get('image'), + 'series': video_info.get('series'), + 'season_number': video_info.get('season'), + 'season': f'Season {video_info.get("season")}', + 'episode_number': video_info.get('episode'), + 'episode': video_info.get('title'), + 'episode_id': video_id, + 'duration': video_info.get('duration'), + 'categories': [video_info.get('category')], + 'formats': formats, + 'release_timestamp': video_info.get('airDate'), + 'timestamp': video_info.get('availableDate'), + } + + +class CBCGemPlaylistIE(InfoExtractor): + IE_NAME = 'gem.cbc.ca:playlist' + _VALID_URL = r'https?://gem\.cbc\.ca/media/(?P<id>(?P<show>[0-9a-z-]+)/s(?P<season>[0-9]+))/?(?:[?#]|$)' + _TESTS = [{ + # TV show playlist, all public videos + 'url': 'https://gem.cbc.ca/media/schitts-creek/s06', + 'playlist_count': 16, + 'info_dict': { + 'id': 'schitts-creek/s06', + 'title': 'Season 6', + 'description': 'md5:6a92104a56cbeb5818cc47884d4326a2', + }, + }] + _API_BASE = 'https://services.radio-canada.ca/ott/cbc-api/v2/shows/' + + def _real_extract(self, url): + match = self._match_valid_url(url) + season_id = match.group('id') + show = match.group('show') + show_info = self._download_json(self._API_BASE + show, season_id) + season = int(match.group('season')) + season_info = try_get(show_info, lambda x: x['seasons'][season - 1]) + + if season_info is None: + raise ExtractorError(f'Couldn\'t find season {season} of {show}') + + episodes = [] + for episode in season_info['assets']: + episodes.append({ + '_type': 'url_transparent', + 'ie_key': 'CBCGem', + 'url': 'https://gem.cbc.ca/media/' + episode['id'], + 'id': episode['id'], + 'title': episode.get('title'), + 'description': episode.get('description'), + 'thumbnail': episode.get('image'), + 'series': episode.get('series'), + 'season_number': episode.get('season'), + 'season': season_info['title'], + 'season_id': season_info.get('id'), + 'episode_number': episode.get('episode'), + 'episode': episode.get('title'), + 'episode_id': episode['id'], + 'duration': episode.get('duration'), + 'categories': [episode.get('category')], + }) + + thumbnail = None + tn_uri = season_info.get('image') + # the-national was observed to use a "data:image/png;base64" + # URI for their 'image' value. The image was 1x1, and is + # probably just a placeholder, so it is ignored. + if tn_uri is not None and not tn_uri.startswith('data:'): + thumbnail = tn_uri + + return { + '_type': 'playlist', + 'entries': episodes, + 'id': season_id, + 'title': season_info['title'], + 'description': season_info.get('description'), + 'thumbnail': thumbnail, + 'series': show_info.get('title'), + 'season_number': season_info.get('season'), + 'season': season_info['title'], + } + + +class CBCGemLiveIE(InfoExtractor): + IE_NAME = 'gem.cbc.ca:live' + _VALID_URL = r'https?://gem\.cbc\.ca/live/(?P<id>[0-9]{12})' + _TEST = { + 'url': 'https://gem.cbc.ca/live/920604739687', + 'info_dict': { + 'title': 'Ottawa', + 'description': 'The live TV channel and local programming from Ottawa', + 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/CBC_OTT_VMS/Live_Channel_Static_Images/Ottawa_2880x1620.jpg', + 'is_live': True, + 'id': 'AyqZwxRqh8EH', + 'ext': 'mp4', + 'timestamp': 1492106160, + 'upload_date': '20170413', + 'uploader': 'CBCC-NEW', + }, + 'skip': 'Live might have ended', + } + + # It's unclear where the chars at the end come from, but they appear to be + # constant. Might need updating in the future. + _API = 'https://tpfeed.cbc.ca/f/ExhSPC/t_t3UKJR6MAT' + + def _real_extract(self, url): + video_id = self._match_id(url) + live_info = self._download_json(self._API, video_id)['entries'] + + video_info = None + for stream in live_info: + if stream.get('guid') == video_id: + video_info = stream + + if video_info is None: + raise ExtractorError( + 'Couldn\'t find video metadata, maybe this livestream is now offline', + expected=True) + + return { + '_type': 'url_transparent', + 'ie_key': 'ThePlatform', + 'url': video_info['content'][0]['url'], + 'id': video_id, + 'title': video_info.get('title'), + 'description': video_info.get('description'), + 'tags': try_get(video_info, lambda x: x['keywords'].split(', ')), + 'thumbnail': video_info.get('cbc$staticImage'), + 'is_live': True, + } diff --git a/yt_dlp/extractor/cbs.py b/yt_dlp/extractor/cbs.py new file mode 100644 index 000000000..ae9ce5862 --- /dev/null +++ b/yt_dlp/extractor/cbs.py @@ -0,0 +1,167 @@ +from __future__ import unicode_literals + +from .theplatform import ThePlatformFeedIE +from ..utils import ( + ExtractorError, + int_or_none, + find_xpath_attr, + xpath_element, + xpath_text, + update_url_query, + url_or_none, +) + + +class CBSBaseIE(ThePlatformFeedIE): + def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): + subtitles = {} + for k, ext in [('sMPTE-TTCCURL', 'tt'), ('ClosedCaptionURL', 'ttml'), ('webVTTCaptionURL', 'vtt')]: + cc_e = find_xpath_attr(smil, self._xpath_ns('.//param', namespace), 'name', k) + if cc_e is not None: + cc_url = cc_e.get('value') + if cc_url: + subtitles.setdefault(subtitles_lang, []).append({ + 'ext': ext, + 'url': cc_url, + }) + return subtitles + + def _extract_common_video_info(self, content_id, asset_types, mpx_acc, extra_info): + tp_path = 'dJ5BDC/media/guid/%d/%s' % (mpx_acc, content_id) + tp_release_url = f'https://link.theplatform.com/s/{tp_path}' + info = self._extract_theplatform_metadata(tp_path, content_id) + + formats, subtitles = [], {} + last_e = None + for asset_type, query in asset_types.items(): + try: + tp_formats, tp_subtitles = self._extract_theplatform_smil( + update_url_query(tp_release_url, query), content_id, + 'Downloading %s SMIL data' % asset_type) + except ExtractorError as e: + last_e = e + if asset_type != 'fallback': + continue + query['formats'] = '' # blank query to check if expired + try: + tp_formats, tp_subtitles = self._extract_theplatform_smil( + update_url_query(tp_release_url, query), content_id, + 'Downloading %s SMIL data, trying again with another format' % asset_type) + except ExtractorError as e: + last_e = e + continue + formats.extend(tp_formats) + subtitles = self._merge_subtitles(subtitles, tp_subtitles) + if last_e and not formats: + self.raise_no_formats(last_e, True, content_id) + self._sort_formats(formats) + + extra_info.update({ + 'id': content_id, + 'formats': formats, + 'subtitles': subtitles, + }) + info.update({k: v for k, v in extra_info.items() if v is not None}) + return info + + def _extract_video_info(self, *args, **kwargs): + # Extract assets + metadata and call _extract_common_video_info + raise NotImplementedError('This method must be implemented by subclasses') + + def _real_extract(self, url): + return self._extract_video_info(self._match_id(url)) + + +class CBSIE(CBSBaseIE): + _VALID_URL = r'''(?x) + (?: + cbs:| + https?://(?:www\.)?(?: + cbs\.com/(?:shows/[^/]+/video|movies/[^/]+)/| + colbertlateshow\.com/(?:video|podcasts)/) + )(?P<id>[\w-]+)''' + + # All tests are blocked outside US + _TESTS = [{ + 'url': 'https://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', + 'info_dict': { + 'id': '_u7W953k6la293J7EPTd9oHkSPs6Xn6_', + 'ext': 'mp4', + 'title': 'Connect Chat feat. Garth Brooks', + 'description': 'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!', + 'duration': 1495, + 'timestamp': 1385585425, + 'upload_date': '20131127', + 'uploader': 'CBSI-NEW', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'https://www.cbs.com/shows/the-late-show-with-stephen-colbert/video/60icOhMb9NcjbcWnF_gub9XXHdeBcNk2/the-late-show-6-23-21-christine-baranski-joy-oladokun-', + 'info_dict': { + 'id': '60icOhMb9NcjbcWnF_gub9XXHdeBcNk2', + 'title': 'The Late Show - 6/23/21 (Christine Baranski, Joy Oladokun)', + 'timestamp': 1624507140, + 'description': 'md5:e01af24e95c74d55e8775aef86117b95', + 'uploader': 'CBSI-NEW', + 'upload_date': '20210624', + }, + 'params': { + 'ignore_no_formats_error': True, + 'skip_download': True, + }, + 'expected_warnings': [ + 'This content expired on', 'No video formats found', 'Requested format is not available'], + }, { + 'url': 'http://colbertlateshow.com/video/8GmB0oY0McANFvp2aEffk9jZZZ2YyXxy/the-colbeard/', + 'only_matching': True, + }, { + 'url': 'http://www.colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/', + 'only_matching': True, + }] + + def _extract_video_info(self, content_id, site='cbs', mpx_acc=2198311517): + items_data = self._download_xml( + 'https://can.cbs.com/thunder/player/videoPlayerService.php', + content_id, query={'partner': site, 'contentId': content_id}) + video_data = xpath_element(items_data, './/item') + title = xpath_text(video_data, 'videoTitle', 'title') or xpath_text(video_data, 'videotitle', 'title') + + asset_types = {} + has_drm = False + for item in items_data.findall('.//item'): + asset_type = xpath_text(item, 'assetType') + query = { + 'mbr': 'true', + 'assetTypes': asset_type, + } + if not asset_type: + # fallback for content_ids that videoPlayerService doesn't return anything for + asset_type = 'fallback' + query['formats'] = 'M3U+none,MPEG4,M3U+appleHlsEncryption,MP3' + del query['assetTypes'] + if asset_type in asset_types: + continue + elif any(excluded in asset_type for excluded in ('HLS_FPS', 'DASH_CENC', 'OnceURL')): + if 'DASH_CENC' in asset_type: + has_drm = True + continue + if asset_type.startswith('HLS') or 'StreamPack' in asset_type: + query['formats'] = 'MPEG4,M3U' + elif asset_type in ('RTMP', 'WIFI', '3G'): + query['formats'] = 'MPEG4,FLV' + asset_types[asset_type] = query + + if not asset_types and has_drm: + self.report_drm(content_id) + + return self._extract_common_video_info(content_id, asset_types, mpx_acc, extra_info={ + 'title': title, + 'series': xpath_text(video_data, 'seriesTitle'), + 'season_number': int_or_none(xpath_text(video_data, 'seasonNumber')), + 'episode_number': int_or_none(xpath_text(video_data, 'episodeNumber')), + 'duration': int_or_none(xpath_text(video_data, 'videoLength'), 1000), + 'thumbnail': url_or_none(xpath_text(video_data, 'previewImageURL')), + }) diff --git a/yt_dlp/extractor/cbsinteractive.py b/yt_dlp/extractor/cbsinteractive.py new file mode 100644 index 000000000..9d4f75435 --- /dev/null +++ b/yt_dlp/extractor/cbsinteractive.py @@ -0,0 +1,102 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .cbs import CBSIE +from ..utils import int_or_none + + +class CBSInteractiveIE(CBSIE): + _VALID_URL = r'https?://(?:www\.)?(?P<site>cnet|zdnet)\.com/(?:videos|video(?:/share)?)/(?P<id>[^/?]+)' + _TESTS = [{ + 'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/', + 'info_dict': { + 'id': 'R49SYt__yAfmlXR85z4f7gNmCBDcN_00', + 'display_id': 'hands-on-with-microsofts-windows-8-1-update', + 'ext': 'mp4', + 'title': 'Hands-on with Microsoft Windows 8.1 Update', + 'description': 'The new update to the Windows 8 OS brings improved performance for mouse and keyboard users.', + 'uploader_id': '6085384d-619e-11e3-b231-14feb5ca9861', + 'uploader': 'Sarah Mitroff', + 'duration': 70, + 'timestamp': 1396479627, + 'upload_date': '20140402', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://www.cnet.com/videos/whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187/', + 'md5': 'f11d27b2fa18597fbf92444d2a9ed386', + 'info_dict': { + 'id': 'kjOJd_OoVJqbg_ZD8MZCOk8Wekb9QccK', + 'display_id': 'whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187', + 'ext': 'mp4', + 'title': 'Whiny potholes tweet at local government when hit by cars (Tomorrow Daily 187)', + 'description': 'md5:d2b9a95a5ffe978ae6fbd4cf944d618f', + 'uploader_id': 'b163284d-6b73-44fc-b3e6-3da66c392d40', + 'uploader': 'Ashley Esqueda', + 'duration': 1482, + 'timestamp': 1433289889, + 'upload_date': '20150603', + }, + }, { + 'url': 'http://www.zdnet.com/video/share/video-keeping-android-smartphones-and-tablets-secure/', + 'info_dict': { + 'id': 'k0r4T_ehht4xW_hAOqiVQPuBDPZ8SRjt', + 'display_id': 'video-keeping-android-smartphones-and-tablets-secure', + 'ext': 'mp4', + 'title': 'Video: Keeping Android smartphones and tablets secure', + 'description': 'Here\'s the best way to keep Android devices secure, and what you do when they\'ve come to the end of their lives.', + 'uploader_id': 'f2d97ea2-8175-11e2-9d12-0018fe8a00b0', + 'uploader': 'Adrian Kingsley-Hughes', + 'duration': 731, + 'timestamp': 1449129925, + 'upload_date': '20151203', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://www.zdnet.com/video/huawei-matebook-x-video/', + 'only_matching': True, + }] + + MPX_ACCOUNTS = { + 'cnet': 2198311517, + 'zdnet': 2387448114, + } + + def _real_extract(self, url): + site, display_id = self._match_valid_url(url).groups() + webpage = self._download_webpage(url, display_id) + + data_json = self._html_search_regex( + r"data(?:-(?:cnet|zdnet))?-video(?:-(?:uvp(?:js)?|player))?-options='([^']+)'", + webpage, 'data json') + data = self._parse_json(data_json, display_id) + vdata = data.get('video') or (data.get('videos') or data.get('playlist'))[0] + + video_id = vdata['mpxRefId'] + + title = vdata['title'] + author = vdata.get('author') + if author: + uploader = '%s %s' % (author['firstName'], author['lastName']) + uploader_id = author.get('id') + else: + uploader = None + uploader_id = None + + info = self._extract_video_info(video_id, site, self.MPX_ACCOUNTS[site]) + info.update({ + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'duration': int_or_none(vdata.get('duration')), + 'uploader': uploader, + 'uploader_id': uploader_id, + }) + return info diff --git a/youtube_dl/extractor/cbslocal.py b/yt_dlp/extractor/cbslocal.py index 3b7e1a8b9..3b7e1a8b9 100644 --- a/youtube_dl/extractor/cbslocal.py +++ b/yt_dlp/extractor/cbslocal.py diff --git a/youtube_dl/extractor/cbsnews.py b/yt_dlp/extractor/cbsnews.py index 1285ed65e..1285ed65e 100644 --- a/youtube_dl/extractor/cbsnews.py +++ b/yt_dlp/extractor/cbsnews.py diff --git a/yt_dlp/extractor/cbssports.py b/yt_dlp/extractor/cbssports.py new file mode 100644 index 000000000..b8a6e5967 --- /dev/null +++ b/yt_dlp/extractor/cbssports.py @@ -0,0 +1,112 @@ +from __future__ import unicode_literals + + +# from .cbs import CBSBaseIE +from .common import InfoExtractor +from ..utils import ( + int_or_none, + try_get, +) + + +# class CBSSportsEmbedIE(CBSBaseIE): +class CBSSportsEmbedIE(InfoExtractor): + IE_NAME = 'cbssports:embed' + _VALID_URL = r'''(?ix)https?://(?:(?:www\.)?cbs|embed\.247)sports\.com/player/embed.+? + (?: + ids%3D(?P<id>[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})| + pcid%3D(?P<pcid>\d+) + )''' + _TESTS = [{ + 'url': 'https://www.cbssports.com/player/embed/?args=player_id%3Db56c03a6-231a-4bbe-9c55-af3c8a8e9636%26ids%3Db56c03a6-231a-4bbe-9c55-af3c8a8e9636%26resizable%3D1%26autoplay%3Dtrue%26domain%3Dcbssports.com%26comp_ads_enabled%3Dfalse%26watchAndRead%3D0%26startTime%3D0%26env%3Dprod', + 'only_matching': True, + }, { + 'url': 'https://embed.247sports.com/player/embed/?args=%3fplayer_id%3d1827823171591%26channel%3dcollege-football-recruiting%26pcid%3d1827823171591%26width%3d640%26height%3d360%26autoplay%3dTrue%26comp_ads_enabled%3dFalse%26uvpc%3dhttps%253a%252f%252fwww.cbssports.com%252fapi%252fcontent%252fvideo%252fconfig%252f%253fcfg%253duvp_247sports_v4%2526partner%253d247%26uvpc_m%3dhttps%253a%252f%252fwww.cbssports.com%252fapi%252fcontent%252fvideo%252fconfig%252f%253fcfg%253duvp_247sports_m_v4%2526partner_m%253d247_mobile%26utag%3d247sportssite%26resizable%3dTrue', + 'only_matching': True, + }] + + # def _extract_video_info(self, filter_query, video_id): + # return self._extract_feed_info('dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id) + + def _real_extract(self, url): + uuid, pcid = self._match_valid_url(url).groups() + query = {'id': uuid} if uuid else {'pcid': pcid} + video = self._download_json( + 'https://www.cbssports.com/api/content/video/', + uuid or pcid, query=query)[0] + video_id = video['id'] + title = video['title'] + metadata = video.get('metaData') or {} + # return self._extract_video_info('byId=%d' % metadata['mpxOutletId'], video_id) + # return self._extract_video_info('byGuid=' + metadata['mpxRefId'], video_id) + + formats = self._extract_m3u8_formats( + metadata['files'][0]['url'], video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False) + self._sort_formats(formats) + + image = video.get('image') + thumbnails = None + if image: + image_path = image.get('path') + if image_path: + thumbnails = [{ + 'url': image_path, + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + 'filesize': int_or_none(image.get('size')), + }] + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnails': thumbnails, + 'description': video.get('description'), + 'timestamp': int_or_none(try_get(video, lambda x: x['dateCreated']['epoch'])), + 'duration': int_or_none(metadata.get('duration')), + } + + +class CBSSportsBaseIE(InfoExtractor): + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + iframe_url = self._search_regex( + r'<iframe[^>]+(?:data-)?src="(https?://[^/]+/player/embed[^"]+)"', + webpage, 'embed url') + return self.url_result(iframe_url, CBSSportsEmbedIE.ie_key()) + + +class CBSSportsIE(CBSSportsBaseIE): + IE_NAME = 'cbssports' + _VALID_URL = r'https?://(?:www\.)?cbssports\.com/[^/]+/video/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.cbssports.com/college-football/video/cover-3-stanford-spring-gleaning/', + 'info_dict': { + 'id': 'b56c03a6-231a-4bbe-9c55-af3c8a8e9636', + 'ext': 'mp4', + 'title': 'Cover 3: Stanford Spring Gleaning', + 'description': 'The Cover 3 crew break down everything you need to know about the Stanford Cardinal this spring.', + 'timestamp': 1617218398, + 'upload_date': '20210331', + 'duration': 502, + }, + }] + + +class TwentyFourSevenSportsIE(CBSSportsBaseIE): + IE_NAME = '247sports' + _VALID_URL = r'https?://(?:www\.)?247sports\.com/Video/(?:[^/?#&]+-)?(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://247sports.com/Video/2021-QB-Jake-Garcia-senior-highlights-through-five-games-10084854/', + 'info_dict': { + 'id': '4f1265cb-c3b5-44a8-bb1d-1914119a0ccc', + 'ext': 'mp4', + 'title': '2021 QB Jake Garcia senior highlights through five games', + 'description': 'md5:8cb67ebed48e2e6adac1701e0ff6e45b', + 'timestamp': 1607114223, + 'upload_date': '20201204', + 'duration': 208, + }, + }] diff --git a/youtube_dl/extractor/ccc.py b/yt_dlp/extractor/ccc.py index 36e6dff72..36e6dff72 100644 --- a/youtube_dl/extractor/ccc.py +++ b/yt_dlp/extractor/ccc.py diff --git a/yt_dlp/extractor/ccma.py b/yt_dlp/extractor/ccma.py new file mode 100644 index 000000000..ea98f8688 --- /dev/null +++ b/yt_dlp/extractor/ccma.py @@ -0,0 +1,154 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import calendar +import datetime + +from .common import InfoExtractor +from ..utils import ( + clean_html, + extract_timezone, + int_or_none, + parse_duration, + parse_resolution, + try_get, + url_or_none, +) + + +class CCMAIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ccma\.cat/(?:[^/]+/)*?(?P<type>video|audio)/(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://www.ccma.cat/tv3/alacarta/lespot-de-la-marato-de-tv3/lespot-de-la-marato-de-tv3/video/5630208/', + 'md5': '7296ca43977c8ea4469e719c609b0871', + 'info_dict': { + 'id': '5630208', + 'ext': 'mp4', + 'title': 'L\'espot de La Marató de TV3', + 'description': 'md5:f12987f320e2f6e988e9908e4fe97765', + 'timestamp': 1478608140, + 'upload_date': '20161108', + 'age_limit': 0, + } + }, { + 'url': 'http://www.ccma.cat/catradio/alacarta/programa/el-consell-de-savis-analitza-el-derbi/audio/943685/', + 'md5': 'fa3e38f269329a278271276330261425', + 'info_dict': { + 'id': '943685', + 'ext': 'mp3', + 'title': 'El Consell de Savis analitza el derbi', + 'description': 'md5:e2a3648145f3241cb9c6b4b624033e53', + 'upload_date': '20170512', + 'timestamp': 1494622500, + 'vcodec': 'none', + 'categories': ['Esports'], + } + }, { + 'url': 'http://www.ccma.cat/tv3/alacarta/crims/crims-josep-tallada-lespereu-me-capitol-1/video/6031387/', + 'md5': 'b43c3d3486f430f3032b5b160d80cbc3', + 'info_dict': { + 'id': '6031387', + 'ext': 'mp4', + 'title': 'Crims - Josep Talleda, l\'"Espereu-me" (capítol 1)', + 'description': 'md5:7cbdafb640da9d0d2c0f62bad1e74e60', + 'timestamp': 1582577700, + 'upload_date': '20200224', + 'subtitles': 'mincount:4', + 'age_limit': 16, + 'series': 'Crims', + } + }] + + def _real_extract(self, url): + media_type, media_id = self._match_valid_url(url).groups() + + media = self._download_json( + 'http://dinamics.ccma.cat/pvideo/media.jsp', media_id, query={ + 'media': media_type, + 'idint': media_id, + }) + + formats = [] + media_url = media['media']['url'] + if isinstance(media_url, list): + for format_ in media_url: + format_url = url_or_none(format_.get('file')) + if not format_url: + continue + label = format_.get('label') + f = parse_resolution(label) + f.update({ + 'url': format_url, + 'format_id': label, + }) + formats.append(f) + else: + formats.append({ + 'url': media_url, + 'vcodec': 'none' if media_type == 'audio' else None, + }) + self._sort_formats(formats) + + informacio = media['informacio'] + title = informacio['titol'] + durada = informacio.get('durada') or {} + duration = int_or_none(durada.get('milisegons'), 1000) or parse_duration(durada.get('text')) + tematica = try_get(informacio, lambda x: x['tematica']['text']) + + timestamp = None + data_utc = try_get(informacio, lambda x: x['data_emissio']['utc']) + try: + timezone, data_utc = extract_timezone(data_utc) + timestamp = calendar.timegm((datetime.datetime.strptime( + data_utc, '%Y-%d-%mT%H:%M:%S') - timezone).timetuple()) + except TypeError: + pass + + subtitles = {} + subtitols = media.get('subtitols') or [] + if isinstance(subtitols, dict): + subtitols = [subtitols] + for st in subtitols: + sub_url = st.get('url') + if sub_url: + subtitles.setdefault( + st.get('iso') or st.get('text') or 'ca', []).append({ + 'url': sub_url, + }) + + thumbnails = [] + imatges = media.get('imatges', {}) + if imatges: + thumbnail_url = imatges.get('url') + if thumbnail_url: + thumbnails = [{ + 'url': thumbnail_url, + 'width': int_or_none(imatges.get('amplada')), + 'height': int_or_none(imatges.get('alcada')), + }] + + age_limit = None + codi_etic = try_get(informacio, lambda x: x['codi_etic']['id']) + if codi_etic: + codi_etic_s = codi_etic.split('_') + if len(codi_etic_s) == 2: + if codi_etic_s[1] == 'TP': + age_limit = 0 + else: + age_limit = int_or_none(codi_etic_s[1]) + + return { + 'id': media_id, + 'title': title, + 'description': clean_html(informacio.get('descripcio')), + 'duration': duration, + 'timestamp': timestamp, + 'thumbnails': thumbnails, + 'subtitles': subtitles, + 'formats': formats, + 'age_limit': age_limit, + 'alt_title': informacio.get('titol_complet'), + 'episode_number': int_or_none(informacio.get('capitol')), + 'categories': [tematica] if tematica else None, + 'series': informacio.get('programa'), + } diff --git a/yt_dlp/extractor/cctv.py b/yt_dlp/extractor/cctv.py new file mode 100644 index 000000000..9b8612138 --- /dev/null +++ b/yt_dlp/extractor/cctv.py @@ -0,0 +1,191 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + float_or_none, + try_get, + unified_timestamp, +) + + +class CCTVIE(InfoExtractor): + IE_DESC = '央视网' + _VALID_URL = r'https?://(?:(?:[^/]+)\.(?:cntv|cctv)\.(?:com|cn)|(?:www\.)?ncpa-classic\.com)/(?:[^/]+/)*?(?P<id>[^/?#&]+?)(?:/index)?(?:\.s?html|[?#&]|$)' + _TESTS = [{ + # fo.addVariable("videoCenterId","id") + 'url': 'http://sports.cntv.cn/2016/02/12/ARTIaBRxv4rTT1yWf1frW2wi160212.shtml', + 'md5': 'd61ec00a493e09da810bf406a078f691', + 'info_dict': { + 'id': '5ecdbeab623f4973b40ff25f18b174e8', + 'ext': 'mp4', + 'title': '[NBA]二少联手砍下46分 雷霆主场击败鹈鹕(快讯)', + 'description': 'md5:7e14a5328dc5eb3d1cd6afbbe0574e95', + 'duration': 98, + 'uploader': 'songjunjie', + 'timestamp': 1455279956, + 'upload_date': '20160212', + }, + }, { + # var guid = "id" + 'url': 'http://tv.cctv.com/2016/02/05/VIDEUS7apq3lKrHG9Dncm03B160205.shtml', + 'info_dict': { + 'id': 'efc5d49e5b3b4ab2b34f3a502b73d3ae', + 'ext': 'mp4', + 'title': '[赛车]“车王”舒马赫恢复情况成谜(快讯)', + 'description': '2月4日,蒙特泽莫罗透露了关于“车王”舒马赫恢复情况,但情况是否属实遭到了质疑。', + 'duration': 37, + 'uploader': 'shujun', + 'timestamp': 1454677291, + 'upload_date': '20160205', + }, + 'params': { + 'skip_download': True, + }, + }, { + # changePlayer('id') + 'url': 'http://english.cntv.cn/special/four_comprehensives/index.shtml', + 'info_dict': { + 'id': '4bb9bb4db7a6471ba85fdeda5af0381e', + 'ext': 'mp4', + 'title': 'NHnews008 ANNUAL POLITICAL SEASON', + 'description': 'Four Comprehensives', + 'duration': 60, + 'uploader': 'zhangyunlei', + 'timestamp': 1425385521, + 'upload_date': '20150303', + }, + 'params': { + 'skip_download': True, + }, + }, { + # loadvideo('id') + 'url': 'http://cctv.cntv.cn/lm/tvseries_russian/yilugesanghua/index.shtml', + 'info_dict': { + 'id': 'b15f009ff45c43968b9af583fc2e04b2', + 'ext': 'mp4', + 'title': 'Путь,усыпанный космеями Серия 1', + 'description': 'Путь, усыпанный космеями', + 'duration': 2645, + 'uploader': 'renxue', + 'timestamp': 1477479241, + 'upload_date': '20161026', + }, + 'params': { + 'skip_download': True, + }, + }, { + # var initMyAray = 'id' + 'url': 'http://www.ncpa-classic.com/2013/05/22/VIDE1369219508996867.shtml', + 'info_dict': { + 'id': 'a194cfa7f18c426b823d876668325946', + 'ext': 'mp4', + 'title': '小泽征尔音乐塾 音乐梦想无国界', + 'duration': 2173, + 'timestamp': 1369248264, + 'upload_date': '20130522', + }, + 'params': { + 'skip_download': True, + }, + }, { + # var ids = ["id"] + 'url': 'http://www.ncpa-classic.com/clt/more/416/index.shtml', + 'info_dict': { + 'id': 'a8606119a4884588a79d81c02abecc16', + 'ext': 'mp3', + 'title': '来自维也纳的新年贺礼', + 'description': 'md5:f13764ae8dd484e84dd4b39d5bcba2a7', + 'duration': 1578, + 'uploader': 'djy', + 'timestamp': 1482942419, + 'upload_date': '20161228', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Failed to download m3u8 information'], + }, { + 'url': 'http://ent.cntv.cn/2016/01/18/ARTIjprSSJH8DryTVr5Bx8Wb160118.shtml', + 'only_matching': True, + }, { + 'url': 'http://tv.cntv.cn/video/C39296/e0210d949f113ddfb38d31f00a4e5c44', + 'only_matching': True, + }, { + 'url': 'http://english.cntv.cn/2016/09/03/VIDEhnkB5y9AgHyIEVphCEz1160903.shtml', + 'only_matching': True, + }, { + 'url': 'http://tv.cctv.com/2016/09/07/VIDE5C1FnlX5bUywlrjhxXOV160907.shtml', + 'only_matching': True, + }, { + 'url': 'http://tv.cntv.cn/video/C39296/95cfac44cabd3ddc4a9438780a4e5c44', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + video_id = self._search_regex( + [r'var\s+guid\s*=\s*["\']([\da-fA-F]+)', + r'videoCenterId["\']\s*,\s*["\']([\da-fA-F]+)', + r'changePlayer\s*\(\s*["\']([\da-fA-F]+)', + r'load[Vv]ideo\s*\(\s*["\']([\da-fA-F]+)', + r'var\s+initMyAray\s*=\s*["\']([\da-fA-F]+)', + r'var\s+ids\s*=\s*\[["\']([\da-fA-F]+)'], + webpage, 'video id') + + data = self._download_json( + 'http://vdn.apps.cntv.cn/api/getHttpVideoInfo.do', video_id, + query={ + 'pid': video_id, + 'url': url, + 'idl': 32, + 'idlr': 32, + 'modifyed': 'false', + }) + + title = data['title'] + + formats = [] + + video = data.get('video') + if isinstance(video, dict): + for quality, chapters_key in enumerate(('lowChapters', 'chapters')): + video_url = try_get( + video, lambda x: x[chapters_key][0]['url'], compat_str) + if video_url: + formats.append({ + 'url': video_url, + 'format_id': 'http', + 'quality': quality, + 'source_preference': -10 + }) + + hls_url = try_get(data, lambda x: x['hls_url'], compat_str) + if hls_url: + hls_url = re.sub(r'maxbr=\d+&?', '', hls_url) + formats.extend(self._extract_m3u8_formats( + hls_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + + self._sort_formats(formats) + + uploader = data.get('editer_name') + description = self._html_search_meta( + 'description', webpage, default=None) + timestamp = unified_timestamp(data.get('f_pgmtime')) + duration = float_or_none(try_get(video, lambda x: x['totalLength'])) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'uploader': uploader, + 'timestamp': timestamp, + 'duration': duration, + 'formats': formats, + } diff --git a/yt_dlp/extractor/cda.py b/yt_dlp/extractor/cda.py new file mode 100644 index 000000000..72c47050f --- /dev/null +++ b/yt_dlp/extractor/cda.py @@ -0,0 +1,234 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import codecs +import re +import json + +from .common import InfoExtractor +from ..compat import ( + compat_chr, + compat_ord, + compat_urllib_parse_unquote, +) +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + merge_dicts, + multipart_encode, + parse_duration, + random_birthday, + urljoin, + try_get, +) + + +class CDAIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www\.)?cda\.pl/video|ebd\.cda\.pl/[0-9]+x[0-9]+)/(?P<id>[0-9a-z]+)' + _BASE_URL = 'http://www.cda.pl/' + _TESTS = [{ + 'url': 'http://www.cda.pl/video/5749950c', + 'md5': '6f844bf51b15f31fae165365707ae970', + 'info_dict': { + 'id': '5749950c', + 'ext': 'mp4', + 'height': 720, + 'title': 'Oto dlaczego przed zakrętem należy zwolnić.', + 'description': 'md5:269ccd135d550da90d1662651fcb9772', + 'thumbnail': r're:^https?://.*\.jpg$', + 'average_rating': float, + 'duration': 39, + 'age_limit': 0, + 'upload_date': '20160221', + 'timestamp': 1456078244, + } + }, { + 'url': 'http://www.cda.pl/video/57413289', + 'md5': 'a88828770a8310fc00be6c95faf7f4d5', + 'info_dict': { + 'id': '57413289', + 'ext': 'mp4', + 'title': 'Lądowanie na lotnisku na Maderze', + 'description': 'md5:60d76b71186dcce4e0ba6d4bbdb13e1a', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'crash404', + 'view_count': int, + 'average_rating': float, + 'duration': 137, + 'age_limit': 0, + } + }, { + # Age-restricted + 'url': 'http://www.cda.pl/video/1273454c4', + 'info_dict': { + 'id': '1273454c4', + 'ext': 'mp4', + 'title': 'Bronson (2008) napisy HD 1080p', + 'description': 'md5:1b6cb18508daf2dc4e0fa4db77fec24c', + 'height': 1080, + 'uploader': 'boniek61', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 5554, + 'age_limit': 18, + 'view_count': int, + 'average_rating': float, + }, + }, { + 'url': 'http://ebd.cda.pl/0x0/5749950c', + 'only_matching': True, + }] + + def _download_age_confirm_page(self, url, video_id, *args, **kwargs): + form_data = random_birthday('rok', 'miesiac', 'dzien') + form_data.update({'return': url, 'module': 'video', 'module_id': video_id}) + data, content_type = multipart_encode(form_data) + return self._download_webpage( + urljoin(url, '/a/validatebirth'), video_id, *args, + data=data, headers={ + 'Referer': url, + 'Content-Type': content_type, + }, **kwargs) + + def _real_extract(self, url): + video_id = self._match_id(url) + self._set_cookie('cda.pl', 'cda.player', 'html5') + webpage = self._download_webpage( + self._BASE_URL + '/video/' + video_id, video_id) + + if 'Ten film jest dostępny dla użytkowników premium' in webpage: + raise ExtractorError('This video is only available for premium users.', expected=True) + + if re.search(r'niedostępn[ey] w(?: |\s+)Twoim kraju\s*<', webpage): + self.raise_geo_restricted() + + need_confirm_age = False + if self._html_search_regex(r'(<form[^>]+action="[^"]*/a/validatebirth[^"]*")', + webpage, 'birthday validate form', default=None): + webpage = self._download_age_confirm_page( + url, video_id, note='Confirming age') + need_confirm_age = True + + formats = [] + + uploader = self._search_regex(r'''(?x) + <(span|meta)[^>]+itemprop=(["\'])author\2[^>]*> + (?:<\1[^>]*>[^<]*</\1>|(?!</\1>)(?:.|\n))*? + <(span|meta)[^>]+itemprop=(["\'])name\4[^>]*>(?P<uploader>[^<]+)</\3> + ''', webpage, 'uploader', default=None, group='uploader') + view_count = self._search_regex( + r'Odsłony:(?:\s| )*([0-9]+)', webpage, + 'view_count', default=None) + average_rating = self._search_regex( + (r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P<rating_value>[0-9.]+)', + r'<span[^>]+\bclass=["\']rating["\'][^>]*>(?P<rating_value>[0-9.]+)'), webpage, 'rating', fatal=False, + group='rating_value') + + info_dict = { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'uploader': uploader, + 'view_count': int_or_none(view_count), + 'average_rating': float_or_none(average_rating), + 'thumbnail': self._og_search_thumbnail(webpage), + 'formats': formats, + 'duration': None, + 'age_limit': 18 if need_confirm_age else 0, + } + + info = self._search_json_ld(webpage, video_id, default={}) + + # Source: https://www.cda.pl/js/player.js?t=1606154898 + def decrypt_file(a): + for p in ('_XDDD', '_CDA', '_ADC', '_CXD', '_QWE', '_Q5', '_IKSDE'): + a = a.replace(p, '') + a = compat_urllib_parse_unquote(a) + b = [] + for c in a: + f = compat_ord(c) + b.append(compat_chr(33 + (f + 14) % 94) if 33 <= f <= 126 else compat_chr(f)) + a = ''.join(b) + a = a.replace('.cda.mp4', '') + for p in ('.2cda.pl', '.3cda.pl'): + a = a.replace(p, '.cda.pl') + if '/upstream' in a: + a = a.replace('/upstream', '.mp4/upstream') + return 'https://' + a + return 'https://' + a + '.mp4' + + def extract_format(page, version): + json_str = self._html_search_regex( + r'player_data=(\\?["\'])(?P<player_data>.+?)\1', page, + '%s player_json' % version, fatal=False, group='player_data') + if not json_str: + return + player_data = self._parse_json( + json_str, '%s player_data' % version, fatal=False) + if not player_data: + return + video = player_data.get('video') + if not video or 'file' not in video: + self.report_warning('Unable to extract %s version information' % version) + return + if video['file'].startswith('uggc'): + video['file'] = codecs.decode(video['file'], 'rot_13') + if video['file'].endswith('adc.mp4'): + video['file'] = video['file'].replace('adc.mp4', '.mp4') + elif not video['file'].startswith('http'): + video['file'] = decrypt_file(video['file']) + video_quality = video.get('quality') + qualities = video.get('qualities', {}) + video_quality = next((k for k, v in qualities.items() if v == video_quality), video_quality) + info_dict['formats'].append({ + 'url': video['file'], + 'format_id': video_quality, + 'height': int_or_none(video_quality[:-1]), + }) + for quality, cda_quality in qualities.items(): + if quality == video_quality: + continue + data = {'jsonrpc': '2.0', 'method': 'videoGetLink', 'id': 2, + 'params': [video_id, cda_quality, video.get('ts'), video.get('hash2'), {}]} + data = json.dumps(data).encode('utf-8') + video_url = self._download_json( + f'https://www.cda.pl/video/{video_id}', video_id, headers={ + 'Content-Type': 'application/json', + 'X-Requested-With': 'XMLHttpRequest' + }, data=data, note=f'Fetching {quality} url', + errnote=f'Failed to fetch {quality} url', fatal=False) + if try_get(video_url, lambda x: x['result']['status']) == 'ok': + video_url = try_get(video_url, lambda x: x['result']['resp']) + info_dict['formats'].append({ + 'url': video_url, + 'format_id': quality, + 'height': int_or_none(quality[:-1]) + }) + + if not info_dict['duration']: + info_dict['duration'] = parse_duration(video.get('duration')) + + extract_format(webpage, 'default') + + for href, resolution in re.findall( + r'<a[^>]+data-quality="[^"]+"[^>]+href="([^"]+)"[^>]+class="quality-btn"[^>]*>([0-9]+p)', + webpage): + if need_confirm_age: + handler = self._download_age_confirm_page + else: + handler = self._download_webpage + + webpage = handler( + urljoin(self._BASE_URL, href), video_id, + 'Downloading %s version information' % resolution, fatal=False) + if not webpage: + # Manually report warning because empty page is returned when + # invalid version is requested. + self.report_warning('Unable to download %s version information' % resolution) + continue + + extract_format(webpage, resolution) + + self._sort_formats(formats) + + return merge_dicts(info_dict, info) diff --git a/yt_dlp/extractor/ceskatelevize.py b/yt_dlp/extractor/ceskatelevize.py new file mode 100644 index 000000000..5e04d38a2 --- /dev/null +++ b/yt_dlp/extractor/ceskatelevize.py @@ -0,0 +1,290 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse_unquote, + compat_urllib_parse_urlparse, +) +from ..utils import ( + ExtractorError, + float_or_none, + sanitized_Request, + unescapeHTML, + update_url_query, + urlencode_postdata, + USER_AGENTS, +) + + +class CeskaTelevizeIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/ivysilani/(?:[^/?#&]+/)*(?P<id>[^/#?]+)' + _TESTS = [{ + 'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220', + 'info_dict': { + 'id': '61924494877246241', + 'ext': 'mp4', + 'title': 'Hyde Park Civilizace: Život v Grónsku', + 'description': 'md5:3fec8f6bb497be5cdb0c9e8781076626', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 3350, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://www.ceskatelevize.cz/ivysilani/10441294653-hyde-park-civilizace/215411058090502/bonus/20641-bonus-01-en', + 'info_dict': { + 'id': '61924494877028507', + 'ext': 'mp4', + 'title': 'Hyde Park Civilizace: Bonus 01 - En', + 'description': 'English Subtittles', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 81.3, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # live stream + 'url': 'http://www.ceskatelevize.cz/ivysilani/zive/ct4/', + 'info_dict': { + 'id': 402, + 'ext': 'mp4', + 'title': r're:^ČT Sport \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'is_live': True, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'skip': 'Georestricted to Czech Republic', + }, { + 'url': 'http://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php?hash=d6a3e1370d2e4fa76296b90bad4dfc19673b641e&IDEC=217 562 22150/0004&channelID=1&width=100%25', + 'only_matching': True, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.' + if '%s</p>' % NOT_AVAILABLE_STRING in webpage: + raise ExtractorError(NOT_AVAILABLE_STRING, expected=True) + + type_ = None + episode_id = None + + playlist = self._parse_json( + self._search_regex( + r'getPlaylistUrl\(\[({.+?})\]', webpage, 'playlist', + default='{}'), playlist_id) + if playlist: + type_ = playlist.get('type') + episode_id = playlist.get('id') + + if not type_: + type_ = self._html_search_regex( + r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],', + webpage, 'type') + if not episode_id: + episode_id = self._html_search_regex( + r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],', + webpage, 'episode_id') + + data = { + 'playlist[0][type]': type_, + 'playlist[0][id]': episode_id, + 'requestUrl': compat_urllib_parse_urlparse(url).path, + 'requestSource': 'iVysilani', + } + + entries = [] + + for user_agent in (None, USER_AGENTS['Safari']): + req = sanitized_Request( + 'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist', + data=urlencode_postdata(data)) + + req.add_header('Content-type', 'application/x-www-form-urlencoded') + req.add_header('x-addr', '127.0.0.1') + req.add_header('X-Requested-With', 'XMLHttpRequest') + if user_agent: + req.add_header('User-Agent', user_agent) + req.add_header('Referer', url) + + playlistpage = self._download_json(req, playlist_id, fatal=False) + + if not playlistpage: + continue + + playlist_url = playlistpage['url'] + if playlist_url == 'error_region': + raise ExtractorError(NOT_AVAILABLE_STRING, expected=True) + + req = sanitized_Request(compat_urllib_parse_unquote(playlist_url)) + req.add_header('Referer', url) + + playlist_title = self._og_search_title(webpage, default=None) + playlist_description = self._og_search_description(webpage, default=None) + + playlist = self._download_json(req, playlist_id, fatal=False) + if not playlist: + continue + + playlist = playlist.get('playlist') + if not isinstance(playlist, list): + continue + + playlist_len = len(playlist) + + for num, item in enumerate(playlist): + is_live = item.get('type') == 'LIVE' + formats = [] + for format_id, stream_url in item.get('streamUrls', {}).items(): + if 'playerType=flash' in stream_url: + stream_formats = self._extract_m3u8_formats( + stream_url, playlist_id, 'mp4', 'm3u8_native', + m3u8_id='hls-%s' % format_id, fatal=False) + else: + stream_formats = self._extract_mpd_formats( + stream_url, playlist_id, + mpd_id='dash-%s' % format_id, fatal=False) + if 'drmOnly=true' in stream_url: + for f in stream_formats: + f['has_drm'] = True + # See https://github.com/ytdl-org/youtube-dl/issues/12119#issuecomment-280037031 + if format_id == 'audioDescription': + for f in stream_formats: + f['source_preference'] = -10 + formats.extend(stream_formats) + + if user_agent and len(entries) == playlist_len: + entries[num]['formats'].extend(formats) + continue + + item_id = item.get('id') or item['assetId'] + title = item['title'] + + duration = float_or_none(item.get('duration')) + thumbnail = item.get('previewImageUrl') + + subtitles = {} + if item.get('type') == 'VOD': + subs = item.get('subtitles') + if subs: + subtitles = self.extract_subtitles(episode_id, subs) + + if playlist_len == 1: + final_title = playlist_title or title + if is_live: + final_title = self._live_title(final_title) + else: + final_title = '%s (%s)' % (playlist_title, title) + + entries.append({ + 'id': item_id, + 'title': final_title, + 'description': playlist_description if playlist_len == 1 else None, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + 'subtitles': subtitles, + 'is_live': is_live, + }) + + for e in entries: + self._sort_formats(e['formats']) + + return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) + + def _get_subtitles(self, episode_id, subs): + original_subtitles = self._download_webpage( + subs[0]['url'], episode_id, 'Downloading subtitles') + srt_subs = self._fix_subtitles(original_subtitles) + return { + 'cs': [{ + 'ext': 'srt', + 'data': srt_subs, + }] + } + + @staticmethod + def _fix_subtitles(subtitles): + """ Convert millisecond-based subtitles to SRT """ + + def _msectotimecode(msec): + """ Helper utility to convert milliseconds to timecode """ + components = [] + for divider in [1000, 60, 60, 100]: + components.append(msec % divider) + msec //= divider + return '{3:02}:{2:02}:{1:02},{0:03}'.format(*components) + + def _fix_subtitle(subtitle): + for line in subtitle.splitlines(): + m = re.match(r'^\s*([0-9]+);\s*([0-9]+)\s+([0-9]+)\s*$', line) + if m: + yield m.group(1) + start, stop = (_msectotimecode(int(t)) for t in m.groups()[1:]) + yield '{0} --> {1}'.format(start, stop) + else: + yield line + + return '\r\n'.join(_fix_subtitle(subtitles)) + + +class CeskaTelevizePoradyIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/porady/(?:[^/?#&]+/)*(?P<id>[^/#?]+)' + _TESTS = [{ + # video with 18+ caution trailer + 'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/', + 'info_dict': { + 'id': '215562210900007-bogotart', + 'title': 'Queer: Bogotart', + 'description': 'Alternativní průvodce současným queer světem', + }, + 'playlist': [{ + 'info_dict': { + 'id': '61924494876844842', + 'ext': 'mp4', + 'title': 'Queer: Bogotart (Varování 18+)', + 'duration': 10.2, + }, + }, { + 'info_dict': { + 'id': '61924494877068022', + 'ext': 'mp4', + 'title': 'Queer: Bogotart (Queer)', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 1558.3, + }, + }], + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # iframe embed + 'url': 'http://www.ceskatelevize.cz/porady/10614999031-neviditelni/21251212048/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + data_url = update_url_query(unescapeHTML(self._search_regex( + (r'<span[^>]*\bdata-url=(["\'])(?P<url>(?:(?!\1).)+)\1', + r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?ceskatelevize\.cz/ivysilani/embed/iFramePlayer\.php.*?)\1'), + webpage, 'iframe player url', group='url')), query={ + 'autoStart': 'true', + }) + + return self.url_result(data_url, ie=CeskaTelevizeIE.ie_key()) diff --git a/yt_dlp/extractor/cgtn.py b/yt_dlp/extractor/cgtn.py new file mode 100644 index 000000000..89f173887 --- /dev/null +++ b/yt_dlp/extractor/cgtn.py @@ -0,0 +1,64 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + try_get, + unified_timestamp, +) + + +class CGTNIE(InfoExtractor): + _VALID_URL = r'https?://news\.cgtn\.com/news/[0-9]{4}-[0-9]{2}-[0-9]{2}/[a-zA-Z0-9-]+-(?P<id>[a-zA-Z0-9-]+)/index\.html' + _TESTS = [ + { + 'url': 'https://news.cgtn.com/news/2021-03-09/Up-and-Out-of-Poverty-Ep-1-A-solemn-promise-YuOUaOzGQU/index.html', + 'info_dict': { + 'id': 'YuOUaOzGQU', + 'ext': 'mp4', + 'title': 'Up and Out of Poverty Ep. 1: A solemn promise', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1615295940, + 'upload_date': '20210309', + }, + 'params': { + 'skip_download': True + } + }, { + 'url': 'https://news.cgtn.com/news/2021-06-06/China-Indonesia-vow-to-further-deepen-maritime-cooperation-10REvJCewCY/index.html', + 'info_dict': { + 'id': '10REvJCewCY', + 'ext': 'mp4', + 'title': 'China, Indonesia vow to further deepen maritime cooperation', + 'thumbnail': r're:^https?://.*\.png$', + 'description': 'China and Indonesia vowed to upgrade their cooperation into the maritime sector and also for political security, economy, and cultural and people-to-people exchanges.', + 'author': 'CGTN', + 'category': 'China', + 'timestamp': 1622950200, + 'upload_date': '20210606', + }, + 'params': { + 'skip_download': False + } + } + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + download_url = self._html_search_regex(r'data-video ="(?P<url>.+m3u8)"', webpage, 'download_url') + datetime_str = self._html_search_regex(r'<span class="date">\s*(.+?)\s*</span>', webpage, 'datetime_str', fatal=False) + + return { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage, default=None), + 'thumbnail': self._og_search_thumbnail(webpage), + 'formats': self._extract_m3u8_formats(download_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls'), + 'category': self._html_search_regex(r'<span class="section">\s*(.+?)\s*</span>', + webpage, 'category', fatal=False), + 'author': self._html_search_regex(r'<div class="news-author-name">\s*(.+?)\s*</div>', + webpage, 'author', default=None, fatal=False), + 'timestamp': try_get(unified_timestamp(datetime_str), lambda x: x - 8 * 3600), + } diff --git a/yt_dlp/extractor/channel9.py b/yt_dlp/extractor/channel9.py new file mode 100644 index 000000000..90024dbba --- /dev/null +++ b/yt_dlp/extractor/channel9.py @@ -0,0 +1,260 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + int_or_none, + parse_iso8601, + qualities, + unescapeHTML, +) + + +class Channel9IE(InfoExtractor): + IE_DESC = 'Channel 9' + IE_NAME = 'channel9' + _VALID_URL = r'https?://(?:www\.)?(?:channel9\.msdn\.com|s\.ch9\.ms)/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)' + + _TESTS = [{ + 'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002', + 'md5': '32083d4eaf1946db6d454313f44510ca', + 'info_dict': { + 'id': '6c413323-383a-49dc-88f9-a22800cab024', + 'ext': 'wmv', + 'title': 'Developer Kick-Off Session: Stuff We Love', + 'description': 'md5:b80bf9355a503c193aff7ec6cd5a7731', + 'duration': 4576, + 'thumbnail': r're:https?://.*\.jpg', + 'timestamp': 1377717420, + 'upload_date': '20130828', + 'session_code': 'KOS002', + 'session_room': 'Arena 1A', + 'session_speakers': 'count:5', + }, + }, { + 'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing', + 'md5': 'dcf983ee6acd2088e7188c3cf79b46bc', + 'info_dict': { + 'id': 'fe8e435f-bb93-4e01-8e97-a28c01887024', + 'ext': 'wmv', + 'title': 'Self-service BI with Power BI - nuclear testing', + 'description': 'md5:2d17fec927fc91e9e17783b3ecc88f54', + 'duration': 1540, + 'thumbnail': r're:https?://.*\.jpg', + 'timestamp': 1386381991, + 'upload_date': '20131207', + 'authors': ['Mike Wilmot'], + }, + }, { + # low quality mp4 is best + 'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library', + 'info_dict': { + 'id': '33ad69d2-6a4e-4172-83a1-a523013dec76', + 'ext': 'mp4', + 'title': 'Ranges for the Standard Library', + 'description': 'md5:9895e0a9fd80822d2f01c454b8f4a372', + 'duration': 5646, + 'thumbnail': r're:https?://.*\.jpg', + 'upload_date': '20150930', + 'timestamp': 1443640735, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://channel9.msdn.com/Events/DEVintersection/DEVintersection-2016/RSS', + 'info_dict': { + 'id': 'Events/DEVintersection/DEVintersection-2016', + 'title': 'DEVintersection 2016 Orlando Sessions', + }, + 'playlist_mincount': 14, + }, { + 'url': 'https://channel9.msdn.com/Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b/RSS', + 'only_matching': True, + }, { + 'url': 'https://channel9.msdn.com/Events/Speakers/scott-hanselman/RSS?UrlSafeName=scott-hanselman', + 'only_matching': True, + }] + + _RSS_URL = 'http://channel9.msdn.com/%s/RSS' + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r'<iframe[^>]+src=["\'](https?://channel9\.msdn\.com/(?:[^/]+/)+)player\b', + webpage) + + def _extract_list(self, video_id, rss_url=None): + if not rss_url: + rss_url = self._RSS_URL % video_id + rss = self._download_xml(rss_url, video_id, 'Downloading RSS') + entries = [self.url_result(session_url.text, 'Channel9') + for session_url in rss.findall('./channel/item/link')] + title_text = rss.find('./channel/title').text + return self.playlist_result(entries, video_id, title_text) + + def _real_extract(self, url): + content_path, rss = self._match_valid_url(url).groups() + + if rss: + return self._extract_list(content_path, url) + + webpage = self._download_webpage( + url, content_path, 'Downloading web page') + + episode_data = self._search_regex( + r"data-episode='([^']+)'", webpage, 'episode data', default=None) + if episode_data: + episode_data = self._parse_json(unescapeHTML( + episode_data), content_path) + content_id = episode_data['contentId'] + is_session = '/Sessions(' in episode_data['api'] + content_url = 'https://channel9.msdn.com/odata' + episode_data['api'] + '?$select=Captions,CommentCount,MediaLengthInSeconds,PublishedDate,Rating,RatingCount,Title,VideoMP4High,VideoMP4Low,VideoMP4Medium,VideoPlayerPreviewImage,VideoWMV,VideoWMVHQ,Views,' + if is_session: + content_url += 'Code,Description,Room,Slides,Speakers,ZipFile&$expand=Speakers' + else: + content_url += 'Authors,Body&$expand=Authors' + content_data = self._download_json(content_url, content_id) + title = content_data['Title'] + + QUALITIES = ( + 'mp3', + 'wmv', 'mp4', + 'wmv-low', 'mp4-low', + 'wmv-mid', 'mp4-mid', + 'wmv-high', 'mp4-high', + ) + + quality_key = qualities(QUALITIES) + + def quality(quality_id, format_url): + return (len(QUALITIES) if '_Source.' in format_url + else quality_key(quality_id)) + + formats = [] + urls = set() + + SITE_QUALITIES = { + 'MP3': 'mp3', + 'MP4': 'mp4', + 'Low Quality WMV': 'wmv-low', + 'Low Quality MP4': 'mp4-low', + 'Mid Quality WMV': 'wmv-mid', + 'Mid Quality MP4': 'mp4-mid', + 'High Quality WMV': 'wmv-high', + 'High Quality MP4': 'mp4-high', + } + + formats_select = self._search_regex( + r'(?s)<select[^>]+name=["\']format[^>]+>(.+?)</select', webpage, + 'formats select', default=None) + if formats_select: + for mobj in re.finditer( + r'<option\b[^>]+\bvalue=(["\'])(?P<url>(?:(?!\1).)+)\1[^>]*>\s*(?P<format>[^<]+?)\s*<', + formats_select): + format_url = mobj.group('url') + if format_url in urls: + continue + urls.add(format_url) + format_id = mobj.group('format') + quality_id = SITE_QUALITIES.get(format_id, format_id) + formats.append({ + 'url': format_url, + 'format_id': quality_id, + 'quality': quality(quality_id, format_url), + 'vcodec': 'none' if quality_id == 'mp3' else None, + }) + + API_QUALITIES = { + 'VideoMP4Low': 'mp4-low', + 'VideoWMV': 'wmv-mid', + 'VideoMP4Medium': 'mp4-mid', + 'VideoMP4High': 'mp4-high', + 'VideoWMVHQ': 'wmv-hq', + } + + for format_id, q in API_QUALITIES.items(): + q_url = content_data.get(format_id) + if not q_url or q_url in urls: + continue + urls.add(q_url) + formats.append({ + 'url': q_url, + 'format_id': q, + 'quality': quality(q, q_url), + }) + + slides = content_data.get('Slides') + zip_file = content_data.get('ZipFile') + + if not formats and not slides and not zip_file: + self.raise_no_formats( + 'None of recording, slides or zip are available for %s' % content_path) + self._sort_formats(formats) + + subtitles = {} + for caption in content_data.get('Captions', []): + caption_url = caption.get('Url') + if not caption_url: + continue + subtitles.setdefault(caption.get('Language', 'en'), []).append({ + 'url': caption_url, + 'ext': 'vtt', + }) + + common = { + 'id': content_id, + 'title': title, + 'description': clean_html(content_data.get('Description') or content_data.get('Body')), + 'thumbnail': content_data.get('VideoPlayerPreviewImage'), + 'duration': int_or_none(content_data.get('MediaLengthInSeconds')), + 'timestamp': parse_iso8601(content_data.get('PublishedDate')), + 'avg_rating': int_or_none(content_data.get('Rating')), + 'rating_count': int_or_none(content_data.get('RatingCount')), + 'view_count': int_or_none(content_data.get('Views')), + 'comment_count': int_or_none(content_data.get('CommentCount')), + 'subtitles': subtitles, + } + if is_session: + speakers = [] + for s in content_data.get('Speakers', []): + speaker_name = s.get('FullName') + if not speaker_name: + continue + speakers.append(speaker_name) + + common.update({ + 'session_code': content_data.get('Code'), + 'session_room': content_data.get('Room'), + 'session_speakers': speakers, + }) + else: + authors = [] + for a in content_data.get('Authors', []): + author_name = a.get('DisplayName') + if not author_name: + continue + authors.append(author_name) + common['authors'] = authors + + contents = [] + + if slides: + d = common.copy() + d.update({'title': title + '-Slides', 'url': slides}) + contents.append(d) + + if zip_file: + d = common.copy() + d.update({'title': title + '-Zip', 'url': zip_file}) + contents.append(d) + + if formats: + d = common.copy() + d.update({'title': title, 'formats': formats}) + contents.append(d) + return self.playlist_result(contents) + else: + return self._extract_list(content_path) diff --git a/youtube_dl/extractor/charlierose.py b/yt_dlp/extractor/charlierose.py index 42c9af263..42c9af263 100644 --- a/youtube_dl/extractor/charlierose.py +++ b/yt_dlp/extractor/charlierose.py diff --git a/youtube_dl/extractor/chaturbate.py b/yt_dlp/extractor/chaturbate.py index a459dcb8d..a459dcb8d 100644 --- a/youtube_dl/extractor/chaturbate.py +++ b/yt_dlp/extractor/chaturbate.py diff --git a/yt_dlp/extractor/chilloutzone.py b/yt_dlp/extractor/chilloutzone.py new file mode 100644 index 000000000..fd5202b9e --- /dev/null +++ b/yt_dlp/extractor/chilloutzone.py @@ -0,0 +1,95 @@ +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from .youtube import YoutubeIE +from ..compat import compat_b64decode +from ..utils import ( + clean_html, + ExtractorError +) + + +class ChilloutzoneIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?chilloutzone\.net/video/(?P<id>[\w|-]+)\.html' + _TESTS = [{ + 'url': 'http://www.chilloutzone.net/video/enemene-meck-alle-katzen-weg.html', + 'md5': 'a76f3457e813ea0037e5244f509e66d1', + 'info_dict': { + 'id': 'enemene-meck-alle-katzen-weg', + 'ext': 'mp4', + 'title': 'Enemene Meck - Alle Katzen weg', + 'description': 'Ist das der Umkehrschluss des Niesenden Panda-Babys?', + }, + }, { + 'note': 'Video hosted at YouTube', + 'url': 'http://www.chilloutzone.net/video/eine-sekunde-bevor.html', + 'info_dict': { + 'id': '1YVQaAgHyRU', + 'ext': 'mp4', + 'title': '16 Photos Taken 1 Second Before Disaster', + 'description': 'md5:58a8fcf6a459fe0a08f54140f0ad1814', + 'uploader': 'BuzzFeedVideo', + 'uploader_id': 'BuzzFeedVideo', + 'upload_date': '20131105', + }, + }, { + 'note': 'Video hosted at Vimeo', + 'url': 'http://www.chilloutzone.net/video/icon-blending.html', + 'md5': '2645c678b8dc4fefcc0e1b60db18dac1', + 'info_dict': { + 'id': '85523671', + 'ext': 'mp4', + 'title': 'The Sunday Times - Icons', + 'description': 're:(?s)^Watch the making of - makingoficons.com.{300,}', + 'uploader': 'Us', + 'uploader_id': 'usfilms', + 'upload_date': '20140131' + }, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + + base64_video_info = self._html_search_regex( + r'var cozVidData = "(.+?)";', webpage, 'video data') + decoded_video_info = compat_b64decode(base64_video_info).decode('utf-8') + video_info_dict = json.loads(decoded_video_info) + + # get video information from dict + video_url = video_info_dict['mediaUrl'] + description = clean_html(video_info_dict.get('description')) + title = video_info_dict['title'] + native_platform = video_info_dict['nativePlatform'] + native_video_id = video_info_dict['nativeVideoId'] + source_priority = video_info_dict['sourcePriority'] + + # If nativePlatform is None a fallback mechanism is used (i.e. youtube embed) + if native_platform is None: + youtube_url = YoutubeIE._extract_url(webpage) + if youtube_url: + return self.url_result(youtube_url, ie=YoutubeIE.ie_key()) + + # Non Fallback: Decide to use native source (e.g. youtube or vimeo) or + # the own CDN + if source_priority == 'native': + if native_platform == 'youtube': + return self.url_result(native_video_id, ie='Youtube') + if native_platform == 'vimeo': + return self.url_result( + 'http://vimeo.com/' + native_video_id, ie='Vimeo') + + if not video_url: + raise ExtractorError('No video found') + + return { + 'id': video_id, + 'url': video_url, + 'ext': 'mp4', + 'title': title, + 'description': description, + } diff --git a/yt_dlp/extractor/chingari.py b/yt_dlp/extractor/chingari.py new file mode 100644 index 000000000..6bdc4f6bb --- /dev/null +++ b/yt_dlp/extractor/chingari.py @@ -0,0 +1,209 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools +import json + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote_plus +from ..utils import ( + clean_html, + ExtractorError, + int_or_none, + str_to_int, + url_or_none, +) + + +class ChingariBaseIE(InfoExtractor): + def _get_post(self, id, post_data): + media_data = post_data['mediaLocation'] + base_url = media_data['base'] + author_data = post_data.get('authorData', {}) + song_data = post_data.get('song', {}) # revist this in future for differentiating b/w 'art' and 'author' + + formats = [{ + 'format_id': frmt, + 'width': str_to_int(frmt[1:]), + 'url': base_url + frmt_path, + } for frmt, frmt_path in media_data.get('transcoded', {}).items()] + + if media_data.get('path'): + formats.append({ + 'format_id': 'original', + 'format_note': 'Direct video.', + 'url': base_url + '/apipublic' + media_data['path'], + 'quality': 10, + }) + self._sort_formats(formats) + timestamp = str_to_int(post_data.get('created_at')) + if timestamp: + timestamp = int_or_none(timestamp, 1000) + + thumbnail, uploader_url = None, None + if media_data.get('thumbnail'): + thumbnail = base_url + media_data.get('thumbnail') + if author_data.get('username'): + uploader_url = 'https://chingari.io/' + author_data.get('username') + + return { + 'id': id, + 'title': compat_urllib_parse_unquote_plus(clean_html(post_data.get('caption'))), + 'description': compat_urllib_parse_unquote_plus(clean_html(post_data.get('caption'))), + 'duration': media_data.get('duration'), + 'thumbnail': url_or_none(thumbnail), + 'like_count': post_data.get('likeCount'), + 'view_count': post_data.get('viewsCount'), + 'comment_count': post_data.get('commentCount'), + 'repost_count': post_data.get('shareCount'), + 'timestamp': timestamp, + 'uploader_id': post_data.get('userId') or author_data.get('_id'), + 'uploader': author_data.get('name'), + 'uploader_url': url_or_none(uploader_url), + 'track': song_data.get('title'), + 'artist': song_data.get('author'), + 'formats': formats, + } + + +class ChingariIE(ChingariBaseIE): + _VALID_URL = r'(?:https?://)(?:www\.)?chingari\.io/share/post\?id=(?P<id>[^&/#?]+)' + _TESTS = [{ + 'url': 'https://chingari.io/share/post?id=612f8f4ce1dc57090e8a7beb', + 'info_dict': { + 'id': '612f8f4ce1dc57090e8a7beb', + 'ext': 'mp4', + 'title': 'Happy birthday Srila Prabhupada', + 'description': 'md5:c7080ebfdfeb06016e638c286d6bc3fa', + 'duration': 0, + 'thumbnail': 'https://media.chingari.io/uploads/c41d30e2-06b6-4e3b-9b4b-edbb929cec06-1630506826911/thumbnail/198f993f-ce87-4623-82c6-cd071bd6d4f4-1630506828016.jpg', + 'like_count': int, + 'view_count': int, + 'comment_count': int, + 'repost_count': int, + 'timestamp': 1630506828, + 'upload_date': '20210901', + 'uploader_id': '5f0403982c8bd344f4813f8c', + 'uploader': 'ISKCON,Inc.', + 'uploader_url': 'https://chingari.io/iskcon,inc', + 'track': None, + 'artist': None, + }, + 'params': {'skip_download': True} + }] + + def _real_extract(self, url): + id = self._match_id(url) + post_json = self._download_json(f'https://api.chingari.io/post/post_details/{id}', id) + if post_json['code'] != 200: + raise ExtractorError(post_json['message'], expected=True) + post_data = post_json['data'] + return self._get_post(id, post_data) + + +class ChingariUserIE(ChingariBaseIE): + _VALID_URL = r'(?:https?://)(?:www\.)?chingari\.io/(?!share/post)(?P<id>[^/?]+)' + _TESTS = [{ + 'url': 'https://chingari.io/dada1023', + 'playlist_mincount': 3, + 'info_dict': { + 'id': 'dada1023', + }, + 'entries': [{ + 'url': 'https://chingari.io/share/post?id=614781f3ade60b3a0bfff42a', + 'info_dict': { + 'id': '614781f3ade60b3a0bfff42a', + 'ext': 'mp4', + 'title': '#chingaribappa ', + 'description': 'md5:d1df21d84088770468fa63afe3b17857', + 'duration': 7, + 'thumbnail': 'https://media.chingari.io/uploads/346d86d4-abb2-474e-a164-ffccf2bbcb72-1632076273717/thumbnail/b0b3aac2-2b86-4dd1-909d-9ed6e57cf77c-1632076275552.jpg', + 'like_count': int, + 'view_count': int, + 'comment_count': int, + 'repost_count': int, + 'timestamp': 1632076275, + 'upload_date': '20210919', + 'uploader_id': '5efc4b12cca35c3d1794c2d3', + 'uploader': 'dada (girish) dhawale', + 'uploader_url': 'https://chingari.io/dada1023', + 'track': None, + 'artist': None + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://chingari.io/share/post?id=6146b132bcbf860959e12cba', + 'info_dict': { + 'id': '6146b132bcbf860959e12cba', + 'ext': 'mp4', + 'title': 'Tactor harvesting', + 'description': 'md5:8403f12dce68828b77ecee7eb7e887b7', + 'duration': 59.3, + 'thumbnail': 'https://media.chingari.io/uploads/b353ca70-7a87-400d-93a6-fa561afaec86-1632022814584/thumbnail/c09302e3-2043-41b1-a2fe-77d97e5bd676-1632022834260.jpg', + 'like_count': int, + 'view_count': int, + 'comment_count': int, + 'repost_count': int, + 'timestamp': 1632022834, + 'upload_date': '20210919', + 'uploader_id': '5efc4b12cca35c3d1794c2d3', + 'uploader': 'dada (girish) dhawale', + 'uploader_url': 'https://chingari.io/dada1023', + 'track': None, + 'artist': None + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://chingari.io/share/post?id=6145651b74cb030a64c40b82', + 'info_dict': { + 'id': '6145651b74cb030a64c40b82', + 'ext': 'mp4', + 'title': '#odiabhajan ', + 'description': 'md5:687ea36835b9276cf2af90f25e7654cb', + 'duration': 56.67, + 'thumbnail': 'https://media.chingari.io/uploads/6cbf216b-babc-4cce-87fe-ceaac8d706ac-1631937782708/thumbnail/8855754f-6669-48ce-b269-8cc0699ed6da-1631937819522.jpg', + 'like_count': int, + 'view_count': int, + 'comment_count': int, + 'repost_count': int, + 'timestamp': 1631937819, + 'upload_date': '20210918', + 'uploader_id': '5efc4b12cca35c3d1794c2d3', + 'uploader': 'dada (girish) dhawale', + 'uploader_url': 'https://chingari.io/dada1023', + 'track': None, + 'artist': None + }, + 'params': {'skip_download': True} + }], + }, { + 'url': 'https://chingari.io/iskcon%2Cinc', + 'playlist_mincount': 1025, + 'info_dict': { + 'id': 'iskcon%2Cinc', + }, + }] + + def _entries(self, id): + skip = 0 + has_more = True + for page in itertools.count(): + posts = self._download_json('https://api.chingari.io/users/getPosts', id, + data=json.dumps({'userId': id, 'ownerId': id, 'skip': skip, 'limit': 20}).encode(), + headers={'content-type': 'application/json;charset=UTF-8'}, + note='Downloading page %s' % page) + for post in posts.get('data', []): + post_data = post['post'] + yield self._get_post(post_data['_id'], post_data) + skip += 20 + has_more = posts['hasMoreData'] + if not has_more: + break + + def _real_extract(self, url): + alt_id = self._match_id(url) + post_json = self._download_json(f'https://api.chingari.io/user/{alt_id}', alt_id) + if post_json['code'] != 200: + raise ExtractorError(post_json['message'], expected=True) + id = post_json['data']['_id'] + return self.playlist_result(self._entries(id), playlist_id=alt_id) diff --git a/youtube_dl/extractor/chirbit.py b/yt_dlp/extractor/chirbit.py index 8d75cdf19..8d75cdf19 100644 --- a/youtube_dl/extractor/chirbit.py +++ b/yt_dlp/extractor/chirbit.py diff --git a/youtube_dl/extractor/cinchcast.py b/yt_dlp/extractor/cinchcast.py index b861d54b0..b861d54b0 100644 --- a/youtube_dl/extractor/cinchcast.py +++ b/yt_dlp/extractor/cinchcast.py diff --git a/yt_dlp/extractor/cinemax.py b/yt_dlp/extractor/cinemax.py new file mode 100644 index 000000000..2c3ff8d4f --- /dev/null +++ b/yt_dlp/extractor/cinemax.py @@ -0,0 +1,28 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .hbo import HBOBaseIE + + +class CinemaxIE(HBOBaseIE): + _VALID_URL = r'https?://(?:www\.)?cinemax\.com/(?P<path>[^/]+/video/[0-9a-z-]+-(?P<id>\d+))' + _TESTS = [{ + 'url': 'https://www.cinemax.com/warrior/video/s1-ep-1-recap-20126903', + 'md5': '82e0734bba8aa7ef526c9dd00cf35a05', + 'info_dict': { + 'id': '20126903', + 'ext': 'mp4', + 'title': 'S1 Ep 1: Recap', + }, + 'expected_warnings': ['Unknown MIME type application/mp4 in DASH manifest'], + }, { + 'url': 'https://www.cinemax.com/warrior/video/s1-ep-1-recap-20126903.embed', + 'only_matching': True, + }] + + def _real_extract(self, url): + path, video_id = self._match_valid_url(url).groups() + info = self._extract_info('https://www.cinemax.com/%s.xml' % path, video_id) + info['id'] = video_id + return info diff --git a/yt_dlp/extractor/ciscolive.py b/yt_dlp/extractor/ciscolive.py new file mode 100644 index 000000000..349c5eb50 --- /dev/null +++ b/yt_dlp/extractor/ciscolive.py @@ -0,0 +1,148 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools + +from .common import InfoExtractor +from ..utils import ( + clean_html, + float_or_none, + int_or_none, + parse_qs, + try_get, + urlencode_postdata, +) + + +class CiscoLiveBaseIE(InfoExtractor): + # These appear to be constant across all Cisco Live presentations + # and are not tied to any user session or event + RAINFOCUS_API_URL = 'https://events.rainfocus.com/api/%s' + RAINFOCUS_API_PROFILE_ID = 'Na3vqYdAlJFSxhYTYQGuMbpafMqftalz' + RAINFOCUS_WIDGET_ID = 'n6l4Lo05R8fiy3RpUBm447dZN8uNWoye' + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5647924234001/SyK2FdqjM_default/index.html?videoId=%s' + + HEADERS = { + 'Origin': 'https://ciscolive.cisco.com', + 'rfApiProfileId': RAINFOCUS_API_PROFILE_ID, + 'rfWidgetId': RAINFOCUS_WIDGET_ID, + } + + def _call_api(self, ep, rf_id, query, referrer, note=None): + headers = self.HEADERS.copy() + headers['Referer'] = referrer + return self._download_json( + self.RAINFOCUS_API_URL % ep, rf_id, note=note, + data=urlencode_postdata(query), headers=headers) + + def _parse_rf_item(self, rf_item): + event_name = rf_item.get('eventName') + title = rf_item['title'] + description = clean_html(rf_item.get('abstract')) + presenter_name = try_get(rf_item, lambda x: x['participants'][0]['fullName']) + bc_id = rf_item['videos'][0]['url'] + bc_url = self.BRIGHTCOVE_URL_TEMPLATE % bc_id + duration = float_or_none(try_get(rf_item, lambda x: x['times'][0]['length'])) + location = try_get(rf_item, lambda x: x['times'][0]['room']) + + if duration: + duration = duration * 60 + + return { + '_type': 'url_transparent', + 'url': bc_url, + 'ie_key': 'BrightcoveNew', + 'title': title, + 'description': description, + 'duration': duration, + 'creator': presenter_name, + 'location': location, + 'series': event_name, + } + + +class CiscoLiveSessionIE(CiscoLiveBaseIE): + _VALID_URL = r'https?://(?:www\.)?ciscolive(?:\.cisco)?\.com/[^#]*#/session/(?P<id>[^/?&]+)' + _TESTS = [{ + 'url': 'https://ciscolive.cisco.com/on-demand-library/?#/session/1423353499155001FoSs', + 'md5': 'c98acf395ed9c9f766941c70f5352e22', + 'info_dict': { + 'id': '5803694304001', + 'ext': 'mp4', + 'title': '13 Smart Automations to Monitor Your Cisco IOS Network', + 'description': 'md5:ec4a436019e09a918dec17714803f7cc', + 'timestamp': 1530305395, + 'upload_date': '20180629', + 'uploader_id': '5647924234001', + 'location': '16B Mezz.', + }, + }, { + 'url': 'https://www.ciscolive.com/global/on-demand-library.html?search.event=ciscoliveemea2019#/session/15361595531500013WOU', + 'only_matching': True, + }, { + 'url': 'https://www.ciscolive.com/global/on-demand-library.html?#/session/1490051371645001kNaS', + 'only_matching': True, + }] + + def _real_extract(self, url): + rf_id = self._match_id(url) + rf_result = self._call_api('session', rf_id, {'id': rf_id}, url) + return self._parse_rf_item(rf_result['items'][0]) + + +class CiscoLiveSearchIE(CiscoLiveBaseIE): + _VALID_URL = r'https?://(?:www\.)?ciscolive(?:\.cisco)?\.com/(?:global/)?on-demand-library(?:\.html|/)' + _TESTS = [{ + 'url': 'https://ciscolive.cisco.com/on-demand-library/?search.event=ciscoliveus2018&search.technicallevel=scpsSkillLevel_aintroductory&search.focus=scpsSessionFocus_designAndDeployment#/', + 'info_dict': { + 'title': 'Search query', + }, + 'playlist_count': 5, + }, { + 'url': 'https://ciscolive.cisco.com/on-demand-library/?search.technology=scpsTechnology_applicationDevelopment&search.technology=scpsTechnology_ipv6&search.focus=scpsSessionFocus_troubleshootingTroubleshooting#/', + 'only_matching': True, + }, { + 'url': 'https://www.ciscolive.com/global/on-demand-library.html?search.technicallevel=scpsSkillLevel_aintroductory&search.event=ciscoliveemea2019&search.technology=scpsTechnology_dataCenter&search.focus=scpsSessionFocus_bestPractices#/', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if CiscoLiveSessionIE.suitable(url) else super(CiscoLiveSearchIE, cls).suitable(url) + + @staticmethod + def _check_bc_id_exists(rf_item): + return int_or_none(try_get(rf_item, lambda x: x['videos'][0]['url'])) is not None + + def _entries(self, query, url): + query['size'] = 50 + query['from'] = 0 + for page_num in itertools.count(1): + results = self._call_api( + 'search', None, query, url, + 'Downloading search JSON page %d' % page_num) + sl = try_get(results, lambda x: x['sectionList'][0], dict) + if sl: + results = sl + items = results.get('items') + if not items or not isinstance(items, list): + break + for item in items: + if not isinstance(item, dict): + continue + if not self._check_bc_id_exists(item): + continue + yield self._parse_rf_item(item) + size = int_or_none(results.get('size')) + if size is not None: + query['size'] = size + total = int_or_none(results.get('total')) + if total is not None and query['from'] + query['size'] > total: + break + query['from'] += query['size'] + + def _real_extract(self, url): + query = parse_qs(url) + query['type'] = 'session' + return self.playlist_result( + self._entries(query, url), playlist_title='Search query') diff --git a/yt_dlp/extractor/ciscowebex.py b/yt_dlp/extractor/ciscowebex.py new file mode 100644 index 000000000..882dae91b --- /dev/null +++ b/yt_dlp/extractor/ciscowebex.py @@ -0,0 +1,90 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + try_get, + unified_timestamp, +) + + +class CiscoWebexIE(InfoExtractor): + IE_NAME = 'ciscowebex' + IE_DESC = 'Cisco Webex' + _VALID_URL = r'''(?x) + (?P<url>https?://(?P<subdomain>[^/#?]*)\.webex\.com/(?: + (?P<siteurl_1>[^/#?]*)/(?:ldr|lsr).php\?(?:[^#]*&)*RCID=(?P<rcid>[0-9a-f]{32})| + (?:recordingservice|webappng)/sites/(?P<siteurl_2>[^/#?]*)/recording/(?:playback/|play/)?(?P<id>[0-9a-f]{32}) + ))''' + + _TESTS = [{ + 'url': 'https://demosubdomain.webex.com/demositeurl/ldr.php?RCID=e58e803bc0f766bb5f6376d2e86adb5b', + 'only_matching': True, + }, { + 'url': 'http://demosubdomain.webex.com/demositeurl/lsr.php?RCID=bc04b4a7b5ea2cc3a493d5ae6aaff5d7', + 'only_matching': True, + }, { + 'url': 'https://demosubdomain.webex.com/recordingservice/sites/demositeurl/recording/88e7a42f7b19f5b423c54754aecc2ce9/playback', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + rcid = mobj.group('rcid') + if rcid: + webpage = self._download_webpage(url, None, note='Getting video ID') + url = self._search_regex(self._VALID_URL, webpage, 'redirection url', group='url') + url = self._request_webpage(url, None, note='Resolving final URL').geturl() + mobj = self._match_valid_url(url) + subdomain = mobj.group('subdomain') + siteurl = mobj.group('siteurl_1') or mobj.group('siteurl_2') + video_id = mobj.group('id') + + stream = self._download_json( + 'https://%s.webex.com/webappng/api/v1/recordings/%s/stream' % (subdomain, video_id), + video_id, fatal=False, query={'siteurl': siteurl}) + if not stream: + self.raise_login_required(method='cookies') + + video_id = stream.get('recordUUID') or video_id + + formats = [{ + 'format_id': 'video', + 'url': stream['fallbackPlaySrc'], + 'ext': 'mp4', + 'vcodec': 'avc1.640028', + 'acodec': 'mp4a.40.2', + }] + if stream.get('preventDownload') is False: + mp4url = try_get(stream, lambda x: x['downloadRecordingInfo']['downloadInfo']['mp4URL']) + if mp4url: + formats.append({ + 'format_id': 'video', + 'url': mp4url, + 'ext': 'mp4', + 'vcodec': 'avc1.640028', + 'acodec': 'mp4a.40.2', + }) + audiourl = try_get(stream, lambda x: x['downloadRecordingInfo']['downloadInfo']['audioURL']) + if audiourl: + formats.append({ + 'format_id': 'audio', + 'url': audiourl, + 'ext': 'mp3', + 'vcodec': 'none', + 'acodec': 'mp3', + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': stream['recordName'], + 'description': stream.get('description'), + 'uploader': stream.get('ownerDisplayName'), + 'uploader_id': stream.get('ownerUserName') or stream.get('ownerId'), # mail or id + 'timestamp': unified_timestamp(stream.get('createTime')), + 'duration': int_or_none(stream.get('duration'), 1000), + 'webpage_url': 'https://%s.webex.com/recordingservice/sites/%s/recording/playback/%s' % (subdomain, siteurl, video_id), + 'formats': formats, + } diff --git a/yt_dlp/extractor/cjsw.py b/yt_dlp/extractor/cjsw.py new file mode 100644 index 000000000..1dea0d7c7 --- /dev/null +++ b/yt_dlp/extractor/cjsw.py @@ -0,0 +1,71 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + unescapeHTML, +) + + +class CJSWIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?cjsw\.com/program/(?P<program>[^/]+)/episode/(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://cjsw.com/program/freshly-squeezed/episode/20170620', + 'md5': 'cee14d40f1e9433632c56e3d14977120', + 'info_dict': { + 'id': '91d9f016-a2e7-46c5-8dcb-7cbcd7437c41', + 'ext': 'mp3', + 'title': 'Freshly Squeezed – Episode June 20, 2017', + 'description': 'md5:c967d63366c3898a80d0c7b0ff337202', + 'series': 'Freshly Squeezed', + 'episode_id': '20170620', + }, + }, { + # no description + 'url': 'http://cjsw.com/program/road-pops/episode/20170707/', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + program, episode_id = mobj.group('program', 'id') + audio_id = '%s/%s' % (program, episode_id) + + webpage = self._download_webpage(url, episode_id) + + title = unescapeHTML(self._search_regex( + (r'<h1[^>]+class=["\']episode-header__title["\'][^>]*>(?P<title>[^<]+)', + r'data-audio-title=(["\'])(?P<title>(?:(?!\1).)+)\1'), + webpage, 'title', group='title')) + + audio_url = self._search_regex( + r'<button[^>]+data-audio-src=(["\'])(?P<url>(?:(?!\1).)+)\1', + webpage, 'audio url', group='url') + + audio_id = self._search_regex( + r'/([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})\.mp3', + audio_url, 'audio id', default=audio_id) + + formats = [{ + 'url': audio_url, + 'ext': determine_ext(audio_url, 'mp3'), + 'vcodec': 'none', + }] + + description = self._html_search_regex( + r'<p>(?P<description>.+?)</p>', webpage, 'description', + default=None) + series = self._search_regex( + r'data-showname=(["\'])(?P<name>(?:(?!\1).)+)\1', webpage, + 'series', default=program, group='name') + + return { + 'id': audio_id, + 'title': title, + 'description': description, + 'formats': formats, + 'series': series, + 'episode_id': episode_id, + } diff --git a/youtube_dl/extractor/cliphunter.py b/yt_dlp/extractor/cliphunter.py index f2ca7a337..f2ca7a337 100644 --- a/youtube_dl/extractor/cliphunter.py +++ b/yt_dlp/extractor/cliphunter.py diff --git a/youtube_dl/extractor/clippit.py b/yt_dlp/extractor/clippit.py index a1a7a774c..a1a7a774c 100644 --- a/youtube_dl/extractor/clippit.py +++ b/yt_dlp/extractor/clippit.py diff --git a/youtube_dl/extractor/cliprs.py b/yt_dlp/extractor/cliprs.py index d55b26d59..d55b26d59 100644 --- a/youtube_dl/extractor/cliprs.py +++ b/yt_dlp/extractor/cliprs.py diff --git a/youtube_dl/extractor/clipsyndicate.py b/yt_dlp/extractor/clipsyndicate.py index 6cdb42f5a..6cdb42f5a 100644 --- a/youtube_dl/extractor/clipsyndicate.py +++ b/yt_dlp/extractor/clipsyndicate.py diff --git a/youtube_dl/extractor/closertotruth.py b/yt_dlp/extractor/closertotruth.py index 26243d52d..26243d52d 100644 --- a/youtube_dl/extractor/closertotruth.py +++ b/yt_dlp/extractor/closertotruth.py diff --git a/youtube_dl/extractor/cloudflarestream.py b/yt_dlp/extractor/cloudflarestream.py index 2fdcfbb3a..2fdcfbb3a 100644 --- a/youtube_dl/extractor/cloudflarestream.py +++ b/yt_dlp/extractor/cloudflarestream.py diff --git a/youtube_dl/extractor/cloudy.py b/yt_dlp/extractor/cloudy.py index 85ca20ecc..85ca20ecc 100644 --- a/youtube_dl/extractor/cloudy.py +++ b/yt_dlp/extractor/cloudy.py diff --git a/youtube_dl/extractor/clubic.py b/yt_dlp/extractor/clubic.py index 98f9cb596..98f9cb596 100644 --- a/youtube_dl/extractor/clubic.py +++ b/yt_dlp/extractor/clubic.py diff --git a/yt_dlp/extractor/clyp.py b/yt_dlp/extractor/clyp.py new file mode 100644 index 000000000..e6b2ac4d4 --- /dev/null +++ b/yt_dlp/extractor/clyp.py @@ -0,0 +1,79 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + float_or_none, + parse_qs, + unified_timestamp, +) + + +class ClypIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?clyp\.it/(?P<id>[a-z0-9]+)' + _TESTS = [{ + 'url': 'https://clyp.it/ojz2wfah', + 'md5': '1d4961036c41247ecfdcc439c0cddcbb', + 'info_dict': { + 'id': 'ojz2wfah', + 'ext': 'mp3', + 'title': 'Krisson80 - bits wip wip', + 'description': '#Krisson80BitsWipWip #chiptune\n#wip', + 'duration': 263.21, + 'timestamp': 1443515251, + 'upload_date': '20150929', + }, + }, { + 'url': 'https://clyp.it/b04p1odi?token=b0078e077e15835845c528a44417719d', + 'info_dict': { + 'id': 'b04p1odi', + 'ext': 'mp3', + 'title': 'GJ! (Reward Edit)', + 'description': 'Metal Resistance (THE ONE edition)', + 'duration': 177.789, + 'timestamp': 1528241278, + 'upload_date': '20180605', + }, + 'params': { + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + audio_id = self._match_id(url) + + qs = parse_qs(url) + token = qs.get('token', [None])[0] + + query = {} + if token: + query['token'] = token + + metadata = self._download_json( + 'https://api.clyp.it/%s' % audio_id, audio_id, query=query) + + formats = [] + for secure in ('', 'Secure'): + for ext in ('Ogg', 'Mp3'): + format_id = '%s%s' % (secure, ext) + format_url = metadata.get('%sUrl' % format_id) + if format_url: + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'vcodec': 'none', + }) + self._sort_formats(formats) + + title = metadata['Title'] + description = metadata.get('Description') + duration = float_or_none(metadata.get('Duration')) + timestamp = unified_timestamp(metadata.get('DateCreated')) + + return { + 'id': audio_id, + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, + } diff --git a/yt_dlp/extractor/cmt.py b/yt_dlp/extractor/cmt.py new file mode 100644 index 000000000..a4ddb9160 --- /dev/null +++ b/yt_dlp/extractor/cmt.py @@ -0,0 +1,56 @@ +from __future__ import unicode_literals + +from .mtv import MTVIE + +# TODO Remove - Reason: Outdated Site + + +class CMTIE(MTVIE): + IE_NAME = 'cmt.com' + _VALID_URL = r'https?://(?:www\.)?cmt\.com/(?:videos|shows|(?:full-)?episodes|video-clips)/(?P<id>[^/]+)' + + _TESTS = [{ + 'url': 'http://www.cmt.com/videos/garth-brooks/989124/the-call-featuring-trisha-yearwood.jhtml#artist=30061', + 'md5': 'e6b7ef3c4c45bbfae88061799bbba6c2', + 'info_dict': { + 'id': '989124', + 'ext': 'mp4', + 'title': 'Garth Brooks - "The Call (featuring Trisha Yearwood)"', + 'description': 'Blame It All On My Roots', + }, + 'skip': 'Video not available', + }, { + 'url': 'http://www.cmt.com/videos/misc/1504699/still-the-king-ep-109-in-3-minutes.jhtml#id=1739908', + 'md5': 'e61a801ca4a183a466c08bd98dccbb1c', + 'info_dict': { + 'id': '1504699', + 'ext': 'mp4', + 'title': 'Still The King Ep. 109 in 3 Minutes', + 'description': 'Relive or catch up with Still The King by watching this recap of season 1, episode 9.', + 'timestamp': 1469421000.0, + 'upload_date': '20160725', + }, + }, { + 'url': 'http://www.cmt.com/shows/party-down-south/party-down-south-ep-407-gone-girl/1738172/playlist/#id=1738172', + 'only_matching': True, + }, { + 'url': 'http://www.cmt.com/full-episodes/537qb3/nashville-the-wayfaring-stranger-season-5-ep-501', + 'only_matching': True, + }, { + 'url': 'http://www.cmt.com/video-clips/t9e4ci/nashville-juliette-in-2-minutes', + 'only_matching': True, + }] + + def _extract_mgid(self, webpage, url): + mgid = self._search_regex( + r'MTVN\.VIDEO\.contentUri\s*=\s*([\'"])(?P<mgid>.+?)\1', + webpage, 'mgid', group='mgid', default=None) + if not mgid: + mgid = self._extract_triforce_mgid(webpage) + return mgid + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + mgid = self._extract_mgid(webpage, url) + return self.url_result('http://media.mtvnservices.com/embed/%s' % mgid) diff --git a/yt_dlp/extractor/cnbc.py b/yt_dlp/extractor/cnbc.py new file mode 100644 index 000000000..da3730cc8 --- /dev/null +++ b/yt_dlp/extractor/cnbc.py @@ -0,0 +1,70 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import smuggle_url + + +class CNBCIE(InfoExtractor): + _VALID_URL = r'https?://video\.cnbc\.com/gallery/\?video=(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://video.cnbc.com/gallery/?video=3000503714', + 'info_dict': { + 'id': '3000503714', + 'ext': 'mp4', + 'title': 'Fighting zombies is big business', + 'description': 'md5:0c100d8e1a7947bd2feec9a5550e519e', + 'timestamp': 1459332000, + 'upload_date': '20160330', + 'uploader': 'NBCU-CNBC', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + return { + '_type': 'url_transparent', + 'ie_key': 'ThePlatform', + 'url': smuggle_url( + 'http://link.theplatform.com/s/gZWlPC/media/guid/2408950221/%s?mbr=true&manifest=m3u' % video_id, + {'force_smil_url': True}), + 'id': video_id, + } + + +class CNBCVideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?cnbc\.com(?P<path>/video/(?:[^/]+/)+(?P<id>[^./?#&]+)\.html)' + _TEST = { + 'url': 'https://www.cnbc.com/video/2018/07/19/trump-i-dont-necessarily-agree-with-raising-rates.html', + 'info_dict': { + 'id': '7000031301', + 'ext': 'mp4', + 'title': "Trump: I don't necessarily agree with raising rates", + 'description': 'md5:878d8f0b4ebb5bb1dda3514b91b49de3', + 'timestamp': 1531958400, + 'upload_date': '20180719', + 'uploader': 'NBCU-CNBC', + }, + 'params': { + 'skip_download': True, + }, + } + + def _real_extract(self, url): + path, display_id = self._match_valid_url(url).groups() + video_id = self._download_json( + 'https://webql-redesign.cnbcfm.com/graphql', display_id, query={ + 'query': '''{ + page(path: "%s") { + vcpsId + } +}''' % path, + })['data']['page']['vcpsId'] + return self.url_result( + 'http://video.cnbc.com/gallery/?video=%d' % video_id, + CNBCIE.ie_key()) diff --git a/yt_dlp/extractor/cnn.py b/yt_dlp/extractor/cnn.py new file mode 100644 index 000000000..af11d95b4 --- /dev/null +++ b/yt_dlp/extractor/cnn.py @@ -0,0 +1,146 @@ +from __future__ import unicode_literals + + +from .common import InfoExtractor +from .turner import TurnerBaseIE +from ..utils import url_basename + + +class CNNIE(TurnerBaseIE): + _VALID_URL = r'''(?x)https?://(?:(?P<sub_domain>edition|www|money)\.)?cnn\.com/(?:video/(?:data/.+?|\?)/)?videos?/ + (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:[a-z\-]+)|(?=&)))''' + + _TESTS = [{ + 'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn', + 'md5': '3e6121ea48df7e2259fe73a0628605c4', + 'info_dict': { + 'id': 'sports/2013/06/09/nadal-1-on-1.cnn', + 'ext': 'mp4', + 'title': 'Nadal wins 8th French Open title', + 'description': 'World Sport\'s Amanda Davies chats with 2013 French Open champion Rafael Nadal.', + 'duration': 135, + 'upload_date': '20130609', + }, + 'expected_warnings': ['Failed to download m3u8 information'], + }, { + 'url': 'http://edition.cnn.com/video/?/video/us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fcnn_topstories+%28RSS%3A+Top+Stories%29', + 'md5': 'b5cc60c60a3477d185af8f19a2a26f4e', + 'info_dict': { + 'id': 'us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology', + 'ext': 'mp4', + 'title': "Student's epic speech stuns new freshmen", + 'description': "A Georgia Tech student welcomes the incoming freshmen with an epic speech backed by music from \"2001: A Space Odyssey.\"", + 'upload_date': '20130821', + }, + 'expected_warnings': ['Failed to download m3u8 information'], + }, { + 'url': 'http://www.cnn.com/video/data/2.0/video/living/2014/12/22/growing-america-nashville-salemtown-board-episode-1.hln.html', + 'md5': 'f14d02ebd264df951feb2400e2c25a1b', + 'info_dict': { + 'id': 'living/2014/12/22/growing-america-nashville-salemtown-board-episode-1.hln', + 'ext': 'mp4', + 'title': 'Nashville Ep. 1: Hand crafted skateboards', + 'description': 'md5:e7223a503315c9f150acac52e76de086', + 'upload_date': '20141222', + }, + 'expected_warnings': ['Failed to download m3u8 information'], + }, { + 'url': 'http://money.cnn.com/video/news/2016/08/19/netflix-stunning-stats.cnnmoney/index.html', + 'md5': '52a515dc1b0f001cd82e4ceda32be9d1', + 'info_dict': { + 'id': '/video/news/2016/08/19/netflix-stunning-stats.cnnmoney', + 'ext': 'mp4', + 'title': '5 stunning stats about Netflix', + 'description': 'Did you know that Netflix has more than 80 million members? Here are five facts about the online video distributor that you probably didn\'t know.', + 'upload_date': '20160819', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://cnn.com/video/?/video/politics/2015/03/27/pkg-arizona-senator-church-attendance-mandatory.ktvk', + 'only_matching': True, + }, { + 'url': 'http://cnn.com/video/?/video/us/2015/04/06/dnt-baker-refuses-anti-gay-order.wkmg', + 'only_matching': True, + }, { + 'url': 'http://edition.cnn.com/videos/arts/2016/04/21/olympic-games-cultural-a-z-brazil.cnn', + 'only_matching': True, + }] + + _CONFIG = { + # http://edition.cnn.com/.element/apps/cvp/3.0/cfg/spider/cnn/expansion/config.xml + 'edition': { + 'data_src': 'http://edition.cnn.com/video/data/3.0/video/%s/index.xml', + 'media_src': 'http://pmd.cdn.turner.com/cnn/big', + }, + # http://money.cnn.com/.element/apps/cvp2/cfg/config.xml + 'money': { + 'data_src': 'http://money.cnn.com/video/data/4.0/video/%s.xml', + 'media_src': 'http://ht3.cdn.turner.com/money/big', + }, + } + + def _extract_timestamp(self, video_data): + # TODO: fix timestamp extraction + return None + + def _real_extract(self, url): + sub_domain, path, page_title = self._match_valid_url(url).groups() + if sub_domain not in ('money', 'edition'): + sub_domain = 'edition' + config = self._CONFIG[sub_domain] + return self._extract_cvp_info( + config['data_src'] % path, page_title, { + 'default': { + 'media_src': config['media_src'], + }, + 'f4m': { + 'host': 'cnn-vh.akamaihd.net', + }, + }) + + +class CNNBlogsIE(InfoExtractor): + _VALID_URL = r'https?://[^\.]+\.blogs\.cnn\.com/.+' + _TEST = { + 'url': 'http://reliablesources.blogs.cnn.com/2014/02/09/criminalizing-journalism/', + 'md5': '3e56f97b0b6ffb4b79f4ea0749551084', + 'info_dict': { + 'id': 'bestoftv/2014/02/09/criminalizing-journalism.cnn', + 'ext': 'mp4', + 'title': 'Criminalizing journalism?', + 'description': 'Glenn Greenwald responds to comments made this week on Capitol Hill that journalists could be criminal accessories.', + 'upload_date': '20140209', + }, + 'expected_warnings': ['Failed to download m3u8 information'], + 'add_ie': ['CNN'], + } + + def _real_extract(self, url): + webpage = self._download_webpage(url, url_basename(url)) + cnn_url = self._html_search_regex(r'data-url="(.+?)"', webpage, 'cnn url') + return self.url_result(cnn_url, CNNIE.ie_key()) + + +class CNNArticleIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:edition|www)\.)?cnn\.com/(?!videos?/)' + _TEST = { + 'url': 'http://www.cnn.com/2014/12/21/politics/obama-north-koreas-hack-not-war-but-cyber-vandalism/', + 'md5': '689034c2a3d9c6dc4aa72d65a81efd01', + 'info_dict': { + 'id': 'bestoftv/2014/12/21/ip-north-korea-obama.cnn', + 'ext': 'mp4', + 'title': 'Obama: Cyberattack not an act of war', + 'description': 'md5:0a802a40d2376f60e6b04c8d5bcebc4b', + 'upload_date': '20141221', + }, + 'expected_warnings': ['Failed to download m3u8 information'], + 'add_ie': ['CNN'], + } + + def _real_extract(self, url): + webpage = self._download_webpage(url, url_basename(url)) + cnn_url = self._html_search_regex(r"video:\s*'([^']+)'", webpage, 'cnn url') + return self.url_result('http://cnn.com/video/?/video/' + cnn_url, CNNIE.ie_key()) diff --git a/yt_dlp/extractor/comedycentral.py b/yt_dlp/extractor/comedycentral.py new file mode 100644 index 000000000..5a12ab5e6 --- /dev/null +++ b/yt_dlp/extractor/comedycentral.py @@ -0,0 +1,54 @@ +from __future__ import unicode_literals + +from .mtv import MTVServicesInfoExtractor + + +class ComedyCentralIE(MTVServicesInfoExtractor): + _VALID_URL = r'https?://(?:www\.)?cc\.com/(?:episodes|video(?:-clips)?|collection-playlist)/(?P<id>[0-9a-z]{6})' + _FEED_URL = 'http://comedycentral.com/feeds/mrss/' + + _TESTS = [{ + 'url': 'http://www.cc.com/video-clips/5ke9v2/the-daily-show-with-trevor-noah-doc-rivers-and-steve-ballmer---the-nba-player-strike', + 'md5': 'b8acb347177c680ff18a292aa2166f80', + 'info_dict': { + 'id': '89ccc86e-1b02-4f83-b0c9-1d9592ecd025', + 'ext': 'mp4', + 'title': 'The Daily Show with Trevor Noah|August 28, 2020|25|25149|Doc Rivers and Steve Ballmer - The NBA Player Strike', + 'description': 'md5:5334307c433892b85f4f5e5ac9ef7498', + 'timestamp': 1598670000, + 'upload_date': '20200829', + }, + }, { + 'url': 'http://www.cc.com/episodes/pnzzci/drawn-together--american-idol--parody-clip-show-season-3-ep-314', + 'only_matching': True, + }, { + 'url': 'https://www.cc.com/video/k3sdvm/the-daily-show-with-jon-stewart-exclusive-the-fourth-estate', + 'only_matching': True, + }, { + 'url': 'https://www.cc.com/collection-playlist/cosnej/stand-up-specials/t6vtjb', + 'only_matching': True, + }] + + +class ComedyCentralTVIE(MTVServicesInfoExtractor): + _VALID_URL = r'https?://(?:www\.)?comedycentral\.tv/folgen/(?P<id>[0-9a-z]{6})' + _TESTS = [{ + 'url': 'https://www.comedycentral.tv/folgen/pxdpec/josh-investigates-klimawandel-staffel-1-ep-1', + 'info_dict': { + 'id': '15907dc3-ec3c-11e8-a442-0e40cf2fc285', + 'ext': 'mp4', + 'title': 'Josh Investigates', + 'description': 'Steht uns das Ende der Welt bevor?', + }, + }] + _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed' + _GEO_COUNTRIES = ['DE'] + + def _get_feed_query(self, uri): + return { + 'accountOverride': 'intl.mtvi.com', + 'arcEp': 'web.cc.tv', + 'ep': 'b9032c3a', + 'imageEp': 'web.cc.tv', + 'mgid': uri, + } diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py new file mode 100644 index 000000000..dbe7dfcbf --- /dev/null +++ b/yt_dlp/extractor/common.py @@ -0,0 +1,3662 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import base64 +import datetime +import hashlib +import itertools +import json +import netrc +import os +import random +import re +import sys +import time +import math + +from ..compat import ( + compat_cookiejar_Cookie, + compat_cookies_SimpleCookie, + compat_etree_Element, + compat_etree_fromstring, + compat_expanduser, + compat_getpass, + compat_http_client, + compat_os_name, + compat_str, + compat_urllib_error, + compat_urllib_parse_unquote, + compat_urllib_parse_urlencode, + compat_urllib_request, + compat_urlparse, + compat_xml_parse_error, +) +from ..downloader import FileDownloader +from ..downloader.f4m import ( + get_base_url, + remove_encrypted_media, +) +from ..utils import ( + age_restricted, + base_url, + bug_reports_message, + clean_html, + compiled_regex_type, + determine_ext, + determine_protocol, + dict_get, + error_to_compat_str, + extract_attributes, + ExtractorError, + fix_xml_ampersands, + float_or_none, + format_field, + GeoRestrictedError, + GeoUtils, + int_or_none, + js_to_json, + JSON_LD_RE, + mimetype2ext, + network_exceptions, + NO_DEFAULT, + orderedSet, + parse_bitrate, + parse_codecs, + parse_duration, + parse_iso8601, + parse_m3u8_attributes, + parse_resolution, + RegexNotFoundError, + sanitize_filename, + sanitized_Request, + str_or_none, + str_to_int, + strip_or_none, + traverse_obj, + unescapeHTML, + unified_strdate, + unified_timestamp, + update_Request, + update_url_query, + url_basename, + url_or_none, + urljoin, + variadic, + xpath_element, + xpath_text, + xpath_with_ns, +) + + +class InfoExtractor(object): + """Information Extractor class. + + Information extractors are the classes that, given a URL, extract + information about the video (or videos) the URL refers to. This + information includes the real video URL, the video title, author and + others. The information is stored in a dictionary which is then + passed to the YoutubeDL. The YoutubeDL processes this + information possibly downloading the video to the file system, among + other possible outcomes. + + The type field determines the type of the result. + By far the most common value (and the default if _type is missing) is + "video", which indicates a single video. + + For a video, the dictionaries must include the following fields: + + id: Video identifier. + title: Video title, unescaped. + + Additionally, it must contain either a formats entry or a url one: + + formats: A list of dictionaries for each format available, ordered + from worst to best quality. + + Potential fields: + * url The mandatory URL representing the media: + for plain file media - HTTP URL of this file, + for RTMP - RTMP URL, + for HLS - URL of the M3U8 media playlist, + for HDS - URL of the F4M manifest, + for DASH + - HTTP URL to plain file media (in case of + unfragmented media) + - URL of the MPD manifest or base URL + representing the media if MPD manifest + is parsed from a string (in case of + fragmented media) + for MSS - URL of the ISM manifest. + * manifest_url + The URL of the manifest file in case of + fragmented media: + for HLS - URL of the M3U8 master playlist, + for HDS - URL of the F4M manifest, + for DASH - URL of the MPD manifest, + for MSS - URL of the ISM manifest. + * ext Will be calculated from URL if missing + * format A human-readable description of the format + ("mp4 container with h264/opus"). + Calculated from the format_id, width, height. + and format_note fields if missing. + * format_id A short description of the format + ("mp4_h264_opus" or "19"). + Technically optional, but strongly recommended. + * format_note Additional info about the format + ("3D" or "DASH video") + * width Width of the video, if known + * height Height of the video, if known + * resolution Textual description of width and height + * tbr Average bitrate of audio and video in KBit/s + * abr Average audio bitrate in KBit/s + * acodec Name of the audio codec in use + * asr Audio sampling rate in Hertz + * vbr Average video bitrate in KBit/s + * fps Frame rate + * vcodec Name of the video codec in use + * container Name of the container format + * filesize The number of bytes, if known in advance + * filesize_approx An estimate for the number of bytes + * player_url SWF Player URL (used for rtmpdump). + * protocol The protocol that will be used for the actual + download, lower-case. + "http", "https", "rtsp", "rtmp", "rtmp_ffmpeg", "rtmpe", + "m3u8", "m3u8_native" or "http_dash_segments". + * fragment_base_url + Base URL for fragments. Each fragment's path + value (if present) will be relative to + this URL. + * fragments A list of fragments of a fragmented media. + Each fragment entry must contain either an url + or a path. If an url is present it should be + considered by a client. Otherwise both path and + fragment_base_url must be present. Here is + the list of all potential fields: + * "url" - fragment's URL + * "path" - fragment's path relative to + fragment_base_url + * "duration" (optional, int or float) + * "filesize" (optional, int) + * preference Order number of this format. If this field is + present and not None, the formats get sorted + by this field, regardless of all other values. + -1 for default (order by other properties), + -2 or smaller for less than default. + < -1000 to hide the format (if there is + another one which is strictly better) + * language Language code, e.g. "de" or "en-US". + * language_preference Is this in the language mentioned in + the URL? + 10 if it's what the URL is about, + -1 for default (don't know), + -10 otherwise, other values reserved for now. + * quality Order number of the video quality of this + format, irrespective of the file format. + -1 for default (order by other properties), + -2 or smaller for less than default. + * source_preference Order number for this video source + (quality takes higher priority) + -1 for default (order by other properties), + -2 or smaller for less than default. + * http_headers A dictionary of additional HTTP headers + to add to the request. + * stretched_ratio If given and not 1, indicates that the + video's pixels are not square. + width : height ratio as float. + * no_resume The server does not support resuming the + (HTTP or RTMP) download. Boolean. + * has_drm The format has DRM and cannot be downloaded. Boolean + * downloader_options A dictionary of downloader options as + described in FileDownloader + RTMP formats can also have the additional fields: page_url, + app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn, + rtmp_protocol, rtmp_real_time + + url: Final video URL. + ext: Video filename extension. + format: The video format, defaults to ext (used for --get-format) + player_url: SWF Player URL (used for rtmpdump). + + The following fields are optional: + + alt_title: A secondary title of the video. + display_id An alternative identifier for the video, not necessarily + unique, but available before title. Typically, id is + something like "4234987", title "Dancing naked mole rats", + and display_id "dancing-naked-mole-rats" + thumbnails: A list of dictionaries, with the following entries: + * "id" (optional, string) - Thumbnail format ID + * "url" + * "preference" (optional, int) - quality of the image + * "width" (optional, int) + * "height" (optional, int) + * "resolution" (optional, string "{width}x{height}", + deprecated) + * "filesize" (optional, int) + * "_test_url" (optional, bool) - If true, test the URL + thumbnail: Full URL to a video thumbnail image. + description: Full video description. + uploader: Full name of the video uploader. + license: License name the video is licensed under. + creator: The creator of the video. + release_timestamp: UNIX timestamp of the moment the video was released. + release_date: The date (YYYYMMDD) when the video was released. + timestamp: UNIX timestamp of the moment the video was uploaded + upload_date: Video upload date (YYYYMMDD). + If not explicitly set, calculated from timestamp. + uploader_id: Nickname or id of the video uploader. + uploader_url: Full URL to a personal webpage of the video uploader. + channel: Full name of the channel the video is uploaded on. + Note that channel fields may or may not repeat uploader + fields. This depends on a particular extractor. + channel_id: Id of the channel. + channel_url: Full URL to a channel webpage. + location: Physical location where the video was filmed. + subtitles: The available subtitles as a dictionary in the format + {tag: subformats}. "tag" is usually a language code, and + "subformats" is a list sorted from lower to higher + preference, each element is a dictionary with the "ext" + entry and one of: + * "data": The subtitles file contents + * "url": A URL pointing to the subtitles file + It can optionally also have: + * "name": Name or description of the subtitles + "ext" will be calculated from URL if missing + automatic_captions: Like 'subtitles'; contains automatically generated + captions instead of normal subtitles + duration: Length of the video in seconds, as an integer or float. + view_count: How many users have watched the video on the platform. + like_count: Number of positive ratings of the video + dislike_count: Number of negative ratings of the video + repost_count: Number of reposts of the video + average_rating: Average rating give by users, the scale used depends on the webpage + comment_count: Number of comments on the video + comments: A list of comments, each with one or more of the following + properties (all but one of text or html optional): + * "author" - human-readable name of the comment author + * "author_id" - user ID of the comment author + * "author_thumbnail" - The thumbnail of the comment author + * "id" - Comment ID + * "html" - Comment as HTML + * "text" - Plain text of the comment + * "timestamp" - UNIX timestamp of comment + * "parent" - ID of the comment this one is replying to. + Set to "root" to indicate that this is a + comment to the original video. + * "like_count" - Number of positive ratings of the comment + * "dislike_count" - Number of negative ratings of the comment + * "is_favorited" - Whether the comment is marked as + favorite by the video uploader + * "author_is_uploader" - Whether the comment is made by + the video uploader + age_limit: Age restriction for the video, as an integer (years) + webpage_url: The URL to the video webpage, if given to yt-dlp it + should allow to get the same result again. (It will be set + by YoutubeDL if it's missing) + categories: A list of categories that the video falls in, for example + ["Sports", "Berlin"] + tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"] + cast: A list of the video cast + is_live: True, False, or None (=unknown). Whether this video is a + live stream that goes on instead of a fixed-length video. + was_live: True, False, or None (=unknown). Whether this video was + originally a live stream. + live_status: 'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown) + If absent, automatically set from is_live, was_live + start_time: Time in seconds where the reproduction should start, as + specified in the URL. + end_time: Time in seconds where the reproduction should end, as + specified in the URL. + chapters: A list of dictionaries, with the following entries: + * "start_time" - The start time of the chapter in seconds + * "end_time" - The end time of the chapter in seconds + * "title" (optional, string) + playable_in_embed: Whether this video is allowed to play in embedded + players on other sites. Can be True (=always allowed), + False (=never allowed), None (=unknown), or a string + specifying the criteria for embedability (Eg: 'whitelist') + availability: Under what condition the video is available. One of + 'private', 'premium_only', 'subscriber_only', 'needs_auth', + 'unlisted' or 'public'. Use 'InfoExtractor._availability' + to set it + __post_extractor: A function to be called just before the metadata is + written to either disk, logger or console. The function + must return a dict which will be added to the info_dict. + This is usefull for additional information that is + time-consuming to extract. Note that the fields thus + extracted will not be available to output template and + match_filter. So, only "comments" and "comment_count" are + currently allowed to be extracted via this method. + + The following fields should only be used when the video belongs to some logical + chapter or section: + + chapter: Name or title of the chapter the video belongs to. + chapter_number: Number of the chapter the video belongs to, as an integer. + chapter_id: Id of the chapter the video belongs to, as a unicode string. + + The following fields should only be used when the video is an episode of some + series, programme or podcast: + + series: Title of the series or programme the video episode belongs to. + season: Title of the season the video episode belongs to. + season_number: Number of the season the video episode belongs to, as an integer. + season_id: Id of the season the video episode belongs to, as a unicode string. + episode: Title of the video episode. Unlike mandatory video title field, + this field should denote the exact title of the video episode + without any kind of decoration. + episode_number: Number of the video episode within a season, as an integer. + episode_id: Id of the video episode, as a unicode string. + + The following fields should only be used when the media is a track or a part of + a music album: + + track: Title of the track. + track_number: Number of the track within an album or a disc, as an integer. + track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii), + as a unicode string. + artist: Artist(s) of the track. + genre: Genre(s) of the track. + album: Title of the album the track belongs to. + album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc). + album_artist: List of all artists appeared on the album (e.g. + "Ash Borer / Fell Voices" or "Various Artists", useful for splits + and compilations). + disc_number: Number of the disc or other physical medium the track belongs to, + as an integer. + release_year: Year (YYYY) when the album was released. + + Unless mentioned otherwise, the fields should be Unicode strings. + + Unless mentioned otherwise, None is equivalent to absence of information. + + + _type "playlist" indicates multiple videos. + There must be a key "entries", which is a list, an iterable, or a PagedList + object, each element of which is a valid dictionary by this specification. + + Additionally, playlists can have "id", "title", and any other relevent + attributes with the same semantics as videos (see above). + + + _type "multi_video" indicates that there are multiple videos that + form a single show, for examples multiple acts of an opera or TV episode. + It must have an entries key like a playlist and contain all the keys + required for a video at the same time. + + + _type "url" indicates that the video must be extracted from another + location, possibly by a different extractor. Its only required key is: + "url" - the next URL to extract. + The key "ie_key" can be set to the class name (minus the trailing "IE", + e.g. "Youtube") if the extractor class is known in advance. + Additionally, the dictionary may have any properties of the resolved entity + known in advance, for example "title" if the title of the referred video is + known ahead of time. + + + _type "url_transparent" entities have the same specification as "url", but + indicate that the given additional information is more precise than the one + associated with the resolved URL. + This is useful when a site employs a video service that hosts the video and + its technical metadata, but that video service does not embed a useful + title, description etc. + + + Subclasses of this one should re-define the _real_initialize() and + _real_extract() methods and define a _VALID_URL regexp. + Probably, they should also be added to the list of extractors. + + Subclasses may also override suitable() if necessary, but ensure the function + signature is preserved and that this function imports everything it needs + (except other extractors), so that lazy_extractors works correctly + + _GEO_BYPASS attribute may be set to False in order to disable + geo restriction bypass mechanisms for a particular extractor. + Though it won't disable explicit geo restriction bypass based on + country code provided with geo_bypass_country. + + _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted + countries for this extractor. One of these countries will be used by + geo restriction bypass mechanism right away in order to bypass + geo restriction, of course, if the mechanism is not disabled. + + _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted + IP blocks in CIDR notation for this extractor. One of these IP blocks + will be used by geo restriction bypass mechanism similarly + to _GEO_COUNTRIES. + + The _WORKING attribute should be set to False for broken IEs + in order to warn the users and skip the tests. + """ + + _ready = False + _downloader = None + _x_forwarded_for_ip = None + _GEO_BYPASS = True + _GEO_COUNTRIES = None + _GEO_IP_BLOCKS = None + _WORKING = True + + _LOGIN_HINTS = { + 'any': 'Use --cookies, --username and --password or --netrc to provide account credentials', + 'cookies': ( + 'Use --cookies-from-browser or --cookies for the authentication. ' + 'See https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl for how to manually pass cookies'), + 'password': 'Use --username and --password or --netrc to provide account credentials', + } + + def __init__(self, downloader=None): + """Constructor. Receives an optional downloader.""" + self._ready = False + self._x_forwarded_for_ip = None + self._printed_messages = set() + self.set_downloader(downloader) + + @classmethod + def _match_valid_url(cls, url): + # This does not use has/getattr intentionally - we want to know whether + # we have cached the regexp for *this* class, whereas getattr would also + # match the superclass + if '_VALID_URL_RE' not in cls.__dict__: + cls._VALID_URL_RE = re.compile(cls._VALID_URL) + return cls._VALID_URL_RE.match(url) + + @classmethod + def suitable(cls, url): + """Receives a URL and returns True if suitable for this IE.""" + # This function must import everything it needs (except other extractors), + # so that lazy_extractors works correctly + return cls._match_valid_url(url) is not None + + @classmethod + def _match_id(cls, url): + return cls._match_valid_url(url).group('id') + + @classmethod + def get_temp_id(cls, url): + try: + return cls._match_id(url) + except (IndexError, AttributeError): + return None + + @classmethod + def working(cls): + """Getter method for _WORKING.""" + return cls._WORKING + + def initialize(self): + """Initializes an instance (authentication, etc).""" + self._printed_messages = set() + self._initialize_geo_bypass({ + 'countries': self._GEO_COUNTRIES, + 'ip_blocks': self._GEO_IP_BLOCKS, + }) + if not self._ready: + self._real_initialize() + self._ready = True + + def _initialize_geo_bypass(self, geo_bypass_context): + """ + Initialize geo restriction bypass mechanism. + + This method is used to initialize geo bypass mechanism based on faking + X-Forwarded-For HTTP header. A random country from provided country list + is selected and a random IP belonging to this country is generated. This + IP will be passed as X-Forwarded-For HTTP header in all subsequent + HTTP requests. + + This method will be used for initial geo bypass mechanism initialization + during the instance initialization with _GEO_COUNTRIES and + _GEO_IP_BLOCKS. + + You may also manually call it from extractor's code if geo bypass + information is not available beforehand (e.g. obtained during + extraction) or due to some other reason. In this case you should pass + this information in geo bypass context passed as first argument. It may + contain following fields: + + countries: List of geo unrestricted countries (similar + to _GEO_COUNTRIES) + ip_blocks: List of geo unrestricted IP blocks in CIDR notation + (similar to _GEO_IP_BLOCKS) + + """ + if not self._x_forwarded_for_ip: + + # Geo bypass mechanism is explicitly disabled by user + if not self.get_param('geo_bypass', True): + return + + if not geo_bypass_context: + geo_bypass_context = {} + + # Backward compatibility: previously _initialize_geo_bypass + # expected a list of countries, some 3rd party code may still use + # it this way + if isinstance(geo_bypass_context, (list, tuple)): + geo_bypass_context = { + 'countries': geo_bypass_context, + } + + # The whole point of geo bypass mechanism is to fake IP + # as X-Forwarded-For HTTP header based on some IP block or + # country code. + + # Path 1: bypassing based on IP block in CIDR notation + + # Explicit IP block specified by user, use it right away + # regardless of whether extractor is geo bypassable or not + ip_block = self.get_param('geo_bypass_ip_block', None) + + # Otherwise use random IP block from geo bypass context but only + # if extractor is known as geo bypassable + if not ip_block: + ip_blocks = geo_bypass_context.get('ip_blocks') + if self._GEO_BYPASS and ip_blocks: + ip_block = random.choice(ip_blocks) + + if ip_block: + self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block) + self._downloader.write_debug( + '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip) + return + + # Path 2: bypassing based on country code + + # Explicit country code specified by user, use it right away + # regardless of whether extractor is geo bypassable or not + country = self.get_param('geo_bypass_country', None) + + # Otherwise use random country code from geo bypass context but + # only if extractor is known as geo bypassable + if not country: + countries = geo_bypass_context.get('countries') + if self._GEO_BYPASS and countries: + country = random.choice(countries) + + if country: + self._x_forwarded_for_ip = GeoUtils.random_ipv4(country) + self._downloader.write_debug( + 'Using fake IP %s (%s) as X-Forwarded-For' % (self._x_forwarded_for_ip, country.upper())) + + def extract(self, url): + """Extracts URL information and returns it in list of dicts.""" + try: + for _ in range(2): + try: + self.initialize() + self.write_debug('Extracting URL: %s' % url) + ie_result = self._real_extract(url) + if ie_result is None: + return None + if self._x_forwarded_for_ip: + ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip + subtitles = ie_result.get('subtitles') + if (subtitles and 'live_chat' in subtitles + and 'no-live-chat' in self.get_param('compat_opts', [])): + del subtitles['live_chat'] + return ie_result + except GeoRestrictedError as e: + if self.__maybe_fake_ip_and_retry(e.countries): + continue + raise + except ExtractorError as e: + video_id = e.video_id or self.get_temp_id(url) + raise ExtractorError( + e.msg, video_id=video_id, ie=self.IE_NAME, tb=e.traceback, expected=e.expected, cause=e.cause) + except compat_http_client.IncompleteRead as e: + raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url)) + except (KeyError, StopIteration) as e: + raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url)) + + def __maybe_fake_ip_and_retry(self, countries): + if (not self.get_param('geo_bypass_country', None) + and self._GEO_BYPASS + and self.get_param('geo_bypass', True) + and not self._x_forwarded_for_ip + and countries): + country_code = random.choice(countries) + self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code) + if self._x_forwarded_for_ip: + self.report_warning( + 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.' + % (self._x_forwarded_for_ip, country_code.upper())) + return True + return False + + def set_downloader(self, downloader): + """Sets the downloader for this IE.""" + self._downloader = downloader + + def _real_initialize(self): + """Real initialization process. Redefine in subclasses.""" + pass + + def _real_extract(self, url): + """Real extraction process. Redefine in subclasses.""" + pass + + @classmethod + def ie_key(cls): + """A string for getting the InfoExtractor with get_info_extractor""" + return cls.__name__[:-2] + + @property + def IE_NAME(self): + return compat_str(type(self).__name__[:-2]) + + @staticmethod + def __can_accept_status_code(err, expected_status): + assert isinstance(err, compat_urllib_error.HTTPError) + if expected_status is None: + return False + elif callable(expected_status): + return expected_status(err.code) is True + else: + return err.code in variadic(expected_status) + + def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None): + """ + Return the response handle. + + See _download_webpage docstring for arguments specification. + """ + if not self._downloader._first_webpage_request: + sleep_interval = float_or_none(self.get_param('sleep_interval_requests')) or 0 + if sleep_interval > 0: + self.to_screen('Sleeping %s seconds ...' % sleep_interval) + time.sleep(sleep_interval) + else: + self._downloader._first_webpage_request = False + + if note is None: + self.report_download_webpage(video_id) + elif note is not False: + if video_id is None: + self.to_screen('%s' % (note,)) + else: + self.to_screen('%s: %s' % (video_id, note)) + + # Some sites check X-Forwarded-For HTTP header in order to figure out + # the origin of the client behind proxy. This allows bypassing geo + # restriction by faking this header's value to IP that belongs to some + # geo unrestricted country. We will do so once we encounter any + # geo restriction error. + if self._x_forwarded_for_ip: + if 'X-Forwarded-For' not in headers: + headers['X-Forwarded-For'] = self._x_forwarded_for_ip + + if isinstance(url_or_request, compat_urllib_request.Request): + url_or_request = update_Request( + url_or_request, data=data, headers=headers, query=query) + else: + if query: + url_or_request = update_url_query(url_or_request, query) + if data is not None or headers: + url_or_request = sanitized_Request(url_or_request, data, headers) + try: + return self._downloader.urlopen(url_or_request) + except network_exceptions as err: + if isinstance(err, compat_urllib_error.HTTPError): + if self.__can_accept_status_code(err, expected_status): + # Retain reference to error to prevent file object from + # being closed before it can be read. Works around the + # effects of <https://bugs.python.org/issue15002> + # introduced in Python 3.4.1. + err.fp._error = err + return err.fp + + if errnote is False: + return False + if errnote is None: + errnote = 'Unable to download webpage' + + errmsg = '%s: %s' % (errnote, error_to_compat_str(err)) + if fatal: + raise ExtractorError(errmsg, sys.exc_info()[2], cause=err) + else: + self.report_warning(errmsg) + return False + + def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None): + """ + Return a tuple (page content as string, URL handle). + + See _download_webpage docstring for arguments specification. + """ + # Strip hashes from the URL (#1038) + if isinstance(url_or_request, (compat_str, str)): + url_or_request = url_or_request.partition('#')[0] + + urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status) + if urlh is False: + assert not fatal + return False + content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding) + return (content, urlh) + + @staticmethod + def _guess_encoding_from_content(content_type, webpage_bytes): + m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) + if m: + encoding = m.group(1) + else: + m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]', + webpage_bytes[:1024]) + if m: + encoding = m.group(1).decode('ascii') + elif webpage_bytes.startswith(b'\xff\xfe'): + encoding = 'utf-16' + else: + encoding = 'utf-8' + + return encoding + + def __check_blocked(self, content): + first_block = content[:512] + if ('<title>Access to this site is blocked</title>' in content + and 'Websense' in first_block): + msg = 'Access to this webpage has been blocked by Websense filtering software in your network.' + blocked_iframe = self._html_search_regex( + r'<iframe src="([^"]+)"', content, + 'Websense information URL', default=None) + if blocked_iframe: + msg += ' Visit %s for more details' % blocked_iframe + raise ExtractorError(msg, expected=True) + if '<title>The URL you requested has been blocked</title>' in first_block: + msg = ( + 'Access to this webpage has been blocked by Indian censorship. ' + 'Use a VPN or proxy server (with --proxy) to route around it.') + block_msg = self._html_search_regex( + r'</h1><p>(.*?)</p>', + content, 'block message', default=None) + if block_msg: + msg += ' (Message: "%s")' % block_msg.replace('\n', ' ') + raise ExtractorError(msg, expected=True) + if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content + and 'blocklist.rkn.gov.ru' in content): + raise ExtractorError( + 'Access to this webpage has been blocked by decision of the Russian government. ' + 'Visit http://blocklist.rkn.gov.ru/ for a block reason.', + expected=True) + + def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None): + content_type = urlh.headers.get('Content-Type', '') + webpage_bytes = urlh.read() + if prefix is not None: + webpage_bytes = prefix + webpage_bytes + if not encoding: + encoding = self._guess_encoding_from_content(content_type, webpage_bytes) + if self.get_param('dump_intermediate_pages', False): + self.to_screen('Dumping request to ' + urlh.geturl()) + dump = base64.b64encode(webpage_bytes).decode('ascii') + self._downloader.to_screen(dump) + if self.get_param('write_pages', False): + basen = '%s_%s' % (video_id, urlh.geturl()) + trim_length = self.get_param('trim_file_name') or 240 + if len(basen) > trim_length: + h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest() + basen = basen[:trim_length - len(h)] + h + raw_filename = basen + '.dump' + filename = sanitize_filename(raw_filename, restricted=True) + self.to_screen('Saving request to ' + filename) + # Working around MAX_PATH limitation on Windows (see + # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx) + if compat_os_name == 'nt': + absfilepath = os.path.abspath(filename) + if len(absfilepath) > 259: + filename = '\\\\?\\' + absfilepath + with open(filename, 'wb') as outf: + outf.write(webpage_bytes) + + try: + content = webpage_bytes.decode(encoding, 'replace') + except LookupError: + content = webpage_bytes.decode('utf-8', 'replace') + + self.__check_blocked(content) + + return content + + def _download_webpage( + self, url_or_request, video_id, note=None, errnote=None, + fatal=True, tries=1, timeout=5, encoding=None, data=None, + headers={}, query={}, expected_status=None): + """ + Return the data of the page as a string. + + Arguments: + url_or_request -- plain text URL as a string or + a compat_urllib_request.Requestobject + video_id -- Video/playlist/item identifier (string) + + Keyword arguments: + note -- note printed before downloading (string) + errnote -- note printed in case of an error (string) + fatal -- flag denoting whether error should be considered fatal, + i.e. whether it should cause ExtractionError to be raised, + otherwise a warning will be reported and extraction continued + tries -- number of tries + timeout -- sleep interval between tries + encoding -- encoding for a page content decoding, guessed automatically + when not explicitly specified + data -- POST data (bytes) + headers -- HTTP headers (dict) + query -- URL query (dict) + expected_status -- allows to accept failed HTTP requests (non 2xx + status code) by explicitly specifying a set of accepted status + codes. Can be any of the following entities: + - an integer type specifying an exact failed status code to + accept + - a list or a tuple of integer types specifying a list of + failed status codes to accept + - a callable accepting an actual failed status code and + returning True if it should be accepted + Note that this argument does not affect success status codes (2xx) + which are always accepted. + """ + + success = False + try_count = 0 + while success is False: + try: + res = self._download_webpage_handle( + url_or_request, video_id, note, errnote, fatal, + encoding=encoding, data=data, headers=headers, query=query, + expected_status=expected_status) + success = True + except compat_http_client.IncompleteRead as e: + try_count += 1 + if try_count >= tries: + raise e + self._sleep(timeout, video_id) + if res is False: + return res + else: + content, _ = res + return content + + def _download_xml_handle( + self, url_or_request, video_id, note='Downloading XML', + errnote='Unable to download XML', transform_source=None, + fatal=True, encoding=None, data=None, headers={}, query={}, + expected_status=None): + """ + Return a tuple (xml as an compat_etree_Element, URL handle). + + See _download_webpage docstring for arguments specification. + """ + res = self._download_webpage_handle( + url_or_request, video_id, note, errnote, fatal=fatal, + encoding=encoding, data=data, headers=headers, query=query, + expected_status=expected_status) + if res is False: + return res + xml_string, urlh = res + return self._parse_xml( + xml_string, video_id, transform_source=transform_source, + fatal=fatal), urlh + + def _download_xml( + self, url_or_request, video_id, + note='Downloading XML', errnote='Unable to download XML', + transform_source=None, fatal=True, encoding=None, + data=None, headers={}, query={}, expected_status=None): + """ + Return the xml as an compat_etree_Element. + + See _download_webpage docstring for arguments specification. + """ + res = self._download_xml_handle( + url_or_request, video_id, note=note, errnote=errnote, + transform_source=transform_source, fatal=fatal, encoding=encoding, + data=data, headers=headers, query=query, + expected_status=expected_status) + return res if res is False else res[0] + + def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True): + if transform_source: + xml_string = transform_source(xml_string) + try: + return compat_etree_fromstring(xml_string.encode('utf-8')) + except compat_xml_parse_error as ve: + errmsg = '%s: Failed to parse XML ' % video_id + if fatal: + raise ExtractorError(errmsg, cause=ve) + else: + self.report_warning(errmsg + str(ve)) + + def _download_json_handle( + self, url_or_request, video_id, note='Downloading JSON metadata', + errnote='Unable to download JSON metadata', transform_source=None, + fatal=True, encoding=None, data=None, headers={}, query={}, + expected_status=None): + """ + Return a tuple (JSON object, URL handle). + + See _download_webpage docstring for arguments specification. + """ + res = self._download_webpage_handle( + url_or_request, video_id, note, errnote, fatal=fatal, + encoding=encoding, data=data, headers=headers, query=query, + expected_status=expected_status) + if res is False: + return res + json_string, urlh = res + return self._parse_json( + json_string, video_id, transform_source=transform_source, + fatal=fatal), urlh + + def _download_json( + self, url_or_request, video_id, note='Downloading JSON metadata', + errnote='Unable to download JSON metadata', transform_source=None, + fatal=True, encoding=None, data=None, headers={}, query={}, + expected_status=None): + """ + Return the JSON object as a dict. + + See _download_webpage docstring for arguments specification. + """ + res = self._download_json_handle( + url_or_request, video_id, note=note, errnote=errnote, + transform_source=transform_source, fatal=fatal, encoding=encoding, + data=data, headers=headers, query=query, + expected_status=expected_status) + return res if res is False else res[0] + + def _parse_json(self, json_string, video_id, transform_source=None, fatal=True): + if transform_source: + json_string = transform_source(json_string) + try: + return json.loads(json_string) + except ValueError as ve: + errmsg = '%s: Failed to parse JSON ' % video_id + if fatal: + raise ExtractorError(errmsg, cause=ve) + else: + self.report_warning(errmsg + str(ve)) + + def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True): + return self._parse_json( + data[data.find('{'):data.rfind('}') + 1], + video_id, transform_source, fatal) + + def _download_socket_json_handle( + self, url_or_request, video_id, note='Polling socket', + errnote='Unable to poll socket', transform_source=None, + fatal=True, encoding=None, data=None, headers={}, query={}, + expected_status=None): + """ + Return a tuple (JSON object, URL handle). + + See _download_webpage docstring for arguments specification. + """ + res = self._download_webpage_handle( + url_or_request, video_id, note, errnote, fatal=fatal, + encoding=encoding, data=data, headers=headers, query=query, + expected_status=expected_status) + if res is False: + return res + webpage, urlh = res + return self._parse_socket_response_as_json( + webpage, video_id, transform_source=transform_source, + fatal=fatal), urlh + + def _download_socket_json( + self, url_or_request, video_id, note='Polling socket', + errnote='Unable to poll socket', transform_source=None, + fatal=True, encoding=None, data=None, headers={}, query={}, + expected_status=None): + """ + Return the JSON object as a dict. + + See _download_webpage docstring for arguments specification. + """ + res = self._download_socket_json_handle( + url_or_request, video_id, note=note, errnote=errnote, + transform_source=transform_source, fatal=fatal, encoding=encoding, + data=data, headers=headers, query=query, + expected_status=expected_status) + return res if res is False else res[0] + + def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs): + idstr = format_field(video_id, template='%s: ') + msg = f'[{self.IE_NAME}] {idstr}{msg}' + if only_once: + if f'WARNING: {msg}' in self._printed_messages: + return + self._printed_messages.add(f'WARNING: {msg}') + self._downloader.report_warning(msg, *args, **kwargs) + + def to_screen(self, msg, *args, **kwargs): + """Print msg to screen, prefixing it with '[ie_name]'""" + self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs) + + def write_debug(self, msg, *args, **kwargs): + self._downloader.write_debug('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs) + + def get_param(self, name, default=None, *args, **kwargs): + if self._downloader: + return self._downloader.params.get(name, default, *args, **kwargs) + return default + + def report_drm(self, video_id, partial=False): + self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id) + + def report_extraction(self, id_or_name): + """Report information extraction.""" + self.to_screen('%s: Extracting information' % id_or_name) + + def report_download_webpage(self, video_id): + """Report webpage download.""" + self.to_screen('%s: Downloading webpage' % video_id) + + def report_age_confirmation(self): + """Report attempt to confirm age.""" + self.to_screen('Confirming age') + + def report_login(self): + """Report attempt to log in.""" + self.to_screen('Logging in') + + def raise_login_required( + self, msg='This video is only available for registered users', + metadata_available=False, method='any'): + if metadata_available and self.get_param('ignore_no_formats_error'): + self.report_warning(msg) + if method is not None: + msg = '%s. %s' % (msg, self._LOGIN_HINTS[method]) + raise ExtractorError(msg, expected=True) + + def raise_geo_restricted( + self, msg='This video is not available from your location due to geo restriction', + countries=None, metadata_available=False): + if metadata_available and self.get_param('ignore_no_formats_error'): + self.report_warning(msg) + else: + raise GeoRestrictedError(msg, countries=countries) + + def raise_no_formats(self, msg, expected=False, video_id=None): + if expected and self.get_param('ignore_no_formats_error'): + self.report_warning(msg, video_id) + elif isinstance(msg, ExtractorError): + raise msg + else: + raise ExtractorError(msg, expected=expected, video_id=video_id) + + # Methods for following #608 + @staticmethod + def url_result(url, ie=None, video_id=None, video_title=None, **kwargs): + """Returns a URL that points to a page that should be processed""" + # TODO: ie should be the class used for getting the info + video_info = {'_type': 'url', + 'url': url, + 'ie_key': ie} + video_info.update(kwargs) + if video_id is not None: + video_info['id'] = video_id + if video_title is not None: + video_info['title'] = video_title + return video_info + + def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None): + urls = orderedSet( + self.url_result(self._proto_relative_url(getter(m) if getter else m), ie) + for m in matches) + return self.playlist_result( + urls, playlist_id=playlist_id, playlist_title=playlist_title) + + @staticmethod + def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs): + """Returns a playlist""" + video_info = {'_type': 'playlist', + 'entries': entries} + video_info.update(kwargs) + if playlist_id: + video_info['id'] = playlist_id + if playlist_title: + video_info['title'] = playlist_title + if playlist_description is not None: + video_info['description'] = playlist_description + return video_info + + def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): + """ + Perform a regex search on the given string, using a single or a list of + patterns returning the first matching group. + In case of failure return a default value or raise a WARNING or a + RegexNotFoundError, depending on fatal, specifying the field name. + """ + if isinstance(pattern, (str, compat_str, compiled_regex_type)): + mobj = re.search(pattern, string, flags) + else: + for p in pattern: + mobj = re.search(p, string, flags) + if mobj: + break + + _name = self._downloader._color_text(name, 'blue') + + if mobj: + if group is None: + # return the first matching group + return next(g for g in mobj.groups() if g is not None) + elif isinstance(group, (list, tuple)): + return tuple(mobj.group(g) for g in group) + else: + return mobj.group(group) + elif default is not NO_DEFAULT: + return default + elif fatal: + raise RegexNotFoundError('Unable to extract %s' % _name) + else: + self.report_warning('unable to extract %s' % _name + bug_reports_message()) + return None + + def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): + """ + Like _search_regex, but strips HTML tags and unescapes entities. + """ + res = self._search_regex(pattern, string, name, default, fatal, flags, group) + if res: + return clean_html(res).strip() + else: + return res + + def _get_netrc_login_info(self, netrc_machine=None): + username = None + password = None + netrc_machine = netrc_machine or self._NETRC_MACHINE + + if self.get_param('usenetrc', False): + try: + netrc_file = compat_expanduser(self.get_param('netrc_location') or '~') + if os.path.isdir(netrc_file): + netrc_file = os.path.join(netrc_file, '.netrc') + info = netrc.netrc(file=netrc_file).authenticators(netrc_machine) + if info is not None: + username = info[0] + password = info[2] + else: + raise netrc.NetrcParseError( + 'No authenticators for %s' % netrc_machine) + except (IOError, netrc.NetrcParseError) as err: + self.report_warning( + 'parsing .netrc: %s' % error_to_compat_str(err)) + + return username, password + + def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None): + """ + Get the login info as (username, password) + First look for the manually specified credentials using username_option + and password_option as keys in params dictionary. If no such credentials + available look in the netrc file using the netrc_machine or _NETRC_MACHINE + value. + If there's no info available, return (None, None) + """ + + # Attempt to use provided username and password or .netrc data + username = self.get_param(username_option) + if username is not None: + password = self.get_param(password_option) + else: + username, password = self._get_netrc_login_info(netrc_machine) + + return username, password + + def _get_tfa_info(self, note='two-factor verification code'): + """ + Get the two-factor authentication info + TODO - asking the user will be required for sms/phone verify + currently just uses the command line option + If there's no info available, return None + """ + + tfa = self.get_param('twofactor') + if tfa is not None: + return tfa + + return compat_getpass('Type %s and press [Return]: ' % note) + + # Helper functions for extracting OpenGraph info + @staticmethod + def _og_regexes(prop): + content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))' + property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)' + % {'prop': re.escape(prop)}) + template = r'<meta[^>]+?%s[^>]+?%s' + return [ + template % (property_re, content_re), + template % (content_re, property_re), + ] + + @staticmethod + def _meta_regex(prop): + return r'''(?isx)<meta + (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1) + [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop) + + def _og_search_property(self, prop, html, name=None, **kargs): + prop = variadic(prop) + if name is None: + name = 'OpenGraph %s' % prop[0] + og_regexes = [] + for p in prop: + og_regexes.extend(self._og_regexes(p)) + escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs) + if escaped is None: + return None + return unescapeHTML(escaped) + + def _og_search_thumbnail(self, html, **kargs): + return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs) + + def _og_search_description(self, html, **kargs): + return self._og_search_property('description', html, fatal=False, **kargs) + + def _og_search_title(self, html, **kargs): + return self._og_search_property('title', html, **kargs) + + def _og_search_video_url(self, html, name='video url', secure=True, **kargs): + regexes = self._og_regexes('video') + self._og_regexes('video:url') + if secure: + regexes = self._og_regexes('video:secure_url') + regexes + return self._html_search_regex(regexes, html, name, **kargs) + + def _og_search_url(self, html, **kargs): + return self._og_search_property('url', html, **kargs) + + def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs): + name = variadic(name) + if display_name is None: + display_name = name[0] + return self._html_search_regex( + [self._meta_regex(n) for n in name], + html, display_name, fatal=fatal, group='content', **kwargs) + + def _dc_search_uploader(self, html): + return self._html_search_meta('dc.creator', html, 'uploader') + + def _rta_search(self, html): + # See http://www.rtalabel.org/index.php?content=howtofaq#single + if re.search(r'(?ix)<meta\s+name="rating"\s+' + r' content="RTA-5042-1996-1400-1577-RTA"', + html): + return 18 + return 0 + + def _media_rating_search(self, html): + # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/ + rating = self._html_search_meta('rating', html) + + if not rating: + return None + + RATING_TABLE = { + 'safe for kids': 0, + 'general': 8, + '14 years': 14, + 'mature': 17, + 'restricted': 19, + } + return RATING_TABLE.get(rating.lower()) + + def _family_friendly_search(self, html): + # See http://schema.org/VideoObject + family_friendly = self._html_search_meta( + 'isFamilyFriendly', html, default=None) + + if not family_friendly: + return None + + RATING_TABLE = { + '1': 0, + 'true': 0, + '0': 18, + 'false': 18, + } + return RATING_TABLE.get(family_friendly.lower()) + + def _twitter_search_player(self, html): + return self._html_search_meta('twitter:player', html, + 'twitter card player') + + def _search_json_ld(self, html, video_id, expected_type=None, **kwargs): + json_ld_list = list(re.finditer(JSON_LD_RE, html)) + default = kwargs.get('default', NO_DEFAULT) + # JSON-LD may be malformed and thus `fatal` should be respected. + # At the same time `default` may be passed that assumes `fatal=False` + # for _search_regex. Let's simulate the same behavior here as well. + fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False + json_ld = [] + for mobj in json_ld_list: + json_ld_item = self._parse_json( + mobj.group('json_ld'), video_id, fatal=fatal) + if not json_ld_item: + continue + if isinstance(json_ld_item, dict): + json_ld.append(json_ld_item) + elif isinstance(json_ld_item, (list, tuple)): + json_ld.extend(json_ld_item) + if json_ld: + json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type) + if json_ld: + return json_ld + if default is not NO_DEFAULT: + return default + elif fatal: + raise RegexNotFoundError('Unable to extract JSON-LD') + else: + self.report_warning('unable to extract JSON-LD %s' % bug_reports_message()) + return {} + + def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None): + if isinstance(json_ld, compat_str): + json_ld = self._parse_json(json_ld, video_id, fatal=fatal) + if not json_ld: + return {} + info = {} + if not isinstance(json_ld, (list, tuple, dict)): + return info + if isinstance(json_ld, dict): + json_ld = [json_ld] + + INTERACTION_TYPE_MAP = { + 'CommentAction': 'comment', + 'AgreeAction': 'like', + 'DisagreeAction': 'dislike', + 'LikeAction': 'like', + 'DislikeAction': 'dislike', + 'ListenAction': 'view', + 'WatchAction': 'view', + 'ViewAction': 'view', + } + + def extract_interaction_type(e): + interaction_type = e.get('interactionType') + if isinstance(interaction_type, dict): + interaction_type = interaction_type.get('@type') + return str_or_none(interaction_type) + + def extract_interaction_statistic(e): + interaction_statistic = e.get('interactionStatistic') + if isinstance(interaction_statistic, dict): + interaction_statistic = [interaction_statistic] + if not isinstance(interaction_statistic, list): + return + for is_e in interaction_statistic: + if not isinstance(is_e, dict): + continue + if is_e.get('@type') != 'InteractionCounter': + continue + interaction_type = extract_interaction_type(is_e) + if not interaction_type: + continue + # For interaction count some sites provide string instead of + # an integer (as per spec) with non digit characters (e.g. ",") + # so extracting count with more relaxed str_to_int + interaction_count = str_to_int(is_e.get('userInteractionCount')) + if interaction_count is None: + continue + count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1]) + if not count_kind: + continue + count_key = '%s_count' % count_kind + if info.get(count_key) is not None: + continue + info[count_key] = interaction_count + + def extract_video_object(e): + assert e['@type'] == 'VideoObject' + author = e.get('author') + info.update({ + 'url': url_or_none(e.get('contentUrl')), + 'title': unescapeHTML(e.get('name')), + 'description': unescapeHTML(e.get('description')), + 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')), + 'duration': parse_duration(e.get('duration')), + 'timestamp': unified_timestamp(e.get('uploadDate')), + # author can be an instance of 'Organization' or 'Person' types. + # both types can have 'name' property(inherited from 'Thing' type). [1] + # however some websites are using 'Text' type instead. + # 1. https://schema.org/VideoObject + 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None, + 'filesize': float_or_none(e.get('contentSize')), + 'tbr': int_or_none(e.get('bitrate')), + 'width': int_or_none(e.get('width')), + 'height': int_or_none(e.get('height')), + 'view_count': int_or_none(e.get('interactionCount')), + }) + extract_interaction_statistic(e) + + for e in json_ld: + if '@context' in e: + item_type = e.get('@type') + if expected_type is not None and expected_type != item_type: + continue + if item_type in ('TVEpisode', 'Episode'): + episode_name = unescapeHTML(e.get('name')) + info.update({ + 'episode': episode_name, + 'episode_number': int_or_none(e.get('episodeNumber')), + 'description': unescapeHTML(e.get('description')), + }) + if not info.get('title') and episode_name: + info['title'] = episode_name + part_of_season = e.get('partOfSeason') + if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'): + info.update({ + 'season': unescapeHTML(part_of_season.get('name')), + 'season_number': int_or_none(part_of_season.get('seasonNumber')), + }) + part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries') + if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'): + info['series'] = unescapeHTML(part_of_series.get('name')) + elif item_type == 'Movie': + info.update({ + 'title': unescapeHTML(e.get('name')), + 'description': unescapeHTML(e.get('description')), + 'duration': parse_duration(e.get('duration')), + 'timestamp': unified_timestamp(e.get('dateCreated')), + }) + elif item_type in ('Article', 'NewsArticle'): + info.update({ + 'timestamp': parse_iso8601(e.get('datePublished')), + 'title': unescapeHTML(e.get('headline')), + 'description': unescapeHTML(e.get('articleBody')), + }) + elif item_type == 'VideoObject': + extract_video_object(e) + if expected_type is None: + continue + else: + break + video = e.get('video') + if isinstance(video, dict) and video.get('@type') == 'VideoObject': + extract_video_object(video) + if expected_type is None: + continue + else: + break + return dict((k, v) for k, v in info.items() if v is not None) + + @staticmethod + def _hidden_inputs(html): + html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html) + hidden_inputs = {} + for input in re.findall(r'(?i)(<input[^>]+>)', html): + attrs = extract_attributes(input) + if not input: + continue + if attrs.get('type') not in ('hidden', 'submit'): + continue + name = attrs.get('name') or attrs.get('id') + value = attrs.get('value') + if name and value is not None: + hidden_inputs[name] = value + return hidden_inputs + + def _form_hidden_inputs(self, form_id, html): + form = self._search_regex( + r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id, + html, '%s form' % form_id, group='form') + return self._hidden_inputs(form) + + class FormatSort: + regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$' + + default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality', + 'res', 'fps', 'codec:vp9.2', 'size', 'br', 'asr', + 'proto', 'ext', 'hasaud', 'source', 'format_id') # These must not be aliases + ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr', + 'height', 'width', 'proto', 'vext', 'abr', 'aext', + 'fps', 'fs_approx', 'source', 'format_id') + + settings = { + 'vcodec': {'type': 'ordered', 'regex': True, + 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']}, + 'acodec': {'type': 'ordered', 'regex': True, + 'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']}, + 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol', + 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', '.*dash', 'ws|websocket', '', 'mms|rtsp', 'none', 'f4']}, + 'vext': {'type': 'ordered', 'field': 'video_ext', + 'order': ('mp4', 'webm', 'flv', '', 'none'), + 'order_free': ('webm', 'mp4', 'flv', '', 'none')}, + 'aext': {'type': 'ordered', 'field': 'audio_ext', + 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'), + 'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')}, + 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000}, + 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple', + 'field': ('vcodec', 'acodec'), + 'function': lambda it: int(any(v != 'none' for v in it))}, + 'ie_pref': {'priority': True, 'type': 'extractor'}, + 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)}, + 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)}, + 'lang': {'convert': 'ignore', 'field': 'language_preference'}, + 'quality': {'convert': 'float_none', 'default': -1}, + 'filesize': {'convert': 'bytes'}, + 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'}, + 'id': {'convert': 'string', 'field': 'format_id'}, + 'height': {'convert': 'float_none'}, + 'width': {'convert': 'float_none'}, + 'fps': {'convert': 'float_none'}, + 'tbr': {'convert': 'float_none'}, + 'vbr': {'convert': 'float_none'}, + 'abr': {'convert': 'float_none'}, + 'asr': {'convert': 'float_none'}, + 'source': {'convert': 'ignore', 'field': 'source_preference'}, + + 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')}, + 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True}, + 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')}, + 'ext': {'type': 'combined', 'field': ('vext', 'aext')}, + 'res': {'type': 'multiple', 'field': ('height', 'width'), + 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))}, + + # Most of these exist only for compatibility reasons + 'dimension': {'type': 'alias', 'field': 'res'}, + 'resolution': {'type': 'alias', 'field': 'res'}, + 'extension': {'type': 'alias', 'field': 'ext'}, + 'bitrate': {'type': 'alias', 'field': 'br'}, + 'total_bitrate': {'type': 'alias', 'field': 'tbr'}, + 'video_bitrate': {'type': 'alias', 'field': 'vbr'}, + 'audio_bitrate': {'type': 'alias', 'field': 'abr'}, + 'framerate': {'type': 'alias', 'field': 'fps'}, + 'language_preference': {'type': 'alias', 'field': 'lang'}, # not named as 'language' because such a field exists + 'protocol': {'type': 'alias', 'field': 'proto'}, + 'source_preference': {'type': 'alias', 'field': 'source'}, + 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'}, + 'filesize_estimate': {'type': 'alias', 'field': 'size'}, + 'samplerate': {'type': 'alias', 'field': 'asr'}, + 'video_ext': {'type': 'alias', 'field': 'vext'}, + 'audio_ext': {'type': 'alias', 'field': 'aext'}, + 'video_codec': {'type': 'alias', 'field': 'vcodec'}, + 'audio_codec': {'type': 'alias', 'field': 'acodec'}, + 'video': {'type': 'alias', 'field': 'hasvid'}, + 'has_video': {'type': 'alias', 'field': 'hasvid'}, + 'audio': {'type': 'alias', 'field': 'hasaud'}, + 'has_audio': {'type': 'alias', 'field': 'hasaud'}, + 'extractor': {'type': 'alias', 'field': 'ie_pref'}, + 'preference': {'type': 'alias', 'field': 'ie_pref'}, + 'extractor_preference': {'type': 'alias', 'field': 'ie_pref'}, + 'format_id': {'type': 'alias', 'field': 'id'}, + } + + _order = [] + + def _get_field_setting(self, field, key): + if field not in self.settings: + self.settings[field] = {} + propObj = self.settings[field] + if key not in propObj: + type = propObj.get('type') + if key == 'field': + default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field + elif key == 'convert': + default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore' + else: + default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None) + propObj[key] = default + return propObj[key] + + def _resolve_field_value(self, field, value, convertNone=False): + if value is None: + if not convertNone: + return None + else: + value = value.lower() + conversion = self._get_field_setting(field, 'convert') + if conversion == 'ignore': + return None + if conversion == 'string': + return value + elif conversion == 'float_none': + return float_or_none(value) + elif conversion == 'bytes': + return FileDownloader.parse_bytes(value) + elif conversion == 'order': + order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order') + use_regex = self._get_field_setting(field, 'regex') + list_length = len(order_list) + empty_pos = order_list.index('') if '' in order_list else list_length + 1 + if use_regex and value is not None: + for i, regex in enumerate(order_list): + if regex and re.match(regex, value): + return list_length - i + return list_length - empty_pos # not in list + else: # not regex or value = None + return list_length - (order_list.index(value) if value in order_list else empty_pos) + else: + if value.isnumeric(): + return float(value) + else: + self.settings[field]['convert'] = 'string' + return value + + def evaluate_params(self, params, sort_extractor): + self._use_free_order = params.get('prefer_free_formats', False) + self._sort_user = params.get('format_sort', []) + self._sort_extractor = sort_extractor + + def add_item(field, reverse, closest, limit_text): + field = field.lower() + if field in self._order: + return + self._order.append(field) + limit = self._resolve_field_value(field, limit_text) + data = { + 'reverse': reverse, + 'closest': False if limit is None else closest, + 'limit_text': limit_text, + 'limit': limit} + if field in self.settings: + self.settings[field].update(data) + else: + self.settings[field] = data + + sort_list = ( + tuple(field for field in self.default if self._get_field_setting(field, 'forced')) + + (tuple() if params.get('format_sort_force', False) + else tuple(field for field in self.default if self._get_field_setting(field, 'priority'))) + + tuple(self._sort_user) + tuple(sort_extractor) + self.default) + + for item in sort_list: + match = re.match(self.regex, item) + if match is None: + raise ExtractorError('Invalid format sort string "%s" given by extractor' % item) + field = match.group('field') + if field is None: + continue + if self._get_field_setting(field, 'type') == 'alias': + field = self._get_field_setting(field, 'field') + reverse = match.group('reverse') is not None + closest = match.group('separator') == '~' + limit_text = match.group('limit') + + has_limit = limit_text is not None + has_multiple_fields = self._get_field_setting(field, 'type') == 'combined' + has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit') + + fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,) + limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple() + limit_count = len(limits) + for (i, f) in enumerate(fields): + add_item(f, reverse, closest, + limits[i] if i < limit_count + else limits[0] if has_limit and not has_multiple_limits + else None) + + def print_verbose_info(self, write_debug): + if self._sort_user: + write_debug('Sort order given by user: %s' % ', '.join(self._sort_user)) + if self._sort_extractor: + write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor)) + write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % ( + '+' if self._get_field_setting(field, 'reverse') else '', field, + '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':', + self._get_field_setting(field, 'limit_text'), + self._get_field_setting(field, 'limit')) + if self._get_field_setting(field, 'limit_text') is not None else '') + for field in self._order if self._get_field_setting(field, 'visible')])) + + def _calculate_field_preference_from_value(self, format, field, type, value): + reverse = self._get_field_setting(field, 'reverse') + closest = self._get_field_setting(field, 'closest') + limit = self._get_field_setting(field, 'limit') + + if type == 'extractor': + maximum = self._get_field_setting(field, 'max') + if value is None or (maximum is not None and value >= maximum): + value = -1 + elif type == 'boolean': + in_list = self._get_field_setting(field, 'in_list') + not_in_list = self._get_field_setting(field, 'not_in_list') + value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1 + elif type == 'ordered': + value = self._resolve_field_value(field, value, True) + + # try to convert to number + val_num = float_or_none(value, default=self._get_field_setting(field, 'default')) + is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None + if is_num: + value = val_num + + return ((-10, 0) if value is None + else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher + else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest + else (0, value, 0) if not reverse and (limit is None or value <= limit) + else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit + else (-1, value, 0)) + + def _calculate_field_preference(self, format, field): + type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple + get_value = lambda f: format.get(self._get_field_setting(f, 'field')) + if type == 'multiple': + type = 'field' # Only 'field' is allowed in multiple for now + actual_fields = self._get_field_setting(field, 'field') + + value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields) + else: + value = get_value(field) + return self._calculate_field_preference_from_value(format, field, type, value) + + def calculate_preference(self, format): + # Determine missing protocol + if not format.get('protocol'): + format['protocol'] = determine_protocol(format) + + # Determine missing ext + if not format.get('ext') and 'url' in format: + format['ext'] = determine_ext(format['url']) + if format.get('vcodec') == 'none': + format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none' + format['video_ext'] = 'none' + else: + format['video_ext'] = format['ext'] + format['audio_ext'] = 'none' + # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported? + # format['preference'] = -1000 + + # Determine missing bitrates + if format.get('tbr') is None: + if format.get('vbr') is not None and format.get('abr') is not None: + format['tbr'] = format.get('vbr', 0) + format.get('abr', 0) + else: + if format.get('vcodec') != 'none' and format.get('vbr') is None: + format['vbr'] = format.get('tbr') - format.get('abr', 0) + if format.get('acodec') != 'none' and format.get('abr') is None: + format['abr'] = format.get('tbr') - format.get('vbr', 0) + + return tuple(self._calculate_field_preference(format, field) for field in self._order) + + def _sort_formats(self, formats, field_preference=[]): + if not formats: + return + format_sort = self.FormatSort() # params and to_screen are taken from the downloader + format_sort.evaluate_params(self._downloader.params, field_preference) + if self.get_param('verbose', False): + format_sort.print_verbose_info(self._downloader.write_debug) + formats.sort(key=lambda f: format_sort.calculate_preference(f)) + + def _check_formats(self, formats, video_id): + if formats: + formats[:] = filter( + lambda f: self._is_valid_url( + f['url'], video_id, + item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'), + formats) + + @staticmethod + def _remove_duplicate_formats(formats): + format_urls = set() + unique_formats = [] + for f in formats: + if f['url'] not in format_urls: + format_urls.add(f['url']) + unique_formats.append(f) + formats[:] = unique_formats + + def _is_valid_url(self, url, video_id, item='video', headers={}): + url = self._proto_relative_url(url, scheme='http:') + # For now assume non HTTP(S) URLs always valid + if not (url.startswith('http://') or url.startswith('https://')): + return True + try: + self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers) + return True + except ExtractorError as e: + self.to_screen( + '%s: %s URL is invalid, skipping: %s' + % (video_id, item, error_to_compat_str(e.cause))) + return False + + def http_scheme(self): + """ Either "http:" or "https:", depending on the user's preferences """ + return ( + 'http:' + if self.get_param('prefer_insecure', False) + else 'https:') + + def _proto_relative_url(self, url, scheme=None): + if url is None: + return url + if url.startswith('//'): + if scheme is None: + scheme = self.http_scheme() + return scheme + url + else: + return url + + def _sleep(self, timeout, video_id, msg_template=None): + if msg_template is None: + msg_template = '%(video_id)s: Waiting for %(timeout)s seconds' + msg = msg_template % {'video_id': video_id, 'timeout': timeout} + self.to_screen(msg) + time.sleep(timeout) + + def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None, + transform_source=lambda s: fix_xml_ampersands(s).strip(), + fatal=True, m3u8_id=None, data=None, headers={}, query={}): + manifest = self._download_xml( + manifest_url, video_id, 'Downloading f4m manifest', + 'Unable to download f4m manifest', + # Some manifests may be malformed, e.g. prosiebensat1 generated manifests + # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244) + transform_source=transform_source, + fatal=fatal, data=data, headers=headers, query=query) + + if manifest is False: + return [] + + return self._parse_f4m_formats( + manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id, + transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id) + + def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None, + transform_source=lambda s: fix_xml_ampersands(s).strip(), + fatal=True, m3u8_id=None): + if not isinstance(manifest, compat_etree_Element) and not fatal: + return [] + + # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy + akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0') + if akamai_pv is not None and ';' in akamai_pv.text: + playerVerificationChallenge = akamai_pv.text.split(';')[0] + if playerVerificationChallenge.strip() != '': + return [] + + formats = [] + manifest_version = '1.0' + media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media') + if not media_nodes: + manifest_version = '2.0' + media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media') + # Remove unsupported DRM protected media from final formats + # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573). + media_nodes = remove_encrypted_media(media_nodes) + if not media_nodes: + return formats + + manifest_base_url = get_base_url(manifest) + + bootstrap_info = xpath_element( + manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'], + 'bootstrap info', default=None) + + vcodec = None + mime_type = xpath_text( + manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'], + 'base URL', default=None) + if mime_type and mime_type.startswith('audio/'): + vcodec = 'none' + + for i, media_el in enumerate(media_nodes): + tbr = int_or_none(media_el.attrib.get('bitrate')) + width = int_or_none(media_el.attrib.get('width')) + height = int_or_none(media_el.attrib.get('height')) + format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])) + # If <bootstrapInfo> is present, the specified f4m is a + # stream-level manifest, and only set-level manifests may refer to + # external resources. See section 11.4 and section 4 of F4M spec + if bootstrap_info is None: + media_url = None + # @href is introduced in 2.0, see section 11.6 of F4M spec + if manifest_version == '2.0': + media_url = media_el.attrib.get('href') + if media_url is None: + media_url = media_el.attrib.get('url') + if not media_url: + continue + manifest_url = ( + media_url if media_url.startswith('http://') or media_url.startswith('https://') + else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url)) + # If media_url is itself a f4m manifest do the recursive extraction + # since bitrates in parent manifest (this one) and media_url manifest + # may differ leading to inability to resolve the format by requested + # bitrate in f4m downloader + ext = determine_ext(manifest_url) + if ext == 'f4m': + f4m_formats = self._extract_f4m_formats( + manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id, + transform_source=transform_source, fatal=fatal) + # Sometimes stream-level manifest contains single media entry that + # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player). + # At the same time parent's media entry in set-level manifest may + # contain it. We will copy it from parent in such cases. + if len(f4m_formats) == 1: + f = f4m_formats[0] + f.update({ + 'tbr': f.get('tbr') or tbr, + 'width': f.get('width') or width, + 'height': f.get('height') or height, + 'format_id': f.get('format_id') if not tbr else format_id, + 'vcodec': vcodec, + }) + formats.extend(f4m_formats) + continue + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + manifest_url, video_id, 'mp4', preference=preference, + quality=quality, m3u8_id=m3u8_id, fatal=fatal)) + continue + formats.append({ + 'format_id': format_id, + 'url': manifest_url, + 'manifest_url': manifest_url, + 'ext': 'flv' if bootstrap_info is not None else None, + 'protocol': 'f4m', + 'tbr': tbr, + 'width': width, + 'height': height, + 'vcodec': vcodec, + 'preference': preference, + 'quality': quality, + }) + return formats + + def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None): + return { + 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])), + 'url': m3u8_url, + 'ext': ext, + 'protocol': 'm3u8', + 'preference': preference - 100 if preference else -100, + 'quality': quality, + 'resolution': 'multiple', + 'format_note': 'Quality selection URL', + } + + def _report_ignoring_subs(self, name): + self.report_warning(bug_reports_message( + f'Ignoring subtitle tracks found in the {name} manifest; ' + 'if any subtitle tracks are missing,' + ), only_once=True) + + def _extract_m3u8_formats(self, *args, **kwargs): + fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs) + if subs: + self._report_ignoring_subs('HLS') + return fmts + + def _extract_m3u8_formats_and_subtitles( + self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native', + preference=None, quality=None, m3u8_id=None, note=None, + errnote=None, fatal=True, live=False, data=None, headers={}, + query={}): + + res = self._download_webpage_handle( + m3u8_url, video_id, + note='Downloading m3u8 information' if note is None else note, + errnote='Failed to download m3u8 information' if errnote is None else errnote, + fatal=fatal, data=data, headers=headers, query=query) + + if res is False: + return [], {} + + m3u8_doc, urlh = res + m3u8_url = urlh.geturl() + + return self._parse_m3u8_formats_and_subtitles( + m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol, + preference=preference, quality=quality, m3u8_id=m3u8_id, + note=note, errnote=errnote, fatal=fatal, live=live, data=data, + headers=headers, query=query, video_id=video_id) + + def _parse_m3u8_formats_and_subtitles( + self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8_native', + preference=None, quality=None, m3u8_id=None, live=False, note=None, + errnote=None, fatal=True, data=None, headers={}, query={}, + video_id=None): + formats, subtitles = [], {} + + if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access + return formats, subtitles + + has_drm = re.search(r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', m3u8_doc) + + def format_url(url): + return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url) + + if self.get_param('hls_split_discontinuity', False): + def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None): + if not m3u8_doc: + if not manifest_url: + return [] + m3u8_doc = self._download_webpage( + manifest_url, video_id, fatal=fatal, data=data, headers=headers, + note=False, errnote='Failed to download m3u8 playlist information') + if m3u8_doc is False: + return [] + return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines())) + + else: + def _extract_m3u8_playlist_indices(*args, **kwargs): + return [None] + + # References: + # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21 + # 2. https://github.com/ytdl-org/youtube-dl/issues/12211 + # 3. https://github.com/ytdl-org/youtube-dl/issues/18923 + + # We should try extracting formats only from master playlists [1, 4.3.4], + # i.e. playlists that describe available qualities. On the other hand + # media playlists [1, 4.3.3] should be returned as is since they contain + # just the media without qualities renditions. + # Fortunately, master playlist can be easily distinguished from media + # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4] + # master playlist tags MUST NOT appear in a media playlist and vice versa. + # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every + # media playlist and MUST NOT appear in master playlist thus we can + # clearly detect media playlist with this criterion. + + if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is + formats = [{ + 'format_id': '-'.join(map(str, filter(None, [m3u8_id, idx]))), + 'format_index': idx, + 'url': m3u8_url, + 'ext': ext, + 'protocol': entry_protocol, + 'preference': preference, + 'quality': quality, + 'has_drm': has_drm, + } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)] + + return formats, subtitles + + groups = {} + last_stream_inf = {} + + def extract_media(x_media_line): + media = parse_m3u8_attributes(x_media_line) + # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED + media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME') + if not (media_type and group_id and name): + return + groups.setdefault(group_id, []).append(media) + # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1> + if media_type == 'SUBTITLES': + # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the + # EXT-X-MEDIA tag if the media type is SUBTITLES. + # However, lack of URI has been spotted in the wild. + # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339 + if not media.get('URI'): + return + url = format_url(media['URI']) + sub_info = { + 'url': url, + 'ext': determine_ext(url), + } + if sub_info['ext'] == 'm3u8': + # Per RFC 8216 §3.1, the only possible subtitle format m3u8 + # files may contain is WebVTT: + # <https://tools.ietf.org/html/rfc8216#section-3.1> + sub_info['ext'] = 'vtt' + sub_info['protocol'] = 'm3u8_native' + lang = media.get('LANGUAGE') or 'und' + subtitles.setdefault(lang, []).append(sub_info) + if media_type not in ('VIDEO', 'AUDIO'): + return + media_url = media.get('URI') + if media_url: + manifest_url = format_url(media_url) + formats.extend({ + 'format_id': '-'.join(map(str, filter(None, (m3u8_id, group_id, name, idx)))), + 'format_note': name, + 'format_index': idx, + 'url': manifest_url, + 'manifest_url': m3u8_url, + 'language': media.get('LANGUAGE'), + 'ext': ext, + 'protocol': entry_protocol, + 'preference': preference, + 'quality': quality, + 'vcodec': 'none' if media_type == 'AUDIO' else None, + } for idx in _extract_m3u8_playlist_indices(manifest_url)) + + def build_stream_name(): + # Despite specification does not mention NAME attribute for + # EXT-X-STREAM-INF tag it still sometimes may be present (see [1] + # or vidio test in TestInfoExtractor.test_parse_m3u8_formats) + # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015 + stream_name = last_stream_inf.get('NAME') + if stream_name: + return stream_name + # If there is no NAME in EXT-X-STREAM-INF it will be obtained + # from corresponding rendition group + stream_group_id = last_stream_inf.get('VIDEO') + if not stream_group_id: + return + stream_group = groups.get(stream_group_id) + if not stream_group: + return stream_group_id + rendition = stream_group[0] + return rendition.get('NAME') or stream_group_id + + # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the + # chance to detect video only formats when EXT-X-STREAM-INF tags + # precede EXT-X-MEDIA tags in HLS manifest such as [3]. + for line in m3u8_doc.splitlines(): + if line.startswith('#EXT-X-MEDIA:'): + extract_media(line) + + for line in m3u8_doc.splitlines(): + if line.startswith('#EXT-X-STREAM-INF:'): + last_stream_inf = parse_m3u8_attributes(line) + elif line.startswith('#') or not line.strip(): + continue + else: + tbr = float_or_none( + last_stream_inf.get('AVERAGE-BANDWIDTH') + or last_stream_inf.get('BANDWIDTH'), scale=1000) + manifest_url = format_url(line.strip()) + + for idx in _extract_m3u8_playlist_indices(manifest_url): + format_id = [m3u8_id, None, idx] + # Bandwidth of live streams may differ over time thus making + # format_id unpredictable. So it's better to keep provided + # format_id intact. + if not live: + stream_name = build_stream_name() + format_id[1] = stream_name if stream_name else '%d' % (tbr if tbr else len(formats)) + f = { + 'format_id': '-'.join(map(str, filter(None, format_id))), + 'format_index': idx, + 'url': manifest_url, + 'manifest_url': m3u8_url, + 'tbr': tbr, + 'ext': ext, + 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')), + 'protocol': entry_protocol, + 'preference': preference, + 'quality': quality, + } + resolution = last_stream_inf.get('RESOLUTION') + if resolution: + mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution) + if mobj: + f['width'] = int(mobj.group('width')) + f['height'] = int(mobj.group('height')) + # Unified Streaming Platform + mobj = re.search( + r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url']) + if mobj: + abr, vbr = mobj.groups() + abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000) + f.update({ + 'vbr': vbr, + 'abr': abr, + }) + codecs = parse_codecs(last_stream_inf.get('CODECS')) + f.update(codecs) + audio_group_id = last_stream_inf.get('AUDIO') + # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which + # references a rendition group MUST have a CODECS attribute. + # However, this is not always respected, for example, [2] + # contains EXT-X-STREAM-INF tag which references AUDIO + # rendition group but does not have CODECS and despite + # referencing an audio group it represents a complete + # (with audio and video) format. So, for such cases we will + # ignore references to rendition groups and treat them + # as complete formats. + if audio_group_id and codecs and f.get('vcodec') != 'none': + audio_group = groups.get(audio_group_id) + if audio_group and audio_group[0].get('URI'): + # TODO: update acodec for audio only formats with + # the same GROUP-ID + f['acodec'] = 'none' + if not f.get('ext'): + f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4' + formats.append(f) + + # for DailyMotion + progressive_uri = last_stream_inf.get('PROGRESSIVE-URI') + if progressive_uri: + http_f = f.copy() + del http_f['manifest_url'] + http_f.update({ + 'format_id': f['format_id'].replace('hls-', 'http-'), + 'protocol': 'http', + 'url': progressive_uri, + }) + formats.append(http_f) + + last_stream_inf = {} + return formats, subtitles + + def _extract_m3u8_vod_duration( + self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}): + + m3u8_vod = self._download_webpage( + m3u8_vod_url, video_id, + note='Downloading m3u8 VOD manifest' if note is None else note, + errnote='Failed to download VOD manifest' if errnote is None else errnote, + fatal=False, data=data, headers=headers, query=query) + + return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id) + + def _parse_m3u8_vod_duration(self, m3u8_vod, video_id): + if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod: + return None + + return int(sum( + float(line[len('#EXTINF:'):].split(',')[0]) + for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None + + @staticmethod + def _xpath_ns(path, namespace=None): + if not namespace: + return path + out = [] + for c in path.split('/'): + if not c or c == '.': + out.append(c) + else: + out.append('{%s}%s' % (namespace, c)) + return '/'.join(out) + + def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None): + smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source) + + if smil is False: + assert not fatal + return [] + + namespace = self._parse_smil_namespace(smil) + + fmts = self._parse_smil_formats( + smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) + subs = self._parse_smil_subtitles( + smil, namespace=namespace) + + return fmts, subs + + def _extract_smil_formats(self, *args, **kwargs): + fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs) + if subs: + self._report_ignoring_subs('SMIL') + return fmts + + def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None): + smil = self._download_smil(smil_url, video_id, fatal=fatal) + if smil is False: + return {} + return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params) + + def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None): + return self._download_xml( + smil_url, video_id, 'Downloading SMIL file', + 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source) + + def _parse_smil(self, smil, smil_url, video_id, f4m_params=None): + namespace = self._parse_smil_namespace(smil) + + formats = self._parse_smil_formats( + smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) + subtitles = self._parse_smil_subtitles(smil, namespace=namespace) + + video_id = os.path.splitext(url_basename(smil_url))[0] + title = None + description = None + upload_date = None + for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): + name = meta.attrib.get('name') + content = meta.attrib.get('content') + if not name or not content: + continue + if not title and name == 'title': + title = content + elif not description and name in ('description', 'abstract'): + description = content + elif not upload_date and name == 'date': + upload_date = unified_strdate(content) + + thumbnails = [{ + 'id': image.get('type'), + 'url': image.get('src'), + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')] + + return { + 'id': video_id, + 'title': title or video_id, + 'description': description, + 'upload_date': upload_date, + 'thumbnails': thumbnails, + 'formats': formats, + 'subtitles': subtitles, + } + + def _parse_smil_namespace(self, smil): + return self._search_regex( + r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None) + + def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): + base = smil_url + for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): + b = meta.get('base') or meta.get('httpBase') + if b: + base = b + break + + formats = [] + rtmp_count = 0 + http_count = 0 + m3u8_count = 0 + imgs_count = 0 + + srcs = set() + media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace)) + for medium in media: + src = medium.get('src') + if not src or src in srcs: + continue + srcs.add(src) + + bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000) + filesize = int_or_none(medium.get('size') or medium.get('fileSize')) + width = int_or_none(medium.get('width')) + height = int_or_none(medium.get('height')) + proto = medium.get('proto') + ext = medium.get('ext') + src_ext = determine_ext(src) + streamer = medium.get('streamer') or base + + if proto == 'rtmp' or streamer.startswith('rtmp'): + rtmp_count += 1 + formats.append({ + 'url': streamer, + 'play_path': src, + 'ext': 'flv', + 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate), + 'tbr': bitrate, + 'filesize': filesize, + 'width': width, + 'height': height, + }) + if transform_rtmp_url: + streamer, src = transform_rtmp_url(streamer, src) + formats[-1].update({ + 'url': streamer, + 'play_path': src, + }) + continue + + src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src) + src_url = src_url.strip() + + if proto == 'm3u8' or src_ext == 'm3u8': + m3u8_formats = self._extract_m3u8_formats( + src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False) + if len(m3u8_formats) == 1: + m3u8_count += 1 + m3u8_formats[0].update({ + 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate), + 'tbr': bitrate, + 'width': width, + 'height': height, + }) + formats.extend(m3u8_formats) + elif src_ext == 'f4m': + f4m_url = src_url + if not f4m_params: + f4m_params = { + 'hdcore': '3.2.0', + 'plugin': 'flowplayer-3.2.0.1', + } + f4m_url += '&' if '?' in f4m_url else '?' + f4m_url += compat_urllib_parse_urlencode(f4m_params) + formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)) + elif src_ext == 'mpd': + formats.extend(self._extract_mpd_formats( + src_url, video_id, mpd_id='dash', fatal=False)) + elif re.search(r'\.ism/[Mm]anifest', src_url): + formats.extend(self._extract_ism_formats( + src_url, video_id, ism_id='mss', fatal=False)) + elif src_url.startswith('http') and self._is_valid_url(src, video_id): + http_count += 1 + formats.append({ + 'url': src_url, + 'ext': ext or src_ext or 'flv', + 'format_id': 'http-%d' % (bitrate or http_count), + 'tbr': bitrate, + 'filesize': filesize, + 'width': width, + 'height': height, + }) + + for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)): + src = medium.get('src') + if not src or src in srcs: + continue + srcs.add(src) + + imgs_count += 1 + formats.append({ + 'format_id': 'imagestream-%d' % (imgs_count), + 'url': src, + 'ext': mimetype2ext(medium.get('type')), + 'acodec': 'none', + 'vcodec': 'none', + 'width': int_or_none(medium.get('width')), + 'height': int_or_none(medium.get('height')), + 'format_note': 'SMIL storyboards', + }) + + return formats + + def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): + urls = [] + subtitles = {} + for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))): + src = textstream.get('src') + if not src or src in urls: + continue + urls.append(src) + ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src) + lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang + subtitles.setdefault(lang, []).append({ + 'url': src, + 'ext': ext, + }) + return subtitles + + def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True): + xspf = self._download_xml( + xspf_url, playlist_id, 'Downloading xpsf playlist', + 'Unable to download xspf manifest', fatal=fatal) + if xspf is False: + return [] + return self._parse_xspf( + xspf, playlist_id, xspf_url=xspf_url, + xspf_base_url=base_url(xspf_url)) + + def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None): + NS_MAP = { + 'xspf': 'http://xspf.org/ns/0/', + 's1': 'http://static.streamone.nl/player/ns/0', + } + + entries = [] + for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)): + title = xpath_text( + track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id) + description = xpath_text( + track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description') + thumbnail = xpath_text( + track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail') + duration = float_or_none( + xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000) + + formats = [] + for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)): + format_url = urljoin(xspf_base_url, location.text) + if not format_url: + continue + formats.append({ + 'url': format_url, + 'manifest_url': xspf_url, + 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)), + 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))), + 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))), + }) + self._sort_formats(formats) + + entries.append({ + 'id': playlist_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + }) + return entries + + def _extract_mpd_formats(self, *args, **kwargs): + fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs) + if subs: + self._report_ignoring_subs('DASH') + return fmts + + def _extract_mpd_formats_and_subtitles( + self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, + fatal=True, data=None, headers={}, query={}): + res = self._download_xml_handle( + mpd_url, video_id, + note='Downloading MPD manifest' if note is None else note, + errnote='Failed to download MPD manifest' if errnote is None else errnote, + fatal=fatal, data=data, headers=headers, query=query) + if res is False: + return [], {} + mpd_doc, urlh = res + if mpd_doc is None: + return [], {} + mpd_base_url = base_url(urlh.geturl()) + + return self._parse_mpd_formats_and_subtitles( + mpd_doc, mpd_id, mpd_base_url, mpd_url) + + def _parse_mpd_formats(self, *args, **kwargs): + fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs) + if subs: + self._report_ignoring_subs('DASH') + return fmts + + def _parse_mpd_formats_and_subtitles( + self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None): + """ + Parse formats from MPD manifest. + References: + 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E), + http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip + 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP + """ + if not self.get_param('dynamic_mpd', True): + if mpd_doc.get('type') == 'dynamic': + return [], {} + + namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None) + + def _add_ns(path): + return self._xpath_ns(path, namespace) + + def is_drm_protected(element): + return element.find(_add_ns('ContentProtection')) is not None + + def extract_multisegment_info(element, ms_parent_info): + ms_info = ms_parent_info.copy() + + # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some + # common attributes and elements. We will only extract relevant + # for us. + def extract_common(source): + segment_timeline = source.find(_add_ns('SegmentTimeline')) + if segment_timeline is not None: + s_e = segment_timeline.findall(_add_ns('S')) + if s_e: + ms_info['total_number'] = 0 + ms_info['s'] = [] + for s in s_e: + r = int(s.get('r', 0)) + ms_info['total_number'] += 1 + r + ms_info['s'].append({ + 't': int(s.get('t', 0)), + # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60]) + 'd': int(s.attrib['d']), + 'r': r, + }) + start_number = source.get('startNumber') + if start_number: + ms_info['start_number'] = int(start_number) + timescale = source.get('timescale') + if timescale: + ms_info['timescale'] = int(timescale) + segment_duration = source.get('duration') + if segment_duration: + ms_info['segment_duration'] = float(segment_duration) + + def extract_Initialization(source): + initialization = source.find(_add_ns('Initialization')) + if initialization is not None: + ms_info['initialization_url'] = initialization.attrib['sourceURL'] + + segment_list = element.find(_add_ns('SegmentList')) + if segment_list is not None: + extract_common(segment_list) + extract_Initialization(segment_list) + segment_urls_e = segment_list.findall(_add_ns('SegmentURL')) + if segment_urls_e: + ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e] + else: + segment_template = element.find(_add_ns('SegmentTemplate')) + if segment_template is not None: + extract_common(segment_template) + media = segment_template.get('media') + if media: + ms_info['media'] = media + initialization = segment_template.get('initialization') + if initialization: + ms_info['initialization'] = initialization + else: + extract_Initialization(segment_template) + return ms_info + + mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration')) + formats, subtitles = [], {} + stream_numbers = {'audio': 0, 'video': 0} + for period in mpd_doc.findall(_add_ns('Period')): + period_duration = parse_duration(period.get('duration')) or mpd_duration + period_ms_info = extract_multisegment_info(period, { + 'start_number': 1, + 'timescale': 1, + }) + for adaptation_set in period.findall(_add_ns('AdaptationSet')): + adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info) + for representation in adaptation_set.findall(_add_ns('Representation')): + representation_attrib = adaptation_set.attrib.copy() + representation_attrib.update(representation.attrib) + # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory + mime_type = representation_attrib['mimeType'] + content_type = representation_attrib.get('contentType', mime_type.split('/')[0]) + + codecs = representation_attrib.get('codecs', '') + if content_type not in ('video', 'audio', 'text'): + if mime_type == 'image/jpeg': + content_type = mime_type + elif codecs.split('.')[0] == 'stpp': + content_type = 'text' + elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'): + content_type = 'text' + else: + self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) + continue + + base_url = '' + for element in (representation, adaptation_set, period, mpd_doc): + base_url_e = element.find(_add_ns('BaseURL')) + if base_url_e is not None: + base_url = base_url_e.text + base_url + if re.match(r'^https?://', base_url): + break + if mpd_base_url and base_url.startswith('/'): + base_url = compat_urlparse.urljoin(mpd_base_url, base_url) + elif mpd_base_url and not re.match(r'^https?://', base_url): + if not mpd_base_url.endswith('/'): + mpd_base_url += '/' + base_url = mpd_base_url + base_url + representation_id = representation_attrib.get('id') + lang = representation_attrib.get('lang') + url_el = representation.find(_add_ns('BaseURL')) + filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None) + bandwidth = int_or_none(representation_attrib.get('bandwidth')) + if representation_id is not None: + format_id = representation_id + else: + format_id = content_type + if mpd_id: + format_id = mpd_id + '-' + format_id + if content_type in ('video', 'audio'): + f = { + 'format_id': format_id, + 'manifest_url': mpd_url, + 'ext': mimetype2ext(mime_type), + 'width': int_or_none(representation_attrib.get('width')), + 'height': int_or_none(representation_attrib.get('height')), + 'tbr': float_or_none(bandwidth, 1000), + 'asr': int_or_none(representation_attrib.get('audioSamplingRate')), + 'fps': int_or_none(representation_attrib.get('frameRate')), + 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None, + 'format_note': 'DASH %s' % content_type, + 'filesize': filesize, + 'container': mimetype2ext(mime_type) + '_dash', + 'manifest_stream_number': stream_numbers[content_type] + } + f.update(parse_codecs(codecs)) + stream_numbers[content_type] += 1 + elif content_type == 'text': + f = { + 'ext': mimetype2ext(mime_type), + 'manifest_url': mpd_url, + 'filesize': filesize, + } + elif content_type == 'image/jpeg': + # See test case in VikiIE + # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1 + f = { + 'format_id': format_id, + 'ext': 'mhtml', + 'manifest_url': mpd_url, + 'format_note': 'DASH storyboards (jpeg)', + 'acodec': 'none', + 'vcodec': 'none', + } + if is_drm_protected(adaptation_set) or is_drm_protected(representation): + f['has_drm'] = True + representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) + + def prepare_template(template_name, identifiers): + tmpl = representation_ms_info[template_name] + # First of, % characters outside $...$ templates + # must be escaped by doubling for proper processing + # by % operator string formatting used further (see + # https://github.com/ytdl-org/youtube-dl/issues/16867). + t = '' + in_template = False + for c in tmpl: + t += c + if c == '$': + in_template = not in_template + elif c == '%' and not in_template: + t += c + # Next, $...$ templates are translated to their + # %(...) counterparts to be used with % operator + if representation_id is not None: + t = t.replace('$RepresentationID$', representation_id) + t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t) + t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t) + t.replace('$$', '$') + return t + + # @initialization is a regular template like @media one + # so it should be handled just the same way (see + # https://github.com/ytdl-org/youtube-dl/issues/11605) + if 'initialization' in representation_ms_info: + initialization_template = prepare_template( + 'initialization', + # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and + # $Time$ shall not be included for @initialization thus + # only $Bandwidth$ remains + ('Bandwidth', )) + representation_ms_info['initialization_url'] = initialization_template % { + 'Bandwidth': bandwidth, + } + + def location_key(location): + return 'url' if re.match(r'^https?://', location) else 'path' + + if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info: + + media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time')) + media_location_key = location_key(media_template) + + # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$ + # can't be used at the same time + if '%(Number' in media_template and 's' not in representation_ms_info: + segment_duration = None + if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info: + segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale']) + representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) + representation_ms_info['fragments'] = [{ + media_location_key: media_template % { + 'Number': segment_number, + 'Bandwidth': bandwidth, + }, + 'duration': segment_duration, + } for segment_number in range( + representation_ms_info['start_number'], + representation_ms_info['total_number'] + representation_ms_info['start_number'])] + else: + # $Number*$ or $Time$ in media template with S list available + # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg + # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411 + representation_ms_info['fragments'] = [] + segment_time = 0 + segment_d = None + segment_number = representation_ms_info['start_number'] + + def add_segment_url(): + segment_url = media_template % { + 'Time': segment_time, + 'Bandwidth': bandwidth, + 'Number': segment_number, + } + representation_ms_info['fragments'].append({ + media_location_key: segment_url, + 'duration': float_or_none(segment_d, representation_ms_info['timescale']), + }) + + for num, s in enumerate(representation_ms_info['s']): + segment_time = s.get('t') or segment_time + segment_d = s['d'] + add_segment_url() + segment_number += 1 + for r in range(s.get('r', 0)): + segment_time += segment_d + add_segment_url() + segment_number += 1 + segment_time += segment_d + elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info: + # No media template + # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI + # or any YouTube dashsegments video + fragments = [] + segment_index = 0 + timescale = representation_ms_info['timescale'] + for s in representation_ms_info['s']: + duration = float_or_none(s['d'], timescale) + for r in range(s.get('r', 0) + 1): + segment_uri = representation_ms_info['segment_urls'][segment_index] + fragments.append({ + location_key(segment_uri): segment_uri, + 'duration': duration, + }) + segment_index += 1 + representation_ms_info['fragments'] = fragments + elif 'segment_urls' in representation_ms_info: + # Segment URLs with no SegmentTimeline + # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091 + # https://github.com/ytdl-org/youtube-dl/pull/14844 + fragments = [] + segment_duration = float_or_none( + representation_ms_info['segment_duration'], + representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None + for segment_url in representation_ms_info['segment_urls']: + fragment = { + location_key(segment_url): segment_url, + } + if segment_duration: + fragment['duration'] = segment_duration + fragments.append(fragment) + representation_ms_info['fragments'] = fragments + # If there is a fragments key available then we correctly recognized fragmented media. + # Otherwise we will assume unfragmented media with direct access. Technically, such + # assumption is not necessarily correct since we may simply have no support for + # some forms of fragmented media renditions yet, but for now we'll use this fallback. + if 'fragments' in representation_ms_info: + f.update({ + # NB: mpd_url may be empty when MPD manifest is parsed from a string + 'url': mpd_url or base_url, + 'fragment_base_url': base_url, + 'fragments': [], + 'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml', + }) + if 'initialization_url' in representation_ms_info: + initialization_url = representation_ms_info['initialization_url'] + if not f.get('url'): + f['url'] = initialization_url + f['fragments'].append({location_key(initialization_url): initialization_url}) + f['fragments'].extend(representation_ms_info['fragments']) + else: + # Assuming direct URL to unfragmented media. + f['url'] = base_url + if content_type in ('video', 'audio') or mime_type == 'image/jpeg': + formats.append(f) + elif content_type == 'text': + subtitles.setdefault(lang or 'und', []).append(f) + + return formats, subtitles + + def _extract_ism_formats(self, *args, **kwargs): + fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs) + if subs: + self._report_ignoring_subs('ISM') + return fmts + + def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): + res = self._download_xml_handle( + ism_url, video_id, + note='Downloading ISM manifest' if note is None else note, + errnote='Failed to download ISM manifest' if errnote is None else errnote, + fatal=fatal, data=data, headers=headers, query=query) + if res is False: + return [], {} + ism_doc, urlh = res + if ism_doc is None: + return [], {} + + return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id) + + def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None): + """ + Parse formats from ISM manifest. + References: + 1. [MS-SSTR]: Smooth Streaming Protocol, + https://msdn.microsoft.com/en-us/library/ff469518.aspx + """ + if ism_doc.get('IsLive') == 'TRUE': + return [], {} + + duration = int(ism_doc.attrib['Duration']) + timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000 + + formats = [] + subtitles = {} + for stream in ism_doc.findall('StreamIndex'): + stream_type = stream.get('Type') + if stream_type not in ('video', 'audio', 'text'): + continue + url_pattern = stream.attrib['Url'] + stream_timescale = int_or_none(stream.get('TimeScale')) or timescale + stream_name = stream.get('Name') + stream_language = stream.get('Language', 'und') + for track in stream.findall('QualityLevel'): + fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None) + # TODO: add support for WVC1 and WMAP + if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'): + self.report_warning('%s is not a supported codec' % fourcc) + continue + tbr = int(track.attrib['Bitrate']) // 1000 + # [1] does not mention Width and Height attributes. However, + # they're often present while MaxWidth and MaxHeight are + # missing, so should be used as fallbacks + width = int_or_none(track.get('MaxWidth') or track.get('Width')) + height = int_or_none(track.get('MaxHeight') or track.get('Height')) + sampling_rate = int_or_none(track.get('SamplingRate')) + + track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern) + track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern) + + fragments = [] + fragment_ctx = { + 'time': 0, + } + stream_fragments = stream.findall('c') + for stream_fragment_index, stream_fragment in enumerate(stream_fragments): + fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time'] + fragment_repeat = int_or_none(stream_fragment.get('r')) or 1 + fragment_ctx['duration'] = int_or_none(stream_fragment.get('d')) + if not fragment_ctx['duration']: + try: + next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t']) + except IndexError: + next_fragment_time = duration + fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat + for _ in range(fragment_repeat): + fragments.append({ + 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern), + 'duration': fragment_ctx['duration'] / stream_timescale, + }) + fragment_ctx['time'] += fragment_ctx['duration'] + + format_id = [] + if ism_id: + format_id.append(ism_id) + if stream_name: + format_id.append(stream_name) + format_id.append(compat_str(tbr)) + + if stream_type == 'text': + subtitles.setdefault(stream_language, []).append({ + 'ext': 'ismt', + 'protocol': 'ism', + 'url': ism_url, + 'manifest_url': ism_url, + 'fragments': fragments, + '_download_params': { + 'stream_type': stream_type, + 'duration': duration, + 'timescale': stream_timescale, + 'fourcc': fourcc, + 'language': stream_language, + 'codec_private_data': track.get('CodecPrivateData'), + } + }) + elif stream_type in ('video', 'audio'): + formats.append({ + 'format_id': '-'.join(format_id), + 'url': ism_url, + 'manifest_url': ism_url, + 'ext': 'ismv' if stream_type == 'video' else 'isma', + 'width': width, + 'height': height, + 'tbr': tbr, + 'asr': sampling_rate, + 'vcodec': 'none' if stream_type == 'audio' else fourcc, + 'acodec': 'none' if stream_type == 'video' else fourcc, + 'protocol': 'ism', + 'fragments': fragments, + 'has_drm': ism_doc.find('Protection') is not None, + '_download_params': { + 'stream_type': stream_type, + 'duration': duration, + 'timescale': stream_timescale, + 'width': width or 0, + 'height': height or 0, + 'fourcc': fourcc, + 'language': stream_language, + 'codec_private_data': track.get('CodecPrivateData'), + 'sampling_rate': sampling_rate, + 'channels': int_or_none(track.get('Channels', 2)), + 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)), + 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)), + }, + }) + return formats, subtitles + + def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None): + def absolute_url(item_url): + return urljoin(base_url, item_url) + + def parse_content_type(content_type): + if not content_type: + return {} + ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type) + if ctr: + mimetype, codecs = ctr.groups() + f = parse_codecs(codecs) + f['ext'] = mimetype2ext(mimetype) + return f + return {} + + def _media_formats(src, cur_media_type, type_info={}): + full_url = absolute_url(src) + ext = type_info.get('ext') or determine_ext(full_url) + if ext == 'm3u8': + is_plain_url = False + formats = self._extract_m3u8_formats( + full_url, video_id, ext='mp4', + entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id, + preference=preference, quality=quality, fatal=False) + elif ext == 'mpd': + is_plain_url = False + formats = self._extract_mpd_formats( + full_url, video_id, mpd_id=mpd_id, fatal=False) + else: + is_plain_url = True + formats = [{ + 'url': full_url, + 'vcodec': 'none' if cur_media_type == 'audio' else None, + }] + return is_plain_url, formats + + entries = [] + # amp-video and amp-audio are very similar to their HTML5 counterparts + # so we wll include them right here (see + # https://www.ampproject.org/docs/reference/components/amp-video) + # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/ + _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)' + media_tags = [(media_tag, media_tag_name, media_type, '') + for media_tag, media_tag_name, media_type + in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)] + media_tags.extend(re.findall( + # We only allow video|audio followed by a whitespace or '>'. + # Allowing more characters may end up in significant slow down (see + # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL: + # http://www.porntrex.com/maps/videositemap.xml). + r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage)) + for media_tag, _, media_type, media_content in media_tags: + media_info = { + 'formats': [], + 'subtitles': {}, + } + media_attributes = extract_attributes(media_tag) + src = strip_or_none(media_attributes.get('src')) + if src: + _, formats = _media_formats(src, media_type) + media_info['formats'].extend(formats) + media_info['thumbnail'] = absolute_url(media_attributes.get('poster')) + if media_content: + for source_tag in re.findall(r'<source[^>]+>', media_content): + s_attr = extract_attributes(source_tag) + # data-video-src and data-src are non standard but seen + # several times in the wild + src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src'))) + if not src: + continue + f = parse_content_type(s_attr.get('type')) + is_plain_url, formats = _media_formats(src, media_type, f) + if is_plain_url: + # width, height, res, label and title attributes are + # all not standard but seen several times in the wild + labels = [ + s_attr.get(lbl) + for lbl in ('label', 'title') + if str_or_none(s_attr.get(lbl)) + ] + width = int_or_none(s_attr.get('width')) + height = (int_or_none(s_attr.get('height')) + or int_or_none(s_attr.get('res'))) + if not width or not height: + for lbl in labels: + resolution = parse_resolution(lbl) + if not resolution: + continue + width = width or resolution.get('width') + height = height or resolution.get('height') + for lbl in labels: + tbr = parse_bitrate(lbl) + if tbr: + break + else: + tbr = None + f.update({ + 'width': width, + 'height': height, + 'tbr': tbr, + 'format_id': s_attr.get('label') or s_attr.get('title'), + }) + f.update(formats[0]) + media_info['formats'].append(f) + else: + media_info['formats'].extend(formats) + for track_tag in re.findall(r'<track[^>]+>', media_content): + track_attributes = extract_attributes(track_tag) + kind = track_attributes.get('kind') + if not kind or kind in ('subtitles', 'captions'): + src = strip_or_none(track_attributes.get('src')) + if not src: + continue + lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label') + media_info['subtitles'].setdefault(lang, []).append({ + 'url': absolute_url(src), + }) + for f in media_info['formats']: + f.setdefault('http_headers', {})['Referer'] = base_url + if media_info['formats'] or media_info['subtitles']: + entries.append(media_info) + return entries + + def _extract_akamai_formats(self, *args, **kwargs): + fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs) + if subs: + self._report_ignoring_subs('akamai') + return fmts + + def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}): + signed = 'hdnea=' in manifest_url + if not signed: + # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html + manifest_url = re.sub( + r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?', + '', manifest_url).strip('?') + + formats = [] + subtitles = {} + + hdcore_sign = 'hdcore=3.7.0' + f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m') + hds_host = hosts.get('hds') + if hds_host: + f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url) + if 'hdcore=' not in f4m_url: + f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign + f4m_formats = self._extract_f4m_formats( + f4m_url, video_id, f4m_id='hds', fatal=False) + for entry in f4m_formats: + entry.update({'extra_param_to_segment_url': hdcore_sign}) + formats.extend(f4m_formats) + + m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8') + hls_host = hosts.get('hls') + if hls_host: + m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url) + m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles( + m3u8_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False) + formats.extend(m3u8_formats) + subtitles = self._merge_subtitles(subtitles, m3u8_subtitles) + + http_host = hosts.get('http') + if http_host and m3u8_formats and not signed: + REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+' + qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',') + qualities_length = len(qualities) + if len(m3u8_formats) in (qualities_length, qualities_length + 1): + i = 0 + for f in m3u8_formats: + if f['vcodec'] != 'none': + for protocol in ('http', 'https'): + http_f = f.copy() + del http_f['manifest_url'] + http_url = re.sub( + REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url']) + http_f.update({ + 'format_id': http_f['format_id'].replace('hls-', protocol + '-'), + 'url': http_url, + 'protocol': protocol, + }) + formats.append(http_f) + i += 1 + + return formats, subtitles + + def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]): + query = compat_urlparse.urlparse(url).query + url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url) + mobj = re.search( + r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url) + url_base = mobj.group('url') + http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base) + formats = [] + + def manifest_url(manifest): + m_url = '%s/%s' % (http_base_url, manifest) + if query: + m_url += '?%s' % query + return m_url + + if 'm3u8' not in skip_protocols: + formats.extend(self._extract_m3u8_formats( + manifest_url('playlist.m3u8'), video_id, 'mp4', + m3u8_entry_protocol, m3u8_id='hls', fatal=False)) + if 'f4m' not in skip_protocols: + formats.extend(self._extract_f4m_formats( + manifest_url('manifest.f4m'), + video_id, f4m_id='hds', fatal=False)) + if 'dash' not in skip_protocols: + formats.extend(self._extract_mpd_formats( + manifest_url('manifest.mpd'), + video_id, mpd_id='dash', fatal=False)) + if re.search(r'(?:/smil:|\.smil)', url_base): + if 'smil' not in skip_protocols: + rtmp_formats = self._extract_smil_formats( + manifest_url('jwplayer.smil'), + video_id, fatal=False) + for rtmp_format in rtmp_formats: + rtsp_format = rtmp_format.copy() + rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path']) + del rtsp_format['play_path'] + del rtsp_format['ext'] + rtsp_format.update({ + 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'), + 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'), + 'protocol': 'rtsp', + }) + formats.extend([rtmp_format, rtsp_format]) + else: + for protocol in ('rtmp', 'rtsp'): + if protocol not in skip_protocols: + formats.append({ + 'url': '%s:%s' % (protocol, url_base), + 'format_id': protocol, + 'protocol': protocol, + }) + return formats + + def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json): + mobj = re.search( + r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)', + webpage) + if mobj: + try: + jwplayer_data = self._parse_json(mobj.group('options'), + video_id=video_id, + transform_source=transform_source) + except ExtractorError: + pass + else: + if isinstance(jwplayer_data, dict): + return jwplayer_data + + def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs): + jwplayer_data = self._find_jwplayer_data( + webpage, video_id, transform_source=js_to_json) + return self._parse_jwplayer_data( + jwplayer_data, video_id, *args, **kwargs) + + def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True, + m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): + # JWPlayer backward compatibility: flattened playlists + # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96 + if 'playlist' not in jwplayer_data: + jwplayer_data = {'playlist': [jwplayer_data]} + + entries = [] + + # JWPlayer backward compatibility: single playlist item + # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10 + if not isinstance(jwplayer_data['playlist'], list): + jwplayer_data['playlist'] = [jwplayer_data['playlist']] + + for video_data in jwplayer_data['playlist']: + # JWPlayer backward compatibility: flattened sources + # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35 + if 'sources' not in video_data: + video_data['sources'] = [video_data] + + this_video_id = video_id or video_data['mediaid'] + + formats = self._parse_jwplayer_formats( + video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id, + mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url) + + subtitles = {} + tracks = video_data.get('tracks') + if tracks and isinstance(tracks, list): + for track in tracks: + if not isinstance(track, dict): + continue + track_kind = track.get('kind') + if not track_kind or not isinstance(track_kind, compat_str): + continue + if track_kind.lower() not in ('captions', 'subtitles'): + continue + track_url = urljoin(base_url, track.get('file')) + if not track_url: + continue + subtitles.setdefault(track.get('label') or 'en', []).append({ + 'url': self._proto_relative_url(track_url) + }) + + entry = { + 'id': this_video_id, + 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')), + 'description': clean_html(video_data.get('description')), + 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))), + 'timestamp': int_or_none(video_data.get('pubdate')), + 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')), + 'subtitles': subtitles, + } + # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32 + if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']): + entry.update({ + '_type': 'url_transparent', + 'url': formats[0]['url'], + }) + else: + self._sort_formats(formats) + entry['formats'] = formats + entries.append(entry) + if len(entries) == 1: + return entries[0] + else: + return self.playlist_result(entries) + + def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None, + m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): + urls = [] + formats = [] + for source in jwplayer_sources_data: + if not isinstance(source, dict): + continue + source_url = urljoin( + base_url, self._proto_relative_url(source.get('file'))) + if not source_url or source_url in urls: + continue + urls.append(source_url) + source_type = source.get('type') or '' + ext = mimetype2ext(source_type) or determine_ext(source_url) + if source_type == 'hls' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + source_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=m3u8_id, fatal=False)) + elif source_type == 'dash' or ext == 'mpd': + formats.extend(self._extract_mpd_formats( + source_url, video_id, mpd_id=mpd_id, fatal=False)) + elif ext == 'smil': + formats.extend(self._extract_smil_formats( + source_url, video_id, fatal=False)) + # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67 + elif source_type.startswith('audio') or ext in ( + 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'): + formats.append({ + 'url': source_url, + 'vcodec': 'none', + 'ext': ext, + }) + else: + height = int_or_none(source.get('height')) + if height is None: + # Often no height is provided but there is a label in + # format like "1080p", "720p SD", or 1080. + height = int_or_none(self._search_regex( + r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''), + 'height', default=None)) + a_format = { + 'url': source_url, + 'width': int_or_none(source.get('width')), + 'height': height, + 'tbr': int_or_none(source.get('bitrate')), + 'ext': ext, + } + if source_url.startswith('rtmp'): + a_format['ext'] = 'flv' + # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as + # of jwplayer.flash.swf + rtmp_url_parts = re.split( + r'((?:mp4|mp3|flv):)', source_url, 1) + if len(rtmp_url_parts) == 3: + rtmp_url, prefix, play_path = rtmp_url_parts + a_format.update({ + 'url': rtmp_url, + 'play_path': prefix + play_path, + }) + if rtmp_params: + a_format.update(rtmp_params) + formats.append(a_format) + return formats + + def _live_title(self, name): + """ Generate the title for a live video """ + now = datetime.datetime.now() + now_str = now.strftime('%Y-%m-%d %H:%M') + return name + ' ' + now_str + + def _int(self, v, name, fatal=False, **kwargs): + res = int_or_none(v, **kwargs) + if 'get_attr' in kwargs: + print(getattr(v, kwargs['get_attr'])) + if res is None: + msg = 'Failed to extract %s: Could not parse value %r' % (name, v) + if fatal: + raise ExtractorError(msg) + else: + self.report_warning(msg) + return res + + def _float(self, v, name, fatal=False, **kwargs): + res = float_or_none(v, **kwargs) + if res is None: + msg = 'Failed to extract %s: Could not parse value %r' % (name, v) + if fatal: + raise ExtractorError(msg) + else: + self.report_warning(msg) + return res + + def _set_cookie(self, domain, name, value, expire_time=None, port=None, + path='/', secure=False, discard=False, rest={}, **kwargs): + cookie = compat_cookiejar_Cookie( + 0, name, value, port, port is not None, domain, True, + domain.startswith('.'), path, True, secure, expire_time, + discard, None, None, rest) + self._downloader.cookiejar.set_cookie(cookie) + + def _get_cookies(self, url): + """ Return a compat_cookies_SimpleCookie with the cookies for the url """ + req = sanitized_Request(url) + self._downloader.cookiejar.add_cookie_header(req) + return compat_cookies_SimpleCookie(req.get_header('Cookie')) + + def _apply_first_set_cookie_header(self, url_handle, cookie): + """ + Apply first Set-Cookie header instead of the last. Experimental. + + Some sites (e.g. [1-3]) may serve two cookies under the same name + in Set-Cookie header and expect the first (old) one to be set rather + than second (new). However, as of RFC6265 the newer one cookie + should be set into cookie store what actually happens. + We will workaround this issue by resetting the cookie to + the first one manually. + 1. https://new.vk.com/ + 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201 + 3. https://learning.oreilly.com/ + """ + for header, cookies in url_handle.headers.items(): + if header.lower() != 'set-cookie': + continue + if sys.version_info[0] >= 3: + cookies = cookies.encode('iso-8859-1') + cookies = cookies.decode('utf-8') + cookie_value = re.search( + r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies) + if cookie_value: + value, domain = cookie_value.groups() + self._set_cookie(domain, cookie, value) + break + + def get_testcases(self, include_onlymatching=False): + t = getattr(self, '_TEST', None) + if t: + assert not hasattr(self, '_TESTS'), \ + '%s has _TEST and _TESTS' % type(self).__name__ + tests = [t] + else: + tests = getattr(self, '_TESTS', []) + for t in tests: + if not include_onlymatching and t.get('only_matching', False): + continue + t['name'] = type(self).__name__[:-len('IE')] + yield t + + def is_suitable(self, age_limit): + """ Test whether the extractor is generally suitable for the given + age limit (i.e. pornographic sites are not, all others usually are) """ + + any_restricted = False + for tc in self.get_testcases(include_onlymatching=False): + if tc.get('playlist', []): + tc = tc['playlist'][0] + is_restricted = age_restricted( + tc.get('info_dict', {}).get('age_limit'), age_limit) + if not is_restricted: + return True + any_restricted = any_restricted or is_restricted + return not any_restricted + + def extract_subtitles(self, *args, **kwargs): + if (self.get_param('writesubtitles', False) + or self.get_param('listsubtitles')): + return self._get_subtitles(*args, **kwargs) + return {} + + def _get_subtitles(self, *args, **kwargs): + raise NotImplementedError('This method must be implemented by subclasses') + + def extract_comments(self, *args, **kwargs): + if not self.get_param('getcomments'): + return None + generator = self._get_comments(*args, **kwargs) + + def extractor(): + comments = [] + try: + while True: + comments.append(next(generator)) + except KeyboardInterrupt: + interrupted = True + self.to_screen('Interrupted by user') + except StopIteration: + interrupted = False + comment_count = len(comments) + self.to_screen(f'Extracted {comment_count} comments') + return { + 'comments': comments, + 'comment_count': None if interrupted else comment_count + } + return extractor + + def _get_comments(self, *args, **kwargs): + raise NotImplementedError('This method must be implemented by subclasses') + + @staticmethod + def _merge_subtitle_items(subtitle_list1, subtitle_list2): + """ Merge subtitle items for one language. Items with duplicated URLs + will be dropped. """ + list1_urls = set([item['url'] for item in subtitle_list1]) + ret = list(subtitle_list1) + ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls]) + return ret + + @classmethod + def _merge_subtitles(cls, *dicts, target=None): + """ Merge subtitle dictionaries, language by language. """ + if target is None: + target = {} + for d in dicts: + for lang, subs in d.items(): + target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs) + return target + + def extract_automatic_captions(self, *args, **kwargs): + if (self.get_param('writeautomaticsub', False) + or self.get_param('listsubtitles')): + return self._get_automatic_captions(*args, **kwargs) + return {} + + def _get_automatic_captions(self, *args, **kwargs): + raise NotImplementedError('This method must be implemented by subclasses') + + def mark_watched(self, *args, **kwargs): + if not self.get_param('mark_watched', False): + return + if (self._get_login_info()[0] is not None + or self.get_param('cookiefile') + or self.get_param('cookiesfrombrowser')): + self._mark_watched(*args, **kwargs) + + def _mark_watched(self, *args, **kwargs): + raise NotImplementedError('This method must be implemented by subclasses') + + def geo_verification_headers(self): + headers = {} + geo_verification_proxy = self.get_param('geo_verification_proxy') + if geo_verification_proxy: + headers['Ytdl-request-proxy'] = geo_verification_proxy + return headers + + def _generic_id(self, url): + return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0]) + + def _generic_title(self, url): + return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]) + + @staticmethod + def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None): + all_known = all(map( + lambda x: x is not None, + (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted))) + return ( + 'private' if is_private + else 'premium_only' if needs_premium + else 'subscriber_only' if needs_subscription + else 'needs_auth' if needs_auth + else 'unlisted' if is_unlisted + else 'public' if all_known + else None) + + def _configuration_arg(self, key, default=NO_DEFAULT, casesense=False): + ''' + @returns A list of values for the extractor argument given by "key" + or "default" if no such key is present + @param default The default value to return when the key is not present (default: []) + @param casesense When false, the values are converted to lower case + ''' + val = traverse_obj( + self._downloader.params, ('extractor_args', self.ie_key().lower(), key)) + if val is None: + return [] if default is NO_DEFAULT else default + return list(val) if casesense else [x.lower() for x in val] + + +class SearchInfoExtractor(InfoExtractor): + """ + Base class for paged search queries extractors. + They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query} + Instances should define _SEARCH_KEY and _MAX_RESULTS. + """ + + @classmethod + def _make_valid_url(cls): + return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY + + @classmethod + def suitable(cls, url): + return re.match(cls._make_valid_url(), url) is not None + + def _real_extract(self, query): + mobj = re.match(self._make_valid_url(), query) + if mobj is None: + raise ExtractorError('Invalid search query "%s"' % query) + + prefix = mobj.group('prefix') + query = mobj.group('query') + if prefix == '': + return self._get_n_results(query, 1) + elif prefix == 'all': + return self._get_n_results(query, self._MAX_RESULTS) + else: + n = int(prefix) + if n <= 0: + raise ExtractorError('invalid download number %s for query "%s"' % (n, query)) + elif n > self._MAX_RESULTS: + self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n)) + n = self._MAX_RESULTS + return self._get_n_results(query, n) + + def _get_n_results(self, query, n): + """Get a specified number of results for a query. + Either this function or _search_results must be overridden by subclasses """ + return self.playlist_result( + itertools.islice(self._search_results(query), 0, None if n == float('inf') else n), + query, query) + + def _search_results(self, query): + """Returns an iterator of search results""" + raise NotImplementedError('This method must be implemented by subclasses') + + @property + def SEARCH_KEY(self): + return self._SEARCH_KEY diff --git a/yt_dlp/extractor/commonmistakes.py b/yt_dlp/extractor/commonmistakes.py new file mode 100644 index 000000000..051269652 --- /dev/null +++ b/yt_dlp/extractor/commonmistakes.py @@ -0,0 +1,50 @@ +from __future__ import unicode_literals + +import sys + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class CommonMistakesIE(InfoExtractor): + IE_DESC = False # Do not list + _VALID_URL = r'''(?x) + (?:url|URL)$ + ''' + + _TESTS = [{ + 'url': 'url', + 'only_matching': True, + }, { + 'url': 'URL', + 'only_matching': True, + }] + + def _real_extract(self, url): + msg = ( + 'You\'ve asked yt-dlp to download the URL "%s". ' + 'That doesn\'t make any sense. ' + 'Simply remove the parameter in your command or configuration.' + ) % url + if not self.get_param('verbose'): + msg += ' Add -v to the command line to see what arguments and configuration yt-dlp has' + raise ExtractorError(msg, expected=True) + + +class UnicodeBOMIE(InfoExtractor): + IE_DESC = False + _VALID_URL = r'(?P<bom>\ufeff)(?P<id>.*)$' + + # Disable test for python 3.2 since BOM is broken in re in this version + # (see https://github.com/ytdl-org/youtube-dl/issues/9751) + _TESTS = [] if (3, 0) < sys.version_info <= (3, 3) else [{ + 'url': '\ufeffhttp://www.youtube.com/watch?v=BaW_jenozKc', + 'only_matching': True, + }] + + def _real_extract(self, url): + real_url = self._match_id(url) + self.report_warning( + 'Your URL starts with a Byte Order Mark (BOM). ' + 'Removing the BOM and looking for "%s" ...' % real_url) + return self.url_result(real_url) diff --git a/yt_dlp/extractor/commonprotocols.py b/yt_dlp/extractor/commonprotocols.py new file mode 100644 index 000000000..3708c6ad2 --- /dev/null +++ b/yt_dlp/extractor/commonprotocols.py @@ -0,0 +1,74 @@ +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..compat import ( + compat_urlparse, +) + + +class RtmpIE(InfoExtractor): + IE_DESC = False # Do not list + _VALID_URL = r'(?i)rtmp[est]?://.+' + + _TESTS = [{ + 'url': 'rtmp://cp44293.edgefcs.net/ondemand?auth=daEcTdydfdqcsb8cZcDbAaCbhamacbbawaS-bw7dBb-bWG-GqpGFqCpNCnGoyL&aifp=v001&slist=public/unsecure/audio/2c97899446428e4301471a8cb72b4b97--audio--pmg-20110908-0900a_flv_aac_med_int.mp4', + 'only_matching': True, + }, { + 'url': 'rtmp://edge.live.hitbox.tv/live/dimak', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._generic_id(url) + title = self._generic_title(url) + return { + 'id': video_id, + 'title': title, + 'formats': [{ + 'url': url, + 'ext': 'flv', + 'format_id': compat_urlparse.urlparse(url).scheme, + }], + } + + +class MmsIE(InfoExtractor): + IE_DESC = False # Do not list + _VALID_URL = r'(?i)mms://.+' + + _TEST = { + # Direct MMS link + 'url': 'mms://kentro.kaist.ac.kr/200907/MilesReid(0709).wmv', + 'info_dict': { + 'id': 'MilesReid(0709)', + 'ext': 'wmv', + 'title': 'MilesReid(0709)', + }, + 'params': { + 'skip_download': True, # rtsp downloads, requiring mplayer or mpv + }, + } + + def _real_extract(self, url): + video_id = self._generic_id(url) + title = self._generic_title(url) + + return { + 'id': video_id, + 'title': title, + 'url': url, + } + + +class ViewSourceIE(InfoExtractor): + IE_DESC = False + _VALID_URL = r'view-source:(?P<url>.+)' + + _TEST = { + 'url': 'view-source:https://www.youtube.com/watch?v=BaW_jenozKc', + 'only_matching': True + } + + def _real_extract(self, url): + return self.url_result(self._match_valid_url(url).group('url')) diff --git a/yt_dlp/extractor/condenast.py b/yt_dlp/extractor/condenast.py new file mode 100644 index 000000000..54e7af8b0 --- /dev/null +++ b/yt_dlp/extractor/condenast.py @@ -0,0 +1,251 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse_urlparse, + compat_urlparse, +) +from ..utils import ( + determine_ext, + extract_attributes, + int_or_none, + js_to_json, + mimetype2ext, + orderedSet, + parse_iso8601, + strip_or_none, + try_get, +) + + +class CondeNastIE(InfoExtractor): + """ + Condé Nast is a media group, some of its sites use a custom HTML5 player + that works the same in all of them. + """ + + # The keys are the supported sites and the values are the name to be shown + # to the user and in the extractor description. + _SITES = { + 'allure': 'Allure', + 'architecturaldigest': 'Architectural Digest', + 'arstechnica': 'Ars Technica', + 'bonappetit': 'Bon Appétit', + 'brides': 'Brides', + 'cnevids': 'Condé Nast', + 'cntraveler': 'Condé Nast Traveler', + 'details': 'Details', + 'epicurious': 'Epicurious', + 'glamour': 'Glamour', + 'golfdigest': 'Golf Digest', + 'gq': 'GQ', + 'newyorker': 'The New Yorker', + 'self': 'SELF', + 'teenvogue': 'Teen Vogue', + 'vanityfair': 'Vanity Fair', + 'vogue': 'Vogue', + 'wired': 'WIRED', + 'wmagazine': 'W Magazine', + } + + _VALID_URL = r'''(?x)https?://(?:video|www|player(?:-backend)?)\.(?:%s)\.com/ + (?: + (?: + embed(?:js)?| + (?:script|inline)/video + )/(?P<id>[0-9a-f]{24})(?:/(?P<player_id>[0-9a-f]{24}))?(?:.+?\btarget=(?P<target>[^&]+))?| + (?P<type>watch|series|video)/(?P<display_id>[^/?#]+) + )''' % '|'.join(_SITES.keys()) + IE_DESC = 'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values())) + + EMBED_URL = r'(?:https?:)?//player(?:-backend)?\.(?:%s)\.com/(?:embed(?:js)?|(?:script|inline)/video)/.+?' % '|'.join(_SITES.keys()) + + _TESTS = [{ + 'url': 'http://video.wired.com/watch/3d-printed-speakers-lit-with-led', + 'md5': '1921f713ed48aabd715691f774c451f7', + 'info_dict': { + 'id': '5171b343c2b4c00dd0c1ccb3', + 'ext': 'mp4', + 'title': '3D Printed Speakers Lit With LED', + 'description': 'Check out these beautiful 3D printed LED speakers. You can\'t actually buy them, but LumiGeek is working on a board that will let you make you\'re own.', + 'uploader': 'wired', + 'upload_date': '20130314', + 'timestamp': 1363219200, + } + }, { + 'url': 'http://video.gq.com/watch/the-closer-with-keith-olbermann-the-only-true-surprise-trump-s-an-idiot?c=series', + 'info_dict': { + 'id': '58d1865bfd2e6126e2000015', + 'ext': 'mp4', + 'title': 'The Only True Surprise? Trump’s an Idiot', + 'uploader': 'gq', + 'upload_date': '20170321', + 'timestamp': 1490126427, + 'description': 'How much grimmer would things be if these people were competent?', + }, + }, { + # JS embed + 'url': 'http://player.cnevids.com/embedjs/55f9cf8b61646d1acf00000c/5511d76261646d5566020000.js', + 'md5': 'f1a6f9cafb7083bab74a710f65d08999', + 'info_dict': { + 'id': '55f9cf8b61646d1acf00000c', + 'ext': 'mp4', + 'title': '3D printed TSA Travel Sentry keys really do open TSA locks', + 'uploader': 'arstechnica', + 'upload_date': '20150916', + 'timestamp': 1442434920, + } + }, { + 'url': 'https://player.cnevids.com/inline/video/59138decb57ac36b83000005.js?target=js-cne-player', + 'only_matching': True, + }, { + 'url': 'http://player-backend.cnevids.com/script/video/59138decb57ac36b83000005.js', + 'only_matching': True, + }] + + def _extract_series(self, url, webpage): + title = self._html_search_regex( + r'(?s)<div class="cne-series-info">.*?<h1>(.+?)</h1>', + webpage, 'series title') + url_object = compat_urllib_parse_urlparse(url) + base_url = '%s://%s' % (url_object.scheme, url_object.netloc) + m_paths = re.finditer( + r'(?s)<p class="cne-thumb-title">.*?<a href="(/watch/.+?)["\?]', webpage) + paths = orderedSet(m.group(1) for m in m_paths) + build_url = lambda path: compat_urlparse.urljoin(base_url, path) + entries = [self.url_result(build_url(path), 'CondeNast') for path in paths] + return self.playlist_result(entries, playlist_title=title) + + def _extract_video_params(self, webpage, display_id): + query = self._parse_json( + self._search_regex( + r'(?s)var\s+params\s*=\s*({.+?})[;,]', webpage, 'player params', + default='{}'), + display_id, transform_source=js_to_json, fatal=False) + if query: + query['videoId'] = self._search_regex( + r'(?:data-video-id=|currentVideoId\s*=\s*)["\']([\da-f]+)', + webpage, 'video id', default=None) + else: + params = extract_attributes(self._search_regex( + r'(<[^>]+data-js="video-player"[^>]+>)', + webpage, 'player params element')) + query.update({ + 'videoId': params['data-video'], + 'playerId': params['data-player'], + 'target': params['id'], + }) + return query + + def _extract_video(self, params): + video_id = params['videoId'] + + video_info = None + + # New API path + query = params.copy() + query['embedType'] = 'inline' + info_page = self._download_json( + 'http://player.cnevids.com/embed-api.json', video_id, + 'Downloading embed info', fatal=False, query=query) + + # Old fallbacks + if not info_page: + if params.get('playerId'): + info_page = self._download_json( + 'http://player.cnevids.com/player/video.js', video_id, + 'Downloading video info', fatal=False, query=params) + if info_page: + video_info = info_page.get('video') + if not video_info: + info_page = self._download_webpage( + 'http://player.cnevids.com/player/loader.js', + video_id, 'Downloading loader info', query=params) + if not video_info: + info_page = self._download_webpage( + 'https://player.cnevids.com/inline/video/%s.js' % video_id, + video_id, 'Downloading inline info', query={ + 'target': params.get('target', 'embedplayer') + }) + + if not video_info: + video_info = self._parse_json( + self._search_regex( + r'(?s)var\s+config\s*=\s*({.+?});', info_page, 'config'), + video_id, transform_source=js_to_json)['video'] + + title = video_info['title'] + + formats = [] + for fdata in video_info['sources']: + src = fdata.get('src') + if not src: + continue + ext = mimetype2ext(fdata.get('type')) or determine_ext(src) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + src, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + continue + quality = fdata.get('quality') + formats.append({ + 'format_id': ext + ('-%s' % quality if quality else ''), + 'url': src, + 'ext': ext, + 'quality': 1 if quality == 'high' else 0, + }) + self._sort_formats(formats) + + subtitles = {} + for t, caption in video_info.get('captions', {}).items(): + caption_url = caption.get('src') + if not (t in ('vtt', 'srt', 'tml') and caption_url): + continue + subtitles.setdefault('en', []).append({'url': caption_url}) + + return { + 'id': video_id, + 'formats': formats, + 'title': title, + 'thumbnail': video_info.get('poster_frame'), + 'uploader': video_info.get('brand'), + 'duration': int_or_none(video_info.get('duration')), + 'tags': video_info.get('tags'), + 'series': video_info.get('series_title'), + 'season': video_info.get('season_title'), + 'timestamp': parse_iso8601(video_info.get('premiere_date')), + 'categories': video_info.get('categories'), + 'subtitles': subtitles, + } + + def _real_extract(self, url): + video_id, player_id, target, url_type, display_id = self._match_valid_url(url).groups() + + if video_id: + return self._extract_video({ + 'videoId': video_id, + 'playerId': player_id, + 'target': target, + }) + + webpage = self._download_webpage(url, display_id) + + if url_type == 'series': + return self._extract_series(url, webpage) + else: + video = try_get(self._parse_json(self._search_regex( + r'__PRELOADED_STATE__\s*=\s*({.+?});', webpage, + 'preload state', '{}'), display_id), + lambda x: x['transformed']['video']) + if video: + params = {'videoId': video['id']} + info = {'description': strip_or_none(video.get('description'))} + else: + params = self._extract_video_params(webpage, display_id) + info = self._search_json_ld( + webpage, display_id, fatal=False) + info.update(self._extract_video(params)) + return info diff --git a/youtube_dl/extractor/contv.py b/yt_dlp/extractor/contv.py index 84b462d40..84b462d40 100644 --- a/youtube_dl/extractor/contv.py +++ b/yt_dlp/extractor/contv.py diff --git a/yt_dlp/extractor/corus.py b/yt_dlp/extractor/corus.py new file mode 100644 index 000000000..352951e20 --- /dev/null +++ b/yt_dlp/extractor/corus.py @@ -0,0 +1,159 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .theplatform import ThePlatformFeedIE +from ..utils import ( + dict_get, + ExtractorError, + float_or_none, + int_or_none, +) + + +class CorusIE(ThePlatformFeedIE): + _VALID_URL = r'''(?x) + https?:// + (?:www\.)? + (?P<domain> + (?: + globaltv| + etcanada| + seriesplus| + wnetwork| + ytv + )\.com| + (?: + hgtv| + foodnetwork| + slice| + history| + showcase| + bigbrothercanada| + abcspark| + disney(?:channel|lachaine) + )\.ca + ) + /(?:[^/]+/)* + (?: + video\.html\?.*?\bv=| + videos?/(?:[^/]+/)*(?:[a-z0-9-]+-)? + ) + (?P<id> + [\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}| + (?:[A-Z]{4})?\d{12,20} + ) + ''' + _TESTS = [{ + 'url': 'http://www.hgtv.ca/shows/bryan-inc/videos/movie-night-popcorn-with-bryan-870923331648/', + 'info_dict': { + 'id': '870923331648', + 'ext': 'mp4', + 'title': 'Movie Night Popcorn with Bryan', + 'description': 'Bryan whips up homemade popcorn, the old fashion way for Jojo and Lincoln.', + 'upload_date': '20170206', + 'timestamp': 1486392197, + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, + 'expected_warnings': ['Failed to parse JSON'], + }, { + 'url': 'http://www.foodnetwork.ca/shows/chopped/video/episode/chocolate-obsession/video.html?v=872683587753', + 'only_matching': True, + }, { + 'url': 'http://etcanada.com/video/873675331955/meet-the-survivor-game-changers-castaways-part-2/', + 'only_matching': True, + }, { + 'url': 'http://www.history.ca/the-world-without-canada/video/full-episodes/natural-resources/video.html?v=955054659646#video', + 'only_matching': True, + }, { + 'url': 'http://www.showcase.ca/eyewitness/video/eyewitness++106/video.html?v=955070531919&p=1&s=da#video', + 'only_matching': True, + }, { + 'url': 'http://www.bigbrothercanada.ca/video/1457812035894/', + 'only_matching': True + }, { + 'url': 'https://www.bigbrothercanada.ca/video/big-brother-canada-704/1457812035894/', + 'only_matching': True + }, { + 'url': 'https://www.seriesplus.com/emissions/dre-mary-mort-sur-ordonnance/videos/deux-coeurs-battant/SERP0055626330000200/', + 'only_matching': True + }, { + 'url': 'https://www.disneychannel.ca/shows/gabby-duran-the-unsittables/video/crybaby-duran-clip/2f557eec-0588-11ea-ae2b-e2c6776b770e/', + 'only_matching': True + }] + _GEO_BYPASS = False + _SITE_MAP = { + 'globaltv': 'series', + 'etcanada': 'series', + 'foodnetwork': 'food', + 'bigbrothercanada': 'series', + 'disneychannel': 'disneyen', + 'disneylachaine': 'disneyfr', + } + + def _real_extract(self, url): + domain, video_id = self._match_valid_url(url).groups() + site = domain.split('.')[0] + path = self._SITE_MAP.get(site, site) + if path != 'series': + path = 'migration/' + path + video = self._download_json( + 'https://globalcontent.corusappservices.com/templates/%s/playlist/' % path, + video_id, query={'byId': video_id}, + headers={'Accept': 'application/json'})[0] + title = video['title'] + + formats = [] + for source in video.get('sources', []): + smil_url = source.get('file') + if not smil_url: + continue + source_type = source.get('type') + note = 'Downloading%s smil file' % (' ' + source_type if source_type else '') + resp = self._download_webpage( + smil_url, video_id, note, fatal=False, + headers=self.geo_verification_headers()) + if not resp: + continue + error = self._parse_json(resp, video_id, fatal=False) + if error: + if error.get('exception') == 'GeoLocationBlocked': + self.raise_geo_restricted(countries=['CA']) + raise ExtractorError(error['description']) + smil = self._parse_xml(resp, video_id, fatal=False) + if smil is None: + continue + namespace = self._parse_smil_namespace(smil) + formats.extend(self._parse_smil_formats( + smil, smil_url, video_id, namespace)) + if not formats and video.get('drm'): + self.report_drm(video_id) + self._sort_formats(formats) + + subtitles = {} + for track in video.get('tracks', []): + track_url = track.get('file') + if not track_url: + continue + lang = 'fr' if site in ('disneylachaine', 'seriesplus') else 'en' + subtitles.setdefault(lang, []).append({'url': track_url}) + + metadata = video.get('metadata') or {} + get_number = lambda x: int_or_none(video.get('pl1$' + x) or metadata.get(x + 'Number')) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': dict_get(video, ('defaultThumbnailUrl', 'thumbnail', 'image')), + 'description': video.get('description'), + 'timestamp': int_or_none(video.get('availableDate'), 1000), + 'subtitles': subtitles, + 'duration': float_or_none(metadata.get('duration')), + 'series': dict_get(video, ('show', 'pl1$show')), + 'season_number': get_number('season'), + 'episode_number': get_number('episode'), + } diff --git a/yt_dlp/extractor/coub.py b/yt_dlp/extractor/coub.py new file mode 100644 index 000000000..eba6b73ba --- /dev/null +++ b/yt_dlp/extractor/coub.py @@ -0,0 +1,140 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + parse_iso8601, + qualities, +) + + +class CoubIE(InfoExtractor): + _VALID_URL = r'(?:coub:|https?://(?:coub\.com/(?:view|embed|coubs)/|c-cdn\.coub\.com/fb-player\.swf\?.*\bcoub(?:ID|id)=))(?P<id>[\da-z]+)' + + _TESTS = [{ + 'url': 'http://coub.com/view/5u5n1', + 'info_dict': { + 'id': '5u5n1', + 'ext': 'mp4', + 'title': 'The Matrix Moonwalk', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 4.6, + 'timestamp': 1428527772, + 'upload_date': '20150408', + 'uploader': 'Artyom Loskutnikov', + 'uploader_id': 'artyom.loskutnikov', + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'age_limit': 0, + }, + }, { + 'url': 'http://c-cdn.coub.com/fb-player.swf?bot_type=vk&coubID=7w5a4', + 'only_matching': True, + }, { + 'url': 'coub:5u5n1', + 'only_matching': True, + }, { + # longer video id + 'url': 'http://coub.com/view/237d5l5h', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + coub = self._download_json( + 'http://coub.com/api/v2/coubs/%s.json' % video_id, video_id) + + if coub.get('error'): + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, coub['error']), expected=True) + + title = coub['title'] + + file_versions = coub['file_versions'] + + QUALITIES = ('low', 'med', 'high') + + MOBILE = 'mobile' + IPHONE = 'iphone' + HTML5 = 'html5' + + SOURCE_PREFERENCE = (MOBILE, IPHONE, HTML5) + + quality_key = qualities(QUALITIES) + preference_key = qualities(SOURCE_PREFERENCE) + + formats = [] + + for kind, items in file_versions.get(HTML5, {}).items(): + if kind not in ('video', 'audio'): + continue + if not isinstance(items, dict): + continue + for quality, item in items.items(): + if not isinstance(item, dict): + continue + item_url = item.get('url') + if not item_url: + continue + formats.append({ + 'url': item_url, + 'format_id': '%s-%s-%s' % (HTML5, kind, quality), + 'filesize': int_or_none(item.get('size')), + 'vcodec': 'none' if kind == 'audio' else None, + 'quality': quality_key(quality), + 'source_preference': preference_key(HTML5), + }) + + iphone_url = file_versions.get(IPHONE, {}).get('url') + if iphone_url: + formats.append({ + 'url': iphone_url, + 'format_id': IPHONE, + 'source_preference': preference_key(IPHONE), + }) + + mobile_url = file_versions.get(MOBILE, {}).get('audio_url') + if mobile_url: + formats.append({ + 'url': mobile_url, + 'format_id': '%s-audio' % MOBILE, + 'source_preference': preference_key(MOBILE), + }) + + self._sort_formats(formats) + + thumbnail = coub.get('picture') + duration = float_or_none(coub.get('duration')) + timestamp = parse_iso8601(coub.get('published_at') or coub.get('created_at')) + uploader = coub.get('channel', {}).get('title') + uploader_id = coub.get('channel', {}).get('permalink') + + view_count = int_or_none(coub.get('views_count') or coub.get('views_increase_count')) + like_count = int_or_none(coub.get('likes_count')) + repost_count = int_or_none(coub.get('recoubs_count')) + + age_restricted = coub.get('age_restricted', coub.get('age_restricted_by_admin')) + if age_restricted is not None: + age_limit = 18 if age_restricted is True else 0 + else: + age_limit = None + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': timestamp, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'view_count': view_count, + 'like_count': like_count, + 'repost_count': repost_count, + 'age_limit': age_limit, + 'formats': formats, + } diff --git a/youtube_dl/extractor/cracked.py b/yt_dlp/extractor/cracked.py index f77a68ece..f77a68ece 100644 --- a/youtube_dl/extractor/cracked.py +++ b/yt_dlp/extractor/cracked.py diff --git a/yt_dlp/extractor/crackle.py b/yt_dlp/extractor/crackle.py new file mode 100644 index 000000000..2c9d28d2e --- /dev/null +++ b/yt_dlp/extractor/crackle.py @@ -0,0 +1,245 @@ +# coding: utf-8 +from __future__ import unicode_literals, division + +import hashlib +import hmac +import re +import time + +from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ( + determine_ext, + float_or_none, + int_or_none, + orderedSet, + parse_age_limit, + parse_duration, + url_or_none, + ExtractorError +) + + +class CrackleIE(InfoExtractor): + _VALID_URL = r'(?:crackle:|https?://(?:(?:www|m)\.)?(?:sony)?crackle\.com/(?:playlist/\d+/|(?:[^/]+/)+))(?P<id>\d+)' + _TESTS = [{ + # geo restricted to CA + 'url': 'https://www.crackle.com/andromeda/2502343', + 'info_dict': { + 'id': '2502343', + 'ext': 'mp4', + 'title': 'Under The Night', + 'description': 'md5:d2b8ca816579ae8a7bf28bfff8cefc8a', + 'duration': 2583, + 'view_count': int, + 'average_rating': 0, + 'age_limit': 14, + 'genre': 'Action, Sci-Fi', + 'creator': 'Allan Kroeker', + 'artist': 'Keith Hamilton Cobb, Kevin Sorbo, Lisa Ryder, Lexa Doig, Robert Hewitt Wolfe', + 'release_year': 2000, + 'series': 'Andromeda', + 'episode': 'Under The Night', + 'season_number': 1, + 'episode_number': 1, + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + }, { + 'url': 'https://www.sonycrackle.com/andromeda/2502343', + 'only_matching': True, + }] + + _MEDIA_FILE_SLOTS = { + '360p.mp4': { + 'width': 640, + 'height': 360, + }, + '480p.mp4': { + 'width': 768, + 'height': 432, + }, + '480p_1mbps.mp4': { + 'width': 852, + 'height': 480, + }, + } + + def _download_json(self, url, *args, **kwargs): + # Authorization generation algorithm is reverse engineered from: + # https://www.sonycrackle.com/static/js/main.ea93451f.chunk.js + timestamp = time.strftime('%Y%m%d%H%M', time.gmtime()) + h = hmac.new(b'IGSLUQCBDFHEOIFM', '|'.join([url, timestamp]).encode(), hashlib.sha1).hexdigest().upper() + headers = { + 'Accept': 'application/json', + 'Authorization': '|'.join([h, timestamp, '117', '1']), + } + return InfoExtractor._download_json(self, url, *args, headers=headers, **kwargs) + + def _real_extract(self, url): + video_id = self._match_id(url) + + geo_bypass_country = self.get_param('geo_bypass_country', None) + countries = orderedSet((geo_bypass_country, 'US', 'AU', 'CA', 'AS', 'FM', 'GU', 'MP', 'PR', 'PW', 'MH', 'VI', '')) + num_countries, num = len(countries) - 1, 0 + + media = {} + for num, country in enumerate(countries): + if num == 1: # start hard-coded list + self.report_warning('%s. Trying with a list of known countries' % ( + 'Unable to obtain video formats from %s API' % geo_bypass_country if geo_bypass_country + else 'No country code was given using --geo-bypass-country')) + elif num == num_countries: # end of list + geo_info = self._download_json( + 'https://web-api-us.crackle.com/Service.svc/geo/country', + video_id, fatal=False, note='Downloading geo-location information from crackle API', + errnote='Unable to fetch geo-location information from crackle') or {} + country = geo_info.get('CountryCode') + if country is None: + continue + self.to_screen('%s identified country as %s' % (self.IE_NAME, country)) + if country in countries: + self.to_screen('Downloading from %s API was already attempted. Skipping...' % country) + continue + + if country is None: + continue + try: + media = self._download_json( + 'https://web-api-us.crackle.com/Service.svc/details/media/%s/%s?disableProtocols=true' % (video_id, country), + video_id, note='Downloading media JSON from %s API' % country, + errnote='Unable to download media JSON') + except ExtractorError as e: + # 401 means geo restriction, trying next country + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + continue + raise + + status = media.get('status') + if status.get('messageCode') != '0': + raise ExtractorError( + '%s said: %s %s - %s' % ( + self.IE_NAME, status.get('messageCodeDescription'), status.get('messageCode'), status.get('message')), + expected=True) + + # Found video formats + if isinstance(media.get('MediaURLs'), list): + break + + ignore_no_formats = self.get_param('ignore_no_formats_error') + allow_unplayable_formats = self.get_param('allow_unplayable_formats') + + if not media or (not media.get('MediaURLs') and not ignore_no_formats): + raise ExtractorError( + 'Unable to access the crackle API. Try passing your country code ' + 'to --geo-bypass-country. If it still does not work and the ' + 'video is available in your country') + title = media['Title'] + + formats, subtitles = [], {} + has_drm = False + for e in media.get('MediaURLs') or []: + if e.get('UseDRM'): + has_drm = True + if not allow_unplayable_formats: + continue + format_url = url_or_none(e.get('Path')) + if not format_url: + continue + ext = determine_ext(format_url) + if ext == 'm3u8': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) + elif ext == 'mpd': + fmts, subs = self._extract_mpd_formats_and_subtitles( + format_url, video_id, mpd_id='dash', fatal=False) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) + elif format_url.endswith('.ism/Manifest'): + fmts, subs = self._extract_ism_formats_and_subtitles( + format_url, video_id, ism_id='mss', fatal=False) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) + else: + mfs_path = e.get('Type') + mfs_info = self._MEDIA_FILE_SLOTS.get(mfs_path) + if not mfs_info: + continue + formats.append({ + 'url': format_url, + 'format_id': 'http-' + mfs_path.split('.')[0], + 'width': mfs_info['width'], + 'height': mfs_info['height'], + }) + if not formats and has_drm: + self.report_drm(video_id) + self._sort_formats(formats) + + description = media.get('Description') + duration = int_or_none(media.get( + 'DurationInSeconds')) or parse_duration(media.get('Duration')) + view_count = int_or_none(media.get('CountViews')) + average_rating = float_or_none(media.get('UserRating')) + age_limit = parse_age_limit(media.get('Rating')) + genre = media.get('Genre') + release_year = int_or_none(media.get('ReleaseYear')) + creator = media.get('Directors') + artist = media.get('Cast') + + if media.get('MediaTypeDisplayValue') == 'Full Episode': + series = media.get('ShowName') + episode = title + season_number = int_or_none(media.get('Season')) + episode_number = int_or_none(media.get('Episode')) + else: + series = episode = season_number = episode_number = None + + cc_files = media.get('ClosedCaptionFiles') + if isinstance(cc_files, list): + for cc_file in cc_files: + if not isinstance(cc_file, dict): + continue + cc_url = url_or_none(cc_file.get('Path')) + if not cc_url: + continue + lang = cc_file.get('Locale') or 'en' + subtitles.setdefault(lang, []).append({'url': cc_url}) + + thumbnails = [] + images = media.get('Images') + if isinstance(images, list): + for image_key, image_url in images.items(): + mobj = re.search(r'Img_(\d+)[xX](\d+)', image_key) + if not mobj: + continue + thumbnails.append({ + 'url': image_url, + 'width': int(mobj.group(1)), + 'height': int(mobj.group(2)), + }) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'view_count': view_count, + 'average_rating': average_rating, + 'age_limit': age_limit, + 'genre': genre, + 'creator': creator, + 'artist': artist, + 'release_year': release_year, + 'series': series, + 'episode': episode, + 'season_number': season_number, + 'episode_number': episode_number, + 'thumbnails': thumbnails, + 'subtitles': subtitles, + 'formats': formats, + } diff --git a/youtube_dl/extractor/crooksandliars.py b/yt_dlp/extractor/crooksandliars.py index 7fb782db7..7fb782db7 100644 --- a/youtube_dl/extractor/crooksandliars.py +++ b/yt_dlp/extractor/crooksandliars.py diff --git a/yt_dlp/extractor/crunchyroll.py b/yt_dlp/extractor/crunchyroll.py new file mode 100644 index 000000000..511ac1b2c --- /dev/null +++ b/yt_dlp/extractor/crunchyroll.py @@ -0,0 +1,757 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import json +import zlib + +from hashlib import sha1 +from math import pow, sqrt, floor +from .common import InfoExtractor +from .vrv import VRVIE +from ..compat import ( + compat_b64decode, + compat_etree_Element, + compat_etree_fromstring, + compat_str, + compat_urllib_parse_urlencode, + compat_urllib_request, + compat_urlparse, +) +from ..utils import ( + ExtractorError, + bytes_to_intlist, + extract_attributes, + float_or_none, + intlist_to_bytes, + int_or_none, + lowercase_escape, + merge_dicts, + remove_end, + sanitized_Request, + try_get, + urlencode_postdata, + xpath_text, +) +from ..aes import ( + aes_cbc_decrypt, +) + + +class CrunchyrollBaseIE(InfoExtractor): + _LOGIN_URL = 'https://www.crunchyroll.com/login' + _LOGIN_FORM = 'login_form' + _NETRC_MACHINE = 'crunchyroll' + + def _call_rpc_api(self, method, video_id, note=None, data=None): + data = data or {} + data['req'] = 'RpcApi' + method + data = compat_urllib_parse_urlencode(data).encode('utf-8') + return self._download_xml( + 'https://www.crunchyroll.com/xml/', + video_id, note, fatal=False, data=data, headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + }) + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login page') + + def is_logged(webpage): + return 'href="/logout"' in webpage + + # Already logged in + if is_logged(login_page): + return + + login_form_str = self._search_regex( + r'(?P<form><form[^>]+?id=(["\'])%s\2[^>]*>)' % self._LOGIN_FORM, + login_page, 'login form', group='form') + + post_url = extract_attributes(login_form_str).get('action') + if not post_url: + post_url = self._LOGIN_URL + elif not post_url.startswith('http'): + post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) + + login_form = self._form_hidden_inputs(self._LOGIN_FORM, login_page) + + login_form.update({ + 'login_form[name]': username, + 'login_form[password]': password, + }) + + response = self._download_webpage( + post_url, None, 'Logging in', 'Wrong login info', + data=urlencode_postdata(login_form), + headers={'Content-Type': 'application/x-www-form-urlencoded'}) + + # Successful login + if is_logged(response): + return + + error = self._html_search_regex( + '(?s)<ul[^>]+class=["\']messages["\'][^>]*>(.+?)</ul>', + response, 'error message', default=None) + if error: + raise ExtractorError('Unable to login: %s' % error, expected=True) + + raise ExtractorError('Unable to log in') + + def _real_initialize(self): + self._login() + + @staticmethod + def _add_skip_wall(url): + parsed_url = compat_urlparse.urlparse(url) + qs = compat_urlparse.parse_qs(parsed_url.query) + # Always force skip_wall to bypass maturity wall, namely 18+ confirmation message: + # > This content may be inappropriate for some people. + # > Are you sure you want to continue? + # since it's not disabled by default in crunchyroll account's settings. + # See https://github.com/ytdl-org/youtube-dl/issues/7202. + qs['skip_wall'] = ['1'] + return compat_urlparse.urlunparse( + parsed_url._replace(query=compat_urllib_parse_urlencode(qs, True))) + + +class CrunchyrollIE(CrunchyrollBaseIE, VRVIE): + IE_NAME = 'crunchyroll' + _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|(?:[^/]*/){1,2}[^/?&]*?)(?P<id>[0-9]+))(?:[/?&]|$)' + _TESTS = [{ + 'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513', + 'info_dict': { + 'id': '645513', + 'ext': 'mp4', + 'title': 'Wanna be the Strongest in the World Episode 1 – An Idol-Wrestler is Born!', + 'description': 'md5:2d17137920c64f2f49981a7797d275ef', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Yomiuri Telecasting Corporation (YTV)', + 'upload_date': '20131013', + 'url': 're:(?!.*&)', + }, + 'params': { + # rtmp + 'skip_download': True, + }, + 'skip': 'Video gone', + }, { + 'url': 'http://www.crunchyroll.com/media-589804/culture-japan-1', + 'info_dict': { + 'id': '589804', + 'ext': 'flv', + 'title': 'Culture Japan Episode 1 – Rebuilding Japan after the 3.11', + 'description': 'md5:2fbc01f90b87e8e9137296f37b461c12', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Danny Choo Network', + 'upload_date': '20120213', + }, + 'params': { + # rtmp + 'skip_download': True, + }, + 'skip': 'Video gone', + }, { + 'url': 'http://www.crunchyroll.com/rezero-starting-life-in-another-world-/episode-5-the-morning-of-our-promise-is-still-distant-702409', + 'info_dict': { + 'id': '702409', + 'ext': 'mp4', + 'title': compat_str, + 'description': compat_str, + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Re:Zero Partners', + 'timestamp': 1462098900, + 'upload_date': '20160501', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://www.crunchyroll.com/konosuba-gods-blessing-on-this-wonderful-world/episode-1-give-me-deliverance-from-this-judicial-injustice-727589', + 'info_dict': { + 'id': '727589', + 'ext': 'mp4', + 'title': compat_str, + 'description': compat_str, + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Kadokawa Pictures Inc.', + 'timestamp': 1484130900, + 'upload_date': '20170111', + 'series': compat_str, + 'season': "KONOSUBA -God's blessing on this wonderful world! 2", + 'season_number': 2, + 'episode': 'Give Me Deliverance From This Judicial Injustice!', + 'episode_number': 1, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://www.crunchyroll.fr/girl-friend-beta/episode-11-goodbye-la-mode-661697', + 'only_matching': True, + }, { + # geo-restricted (US), 18+ maturity wall, non-premium available + 'url': 'http://www.crunchyroll.com/cosplay-complex-ova/episode-1-the-birth-of-the-cosplay-club-565617', + 'only_matching': True, + }, { + # A description with double quotes + 'url': 'http://www.crunchyroll.com/11eyes/episode-1-piros-jszaka-red-night-535080', + 'info_dict': { + 'id': '535080', + 'ext': 'mp4', + 'title': compat_str, + 'description': compat_str, + 'uploader': 'Marvelous AQL Inc.', + 'timestamp': 1255512600, + 'upload_date': '20091014', + }, + 'params': { + # Just test metadata extraction + 'skip_download': True, + }, + }, { + # make sure we can extract an uploader name that's not a link + 'url': 'http://www.crunchyroll.com/hakuoki-reimeiroku/episode-1-dawn-of-the-divine-warriors-606899', + 'info_dict': { + 'id': '606899', + 'ext': 'mp4', + 'title': 'Hakuoki Reimeiroku Episode 1 – Dawn of the Divine Warriors', + 'description': 'Ryunosuke was left to die, but Serizawa-san asked him a simple question "Do you want to live?"', + 'uploader': 'Geneon Entertainment', + 'upload_date': '20120717', + }, + 'params': { + # just test metadata extraction + 'skip_download': True, + }, + 'skip': 'Video gone', + }, { + # A video with a vastly different season name compared to the series name + 'url': 'http://www.crunchyroll.com/nyarko-san-another-crawling-chaos/episode-1-test-590532', + 'info_dict': { + 'id': '590532', + 'ext': 'mp4', + 'title': compat_str, + 'description': compat_str, + 'uploader': 'TV TOKYO', + 'timestamp': 1330956000, + 'upload_date': '20120305', + 'series': 'Nyarko-san: Another Crawling Chaos', + 'season': 'Haiyoru! Nyaruani (ONA)', + }, + 'params': { + # Just test metadata extraction + 'skip_download': True, + }, + }, { + 'url': 'http://www.crunchyroll.com/media-723735', + 'only_matching': True, + }, { + 'url': 'https://www.crunchyroll.com/en-gb/mob-psycho-100/episode-2-urban-legends-encountering-rumors-780921', + 'only_matching': True, + }] + + _FORMAT_IDS = { + '360': ('60', '106'), + '480': ('61', '106'), + '720': ('62', '106'), + '1080': ('80', '108'), + } + + def _download_webpage(self, url_or_request, *args, **kwargs): + request = (url_or_request if isinstance(url_or_request, compat_urllib_request.Request) + else sanitized_Request(url_or_request)) + # Accept-Language must be set explicitly to accept any language to avoid issues + # similar to https://github.com/ytdl-org/youtube-dl/issues/6797. + # Along with IP address Crunchyroll uses Accept-Language to guess whether georestriction + # should be imposed or not (from what I can see it just takes the first language + # ignoring the priority and requires it to correspond the IP). By the way this causes + # Crunchyroll to not work in georestriction cases in some browsers that don't place + # the locale lang first in header. However allowing any language seems to workaround the issue. + request.add_header('Accept-Language', '*') + return super(CrunchyrollBaseIE, self)._download_webpage(request, *args, **kwargs) + + def _decrypt_subtitles(self, data, iv, id): + data = bytes_to_intlist(compat_b64decode(data)) + iv = bytes_to_intlist(compat_b64decode(iv)) + id = int(id) + + def obfuscate_key_aux(count, modulo, start): + output = list(start) + for _ in range(count): + output.append(output[-1] + output[-2]) + # cut off start values + output = output[2:] + output = list(map(lambda x: x % modulo + 33, output)) + return output + + def obfuscate_key(key): + num1 = int(floor(pow(2, 25) * sqrt(6.9))) + num2 = (num1 ^ key) << 5 + num3 = key ^ num1 + num4 = num3 ^ (num3 >> 3) ^ num2 + prefix = intlist_to_bytes(obfuscate_key_aux(20, 97, (1, 2))) + shaHash = bytes_to_intlist(sha1(prefix + str(num4).encode('ascii')).digest()) + # Extend 160 Bit hash to 256 Bit + return shaHash + [0] * 12 + + key = obfuscate_key(id) + + decrypted_data = intlist_to_bytes(aes_cbc_decrypt(data, key, iv)) + return zlib.decompress(decrypted_data) + + def _convert_subtitles_to_srt(self, sub_root): + output = '' + + for i, event in enumerate(sub_root.findall('./events/event'), 1): + start = event.attrib['start'].replace('.', ',') + end = event.attrib['end'].replace('.', ',') + text = event.attrib['text'].replace('\\N', '\n') + output += '%d\n%s --> %s\n%s\n\n' % (i, start, end, text) + return output + + def _convert_subtitles_to_ass(self, sub_root): + output = '' + + def ass_bool(strvalue): + assvalue = '0' + if strvalue == '1': + assvalue = '-1' + return assvalue + + output = '[Script Info]\n' + output += 'Title: %s\n' % sub_root.attrib['title'] + output += 'ScriptType: v4.00+\n' + output += 'WrapStyle: %s\n' % sub_root.attrib['wrap_style'] + output += 'PlayResX: %s\n' % sub_root.attrib['play_res_x'] + output += 'PlayResY: %s\n' % sub_root.attrib['play_res_y'] + output += """ +[V4+ Styles] +Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding +""" + for style in sub_root.findall('./styles/style'): + output += 'Style: ' + style.attrib['name'] + output += ',' + style.attrib['font_name'] + output += ',' + style.attrib['font_size'] + output += ',' + style.attrib['primary_colour'] + output += ',' + style.attrib['secondary_colour'] + output += ',' + style.attrib['outline_colour'] + output += ',' + style.attrib['back_colour'] + output += ',' + ass_bool(style.attrib['bold']) + output += ',' + ass_bool(style.attrib['italic']) + output += ',' + ass_bool(style.attrib['underline']) + output += ',' + ass_bool(style.attrib['strikeout']) + output += ',' + style.attrib['scale_x'] + output += ',' + style.attrib['scale_y'] + output += ',' + style.attrib['spacing'] + output += ',' + style.attrib['angle'] + output += ',' + style.attrib['border_style'] + output += ',' + style.attrib['outline'] + output += ',' + style.attrib['shadow'] + output += ',' + style.attrib['alignment'] + output += ',' + style.attrib['margin_l'] + output += ',' + style.attrib['margin_r'] + output += ',' + style.attrib['margin_v'] + output += ',' + style.attrib['encoding'] + output += '\n' + + output += """ +[Events] +Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text +""" + for event in sub_root.findall('./events/event'): + output += 'Dialogue: 0' + output += ',' + event.attrib['start'] + output += ',' + event.attrib['end'] + output += ',' + event.attrib['style'] + output += ',' + event.attrib['name'] + output += ',' + event.attrib['margin_l'] + output += ',' + event.attrib['margin_r'] + output += ',' + event.attrib['margin_v'] + output += ',' + event.attrib['effect'] + output += ',' + event.attrib['text'] + output += '\n' + + return output + + def _extract_subtitles(self, subtitle): + sub_root = compat_etree_fromstring(subtitle) + return [{ + 'ext': 'srt', + 'data': self._convert_subtitles_to_srt(sub_root), + }, { + 'ext': 'ass', + 'data': self._convert_subtitles_to_ass(sub_root), + }] + + def _get_subtitles(self, video_id, webpage): + subtitles = {} + for sub_id, sub_name in re.findall(r'\bssid=([0-9]+)"[^>]+?\btitle="([^"]+)', webpage): + sub_doc = self._call_rpc_api( + 'Subtitle_GetXml', video_id, + 'Downloading subtitles for ' + sub_name, data={ + 'subtitle_script_id': sub_id, + }) + if not isinstance(sub_doc, compat_etree_Element): + continue + sid = sub_doc.get('id') + iv = xpath_text(sub_doc, 'iv', 'subtitle iv') + data = xpath_text(sub_doc, 'data', 'subtitle data') + if not sid or not iv or not data: + continue + subtitle = self._decrypt_subtitles(data, iv, sid).decode('utf-8') + lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False) + if not lang_code: + continue + subtitles[lang_code] = self._extract_subtitles(subtitle) + return subtitles + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + + if mobj.group('prefix') == 'm': + mobile_webpage = self._download_webpage(url, video_id, 'Downloading mobile webpage') + webpage_url = self._search_regex(r'<link rel="canonical" href="([^"]+)" />', mobile_webpage, 'webpage_url') + else: + webpage_url = 'http://www.' + mobj.group('url') + + webpage = self._download_webpage( + self._add_skip_wall(webpage_url), video_id, + headers=self.geo_verification_headers()) + note_m = self._html_search_regex( + r'<div class="showmedia-trailer-notice">(.+?)</div>', + webpage, 'trailer-notice', default='') + if note_m: + raise ExtractorError(note_m, expected=True) + + mobj = re.search(r'Page\.messaging_box_controller\.addItems\(\[(?P<msg>{.+?})\]\)', webpage) + if mobj: + msg = json.loads(mobj.group('msg')) + if msg.get('type') == 'error': + raise ExtractorError('crunchyroll returned error: %s' % msg['message_body'], expected=True) + + if 'To view this, please log in to verify you are 18 or older.' in webpage: + self.raise_login_required() + + media = self._parse_json(self._search_regex( + r'vilos\.config\.media\s*=\s*({.+?});', + webpage, 'vilos media', default='{}'), video_id) + media_metadata = media.get('metadata') or {} + + language = self._search_regex( + r'(?:vilos\.config\.player\.language|LOCALE)\s*=\s*(["\'])(?P<lang>(?:(?!\1).)+)\1', + webpage, 'language', default=None, group='lang') + + video_title = self._html_search_regex( + (r'(?s)<h1[^>]*>((?:(?!<h1).)*?<(?:span[^>]+itemprop=["\']title["\']|meta[^>]+itemprop=["\']position["\'])[^>]*>(?:(?!<h1).)+?)</h1>', + r'<title>(.+?),\s+-\s+.+? Crunchyroll'), + webpage, 'video_title', default=None) + if not video_title: + video_title = re.sub(r'^Watch\s+', '', self._og_search_description(webpage)) + video_title = re.sub(r' {2,}', ' ', video_title) + video_description = (self._parse_json(self._html_search_regex( + r'<script[^>]*>\s*.+?\[media_id=%s\].+?({.+?"description"\s*:.+?})\);' % video_id, + webpage, 'description', default='{}'), video_id) or media_metadata).get('description') + + thumbnails = [] + thumbnail_url = (self._parse_json(self._html_search_regex( + r'<script type="application\/ld\+json">\n\s*(.+?)<\/script>', + webpage, 'thumbnail_url', default='{}'), video_id)).get('image') + if thumbnail_url: + thumbnails.append({ + 'url': thumbnail_url, + 'width': 1920, + 'height': 1080 + }) + + if video_description: + video_description = lowercase_escape(video_description.replace(r'\r\n', '\n')) + video_uploader = self._html_search_regex( + # try looking for both an uploader that's a link and one that's not + [r'<a[^>]+href="/publisher/[^"]+"[^>]*>([^<]+)</a>', r'<div>\s*Publisher:\s*<span>\s*(.+?)\s*</span>\s*</div>'], + webpage, 'video_uploader', default=False) + + formats = [] + for stream in media.get('streams', []): + audio_lang = stream.get('audio_lang') + hardsub_lang = stream.get('hardsub_lang') + vrv_formats = self._extract_vrv_formats( + stream.get('url'), video_id, stream.get('format'), + audio_lang, hardsub_lang) + for f in vrv_formats: + f['language_preference'] = 1 if audio_lang == language else 0 + f['quality'] = ( + 1 if not hardsub_lang + else 0 if hardsub_lang == language + else -1) + formats.extend(vrv_formats) + if not formats: + available_fmts = [] + for a, fmt in re.findall(r'(<a[^>]+token=["\']showmedia\.([0-9]{3,4})p["\'][^>]+>)', webpage): + attrs = extract_attributes(a) + href = attrs.get('href') + if href and '/freetrial' in href: + continue + available_fmts.append(fmt) + if not available_fmts: + for p in (r'token=["\']showmedia\.([0-9]{3,4})p"', r'showmedia\.([0-9]{3,4})p'): + available_fmts = re.findall(p, webpage) + if available_fmts: + break + if not available_fmts: + available_fmts = self._FORMAT_IDS.keys() + video_encode_ids = [] + + for fmt in available_fmts: + stream_quality, stream_format = self._FORMAT_IDS[fmt] + video_format = fmt + 'p' + stream_infos = [] + streamdata = self._call_rpc_api( + 'VideoPlayer_GetStandardConfig', video_id, + 'Downloading media info for %s' % video_format, data={ + 'media_id': video_id, + 'video_format': stream_format, + 'video_quality': stream_quality, + 'current_page': url, + }) + if isinstance(streamdata, compat_etree_Element): + stream_info = streamdata.find('./{default}preload/stream_info') + if stream_info is not None: + stream_infos.append(stream_info) + stream_info = self._call_rpc_api( + 'VideoEncode_GetStreamInfo', video_id, + 'Downloading stream info for %s' % video_format, data={ + 'media_id': video_id, + 'video_format': stream_format, + 'video_encode_quality': stream_quality, + }) + if isinstance(stream_info, compat_etree_Element): + stream_infos.append(stream_info) + for stream_info in stream_infos: + video_encode_id = xpath_text(stream_info, './video_encode_id') + if video_encode_id in video_encode_ids: + continue + video_encode_ids.append(video_encode_id) + + video_file = xpath_text(stream_info, './file') + if not video_file: + continue + if video_file.startswith('http'): + formats.extend(self._extract_m3u8_formats( + video_file, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + continue + + video_url = xpath_text(stream_info, './host') + if not video_url: + continue + metadata = stream_info.find('./metadata') + format_info = { + 'format': video_format, + 'height': int_or_none(xpath_text(metadata, './height')), + 'width': int_or_none(xpath_text(metadata, './width')), + } + + if '.fplive.net/' in video_url: + video_url = re.sub(r'^rtmpe?://', 'http://', video_url.strip()) + parsed_video_url = compat_urlparse.urlparse(video_url) + direct_video_url = compat_urlparse.urlunparse(parsed_video_url._replace( + netloc='v.lvlt.crcdn.net', + path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_file.split(':')[-1]))) + if self._is_valid_url(direct_video_url, video_id, video_format): + format_info.update({ + 'format_id': 'http-' + video_format, + 'url': direct_video_url, + }) + formats.append(format_info) + continue + + format_info.update({ + 'format_id': 'rtmp-' + video_format, + 'url': video_url, + 'play_path': video_file, + 'ext': 'flv', + }) + formats.append(format_info) + self._sort_formats(formats) + + metadata = self._call_rpc_api( + 'VideoPlayer_GetMediaMetadata', video_id, + note='Downloading media info', data={ + 'media_id': video_id, + }) + + subtitles = {} + for subtitle in media.get('subtitles', []): + subtitle_url = subtitle.get('url') + if not subtitle_url: + continue + subtitles.setdefault(subtitle.get('language', 'enUS'), []).append({ + 'url': subtitle_url, + 'ext': subtitle.get('format', 'ass'), + }) + if not subtitles: + subtitles = self.extract_subtitles(video_id, webpage) + + # webpage provide more accurate data than series_title from XML + series = self._html_search_regex( + r'(?s)<h\d[^>]+\bid=["\']showmedia_about_episode_num[^>]+>(.+?)</h\d', + webpage, 'series', fatal=False) + + season = episode = episode_number = duration = None + + if isinstance(metadata, compat_etree_Element): + season = xpath_text(metadata, 'series_title') + episode = xpath_text(metadata, 'episode_title') + episode_number = int_or_none(xpath_text(metadata, 'episode_number')) + duration = float_or_none(media_metadata.get('duration'), 1000) + + if not episode: + episode = media_metadata.get('title') + if not episode_number: + episode_number = int_or_none(media_metadata.get('episode_number')) + thumbnail_url = try_get(media, lambda x: x['thumbnail']['url']) + if thumbnail_url: + thumbnails.append({ + 'url': thumbnail_url, + 'width': 640, + 'height': 360 + }) + + season_number = int_or_none(self._search_regex( + r'(?s)<h\d[^>]+id=["\']showmedia_about_episode_num[^>]+>.+?</h\d>\s*<h4>\s*Season (\d+)', + webpage, 'season number', default=None)) + + info = self._search_json_ld(webpage, video_id, default={}) + + return merge_dicts({ + 'id': video_id, + 'title': video_title, + 'description': video_description, + 'duration': duration, + 'thumbnails': thumbnails, + 'uploader': video_uploader, + 'series': series, + 'season': season, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, + 'subtitles': subtitles, + 'formats': formats, + }, info) + + +class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): + IE_NAME = 'crunchyroll:playlist' + _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?:\w{1,2}/)?(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login|media-\d+))(?P<id>[\w\-]+))/?(?:\?|$)' + + _TESTS = [{ + 'url': 'https://www.crunchyroll.com/a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi', + 'info_dict': { + 'id': 'a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi', + 'title': 'A Bridge to the Starry Skies - Hoshizora e Kakaru Hashi' + }, + 'playlist_count': 13, + }, { + # geo-restricted (US), 18+ maturity wall, non-premium available + 'url': 'http://www.crunchyroll.com/cosplay-complex-ova', + 'info_dict': { + 'id': 'cosplay-complex-ova', + 'title': 'Cosplay Complex OVA' + }, + 'playlist_count': 3, + 'skip': 'Georestricted', + }, { + # geo-restricted (US), 18+ maturity wall, non-premium will be available since 2015.11.14 + 'url': 'http://www.crunchyroll.com/ladies-versus-butlers?skip_wall=1', + 'only_matching': True, + }, { + 'url': 'http://www.crunchyroll.com/fr/ladies-versus-butlers', + 'only_matching': True, + }] + + def _real_extract(self, url): + show_id = self._match_id(url) + + webpage = self._download_webpage( + # https:// gives a 403, but http:// does not + self._add_skip_wall(url).replace('https://', 'http://'), show_id, + headers=self.geo_verification_headers()) + title = self._html_search_meta('name', webpage, default=None) + + episode_re = r'<li id="showview_videos_media_(\d+)"[^>]+>.*?<a href="([^"]+)"' + season_re = r'<a [^>]+season-dropdown[^>]+>([^<]+)' + paths = re.findall(f'(?s){episode_re}|{season_re}', webpage) + + entries, current_season = [], None + for ep_id, ep, season in paths: + if season: + current_season = season + continue + entries.append(self.url_result( + f'http://www.crunchyroll.com{ep}', CrunchyrollIE.ie_key(), ep_id, season=current_season)) + + return { + '_type': 'playlist', + 'id': show_id, + 'title': title, + 'entries': reversed(entries), + } + + +class CrunchyrollBetaIE(CrunchyrollBaseIE): + IE_NAME = 'crunchyroll:beta' + _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P<lang>(?:\w{1,2}/)?)watch/(?P<internal_id>\w+)/(?P<id>[\w\-]+)/?(?:\?|$)' + _TESTS = [{ + 'url': 'https://beta.crunchyroll.com/watch/GY2P1Q98Y/to-the-future', + 'info_dict': { + 'id': '696363', + 'ext': 'mp4', + 'timestamp': 1459610100, + 'description': 'md5:a022fbec4fbb023d43631032c91ed64b', + 'uploader': 'Toei Animation', + 'title': 'World Trigger Episode 73 – To the Future', + 'upload_date': '20160402', + }, + 'params': {'skip_download': 'm3u8'}, + 'expected_warnings': ['Unable to download XML'] + }] + + def _real_extract(self, url): + lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'internal_id', 'id') + webpage = self._download_webpage(url, display_id) + episode_data = self._parse_json( + self._search_regex(r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'episode data'), + display_id)['content']['byId'][internal_id] + video_id = episode_data['external_id'].split('.')[1] + series_id = episode_data['episode_metadata']['series_slug_title'] + return self.url_result(f'https://www.crunchyroll.com/{lang}{series_id}/{display_id}-{video_id}', + CrunchyrollIE.ie_key(), video_id) + + +class CrunchyrollBetaShowIE(CrunchyrollBaseIE): + IE_NAME = 'crunchyroll:playlist:beta' + _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P<lang>(?:\w{1,2}/)?)series/\w+/(?P<id>[\w\-]+)/?(?:\?|$)' + _TESTS = [{ + 'url': 'https://beta.crunchyroll.com/series/GY19NQ2QR/Girl-Friend-BETA', + 'info_dict': { + 'id': 'girl-friend-beta', + 'title': 'Girl Friend BETA', + }, + 'playlist_mincount': 10, + }, { + 'url': 'https://beta.crunchyroll.com/it/series/GY19NQ2QR/Girl-Friend-BETA', + 'only_matching': True, + }] + + def _real_extract(self, url): + lang, series_id = self._match_valid_url(url).group('lang', 'id') + return self.url_result(f'https://www.crunchyroll.com/{lang}{series_id.lower()}', + CrunchyrollShowPlaylistIE.ie_key(), series_id) diff --git a/youtube_dl/extractor/cspan.py b/yt_dlp/extractor/cspan.py index 2e01aff48..2e01aff48 100644 --- a/youtube_dl/extractor/cspan.py +++ b/yt_dlp/extractor/cspan.py diff --git a/youtube_dl/extractor/ctsnews.py b/yt_dlp/extractor/ctsnews.py index 679f1d92e..679f1d92e 100644 --- a/youtube_dl/extractor/ctsnews.py +++ b/yt_dlp/extractor/ctsnews.py diff --git a/youtube_dl/extractor/ctv.py b/yt_dlp/extractor/ctv.py index 756bcc2be..756bcc2be 100644 --- a/youtube_dl/extractor/ctv.py +++ b/yt_dlp/extractor/ctv.py diff --git a/youtube_dl/extractor/ctvnews.py b/yt_dlp/extractor/ctvnews.py index 03f8cefb7..03f8cefb7 100644 --- a/youtube_dl/extractor/ctvnews.py +++ b/yt_dlp/extractor/ctvnews.py diff --git a/yt_dlp/extractor/cultureunplugged.py b/yt_dlp/extractor/cultureunplugged.py new file mode 100644 index 000000000..9002e4cef --- /dev/null +++ b/yt_dlp/extractor/cultureunplugged.py @@ -0,0 +1,69 @@ +from __future__ import unicode_literals + +import time + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + HEADRequest, +) + + +class CultureUnpluggedIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?cultureunplugged\.com/documentary/watch-online/play/(?P<id>\d+)(?:/(?P<display_id>[^/]+))?' + _TESTS = [{ + 'url': 'http://www.cultureunplugged.com/documentary/watch-online/play/53662/The-Next--Best-West', + 'md5': 'ac6c093b089f7d05e79934dcb3d228fc', + 'info_dict': { + 'id': '53662', + 'display_id': 'The-Next--Best-West', + 'ext': 'mp4', + 'title': 'The Next, Best West', + 'description': 'md5:0423cd00833dea1519cf014e9d0903b1', + 'thumbnail': r're:^https?://.*\.jpg$', + 'creator': 'Coldstream Creative', + 'duration': 2203, + 'view_count': int, + } + }, { + 'url': 'http://www.cultureunplugged.com/documentary/watch-online/play/53662', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') or video_id + + # request setClientTimezone.php to get PHPSESSID cookie which is need to get valid json data in the next request + self._request_webpage(HEADRequest( + 'http://www.cultureunplugged.com/setClientTimezone.php?timeOffset=%d' % -(time.timezone / 3600)), display_id) + movie_data = self._download_json( + 'http://www.cultureunplugged.com/movie-data/cu-%s.json' % video_id, display_id) + + video_url = movie_data['url'] + title = movie_data['title'] + + description = movie_data.get('synopsis') + creator = movie_data.get('producer') + duration = int_or_none(movie_data.get('duration')) + view_count = int_or_none(movie_data.get('views')) + + thumbnails = [{ + 'url': movie_data['%s_thumb' % size], + 'id': size, + 'preference': preference, + } for preference, size in enumerate(( + 'small', 'large')) if movie_data.get('%s_thumb' % size)] + + return { + 'id': video_id, + 'display_id': display_id, + 'url': video_url, + 'title': title, + 'description': description, + 'creator': creator, + 'duration': duration, + 'view_count': view_count, + 'thumbnails': thumbnails, + } diff --git a/yt_dlp/extractor/curiositystream.py b/yt_dlp/extractor/curiositystream.py new file mode 100644 index 000000000..034a5c92a --- /dev/null +++ b/yt_dlp/extractor/curiositystream.py @@ -0,0 +1,186 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + urlencode_postdata, + compat_str, + ExtractorError, +) + + +class CuriosityStreamBaseIE(InfoExtractor): + _NETRC_MACHINE = 'curiositystream' + _auth_token = None + _API_BASE_URL = 'https://api.curiositystream.com/v1/' + + def _handle_errors(self, result): + error = result.get('error', {}).get('message') + if error: + if isinstance(error, dict): + error = ', '.join(error.values()) + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error), expected=True) + + def _call_api(self, path, video_id, query=None): + headers = {} + if self._auth_token: + headers['X-Auth-Token'] = self._auth_token + result = self._download_json( + self._API_BASE_URL + path, video_id, headers=headers, query=query) + self._handle_errors(result) + return result['data'] + + def _real_initialize(self): + email, password = self._get_login_info() + if email is None: + return + result = self._download_json( + self._API_BASE_URL + 'login', None, data=urlencode_postdata({ + 'email': email, + 'password': password, + })) + self._handle_errors(result) + self._auth_token = result['message']['auth_token'] + + +class CuriosityStreamIE(CuriosityStreamBaseIE): + IE_NAME = 'curiositystream' + _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/video/(?P<id>\d+)' + _TEST = { + 'url': 'https://app.curiositystream.com/video/2', + 'info_dict': { + 'id': '2', + 'ext': 'mp4', + 'title': 'How Did You Develop The Internet?', + 'description': 'Vint Cerf, Google\'s Chief Internet Evangelist, describes how he and Bob Kahn created the internet.', + }, + 'params': { + 'format': 'bestvideo', + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + formats = [] + for encoding_format in ('m3u8', 'mpd'): + media = self._call_api('media/' + video_id, video_id, query={ + 'encodingsNew': 'true', + 'encodingsFormat': encoding_format, + }) + for encoding in media.get('encodings', []): + playlist_url = encoding.get('master_playlist_url') + if encoding_format == 'm3u8': + # use `m3u8` entry_protocol until EXT-X-MAP is properly supported by `m3u8_native` entry_protocol + formats.extend(self._extract_m3u8_formats( + playlist_url, video_id, 'mp4', + m3u8_id='hls', fatal=False)) + elif encoding_format == 'mpd': + formats.extend(self._extract_mpd_formats( + playlist_url, video_id, mpd_id='dash', fatal=False)) + encoding_url = encoding.get('url') + file_url = encoding.get('file_url') + if not encoding_url and not file_url: + continue + f = { + 'width': int_or_none(encoding.get('width')), + 'height': int_or_none(encoding.get('height')), + 'vbr': int_or_none(encoding.get('video_bitrate')), + 'abr': int_or_none(encoding.get('audio_bitrate')), + 'filesize': int_or_none(encoding.get('size_in_bytes')), + 'vcodec': encoding.get('video_codec'), + 'acodec': encoding.get('audio_codec'), + 'container': encoding.get('container_type'), + } + for f_url in (encoding_url, file_url): + if not f_url: + continue + fmt = f.copy() + rtmp = re.search(r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+))/(?P<playpath>mp[34]:.+)$', f_url) + if rtmp: + fmt.update({ + 'url': rtmp.group('url'), + 'play_path': rtmp.group('playpath'), + 'app': rtmp.group('app'), + 'ext': 'flv', + 'format_id': 'rtmp', + }) + else: + fmt.update({ + 'url': f_url, + 'format_id': 'http', + }) + formats.append(fmt) + self._sort_formats(formats) + + title = media['title'] + + subtitles = {} + for closed_caption in media.get('closed_captions', []): + sub_url = closed_caption.get('file') + if not sub_url: + continue + lang = closed_caption.get('code') or closed_caption.get('language') or 'en' + subtitles.setdefault(lang, []).append({ + 'url': sub_url, + }) + + return { + 'id': video_id, + 'formats': formats, + 'title': title, + 'description': media.get('description'), + 'thumbnail': media.get('image_large') or media.get('image_medium') or media.get('image_small'), + 'duration': int_or_none(media.get('duration')), + 'tags': media.get('tags'), + 'subtitles': subtitles, + } + + +class CuriosityStreamCollectionIE(CuriosityStreamBaseIE): + IE_NAME = 'curiositystream:collection' + _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/(?:collections?|series)/(?P<id>\d+)' + _API_BASE_URL = 'https://api.curiositystream.com/v2/collections/' + _TESTS = [{ + 'url': 'https://curiositystream.com/collections/86', + 'info_dict': { + 'id': '86', + 'title': 'Staff Picks', + 'description': 'Wondering where to start? Here are a few of our favorite series and films... from our couch to yours.', + }, + 'playlist_mincount': 7, + }, { + 'url': 'https://app.curiositystream.com/collection/2', + 'info_dict': { + 'id': '2', + 'title': 'Curious Minds: The Internet', + 'description': 'How is the internet shaping our lives in the 21st Century?', + }, + 'playlist_mincount': 16, + }, { + 'url': 'https://curiositystream.com/series/2', + 'only_matching': True, + }, { + 'url': 'https://curiositystream.com/collections/36', + 'only_matching': True, + }] + + def _real_extract(self, url): + collection_id = self._match_id(url) + collection = self._call_api(collection_id, collection_id) + entries = [] + for media in collection.get('media', []): + media_id = compat_str(media.get('id')) + media_type, ie = ('series', CuriosityStreamCollectionIE) if media.get('is_collection') else ('video', CuriosityStreamIE) + entries.append(self.url_result( + 'https://curiositystream.com/%s/%s' % (media_type, media_id), + ie=ie.ie_key(), video_id=media_id)) + return self.playlist_result( + entries, collection_id, + collection.get('title'), collection.get('description')) diff --git a/youtube_dl/extractor/cwtv.py b/yt_dlp/extractor/cwtv.py index 73382431b..73382431b 100644 --- a/youtube_dl/extractor/cwtv.py +++ b/yt_dlp/extractor/cwtv.py diff --git a/youtube_dl/extractor/dailymail.py b/yt_dlp/extractor/dailymail.py index 67b88fd56..67b88fd56 100644 --- a/youtube_dl/extractor/dailymail.py +++ b/yt_dlp/extractor/dailymail.py diff --git a/yt_dlp/extractor/dailymotion.py b/yt_dlp/extractor/dailymotion.py new file mode 100644 index 000000000..e04e10b86 --- /dev/null +++ b/yt_dlp/extractor/dailymotion.py @@ -0,0 +1,393 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import functools +import json +import re + +from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ( + age_restricted, + clean_html, + ExtractorError, + int_or_none, + OnDemandPagedList, + try_get, + unescapeHTML, + urlencode_postdata, +) + + +class DailymotionBaseInfoExtractor(InfoExtractor): + _FAMILY_FILTER = None + _HEADERS = { + 'Content-Type': 'application/json', + 'Origin': 'https://www.dailymotion.com', + } + _NETRC_MACHINE = 'dailymotion' + + def _get_dailymotion_cookies(self): + return self._get_cookies('https://www.dailymotion.com/') + + @staticmethod + def _get_cookie_value(cookies, name): + cookie = cookies.get(name) + if cookie: + return cookie.value + + def _set_dailymotion_cookie(self, name, value): + self._set_cookie('www.dailymotion.com', name, value) + + def _real_initialize(self): + cookies = self._get_dailymotion_cookies() + ff = self._get_cookie_value(cookies, 'ff') + self._FAMILY_FILTER = ff == 'on' if ff else age_restricted(18, self.get_param('age_limit')) + self._set_dailymotion_cookie('ff', 'on' if self._FAMILY_FILTER else 'off') + + def _call_api(self, object_type, xid, object_fields, note, filter_extra=None): + if not self._HEADERS.get('Authorization'): + cookies = self._get_dailymotion_cookies() + token = self._get_cookie_value(cookies, 'access_token') or self._get_cookie_value(cookies, 'client_token') + if not token: + data = { + 'client_id': 'f1a362d288c1b98099c7', + 'client_secret': 'eea605b96e01c796ff369935357eca920c5da4c5', + } + username, password = self._get_login_info() + if username: + data.update({ + 'grant_type': 'password', + 'password': password, + 'username': username, + }) + else: + data['grant_type'] = 'client_credentials' + try: + token = self._download_json( + 'https://graphql.api.dailymotion.com/oauth/token', + None, 'Downloading Access Token', + data=urlencode_postdata(data))['access_token'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + raise ExtractorError(self._parse_json( + e.cause.read().decode(), xid)['error_description'], expected=True) + raise + self._set_dailymotion_cookie('access_token' if username else 'client_token', token) + self._HEADERS['Authorization'] = 'Bearer ' + token + + resp = self._download_json( + 'https://graphql.api.dailymotion.com/', xid, note, data=json.dumps({ + 'query': '''{ + %s(xid: "%s"%s) { + %s + } +}''' % (object_type, xid, ', ' + filter_extra if filter_extra else '', object_fields), + }).encode(), headers=self._HEADERS) + obj = resp['data'][object_type] + if not obj: + raise ExtractorError(resp['errors'][0]['message'], expected=True) + return obj + + +class DailymotionIE(DailymotionBaseInfoExtractor): + _VALID_URL = r'''(?ix) + https?:// + (?: + (?:(?:www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:embed|swf|\#)/)?video|swf)| + (?:www\.)?lequipe\.fr/video + ) + /(?P<id>[^/?_]+)(?:.+?\bplaylist=(?P<playlist_id>x[0-9a-z]+))? + ''' + IE_NAME = 'dailymotion' + _TESTS = [{ + 'url': 'http://www.dailymotion.com/video/x5kesuj_office-christmas-party-review-jason-bateman-olivia-munn-t-j-miller_news', + 'md5': '074b95bdee76b9e3654137aee9c79dfe', + 'info_dict': { + 'id': 'x5kesuj', + 'ext': 'mp4', + 'title': 'Office Christmas Party Review – Jason Bateman, Olivia Munn, T.J. Miller', + 'description': 'Office Christmas Party Review - Jason Bateman, Olivia Munn, T.J. Miller', + 'duration': 187, + 'timestamp': 1493651285, + 'upload_date': '20170501', + 'uploader': 'Deadline', + 'uploader_id': 'x1xm8ri', + 'age_limit': 0, + }, + }, { + 'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames', + 'md5': '2137c41a8e78554bb09225b8eb322406', + 'info_dict': { + 'id': 'x2iuewm', + 'ext': 'mp4', + 'title': 'Steam Machine Models, Pricing Listed on Steam Store - IGN News', + 'description': 'Several come bundled with the Steam Controller.', + 'thumbnail': r're:^https?:.*\.(?:jpg|png)$', + 'duration': 74, + 'timestamp': 1425657362, + 'upload_date': '20150306', + 'uploader': 'IGN', + 'uploader_id': 'xijv66', + 'age_limit': 0, + 'view_count': int, + }, + 'skip': 'video gone', + }, { + # Vevo video + 'url': 'http://www.dailymotion.com/video/x149uew_katy-perry-roar-official_musi', + 'info_dict': { + 'title': 'Roar (Official)', + 'id': 'USUV71301934', + 'ext': 'mp4', + 'uploader': 'Katy Perry', + 'upload_date': '20130905', + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'VEVO is only available in some countries', + }, { + # age-restricted video + 'url': 'http://www.dailymotion.com/video/xyh2zz_leanna-decker-cyber-girl-of-the-year-desires-nude-playboy-plus_redband', + 'md5': '0d667a7b9cebecc3c89ee93099c4159d', + 'info_dict': { + 'id': 'xyh2zz', + 'ext': 'mp4', + 'title': 'Leanna Decker - Cyber Girl Of The Year Desires Nude [Playboy Plus]', + 'uploader': 'HotWaves1012', + 'age_limit': 18, + }, + 'skip': 'video gone', + }, { + # geo-restricted, player v5 + 'url': 'http://www.dailymotion.com/video/xhza0o', + 'only_matching': True, + }, { + # with subtitles + 'url': 'http://www.dailymotion.com/video/x20su5f_the-power-of-nightmares-1-the-rise-of-the-politics-of-fear-bbc-2004_news', + 'only_matching': True, + }, { + 'url': 'http://www.dailymotion.com/swf/video/x3n92nf', + 'only_matching': True, + }, { + 'url': 'http://www.dailymotion.com/swf/x3ss1m_funny-magic-trick-barry-and-stuart_fun', + 'only_matching': True, + }, { + 'url': 'https://www.lequipe.fr/video/x791mem', + 'only_matching': True, + }, { + 'url': 'https://www.lequipe.fr/video/k7MtHciueyTcrFtFKA2', + 'only_matching': True, + }, { + 'url': 'https://www.dailymotion.com/video/x3z49k?playlist=xv4bw', + 'only_matching': True, + }] + _GEO_BYPASS = False + _COMMON_MEDIA_FIELDS = '''description + geoblockedCountries { + allowed + } + xid''' + + @staticmethod + def _extract_urls(webpage): + urls = [] + # Look for embedded Dailymotion player + # https://developer.dailymotion.com/player#player-parameters + for mobj in re.finditer( + r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1', webpage): + urls.append(unescapeHTML(mobj.group('url'))) + for mobj in re.finditer( + r'(?s)DM\.player\([^,]+,\s*{.*?video[\'"]?\s*:\s*["\']?(?P<id>[0-9a-zA-Z]+).+?}\s*\);', webpage): + urls.append('https://www.dailymotion.com/embed/video/' + mobj.group('id')) + return urls + + def _real_extract(self, url): + video_id, playlist_id = self._match_valid_url(url).groups() + + if playlist_id: + if not self.get_param('noplaylist'): + self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % playlist_id) + return self.url_result( + 'http://www.dailymotion.com/playlist/' + playlist_id, + 'DailymotionPlaylist', playlist_id) + self.to_screen('Downloading just video %s because of --no-playlist' % video_id) + + password = self.get_param('videopassword') + media = self._call_api( + 'media', video_id, '''... on Video { + %s + stats { + likes { + total + } + views { + total + } + } + } + ... on Live { + %s + audienceCount + isOnAir + }''' % (self._COMMON_MEDIA_FIELDS, self._COMMON_MEDIA_FIELDS), 'Downloading media JSON metadata', + 'password: "%s"' % self.get_param('videopassword') if password else None) + xid = media['xid'] + + metadata = self._download_json( + 'https://www.dailymotion.com/player/metadata/video/' + xid, + xid, 'Downloading metadata JSON', + query={'app': 'com.dailymotion.neon'}) + + error = metadata.get('error') + if error: + title = error.get('title') or error['raw_message'] + # See https://developer.dailymotion.com/api#access-error + if error.get('code') == 'DM007': + allowed_countries = try_get(media, lambda x: x['geoblockedCountries']['allowed'], list) + self.raise_geo_restricted(msg=title, countries=allowed_countries) + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, title), expected=True) + + title = metadata['title'] + is_live = media.get('isOnAir') + formats = [] + for quality, media_list in metadata['qualities'].items(): + for m in media_list: + media_url = m.get('url') + media_type = m.get('type') + if not media_url or media_type == 'application/vnd.lumberjack.manifest': + continue + if media_type == 'application/x-mpegURL': + formats.extend(self._extract_m3u8_formats( + media_url, video_id, 'mp4', + 'm3u8' if is_live else 'm3u8_native', + m3u8_id='hls', fatal=False)) + else: + f = { + 'url': media_url, + 'format_id': 'http-' + quality, + } + m = re.search(r'/H264-(\d+)x(\d+)(?:-(60)/)?', media_url) + if m: + width, height, fps = map(int_or_none, m.groups()) + f.update({ + 'fps': fps, + 'height': height, + 'width': width, + }) + formats.append(f) + for f in formats: + f['url'] = f['url'].split('#')[0] + if not f.get('fps') and f['format_id'].endswith('@60'): + f['fps'] = 60 + self._sort_formats(formats) + + subtitles = {} + subtitles_data = try_get(metadata, lambda x: x['subtitles']['data'], dict) or {} + for subtitle_lang, subtitle in subtitles_data.items(): + subtitles[subtitle_lang] = [{ + 'url': subtitle_url, + } for subtitle_url in subtitle.get('urls', [])] + + thumbnails = [] + for height, poster_url in metadata.get('posters', {}).items(): + thumbnails.append({ + 'height': int_or_none(height), + 'id': height, + 'url': poster_url, + }) + + owner = metadata.get('owner') or {} + stats = media.get('stats') or {} + get_count = lambda x: int_or_none(try_get(stats, lambda y: y[x + 's']['total'])) + + return { + 'id': video_id, + 'title': self._live_title(title) if is_live else title, + 'description': clean_html(media.get('description')), + 'thumbnails': thumbnails, + 'duration': int_or_none(metadata.get('duration')) or None, + 'timestamp': int_or_none(metadata.get('created_time')), + 'uploader': owner.get('screenname'), + 'uploader_id': owner.get('id') or metadata.get('screenname'), + 'age_limit': 18 if metadata.get('explicit') else 0, + 'tags': metadata.get('tags'), + 'view_count': get_count('view') or int_or_none(media.get('audienceCount')), + 'like_count': get_count('like'), + 'formats': formats, + 'subtitles': subtitles, + 'is_live': is_live, + } + + +class DailymotionPlaylistBaseIE(DailymotionBaseInfoExtractor): + _PAGE_SIZE = 100 + + def _fetch_page(self, playlist_id, page): + page += 1 + videos = self._call_api( + self._OBJECT_TYPE, playlist_id, + '''videos(allowExplicit: %s, first: %d, page: %d) { + edges { + node { + xid + url + } + } + }''' % ('false' if self._FAMILY_FILTER else 'true', self._PAGE_SIZE, page), + 'Downloading page %d' % page)['videos'] + for edge in videos['edges']: + node = edge['node'] + yield self.url_result( + node['url'], DailymotionIE.ie_key(), node['xid']) + + def _real_extract(self, url): + playlist_id = self._match_id(url) + entries = OnDemandPagedList(functools.partial( + self._fetch_page, playlist_id), self._PAGE_SIZE) + return self.playlist_result( + entries, playlist_id) + + +class DailymotionPlaylistIE(DailymotionPlaylistBaseIE): + IE_NAME = 'dailymotion:playlist' + _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>x[0-9a-z]+)' + _TESTS = [{ + 'url': 'http://www.dailymotion.com/playlist/xv4bw_nqtv_sport/1#video=xl8v3q', + 'info_dict': { + 'id': 'xv4bw', + }, + 'playlist_mincount': 20, + }] + _OBJECT_TYPE = 'collection' + + +class DailymotionUserIE(DailymotionPlaylistBaseIE): + IE_NAME = 'dailymotion:user' + _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist)/)(?:(?:old/)?user/)?(?P<id>[^/]+)' + _TESTS = [{ + 'url': 'https://www.dailymotion.com/user/nqtv', + 'info_dict': { + 'id': 'nqtv', + }, + 'playlist_mincount': 152, + }, { + 'url': 'http://www.dailymotion.com/user/UnderProject', + 'info_dict': { + 'id': 'UnderProject', + }, + 'playlist_mincount': 1000, + 'skip': 'Takes too long time', + }, { + 'url': 'https://www.dailymotion.com/user/nqtv', + 'info_dict': { + 'id': 'nqtv', + }, + 'playlist_mincount': 148, + 'params': { + 'age_limit': 0, + }, + }] + _OBJECT_TYPE = 'channel' diff --git a/yt_dlp/extractor/damtomo.py b/yt_dlp/extractor/damtomo.py new file mode 100644 index 000000000..456cd35a4 --- /dev/null +++ b/yt_dlp/extractor/damtomo.py @@ -0,0 +1,113 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ExtractorError, clean_html, int_or_none, try_get, unified_strdate +from ..compat import compat_str + + +class DamtomoBaseIE(InfoExtractor): + def _real_extract(self, url): + video_id = self._match_id(url) + webpage, handle = self._download_webpage_handle(self._WEBPAGE_URL_TMPL % video_id, video_id, encoding='sjis') + + if handle.url == 'https://www.clubdam.com/sorry/': + raise ExtractorError('You are rate-limited. Try again later.', expected=True) + if '<h2>予期せぬエラーが発生しました。</h2>' in webpage: + raise ExtractorError('There is an error on server-side. Try again later.', expected=True) + + description = self._search_regex(r'(?m)<div id="public_comment">\s*<p>\s*([^<]*?)\s*</p>', webpage, 'description', default=None) + uploader_id = self._search_regex(r'<a href="https://www\.clubdam\.com/app/damtomo/member/info/Profile\.do\?damtomoId=([^"]+)"', webpage, 'uploader_id', default=None) + + data_dict = { + mobj.group('class'): re.sub(r'\s+', ' ', clean_html(mobj.group('value'))) + for mobj in re.finditer(r'(?s)<(p|div)\s+class="(?P<class>[^" ]+?)">(?P<value>.+?)</\1>', webpage)} + + # since videos do not have title, give the name of song instead + data_dict['user_name'] = re.sub(r'\s*さん\s*$', '', data_dict['user_name']) + title = data_dict.get('song_title') + + stream_tree = self._download_xml( + self._DKML_XML_URL % video_id, video_id, note='Requesting stream information', encoding='sjis', + # doing this has no problem since there is no character outside ASCII, + # and never likely to happen in the future + transform_source=lambda x: re.sub(r'\s*encoding="[^"]+?"', '', x)) + m3u8_url = try_get(stream_tree, lambda x: x.find( + './/d:streamingUrl', {'d': self._DKML_XML_NS}).text.strip(), compat_str) + if not m3u8_url: + raise ExtractorError('Failed to obtain m3u8 URL') + formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'uploader_id': uploader_id, + 'description': description, + 'uploader': data_dict.get('user_name'), + 'upload_date': unified_strdate(self._search_regex(r'(\d{4}/\d{2}/\d{2})', data_dict.get('date'), 'upload_date', default=None)), + 'view_count': int_or_none(self._search_regex(r'(\d+)', data_dict['audience'], 'view_count', default=None)), + 'like_count': int_or_none(self._search_regex(r'(\d+)', data_dict['nice'], 'like_count', default=None)), + 'track': title, + 'artist': data_dict.get('song_artist'), + 'formats': formats, + } + + +class DamtomoVideoIE(DamtomoBaseIE): + IE_NAME = 'damtomo:video' + _VALID_URL = r'https?://(?:www\.)?clubdam\.com/app/damtomo/(?:SP/)?karaokeMovie/StreamingDkm\.do\?karaokeMovieId=(?P<id>\d+)' + _WEBPAGE_URL_TMPL = 'https://www.clubdam.com/app/damtomo/karaokeMovie/StreamingDkm.do?karaokeMovieId=%s' + _DKML_XML_URL = 'https://www.clubdam.com/app/damtomo/karaokeMovie/GetStreamingDkmUrlXML.do?movieSelectFlg=2&karaokeMovieId=%s' + _DKML_XML_NS = 'https://www.clubdam.com/app/damtomo/karaokeMovie/GetStreamingDkmUrlXML' + _TESTS = [{ + 'url': 'https://www.clubdam.com/app/damtomo/karaokeMovie/StreamingDkm.do?karaokeMovieId=2414316', + 'info_dict': { + 'id': '2414316', + 'title': 'Get Wild', + 'uploader': 'Kドロン', + 'uploader_id': 'ODk5NTQwMzQ', + 'track': 'Get Wild', + 'artist': 'TM NETWORK(TMN)', + 'upload_date': '20201226', + } + }] + + +class DamtomoRecordIE(DamtomoBaseIE): + IE_NAME = 'damtomo:record' + _VALID_URL = r'https?://(?:www\.)?clubdam\.com/app/damtomo/(?:SP/)?karaokePost/StreamingKrk\.do\?karaokeContributeId=(?P<id>\d+)' + _WEBPAGE_URL_TMPL = 'https://www.clubdam.com/app/damtomo/karaokePost/StreamingKrk.do?karaokeContributeId=%s' + _DKML_XML_URL = 'https://www.clubdam.com/app/damtomo/karaokePost/GetStreamingKrkUrlXML.do?karaokeContributeId=%s' + _DKML_XML_NS = 'https://www.clubdam.com/app/damtomo/karaokePost/GetStreamingKrkUrlXML' + _TESTS = [{ + 'url': 'https://www.clubdam.com/app/damtomo/karaokePost/StreamingKrk.do?karaokeContributeId=27376862', + 'info_dict': { + 'id': '27376862', + 'title': 'イカSUMMER [良音]', + 'description': None, + 'uploader': 'NANA', + 'uploader_id': 'MzAyMDExNTY', + 'upload_date': '20210721', + 'view_count': 4, + 'like_count': 1, + 'track': 'イカSUMMER [良音]', + 'artist': 'ORANGE RANGE', + } + }, { + 'url': 'https://www.clubdam.com/app/damtomo/karaokePost/StreamingKrk.do?karaokeContributeId=27489418', + 'info_dict': { + 'id': '27489418', + 'title': '心みだれて〜say it with flowers〜(生音)', + 'uploader_id': 'NjI1MjI2MjU', + 'description': 'やっぱりキーを下げて正解だった感じ。リベンジ成功ということで。', + 'uploader': '箱の「中の人」', + 'upload_date': '20210815', + 'view_count': 5, + 'like_count': 3, + 'track': '心みだれて〜say it with flowers〜(生音)', + 'artist': '小林明子', + } + }] diff --git a/yt_dlp/extractor/daum.py b/yt_dlp/extractor/daum.py new file mode 100644 index 000000000..8aa2af9a8 --- /dev/null +++ b/yt_dlp/extractor/daum.py @@ -0,0 +1,265 @@ +# coding: utf-8 + +from __future__ import unicode_literals + +import itertools + +from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse_unquote, +) +from ..utils import parse_qs + + +class DaumBaseIE(InfoExtractor): + _KAKAO_EMBED_BASE = 'http://tv.kakao.com/embed/player/cliplink/' + + +class DaumIE(DaumBaseIE): + _VALID_URL = r'https?://(?:(?:m\.)?tvpot\.daum\.net/v/|videofarm\.daum\.net/controller/player/VodPlayer\.swf\?vid=)(?P<id>[^?#&]+)' + IE_NAME = 'daum.net' + + _TESTS = [{ + 'url': 'http://tvpot.daum.net/v/vab4dyeDBysyBssyukBUjBz', + 'info_dict': { + 'id': 'vab4dyeDBysyBssyukBUjBz', + 'ext': 'mp4', + 'title': '마크 헌트 vs 안토니오 실바', + 'description': 'Mark Hunt vs Antonio Silva', + 'upload_date': '20131217', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'duration': 2117, + 'view_count': int, + 'comment_count': int, + 'uploader_id': 186139, + 'uploader': '콘간지', + 'timestamp': 1387310323, + }, + }, { + 'url': 'http://m.tvpot.daum.net/v/65139429', + 'info_dict': { + 'id': '65139429', + 'ext': 'mp4', + 'title': '1297회, \'아빠 아들로 태어나길 잘 했어\' 민수, 감동의 눈물[아빠 어디가] 20150118', + 'description': 'md5:79794514261164ff27e36a21ad229fc5', + 'upload_date': '20150118', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'duration': 154, + 'view_count': int, + 'comment_count': int, + 'uploader': 'MBC 예능', + 'uploader_id': 132251, + 'timestamp': 1421604228, + }, + }, { + 'url': 'http://tvpot.daum.net/v/07dXWRka62Y%24', + 'only_matching': True, + }, { + 'url': 'http://videofarm.daum.net/controller/player/VodPlayer.swf?vid=vwIpVpCQsT8%24&ref=', + 'info_dict': { + 'id': 'vwIpVpCQsT8$', + 'ext': 'flv', + 'title': '01-Korean War ( Trouble on the horizon )', + 'description': 'Korean War 01\r\nTrouble on the horizon\r\n전쟁의 먹구름', + 'upload_date': '20080223', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'duration': 249, + 'view_count': int, + 'comment_count': int, + 'uploader': '까칠한 墮落始祖 황비홍님의', + 'uploader_id': 560824, + 'timestamp': 1203770745, + }, + }, { + # Requires dte_type=WEB (#9972) + 'url': 'http://tvpot.daum.net/v/s3794Uf1NZeZ1qMpGpeqeRU', + 'md5': 'a8917742069a4dd442516b86e7d66529', + 'info_dict': { + 'id': 's3794Uf1NZeZ1qMpGpeqeRU', + 'ext': 'mp4', + 'title': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)', + 'description': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)\r\n\r\n[쇼! 음악중심] 20160611, 507회', + 'upload_date': '20170129', + 'uploader': '쇼! 음악중심', + 'uploader_id': 2653210, + 'timestamp': 1485684628, + }, + }] + + def _real_extract(self, url): + video_id = compat_urllib_parse_unquote(self._match_id(url)) + if not video_id.isdigit(): + video_id += '@my' + return self.url_result( + self._KAKAO_EMBED_BASE + video_id, 'Kakao', video_id) + + +class DaumClipIE(DaumBaseIE): + _VALID_URL = r'https?://(?:m\.)?tvpot\.daum\.net/(?:clip/ClipView.(?:do|tv)|mypot/View.do)\?.*?clipid=(?P<id>\d+)' + IE_NAME = 'daum.net:clip' + _URL_TEMPLATE = 'http://tvpot.daum.net/clip/ClipView.do?clipid=%s' + + _TESTS = [{ + 'url': 'http://tvpot.daum.net/clip/ClipView.do?clipid=52554690', + 'info_dict': { + 'id': '52554690', + 'ext': 'mp4', + 'title': 'DOTA 2GETHER 시즌2 6회 - 2부', + 'description': 'DOTA 2GETHER 시즌2 6회 - 2부', + 'upload_date': '20130831', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'duration': 3868, + 'view_count': int, + 'uploader': 'GOMeXP', + 'uploader_id': 6667, + 'timestamp': 1377911092, + }, + }, { + 'url': 'http://m.tvpot.daum.net/clip/ClipView.tv?clipid=54999425', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if DaumPlaylistIE.suitable(url) or DaumUserIE.suitable(url) else super(DaumClipIE, cls).suitable(url) + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result( + self._KAKAO_EMBED_BASE + video_id, 'Kakao', video_id) + + +class DaumListIE(InfoExtractor): + def _get_entries(self, list_id, list_id_type): + name = None + entries = [] + for pagenum in itertools.count(1): + list_info = self._download_json( + 'http://tvpot.daum.net/mypot/json/GetClipInfo.do?size=48&init=true&order=date&page=%d&%s=%s' % ( + pagenum, list_id_type, list_id), list_id, 'Downloading list info - %s' % pagenum) + + entries.extend([ + self.url_result( + 'http://tvpot.daum.net/v/%s' % clip['vid']) + for clip in list_info['clip_list'] + ]) + + if not name: + name = list_info.get('playlist_bean', {}).get('name') or \ + list_info.get('potInfo', {}).get('name') + + if not list_info.get('has_more'): + break + + return name, entries + + def _check_clip(self, url, list_id): + query_dict = parse_qs(url) + if 'clipid' in query_dict: + clip_id = query_dict['clipid'][0] + if self.get_param('noplaylist'): + self.to_screen('Downloading just video %s because of --no-playlist' % clip_id) + return self.url_result(DaumClipIE._URL_TEMPLATE % clip_id, 'DaumClip') + else: + self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % list_id) + + +class DaumPlaylistIE(DaumListIE): + _VALID_URL = r'https?://(?:m\.)?tvpot\.daum\.net/mypot/(?:View\.do|Top\.tv)\?.*?playlistid=(?P<id>[0-9]+)' + IE_NAME = 'daum.net:playlist' + _URL_TEMPLATE = 'http://tvpot.daum.net/mypot/View.do?playlistid=%s' + + _TESTS = [{ + 'note': 'Playlist url with clipid', + 'url': 'http://tvpot.daum.net/mypot/View.do?playlistid=6213966&clipid=73806844', + 'info_dict': { + 'id': '6213966', + 'title': 'Woorissica Official', + }, + 'playlist_mincount': 181 + }, { + 'note': 'Playlist url with clipid - noplaylist', + 'url': 'http://tvpot.daum.net/mypot/View.do?playlistid=6213966&clipid=73806844', + 'info_dict': { + 'id': '73806844', + 'ext': 'mp4', + 'title': '151017 Airport', + 'upload_date': '20160117', + }, + 'params': { + 'noplaylist': True, + 'skip_download': True, + } + }] + + @classmethod + def suitable(cls, url): + return False if DaumUserIE.suitable(url) else super(DaumPlaylistIE, cls).suitable(url) + + def _real_extract(self, url): + list_id = self._match_id(url) + + clip_result = self._check_clip(url, list_id) + if clip_result: + return clip_result + + name, entries = self._get_entries(list_id, 'playlistid') + + return self.playlist_result(entries, list_id, name) + + +class DaumUserIE(DaumListIE): + _VALID_URL = r'https?://(?:m\.)?tvpot\.daum\.net/mypot/(?:View|Top)\.(?:do|tv)\?.*?ownerid=(?P<id>[0-9a-zA-Z]+)' + IE_NAME = 'daum.net:user' + + _TESTS = [{ + 'url': 'http://tvpot.daum.net/mypot/View.do?ownerid=o2scDLIVbHc0', + 'info_dict': { + 'id': 'o2scDLIVbHc0', + 'title': '마이 리틀 텔레비전', + }, + 'playlist_mincount': 213 + }, { + 'url': 'http://tvpot.daum.net/mypot/View.do?ownerid=o2scDLIVbHc0&clipid=73801156', + 'info_dict': { + 'id': '73801156', + 'ext': 'mp4', + 'title': '[미공개] 김구라, 오만석이 부릅니다 \'오케피\' - 마이 리틀 텔레비전 20160116', + 'upload_date': '20160117', + 'description': 'md5:5e91d2d6747f53575badd24bd62b9f36' + }, + 'params': { + 'noplaylist': True, + 'skip_download': True, + } + }, { + 'note': 'Playlist url has ownerid and playlistid, playlistid takes precedence', + 'url': 'http://tvpot.daum.net/mypot/View.do?ownerid=o2scDLIVbHc0&playlistid=6196631', + 'info_dict': { + 'id': '6196631', + 'title': '마이 리틀 텔레비전 - 20160109', + }, + 'playlist_count': 11 + }, { + 'url': 'http://tvpot.daum.net/mypot/Top.do?ownerid=o2scDLIVbHc0', + 'only_matching': True, + }, { + 'url': 'http://m.tvpot.daum.net/mypot/Top.tv?ownerid=45x1okb1If50&playlistid=3569733', + 'only_matching': True, + }] + + def _real_extract(self, url): + list_id = self._match_id(url) + + clip_result = self._check_clip(url, list_id) + if clip_result: + return clip_result + + query_dict = parse_qs(url) + if 'playlistid' in query_dict: + playlist_id = query_dict['playlistid'][0] + return self.url_result(DaumPlaylistIE._URL_TEMPLATE % playlist_id, 'DaumPlaylist') + + name, entries = self._get_entries(list_id, 'ownerid') + + return self.playlist_result(entries, list_id, name) diff --git a/yt_dlp/extractor/dbtv.py b/yt_dlp/extractor/dbtv.py new file mode 100644 index 000000000..8e73176a6 --- /dev/null +++ b/yt_dlp/extractor/dbtv.py @@ -0,0 +1,57 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class DBTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?dagbladet\.no/video/(?:(?:embed|(?P<display_id>[^/]+))/)?(?P<id>[0-9A-Za-z_-]{11}|[a-zA-Z0-9]{8})' + _TESTS = [{ + 'url': 'https://www.dagbladet.no/video/PynxJnNWChE/', + 'md5': 'b8f850ba1860adbda668d367f9b77699', + 'info_dict': { + 'id': 'PynxJnNWChE', + 'ext': 'mp4', + 'title': 'Skulle teste ut fornøyelsespark, men kollegaen var bare opptatt av bikinikroppen', + 'description': 'md5:49cc8370e7d66e8a2ef15c3b4631fd3f', + 'thumbnail': r're:https?://.*\.jpg', + 'upload_date': '20160916', + 'duration': 69, + 'uploader_id': 'UCk5pvsyZJoYJBd7_oFPTlRQ', + 'uploader': 'Dagbladet', + }, + 'add_ie': ['Youtube'] + }, { + 'url': 'https://www.dagbladet.no/video/embed/xlGmyIeN9Jo/?autoplay=false', + 'only_matching': True, + }, { + 'url': 'https://www.dagbladet.no/video/truer-iran-bor-passe-dere/PalfB2Cw', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [url for _, url in re.findall( + r'<iframe[^>]+src=(["\'])((?:https?:)?//(?:www\.)?dagbladet\.no/video/embed/(?:[0-9A-Za-z_-]{11}|[a-zA-Z0-9]{8}).*?)\1', + webpage)] + + def _real_extract(self, url): + display_id, video_id = self._match_valid_url(url).groups() + info = { + '_type': 'url_transparent', + 'id': video_id, + 'display_id': display_id, + } + if len(video_id) == 11: + info.update({ + 'url': video_id, + 'ie_key': 'Youtube', + }) + else: + info.update({ + 'url': 'jwplatform:' + video_id, + 'ie_key': 'JWPlatform', + }) + return info diff --git a/youtube_dl/extractor/dctp.py b/yt_dlp/extractor/dctp.py index e700f8d86..e700f8d86 100644 --- a/youtube_dl/extractor/dctp.py +++ b/yt_dlp/extractor/dctp.py diff --git a/yt_dlp/extractor/deezer.py b/yt_dlp/extractor/deezer.py new file mode 100644 index 000000000..7ba02e552 --- /dev/null +++ b/yt_dlp/extractor/deezer.py @@ -0,0 +1,146 @@ +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + orderedSet, +) + + +class DeezerBaseInfoExtractor(InfoExtractor): + def get_data(self, url): + if not self.get_param('test'): + self.report_warning('For now, this extractor only supports the 30 second previews. Patches welcome!') + + mobj = self._match_valid_url(url) + data_id = mobj.group('id') + + webpage = self._download_webpage(url, data_id) + geoblocking_msg = self._html_search_regex( + r'<p class="soon-txt">(.*?)</p>', webpage, 'geoblocking message', + default=None) + if geoblocking_msg is not None: + raise ExtractorError( + 'Deezer said: %s' % geoblocking_msg, expected=True) + + data_json = self._search_regex( + (r'__DZR_APP_STATE__\s*=\s*({.+?})\s*</script>', + r'naboo\.display\(\'[^\']+\',\s*(.*?)\);\n'), + webpage, 'data JSON') + data = json.loads(data_json) + return data_id, webpage, data + + +class DeezerPlaylistIE(DeezerBaseInfoExtractor): + _VALID_URL = r'https?://(?:www\.)?deezer\.com/(../)?playlist/(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.deezer.com/playlist/176747451', + 'info_dict': { + 'id': '176747451', + 'title': 'Best!', + 'uploader': 'anonymous', + 'thumbnail': r're:^https?://(e-)?cdns-images\.dzcdn\.net/images/cover/.*\.jpg$', + }, + 'playlist_count': 29, + } + + def _real_extract(self, url): + playlist_id, webpage, data = self.get_data(url) + + playlist_title = data.get('DATA', {}).get('TITLE') + playlist_uploader = data.get('DATA', {}).get('PARENT_USERNAME') + playlist_thumbnail = self._search_regex( + r'<img id="naboo_playlist_image".*?src="([^"]+)"', webpage, + 'playlist thumbnail') + + entries = [] + for s in data.get('SONGS', {}).get('data'): + formats = [{ + 'format_id': 'preview', + 'url': s.get('MEDIA', [{}])[0].get('HREF'), + 'preference': -100, # Only the first 30 seconds + 'ext': 'mp3', + }] + self._sort_formats(formats) + artists = ', '.join( + orderedSet(a.get('ART_NAME') for a in s.get('ARTISTS'))) + entries.append({ + 'id': s.get('SNG_ID'), + 'duration': int_or_none(s.get('DURATION')), + 'title': '%s - %s' % (artists, s.get('SNG_TITLE')), + 'uploader': s.get('ART_NAME'), + 'uploader_id': s.get('ART_ID'), + 'age_limit': 16 if s.get('EXPLICIT_LYRICS') == '1' else 0, + 'formats': formats, + }) + + return { + '_type': 'playlist', + 'id': playlist_id, + 'title': playlist_title, + 'uploader': playlist_uploader, + 'thumbnail': playlist_thumbnail, + 'entries': entries, + } + + +class DeezerAlbumIE(DeezerBaseInfoExtractor): + _VALID_URL = r'https?://(?:www\.)?deezer\.com/(../)?album/(?P<id>[0-9]+)' + _TEST = { + 'url': 'https://www.deezer.com/fr/album/67505622', + 'info_dict': { + 'id': '67505622', + 'title': 'Last Week', + 'uploader': 'Home Brew', + 'thumbnail': r're:^https?://(e-)?cdns-images\.dzcdn\.net/images/cover/.*\.jpg$', + }, + 'playlist_count': 7, + } + + def _real_extract(self, url): + album_id, webpage, data = self.get_data(url) + + album_title = data.get('DATA', {}).get('ALB_TITLE') + album_uploader = data.get('DATA', {}).get('ART_NAME') + album_thumbnail = self._search_regex( + r'<img id="naboo_album_image".*?src="([^"]+)"', webpage, + 'album thumbnail') + + entries = [] + for s in data.get('SONGS', {}).get('data'): + formats = [{ + 'format_id': 'preview', + 'url': s.get('MEDIA', [{}])[0].get('HREF'), + 'preference': -100, # Only the first 30 seconds + 'ext': 'mp3', + }] + self._sort_formats(formats) + artists = ', '.join( + orderedSet(a.get('ART_NAME') for a in s.get('ARTISTS'))) + entries.append({ + 'id': s.get('SNG_ID'), + 'duration': int_or_none(s.get('DURATION')), + 'title': '%s - %s' % (artists, s.get('SNG_TITLE')), + 'uploader': s.get('ART_NAME'), + 'uploader_id': s.get('ART_ID'), + 'age_limit': 16 if s.get('EXPLICIT_LYRICS') == '1' else 0, + 'formats': formats, + 'track': s.get('SNG_TITLE'), + 'track_number': int_or_none(s.get('TRACK_NUMBER')), + 'track_id': s.get('SNG_ID'), + 'artist': album_uploader, + 'album': album_title, + 'album_artist': album_uploader, + }) + + return { + '_type': 'playlist', + 'id': album_id, + 'title': album_title, + 'uploader': album_uploader, + 'thumbnail': album_thumbnail, + 'entries': entries, + } diff --git a/youtube_dl/extractor/defense.py b/yt_dlp/extractor/defense.py index 9fe144e14..9fe144e14 100644 --- a/youtube_dl/extractor/defense.py +++ b/yt_dlp/extractor/defense.py diff --git a/youtube_dl/extractor/democracynow.py b/yt_dlp/extractor/democracynow.py index 5c9c0ecdc..5c9c0ecdc 100644 --- a/youtube_dl/extractor/democracynow.py +++ b/yt_dlp/extractor/democracynow.py diff --git a/yt_dlp/extractor/dfb.py b/yt_dlp/extractor/dfb.py new file mode 100644 index 000000000..97f70fc7b --- /dev/null +++ b/yt_dlp/extractor/dfb.py @@ -0,0 +1,56 @@ +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import unified_strdate + + +class DFBIE(InfoExtractor): + IE_NAME = 'tv.dfb.de' + _VALID_URL = r'https?://tv\.dfb\.de/video/(?P<display_id>[^/]+)/(?P<id>\d+)' + + _TEST = { + 'url': 'http://tv.dfb.de/video/u-19-em-stimmen-zum-spiel-gegen-russland/11633/', + 'md5': 'ac0f98a52a330f700b4b3034ad240649', + 'info_dict': { + 'id': '11633', + 'display_id': 'u-19-em-stimmen-zum-spiel-gegen-russland', + 'ext': 'mp4', + 'title': 'U 19-EM: Stimmen zum Spiel gegen Russland', + 'upload_date': '20150714', + }, + } + + def _real_extract(self, url): + display_id, video_id = self._match_valid_url(url).groups() + + player_info = self._download_xml( + 'http://tv.dfb.de/server/hd_video.php?play=%s' % video_id, + display_id) + video_info = player_info.find('video') + stream_access_url = self._proto_relative_url(video_info.find('url').text.strip()) + + formats = [] + # see http://tv.dfb.de/player/js/ajax.js for the method to extract m3u8 formats + for sa_url in (stream_access_url, stream_access_url + '&area=&format=iphone'): + stream_access_info = self._download_xml(sa_url, display_id) + token_el = stream_access_info.find('token') + manifest_url = token_el.attrib['url'] + '?' + 'hdnea=' + token_el.attrib['auth'] + if '.f4m' in manifest_url: + formats.extend(self._extract_f4m_formats( + manifest_url + '&hdcore=3.2.0', + display_id, f4m_id='hds', fatal=False)) + else: + formats.extend(self._extract_m3u8_formats( + manifest_url, display_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + self._sort_formats(formats) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': video_info.find('title').text, + 'thumbnail': 'http://tv.dfb.de/images/%s_640x360.jpg' % video_id, + 'upload_date': unified_strdate(video_info.find('time_date').text), + 'formats': formats, + } diff --git a/youtube_dl/extractor/dhm.py b/yt_dlp/extractor/dhm.py index aee72a6ed..aee72a6ed 100644 --- a/youtube_dl/extractor/dhm.py +++ b/yt_dlp/extractor/dhm.py diff --git a/youtube_dl/extractor/digg.py b/yt_dlp/extractor/digg.py index 913c1750f..913c1750f 100644 --- a/youtube_dl/extractor/digg.py +++ b/yt_dlp/extractor/digg.py diff --git a/yt_dlp/extractor/digiteka.py b/yt_dlp/extractor/digiteka.py new file mode 100644 index 000000000..d63204778 --- /dev/null +++ b/yt_dlp/extractor/digiteka.py @@ -0,0 +1,112 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import int_or_none + + +class DigitekaIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?://(?:www\.)?(?:digiteka\.net|ultimedia\.com)/ + (?: + deliver/ + (?P<embed_type> + generic| + musique + ) + (?:/[^/]+)*/ + (?: + src| + article + )| + default/index/video + (?P<site_type> + generic| + music + ) + /id + )/(?P<id>[\d+a-z]+)''' + _TESTS = [{ + # news + 'url': 'https://www.ultimedia.com/default/index/videogeneric/id/s8uk0r', + 'md5': '276a0e49de58c7e85d32b057837952a2', + 'info_dict': { + 'id': 's8uk0r', + 'ext': 'mp4', + 'title': 'Loi sur la fin de vie: le texte prévoit un renforcement des directives anticipées', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 74, + 'upload_date': '20150317', + 'timestamp': 1426604939, + 'uploader_id': '3fszv', + }, + }, { + # music + 'url': 'https://www.ultimedia.com/default/index/videomusic/id/xvpfp8', + 'md5': '2ea3513813cf230605c7e2ffe7eca61c', + 'info_dict': { + 'id': 'xvpfp8', + 'ext': 'mp4', + 'title': 'Two - C\'est La Vie (clip)', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 233, + 'upload_date': '20150224', + 'timestamp': 1424760500, + 'uploader_id': '3rfzk', + }, + }, { + 'url': 'https://www.digiteka.net/deliver/generic/iframe/mdtk/01637594/src/lqm3kl/zone/1/showtitle/1/autoplay/yes', + 'only_matching': True, + }] + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<(?:iframe|script)[^>]+src=["\'](?P<url>(?:https?:)?//(?:www\.)?ultimedia\.com/deliver/(?:generic|musique)(?:/[^/]+)*/(?:src|article)/[\d+a-z]+)', + webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + video_type = mobj.group('embed_type') or mobj.group('site_type') + if video_type == 'music': + video_type = 'musique' + + deliver_info = self._download_json( + 'http://www.ultimedia.com/deliver/video?video=%s&topic=%s' % (video_id, video_type), + video_id) + + yt_id = deliver_info.get('yt_id') + if yt_id: + return self.url_result(yt_id, 'Youtube') + + jwconf = deliver_info['jwconf'] + + formats = [] + for source in jwconf['playlist'][0]['sources']: + formats.append({ + 'url': source['file'], + 'format_id': source.get('label'), + }) + + self._sort_formats(formats) + + title = deliver_info['title'] + thumbnail = jwconf.get('image') + duration = int_or_none(deliver_info.get('duration')) + timestamp = int_or_none(deliver_info.get('release_time')) + uploader_id = deliver_info.get('owner_id') + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': timestamp, + 'uploader_id': uploader_id, + 'formats': formats, + } diff --git a/yt_dlp/extractor/discovery.py b/yt_dlp/extractor/discovery.py new file mode 100644 index 000000000..fd3ad75c7 --- /dev/null +++ b/yt_dlp/extractor/discovery.py @@ -0,0 +1,117 @@ +from __future__ import unicode_literals + +import random +import string + +from .discoverygo import DiscoveryGoBaseIE +from ..compat import compat_urllib_parse_unquote +from ..utils import ExtractorError +from ..compat import compat_HTTPError + + +class DiscoveryIE(DiscoveryGoBaseIE): + _VALID_URL = r'''(?x)https?:// + (?P<site> + go\.discovery| + www\. + (?: + investigationdiscovery| + discoverylife| + animalplanet| + ahctv| + destinationamerica| + sciencechannel| + tlc + )| + watch\. + (?: + hgtv| + foodnetwork| + travelchannel| + diynetwork| + cookingchanneltv| + motortrend + ) + )\.com/tv-shows/(?P<show_slug>[^/]+)/(?:video|full-episode)s/(?P<id>[^./?#]+)''' + _TESTS = [{ + 'url': 'https://go.discovery.com/tv-shows/cash-cab/videos/riding-with-matthew-perry', + 'info_dict': { + 'id': '5a2f35ce6b66d17a5026e29e', + 'ext': 'mp4', + 'title': 'Riding with Matthew Perry', + 'description': 'md5:a34333153e79bc4526019a5129e7f878', + 'duration': 84, + }, + 'params': { + 'skip_download': True, # requires ffmpeg + } + }, { + 'url': 'https://www.investigationdiscovery.com/tv-shows/final-vision/full-episodes/final-vision', + 'only_matching': True, + }, { + 'url': 'https://go.discovery.com/tv-shows/alaskan-bush-people/videos/follow-your-own-road', + 'only_matching': True, + }, { + # using `show_slug` is important to get the correct video data + 'url': 'https://www.sciencechannel.com/tv-shows/mythbusters-on-science/full-episodes/christmas-special', + 'only_matching': True, + }] + _GEO_COUNTRIES = ['US'] + _GEO_BYPASS = False + _API_BASE_URL = 'https://api.discovery.com/v1/' + + def _real_extract(self, url): + site, show_slug, display_id = self._match_valid_url(url).groups() + + access_token = None + cookies = self._get_cookies(url) + + # prefer Affiliate Auth Token over Anonymous Auth Token + auth_storage_cookie = cookies.get('eosAf') or cookies.get('eosAn') + if auth_storage_cookie and auth_storage_cookie.value: + auth_storage = self._parse_json(compat_urllib_parse_unquote( + compat_urllib_parse_unquote(auth_storage_cookie.value)), + display_id, fatal=False) or {} + access_token = auth_storage.get('a') or auth_storage.get('access_token') + + if not access_token: + access_token = self._download_json( + 'https://%s.com/anonymous' % site, display_id, + 'Downloading token JSON metadata', query={ + 'authRel': 'authorization', + 'client_id': '3020a40c2356a645b4b4', + 'nonce': ''.join([random.choice(string.ascii_letters) for _ in range(32)]), + 'redirectUri': 'https://www.discovery.com/', + })['access_token'] + + headers = self.geo_verification_headers() + headers['Authorization'] = 'Bearer ' + access_token + + try: + video = self._download_json( + self._API_BASE_URL + 'content/videos', + display_id, 'Downloading content JSON metadata', + headers=headers, query={ + 'embed': 'show.name', + 'fields': 'authenticated,description.detailed,duration,episodeNumber,id,name,parental.rating,season.number,show,tags', + 'slug': display_id, + 'show_slug': show_slug, + })[0] + video_id = video['id'] + stream = self._download_json( + self._API_BASE_URL + 'streaming/video/' + video_id, + display_id, 'Downloading streaming JSON metadata', headers=headers) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 403): + e_description = self._parse_json( + e.cause.read().decode(), display_id)['description'] + if 'resource not available for country' in e_description: + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + if 'Authorized Networks' in e_description: + raise ExtractorError( + 'This video is only available via cable service provider subscription that' + ' is not currently supported. You may want to use --cookies.', expected=True) + raise ExtractorError(e_description) + raise + + return self._extract_video_info(video, stream, display_id) diff --git a/youtube_dl/extractor/discoverygo.py b/yt_dlp/extractor/discoverygo.py index 9e7b14a7d..9e7b14a7d 100644 --- a/youtube_dl/extractor/discoverygo.py +++ b/yt_dlp/extractor/discoverygo.py diff --git a/yt_dlp/extractor/discoverynetworks.py b/yt_dlp/extractor/discoverynetworks.py new file mode 100644 index 000000000..f43c87160 --- /dev/null +++ b/yt_dlp/extractor/discoverynetworks.py @@ -0,0 +1,42 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .dplay import DPlayIE + + +class DiscoveryNetworksDeIE(DPlayIE): + _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:tlc|dmax)\.de|dplay\.co\.uk)/(?:programme|show|sendungen)/(?P<programme>[^/]+)/(?:video/)?(?P<alternate_id>[^/]+)' + + _TESTS = [{ + 'url': 'https://www.tlc.de/programme/breaking-amish/video/die-welt-da-drauen/DCB331270001100', + 'info_dict': { + 'id': '78867', + 'ext': 'mp4', + 'title': 'Die Welt da draußen', + 'description': 'md5:61033c12b73286e409d99a41742ef608', + 'timestamp': 1554069600, + 'upload_date': '20190331', + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, + }, { + 'url': 'https://www.dmax.de/programme/dmax-highlights/video/tuning-star-sidney-hoffmann-exklusiv-bei-dmax/191023082312316', + 'only_matching': True, + }, { + 'url': 'https://www.dplay.co.uk/show/ghost-adventures/video/hotel-leger-103620/EHD_280313B', + 'only_matching': True, + }, { + 'url': 'https://tlc.de/sendungen/breaking-amish/die-welt-da-drauen/', + 'only_matching': True, + }] + + def _real_extract(self, url): + domain, programme, alternate_id = self._match_valid_url(url).groups() + country = 'GB' if domain == 'dplay.co.uk' else 'DE' + realm = 'questuk' if country == 'GB' else domain.replace('.', '') + return self._get_disco_api_info( + url, '%s/%s' % (programme, alternate_id), + 'sonic-eu1-prod.disco-api.com', realm, country) diff --git a/yt_dlp/extractor/discoveryplusindia.py b/yt_dlp/extractor/discoveryplusindia.py new file mode 100644 index 000000000..51801402c --- /dev/null +++ b/yt_dlp/extractor/discoveryplusindia.py @@ -0,0 +1,98 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from ..compat import compat_str +from ..utils import try_get +from .common import InfoExtractor +from .dplay import DPlayIE + + +class DiscoveryPlusIndiaIE(DPlayIE): + _VALID_URL = r'https?://(?:www\.)?discoveryplus\.in/videos?' + DPlayIE._PATH_REGEX + _TESTS = [{ + 'url': 'https://www.discoveryplus.in/videos/how-do-they-do-it/fugu-and-more?seasonId=8&type=EPISODE', + 'info_dict': { + 'id': '27104', + 'ext': 'mp4', + 'display_id': 'how-do-they-do-it/fugu-and-more', + 'title': 'Fugu and More', + 'description': 'The Japanese catch, prepare and eat the deadliest fish on the planet.', + 'duration': 1319, + 'timestamp': 1582309800, + 'upload_date': '20200221', + 'series': 'How Do They Do It?', + 'season_number': 8, + 'episode_number': 2, + 'creator': 'Discovery Channel', + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, + 'skip': 'Cookies (not necessarily logged in) are needed' + }] + + def _update_disco_api_headers(self, headers, disco_base, display_id, realm): + headers['x-disco-params'] = 'realm=%s' % realm + headers['x-disco-client'] = 'WEB:UNKNOWN:dplus-india:17.0.0' + + def _download_video_playback_info(self, disco_base, video_id, headers): + return self._download_json( + disco_base + 'playback/v3/videoPlaybackInfo', + video_id, headers=headers, data=json.dumps({ + 'deviceInfo': { + 'adBlocker': False, + }, + 'videoId': video_id, + }).encode('utf-8'))['data']['attributes']['streaming'] + + def _real_extract(self, url): + display_id = self._match_id(url) + return self._get_disco_api_info( + url, display_id, 'ap2-prod-direct.discoveryplus.in', 'dplusindia', 'in') + + +class DiscoveryPlusIndiaShowIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?discoveryplus\.in/show/(?P<show_name>[^/]+)/?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://www.discoveryplus.in/show/how-do-they-do-it', + 'playlist_mincount': 140, + 'info_dict': { + 'id': 'how-do-they-do-it', + }, + }] + + def _entries(self, show_name): + headers = { + 'x-disco-client': 'WEB:UNKNOWN:dplus-india:prod', + 'x-disco-params': 'realm=dplusindia', + 'referer': 'https://www.discoveryplus.in/', + } + show_url = 'https://ap2-prod-direct.discoveryplus.in/cms/routes/show/{}?include=default'.format(show_name) + show_json = self._download_json(show_url, + video_id=show_name, + headers=headers)['included'][4]['attributes']['component'] + show_id = show_json['mandatoryParams'].split('=')[-1] + season_url = 'https://ap2-prod-direct.discoveryplus.in/content/videos?sort=episodeNumber&filter[seasonNumber]={}&filter[show.id]={}&page[size]=100&page[number]={}' + for season in show_json['filters'][0]['options']: + season_id = season['id'] + total_pages, page_num = 1, 0 + while page_num < total_pages: + season_json = self._download_json(season_url.format(season_id, show_id, compat_str(page_num + 1)), + video_id=show_id, headers=headers, + note='Downloading JSON metadata%s' % (' page %d' % page_num if page_num else '')) + if page_num == 0: + total_pages = try_get(season_json, lambda x: x['meta']['totalPages'], int) or 1 + episodes_json = season_json['data'] + for episode in episodes_json: + video_id = episode['attributes']['path'] + yield self.url_result( + 'https://discoveryplus.in/videos/%s' % video_id, + ie=DiscoveryPlusIndiaIE.ie_key(), video_id=video_id) + page_num += 1 + + def _real_extract(self, url): + show_name = self._match_valid_url(url).group('show_name') + return self.playlist_result(self._entries(show_name), playlist_id=show_name) diff --git a/youtube_dl/extractor/discoveryvr.py b/yt_dlp/extractor/discoveryvr.py index cb63c2649..cb63c2649 100644 --- a/youtube_dl/extractor/discoveryvr.py +++ b/yt_dlp/extractor/discoveryvr.py diff --git a/yt_dlp/extractor/disney.py b/yt_dlp/extractor/disney.py new file mode 100644 index 000000000..f018cbe9d --- /dev/null +++ b/yt_dlp/extractor/disney.py @@ -0,0 +1,169 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + unified_strdate, + compat_str, + determine_ext, + update_url_query, +) + + +class DisneyIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?://(?P<domain>(?:[^/]+\.)?(?:disney\.[a-z]{2,3}(?:\.[a-z]{2})?|disney(?:(?:me|latino)\.com|turkiye\.com\.tr|channel\.de)|(?:starwars|marvelkids)\.com))/(?:(?:embed/|(?:[^/]+/)+[\w-]+-)(?P<id>[a-z0-9]{24})|(?:[^/]+/)?(?P<display_id>[^/?#]+))''' + _TESTS = [{ + # Disney.EmbedVideo + 'url': 'http://video.disney.com/watch/moana-trailer-545ed1857afee5a0ec239977', + 'info_dict': { + 'id': '545ed1857afee5a0ec239977', + 'ext': 'mp4', + 'title': 'Moana - Trailer', + 'description': 'A fun adventure for the entire Family! Bring home Moana on Digital HD Feb 21 & Blu-ray March 7', + 'upload_date': '20170112', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + }, { + # Grill.burger + 'url': 'http://www.starwars.com/video/rogue-one-a-star-wars-story-intro-featurette', + 'info_dict': { + 'id': '5454e9f4e9804a552e3524c8', + 'ext': 'mp4', + 'title': '"Intro" Featurette: Rogue One: A Star Wars Story', + 'upload_date': '20170104', + 'description': 'Go behind-the-scenes of Rogue One: A Star Wars Story in this featurette with Director Gareth Edwards and the cast of the film.', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + }, { + 'url': 'http://videos.disneylatino.com/ver/spider-man-de-regreso-a-casa-primer-adelanto-543a33a1850bdcfcca13bae2', + 'only_matching': True, + }, { + 'url': 'http://video.en.disneyme.com/watch/future-worm/robo-carp-2001-544b66002aa7353cdd3f5114', + 'only_matching': True, + }, { + 'url': 'http://video.disneyturkiye.com.tr/izle/7c-7-cuceler/kimin-sesi-zaten-5456f3d015f6b36c8afdd0e2', + 'only_matching': True, + }, { + 'url': 'http://disneyjunior.disney.com/embed/546a4798ddba3d1612e4005d', + 'only_matching': True, + }, { + 'url': 'http://www.starwars.com/embed/54690d1e6c42e5f09a0fb097', + 'only_matching': True, + }, { + 'url': 'http://spiderman.marvelkids.com/embed/522900d2ced3c565e4cc0677', + 'only_matching': True, + }, { + 'url': 'http://spiderman.marvelkids.com/videos/contest-of-champions-part-four-clip-1', + 'only_matching': True, + }, { + 'url': 'http://disneyjunior.en.disneyme.com/dj/watch-my-friends-tigger-and-pooh-promo', + 'only_matching': True, + }, { + 'url': 'http://disneychannel.de/sehen/soy-luna-folge-118-5518518987ba27f3cc729268', + 'only_matching': True, + }, { + 'url': 'http://disneyjunior.disney.com/galactech-the-galactech-grab-galactech-an-admiral-rescue', + 'only_matching': True, + }] + + def _real_extract(self, url): + domain, video_id, display_id = self._match_valid_url(url).groups() + if not video_id: + webpage = self._download_webpage(url, display_id) + grill = re.sub(r'"\s*\+\s*"', '', self._search_regex( + r'Grill\.burger\s*=\s*({.+})\s*:', + webpage, 'grill data')) + page_data = next(s for s in self._parse_json(grill, display_id)['stack'] if s.get('type') == 'video') + video_data = page_data['data'][0] + else: + webpage = self._download_webpage( + 'http://%s/embed/%s' % (domain, video_id), video_id) + page_data = self._parse_json(self._search_regex( + r'Disney\.EmbedVideo\s*=\s*({.+});', + webpage, 'embed data'), video_id) + video_data = page_data['video'] + + for external in video_data.get('externals', []): + if external.get('source') == 'vevo': + return self.url_result('vevo:' + external['data_id'], 'Vevo') + + video_id = video_data['id'] + title = video_data['title'] + + formats = [] + for flavor in video_data.get('flavors', []): + flavor_format = flavor.get('format') + flavor_url = flavor.get('url') + if not flavor_url or not re.match(r'https?://', flavor_url) or flavor_format == 'mp4_access': + continue + tbr = int_or_none(flavor.get('bitrate')) + if tbr == 99999: + # wrong ks(Kaltura Signature) causes 404 Error + flavor_url = update_url_query(flavor_url, {'ks': ''}) + m3u8_formats = self._extract_m3u8_formats( + flavor_url, video_id, 'mp4', + m3u8_id=flavor_format, fatal=False) + for f in m3u8_formats: + # Apple FairPlay + if '/fpshls/' in f['url']: + continue + formats.append(f) + continue + format_id = [] + if flavor_format: + format_id.append(flavor_format) + if tbr: + format_id.append(compat_str(tbr)) + ext = determine_ext(flavor_url) + if flavor_format == 'applehttp' or ext == 'm3u8': + ext = 'mp4' + width = int_or_none(flavor.get('width')) + height = int_or_none(flavor.get('height')) + formats.append({ + 'format_id': '-'.join(format_id), + 'url': flavor_url, + 'width': width, + 'height': height, + 'tbr': tbr, + 'ext': ext, + 'vcodec': 'none' if (width == 0 and height == 0) else None, + }) + if not formats and video_data.get('expired'): + self.raise_no_formats( + '%s said: %s' % (self.IE_NAME, page_data['translations']['video_expired']), + expected=True) + self._sort_formats(formats) + + subtitles = {} + for caption in video_data.get('captions', []): + caption_url = caption.get('url') + caption_format = caption.get('format') + if not caption_url or caption_format.startswith('unknown'): + continue + subtitles.setdefault(caption.get('language', 'en'), []).append({ + 'url': caption_url, + 'ext': { + 'webvtt': 'vtt', + }.get(caption_format, caption_format), + }) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('description') or video_data.get('short_desc'), + 'thumbnail': video_data.get('thumb') or video_data.get('thumb_secure'), + 'duration': int_or_none(video_data.get('duration_sec')), + 'upload_date': unified_strdate(video_data.get('publish_date')), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/yt_dlp/extractor/dispeak.py b/yt_dlp/extractor/dispeak.py new file mode 100644 index 000000000..be7ad1202 --- /dev/null +++ b/yt_dlp/extractor/dispeak.py @@ -0,0 +1,131 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_duration, + remove_end, + xpath_element, + xpath_text, +) + + +class DigitallySpeakingIE(InfoExtractor): + _VALID_URL = r'https?://(?:s?evt\.dispeak|events\.digitallyspeaking)\.com/(?:[^/]+/)+xml/(?P<id>[^.]+)\.xml' + + _TESTS = [{ + # From http://gdcvault.com/play/1023460/Tenacious-Design-and-The-Interface + 'url': 'http://evt.dispeak.com/ubm/gdc/sf16/xml/840376_BQRC.xml', + 'md5': 'a8efb6c31ed06ca8739294960b2dbabd', + 'info_dict': { + 'id': '840376_BQRC', + 'ext': 'mp4', + 'title': 'Tenacious Design and The Interface of \'Destiny\'', + }, + }, { + # From http://www.gdcvault.com/play/1014631/Classic-Game-Postmortem-PAC + 'url': 'http://events.digitallyspeaking.com/gdc/sf11/xml/12396_1299111843500GMPX.xml', + 'only_matching': True, + }, { + # From http://www.gdcvault.com/play/1013700/Advanced-Material + 'url': 'http://sevt.dispeak.com/ubm/gdc/eur10/xml/11256_1282118587281VNIT.xml', + 'only_matching': True, + }, { + # From https://gdcvault.com/play/1016624, empty speakerVideo + 'url': 'https://sevt.dispeak.com/ubm/gdc/online12/xml/201210-822101_1349794556671DDDD.xml', + 'info_dict': { + 'id': '201210-822101_1349794556671DDDD', + 'ext': 'flv', + 'title': 'Pre-launch - Preparing to Take the Plunge', + }, + }, { + # From http://www.gdcvault.com/play/1014846/Conference-Keynote-Shigeru, empty slideVideo + 'url': 'http://events.digitallyspeaking.com/gdc/project25/xml/p25-miyamoto1999_1282467389849HSVB.xml', + 'only_matching': True, + }] + + def _parse_mp4(self, metadata): + video_formats = [] + video_root = None + + mp4_video = xpath_text(metadata, './mp4video', default=None) + if mp4_video is not None: + mobj = re.match(r'(?P<root>https?://.*?/).*', mp4_video) + video_root = mobj.group('root') + if video_root is None: + http_host = xpath_text(metadata, 'httpHost', default=None) + if http_host: + video_root = 'http://%s/' % http_host + if video_root is None: + # Hard-coded in http://evt.dispeak.com/ubm/gdc/sf16/custom/player2.js + # Works for GPUTechConf, too + video_root = 'http://s3-2u.digitallyspeaking.com/' + + formats = metadata.findall('./MBRVideos/MBRVideo') + if not formats: + return None + for a_format in formats: + stream_name = xpath_text(a_format, 'streamName', fatal=True) + video_path = re.match(r'mp4\:(?P<path>.*)', stream_name).group('path') + url = video_root + video_path + bitrate = xpath_text(a_format, 'bitrate') + tbr = int_or_none(bitrate) + vbr = int_or_none(self._search_regex( + r'-(\d+)\.mp4', video_path, 'vbr', default=None)) + abr = tbr - vbr if tbr and vbr else None + video_formats.append({ + 'format_id': bitrate, + 'url': url, + 'tbr': tbr, + 'vbr': vbr, + 'abr': abr, + }) + return video_formats + + def _parse_flv(self, metadata): + formats = [] + akamai_url = xpath_text(metadata, './akamaiHost', fatal=True) + audios = metadata.findall('./audios/audio') + for audio in audios: + formats.append({ + 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, + 'play_path': remove_end(audio.get('url'), '.flv'), + 'ext': 'flv', + 'vcodec': 'none', + 'quality': 1, + 'format_id': audio.get('code'), + }) + for video_key, format_id, preference in ( + ('slide', 'slides', -2), ('speaker', 'speaker', -1)): + video_path = xpath_text(metadata, './%sVideo' % video_key) + if not video_path: + continue + formats.append({ + 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, + 'play_path': remove_end(video_path, '.flv'), + 'ext': 'flv', + 'format_note': '%s video' % video_key, + 'quality': preference, + 'format_id': format_id, + }) + return formats + + def _real_extract(self, url): + video_id = self._match_id(url) + + xml_description = self._download_xml(url, video_id) + metadata = xpath_element(xml_description, 'metadata') + + video_formats = self._parse_mp4(metadata) + if video_formats is None: + video_formats = self._parse_flv(metadata) + + return { + 'id': video_id, + 'formats': video_formats, + 'title': xpath_text(metadata, 'title', fatal=True), + 'duration': parse_duration(xpath_text(metadata, 'endTime')), + 'creator': xpath_text(metadata, 'speaker'), + } diff --git a/yt_dlp/extractor/dlive.py b/yt_dlp/extractor/dlive.py new file mode 100644 index 000000000..90462c0ab --- /dev/null +++ b/yt_dlp/extractor/dlive.py @@ -0,0 +1,96 @@ +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..utils import int_or_none + + +class DLiveVODIE(InfoExtractor): + IE_NAME = 'dlive:vod' + _VALID_URL = r'https?://(?:www\.)?dlive\.tv/p/(?P<uploader_id>.+?)\+(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://dlive.tv/p/pdp+3mTzOl4WR', + 'info_dict': { + 'id': '3mTzOl4WR', + 'ext': 'mp4', + 'title': 'Minecraft with james charles epic', + 'upload_date': '20190701', + 'timestamp': 1562011015, + 'uploader_id': 'pdp', + } + }, { + 'url': 'https://dlive.tv/p/pdpreplay+D-RD-xSZg', + 'only_matching': True, + }] + + def _real_extract(self, url): + uploader_id, vod_id = self._match_valid_url(url).groups() + broadcast = self._download_json( + 'https://graphigo.prd.dlive.tv/', vod_id, + data=json.dumps({'query': '''query { + pastBroadcast(permlink:"%s+%s") { + content + createdAt + length + playbackUrl + title + thumbnailUrl + viewCount + } +}''' % (uploader_id, vod_id)}).encode())['data']['pastBroadcast'] + title = broadcast['title'] + formats = self._extract_m3u8_formats( + broadcast['playbackUrl'], vod_id, 'mp4', 'm3u8_native') + self._sort_formats(formats) + return { + 'id': vod_id, + 'title': title, + 'uploader_id': uploader_id, + 'formats': formats, + 'description': broadcast.get('content'), + 'thumbnail': broadcast.get('thumbnailUrl'), + 'timestamp': int_or_none(broadcast.get('createdAt'), 1000), + 'view_count': int_or_none(broadcast.get('viewCount')), + } + + +class DLiveStreamIE(InfoExtractor): + IE_NAME = 'dlive:stream' + _VALID_URL = r'https?://(?:www\.)?dlive\.tv/(?!p/)(?P<id>[\w.-]+)' + + def _real_extract(self, url): + display_name = self._match_id(url) + user = self._download_json( + 'https://graphigo.prd.dlive.tv/', display_name, + data=json.dumps({'query': '''query { + userByDisplayName(displayname:"%s") { + livestream { + content + createdAt + title + thumbnailUrl + watchingCount + } + username + } +}''' % display_name}).encode())['data']['userByDisplayName'] + livestream = user['livestream'] + title = livestream['title'] + username = user['username'] + formats = self._extract_m3u8_formats( + 'https://live.prd.dlive.tv/hls/live/%s.m3u8' % username, + display_name, 'mp4') + self._sort_formats(formats) + return { + 'id': display_name, + 'title': self._live_title(title), + 'uploader': display_name, + 'uploader_id': username, + 'formats': formats, + 'description': livestream.get('content'), + 'thumbnail': livestream.get('thumbnailUrl'), + 'is_live': True, + 'timestamp': int_or_none(livestream.get('createdAt'), 1000), + 'view_count': int_or_none(livestream.get('watchingCount')), + } diff --git a/yt_dlp/extractor/doodstream.py b/yt_dlp/extractor/doodstream.py new file mode 100644 index 000000000..2c9ea6898 --- /dev/null +++ b/yt_dlp/extractor/doodstream.py @@ -0,0 +1,71 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import string +import random +import time + +from .common import InfoExtractor + + +class DoodStreamIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?dood\.(?:to|watch)/[ed]/(?P<id>[a-z0-9]+)' + _TESTS = [{ + 'url': 'http://dood.to/e/5s1wmbdacezb', + 'md5': '4568b83b31e13242b3f1ff96c55f0595', + 'info_dict': { + 'id': '5s1wmbdacezb', + 'ext': 'mp4', + 'title': 'Kat Wonders - Monthly May 2020', + 'description': 'Kat Wonders - Monthly May 2020 | DoodStream.com', + 'thumbnail': 'https://img.doodcdn.com/snaps/flyus84qgl2fsk4g.jpg', + } + }, { + 'url': 'https://dood.to/d/jzrxn12t2s7n', + 'md5': '3207e199426eca7c2aa23c2872e6728a', + 'info_dict': { + 'id': 'jzrxn12t2s7n', + 'ext': 'mp4', + 'title': 'Stacy Cruz Cute ALLWAYSWELL', + 'description': 'Stacy Cruz Cute ALLWAYSWELL | DoodStream.com', + 'thumbnail': 'https://img.doodcdn.com/snaps/8edqd5nppkac3x8u.jpg', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + if '/d/' in url: + url = "https://dood.to" + self._html_search_regex( + r'<iframe src="(/e/[a-z0-9]+)"', webpage, 'embed') + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._html_search_meta(['og:title', 'twitter:title'], + webpage, default=None) + thumb = self._html_search_meta(['og:image', 'twitter:image'], + webpage, default=None) + token = self._html_search_regex(r'[?&]token=([a-z0-9]+)[&\']', webpage, 'token') + description = self._html_search_meta( + ['og:description', 'description', 'twitter:description'], + webpage, default=None) + auth_url = 'https://dood.to' + self._html_search_regex( + r'(/pass_md5.*?)\'', webpage, 'pass_md5') + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/66.0', + 'referer': url + } + + webpage = self._download_webpage(auth_url, video_id, headers=headers) + final_url = webpage + ''.join([random.choice(string.ascii_letters + string.digits) for _ in range(10)]) + "?token=" + token + "&expiry=" + str(int(time.time() * 1000)) + + return { + 'id': video_id, + 'title': title, + 'url': final_url, + 'http_headers': headers, + 'ext': 'mp4', + 'description': description, + 'thumbnail': thumb, + } diff --git a/youtube_dl/extractor/dotsub.py b/yt_dlp/extractor/dotsub.py index 148605c0b..148605c0b 100644 --- a/youtube_dl/extractor/dotsub.py +++ b/yt_dlp/extractor/dotsub.py diff --git a/youtube_dl/extractor/douyutv.py b/yt_dlp/extractor/douyutv.py index 9757f4422..9757f4422 100644 --- a/youtube_dl/extractor/douyutv.py +++ b/yt_dlp/extractor/douyutv.py diff --git a/yt_dlp/extractor/dplay.py b/yt_dlp/extractor/dplay.py new file mode 100644 index 000000000..e0e446b87 --- /dev/null +++ b/yt_dlp/extractor/dplay.py @@ -0,0 +1,431 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ( + determine_ext, + ExtractorError, + float_or_none, + int_or_none, + strip_or_none, + unified_timestamp, +) + + +class DPlayIE(InfoExtractor): + _PATH_REGEX = r'/(?P<id>[^/]+/[^/?#]+)' + _VALID_URL = r'''(?x)https?:// + (?P<domain> + (?:www\.)?(?P<host>d + (?: + play\.(?P<country>dk|fi|jp|se|no)| + iscoveryplus\.(?P<plus_country>dk|es|fi|it|se|no) + ) + )| + (?P<subdomain_country>es|it)\.dplay\.com + )/[^/]+''' + _PATH_REGEX + + _TESTS = [{ + # non geo restricted, via secure api, unsigned download hls URL + 'url': 'https://www.dplay.se/videos/nugammalt-77-handelser-som-format-sverige/nugammalt-77-handelser-som-format-sverige-101', + 'info_dict': { + 'id': '13628', + 'display_id': 'nugammalt-77-handelser-som-format-sverige/nugammalt-77-handelser-som-format-sverige-101', + 'ext': 'mp4', + 'title': 'Svensken lär sig njuta av livet', + 'description': 'md5:d3819c9bccffd0fe458ca42451dd50d8', + 'duration': 2649.856, + 'timestamp': 1365453720, + 'upload_date': '20130408', + 'creator': 'Kanal 5', + 'series': 'Nugammalt - 77 händelser som format Sverige', + 'season_number': 1, + 'episode_number': 1, + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, + }, { + # geo restricted, via secure api, unsigned download hls URL + 'url': 'http://www.dplay.dk/videoer/ted-bundy-mind-of-a-monster/ted-bundy-mind-of-a-monster', + 'info_dict': { + 'id': '104465', + 'display_id': 'ted-bundy-mind-of-a-monster/ted-bundy-mind-of-a-monster', + 'ext': 'mp4', + 'title': 'Ted Bundy: Mind Of A Monster', + 'description': 'md5:8b780f6f18de4dae631668b8a9637995', + 'duration': 5290.027, + 'timestamp': 1570694400, + 'upload_date': '20191010', + 'creator': 'ID - Investigation Discovery', + 'series': 'Ted Bundy: Mind Of A Monster', + 'season_number': 1, + 'episode_number': 1, + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, + }, { + # disco-api + 'url': 'https://www.dplay.no/videoer/i-kongens-klr/sesong-1-episode-7', + 'info_dict': { + 'id': '40206', + 'display_id': 'i-kongens-klr/sesong-1-episode-7', + 'ext': 'mp4', + 'title': 'Episode 7', + 'description': 'md5:e3e1411b2b9aebeea36a6ec5d50c60cf', + 'duration': 2611.16, + 'timestamp': 1516726800, + 'upload_date': '20180123', + 'series': 'I kongens klær', + 'season_number': 1, + 'episode_number': 7, + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, + 'skip': 'Available for Premium users', + }, { + 'url': 'http://it.dplay.com/nove/biografie-imbarazzanti/luigi-di-maio-la-psicosi-di-stanislawskij/', + 'md5': '2b808ffb00fc47b884a172ca5d13053c', + 'info_dict': { + 'id': '6918', + 'display_id': 'biografie-imbarazzanti/luigi-di-maio-la-psicosi-di-stanislawskij', + 'ext': 'mp4', + 'title': 'Luigi Di Maio: la psicosi di Stanislawskij', + 'description': 'md5:3c7a4303aef85868f867a26f5cc14813', + 'thumbnail': r're:^https?://.*\.jpe?g', + 'upload_date': '20160524', + 'timestamp': 1464076800, + 'series': 'Biografie imbarazzanti', + 'season_number': 1, + 'episode': 'Episode 1', + 'episode_number': 1, + }, + }, { + 'url': 'https://es.dplay.com/dmax/la-fiebre-del-oro/temporada-8-episodio-1/', + 'info_dict': { + 'id': '21652', + 'display_id': 'la-fiebre-del-oro/temporada-8-episodio-1', + 'ext': 'mp4', + 'title': 'Episodio 1', + 'description': 'md5:b9dcff2071086e003737485210675f69', + 'thumbnail': r're:^https?://.*\.png', + 'upload_date': '20180709', + 'timestamp': 1531173540, + 'series': 'La fiebre del oro', + 'season_number': 8, + 'episode': 'Episode 1', + 'episode_number': 1, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.dplay.fi/videot/shifting-gears-with-aaron-kaufman/episode-16', + 'only_matching': True, + }, { + 'url': 'https://www.dplay.jp/video/gold-rush/24086', + 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.se/videos/nugammalt-77-handelser-som-format-sverige/nugammalt-77-handelser-som-format-sverige-101', + 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.dk/videoer/ted-bundy-mind-of-a-monster/ted-bundy-mind-of-a-monster', + 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.no/videoer/i-kongens-klr/sesong-1-episode-7', + 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.it/videos/biografie-imbarazzanti/luigi-di-maio-la-psicosi-di-stanislawskij', + 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.es/videos/la-fiebre-del-oro/temporada-8-episodio-1', + 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.fi/videot/shifting-gears-with-aaron-kaufman/episode-16', + 'only_matching': True, + }] + + def _process_errors(self, e, geo_countries): + info = self._parse_json(e.cause.read().decode('utf-8'), None) + error = info['errors'][0] + error_code = error.get('code') + if error_code == 'access.denied.geoblocked': + self.raise_geo_restricted(countries=geo_countries) + elif error_code in ('access.denied.missingpackage', 'invalid.token'): + raise ExtractorError( + 'This video is only available for registered users. You may want to use --cookies.', expected=True) + raise ExtractorError(info['errors'][0]['detail'], expected=True) + + def _update_disco_api_headers(self, headers, disco_base, display_id, realm): + headers['Authorization'] = 'Bearer ' + self._download_json( + disco_base + 'token', display_id, 'Downloading token', + query={ + 'realm': realm, + })['data']['attributes']['token'] + + def _download_video_playback_info(self, disco_base, video_id, headers): + streaming = self._download_json( + disco_base + 'playback/videoPlaybackInfo/' + video_id, + video_id, headers=headers)['data']['attributes']['streaming'] + streaming_list = [] + for format_id, format_dict in streaming.items(): + streaming_list.append({ + 'type': format_id, + 'url': format_dict.get('url'), + }) + return streaming_list + + def _get_disco_api_info(self, url, display_id, disco_host, realm, country): + geo_countries = [country.upper()] + self._initialize_geo_bypass({ + 'countries': geo_countries, + }) + disco_base = 'https://%s/' % disco_host + headers = { + 'Referer': url, + } + self._update_disco_api_headers(headers, disco_base, display_id, realm) + try: + video = self._download_json( + disco_base + 'content/videos/' + display_id, display_id, + headers=headers, query={ + 'fields[channel]': 'name', + 'fields[image]': 'height,src,width', + 'fields[show]': 'name', + 'fields[tag]': 'name', + 'fields[video]': 'description,episodeNumber,name,publishStart,seasonNumber,videoDuration', + 'include': 'images,primaryChannel,show,tags' + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + self._process_errors(e, geo_countries) + raise + video_id = video['data']['id'] + info = video['data']['attributes'] + title = info['name'].strip() + formats = [] + try: + streaming = self._download_video_playback_info( + disco_base, video_id, headers) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + self._process_errors(e, geo_countries) + raise + for format_dict in streaming: + if not isinstance(format_dict, dict): + continue + format_url = format_dict.get('url') + if not format_url: + continue + format_id = format_dict.get('type') + ext = determine_ext(format_url) + if format_id == 'dash' or ext == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, display_id, mpd_id='dash', fatal=False)) + elif format_id == 'hls' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, display_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False)) + else: + formats.append({ + 'url': format_url, + 'format_id': format_id, + }) + self._sort_formats(formats) + + creator = series = None + tags = [] + thumbnails = [] + included = video.get('included') or [] + if isinstance(included, list): + for e in included: + attributes = e.get('attributes') + if not attributes: + continue + e_type = e.get('type') + if e_type == 'channel': + creator = attributes.get('name') + elif e_type == 'image': + src = attributes.get('src') + if src: + thumbnails.append({ + 'url': src, + 'width': int_or_none(attributes.get('width')), + 'height': int_or_none(attributes.get('height')), + }) + if e_type == 'show': + series = attributes.get('name') + elif e_type == 'tag': + name = attributes.get('name') + if name: + tags.append(name) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': strip_or_none(info.get('description')), + 'duration': float_or_none(info.get('videoDuration'), 1000), + 'timestamp': unified_timestamp(info.get('publishStart')), + 'series': series, + 'season_number': int_or_none(info.get('seasonNumber')), + 'episode_number': int_or_none(info.get('episodeNumber')), + 'creator': creator, + 'tags': tags, + 'thumbnails': thumbnails, + 'formats': formats, + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + display_id = mobj.group('id') + domain = mobj.group('domain').lstrip('www.') + country = mobj.group('country') or mobj.group('subdomain_country') or mobj.group('plus_country') + host = 'disco-api.' + domain if domain[0] == 'd' else 'eu2-prod.disco-api.com' + return self._get_disco_api_info( + url, display_id, host, 'dplay' + country, country) + + +class HGTVDeIE(DPlayIE): + _VALID_URL = r'https?://de\.hgtv\.com/sendungen' + DPlayIE._PATH_REGEX + _TESTS = [{ + 'url': 'https://de.hgtv.com/sendungen/tiny-house-klein-aber-oho/wer-braucht-schon-eine-toilette/', + 'info_dict': { + 'id': '151205', + 'display_id': 'tiny-house-klein-aber-oho/wer-braucht-schon-eine-toilette', + 'ext': 'mp4', + 'title': 'Wer braucht schon eine Toilette', + 'description': 'md5:05b40a27e7aed2c9172de34d459134e2', + 'duration': 1177.024, + 'timestamp': 1595705400, + 'upload_date': '20200725', + 'creator': 'HGTV', + 'series': 'Tiny House - klein, aber oho', + 'season_number': 3, + 'episode_number': 3, + }, + 'params': { + 'format': 'bestvideo', + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + return self._get_disco_api_info( + url, display_id, 'eu1-prod.disco-api.com', 'hgtv', 'de') + + +class DiscoveryPlusIE(DPlayIE): + _VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/video' + DPlayIE._PATH_REGEX + _TESTS = [{ + 'url': 'https://www.discoveryplus.com/video/property-brothers-forever-home/food-and-family', + 'info_dict': { + 'id': '1140794', + 'display_id': 'property-brothers-forever-home/food-and-family', + 'ext': 'mp4', + 'title': 'Food and Family', + 'description': 'The brothers help a Richmond family expand their single-level home.', + 'duration': 2583.113, + 'timestamp': 1609304400, + 'upload_date': '20201230', + 'creator': 'HGTV', + 'series': 'Property Brothers: Forever Home', + 'season_number': 1, + 'episode_number': 1, + }, + 'skip': 'Available for Premium users', + }] + + _PRODUCT = 'dplus_us' + _API_URL = 'us1-prod-direct.discoveryplus.com' + + def _update_disco_api_headers(self, headers, disco_base, display_id, realm): + headers['x-disco-client'] = f'WEB:UNKNOWN:{self._PRODUCT}:25.2.6' + + def _download_video_playback_info(self, disco_base, video_id, headers): + return self._download_json( + disco_base + 'playback/v3/videoPlaybackInfo', + video_id, headers=headers, data=json.dumps({ + 'deviceInfo': { + 'adBlocker': False, + }, + 'videoId': video_id, + 'wisteriaProperties': { + 'platform': 'desktop', + 'product': self._PRODUCT, + }, + }).encode('utf-8'))['data']['attributes']['streaming'] + + def _real_extract(self, url): + display_id = self._match_id(url) + return self._get_disco_api_info( + url, display_id, self._API_URL, 'go', 'us') + + +class ScienceChannelIE(DiscoveryPlusIE): + _VALID_URL = r'https?://(?:www\.)?sciencechannel\.com/video' + DPlayIE._PATH_REGEX + _TESTS = [{ + 'url': 'https://www.sciencechannel.com/video/strangest-things-science-atve-us/nazi-mystery-machine', + 'info_dict': { + 'id': '2842849', + 'display_id': 'strangest-things-science-atve-us/nazi-mystery-machine', + 'ext': 'mp4', + 'title': 'Nazi Mystery Machine', + 'description': 'Experts investigate the secrets of a revolutionary encryption machine.', + 'season_number': 1, + 'episode_number': 1, + }, + 'skip': 'Available for Premium users', + }] + + _PRODUCT = 'sci' + _API_URL = 'us1-prod-direct.sciencechannel.com' + + +class DIYNetworkIE(DiscoveryPlusIE): + _VALID_URL = r'https?://(?:watch\.)?diynetwork\.com/video' + DPlayIE._PATH_REGEX + _TESTS = [{ + 'url': 'https://watch.diynetwork.com/video/pool-kings-diy-network/bringing-beach-life-to-texas', + 'info_dict': { + 'id': '2309730', + 'display_id': 'pool-kings-diy-network/bringing-beach-life-to-texas', + 'ext': 'mp4', + 'title': 'Bringing Beach Life to Texas', + 'description': 'The Pool Kings give a family a day at the beach in their own backyard.', + 'season_number': 10, + 'episode_number': 2, + }, + 'skip': 'Available for Premium users', + }] + + _PRODUCT = 'diy' + _API_URL = 'us1-prod-direct.watch.diynetwork.com' + + +class AnimalPlanetIE(DiscoveryPlusIE): + _VALID_URL = r'https?://(?:www\.)?animalplanet\.com/video' + DPlayIE._PATH_REGEX + _TESTS = [{ + 'url': 'https://www.animalplanet.com/video/north-woods-law-animal-planet/squirrel-showdown', + 'info_dict': { + 'id': '3338923', + 'display_id': 'north-woods-law-animal-planet/squirrel-showdown', + 'ext': 'mp4', + 'title': 'Squirrel Showdown', + 'description': 'A woman is suspected of being in possession of flying squirrel kits.', + 'season_number': 16, + 'episode_number': 11, + }, + 'skip': 'Available for Premium users', + }] + + _PRODUCT = 'apl' + _API_URL = 'us1-prod-direct.animalplanet.com' diff --git a/yt_dlp/extractor/drbonanza.py b/yt_dlp/extractor/drbonanza.py new file mode 100644 index 000000000..ea0f06d3d --- /dev/null +++ b/yt_dlp/extractor/drbonanza.py @@ -0,0 +1,58 @@ +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ( + js_to_json, + parse_duration, + unescapeHTML, +) + + +class DRBonanzaIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?dr\.dk/bonanza/[^/]+/\d+/[^/]+/(?P<id>\d+)/(?P<display_id>[^/?#&]+)' + _TEST = { + 'url': 'http://www.dr.dk/bonanza/serie/154/matador/40312/matador---0824-komme-fremmede-', + 'info_dict': { + 'id': '40312', + 'display_id': 'matador---0824-komme-fremmede-', + 'ext': 'mp4', + 'title': 'MATADOR - 08:24. "Komme fremmede".', + 'description': 'md5:77b4c1ac4d4c1b9d610ab4395212ff84', + 'thumbnail': r're:^https?://.*\.(?:gif|jpg)$', + 'duration': 4613, + }, + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id, display_id = mobj.group('id', 'display_id') + + webpage = self._download_webpage(url, display_id) + + info = self._parse_html5_media_entries( + url, webpage, display_id, m3u8_id='hls', + m3u8_entry_protocol='m3u8_native')[0] + self._sort_formats(info['formats']) + + asset = self._parse_json( + self._search_regex( + r'(?s)currentAsset\s*=\s*({.+?})\s*</script', webpage, 'asset'), + display_id, transform_source=js_to_json) + + title = unescapeHTML(asset['AssetTitle']).strip() + + def extract(field): + return self._search_regex( + r'<div[^>]+>\s*<p>%s:<p>\s*</div>\s*<div[^>]+>\s*<p>([^<]+)</p>' % field, + webpage, field, default=None) + + info.update({ + 'id': asset.get('AssetId') or video_id, + 'display_id': display_id, + 'title': title, + 'description': extract('Programinfo'), + 'duration': parse_duration(extract('Tid')), + 'thumbnail': asset.get('AssetImageUrl'), + }) + return info diff --git a/youtube_dl/extractor/dreisat.py b/yt_dlp/extractor/dreisat.py index 5a07c18f4..5a07c18f4 100644 --- a/youtube_dl/extractor/dreisat.py +++ b/yt_dlp/extractor/dreisat.py diff --git a/yt_dlp/extractor/dropbox.py b/yt_dlp/extractor/dropbox.py new file mode 100644 index 000000000..6a7d050aa --- /dev/null +++ b/yt_dlp/extractor/dropbox.py @@ -0,0 +1,40 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import os.path +import re + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote +from ..utils import url_basename + + +class DropboxIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?dropbox[.]com/sh?/(?P<id>[a-zA-Z0-9]{15})/.*' + _TESTS = [ + { + 'url': 'https://www.dropbox.com/s/nelirfsxnmcfbfh/youtube-dl%20test%20video%20%27%C3%A4%22BaW_jenozKc.mp4?dl=0', + 'info_dict': { + 'id': 'nelirfsxnmcfbfh', + 'ext': 'mp4', + 'title': 'youtube-dl test video \'ä"BaW_jenozKc' + } + }, { + 'url': 'https://www.dropbox.com/sh/662glsejgzoj9sr/AAByil3FGH9KFNZ13e08eSa1a/Pregame%20Ceremony%20Program%20PA%2020140518.m4v', + 'only_matching': True, + }, + ] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + fn = compat_urllib_parse_unquote(url_basename(url)) + title = os.path.splitext(fn)[0] + video_url = re.sub(r'[?&]dl=0', '', url) + video_url += ('?' if '?' not in video_url else '&') + 'dl=1' + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + } diff --git a/yt_dlp/extractor/drtuber.py b/yt_dlp/extractor/drtuber.py new file mode 100644 index 000000000..540b86a16 --- /dev/null +++ b/yt_dlp/extractor/drtuber.py @@ -0,0 +1,112 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + NO_DEFAULT, + parse_duration, + str_to_int, +) + + +class DrTuberIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www|m)\.)?drtuber\.com/(?:video|embed)/(?P<id>\d+)(?:/(?P<display_id>[\w-]+))?' + _TESTS = [{ + 'url': 'http://www.drtuber.com/video/1740434/hot-perky-blonde-naked-golf', + 'md5': '93e680cf2536ad0dfb7e74d94a89facd', + 'info_dict': { + 'id': '1740434', + 'display_id': 'hot-perky-blonde-naked-golf', + 'ext': 'mp4', + 'title': 'hot perky blonde naked golf', + 'like_count': int, + 'comment_count': int, + 'categories': ['Babe', 'Blonde', 'Erotic', 'Outdoor', 'Softcore', 'Solo'], + 'thumbnail': r're:https?://.*\.jpg$', + 'age_limit': 18, + } + }, { + 'url': 'http://www.drtuber.com/embed/489939', + 'only_matching': True, + }, { + 'url': 'http://m.drtuber.com/video/3893529/lingerie-blowjob-from-beautiful-teen', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?drtuber\.com/embed/\d+)', + webpage) + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') or video_id + + webpage = self._download_webpage( + 'http://www.drtuber.com/video/%s' % video_id, display_id) + + video_data = self._download_json( + 'http://www.drtuber.com/player_config_json/', video_id, query={ + 'vid': video_id, + 'embed': 0, + 'aid': 0, + 'domain_id': 0, + }) + + formats = [] + for format_id, video_url in video_data['files'].items(): + if video_url: + formats.append({ + 'format_id': format_id, + 'quality': 2 if format_id == 'hq' else 1, + 'url': video_url + }) + self._sort_formats(formats) + + duration = int_or_none(video_data.get('duration')) or parse_duration( + video_data.get('duration_format')) + + title = self._html_search_regex( + (r'<h1[^>]+class=["\']title[^>]+>([^<]+)', + r'<title>([^<]+)\s*@\s+DrTuber', + r'class="title_watch"[^>]*><(?:p|h\d+)[^>]*>([^<]+)<', + r'<p[^>]+class="title_substrate">([^<]+)</p>', + r'<title>([^<]+) - \d+'), + webpage, 'title') + + thumbnail = self._html_search_regex( + r'poster="([^"]+)"', + webpage, 'thumbnail', fatal=False) + + def extract_count(id_, name, default=NO_DEFAULT): + return str_to_int(self._html_search_regex( + r'<span[^>]+(?:class|id)="%s"[^>]*>([\d,\.]+)</span>' % id_, + webpage, '%s count' % name, default=default, fatal=False)) + + like_count = extract_count('rate_likes', 'like') + dislike_count = extract_count('rate_dislikes', 'dislike', default=None) + comment_count = extract_count('comments_count', 'comment') + + cats_str = self._search_regex( + r'<div[^>]+class="categories_list">(.+?)</div>', + webpage, 'categories', fatal=False) + categories = [] if not cats_str else re.findall( + r'<a title="([^"]+)"', cats_str) + + return { + 'id': video_id, + 'display_id': display_id, + 'formats': formats, + 'title': title, + 'thumbnail': thumbnail, + 'like_count': like_count, + 'dislike_count': dislike_count, + 'comment_count': comment_count, + 'categories': categories, + 'age_limit': self._rta_search(webpage), + 'duration': duration, + } diff --git a/yt_dlp/extractor/drtv.py b/yt_dlp/extractor/drtv.py new file mode 100644 index 000000000..7bb15f8d4 --- /dev/null +++ b/yt_dlp/extractor/drtv.py @@ -0,0 +1,355 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import binascii +import hashlib +import re + + +from .common import InfoExtractor +from ..aes import aes_cbc_decrypt +from ..compat import compat_urllib_parse_unquote +from ..utils import ( + bytes_to_intlist, + ExtractorError, + int_or_none, + intlist_to_bytes, + float_or_none, + mimetype2ext, + str_or_none, + try_get, + unified_timestamp, + update_url_query, + url_or_none, +) + + +class DRTVIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?dr\.dk/(?:tv/se|nyheder|radio(?:/ondemand)?)/(?:[^/]+/)*| + (?:www\.)?(?:dr\.dk|dr-massive\.com)/drtv/(?:se|episode|program)/ + ) + (?P<id>[\da-z_-]+) + ''' + _GEO_BYPASS = False + _GEO_COUNTRIES = ['DK'] + IE_NAME = 'drtv' + _TESTS = [{ + 'url': 'https://www.dr.dk/tv/se/boern/ultra/klassen-ultra/klassen-darlig-taber-10', + 'md5': '25e659cccc9a2ed956110a299fdf5983', + 'info_dict': { + 'id': 'klassen-darlig-taber-10', + 'ext': 'mp4', + 'title': 'Klassen - Dårlig taber (10)', + 'description': 'md5:815fe1b7fa656ed80580f31e8b3c79aa', + 'timestamp': 1539085800, + 'upload_date': '20181009', + 'duration': 606.84, + 'series': 'Klassen', + 'season': 'Klassen I', + 'season_number': 1, + 'season_id': 'urn:dr:mu:bundle:57d7e8216187a4031cfd6f6b', + 'episode': 'Episode 10', + 'episode_number': 10, + 'release_year': 2016, + }, + 'expected_warnings': ['Unable to download f4m manifest'], + }, { + # embed + 'url': 'https://www.dr.dk/nyheder/indland/live-christianias-rydning-af-pusher-street-er-i-gang', + 'info_dict': { + 'id': 'urn:dr:mu:programcard:57c926176187a50a9c6e83c6', + 'ext': 'mp4', + 'title': 'christiania pusher street ryddes drdkrjpo', + 'description': 'md5:2a71898b15057e9b97334f61d04e6eb5', + 'timestamp': 1472800279, + 'upload_date': '20160902', + 'duration': 131.4, + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download f4m manifest'], + }, { + # with SignLanguage formats + 'url': 'https://www.dr.dk/tv/se/historien-om-danmark/-/historien-om-danmark-stenalder', + 'info_dict': { + 'id': 'historien-om-danmark-stenalder', + 'ext': 'mp4', + 'title': 'Historien om Danmark: Stenalder', + 'description': 'md5:8c66dcbc1669bbc6f873879880f37f2a', + 'timestamp': 1546628400, + 'upload_date': '20190104', + 'duration': 3502.56, + 'formats': 'mincount:20', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.dr.dk/radio/p4kbh/regionale-nyheder-kh4/p4-nyheder-2019-06-26-17-30-9', + 'only_matching': True, + }, { + 'url': 'https://www.dr.dk/drtv/se/bonderoeven_71769', + 'info_dict': { + 'id': '00951930010', + 'ext': 'mp4', + 'title': 'Bonderøven (1:8)', + 'description': 'md5:3cf18fc0d3b205745d4505f896af8121', + 'timestamp': 1546542000, + 'upload_date': '20190103', + 'duration': 2576.6, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.dr.dk/drtv/episode/bonderoeven_71769', + 'only_matching': True, + }, { + 'url': 'https://dr-massive.com/drtv/se/bonderoeven_71769', + 'only_matching': True, + }, { + 'url': 'https://www.dr.dk/drtv/program/jagten_220924', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + if '>Programmet er ikke længere tilgængeligt' in webpage: + raise ExtractorError( + 'Video %s is not available' % video_id, expected=True) + + video_id = self._search_regex( + (r'data-(?:material-identifier|episode-slug)="([^"]+)"', + r'data-resource="[^>"]+mu/programcard/expanded/([^"]+)"'), + webpage, 'video id', default=None) + + if not video_id: + video_id = self._search_regex( + r'(urn(?:%3A|:)dr(?:%3A|:)mu(?:%3A|:)programcard(?:%3A|:)[\da-f]+)', + webpage, 'urn', default=None) + if video_id: + video_id = compat_urllib_parse_unquote(video_id) + + _PROGRAMCARD_BASE = 'https://www.dr.dk/mu-online/api/1.4/programcard' + query = {'expanded': 'true'} + + if video_id: + programcard_url = '%s/%s' % (_PROGRAMCARD_BASE, video_id) + else: + programcard_url = _PROGRAMCARD_BASE + page = self._parse_json( + self._search_regex( + r'data\s*=\s*({.+?})\s*(?:;|</script)', webpage, + 'data'), '1')['cache']['page'] + page = page[list(page.keys())[0]] + item = try_get( + page, (lambda x: x['item'], lambda x: x['entries'][0]['item']), + dict) + video_id = item['customId'].split(':')[-1] + query['productionnumber'] = video_id + + data = self._download_json( + programcard_url, video_id, 'Downloading video JSON', query=query) + + title = str_or_none(data.get('Title')) or re.sub( + r'\s*\|\s*(?:TV\s*\|\s*DR|DRTV)$', '', + self._og_search_title(webpage)) + description = self._og_search_description( + webpage, default=None) or data.get('Description') + + timestamp = unified_timestamp( + data.get('PrimaryBroadcastStartTime') or data.get('SortDateTime')) + + thumbnail = None + duration = None + + restricted_to_denmark = False + + formats = [] + subtitles = {} + + assets = [] + primary_asset = data.get('PrimaryAsset') + if isinstance(primary_asset, dict): + assets.append(primary_asset) + secondary_assets = data.get('SecondaryAssets') + if isinstance(secondary_assets, list): + for secondary_asset in secondary_assets: + if isinstance(secondary_asset, dict): + assets.append(secondary_asset) + + def hex_to_bytes(hex): + return binascii.a2b_hex(hex.encode('ascii')) + + def decrypt_uri(e): + n = int(e[2:10], 16) + a = e[10 + n:] + data = bytes_to_intlist(hex_to_bytes(e[10:10 + n])) + key = bytes_to_intlist(hashlib.sha256( + ('%s:sRBzYNXBzkKgnjj8pGtkACch' % a).encode('utf-8')).digest()) + iv = bytes_to_intlist(hex_to_bytes(a)) + decrypted = aes_cbc_decrypt(data, key, iv) + return intlist_to_bytes( + decrypted[:-decrypted[-1]]).decode('utf-8').split('?')[0] + + for asset in assets: + kind = asset.get('Kind') + if kind == 'Image': + thumbnail = url_or_none(asset.get('Uri')) + elif kind in ('VideoResource', 'AudioResource'): + duration = float_or_none(asset.get('DurationInMilliseconds'), 1000) + restricted_to_denmark = asset.get('RestrictedToDenmark') + asset_target = asset.get('Target') + for link in asset.get('Links', []): + uri = link.get('Uri') + if not uri: + encrypted_uri = link.get('EncryptedUri') + if not encrypted_uri: + continue + try: + uri = decrypt_uri(encrypted_uri) + except Exception: + self.report_warning( + 'Unable to decrypt EncryptedUri', video_id) + continue + uri = url_or_none(uri) + if not uri: + continue + target = link.get('Target') + format_id = target or '' + if asset_target in ('SpokenSubtitles', 'SignLanguage', 'VisuallyInterpreted'): + preference = -1 + format_id += '-%s' % asset_target + elif asset_target == 'Default': + preference = 1 + else: + preference = None + if target == 'HDS': + f4m_formats = self._extract_f4m_formats( + uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43', + video_id, preference, f4m_id=format_id, fatal=False) + if kind == 'AudioResource': + for f in f4m_formats: + f['vcodec'] = 'none' + formats.extend(f4m_formats) + elif target == 'HLS': + formats.extend(self._extract_m3u8_formats( + uri, video_id, 'mp4', entry_protocol='m3u8_native', + quality=preference, m3u8_id=format_id, + fatal=False)) + else: + bitrate = link.get('Bitrate') + if bitrate: + format_id += '-%s' % bitrate + formats.append({ + 'url': uri, + 'format_id': format_id, + 'tbr': int_or_none(bitrate), + 'ext': link.get('FileFormat'), + 'vcodec': 'none' if kind == 'AudioResource' else None, + 'quality': preference, + }) + subtitles_list = asset.get('SubtitlesList') or asset.get('Subtitleslist') + if isinstance(subtitles_list, list): + LANGS = { + 'Danish': 'da', + } + for subs in subtitles_list: + if not isinstance(subs, dict): + continue + sub_uri = url_or_none(subs.get('Uri')) + if not sub_uri: + continue + lang = subs.get('Language') or 'da' + subtitles.setdefault(LANGS.get(lang, lang), []).append({ + 'url': sub_uri, + 'ext': mimetype2ext(subs.get('MimeType')) or 'vtt' + }) + + if not formats and restricted_to_denmark: + self.raise_geo_restricted( + 'Unfortunately, DR is not allowed to show this program outside Denmark.', + countries=self._GEO_COUNTRIES) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'formats': formats, + 'subtitles': subtitles, + 'series': str_or_none(data.get('SeriesTitle')), + 'season': str_or_none(data.get('SeasonTitle')), + 'season_number': int_or_none(data.get('SeasonNumber')), + 'season_id': str_or_none(data.get('SeasonUrn')), + 'episode': str_or_none(data.get('EpisodeTitle')), + 'episode_number': int_or_none(data.get('EpisodeNumber')), + 'release_year': int_or_none(data.get('ProductionYear')), + } + + +class DRTVLiveIE(InfoExtractor): + IE_NAME = 'drtv:live' + _VALID_URL = r'https?://(?:www\.)?dr\.dk/(?:tv|TV)/live/(?P<id>[\da-z-]+)' + _GEO_COUNTRIES = ['DK'] + _TEST = { + 'url': 'https://www.dr.dk/tv/live/dr1', + 'info_dict': { + 'id': 'dr1', + 'ext': 'mp4', + 'title': 're:^DR1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + channel_id = self._match_id(url) + channel_data = self._download_json( + 'https://www.dr.dk/mu-online/api/1.0/channel/' + channel_id, + channel_id) + title = self._live_title(channel_data['Title']) + + formats = [] + for streaming_server in channel_data.get('StreamingServers', []): + server = streaming_server.get('Server') + if not server: + continue + link_type = streaming_server.get('LinkType') + for quality in streaming_server.get('Qualities', []): + for stream in quality.get('Streams', []): + stream_path = stream.get('Stream') + if not stream_path: + continue + stream_url = update_url_query( + '%s/%s' % (server, stream_path), {'b': ''}) + if link_type == 'HLS': + formats.extend(self._extract_m3u8_formats( + stream_url, channel_id, 'mp4', + m3u8_id=link_type, fatal=False, live=True)) + elif link_type == 'HDS': + formats.extend(self._extract_f4m_formats(update_url_query( + '%s/%s' % (server, stream_path), {'hdcore': '3.7.0'}), + channel_id, f4m_id=link_type, fatal=False)) + self._sort_formats(formats) + + return { + 'id': channel_id, + 'title': title, + 'thumbnail': channel_data.get('PrimaryImageUri'), + 'formats': formats, + 'is_live': True, + } diff --git a/yt_dlp/extractor/dtube.py b/yt_dlp/extractor/dtube.py new file mode 100644 index 000000000..ad247b7dd --- /dev/null +++ b/yt_dlp/extractor/dtube.py @@ -0,0 +1,82 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +from socket import timeout + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, +) + + +class DTubeIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?d\.tube/(?:#!/)?v/(?P<uploader_id>[0-9a-z.-]+)/(?P<id>[0-9a-z]{8})' + _TEST = { + 'url': 'https://d.tube/#!/v/broncnutz/x380jtr1', + 'md5': '9f29088fa08d699a7565ee983f56a06e', + 'info_dict': { + 'id': 'x380jtr1', + 'ext': 'mp4', + 'title': 'Lefty 3-Rings is Back Baby!! NCAA Picks', + 'description': 'md5:60be222088183be3a42f196f34235776', + 'uploader_id': 'broncnutz', + 'upload_date': '20190107', + 'timestamp': 1546854054, + }, + 'params': { + 'format': '480p', + }, + } + + def _real_extract(self, url): + uploader_id, video_id = self._match_valid_url(url).groups() + result = self._download_json('https://api.steemit.com/', video_id, data=json.dumps({ + 'jsonrpc': '2.0', + 'method': 'get_content', + 'params': [uploader_id, video_id], + }).encode())['result'] + + metadata = json.loads(result['json_metadata']) + video = metadata['video'] + content = video['content'] + info = video.get('info', {}) + title = info.get('title') or result['title'] + + def canonical_url(h): + if not h: + return None + return 'https://video.dtube.top/ipfs/' + h + + formats = [] + for q in ('240', '480', '720', '1080', ''): + video_url = canonical_url(content.get('video%shash' % q)) + if not video_url: + continue + format_id = (q + 'p') if q else 'Source' + try: + self.to_screen('%s: Checking %s video format URL' % (video_id, format_id)) + self._downloader._opener.open(video_url, timeout=5).close() + except timeout: + self.to_screen( + '%s: %s URL is invalid, skipping' % (video_id, format_id)) + continue + formats.append({ + 'format_id': format_id, + 'url': video_url, + 'height': int_or_none(q), + 'ext': 'mp4', + }) + + return { + 'id': video_id, + 'title': title, + 'description': content.get('description'), + 'thumbnail': canonical_url(info.get('snaphash')), + 'tags': content.get('tags') or metadata.get('tags'), + 'duration': info.get('duration'), + 'formats': formats, + 'timestamp': parse_iso8601(result.get('created')), + 'uploader_id': uploader_id, + } diff --git a/yt_dlp/extractor/duboku.py b/yt_dlp/extractor/duboku.py new file mode 100644 index 000000000..a87597873 --- /dev/null +++ b/yt_dlp/extractor/duboku.py @@ -0,0 +1,242 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + clean_html, + extract_attributes, + ExtractorError, + get_elements_by_class, + int_or_none, + js_to_json, + smuggle_url, + unescapeHTML, +) + + +def _get_elements_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True): + """Return the content of the tag with the specified attribute in the passed HTML document""" + + if tag is None: + tag = '[a-zA-Z0-9:._-]+' + if attribute is None: + attribute = '' + else: + attribute = r'\s+(?P<attribute>%s)' % re.escape(attribute) + if value is None: + value = '' + else: + value = re.escape(value) if escape_value else value + value = '=[\'"]?(?P<value>%s)[\'"]?' % value + + retlist = [] + for m in re.finditer(r'''(?xs) + <(?P<tag>%s) + (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*? + %s%s + (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*? + \s*> + (?P<content>.*?) + </\1> + ''' % (tag, attribute, value), html): + retlist.append(m) + + return retlist + + +def _get_element_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True): + retval = _get_elements_by_tag_and_attrib(html, tag, attribute, value, escape_value) + return retval[0] if retval else None + + +class DubokuIE(InfoExtractor): + IE_NAME = 'duboku' + IE_DESC = 'www.duboku.co' + + _VALID_URL = r'(?:https?://[^/]+\.duboku\.co/vodplay/)(?P<id>[0-9]+-[0-9-]+)\.html.*' + _TESTS = [{ + 'url': 'https://www.duboku.co/vodplay/1575-1-1.html', + 'info_dict': { + 'id': '1575-1-1', + 'ext': 'ts', + 'series': '白色月光', + 'title': 'contains:白色月光', + 'season_number': 1, + 'episode_number': 1, + }, + 'params': { + 'skip_download': 'm3u8 download', + }, + }, { + 'url': 'https://www.duboku.co/vodplay/1588-1-1.html', + 'info_dict': { + 'id': '1588-1-1', + 'ext': 'ts', + 'series': '亲爱的自己', + 'title': 'contains:预告片', + 'season_number': 1, + 'episode_number': 1, + }, + 'params': { + 'skip_download': 'm3u8 download', + }, + }] + + _PLAYER_DATA_PATTERN = r'player_data\s*=\s*(\{\s*(.*)})\s*;?\s*</script' + + def _real_extract(self, url): + video_id = self._match_id(url) + temp = video_id.split('-') + series_id = temp[0] + season_id = temp[1] + episode_id = temp[2] + + webpage_url = 'https://www.duboku.co/vodplay/%s.html' % video_id + webpage_html = self._download_webpage(webpage_url, video_id) + + # extract video url + + player_data = self._search_regex( + self._PLAYER_DATA_PATTERN, webpage_html, 'player_data') + player_data = self._parse_json(player_data, video_id, js_to_json) + + # extract title + + temp = get_elements_by_class('title', webpage_html) + series_title = None + title = None + for html in temp: + mobj = re.search(r'<a\s+.*>(.*)</a>', html) + if mobj: + href = extract_attributes(mobj.group(0)).get('href') + if href: + mobj1 = re.search(r'/(\d+)\.html', href) + if mobj1 and mobj1.group(1) == series_id: + series_title = clean_html(mobj.group(0)) + series_title = re.sub(r'[\s\r\n\t]+', ' ', series_title) + title = clean_html(html) + title = re.sub(r'[\s\r\n\t]+', ' ', title) + break + + data_url = player_data.get('url') + if not data_url: + raise ExtractorError('Cannot find url in player_data') + data_from = player_data.get('from') + + # if it is an embedded iframe, maybe it's an external source + if data_from == 'iframe': + # use _type url_transparent to retain the meaningful details + # of the video. + return { + '_type': 'url_transparent', + 'url': smuggle_url(data_url, {'http_headers': {'Referer': webpage_url}}), + 'id': video_id, + 'title': title, + 'series': series_title, + 'season_number': int_or_none(season_id), + 'season_id': season_id, + 'episode_number': int_or_none(episode_id), + 'episode_id': episode_id, + } + + formats = self._extract_m3u8_formats(data_url, video_id, 'mp4') + + return { + 'id': video_id, + 'title': title, + 'series': series_title, + 'season_number': int_or_none(season_id), + 'season_id': season_id, + 'episode_number': int_or_none(episode_id), + 'episode_id': episode_id, + 'formats': formats, + 'http_headers': {'Referer': 'https://www.duboku.co/static/player/videojs.html'} + } + + +class DubokuPlaylistIE(InfoExtractor): + IE_NAME = 'duboku:list' + IE_DESC = 'www.duboku.co entire series' + + _VALID_URL = r'(?:https?://[^/]+\.duboku\.co/voddetail/)(?P<id>[0-9]+)\.html.*' + _TESTS = [{ + 'url': 'https://www.duboku.co/voddetail/1575.html', + 'info_dict': { + 'id': 'startswith:1575', + 'title': '白色月光', + }, + 'playlist_count': 12, + }, { + 'url': 'https://www.duboku.co/voddetail/1554.html', + 'info_dict': { + 'id': 'startswith:1554', + 'title': '以家人之名', + }, + 'playlist_mincount': 30, + }, { + 'url': 'https://www.duboku.co/voddetail/1554.html#playlist2', + 'info_dict': { + 'id': '1554#playlist2', + 'title': '以家人之名', + }, + 'playlist_mincount': 27, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + if mobj is None: + raise ExtractorError('Invalid URL: %s' % url) + series_id = mobj.group('id') + fragment = compat_urlparse.urlparse(url).fragment + + webpage_url = 'https://www.duboku.co/voddetail/%s.html' % series_id + webpage_html = self._download_webpage(webpage_url, series_id) + + # extract title + + title = _get_element_by_tag_and_attrib(webpage_html, 'h1', 'class', 'title') + title = unescapeHTML(title.group('content')) if title else None + if not title: + title = self._html_search_meta('keywords', webpage_html) + if not title: + title = _get_element_by_tag_and_attrib(webpage_html, 'title') + title = unescapeHTML(title.group('content')) if title else None + + # extract playlists + + playlists = {} + for div in _get_elements_by_tag_and_attrib( + webpage_html, attribute='id', value='playlist\\d+', escape_value=False): + playlist_id = div.group('value') + playlist = [] + for a in _get_elements_by_tag_and_attrib( + div.group('content'), 'a', 'href', value='[^\'"]+?', escape_value=False): + playlist.append({ + 'href': unescapeHTML(a.group('value')), + 'title': unescapeHTML(a.group('content')) + }) + playlists[playlist_id] = playlist + + # select the specified playlist if url fragment exists + playlist = None + playlist_id = None + if fragment: + playlist = playlists.get(fragment) + playlist_id = fragment + else: + first = next(iter(playlists.items()), None) + if first: + (playlist_id, playlist) = first + if not playlist: + raise ExtractorError( + 'Cannot find %s' % fragment if fragment else 'Cannot extract playlist') + + # return url results + return self.playlist_result([ + self.url_result( + compat_urlparse.urljoin('https://www.duboku.co', x['href']), + ie=DubokuIE.ie_key(), video_title=x.get('title')) + for x in playlist], series_id + '#' + playlist_id, title) diff --git a/youtube_dl/extractor/dumpert.py b/yt_dlp/extractor/dumpert.py index d9d9afdec..d9d9afdec 100644 --- a/youtube_dl/extractor/dumpert.py +++ b/yt_dlp/extractor/dumpert.py diff --git a/youtube_dl/extractor/dvtv.py b/yt_dlp/extractor/dvtv.py index de7f6d670..de7f6d670 100644 --- a/youtube_dl/extractor/dvtv.py +++ b/yt_dlp/extractor/dvtv.py diff --git a/yt_dlp/extractor/dw.py b/yt_dlp/extractor/dw.py new file mode 100644 index 000000000..6eaee07b4 --- /dev/null +++ b/yt_dlp/extractor/dw.py @@ -0,0 +1,110 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + unified_strdate, + url_or_none, +) +from ..compat import compat_urlparse + + +class DWIE(InfoExtractor): + IE_NAME = 'dw' + _VALID_URL = r'https?://(?:www\.)?dw\.com/(?:[^/]+/)+(?:av|e)-(?P<id>\d+)' + _TESTS = [{ + # video + 'url': 'http://www.dw.com/en/intelligent-light/av-19112290', + 'md5': 'fb9dfd9520811d3ece80f04befd73428', + 'info_dict': { + 'id': '19112290', + 'ext': 'mp4', + 'title': 'Intelligent light', + 'description': 'md5:90e00d5881719f2a6a5827cb74985af1', + 'upload_date': '20160605', + } + }, { + # audio + 'url': 'http://www.dw.com/en/worldlink-my-business/av-19111941', + 'md5': '2814c9a1321c3a51f8a7aeb067a360dd', + 'info_dict': { + 'id': '19111941', + 'ext': 'mp3', + 'title': 'WorldLink: My business', + 'description': 'md5:bc9ca6e4e063361e21c920c53af12405', + 'upload_date': '20160311', + } + }, { + # DW documentaries, only last for one or two weeks + 'url': 'http://www.dw.com/en/documentaries-welcome-to-the-90s-2016-05-21/e-19220158-9798', + 'md5': '56b6214ef463bfb9a3b71aeb886f3cf1', + 'info_dict': { + 'id': '19274438', + 'ext': 'mp4', + 'title': 'Welcome to the 90s – Hip Hop', + 'description': 'Welcome to the 90s - The Golden Decade of Hip Hop', + 'upload_date': '20160521', + }, + 'skip': 'Video removed', + }] + + def _real_extract(self, url): + media_id = self._match_id(url) + webpage = self._download_webpage(url, media_id) + hidden_inputs = self._hidden_inputs(webpage) + title = hidden_inputs['media_title'] + media_id = hidden_inputs.get('media_id') or media_id + + direct_url = url_or_none(hidden_inputs.get('file_name')) + if direct_url: + formats = [{'url': hidden_inputs['file_name']}] + else: + formats = self._extract_smil_formats( + 'http://www.dw.com/smil/v-%s' % media_id, media_id, + transform_source=lambda s: s.replace( + 'rtmp://tv-od.dw.de/flash/', + 'http://tv-download.dw.de/dwtv_video/flv/')) + self._sort_formats(formats) + + upload_date = hidden_inputs.get('display_date') + if not upload_date: + upload_date = self._html_search_regex( + r'<span[^>]+class="date">([0-9.]+)\s*\|', webpage, + 'upload date', default=None) + upload_date = unified_strdate(upload_date) + + return { + 'id': media_id, + 'title': title, + 'description': self._og_search_description(webpage), + 'thumbnail': hidden_inputs.get('preview_image'), + 'duration': int_or_none(hidden_inputs.get('file_duration')), + 'upload_date': upload_date, + 'formats': formats, + } + + +class DWArticleIE(InfoExtractor): + IE_NAME = 'dw:article' + _VALID_URL = r'https?://(?:www\.)?dw\.com/(?:[^/]+/)+a-(?P<id>\d+)' + _TEST = { + 'url': 'http://www.dw.com/en/no-hope-limited-options-for-refugees-in-idomeni/a-19111009', + 'md5': '8ca657f9d068bbef74d6fc38b97fc869', + 'info_dict': { + 'id': '19105868', + 'ext': 'mp4', + 'title': 'The harsh life of refugees in Idomeni', + 'description': 'md5:196015cc7e48ebf474db9399420043c7', + 'upload_date': '20160310', + } + } + + def _real_extract(self, url): + article_id = self._match_id(url) + webpage = self._download_webpage(url, article_id) + hidden_inputs = self._hidden_inputs(webpage) + media_id = hidden_inputs['media_id'] + media_path = self._search_regex(r'href="([^"]+av-%s)"\s+class="overlayLink"' % media_id, webpage, 'media url') + media_url = compat_urlparse.urljoin(url, media_path) + return self.url_result(media_url, 'DW', media_id) diff --git a/yt_dlp/extractor/eagleplatform.py b/yt_dlp/extractor/eagleplatform.py new file mode 100644 index 000000000..f86731a0c --- /dev/null +++ b/yt_dlp/extractor/eagleplatform.py @@ -0,0 +1,206 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ( + ExtractorError, + int_or_none, + unsmuggle_url, + url_or_none, +) + + +class EaglePlatformIE(InfoExtractor): + _VALID_URL = r'''(?x) + (?: + eagleplatform:(?P<custom_host>[^/]+):| + https?://(?P<host>.+?\.media\.eagleplatform\.com)/index/player\?.*\brecord_id= + ) + (?P<id>\d+) + ''' + _TESTS = [{ + # http://lenta.ru/news/2015/03/06/navalny/ + 'url': 'http://lentaru.media.eagleplatform.com/index/player?player=new&record_id=227304&player_template_id=5201', + # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used + 'info_dict': { + 'id': '227304', + 'ext': 'mp4', + 'title': 'Навальный вышел на свободу', + 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 87, + 'view_count': int, + 'age_limit': 0, + }, + }, { + # http://muz-tv.ru/play/7129/ + # http://media.clipyou.ru/index/player?record_id=12820&width=730&height=415&autoplay=true + 'url': 'eagleplatform:media.clipyou.ru:12820', + 'md5': '358597369cf8ba56675c1df15e7af624', + 'info_dict': { + 'id': '12820', + 'ext': 'mp4', + 'title': "'O Sole Mio", + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 216, + 'view_count': int, + }, + 'skip': 'Georestricted', + }, { + # referrer protected video (https://tvrain.ru/lite/teleshow/kak_vse_nachinalos/namin-418921/) + 'url': 'eagleplatform:tvrainru.media.eagleplatform.com:582306', + 'only_matching': True, + }] + + @staticmethod + def _extract_url(webpage): + # Regular iframe embedding + mobj = re.search( + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//.+?\.media\.eagleplatform\.com/index/player\?.+?)\1', + webpage) + if mobj is not None: + return mobj.group('url') + PLAYER_JS_RE = r''' + <script[^>]+ + src=(?P<qjs>["\'])(?:https?:)?//(?P<host>(?:(?!(?P=qjs)).)+\.media\.eagleplatform\.com)/player/player\.js(?P=qjs) + .+? + ''' + # "Basic usage" embedding (see http://dultonmedia.github.io/eplayer/) + mobj = re.search( + r'''(?xs) + %s + <div[^>]+ + class=(?P<qclass>["\'])eagleplayer(?P=qclass)[^>]+ + data-id=["\'](?P<id>\d+) + ''' % PLAYER_JS_RE, webpage) + if mobj is not None: + return 'eagleplatform:%(host)s:%(id)s' % mobj.groupdict() + # Generalization of "Javascript code usage", "Combined usage" and + # "Usage without attaching to DOM" embeddings (see + # http://dultonmedia.github.io/eplayer/) + mobj = re.search( + r'''(?xs) + %s + <script> + .+? + new\s+EaglePlayer\( + (?:[^,]+\s*,\s*)? + { + .+? + \bid\s*:\s*["\']?(?P<id>\d+) + .+? + } + \s*\) + .+? + </script> + ''' % PLAYER_JS_RE, webpage) + if mobj is not None: + return 'eagleplatform:%(host)s:%(id)s' % mobj.groupdict() + + @staticmethod + def _handle_error(response): + status = int_or_none(response.get('status', 200)) + if status != 200: + raise ExtractorError(' '.join(response['errors']), expected=True) + + def _download_json(self, url_or_request, video_id, *args, **kwargs): + try: + response = super(EaglePlatformIE, self)._download_json( + url_or_request, video_id, *args, **kwargs) + except ExtractorError as ee: + if isinstance(ee.cause, compat_HTTPError): + response = self._parse_json(ee.cause.read().decode('utf-8'), video_id) + self._handle_error(response) + raise + return response + + def _get_video_url(self, url_or_request, video_id, note='Downloading JSON metadata'): + return self._download_json(url_or_request, video_id, note)['data'][0] + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + + mobj = self._match_valid_url(url) + host, video_id = mobj.group('custom_host') or mobj.group('host'), mobj.group('id') + + headers = {} + query = { + 'id': video_id, + } + + referrer = smuggled_data.get('referrer') + if referrer: + headers['Referer'] = referrer + query['referrer'] = referrer + + player_data = self._download_json( + 'http://%s/api/player_data' % host, video_id, + headers=headers, query=query) + + media = player_data['data']['playlist']['viewports'][0]['medialist'][0] + + title = media['title'] + description = media.get('description') + thumbnail = self._proto_relative_url(media.get('snapshot'), 'http:') + duration = int_or_none(media.get('duration')) + view_count = int_or_none(media.get('views')) + + age_restriction = media.get('age_restriction') + age_limit = None + if age_restriction: + age_limit = 0 if age_restriction == 'allow_all' else 18 + + secure_m3u8 = self._proto_relative_url(media['sources']['secure_m3u8']['auto'], 'http:') + + formats = [] + + m3u8_url = self._get_video_url(secure_m3u8, video_id, 'Downloading m3u8 JSON') + m3u8_formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False) + formats.extend(m3u8_formats) + + m3u8_formats_dict = {} + for f in m3u8_formats: + if f.get('height') is not None: + m3u8_formats_dict[f['height']] = f + + mp4_data = self._download_json( + # Secure mp4 URL is constructed according to Player.prototype.mp4 from + # http://lentaru.media.eagleplatform.com/player/player.js + re.sub(r'm3u8|hlsvod|hls|f4m', 'mp4s', secure_m3u8), + video_id, 'Downloading mp4 JSON', fatal=False) + if mp4_data: + for format_id, format_url in mp4_data.get('data', {}).items(): + if not url_or_none(format_url): + continue + height = int_or_none(format_id) + if height is not None and m3u8_formats_dict.get(height): + f = m3u8_formats_dict[height].copy() + f.update({ + 'format_id': f['format_id'].replace('hls', 'http'), + 'protocol': 'http', + }) + else: + f = { + 'format_id': 'http-%s' % format_id, + 'height': int_or_none(format_id), + } + f['url'] = format_url + formats.append(f) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'view_count': view_count, + 'age_limit': age_limit, + 'formats': formats, + } diff --git a/youtube_dl/extractor/ebaumsworld.py b/yt_dlp/extractor/ebaumsworld.py index c97682cd3..c97682cd3 100644 --- a/youtube_dl/extractor/ebaumsworld.py +++ b/yt_dlp/extractor/ebaumsworld.py diff --git a/youtube_dl/extractor/echomsk.py b/yt_dlp/extractor/echomsk.py index 6b7cc652f..6b7cc652f 100644 --- a/youtube_dl/extractor/echomsk.py +++ b/yt_dlp/extractor/echomsk.py diff --git a/yt_dlp/extractor/egghead.py b/yt_dlp/extractor/egghead.py new file mode 100644 index 000000000..f6b50e7c2 --- /dev/null +++ b/yt_dlp/extractor/egghead.py @@ -0,0 +1,139 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + int_or_none, + try_get, + unified_timestamp, + url_or_none, +) + + +class EggheadBaseIE(InfoExtractor): + def _call_api(self, path, video_id, resource, fatal=True): + return self._download_json( + 'https://app.egghead.io/api/v1/' + path, + video_id, 'Downloading %s JSON' % resource, fatal=fatal) + + +class EggheadCourseIE(EggheadBaseIE): + IE_DESC = 'egghead.io course' + IE_NAME = 'egghead:course' + _VALID_URL = r'https://(?:app\.)?egghead\.io/(?:course|playlist)s/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://egghead.io/courses/professor-frisby-introduces-composable-functional-javascript', + 'playlist_count': 29, + 'info_dict': { + 'id': '432655', + 'title': 'Professor Frisby Introduces Composable Functional JavaScript', + 'description': 're:(?s)^This course teaches the ubiquitous.*You\'ll start composing functionality before you know it.$', + }, + }, { + 'url': 'https://app.egghead.io/playlists/professor-frisby-introduces-composable-functional-javascript', + 'only_matching': True, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + series_path = 'series/' + playlist_id + lessons = self._call_api( + series_path + '/lessons', playlist_id, 'course lessons') + + entries = [] + for lesson in lessons: + lesson_url = url_or_none(lesson.get('http_url')) + if not lesson_url: + continue + lesson_id = lesson.get('id') + if lesson_id: + lesson_id = compat_str(lesson_id) + entries.append(self.url_result( + lesson_url, ie=EggheadLessonIE.ie_key(), video_id=lesson_id)) + + course = self._call_api( + series_path, playlist_id, 'course', False) or {} + + playlist_id = course.get('id') + if playlist_id: + playlist_id = compat_str(playlist_id) + + return self.playlist_result( + entries, playlist_id, course.get('title'), + course.get('description')) + + +class EggheadLessonIE(EggheadBaseIE): + IE_DESC = 'egghead.io lesson' + IE_NAME = 'egghead:lesson' + _VALID_URL = r'https://(?:app\.)?egghead\.io/(?:api/v1/)?lessons/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://egghead.io/lessons/javascript-linear-data-flow-with-container-style-types-box', + 'info_dict': { + 'id': '1196', + 'display_id': 'javascript-linear-data-flow-with-container-style-types-box', + 'ext': 'mp4', + 'title': 'Create linear data flow with container style types (Box)', + 'description': 'md5:9aa2cdb6f9878ed4c39ec09e85a8150e', + 'thumbnail': r're:^https?:.*\.jpg$', + 'timestamp': 1481296768, + 'upload_date': '20161209', + 'duration': 304, + 'view_count': 0, + 'tags': 'count:2', + }, + 'params': { + 'skip_download': True, + 'format': 'bestvideo', + }, + }, { + 'url': 'https://egghead.io/api/v1/lessons/react-add-redux-to-a-react-application', + 'only_matching': True, + }, { + 'url': 'https://app.egghead.io/lessons/javascript-linear-data-flow-with-container-style-types-box', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + lesson = self._call_api( + 'lessons/' + display_id, display_id, 'lesson') + + lesson_id = compat_str(lesson['id']) + title = lesson['title'] + + formats = [] + for _, format_url in lesson['media_urls'].items(): + format_url = url_or_none(format_url) + if not format_url: + continue + ext = determine_ext(format_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, lesson_id, 'mp4', m3u8_id='hls', fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, lesson_id, mpd_id='dash', fatal=False)) + else: + formats.append({ + 'url': format_url, + }) + self._sort_formats(formats) + + return { + 'id': lesson_id, + 'display_id': display_id, + 'title': title, + 'description': lesson.get('summary'), + 'thumbnail': lesson.get('thumb_nail'), + 'timestamp': unified_timestamp(lesson.get('published_at')), + 'duration': int_or_none(lesson.get('duration')), + 'view_count': int_or_none(lesson.get('plays_count')), + 'tags': try_get(lesson, lambda x: x['tag_list'], list), + 'series': try_get( + lesson, lambda x: x['series']['title'], compat_str), + 'formats': formats, + } diff --git a/youtube_dl/extractor/ehow.py b/yt_dlp/extractor/ehow.py index b1cd4f5d4..b1cd4f5d4 100644 --- a/youtube_dl/extractor/ehow.py +++ b/yt_dlp/extractor/ehow.py diff --git a/youtube_dl/extractor/eighttracks.py b/yt_dlp/extractor/eighttracks.py index 9a44f89f3..9a44f89f3 100644 --- a/youtube_dl/extractor/eighttracks.py +++ b/yt_dlp/extractor/eighttracks.py diff --git a/yt_dlp/extractor/einthusan.py b/yt_dlp/extractor/einthusan.py new file mode 100644 index 000000000..7af279a53 --- /dev/null +++ b/yt_dlp/extractor/einthusan.py @@ -0,0 +1,110 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..compat import ( + compat_b64decode, + compat_str, + compat_urlparse, +) +from ..utils import ( + extract_attributes, + ExtractorError, + get_elements_by_class, + urlencode_postdata, +) + + +class EinthusanIE(InfoExtractor): + _VALID_URL = r'https?://(?P<host>einthusan\.(?:tv|com|ca))/movie/watch/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://einthusan.tv/movie/watch/9097/', + 'md5': 'ff0f7f2065031b8a2cf13a933731c035', + 'info_dict': { + 'id': '9097', + 'ext': 'mp4', + 'title': 'Ae Dil Hai Mushkil', + 'description': 'md5:33ef934c82a671a94652a9b4e54d931b', + 'thumbnail': r're:^https?://.*\.jpg$', + } + }, { + 'url': 'https://einthusan.tv/movie/watch/51MZ/?lang=hindi', + 'only_matching': True, + }, { + 'url': 'https://einthusan.com/movie/watch/9097/', + 'only_matching': True, + }, { + 'url': 'https://einthusan.ca/movie/watch/4E9n/?lang=hindi', + 'only_matching': True, + }] + + # reversed from jsoncrypto.prototype.decrypt() in einthusan-PGMovieWatcher.js + def _decrypt(self, encrypted_data, video_id): + return self._parse_json(compat_b64decode(( + encrypted_data[:10] + encrypted_data[-1] + encrypted_data[12:-1] + )).decode('utf-8'), video_id) + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + host = mobj.group('host') + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex(r'<h3>([^<]+)</h3>', webpage, 'title') + + player_params = extract_attributes(self._search_regex( + r'(<section[^>]+id="UIVideoPlayer"[^>]+>)', webpage, 'player parameters')) + + page_id = self._html_search_regex( + '<html[^>]+data-pageid="([^"]+)"', webpage, 'page ID') + video_data = self._download_json( + 'https://%s/ajax/movie/watch/%s/' % (host, video_id), video_id, + data=urlencode_postdata({ + 'xEvent': 'UIVideoPlayer.PingOutcome', + 'xJson': json.dumps({ + 'EJOutcomes': player_params['data-ejpingables'], + 'NativeHLS': False + }), + 'arcVersion': 3, + 'appVersion': 59, + 'gorilla.csrf.Token': page_id, + }))['Data'] + + if isinstance(video_data, compat_str) and video_data.startswith('/ratelimited/'): + raise ExtractorError( + 'Download rate reached. Please try again later.', expected=True) + + ej_links = self._decrypt(video_data['EJLinks'], video_id) + + formats = [] + + m3u8_url = ej_links.get('HLSLink') + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native')) + + mp4_url = ej_links.get('MP4Link') + if mp4_url: + formats.append({ + 'url': mp4_url, + }) + + self._sort_formats(formats) + + description = get_elements_by_class('synopsis', webpage)[0] + thumbnail = self._html_search_regex( + r'''<img[^>]+src=(["'])(?P<url>(?!\1).+?/moviecovers/(?!\1).+?)\1''', + webpage, 'thumbnail url', fatal=False, group='url') + if thumbnail is not None: + thumbnail = compat_urlparse.urljoin(url, thumbnail) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + 'description': description, + } diff --git a/youtube_dl/extractor/eitb.py b/yt_dlp/extractor/eitb.py index ee5ead18b..ee5ead18b 100644 --- a/youtube_dl/extractor/eitb.py +++ b/yt_dlp/extractor/eitb.py diff --git a/youtube_dl/extractor/ellentube.py b/yt_dlp/extractor/ellentube.py index 544473274..544473274 100644 --- a/youtube_dl/extractor/ellentube.py +++ b/yt_dlp/extractor/ellentube.py diff --git a/yt_dlp/extractor/elonet.py b/yt_dlp/extractor/elonet.py new file mode 100644 index 000000000..eefba4e24 --- /dev/null +++ b/yt_dlp/extractor/elonet.py @@ -0,0 +1,89 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + base_url, + ExtractorError, + try_get, +) +from ..compat import compat_str + + +class ElonetIE(InfoExtractor): + _VALID_URL = r'https?://elonet\.finna\.fi/Record/kavi\.elonet_elokuva_(?P<id>[0-9]+)' + _TESTS = [{ + # m3u8 with subtitles + 'url': 'https://elonet.finna.fi/Record/kavi.elonet_elokuva_107867', + 'md5': '8efc954b96c543711707f87de757caea', + 'info_dict': { + 'id': '107867', + 'ext': 'mp4', + 'title': 'Valkoinen peura', + 'description': 'Valkoinen peura (1952) on Erik Blombergin ohjaama ja yhdessä Mirjami Kuosmasen kanssa käsikirjoittama tarunomainen kertomus valkoisen peuran hahmossa lii...', + 'thumbnail': 'https://elonet.finna.fi/Cover/Show?id=kavi.elonet_elokuva_107867&index=0&size=large', + }, + }, { + # DASH with subtitles + 'url': 'https://elonet.finna.fi/Record/kavi.elonet_elokuva_116539', + 'info_dict': { + 'id': '116539', + 'ext': 'mp4', + 'title': 'Minulla on tiikeri', + 'description': 'Pienellä pojalla, joka asuu kerrostalossa, on kotieläimenä tiikeri. Se on kuitenkin salaisuus. Kerrostalon räpätäti on Kotilaisen täti, joka on aina vali...', + 'thumbnail': 'https://elonet.finna.fi/Cover/Show?id=kavi.elonet_elokuva_116539&index=0&size=large&source=Solr', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex( + r'<meta .*property="og:title" .*content="(.+?)"', webpage, 'title') + description = self._html_search_regex( + r'<meta .*property="og:description" .*content="(.+?)"', webpage, 'description') + thumbnail = self._html_search_regex( + r'<meta .*property="og:image" .*content="(.+?)"', webpage, 'thumbnail') + + json_s = self._html_search_regex( + r'data-video-sources="(.+?)"', webpage, 'json') + src = try_get( + self._parse_json(json_s, video_id), + lambda x: x[0]["src"], compat_str) + formats = [] + subtitles = {} + if re.search(r'\.m3u8\??', src): + res = self._download_webpage_handle( + # elonet servers have certificate problems + src.replace('https:', 'http:'), video_id, + note='Downloading m3u8 information', + errnote='Failed to download m3u8 information') + if res: + doc, urlh = res + url = urlh.geturl() + formats, subtitles = self._parse_m3u8_formats_and_subtitles(doc, url) + for f in formats: + f['ext'] = 'mp4' + elif re.search(r'\.mpd\??', src): + res = self._download_xml_handle( + src, video_id, + note='Downloading MPD manifest', + errnote='Failed to download MPD manifest') + if res: + doc, urlh = res + url = base_url(urlh.geturl()) + formats, subtitles = self._parse_mpd_formats_and_subtitles(doc, mpd_base_url=url) + else: + raise ExtractorError("Unknown streaming format") + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/youtube_dl/extractor/elpais.py b/yt_dlp/extractor/elpais.py index b89f6db62..b89f6db62 100644 --- a/youtube_dl/extractor/elpais.py +++ b/yt_dlp/extractor/elpais.py diff --git a/youtube_dl/extractor/embedly.py b/yt_dlp/extractor/embedly.py index a5820b21e..a5820b21e 100644 --- a/youtube_dl/extractor/embedly.py +++ b/yt_dlp/extractor/embedly.py diff --git a/youtube_dl/extractor/engadget.py b/yt_dlp/extractor/engadget.py index 65635c18b..65635c18b 100644 --- a/youtube_dl/extractor/engadget.py +++ b/yt_dlp/extractor/engadget.py diff --git a/yt_dlp/extractor/epicon.py b/yt_dlp/extractor/epicon.py new file mode 100644 index 000000000..b4e544d4f --- /dev/null +++ b/yt_dlp/extractor/epicon.py @@ -0,0 +1,119 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class EpiconIE(InfoExtractor): + _VALID_URL = r'(?:https?://)(?:www\.)?epicon\.in/(?:documentaries|movies|tv-shows/[^/?#]+/[^/?#]+)/(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'https://www.epicon.in/documentaries/air-battle-of-srinagar', + 'info_dict': { + 'id': 'air-battle-of-srinagar', + 'ext': 'mp4', + 'title': 'Air Battle of Srinagar', + 'description': 'md5:c4de2013af9bc05ae4392e4115d518d7', + 'thumbnail': r're:^https?://.*\.jpg$', + } + }, { + 'url': 'https://www.epicon.in/movies/krit', + 'info_dict': { + 'id': 'krit', + 'ext': 'mp4', + 'title': 'Krit', + 'description': 'md5:c12b35dad915d48ccff7f013c79bab4a', + 'thumbnail': r're:^https?://.*\.jpg$', + } + }, { + 'url': 'https://www.epicon.in/tv-shows/paapnaashini-ganga/season-1/vardaan', + 'info_dict': { + 'id': 'vardaan', + 'ext': 'mp4', + 'title': 'Paapnaashini Ganga - Season 1 - Ep 1 - VARDAAN', + 'description': 'md5:f517058c3d0402398eefa6242f4dd6ae', + 'thumbnail': r're:^https?://.*\.jpg$', + } + }, { + 'url': 'https://www.epicon.in/movies/jayadev', + 'info_dict': { + 'id': 'jayadev', + 'ext': 'mp4', + 'title': 'Jayadev', + 'description': 'md5:09e349eecd8e585a3b6466904f19df6c', + 'thumbnail': r're:^https?://.*\.jpg$', + } + }] + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + cid = self._search_regex(r'class=\"mylist-icon\ iconclick\"\ id=\"(\d+)', webpage, 'cid') + headers = {'content-type': 'application/x-www-form-urlencoded; charset=UTF-8'} + data = f'cid={cid}&action=st&type=video'.encode() + data_json = self._parse_json(self._download_json('https://www.epicon.in/ajaxplayer/', id, headers=headers, data=data), id) + + if not data_json['success']: + raise ExtractorError(data_json['message'], expected=True) + + title = self._search_regex(r'setplaytitle=\"([^\"]+)', webpage, 'title') + description = self._og_search_description(webpage) or None + thumbnail = self._og_search_thumbnail(webpage) or None + formats = self._extract_m3u8_formats(data_json['url']['video_url'], id) + self._sort_formats(formats) + + subtitles = {} + for subtitle in data_json.get('subtitles', []): + sub_url = subtitle.get('file') + if not sub_url: + continue + subtitles.setdefault(subtitle.get('lang', 'English'), []).append({ + 'url': self._proto_relative_url(sub_url), + }) + + return { + 'id': id, + 'formats': formats, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'subtitles': subtitles, + } + + +class EpiconSeriesIE(InfoExtractor): + _VALID_URL = r'(?!.*season)(?:https?://)(?:www\.)?epicon\.in/tv-shows/(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'https://www.epicon.in/tv-shows/1-of-something', + 'playlist_mincount': 5, + 'info_dict': { + 'id': '1-of-something', + }, + }, { + 'url': 'https://www.epicon.in/tv-shows/eco-india-english', + 'playlist_mincount': 76, + 'info_dict': { + 'id': 'eco-india-english', + }, + }, { + 'url': 'https://www.epicon.in/tv-shows/s/', + 'playlist_mincount': 25, + 'info_dict': { + 'id': 's', + }, + }, { + 'url': 'https://www.epicon.in/tv-shows/ekaant', + 'playlist_mincount': 38, + 'info_dict': { + 'id': 'ekaant', + }, + }] + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + episodes = re.findall(r'ct-tray-url=\"(tv-shows/%s/[^\"]+)' % id, webpage) + entries = [self.url_result('https://www.epicon.in/%s' % episode, ie=EpiconIE.ie_key()) for episode in episodes] + return self.playlist_result(entries, playlist_id=id) diff --git a/yt_dlp/extractor/eporner.py b/yt_dlp/extractor/eporner.py new file mode 100644 index 000000000..25a0d9799 --- /dev/null +++ b/yt_dlp/extractor/eporner.py @@ -0,0 +1,131 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ( + encode_base_n, + ExtractorError, + int_or_none, + merge_dicts, + parse_duration, + str_to_int, + url_or_none, +) + + +class EpornerIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?eporner\.com/(?:(?:hd-porn|embed)/|video-)(?P<id>\w+)(?:/(?P<display_id>[\w-]+))?' + _TESTS = [{ + 'url': 'http://www.eporner.com/hd-porn/95008/Infamous-Tiffany-Teen-Strip-Tease-Video/', + 'md5': '39d486f046212d8e1b911c52ab4691f8', + 'info_dict': { + 'id': 'qlDUmNsj6VS', + 'display_id': 'Infamous-Tiffany-Teen-Strip-Tease-Video', + 'ext': 'mp4', + 'title': 'Infamous Tiffany Teen Strip Tease Video', + 'description': 'md5:764f39abf932daafa37485eb46efa152', + 'timestamp': 1232520922, + 'upload_date': '20090121', + 'duration': 1838, + 'view_count': int, + 'age_limit': 18, + }, + 'params': { + 'proxy': '127.0.0.1:8118' + } + }, { + # New (May 2016) URL layout + 'url': 'http://www.eporner.com/hd-porn/3YRUtzMcWn0/Star-Wars-XXX-Parody/', + 'only_matching': True, + }, { + 'url': 'http://www.eporner.com/hd-porn/3YRUtzMcWn0', + 'only_matching': True, + }, { + 'url': 'http://www.eporner.com/embed/3YRUtzMcWn0', + 'only_matching': True, + }, { + 'url': 'https://www.eporner.com/video-FJsA19J3Y3H/one-of-the-greats/', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') or video_id + + webpage, urlh = self._download_webpage_handle(url, display_id) + + video_id = self._match_id(urlh.geturl()) + + hash = self._search_regex( + r'hash\s*[:=]\s*["\']([\da-f]{32})', webpage, 'hash') + + title = self._og_search_title(webpage, default=None) or self._html_search_regex( + r'<title>(.+?) - EPORNER', webpage, 'title') + + # Reverse engineered from vjs.js + def calc_hash(s): + return ''.join((encode_base_n(int(s[lb:lb + 8], 16), 36) for lb in range(0, 32, 8))) + + video = self._download_json( + 'http://www.eporner.com/xhr/video/%s' % video_id, + display_id, note='Downloading video JSON', + query={ + 'hash': calc_hash(hash), + 'device': 'generic', + 'domain': 'www.eporner.com', + 'fallback': 'false', + }) + + if video.get('available') is False: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, video['message']), expected=True) + + sources = video['sources'] + + formats = [] + for kind, formats_dict in sources.items(): + if not isinstance(formats_dict, dict): + continue + for format_id, format_dict in formats_dict.items(): + if not isinstance(format_dict, dict): + continue + src = url_or_none(format_dict.get('src')) + if not src or not src.startswith('http'): + continue + if kind == 'hls': + formats.extend(self._extract_m3u8_formats( + src, display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=kind, fatal=False)) + else: + height = int_or_none(self._search_regex( + r'(\d+)[pP]', format_id, 'height', default=None)) + fps = int_or_none(self._search_regex( + r'(\d+)fps', format_id, 'fps', default=None)) + + formats.append({ + 'url': src, + 'format_id': format_id, + 'height': height, + 'fps': fps, + }) + self._sort_formats(formats) + + json_ld = self._search_json_ld(webpage, display_id, default={}) + + duration = parse_duration(self._html_search_meta( + 'duration', webpage, default=None)) + view_count = str_to_int(self._search_regex( + r'id=["\']cinemaviews1["\'][^>]*>\s*([0-9,]+)', + webpage, 'view count', default=None)) + + return merge_dicts(json_ld, { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'duration': duration, + 'view_count': view_count, + 'formats': formats, + 'age_limit': 18, + }) diff --git a/yt_dlp/extractor/eroprofile.py b/yt_dlp/extractor/eroprofile.py new file mode 100644 index 000000000..a8396f1d3 --- /dev/null +++ b/yt_dlp/extractor/eroprofile.py @@ -0,0 +1,131 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_urlencode +from ..utils import ( + ExtractorError, + merge_dicts, +) + + +class EroProfileIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?eroprofile\.com/m/videos/view/(?P<id>[^/]+)' + _LOGIN_URL = 'http://www.eroprofile.com/auth/auth.php?' + _NETRC_MACHINE = 'eroprofile' + _TESTS = [{ + 'url': 'http://www.eroprofile.com/m/videos/view/sexy-babe-softcore', + 'md5': 'c26f351332edf23e1ea28ce9ec9de32f', + 'info_dict': { + 'id': '3733775', + 'display_id': 'sexy-babe-softcore', + 'ext': 'm4v', + 'title': 'sexy babe softcore', + 'thumbnail': r're:https?://.*\.jpg', + 'age_limit': 18, + }, + 'skip': 'Video not found', + }, { + 'url': 'http://www.eroprofile.com/m/videos/view/Try-It-On-Pee_cut_2-wmv-4shared-com-file-sharing-download-movie-file', + 'md5': '1baa9602ede46ce904c431f5418d8916', + 'info_dict': { + 'id': '1133519', + 'ext': 'm4v', + 'title': 'Try It On Pee_cut_2.wmv - 4shared.com - file sharing - download movie file', + 'thumbnail': r're:https?://.*\.jpg', + 'age_limit': 18, + }, + 'skip': 'Requires login', + }] + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + + query = compat_urllib_parse_urlencode({ + 'username': username, + 'password': password, + 'url': 'http://www.eroprofile.com/', + }) + login_url = self._LOGIN_URL + query + login_page = self._download_webpage(login_url, None, False) + + m = re.search(r'Your username or password was incorrect\.', login_page) + if m: + raise ExtractorError( + 'Wrong username and/or password.', expected=True) + + self.report_login() + redirect_url = self._search_regex( + r'<script[^>]+?src="([^"]+)"', login_page, 'login redirect url') + self._download_webpage(redirect_url, None, False) + + def _real_initialize(self): + self._login() + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + m = re.search(r'You must be logged in to view this video\.', webpage) + if m: + self.raise_login_required('This video requires login') + + video_id = self._search_regex( + [r"glbUpdViews\s*\('\d*','(\d+)'", r'p/report/video/(\d+)'], + webpage, 'video id', default=None) + + title = self._html_search_regex( + (r'Title:</th><td>([^<]+)</td>', r'<h1[^>]*>(.+?)</h1>'), + webpage, 'title') + + info = self._parse_html5_media_entries(url, webpage, video_id)[0] + + return merge_dicts(info, { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'age_limit': 18, + }) + + +class EroProfileAlbumIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?eroprofile\.com/m/videos/album/(?P<id>[^/]+)' + IE_NAME = 'EroProfile:album' + + _TESTS = [{ + 'url': 'https://www.eroprofile.com/m/videos/album/BBW-2-893', + 'info_dict': { + 'id': 'BBW-2-893', + 'title': 'BBW 2' + }, + 'playlist_mincount': 486, + }, + ] + + def _extract_from_page(self, page): + for url in re.findall(r'href=".*?(/m/videos/view/[^"]+)"', page): + yield self.url_result(f'https://www.eroprofile.com{url}', EroProfileIE.ie_key()) + + def _entries(self, playlist_id, first_page): + yield from self._extract_from_page(first_page) + + page_urls = re.findall(rf'href=".*?(/m/videos/album/{playlist_id}\?pnum=(\d+))"', first_page) + max_page = max(int(n) for _, n in page_urls) + + for n in range(2, max_page + 1): + url = f'https://www.eroprofile.com/m/videos/album/{playlist_id}?pnum={n}' + yield from self._extract_from_page( + self._download_webpage(url, playlist_id, + note=f'Downloading playlist page {int(n) - 1}')) + + def _real_extract(self, url): + playlist_id = self._match_id(url) + first_page = self._download_webpage(url, playlist_id, note='Downloading playlist') + playlist_title = self._search_regex( + r'<title>Album: (.*) - EroProfile</title>', first_page, 'playlist_title') + + return self.playlist_result(self._entries(playlist_id, first_page), playlist_id, playlist_title) diff --git a/youtube_dl/extractor/escapist.py b/yt_dlp/extractor/escapist.py index 4cd815ebc..4cd815ebc 100644 --- a/youtube_dl/extractor/escapist.py +++ b/yt_dlp/extractor/escapist.py diff --git a/yt_dlp/extractor/espn.py b/yt_dlp/extractor/espn.py new file mode 100644 index 000000000..d4a66c29f --- /dev/null +++ b/yt_dlp/extractor/espn.py @@ -0,0 +1,238 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .once import OnceIE +from ..compat import compat_str +from ..utils import ( + determine_ext, + int_or_none, + unified_timestamp, +) + + +class ESPNIE(OnceIE): + _VALID_URL = r'''(?x) + https?:// + (?: + (?: + (?: + (?:(?:\w+\.)+)?espn\.go| + (?:www\.)?espn + )\.com/ + (?: + (?: + video/(?:clip|iframe/twitter)| + watch/player + ) + (?: + .*?\?.*?\bid=| + /_/id/ + )| + [^/]+/video/ + ) + )| + (?:www\.)espnfc\.(?:com|us)/(?:video/)?[^/]+/\d+/video/ + ) + (?P<id>\d+) + ''' + + _TESTS = [{ + 'url': 'http://espn.go.com/video/clip?id=10365079', + 'info_dict': { + 'id': '10365079', + 'ext': 'mp4', + 'title': '30 for 30 Shorts: Judging Jewell', + 'description': 'md5:39370c2e016cb4ecf498ffe75bef7f0f', + 'timestamp': 1390936111, + 'upload_date': '20140128', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://broadband.espn.go.com/video/clip?id=18910086', + 'info_dict': { + 'id': '18910086', + 'ext': 'mp4', + 'title': 'Kyrie spins around defender for two', + 'description': 'md5:2b0f5bae9616d26fba8808350f0d2b9b', + 'timestamp': 1489539155, + 'upload_date': '20170315', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download f4m manifest'], + }, { + 'url': 'http://nonredline.sports.espn.go.com/video/clip?id=19744672', + 'only_matching': True, + }, { + 'url': 'https://cdn.espn.go.com/video/clip/_/id/19771774', + 'only_matching': True, + }, { + 'url': 'http://www.espn.com/watch/player?id=19141491', + 'only_matching': True, + }, { + 'url': 'http://www.espn.com/watch/player?bucketId=257&id=19505875', + 'only_matching': True, + }, { + 'url': 'http://www.espn.com/watch/player/_/id/19141491', + 'only_matching': True, + }, { + 'url': 'http://www.espn.com/video/clip?id=10365079', + 'only_matching': True, + }, { + 'url': 'http://www.espn.com/video/clip/_/id/17989860', + 'only_matching': True, + }, { + 'url': 'https://espn.go.com/video/iframe/twitter/?cms=espn&id=10365079', + 'only_matching': True, + }, { + 'url': 'http://www.espnfc.us/video/espn-fc-tv/86/video/3319154/nashville-unveiled-as-the-newest-club-in-mls', + 'only_matching': True, + }, { + 'url': 'http://www.espnfc.com/english-premier-league/23/video/3324163/premier-league-in-90-seconds-golden-tweets', + 'only_matching': True, + }, { + 'url': 'http://www.espn.com/espnw/video/26066627/arkansas-gibson-completes-hr-cycle-four-innings', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + clip = self._download_json( + 'http://api-app.espn.com/v1/video/clips/%s' % video_id, + video_id)['videos'][0] + + title = clip['headline'] + + format_urls = set() + formats = [] + + def traverse_source(source, base_source_id=None): + for source_id, source in source.items(): + if source_id == 'alert': + continue + elif isinstance(source, compat_str): + extract_source(source, base_source_id) + elif isinstance(source, dict): + traverse_source( + source, + '%s-%s' % (base_source_id, source_id) + if base_source_id else source_id) + + def extract_source(source_url, source_id=None): + if source_url in format_urls: + return + format_urls.add(source_url) + ext = determine_ext(source_url) + if OnceIE.suitable(source_url): + formats.extend(self._extract_once_formats(source_url)) + elif ext == 'smil': + formats.extend(self._extract_smil_formats( + source_url, video_id, fatal=False)) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + source_url, video_id, f4m_id=source_id, fatal=False)) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + source_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=source_id, fatal=False)) + else: + f = { + 'url': source_url, + 'format_id': source_id, + } + mobj = re.search(r'(\d+)p(\d+)_(\d+)k\.', source_url) + if mobj: + f.update({ + 'height': int(mobj.group(1)), + 'fps': int(mobj.group(2)), + 'tbr': int(mobj.group(3)), + }) + if source_id == 'mezzanine': + f['quality'] = 1 + formats.append(f) + + links = clip.get('links', {}) + traverse_source(links.get('source', {})) + traverse_source(links.get('mobile', {})) + self._sort_formats(formats) + + description = clip.get('caption') or clip.get('description') + thumbnail = clip.get('thumbnail') + duration = int_or_none(clip.get('duration')) + timestamp = unified_timestamp(clip.get('originalPublishDate')) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'formats': formats, + } + + +class ESPNArticleIE(InfoExtractor): + _VALID_URL = r'https?://(?:espn\.go|(?:www\.)?espn)\.com/(?:[^/]+/)*(?P<id>[^/]+)' + _TESTS = [{ + 'url': 'http://espn.go.com/nba/recap?gameId=400793786', + 'only_matching': True, + }, { + 'url': 'http://espn.go.com/blog/golden-state-warriors/post/_/id/593/how-warriors-rapidly-regained-a-winning-edge', + 'only_matching': True, + }, { + 'url': 'http://espn.go.com/sports/endurance/story/_/id/12893522/dzhokhar-tsarnaev-sentenced-role-boston-marathon-bombings', + 'only_matching': True, + }, { + 'url': 'http://espn.go.com/nba/playoffs/2015/story/_/id/12887571/john-wall-washington-wizards-no-swelling-left-hand-wrist-game-5-return', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if ESPNIE.suitable(url) else super(ESPNArticleIE, cls).suitable(url) + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_id = self._search_regex( + r'class=(["\']).*?video-play-button.*?\1[^>]+data-id=["\'](?P<id>\d+)', + webpage, 'video id', group='id') + + return self.url_result( + 'http://espn.go.com/video/clip?id=%s' % video_id, ESPNIE.ie_key()) + + +class FiveThirtyEightIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?fivethirtyeight\.com/features/(?P<id>[^/?#]+)' + _TEST = { + 'url': 'http://fivethirtyeight.com/features/how-the-6-8-raiders-can-still-make-the-playoffs/', + 'info_dict': { + 'id': '56032156', + 'ext': 'flv', + 'title': 'FiveThirtyEight: The Raiders can still make the playoffs', + 'description': 'Neil Paine breaks down the simplest scenario that will put the Raiders into the playoffs at 8-8.', + }, + 'params': { + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + embed_url = self._search_regex( + r'<iframe[^>]+src=["\'](https?://fivethirtyeight\.abcnews\.go\.com/video/embed/\d+/\d+)', + webpage, 'embed url') + + return self.url_result(embed_url, 'AbcNewsVideo') diff --git a/youtube_dl/extractor/esri.py b/yt_dlp/extractor/esri.py index e9dcaeb1d..e9dcaeb1d 100644 --- a/youtube_dl/extractor/esri.py +++ b/yt_dlp/extractor/esri.py diff --git a/yt_dlp/extractor/europa.py b/yt_dlp/extractor/europa.py new file mode 100644 index 000000000..60ab2ce13 --- /dev/null +++ b/yt_dlp/extractor/europa.py @@ -0,0 +1,93 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + orderedSet, + parse_duration, + parse_qs, + qualities, + unified_strdate, + xpath_text +) + + +class EuropaIE(InfoExtractor): + _VALID_URL = r'https?://ec\.europa\.eu/avservices/(?:video/player|audio/audioDetails)\.cfm\?.*?\bref=(?P<id>[A-Za-z0-9-]+)' + _TESTS = [{ + 'url': 'http://ec.europa.eu/avservices/video/player.cfm?ref=I107758', + 'md5': '574f080699ddd1e19a675b0ddf010371', + 'info_dict': { + 'id': 'I107758', + 'ext': 'mp4', + 'title': 'TRADE - Wikileaks on TTIP', + 'description': 'NEW LIVE EC Midday press briefing of 11/08/2015', + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20150811', + 'duration': 34, + 'view_count': int, + 'formats': 'mincount:3', + } + }, { + 'url': 'http://ec.europa.eu/avservices/video/player.cfm?sitelang=en&ref=I107786', + 'only_matching': True, + }, { + 'url': 'http://ec.europa.eu/avservices/audio/audioDetails.cfm?ref=I-109295&sitelang=en', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + playlist = self._download_xml( + 'http://ec.europa.eu/avservices/video/player/playlist.cfm?ID=%s' % video_id, video_id) + + def get_item(type_, preference): + items = {} + for item in playlist.findall('./info/%s/item' % type_): + lang, label = xpath_text(item, 'lg', default=None), xpath_text(item, 'label', default=None) + if lang and label: + items[lang] = label.strip() + for p in preference: + if items.get(p): + return items[p] + + query = parse_qs(url) + preferred_lang = query.get('sitelang', ('en', ))[0] + + preferred_langs = orderedSet((preferred_lang, 'en', 'int')) + + title = get_item('title', preferred_langs) or video_id + description = get_item('description', preferred_langs) + thumbnail = xpath_text(playlist, './info/thumburl', 'thumbnail') + upload_date = unified_strdate(xpath_text(playlist, './info/date', 'upload date')) + duration = parse_duration(xpath_text(playlist, './info/duration', 'duration')) + view_count = int_or_none(xpath_text(playlist, './info/views', 'views')) + + language_preference = qualities(preferred_langs[::-1]) + + formats = [] + for file_ in playlist.findall('./files/file'): + video_url = xpath_text(file_, './url') + if not video_url: + continue + lang = xpath_text(file_, './lg') + formats.append({ + 'url': video_url, + 'format_id': lang, + 'format_note': xpath_text(file_, './lglabel'), + 'language_preference': language_preference(lang) + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'duration': duration, + 'view_count': view_count, + 'formats': formats + } diff --git a/yt_dlp/extractor/euscreen.py b/yt_dlp/extractor/euscreen.py new file mode 100644 index 000000000..3980c2349 --- /dev/null +++ b/yt_dlp/extractor/euscreen.py @@ -0,0 +1,64 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + +from ..utils import ( + parse_duration, + js_to_json, +) + + +class EUScreenIE(InfoExtractor): + _VALID_URL = r'(?:https?://)(?:www\.)?euscreen\.eu/item.html\?id=(?P<id>[^&?$/]+)' + + _TESTS = [{ + 'url': 'https://euscreen.eu/item.html?id=EUS_0EBCBF356BFC4E12A014023BA41BD98C', + 'info_dict': { + 'id': 'EUS_0EBCBF356BFC4E12A014023BA41BD98C', + 'ext': 'mp4', + 'title': "L'effondrement du stade du Heysel", + 'alt_title': 'Collapse of the Heysel Stadium', + 'duration': 318.0, + 'description': 'md5:f0ffffdfce6821139357a1b8359d6152', + 'series': 'JA2 DERNIERE', + 'episode': '-', + 'uploader': 'INA / France', + 'thumbnail': 'http://images3.noterik.com/domain/euscreenxl/user/eu_ina/video/EUS_0EBCBF356BFC4E12A014023BA41BD98C/image.jpg' + }, + 'params': {'skip_download': True} + }] + + _payload = b'<fsxml><screen><properties><screenId>-1</screenId></properties><capabilities id="1"><properties><platform>Win32</platform><appcodename>Mozilla</appcodename><appname>Netscape</appname><appversion>5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36</appversion><useragent>Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36</useragent><cookiesenabled>true</cookiesenabled><screenwidth>784</screenwidth><screenheight>758</screenheight><orientation>undefined</orientation><smt_browserid>Sat, 07 Oct 2021 08:56:50 GMT</smt_browserid><smt_sessionid>1633769810758</smt_sessionid></properties></capabilities></screen></fsxml>' + + def _real_extract(self, url): + id = self._match_id(url) + args_for_js_request = self._download_webpage( + 'https://euscreen.eu/lou/LouServlet/domain/euscreenxl/html5application/euscreenxlitem', + id, data=self._payload, query={'actionlist': 'itempage', 'id': id}) + info_js = self._download_webpage( + 'https://euscreen.eu/lou/LouServlet/domain/euscreenxl/html5application/euscreenxlitem', + id, data=args_for_js_request.replace('screenid', 'screenId').encode()) + video_json = self._parse_json( + self._search_regex(r'setVideo\(({.+})\)\(\$end\$\)put', info_js, 'Video JSON'), + id, transform_source=js_to_json) + meta_json = self._parse_json( + self._search_regex(r'setData\(({.+})\)\(\$end\$\)', info_js, 'Metadata JSON'), + id, transform_source=js_to_json) + formats = [{ + 'url': source['src'], + } for source in video_json.get('sources', [])] + self._sort_formats(formats) + + return { + 'id': id, + 'title': meta_json.get('originalTitle'), + 'alt_title': meta_json.get('title'), + 'duration': parse_duration(meta_json.get('duration')), + 'description': '%s\n%s' % (meta_json.get('summaryOriginal', ''), meta_json.get('summaryEnglish', '')), + 'series': meta_json.get('series') or meta_json.get('seriesEnglish'), + 'episode': meta_json.get('episodeNumber'), + 'uploader': meta_json.get('provider'), + 'thumbnail': meta_json.get('screenshot') or video_json.get('screenshot'), + 'formats': formats, + } diff --git a/yt_dlp/extractor/everyonesmixtape.py b/yt_dlp/extractor/everyonesmixtape.py new file mode 100644 index 000000000..80cb032be --- /dev/null +++ b/yt_dlp/extractor/everyonesmixtape.py @@ -0,0 +1,76 @@ +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + sanitized_Request, +) + + +class EveryonesMixtapeIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?everyonesmixtape\.com/#/mix/(?P<id>[0-9a-zA-Z]+)(?:/(?P<songnr>[0-9]))?$' + + _TESTS = [{ + 'url': 'http://everyonesmixtape.com/#/mix/m7m0jJAbMQi/5', + 'info_dict': { + 'id': '5bfseWNmlds', + 'ext': 'mp4', + 'title': "Passion Pit - \"Sleepyhead\" (Official Music Video)", + 'uploader': 'FKR.TV', + 'uploader_id': 'frenchkissrecords', + 'description': "Music video for \"Sleepyhead\" from Passion Pit's debut EP Chunk Of Change.\nBuy on iTunes: https://itunes.apple.com/us/album/chunk-of-change-ep/id300087641\n\nDirected by The Wilderness.\n\nhttp://www.passionpitmusic.com\nhttp://www.frenchkissrecords.com", + 'upload_date': '20081015' + }, + 'params': { + 'skip_download': True, # This is simply YouTube + } + }, { + 'url': 'http://everyonesmixtape.com/#/mix/m7m0jJAbMQi', + 'info_dict': { + 'id': 'm7m0jJAbMQi', + 'title': 'Driving', + }, + 'playlist_count': 24 + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + playlist_id = mobj.group('id') + + pllist_url = 'http://everyonesmixtape.com/mixtape.php?a=getMixes&u=-1&linked=%s&explore=' % playlist_id + pllist_req = sanitized_Request(pllist_url) + pllist_req.add_header('X-Requested-With', 'XMLHttpRequest') + + playlist_list = self._download_json( + pllist_req, playlist_id, note='Downloading playlist metadata') + try: + playlist_no = next(playlist['id'] + for playlist in playlist_list + if playlist['code'] == playlist_id) + except StopIteration: + raise ExtractorError('Playlist id not found') + + pl_url = 'http://everyonesmixtape.com/mixtape.php?a=getMix&id=%s&userId=null&code=' % playlist_no + pl_req = sanitized_Request(pl_url) + pl_req.add_header('X-Requested-With', 'XMLHttpRequest') + playlist = self._download_json( + pl_req, playlist_id, note='Downloading playlist info') + + entries = [{ + '_type': 'url', + 'url': t['url'], + 'title': t['title'], + } for t in playlist['tracks']] + + if mobj.group('songnr'): + songnr = int(mobj.group('songnr')) - 1 + return entries[songnr] + + playlist_title = playlist['mixData']['name'] + return { + '_type': 'playlist', + 'id': playlist_id, + 'title': playlist_title, + 'entries': entries, + } diff --git a/youtube_dl/extractor/expotv.py b/yt_dlp/extractor/expotv.py index 95a897782..95a897782 100644 --- a/youtube_dl/extractor/expotv.py +++ b/yt_dlp/extractor/expotv.py diff --git a/youtube_dl/extractor/expressen.py b/yt_dlp/extractor/expressen.py index dc8b855d2..dc8b855d2 100644 --- a/youtube_dl/extractor/expressen.py +++ b/yt_dlp/extractor/expressen.py diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py new file mode 100644 index 000000000..f4f817fcb --- /dev/null +++ b/yt_dlp/extractor/extractors.py @@ -0,0 +1,1880 @@ +# flake8: noqa +from __future__ import unicode_literals + +from .abc import ( + ABCIE, + ABCIViewIE, +) +from .abcnews import ( + AbcNewsIE, + AbcNewsVideoIE, +) +from .abcotvs import ( + ABCOTVSIE, + ABCOTVSClipsIE, +) +from .academicearth import AcademicEarthCourseIE +from .acast import ( + ACastIE, + ACastChannelIE, +) +from .adn import ADNIE +from .adobeconnect import AdobeConnectIE +from .adobetv import ( + AdobeTVEmbedIE, + AdobeTVIE, + AdobeTVShowIE, + AdobeTVChannelIE, + AdobeTVVideoIE, +) +from .adultswim import AdultSwimIE +from .aenetworks import ( + AENetworksIE, + AENetworksCollectionIE, + AENetworksShowIE, + HistoryTopicIE, + HistoryPlayerIE, + BiographyIE, +) +from .afreecatv import AfreecaTVIE +from .airmozilla import AirMozillaIE +from .aljazeera import AlJazeeraIE +from .alphaporno import AlphaPornoIE +from .amara import AmaraIE +from .alura import ( + AluraIE, + AluraCourseIE +) +from .amcnetworks import AMCNetworksIE +from .animelab import ( + AnimeLabIE, + AnimeLabShowsIE, +) +from .americastestkitchen import ( + AmericasTestKitchenIE, + AmericasTestKitchenSeasonIE, +) +from .animeondemand import AnimeOnDemandIE +from .anvato import AnvatoIE +from .aol import AolIE +from .allocine import AllocineIE +from .aliexpress import AliExpressLiveIE +from .apa import APAIE +from .aparat import AparatIE +from .appleconnect import AppleConnectIE +from .appletrailers import ( + AppleTrailersIE, + AppleTrailersSectionIE, +) +from .applepodcasts import ApplePodcastsIE +from .archiveorg import ( + ArchiveOrgIE, + YoutubeWebArchiveIE, +) +from .arcpublishing import ArcPublishingIE +from .arkena import ArkenaIE +from .ard import ( + ARDBetaMediathekIE, + ARDIE, + ARDMediathekIE, +) +from .arte import ( + ArteTVIE, + ArteTVEmbedIE, + ArteTVPlaylistIE, +) +from .arnes import ArnesIE +from .asiancrush import ( + AsianCrushIE, + AsianCrushPlaylistIE, +) +from .atresplayer import AtresPlayerIE +from .atttechchannel import ATTTechChannelIE +from .atvat import ATVAtIE +from .audimedia import AudiMediaIE +from .audioboom import AudioBoomIE +from .audiomack import AudiomackIE, AudiomackAlbumIE +from .audius import ( + AudiusIE, + AudiusTrackIE, + AudiusPlaylistIE, + AudiusProfileIE, +) +from .awaan import ( + AWAANIE, + AWAANVideoIE, + AWAANLiveIE, + AWAANSeasonIE, +) +from .azmedien import AZMedienIE +from .baidu import BaiduVideoIE +from .bandaichannel import BandaiChannelIE +from .bandcamp import ( + BandcampIE, + BandcampAlbumIE, + BandcampWeeklyIE, + BandcampMusicIE, +) +from .bannedvideo import BannedVideoIE +from .bbc import ( + BBCCoUkIE, + BBCCoUkArticleIE, + BBCCoUkIPlayerEpisodesIE, + BBCCoUkIPlayerGroupIE, + BBCCoUkPlaylistIE, + BBCIE, +) +from .beeg import BeegIE +from .behindkink import BehindKinkIE +from .bellmedia import BellMediaIE +from .beatport import BeatportIE +from .bet import BetIE +from .bfi import BFIPlayerIE +from .bfmtv import ( + BFMTVIE, + BFMTVLiveIE, + BFMTVArticleIE, +) +from .bibeltv import BibelTVIE +from .bigflix import BigflixIE +from .bild import BildIE +from .bilibili import ( + BiliBiliIE, + BiliBiliSearchIE, + BilibiliCategoryIE, + BiliBiliBangumiIE, + BilibiliAudioIE, + BilibiliAudioAlbumIE, + BiliBiliPlayerIE, + BilibiliChannelIE, + BiliIntlIE, + BiliIntlSeriesIE, +) +from .biobiochiletv import BioBioChileTVIE +from .bitchute import ( + BitChuteIE, + BitChuteChannelIE, +) +from .bitwave import ( + BitwaveReplayIE, + BitwaveStreamIE, +) +from .biqle import BIQLEIE +from .blackboardcollaborate import BlackboardCollaborateIE +from .bleacherreport import ( + BleacherReportIE, + BleacherReportCMSIE, +) +from .bloomberg import BloombergIE +from .bokecc import BokeCCIE +from .bongacams import BongaCamsIE +from .bostonglobe import BostonGlobeIE +from .box import BoxIE +from .bpb import BpbIE +from .br import ( + BRIE, + BRMediathekIE, +) +from .bravotv import BravoTVIE +from .breakcom import BreakIE +from .brightcove import ( + BrightcoveLegacyIE, + BrightcoveNewIE, +) +from .businessinsider import BusinessInsiderIE +from .buzzfeed import BuzzFeedIE +from .byutv import BYUtvIE +from .c56 import C56IE +from .cam4 import CAM4IE +from .camdemy import ( + CamdemyIE, + CamdemyFolderIE +) +from .cammodels import CamModelsIE +from .camwithher import CamWithHerIE +from .canalplus import CanalplusIE +from .canalc2 import Canalc2IE +from .canvas import ( + CanvasIE, + CanvasEenIE, + VrtNUIE, + DagelijkseKostIE, +) +from .carambatv import ( + CarambaTVIE, + CarambaTVPageIE, +) +from .cartoonnetwork import CartoonNetworkIE +from .cbc import ( + CBCIE, + CBCPlayerIE, + CBCGemIE, + CBCGemPlaylistIE, + CBCGemLiveIE, +) +from .cbs import CBSIE +from .cbslocal import ( + CBSLocalIE, + CBSLocalArticleIE, +) +from .cbsinteractive import CBSInteractiveIE +from .cbsnews import ( + CBSNewsEmbedIE, + CBSNewsIE, + CBSNewsLiveVideoIE, +) +from .cbssports import ( + CBSSportsEmbedIE, + CBSSportsIE, + TwentyFourSevenSportsIE, +) +from .ccc import ( + CCCIE, + CCCPlaylistIE, +) +from .ccma import CCMAIE +from .cctv import CCTVIE +from .cda import CDAIE +from .ceskatelevize import ( + CeskaTelevizeIE, + CeskaTelevizePoradyIE, +) +from .cgtn import CGTNIE +from .channel9 import Channel9IE +from .charlierose import CharlieRoseIE +from .chaturbate import ChaturbateIE +from .chilloutzone import ChilloutzoneIE +from .chingari import ( + ChingariIE, + ChingariUserIE, +) +from .chirbit import ( + ChirbitIE, + ChirbitProfileIE, +) +from .cinchcast import CinchcastIE +from .cinemax import CinemaxIE +from .ciscolive import ( + CiscoLiveSessionIE, + CiscoLiveSearchIE, +) +from .ciscowebex import CiscoWebexIE +from .cjsw import CJSWIE +from .cliphunter import CliphunterIE +from .clippit import ClippitIE +from .cliprs import ClipRsIE +from .clipsyndicate import ClipsyndicateIE +from .closertotruth import CloserToTruthIE +from .cloudflarestream import CloudflareStreamIE +from .cloudy import CloudyIE +from .clubic import ClubicIE +from .clyp import ClypIE +from .cmt import CMTIE +from .cnbc import ( + CNBCIE, + CNBCVideoIE, +) +from .cnn import ( + CNNIE, + CNNBlogsIE, + CNNArticleIE, +) +from .coub import CoubIE +from .comedycentral import ( + ComedyCentralIE, + ComedyCentralTVIE, +) +from .commonmistakes import CommonMistakesIE, UnicodeBOMIE +from .commonprotocols import ( + MmsIE, + RtmpIE, + ViewSourceIE, +) +from .condenast import CondeNastIE +from .contv import CONtvIE +from .corus import CorusIE +from .cracked import CrackedIE +from .crackle import CrackleIE +from .crooksandliars import CrooksAndLiarsIE +from .crunchyroll import ( + CrunchyrollIE, + CrunchyrollShowPlaylistIE, + CrunchyrollBetaIE, + CrunchyrollBetaShowIE, +) +from .cspan import CSpanIE +from .ctsnews import CtsNewsIE +from .ctv import CTVIE +from .ctvnews import CTVNewsIE +from .cultureunplugged import CultureUnpluggedIE +from .curiositystream import ( + CuriosityStreamIE, + CuriosityStreamCollectionIE, +) +from .cwtv import CWTVIE +from .dailymail import DailyMailIE +from .dailymotion import ( + DailymotionIE, + DailymotionPlaylistIE, + DailymotionUserIE, +) +from .damtomo import ( + DamtomoRecordIE, + DamtomoVideoIE, +) +from .daum import ( + DaumIE, + DaumClipIE, + DaumPlaylistIE, + DaumUserIE, +) +from .dbtv import DBTVIE +from .dctp import DctpTvIE +from .deezer import ( + DeezerPlaylistIE, + DeezerAlbumIE, +) +from .democracynow import DemocracynowIE +from .dfb import DFBIE +from .dhm import DHMIE +from .digg import DiggIE +from .discoveryplusindia import ( + DiscoveryPlusIndiaIE, + DiscoveryPlusIndiaShowIE, +) +from .dotsub import DotsubIE +from .douyutv import ( + DouyuShowIE, + DouyuTVIE, +) +from .dplay import ( + DPlayIE, + DiscoveryPlusIE, + HGTVDeIE, + ScienceChannelIE, + DIYNetworkIE, + AnimalPlanetIE +) +from .dreisat import DreiSatIE +from .drbonanza import DRBonanzaIE +from .drtuber import DrTuberIE +from .drtv import ( + DRTVIE, + DRTVLiveIE, +) +from .dtube import DTubeIE +from .dvtv import DVTVIE +from .duboku import ( + DubokuIE, + DubokuPlaylistIE +) +from .dumpert import DumpertIE +from .defense import DefenseGouvFrIE +from .discovery import DiscoveryIE +from .discoverygo import ( + DiscoveryGoIE, + DiscoveryGoPlaylistIE, +) +from .discoverynetworks import DiscoveryNetworksDeIE +from .discoveryvr import DiscoveryVRIE +from .disney import DisneyIE +from .dispeak import DigitallySpeakingIE +from .doodstream import DoodStreamIE +from .dropbox import DropboxIE +from .dw import ( + DWIE, + DWArticleIE, +) +from .eagleplatform import EaglePlatformIE +from .ebaumsworld import EbaumsWorldIE +from .echomsk import EchoMskIE +from .egghead import ( + EggheadCourseIE, + EggheadLessonIE, +) +from .ehow import EHowIE +from .eighttracks import EightTracksIE +from .einthusan import EinthusanIE +from .eitb import EitbIE +from .ellentube import ( + EllenTubeIE, + EllenTubeVideoIE, + EllenTubePlaylistIE, +) +from .elonet import ElonetIE +from .elpais import ElPaisIE +from .embedly import EmbedlyIE +from .engadget import EngadgetIE +from .epicon import ( + EpiconIE, + EpiconSeriesIE, +) +from .eporner import EpornerIE +from .eroprofile import ( + EroProfileIE, + EroProfileAlbumIE, +) +from .escapist import EscapistIE +from .espn import ( + ESPNIE, + ESPNArticleIE, + FiveThirtyEightIE, +) +from .esri import EsriVideoIE +from .europa import EuropaIE +from .euscreen import EUScreenIE +from .expotv import ExpoTVIE +from .expressen import ExpressenIE +from .extremetube import ExtremeTubeIE +from .eyedotv import EyedoTVIE +from .facebook import ( + FacebookIE, + FacebookPluginsVideoIE, +) +from .fancode import ( + FancodeVodIE, + FancodeLiveIE +) + +from .faz import FazIE +from .fc2 import ( + FC2IE, + FC2EmbedIE, +) +from .fczenit import FczenitIE +from .filmmodu import FilmmoduIE +from .filmon import ( + FilmOnIE, + FilmOnChannelIE, +) +from .filmweb import FilmwebIE +from .firsttv import FirstTVIE +from .fivemin import FiveMinIE +from .fivetv import FiveTVIE +from .flickr import FlickrIE +from .folketinget import FolketingetIE +from .footyroom import FootyRoomIE +from .formula1 import Formula1IE +from .fourtube import ( + FourTubeIE, + PornTubeIE, + PornerBrosIE, + FuxIE, +) +from .fox import FOXIE +from .fox9 import ( + FOX9IE, + FOX9NewsIE, +) +from .foxgay import FoxgayIE +from .foxnews import ( + FoxNewsIE, + FoxNewsArticleIE, +) +from .foxsports import FoxSportsIE +from .franceculture import FranceCultureIE +from .franceinter import FranceInterIE +from .francetv import ( + FranceTVIE, + FranceTVSiteIE, + FranceTVInfoIE, +) +from .freesound import FreesoundIE +from .freespeech import FreespeechIE +from .freshlive import FreshLiveIE +from .frontendmasters import ( + FrontendMastersIE, + FrontendMastersLessonIE, + FrontendMastersCourseIE +) +from .fujitv import FujiTVFODPlus7IE +from .funimation import ( + FunimationIE, + FunimationPageIE, + FunimationShowIE, +) +from .funk import FunkIE +from .fusion import FusionIE +from .gab import GabTVIE +from .gaia import GaiaIE +from .gameinformer import GameInformerIE +from .gamespot import GameSpotIE +from .gamestar import GameStarIE +from .gaskrank import GaskrankIE +from .gazeta import GazetaIE +from .gdcvault import GDCVaultIE +from .gedidigital import GediDigitalIE +from .generic import GenericIE +from .gettr import GettrIE +from .gfycat import GfycatIE +from .giantbomb import GiantBombIE +from .giga import GigaIE +from .glide import GlideIE +from .globo import ( + GloboIE, + GloboArticleIE, +) +from .go import GoIE +from .godtube import GodTubeIE +from .golem import GolemIE +from .googledrive import GoogleDriveIE +from .googlepodcasts import ( + GooglePodcastsIE, + GooglePodcastsFeedIE, +) +from .googlesearch import GoogleSearchIE +from .gopro import GoProIE +from .goshgay import GoshgayIE +from .gotostage import GoToStageIE +from .gputechconf import GPUTechConfIE +from .gronkh import GronkhIE +from .groupon import GrouponIE +from .hbo import HBOIE +from .hearthisat import HearThisAtIE +from .heise import HeiseIE +from .hellporno import HellPornoIE +from .helsinki import HelsinkiIE +from .hentaistigma import HentaiStigmaIE +from .hgtv import HGTVComShowIE +from .hketv import HKETVIE +from .hidive import HiDiveIE +from .historicfilms import HistoricFilmsIE +from .hitbox import HitboxIE, HitboxLiveIE +from .hitrecord import HitRecordIE +from .hornbunny import HornBunnyIE +from .hotnewhiphop import HotNewHipHopIE +from .hotstar import ( + HotStarIE, + HotStarPlaylistIE, + HotStarSeriesIE, +) +from .howcast import HowcastIE +from .howstuffworks import HowStuffWorksIE +from .hrfensehen import HRFernsehenIE +from .hrti import ( + HRTiIE, + HRTiPlaylistIE, +) +from .huajiao import HuajiaoIE +from .huffpost import HuffPostIE +from .hungama import ( + HungamaIE, + HungamaSongIE, + HungamaAlbumPlaylistIE, +) +from .hypem import HypemIE +from .ichinanalive import ( + IchinanaLiveIE, + IchinanaLiveClipIE, +) +from .ign import ( + IGNIE, + IGNVideoIE, + IGNArticleIE, +) +from .iheart import ( + IHeartRadioIE, + IHeartRadioPodcastIE, +) +from .imdb import ( + ImdbIE, + ImdbListIE +) +from .imgur import ( + ImgurIE, + ImgurAlbumIE, + ImgurGalleryIE, +) +from .ina import InaIE +from .inc import IncIE +from .indavideo import IndavideoEmbedIE +from .infoq import InfoQIE +from .instagram import ( + InstagramIE, + InstagramUserIE, + InstagramTagIE, +) +from .internazionale import InternazionaleIE +from .internetvideoarchive import InternetVideoArchiveIE +from .iprima import IPrimaIE +from .iqiyi import IqiyiIE +from .ir90tv import Ir90TvIE +from .itv import ( + ITVIE, + ITVBTCCIE, +) +from .ivi import ( + IviIE, + IviCompilationIE +) +from .ivideon import IvideonIE +from .iwara import IwaraIE +from .izlesene import IzleseneIE +from .jamendo import ( + JamendoIE, + JamendoAlbumIE, +) +from .jeuxvideo import JeuxVideoIE +from .jove import JoveIE +from .joj import JojIE +from .jwplatform import JWPlatformIE +from .kakao import KakaoIE +from .kaltura import KalturaIE +from .kankan import KankanIE +from .karaoketv import KaraoketvIE +from .karrierevideos import KarriereVideosIE +from .keezmovies import KeezMoviesIE +from .ketnet import KetnetIE +from .khanacademy import ( + KhanAcademyIE, + KhanAcademyUnitIE, +) +from .kickstarter import KickStarterIE +from .kinja import KinjaEmbedIE +from .kinopoisk import KinoPoiskIE +from .konserthusetplay import KonserthusetPlayIE +from .koo import KooIE +from .krasview import KrasViewIE +from .ku6 import Ku6IE +from .kusi import KUSIIE +from .kuwo import ( + KuwoIE, + KuwoAlbumIE, + KuwoChartIE, + KuwoSingerIE, + KuwoCategoryIE, + KuwoMvIE, +) +from .la7 import ( + LA7IE, + LA7PodcastEpisodeIE, + LA7PodcastIE, +) +from .laola1tv import ( + Laola1TvEmbedIE, + Laola1TvIE, + EHFTVIE, + ITTFIE, +) +from .lbry import ( + LBRYIE, + LBRYChannelIE, +) +from .lci import LCIIE +from .lcp import ( + LcpPlayIE, + LcpIE, +) +from .lecture2go import Lecture2GoIE +from .lecturio import ( + LecturioIE, + LecturioCourseIE, + LecturioDeCourseIE, +) +from .leeco import ( + LeIE, + LePlaylistIE, + LetvCloudIE, +) +from .lego import LEGOIE +from .lemonde import LemondeIE +from .lenta import LentaIE +from .libraryofcongress import LibraryOfCongressIE +from .libsyn import LibsynIE +from .lifenews import ( + LifeNewsIE, + LifeEmbedIE, +) +from .limelight import ( + LimelightMediaIE, + LimelightChannelIE, + LimelightChannelListIE, +) +from .line import ( + LineTVIE, + LineLiveIE, + LineLiveChannelIE, +) +from .linkedin import ( + LinkedInLearningIE, + LinkedInLearningCourseIE, +) +from .linuxacademy import LinuxAcademyIE +from .litv import LiTVIE +from .livejournal import LiveJournalIE +from .livestream import ( + LivestreamIE, + LivestreamOriginalIE, + LivestreamShortenerIE, +) +from .lnkgo import LnkGoIE +from .localnews8 import LocalNews8IE +from .lovehomeporn import LoveHomePornIE +from .lrt import LRTIE +from .lynda import ( + LyndaIE, + LyndaCourseIE +) +from .m6 import M6IE +from .magentamusik360 import MagentaMusik360IE +from .mailru import ( + MailRuIE, + MailRuMusicIE, + MailRuMusicSearchIE, +) +from .malltv import MallTVIE +from .mangomolo import ( + MangomoloVideoIE, + MangomoloLiveIE, +) +from .manoto import ( + ManotoTVIE, + ManotoTVShowIE, + ManotoTVLiveIE, +) +from .manyvids import ManyVidsIE +from .maoritv import MaoriTVIE +from .markiza import ( + MarkizaIE, + MarkizaPageIE, +) +from .massengeschmacktv import MassengeschmackTVIE +from .matchtv import MatchTVIE +from .mdr import MDRIE +from .medaltv import MedalTVIE +from .mediaite import MediaiteIE +from .mediaklikk import MediaKlikkIE +from .mediaset import MediasetIE +from .mediasite import ( + MediasiteIE, + MediasiteCatalogIE, + MediasiteNamedCatalogIE, +) +from .medici import MediciIE +from .megaphone import MegaphoneIE +from .meipai import MeipaiIE +from .melonvod import MelonVODIE +from .meta import METAIE +from .metacafe import MetacafeIE +from .metacritic import MetacriticIE +from .mgoon import MgoonIE +from .mgtv import MGTVIE +from .miaopai import MiaoPaiIE +from .microsoftvirtualacademy import ( + MicrosoftVirtualAcademyIE, + MicrosoftVirtualAcademyCourseIE, +) +from .mildom import ( + MildomIE, + MildomVodIE, + MildomUserVodIE, +) +from .minds import ( + MindsIE, + MindsChannelIE, + MindsGroupIE, +) +from .ministrygrid import MinistryGridIE +from .minoto import MinotoIE +from .miomio import MioMioIE +from .mirrativ import ( + MirrativIE, + MirrativUserIE, +) +from .mit import TechTVMITIE, OCWMITIE +from .mitele import MiTeleIE +from .mixcloud import ( + MixcloudIE, + MixcloudUserIE, + MixcloudPlaylistIE, +) +from .mlb import ( + MLBIE, + MLBVideoIE, +) +from .mnet import MnetIE +from .moevideo import MoeVideoIE +from .mofosex import ( + MofosexIE, + MofosexEmbedIE, +) +from .mojvideo import MojvideoIE +from .morningstar import MorningstarIE +from .motherless import ( + MotherlessIE, + MotherlessGroupIE +) +from .motorsport import MotorsportIE +from .movieclips import MovieClipsIE +from .moviezine import MoviezineIE +from .movingimage import MovingImageIE +from .msn import MSNIE +from .mtv import ( + MTVIE, + MTVVideoIE, + MTVServicesEmbeddedIE, + MTVDEIE, + MTVJapanIE, + MTVItaliaIE, + MTVItaliaProgrammaIE, +) +from .muenchentv import MuenchenTVIE +from .musescore import MuseScoreIE +from .mwave import MwaveIE, MwaveMeetGreetIE +from .mxplayer import ( + MxplayerIE, + MxplayerShowIE, +) +from .mychannels import MyChannelsIE +from .myspace import MySpaceIE, MySpaceAlbumIE +from .myspass import MySpassIE +from .myvi import ( + MyviIE, + MyviEmbedIE, +) +from .myvideoge import MyVideoGeIE +from .myvidster import MyVidsterIE +from .n1 import N1InfoIIE, N1InfoAssetIE +from .nationalgeographic import ( + NationalGeographicVideoIE, + NationalGeographicTVIE, +) +from .naver import ( + NaverIE, + NaverLiveIE, +) +from .nba import ( + NBAWatchEmbedIE, + NBAWatchIE, + NBAWatchCollectionIE, + NBAEmbedIE, + NBAIE, + NBAChannelIE, +) +from .nbc import ( + NBCIE, + NBCNewsIE, + NBCOlympicsIE, + NBCOlympicsStreamIE, + NBCSportsIE, + NBCSportsStreamIE, + NBCSportsVPlayerIE, +) +from .ndr import ( + NDRIE, + NJoyIE, + NDREmbedBaseIE, + NDREmbedIE, + NJoyEmbedIE, +) +from .ndtv import NDTVIE +from .nebula import NebulaIE +from .nerdcubed import NerdCubedFeedIE +from .netzkino import NetzkinoIE +from .neteasemusic import ( + NetEaseMusicIE, + NetEaseMusicAlbumIE, + NetEaseMusicSingerIE, + NetEaseMusicListIE, + NetEaseMusicMvIE, + NetEaseMusicProgramIE, + NetEaseMusicDjRadioIE, +) +from .newgrounds import ( + NewgroundsIE, + NewgroundsPlaylistIE, + NewgroundsUserIE, +) +from .newstube import NewstubeIE +from .nextmedia import ( + NextMediaIE, + NextMediaActionNewsIE, + AppleDailyIE, + NextTVIE, +) +from .nexx import ( + NexxIE, + NexxEmbedIE, +) +from .nfhsnetwork import NFHSNetworkIE +from .nfl import ( + NFLIE, + NFLArticleIE, +) +from .nhk import ( + NhkVodIE, + NhkVodProgramIE, +) +from .nhl import NHLIE +from .nick import ( + NickIE, + NickBrIE, + NickDeIE, + NickNightIE, + NickRuIE, +) + +from .niconico import ( + NiconicoIE, + NiconicoPlaylistIE, + NiconicoUserIE, + NicovideoSearchDateIE, + NicovideoSearchIE, + NicovideoSearchURLIE, +) +from .ninecninemedia import NineCNineMediaIE +from .ninegag import NineGagIE +from .ninenow import NineNowIE +from .nintendo import NintendoIE +from .nitter import NitterIE +from .njpwworld import NJPWWorldIE +from .nobelprize import NobelPrizeIE +from .nonktube import NonkTubeIE +from .noovo import NoovoIE +from .normalboots import NormalbootsIE +from .nosvideo import NosVideoIE +from .nova import ( + NovaEmbedIE, + NovaIE, +) +from .novaplay import NovaPlayIE +from .nowness import ( + NownessIE, + NownessPlaylistIE, + NownessSeriesIE, +) +from .noz import NozIE +from .npo import ( + AndereTijdenIE, + NPOIE, + NPOLiveIE, + NPORadioIE, + NPORadioFragmentIE, + SchoolTVIE, + HetKlokhuisIE, + VPROIE, + WNLIE, +) +from .npr import NprIE +from .nrk import ( + NRKIE, + NRKPlaylistIE, + NRKSkoleIE, + NRKTVIE, + NRKTVDirekteIE, + NRKRadioPodkastIE, + NRKTVEpisodeIE, + NRKTVEpisodesIE, + NRKTVSeasonIE, + NRKTVSeriesIE, +) +from .nrl import NRLTVIE +from .ntvcojp import NTVCoJpCUIE +from .ntvde import NTVDeIE +from .ntvru import NTVRuIE +from .nytimes import ( + NYTimesIE, + NYTimesArticleIE, + NYTimesCookingIE, +) +from .nuvid import NuvidIE +from .nzherald import NZHeraldIE +from .nzz import NZZIE +from .odatv import OdaTVIE +from .odnoklassniki import OdnoklassnikiIE +from .oktoberfesttv import OktoberfestTVIE +from .olympics import OlympicsReplayIE +from .on24 import On24IE +from .ondemandkorea import OnDemandKoreaIE +from .onet import ( + OnetIE, + OnetChannelIE, + OnetMVPIE, + OnetPlIE, +) +from .onionstudios import OnionStudiosIE +from .ooyala import ( + OoyalaIE, + OoyalaExternalIE, +) +from .openrec import ( + OpenRecIE, + OpenRecCaptureIE, +) +from .ora import OraTVIE +from .orf import ( + ORFTVthekIE, + ORFFM4IE, + ORFFM4StoryIE, + ORFOE1IE, + ORFOE3IE, + ORFNOEIE, + ORFWIEIE, + ORFBGLIE, + ORFOOEIE, + ORFSTMIE, + ORFKTNIE, + ORFSBGIE, + ORFTIRIE, + ORFVBGIE, + ORFIPTVIE, +) +from .outsidetv import OutsideTVIE +from .packtpub import ( + PacktPubIE, + PacktPubCourseIE, +) +from .palcomp3 import ( + PalcoMP3IE, + PalcoMP3ArtistIE, + PalcoMP3VideoIE, +) +from .pandoratv import PandoraTVIE +from .paramountplus import ( + ParamountPlusIE, + ParamountPlusSeriesIE, +) +from .parliamentliveuk import ParliamentLiveUKIE +from .parlview import ParlviewIE +from .patreon import ( + PatreonIE, + PatreonUserIE +) +from .pbs import PBSIE +from .pearvideo import PearVideoIE +from .peertube import ( + PeerTubeIE, + PeerTubePlaylistIE, +) +from .peloton import ( + PelotonIE, + PelotonLiveIE +) +from .people import PeopleIE +from .performgroup import PerformGroupIE +from .periscope import ( + PeriscopeIE, + PeriscopeUserIE, +) +from .philharmoniedeparis import PhilharmonieDeParisIE +from .phoenix import PhoenixIE +from .photobucket import PhotobucketIE +from .picarto import ( + PicartoIE, + PicartoVodIE, +) +from .piksel import PikselIE +from .pinkbike import PinkbikeIE +from .pinterest import ( + PinterestIE, + PinterestCollectionIE, +) +from .pladform import PladformIE +from .platzi import ( + PlatziIE, + PlatziCourseIE, +) +from .playfm import PlayFMIE +from .playplustv import PlayPlusTVIE +from .plays import PlaysTVIE +from .playstuff import PlayStuffIE +from .playtvak import PlaytvakIE +from .playvid import PlayvidIE +from .playwire import PlaywireIE +from .plutotv import PlutoTVIE +from .pluralsight import ( + PluralsightIE, + PluralsightCourseIE, +) +from .podomatic import PodomaticIE +from .pokemon import ( + PokemonIE, + PokemonWatchIE, +) +from .polskieradio import ( + PolskieRadioIE, + PolskieRadioCategoryIE, +) +from .popcorntimes import PopcorntimesIE +from .popcorntv import PopcornTVIE +from .porn91 import Porn91IE +from .porncom import PornComIE +from .pornflip import PornFlipIE +from .pornhd import PornHdIE +from .pornhub import ( + PornHubIE, + PornHubUserIE, + PornHubPlaylistIE, + PornHubPagedVideoListIE, + PornHubUserVideosUploadIE, +) +from .pornotube import PornotubeIE +from .pornovoisines import PornoVoisinesIE +from .pornoxo import PornoXOIE +from .puhutv import ( + PuhuTVIE, + PuhuTVSerieIE, +) +from .presstv import PressTVIE +from .projectveritas import ProjectVeritasIE +from .prosiebensat1 import ProSiebenSat1IE +from .puls4 import Puls4IE +from .pyvideo import PyvideoIE +from .qqmusic import ( + QQMusicIE, + QQMusicSingerIE, + QQMusicAlbumIE, + QQMusicToplistIE, + QQMusicPlaylistIE, +) +from .r7 import ( + R7IE, + R7ArticleIE, +) +from .radiko import RadikoIE, RadikoRadioIE +from .radiocanada import ( + RadioCanadaIE, + RadioCanadaAudioVideoIE, +) +from .radiode import RadioDeIE +from .radiojavan import RadioJavanIE +from .radiobremen import RadioBremenIE +from .radiofrance import RadioFranceIE +from .radlive import ( + RadLiveIE, + RadLiveChannelIE, + RadLiveSeasonIE, +) +from .rai import ( + RaiPlayIE, + RaiPlayLiveIE, + RaiPlayPlaylistIE, + RaiIE, +) +from .raywenderlich import ( + RayWenderlichIE, + RayWenderlichCourseIE, +) +from .rbmaradio import RBMARadioIE +from .rcs import ( + RCSIE, + RCSEmbedsIE, + RCSVariousIE, +) +from .rcti import ( + RCTIPlusIE, + RCTIPlusSeriesIE, + RCTIPlusTVIE, +) +from .rds import RDSIE +from .redbulltv import ( + RedBullTVIE, + RedBullEmbedIE, + RedBullTVRrnContentIE, + RedBullIE, +) +from .reddit import ( + RedditIE, + RedditRIE, +) +from .redtube import RedTubeIE +from .regiotv import RegioTVIE +from .rentv import ( + RENTVIE, + RENTVArticleIE, +) +from .restudy import RestudyIE +from .reuters import ReutersIE +from .reverbnation import ReverbNationIE +from .rice import RICEIE +from .rmcdecouverte import RMCDecouverteIE +from .ro220 import Ro220IE +from .rockstargames import RockstarGamesIE +from .roosterteeth import RoosterTeethIE +from .rottentomatoes import RottenTomatoesIE +from .roxwel import RoxwelIE +from .rozhlas import RozhlasIE +from .rtbf import RTBFIE +from .rte import RteIE, RteRadioIE +from .rtlnl import RtlNlIE +from .rtl2 import ( + RTL2IE, + RTL2YouIE, + RTL2YouSeriesIE, +) +from .rtp import RTPIE +from .rts import RTSIE +from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE, RTVELiveIE, RTVETelevisionIE +from .rtvnh import RTVNHIE +from .rtvs import RTVSIE +from .ruhd import RUHDIE +from .rumble import ( + RumbleEmbedIE, + RumbleChannelIE, +) +from .rutube import ( + RutubeIE, + RutubeChannelIE, + RutubeEmbedIE, + RutubeMovieIE, + RutubePersonIE, + RutubePlaylistIE, +) +from .rutv import RUTVIE +from .ruutu import RuutuIE +from .ruv import RuvIE +from .safari import ( + SafariIE, + SafariApiIE, + SafariCourseIE, +) +from .saitosan import SaitosanIE +from .samplefocus import SampleFocusIE +from .sapo import SapoIE +from .savefrom import SaveFromIE +from .sbs import SBSIE +from .screencast import ScreencastIE +from .screencastomatic import ScreencastOMaticIE +from .scrippsnetworks import ( + ScrippsNetworksWatchIE, + ScrippsNetworksIE, +) +from .scte import ( + SCTEIE, + SCTECourseIE, +) +from .seeker import SeekerIE +from .senateisvp import SenateISVPIE +from .sendtonews import SendtoNewsIE +from .servus import ServusIE +from .sevenplus import SevenPlusIE +from .sexu import SexuIE +from .seznamzpravy import ( + SeznamZpravyIE, + SeznamZpravyArticleIE, +) +from .shahid import ( + ShahidIE, + ShahidShowIE, +) +from .shared import ( + SharedIE, + VivoIE, +) +from .shemaroome import ShemarooMeIE +from .showroomlive import ShowRoomLiveIE +from .simplecast import ( + SimplecastIE, + SimplecastEpisodeIE, + SimplecastPodcastIE, +) +from .sina import SinaIE +from .sixplay import SixPlayIE +from .skyit import ( + SkyItPlayerIE, + SkyItVideoIE, + SkyItVideoLiveIE, + SkyItIE, + SkyItAcademyIE, + SkyItArteIE, + CieloTVItIE, + TV8ItIE, +) +from .skylinewebcams import SkylineWebcamsIE +from .skynewsarabia import ( + SkyNewsArabiaIE, + SkyNewsArabiaArticleIE, +) +from .skynewsau import SkyNewsAUIE +from .sky import ( + SkyNewsIE, + SkySportsIE, + SkySportsNewsIE, +) +from .slideshare import SlideshareIE +from .slideslive import SlidesLiveIE +from .slutload import SlutloadIE +from .snotr import SnotrIE +from .sohu import SohuIE +from .sonyliv import ( + SonyLIVIE, + SonyLIVSeriesIE, +) +from .soundcloud import ( + SoundcloudEmbedIE, + SoundcloudIE, + SoundcloudSetIE, + SoundcloudUserIE, + SoundcloudTrackStationIE, + SoundcloudPlaylistIE, + SoundcloudSearchIE, +) +from .soundgasm import ( + SoundgasmIE, + SoundgasmProfileIE +) +from .southpark import ( + SouthParkIE, + SouthParkDeIE, + SouthParkDkIE, + SouthParkEsIE, + SouthParkNlIE +) +from .sovietscloset import ( + SovietsClosetIE, + SovietsClosetPlaylistIE +) +from .spankbang import ( + SpankBangIE, + SpankBangPlaylistIE, +) +from .spankwire import SpankwireIE +from .spiegel import SpiegelIE +from .spike import ( + BellatorIE, + ParamountNetworkIE, +) +from .stitcher import ( + StitcherIE, + StitcherShowIE, +) +from .sport5 import Sport5IE +from .sportbox import SportBoxIE +from .sportdeutschland import SportDeutschlandIE +from .spotify import ( + SpotifyIE, + SpotifyShowIE, +) +from .spreaker import ( + SpreakerIE, + SpreakerPageIE, + SpreakerShowIE, + SpreakerShowPageIE, +) +from .springboardplatform import SpringboardPlatformIE +from .sprout import SproutIE +from .srgssr import ( + SRGSSRIE, + SRGSSRPlayIE, +) +from .srmediathek import SRMediathekIE +from .stanfordoc import StanfordOpenClassroomIE +from .startv import StarTVIE +from .steam import SteamIE +from .storyfire import ( + StoryFireIE, + StoryFireUserIE, + StoryFireSeriesIE, +) +from .streamable import StreamableIE +from .streamanity import StreamanityIE +from .streamcloud import StreamcloudIE +from .streamcz import StreamCZIE +from .streetvoice import StreetVoiceIE +from .stretchinternet import StretchInternetIE +from .stv import STVPlayerIE +from .sunporno import SunPornoIE +from .sverigesradio import ( + SverigesRadioEpisodeIE, + SverigesRadioPublicationIE, +) +from .svt import ( + SVTIE, + SVTPageIE, + SVTPlayIE, + SVTSeriesIE, +) +from .swrmediathek import SWRMediathekIE +from .syfy import SyfyIE +from .sztvhu import SztvHuIE +from .tagesschau import ( + TagesschauPlayerIE, + TagesschauIE, +) +from .tass import TassIE +from .tbs import TBSIE +from .tdslifeway import TDSLifewayIE +from .teachable import ( + TeachableIE, + TeachableCourseIE, +) +from .teachertube import ( + TeacherTubeIE, + TeacherTubeUserIE, +) +from .teachingchannel import TeachingChannelIE +from .teamcoco import TeamcocoIE +from .teamtreehouse import TeamTreeHouseIE +from .techtalks import TechTalksIE +from .ted import TEDIE +from .tele5 import Tele5IE +from .tele13 import Tele13IE +from .telebruxelles import TeleBruxellesIE +from .telecinco import TelecincoIE +from .telegraaf import TelegraafIE +from .telemb import TeleMBIE +from .telemundo import TelemundoIE +from .telequebec import ( + TeleQuebecIE, + TeleQuebecSquatIE, + TeleQuebecEmissionIE, + TeleQuebecLiveIE, + TeleQuebecVideoIE, +) +from .teletask import TeleTaskIE +from .telewebion import TelewebionIE +from .tennistv import TennisTVIE +from .tenplay import TenPlayIE +from .testurl import TestURLIE +from .tf1 import TF1IE +from .tfo import TFOIE +from .theintercept import TheInterceptIE +from .theplatform import ( + ThePlatformIE, + ThePlatformFeedIE, +) +from .thescene import TheSceneIE +from .thestar import TheStarIE +from .thesun import TheSunIE +from .theta import ( + ThetaVideoIE, + ThetaStreamIE, +) +from .theweatherchannel import TheWeatherChannelIE +from .thisamericanlife import ThisAmericanLifeIE +from .thisav import ThisAVIE +from .thisoldhouse import ThisOldHouseIE +from .threeqsdn import ThreeQSDNIE +from .tiktok import ( + TikTokIE, + TikTokUserIE, + DouyinIE, +) +from .tinypic import TinyPicIE +from .tmz import TMZIE +from .tnaflix import ( + TNAFlixNetworkEmbedIE, + TNAFlixIE, + EMPFlixIE, + MovieFapIE, +) +from .toggle import ( + ToggleIE, + MeWatchIE, +) +from .tokentube import ( + TokentubeIE, + TokentubeChannelIE +) +from .tonline import TOnlineIE +from .toongoggles import ToonGogglesIE +from .toutv import TouTvIE +from .toypics import ToypicsUserIE, ToypicsIE +from .traileraddict import TrailerAddictIE +from .trilulilu import TriluliluIE +from .trovo import ( + TrovoIE, + TrovoVodIE, + TrovoChannelVodIE, + TrovoChannelClipIE, +) +from .trunews import TruNewsIE +from .trutv import TruTVIE +from .tube8 import Tube8IE +from .tubitv import ( + TubiTvIE, + TubiTvShowIE, +) +from .tumblr import TumblrIE +from .tunein import ( + TuneInClipIE, + TuneInStationIE, + TuneInProgramIE, + TuneInTopicIE, + TuneInShortenerIE, +) +from .tunepk import TunePkIE +from .turbo import TurboIE +from .tv2 import ( + TV2IE, + TV2ArticleIE, + KatsomoIE, + MTVUutisetArticleIE, +) +from .tv2dk import ( + TV2DKIE, + TV2DKBornholmPlayIE, +) +from .tv2hu import ( + TV2HuIE, + TV2HuSeriesIE, +) +from .tv4 import TV4IE +from .tv5mondeplus import TV5MondePlusIE +from .tv5unis import ( + TV5UnisVideoIE, + TV5UnisIE, +) +from .tva import ( + TVAIE, + QubIE, +) +from .tvanouvelles import ( + TVANouvellesIE, + TVANouvellesArticleIE, +) +from .tvc import ( + TVCIE, + TVCArticleIE, +) +from .tver import TVerIE +from .tvigle import TvigleIE +from .tvland import TVLandIE +from .tvn24 import TVN24IE +from .tvnet import TVNetIE +from .tvnoe import TVNoeIE +from .tvnow import ( + TVNowIE, + TVNowFilmIE, + TVNowNewIE, + TVNowSeasonIE, + TVNowAnnualIE, + TVNowShowIE, +) +from .tvp import ( + TVPEmbedIE, + TVPIE, + TVPWebsiteIE, +) +from .tvplay import ( + TVPlayIE, + ViafreeIE, + TVPlayHomeIE, +) +from .tvplayer import TVPlayerIE +from .tweakers import TweakersIE +from .twentyfourvideo import TwentyFourVideoIE +from .twentymin import TwentyMinutenIE +from .twentythreevideo import TwentyThreeVideoIE +from .twitcasting import ( + TwitCastingIE, + TwitCastingLiveIE, + TwitCastingUserIE, +) +from .twitch import ( + TwitchVodIE, + TwitchCollectionIE, + TwitchVideosIE, + TwitchVideosClipsIE, + TwitchVideosCollectionsIE, + TwitchStreamIE, + TwitchClipsIE, +) +from .twitter import ( + TwitterCardIE, + TwitterIE, + TwitterAmplifyIE, + TwitterBroadcastIE, + TwitterShortenerIE, +) +from .udemy import ( + UdemyIE, + UdemyCourseIE +) +from .udn import UDNEmbedIE +from .ufctv import ( + UFCTVIE, + UFCArabiaIE, +) +from .ukcolumn import UkColumnIE +from .uktvplay import UKTVPlayIE +from .digiteka import DigitekaIE +from .dlive import ( + DLiveVODIE, + DLiveStreamIE, +) +from .umg import UMGDeIE +from .unistra import UnistraIE +from .unity import UnityIE +from .uol import UOLIE +from .uplynk import ( + UplynkIE, + UplynkPreplayIE, +) +from .urort import UrortIE +from .urplay import URPlayIE +from .usanetwork import USANetworkIE +from .usatoday import USATodayIE +from .ustream import UstreamIE, UstreamChannelIE +from .ustudio import ( + UstudioIE, + UstudioEmbedIE, +) +from .utreon import UtreonIE +from .varzesh3 import Varzesh3IE +from .vbox7 import Vbox7IE +from .veehd import VeeHDIE +from .veo import VeoIE +from .veoh import VeohIE +from .vesti import VestiIE +from .vevo import ( + VevoIE, + VevoPlaylistIE, +) +from .vgtv import ( + BTArticleIE, + BTVestlendingenIE, + VGTVIE, +) +from .vh1 import VH1IE +from .vice import ( + ViceIE, + ViceArticleIE, + ViceShowIE, +) +from .vidbit import VidbitIE +from .viddler import ViddlerIE +from .videa import VideaIE +from .videodetective import VideoDetectiveIE +from .videofyme import VideofyMeIE +from .videomore import ( + VideomoreIE, + VideomoreVideoIE, + VideomoreSeasonIE, +) +from .videopress import VideoPressIE +from .vidio import ( + VidioIE, + VidioPremierIE, + VidioLiveIE +) +from .vidlii import VidLiiIE +from .vier import VierIE, VierVideosIE +from .viewlift import ( + ViewLiftIE, + ViewLiftEmbedIE, +) +from .viidea import ViideaIE +from .vimeo import ( + VimeoIE, + VimeoAlbumIE, + VimeoChannelIE, + VimeoGroupsIE, + VimeoLikesIE, + VimeoOndemandIE, + VimeoReviewIE, + VimeoUserIE, + VimeoWatchLaterIE, + VHXEmbedIE, +) +from .vimple import VimpleIE +from .vine import ( + VineIE, + VineUserIE, +) +from .viki import ( + VikiIE, + VikiChannelIE, +) +from .viqeo import ViqeoIE +from .viu import ( + ViuIE, + ViuPlaylistIE, + ViuOTTIE, +) +from .vk import ( + VKIE, + VKUserVideosIE, + VKWallPostIE, +) +from .vlive import ( + VLiveIE, + VLivePostIE, + VLiveChannelIE, +) +from .vodlocker import VodlockerIE +from .vodpl import VODPlIE +from .vodplatform import VODPlatformIE +from .voicerepublic import VoiceRepublicIE +from .voicy import ( + VoicyIE, + VoicyChannelIE, +) +from .voot import ( + VootIE, + VootSeriesIE, +) +from .voxmedia import ( + VoxMediaVolumeIE, + VoxMediaIE, +) +from .vrt import VRTIE +from .vrak import VrakIE +from .vrv import ( + VRVIE, + VRVSeriesIE, +) +from .vshare import VShareIE +from .vtm import VTMIE +from .medialaan import MedialaanIE +from .vube import VubeIE +from .vuclip import VuClipIE +from .vupload import VuploadIE +from .vvvvid import ( + VVVVIDIE, + VVVVIDShowIE, +) +from .vyborymos import VyboryMosIE +from .vzaar import VzaarIE +from .wakanim import WakanimIE +from .walla import WallaIE +from .washingtonpost import ( + WashingtonPostIE, + WashingtonPostArticleIE, +) +from .wat import WatIE +from .watchbox import WatchBoxIE +from .watchindianporn import WatchIndianPornIE +from .wdr import ( + WDRIE, + WDRPageIE, + WDRElefantIE, + WDRMobileIE, +) +from .webcaster import ( + WebcasterIE, + WebcasterFeedIE, +) +from .webofstories import ( + WebOfStoriesIE, + WebOfStoriesPlaylistIE, +) +from .weibo import ( + WeiboIE, + WeiboMobileIE +) +from .weiqitv import WeiqiTVIE +from .wimtv import WimTVIE +from .whowatch import WhoWatchIE +from .wistia import ( + WistiaIE, + WistiaPlaylistIE, +) +from .worldstarhiphop import WorldStarHipHopIE +from .wsj import ( + WSJIE, + WSJArticleIE, +) +from .wwe import WWEIE +from .xbef import XBefIE +from .xboxclips import XboxClipsIE +from .xfileshare import XFileShareIE +from .xhamster import ( + XHamsterIE, + XHamsterEmbedIE, + XHamsterUserIE, +) +from .xiami import ( + XiamiSongIE, + XiamiAlbumIE, + XiamiArtistIE, + XiamiCollectionIE +) +from .ximalaya import ( + XimalayaIE, + XimalayaAlbumIE +) +from .xminus import XMinusIE +from .xnxx import XNXXIE +from .xstream import XstreamIE +from .xtube import XTubeUserIE, XTubeIE +from .xuite import XuiteIE +from .xvideos import XVideosIE +from .xxxymovies import XXXYMoviesIE +from .yahoo import ( + YahooIE, + YahooSearchIE, + YahooGyaOPlayerIE, + YahooGyaOIE, + YahooJapanNewsIE, +) +from .yandexdisk import YandexDiskIE +from .yandexmusic import ( + YandexMusicTrackIE, + YandexMusicAlbumIE, + YandexMusicPlaylistIE, + YandexMusicArtistTracksIE, + YandexMusicArtistAlbumsIE, +) +from .yandexvideo import ( + YandexVideoIE, + ZenYandexIE, + ZenYandexChannelIE, +) +from .yapfiles import YapFilesIE +from .yesjapan import YesJapanIE +from .yinyuetai import YinYueTaiIE +from .ynet import YnetIE +from .youjizz import YouJizzIE +from .youku import ( + YoukuIE, + YoukuShowIE, +) +from .younow import ( + YouNowLiveIE, + YouNowChannelIE, + YouNowMomentIE, +) +from .youporn import YouPornIE +from .yourporn import YourPornIE +from .yourupload import YourUploadIE +from .youtube import ( + YoutubeIE, + YoutubeClipIE, + YoutubeFavouritesIE, + YoutubeHistoryIE, + YoutubeTabIE, + YoutubePlaylistIE, + YoutubeRecommendedIE, + YoutubeSearchDateIE, + YoutubeSearchIE, + YoutubeSearchURLIE, + YoutubeSubscriptionsIE, + YoutubeTruncatedIDIE, + YoutubeTruncatedURLIE, + YoutubeYtBeIE, + YoutubeYtUserIE, + YoutubeWatchLaterIE, +) +from .zapiks import ZapiksIE +from .zattoo import ( + BBVTVIE, + EinsUndEinsTVIE, + EWETVIE, + GlattvisionTVIE, + MNetTVIE, + MyVisionTVIE, + NetPlusIE, + OsnatelTVIE, + QuantumTVIE, + QuicklineIE, + QuicklineLiveIE, + SaltTVIE, + SAKTVIE, + VTXTVIE, + WalyTVIE, + ZattooIE, + ZattooLiveIE, +) +from .zdf import ZDFIE, ZDFChannelIE +from .zee5 import ( + Zee5IE, + Zee5SeriesIE, +) +from .zhihu import ZhihuIE +from .zingmp3 import ( + ZingMp3IE, + ZingMp3AlbumIE, +) +from .zoom import ZoomIE +from .zype import ZypeIE diff --git a/youtube_dl/extractor/extremetube.py b/yt_dlp/extractor/extremetube.py index acd4090fa..acd4090fa 100644 --- a/youtube_dl/extractor/extremetube.py +++ b/yt_dlp/extractor/extremetube.py diff --git a/youtube_dl/extractor/eyedotv.py b/yt_dlp/extractor/eyedotv.py index f62ddebae..f62ddebae 100644 --- a/youtube_dl/extractor/eyedotv.py +++ b/yt_dlp/extractor/eyedotv.py diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py new file mode 100644 index 000000000..44d3dc0d7 --- /dev/null +++ b/yt_dlp/extractor/facebook.py @@ -0,0 +1,748 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..compat import ( + compat_etree_fromstring, + compat_str, + compat_urllib_parse_unquote, + compat_urllib_parse_unquote_plus, +) +from ..utils import ( + clean_html, + error_to_compat_str, + ExtractorError, + float_or_none, + get_element_by_id, + int_or_none, + js_to_json, + limit_length, + merge_dicts, + network_exceptions, + parse_count, + qualities, + sanitized_Request, + try_get, + urlencode_postdata, + urljoin, +) + + +class FacebookIE(InfoExtractor): + _VALID_URL = r'''(?x) + (?: + https?:// + (?:[\w-]+\.)?(?:facebook\.com|facebookwkhpilnemxj7asaniu7vnjjbiltxjqhye3mhbshg7kx5tfyd\.onion)/ + (?:[^#]*?\#!/)? + (?: + (?: + video/video\.php| + photo\.php| + video\.php| + video/embed| + story\.php| + watch(?:/live)?/? + )\?(?:.*?)(?:v|video_id|story_fbid)=| + [^/]+/videos/(?:[^/]+/)?| + [^/]+/posts/| + groups/[^/]+/permalink/| + watchparty/ + )| + facebook: + ) + (?P<id>[0-9]+) + ''' + _LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1' + _CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1' + _NETRC_MACHINE = 'facebook' + IE_NAME = 'facebook' + + _VIDEO_PAGE_TEMPLATE = 'https://www.facebook.com/video/video.php?v=%s' + _VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true&payloadtype=primary' + + _TESTS = [{ + 'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf', + 'md5': '6a40d33c0eccbb1af76cf0485a052659', + 'info_dict': { + 'id': '637842556329505', + 'ext': 'mp4', + 'title': 're:Did you know Kei Nishikori is the first Asian man to ever reach a Grand Slam', + 'uploader': 'Tennis on Facebook', + 'upload_date': '20140908', + 'timestamp': 1410199200, + }, + 'skip': 'Requires logging in', + }, { + # data.video + 'url': 'https://www.facebook.com/video.php?v=274175099429670', + 'info_dict': { + 'id': '274175099429670', + 'ext': 'mp4', + 'title': 'Asif Nawab Butt', + 'description': 'Asif Nawab Butt', + 'uploader': 'Asif Nawab Butt', + 'upload_date': '20140506', + 'timestamp': 1399398998, + 'thumbnail': r're:^https?://.*', + }, + 'expected_warnings': [ + 'title' + ] + }, { + 'note': 'Video with DASH manifest', + 'url': 'https://www.facebook.com/video.php?v=957955867617029', + 'md5': 'b2c28d528273b323abe5c6ab59f0f030', + 'info_dict': { + 'id': '957955867617029', + 'ext': 'mp4', + 'title': 'When you post epic content on instagram.com/433 8 million followers, this is ...', + 'uploader': 'Demy de Zeeuw', + 'upload_date': '20160110', + 'timestamp': 1452431627, + }, + 'skip': 'Requires logging in', + }, { + 'url': 'https://www.facebook.com/maxlayn/posts/10153807558977570', + 'md5': '037b1fa7f3c2d02b7a0d7bc16031ecc6', + 'info_dict': { + 'id': '544765982287235', + 'ext': 'mp4', + 'title': '"What are you doing running in the snow?"', + 'uploader': 'FailArmy', + }, + 'skip': 'Video gone', + }, { + 'url': 'https://m.facebook.com/story.php?story_fbid=1035862816472149&id=116132035111903', + 'md5': '1deb90b6ac27f7efcf6d747c8a27f5e3', + 'info_dict': { + 'id': '1035862816472149', + 'ext': 'mp4', + 'title': 'What the Flock Is Going On In New Zealand Credit: ViralHog', + 'uploader': 'S. Saint', + }, + 'skip': 'Video gone', + }, { + 'note': 'swf params escaped', + 'url': 'https://www.facebook.com/barackobama/posts/10153664894881749', + 'md5': '97ba073838964d12c70566e0085c2b91', + 'info_dict': { + 'id': '10153664894881749', + 'ext': 'mp4', + 'title': 'Average time to confirm recent Supreme Court nominees: 67 days Longest it\'s t...', + 'thumbnail': r're:^https?://.*', + 'timestamp': 1456259628, + 'upload_date': '20160223', + 'uploader': 'Barack Obama', + }, + 'skip': 'Gif on giphy.com gone', + }, { + # have 1080P, but only up to 720p in swf params + # data.video.story.attachments[].media + 'url': 'https://www.facebook.com/cnn/videos/10155529876156509/', + 'md5': '3f3798adb2b73423263e59376f1f5eb7', + 'info_dict': { + 'id': '10155529876156509', + 'ext': 'mp4', + 'title': 'Holocaust survivor becomes US citizen', + 'description': 'She survived the holocaust — and years later, she’s getting her citizenship so she can vote for Hillary Clinton http://cnn.it/2eERh5f', + 'timestamp': 1477818095, + 'upload_date': '20161030', + 'uploader': 'CNN', + 'thumbnail': r're:^https?://.*', + 'view_count': int, + }, + }, { + # bigPipe.onPageletArrive ... onPageletArrive pagelet_group_mall + # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media + 'url': 'https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/', + 'info_dict': { + 'id': '1417995061575415', + 'ext': 'mp4', + 'title': 'Yaroslav Korpan - Довгоочікуване відео', + 'description': 'Довгоочікуване відео', + 'timestamp': 1486648771, + 'upload_date': '20170209', + 'uploader': 'Yaroslav Korpan', + 'uploader_id': '100000948048708', + }, + 'params': { + 'skip_download': True, + }, + }, { + # FIXME + 'url': 'https://www.facebook.com/LaGuiaDelVaron/posts/1072691702860471', + 'info_dict': { + 'id': '1072691702860471', + 'ext': 'mp4', + 'title': 'md5:ae2d22a93fbb12dad20dc393a869739d', + 'timestamp': 1477305000, + 'upload_date': '20161024', + 'uploader': 'La Guía Del Varón', + 'thumbnail': r're:^https?://.*', + }, + 'params': { + 'skip_download': True, + }, + }, { + # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media + 'url': 'https://www.facebook.com/groups/1024490957622648/permalink/1396382447100162/', + 'info_dict': { + 'id': '202882990186699', + 'ext': 'mp4', + 'title': 'Elisabeth Ahtn - Hello? Yes your uber ride is here\n* Jukin...', + 'description': 'Hello? Yes your uber ride is here\n* Jukin Media Verified *\nFind this video and others like it by visiting...', + 'timestamp': 1486035513, + 'upload_date': '20170202', + 'uploader': 'Elisabeth Ahtn', + 'uploader_id': '100013949973717', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.facebook.com/video.php?v=10204634152394104', + 'only_matching': True, + }, { + 'url': 'https://www.facebook.com/amogood/videos/1618742068337349/?fref=nf', + 'only_matching': True, + }, { + # data.mediaset.currMedia.edges + 'url': 'https://www.facebook.com/ChristyClarkForBC/videos/vb.22819070941/10153870694020942/?type=2&theater', + 'only_matching': True, + }, { + # data.video.story.attachments[].media + 'url': 'facebook:544765982287235', + 'only_matching': True, + }, { + # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media + 'url': 'https://www.facebook.com/groups/164828000315060/permalink/764967300301124/', + 'only_matching': True, + }, { + # data.video.creation_story.attachments[].media + 'url': 'https://zh-hk.facebook.com/peoplespower/videos/1135894589806027/', + 'only_matching': True, + }, { + # data.video + 'url': 'https://www.facebookwkhpilnemxj7asaniu7vnjjbiltxjqhye3mhbshg7kx5tfyd.onion/video.php?v=274175099429670', + 'only_matching': True, + }, { + # no title + 'url': 'https://www.facebook.com/onlycleverentertainment/videos/1947995502095005/', + 'only_matching': True, + }, { + # data.video + 'url': 'https://www.facebook.com/WatchESLOne/videos/359649331226507/', + 'info_dict': { + 'id': '359649331226507', + 'ext': 'mp4', + 'title': 'Fnatic vs. EG - Group A - Opening Match - ESL One Birmingham Day 1', + 'description': '#ESLOne VoD - Birmingham Finals Day#1 Fnatic vs. @Evil Geniuses', + 'timestamp': 1527084179, + 'upload_date': '20180523', + 'uploader': 'ESL One Dota 2', + 'uploader_id': '234218833769558', + }, + 'params': { + 'skip_download': True, + }, + }, { + # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.all_subattachments.nodes[].media + 'url': 'https://www.facebook.com/100033620354545/videos/106560053808006/', + 'info_dict': { + 'id': '106560053808006', + }, + 'playlist_count': 2, + }, { + # data.video.story.attachments[].media + 'url': 'https://www.facebook.com/watch/?v=647537299265662', + 'only_matching': True, + }, { + # FIXME: https://github.com/yt-dlp/yt-dlp/issues/542 + # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.all_subattachments.nodes[].media + 'url': 'https://www.facebook.com/PankajShahLondon/posts/10157667649866271', + 'info_dict': { + 'id': '10157667649866271', + }, + 'playlist_count': 3, + }, { + # data.nodes[].comet_sections.content.story.attachments[].style_type_renderer.attachment.media + 'url': 'https://m.facebook.com/Alliance.Police.Department/posts/4048563708499330', + 'info_dict': { + 'id': '117576630041613', + 'ext': 'mp4', + # TODO: title can be extracted from video page + 'title': 'Facebook video #117576630041613', + 'uploader_id': '189393014416438', + 'upload_date': '20201123', + 'timestamp': 1606162592, + }, + 'skip': 'Requires logging in', + }, { + # node.comet_sections.content.story.attached_story.attachments.style_type_renderer.attachment.media + 'url': 'https://www.facebook.com/groups/ateistiskselskab/permalink/10154930137678856/', + 'info_dict': { + 'id': '211567722618337', + 'ext': 'mp4', + 'title': 'Facebook video #211567722618337', + 'uploader_id': '127875227654254', + 'upload_date': '20161122', + 'timestamp': 1479793574, + }, + 'skip': 'No video', + }, { + # data.video.creation_story.attachments[].media + 'url': 'https://www.facebook.com/watch/live/?v=1823658634322275', + 'only_matching': True, + }, { + 'url': 'https://www.facebook.com/watchparty/211641140192478', + 'info_dict': { + 'id': '211641140192478', + }, + 'playlist_count': 1, + 'skip': 'Requires logging in', + }] + _SUPPORTED_PAGLETS_REGEX = r'(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_[0-9a-f]+)' + _api_config = { + 'graphURI': '/api/graphql/' + } + + @staticmethod + def _extract_urls(webpage): + urls = [] + for mobj in re.finditer( + r'<iframe[^>]+?src=(["\'])(?P<url>https?://www\.facebook\.com/(?:video/embed|plugins/video\.php).+?)\1', + webpage): + urls.append(mobj.group('url')) + # Facebook API embed + # see https://developers.facebook.com/docs/plugins/embedded-video-player + for mobj in re.finditer(r'''(?x)<div[^>]+ + class=(?P<q1>[\'"])[^\'"]*\bfb-(?:video|post)\b[^\'"]*(?P=q1)[^>]+ + data-href=(?P<q2>[\'"])(?P<url>(?:https?:)?//(?:www\.)?facebook.com/.+?)(?P=q2)''', webpage): + urls.append(mobj.group('url')) + return urls + + def _login(self): + useremail, password = self._get_login_info() + if useremail is None: + return + + login_page_req = sanitized_Request(self._LOGIN_URL) + self._set_cookie('facebook.com', 'locale', 'en_US') + login_page = self._download_webpage(login_page_req, None, + note='Downloading login page', + errnote='Unable to download login page') + lsd = self._search_regex( + r'<input type="hidden" name="lsd" value="([^"]*)"', + login_page, 'lsd') + lgnrnd = self._search_regex(r'name="lgnrnd" value="([^"]*?)"', login_page, 'lgnrnd') + + login_form = { + 'email': useremail, + 'pass': password, + 'lsd': lsd, + 'lgnrnd': lgnrnd, + 'next': 'http://facebook.com/home.php', + 'default_persistent': '0', + 'legacy_return': '1', + 'timezone': '-60', + 'trynum': '1', + } + request = sanitized_Request(self._LOGIN_URL, urlencode_postdata(login_form)) + request.add_header('Content-Type', 'application/x-www-form-urlencoded') + try: + login_results = self._download_webpage(request, None, + note='Logging in', errnote='unable to fetch login page') + if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None: + error = self._html_search_regex( + r'(?s)<div[^>]+class=(["\']).*?login_error_box.*?\1[^>]*><div[^>]*>.*?</div><div[^>]*>(?P<error>.+?)</div>', + login_results, 'login error', default=None, group='error') + if error: + raise ExtractorError('Unable to login: %s' % error, expected=True) + self.report_warning('unable to log in: bad username/password, or exceeded login rate limit (~3/min). Check credentials or wait.') + return + + fb_dtsg = self._search_regex( + r'name="fb_dtsg" value="(.+?)"', login_results, 'fb_dtsg', default=None) + h = self._search_regex( + r'name="h"\s+(?:\w+="[^"]+"\s+)*?value="([^"]+)"', login_results, 'h', default=None) + + if not fb_dtsg or not h: + return + + check_form = { + 'fb_dtsg': fb_dtsg, + 'h': h, + 'name_action_selected': 'dont_save', + } + check_req = sanitized_Request(self._CHECKPOINT_URL, urlencode_postdata(check_form)) + check_req.add_header('Content-Type', 'application/x-www-form-urlencoded') + check_response = self._download_webpage(check_req, None, + note='Confirming login') + if re.search(r'id="checkpointSubmitButton"', check_response) is not None: + self.report_warning('Unable to confirm login, you have to login in your browser and authorize the login.') + except network_exceptions as err: + self.report_warning('unable to log in: %s' % error_to_compat_str(err)) + return + + def _real_initialize(self): + self._login() + + def _extract_from_url(self, url, video_id): + webpage = self._download_webpage( + url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id) + + def extract_metadata(webpage): + video_title = self._html_search_regex( + r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage, + 'title', default=None) + if not video_title: + video_title = self._html_search_regex( + r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(.*?)</span>', + webpage, 'alternative title', default=None) + if not video_title: + video_title = self._html_search_meta( + ['og:title', 'twitter:title', 'description'], + webpage, 'title', default=None) + if video_title: + video_title = limit_length(video_title, 80) + else: + video_title = 'Facebook video #%s' % video_id + description = self._html_search_meta( + ['description', 'og:description', 'twitter:description'], + webpage, 'description', default=None) + uploader = clean_html(get_element_by_id( + 'fbPhotoPageAuthorName', webpage)) or self._search_regex( + r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader', + default=None) or self._og_search_title(webpage, fatal=False) + timestamp = int_or_none(self._search_regex( + r'<abbr[^>]+data-utime=["\'](\d+)', webpage, + 'timestamp', default=None)) + thumbnail = self._html_search_meta( + ['og:image', 'twitter:image'], webpage, 'thumbnail', default=None) + # some webpages contain unretrievable thumbnail urls + # like https://lookaside.fbsbx.com/lookaside/crawler/media/?media_id=10155168902769113&get_thumbnail=1 + # in https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/ + if thumbnail and not re.search(r'\.(?:jpg|png)', thumbnail): + thumbnail = None + view_count = parse_count(self._search_regex( + r'\bviewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count', + default=None)) + info_dict = { + 'title': video_title, + 'description': description, + 'uploader': uploader, + 'timestamp': timestamp, + 'thumbnail': thumbnail, + 'view_count': view_count, + } + info_json_ld = self._search_json_ld(webpage, video_id, default={}) + if info_json_ld.get('title'): + info_json_ld['title'] = limit_length( + re.sub(r'\s*\|\s*Facebook$', '', info_json_ld['title']), 80) + return merge_dicts(info_json_ld, info_dict) + + video_data = None + + def extract_video_data(instances): + video_data = [] + for item in instances: + if try_get(item, lambda x: x[1][0]) == 'VideoConfig': + video_item = item[2][0] + if video_item.get('video_id'): + video_data.append(video_item['videoData']) + return video_data + + server_js_data = self._parse_json(self._search_regex( + [r'handleServerJS\(({.+})(?:\);|,")', r'\bs\.handle\(({.+?})\);'], + webpage, 'server js data', default='{}'), video_id, fatal=False) + + if server_js_data: + video_data = extract_video_data(server_js_data.get('instances', [])) + + def extract_from_jsmods_instances(js_data): + if js_data: + return extract_video_data(try_get( + js_data, lambda x: x['jsmods']['instances'], list) or []) + + def extract_dash_manifest(video, formats): + dash_manifest = video.get('dash_manifest') + if dash_manifest: + formats.extend(self._parse_mpd_formats( + compat_etree_fromstring(compat_urllib_parse_unquote_plus(dash_manifest)))) + + def process_formats(formats): + # Downloads with browser's User-Agent are rate limited. Working around + # with non-browser User-Agent. + for f in formats: + f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1' + + self._sort_formats(formats, ('res', 'quality')) + + def extract_relay_data(_filter): + return self._parse_json(self._search_regex( + r'handleWithCustomApplyEach\([^,]+,\s*({.*?%s.*?})\);' % _filter, + webpage, 'replay data', default='{}'), video_id, fatal=False) or {} + + def extract_relay_prefetched_data(_filter): + replay_data = extract_relay_data(_filter) + for require in (replay_data.get('require') or []): + if require[0] == 'RelayPrefetchedStreamCache': + return try_get(require, lambda x: x[3][1]['__bbox']['result']['data'], dict) or {} + + if not video_data: + server_js_data = self._parse_json(self._search_regex([ + r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+' + self._SUPPORTED_PAGLETS_REGEX, + r'bigPipe\.onPageletArrive\(({.*?id\s*:\s*"%s".*?})\);' % self._SUPPORTED_PAGLETS_REGEX + ], webpage, 'js data', default='{}'), video_id, js_to_json, False) + video_data = extract_from_jsmods_instances(server_js_data) + + if not video_data: + data = extract_relay_prefetched_data( + r'"(?:dash_manifest|playable_url(?:_quality_hd)?)"\s*:\s*"[^"]+"') + if data: + entries = [] + + def parse_graphql_video(video): + formats = [] + q = qualities(['sd', 'hd']) + for (suffix, format_id) in [('', 'sd'), ('_quality_hd', 'hd')]: + playable_url = video.get('playable_url' + suffix) + if not playable_url: + continue + formats.append({ + 'format_id': format_id, + 'quality': q(format_id), + 'url': playable_url, + }) + extract_dash_manifest(video, formats) + process_formats(formats) + v_id = video.get('videoId') or video.get('id') or video_id + info = { + 'id': v_id, + 'formats': formats, + 'thumbnail': try_get(video, lambda x: x['thumbnailImage']['uri']), + 'uploader_id': try_get(video, lambda x: x['owner']['id']), + 'timestamp': int_or_none(video.get('publish_time')), + 'duration': float_or_none(video.get('playable_duration_in_ms'), 1000), + } + description = try_get(video, lambda x: x['savable_description']['text']) + title = video.get('name') + if title: + info.update({ + 'title': title, + 'description': description, + }) + else: + info['title'] = description or 'Facebook video #%s' % v_id + entries.append(info) + + def parse_attachment(attachment, key='media'): + media = attachment.get(key) or {} + if media.get('__typename') == 'Video': + return parse_graphql_video(media) + + nodes = data.get('nodes') or [] + node = data.get('node') or {} + if not nodes and node: + nodes.append(node) + for node in nodes: + story = try_get(node, lambda x: x['comet_sections']['content']['story'], dict) or {} + attachments = try_get(story, [ + lambda x: x['attached_story']['attachments'], + lambda x: x['attachments'] + ], list) or [] + for attachment in attachments: + attachment = try_get(attachment, lambda x: x['style_type_renderer']['attachment'], dict) + ns = try_get(attachment, lambda x: x['all_subattachments']['nodes'], list) or [] + for n in ns: + parse_attachment(n) + parse_attachment(attachment) + + edges = try_get(data, lambda x: x['mediaset']['currMedia']['edges'], list) or [] + for edge in edges: + parse_attachment(edge, key='node') + + video = data.get('video') or {} + if video: + attachments = try_get(video, [ + lambda x: x['story']['attachments'], + lambda x: x['creation_story']['attachments'] + ], list) or [] + for attachment in attachments: + parse_attachment(attachment) + if not entries: + parse_graphql_video(video) + + if len(entries) > 1: + return self.playlist_result(entries, video_id) + + video_info = entries[0] + webpage_info = extract_metadata(webpage) + # honor precise duration in video info + if video_info.get('duration'): + webpage_info['duration'] = video_info['duration'] + return merge_dicts(webpage_info, video_info) + + if not video_data: + m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*"><div>(.*?)</div>', webpage) + if m_msg is not None: + raise ExtractorError( + 'The video is not available, Facebook said: "%s"' % m_msg.group(1), + expected=True) + elif any(p in webpage for p in ( + '>You must log in to continue', + 'id="login_form"', + 'id="loginbutton"')): + self.raise_login_required() + + if not video_data and '/watchparty/' in url: + post_data = { + 'doc_id': 3731964053542869, + 'variables': json.dumps({ + 'livingRoomID': video_id, + }), + } + + prefetched_data = extract_relay_prefetched_data(r'"login_data"\s*:\s*{') + if prefetched_data: + lsd = try_get(prefetched_data, lambda x: x['login_data']['lsd'], dict) + if lsd: + post_data[lsd['name']] = lsd['value'] + + relay_data = extract_relay_data(r'\[\s*"RelayAPIConfigDefaults"\s*,') + for define in (relay_data.get('define') or []): + if define[0] == 'RelayAPIConfigDefaults': + self._api_config = define[2] + + living_room = self._download_json( + urljoin(url, self._api_config['graphURI']), video_id, + data=urlencode_postdata(post_data))['data']['living_room'] + + entries = [] + for edge in (try_get(living_room, lambda x: x['recap']['watched_content']['edges']) or []): + video = try_get(edge, lambda x: x['node']['video']) or {} + v_id = video.get('id') + if not v_id: + continue + v_id = compat_str(v_id) + entries.append(self.url_result( + self._VIDEO_PAGE_TEMPLATE % v_id, + self.ie_key(), v_id, video.get('name'))) + + return self.playlist_result(entries, video_id) + + if not video_data: + # Video info not in first request, do a secondary request using + # tahoe player specific URL + tahoe_data = self._download_webpage( + self._VIDEO_PAGE_TAHOE_TEMPLATE % video_id, video_id, + data=urlencode_postdata({ + '__a': 1, + '__pc': self._search_regex( + r'pkg_cohort["\']\s*:\s*["\'](.+?)["\']', webpage, + 'pkg cohort', default='PHASED:DEFAULT'), + '__rev': self._search_regex( + r'client_revision["\']\s*:\s*(\d+),', webpage, + 'client revision', default='3944515'), + 'fb_dtsg': self._search_regex( + r'"DTSGInitialData"\s*,\s*\[\]\s*,\s*{\s*"token"\s*:\s*"([^"]+)"', + webpage, 'dtsg token', default=''), + }), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + }) + tahoe_js_data = self._parse_json( + self._search_regex( + r'for\s+\(\s*;\s*;\s*\)\s*;(.+)', tahoe_data, + 'tahoe js data', default='{}'), + video_id, fatal=False) + video_data = extract_from_jsmods_instances(tahoe_js_data) + + if not video_data: + raise ExtractorError('Cannot parse data') + + if len(video_data) > 1: + entries = [] + for v in video_data: + video_url = v[0].get('video_url') + if not video_url: + continue + entries.append(self.url_result(urljoin( + url, video_url), self.ie_key(), v[0].get('video_id'))) + return self.playlist_result(entries, video_id) + video_data = video_data[0] + + formats = [] + subtitles = {} + for f in video_data: + format_id = f['stream_type'] + if f and isinstance(f, dict): + f = [f] + if not f or not isinstance(f, list): + continue + for quality in ('sd', 'hd'): + for src_type in ('src', 'src_no_ratelimit'): + src = f[0].get('%s_%s' % (quality, src_type)) + if src: + preference = -10 if format_id == 'progressive' else -1 + if quality == 'hd': + preference += 5 + formats.append({ + 'format_id': '%s_%s_%s' % (format_id, quality, src_type), + 'url': src, + 'quality': preference, + 'height': 720 if quality == 'hd' else None + }) + extract_dash_manifest(f[0], formats) + subtitles_src = f[0].get('subtitles_src') + if subtitles_src: + subtitles.setdefault('en', []).append({'url': subtitles_src}) + + process_formats(formats) + + info_dict = { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + } + info_dict.update(extract_metadata(webpage)) + + return info_dict + + def _real_extract(self, url): + video_id = self._match_id(url) + + real_url = self._VIDEO_PAGE_TEMPLATE % video_id if url.startswith('facebook:') else url + return self._extract_from_url(real_url, video_id) + + +class FacebookPluginsVideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:[\w-]+\.)?facebook\.com/plugins/video\.php\?.*?\bhref=(?P<id>https.+)' + + _TESTS = [{ + 'url': 'https://www.facebook.com/plugins/video.php?href=https%3A%2F%2Fwww.facebook.com%2Fgov.sg%2Fvideos%2F10154383743583686%2F&show_text=0&width=560', + 'md5': '5954e92cdfe51fe5782ae9bda7058a07', + 'info_dict': { + 'id': '10154383743583686', + 'ext': 'mp4', + 'title': 'What to do during the haze?', + 'uploader': 'Gov.sg', + 'upload_date': '20160826', + 'timestamp': 1472184808, + }, + 'add_ie': [FacebookIE.ie_key()], + }, { + 'url': 'https://www.facebook.com/plugins/video.php?href=https%3A%2F%2Fwww.facebook.com%2Fvideo.php%3Fv%3D10204634152394104', + 'only_matching': True, + }, { + 'url': 'https://www.facebook.com/plugins/video.php?href=https://www.facebook.com/gov.sg/videos/10154383743583686/&show_text=0&width=560', + 'only_matching': True, + }] + + def _real_extract(self, url): + return self.url_result( + compat_urllib_parse_unquote(self._match_id(url)), + FacebookIE.ie_key()) diff --git a/yt_dlp/extractor/fancode.py b/yt_dlp/extractor/fancode.py new file mode 100644 index 000000000..912feb702 --- /dev/null +++ b/yt_dlp/extractor/fancode.py @@ -0,0 +1,187 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + +from ..compat import compat_str +from ..utils import ( + parse_iso8601, + ExtractorError, + try_get, + mimetype2ext +) + + +class FancodeVodIE(InfoExtractor): + IE_NAME = 'fancode:vod' + + _VALID_URL = r'https?://(?:www\.)?fancode\.com/video/(?P<id>[0-9]+)\b' + + _TESTS = [{ + 'url': 'https://fancode.com/video/15043/match-preview-pbks-vs-mi', + 'params': { + 'skip_download': True, + 'format': 'bestvideo' + }, + 'info_dict': { + 'id': '6249806281001', + 'ext': 'mp4', + 'title': 'Match Preview: PBKS vs MI', + 'thumbnail': r're:^https?://.*\.jpg$', + "timestamp": 1619081590, + 'view_count': int, + 'like_count': int, + 'upload_date': '20210422', + 'uploader_id': '6008340455001' + } + }, { + 'url': 'https://fancode.com/video/15043', + 'only_matching': True, + }] + + _ACCESS_TOKEN = None + _NETRC_MACHINE = 'fancode' + + _LOGIN_HINT = 'Use "--user refresh --password <refresh_token>" to login using a refresh token' + + headers = { + 'content-type': 'application/json', + 'origin': 'https://fancode.com', + 'referer': 'https://fancode.com', + } + + def _login(self): + # Access tokens are shortlived, so get them using the refresh token. + username, password = self._get_login_info() + if username == 'refresh' and password is not None: + self.report_login() + data = '''{ + "query":"mutation RefreshToken($refreshToken: String\\u0021) { refreshToken(refreshToken: $refreshToken) { accessToken }}", + "variables":{ + "refreshToken":"%s" + }, + "operationName":"RefreshToken" + }''' % password + + token_json = self.download_gql('refresh token', data, "Getting the Access token") + self._ACCESS_TOKEN = try_get(token_json, lambda x: x['data']['refreshToken']['accessToken']) + if self._ACCESS_TOKEN is None: + self.report_warning('Failed to get Access token') + else: + self.headers.update({'Authorization': 'Bearer %s' % self._ACCESS_TOKEN}) + elif username is not None: + self.report_warning(f'Login using username and password is not currently supported. {self._LOGIN_HINT}') + + def _real_initialize(self): + self._login() + + def _check_login_required(self, is_available, is_premium): + msg = None + if is_premium and self._ACCESS_TOKEN is None: + msg = f'This video is only available for registered users. {self._LOGIN_HINT}' + elif not is_available and self._ACCESS_TOKEN is not None: + msg = 'This video isn\'t available to the current logged in account' + if msg: + self.raise_login_required(msg, metadata_available=True, method=None) + + def download_gql(self, variable, data, note, fatal=False, headers=headers): + return self._download_json( + 'https://www.fancode.com/graphql', variable, + data=data.encode(), note=note, + headers=headers, fatal=fatal) + + def _real_extract(self, url): + + BRIGHTCOVE_URL_TEMPLATE = 'https://players.brightcove.net/%s/default_default/index.html?videoId=%s' + video_id = self._match_id(url) + + brightcove_user_id = '6008340455001' + data = '''{ + "query":"query Video($id: Int\\u0021, $filter: SegmentFilter) { media(id: $id, filter: $filter) { id contentId title contentId publishedTime totalViews totalUpvotes provider thumbnail { src } mediaSource {brightcove } duration isPremium isUserEntitled tags duration }}", + "variables":{ + "id":%s, + "filter":{ + "contentDataType":"DEFAULT" + } + }, + "operationName":"Video" + }''' % video_id + + metadata_json = self.download_gql(video_id, data, note='Downloading metadata') + + media = try_get(metadata_json, lambda x: x['data']['media'], dict) or {} + brightcove_video_id = try_get(media, lambda x: x['mediaSource']['brightcove'], compat_str) + + if brightcove_video_id is None: + raise ExtractorError('Unable to extract brightcove Video ID') + + is_premium = media.get('isPremium') + + self._check_login_required(media.get('isUserEntitled'), is_premium) + + return { + '_type': 'url_transparent', + 'url': BRIGHTCOVE_URL_TEMPLATE % (brightcove_user_id, brightcove_video_id), + 'ie_key': 'BrightcoveNew', + 'id': video_id, + 'title': media['title'], + 'like_count': media.get('totalUpvotes'), + 'view_count': media.get('totalViews'), + 'tags': media.get('tags'), + 'release_timestamp': parse_iso8601(media.get('publishedTime')), + 'availability': self._availability(needs_premium=is_premium), + } + + +class FancodeLiveIE(FancodeVodIE): + IE_NAME = 'fancode:live' + + _VALID_URL = r'https?://(www\.)?fancode\.com/match/(?P<id>[0-9]+).+' + + _TESTS = [{ + 'url': 'https://fancode.com/match/35328/cricket-fancode-ecs-hungary-2021-bub-vs-blb?slug=commentary', + 'info_dict': { + 'id': '35328', + 'ext': 'mp4', + 'title': 'BUB vs BLB', + "timestamp": 1624863600, + 'is_live': True, + 'upload_date': '20210628', + }, + 'skip': 'Ended' + }, { + 'url': 'https://fancode.com/match/35328/', + 'only_matching': True, + }, { + 'url': 'https://fancode.com/match/35567?slug=scorecard', + 'only_matching': True, + }] + + def _real_extract(self, url): + + id = self._match_id(url) + data = '''{ + "query":"query MatchResponse($id: Int\\u0021, $isLoggedIn: Boolean\\u0021) { match: matchWithScores(id: $id) { id matchDesc mediaId videoStreamId videoStreamUrl { ...VideoSource } liveStreams { videoStreamId videoStreamUrl { ...VideoSource } contentId } name startTime streamingStatus isPremium isUserEntitled @include(if: $isLoggedIn) status metaTags bgImage { src } sport { name slug } tour { id name } squads { name shortName } liveStreams { contentId } mediaId }}fragment VideoSource on VideoSource { title description posterUrl url deliveryType playerType}", + "variables":{ + "id":%s, + "isLoggedIn":true + }, + "operationName":"MatchResponse" + }''' % id + + info_json = self.download_gql(id, data, "Info json") + + match_info = try_get(info_json, lambda x: x['data']['match']) + + if match_info.get('streamingStatus') != "STARTED": + raise ExtractorError('The stream can\'t be accessed', expected=True) + self._check_login_required(match_info.get('isUserEntitled'), True) # all live streams are premium only + + return { + 'id': id, + 'title': match_info.get('name'), + 'formats': self._extract_akamai_formats(try_get(match_info, lambda x: x['videoStreamUrl']['url']), id), + 'ext': mimetype2ext(try_get(match_info, lambda x: x['videoStreamUrl']['deliveryType'])), + 'is_live': True, + 'release_timestamp': parse_iso8601(match_info.get('startTime')) + } diff --git a/youtube_dl/extractor/faz.py b/yt_dlp/extractor/faz.py index 312ee2aee..312ee2aee 100644 --- a/youtube_dl/extractor/faz.py +++ b/yt_dlp/extractor/faz.py diff --git a/yt_dlp/extractor/fc2.py b/yt_dlp/extractor/fc2.py new file mode 100644 index 000000000..4d85e62fe --- /dev/null +++ b/yt_dlp/extractor/fc2.py @@ -0,0 +1,159 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import hashlib + +from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_request, + compat_urlparse, +) +from ..utils import ( + ExtractorError, + sanitized_Request, + urlencode_postdata, +) + + +class FC2IE(InfoExtractor): + _VALID_URL = r'^(?:https?://video\.fc2\.com/(?:[^/]+/)*content/|fc2:)(?P<id>[^/]+)' + IE_NAME = 'fc2' + _NETRC_MACHINE = 'fc2' + _TESTS = [{ + 'url': 'http://video.fc2.com/en/content/20121103kUan1KHs', + 'md5': 'a6ebe8ebe0396518689d963774a54eb7', + 'info_dict': { + 'id': '20121103kUan1KHs', + 'ext': 'flv', + 'title': 'Boxing again with Puff', + }, + }, { + 'url': 'http://video.fc2.com/en/content/20150125cEva0hDn/', + 'info_dict': { + 'id': '20150125cEva0hDn', + 'ext': 'mp4', + }, + 'params': { + 'username': 'ytdl@yt-dl.org', + 'password': '(snip)', + }, + 'skip': 'requires actual password', + }, { + 'url': 'http://video.fc2.com/en/a/content/20130926eZpARwsF', + 'only_matching': True, + }] + + def _login(self): + username, password = self._get_login_info() + if username is None or password is None: + return False + + # Log in + login_form_strs = { + 'email': username, + 'password': password, + 'done': 'video', + 'Submit': ' Login ', + } + + login_data = urlencode_postdata(login_form_strs) + request = sanitized_Request( + 'https://secure.id.fc2.com/index.php?mode=login&switch_language=en', login_data) + + login_results = self._download_webpage(request, None, note='Logging in', errnote='Unable to log in') + if 'mode=redirect&login=done' not in login_results: + self.report_warning('unable to log in: bad username or password') + return False + + # this is also needed + login_redir = sanitized_Request('http://id.fc2.com/?mode=redirect&login=done') + self._download_webpage( + login_redir, None, note='Login redirect', errnote='Login redirect failed') + + return True + + def _real_extract(self, url): + video_id = self._match_id(url) + self._login() + webpage = None + if not url.startswith('fc2:'): + webpage = self._download_webpage(url, video_id) + self._downloader.cookiejar.clear_session_cookies() # must clear + self._login() + + title = 'FC2 video %s' % video_id + thumbnail = None + if webpage is not None: + title = self._og_search_title(webpage) + thumbnail = self._og_search_thumbnail(webpage) + refer = url.replace('/content/', '/a/content/') if '/a/content/' not in url else url + + mimi = hashlib.md5((video_id + '_gGddgPfeaf_gzyr').encode('utf-8')).hexdigest() + + info_url = ( + 'http://video.fc2.com/ginfo.php?mimi={1:s}&href={2:s}&v={0:s}&fversion=WIN%2011%2C6%2C602%2C180&from=2&otag=0&upid={0:s}&tk=null&'. + format(video_id, mimi, compat_urllib_request.quote(refer, safe=b'').replace('.', '%2E'))) + + info_webpage = self._download_webpage( + info_url, video_id, note='Downloading info page') + info = compat_urlparse.parse_qs(info_webpage) + + if 'err_code' in info: + # most of the time we can still download wideo even if err_code is 403 or 602 + self.report_warning( + 'Error code was: %s... but still trying' % info['err_code'][0]) + + if 'filepath' not in info: + raise ExtractorError('Cannot download file. Are you logged in?') + + video_url = info['filepath'][0] + '?mid=' + info['mid'][0] + title_info = info.get('title') + if title_info: + title = title_info[0] + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + 'ext': 'flv', + 'thumbnail': thumbnail, + } + + +class FC2EmbedIE(InfoExtractor): + _VALID_URL = r'https?://video\.fc2\.com/flv2\.swf\?(?P<query>.+)' + IE_NAME = 'fc2:embed' + + _TEST = { + 'url': 'http://video.fc2.com/flv2.swf?t=201404182936758512407645&i=20130316kwishtfitaknmcgd76kjd864hso93htfjcnaogz629mcgfs6rbfk0hsycma7shkf85937cbchfygd74&i=201403223kCqB3Ez&d=2625&sj=11&lang=ja&rel=1&from=11&cmt=1&tk=TlRBM09EQTNNekU9&tl=プリズン・ブレイク%20S1-01%20マイケル%20【吹替】', + 'md5': 'b8aae5334cb691bdb1193a88a6ab5d5a', + 'info_dict': { + 'id': '201403223kCqB3Ez', + 'ext': 'flv', + 'title': 'プリズン・ブレイク S1-01 マイケル 【吹替】', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + query = compat_parse_qs(mobj.group('query')) + + video_id = query['i'][-1] + title = query.get('tl', ['FC2 video %s' % video_id])[0] + + sj = query.get('sj', [None])[0] + thumbnail = None + if sj: + # See thumbnailImagePath() in ServerConst.as of flv2.swf + thumbnail = 'http://video%s-thumbnail.fc2.com/up/pic/%s.jpg' % ( + sj, '/'.join((video_id[:6], video_id[6:8], video_id[-2], video_id[-1], video_id))) + + return { + '_type': 'url_transparent', + 'ie_key': FC2IE.ie_key(), + 'url': 'fc2:%s' % video_id, + 'title': title, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/fczenit.py b/yt_dlp/extractor/fczenit.py index 8db7c5963..8db7c5963 100644 --- a/youtube_dl/extractor/fczenit.py +++ b/yt_dlp/extractor/fczenit.py diff --git a/yt_dlp/extractor/filmmodu.py b/yt_dlp/extractor/filmmodu.py new file mode 100644 index 000000000..2746876d5 --- /dev/null +++ b/yt_dlp/extractor/filmmodu.py @@ -0,0 +1,74 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import int_or_none + + +class FilmmoduIE(InfoExtractor): + _VALID_URL = r'https?://(?:www.)?filmmodu.org/(?P<id>[^/]+-(?:turkce-dublaj-izle|altyazili-izle))' + _TESTS = [{ + 'url': 'https://www.filmmodu.org/f9-altyazili-izle', + 'md5': 'aeefd955c2a508a5bdaa3bcec8eeb0d4', + 'info_dict': { + 'id': '10804', + 'ext': 'mp4', + 'title': 'F9', + 'description': 'md5:2713f584a4d65afa2611e2948d0b953c', + 'subtitles': { + 'tr': [{ + 'ext': 'vtt', + }], + }, + 'thumbnail': r're:https://s[0-9]+.filmmodu.org/uploads/movie/cover/10804/xXHZeb1yhJvnSHPzZDqee0zfMb6.jpg', + }, + }, { + 'url': 'https://www.filmmodu.org/the-godfather-turkce-dublaj-izle', + 'md5': '109f2fcb9c941330eed133971c035c00', + 'info_dict': { + 'id': '3646', + 'ext': 'mp4', + 'title': 'Baba', + 'description': 'md5:d43fd651937cd75cc650883ebd8d8461', + 'thumbnail': r're:https://s[0-9]+.filmmodu.org/uploads/movie/cover/3646/6xKCYgH16UuwEGAyroLU6p8HLIn.jpg', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + title = self._og_search_title(webpage, fatal=True) + description = self._og_search_description(webpage) + thumbnail = self._og_search_thumbnail(webpage) + real_video_id = self._search_regex(r'var\s*videoId\s*=\s*\'([0-9]+)\'', webpage, 'video_id') + video_type = self._search_regex(r'var\s*videoType\s*=\s*\'([a-z]+)\'', webpage, 'video_type') + data = self._download_json('https://www.filmmodu.org/get-source', real_video_id, query={ + 'movie_id': real_video_id, + 'type': video_type, + }) + formats = [{ + 'url': source['src'], + 'ext': 'mp4', + 'format_id': source['label'], + 'height': int_or_none(source.get('res')), + 'protocol': 'm3u8_native', + } for source in data['sources']] + + self._sort_formats(formats) + + subtitles = {} + + if data.get('subtitle'): + subtitles['tr'] = [{ + 'url': data['subtitle'], + }] + + return { + 'id': real_video_id, + 'display_id': video_id, + 'title': title, + 'description': description, + 'formats': formats, + 'subtitles': subtitles, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/filmon.py b/yt_dlp/extractor/filmon.py index f775fe0ba..f775fe0ba 100644 --- a/youtube_dl/extractor/filmon.py +++ b/yt_dlp/extractor/filmon.py diff --git a/yt_dlp/extractor/filmweb.py b/yt_dlp/extractor/filmweb.py new file mode 100644 index 000000000..5e323b4f8 --- /dev/null +++ b/yt_dlp/extractor/filmweb.py @@ -0,0 +1,41 @@ +from __future__ import unicode_literals + + +from .common import InfoExtractor + + +class FilmwebIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?filmweb\.no/(?P<type>trailere|filmnytt)/article(?P<id>\d+)\.ece' + _TEST = { + 'url': 'http://www.filmweb.no/trailere/article1264921.ece', + 'md5': 'e353f47df98e557d67edaceda9dece89', + 'info_dict': { + 'id': '13033574', + 'ext': 'mp4', + 'title': 'Det som en gang var', + 'upload_date': '20160316', + 'timestamp': 1458140101, + 'uploader_id': '12639966', + 'uploader': 'Live Roaldset', + } + } + + def _real_extract(self, url): + article_type, article_id = self._match_valid_url(url).groups() + if article_type == 'filmnytt': + webpage = self._download_webpage(url, article_id) + article_id = self._search_regex(r'data-videoid="(\d+)"', webpage, 'article id') + embed_code = self._download_json( + 'https://www.filmweb.no/template_v2/ajax/json_trailerEmbed.jsp', + article_id, query={ + 'articleId': article_id, + })['embedCode'] + iframe_url = self._proto_relative_url(self._search_regex( + r'<iframe[^>]+src="([^"]+)', embed_code, 'iframe url')) + + return { + '_type': 'url_transparent', + 'id': article_id, + 'url': iframe_url, + 'ie_key': 'TwentyThreeVideo', + } diff --git a/yt_dlp/extractor/firsttv.py b/yt_dlp/extractor/firsttv.py new file mode 100644 index 000000000..ccad173b7 --- /dev/null +++ b/yt_dlp/extractor/firsttv.py @@ -0,0 +1,156 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urlparse, +) +from ..utils import ( + int_or_none, + qualities, + unified_strdate, + url_or_none, +) + + +class FirstTVIE(InfoExtractor): + IE_NAME = '1tv' + IE_DESC = 'Первый канал' + _VALID_URL = r'https?://(?:www\.)?1tv\.ru/(?:[^/]+/)+(?P<id>[^/?#]+)' + + _TESTS = [{ + # single format + 'url': 'http://www.1tv.ru/shows/naedine-so-vsemi/vypuski/gost-lyudmila-senchina-naedine-so-vsemi-vypusk-ot-12-02-2015', + 'md5': 'a1b6b60d530ebcf8daacf4565762bbaf', + 'info_dict': { + 'id': '40049', + 'ext': 'mp4', + 'title': 'Гость Людмила Сенчина. Наедине со всеми. Выпуск от 12.02.2015', + 'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$', + 'upload_date': '20150212', + 'duration': 2694, + }, + }, { + # multiple formats + 'url': 'http://www.1tv.ru/shows/dobroe-utro/pro-zdorove/vesennyaya-allergiya-dobroe-utro-fragment-vypuska-ot-07042016', + 'info_dict': { + 'id': '364746', + 'ext': 'mp4', + 'title': 'Весенняя аллергия. Доброе утро. Фрагмент выпуска от 07.04.2016', + 'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$', + 'upload_date': '20160407', + 'duration': 179, + 'formats': 'mincount:3', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.1tv.ru/news/issue/2016-12-01/14:00', + 'info_dict': { + 'id': '14:00', + 'title': 'Выпуск новостей в 14:00 1 декабря 2016 года. Новости. Первый канал', + 'description': 'md5:2e921b948f8c1ff93901da78ebdb1dfd', + }, + 'playlist_count': 13, + }, { + 'url': 'http://www.1tv.ru/shows/tochvtoch-supersezon/vystupleniya/evgeniy-dyatlov-vladimir-vysockiy-koni-priveredlivye-toch-v-toch-supersezon-fragment-vypuska-ot-06-11-2016', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + playlist_url = compat_urlparse.urljoin(url, self._search_regex( + r'data-playlist-url=(["\'])(?P<url>(?:(?!\1).)+)\1', + webpage, 'playlist url', group='url')) + + parsed_url = compat_urlparse.urlparse(playlist_url) + qs = compat_urlparse.parse_qs(parsed_url.query) + item_ids = qs.get('videos_ids[]') or qs.get('news_ids[]') + + items = self._download_json(playlist_url, display_id) + + if item_ids: + items = [ + item for item in items + if item.get('uid') and compat_str(item['uid']) in item_ids] + else: + items = [items[0]] + + entries = [] + QUALITIES = ('ld', 'sd', 'hd', ) + + for item in items: + title = item['title'] + quality = qualities(QUALITIES) + formats = [] + path = None + for f in item.get('mbr', []): + src = url_or_none(f.get('src')) + if not src: + continue + tbr = int_or_none(self._search_regex( + r'_(\d{3,})\.mp4', src, 'tbr', default=None)) + if not path: + path = self._search_regex( + r'//[^/]+/(.+?)_\d+\.mp4', src, + 'm3u8 path', default=None) + formats.append({ + 'url': src, + 'format_id': f.get('name'), + 'tbr': tbr, + 'source_preference': quality(f.get('name')), + # quality metadata of http formats may be incorrect + 'preference': -10, + }) + # m3u8 URL format is reverse engineered from [1] (search for + # master.m3u8). dashEdges (that is currently balancer-vod.1tv.ru) + # is taken from [2]. + # 1. http://static.1tv.ru/player/eump1tv-current/eump-1tv.all.min.js?rnd=9097422834:formatted + # 2. http://static.1tv.ru/player/eump1tv-config/config-main.js?rnd=9097422834 + if not path and len(formats) == 1: + path = self._search_regex( + r'//[^/]+/(.+?$)', formats[0]['url'], + 'm3u8 path', default=None) + if path: + if len(formats) == 1: + m3u8_path = ',' + else: + tbrs = [compat_str(t) for t in sorted(f['tbr'] for f in formats)] + m3u8_path = '_,%s,%s' % (','.join(tbrs), '.mp4') + formats.extend(self._extract_m3u8_formats( + 'http://balancer-vod.1tv.ru/%s%s.urlset/master.m3u8' + % (path, m3u8_path), + display_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + self._sort_formats(formats) + + thumbnail = item.get('poster') or self._og_search_thumbnail(webpage) + duration = int_or_none(item.get('duration') or self._html_search_meta( + 'video:duration', webpage, 'video duration', fatal=False)) + upload_date = unified_strdate(self._html_search_meta( + 'ya:ovs:upload_date', webpage, 'upload date', default=None)) + + entries.append({ + 'id': compat_str(item.get('id') or item['uid']), + 'thumbnail': thumbnail, + 'title': title, + 'upload_date': upload_date, + 'duration': int_or_none(duration), + 'formats': formats + }) + + title = self._html_search_regex( + (r'<div class="tv_translation">\s*<h1><a href="[^"]+">([^<]*)</a>', + r"'title'\s*:\s*'([^']+)'"), + webpage, 'title', default=None) or self._og_search_title( + webpage, default=None) + description = self._html_search_regex( + r'<div class="descr">\s*<div> </div>\s*<p>([^<]*)</p></div>', + webpage, 'description', default=None) or self._html_search_meta( + 'description', webpage, 'description', default=None) + + return self.playlist_result(entries, display_id, title, description) diff --git a/youtube_dl/extractor/fivemin.py b/yt_dlp/extractor/fivemin.py index f3f876ecd..f3f876ecd 100644 --- a/youtube_dl/extractor/fivemin.py +++ b/yt_dlp/extractor/fivemin.py diff --git a/yt_dlp/extractor/fivetv.py b/yt_dlp/extractor/fivetv.py new file mode 100644 index 000000000..be81fccb8 --- /dev/null +++ b/yt_dlp/extractor/fivetv.py @@ -0,0 +1,90 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import int_or_none + + +class FiveTVIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?5-tv\.ru/ + (?: + (?:[^/]+/)+(?P<id>\d+)| + (?P<path>[^/?#]+)(?:[/?#])? + ) + ''' + + _TESTS = [{ + 'url': 'http://5-tv.ru/news/96814/', + 'md5': 'bbff554ad415ecf5416a2f48c22d9283', + 'info_dict': { + 'id': '96814', + 'ext': 'mp4', + 'title': 'Россияне выбрали имя для общенациональной платежной системы', + 'description': 'md5:a8aa13e2b7ad36789e9f77a74b6de660', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 180, + }, + }, { + 'url': 'http://5-tv.ru/video/1021729/', + 'info_dict': { + 'id': '1021729', + 'ext': 'mp4', + 'title': '3D принтер', + 'description': 'md5:d76c736d29ef7ec5c0cf7d7c65ffcb41', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 180, + }, + }, { + # redirect to https://www.5-tv.ru/projects/1000095/izvestia-glavnoe/ + 'url': 'http://www.5-tv.ru/glavnoe/#itemDetails', + 'info_dict': { + 'id': 'glavnoe', + 'ext': 'mp4', + 'title': r're:^Итоги недели с \d+ по \d+ \w+ \d{4} года$', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'skip': 'redirect to «Известия. Главное» project page', + }, { + 'url': 'http://www.5-tv.ru/glavnoe/broadcasts/508645/', + 'only_matching': True, + }, { + 'url': 'http://5-tv.ru/films/1507502/', + 'only_matching': True, + }, { + 'url': 'http://5-tv.ru/programs/broadcast/508713/', + 'only_matching': True, + }, { + 'url': 'http://5-tv.ru/angel/', + 'only_matching': True, + }, { + 'url': 'http://www.5-tv.ru/schedule/?iframe=true&width=900&height=450', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') or mobj.group('path') + + webpage = self._download_webpage(url, video_id) + + video_url = self._search_regex( + [r'<div[^>]+?class="(?:flow)?player[^>]+?data-href="([^"]+)"', + r'<a[^>]+?href="([^"]+)"[^>]+?class="videoplayer"'], + webpage, 'video url') + + title = self._og_search_title(webpage, default=None) or self._search_regex( + r'<title>([^<]+)</title>', webpage, 'title') + duration = int_or_none(self._og_search_property( + 'video:duration', webpage, 'duration', default=None)) + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'description': self._og_search_description(webpage, default=None), + 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'duration': duration, + } diff --git a/yt_dlp/extractor/flickr.py b/yt_dlp/extractor/flickr.py new file mode 100644 index 000000000..6c82fae3c --- /dev/null +++ b/yt_dlp/extractor/flickr.py @@ -0,0 +1,116 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urllib_parse_urlencode, +) +from ..utils import ( + ExtractorError, + int_or_none, + qualities, +) + + +class FlickrIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.|secure\.)?flickr\.com/photos/[\w\-_@]+/(?P<id>\d+)' + _TEST = { + 'url': 'http://www.flickr.com/photos/forestwander-nature-pictures/5645318632/in/photostream/', + 'md5': '164fe3fa6c22e18d448d4d5af2330f31', + 'info_dict': { + 'id': '5645318632', + 'ext': 'mpg', + 'description': 'Waterfalls in the Springtime at Dark Hollow Waterfalls. These are located just off of Skyline Drive in Virginia. They are only about 6/10 of a mile hike but it is a pretty steep hill and a good climb back up.', + 'title': 'Dark Hollow Waterfalls', + 'duration': 19, + 'timestamp': 1303528740, + 'upload_date': '20110423', + 'uploader_id': '10922353@N03', + 'uploader': 'Forest Wander', + 'uploader_url': 'https://www.flickr.com/photos/forestwander-nature-pictures/', + 'comment_count': int, + 'view_count': int, + 'tags': list, + 'license': 'Attribution-ShareAlike', + } + } + _API_BASE_URL = 'https://api.flickr.com/services/rest?' + # https://help.yahoo.com/kb/flickr/SLN25525.html + _LICENSES = { + '0': 'All Rights Reserved', + '1': 'Attribution-NonCommercial-ShareAlike', + '2': 'Attribution-NonCommercial', + '3': 'Attribution-NonCommercial-NoDerivs', + '4': 'Attribution', + '5': 'Attribution-ShareAlike', + '6': 'Attribution-NoDerivs', + '7': 'No known copyright restrictions', + '8': 'United States government work', + '9': 'Public Domain Dedication (CC0)', + '10': 'Public Domain Work', + } + + def _call_api(self, method, video_id, api_key, note, secret=None): + query = { + 'photo_id': video_id, + 'method': 'flickr.%s' % method, + 'api_key': api_key, + 'format': 'json', + 'nojsoncallback': 1, + } + if secret: + query['secret'] = secret + data = self._download_json(self._API_BASE_URL + compat_urllib_parse_urlencode(query), video_id, note) + if data['stat'] != 'ok': + raise ExtractorError(data['message']) + return data + + def _real_extract(self, url): + video_id = self._match_id(url) + + api_key = self._download_json( + 'https://www.flickr.com/hermes_error_beacon.gne', video_id, + 'Downloading api key')['site_key'] + + video_info = self._call_api( + 'photos.getInfo', video_id, api_key, 'Downloading video info')['photo'] + if video_info['media'] == 'video': + streams = self._call_api( + 'video.getStreamInfo', video_id, api_key, + 'Downloading streams info', video_info['secret'])['streams'] + + preference = qualities( + ['288p', 'iphone_wifi', '100', '300', '700', '360p', 'appletv', '720p', '1080p', 'orig']) + + formats = [] + for stream in streams['stream']: + stream_type = compat_str(stream.get('type')) + formats.append({ + 'format_id': stream_type, + 'url': stream['_content'], + 'quality': preference(stream_type), + }) + self._sort_formats(formats) + + owner = video_info.get('owner', {}) + uploader_id = owner.get('nsid') + uploader_path = owner.get('path_alias') or uploader_id + uploader_url = 'https://www.flickr.com/photos/%s/' % uploader_path if uploader_path else None + + return { + 'id': video_id, + 'title': video_info['title']['_content'], + 'description': video_info.get('description', {}).get('_content'), + 'formats': formats, + 'timestamp': int_or_none(video_info.get('dateuploaded')), + 'duration': int_or_none(video_info.get('video', {}).get('duration')), + 'uploader_id': uploader_id, + 'uploader': owner.get('realname'), + 'uploader_url': uploader_url, + 'comment_count': int_or_none(video_info.get('comments', {}).get('_content')), + 'view_count': int_or_none(video_info.get('views')), + 'tags': [tag.get('_content') for tag in video_info.get('tags', {}).get('tag', [])], + 'license': self._LICENSES.get(video_info.get('license')), + } + else: + raise ExtractorError('not a video', expected=True) diff --git a/youtube_dl/extractor/folketinget.py b/yt_dlp/extractor/folketinget.py index b3df93f28..b3df93f28 100644 --- a/youtube_dl/extractor/folketinget.py +++ b/yt_dlp/extractor/folketinget.py diff --git a/youtube_dl/extractor/footyroom.py b/yt_dlp/extractor/footyroom.py index 118325b6d..118325b6d 100644 --- a/youtube_dl/extractor/footyroom.py +++ b/yt_dlp/extractor/footyroom.py diff --git a/youtube_dl/extractor/formula1.py b/yt_dlp/extractor/formula1.py index 67662e6de..67662e6de 100644 --- a/youtube_dl/extractor/formula1.py +++ b/yt_dlp/extractor/formula1.py diff --git a/yt_dlp/extractor/fourtube.py b/yt_dlp/extractor/fourtube.py new file mode 100644 index 000000000..d4d955b6b --- /dev/null +++ b/yt_dlp/extractor/fourtube.py @@ -0,0 +1,309 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_b64decode, + compat_str, + compat_urllib_parse_unquote, + compat_urlparse, +) +from ..utils import ( + int_or_none, + parse_duration, + parse_iso8601, + str_or_none, + str_to_int, + try_get, + unified_timestamp, + url_or_none, +) + + +class FourTubeBaseIE(InfoExtractor): + def _extract_formats(self, url, video_id, media_id, sources): + token_url = 'https://%s/%s/desktop/%s' % ( + self._TKN_HOST, media_id, '+'.join(sources)) + + parsed_url = compat_urlparse.urlparse(url) + tokens = self._download_json(token_url, video_id, data=b'', headers={ + 'Origin': '%s://%s' % (parsed_url.scheme, parsed_url.hostname), + 'Referer': url, + }) + formats = [{ + 'url': tokens[format]['token'], + 'format_id': format + 'p', + 'resolution': format + 'p', + 'quality': int(format), + } for format in sources] + self._sort_formats(formats) + return formats + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + kind, video_id, display_id = mobj.group('kind', 'id', 'display_id') + + if kind == 'm' or not display_id: + url = self._URL_TEMPLATE % video_id + + webpage = self._download_webpage(url, video_id) + + title = self._html_search_meta('name', webpage) + timestamp = parse_iso8601(self._html_search_meta( + 'uploadDate', webpage)) + thumbnail = self._html_search_meta('thumbnailUrl', webpage) + uploader_id = self._html_search_regex( + r'<a class="item-to-subscribe" href="[^"]+/(?:channel|user)s?/([^/"]+)" title="Go to [^"]+ page">', + webpage, 'uploader id', fatal=False) + uploader = self._html_search_regex( + r'<a class="item-to-subscribe" href="[^"]+/(?:channel|user)s?/[^/"]+" title="Go to ([^"]+) page">', + webpage, 'uploader', fatal=False) + + categories_html = self._search_regex( + r'(?s)><i class="icon icon-tag"></i>\s*Categories / Tags\s*.*?<ul class="[^"]*?list[^"]*?">(.*?)</ul>', + webpage, 'categories', fatal=False) + categories = None + if categories_html: + categories = [ + c.strip() for c in re.findall( + r'(?s)<li><a.*?>(.*?)</a>', categories_html)] + + view_count = str_to_int(self._search_regex( + r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserPlays:([0-9,]+)">', + webpage, 'view count', default=None)) + like_count = str_to_int(self._search_regex( + r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserLikes:([0-9,]+)">', + webpage, 'like count', default=None)) + duration = parse_duration(self._html_search_meta('duration', webpage)) + + media_id = self._search_regex( + r'<button[^>]+data-id=(["\'])(?P<id>\d+)\1[^>]+data-quality=', webpage, + 'media id', default=None, group='id') + sources = [ + quality + for _, quality in re.findall(r'<button[^>]+data-quality=(["\'])(.+?)\1', webpage)] + if not (media_id and sources): + player_js = self._download_webpage( + self._search_regex( + r'<script[^>]id=(["\'])playerembed\1[^>]+src=(["\'])(?P<url>.+?)\2', + webpage, 'player JS', group='url'), + video_id, 'Downloading player JS') + params_js = self._search_regex( + r'\$\.ajax\(url,\ opts\);\s*\}\s*\}\)\(([0-9,\[\] ]+)\)', + player_js, 'initialization parameters') + params = self._parse_json('[%s]' % params_js, video_id) + media_id = params[0] + sources = ['%s' % p for p in params[2]] + + formats = self._extract_formats(url, video_id, media_id, sources) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'categories': categories, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'timestamp': timestamp, + 'like_count': like_count, + 'view_count': view_count, + 'duration': duration, + 'age_limit': 18, + } + + +class FourTubeIE(FourTubeBaseIE): + IE_NAME = '4tube' + _VALID_URL = r'https?://(?:(?P<kind>www|m)\.)?4tube\.com/(?:videos|embed)/(?P<id>\d+)(?:/(?P<display_id>[^/?#&]+))?' + _URL_TEMPLATE = 'https://www.4tube.com/videos/%s/video' + _TKN_HOST = 'token.4tube.com' + _TESTS = [{ + 'url': 'http://www.4tube.com/videos/209733/hot-babe-holly-michaels-gets-her-ass-stuffed-by-black', + 'md5': '6516c8ac63b03de06bc8eac14362db4f', + 'info_dict': { + 'id': '209733', + 'ext': 'mp4', + 'title': 'Hot Babe Holly Michaels gets her ass stuffed by black', + 'uploader': 'WCP Club', + 'uploader_id': 'wcp-club', + 'upload_date': '20131031', + 'timestamp': 1383263892, + 'duration': 583, + 'view_count': int, + 'like_count': int, + 'categories': list, + 'age_limit': 18, + }, + }, { + 'url': 'http://www.4tube.com/embed/209733', + 'only_matching': True, + }, { + 'url': 'http://m.4tube.com/videos/209733/hot-babe-holly-michaels-gets-her-ass-stuffed-by-black', + 'only_matching': True, + }] + + +class FuxIE(FourTubeBaseIE): + _VALID_URL = r'https?://(?:(?P<kind>www|m)\.)?fux\.com/(?:video|embed)/(?P<id>\d+)(?:/(?P<display_id>[^/?#&]+))?' + _URL_TEMPLATE = 'https://www.fux.com/video/%s/video' + _TKN_HOST = 'token.fux.com' + _TESTS = [{ + 'url': 'https://www.fux.com/video/195359/awesome-fucking-kitchen-ends-cum-swallow', + 'info_dict': { + 'id': '195359', + 'ext': 'mp4', + 'title': 'Awesome fucking in the kitchen ends with cum swallow', + 'uploader': 'alenci2342', + 'uploader_id': 'alenci2342', + 'upload_date': '20131230', + 'timestamp': 1388361660, + 'duration': 289, + 'view_count': int, + 'like_count': int, + 'categories': list, + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.fux.com/embed/195359', + 'only_matching': True, + }, { + 'url': 'https://www.fux.com/video/195359/awesome-fucking-kitchen-ends-cum-swallow', + 'only_matching': True, + }] + + +class PornTubeIE(FourTubeBaseIE): + _VALID_URL = r'https?://(?:(?P<kind>www|m)\.)?porntube\.com/(?:videos/(?P<display_id>[^/]+)_|embed/)(?P<id>\d+)' + _URL_TEMPLATE = 'https://www.porntube.com/videos/video_%s' + _TKN_HOST = 'tkn.porntube.com' + _TESTS = [{ + 'url': 'https://www.porntube.com/videos/teen-couple-doing-anal_7089759', + 'info_dict': { + 'id': '7089759', + 'ext': 'mp4', + 'title': 'Teen couple doing anal', + 'uploader': 'Alexy', + 'uploader_id': '91488', + 'upload_date': '20150606', + 'timestamp': 1433595647, + 'duration': 5052, + 'view_count': int, + 'like_count': int, + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.porntube.com/videos/squirting-teen-ballerina-ecg_1331406', + 'info_dict': { + 'id': '1331406', + 'ext': 'mp4', + 'title': 'Squirting Teen Ballerina on ECG', + 'uploader': 'Exploited College Girls', + 'uploader_id': '665', + 'channel': 'Exploited College Girls', + 'channel_id': '665', + 'upload_date': '20130920', + 'timestamp': 1379685485, + 'duration': 851, + 'view_count': int, + 'like_count': int, + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.porntube.com/embed/7089759', + 'only_matching': True, + }, { + 'url': 'https://m.porntube.com/videos/teen-couple-doing-anal_7089759', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id, display_id = mobj.group('id', 'display_id') + + webpage = self._download_webpage(url, display_id) + + video = self._parse_json( + self._search_regex( + r'INITIALSTATE\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1', + webpage, 'data', group='value'), video_id, + transform_source=lambda x: compat_urllib_parse_unquote( + compat_b64decode(x).decode('utf-8')))['page']['video'] + + title = video['title'] + media_id = video['mediaId'] + sources = [compat_str(e['height']) + for e in video['encodings'] if e.get('height')] + formats = self._extract_formats(url, video_id, media_id, sources) + + thumbnail = url_or_none(video.get('masterThumb')) + uploader = try_get(video, lambda x: x['user']['username'], compat_str) + uploader_id = str_or_none(try_get( + video, lambda x: x['user']['id'], int)) + channel = try_get(video, lambda x: x['channel']['name'], compat_str) + channel_id = str_or_none(try_get( + video, lambda x: x['channel']['id'], int)) + like_count = int_or_none(video.get('likes')) + dislike_count = int_or_none(video.get('dislikes')) + view_count = int_or_none(video.get('playsQty')) + duration = int_or_none(video.get('durationInSeconds')) + timestamp = unified_timestamp(video.get('publishedAt')) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + 'uploader': uploader or channel, + 'uploader_id': uploader_id or channel_id, + 'channel': channel, + 'channel_id': channel_id, + 'timestamp': timestamp, + 'like_count': like_count, + 'dislike_count': dislike_count, + 'view_count': view_count, + 'duration': duration, + 'age_limit': 18, + } + + +class PornerBrosIE(FourTubeBaseIE): + _VALID_URL = r'https?://(?:(?P<kind>www|m)\.)?pornerbros\.com/(?:videos/(?P<display_id>[^/]+)_|embed/)(?P<id>\d+)' + _URL_TEMPLATE = 'https://www.pornerbros.com/videos/video_%s' + _TKN_HOST = 'token.pornerbros.com' + _TESTS = [{ + 'url': 'https://www.pornerbros.com/videos/skinny-brunette-takes-big-cock-down-her-anal-hole_181369', + 'md5': '6516c8ac63b03de06bc8eac14362db4f', + 'info_dict': { + 'id': '181369', + 'ext': 'mp4', + 'title': 'Skinny brunette takes big cock down her anal hole', + 'uploader': 'PornerBros HD', + 'uploader_id': 'pornerbros-hd', + 'upload_date': '20130130', + 'timestamp': 1359527401, + 'duration': 1224, + 'view_count': int, + 'categories': list, + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.pornerbros.com/embed/181369', + 'only_matching': True, + }, { + 'url': 'https://m.pornerbros.com/videos/skinny-brunette-takes-big-cock-down-her-anal-hole_181369', + 'only_matching': True, + }] diff --git a/youtube_dl/extractor/fox.py b/yt_dlp/extractor/fox.py index 04f4bdba6..04f4bdba6 100644 --- a/youtube_dl/extractor/fox.py +++ b/yt_dlp/extractor/fox.py diff --git a/youtube_dl/extractor/fox9.py b/yt_dlp/extractor/fox9.py index 91f8f7b8a..91f8f7b8a 100644 --- a/youtube_dl/extractor/fox9.py +++ b/yt_dlp/extractor/fox9.py diff --git a/youtube_dl/extractor/foxgay.py b/yt_dlp/extractor/foxgay.py index 512a10645..512a10645 100644 --- a/youtube_dl/extractor/foxgay.py +++ b/yt_dlp/extractor/foxgay.py diff --git a/yt_dlp/extractor/foxnews.py b/yt_dlp/extractor/foxnews.py new file mode 100644 index 000000000..18fa0a5ef --- /dev/null +++ b/yt_dlp/extractor/foxnews.py @@ -0,0 +1,127 @@ +from __future__ import unicode_literals + +import re + +from .amp import AMPIE +from .common import InfoExtractor + + +class FoxNewsIE(AMPIE): + IE_NAME = 'foxnews' + IE_DESC = 'Fox News and Fox Business Video' + _VALID_URL = r'https?://(?P<host>video\.(?:insider\.)?fox(?:news|business)\.com)/v/(?:video-embed\.html\?video_id=)?(?P<id>\d+)' + _TESTS = [ + { + 'url': 'http://video.foxnews.com/v/3937480/frozen-in-time/#sp=show-clips', + 'md5': '32aaded6ba3ef0d1c04e238d01031e5e', + 'info_dict': { + 'id': '3937480', + 'ext': 'flv', + 'title': 'Frozen in Time', + 'description': '16-year-old girl is size of toddler', + 'duration': 265, + 'timestamp': 1304411491, + 'upload_date': '20110503', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + }, + { + 'url': 'http://video.foxnews.com/v/3922535568001/rep-luis-gutierrez-on-if-obamas-immigration-plan-is-legal/#sp=show-clips', + 'md5': '5846c64a1ea05ec78175421b8323e2df', + 'info_dict': { + 'id': '3922535568001', + 'ext': 'mp4', + 'title': "Rep. Luis Gutierrez on if Obama's immigration plan is legal", + 'description': "Congressman discusses president's plan", + 'duration': 292, + 'timestamp': 1417662047, + 'upload_date': '20141204', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + { + 'url': 'http://video.foxnews.com/v/video-embed.html?video_id=3937480&d=video.foxnews.com', + 'only_matching': True, + }, + { + 'url': 'http://video.foxbusiness.com/v/4442309889001', + 'only_matching': True, + }, + { + # From http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words + 'url': 'http://video.insider.foxnews.com/v/video-embed.html?video_id=5099377331001&autoplay=true&share_url=http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words&share_title=Student%20Group:%20Saying%20%27Politically%20Correct,%27%20%27Trash%27%20and%20%27Lame%27%20Is%20Offensive&share=true', + 'only_matching': True, + }, + ] + + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r'<(?:amp-)?iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//video\.foxnews\.com/v/video-embed\.html?.*?\bvideo_id=\d+.*?)\1', + webpage)] + + def _real_extract(self, url): + host, video_id = self._match_valid_url(url).groups() + + info = self._extract_feed_info( + 'http://%s/v/feed/video/%s.js?template=fox' % (host, video_id)) + info['id'] = video_id + return info + + +class FoxNewsArticleIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?:insider\.)?foxnews\.com/(?!v)([^/]+/)+(?P<id>[a-z-]+)' + IE_NAME = 'foxnews:article' + + _TESTS = [{ + # data-video-id + 'url': 'http://www.foxnews.com/politics/2016/09/08/buzz-about-bud-clinton-camp-denies-claims-wore-earpiece-at-forum.html', + 'md5': '83d44e1aff1433e7a29a7b537d1700b5', + 'info_dict': { + 'id': '5116295019001', + 'ext': 'mp4', + 'title': 'Trump and Clinton asked to defend positions on Iraq War', + 'description': 'Veterans react on \'The Kelly File\'', + 'timestamp': 1473301045, + 'upload_date': '20160908', + }, + }, { + # iframe embed + 'url': 'http://www.foxnews.com/us/2018/03/09/parkland-survivor-kyle-kashuv-on-meeting-trump-his-app-to-prevent-another-school-shooting.amp.html?__twitter_impression=true', + 'info_dict': { + 'id': '5748266721001', + 'ext': 'flv', + 'title': 'Kyle Kashuv has a positive message for the Trump White House', + 'description': 'Marjory Stoneman Douglas student disagrees with classmates.', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 229, + 'timestamp': 1520594670, + 'upload_date': '20180309', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + video_id = self._html_search_regex( + r'data-video-id=([\'"])(?P<id>[^\'"]+)\1', + webpage, 'video ID', group='id', default=None) + if video_id: + return self.url_result( + 'http://video.foxnews.com/v/' + video_id, FoxNewsIE.ie_key()) + + return self.url_result( + FoxNewsIE._extract_urls(webpage)[0], FoxNewsIE.ie_key()) diff --git a/youtube_dl/extractor/foxsports.py b/yt_dlp/extractor/foxsports.py index 2b2cb6c6f..2b2cb6c6f 100644 --- a/youtube_dl/extractor/foxsports.py +++ b/yt_dlp/extractor/foxsports.py diff --git a/youtube_dl/extractor/franceculture.py b/yt_dlp/extractor/franceculture.py index 14f4cb489..14f4cb489 100644 --- a/youtube_dl/extractor/franceculture.py +++ b/yt_dlp/extractor/franceculture.py diff --git a/youtube_dl/extractor/franceinter.py b/yt_dlp/extractor/franceinter.py index ae822a50e..ae822a50e 100644 --- a/youtube_dl/extractor/franceinter.py +++ b/yt_dlp/extractor/franceinter.py diff --git a/yt_dlp/extractor/francetv.py b/yt_dlp/extractor/francetv.py new file mode 100644 index 000000000..3bbab69e6 --- /dev/null +++ b/yt_dlp/extractor/francetv.py @@ -0,0 +1,391 @@ +# coding: utf-8 + +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + ExtractorError, + format_field, + parse_iso8601, + parse_qs, +) +from .dailymotion import DailymotionIE + + +class FranceTVBaseInfoExtractor(InfoExtractor): + def _make_url_result(self, video_or_full_id, catalog=None): + full_id = 'francetv:%s' % video_or_full_id + if '@' not in video_or_full_id and catalog: + full_id += '@%s' % catalog + return self.url_result( + full_id, ie=FranceTVIE.ie_key(), + video_id=video_or_full_id.split('@')[0]) + + +class FranceTVIE(InfoExtractor): + _VALID_URL = r'''(?x) + (?: + https?:// + sivideo\.webservices\.francetelevisions\.fr/tools/getInfosOeuvre/v2/\? + .*?\bidDiffusion=[^&]+| + (?: + https?://videos\.francetv\.fr/video/| + francetv: + ) + (?P<id>[^@]+)(?:@(?P<catalog>.+))? + ) + ''' + + _TESTS = [{ + # without catalog + 'url': 'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/?idDiffusion=162311093&callback=_jsonp_loader_callback_request_0', + 'md5': 'c2248a8de38c4e65ea8fae7b5df2d84f', + 'info_dict': { + 'id': '162311093', + 'ext': 'mp4', + 'title': '13h15, le dimanche... - Les mystères de Jésus', + 'description': 'md5:75efe8d4c0a8205e5904498ffe1e1a42', + 'timestamp': 1502623500, + 'upload_date': '20170813', + }, + }, { + # with catalog + 'url': 'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/?idDiffusion=NI_1004933&catalogue=Zouzous&callback=_jsonp_loader_callback_request_4', + 'only_matching': True, + }, { + 'url': 'http://videos.francetv.fr/video/NI_657393@Regions', + 'only_matching': True, + }, { + 'url': 'francetv:162311093', + 'only_matching': True, + }, { + 'url': 'francetv:NI_1004933@Zouzous', + 'only_matching': True, + }, { + 'url': 'francetv:NI_983319@Info-web', + 'only_matching': True, + }, { + 'url': 'francetv:NI_983319', + 'only_matching': True, + }, { + 'url': 'francetv:NI_657393@Regions', + 'only_matching': True, + }, { + # france-3 live + 'url': 'francetv:SIM_France3', + 'only_matching': True, + }] + + def _extract_video(self, video_id, catalogue=None): + # Videos are identified by idDiffusion so catalogue part is optional. + # However when provided, some extra formats may be returned so we pass + # it if available. + is_live = None + videos = [] + title = None + subtitle = None + image = None + duration = None + timestamp = None + spritesheets = None + + for device_type in ('desktop', 'mobile'): + dinfo = self._download_json( + 'https://player.webservices.francetelevisions.fr/v1/videos/%s' % video_id, + video_id, 'Downloading %s video JSON' % device_type, query={ + 'device_type': device_type, + 'browser': 'chrome', + }, fatal=False) + + if not dinfo: + continue + + video = dinfo.get('video') + if video: + videos.append(video) + if duration is None: + duration = video.get('duration') + if is_live is None: + is_live = video.get('is_live') + if spritesheets is None: + spritesheets = video.get('spritesheets') + + meta = dinfo.get('meta') + if meta: + if title is None: + title = meta.get('title') + # XXX: what is meta['pre_title']? + if subtitle is None: + subtitle = meta.get('additional_title') + if image is None: + image = meta.get('image_url') + if timestamp is None: + timestamp = parse_iso8601(meta.get('broadcasted_at')) + + formats = [] + subtitles = {} + for video in videos: + format_id = video.get('format') + + video_url = None + if video.get('workflow') == 'token-akamai': + token_url = video.get('token') + if token_url: + token_json = self._download_json( + token_url, video_id, + 'Downloading signed %s manifest URL' % format_id) + if token_json: + video_url = token_json.get('url') + if not video_url: + video_url = video.get('url') + + ext = determine_ext(video_url) + if ext == 'f4m': + formats.extend(self._extract_f4m_formats( + video_url, video_id, f4m_id=format_id, fatal=False)) + elif ext == 'm3u8': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + video_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id=format_id, + fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + elif ext == 'mpd': + fmts, subs = self._extract_mpd_formats_and_subtitles( + video_url, video_id, mpd_id=format_id, fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + elif video_url.startswith('rtmp'): + formats.append({ + 'url': video_url, + 'format_id': 'rtmp-%s' % format_id, + 'ext': 'flv', + }) + else: + if self._is_valid_url(video_url, video_id, format_id): + formats.append({ + 'url': video_url, + 'format_id': format_id, + }) + + # XXX: what is video['captions']? + + for f in formats: + if f.get('acodec') != 'none' and f.get('language') in ('qtz', 'qad'): + f['language_preference'] = -10 + f['format_note'] = 'audio description%s' % format_field(f, 'format_note', ', %s') + + if spritesheets: + formats.append({ + 'format_id': 'spritesheets', + 'format_note': 'storyboard', + 'acodec': 'none', + 'vcodec': 'none', + 'ext': 'mhtml', + 'protocol': 'mhtml', + 'url': 'about:dummy', + 'fragments': [{ + 'path': sheet, + # XXX: not entirely accurate; each spritesheet seems to be + # a 10×10 grid of thumbnails corresponding to approximately + # 2 seconds of the video; the last spritesheet may be shorter + 'duration': 200, + } for sheet in spritesheets] + }) + + self._sort_formats(formats) + + if subtitle: + title += ' - %s' % subtitle + title = title.strip() + + return { + 'id': video_id, + 'title': self._live_title(title) if is_live else title, + 'thumbnail': image, + 'duration': duration, + 'timestamp': timestamp, + 'is_live': is_live, + 'formats': formats, + 'subtitles': subtitles, + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + catalog = mobj.group('catalog') + + if not video_id: + qs = parse_qs(url) + video_id = qs.get('idDiffusion', [None])[0] + catalog = qs.get('catalogue', [None])[0] + if not video_id: + raise ExtractorError('Invalid URL', expected=True) + + return self._extract_video(video_id, catalog) + + +class FranceTVSiteIE(FranceTVBaseInfoExtractor): + _VALID_URL = r'https?://(?:(?:www\.)?france\.tv|mobile\.france\.tv)/(?:[^/]+/)*(?P<id>[^/]+)\.html' + + _TESTS = [{ + 'url': 'https://www.france.tv/france-2/13h15-le-dimanche/140921-les-mysteres-de-jesus.html', + 'info_dict': { + 'id': 'ec217ecc-0733-48cf-ac06-af1347b849d1', + 'ext': 'mp4', + 'title': '13h15, le dimanche... - Les mystères de Jésus', + 'description': 'md5:75efe8d4c0a8205e5904498ffe1e1a42', + 'timestamp': 1502623500, + 'upload_date': '20170813', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [FranceTVIE.ie_key()], + }, { + # france3 + 'url': 'https://www.france.tv/france-3/des-chiffres-et-des-lettres/139063-emission-du-mardi-9-mai-2017.html', + 'only_matching': True, + }, { + # france4 + 'url': 'https://www.france.tv/france-4/hero-corp/saison-1/134151-apres-le-calme.html', + 'only_matching': True, + }, { + # france5 + 'url': 'https://www.france.tv/france-5/c-a-dire/saison-10/137013-c-a-dire.html', + 'only_matching': True, + }, { + # franceo + 'url': 'https://www.france.tv/france-o/archipels/132249-mon-ancetre-l-esclave.html', + 'only_matching': True, + }, { + # france2 live + 'url': 'https://www.france.tv/france-2/direct.html', + 'only_matching': True, + }, { + 'url': 'https://www.france.tv/documentaires/histoire/136517-argentine-les-500-bebes-voles-de-la-dictature.html', + 'only_matching': True, + }, { + 'url': 'https://www.france.tv/jeux-et-divertissements/divertissements/133965-le-web-contre-attaque.html', + 'only_matching': True, + }, { + 'url': 'https://mobile.france.tv/france-5/c-dans-l-air/137347-emission-du-vendredi-12-mai-2017.html', + 'only_matching': True, + }, { + 'url': 'https://www.france.tv/142749-rouge-sang.html', + 'only_matching': True, + }, { + # france-3 live + 'url': 'https://www.france.tv/france-3/direct.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + catalogue = None + video_id = self._search_regex( + r'(?:data-main-video\s*=|videoId["\']?\s*[:=])\s*(["\'])(?P<id>(?:(?!\1).)+)\1', + webpage, 'video id', default=None, group='id') + + if not video_id: + video_id, catalogue = self._html_search_regex( + r'(?:href=|player\.setVideo\(\s*)"http://videos?\.francetv\.fr/video/([^@]+@[^"]+)"', + webpage, 'video ID').split('@') + + return self._make_url_result(video_id, catalogue) + + +class FranceTVInfoIE(FranceTVBaseInfoExtractor): + IE_NAME = 'francetvinfo.fr' + _VALID_URL = r'https?://(?:www|mobile|france3-regions)\.francetvinfo\.fr/(?:[^/]+/)*(?P<id>[^/?#&.]+)' + + _TESTS = [{ + 'url': 'https://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-jeudi-22-aout-2019_3561461.html', + 'info_dict': { + 'id': 'd12458ee-5062-48fe-bfdd-a30d6a01b793', + 'ext': 'mp4', + 'title': 'Soir 3', + 'upload_date': '20190822', + 'timestamp': 1566510900, + 'description': 'md5:72d167097237701d6e8452ff03b83c00', + 'subtitles': { + 'fr': 'mincount:2', + }, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [FranceTVIE.ie_key()], + }, { + 'note': 'Only an image exists in initial webpage instead of the video', + 'url': 'https://www.francetvinfo.fr/sante/maladie/coronavirus/covid-19-en-inde-une-situation-catastrophique-a-new-dehli_4381095.html', + 'info_dict': { + 'id': '7d204c9e-a2d3-11eb-9e4c-000d3a23d482', + 'ext': 'mp4', + 'title': 'Covid-19 : une situation catastrophique à New Dehli', + 'thumbnail': str, + 'duration': 76, + 'timestamp': 1619028518, + 'upload_date': '20210421', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [FranceTVIE.ie_key()], + }, { + 'url': 'http://www.francetvinfo.fr/elections/europeennes/direct-europeennes-regardez-le-debat-entre-les-candidats-a-la-presidence-de-la-commission_600639.html', + 'only_matching': True, + }, { + 'url': 'http://www.francetvinfo.fr/economie/entreprises/les-entreprises-familiales-le-secret-de-la-reussite_933271.html', + 'only_matching': True, + }, { + 'url': 'http://france3-regions.francetvinfo.fr/bretagne/cotes-d-armor/thalassa-echappee-breizh-ce-venredi-dans-les-cotes-d-armor-954961.html', + 'only_matching': True, + }, { + # Dailymotion embed + 'url': 'http://www.francetvinfo.fr/politique/notre-dame-des-landes/video-sur-france-inter-cecile-duflot-denonce-le-regard-meprisant-de-patrick-cohen_1520091.html', + 'md5': 'ee7f1828f25a648addc90cb2687b1f12', + 'info_dict': { + 'id': 'x4iiko0', + 'ext': 'mp4', + 'title': 'NDDL, référendum, Brexit : Cécile Duflot répond à Patrick Cohen', + 'description': 'Au lendemain de la victoire du "oui" au référendum sur l\'aéroport de Notre-Dame-des-Landes, l\'ancienne ministre écologiste est l\'invitée de Patrick Cohen. Plus d\'info : https://www.franceinter.fr/emissions/le-7-9/le-7-9-27-juin-2016', + 'timestamp': 1467011958, + 'upload_date': '20160627', + 'uploader': 'France Inter', + 'uploader_id': 'x2q2ez', + }, + 'add_ie': ['Dailymotion'], + }, { + 'url': 'http://france3-regions.francetvinfo.fr/limousin/emissions/jt-1213-limousin', + 'only_matching': True, + }, { + # "<figure id=" pattern (#28792) + 'url': 'https://www.francetvinfo.fr/culture/patrimoine/incendie-de-notre-dame-de-paris/notre-dame-de-paris-de-l-incendie-de-la-cathedrale-a-sa-reconstruction_4372291.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + dailymotion_urls = DailymotionIE._extract_urls(webpage) + if dailymotion_urls: + return self.playlist_result([ + self.url_result(dailymotion_url, DailymotionIE.ie_key()) + for dailymotion_url in dailymotion_urls]) + + video_id = self._search_regex( + (r'player\.load[^;]+src:\s*["\']([^"\']+)', + r'id-video=([^@]+@[^"]+)', + r'<a[^>]+href="(?:https?:)?//videos\.francetv\.fr/video/([^@]+@[^"]+)"', + r'(?:data-id|<figure[^<]+\bid)=["\']([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'), + webpage, 'video id') + + return self._make_url_result(video_id) diff --git a/youtube_dl/extractor/freesound.py b/yt_dlp/extractor/freesound.py index 138b6bc58..138b6bc58 100644 --- a/youtube_dl/extractor/freesound.py +++ b/yt_dlp/extractor/freesound.py diff --git a/youtube_dl/extractor/freespeech.py b/yt_dlp/extractor/freespeech.py index ea9c3e317..ea9c3e317 100644 --- a/youtube_dl/extractor/freespeech.py +++ b/yt_dlp/extractor/freespeech.py diff --git a/youtube_dl/extractor/freshlive.py b/yt_dlp/extractor/freshlive.py index 72a845945..72a845945 100644 --- a/youtube_dl/extractor/freshlive.py +++ b/yt_dlp/extractor/freshlive.py diff --git a/yt_dlp/extractor/frontendmasters.py b/yt_dlp/extractor/frontendmasters.py new file mode 100644 index 000000000..40b8cb0b4 --- /dev/null +++ b/yt_dlp/extractor/frontendmasters.py @@ -0,0 +1,263 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urlparse, +) +from ..utils import ( + ExtractorError, + parse_duration, + url_or_none, + urlencode_postdata, +) + + +class FrontendMastersBaseIE(InfoExtractor): + _API_BASE = 'https://api.frontendmasters.com/v1/kabuki' + _LOGIN_URL = 'https://frontendmasters.com/login/' + + _NETRC_MACHINE = 'frontendmasters' + + _QUALITIES = { + 'low': {'width': 480, 'height': 360}, + 'mid': {'width': 1280, 'height': 720}, + 'high': {'width': 1920, 'height': 1080} + } + + def _real_initialize(self): + self._login() + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login page') + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'username': username, + 'password': password + }) + + post_url = self._search_regex( + r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, + 'post_url', default=self._LOGIN_URL, group='url') + + if not post_url.startswith('http'): + post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) + + response = self._download_webpage( + post_url, None, 'Logging in', data=urlencode_postdata(login_form), + headers={'Content-Type': 'application/x-www-form-urlencoded'}) + + # Successful login + if any(p in response for p in ( + 'wp-login.php?action=logout', '>Logout')): + return + + error = self._html_search_regex( + r'class=(["\'])(?:(?!\1).)*\bMessageAlert\b(?:(?!\1).)*\1[^>]*>(?P<error>[^<]+)<', + response, 'error message', default=None, group='error') + if error: + raise ExtractorError('Unable to login: %s' % error, expected=True) + raise ExtractorError('Unable to log in') + + +class FrontendMastersPageBaseIE(FrontendMastersBaseIE): + def _download_course(self, course_name, url): + return self._download_json( + '%s/courses/%s' % (self._API_BASE, course_name), course_name, + 'Downloading course JSON', headers={'Referer': url}) + + @staticmethod + def _extract_chapters(course): + chapters = [] + lesson_elements = course.get('lessonElements') + if isinstance(lesson_elements, list): + chapters = [url_or_none(e) for e in lesson_elements if url_or_none(e)] + return chapters + + @staticmethod + def _extract_lesson(chapters, lesson_id, lesson): + title = lesson.get('title') or lesson_id + display_id = lesson.get('slug') + description = lesson.get('description') + thumbnail = lesson.get('thumbnail') + + chapter_number = None + index = lesson.get('index') + element_index = lesson.get('elementIndex') + if (isinstance(index, int) and isinstance(element_index, int) + and index < element_index): + chapter_number = element_index - index + chapter = (chapters[chapter_number - 1] + if chapter_number - 1 < len(chapters) else None) + + duration = None + timestamp = lesson.get('timestamp') + if isinstance(timestamp, compat_str): + mobj = re.search( + r'(?P<start>\d{1,2}:\d{1,2}:\d{1,2})\s*-(?P<end>\s*\d{1,2}:\d{1,2}:\d{1,2})', + timestamp) + if mobj: + duration = parse_duration(mobj.group('end')) - parse_duration( + mobj.group('start')) + + return { + '_type': 'url_transparent', + 'url': 'frontendmasters:%s' % lesson_id, + 'ie_key': FrontendMastersIE.ie_key(), + 'id': lesson_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'chapter': chapter, + 'chapter_number': chapter_number, + } + + +class FrontendMastersIE(FrontendMastersBaseIE): + _VALID_URL = r'(?:frontendmasters:|https?://api\.frontendmasters\.com/v\d+/kabuki/video/)(?P<id>[^/]+)' + _TESTS = [{ + 'url': 'https://api.frontendmasters.com/v1/kabuki/video/a2qogef6ba', + 'md5': '7f161159710d6b7016a4f4af6fcb05e2', + 'info_dict': { + 'id': 'a2qogef6ba', + 'ext': 'mp4', + 'title': 'a2qogef6ba', + }, + 'skip': 'Requires FrontendMasters account credentials', + }, { + 'url': 'frontendmasters:a2qogef6ba', + 'only_matching': True, + }] + + def _real_extract(self, url): + lesson_id = self._match_id(url) + + source_url = '%s/video/%s/source' % (self._API_BASE, lesson_id) + + formats = [] + for ext in ('webm', 'mp4'): + for quality in ('low', 'mid', 'high'): + resolution = self._QUALITIES[quality].copy() + format_id = '%s-%s' % (ext, quality) + format_url = self._download_json( + source_url, lesson_id, + 'Downloading %s source JSON' % format_id, query={ + 'f': ext, + 'r': resolution['height'], + }, headers={ + 'Referer': url, + }, fatal=False)['url'] + + if not format_url: + continue + + f = resolution.copy() + f.update({ + 'url': format_url, + 'ext': ext, + 'format_id': format_id, + }) + formats.append(f) + self._sort_formats(formats) + + subtitles = { + 'en': [{ + 'url': '%s/transcripts/%s.vtt' % (self._API_BASE, lesson_id), + }] + } + + return { + 'id': lesson_id, + 'title': lesson_id, + 'formats': formats, + 'subtitles': subtitles + } + + +class FrontendMastersLessonIE(FrontendMastersPageBaseIE): + _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P<course_name>[^/]+)/(?P<lesson_name>[^/]+)' + _TEST = { + 'url': 'https://frontendmasters.com/courses/web-development/tools', + 'info_dict': { + 'id': 'a2qogef6ba', + 'display_id': 'tools', + 'ext': 'mp4', + 'title': 'Tools', + 'description': 'md5:82c1ea6472e88ed5acd1829fe992e4f7', + 'thumbnail': r're:^https?://.*\.jpg$', + 'chapter': 'Introduction', + 'chapter_number': 1, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Requires FrontendMasters account credentials', + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + course_name, lesson_name = mobj.group('course_name', 'lesson_name') + + course = self._download_course(course_name, url) + + lesson_id, lesson = next( + (video_id, data) + for video_id, data in course['lessonData'].items() + if data.get('slug') == lesson_name) + + chapters = self._extract_chapters(course) + return self._extract_lesson(chapters, lesson_id, lesson) + + +class FrontendMastersCourseIE(FrontendMastersPageBaseIE): + _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P<id>[^/]+)' + _TEST = { + 'url': 'https://frontendmasters.com/courses/web-development/', + 'info_dict': { + 'id': 'web-development', + 'title': 'Introduction to Web Development', + 'description': 'md5:9317e6e842098bf725d62360e52d49a6', + }, + 'playlist_count': 81, + 'skip': 'Requires FrontendMasters account credentials', + } + + @classmethod + def suitable(cls, url): + return False if FrontendMastersLessonIE.suitable(url) else super( + FrontendMastersBaseIE, cls).suitable(url) + + def _real_extract(self, url): + course_name = self._match_id(url) + + course = self._download_course(course_name, url) + + chapters = self._extract_chapters(course) + + lessons = sorted( + course['lessonData'].values(), key=lambda data: data['index']) + + entries = [] + for lesson in lessons: + lesson_name = lesson.get('slug') + if not lesson_name: + continue + lesson_id = lesson.get('hash') or lesson.get('statsId') + entries.append(self._extract_lesson(chapters, lesson_id, lesson)) + + title = course.get('title') + description = course.get('description') + + return self.playlist_result(entries, course_name, title, description) diff --git a/youtube_dl/extractor/fujitv.py b/yt_dlp/extractor/fujitv.py index a02a94374..a02a94374 100644 --- a/youtube_dl/extractor/fujitv.py +++ b/yt_dlp/extractor/fujitv.py diff --git a/yt_dlp/extractor/funimation.py b/yt_dlp/extractor/funimation.py new file mode 100644 index 000000000..382cbe159 --- /dev/null +++ b/yt_dlp/extractor/funimation.py @@ -0,0 +1,352 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import random +import re +import string + +from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ( + determine_ext, + int_or_none, + js_to_json, + orderedSet, + qualities, + str_or_none, + traverse_obj, + try_get, + urlencode_postdata, + ExtractorError, +) + + +class FunimationBaseIE(InfoExtractor): + _NETRC_MACHINE = 'funimation' + _REGION = None + _TOKEN = None + + def _get_region(self): + region_cookie = self._get_cookies('https://www.funimation.com').get('region') + region = region_cookie.value if region_cookie else self.get_param('geo_bypass_country') + return region or traverse_obj( + self._download_json( + 'https://geo-service.prd.funimationsvc.com/geo/v1/region/check', None, fatal=False, + note='Checking geo-location', errnote='Unable to fetch geo-location information'), + 'region') or 'US' + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + try: + data = self._download_json( + 'https://prod-api-funimationnow.dadcdigital.com/api/auth/login/', + None, 'Logging in', data=urlencode_postdata({ + 'username': username, + 'password': password, + })) + return data['token'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + error = self._parse_json(e.cause.read().decode(), None)['error'] + raise ExtractorError(error, expected=True) + raise + + +class FunimationPageIE(FunimationBaseIE): + IE_NAME = 'funimation:page' + _VALID_URL = r'https?://(?:www\.)?funimation(?:\.com|now\.uk)/(?:(?P<lang>[^/]+)/)?(?:shows|v)/(?P<show>[^/]+)/(?P<episode>[^/?#&]+)' + + _TESTS = [{ + 'url': 'https://www.funimation.com/shows/attack-on-titan-junior-high/broadcast-dub-preview/', + 'info_dict': { + 'id': '210050', + 'ext': 'mp4', + 'title': 'Broadcast Dub Preview', + # Other metadata is tested in FunimationIE + }, + 'params': { + 'skip_download': 'm3u8', + }, + 'add_ie': ['Funimation'], + }, { + # Not available in US + 'url': 'https://www.funimation.com/shows/hacksign/role-play/', + 'only_matching': True, + }, { + # with lang code + 'url': 'https://www.funimation.com/en/shows/hacksign/role-play/', + 'only_matching': True, + }, { + 'url': 'https://www.funimationnow.uk/shows/puzzle-dragons-x/drop-impact/simulcast/', + 'only_matching': True, + }, { + 'url': 'https://www.funimation.com/v/a-certain-scientific-railgun/super-powered-level-5', + 'only_matching': True, + }] + + def _real_initialize(self): + if not self._REGION: + FunimationBaseIE._REGION = self._get_region() + if not self._TOKEN: + FunimationBaseIE._TOKEN = self._login() + + def _real_extract(self, url): + locale, show, episode = self._match_valid_url(url).group('lang', 'show', 'episode') + + video_id = traverse_obj(self._download_json( + f'https://title-api.prd.funimationsvc.com/v1/shows/{show}/episodes/{episode}', + f'{show}_{episode}', query={ + 'deviceType': 'web', + 'region': self._REGION, + 'locale': locale or 'en' + }), ('videoList', ..., 'id'), get_all=False) + + return self.url_result(f'https://www.funimation.com/player/{video_id}', FunimationIE.ie_key(), video_id) + + +class FunimationIE(FunimationBaseIE): + _VALID_URL = r'https?://(?:www\.)?funimation\.com/player/(?P<id>\d+)' + + _TESTS = [{ + 'url': 'https://www.funimation.com/player/210051', + 'info_dict': { + 'id': '210050', + 'display_id': 'broadcast-dub-preview', + 'ext': 'mp4', + 'title': 'Broadcast Dub Preview', + 'thumbnail': r're:https?://.*\.(?:jpg|png)', + 'episode': 'Broadcast Dub Preview', + 'episode_id': '210050', + 'season': 'Extras', + 'season_id': '166038', + 'season_number': 99, + 'series': 'Attack on Titan: Junior High', + 'description': '', + 'duration': 155, + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, { + 'note': 'player_id should be extracted with the relevent compat-opt', + 'url': 'https://www.funimation.com/player/210051', + 'info_dict': { + 'id': '210051', + 'display_id': 'broadcast-dub-preview', + 'ext': 'mp4', + 'title': 'Broadcast Dub Preview', + 'thumbnail': r're:https?://.*\.(?:jpg|png)', + 'episode': 'Broadcast Dub Preview', + 'episode_id': '210050', + 'season': 'Extras', + 'season_id': '166038', + 'season_number': 99, + 'series': 'Attack on Titan: Junior High', + 'description': '', + 'duration': 155, + }, + 'params': { + 'skip_download': 'm3u8', + 'compat_opts': ['seperate-video-versions'], + }, + }] + + def _real_initialize(self): + if not self._TOKEN: + FunimationBaseIE._TOKEN = self._login() + + @staticmethod + def _get_experiences(episode): + for lang, lang_data in episode.get('languages', {}).items(): + for video_data in lang_data.values(): + for version, f in video_data.items(): + yield lang, version.title(), f + + def _get_episode(self, webpage, experience_id=None, episode_id=None, fatal=True): + ''' Extract the episode, season and show objects given either episode/experience id ''' + show = self._parse_json( + self._search_regex( + r'show\s*=\s*({.+?})\s*;', webpage, 'show data', fatal=fatal), + experience_id, transform_source=js_to_json, fatal=fatal) or [] + for season in show.get('seasons', []): + for episode in season.get('episodes', []): + if episode_id is not None: + if str(episode.get('episodePk')) == episode_id: + return episode, season, show + continue + for _, _, f in self._get_experiences(episode): + if f.get('experienceId') == experience_id: + return episode, season, show + if fatal: + raise ExtractorError('Unable to find episode information') + else: + self.report_warning('Unable to find episode information') + return {}, {}, {} + + def _real_extract(self, url): + initial_experience_id = self._match_id(url) + webpage = self._download_webpage( + url, initial_experience_id, note=f'Downloading player webpage for {initial_experience_id}') + episode, season, show = self._get_episode(webpage, experience_id=int(initial_experience_id)) + episode_id = str(episode['episodePk']) + display_id = episode.get('slug') or episode_id + + formats, subtitles, thumbnails, duration = [], {}, [], 0 + requested_languages, requested_versions = self._configuration_arg('language'), self._configuration_arg('version') + language_preference = qualities((requested_languages or [''])[::-1]) + source_preference = qualities((requested_versions or ['uncut', 'simulcast'])[::-1]) + only_initial_experience = 'seperate-video-versions' in self.get_param('compat_opts', []) + + for lang, version, fmt in self._get_experiences(episode): + experience_id = str(fmt['experienceId']) + if (only_initial_experience and experience_id != initial_experience_id + or requested_languages and lang.lower() not in requested_languages + or requested_versions and version.lower() not in requested_versions): + continue + thumbnails.append({'url': fmt.get('poster')}) + duration = max(duration, fmt.get('duration', 0)) + format_name = '%s %s (%s)' % (version, lang, experience_id) + self.extract_subtitles( + subtitles, experience_id, display_id=display_id, format_name=format_name, + episode=episode if experience_id == initial_experience_id else episode_id) + + headers = {} + if self._TOKEN: + headers['Authorization'] = 'Token %s' % self._TOKEN + page = self._download_json( + 'https://www.funimation.com/api/showexperience/%s/' % experience_id, + display_id, headers=headers, expected_status=403, query={ + 'pinst_id': ''.join([random.choice(string.digits + string.ascii_letters) for _ in range(8)]), + }, note=f'Downloading {format_name} JSON') + sources = page.get('items') or [] + if not sources: + error = try_get(page, lambda x: x['errors'][0], dict) + if error: + self.report_warning('%s said: Error %s - %s' % ( + self.IE_NAME, error.get('code'), error.get('detail') or error.get('title'))) + else: + self.report_warning('No sources found for format') + + current_formats = [] + for source in sources: + source_url = source.get('src') + source_type = source.get('videoType') or determine_ext(source_url) + if source_type == 'm3u8': + current_formats.extend(self._extract_m3u8_formats( + source_url, display_id, 'mp4', m3u8_id='%s-%s' % (experience_id, 'hls'), fatal=False, + note=f'Downloading {format_name} m3u8 information')) + else: + current_formats.append({ + 'format_id': '%s-%s' % (experience_id, source_type), + 'url': source_url, + }) + for f in current_formats: + # TODO: Convert language to code + f.update({ + 'language': lang, + 'format_note': version, + 'source_preference': source_preference(version.lower()), + 'language_preference': language_preference(lang.lower()), + }) + formats.extend(current_formats) + self._remove_duplicate_formats(formats) + self._sort_formats(formats, ('lang', 'source')) + + return { + 'id': initial_experience_id if only_initial_experience else episode_id, + 'display_id': display_id, + 'duration': duration, + 'title': episode['episodeTitle'], + 'description': episode.get('episodeSummary'), + 'episode': episode.get('episodeTitle'), + 'episode_number': int_or_none(episode.get('episodeId')), + 'episode_id': episode_id, + 'season': season.get('seasonTitle'), + 'season_number': int_or_none(season.get('seasonId')), + 'season_id': str_or_none(season.get('seasonPk')), + 'series': show.get('showTitle'), + 'formats': formats, + 'thumbnails': thumbnails, + 'subtitles': subtitles, + } + + def _get_subtitles(self, subtitles, experience_id, episode, display_id, format_name): + if isinstance(episode, str): + webpage = self._download_webpage( + f'https://www.funimation.com/player/{experience_id}', display_id, + fatal=False, note=f'Downloading player webpage for {format_name}') + episode, _, _ = self._get_episode(webpage, episode_id=episode, fatal=False) + + for _, version, f in self._get_experiences(episode): + for source in f.get('sources'): + for text_track in source.get('textTracks'): + if not text_track.get('src'): + continue + sub_type = text_track.get('type').upper() + sub_type = sub_type if sub_type != 'FULL' else None + current_sub = { + 'url': text_track['src'], + 'name': ' '.join(filter(None, (version, text_track.get('label'), sub_type))) + } + lang = '_'.join(filter(None, ( + text_track.get('language', 'und'), version if version != 'Simulcast' else None, sub_type))) + if current_sub not in subtitles.get(lang, []): + subtitles.setdefault(lang, []).append(current_sub) + return subtitles + + +class FunimationShowIE(FunimationBaseIE): + IE_NAME = 'funimation:show' + _VALID_URL = r'(?P<url>https?://(?:www\.)?funimation(?:\.com|now\.uk)/(?P<locale>[^/]+)?/?shows/(?P<id>[^/?#&]+))/?(?:[?#]|$)' + + _TESTS = [{ + 'url': 'https://www.funimation.com/en/shows/sk8-the-infinity', + 'info_dict': { + 'id': 1315000, + 'title': 'SK8 the Infinity' + }, + 'playlist_count': 13, + 'params': { + 'skip_download': True, + }, + }, { + # without lang code + 'url': 'https://www.funimation.com/shows/ouran-high-school-host-club/', + 'info_dict': { + 'id': 39643, + 'title': 'Ouran High School Host Club' + }, + 'playlist_count': 26, + 'params': { + 'skip_download': True, + }, + }] + + def _real_initialize(self): + if not self._REGION: + FunimationBaseIE._REGION = self._get_region() + + def _real_extract(self, url): + base_url, locale, display_id = self._match_valid_url(url).groups() + + show_info = self._download_json( + 'https://title-api.prd.funimationsvc.com/v2/shows/%s?region=%s&deviceType=web&locale=%s' + % (display_id, self._REGION, locale or 'en'), display_id) + items_info = self._download_json( + 'https://prod-api-funimationnow.dadcdigital.com/api/funimation/episodes/?limit=99999&title_id=%s' + % show_info.get('id'), display_id) + + vod_items = traverse_obj(items_info, ('items', ..., re.compile('(?i)mostRecent[AS]vod').match, 'item')) + + return { + '_type': 'playlist', + 'id': show_info['id'], + 'title': show_info['name'], + 'entries': orderedSet( + self.url_result( + '%s/%s' % (base_url, vod_item.get('episodeSlug')), FunimationPageIE.ie_key(), + vod_item.get('episodeId'), vod_item.get('episodeName')) + for vod_item in sorted(vod_items, key=lambda x: x.get('episodeOrder', -1))), + } diff --git a/yt_dlp/extractor/funk.py b/yt_dlp/extractor/funk.py new file mode 100644 index 000000000..e5e32608f --- /dev/null +++ b/yt_dlp/extractor/funk.py @@ -0,0 +1,48 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from .nexx import NexxIE +from ..utils import ( + int_or_none, + str_or_none, +) + + +class FunkIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?funk\.net/(?:channel|playlist)/[^/]+/(?P<display_id>[0-9a-z-]+)-(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.funk.net/channel/ba-793/die-lustigsten-instrumente-aus-dem-internet-teil-2-1155821', + 'md5': '8dd9d9ab59b4aa4173b3197f2ea48e81', + 'info_dict': { + 'id': '1155821', + 'ext': 'mp4', + 'title': 'Die LUSTIGSTEN INSTRUMENTE aus dem Internet - Teil 2', + 'description': 'md5:a691d0413ef4835588c5b03ded670c1f', + 'timestamp': 1514507395, + 'upload_date': '20171229', + }, + + }, { + 'url': 'https://www.funk.net/playlist/neuesteVideos/kameras-auf-dem-fusion-festival-1618699', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id, nexx_id = self._match_valid_url(url).groups() + video = self._download_json( + 'https://www.funk.net/api/v4.0/videos/' + nexx_id, nexx_id) + return { + '_type': 'url_transparent', + 'url': 'nexx:741:' + nexx_id, + 'ie_key': NexxIE.ie_key(), + 'id': nexx_id, + 'title': video.get('title'), + 'description': video.get('description'), + 'duration': int_or_none(video.get('duration')), + 'channel_id': str_or_none(video.get('channelId')), + 'display_id': display_id, + 'tags': video.get('tags'), + 'thumbnail': video.get('imageUrlLandscape'), + } diff --git a/youtube_dl/extractor/fusion.py b/yt_dlp/extractor/fusion.py index a3f44b812..a3f44b812 100644 --- a/youtube_dl/extractor/fusion.py +++ b/yt_dlp/extractor/fusion.py diff --git a/yt_dlp/extractor/fxnetworks.py b/yt_dlp/extractor/fxnetworks.py new file mode 100644 index 000000000..00e67426b --- /dev/null +++ b/yt_dlp/extractor/fxnetworks.py @@ -0,0 +1,77 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .adobepass import AdobePassIE +from ..utils import ( + extract_attributes, + int_or_none, + parse_age_limit, + smuggle_url, + update_url_query, +) + + +class FXNetworksIE(AdobePassIE): + _VALID_URL = r'https?://(?:www\.)?(?:fxnetworks|simpsonsworld)\.com/video/(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://www.fxnetworks.com/video/1032565827847', + 'md5': '8d99b97b4aa7a202f55b6ed47ea7e703', + 'info_dict': { + 'id': 'dRzwHC_MMqIv', + 'ext': 'mp4', + 'title': 'First Look: Better Things - Season 2', + 'description': 'Because real life is like a fart. Watch this FIRST LOOK to see what inspired the new season of Better Things.', + 'age_limit': 14, + 'uploader': 'NEWA-FNG-FX', + 'upload_date': '20170825', + 'timestamp': 1503686274, + 'episode_number': 0, + 'season_number': 2, + 'series': 'Better Things', + }, + 'add_ie': ['ThePlatform'], + }, { + 'url': 'http://www.simpsonsworld.com/video/716094019682', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + if 'The content you are trying to access is not available in your region.' in webpage: + self.raise_geo_restricted() + video_data = extract_attributes(self._search_regex( + r'(<a.+?rel="https?://link\.theplatform\.com/s/.+?</a>)', webpage, 'video data')) + player_type = self._search_regex(r'playerType\s*=\s*[\'"]([^\'"]+)', webpage, 'player type', default=None) + release_url = video_data['rel'] + title = video_data['data-title'] + rating = video_data.get('data-rating') + query = { + 'mbr': 'true', + } + if player_type == 'movies': + query.update({ + 'manifest': 'm3u', + }) + else: + query.update({ + 'switch': 'http', + }) + if video_data.get('data-req-auth') == '1': + resource = self._get_mvpd_resource( + video_data['data-channel'], title, + video_data.get('data-guid'), rating) + query['auth'] = self._extract_mvpd_auth(url, video_id, 'fx', resource) + + return { + '_type': 'url_transparent', + 'id': video_id, + 'title': title, + 'url': smuggle_url(update_url_query(release_url, query), {'force_smil_url': True}), + 'series': video_data.get('data-show-title'), + 'episode_number': int_or_none(video_data.get('data-episode')), + 'season_number': int_or_none(video_data.get('data-season')), + 'thumbnail': video_data.get('data-large-thumb'), + 'age_limit': parse_age_limit(rating), + 'ie_key': 'ThePlatform', + } diff --git a/yt_dlp/extractor/gab.py b/yt_dlp/extractor/gab.py new file mode 100644 index 000000000..25b5cb066 --- /dev/null +++ b/yt_dlp/extractor/gab.py @@ -0,0 +1,64 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + str_to_int, +) + + +class GabTVIE(InfoExtractor): + _VALID_URL = r'(?:https?://)tv.gab.com/channel/[^/]+/view/(?P<id>[a-z0-9-]+)' + _TESTS = [{ + 'url': 'https://tv.gab.com/channel/wurzelroot/view/why-was-america-in-afghanistan-61217eacea5665de450d0488', + 'info_dict': { + 'id': '61217eacea5665de450d0488', + 'ext': 'mp4', + 'title': 'WHY WAS AMERICA IN AFGHANISTAN - AMERICA FIRST AGAINST AMERICAN OLIGARCHY', + 'description': None, + 'uploader': 'Wurzelroot', + 'uploader_id': '608fb0a85738fd1974984f7d', + 'thumbnail': 'https://tv.gab.com/image/61217eacea5665de450d0488', + } + }] + + def _real_extract(self, url): + id = self._match_id(url).split('-')[-1] + webpage = self._download_webpage(url, id) + channel_id = self._search_regex(r'data-channel-id=\"(?P<channel_id>[^\"]+)', webpage, 'channel_id') + channel_name = self._search_regex(r'data-channel-name=\"(?P<channel_id>[^\"]+)', webpage, 'channel_name') + title = self._search_regex(r'data-episode-title=\"(?P<channel_id>[^\"]+)', webpage, 'title') + view_key = self._search_regex(r'data-view-key=\"(?P<channel_id>[^\"]+)', webpage, 'view_key') + description = clean_html(self._html_search_regex(self._meta_regex('description'), webpage, 'description', group='content')) or None + available_resolutions = re.findall(r'<a\ data-episode-id=\"%s\"\ data-resolution=\"(?P<resolution>[^\"]+)' % id, webpage) + + formats = [] + for resolution in available_resolutions: + frmt = { + 'url': f'https://tv.gab.com/media/{id}?viewKey={view_key}&r={resolution}', + 'format_id': resolution, + 'vcodec': 'h264', + 'acodec': 'aac', + 'ext': 'mp4' + } + if 'audio-' in resolution: + frmt['abr'] = str_to_int(resolution.replace('audio-', '')) + frmt['height'] = 144 + frmt['quality'] = -10 + else: + frmt['height'] = str_to_int(resolution.replace('p', '')) + formats.append(frmt) + self._sort_formats(formats) + + return { + 'id': id, + 'title': title, + 'formats': formats, + 'description': description, + 'uploader': channel_name, + 'uploader_id': channel_id, + 'thumbnail': f'https://tv.gab.com/image/{id}', + } diff --git a/yt_dlp/extractor/gaia.py b/yt_dlp/extractor/gaia.py new file mode 100644 index 000000000..7821fb783 --- /dev/null +++ b/yt_dlp/extractor/gaia.py @@ -0,0 +1,129 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urllib_parse_unquote, +) +from ..utils import ( + ExtractorError, + int_or_none, + str_or_none, + strip_or_none, + try_get, + urlencode_postdata, +) + + +class GaiaIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?gaia\.com/video/(?P<id>[^/?]+).*?\bfullplayer=(?P<type>feature|preview)' + _TESTS = [{ + 'url': 'https://www.gaia.com/video/connecting-universal-consciousness?fullplayer=feature', + 'info_dict': { + 'id': '89356', + 'ext': 'mp4', + 'title': 'Connecting with Universal Consciousness', + 'description': 'md5:844e209ad31b7d31345f5ed689e3df6f', + 'upload_date': '20151116', + 'timestamp': 1447707266, + 'duration': 936, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'https://www.gaia.com/video/connecting-universal-consciousness?fullplayer=preview', + 'info_dict': { + 'id': '89351', + 'ext': 'mp4', + 'title': 'Connecting with Universal Consciousness', + 'description': 'md5:844e209ad31b7d31345f5ed689e3df6f', + 'upload_date': '20151116', + 'timestamp': 1447707266, + 'duration': 53, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }] + _NETRC_MACHINE = 'gaia' + _jwt = None + + def _real_initialize(self): + auth = self._get_cookies('https://www.gaia.com/').get('auth') + if auth: + auth = self._parse_json( + compat_urllib_parse_unquote(auth.value), + None, fatal=False) + if not auth: + username, password = self._get_login_info() + if username is None: + return + auth = self._download_json( + 'https://auth.gaia.com/v1/login', + None, data=urlencode_postdata({ + 'username': username, + 'password': password + })) + if auth.get('success') is False: + raise ExtractorError(', '.join(auth['messages']), expected=True) + if auth: + self._jwt = auth.get('jwt') + + def _real_extract(self, url): + display_id, vtype = self._match_valid_url(url).groups() + node_id = self._download_json( + 'https://brooklyn.gaia.com/pathinfo', display_id, query={ + 'path': 'video/' + display_id, + })['id'] + node = self._download_json( + 'https://brooklyn.gaia.com/node/%d' % node_id, node_id) + vdata = node[vtype] + media_id = compat_str(vdata['nid']) + title = node['title'] + + headers = None + if self._jwt: + headers = {'Authorization': 'Bearer ' + self._jwt} + media = self._download_json( + 'https://brooklyn.gaia.com/media/' + media_id, + media_id, headers=headers) + formats = self._extract_m3u8_formats( + media['mediaUrls']['bcHLS'], media_id, 'mp4') + self._sort_formats(formats) + + subtitles = {} + text_tracks = media.get('textTracks', {}) + for key in ('captions', 'subtitles'): + for lang, sub_url in text_tracks.get(key, {}).items(): + subtitles.setdefault(lang, []).append({ + 'url': sub_url, + }) + + fivestar = node.get('fivestar', {}) + fields = node.get('fields', {}) + + def get_field_value(key, value_key='value'): + return try_get(fields, lambda x: x[key][0][value_key]) + + return { + 'id': media_id, + 'display_id': display_id, + 'title': title, + 'formats': formats, + 'description': strip_or_none(get_field_value('body') or get_field_value('teaser')), + 'timestamp': int_or_none(node.get('created')), + 'subtitles': subtitles, + 'duration': int_or_none(vdata.get('duration')), + 'like_count': int_or_none(try_get(fivestar, lambda x: x['up_count']['value'])), + 'dislike_count': int_or_none(try_get(fivestar, lambda x: x['down_count']['value'])), + 'comment_count': int_or_none(node.get('comment_count')), + 'series': try_get(node, lambda x: x['series']['title'], compat_str), + 'season_number': int_or_none(get_field_value('season')), + 'season_id': str_or_none(get_field_value('series_nid', 'nid')), + 'episode_number': int_or_none(get_field_value('episode')), + } diff --git a/youtube_dl/extractor/gameinformer.py b/yt_dlp/extractor/gameinformer.py index f1b96c172..f1b96c172 100644 --- a/youtube_dl/extractor/gameinformer.py +++ b/yt_dlp/extractor/gameinformer.py diff --git a/youtube_dl/extractor/gamespot.py b/yt_dlp/extractor/gamespot.py index 7a1beae3c..7a1beae3c 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/yt_dlp/extractor/gamespot.py diff --git a/yt_dlp/extractor/gamestar.py b/yt_dlp/extractor/gamestar.py new file mode 100644 index 000000000..e882fa671 --- /dev/null +++ b/yt_dlp/extractor/gamestar.py @@ -0,0 +1,64 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + remove_end, +) + + +class GameStarIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?game(?P<site>pro|star)\.de/videos/.*,(?P<id>[0-9]+)\.html' + _TESTS = [{ + 'url': 'http://www.gamestar.de/videos/trailer,3/hobbit-3-die-schlacht-der-fuenf-heere,76110.html', + 'md5': 'ee782f1f8050448c95c5cacd63bc851c', + 'info_dict': { + 'id': '76110', + 'ext': 'mp4', + 'title': 'Hobbit 3: Die Schlacht der Fünf Heere - Teaser-Trailer zum dritten Teil', + 'description': 'Der Teaser-Trailer zu Hobbit 3: Die Schlacht der Fünf Heere zeigt einige Szenen aus dem dritten Teil der Saga und kündigt den...', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1406542380, + 'upload_date': '20140728', + 'duration': 17, + } + }, { + 'url': 'http://www.gamepro.de/videos/top-10-indie-spiele-fuer-nintendo-switch-video-tolle-nindies-games-zum-download,95316.html', + 'only_matching': True, + }, { + 'url': 'http://www.gamestar.de/videos/top-10-indie-spiele-fuer-nintendo-switch-video-tolle-nindies-games-zum-download,95316.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + site = mobj.group('site') + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + + # TODO: there are multiple ld+json objects in the webpage, + # while _search_json_ld finds only the first one + json_ld = self._parse_json(self._search_regex( + r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>[^<]+VideoObject[^<]+)</script>', + webpage, 'JSON-LD', group='json_ld'), video_id) + info_dict = self._json_ld(json_ld, video_id) + info_dict['title'] = remove_end( + info_dict['title'], ' - Game%s' % site.title()) + + view_count = int_or_none(json_ld.get('interactionCount')) + comment_count = int_or_none(self._html_search_regex( + r'<span>Kommentare</span>\s*<span[^>]+class=["\']count[^>]+>\s*\(\s*([0-9]+)', + webpage, 'comment count', fatal=False)) + + info_dict.update({ + 'id': video_id, + 'url': 'http://gamestar.de/_misc/videos/portal/getVideoUrl.cfm?premium=0&videoId=' + video_id, + 'ext': 'mp4', + 'view_count': view_count, + 'comment_count': comment_count + }) + + return info_dict diff --git a/yt_dlp/extractor/gaskrank.py b/yt_dlp/extractor/gaskrank.py new file mode 100644 index 000000000..03acd2a73 --- /dev/null +++ b/yt_dlp/extractor/gaskrank.py @@ -0,0 +1,101 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +from .common import InfoExtractor +from ..utils import ( + float_or_none, + int_or_none, + unified_strdate, +) + + +class GaskrankIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?gaskrank\.tv/tv/(?P<categories>[^/]+)/(?P<id>[^/]+)\.htm' + _TESTS = [{ + 'url': 'http://www.gaskrank.tv/tv/motorrad-fun/strike-einparken-durch-anfaenger-crash-mit-groesserem-flurschaden.htm', + 'md5': '1ae88dbac97887d85ebd1157a95fc4f9', + 'info_dict': { + 'id': '201601/26955', + 'ext': 'mp4', + 'title': 'Strike! Einparken können nur Männer - Flurschaden hält sich in Grenzen *lol*', + 'thumbnail': r're:^https?://.*\.jpg$', + 'categories': ['motorrad-fun'], + 'display_id': 'strike-einparken-durch-anfaenger-crash-mit-groesserem-flurschaden', + 'uploader_id': 'Bikefun', + 'upload_date': '20170110', + 'uploader_url': None, + } + }, { + 'url': 'http://www.gaskrank.tv/tv/racing/isle-of-man-tt-2011-michael-du-15920.htm', + 'md5': 'c33ee32c711bc6c8224bfcbe62b23095', + 'info_dict': { + 'id': '201106/15920', + 'ext': 'mp4', + 'title': 'Isle of Man - Michael Dunlop vs Guy Martin - schwindelig kucken', + 'thumbnail': r're:^https?://.*\.jpg$', + 'categories': ['racing'], + 'display_id': 'isle-of-man-tt-2011-michael-du-15920', + 'uploader_id': 'IOM', + 'upload_date': '20170523', + 'uploader_url': 'www.iomtt.com', + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + title = self._og_search_title( + webpage, default=None) or self._html_search_meta( + 'title', webpage, fatal=True) + + categories = [self._match_valid_url(url).group('categories')] + + mobj = re.search( + r'Video von:\s*(?P<uploader_id>[^|]*?)\s*\|\s*vom:\s*(?P<upload_date>[0-9][0-9]\.[0-9][0-9]\.[0-9][0-9][0-9][0-9])', + webpage) + if mobj is not None: + uploader_id = mobj.groupdict().get('uploader_id') + upload_date = unified_strdate(mobj.groupdict().get('upload_date')) + + uploader_url = self._search_regex( + r'Homepage:\s*<[^>]*>(?P<uploader_url>[^<]*)', + webpage, 'uploader_url', default=None) + tags = re.findall( + r'/tv/tags/[^/]+/"\s*>(?P<tag>[^<]*?)<', + webpage) + + view_count = self._search_regex( + r'class\s*=\s*"gkRight"(?:[^>]*>\s*<[^>]*)*icon-eye-open(?:[^>]*>\s*<[^>]*)*>\s*(?P<view_count>[0-9\.]*)', + webpage, 'view_count', default=None) + if view_count: + view_count = int_or_none(view_count.replace('.', '')) + + average_rating = self._search_regex( + r'itemprop\s*=\s*"ratingValue"[^>]*>\s*(?P<average_rating>[0-9,]+)', + webpage, 'average_rating') + if average_rating: + average_rating = float_or_none(average_rating.replace(',', '.')) + + video_id = self._search_regex( + r'https?://movies\.gaskrank\.tv/([^-]*?)(-[^\.]*)?\.mp4', + webpage, 'video id', default=display_id) + + entry = self._parse_html5_media_entries(url, webpage, video_id)[0] + entry.update({ + 'id': video_id, + 'title': title, + 'categories': categories, + 'display_id': display_id, + 'uploader_id': uploader_id, + 'upload_date': upload_date, + 'uploader_url': uploader_url, + 'tags': tags, + 'view_count': view_count, + 'average_rating': average_rating, + }) + self._sort_formats(entry['formats']) + + return entry diff --git a/yt_dlp/extractor/gazeta.py b/yt_dlp/extractor/gazeta.py new file mode 100644 index 000000000..367187080 --- /dev/null +++ b/yt_dlp/extractor/gazeta.py @@ -0,0 +1,47 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor + + +class GazetaIE(InfoExtractor): + _VALID_URL = r'(?P<url>https?://(?:www\.)?gazeta\.ru/(?:[^/]+/)?video/(?:main/)*(?:\d{4}/\d{2}/\d{2}/)?(?P<id>[A-Za-z0-9-_.]+)\.s?html)' + _TESTS = [{ + 'url': 'http://www.gazeta.ru/video/main/zadaite_vopros_vladislavu_yurevichu.shtml', + 'md5': 'd49c9bdc6e5a7888f27475dc215ee789', + 'info_dict': { + 'id': '205566', + 'ext': 'mp4', + 'title': '«70–80 процентов гражданских в Донецке на грани голода»', + 'description': 'md5:38617526050bd17b234728e7f9620a71', + 'thumbnail': r're:^https?://.*\.jpg', + }, + 'skip': 'video not found', + }, { + 'url': 'http://www.gazeta.ru/lifestyle/video/2015/03/08/master-klass_krasivoi_byt._delaem_vesennii_makiyazh.shtml', + 'only_matching': True, + }, { + 'url': 'http://www.gazeta.ru/video/main/main/2015/06/22/platit_ili_ne_platit_po_isku_yukosa.shtml', + 'md5': '37f19f78355eb2f4256ee1688359f24c', + 'info_dict': { + 'id': '252048', + 'ext': 'mp4', + 'title': '"Если по иску ЮКОСа придется платить, это будет большой удар по бюджету"', + }, + 'add_ie': ['EaglePlatform'], + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + + display_id = mobj.group('id') + embed_url = '%s?p=embed' % mobj.group('url') + embed_page = self._download_webpage( + embed_url, display_id, 'Downloading embed page') + + video_id = self._search_regex( + r'<div[^>]*?class="eagleplayer"[^>]*?data-id="([^"]+)"', embed_page, 'video id') + + return self.url_result( + 'eagleplatform:gazeta.media.eagleplatform.com:%s' % video_id, 'EaglePlatform') diff --git a/yt_dlp/extractor/gdcvault.py b/yt_dlp/extractor/gdcvault.py new file mode 100644 index 000000000..c3ad6b4ce --- /dev/null +++ b/yt_dlp/extractor/gdcvault.py @@ -0,0 +1,220 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .kaltura import KalturaIE +from ..utils import ( + HEADRequest, + remove_start, + sanitized_Request, + smuggle_url, + urlencode_postdata, +) + + +class GDCVaultIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?gdcvault\.com/play/(?P<id>\d+)(?:/(?P<name>[\w-]+))?' + _NETRC_MACHINE = 'gdcvault' + _TESTS = [ + { + 'url': 'http://www.gdcvault.com/play/1019721/Doki-Doki-Universe-Sweet-Simple', + 'md5': '7ce8388f544c88b7ac11c7ab1b593704', + 'info_dict': { + 'id': '201311826596_AWNY', + 'display_id': 'Doki-Doki-Universe-Sweet-Simple', + 'ext': 'mp4', + 'title': 'Doki-Doki Universe: Sweet, Simple and Genuine (GDC Next 10)' + } + }, + { + 'url': 'http://www.gdcvault.com/play/1015683/Embracing-the-Dark-Art-of', + 'info_dict': { + 'id': '201203272_1330951438328RSXR', + 'display_id': 'Embracing-the-Dark-Art-of', + 'ext': 'flv', + 'title': 'Embracing the Dark Art of Mathematical Modeling in AI' + }, + 'params': { + 'skip_download': True, # Requires rtmpdump + } + }, + { + 'url': 'http://www.gdcvault.com/play/1015301/Thexder-Meets-Windows-95-or', + 'md5': 'a5eb77996ef82118afbbe8e48731b98e', + 'info_dict': { + 'id': '1015301', + 'display_id': 'Thexder-Meets-Windows-95-or', + 'ext': 'flv', + 'title': 'Thexder Meets Windows 95, or Writing Great Games in the Windows 95 Environment', + }, + 'skip': 'Requires login', + }, + { + 'url': 'http://gdcvault.com/play/1020791/', + 'only_matching': True, + }, + { + # Hard-coded hostname + 'url': 'http://gdcvault.com/play/1023460/Tenacious-Design-and-The-Interface', + 'md5': 'a8efb6c31ed06ca8739294960b2dbabd', + 'info_dict': { + 'id': '840376_BQRC', + 'ext': 'mp4', + 'display_id': 'Tenacious-Design-and-The-Interface', + 'title': 'Tenacious Design and The Interface of \'Destiny\'', + }, + }, + { + # Multiple audios + 'url': 'http://www.gdcvault.com/play/1014631/Classic-Game-Postmortem-PAC', + 'info_dict': { + 'id': '12396_1299111843500GMPX', + 'ext': 'mp4', + 'title': 'How to Create a Good Game - From My Experience of Designing Pac-Man', + }, + # 'params': { + # 'skip_download': True, # Requires rtmpdump + # 'format': 'jp', # The japanese audio + # } + }, + { + # gdc-player.html + 'url': 'http://www.gdcvault.com/play/1435/An-American-engine-in-Tokyo', + 'info_dict': { + 'id': '9350_1238021887562UHXB', + 'display_id': 'An-American-engine-in-Tokyo', + 'ext': 'mp4', + 'title': 'An American Engine in Tokyo:/nThe collaboration of Epic Games and Square Enix/nFor THE LAST REMINANT', + }, + }, + { + # Kaltura Embed + 'url': 'https://www.gdcvault.com/play/1026180/Mastering-the-Apex-of-Scaling', + 'info_dict': { + 'id': '0_h1fg8j3p', + 'ext': 'mp4', + 'title': 'Mastering the Apex of Scaling Game Servers (Presented by Multiplay)', + 'timestamp': 1554401811, + 'upload_date': '20190404', + 'uploader_id': 'joe@blazestreaming.com', + }, + 'params': { + 'format': 'mp4-408', + }, + }, + { + # Kaltura embed, whitespace between quote and embedded URL in iframe's src + 'url': 'https://www.gdcvault.com/play/1025699', + 'info_dict': { + 'id': '0_zagynv0a', + 'ext': 'mp4', + 'title': 'Tech Toolbox', + 'upload_date': '20190408', + 'uploader_id': 'joe@blazestreaming.com', + 'timestamp': 1554764629, + }, + 'params': { + 'skip_download': True, + }, + }, + { + # HTML5 video + 'url': 'http://www.gdcvault.com/play/1014846/Conference-Keynote-Shigeru', + 'only_matching': True, + }, + ] + + def _login(self, webpage_url, display_id): + username, password = self._get_login_info() + if username is None or password is None: + self.report_warning('It looks like ' + webpage_url + ' requires a login. Try specifying a username and password and try again.') + return None + + mobj = re.match(r'(?P<root_url>https?://.*?/).*', webpage_url) + login_url = mobj.group('root_url') + 'api/login.php' + logout_url = mobj.group('root_url') + 'logout' + + login_form = { + 'email': username, + 'password': password, + } + + request = sanitized_Request(login_url, urlencode_postdata(login_form)) + request.add_header('Content-Type', 'application/x-www-form-urlencoded') + self._download_webpage(request, display_id, 'Logging in') + start_page = self._download_webpage(webpage_url, display_id, 'Getting authenticated video page') + self._download_webpage(logout_url, display_id, 'Logging out') + + return start_page + + def _real_extract(self, url): + video_id, name = self._match_valid_url(url).groups() + display_id = name or video_id + + webpage_url = 'http://www.gdcvault.com/play/' + video_id + start_page = self._download_webpage(webpage_url, display_id) + + direct_url = self._search_regex( + r's1\.addVariable\("file",\s*encodeURIComponent\("(/[^"]+)"\)\);', + start_page, 'url', default=None) + if direct_url: + title = self._html_search_regex( + r'<td><strong>Session Name:?</strong></td>\s*<td>(.*?)</td>', + start_page, 'title') + video_url = 'http://www.gdcvault.com' + direct_url + # resolve the url so that we can detect the correct extension + video_url = self._request_webpage( + HEADRequest(video_url), video_id).geturl() + + return { + 'id': video_id, + 'display_id': display_id, + 'url': video_url, + 'title': title, + } + + embed_url = KalturaIE._extract_url(start_page) + if embed_url: + embed_url = smuggle_url(embed_url, {'source_url': url}) + ie_key = 'Kaltura' + else: + PLAYER_REGEX = r'<iframe src="(?P<xml_root>.+?)/(?:gdc-)?player.*?\.html.*?".*?</iframe>' + + xml_root = self._html_search_regex( + PLAYER_REGEX, start_page, 'xml root', default=None) + if xml_root is None: + # Probably need to authenticate + login_res = self._login(webpage_url, display_id) + if login_res is None: + self.report_warning('Could not login.') + else: + start_page = login_res + # Grab the url from the authenticated page + xml_root = self._html_search_regex( + PLAYER_REGEX, start_page, 'xml root') + + xml_name = self._html_search_regex( + r'<iframe src=".*?\?xml(?:=|URL=xml/)(.+?\.xml).*?".*?</iframe>', + start_page, 'xml filename', default=None) + if not xml_name: + info = self._parse_html5_media_entries(url, start_page, video_id)[0] + info.update({ + 'title': remove_start(self._search_regex( + r'>Session Name:\s*<.*?>\s*<td>(.+?)</td>', start_page, + 'title', default=None) or self._og_search_title( + start_page, default=None), 'GDC Vault - '), + 'id': video_id, + 'display_id': display_id, + }) + return info + embed_url = '%s/xml/%s' % (xml_root, xml_name) + ie_key = 'DigitallySpeaking' + + return { + '_type': 'url_transparent', + 'id': video_id, + 'display_id': display_id, + 'url': embed_url, + 'ie_key': ie_key, + } diff --git a/yt_dlp/extractor/gedidigital.py b/yt_dlp/extractor/gedidigital.py new file mode 100644 index 000000000..ec386c218 --- /dev/null +++ b/yt_dlp/extractor/gedidigital.py @@ -0,0 +1,210 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + base_url, + determine_ext, + int_or_none, + url_basename, + urljoin, +) + + +class GediDigitalIE(InfoExtractor): + _VALID_URL = r'''(?x)(?P<url>(?:https?:)//video\. + (?: + (?: + (?:espresso\.)?repubblica + |lastampa + |ilsecoloxix + |huffingtonpost + )| + (?: + iltirreno + |messaggeroveneto + |ilpiccolo + |gazzettadimantova + |mattinopadova + |laprovinciapavese + |tribunatreviso + |nuovavenezia + |gazzettadimodena + |lanuovaferrara + |corrierealpi + |lasentinella + )\.gelocal + )\.it(?:/[^/]+){2,4}/(?P<id>\d+))(?:$|[?&].*)''' + _TESTS = [{ + 'url': 'https://video.lastampa.it/politica/il-paradosso-delle-regionali-la-lega-vince-ma-sembra-aver-perso/121559/121683', + 'md5': '84658d7fb9e55a6e57ecc77b73137494', + 'info_dict': { + 'id': '121683', + 'ext': 'mp4', + 'title': 'Il paradosso delle Regionali: ecco perché la Lega vince ma sembra aver perso', + 'description': 'md5:de7f4d6eaaaf36c153b599b10f8ce7ca', + 'thumbnail': r're:^https://www\.repstatic\.it/video/photo/.+?-thumb-full-.+?\.jpg$', + 'duration': 125, + }, + }, { + 'url': 'https://video.huffingtonpost.it/embed/politica/cotticelli-non-so-cosa-mi-sia-successo-sto-cercando-di-capire-se-ho-avuto-un-malore/29312/29276?responsive=true&el=video971040871621586700', + 'only_matching': True, + }, { + 'url': 'https://video.espresso.repubblica.it/embed/tutti-i-video/01-ted-villa/14772/14870&width=640&height=360', + 'only_matching': True, + }, { + 'url': 'https://video.repubblica.it/motori/record-della-pista-a-spa-francorchamps-la-pagani-huayra-roadster-bc-stupisce/367415/367963', + 'only_matching': True, + }, { + 'url': 'https://video.ilsecoloxix.it/sport/cassani-e-i-brividi-azzurri-ai-mondiali-di-imola-qui-mi-sono-innamorato-del-ciclismo-da-ragazzino-incredibile-tornarci-da-ct/66184/66267', + 'only_matching': True, + }, { + 'url': 'https://video.iltirreno.gelocal.it/sport/dentro-la-notizia-ferrari-cosa-succede-a-maranello/141059/142723', + 'only_matching': True, + }, { + 'url': 'https://video.messaggeroveneto.gelocal.it/locale/maria-giovanna-elmi-covid-vaccino/138155/139268', + 'only_matching': True, + }, { + 'url': 'https://video.ilpiccolo.gelocal.it/dossier/big-john/dinosauro-big-john-al-via-le-visite-guidate-a-trieste/135226/135751', + 'only_matching': True, + }, { + 'url': 'https://video.gazzettadimantova.gelocal.it/locale/dal-ponte-visconteo-di-valeggio-l-and-8217sos-dei-ristoratori-aprire-anche-a-cena/137310/137818', + 'only_matching': True, + }, { + 'url': 'https://video.mattinopadova.gelocal.it/dossier/coronavirus-in-veneto/covid-a-vo-un-anno-dopo-un-cuore-tricolore-per-non-dimenticare/138402/138964', + 'only_matching': True, + }, { + 'url': 'https://video.laprovinciapavese.gelocal.it/locale/mede-zona-rossa-via-alle-vaccinazioni-per-gli-over-80/137545/138120', + 'only_matching': True, + }, { + 'url': 'https://video.tribunatreviso.gelocal.it/dossier/coronavirus-in-veneto/ecco-le-prima-vaccinazioni-di-massa-nella-marca/134485/135024', + 'only_matching': True, + }, { + 'url': 'https://video.nuovavenezia.gelocal.it/locale/camion-troppo-alto-per-il-ponte-ferroviario-perde-il-carico/135734/136266', + 'only_matching': True, + }, { + 'url': 'https://video.gazzettadimodena.gelocal.it/locale/modena-scoperta-la-proteina-che-predice-il-livello-di-gravita-del-covid/139109/139796', + 'only_matching': True, + }, { + 'url': 'https://video.lanuovaferrara.gelocal.it/locale/due-bombole-di-gpl-aperte-e-abbandonate-i-vigili-bruciano-il-gas/134391/134957', + 'only_matching': True, + }, { + 'url': 'https://video.corrierealpi.gelocal.it/dossier/cortina-2021-i-mondiali-di-sci-alpino/mondiali-di-sci-il-timelapse-sulla-splendida-olympia/133760/134331', + 'only_matching': True, + }, { + 'url': 'https://video.lasentinella.gelocal.it/locale/vestigne-centra-un-auto-e-si-ribalta/138931/139466', + 'only_matching': True, + }, { + 'url': 'https://video.espresso.repubblica.it/tutti-i-video/01-ted-villa/14772', + 'only_matching': True, + }] + + @staticmethod + def _sanitize_urls(urls): + # add protocol if missing + for i, e in enumerate(urls): + if e.startswith('//'): + urls[i] = 'https:%s' % e + # clean iframes urls + for i, e in enumerate(urls): + urls[i] = urljoin(base_url(e), url_basename(e)) + return urls + + @staticmethod + def _extract_urls(webpage): + entries = [ + mobj.group('eurl') + for mobj in re.finditer(r'''(?x) + (?: + data-frame-src=| + <iframe[^\n]+src= + ) + (["'])(?P<eurl>%s)\1''' % GediDigitalIE._VALID_URL, webpage)] + return GediDigitalIE._sanitize_urls(entries) + + @staticmethod + def _extract_url(webpage): + urls = GediDigitalIE._extract_urls(webpage) + return urls[0] if urls else None + + @staticmethod + def _clean_formats(formats): + format_urls = set() + clean_formats = [] + for f in formats: + if f['url'] not in format_urls: + if f.get('audio_ext') != 'none' and not f.get('acodec'): + continue + format_urls.add(f['url']) + clean_formats.append(f) + formats[:] = clean_formats + + def _real_extract(self, url): + video_id = self._match_id(url) + url = self._match_valid_url(url).group('url') + webpage = self._download_webpage(url, video_id) + title = self._html_search_meta( + ['twitter:title', 'og:title'], webpage, fatal=True) + player_data = re.findall( + r"PlayerFactory\.setParam\('(?P<type>format|param)',\s*'(?P<name>[^']+)',\s*'(?P<val>[^']+)'\);", + webpage) + + formats = [] + duration = thumb = None + for t, n, v in player_data: + if t == 'format': + if n in ('video-hds-vod-ec', 'video-hls-vod-ec', 'video-viralize', 'video-youtube-pfp'): + continue + elif n.endswith('-vod-ak'): + formats.extend(self._extract_akamai_formats( + v, video_id, {'http': 'media.gedidigital.it'})) + else: + ext = determine_ext(v) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + v, video_id, 'mp4', 'm3u8_native', m3u8_id=n, fatal=False)) + continue + f = { + 'format_id': n, + 'url': v, + } + if ext == 'mp3': + abr = int_or_none(self._search_regex( + r'-mp3-audio-(\d+)', v, 'abr', default=None)) + f.update({ + 'abr': abr, + 'tbr': abr, + 'acodec': ext, + 'vcodec': 'none' + }) + else: + mobj = re.match(r'^video-rrtv-(\d+)(?:-(\d+))?$', n) + if mobj: + f.update({ + 'height': int(mobj.group(1)), + 'vbr': int_or_none(mobj.group(2)), + }) + if not f.get('vbr'): + f['vbr'] = int_or_none(self._search_regex( + r'-video-rrtv-(\d+)', v, 'abr', default=None)) + formats.append(f) + elif t == 'param': + if n in ['image_full', 'image']: + thumb = v + elif n == 'videoDuration': + duration = int_or_none(v) + + self._clean_formats(formats) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': self._html_search_meta( + ['twitter:description', 'og:description', 'description'], webpage), + 'thumbnail': thumb or self._og_search_thumbnail(webpage), + 'formats': formats, + 'duration': duration, + } diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py new file mode 100644 index 000000000..5918c8c56 --- /dev/null +++ b/yt_dlp/extractor/generic.py @@ -0,0 +1,3783 @@ +# coding: utf-8 + +from __future__ import unicode_literals + +import os +import re +import sys + +from .common import InfoExtractor +from .youtube import YoutubeIE +from ..compat import ( + compat_etree_fromstring, + compat_str, + compat_urllib_parse_unquote, + compat_urlparse, + compat_xml_parse_error, +) +from ..utils import ( + determine_ext, + ExtractorError, + float_or_none, + HEADRequest, + int_or_none, + is_html, + js_to_json, + KNOWN_EXTENSIONS, + merge_dicts, + mimetype2ext, + orderedSet, + parse_duration, + sanitized_Request, + smuggle_url, + unescapeHTML, + unified_timestamp, + unsmuggle_url, + UnsupportedError, + url_or_none, + xpath_attr, + xpath_text, + xpath_with_ns, +) +from .commonprotocols import RtmpIE +from .brightcove import ( + BrightcoveLegacyIE, + BrightcoveNewIE, +) +from .nexx import ( + NexxIE, + NexxEmbedIE, +) +from .nbc import NBCSportsVPlayerIE +from .ooyala import OoyalaIE +from .rutv import RUTVIE +from .tvc import TVCIE +from .sportbox import SportBoxIE +from .myvi import MyviIE +from .condenast import CondeNastIE +from .udn import UDNEmbedIE +from .senateisvp import SenateISVPIE +from .svt import SVTIE +from .pornhub import PornHubIE +from .xhamster import XHamsterEmbedIE +from .tnaflix import TNAFlixNetworkEmbedIE +from .drtuber import DrTuberIE +from .redtube import RedTubeIE +from .tube8 import Tube8IE +from .mofosex import MofosexEmbedIE +from .spankwire import SpankwireIE +from .youporn import YouPornIE +from .vimeo import ( + VimeoIE, + VHXEmbedIE, +) +from .dailymotion import DailymotionIE +from .dailymail import DailyMailIE +from .onionstudios import OnionStudiosIE +from .viewlift import ViewLiftEmbedIE +from .mtv import MTVServicesEmbeddedIE +from .pladform import PladformIE +from .videomore import VideomoreIE +from .webcaster import WebcasterFeedIE +from .googledrive import GoogleDriveIE +from .jwplatform import JWPlatformIE +from .digiteka import DigitekaIE +from .arkena import ArkenaIE +from .instagram import InstagramIE +from .threeqsdn import ThreeQSDNIE +from .theplatform import ThePlatformIE +from .kaltura import KalturaIE +from .eagleplatform import EaglePlatformIE +from .facebook import FacebookIE +from .soundcloud import SoundcloudEmbedIE +from .tunein import TuneInBaseIE +from .vbox7 import Vbox7IE +from .dbtv import DBTVIE +from .piksel import PikselIE +from .videa import VideaIE +from .twentymin import TwentyMinutenIE +from .ustream import UstreamIE +from .arte import ArteTVEmbedIE +from .videopress import VideoPressIE +from .rutube import RutubeIE +from .limelight import LimelightBaseIE +from .anvato import AnvatoIE +from .washingtonpost import WashingtonPostIE +from .wistia import WistiaIE +from .mediaset import MediasetIE +from .joj import JojIE +from .megaphone import MegaphoneIE +from .vzaar import VzaarIE +from .channel9 import Channel9IE +from .vshare import VShareIE +from .mediasite import MediasiteIE +from .springboardplatform import SpringboardPlatformIE +from .yapfiles import YapFilesIE +from .vice import ViceIE +from .xfileshare import XFileShareIE +from .cloudflarestream import CloudflareStreamIE +from .peertube import PeerTubeIE +from .teachable import TeachableIE +from .indavideo import IndavideoEmbedIE +from .apa import APAIE +from .foxnews import FoxNewsIE +from .viqeo import ViqeoIE +from .expressen import ExpressenIE +from .zype import ZypeIE +from .odnoklassniki import OdnoklassnikiIE +from .vk import VKIE +from .kinja import KinjaEmbedIE +from .gedidigital import GediDigitalIE +from .rcs import RCSEmbedsIE +from .bitchute import BitChuteIE +from .rumble import RumbleEmbedIE +from .arcpublishing import ArcPublishingIE +from .medialaan import MedialaanIE +from .simplecast import SimplecastIE +from .wimtv import WimTVIE + + +class GenericIE(InfoExtractor): + IE_DESC = 'Generic downloader that works on some sites' + _VALID_URL = r'.*' + IE_NAME = 'generic' + _TESTS = [ + # Direct link to a video + { + 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4', + 'md5': '67d406c2bcb6af27fa886f31aa934bbe', + 'info_dict': { + 'id': 'trailer', + 'ext': 'mp4', + 'title': 'trailer', + 'upload_date': '20100513', + } + }, + # Direct link to media delivered compressed (until Accept-Encoding is *) + { + 'url': 'http://calimero.tk/muzik/FictionJunction-Parallel_Hearts.flac', + 'md5': '128c42e68b13950268b648275386fc74', + 'info_dict': { + 'id': 'FictionJunction-Parallel_Hearts', + 'ext': 'flac', + 'title': 'FictionJunction-Parallel_Hearts', + 'upload_date': '20140522', + }, + 'expected_warnings': [ + 'URL could be a direct video link, returning it as such.' + ], + 'skip': 'URL invalid', + }, + # Direct download with broken HEAD + { + 'url': 'http://ai-radio.org:8000/radio.opus', + 'info_dict': { + 'id': 'radio', + 'ext': 'opus', + 'title': 'radio', + }, + 'params': { + 'skip_download': True, # infinite live stream + }, + 'expected_warnings': [ + r'501.*Not Implemented', + r'400.*Bad Request', + ], + }, + # Direct link with incorrect MIME type + { + 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', + 'md5': '4ccbebe5f36706d85221f204d7eb5913', + 'info_dict': { + 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', + 'id': '5_Lennart_Poettering_-_Systemd', + 'ext': 'webm', + 'title': '5_Lennart_Poettering_-_Systemd', + 'upload_date': '20141120', + }, + 'expected_warnings': [ + 'URL could be a direct video link, returning it as such.' + ] + }, + # RSS feed + { + 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml', + 'info_dict': { + 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml', + 'title': 'Zero Punctuation', + 'description': 're:.*groundbreaking video review series.*' + }, + 'playlist_mincount': 11, + }, + # RSS feed with enclosure + { + 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', + 'info_dict': { + 'id': 'http://podcastfeeds.nbcnews.com/nbcnews/video/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', + 'title': 'MSNBC Rachel Maddow (video)', + 'description': 're:.*her unique approach to storytelling.*', + }, + 'playlist': [{ + 'info_dict': { + 'ext': 'mov', + 'id': 'pdv_maddow_netcast_mov-12-03-2020-223726', + 'title': 'MSNBC Rachel Maddow (video) - 12-03-2020-223726', + 'description': 're:.*her unique approach to storytelling.*', + 'upload_date': '20201204', + }, + }], + }, + # RSS feed with item with description and thumbnails + { + 'url': 'https://anchor.fm/s/dd00e14/podcast/rss', + 'info_dict': { + 'id': 'https://anchor.fm/s/dd00e14/podcast/rss', + 'title': 're:.*100% Hydrogen.*', + 'description': 're:.*In this episode.*', + }, + 'playlist': [{ + 'info_dict': { + 'ext': 'm4a', + 'id': 'c1c879525ce2cb640b344507e682c36d', + 'title': 're:Hydrogen!', + 'description': 're:.*In this episode we are going.*', + 'timestamp': 1567977776, + 'upload_date': '20190908', + 'duration': 459, + 'thumbnail': r're:^https?://.*\.jpg$', + 'episode_number': 1, + 'season_number': 1, + 'age_limit': 0, + }, + }], + 'params': { + 'skip_download': True, + }, + }, + # RSS feed with enclosures and unsupported link URLs + { + 'url': 'http://www.hellointernet.fm/podcast?format=rss', + 'info_dict': { + 'id': 'http://www.hellointernet.fm/podcast?format=rss', + 'description': 'CGP Grey and Brady Haran talk about YouTube, life, work, whatever.', + 'title': 'Hello Internet', + }, + 'playlist_mincount': 100, + }, + # SMIL from http://videolectures.net/promogram_igor_mekjavic_eng + { + 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/video/1/smil.xml', + 'info_dict': { + 'id': 'smil', + 'ext': 'mp4', + 'title': 'Automatics, robotics and biocybernetics', + 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482', + 'upload_date': '20130627', + 'formats': 'mincount:16', + 'subtitles': 'mincount:1', + }, + 'params': { + 'force_generic_extractor': True, + 'skip_download': True, + }, + }, + # SMIL from http://www1.wdr.de/mediathek/video/livestream/index.html + { + 'url': 'http://metafilegenerator.de/WDR/WDR_FS/hds/hds.smil', + 'info_dict': { + 'id': 'hds', + 'ext': 'flv', + 'title': 'hds', + 'formats': 'mincount:1', + }, + 'params': { + 'skip_download': True, + }, + }, + # SMIL from https://www.restudy.dk/video/play/id/1637 + { + 'url': 'https://www.restudy.dk/awsmedia/SmilDirectory/video_1637.xml', + 'info_dict': { + 'id': 'video_1637', + 'ext': 'flv', + 'title': 'video_1637', + 'formats': 'mincount:3', + }, + 'params': { + 'skip_download': True, + }, + }, + # SMIL from http://adventure.howstuffworks.com/5266-cool-jobs-iditarod-musher-video.htm + { + 'url': 'http://services.media.howstuffworks.com/videos/450221/smil-service.smil', + 'info_dict': { + 'id': 'smil-service', + 'ext': 'flv', + 'title': 'smil-service', + 'formats': 'mincount:1', + }, + 'params': { + 'skip_download': True, + }, + }, + # SMIL from http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370 + { + 'url': 'http://api.new.livestream.com/accounts/1570303/events/1585861/videos/4719370.smil', + 'info_dict': { + 'id': '4719370', + 'ext': 'mp4', + 'title': '571de1fd-47bc-48db-abf9-238872a58d1f', + 'formats': 'mincount:3', + }, + 'params': { + 'skip_download': True, + }, + }, + # XSPF playlist from http://www.telegraaf.nl/tv/nieuws/binnenland/24353229/__Tikibad_ontruimd_wegens_brand__.html + { + 'url': 'http://www.telegraaf.nl/xml/playlist/2015/8/7/mZlp2ctYIUEB.xspf', + 'info_dict': { + 'id': 'mZlp2ctYIUEB', + 'ext': 'mp4', + 'title': 'Tikibad ontruimd wegens brand', + 'description': 'md5:05ca046ff47b931f9b04855015e163a4', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 33, + }, + 'params': { + 'skip_download': True, + }, + }, + # MPD from http://dash-mse-test.appspot.com/media.html + { + 'url': 'http://yt-dash-mse-test.commondatastorage.googleapis.com/media/car-20120827-manifest.mpd', + 'md5': '4b57baab2e30d6eb3a6a09f0ba57ef53', + 'info_dict': { + 'id': 'car-20120827-manifest', + 'ext': 'mp4', + 'title': 'car-20120827-manifest', + 'formats': 'mincount:9', + 'upload_date': '20130904', + }, + 'params': { + 'format': 'bestvideo', + }, + }, + # m3u8 served with Content-Type: audio/x-mpegURL; charset=utf-8 + { + 'url': 'http://once.unicornmedia.com/now/master/playlist/bb0b18ba-64f5-4b1b-a29f-0ac252f06b68/77a785f3-5188-4806-b788-0893a61634ed/93677179-2d99-4ef4-9e17-fe70d49abfbf/content.m3u8', + 'info_dict': { + 'id': 'content', + 'ext': 'mp4', + 'title': 'content', + 'formats': 'mincount:8', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + }, + 'skip': 'video gone', + }, + # m3u8 served with Content-Type: text/plain + { + 'url': 'http://www.nacentapps.com/m3u8/index.m3u8', + 'info_dict': { + 'id': 'index', + 'ext': 'mp4', + 'title': 'index', + 'upload_date': '20140720', + 'formats': 'mincount:11', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + }, + 'skip': 'video gone', + }, + # google redirect + { + 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE', + 'info_dict': { + 'id': 'cmQHVoWB5FY', + 'ext': 'mp4', + 'upload_date': '20130224', + 'uploader_id': 'TheVerge', + 'description': r're:^Chris Ziegler takes a look at the\.*', + 'uploader': 'The Verge', + 'title': 'First Firefox OS phones side-by-side', + }, + 'params': { + 'skip_download': False, + } + }, + { + # redirect in Refresh HTTP header + 'url': 'https://www.facebook.com/l.php?u=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DpO8h3EaFRdo&h=TAQHsoToz&enc=AZN16h-b6o4Zq9pZkCCdOLNKMN96BbGMNtcFwHSaazus4JHT_MFYkAA-WARTX2kvsCIdlAIyHZjl6d33ILIJU7Jzwk_K3mcenAXoAzBNoZDI_Q7EXGDJnIhrGkLXo_LJ_pAa2Jzbx17UHMd3jAs--6j2zaeto5w9RTn8T_1kKg3fdC5WPX9Dbb18vzH7YFX0eSJmoa6SP114rvlkw6pkS1-T&s=1', + 'info_dict': { + 'id': 'pO8h3EaFRdo', + 'ext': 'mp4', + 'title': 'Tripeo Boiler Room x Dekmantel Festival DJ Set', + 'description': 'md5:6294cc1af09c4049e0652b51a2df10d5', + 'upload_date': '20150917', + 'uploader_id': 'brtvofficial', + 'uploader': 'Boiler Room', + }, + 'params': { + 'skip_download': False, + }, + }, + { + 'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html', + 'md5': '85b90ccc9d73b4acd9138d3af4c27f89', + 'info_dict': { + 'id': '13601338388002', + 'ext': 'mp4', + 'uploader': 'www.hodiho.fr', + 'title': 'R\u00e9gis plante sa Jeep', + } + }, + # bandcamp page with custom domain + { + 'add_ie': ['Bandcamp'], + 'url': 'http://bronyrock.com/track/the-pony-mash', + 'info_dict': { + 'id': '3235767654', + 'ext': 'mp3', + 'title': 'The Pony Mash', + 'uploader': 'M_Pallante', + }, + 'skip': 'There is a limit of 200 free downloads / month for the test song', + }, + { + # embedded brightcove video + # it also tests brightcove videos that need to set the 'Referer' + # in the http requests + 'add_ie': ['BrightcoveLegacy'], + 'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/', + 'info_dict': { + 'id': '2765128793001', + 'ext': 'mp4', + 'title': 'Le cours de bourse : l’analyse technique', + 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9', + 'uploader': 'BFM BUSINESS', + }, + 'params': { + 'skip_download': True, + }, + }, + { + # embedded with itemprop embedURL and video id spelled as `idVideo` + 'add_id': ['BrightcoveLegacy'], + 'url': 'http://bfmbusiness.bfmtv.com/mediaplayer/chroniques/olivier-delamarche/', + 'info_dict': { + 'id': '5255628253001', + 'ext': 'mp4', + 'title': 'md5:37c519b1128915607601e75a87995fc0', + 'description': 'md5:37f7f888b434bb8f8cc8dbd4f7a4cf26', + 'uploader': 'BFM BUSINESS', + 'uploader_id': '876450612001', + 'timestamp': 1482255315, + 'upload_date': '20161220', + }, + 'params': { + 'skip_download': True, + }, + }, + { + # https://github.com/ytdl-org/youtube-dl/issues/2253 + 'url': 'http://bcove.me/i6nfkrc3', + 'md5': '0ba9446db037002366bab3b3eb30c88c', + 'info_dict': { + 'id': '3101154703001', + 'ext': 'mp4', + 'title': 'Still no power', + 'uploader': 'thestar.com', + 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.', + }, + 'add_ie': ['BrightcoveLegacy'], + 'skip': 'video gone', + }, + { + 'url': 'http://www.championat.com/video/football/v/87/87499.html', + 'md5': 'fb973ecf6e4a78a67453647444222983', + 'info_dict': { + 'id': '3414141473001', + 'ext': 'mp4', + 'title': 'Видео. Удаление Дзагоева (ЦСКА)', + 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"', + 'uploader': 'Championat', + }, + }, + { + # https://github.com/ytdl-org/youtube-dl/issues/3541 + 'add_ie': ['BrightcoveLegacy'], + 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1', + 'info_dict': { + 'id': '3866516442001', + 'ext': 'mp4', + 'title': 'Leer mij vrouwen kennen: Aflevering 1', + 'description': 'Leer mij vrouwen kennen: Aflevering 1', + 'uploader': 'SBS Broadcasting', + }, + 'skip': 'Restricted to Netherlands', + 'params': { + 'skip_download': True, # m3u8 download + }, + }, + { + # Brightcove video in <iframe> + 'url': 'http://www.un.org/chinese/News/story.asp?NewsID=27724', + 'md5': '36d74ef5e37c8b4a2ce92880d208b968', + 'info_dict': { + 'id': '5360463607001', + 'ext': 'mp4', + 'title': '叙利亚失明儿童在废墟上演唱《心跳》 呼吁获得正常童年生活', + 'description': '联合国儿童基金会中东和北非区域大使、作曲家扎德·迪拉尼(Zade Dirani)在3月15日叙利亚冲突爆发7周年纪念日之际发布了为叙利亚谱写的歌曲《心跳》(HEARTBEAT),为受到六年冲突影响的叙利亚儿童发出强烈呐喊,呼吁世界做出共同努力,使叙利亚儿童重新获得享有正常童年生活的权利。', + 'uploader': 'United Nations', + 'uploader_id': '1362235914001', + 'timestamp': 1489593889, + 'upload_date': '20170315', + }, + 'add_ie': ['BrightcoveLegacy'], + }, + { + # Brightcove with alternative playerID key + 'url': 'http://www.nature.com/nmeth/journal/v9/n7/fig_tab/nmeth.2062_SV1.html', + 'info_dict': { + 'id': 'nmeth.2062_SV1', + 'title': 'Simultaneous multiview imaging of the Drosophila syncytial blastoderm : Quantitative high-speed imaging of entire developing embryos with simultaneous multiview light-sheet microscopy : Nature Methods : Nature Research', + }, + 'playlist': [{ + 'info_dict': { + 'id': '2228375078001', + 'ext': 'mp4', + 'title': 'nmeth.2062-sv1', + 'description': 'nmeth.2062-sv1', + 'timestamp': 1363357591, + 'upload_date': '20130315', + 'uploader': 'Nature Publishing Group', + 'uploader_id': '1964492299001', + }, + }], + }, + { + # Brightcove with UUID in videoPlayer + 'url': 'http://www8.hp.com/cn/zh/home.html', + 'info_dict': { + 'id': '5255815316001', + 'ext': 'mp4', + 'title': 'Sprocket Video - China', + 'description': 'Sprocket Video - China', + 'uploader': 'HP-Video Gallery', + 'timestamp': 1482263210, + 'upload_date': '20161220', + 'uploader_id': '1107601872001', + }, + 'params': { + 'skip_download': True, # m3u8 download + }, + 'skip': 'video rotates...weekly?', + }, + { + # Brightcove:new type [2]. + 'url': 'http://www.delawaresportszone.com/video-st-thomas-more-earns-first-trip-to-basketball-semis', + 'md5': '2b35148fcf48da41c9fb4591650784f3', + 'info_dict': { + 'id': '5348741021001', + 'ext': 'mp4', + 'upload_date': '20170306', + 'uploader_id': '4191638492001', + 'timestamp': 1488769918, + 'title': 'VIDEO: St. Thomas More earns first trip to basketball semis', + + }, + }, + { + # Alternative brightcove <video> attributes + 'url': 'http://www.programme-tv.net/videos/extraits/81095-guillaume-canet-evoque-les-rumeurs-d-infidelite-de-marion-cotillard-avec-brad-pitt-dans-vivement-dimanche/', + 'info_dict': { + 'id': '81095-guillaume-canet-evoque-les-rumeurs-d-infidelite-de-marion-cotillard-avec-brad-pitt-dans-vivement-dimanche', + 'title': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche, Extraits : toutes les vidéos avec Télé-Loisirs", + }, + 'playlist': [{ + 'md5': '732d22ba3d33f2f3fc253c39f8f36523', + 'info_dict': { + 'id': '5311302538001', + 'ext': 'mp4', + 'title': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche", + 'description': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche (France 2, 5 février 2017)", + 'timestamp': 1486321708, + 'upload_date': '20170205', + 'uploader_id': '800000640001', + }, + 'only_matching': True, + }], + }, + { + # Brightcove with UUID in videoPlayer + 'url': 'http://www8.hp.com/cn/zh/home.html', + 'info_dict': { + 'id': '5255815316001', + 'ext': 'mp4', + 'title': 'Sprocket Video - China', + 'description': 'Sprocket Video - China', + 'uploader': 'HP-Video Gallery', + 'timestamp': 1482263210, + 'upload_date': '20161220', + 'uploader_id': '1107601872001', + }, + 'params': { + 'skip_download': True, # m3u8 download + }, + }, + # ooyala video + { + 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219', + 'md5': '166dd577b433b4d4ebfee10b0824d8ff', + 'info_dict': { + 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ', + 'ext': 'mp4', + 'title': '2cc213299525360.mov', # that's what we get + 'duration': 238.231, + }, + 'add_ie': ['Ooyala'], + }, + { + # ooyala video embedded with http://player.ooyala.com/iframe.js + 'url': 'http://www.macrumors.com/2015/07/24/steve-jobs-the-man-in-the-machine-first-trailer/', + 'info_dict': { + 'id': 'p0MGJndjoG5SOKqO_hZJuZFPB-Tr5VgB', + 'ext': 'mp4', + 'title': '"Steve Jobs: Man in the Machine" trailer', + 'description': 'The first trailer for the Alex Gibney documentary "Steve Jobs: Man in the Machine."', + 'duration': 135.427, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'movie expired', + }, + # ooyala video embedded with http://player.ooyala.com/static/v4/production/latest/core.min.js + { + 'url': 'http://wnep.com/2017/07/22/steampunk-fest-comes-to-honesdale/', + 'info_dict': { + 'id': 'lwYWYxYzE6V5uJMjNGyKtwwiw9ZJD7t2', + 'ext': 'mp4', + 'title': 'Steampunk Fest Comes to Honesdale', + 'duration': 43.276, + }, + 'params': { + 'skip_download': True, + } + }, + # embed.ly video + { + 'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/', + 'info_dict': { + 'id': '9ODmcdjQcHQ', + 'ext': 'mp4', + 'title': 'Tested: Grinding Coffee at 2000 Frames Per Second', + 'upload_date': '20140225', + 'description': 'md5:06a40fbf30b220468f1e0957c0f558ff', + 'uploader': 'Tested', + 'uploader_id': 'testedcom', + }, + # No need to test YoutubeIE here + 'params': { + 'skip_download': True, + }, + }, + # funnyordie embed + { + 'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns', + 'info_dict': { + 'id': '18e820ec3f', + 'ext': 'mp4', + 'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama', + 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.', + }, + # HEAD requests lead to endless 301, while GET is OK + 'expected_warnings': ['301'], + }, + # RUTV embed + { + 'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html', + 'info_dict': { + 'id': '776940', + 'ext': 'mp4', + 'title': 'Охотское море стало целиком российским', + 'description': 'md5:5ed62483b14663e2a95ebbe115eb8f43', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + # TVC embed + { + 'url': 'http://sch1298sz.mskobr.ru/dou_edu/karamel_ki/filial_galleries/video/iframe_src_http_tvc_ru_video_iframe_id_55304_isplay_false_acc_video_id_channel_brand_id_11_show_episodes_episode_id_32307_frameb/', + 'info_dict': { + 'id': '55304', + 'ext': 'mp4', + 'title': 'Дошкольное воспитание', + }, + }, + # SportBox embed + { + 'url': 'http://www.vestifinance.ru/articles/25753', + 'info_dict': { + 'id': '25753', + 'title': 'Прямые трансляции с Форума-выставки "Госзаказ-2013"', + }, + 'playlist': [{ + 'info_dict': { + 'id': '370908', + 'title': 'Госзаказ. День 3', + 'ext': 'mp4', + } + }, { + 'info_dict': { + 'id': '370905', + 'title': 'Госзаказ. День 2', + 'ext': 'mp4', + } + }, { + 'info_dict': { + 'id': '370902', + 'title': 'Госзаказ. День 1', + 'ext': 'mp4', + } + }], + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + # Myvi.ru embed + { + 'url': 'http://www.kinomyvi.tv/news/detail/Pervij-dublirovannij-trejler--Uzhastikov-_nOw1', + 'info_dict': { + 'id': 'f4dafcad-ff21-423d-89b5-146cfd89fa1e', + 'ext': 'mp4', + 'title': 'Ужастики, русский трейлер (2015)', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 153, + } + }, + # XHamster embed + { + 'url': 'http://www.numisc.com/forum/showthread.php?11696-FM15-which-pumiscer-was-this-%28-vid-%29-%28-alfa-as-fuck-srx-%29&s=711f5db534502e22260dec8c5e2d66d8', + 'info_dict': { + 'id': 'showthread', + 'title': '[NSFL] [FM15] which pumiscer was this ( vid ) ( alfa as fuck srx )', + }, + 'playlist_mincount': 7, + # This forum does not allow <iframe> syntaxes anymore + # Now HTML tags are displayed as-is + 'skip': 'No videos on this page', + }, + # Embedded TED video + { + 'url': 'http://en.support.wordpress.com/videos/ted-talks/', + 'md5': '65fdff94098e4a607385a60c5177c638', + 'info_dict': { + 'id': '1969', + 'ext': 'mp4', + 'title': 'Hidden miracles of the natural world', + 'uploader': 'Louie Schwartzberg', + 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9', + } + }, + # nowvideo embed hidden behind percent encoding + { + 'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/', + 'md5': '2baf4ddd70f697d94b1c18cf796d5107', + 'info_dict': { + 'id': '06e53103ca9aa', + 'ext': 'flv', + 'title': 'Macross Episode 001 Watch Macross Episode 001 onl', + 'description': 'No description', + }, + }, + # arte embed + { + 'url': 'http://www.tv-replay.fr/redirection/20-03-14/x-enius-arte-10753389.html', + 'md5': '7653032cbb25bf6c80d80f217055fa43', + 'info_dict': { + 'id': '048195-004_PLUS7-F', + 'ext': 'flv', + 'title': 'X:enius', + 'description': 'md5:d5fdf32ef6613cdbfd516ae658abf168', + 'upload_date': '20140320', + }, + 'params': { + 'skip_download': 'Requires rtmpdump' + }, + 'skip': 'video gone', + }, + # francetv embed + { + 'url': 'http://www.tsprod.com/replay-du-concert-alcaline-de-calogero', + 'info_dict': { + 'id': 'EV_30231', + 'ext': 'mp4', + 'title': 'Alcaline, le concert avec Calogero', + 'description': 'md5:61f08036dcc8f47e9cfc33aed08ffaff', + 'upload_date': '20150226', + 'timestamp': 1424989860, + 'duration': 5400, + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + }, + 'expected_warnings': [ + 'Forbidden' + ] + }, + # Condé Nast embed + { + 'url': 'http://www.wired.com/2014/04/honda-asimo/', + 'md5': 'ba0dfe966fa007657bd1443ee672db0f', + 'info_dict': { + 'id': '53501be369702d3275860000', + 'ext': 'mp4', + 'title': 'Honda’s New Asimo Robot Is More Human Than Ever', + } + }, + # Dailymotion embed + { + 'url': 'http://www.spi0n.com/zap-spi0n-com-n216/', + 'md5': '441aeeb82eb72c422c7f14ec533999cd', + 'info_dict': { + 'id': 'k2mm4bCdJ6CQ2i7c8o2', + 'ext': 'mp4', + 'title': 'Le Zap de Spi0n n°216 - Zapping du Web', + 'description': 'md5:faf028e48a461b8b7fad38f1e104b119', + 'uploader': 'Spi0n', + 'uploader_id': 'xgditw', + 'upload_date': '20140425', + 'timestamp': 1398441542, + }, + 'add_ie': ['Dailymotion'], + }, + # DailyMail embed + { + 'url': 'http://www.bumm.sk/krimi/2017/07/05/biztonsagi-kamera-buktatta-le-az-agg-ferfit-utlegelo-apolot', + 'info_dict': { + 'id': '1495629', + 'ext': 'mp4', + 'title': 'Care worker punches elderly dementia patient in head 11 times', + 'description': 'md5:3a743dee84e57e48ec68bf67113199a5', + }, + 'add_ie': ['DailyMail'], + 'params': { + 'skip_download': True, + }, + }, + # YouTube embed + { + 'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html', + 'info_dict': { + 'id': 'FXRb4ykk4S0', + 'ext': 'mp4', + 'title': 'The NBL Auction 2014', + 'uploader': 'BADMINTON England', + 'uploader_id': 'BADMINTONEvents', + 'upload_date': '20140603', + 'description': 'md5:9ef128a69f1e262a700ed83edb163a73', + }, + 'add_ie': ['Youtube'], + 'params': { + 'skip_download': True, + } + }, + # MTVServices embed + { + 'url': 'http://www.vulture.com/2016/06/new-key-peele-sketches-released.html', + 'md5': 'ca1aef97695ef2c1d6973256a57e5252', + 'info_dict': { + 'id': '769f7ec0-0692-4d62-9b45-0d88074bffc1', + 'ext': 'mp4', + 'title': 'Key and Peele|October 10, 2012|2|203|Liam Neesons - Uncensored', + 'description': 'Two valets share their love for movie star Liam Neesons.', + 'timestamp': 1349922600, + 'upload_date': '20121011', + }, + }, + # YouTube embed via <data-embed-url=""> + { + 'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM', + 'info_dict': { + 'id': '4vAffPZIT44', + 'ext': 'mp4', + 'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!', + 'uploader': 'Gameloft', + 'uploader_id': 'gameloft', + 'upload_date': '20140828', + 'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4', + }, + 'params': { + 'skip_download': True, + } + }, + # YouTube <object> embed + { + 'url': 'http://www.improbable.com/2017/04/03/untrained-modern-youths-and-ancient-masters-in-selfie-portraits/', + 'md5': '516718101ec834f74318df76259fb3cc', + 'info_dict': { + 'id': 'msN87y-iEx0', + 'ext': 'webm', + 'title': 'Feynman: Mirrors FUN TO IMAGINE 6', + 'upload_date': '20080526', + 'description': 'md5:0ffc78ea3f01b2e2c247d5f8d1d3c18d', + 'uploader': 'Christopher Sykes', + 'uploader_id': 'ChristopherJSykes', + }, + 'add_ie': ['Youtube'], + }, + # Camtasia studio + { + 'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/', + 'playlist': [{ + 'md5': '0c5e352edabf715d762b0ad4e6d9ee67', + 'info_dict': { + 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final', + 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1', + 'ext': 'flv', + 'duration': 2235.90, + } + }, { + 'md5': '10e4bb3aaca9fd630e273ff92d9f3c63', + 'info_dict': { + 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP', + 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip', + 'ext': 'flv', + 'duration': 2235.93, + } + }], + 'info_dict': { + 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final', + } + }, + # Flowplayer + { + 'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html', + 'md5': '9d65602bf31c6e20014319c7d07fba27', + 'info_dict': { + 'id': '5123ea6d5e5a7', + 'ext': 'mp4', + 'age_limit': 18, + 'uploader': 'www.handjobhub.com', + 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com', + } + }, + # Multiple brightcove videos + # https://github.com/ytdl-org/youtube-dl/issues/2283 + { + 'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html', + 'info_dict': { + 'id': 'always-never', + 'title': 'Always / Never - The New Yorker', + }, + 'playlist_count': 3, + 'params': { + 'extract_flat': False, + 'skip_download': True, + } + }, + # MLB embed + { + 'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/', + 'md5': '96f09a37e44da40dd083e12d9a683327', + 'info_dict': { + 'id': '33322633', + 'ext': 'mp4', + 'title': 'Ump changes call to ball', + 'description': 'md5:71c11215384298a172a6dcb4c2e20685', + 'duration': 48, + 'timestamp': 1401537900, + 'upload_date': '20140531', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + }, + # Wistia embed + { + 'url': 'http://study.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson', + 'md5': '1953f3a698ab51cfc948ed3992a0b7ff', + 'info_dict': { + 'id': '6e2wtrbdaf', + 'ext': 'mov', + 'title': 'paywall_north-american-exploration-failed-colonies-of-spain-france-england', + 'description': 'a Paywall Videos video from Remilon', + 'duration': 644.072, + 'uploader': 'study.com', + 'timestamp': 1459678540, + 'upload_date': '20160403', + 'filesize': 24687186, + }, + }, + { + 'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz', + 'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4', + 'info_dict': { + 'id': 'uxjb0lwrcz', + 'ext': 'mp4', + 'title': 'Conversation about Hexagonal Rails Part 1', + 'description': 'a Martin Fowler video from ThoughtWorks', + 'duration': 1715.0, + 'uploader': 'thoughtworks.wistia.com', + 'timestamp': 1401832161, + 'upload_date': '20140603', + }, + }, + # Wistia standard embed (async) + { + 'url': 'https://www.getdrip.com/university/brennan-dunn-drip-workshop/', + 'info_dict': { + 'id': '807fafadvk', + 'ext': 'mp4', + 'title': 'Drip Brennan Dunn Workshop', + 'description': 'a JV Webinars video from getdrip-1', + 'duration': 4986.95, + 'timestamp': 1463607249, + 'upload_date': '20160518', + }, + 'params': { + 'skip_download': True, + } + }, + # Soundcloud embed + { + 'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/', + 'info_dict': { + 'id': '174391317', + 'ext': 'mp3', + 'description': 'md5:ff867d6b555488ad3c52572bb33d432c', + 'uploader': 'Sophos Security', + 'title': 'Chet Chat 171 - Oct 29, 2014', + 'upload_date': '20141029', + } + }, + # Soundcloud multiple embeds + { + 'url': 'http://www.guitarplayer.com/lessons/1014/legato-workout-one-hour-to-more-fluid-performance---tab/52809', + 'info_dict': { + 'id': '52809', + 'title': 'Guitar Essentials: Legato Workout—One-Hour to Fluid Performance | TAB + AUDIO', + }, + 'playlist_mincount': 7, + }, + # TuneIn station embed + { + 'url': 'http://radiocnrv.com/promouvoir-radio-cnrv/', + 'info_dict': { + 'id': '204146', + 'ext': 'mp3', + 'title': 'CNRV', + 'location': 'Paris, France', + 'is_live': True, + }, + 'params': { + # Live stream + 'skip_download': True, + }, + }, + # Livestream embed + { + 'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast', + 'info_dict': { + 'id': '67864563', + 'ext': 'flv', + 'upload_date': '20141112', + 'title': 'Rosetta #CometLanding webcast HL 10', + } + }, + # Another Livestream embed, without 'new.' in URL + { + 'url': 'https://www.freespeech.org/', + 'info_dict': { + 'id': '123537347', + 'ext': 'mp4', + 'title': 're:^FSTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + }, + 'params': { + # Live stream + 'skip_download': True, + }, + }, + # LazyYT + { + 'url': 'https://skiplagged.com/', + 'info_dict': { + 'id': 'skiplagged', + 'title': 'Skiplagged: The smart way to find cheap flights', + }, + 'playlist_mincount': 1, + 'add_ie': ['Youtube'], + }, + # Cinchcast embed + { + 'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/', + 'info_dict': { + 'id': '7141703', + 'ext': 'mp3', + 'upload_date': '20141126', + 'title': 'Jack Tips: 5 Steps to Permanent Gut Healing', + } + }, + # Cinerama player + { + 'url': 'http://www.abc.net.au/7.30/content/2015/s4164797.htm', + 'info_dict': { + 'id': '730m_DandD_1901_512k', + 'ext': 'mp4', + 'uploader': 'www.abc.net.au', + 'title': 'Game of Thrones with dice - Dungeons and Dragons fantasy role-playing game gets new life - 19/01/2015', + } + }, + # embedded viddler video + { + 'url': 'http://deadspin.com/i-cant-stop-watching-john-wall-chop-the-nuggets-with-th-1681801597', + 'info_dict': { + 'id': '4d03aad9', + 'ext': 'mp4', + 'uploader': 'deadspin', + 'title': 'WALL-TO-GORTAT', + 'timestamp': 1422285291, + 'upload_date': '20150126', + }, + 'add_ie': ['Viddler'], + }, + # Libsyn embed + { + 'url': 'http://thedailyshow.cc.com/podcast/episodetwelve', + 'info_dict': { + 'id': '3377616', + 'ext': 'mp3', + 'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart", + 'description': 'md5:601cb790edd05908957dae8aaa866465', + 'upload_date': '20150220', + }, + 'skip': 'All The Daily Show URLs now redirect to http://www.cc.com/shows/', + }, + # jwplayer YouTube + { + 'url': 'http://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/', + 'info_dict': { + 'id': 'Mrj4DVp2zeA', + 'ext': 'mp4', + 'upload_date': '20150212', + 'uploader': 'The National Archives UK', + 'description': 'md5:8078af856dca76edc42910b61273dbbf', + 'uploader_id': 'NationalArchives08', + 'title': 'Webinar: Using Discovery, The National Archives’ online catalogue', + }, + }, + # jwplayer rtmp + { + 'url': 'http://www.suffolk.edu/sjc/live.php', + 'info_dict': { + 'id': 'live', + 'ext': 'flv', + 'title': 'Massachusetts Supreme Judicial Court Oral Arguments', + 'uploader': 'www.suffolk.edu', + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Only has video a few mornings per month, see http://www.suffolk.edu/sjc/', + }, + # Complex jwplayer + { + 'url': 'http://www.indiedb.com/games/king-machine/videos', + 'info_dict': { + 'id': 'videos', + 'ext': 'mp4', + 'title': 'king machine trailer 1', + 'description': 'Browse King Machine videos & audio for sweet media. Your eyes will thank you.', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + }, + { + # JWPlayer config passed as variable + 'url': 'http://www.txxx.com/videos/3326530/ariele/', + 'info_dict': { + 'id': '3326530_hq', + 'ext': 'mp4', + 'title': 'ARIELE | Tube Cup', + 'uploader': 'www.txxx.com', + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + } + }, + { + # JWPlatform iframe + 'url': 'https://www.covermagazine.co.uk/feature/2465255/business-protection-involved', + 'info_dict': { + 'id': 'AG26UQXM', + 'ext': 'mp4', + 'upload_date': '20160719', + 'timestamp': 468923808, + 'title': '2016_05_18 Cover L&G Business Protection V1 FINAL.mp4', + }, + 'add_ie': [JWPlatformIE.ie_key()], + }, + { + # Video.js embed, multiple formats + 'url': 'http://ortcam.com/solidworks-урок-6-настройка-чертежа_33f9b7351.html', + 'info_dict': { + 'id': 'yygqldloqIk', + 'ext': 'mp4', + 'title': 'SolidWorks. Урок 6 Настройка чертежа', + 'description': 'md5:baf95267792646afdbf030e4d06b2ab3', + 'upload_date': '20130314', + 'uploader': 'PROстое3D', + 'uploader_id': 'PROstoe3D', + }, + 'params': { + 'skip_download': True, + }, + }, + { + # Video.js embed, single format + 'url': 'https://www.vooplayer.com/v3/watch/watch.php?v=NzgwNTg=', + 'info_dict': { + 'id': 'watch', + 'ext': 'mp4', + 'title': 'Step 1 - Good Foundation', + 'description': 'md5:d1e7ff33a29fc3eb1673d6c270d344f4', + }, + 'params': { + 'skip_download': True, + }, + }, + # rtl.nl embed + { + 'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen', + 'playlist_mincount': 5, + 'info_dict': { + 'id': 'aanslagen-kopenhagen', + 'title': 'Aanslagen Kopenhagen', + } + }, + # Zapiks embed + { + 'url': 'http://www.skipass.com/news/116090-bon-appetit-s5ep3-baqueira-mi-cor.html', + 'info_dict': { + 'id': '118046', + 'ext': 'mp4', + 'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !', + } + }, + # Kaltura embed (different embed code) + { + 'url': 'http://www.premierchristianradio.com/Shows/Saturday/Unbelievable/Conference-Videos/Os-Guinness-Is-It-Fools-Talk-Unbelievable-Conference-2014', + 'info_dict': { + 'id': '1_a52wc67y', + 'ext': 'flv', + 'upload_date': '20150127', + 'uploader_id': 'PremierMedia', + 'timestamp': int, + 'title': 'Os Guinness // Is It Fools Talk? // Unbelievable? Conference 2014', + }, + }, + # Kaltura embed with single quotes + { + 'url': 'http://fod.infobase.com/p_ViewPlaylist.aspx?AssignmentID=NUN8ZY', + 'info_dict': { + 'id': '0_izeg5utt', + 'ext': 'mp4', + 'title': '35871', + 'timestamp': 1355743100, + 'upload_date': '20121217', + 'uploader_id': 'cplapp@learn360.com', + }, + 'add_ie': ['Kaltura'], + }, + { + # Kaltura embedded via quoted entry_id + 'url': 'https://www.oreilly.com/ideas/my-cloud-makes-pretty-pictures', + 'info_dict': { + 'id': '0_utuok90b', + 'ext': 'mp4', + 'title': '06_matthew_brender_raj_dutt', + 'timestamp': 1466638791, + 'upload_date': '20160622', + }, + 'add_ie': ['Kaltura'], + 'expected_warnings': [ + 'Could not send HEAD request' + ], + 'params': { + 'skip_download': True, + } + }, + { + # Kaltura embedded, some fileExt broken (#11480) + 'url': 'http://www.cornell.edu/video/nima-arkani-hamed-standard-models-of-particle-physics', + 'info_dict': { + 'id': '1_sgtvehim', + 'ext': 'mp4', + 'title': 'Our "Standard Models" of particle physics and cosmology', + 'description': 'md5:67ea74807b8c4fea92a6f38d6d323861', + 'timestamp': 1321158993, + 'upload_date': '20111113', + 'uploader_id': 'kps1', + }, + 'add_ie': ['Kaltura'], + }, + { + # Kaltura iframe embed + 'url': 'http://www.gsd.harvard.edu/event/i-m-pei-a-centennial-celebration/', + 'md5': 'ae5ace8eb09dc1a35d03b579a9c2cc44', + 'info_dict': { + 'id': '0_f2cfbpwy', + 'ext': 'mp4', + 'title': 'I. M. Pei: A Centennial Celebration', + 'description': 'md5:1db8f40c69edc46ca180ba30c567f37c', + 'upload_date': '20170403', + 'uploader_id': 'batchUser', + 'timestamp': 1491232186, + }, + 'add_ie': ['Kaltura'], + }, + { + # Kaltura iframe embed, more sophisticated + 'url': 'http://www.cns.nyu.edu/~eero/math-tools/Videos/lecture-05sep2017.html', + 'info_dict': { + 'id': '1_9gzouybz', + 'ext': 'mp4', + 'title': 'lecture-05sep2017', + 'description': 'md5:40f347d91fd4ba047e511c5321064b49', + 'upload_date': '20170913', + 'uploader_id': 'eps2', + 'timestamp': 1505340777, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['Kaltura'], + }, + { + # meta twitter:player + 'url': 'http://thechive.com/2017/12/08/all-i-want-for-christmas-is-more-twerk/', + 'info_dict': { + 'id': '0_01b42zps', + 'ext': 'mp4', + 'title': 'Main Twerk (Video)', + 'upload_date': '20171208', + 'uploader_id': 'sebastian.salinas@thechive.com', + 'timestamp': 1512713057, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['Kaltura'], + }, + # referrer protected EaglePlatform embed + { + 'url': 'https://tvrain.ru/lite/teleshow/kak_vse_nachinalos/namin-418921/', + 'info_dict': { + 'id': '582306', + 'ext': 'mp4', + 'title': 'Стас Намин: «Мы нарушили девственность Кремля»', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 3382, + 'view_count': int, + }, + 'params': { + 'skip_download': True, + }, + }, + # ClipYou (EaglePlatform) embed (custom URL) + { + 'url': 'http://muz-tv.ru/play/7129/', + # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used + 'info_dict': { + 'id': '12820', + 'ext': 'mp4', + 'title': "'O Sole Mio", + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 216, + 'view_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'This video is unavailable.', + }, + # Pladform embed + { + 'url': 'http://muz-tv.ru/kinozal/view/7400/', + 'info_dict': { + 'id': '100183293', + 'ext': 'mp4', + 'title': 'Тайны перевала Дятлова • 1 серия 2 часть', + 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 694, + 'age_limit': 0, + }, + 'skip': 'HTTP Error 404: Not Found', + }, + # Playwire embed + { + 'url': 'http://www.cinemablend.com/new/First-Joe-Dirt-2-Trailer-Teaser-Stupid-Greatness-70874.html', + 'info_dict': { + 'id': '3519514', + 'ext': 'mp4', + 'title': 'Joe Dirt 2 Beautiful Loser Teaser Trailer', + 'thumbnail': r're:^https?://.*\.png$', + 'duration': 45.115, + }, + }, + # 5min embed + { + 'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/', + 'md5': '4c6f127a30736b59b3e2c19234ee2bf7', + 'info_dict': { + 'id': '518726732', + 'ext': 'mp4', + 'title': 'Facebook Creates "On This Day" | Crunch Report', + 'description': 'Amazon updates Fire TV line, Tesla\'s Model X spotted in the wild', + 'timestamp': 1427237531, + 'uploader': 'Crunch Report', + 'upload_date': '20150324', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + # Crooks and Liars embed + { + 'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists', + 'info_dict': { + 'id': '8RUoRhRi', + 'ext': 'mp4', + 'title': "Fox & Friends Says Protecting Atheists From Discrimination Is Anti-Christian!", + 'description': 'md5:e1a46ad1650e3a5ec7196d432799127f', + 'timestamp': 1428207000, + 'upload_date': '20150405', + 'uploader': 'Heather', + }, + }, + # Crooks and Liars external embed + { + 'url': 'http://theothermccain.com/2010/02/02/video-proves-that-bill-kristol-has-been-watching-glenn-beck/comment-page-1/', + 'info_dict': { + 'id': 'MTE3MjUtMzQ2MzA', + 'ext': 'mp4', + 'title': 'md5:5e3662a81a4014d24c250d76d41a08d5', + 'description': 'md5:9b8e9542d6c3c5de42d6451b7d780cec', + 'timestamp': 1265032391, + 'upload_date': '20100201', + 'uploader': 'Heather', + }, + }, + # NBC Sports vplayer embed + { + 'url': 'http://www.riderfans.com/forum/showthread.php?121827-Freeman&s=e98fa1ea6dc08e886b1678d35212494a', + 'info_dict': { + 'id': 'ln7x1qSThw4k', + 'ext': 'flv', + 'title': "PFT Live: New leader in the 'new-look' defense", + 'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e', + 'uploader': 'NBCU-SPORTS', + 'upload_date': '20140107', + 'timestamp': 1389118457, + }, + 'skip': 'Invalid Page URL', + }, + # NBC News embed + { + 'url': 'http://www.vulture.com/2016/06/letterman-couldnt-care-less-about-late-night.html', + 'md5': '1aa589c675898ae6d37a17913cf68d66', + 'info_dict': { + 'id': 'x_dtl_oa_LettermanliftPR_160608', + 'ext': 'mp4', + 'title': 'David Letterman: A Preview', + 'description': 'A preview of Tom Brokaw\'s interview with David Letterman as part of the On Assignment series powered by Dateline. Airs Sunday June 12 at 7/6c.', + 'upload_date': '20160609', + 'timestamp': 1465431544, + 'uploader': 'NBCU-NEWS', + }, + }, + # UDN embed + { + 'url': 'https://video.udn.com/news/300346', + 'md5': 'fd2060e988c326991037b9aff9df21a6', + 'info_dict': { + 'id': '300346', + 'ext': 'mp4', + 'title': '中一中男師變性 全校師生力挺', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'expected_warnings': ['Failed to parse JSON Expecting value'], + }, + # Brightcove URL in single quotes + { + 'url': 'http://www.sportsnet.ca/baseball/mlb/sn-presents-russell-martin-world-citizen/', + 'md5': '4ae374f1f8b91c889c4b9203c8c752af', + 'info_dict': { + 'id': '4255764656001', + 'ext': 'mp4', + 'title': 'SN Presents: Russell Martin, World Citizen', + 'description': 'To understand why he was the Toronto Blue Jays’ top off-season priority is to appreciate his background and upbringing in Montreal, where he first developed his baseball skills. Written and narrated by Stephen Brunt.', + 'uploader': 'Rogers Sportsnet', + 'uploader_id': '1704050871', + 'upload_date': '20150525', + 'timestamp': 1432570283, + }, + }, + # Kinja embed + { + 'url': 'http://www.clickhole.com/video/dont-understand-bitcoin-man-will-mumble-explanatio-2537', + 'info_dict': { + 'id': '106351', + 'ext': 'mp4', + 'title': 'Don’t Understand Bitcoin? This Man Will Mumble An Explanation At You', + 'description': 'Migrated from OnionStudios', + 'thumbnail': r're:^https?://.*\.jpe?g$', + 'uploader': 'clickhole', + 'upload_date': '20150527', + 'timestamp': 1432744860, + } + }, + # SnagFilms embed + { + 'url': 'http://whilewewatch.blogspot.ru/2012/06/whilewewatch-whilewewatch-gripping.html', + 'info_dict': { + 'id': '74849a00-85a9-11e1-9660-123139220831', + 'ext': 'mp4', + 'title': '#whilewewatch', + } + }, + # AdobeTVVideo embed + { + 'url': 'https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners', + 'md5': '43662b577c018ad707a63766462b1e87', + 'info_dict': { + 'id': '2456', + 'ext': 'mp4', + 'title': 'New experience with Acrobat DC', + 'description': 'New experience with Acrobat DC', + 'duration': 248.667, + }, + }, + # BrightcoveInPageEmbed embed + { + 'url': 'http://www.geekandsundry.com/tabletop-bonus-wils-final-thoughts-on-dread/', + 'info_dict': { + 'id': '4238694884001', + 'ext': 'flv', + 'title': 'Tabletop: Dread, Last Thoughts', + 'description': 'Tabletop: Dread, Last Thoughts', + 'duration': 51690, + }, + }, + # Brightcove embed, with no valid 'renditions' but valid 'IOSRenditions' + # This video can't be played in browsers if Flash disabled and UA set to iPhone, which is actually a false alarm + { + 'url': 'https://dl.dropboxusercontent.com/u/29092637/interview.html', + 'info_dict': { + 'id': '4785848093001', + 'ext': 'mp4', + 'title': 'The Cardinal Pell Interview', + 'description': 'Sky News Contributor Andrew Bolt interviews George Pell in Rome, following the Cardinal\'s evidence before the Royal Commission into Child Abuse. ', + 'uploader': 'GlobeCast Australia - GlobeStream', + 'uploader_id': '2733773828001', + 'upload_date': '20160304', + 'timestamp': 1457083087, + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + }, + }, + { + # Brightcove embed with whitespace around attribute names + 'url': 'http://www.stack.com/video/3167554373001/learn-to-hit-open-three-pointers-with-damian-lillard-s-baseline-drift-drill', + 'info_dict': { + 'id': '3167554373001', + 'ext': 'mp4', + 'title': "Learn to Hit Open Three-Pointers With Damian Lillard's Baseline Drift Drill", + 'description': 'md5:57bacb0e0f29349de4972bfda3191713', + 'uploader_id': '1079349493', + 'upload_date': '20140207', + 'timestamp': 1391810548, + }, + 'params': { + 'skip_download': True, + }, + }, + # Another form of arte.tv embed + { + 'url': 'http://www.tv-replay.fr/redirection/09-04-16/arte-reportage-arte-11508975.html', + 'md5': '850bfe45417ddf221288c88a0cffe2e2', + 'info_dict': { + 'id': '030273-562_PLUS7-F', + 'ext': 'mp4', + 'title': 'ARTE Reportage - Nulle part, en France', + 'description': 'md5:e3a0e8868ed7303ed509b9e3af2b870d', + 'upload_date': '20160409', + }, + }, + # Duplicated embedded video URLs + { + 'url': 'http://www.hudl.com/athlete/2538180/highlights/149298443', + 'info_dict': { + 'id': '149298443_480_16c25b74_2', + 'ext': 'mp4', + 'title': 'vs. Blue Orange Spring Game', + 'uploader': 'www.hudl.com', + }, + }, + # twitter:player:stream embed + { + 'url': 'http://www.rtl.be/info/video/589263.aspx?CategoryID=288', + 'info_dict': { + 'id': 'master', + 'ext': 'mp4', + 'title': 'Une nouvelle espèce de dinosaure découverte en Argentine', + 'uploader': 'www.rtl.be', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + }, + }, + # twitter:player embed + { + 'url': 'http://www.theatlantic.com/video/index/484130/what-do-black-holes-sound-like/', + 'md5': 'a3e0df96369831de324f0778e126653c', + 'info_dict': { + 'id': '4909620399001', + 'ext': 'mp4', + 'title': 'What Do Black Holes Sound Like?', + 'description': 'what do black holes sound like', + 'upload_date': '20160524', + 'uploader_id': '29913724001', + 'timestamp': 1464107587, + 'uploader': 'TheAtlantic', + }, + 'add_ie': ['BrightcoveLegacy'], + }, + # Facebook <iframe> embed + { + 'url': 'https://www.hostblogger.de/blog/archives/6181-Auto-jagt-Betonmischer.html', + 'md5': 'fbcde74f534176ecb015849146dd3aee', + 'info_dict': { + 'id': '599637780109885', + 'ext': 'mp4', + 'title': 'Facebook video #599637780109885', + }, + }, + # Facebook <iframe> embed, plugin video + { + 'url': 'http://5pillarsuk.com/2017/06/07/tariq-ramadan-disagrees-with-pr-exercise-by-imams-refusing-funeral-prayers-for-london-attackers/', + 'info_dict': { + 'id': '1754168231264132', + 'ext': 'mp4', + 'title': 'About the Imams and Religious leaders refusing to perform funeral prayers for...', + 'uploader': 'Tariq Ramadan (official)', + 'timestamp': 1496758379, + 'upload_date': '20170606', + }, + 'params': { + 'skip_download': True, + }, + }, + # Facebook API embed + { + 'url': 'http://www.lothype.com/blue-stars-2016-preview-standstill-full-show/', + 'md5': 'a47372ee61b39a7b90287094d447d94e', + 'info_dict': { + 'id': '10153467542406923', + 'ext': 'mp4', + 'title': 'Facebook video #10153467542406923', + }, + }, + # Wordpress "YouTube Video Importer" plugin + { + 'url': 'http://www.lothype.com/blue-devils-drumline-stanford-lot-2016/', + 'md5': 'd16797741b560b485194eddda8121b48', + 'info_dict': { + 'id': 'HNTXWDXV9Is', + 'ext': 'mp4', + 'title': 'Blue Devils Drumline Stanford lot 2016', + 'upload_date': '20160627', + 'uploader_id': 'GENOCIDE8GENERAL10', + 'uploader': 'cylus cyrus', + }, + }, + { + # video stored on custom kaltura server + 'url': 'http://www.expansion.com/multimedia/videos.html?media=EQcM30NHIPv', + 'md5': '537617d06e64dfed891fa1593c4b30cc', + 'info_dict': { + 'id': '0_1iotm5bh', + 'ext': 'mp4', + 'title': 'Elecciones británicas: 5 lecciones para Rajoy', + 'description': 'md5:435a89d68b9760b92ce67ed227055f16', + 'uploader_id': 'videos.expansion@el-mundo.net', + 'upload_date': '20150429', + 'timestamp': 1430303472, + }, + 'add_ie': ['Kaltura'], + }, + { + # multiple kaltura embeds, nsfw + 'url': 'https://www.quartier-rouge.be/prive/femmes/kamila-avec-video-jaime-sadomie.html', + 'info_dict': { + 'id': 'kamila-avec-video-jaime-sadomie', + 'title': "Kamila avec vídeo “J'aime sadomie”", + }, + 'playlist_count': 8, + }, + { + # Non-standard Vimeo embed + 'url': 'https://openclassrooms.com/courses/understanding-the-web', + 'md5': '64d86f1c7d369afd9a78b38cbb88d80a', + 'info_dict': { + 'id': '148867247', + 'ext': 'mp4', + 'title': 'Understanding the web - Teaser', + 'description': 'This is "Understanding the web - Teaser" by openclassrooms on Vimeo, the home for high quality videos and the people who love them.', + 'upload_date': '20151214', + 'uploader': 'OpenClassrooms', + 'uploader_id': 'openclassrooms', + }, + 'add_ie': ['Vimeo'], + }, + { + # generic vimeo embed that requires original URL passed as Referer + 'url': 'http://racing4everyone.eu/2016/07/30/formula-1-2016-round12-germany/', + 'only_matching': True, + }, + { + 'url': 'https://support.arkena.com/display/PLAY/Ways+to+embed+your+video', + 'md5': 'b96f2f71b359a8ecd05ce4e1daa72365', + 'info_dict': { + 'id': 'b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe', + 'ext': 'mp4', + 'title': 'Big Buck Bunny', + 'description': 'Royalty free test video', + 'timestamp': 1432816365, + 'upload_date': '20150528', + 'is_live': False, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [ArkenaIE.ie_key()], + }, + { + 'url': 'http://nova.bg/news/view/2016/08/16/156543/%D0%BD%D0%B0-%D0%BA%D0%BE%D1%81%D1%8A%D0%BC-%D0%BE%D1%82-%D0%B2%D0%B7%D1%80%D0%B8%D0%B2-%D0%BE%D1%82%D1%86%D0%B5%D0%BF%D0%B8%D1%85%D0%B0-%D1%86%D1%8F%D0%BB-%D0%BA%D0%B2%D0%B0%D1%80%D1%82%D0%B0%D0%BB-%D0%B7%D0%B0%D1%80%D0%B0%D0%B4%D0%B8-%D0%B8%D0%B7%D1%82%D0%B8%D1%87%D0%B0%D0%BD%D0%B5-%D0%BD%D0%B0-%D0%B3%D0%B0%D0%B7-%D0%B2-%D0%BF%D0%BB%D0%BE%D0%B2%D0%B4%D0%B8%D0%B2/', + 'info_dict': { + 'id': '1c7141f46c', + 'ext': 'mp4', + 'title': 'НА КОСЪМ ОТ ВЗРИВ: Изтичане на газ на бензиностанция в Пловдив', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [Vbox7IE.ie_key()], + }, + { + # DBTV embeds + 'url': 'http://www.dagbladet.no/2016/02/23/nyheter/nordlys/ski/troms/ver/43254897/', + 'info_dict': { + 'id': '43254897', + 'title': 'Etter ett års planlegging, klaffet endelig alt: - Jeg måtte ta en liten dans', + }, + 'playlist_mincount': 3, + }, + { + # Videa embeds + 'url': 'http://forum.dvdtalk.com/movie-talk/623756-deleted-magic-star-wars-ot-deleted-alt-scenes-docu-style.html', + 'info_dict': { + 'id': '623756-deleted-magic-star-wars-ot-deleted-alt-scenes-docu-style', + 'title': 'Deleted Magic - Star Wars: OT Deleted / Alt. Scenes Docu. Style - DVD Talk Forum', + }, + 'playlist_mincount': 2, + }, + { + # 20 minuten embed + 'url': 'http://www.20min.ch/schweiz/news/story/So-kommen-Sie-bei-Eis-und-Schnee-sicher-an-27032552', + 'info_dict': { + 'id': '523629', + 'ext': 'mp4', + 'title': 'So kommen Sie bei Eis und Schnee sicher an', + 'description': 'md5:117c212f64b25e3d95747e5276863f7d', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [TwentyMinutenIE.ie_key()], + }, + { + # VideoPress embed + 'url': 'https://en.support.wordpress.com/videopress/', + 'info_dict': { + 'id': 'OcobLTqC', + 'ext': 'm4v', + 'title': 'IMG_5786', + 'timestamp': 1435711927, + 'upload_date': '20150701', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [VideoPressIE.ie_key()], + }, + { + # Rutube embed + 'url': 'http://magazzino.friday.ru/videos/vipuski/kazan-2', + 'info_dict': { + 'id': '9b3d5bee0a8740bf70dfd29d3ea43541', + 'ext': 'flv', + 'title': 'Магаззино: Казань 2', + 'description': 'md5:99bccdfac2269f0e8fdbc4bbc9db184a', + 'uploader': 'Магаззино', + 'upload_date': '20170228', + 'uploader_id': '996642', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [RutubeIE.ie_key()], + }, + { + # ThePlatform embedded with whitespaces in URLs + 'url': 'http://www.golfchannel.com/topics/shows/golftalkcentral.htm', + 'only_matching': True, + }, + { + # Senate ISVP iframe https + 'url': 'https://www.hsgac.senate.gov/hearings/canadas-fast-track-refugee-plan-unanswered-questions-and-implications-for-us-national-security', + 'md5': 'fb8c70b0b515e5037981a2492099aab8', + 'info_dict': { + 'id': 'govtaff020316', + 'ext': 'mp4', + 'title': 'Integrated Senate Video Player', + }, + 'add_ie': [SenateISVPIE.ie_key()], + }, + { + # Limelight embeds (1 channel embed + 4 media embeds) + 'url': 'http://www.sedona.com/FacilitatorTraining2017', + 'info_dict': { + 'id': 'FacilitatorTraining2017', + 'title': 'Facilitator Training 2017', + }, + 'playlist_mincount': 5, + }, + { + # Limelight embed (LimelightPlayerUtil.embed) + 'url': 'https://tv5.ca/videos?v=xuu8qowr291ri', + 'info_dict': { + 'id': '95d035dc5c8a401588e9c0e6bd1e9c92', + 'ext': 'mp4', + 'title': '07448641', + 'timestamp': 1499890639, + 'upload_date': '20170712', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['LimelightMedia'], + }, + { + 'url': 'http://kron4.com/2017/04/28/standoff-with-walnut-creek-murder-suspect-ends-with-arrest/', + 'info_dict': { + 'id': 'standoff-with-walnut-creek-murder-suspect-ends-with-arrest', + 'title': 'Standoff with Walnut Creek murder suspect ends', + 'description': 'md5:3ccc48a60fc9441eeccfc9c469ebf788', + }, + 'playlist_mincount': 4, + }, + { + # WashingtonPost embed + 'url': 'http://www.vanityfair.com/hollywood/2017/04/donald-trump-tv-pitches', + 'info_dict': { + 'id': '8caf6e88-d0ec-11e5-90d3-34c2c42653ac', + 'ext': 'mp4', + 'title': "No one has seen the drama series based on Trump's life \u2014 until now", + 'description': 'Donald Trump wanted a weekly TV drama based on his life. It never aired. But The Washington Post recently obtained a scene from the pilot script — and enlisted actors.', + 'timestamp': 1455216756, + 'uploader': 'The Washington Post', + 'upload_date': '20160211', + }, + 'add_ie': [WashingtonPostIE.ie_key()], + }, + { + # Mediaset embed + 'url': 'http://www.tgcom24.mediaset.it/politica/serracchiani-voglio-vivere-in-una-societa-aperta-reazioni-sproporzionate-_3071354-201702a.shtml', + 'info_dict': { + 'id': '720642', + 'ext': 'mp4', + 'title': 'Serracchiani: "Voglio vivere in una società aperta, con tutela del patto di fiducia"', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [MediasetIE.ie_key()], + }, + { + # JOJ.sk embeds + 'url': 'https://www.noviny.sk/slovensko/238543-slovenskom-sa-prehnala-vlna-silnych-burok', + 'info_dict': { + 'id': '238543-slovenskom-sa-prehnala-vlna-silnych-burok', + 'title': 'Slovenskom sa prehnala vlna silných búrok', + }, + 'playlist_mincount': 5, + 'add_ie': [JojIE.ie_key()], + }, + { + # AMP embed (see https://www.ampproject.org/docs/reference/components/amp-video) + 'url': 'https://tvrain.ru/amp/418921/', + 'md5': 'cc00413936695987e8de148b67d14f1d', + 'info_dict': { + 'id': '418921', + 'ext': 'mp4', + 'title': 'Стас Намин: «Мы нарушили девственность Кремля»', + }, + }, + { + # vzaar embed + 'url': 'http://help.vzaar.com/article/165-embedding-video', + 'md5': '7e3919d9d2620b89e3e00bec7fe8c9d4', + 'info_dict': { + 'id': '8707641', + 'ext': 'mp4', + 'title': 'Building A Business Online: Principal Chairs Q & A', + }, + }, + { + # multiple HTML5 videos on one page + 'url': 'https://www.paragon-software.com/home/rk-free/keyscenarios.html', + 'info_dict': { + 'id': 'keyscenarios', + 'title': 'Rescue Kit 14 Free Edition - Getting started', + }, + 'playlist_count': 4, + }, + { + # vshare embed + 'url': 'https://youtube-dl-demo.neocities.org/vshare.html', + 'md5': '17b39f55b5497ae8b59f5fbce8e35886', + 'info_dict': { + 'id': '0f64ce6', + 'title': 'vl14062007715967', + 'ext': 'mp4', + } + }, + { + 'url': 'http://www.heidelberg-laureate-forum.org/blog/video/lecture-friday-september-23-2016-sir-c-antony-r-hoare/', + 'md5': 'aecd089f55b1cb5a59032cb049d3a356', + 'info_dict': { + 'id': '90227f51a80c4d8f86c345a7fa62bd9a1d', + 'ext': 'mp4', + 'title': 'Lecture: Friday, September 23, 2016 - Sir Tony Hoare', + 'description': 'md5:5a51db84a62def7b7054df2ade403c6c', + 'timestamp': 1474354800, + 'upload_date': '20160920', + } + }, + { + 'url': 'http://www.kidzworld.com/article/30935-trolls-the-beat-goes-on-interview-skylar-astin-and-amanda-leighton', + 'info_dict': { + 'id': '1731611', + 'ext': 'mp4', + 'title': 'Official Trailer | TROLLS: THE BEAT GOES ON!', + 'description': 'md5:eb5f23826a027ba95277d105f248b825', + 'timestamp': 1516100691, + 'upload_date': '20180116', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [SpringboardPlatformIE.ie_key()], + }, + { + 'url': 'https://www.yapfiles.ru/show/1872528/690b05d3054d2dbe1e69523aa21bb3b1.mp4.html', + 'info_dict': { + 'id': 'vMDE4NzI1Mjgt690b', + 'ext': 'mp4', + 'title': 'Котята', + }, + 'add_ie': [YapFilesIE.ie_key()], + 'params': { + 'skip_download': True, + }, + }, + { + # CloudflareStream embed + 'url': 'https://www.cloudflare.com/products/cloudflare-stream/', + 'info_dict': { + 'id': '31c9291ab41fac05471db4e73aa11717', + 'ext': 'mp4', + 'title': '31c9291ab41fac05471db4e73aa11717', + }, + 'add_ie': [CloudflareStreamIE.ie_key()], + 'params': { + 'skip_download': True, + }, + }, + { + # PeerTube embed + 'url': 'https://joinpeertube.org/fr/home/', + 'info_dict': { + 'id': 'home', + 'title': 'Reprenez le contrôle de vos vidéos ! #JoinPeertube', + }, + 'playlist_count': 2, + }, + { + # Indavideo embed + 'url': 'https://streetkitchen.hu/receptek/igy_kell_otthon_hamburgert_sutni/', + 'info_dict': { + 'id': '1693903', + 'ext': 'mp4', + 'title': 'Így kell otthon hamburgert sütni', + 'description': 'md5:f5a730ecf900a5c852e1e00540bbb0f7', + 'timestamp': 1426330212, + 'upload_date': '20150314', + 'uploader': 'StreetKitchen', + 'uploader_id': '546363', + }, + 'add_ie': [IndavideoEmbedIE.ie_key()], + 'params': { + 'skip_download': True, + }, + }, + { + # APA embed via JWPlatform embed + 'url': 'http://www.vol.at/blue-man-group/5593454', + 'info_dict': { + 'id': 'jjv85FdZ', + 'ext': 'mp4', + 'title': '"Blau ist mysteriös": Die Blue Man Group im Interview', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 254, + 'timestamp': 1519211149, + 'upload_date': '20180221', + }, + 'params': { + 'skip_download': True, + }, + }, + { + 'url': 'http://share-videos.se/auto/video/83645793?uid=13', + 'md5': 'b68d276de422ab07ee1d49388103f457', + 'info_dict': { + 'id': '83645793', + 'title': 'Lock up and get excited', + 'ext': 'mp4' + }, + 'skip': 'TODO: fix nested playlists processing in tests', + }, + { + # Viqeo embeds + 'url': 'https://viqeo.tv/', + 'info_dict': { + 'id': 'viqeo', + 'title': 'All-new video platform', + }, + 'playlist_count': 6, + }, + { + # Squarespace video embed, 2019-08-28 + 'url': 'http://ootboxford.com', + 'info_dict': { + 'id': 'Tc7b_JGdZfw', + 'title': 'Out of the Blue, at Childish Things 10', + 'ext': 'mp4', + 'description': 'md5:a83d0026666cf5ee970f8bd1cfd69c7f', + 'uploader_id': 'helendouglashouse', + 'uploader': 'Helen & Douglas House', + 'upload_date': '20140328', + }, + 'params': { + 'skip_download': True, + }, + }, + # { + # # Zype embed + # 'url': 'https://www.cookscountry.com/episode/554-smoky-barbecue-favorites', + # 'info_dict': { + # 'id': '5b400b834b32992a310622b9', + # 'ext': 'mp4', + # 'title': 'Smoky Barbecue Favorites', + # 'thumbnail': r're:^https?://.*\.jpe?g', + # 'description': 'md5:5ff01e76316bd8d46508af26dc86023b', + # 'upload_date': '20170909', + # 'timestamp': 1504915200, + # }, + # 'add_ie': [ZypeIE.ie_key()], + # 'params': { + # 'skip_download': True, + # }, + # }, + { + # videojs embed + 'url': 'https://video.sibnet.ru/shell.php?videoid=3422904', + 'info_dict': { + 'id': 'shell', + 'ext': 'mp4', + 'title': 'Доставщик пиццы спросил разрешения сыграть на фортепиано', + 'description': 'md5:89209cdc587dab1e4a090453dbaa2cb1', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Failed to download MPD manifest'], + }, + { + # DailyMotion embed with DM.player + 'url': 'https://www.beinsports.com/us/copa-del-rey/video/the-locker-room-valencia-beat-barca-in-copa/1203804', + 'info_dict': { + 'id': 'k6aKkGHd9FJs4mtJN39', + 'ext': 'mp4', + 'title': 'The Locker Room: Valencia Beat Barca In Copa del Rey Final', + 'description': 'This video is private.', + 'uploader_id': 'x1jf30l', + 'uploader': 'beIN SPORTS USA', + 'upload_date': '20190528', + 'timestamp': 1559062971, + }, + 'params': { + 'skip_download': True, + }, + }, + # { + # # TODO: find another test + # # http://schema.org/VideoObject + # 'url': 'https://flipagram.com/f/nyvTSJMKId', + # 'md5': '888dcf08b7ea671381f00fab74692755', + # 'info_dict': { + # 'id': 'nyvTSJMKId', + # 'ext': 'mp4', + # 'title': 'Flipagram by sjuria101 featuring Midnight Memories by One Direction', + # 'description': '#love for cats.', + # 'timestamp': 1461244995, + # 'upload_date': '20160421', + # }, + # 'params': { + # 'force_generic_extractor': True, + # }, + # }, + { + # VHX Embed + 'url': 'https://demo.vhx.tv/category-c/videos/file-example-mp4-480-1-5mg-copy', + 'info_dict': { + 'id': '858208', + 'ext': 'mp4', + 'title': 'Untitled', + 'uploader_id': 'user80538407', + 'uploader': 'OTT Videos', + }, + }, + { + # ArcPublishing PoWa video player + 'url': 'https://www.adn.com/politics/2020/11/02/video-senate-candidates-campaign-in-anchorage-on-eve-of-election-day/', + 'md5': 'b03b2fac8680e1e5a7cc81a5c27e71b3', + 'info_dict': { + 'id': '8c99cb6e-b29c-4bc9-9173-7bf9979225ab', + 'ext': 'mp4', + 'title': 'Senate candidates wave to voters on Anchorage streets', + 'description': 'md5:91f51a6511f090617353dc720318b20e', + 'timestamp': 1604378735, + 'upload_date': '20201103', + 'duration': 1581, + }, + }, + { + # MyChannels SDK embed + # https://www.24kitchen.nl/populair/deskundige-dit-waarom-sommigen-gevoelig-zijn-voor-voedselallergieen + 'url': 'https://www.demorgen.be/nieuws/burgemeester-rotterdam-richt-zich-in-videoboodschap-tot-relschoppers-voelt-het-goed~b0bcfd741/', + 'md5': '90c0699c37006ef18e198c032d81739c', + 'info_dict': { + 'id': '194165', + 'ext': 'mp4', + 'title': 'Burgemeester Aboutaleb spreekt relschoppers toe', + 'timestamp': 1611740340, + 'upload_date': '20210127', + 'duration': 159, + }, + }, + { + # Simplecast player embed + 'url': 'https://www.bio.org/podcast', + 'info_dict': { + 'id': 'podcast', + 'title': 'I AM BIO Podcast | BIO', + }, + 'playlist_mincount': 52, + }, + { + # Sibnet embed (https://help.sibnet.ru/?sibnet_video_embed) + 'url': 'https://phpbb3.x-tk.ru/bbcode-video-sibnet-t24.html', + 'only_matching': True, + }, { + # WimTv embed player + 'url': 'http://www.msmotor.tv/wearefmi-pt-2-2021/', + 'info_dict': { + 'id': 'wearefmi-pt-2-2021', + 'title': '#WEAREFMI – PT.2 – 2021 – MsMotorTV', + }, + 'playlist_count': 1, + }, { + # KVS Player + 'url': 'https://www.kvs-demo.com/videos/105/kelis-4th-of-july/', + 'info_dict': { + 'id': '105', + 'display_id': 'kelis-4th-of-july', + 'ext': 'mp4', + 'title': 'Kelis - 4th Of July', + 'thumbnail': 'https://kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg', + }, + 'params': { + 'skip_download': True, + }, + }, { + # KVS Player + 'url': 'https://www.kvs-demo.com/embed/105/', + 'info_dict': { + 'id': '105', + 'display_id': 'kelis-4th-of-july', + 'ext': 'mp4', + 'title': 'Kelis - 4th Of July / Embed Player', + 'thumbnail': 'https://kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg', + }, + 'params': { + 'skip_download': True, + }, + }, { + # KVS Player + 'url': 'https://thisvid.com/videos/french-boy-pantsed/', + 'md5': '3397979512c682f6b85b3b04989df224', + 'info_dict': { + 'id': '2400174', + 'display_id': 'french-boy-pantsed', + 'ext': 'mp4', + 'title': 'French Boy Pantsed - ThisVid.com', + 'thumbnail': 'https://media.thisvid.com/contents/videos_screenshots/2400000/2400174/preview.mp4.jpg', + } + }, { + # KVS Player + 'url': 'https://thisvid.com/embed/2400174/', + 'md5': '3397979512c682f6b85b3b04989df224', + 'info_dict': { + 'id': '2400174', + 'display_id': 'french-boy-pantsed', + 'ext': 'mp4', + 'title': 'French Boy Pantsed - ThisVid.com', + 'thumbnail': 'https://media.thisvid.com/contents/videos_screenshots/2400000/2400174/preview.mp4.jpg', + } + }, { + # KVS Player + 'url': 'https://youix.com/video/leningrad-zoj/', + 'md5': '94f96ba95706dc3880812b27b7d8a2b8', + 'info_dict': { + 'id': '18485', + 'display_id': 'leningrad-zoj', + 'ext': 'mp4', + 'title': 'Клип: Ленинград - ЗОЖ скачать, смотреть онлайн | Youix.com', + 'thumbnail': 'https://youix.com/contents/videos_screenshots/18000/18485/preview_480x320_youix_com.mp4.jpg', + } + }, { + # KVS Player + 'url': 'https://youix.com/embed/18485', + 'md5': '94f96ba95706dc3880812b27b7d8a2b8', + 'info_dict': { + 'id': '18485', + 'display_id': 'leningrad-zoj', + 'ext': 'mp4', + 'title': 'Ленинград - ЗОЖ', + 'thumbnail': 'https://youix.com/contents/videos_screenshots/18000/18485/preview_480x320_youix_com.mp4.jpg', + } + }, { + # KVS Player + 'url': 'https://bogmedia.org/videos/21217/40-nochey-40-nights-2016/', + 'md5': '94166bdb26b4cb1fb9214319a629fc51', + 'info_dict': { + 'id': '21217', + 'display_id': '40-nochey-40-nights-2016', + 'ext': 'mp4', + 'title': '40 ночей (2016) - BogMedia.org', + 'thumbnail': 'https://bogmedia.org/contents/videos_screenshots/21000/21217/preview_480p.mp4.jpg', + } + }, + ] + + def report_following_redirect(self, new_url): + """Report information extraction.""" + self._downloader.to_screen('[redirect] Following redirect to %s' % new_url) + + def _extract_rss(self, url, video_id, doc): + playlist_title = doc.find('./channel/title').text + playlist_desc_el = doc.find('./channel/description') + playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text + + NS_MAP = { + 'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd', + } + + entries = [] + for it in doc.findall('./channel/item'): + next_url = None + enclosure_nodes = it.findall('./enclosure') + for e in enclosure_nodes: + next_url = e.attrib.get('url') + if next_url: + break + + if not next_url: + next_url = xpath_text(it, 'link', fatal=False) + + if not next_url: + continue + + def itunes(key): + return xpath_text( + it, xpath_with_ns('./itunes:%s' % key, NS_MAP), + default=None) + + duration = itunes('duration') + explicit = (itunes('explicit') or '').lower() + if explicit in ('true', 'yes'): + age_limit = 18 + elif explicit in ('false', 'no'): + age_limit = 0 + else: + age_limit = None + + entries.append({ + '_type': 'url_transparent', + 'url': next_url, + 'title': it.find('title').text, + 'description': xpath_text(it, 'description', default=None), + 'timestamp': unified_timestamp( + xpath_text(it, 'pubDate', default=None)), + 'duration': int_or_none(duration) or parse_duration(duration), + 'thumbnail': url_or_none(xpath_attr(it, xpath_with_ns('./itunes:image', NS_MAP), 'href')), + 'episode': itunes('title'), + 'episode_number': int_or_none(itunes('episode')), + 'season_number': int_or_none(itunes('season')), + 'age_limit': age_limit, + }) + + return { + '_type': 'playlist', + 'id': url, + 'title': playlist_title, + 'description': playlist_desc, + 'entries': entries, + } + + def _extract_camtasia(self, url, video_id, webpage): + """ Returns None if no camtasia video can be found. """ + + camtasia_cfg = self._search_regex( + r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);', + webpage, 'camtasia configuration file', default=None) + if camtasia_cfg is None: + return None + + title = self._html_search_meta('DC.title', webpage, fatal=True) + + camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg) + camtasia_cfg = self._download_xml( + camtasia_url, video_id, + note='Downloading camtasia configuration', + errnote='Failed to download camtasia configuration') + fileset_node = camtasia_cfg.find('./playlist/array/fileset') + + entries = [] + for n in fileset_node.getchildren(): + url_n = n.find('./uri') + if url_n is None: + continue + + entries.append({ + 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0], + 'title': '%s - %s' % (title, n.tag), + 'url': compat_urlparse.urljoin(url, url_n.text), + 'duration': float_or_none(n.find('./duration').text), + }) + + return { + '_type': 'playlist', + 'entries': entries, + 'title': title, + } + + def _kvs_getrealurl(self, video_url, license_code): + if not video_url.startswith('function/0/'): + return video_url # not obfuscated + + url_path, _, url_query = video_url.partition('?') + urlparts = url_path.split('/')[2:] + license = self._kvs_getlicensetoken(license_code) + newmagic = urlparts[5][:32] + + for o in range(len(newmagic) - 1, -1, -1): + new = '' + l = (o + sum([int(n) for n in license[o:]])) % 32 + + for i in range(0, len(newmagic)): + if i == o: + new += newmagic[l] + elif i == l: + new += newmagic[o] + else: + new += newmagic[i] + newmagic = new + + urlparts[5] = newmagic + urlparts[5][32:] + return '/'.join(urlparts) + '?' + url_query + + def _kvs_getlicensetoken(self, license): + modlicense = license.replace('$', '').replace('0', '1') + center = int(len(modlicense) / 2) + fronthalf = int(modlicense[:center + 1]) + backhalf = int(modlicense[center:]) + + modlicense = str(4 * abs(fronthalf - backhalf)) + retval = '' + for o in range(0, center + 1): + for i in range(1, 5): + retval += str((int(license[o + i]) + int(modlicense[o])) % 10) + return retval + + def _real_extract(self, url): + if url.startswith('//'): + return self.url_result(self.http_scheme() + url) + + parsed_url = compat_urlparse.urlparse(url) + if not parsed_url.scheme: + default_search = self.get_param('default_search') + if default_search is None: + default_search = 'fixup_error' + + if default_search in ('auto', 'auto_warning', 'fixup_error'): + if re.match(r'^[^\s/]+\.[^\s/]+/', url): + self.report_warning('The url doesn\'t specify the protocol, trying with http') + return self.url_result('http://' + url) + elif default_search != 'fixup_error': + if default_search == 'auto_warning': + if re.match(r'^(?:url|URL)$', url): + raise ExtractorError( + 'Invalid URL: %r . Call yt-dlp like this: yt-dlp -v "https://www.youtube.com/watch?v=BaW_jenozKc" ' % url, + expected=True) + else: + self.report_warning( + 'Falling back to youtube search for %s . Set --default-search "auto" to suppress this warning.' % url) + return self.url_result('ytsearch:' + url) + + if default_search in ('error', 'fixup_error'): + raise ExtractorError( + '%r is not a valid URL. ' + 'Set --default-search "ytsearch" (or run yt-dlp "ytsearch:%s" ) to search YouTube' + % (url, url), expected=True) + else: + if ':' not in default_search: + default_search += ':' + return self.url_result(default_search + url) + + url, smuggled_data = unsmuggle_url(url) + force_videoid = None + is_intentional = smuggled_data and smuggled_data.get('to_generic') + if smuggled_data and 'force_videoid' in smuggled_data: + force_videoid = smuggled_data['force_videoid'] + video_id = force_videoid + else: + video_id = self._generic_id(url) + + self.to_screen('%s: Requesting header' % video_id) + + head_req = HEADRequest(url) + head_response = self._request_webpage( + head_req, video_id, + note=False, errnote='Could not send HEAD request to %s' % url, + fatal=False) + + if head_response is not False: + # Check for redirect + new_url = head_response.geturl() + if url != new_url: + self.report_following_redirect(new_url) + if force_videoid: + new_url = smuggle_url( + new_url, {'force_videoid': force_videoid}) + return self.url_result(new_url) + + full_response = None + if head_response is False: + request = sanitized_Request(url) + request.add_header('Accept-Encoding', '*') + full_response = self._request_webpage(request, video_id) + head_response = full_response + + info_dict = { + 'id': video_id, + 'title': self._generic_title(url), + 'timestamp': unified_timestamp(head_response.headers.get('Last-Modified')) + } + + # Check for direct link to a video + content_type = head_response.headers.get('Content-Type', '').lower() + m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type) + if m: + format_id = compat_str(m.group('format_id')) + subtitles = {} + if format_id.endswith('mpegurl'): + formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4') + elif format_id == 'f4m': + formats = self._extract_f4m_formats(url, video_id) + else: + formats = [{ + 'format_id': format_id, + 'url': url, + 'vcodec': 'none' if m.group('type') == 'audio' else None + }] + info_dict['direct'] = True + self._sort_formats(formats) + info_dict['formats'] = formats + info_dict['subtitles'] = subtitles + return info_dict + + if not self.get_param('test', False) and not is_intentional: + force = self.get_param('force_generic_extractor', False) + self.report_warning( + '%s on generic information extractor.' % ('Forcing' if force else 'Falling back')) + + if not full_response: + request = sanitized_Request(url) + # Some webservers may serve compressed content of rather big size (e.g. gzipped flac) + # making it impossible to download only chunk of the file (yet we need only 512kB to + # test whether it's HTML or not). According to yt-dlp default Accept-Encoding + # that will always result in downloading the whole file that is not desirable. + # Therefore for extraction pass we have to override Accept-Encoding to any in order + # to accept raw bytes and being able to download only a chunk. + # It may probably better to solve this by checking Content-Type for application/octet-stream + # after HEAD request finishes, but not sure if we can rely on this. + request.add_header('Accept-Encoding', '*') + full_response = self._request_webpage(request, video_id) + + first_bytes = full_response.read(512) + + # Is it an M3U playlist? + if first_bytes.startswith(b'#EXTM3U'): + info_dict['formats'], info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4') + self._sort_formats(info_dict['formats']) + return info_dict + + # Maybe it's a direct link to a video? + # Be careful not to download the whole thing! + if not is_html(first_bytes): + self.report_warning( + 'URL could be a direct video link, returning it as such.') + info_dict.update({ + 'direct': True, + 'url': url, + }) + return info_dict + + webpage = self._webpage_read_content( + full_response, url, video_id, prefix=first_bytes) + + if '<title>DPG Media Privacy Gate</title>' in webpage: + webpage = self._download_webpage(url, video_id) + + self.report_extraction(video_id) + + # Is it an RSS feed, a SMIL file, an XSPF playlist or a MPD manifest? + try: + try: + doc = compat_etree_fromstring(webpage) + except compat_xml_parse_error: + doc = compat_etree_fromstring(webpage.encode('utf-8')) + if doc.tag == 'rss': + return self._extract_rss(url, video_id, doc) + elif doc.tag == 'SmoothStreamingMedia': + info_dict['formats'], info_dict['subtitles'] = self._parse_ism_formats_and_subtitles(doc, url) + self._sort_formats(info_dict['formats']) + return info_dict + elif re.match(r'^(?:{[^}]+})?smil$', doc.tag): + smil = self._parse_smil(doc, url, video_id) + self._sort_formats(smil['formats']) + return smil + elif doc.tag == '{http://xspf.org/ns/0/}playlist': + return self.playlist_result( + self._parse_xspf( + doc, video_id, xspf_url=url, + xspf_base_url=full_response.geturl()), + video_id) + elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): + info_dict['formats'], info_dict['subtitles'] = self._parse_mpd_formats_and_subtitles( + doc, + mpd_base_url=full_response.geturl().rpartition('/')[0], + mpd_url=url) + self._sort_formats(info_dict['formats']) + return info_dict + elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag): + info_dict['formats'] = self._parse_f4m_formats(doc, url, video_id) + self._sort_formats(info_dict['formats']) + return info_dict + except compat_xml_parse_error: + pass + + # Is it a Camtasia project? + camtasia_res = self._extract_camtasia(url, video_id, webpage) + if camtasia_res is not None: + return camtasia_res + + # Sometimes embedded video player is hidden behind percent encoding + # (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448) + # Unescaping the whole page allows to handle those cases in a generic way + # FIXME: unescaping the whole page may break URLs, commenting out for now. + # There probably should be a second run of generic extractor on unescaped webpage. + # webpage = compat_urllib_parse_unquote(webpage) + + # Unescape squarespace embeds to be detected by generic extractor, + # see https://github.com/ytdl-org/youtube-dl/issues/21294 + webpage = re.sub( + r'<div[^>]+class=[^>]*?\bsqs-video-wrapper\b[^>]*>', + lambda x: unescapeHTML(x.group(0)), webpage) + + # it's tempting to parse this further, but you would + # have to take into account all the variations like + # Video Title - Site Name + # Site Name | Video Title + # Video Title - Tagline | Site Name + # and so on and so forth; it's just not practical + video_title = self._og_search_title( + webpage, default=None) or self._html_search_regex( + r'(?s)<title>(.*?)</title>', webpage, 'video title', + default='video') + + # Try to detect age limit automatically + age_limit = self._rta_search(webpage) + # And then there are the jokers who advertise that they use RTA, + # but actually don't. + AGE_LIMIT_MARKERS = [ + r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>', + ] + if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS): + age_limit = 18 + + # video uploader is domain name + video_uploader = self._search_regex( + r'^(?:https?://)?([^/]*)/.*', url, 'video uploader') + + video_description = self._og_search_description(webpage, default=None) + video_thumbnail = self._og_search_thumbnail(webpage, default=None) + + info_dict.update({ + 'title': video_title, + 'description': video_description, + 'thumbnail': video_thumbnail, + 'age_limit': age_limit, + }) + + # Look for Brightcove Legacy Studio embeds + bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage) + if bc_urls: + entries = [{ + '_type': 'url', + 'url': smuggle_url(bc_url, {'Referer': url}), + 'ie_key': 'BrightcoveLegacy' + } for bc_url in bc_urls] + + return { + '_type': 'playlist', + 'title': video_title, + 'id': video_id, + 'entries': entries, + } + + # Look for Brightcove New Studio embeds + bc_urls = BrightcoveNewIE._extract_urls(self, webpage) + if bc_urls: + return self.playlist_from_matches( + bc_urls, video_id, video_title, + getter=lambda x: smuggle_url(x, {'referrer': url}), + ie='BrightcoveNew') + + # Look for Nexx embeds + nexx_urls = NexxIE._extract_urls(webpage) + if nexx_urls: + return self.playlist_from_matches(nexx_urls, video_id, video_title, ie=NexxIE.ie_key()) + + # Look for Nexx iFrame embeds + nexx_embed_urls = NexxEmbedIE._extract_urls(webpage) + if nexx_embed_urls: + return self.playlist_from_matches(nexx_embed_urls, video_id, video_title, ie=NexxEmbedIE.ie_key()) + + # Look for ThePlatform embeds + tp_urls = ThePlatformIE._extract_urls(webpage) + if tp_urls: + return self.playlist_from_matches(tp_urls, video_id, video_title, ie='ThePlatform') + + arc_urls = ArcPublishingIE._extract_urls(webpage) + if arc_urls: + return self.playlist_from_matches(arc_urls, video_id, video_title, ie=ArcPublishingIE.ie_key()) + + mychannels_urls = MedialaanIE._extract_urls(webpage) + if mychannels_urls: + return self.playlist_from_matches( + mychannels_urls, video_id, video_title, ie=MedialaanIE.ie_key()) + + # Look for embedded rtl.nl player + matches = re.findall( + r'<iframe[^>]+?src="((?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)"', + webpage) + if matches: + return self.playlist_from_matches(matches, video_id, video_title, ie='RtlNl') + + vimeo_urls = VimeoIE._extract_urls(url, webpage) + if vimeo_urls: + return self.playlist_from_matches(vimeo_urls, video_id, video_title, ie=VimeoIE.ie_key()) + + vhx_url = VHXEmbedIE._extract_url(webpage) + if vhx_url: + return self.url_result(vhx_url, VHXEmbedIE.ie_key()) + + # Invidious Instances + # https://github.com/yt-dlp/yt-dlp/issues/195 + # https://github.com/iv-org/invidious/pull/1730 + youtube_url = self._search_regex( + r'<link rel="alternate" href="(https://www\.youtube\.com/watch\?v=[0-9A-Za-z_-]{11})"', + webpage, 'youtube link', default=None) + if youtube_url: + return self.url_result(youtube_url, YoutubeIE.ie_key()) + + # Look for YouTube embeds + youtube_urls = YoutubeIE._extract_urls(webpage) + if youtube_urls: + return self.playlist_from_matches( + youtube_urls, video_id, video_title, ie=YoutubeIE.ie_key()) + + matches = DailymotionIE._extract_urls(webpage) + if matches: + return self.playlist_from_matches(matches, video_id, video_title) + + # Look for embedded Dailymotion playlist player (#3822) + m = re.search( + r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage) + if m: + playlists = re.findall( + r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url'))) + if playlists: + return self.playlist_from_matches( + playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p) + + # Look for DailyMail embeds + dailymail_urls = DailyMailIE._extract_urls(webpage) + if dailymail_urls: + return self.playlist_from_matches( + dailymail_urls, video_id, video_title, ie=DailyMailIE.ie_key()) + + # Look for Teachable embeds, must be before Wistia + teachable_url = TeachableIE._extract_url(webpage, url) + if teachable_url: + return self.url_result(teachable_url) + + # Look for embedded Wistia player + wistia_urls = WistiaIE._extract_urls(webpage) + if wistia_urls: + playlist = self.playlist_from_matches(wistia_urls, video_id, video_title, ie=WistiaIE.ie_key()) + for entry in playlist['entries']: + entry.update({ + '_type': 'url_transparent', + 'uploader': video_uploader, + }) + return playlist + + # Look for SVT player + svt_url = SVTIE._extract_url(webpage) + if svt_url: + return self.url_result(svt_url, 'SVT') + + # Look for Bandcamp pages with custom domain + mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage) + if mobj is not None: + burl = unescapeHTML(mobj.group(1)) + # Don't set the extractor because it can be a track url or an album + return self.url_result(burl) + + # Look for embedded Vevo player + mobj = re.search( + r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage) + if mobj is not None: + return self.url_result(mobj.group('url')) + + # Look for embedded Viddler player + mobj = re.search( + r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P<url>(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1', + webpage) + if mobj is not None: + return self.url_result(mobj.group('url')) + + # Look for NYTimes player + mobj = re.search( + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>', + webpage) + if mobj is not None: + return self.url_result(mobj.group('url')) + + # Look for Libsyn player + mobj = re.search( + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1', webpage) + if mobj is not None: + return self.url_result(mobj.group('url')) + + # Look for Ooyala videos + mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) + or re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) + or re.search(r'OO\.Player\.create\.apply\(\s*OO\.Player\s*,\s*op\(\s*\[\s*[\'"][^\'"]*[\'"]\s*,\s*[\'"](?P<ec>.{32})[\'"]', webpage) + or re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) + or re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage)) + if mobj is not None: + embed_token = self._search_regex( + r'embedToken[\'"]?\s*:\s*[\'"]([^\'"]+)', + webpage, 'ooyala embed token', default=None) + return OoyalaIE._build_url_result(smuggle_url( + mobj.group('ec'), { + 'domain': url, + 'embed_token': embed_token, + })) + + # Look for multiple Ooyala embeds on SBN network websites + mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage) + if mobj is not None: + embeds = self._parse_json(mobj.group(1), video_id, fatal=False) + if embeds: + return self.playlist_from_matches( + embeds, video_id, video_title, + getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala') + + # Look for Aparat videos + mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage) + if mobj is not None: + return self.url_result(mobj.group(1), 'Aparat') + + # Look for MPORA videos + mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage) + if mobj is not None: + return self.url_result(mobj.group(1), 'Mpora') + + # Look for embedded Facebook player + facebook_urls = FacebookIE._extract_urls(webpage) + if facebook_urls: + return self.playlist_from_matches(facebook_urls, video_id, video_title) + + # Look for embedded VK player + mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage) + if mobj is not None: + return self.url_result(mobj.group('url'), 'VK') + + # Look for embedded Odnoklassniki player + odnoklassniki_url = OdnoklassnikiIE._extract_url(webpage) + if odnoklassniki_url: + return self.url_result(odnoklassniki_url, OdnoklassnikiIE.ie_key()) + + # Look for sibnet embedded player + sibnet_urls = VKIE._extract_sibnet_urls(webpage) + if sibnet_urls: + return self.playlist_from_matches(sibnet_urls, video_id, video_title) + + # Look for embedded ivi player + mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage) + if mobj is not None: + return self.url_result(mobj.group('url'), 'Ivi') + + # Look for embedded Huffington Post player + mobj = re.search( + r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage) + if mobj is not None: + return self.url_result(mobj.group('url'), 'HuffPost') + + # Look for embed.ly + mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage) + if mobj is not None: + return self.url_result(mobj.group('url')) + mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage) + if mobj is not None: + return self.url_result(compat_urllib_parse_unquote(mobj.group('url'))) + + # Look for funnyordie embed + matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage) + if matches: + return self.playlist_from_matches( + matches, video_id, video_title, getter=unescapeHTML, ie='FunnyOrDie') + + # Look for Simplecast embeds + simplecast_urls = SimplecastIE._extract_urls(webpage) + if simplecast_urls: + return self.playlist_from_matches( + simplecast_urls, video_id, video_title) + + # Look for BBC iPlayer embed + matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage) + if matches: + return self.playlist_from_matches(matches, video_id, video_title, ie='BBCCoUk') + + # Look for embedded RUTV player + rutv_url = RUTVIE._extract_url(webpage) + if rutv_url: + return self.url_result(rutv_url, 'RUTV') + + # Look for embedded TVC player + tvc_url = TVCIE._extract_url(webpage) + if tvc_url: + return self.url_result(tvc_url, 'TVC') + + # Look for embedded SportBox player + sportbox_urls = SportBoxIE._extract_urls(webpage) + if sportbox_urls: + return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie=SportBoxIE.ie_key()) + + # Look for embedded XHamster player + xhamster_urls = XHamsterEmbedIE._extract_urls(webpage) + if xhamster_urls: + return self.playlist_from_matches(xhamster_urls, video_id, video_title, ie='XHamsterEmbed') + + # Look for embedded TNAFlixNetwork player + tnaflix_urls = TNAFlixNetworkEmbedIE._extract_urls(webpage) + if tnaflix_urls: + return self.playlist_from_matches(tnaflix_urls, video_id, video_title, ie=TNAFlixNetworkEmbedIE.ie_key()) + + # Look for embedded PornHub player + pornhub_urls = PornHubIE._extract_urls(webpage) + if pornhub_urls: + return self.playlist_from_matches(pornhub_urls, video_id, video_title, ie=PornHubIE.ie_key()) + + # Look for embedded DrTuber player + drtuber_urls = DrTuberIE._extract_urls(webpage) + if drtuber_urls: + return self.playlist_from_matches(drtuber_urls, video_id, video_title, ie=DrTuberIE.ie_key()) + + # Look for embedded RedTube player + redtube_urls = RedTubeIE._extract_urls(webpage) + if redtube_urls: + return self.playlist_from_matches(redtube_urls, video_id, video_title, ie=RedTubeIE.ie_key()) + + # Look for embedded Tube8 player + tube8_urls = Tube8IE._extract_urls(webpage) + if tube8_urls: + return self.playlist_from_matches(tube8_urls, video_id, video_title, ie=Tube8IE.ie_key()) + + # Look for embedded Mofosex player + mofosex_urls = MofosexEmbedIE._extract_urls(webpage) + if mofosex_urls: + return self.playlist_from_matches(mofosex_urls, video_id, video_title, ie=MofosexEmbedIE.ie_key()) + + # Look for embedded Spankwire player + spankwire_urls = SpankwireIE._extract_urls(webpage) + if spankwire_urls: + return self.playlist_from_matches(spankwire_urls, video_id, video_title, ie=SpankwireIE.ie_key()) + + # Look for embedded YouPorn player + youporn_urls = YouPornIE._extract_urls(webpage) + if youporn_urls: + return self.playlist_from_matches(youporn_urls, video_id, video_title, ie=YouPornIE.ie_key()) + + # Look for embedded Tvigle player + mobj = re.search( + r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage) + if mobj is not None: + return self.url_result(mobj.group('url'), 'Tvigle') + + # Look for embedded TED player + mobj = re.search( + r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage) + if mobj is not None: + return self.url_result(mobj.group('url'), 'TED') + + # Look for embedded Ustream videos + ustream_url = UstreamIE._extract_url(webpage) + if ustream_url: + return self.url_result(ustream_url, UstreamIE.ie_key()) + + # Look for embedded arte.tv player + arte_urls = ArteTVEmbedIE._extract_urls(webpage) + if arte_urls: + return self.playlist_from_matches(arte_urls, video_id, video_title) + + # Look for embedded francetv player + mobj = re.search( + r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?://)?embed\.francetv\.fr/\?ue=.+?)\1', + webpage) + if mobj is not None: + return self.url_result(mobj.group('url')) + + # Look for embedded Myvi.ru player + myvi_url = MyviIE._extract_url(webpage) + if myvi_url: + return self.url_result(myvi_url) + + # Look for embedded soundcloud player + soundcloud_urls = SoundcloudEmbedIE._extract_urls(webpage) + if soundcloud_urls: + return self.playlist_from_matches(soundcloud_urls, video_id, video_title, getter=unescapeHTML) + + # Look for tunein player + tunein_urls = TuneInBaseIE._extract_urls(webpage) + if tunein_urls: + return self.playlist_from_matches(tunein_urls, video_id, video_title) + + # Look for embedded mtvservices player + mtvservices_url = MTVServicesEmbeddedIE._extract_url(webpage) + if mtvservices_url: + return self.url_result(mtvservices_url, ie='MTVServicesEmbedded') + + # Look for embedded yahoo player + mobj = re.search( + r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1', + webpage) + if mobj is not None: + return self.url_result(mobj.group('url'), 'Yahoo') + + # Look for embedded sbs.com.au player + mobj = re.search( + r'''(?x) + (?: + <meta\s+property="og:video"\s+content=| + <iframe[^>]+?src= + ) + (["\'])(?P<url>https?://(?:www\.)?sbs\.com\.au/ondemand/video/.+?)\1''', + webpage) + if mobj is not None: + return self.url_result(mobj.group('url'), 'SBS') + + # Look for embedded Cinchcast player + mobj = re.search( + r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1', + webpage) + if mobj is not None: + return self.url_result(mobj.group('url'), 'Cinchcast') + + mobj = re.search( + r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1', + webpage) + if not mobj: + mobj = re.search( + r'data-video-link=["\'](?P<url>http://m\.mlb\.com/video/[^"\']+)', + webpage) + if mobj is not None: + return self.url_result(mobj.group('url'), 'MLB') + + mobj = re.search( + r'<(?:iframe|script)[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL, + webpage) + if mobj is not None: + return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast') + + mobj = re.search( + r'<iframe[^>]+src="(?P<url>https?://(?:new\.)?livestream\.com/[^"]+/player[^"]+)"', + webpage) + if mobj is not None: + return self.url_result(mobj.group('url'), 'Livestream') + + # Look for Zapiks embed + mobj = re.search( + r'<iframe[^>]+src="(?P<url>https?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"', webpage) + if mobj is not None: + return self.url_result(mobj.group('url'), 'Zapiks') + + # Look for Kaltura embeds + kaltura_urls = KalturaIE._extract_urls(webpage) + if kaltura_urls: + return self.playlist_from_matches( + kaltura_urls, video_id, video_title, + getter=lambda x: smuggle_url(x, {'source_url': url}), + ie=KalturaIE.ie_key()) + + # Look for EaglePlatform embeds + eagleplatform_url = EaglePlatformIE._extract_url(webpage) + if eagleplatform_url: + return self.url_result(smuggle_url(eagleplatform_url, {'referrer': url}), EaglePlatformIE.ie_key()) + + # Look for ClipYou (uses EaglePlatform) embeds + mobj = re.search( + r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage) + if mobj is not None: + return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform') + + # Look for Pladform embeds + pladform_url = PladformIE._extract_url(webpage) + if pladform_url: + return self.url_result(pladform_url) + + # Look for Videomore embeds + videomore_url = VideomoreIE._extract_url(webpage) + if videomore_url: + return self.url_result(videomore_url) + + # Look for Webcaster embeds + webcaster_url = WebcasterFeedIE._extract_url(self, webpage) + if webcaster_url: + return self.url_result(webcaster_url, ie=WebcasterFeedIE.ie_key()) + + # Look for Playwire embeds + mobj = re.search( + r'<script[^>]+data-config=(["\'])(?P<url>(?:https?:)?//config\.playwire\.com/.+?)\1', webpage) + if mobj is not None: + return self.url_result(mobj.group('url')) + + # Look for 5min embeds + mobj = re.search( + r'<meta[^>]+property="og:video"[^>]+content="https?://embed\.5min\.com/(?P<id>[0-9]+)/?', webpage) + if mobj is not None: + return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin') + + # Look for Crooks and Liars embeds + mobj = re.search( + r'<(?:iframe[^>]+src|param[^>]+value)=(["\'])(?P<url>(?:https?:)?//embed\.crooksandliars\.com/(?:embed|v)/.+?)\1', webpage) + if mobj is not None: + return self.url_result(mobj.group('url')) + + # Look for NBC Sports VPlayer embeds + nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage) + if nbc_sports_url: + return self.url_result(nbc_sports_url, 'NBCSportsVPlayer') + + # Look for NBC News embeds + nbc_news_embed_url = re.search( + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//www\.nbcnews\.com/widget/video-embed/[^"\']+)\1', webpage) + if nbc_news_embed_url: + return self.url_result(nbc_news_embed_url.group('url'), 'NBCNews') + + # Look for Google Drive embeds + google_drive_url = GoogleDriveIE._extract_url(webpage) + if google_drive_url: + return self.url_result(google_drive_url, 'GoogleDrive') + + # Look for UDN embeds + mobj = re.search( + r'<iframe[^>]+src="(?:https?:)?(?P<url>%s)"' % UDNEmbedIE._PROTOCOL_RELATIVE_VALID_URL, webpage) + if mobj is not None: + return self.url_result( + compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed') + + # Look for Senate ISVP iframe + senate_isvp_url = SenateISVPIE._search_iframe_url(webpage) + if senate_isvp_url: + return self.url_result(senate_isvp_url, 'SenateISVP') + + # Look for Kinja embeds + kinja_embed_urls = KinjaEmbedIE._extract_urls(webpage, url) + if kinja_embed_urls: + return self.playlist_from_matches( + kinja_embed_urls, video_id, video_title) + + # Look for OnionStudios embeds + onionstudios_url = OnionStudiosIE._extract_url(webpage) + if onionstudios_url: + return self.url_result(onionstudios_url) + + # Look for ViewLift embeds + viewlift_url = ViewLiftEmbedIE._extract_url(webpage) + if viewlift_url: + return self.url_result(viewlift_url) + + # Look for JWPlatform embeds + jwplatform_urls = JWPlatformIE._extract_urls(webpage) + if jwplatform_urls: + return self.playlist_from_matches(jwplatform_urls, video_id, video_title, ie=JWPlatformIE.ie_key()) + + # Look for Digiteka embeds + digiteka_url = DigitekaIE._extract_url(webpage) + if digiteka_url: + return self.url_result(self._proto_relative_url(digiteka_url), DigitekaIE.ie_key()) + + # Look for Arkena embeds + arkena_url = ArkenaIE._extract_url(webpage) + if arkena_url: + return self.url_result(arkena_url, ArkenaIE.ie_key()) + + # Look for Piksel embeds + piksel_url = PikselIE._extract_url(webpage) + if piksel_url: + return self.url_result(piksel_url, PikselIE.ie_key()) + + # Look for Limelight embeds + limelight_urls = LimelightBaseIE._extract_urls(webpage, url) + if limelight_urls: + return self.playlist_result( + limelight_urls, video_id, video_title, video_description) + + # Look for Anvato embeds + anvato_urls = AnvatoIE._extract_urls(self, webpage, video_id) + if anvato_urls: + return self.playlist_result( + anvato_urls, video_id, video_title, video_description) + + # Look for AdobeTVVideo embeds + mobj = re.search( + r'<iframe[^>]+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]', + webpage) + if mobj is not None: + return self.url_result( + self._proto_relative_url(unescapeHTML(mobj.group(1))), + 'AdobeTVVideo') + + # Look for Vine embeds + mobj = re.search( + r'<iframe[^>]+src=[\'"]((?:https?:)?//(?:www\.)?vine\.co/v/[^/]+/embed/(?:simple|postcard))', + webpage) + if mobj is not None: + return self.url_result( + self._proto_relative_url(unescapeHTML(mobj.group(1))), 'Vine') + + # Look for VODPlatform embeds + mobj = re.search( + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:(?:www\.)?vod-platform\.net|embed\.kwikmotion\.com)/[eE]mbed/.+?)\1', + webpage) + if mobj is not None: + return self.url_result( + self._proto_relative_url(unescapeHTML(mobj.group('url'))), 'VODPlatform') + + # Look for Mangomolo embeds + mobj = re.search( + r'''(?x)<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?// + (?: + admin\.mangomolo\.com/analytics/index\.php/customers/embed| + player\.mangomolo\.com/v1 + )/ + (?: + video\?.*?\bid=(?P<video_id>\d+)| + (?:index|live)\?.*?\bchannelid=(?P<channel_id>(?:[A-Za-z0-9+/=]|%2B|%2F|%3D)+) + ).+?)\1''', webpage) + if mobj is not None: + info = { + '_type': 'url_transparent', + 'url': self._proto_relative_url(unescapeHTML(mobj.group('url'))), + 'title': video_title, + 'description': video_description, + 'thumbnail': video_thumbnail, + 'uploader': video_uploader, + } + video_id = mobj.group('video_id') + if video_id: + info.update({ + 'ie_key': 'MangomoloVideo', + 'id': video_id, + }) + else: + info.update({ + 'ie_key': 'MangomoloLive', + 'id': mobj.group('channel_id'), + }) + return info + + # Look for Instagram embeds + instagram_embed_url = InstagramIE._extract_embed_url(webpage) + if instagram_embed_url is not None: + return self.url_result( + self._proto_relative_url(instagram_embed_url), InstagramIE.ie_key()) + + # Look for 3Q SDN embeds + threeqsdn_url = ThreeQSDNIE._extract_url(webpage) + if threeqsdn_url: + return { + '_type': 'url_transparent', + 'ie_key': ThreeQSDNIE.ie_key(), + 'url': self._proto_relative_url(threeqsdn_url), + 'title': video_title, + 'description': video_description, + 'thumbnail': video_thumbnail, + 'uploader': video_uploader, + } + + # Look for VBOX7 embeds + vbox7_url = Vbox7IE._extract_url(webpage) + if vbox7_url: + return self.url_result(vbox7_url, Vbox7IE.ie_key()) + + # Look for DBTV embeds + dbtv_urls = DBTVIE._extract_urls(webpage) + if dbtv_urls: + return self.playlist_from_matches(dbtv_urls, video_id, video_title, ie=DBTVIE.ie_key()) + + # Look for Videa embeds + videa_urls = VideaIE._extract_urls(webpage) + if videa_urls: + return self.playlist_from_matches(videa_urls, video_id, video_title, ie=VideaIE.ie_key()) + + # Look for 20 minuten embeds + twentymin_urls = TwentyMinutenIE._extract_urls(webpage) + if twentymin_urls: + return self.playlist_from_matches( + twentymin_urls, video_id, video_title, ie=TwentyMinutenIE.ie_key()) + + # Look for VideoPress embeds + videopress_urls = VideoPressIE._extract_urls(webpage) + if videopress_urls: + return self.playlist_from_matches( + videopress_urls, video_id, video_title, ie=VideoPressIE.ie_key()) + + # Look for Rutube embeds + rutube_urls = RutubeIE._extract_urls(webpage) + if rutube_urls: + return self.playlist_from_matches( + rutube_urls, video_id, video_title, ie=RutubeIE.ie_key()) + + # Look for WashingtonPost embeds + wapo_urls = WashingtonPostIE._extract_urls(webpage) + if wapo_urls: + return self.playlist_from_matches( + wapo_urls, video_id, video_title, ie=WashingtonPostIE.ie_key()) + + # Look for Mediaset embeds + mediaset_urls = MediasetIE._extract_urls(self, webpage) + if mediaset_urls: + return self.playlist_from_matches( + mediaset_urls, video_id, video_title, ie=MediasetIE.ie_key()) + + # Look for JOJ.sk embeds + joj_urls = JojIE._extract_urls(webpage) + if joj_urls: + return self.playlist_from_matches( + joj_urls, video_id, video_title, ie=JojIE.ie_key()) + + # Look for megaphone.fm embeds + mpfn_urls = MegaphoneIE._extract_urls(webpage) + if mpfn_urls: + return self.playlist_from_matches( + mpfn_urls, video_id, video_title, ie=MegaphoneIE.ie_key()) + + # Look for vzaar embeds + vzaar_urls = VzaarIE._extract_urls(webpage) + if vzaar_urls: + return self.playlist_from_matches( + vzaar_urls, video_id, video_title, ie=VzaarIE.ie_key()) + + channel9_urls = Channel9IE._extract_urls(webpage) + if channel9_urls: + return self.playlist_from_matches( + channel9_urls, video_id, video_title, ie=Channel9IE.ie_key()) + + vshare_urls = VShareIE._extract_urls(webpage) + if vshare_urls: + return self.playlist_from_matches( + vshare_urls, video_id, video_title, ie=VShareIE.ie_key()) + + # Look for Mediasite embeds + mediasite_urls = MediasiteIE._extract_urls(webpage) + if mediasite_urls: + entries = [ + self.url_result(smuggle_url( + compat_urlparse.urljoin(url, mediasite_url), + {'UrlReferrer': url}), ie=MediasiteIE.ie_key()) + for mediasite_url in mediasite_urls] + return self.playlist_result(entries, video_id, video_title) + + springboardplatform_urls = SpringboardPlatformIE._extract_urls(webpage) + if springboardplatform_urls: + return self.playlist_from_matches( + springboardplatform_urls, video_id, video_title, + ie=SpringboardPlatformIE.ie_key()) + + yapfiles_urls = YapFilesIE._extract_urls(webpage) + if yapfiles_urls: + return self.playlist_from_matches( + yapfiles_urls, video_id, video_title, ie=YapFilesIE.ie_key()) + + vice_urls = ViceIE._extract_urls(webpage) + if vice_urls: + return self.playlist_from_matches( + vice_urls, video_id, video_title, ie=ViceIE.ie_key()) + + xfileshare_urls = XFileShareIE._extract_urls(webpage) + if xfileshare_urls: + return self.playlist_from_matches( + xfileshare_urls, video_id, video_title, ie=XFileShareIE.ie_key()) + + cloudflarestream_urls = CloudflareStreamIE._extract_urls(webpage) + if cloudflarestream_urls: + return self.playlist_from_matches( + cloudflarestream_urls, video_id, video_title, ie=CloudflareStreamIE.ie_key()) + + peertube_urls = PeerTubeIE._extract_urls(webpage, url) + if peertube_urls: + return self.playlist_from_matches( + peertube_urls, video_id, video_title, ie=PeerTubeIE.ie_key()) + + indavideo_urls = IndavideoEmbedIE._extract_urls(webpage) + if indavideo_urls: + return self.playlist_from_matches( + indavideo_urls, video_id, video_title, ie=IndavideoEmbedIE.ie_key()) + + apa_urls = APAIE._extract_urls(webpage) + if apa_urls: + return self.playlist_from_matches( + apa_urls, video_id, video_title, ie=APAIE.ie_key()) + + foxnews_urls = FoxNewsIE._extract_urls(webpage) + if foxnews_urls: + return self.playlist_from_matches( + foxnews_urls, video_id, video_title, ie=FoxNewsIE.ie_key()) + + sharevideos_urls = [sharevideos_mobj.group('url') for sharevideos_mobj in re.finditer( + r'<iframe[^>]+?\bsrc\s*=\s*(["\'])(?P<url>(?:https?:)?//embed\.share-videos\.se/auto/embed/\d+\?.*?\buid=\d+.*?)\1', + webpage)] + if sharevideos_urls: + return self.playlist_from_matches( + sharevideos_urls, video_id, video_title) + + viqeo_urls = ViqeoIE._extract_urls(webpage) + if viqeo_urls: + return self.playlist_from_matches( + viqeo_urls, video_id, video_title, ie=ViqeoIE.ie_key()) + + expressen_urls = ExpressenIE._extract_urls(webpage) + if expressen_urls: + return self.playlist_from_matches( + expressen_urls, video_id, video_title, ie=ExpressenIE.ie_key()) + + zype_urls = ZypeIE._extract_urls(webpage) + if zype_urls: + return self.playlist_from_matches( + zype_urls, video_id, video_title, ie=ZypeIE.ie_key()) + + gedi_urls = GediDigitalIE._extract_urls(webpage) + if gedi_urls: + return self.playlist_from_matches( + gedi_urls, video_id, video_title, ie=GediDigitalIE.ie_key()) + + # Look for RCS media group embeds + rcs_urls = RCSEmbedsIE._extract_urls(webpage) + if rcs_urls: + return self.playlist_from_matches( + rcs_urls, video_id, video_title, ie=RCSEmbedsIE.ie_key()) + + wimtv_urls = WimTVIE._extract_urls(webpage) + if wimtv_urls: + return self.playlist_from_matches( + wimtv_urls, video_id, video_title, ie=WimTVIE.ie_key()) + + bitchute_urls = BitChuteIE._extract_urls(webpage) + if bitchute_urls: + return self.playlist_from_matches( + bitchute_urls, video_id, video_title, ie=BitChuteIE.ie_key()) + + rumble_urls = RumbleEmbedIE._extract_urls(webpage) + if len(rumble_urls) == 1: + return self.url_result(rumble_urls[0], RumbleEmbedIE.ie_key()) + if rumble_urls: + return self.playlist_from_matches( + rumble_urls, video_id, video_title, ie=RumbleEmbedIE.ie_key()) + + # Look for HTML5 media + entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') + if entries: + if len(entries) == 1: + entries[0].update({ + 'id': video_id, + 'title': video_title, + }) + else: + for num, entry in enumerate(entries, start=1): + entry.update({ + 'id': '%s-%s' % (video_id, num), + 'title': '%s (%d)' % (video_title, num), + }) + for entry in entries: + self._sort_formats(entry['formats']) + return self.playlist_result(entries, video_id, video_title) + + jwplayer_data = self._find_jwplayer_data( + webpage, video_id, transform_source=js_to_json) + if jwplayer_data: + try: + info = self._parse_jwplayer_data( + jwplayer_data, video_id, require_title=False, base_url=url) + return merge_dicts(info, info_dict) + except ExtractorError: + # See https://github.com/ytdl-org/youtube-dl/pull/16735 + pass + + # Video.js embed + mobj = re.search( + r'(?s)\bvideojs\s*\(.+?\.src\s*\(\s*((?:\[.+?\]|{.+?}))\s*\)\s*;', + webpage) + if mobj is not None: + sources = self._parse_json( + mobj.group(1), video_id, transform_source=js_to_json, + fatal=False) or [] + if not isinstance(sources, list): + sources = [sources] + formats = [] + subtitles = {} + for source in sources: + src = source.get('src') + if not src or not isinstance(src, compat_str): + continue + src = compat_urlparse.urljoin(url, src) + src_type = source.get('type') + if isinstance(src_type, compat_str): + src_type = src_type.lower() + ext = determine_ext(src).lower() + if src_type == 'video/youtube': + return self.url_result(src, YoutubeIE.ie_key()) + if src_type == 'application/dash+xml' or ext == 'mpd': + fmts, subs = self._extract_mpd_formats_and_subtitles( + src, video_id, mpd_id='dash', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + elif src_type == 'application/x-mpegurl' or ext == 'm3u8': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + src, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + else: + formats.append({ + 'url': src, + 'ext': (mimetype2ext(src_type) + or ext if ext in KNOWN_EXTENSIONS else 'mp4'), + 'http_headers': { + 'Referer': full_response.geturl(), + }, + }) + if formats or subtitles: + self._sort_formats(formats) + info_dict['formats'] = formats + info_dict['subtitles'] = subtitles + return info_dict + + # Looking for http://schema.org/VideoObject + json_ld = self._search_json_ld( + webpage, video_id, default={}, expected_type='VideoObject') + if json_ld.get('url'): + return merge_dicts(json_ld, info_dict) + + def check_video(vurl): + if YoutubeIE.suitable(vurl): + return True + if RtmpIE.suitable(vurl): + return True + vpath = compat_urlparse.urlparse(vurl).path + vext = determine_ext(vpath) + return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js', 'xml') + + def filter_video(urls): + return list(filter(check_video, urls)) + + # Start with something easy: JW Player in SWFObject + found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)) + if not found: + # Look for gorilla-vid style embedding + found = filter_video(re.findall(r'''(?sx) + (?: + jw_plugins| + JWPlayerOptions| + jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup + ) + .*? + ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage)) + if not found: + # Look for generic KVS player + found = re.search(r'<script [^>]*?src="https://.+?/kt_player\.js\?v=(?P<ver>(?P<maj_ver>\d+)(\.\d+)+)".*?>', webpage) + if found: + if found.group('maj_ver') not in ['4', '5']: + self.report_warning('Untested major version (%s) in player engine--Download may fail.' % found.group('ver')) + flashvars = re.search(r'(?ms)<script.*?>.*?var\s+flashvars\s*=\s*(\{.*?\});.*?</script>', webpage) + flashvars = self._parse_json(flashvars.group(1), video_id, transform_source=js_to_json) + + # extract the part after the last / as the display_id from the + # canonical URL. + display_id = self._search_regex( + r'(?:<link href="https?://[^"]+/(.+?)/?" rel="canonical"\s*/?>' + r'|<link rel="canonical" href="https?://[^"]+/(.+?)/?"\s*/?>)', + webpage, 'display_id', fatal=False + ) + title = self._html_search_regex(r'<(?:h1|title)>(?:Video: )?(.+?)</(?:h1|title)>', webpage, 'title') + + thumbnail = flashvars['preview_url'] + if thumbnail.startswith('//'): + protocol, _, _ = url.partition('/') + thumbnail = protocol + thumbnail + + formats = [] + for key in ('video_url', 'video_alt_url', 'video_alt_url2'): + if key in flashvars and '/get_file/' in flashvars[key]: + next_format = { + 'url': self._kvs_getrealurl(flashvars[key], flashvars['license_code']), + 'format_id': flashvars.get(key + '_text', key), + 'ext': 'mp4', + } + height = re.search(r'%s_(\d+)p\.mp4(?:/[?].*)?$' % flashvars['video_id'], flashvars[key]) + if height: + next_format['height'] = int(height.group(1)) + else: + next_format['quality'] = 1 + formats.append(next_format) + self._sort_formats(formats) + + return { + 'id': flashvars['video_id'], + 'display_id': display_id, + 'title': title, + 'thumbnail': thumbnail, + 'formats': formats, + } + if not found: + # Broaden the search a little bit + found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)) + if not found: + # Broaden the findall a little bit: JWPlayer JS loader + found = filter_video(re.findall( + r'[^A-Za-z0-9]?(?:file|video_url)["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage)) + if not found: + # Flow player + found = filter_video(re.findall(r'''(?xs) + flowplayer\("[^"]+",\s* + \{[^}]+?\}\s*, + \s*\{[^}]+? ["']?clip["']?\s*:\s*\{\s* + ["']?url["']?\s*:\s*["']([^"']+)["'] + ''', webpage)) + if not found: + # Cinerama player + found = re.findall( + r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage) + if not found: + # Try to find twitter cards info + # twitter:player:stream should be checked before twitter:player since + # it is expected to contain a raw stream (see + # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser) + found = filter_video(re.findall( + r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)) + if not found: + # We look for Open Graph info: + # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am) + m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage) + # We only look in og:video if the MIME type is a video, don't try if it's a Flash player: + if m_video_type is not None: + found = filter_video(re.findall(r'<meta.*?property="og:(?:video|audio)".*?content="(.*?)"', webpage)) + if not found: + REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)' + found = re.search( + r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")' + r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX, + webpage) + if not found: + # Look also in Refresh HTTP header + refresh_header = head_response.headers.get('Refresh') + if refresh_header: + # In python 2 response HTTP headers are bytestrings + if sys.version_info < (3, 0) and isinstance(refresh_header, str): + refresh_header = refresh_header.decode('iso-8859-1') + found = re.search(REDIRECT_REGEX, refresh_header) + if found: + new_url = compat_urlparse.urljoin(url, unescapeHTML(found.group(1))) + if new_url != url: + self.report_following_redirect(new_url) + return { + '_type': 'url', + 'url': new_url, + } + else: + found = None + + if not found: + # twitter:player is a https URL to iframe player that may or may not + # be supported by yt-dlp thus this is checked the very last (see + # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser) + embed_url = self._html_search_meta('twitter:player', webpage, default=None) + if embed_url and embed_url != url: + return self.url_result(embed_url) + + if not found: + raise UnsupportedError(url) + + entries = [] + for video_url in orderedSet(found): + video_url = unescapeHTML(video_url) + video_url = video_url.replace('\\/', '/') + video_url = compat_urlparse.urljoin(url, video_url) + video_id = compat_urllib_parse_unquote(os.path.basename(video_url)) + + # Sometimes, jwplayer extraction will result in a YouTube URL + if YoutubeIE.suitable(video_url): + entries.append(self.url_result(video_url, 'Youtube')) + continue + + # here's a fun little line of code for you: + video_id = os.path.splitext(video_id)[0] + + entry_info_dict = { + 'id': video_id, + 'uploader': video_uploader, + 'title': video_title, + 'age_limit': age_limit, + } + + if RtmpIE.suitable(video_url): + entry_info_dict.update({ + '_type': 'url_transparent', + 'ie_key': RtmpIE.ie_key(), + 'url': video_url, + }) + entries.append(entry_info_dict) + continue + + ext = determine_ext(video_url) + if ext == 'smil': + entry_info_dict = {**self._extract_smil_info(video_url, video_id), **entry_info_dict} + elif ext == 'xspf': + return self.playlist_result(self._extract_xspf_playlist(video_url, video_id), video_id) + elif ext == 'm3u8': + entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(video_url, video_id, ext='mp4') + elif ext == 'mpd': + entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_mpd_formats_and_subtitles(video_url, video_id) + elif ext == 'f4m': + entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id) + elif re.search(r'(?i)\.(?:ism|smil)/manifest', video_url) and video_url != url: + # Just matching .ism/manifest is not enough to be reliably sure + # whether it's actually an ISM manifest or some other streaming + # manifest since there are various streaming URL formats + # possible (see [1]) as well as some other shenanigans like + # .smil/manifest URLs that actually serve an ISM (see [2]) and + # so on. + # Thus the most reasonable way to solve this is to delegate + # to generic extractor in order to look into the contents of + # the manifest itself. + # 1. https://azure.microsoft.com/en-us/documentation/articles/media-services-deliver-content-overview/#streaming-url-formats + # 2. https://svs.itworkscdn.net/lbcivod/smil:itwfcdn/lbci/170976.smil/Manifest + entry_info_dict = self.url_result( + smuggle_url(video_url, {'to_generic': True}), + GenericIE.ie_key()) + else: + entry_info_dict['url'] = video_url + + if entry_info_dict.get('formats'): + self._sort_formats(entry_info_dict['formats']) + + entries.append(entry_info_dict) + + if len(entries) == 1: + return entries[0] + else: + for num, e in enumerate(entries, start=1): + # 'url' results don't have a title + if e.get('title') is not None: + e['title'] = '%s (%d)' % (e['title'], num) + return { + '_type': 'playlist', + 'entries': entries, + } diff --git a/yt_dlp/extractor/gettr.py b/yt_dlp/extractor/gettr.py new file mode 100644 index 000000000..aa50b2f35 --- /dev/null +++ b/yt_dlp/extractor/gettr.py @@ -0,0 +1,110 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + dict_get, + float_or_none, + int_or_none, + remove_end, + str_or_none, + try_get, + url_or_none, + urljoin, +) + + +class GettrIE(InfoExtractor): + _VALID_URL = r'https?://(www\.)?gettr\.com/post/(?P<id>[a-z0-9]+)' + _MEDIA_BASE_URL = 'https://media.gettr.com/' + + _TESTS = [{ + 'url': 'https://www.gettr.com/post/pcf6uv838f', + 'info_dict': { + 'id': 'pcf6uv838f', + 'title': 'md5:9086a646bbd06c41c4fe8e52b3c93454', + 'description': 'md5:be0577f1e4caadc06de4a002da2bf287', + 'ext': 'mp4', + 'uploader': 'EpochTV', + 'uploader_id': 'epochtv', + 'thumbnail': r're:^https?://.+/out\.jpg', + 'timestamp': 1632782451058, + 'duration': 58.5585, + } + }, { + 'url': 'https://gettr.com/post/p4iahp', + 'info_dict': { + 'id': 'p4iahp', + 'title': 'md5:b03c07883db6fbc1aab88877a6c3b149', + 'description': 'md5:741b7419d991c403196ed2ea7749a39d', + 'ext': 'mp4', + 'uploader': 'Neues Forum Freiheit', + 'uploader_id': 'nf_freiheit', + 'thumbnail': r're:^https?://.+/out\.jpg', + 'timestamp': 1626594455017, + 'duration': 23, + } + }] + + def _real_extract(self, url): + post_id = self._match_id(url) + webpage = self._download_webpage(url, post_id) + + api_data = self._download_json( + 'https://api.gettr.com/u/post/%s?incl="poststats|userinfo"' % post_id, post_id) + + post_data = try_get(api_data, lambda x: x['result']['data']) + user_data = try_get(api_data, lambda x: x['result']['aux']['uinf'][post_data['uid']]) or {} + + if post_data.get('nfound'): + raise ExtractorError(post_data.get('txt'), expected=True) + + title = description = str_or_none( + post_data.get('txt') or self._og_search_description(webpage)) + + uploader = str_or_none( + user_data.get('nickname') + or remove_end(self._og_search_title(webpage), ' on GETTR')) + if uploader: + title = '%s - %s' % (uploader, title) + + if not dict_get(post_data, ['vid', 'ovid']): + raise ExtractorError('There\'s no video in this post.') + + vid = post_data.get('vid') + ovid = post_data.get('ovid') + + formats = self._extract_m3u8_formats( + urljoin(self._MEDIA_BASE_URL, vid), post_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls') if vid else [] + + if ovid: + formats.append({ + 'url': urljoin(self._MEDIA_BASE_URL, ovid), + 'format_id': 'ovid', + 'ext': 'mp4', + 'width': int_or_none(post_data.get('vid_wid')), + 'height': int_or_none(post_data.get('vid_hgt')), + 'source_preference': 1, + 'quality': 1, + }) + + self._sort_formats(formats) + + return { + 'id': post_id, + 'title': title, + 'description': description, + 'thumbnail': url_or_none( + urljoin(self._MEDIA_BASE_URL, post_data.get('main')) + or self._og_search_thumbnail(webpage)), + 'timestamp': int_or_none(post_data.get('cdate')), + 'uploader_id': str_or_none( + dict_get(user_data, ['_id', 'username']) + or post_data.get('uid')), + 'uploader': uploader, + 'formats': formats, + 'duration': float_or_none(post_data.get('vid_dur')), + 'tags': post_data.get('htgs'), + } diff --git a/youtube_dl/extractor/gfycat.py b/yt_dlp/extractor/gfycat.py index 18a30fe67..18a30fe67 100644 --- a/youtube_dl/extractor/gfycat.py +++ b/yt_dlp/extractor/gfycat.py diff --git a/yt_dlp/extractor/giantbomb.py b/yt_dlp/extractor/giantbomb.py new file mode 100644 index 000000000..1920923fc --- /dev/null +++ b/yt_dlp/extractor/giantbomb.py @@ -0,0 +1,89 @@ +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, + qualities, + unescapeHTML, +) + + +class GiantBombIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?giantbomb\.com/(?:videos|shows)/(?P<display_id>[^/]+)/(?P<id>\d+-\d+)' + _TESTS = [{ + 'url': 'http://www.giantbomb.com/videos/quick-look-destiny-the-dark-below/2300-9782/', + 'md5': '132f5a803e7e0ab0e274d84bda1e77ae', + 'info_dict': { + 'id': '2300-9782', + 'display_id': 'quick-look-destiny-the-dark-below', + 'ext': 'mp4', + 'title': 'Quick Look: Destiny: The Dark Below', + 'description': 'md5:0aa3aaf2772a41b91d44c63f30dfad24', + 'duration': 2399, + 'thumbnail': r're:^https?://.*\.jpg$', + } + }, { + 'url': 'https://www.giantbomb.com/shows/ben-stranding/2970-20212', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') + + webpage = self._download_webpage(url, display_id) + + title = self._og_search_title(webpage) + description = self._og_search_description(webpage) + thumbnail = self._og_search_thumbnail(webpage) + + video = json.loads(unescapeHTML(self._search_regex( + r'data-video="([^"]+)"', webpage, 'data-video'))) + + duration = int_or_none(video.get('lengthSeconds')) + + quality = qualities([ + 'f4m_low', 'progressive_low', 'f4m_high', + 'progressive_high', 'f4m_hd', 'progressive_hd']) + + formats = [] + for format_id, video_url in video['videoStreams'].items(): + if format_id == 'f4m_stream': + continue + ext = determine_ext(video_url) + if ext == 'f4m': + f4m_formats = self._extract_f4m_formats(video_url + '?hdcore=3.3.1', display_id) + if f4m_formats: + f4m_formats[0]['quality'] = quality(format_id) + formats.extend(f4m_formats) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, display_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'url': video_url, + 'format_id': format_id, + 'quality': quality(format_id), + }) + + if not formats: + youtube_id = video.get('youtubeID') + if youtube_id: + return self.url_result(youtube_id, 'Youtube') + + self._sort_formats(formats) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + } diff --git a/youtube_dl/extractor/giga.py b/yt_dlp/extractor/giga.py index 5a9992a27..5a9992a27 100644 --- a/youtube_dl/extractor/giga.py +++ b/yt_dlp/extractor/giga.py diff --git a/youtube_dl/extractor/gigya.py b/yt_dlp/extractor/gigya.py index 412178492..412178492 100644 --- a/youtube_dl/extractor/gigya.py +++ b/yt_dlp/extractor/gigya.py diff --git a/youtube_dl/extractor/glide.py b/yt_dlp/extractor/glide.py index d94dfbf09..d94dfbf09 100644 --- a/youtube_dl/extractor/glide.py +++ b/yt_dlp/extractor/glide.py diff --git a/yt_dlp/extractor/globo.py b/yt_dlp/extractor/globo.py new file mode 100644 index 000000000..a3f024157 --- /dev/null +++ b/yt_dlp/extractor/globo.py @@ -0,0 +1,213 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import base64 +import hashlib +import json +import random +import re + +from .common import InfoExtractor +from ..compat import ( + compat_str, +) +from ..utils import ( + ExtractorError, + float_or_none, + orderedSet, + str_or_none, + try_get, +) + + +class GloboIE(InfoExtractor): + _VALID_URL = r'(?:globo:|https?://.+?\.globo\.com/(?:[^/]+/)*(?:v/(?:[^/]+/)?|videos/))(?P<id>\d{7,})' + _NETRC_MACHINE = 'globo' + _TESTS = [{ + 'url': 'http://g1.globo.com/carros/autoesporte/videos/t/exclusivos-do-g1/v/mercedes-benz-gla-passa-por-teste-de-colisao-na-europa/3607726/', + 'info_dict': { + 'id': '3607726', + 'ext': 'mp4', + 'title': 'Mercedes-Benz GLA passa por teste de colisão na Europa', + 'duration': 103.204, + 'uploader': 'G1', + 'uploader_id': '2015', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://globoplay.globo.com/v/4581987/', + 'info_dict': { + 'id': '4581987', + 'ext': 'mp4', + 'title': 'Acidentes de trânsito estão entre as maiores causas de queda de energia em SP', + 'duration': 137.973, + 'uploader': 'Rede Globo', + 'uploader_id': '196', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://canalbrasil.globo.com/programas/sangue-latino/videos/3928201.html', + 'only_matching': True, + }, { + 'url': 'http://globosatplay.globo.com/globonews/v/4472924/', + 'only_matching': True, + }, { + 'url': 'http://globotv.globo.com/t/programa/v/clipe-sexo-e-as-negas-adeus/3836166/', + 'only_matching': True, + }, { + 'url': 'http://globotv.globo.com/canal-brasil/sangue-latino/t/todos-os-videos/v/ator-e-diretor-argentino-ricado-darin-fala-sobre-utopias-e-suas-perdas/3928201/', + 'only_matching': True, + }, { + 'url': 'http://canaloff.globo.com/programas/desejar-profundo/videos/4518560.html', + 'only_matching': True, + }, { + 'url': 'globo:3607726', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + video = self._download_json( + 'http://api.globovideos.com/videos/%s/playlist' % video_id, + video_id)['videos'][0] + if not self.get_param('allow_unplayable_formats') and video.get('encrypted') is True: + self.report_drm(video_id) + + title = video['title'] + + formats = [] + security = self._download_json( + 'https://playback.video.globo.com/v1/video-session', video_id, 'Downloading security hash for %s' % video_id, + headers={'content-type': 'application/json'}, data=json.dumps({ + "player_type": "desktop", + "video_id": video_id, + "quality": "max", + "content_protection": "widevine", + "vsid": "581b986b-4c40-71f0-5a58-803e579d5fa2", + "tz": "-3.0:00" + }).encode()) + + security_hash = security['source']['token'] + if not security_hash: + message = security.get('message') + if message: + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, message), expected=True) + + hash_code = security_hash[:2] + padding = '%010d' % random.randint(1, 10000000000) + if hash_code in ('04', '14'): + received_time = security_hash[3:13] + received_md5 = security_hash[24:] + hash_prefix = security_hash[:23] + elif hash_code in ('02', '12', '03', '13'): + received_time = security_hash[2:12] + received_md5 = security_hash[22:] + padding += '1' + hash_prefix = '05' + security_hash[:22] + + padded_sign_time = compat_str(int(received_time) + 86400) + padding + md5_data = (received_md5 + padded_sign_time + '0xAC10FD').encode() + signed_md5 = base64.urlsafe_b64encode(hashlib.md5(md5_data).digest()).decode().strip('=') + signed_hash = hash_prefix + padded_sign_time + signed_md5 + source = security['source']['url_parts'] + resource_url = source['scheme'] + '://' + source['domain'] + source['path'] + signed_url = '%s?h=%s&k=html5&a=%s' % (resource_url, signed_hash, 'F' if video.get('subscriber_only') else 'A') + + formats.extend(self._extract_m3u8_formats( + signed_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + self._sort_formats(formats) + + subtitles = {} + for resource in video['resources']: + if resource.get('type') == 'subtitle': + subtitles.setdefault(resource.get('language') or 'por', []).append({ + 'url': resource.get('url'), + }) + subs = try_get(security, lambda x: x['source']['subtitles'], expected_type=dict) or {} + for sub_lang, sub_url in subs.items(): + if sub_url: + subtitles.setdefault(sub_lang or 'por', []).append({ + 'url': sub_url, + }) + subs = try_get(security, lambda x: x['source']['subtitles_webvtt'], expected_type=dict) or {} + for sub_lang, sub_url in subs.items(): + if sub_url: + subtitles.setdefault(sub_lang or 'por', []).append({ + 'url': sub_url, + }) + + duration = float_or_none(video.get('duration'), 1000) + uploader = video.get('channel') + uploader_id = str_or_none(video.get('channel_id')) + + return { + 'id': video_id, + 'title': title, + 'duration': duration, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'formats': formats, + 'subtitles': subtitles, + } + + +class GloboArticleIE(InfoExtractor): + _VALID_URL = r'https?://.+?\.globo\.com/(?:[^/]+/)*(?P<id>[^/.]+)(?:\.html)?' + + _VIDEOID_REGEXES = [ + r'\bdata-video-id=["\'](\d{7,})', + r'\bdata-player-videosids=["\'](\d{7,})', + r'\bvideosIDs\s*:\s*["\']?(\d{7,})', + r'\bdata-id=["\'](\d{7,})', + r'<div[^>]+\bid=["\'](\d{7,})', + ] + + _TESTS = [{ + 'url': 'http://g1.globo.com/jornal-nacional/noticia/2014/09/novidade-na-fiscalizacao-de-bagagem-pela-receita-provoca-discussoes.html', + 'info_dict': { + 'id': 'novidade-na-fiscalizacao-de-bagagem-pela-receita-provoca-discussoes', + 'title': 'Novidade na fiscalização de bagagem pela Receita provoca discussões', + 'description': 'md5:c3c4b4d4c30c32fce460040b1ac46b12', + }, + 'playlist_count': 1, + }, { + 'url': 'http://g1.globo.com/pr/parana/noticia/2016/09/mpf-denuncia-lula-marisa-e-mais-seis-na-operacao-lava-jato.html', + 'info_dict': { + 'id': 'mpf-denuncia-lula-marisa-e-mais-seis-na-operacao-lava-jato', + 'title': "Lula era o 'comandante máximo' do esquema da Lava Jato, diz MPF", + 'description': 'md5:8aa7cc8beda4dc71cc8553e00b77c54c', + }, + 'playlist_count': 6, + }, { + 'url': 'http://gq.globo.com/Prazeres/Poder/noticia/2015/10/all-o-desafio-assista-ao-segundo-capitulo-da-serie.html', + 'only_matching': True, + }, { + 'url': 'http://gshow.globo.com/programas/tv-xuxa/O-Programa/noticia/2014/01/xuxa-e-junno-namoram-muuuito-em-luau-de-zeze-di-camargo-e-luciano.html', + 'only_matching': True, + }, { + 'url': 'http://oglobo.globo.com/rio/a-amizade-entre-um-entregador-de-farmacia-um-piano-19946271', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if GloboIE.suitable(url) else super(GloboArticleIE, cls).suitable(url) + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_ids = [] + for video_regex in self._VIDEOID_REGEXES: + video_ids.extend(re.findall(video_regex, webpage)) + entries = [ + self.url_result('globo:%s' % video_id, GloboIE.ie_key()) + for video_id in orderedSet(video_ids)] + title = self._og_search_title(webpage, fatal=False) + description = self._html_search_meta('description', webpage) + return self.playlist_result(entries, display_id, title, description) diff --git a/yt_dlp/extractor/go.py b/yt_dlp/extractor/go.py new file mode 100644 index 000000000..2ccc6df21 --- /dev/null +++ b/yt_dlp/extractor/go.py @@ -0,0 +1,320 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .adobepass import AdobePassIE +from ..compat import compat_str +from ..utils import ( + int_or_none, + determine_ext, + parse_age_limit, + remove_start, + remove_end, + try_get, + urlencode_postdata, + ExtractorError, +) + + +class GoIE(AdobePassIE): + _SITE_INFO = { + 'abc': { + 'brand': '001', + 'requestor_id': 'ABC', + }, + 'freeform': { + 'brand': '002', + 'requestor_id': 'ABCFamily', + }, + 'watchdisneychannel': { + 'brand': '004', + 'resource_id': 'Disney', + }, + 'watchdisneyjunior': { + 'brand': '008', + 'resource_id': 'DisneyJunior', + }, + 'watchdisneyxd': { + 'brand': '009', + 'resource_id': 'DisneyXD', + }, + 'disneynow': { + 'brand': '011', + 'resource_id': 'Disney', + }, + 'fxnow.fxnetworks': { + 'brand': '025', + 'requestor_id': 'dtci', + }, + } + _VALID_URL = r'''(?x) + https?:// + (?P<sub_domain> + (?:%s\.)?go|fxnow\.fxnetworks| + (?:www\.)?(?:abc|freeform|disneynow) + )\.com/ + (?: + (?:[^/]+/)*(?P<id>[Vv][Dd][Kk][Aa]\w+)| + (?:[^/]+/)*(?P<display_id>[^/?\#]+) + ) + ''' % r'\.|'.join(list(_SITE_INFO.keys())) + _TESTS = [{ + 'url': 'http://abc.go.com/shows/designated-survivor/video/most-recent/VDKA3807643', + 'info_dict': { + 'id': 'VDKA3807643', + 'ext': 'mp4', + 'title': 'The Traitor in the White House', + 'description': 'md5:05b009d2d145a1e85d25111bd37222e8', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'skip': 'This content is no longer available.', + }, { + 'url': 'http://watchdisneyxd.go.com/doraemon', + 'info_dict': { + 'title': 'Doraemon', + 'id': 'SH55574025', + }, + 'playlist_mincount': 51, + }, { + 'url': 'http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood', + 'info_dict': { + 'id': 'VDKA3609139', + 'ext': 'mp4', + 'title': 'This Guilty Blood', + 'description': 'md5:f18e79ad1c613798d95fdabfe96cd292', + 'age_limit': 14, + }, + 'params': { + 'geo_bypass_ip_block': '3.244.239.0/24', + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'https://abc.com/shows/the-rookie/episode-guide/season-02/03-the-bet', + 'info_dict': { + 'id': 'VDKA13435179', + 'ext': 'mp4', + 'title': 'The Bet', + 'description': 'md5:c66de8ba2e92c6c5c113c3ade84ab404', + 'age_limit': 14, + }, + 'params': { + 'geo_bypass_ip_block': '3.244.239.0/24', + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'https://fxnow.fxnetworks.com/shows/better-things/video/vdka12782841', + 'info_dict': { + 'id': 'VDKA12782841', + 'ext': 'mp4', + 'title': 'First Look: Better Things - Season 2', + 'description': 'md5:fa73584a95761c605d9d54904e35b407', + }, + 'params': { + 'geo_bypass_ip_block': '3.244.239.0/24', + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'https://abc.com/shows/modern-family/episode-guide/season-01/101-pilot', + 'info_dict': { + 'id': 'VDKA22600213', + 'ext': 'mp4', + 'title': 'Pilot', + 'description': 'md5:74306df917cfc199d76d061d66bebdb4', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://abc.go.com/shows/the-catch/episode-guide/season-01/10-the-wedding', + 'only_matching': True, + }, { + 'url': 'http://abc.go.com/shows/world-news-tonight/episode-guide/2017-02/17-021717-intense-stand-off-between-man-with-rifle-and-police-in-oakland', + 'only_matching': True, + }, { + # brand 004 + 'url': 'http://disneynow.go.com/shows/big-hero-6-the-series/season-01/episode-10-mr-sparkles-loses-his-sparkle/vdka4637915', + 'only_matching': True, + }, { + # brand 008 + 'url': 'http://disneynow.go.com/shows/minnies-bow-toons/video/happy-campers/vdka4872013', + 'only_matching': True, + }, { + 'url': 'https://disneynow.com/shows/minnies-bow-toons/video/happy-campers/vdka4872013', + 'only_matching': True, + }, { + 'url': 'https://www.freeform.com/shows/cruel-summer/episode-guide/season-01/01-happy-birthday-jeanette-turner', + 'only_matching': True, + }] + + def _extract_videos(self, brand, video_id='-1', show_id='-1'): + display_id = video_id if video_id != '-1' else show_id + return self._download_json( + 'http://api.contents.watchabc.go.com/vp2/ws/contents/3000/videos/%s/001/-1/%s/-1/%s/-1/-1.json' % (brand, show_id, video_id), + display_id)['video'] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + sub_domain = remove_start(remove_end(mobj.group('sub_domain') or '', '.go'), 'www.') + video_id, display_id = mobj.group('id', 'display_id') + site_info = self._SITE_INFO.get(sub_domain, {}) + brand = site_info.get('brand') + if not video_id or not site_info: + webpage = self._download_webpage(url, display_id or video_id) + data = self._parse_json( + self._search_regex( + r'["\']__abc_com__["\']\s*\]\s*=\s*({.+?})\s*;', webpage, + 'data', default='{}'), + display_id or video_id, fatal=False) + # https://abc.com/shows/modern-family/episode-guide/season-01/101-pilot + layout = try_get(data, lambda x: x['page']['content']['video']['layout'], dict) + video_id = None + if layout: + video_id = try_get( + layout, + (lambda x: x['videoid'], lambda x: x['video']['id']), + compat_str) + if not video_id: + video_id = self._search_regex( + ( + # There may be inner quotes, e.g. data-video-id="'VDKA3609139'" + # from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood + r'data-video-id=["\']*(VDKA\w+)', + # page.analytics.videoIdCode + r'\bvideoIdCode["\']\s*:\s*["\']((?:vdka|VDKA)\w+)', + # https://abc.com/shows/the-rookie/episode-guide/season-02/03-the-bet + r'\b(?:video)?id["\']\s*:\s*["\'](VDKA\w+)' + ), webpage, 'video id', default=video_id) + if not site_info: + brand = self._search_regex( + (r'data-brand=\s*["\']\s*(\d+)', + r'data-page-brand=\s*["\']\s*(\d+)'), webpage, 'brand', + default='004') + site_info = next( + si for _, si in self._SITE_INFO.items() + if si.get('brand') == brand) + if not video_id: + # show extraction works for Disney, DisneyJunior and DisneyXD + # ABC and Freeform has different layout + show_id = self._search_regex(r'data-show-id=["\']*(SH\d+)', webpage, 'show id') + videos = self._extract_videos(brand, show_id=show_id) + show_title = self._search_regex(r'data-show-title="([^"]+)"', webpage, 'show title', fatal=False) + entries = [] + for video in videos: + entries.append(self.url_result( + video['url'], 'Go', video.get('id'), video.get('title'))) + entries.reverse() + return self.playlist_result(entries, show_id, show_title) + video_data = self._extract_videos(brand, video_id)[0] + video_id = video_data['id'] + title = video_data['title'] + + formats = [] + for asset in video_data.get('assets', {}).get('asset', []): + asset_url = asset.get('value') + if not asset_url: + continue + format_id = asset.get('format') + ext = determine_ext(asset_url) + if ext == 'm3u8': + video_type = video_data.get('type') + data = { + 'video_id': video_data['id'], + 'video_type': video_type, + 'brand': brand, + 'device': '001', + } + if video_data.get('accesslevel') == '1': + requestor_id = site_info.get('requestor_id', 'DisneyChannels') + resource = site_info.get('resource_id') or self._get_mvpd_resource( + requestor_id, title, video_id, None) + auth = self._extract_mvpd_auth( + url, video_id, requestor_id, resource) + data.update({ + 'token': auth, + 'token_type': 'ap', + 'adobe_requestor_id': requestor_id, + }) + else: + self._initialize_geo_bypass({'countries': ['US']}) + entitlement = self._download_json( + 'https://api.entitlement.watchabc.go.com/vp2/ws-secure/entitlement/2020/authorize.json', + video_id, data=urlencode_postdata(data)) + errors = entitlement.get('errors', {}).get('errors', []) + if errors: + for error in errors: + if error.get('code') == 1002: + self.raise_geo_restricted( + error['message'], countries=['US']) + error_message = ', '.join([error['message'] for error in errors]) + raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message), expected=True) + asset_url += '?' + entitlement['uplynkData']['sessionKey'] + formats.extend(self._extract_m3u8_formats( + asset_url, video_id, 'mp4', m3u8_id=format_id or 'hls', fatal=False)) + else: + f = { + 'format_id': format_id, + 'url': asset_url, + 'ext': ext, + } + if re.search(r'(?:/mp4/source/|_source\.mp4)', asset_url): + f.update({ + 'format_id': ('%s-' % format_id if format_id else '') + 'SOURCE', + 'quality': 1, + }) + else: + mobj = re.search(r'/(\d+)x(\d+)/', asset_url) + if mobj: + height = int(mobj.group(2)) + f.update({ + 'format_id': ('%s-' % format_id if format_id else '') + '%dP' % height, + 'width': int(mobj.group(1)), + 'height': height, + }) + formats.append(f) + self._sort_formats(formats) + + subtitles = {} + for cc in video_data.get('closedcaption', {}).get('src', []): + cc_url = cc.get('value') + if not cc_url: + continue + ext = determine_ext(cc_url) + if ext == 'xml': + ext = 'ttml' + subtitles.setdefault(cc.get('lang'), []).append({ + 'url': cc_url, + 'ext': ext, + }) + + thumbnails = [] + for thumbnail in video_data.get('thumbnails', {}).get('thumbnail', []): + thumbnail_url = thumbnail.get('value') + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('longdescription') or video_data.get('description'), + 'duration': int_or_none(video_data.get('duration', {}).get('value'), 1000), + 'age_limit': parse_age_limit(video_data.get('tvrating', {}).get('rating')), + 'episode_number': int_or_none(video_data.get('episodenumber')), + 'series': video_data.get('show', {}).get('title'), + 'season_number': int_or_none(video_data.get('season', {}).get('num')), + 'thumbnails': thumbnails, + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/yt_dlp/extractor/godtube.py b/yt_dlp/extractor/godtube.py new file mode 100644 index 000000000..96e68b4d2 --- /dev/null +++ b/yt_dlp/extractor/godtube.py @@ -0,0 +1,57 @@ +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ( + parse_duration, + parse_iso8601, +) + + +class GodTubeIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?godtube\.com/watch/\?v=(?P<id>[\da-zA-Z]+)' + _TESTS = [ + { + 'url': 'https://www.godtube.com/watch/?v=0C0CNNNU', + 'md5': '77108c1e4ab58f48031101a1a2119789', + 'info_dict': { + 'id': '0C0CNNNU', + 'ext': 'mp4', + 'title': 'Woman at the well.', + 'duration': 159, + 'timestamp': 1205712000, + 'uploader': 'beverlybmusic', + 'upload_date': '20080317', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + }, + ] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + + config = self._download_xml( + 'http://www.godtube.com/resource/mediaplayer/%s.xml' % video_id.lower(), + video_id, 'Downloading player config XML') + + video_url = config.find('file').text + uploader = config.find('author').text + timestamp = parse_iso8601(config.find('date').text) + duration = parse_duration(config.find('duration').text) + thumbnail = config.find('image').text + + media = self._download_xml( + 'http://www.godtube.com/media/xml/?v=%s' % video_id, video_id, 'Downloading media XML') + + title = media.find('title').text + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'uploader': uploader, + 'duration': duration, + } diff --git a/youtube_dl/extractor/golem.py b/yt_dlp/extractor/golem.py index 47a068e74..47a068e74 100644 --- a/youtube_dl/extractor/golem.py +++ b/yt_dlp/extractor/golem.py diff --git a/yt_dlp/extractor/googledrive.py b/yt_dlp/extractor/googledrive.py new file mode 100644 index 000000000..7b5bf280f --- /dev/null +++ b/yt_dlp/extractor/googledrive.py @@ -0,0 +1,280 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_parse_qs +from ..utils import ( + determine_ext, + ExtractorError, + get_element_by_class, + int_or_none, + lowercase_escape, + try_get, + update_url_query, +) + + +class GoogleDriveIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?: + (?:docs|drive)\.google\.com/ + (?: + (?:uc|open)\?.*?id=| + file/d/ + )| + video\.google\.com/get_player\?.*?docid= + ) + (?P<id>[a-zA-Z0-9_-]{28,}) + ''' + _TESTS = [{ + 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1', + 'md5': '5c602afbbf2c1db91831f5d82f678554', + 'info_dict': { + 'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ', + 'ext': 'mp4', + 'title': 'Big Buck Bunny.mp4', + 'duration': 45, + } + }, { + # video can't be watched anonymously due to view count limit reached, + # but can be downloaded (see https://github.com/ytdl-org/youtube-dl/issues/14046) + 'url': 'https://drive.google.com/file/d/0B-vUyvmDLdWDcEt4WjBqcmI2XzQ/view', + 'only_matching': True, + }, { + # video id is longer than 28 characters + 'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit', + 'only_matching': True, + }, { + 'url': 'https://drive.google.com/open?id=0B2fjwgkl1A_CX083Tkowdmt6d28', + 'only_matching': True, + }, { + 'url': 'https://drive.google.com/uc?id=0B2fjwgkl1A_CX083Tkowdmt6d28', + 'only_matching': True, + }] + _FORMATS_EXT = { + '5': 'flv', + '6': 'flv', + '13': '3gp', + '17': '3gp', + '18': 'mp4', + '22': 'mp4', + '34': 'flv', + '35': 'flv', + '36': '3gp', + '37': 'mp4', + '38': 'mp4', + '43': 'webm', + '44': 'webm', + '45': 'webm', + '46': 'webm', + '59': 'mp4', + } + _BASE_URL_CAPTIONS = 'https://drive.google.com/timedtext' + _CAPTIONS_ENTRY_TAG = { + 'subtitles': 'track', + 'automatic_captions': 'target', + } + _caption_formats_ext = [] + _captions_xml = None + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28,})', + webpage) + if mobj: + return 'https://drive.google.com/file/d/%s' % mobj.group('id') + + def _download_subtitles_xml(self, video_id, subtitles_id, hl): + if self._captions_xml: + return + self._captions_xml = self._download_xml( + self._BASE_URL_CAPTIONS, video_id, query={ + 'id': video_id, + 'vid': subtitles_id, + 'hl': hl, + 'v': video_id, + 'type': 'list', + 'tlangs': '1', + 'fmts': '1', + 'vssids': '1', + }, note='Downloading subtitles XML', + errnote='Unable to download subtitles XML', fatal=False) + if self._captions_xml: + for f in self._captions_xml.findall('format'): + if f.attrib.get('fmt_code') and not f.attrib.get('default'): + self._caption_formats_ext.append(f.attrib['fmt_code']) + + def _get_captions_by_type(self, video_id, subtitles_id, caption_type, + origin_lang_code=None): + if not subtitles_id or not caption_type: + return + captions = {} + for caption_entry in self._captions_xml.findall( + self._CAPTIONS_ENTRY_TAG[caption_type]): + caption_lang_code = caption_entry.attrib.get('lang_code') + if not caption_lang_code: + continue + caption_format_data = [] + for caption_format in self._caption_formats_ext: + query = { + 'vid': subtitles_id, + 'v': video_id, + 'fmt': caption_format, + 'lang': (caption_lang_code if origin_lang_code is None + else origin_lang_code), + 'type': 'track', + 'name': '', + 'kind': '', + } + if origin_lang_code is not None: + query.update({'tlang': caption_lang_code}) + caption_format_data.append({ + 'url': update_url_query(self._BASE_URL_CAPTIONS, query), + 'ext': caption_format, + }) + captions[caption_lang_code] = caption_format_data + return captions + + def _get_subtitles(self, video_id, subtitles_id, hl): + if not subtitles_id or not hl: + return + self._download_subtitles_xml(video_id, subtitles_id, hl) + if not self._captions_xml: + return + return self._get_captions_by_type(video_id, subtitles_id, 'subtitles') + + def _get_automatic_captions(self, video_id, subtitles_id, hl): + if not subtitles_id or not hl: + return + self._download_subtitles_xml(video_id, subtitles_id, hl) + if not self._captions_xml: + return + track = self._captions_xml.find('track') + if track is None: + return + origin_lang_code = track.attrib.get('lang_code') + if not origin_lang_code: + return + return self._get_captions_by_type( + video_id, subtitles_id, 'automatic_captions', origin_lang_code) + + def _real_extract(self, url): + video_id = self._match_id(url) + video_info = compat_parse_qs(self._download_webpage( + 'https://drive.google.com/get_video_info', + video_id, query={'docid': video_id})) + + def get_value(key): + return try_get(video_info, lambda x: x[key][0]) + + reason = get_value('reason') + title = get_value('title') + if not title and reason: + raise ExtractorError(reason, expected=True) + + formats = [] + fmt_stream_map = (get_value('fmt_stream_map') or '').split(',') + fmt_list = (get_value('fmt_list') or '').split(',') + if fmt_stream_map and fmt_list: + resolutions = {} + for fmt in fmt_list: + mobj = re.search( + r'^(?P<format_id>\d+)/(?P<width>\d+)[xX](?P<height>\d+)', fmt) + if mobj: + resolutions[mobj.group('format_id')] = ( + int(mobj.group('width')), int(mobj.group('height'))) + + for fmt_stream in fmt_stream_map: + fmt_stream_split = fmt_stream.split('|') + if len(fmt_stream_split) < 2: + continue + format_id, format_url = fmt_stream_split[:2] + f = { + 'url': lowercase_escape(format_url), + 'format_id': format_id, + 'ext': self._FORMATS_EXT[format_id], + } + resolution = resolutions.get(format_id) + if resolution: + f.update({ + 'width': resolution[0], + 'height': resolution[1], + }) + formats.append(f) + + source_url = update_url_query( + 'https://drive.google.com/uc', { + 'id': video_id, + 'export': 'download', + }) + + def request_source_file(source_url, kind): + return self._request_webpage( + source_url, video_id, note='Requesting %s file' % kind, + errnote='Unable to request %s file' % kind, fatal=False) + urlh = request_source_file(source_url, 'source') + if urlh: + def add_source_format(urlh): + formats.append({ + # Use redirect URLs as download URLs in order to calculate + # correct cookies in _calc_cookies. + # Using original URLs may result in redirect loop due to + # google.com's cookies mistakenly used for googleusercontent.com + # redirect URLs (see #23919). + 'url': urlh.geturl(), + 'ext': determine_ext(title, 'mp4').lower(), + 'format_id': 'source', + 'quality': 1, + }) + if urlh.headers.get('Content-Disposition'): + add_source_format(urlh) + else: + confirmation_webpage = self._webpage_read_content( + urlh, url, video_id, note='Downloading confirmation page', + errnote='Unable to confirm download', fatal=False) + if confirmation_webpage: + confirm = self._search_regex( + r'confirm=([^&"\']+)', confirmation_webpage, + 'confirmation code', default=None) + if confirm: + confirmed_source_url = update_url_query(source_url, { + 'confirm': confirm, + }) + urlh = request_source_file(confirmed_source_url, 'confirmed source') + if urlh and urlh.headers.get('Content-Disposition'): + add_source_format(urlh) + else: + self.report_warning( + get_element_by_class('uc-error-subcaption', confirmation_webpage) + or get_element_by_class('uc-error-caption', confirmation_webpage) + or 'unable to extract confirmation code') + + if not formats and reason: + self.raise_no_formats(reason, expected=True) + + self._sort_formats(formats) + + hl = get_value('hl') + subtitles_id = None + ttsurl = get_value('ttsurl') + if ttsurl: + # the video Id for subtitles will be the last value in the ttsurl + # query string + subtitles_id = ttsurl.encode('utf-8').decode( + 'unicode_escape').split('=')[-1] + + self._downloader.cookiejar.clear(domain='.google.com', path='/', name='NID') + + return { + 'id': video_id, + 'title': title, + 'thumbnail': 'https://drive.google.com/thumbnail?id=' + video_id, + 'duration': int_or_none(get_value('length_seconds')), + 'formats': formats, + 'subtitles': self.extract_subtitles(video_id, subtitles_id, hl), + 'automatic_captions': self.extract_automatic_captions( + video_id, subtitles_id, hl), + } diff --git a/yt_dlp/extractor/googlepodcasts.py b/yt_dlp/extractor/googlepodcasts.py new file mode 100644 index 000000000..25631e213 --- /dev/null +++ b/yt_dlp/extractor/googlepodcasts.py @@ -0,0 +1,87 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..utils import ( + clean_podcast_url, + int_or_none, + try_get, + urlencode_postdata, +) + + +class GooglePodcastsBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://podcasts\.google\.com/feed/' + + def _batch_execute(self, func_id, video_id, params): + return json.loads(self._download_json( + 'https://podcasts.google.com/_/PodcastsUi/data/batchexecute', + video_id, data=urlencode_postdata({ + 'f.req': json.dumps([[[func_id, json.dumps(params), None, '1']]]), + }), transform_source=lambda x: self._search_regex(r'(?s)(\[.+\])', x, 'data'))[0][2]) + + def _extract_episode(self, episode): + return { + 'id': episode[4][3], + 'title': episode[8], + 'url': clean_podcast_url(episode[13]), + 'thumbnail': episode[2], + 'description': episode[9], + 'creator': try_get(episode, lambda x: x[14]), + 'timestamp': int_or_none(episode[11]), + 'duration': int_or_none(episode[12]), + 'series': episode[1], + } + + +class GooglePodcastsIE(GooglePodcastsBaseIE): + IE_NAME = 'google:podcasts' + _VALID_URL = GooglePodcastsBaseIE._VALID_URL_BASE + r'(?P<feed_url>[^/]+)/episode/(?P<id>[^/?&#]+)' + _TEST = { + 'url': 'https://podcasts.google.com/feed/aHR0cHM6Ly9mZWVkcy5ucHIub3JnLzM0NDA5ODUzOS9wb2RjYXN0LnhtbA/episode/MzBlNWRlN2UtOWE4Yy00ODcwLTk2M2MtM2JlMmUyNmViOTRh', + 'md5': 'fa56b2ee8bd0703e27e42d4b104c4766', + 'info_dict': { + 'id': '30e5de7e-9a8c-4870-963c-3be2e26eb94a', + 'ext': 'mp3', + 'title': 'WWDTM New Year 2021', + 'description': 'We say goodbye to 2020 with Christine Baranksi, Doug Jones, Jonna Mendez, and Kellee Edwards.', + 'upload_date': '20210102', + 'timestamp': 1609606800, + 'duration': 2901, + 'series': "Wait Wait... Don't Tell Me!", + } + } + + def _real_extract(self, url): + b64_feed_url, b64_guid = self._match_valid_url(url).groups() + episode = self._batch_execute( + 'oNjqVe', b64_guid, [b64_feed_url, b64_guid])[1] + return self._extract_episode(episode) + + +class GooglePodcastsFeedIE(GooglePodcastsBaseIE): + IE_NAME = 'google:podcasts:feed' + _VALID_URL = GooglePodcastsBaseIE._VALID_URL_BASE + r'(?P<id>[^/?&#]+)/?(?:[?#&]|$)' + _TEST = { + 'url': 'https://podcasts.google.com/feed/aHR0cHM6Ly9mZWVkcy5ucHIub3JnLzM0NDA5ODUzOS9wb2RjYXN0LnhtbA', + 'info_dict': { + 'title': "Wait Wait... Don't Tell Me!", + 'description': "NPR's weekly current events quiz. Have a laugh and test your news knowledge while figuring out what's real and what we've made up.", + }, + 'playlist_mincount': 20, + } + + def _real_extract(self, url): + b64_feed_url = self._match_id(url) + data = self._batch_execute('ncqJEe', b64_feed_url, [b64_feed_url]) + + entries = [] + for episode in (try_get(data, lambda x: x[1][0]) or []): + entries.append(self._extract_episode(episode)) + + feed = try_get(data, lambda x: x[3]) or [] + return self.playlist_result( + entries, playlist_title=try_get(feed, lambda x: x[0]), + playlist_description=try_get(feed, lambda x: x[2])) diff --git a/yt_dlp/extractor/googlesearch.py b/yt_dlp/extractor/googlesearch.py new file mode 100644 index 000000000..f605c0c35 --- /dev/null +++ b/yt_dlp/extractor/googlesearch.py @@ -0,0 +1,43 @@ +from __future__ import unicode_literals + +import itertools +import re + +from .common import SearchInfoExtractor + + +class GoogleSearchIE(SearchInfoExtractor): + IE_DESC = 'Google Video search' + _MAX_RESULTS = 1000 + IE_NAME = 'video.google:search' + _SEARCH_KEY = 'gvsearch' + _WORKING = False + _TEST = { + 'url': 'gvsearch15:python language', + 'info_dict': { + 'id': 'python language', + 'title': 'python language', + }, + 'playlist_count': 15, + } + + def _search_results(self, query): + for pagenum in itertools.count(): + webpage = self._download_webpage( + 'http://www.google.com/search', + 'gvsearch:' + query, + note='Downloading result page %s' % (pagenum + 1), + query={ + 'tbm': 'vid', + 'q': query, + 'start': pagenum * 10, + 'hl': 'en', + }) + + for hit_idx, mobj in enumerate(re.finditer( + r'<h3 class="r"><a href="([^"]+)"', webpage)): + if re.search(f'id="vidthumb{hit_idx + 1}"', webpage): + yield self.url_result(mobj.group(1)) + + if not re.search(r'id="pnnext"', webpage): + return diff --git a/yt_dlp/extractor/gopro.py b/yt_dlp/extractor/gopro.py new file mode 100644 index 000000000..10cc1aec1 --- /dev/null +++ b/yt_dlp/extractor/gopro.py @@ -0,0 +1,110 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + remove_end, + str_or_none, + try_get, + unified_timestamp, + url_or_none, +) + + +class GoProIE(InfoExtractor): + _VALID_URL = r'https?://(www\.)?gopro\.com/v/(?P<id>[A-Za-z0-9]+)' + + _TESTS = [{ + 'url': 'https://gopro.com/v/ZNVvED8QDzR5V', + 'info_dict': { + 'id': 'ZNVvED8QDzR5V', + 'title': 'My GoPro Adventure - 9/19/21', + 'thumbnail': r're:https?://.+', + 'ext': 'mp4', + 'timestamp': 1632072947, + 'upload_date': '20210919', + 'uploader_id': 'fireydive30018', + 'duration': 396062, + } + }, { + 'url': 'https://gopro.com/v/KRm6Vgp2peg4e', + 'info_dict': { + 'id': 'KRm6Vgp2peg4e', + 'title': 'じゃがいも カリカリ オーブン焼き', + 'thumbnail': r're:https?://.+', + 'ext': 'mp4', + 'timestamp': 1607231125, + 'upload_date': '20201206', + 'uploader_id': 'dc9bcb8b-47d2-47c6-afbc-4c48f9a3769e', + 'duration': 45187, + 'track': 'The Sky Machine', + } + }, { + 'url': 'https://gopro.com/v/kVrK9wlJvBMwn', + 'info_dict': { + 'id': 'kVrK9wlJvBMwn', + 'title': 'DARKNESS', + 'thumbnail': r're:https?://.+', + 'ext': 'mp4', + 'timestamp': 1594183735, + 'upload_date': '20200708', + 'uploader_id': '闇夜乃皇帝', + 'duration': 313075, + 'track': 'Battery (Live)', + 'artist': 'Metallica', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + metadata = self._parse_json( + self._html_search_regex(r'window\.__reflectData\s*=\s*([^;]+)', webpage, 'metadata'), video_id) + + video_info = metadata['collectionMedia'][0] + media_data = self._download_json( + 'https://api.gopro.com/media/%s/download' % video_info['id'], video_id) + + formats = [] + for fmt in try_get(media_data, lambda x: x['_embedded']['variations']) or []: + format_url = url_or_none(fmt.get('url')) + if not format_url: + continue + formats.append({ + 'url': format_url, + 'format_id': str_or_none(fmt.get('quality')), + 'format_note': str_or_none(fmt.get('label')), + 'ext': str_or_none(fmt.get('type')), + 'width': int_or_none(fmt.get('width')), + 'height': int_or_none(fmt.get('height')), + }) + + self._sort_formats(formats) + + title = str_or_none( + try_get(metadata, lambda x: x['collection']['title']) + or self._html_search_meta(['og:title', 'twitter:title'], webpage) + or remove_end(self._html_search_regex( + r'<title[^>]*>([^<]+)</title>', webpage, 'title', fatal=False), ' | GoPro')) + if title: + title = title.replace('\n', ' ') + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': url_or_none( + self._html_search_meta(['og:image', 'twitter:image'], webpage)), + 'timestamp': unified_timestamp( + try_get(metadata, lambda x: x['collection']['created_at'])), + 'uploader_id': str_or_none( + try_get(metadata, lambda x: x['account']['nickname'])), + 'duration': int_or_none( + video_info.get('source_duration')), + 'artist': str_or_none( + video_info.get('music_track_artist')), + 'track': str_or_none( + video_info.get('music_track_name')), + } diff --git a/youtube_dl/extractor/goshgay.py b/yt_dlp/extractor/goshgay.py index 377981d3e..377981d3e 100644 --- a/youtube_dl/extractor/goshgay.py +++ b/yt_dlp/extractor/goshgay.py diff --git a/yt_dlp/extractor/gotostage.py b/yt_dlp/extractor/gotostage.py new file mode 100644 index 000000000..6aa96106a --- /dev/null +++ b/yt_dlp/extractor/gotostage.py @@ -0,0 +1,73 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + try_get, + url_or_none +) + +import json + + +class GoToStageIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?gotostage\.com/channel/[a-z0-9]+/recording/(?P<id>[a-z0-9]+)/watch' + _TESTS = [{ + 'url': 'https://www.gotostage.com/channel/8901680603948959494/recording/60bb55548d434f21b9ce4f0e225c4895/watch', + 'md5': 'ca72ce990cdcd7a2bd152f7217e319a2', + 'info_dict': { + 'id': '60bb55548d434f21b9ce4f0e225c4895', + 'ext': 'mp4', + 'title': 'What is GoToStage?', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 93.924711 + } + }, { + 'url': 'https://www.gotostage.com/channel/bacc3d3535b34bafacc3f4ef8d4df78a/recording/831e74cd3e0042be96defba627b6f676/watch?source=HOMEPAGE', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + metadata = self._download_json( + 'https://api.gotostage.com/contents?ids=%s' % video_id, + video_id, + note='Downloading video metadata', + errnote='Unable to download video metadata')[0] + + registration_data = { + 'product': metadata['product'], + 'resourceType': metadata['contentType'], + 'productReferenceKey': metadata['productRefKey'], + 'firstName': 'foo', + 'lastName': 'bar', + 'email': 'foobar@example.com' + } + + registration_response = self._download_json( + 'https://api-registrations.logmeininc.com/registrations', + video_id, + data=json.dumps(registration_data).encode(), + expected_status=409, + headers={'Content-Type': 'application/json'}, + note='Register user', + errnote='Unable to register user') + + content_response = self._download_json( + 'https://api.gotostage.com/contents/%s/asset' % video_id, + video_id, + headers={'x-registrantkey': registration_response['registrationKey']}, + note='Get download url', + errnote='Unable to get download url') + + return { + 'id': video_id, + 'title': try_get(metadata, lambda x: x['title'], compat_str), + 'url': try_get(content_response, lambda x: x['cdnLocation'], compat_str), + 'ext': 'mp4', + 'thumbnail': url_or_none(try_get(metadata, lambda x: x['thumbnail']['location'])), + 'duration': try_get(metadata, lambda x: x['duration'], float), + 'categories': [try_get(metadata, lambda x: x['category'], compat_str)], + 'is_live': False + } diff --git a/youtube_dl/extractor/gputechconf.py b/yt_dlp/extractor/gputechconf.py index 73dc62c49..73dc62c49 100644 --- a/youtube_dl/extractor/gputechconf.py +++ b/yt_dlp/extractor/gputechconf.py diff --git a/yt_dlp/extractor/gronkh.py b/yt_dlp/extractor/gronkh.py new file mode 100644 index 000000000..a7792a5e0 --- /dev/null +++ b/yt_dlp/extractor/gronkh.py @@ -0,0 +1,43 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import unified_strdate + + +class GronkhIE(InfoExtractor): + _VALID_URL = r'(?:https?://)(?:www\.)?gronkh\.tv/stream/(?P<id>\d+)' + + _TESTS = [{ + 'url': 'https://gronkh.tv/stream/536', + 'info_dict': { + 'id': '536', + 'ext': 'mp4', + 'title': 'GTV0536, 2021-10-01 - MARTHA IS DEAD #FREiAB1830 !FF7 !horde !archiv', + 'view_count': 19491, + 'thumbnail': 'https://01.cdn.vod.farm/preview/6436746cce14e25f751260a692872b9b.jpg', + 'upload_date': '20211001' + }, + 'params': {'skip_download': True} + }] + + def _real_extract(self, url): + id = self._match_id(url) + data_json = self._download_json(f'https://api.gronkh.tv/v1/video/info?episode={id}', id) + m3u8_url = self._download_json(f'https://api.gronkh.tv/v1/video/playlist?episode={id}', id)['playlist_url'] + formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, id) + if data_json.get('vtt_url'): + subtitles.setdefault('en', []).append({ + 'url': data_json['vtt_url'], + 'ext': 'vtt', + }) + self._sort_formats(formats) + return { + 'id': id, + 'title': data_json.get('title'), + 'view_count': data_json.get('views'), + 'thumbnail': data_json.get('preview_url'), + 'upload_date': unified_strdate(data_json.get('created_at')), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/youtube_dl/extractor/groupon.py b/yt_dlp/extractor/groupon.py index a6da90931..a6da90931 100644 --- a/youtube_dl/extractor/groupon.py +++ b/yt_dlp/extractor/groupon.py diff --git a/youtube_dl/extractor/hbo.py b/yt_dlp/extractor/hbo.py index 68df748f5..68df748f5 100644 --- a/youtube_dl/extractor/hbo.py +++ b/yt_dlp/extractor/hbo.py diff --git a/yt_dlp/extractor/hearthisat.py b/yt_dlp/extractor/hearthisat.py new file mode 100644 index 000000000..a3d6a055f --- /dev/null +++ b/yt_dlp/extractor/hearthisat.py @@ -0,0 +1,101 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + KNOWN_EXTENSIONS, + str_to_int, +) + + +class HearThisAtIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?hearthis\.at/(?P<artist>[^/]+)/(?P<title>[A-Za-z0-9\-]+)/?$' + _PLAYLIST_URL = 'https://hearthis.at/playlist.php' + _TESTS = [{ + 'url': 'https://hearthis.at/moofi/dr-kreep', + 'md5': 'ab6ec33c8fed6556029337c7885eb4e0', + 'info_dict': { + 'id': '150939', + 'ext': 'wav', + 'title': 'Moofi - Dr. Kreep', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1421564134, + 'description': 'md5:1adb0667b01499f9d27e97ddfd53852a', + 'upload_date': '20150118', + 'view_count': int, + 'duration': 71, + 'genre': 'Experimental', + } + }, { + # 'download' link redirects to the original webpage + 'url': 'https://hearthis.at/twitchsf/dj-jim-hopkins-totally-bitchin-80s-dance-mix/', + 'md5': '5980ceb7c461605d30f1f039df160c6e', + 'info_dict': { + 'id': '811296', + 'ext': 'mp3', + 'title': 'TwitchSF - DJ Jim Hopkins - Totally Bitchin\' 80\'s Dance Mix!', + 'description': 'md5:ef26815ca8f483272a87b137ff175be2', + 'upload_date': '20160328', + 'timestamp': 1459186146, + 'thumbnail': r're:^https?://.*\.jpg$', + 'view_count': int, + 'duration': 4360, + 'genre': 'Dance', + }, + }] + + def _real_extract(self, url): + m = self._match_valid_url(url) + display_id = '{artist:s} - {title:s}'.format(**m.groupdict()) + api_url = url.replace('www.', '').replace('hearthis.at', 'api-v2.hearthis.at') + data_json = self._download_json(api_url, display_id) + track_id = data_json.get('id') + artist_json = data_json.get('user') + title = '{} - {}'.format(artist_json.get('username'), data_json.get('title')) + genre = data_json.get('genre') + description = data_json.get('description') + thumbnail = data_json.get('artwork_url') or data_json.get('thumb') + view_count = str_to_int(data_json.get('playback_count')) + duration = str_to_int(data_json.get('duration')) + timestamp = data_json.get('release_timestamp') + + formats = [] + mp3_url = data_json.get('stream_url') + + if mp3_url: + formats.append({ + 'format_id': 'mp3', + 'vcodec': 'none', + 'acodec': 'mp3', + 'url': mp3_url, + 'ext': 'mp3', + }) + + if data_json.get('download_url'): + download_url = data_json['download_url'] + ext = determine_ext(data_json['download_filename']) + if ext in KNOWN_EXTENSIONS: + formats.append({ + 'format_id': ext, + 'vcodec': 'none', + 'ext': ext, + 'url': download_url, + 'acodec': ext, + 'quality': 2, # Usually better quality + }) + self._sort_formats(formats) + + return { + 'id': track_id, + 'display_id': display_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'view_count': view_count, + 'genre': genre, + } diff --git a/youtube_dl/extractor/heise.py b/yt_dlp/extractor/heise.py index cbe564a3c..cbe564a3c 100644 --- a/youtube_dl/extractor/heise.py +++ b/yt_dlp/extractor/heise.py diff --git a/youtube_dl/extractor/hellporno.py b/yt_dlp/extractor/hellporno.py index fae425103..fae425103 100644 --- a/youtube_dl/extractor/hellporno.py +++ b/yt_dlp/extractor/hellporno.py diff --git a/youtube_dl/extractor/helsinki.py b/yt_dlp/extractor/helsinki.py index 575fb332a..575fb332a 100644 --- a/youtube_dl/extractor/helsinki.py +++ b/yt_dlp/extractor/helsinki.py diff --git a/youtube_dl/extractor/hentaistigma.py b/yt_dlp/extractor/hentaistigma.py index 86a93de4d..86a93de4d 100644 --- a/youtube_dl/extractor/hentaistigma.py +++ b/yt_dlp/extractor/hentaistigma.py diff --git a/youtube_dl/extractor/hgtv.py b/yt_dlp/extractor/hgtv.py index a4f332565..a4f332565 100644 --- a/youtube_dl/extractor/hgtv.py +++ b/yt_dlp/extractor/hgtv.py diff --git a/yt_dlp/extractor/hidive.py b/yt_dlp/extractor/hidive.py new file mode 100644 index 000000000..15bd444f9 --- /dev/null +++ b/yt_dlp/extractor/hidive.py @@ -0,0 +1,122 @@ +# coding: utf-8 +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + try_get, + url_or_none, + urlencode_postdata, +) + + +class HiDiveIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?hidive\.com/stream/(?P<id>(?P<title>[^/]+)/(?P<key>[^/?#&]+))' + # Using X-Forwarded-For results in 403 HTTP error for HLS fragments, + # so disabling geo bypass completely + _GEO_BYPASS = False + _NETRC_MACHINE = 'hidive' + _LOGIN_URL = 'https://www.hidive.com/account/login' + + _TESTS = [{ + 'url': 'https://www.hidive.com/stream/the-comic-artist-and-his-assistants/s01e001', + 'info_dict': { + 'id': 'the-comic-artist-and-his-assistants/s01e001', + 'ext': 'mp4', + 'title': 'the-comic-artist-and-his-assistants/s01e001', + 'series': 'the-comic-artist-and-his-assistants', + 'season_number': 1, + 'episode_number': 1, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Requires Authentication', + }] + + def _real_initialize(self): + email, password = self._get_login_info() + if email is None: + return + + webpage = self._download_webpage(self._LOGIN_URL, None) + form = self._search_regex( + r'(?s)<form[^>]+action="/account/login"[^>]*>(.+?)</form>', + webpage, 'login form') + data = self._hidden_inputs(form) + data.update({ + 'Email': email, + 'Password': password, + }) + self._download_webpage( + self._LOGIN_URL, None, 'Logging in', data=urlencode_postdata(data)) + + def _call_api(self, video_id, title, key, data={}, **kwargs): + data = { + **data, + 'Title': title, + 'Key': key, + 'PlayerId': 'f4f895ce1ca713ba263b91caeb1daa2d08904783', + } + return self._download_json( + 'https://www.hidive.com/play/settings', video_id, + data=urlencode_postdata(data), **kwargs) or {} + + def _extract_subtitles_from_rendition(self, rendition, subtitles, parsed_urls): + for cc_file in rendition.get('ccFiles', []): + cc_url = url_or_none(try_get(cc_file, lambda x: x[2])) + # name is used since we cant distinguish subs with same language code + cc_lang = try_get(cc_file, (lambda x: x[1].replace(' ', '-').lower(), lambda x: x[0]), str) + if cc_url not in parsed_urls and cc_lang: + parsed_urls.add(cc_url) + subtitles.setdefault(cc_lang, []).append({'url': cc_url}) + + def _get_subtitles(self, url, video_id, title, key, parsed_urls): + webpage = self._download_webpage(url, video_id, fatal=False) or '' + subtitles = {} + for caption in set(re.findall(r'data-captions=\"([^\"]+)\"', webpage)): + renditions = self._call_api( + video_id, title, key, {'Captions': caption}, fatal=False, + note=f'Downloading {caption} subtitle information').get('renditions') or {} + for rendition_id, rendition in renditions.items(): + self._extract_subtitles_from_rendition(rendition, subtitles, parsed_urls) + return subtitles + + def _real_extract(self, url): + video_id, title, key = self._match_valid_url(url).group('id', 'title', 'key') + settings = self._call_api(video_id, title, key) + + restriction = settings.get('restrictionReason') + if restriction == 'RegionRestricted': + self.raise_geo_restricted() + if restriction and restriction != 'None': + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, restriction), expected=True) + + formats, parsed_urls = [], {None} + for rendition_id, rendition in settings['renditions'].items(): + audio, version, extra = rendition_id.split('_') + m3u8_url = url_or_none(try_get(rendition, lambda x: x['bitrates']['hls'])) + if m3u8_url not in parsed_urls: + parsed_urls.add(m3u8_url) + frmt = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id=rendition_id, fatal=False) + for f in frmt: + f['language'] = audio + f['format_note'] = f'{version}, {extra}' + formats.extend(frmt) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_id, + 'subtitles': self.extract_subtitles(url, video_id, title, key, parsed_urls), + 'formats': formats, + 'series': title, + 'season_number': int_or_none( + self._search_regex(r's(\d+)', key, 'season number', default=None)), + 'episode_number': int_or_none( + self._search_regex(r'e(\d+)', key, 'episode number', default=None)), + 'http_headers': {'Referer': url} + } diff --git a/youtube_dl/extractor/historicfilms.py b/yt_dlp/extractor/historicfilms.py index 56343e98f..56343e98f 100644 --- a/youtube_dl/extractor/historicfilms.py +++ b/yt_dlp/extractor/historicfilms.py diff --git a/youtube_dl/extractor/hitbox.py b/yt_dlp/extractor/hitbox.py index 3e5ff2685..3e5ff2685 100644 --- a/youtube_dl/extractor/hitbox.py +++ b/yt_dlp/extractor/hitbox.py diff --git a/youtube_dl/extractor/hitrecord.py b/yt_dlp/extractor/hitrecord.py index fd5dc2935..fd5dc2935 100644 --- a/youtube_dl/extractor/hitrecord.py +++ b/yt_dlp/extractor/hitrecord.py diff --git a/youtube_dl/extractor/hketv.py b/yt_dlp/extractor/hketv.py index 1f3502b90..1f3502b90 100644 --- a/youtube_dl/extractor/hketv.py +++ b/yt_dlp/extractor/hketv.py diff --git a/youtube_dl/extractor/hornbunny.py b/yt_dlp/extractor/hornbunny.py index c458a959d..c458a959d 100644 --- a/youtube_dl/extractor/hornbunny.py +++ b/yt_dlp/extractor/hornbunny.py diff --git a/youtube_dl/extractor/hotnewhiphop.py b/yt_dlp/extractor/hotnewhiphop.py index 4703e1894..4703e1894 100644 --- a/youtube_dl/extractor/hotnewhiphop.py +++ b/yt_dlp/extractor/hotnewhiphop.py diff --git a/yt_dlp/extractor/hotstar.py b/yt_dlp/extractor/hotstar.py new file mode 100644 index 000000000..12e6c53d4 --- /dev/null +++ b/yt_dlp/extractor/hotstar.py @@ -0,0 +1,333 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import hashlib +import hmac +import re +import time +import uuid +import json + +from .common import InfoExtractor +from ..compat import ( + compat_HTTPError, + compat_str +) +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + str_or_none, + try_get, + url_or_none, +) + + +class HotStarBaseIE(InfoExtractor): + _AKAMAI_ENCRYPTION_KEY = b'\x05\xfc\x1a\x01\xca\xc9\x4b\xc4\x12\xfc\x53\x12\x07\x75\xf9\xee' + + def _call_api_impl(self, path, video_id, query, st=None, cookies=None): + st = int_or_none(st) or int(time.time()) + exp = st + 6000 + auth = 'st=%d~exp=%d~acl=/*' % (st, exp) + auth += '~hmac=' + hmac.new(self._AKAMAI_ENCRYPTION_KEY, auth.encode(), hashlib.sha256).hexdigest() + + if cookies and cookies.get('userUP'): + token = cookies.get('userUP').value + else: + token = self._download_json( + 'https://api.hotstar.com/um/v3/users', + video_id, note='Downloading token', + data=json.dumps({"device_ids": [{"id": compat_str(uuid.uuid4()), "type": "device_id"}]}).encode('utf-8'), + headers={ + 'hotstarauth': auth, + 'x-hs-platform': 'PCTV', # or 'web' + 'Content-Type': 'application/json', + })['user_identity'] + + response = self._download_json( + 'https://api.hotstar.com/' + path, video_id, headers={ + 'hotstarauth': auth, + 'x-hs-appversion': '6.72.2', + 'x-hs-platform': 'web', + 'x-hs-usertoken': token, + }, query=query) + + if response['message'] != "Playback URL's fetched successfully": + raise ExtractorError( + response['message'], expected=True) + return response['data'] + + def _call_api(self, path, video_id, query_name='contentId'): + return self._download_json('https://api.hotstar.com/' + path, video_id=video_id, query={ + query_name: video_id, + 'tas': 10000, + }, headers={ + 'x-country-code': 'IN', + 'x-platform-code': 'PCTV', + }) + + def _call_api_v2(self, path, video_id, st=None, cookies=None): + return self._call_api_impl( + '%s/content/%s' % (path, video_id), video_id, st=st, cookies=cookies, query={ + 'desired-config': 'audio_channel:stereo|container:fmp4|dynamic_range:hdr|encryption:plain|ladder:tv|package:dash|resolution:fhd|subs-tag:HotstarVIP|video_codec:h265', + 'device-id': cookies.get('device_id').value if cookies.get('device_id') else compat_str(uuid.uuid4()), + 'os-name': 'Windows', + 'os-version': '10', + }) + + +class HotStarIE(HotStarBaseIE): + IE_NAME = 'hotstar' + _VALID_URL = r'''(?x) + (?: + hotstar\:| + https?://(?:www\.)?hotstar\.com(?:/in)?/(?!in/) + ) + (?: + (?P<type>movies|sports|episode|(?P<tv>tv)) + (?: + \:| + /[^/?#]+/ + (?(tv) + (?:[^/?#]+/){2}| + (?:[^/?#]+/)* + ) + )| + [^/?#]+/ + )? + (?P<id>\d{10}) + ''' + _TESTS = [{ + 'url': 'https://www.hotstar.com/can-you-not-spread-rumours/1000076273', + 'info_dict': { + 'id': '1000076273', + 'ext': 'mp4', + 'title': 'Can You Not Spread Rumours?', + 'description': 'md5:c957d8868e9bc793ccb813691cc4c434', + 'timestamp': 1447248600, + 'upload_date': '20151111', + 'duration': 381, + }, + }, { + 'url': 'hotstar:1000076273', + 'only_matching': True, + }, { + 'url': 'https://www.hotstar.com/movies/radha-gopalam/1000057157', + 'info_dict': { + 'id': '1000057157', + 'ext': 'mp4', + 'title': 'Radha Gopalam', + 'description': 'md5:be3bc342cc120bbc95b3b0960e2b0d22', + 'timestamp': 1140805800, + 'upload_date': '20060224', + 'duration': 9182, + }, + }, { + 'url': 'hotstar:movies:1000057157', + 'only_matching': True, + }, { + 'url': 'https://www.hotstar.com/in/sports/cricket/follow-the-blues-2021/recap-eng-fight-back-on-day-2/1260066104', + 'only_matching': True, + }, { + 'url': 'https://www.hotstar.com/in/sports/football/most-costly-pl-transfers-ft-grealish/1260065956', + 'only_matching': True, + }, { + # contentData + 'url': 'hotstar:sports:1260065956', + 'only_matching': True, + }, { + # contentData + 'url': 'hotstar:sports:1260066104', + 'only_matching': True, + }, { + 'url': 'https://www.hotstar.com/tv/ek-bhram-sarvagun-sampanna/s-2116/janhvi-targets-suman/1000234847', + 'info_dict': { + 'id': '1000234847', + 'ext': 'mp4', + 'title': 'Janhvi Targets Suman', + 'description': 'md5:78a85509348910bd1ca31be898c5796b', + 'timestamp': 1556670600, + 'upload_date': '20190501', + 'duration': 1219, + 'channel': 'StarPlus', + 'channel_id': 3, + 'series': 'Ek Bhram - Sarvagun Sampanna', + 'season': 'Chapter 1', + 'season_number': 1, + 'season_id': 6771, + 'episode': 'Janhvi Targets Suman', + 'episode_number': 8, + }, + }, { + 'url': 'hotstar:episode:1000234847', + 'only_matching': True, + }] + _GEO_BYPASS = False + _TYPE = { + 'movies': 'movie', + 'sports': 'match', + 'episode': 'episode', + 'tv': 'episode', + None: 'content', + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + video_type = mobj.group('type') + cookies = self._get_cookies(url) + video_type = self._TYPE.get(video_type, video_type) + video_data = self._call_api(f'o/v1/{video_type}/detail', video_id)['body']['results']['item'] + title = video_data['title'] + + if not self.get_param('allow_unplayable_formats') and video_data.get('drmProtected'): + self.report_drm(video_id) + + headers = {'Referer': 'https://www.hotstar.com/in'} + formats = [] + subs = {} + geo_restricted = False + _, urlh = self._download_webpage_handle('https://www.hotstar.com/in', video_id) + # Required to fix https://github.com/yt-dlp/yt-dlp/issues/396 + st = urlh.headers.get('x-origin-date') + # change to v2 in the future + playback_sets = self._call_api_v2('play/v1/playback', video_id, st=st, cookies=cookies)['playBackSets'] + for playback_set in playback_sets: + if not isinstance(playback_set, dict): + continue + dr = re.search(r'dynamic_range:(?P<dr>[a-z]+)', playback_set.get('tagsCombination')).group('dr') + format_url = url_or_none(playback_set.get('playbackUrl')) + if not format_url: + continue + format_url = re.sub( + r'(?<=//staragvod)(\d)', r'web\1', format_url) + tags = str_or_none(playback_set.get('tagsCombination')) or '' + ext = determine_ext(format_url) + current_formats, current_subs = [], {} + try: + if 'package:hls' in tags or ext == 'm3u8': + current_formats, current_subs = self._extract_m3u8_formats_and_subtitles( + format_url, video_id, 'mp4', + entry_protocol='m3u8_native', + m3u8_id=f'{dr}-hls', headers=headers) + elif 'package:dash' in tags or ext == 'mpd': + current_formats, current_subs = self._extract_mpd_formats_and_subtitles( + format_url, video_id, mpd_id=f'{dr}-dash', headers=headers) + elif ext == 'f4m': + # produce broken files + pass + else: + current_formats = [{ + 'url': format_url, + 'width': int_or_none(playback_set.get('width')), + 'height': int_or_none(playback_set.get('height')), + }] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + geo_restricted = True + continue + if tags and 'encryption:plain' not in tags: + for f in current_formats: + f['has_drm'] = True + formats.extend(current_formats) + subs = self._merge_subtitles(subs, current_subs) + if not formats and geo_restricted: + self.raise_geo_restricted(countries=['IN'], metadata_available=True) + self._sort_formats(formats) + + for f in formats: + f.setdefault('http_headers', {}).update(headers) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('description'), + 'duration': int_or_none(video_data.get('duration')), + 'timestamp': int_or_none(video_data.get('broadcastDate') or video_data.get('startDate')), + 'formats': formats, + 'subtitles': subs, + 'channel': video_data.get('channelName'), + 'channel_id': video_data.get('channelId'), + 'series': video_data.get('showName'), + 'season': video_data.get('seasonName'), + 'season_number': int_or_none(video_data.get('seasonNo')), + 'season_id': video_data.get('seasonId'), + 'episode': title, + 'episode_number': int_or_none(video_data.get('episodeNo')), + 'http_headers': { + 'Referer': 'https://www.hotstar.com/in', + } + } + + +class HotStarPlaylistIE(HotStarBaseIE): + IE_NAME = 'hotstar:playlist' + _VALID_URL = r'https?://(?:www\.)?hotstar\.com/tv/[^/]+/s-\w+/list/[^/]+/t-(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://www.hotstar.com/tv/savdhaan-india/s-26/list/popular-clips/t-3_2_26', + 'info_dict': { + 'id': '3_2_26', + }, + 'playlist_mincount': 20, + }, { + 'url': 'https://www.hotstar.com/tv/savdhaan-india/s-26/list/extras/t-2480', + 'only_matching': True, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + collection = self._call_api('o/v1/tray/find', playlist_id, 'uqId')['body']['results'] + entries = [ + self.url_result( + 'https://www.hotstar.com/%s' % video['contentId'], + ie=HotStarIE.ie_key(), video_id=video['contentId']) + for video in collection['assets']['items'] + if video.get('contentId')] + + return self.playlist_result(entries, playlist_id) + + +class HotStarSeriesIE(HotStarBaseIE): + IE_NAME = 'hotstar:series' + _VALID_URL = r'(?P<url>(?:https?://)(?:www\.)?hotstar\.com(?:/in)?/tv/[^/]+/(?P<id>\d+))' + _TESTS = [{ + 'url': 'https://www.hotstar.com/in/tv/radhakrishn/1260000646', + 'info_dict': { + 'id': '1260000646', + }, + 'playlist_mincount': 690, + }, { + 'url': 'https://www.hotstar.com/tv/dancee-/1260050431', + 'info_dict': { + 'id': '1260050431', + }, + 'playlist_mincount': 43, + }, { + 'url': 'https://www.hotstar.com/in/tv/mahabharat/435/', + 'info_dict': { + 'id': '435', + }, + 'playlist_mincount': 269, + }] + + def _real_extract(self, url): + url, series_id = self._match_valid_url(url).groups() + headers = { + 'x-country-code': 'IN', + 'x-platform-code': 'PCTV', + } + detail_json = self._download_json('https://api.hotstar.com/o/v1/show/detail?contentId=' + series_id, + video_id=series_id, headers=headers) + id = compat_str(try_get(detail_json, lambda x: x['body']['results']['item']['id'], int)) + item_json = self._download_json('https://api.hotstar.com/o/v1/tray/g/1/items?etid=0&tao=0&tas=10000&eid=' + id, + video_id=series_id, headers=headers) + entries = [ + self.url_result( + '%s/ignoreme/%d' % (url, video['contentId']), + ie=HotStarIE.ie_key(), video_id=video['contentId']) + for video in item_json['body']['results']['items'] + if video.get('contentId')] + + return self.playlist_result(entries, series_id) diff --git a/youtube_dl/extractor/howcast.py b/yt_dlp/extractor/howcast.py index 7e36b85ad..7e36b85ad 100644 --- a/youtube_dl/extractor/howcast.py +++ b/yt_dlp/extractor/howcast.py diff --git a/youtube_dl/extractor/howstuffworks.py b/yt_dlp/extractor/howstuffworks.py index cf90ab3c9..cf90ab3c9 100644 --- a/youtube_dl/extractor/howstuffworks.py +++ b/yt_dlp/extractor/howstuffworks.py diff --git a/yt_dlp/extractor/hrfensehen.py b/yt_dlp/extractor/hrfensehen.py new file mode 100644 index 000000000..2a994d471 --- /dev/null +++ b/yt_dlp/extractor/hrfensehen.py @@ -0,0 +1,102 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from ..utils import int_or_none, unified_timestamp, unescapeHTML +from .common import InfoExtractor + + +class HRFernsehenIE(InfoExtractor): + IE_NAME = 'hrfernsehen' + _VALID_URL = r'^https?://www\.(?:hr-fernsehen|hessenschau)\.de/.*,video-(?P<id>[0-9]{6})\.html' + + _TESTS = [{ + 'url': 'https://www.hessenschau.de/tv-sendung/hessenschau-vom-26082020,video-130546.html', + 'md5': '5c4e0ba94677c516a2f65a84110fc536', + 'info_dict': { + 'id': '130546', + 'ext': 'mp4', + 'description': 'Sturmtief Kirsten fegt über Hessen / Die Corona-Pandemie – eine Chronologie / ' + 'Sterbehilfe: Die Lage in Hessen / Miss Hessen leitet zwei eigene Unternehmen / ' + 'Pop-Up Museum zeigt Schwarze Unterhaltung und Black Music', + 'subtitles': {'de': [{ + 'url': 'https://hr-a.akamaihd.net/video/as/hessenschau/2020_08/hrLogo_200826200407_L385592_512x288-25p-500kbit.vtt' + }]}, + 'timestamp': 1598470200, + 'upload_date': '20200826', + 'thumbnails': [{ + 'url': 'https://www.hessenschau.de/tv-sendung/hs_ganz-1554~_t-1598465545029_v-16to9.jpg', + 'id': '0' + }, { + 'url': 'https://www.hessenschau.de/tv-sendung/hs_ganz-1554~_t-1598465545029_v-16to9__medium.jpg', + 'id': '1' + }], + 'title': 'hessenschau vom 26.08.2020' + } + }, { + 'url': 'https://www.hr-fernsehen.de/sendungen-a-z/mex/sendungen/fair-und-gut---was-hinter-aldis-eigenem-guetesiegel-steckt,video-130544.html', + 'only_matching': True + }] + + _GEO_COUNTRIES = ['DE'] + + def extract_airdate(self, loader_data): + airdate_str = loader_data.get('mediaMetadata', {}).get('agf', {}).get('airdate') + + if airdate_str is None: + return None + + return unified_timestamp(airdate_str) + + def extract_formats(self, loader_data): + stream_formats = [] + for stream_obj in loader_data["videoResolutionLevels"]: + stream_format = { + 'format_id': str(stream_obj['verticalResolution']) + "p", + 'height': stream_obj['verticalResolution'], + 'url': stream_obj['url'], + } + + quality_information = re.search(r'([0-9]{3,4})x([0-9]{3,4})-([0-9]{2})p-([0-9]{3,4})kbit', + stream_obj['url']) + if quality_information: + stream_format['width'] = int_or_none(quality_information.group(1)) + stream_format['height'] = int_or_none(quality_information.group(2)) + stream_format['fps'] = int_or_none(quality_information.group(3)) + stream_format['tbr'] = int_or_none(quality_information.group(4)) + + stream_formats.append(stream_format) + + self._sort_formats(stream_formats) + return stream_formats + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._html_search_meta( + ['og:title', 'twitter:title', 'name'], webpage) + description = self._html_search_meta( + ['description'], webpage) + + loader_str = unescapeHTML(self._search_regex(r"data-hr-mediaplayer-loader='([^']*)'", webpage, "ardloader")) + loader_data = json.loads(loader_str) + + info = { + 'id': video_id, + 'title': title, + 'description': description, + 'formats': self.extract_formats(loader_data), + 'timestamp': self.extract_airdate(loader_data) + } + + if "subtitle" in loader_data: + info["subtitles"] = {"de": [{"url": loader_data["subtitle"]}]} + + thumbnails = list(set([t for t in loader_data.get("previewImageUrl", {}).values()])) + if len(thumbnails) > 0: + info["thumbnails"] = [{"url": t} for t in thumbnails] + + return info diff --git a/yt_dlp/extractor/hrti.py b/yt_dlp/extractor/hrti.py new file mode 100644 index 000000000..dc5b9670c --- /dev/null +++ b/yt_dlp/extractor/hrti.py @@ -0,0 +1,207 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ( + clean_html, + ExtractorError, + int_or_none, + parse_age_limit, + sanitized_Request, + try_get, +) + + +class HRTiBaseIE(InfoExtractor): + """ + Base Information Extractor for Croatian Radiotelevision + video on demand site https://hrti.hrt.hr + Reverse engineered from the JavaScript app in app.min.js + """ + _NETRC_MACHINE = 'hrti' + + _APP_LANGUAGE = 'hr' + _APP_VERSION = '1.1' + _APP_PUBLICATION_ID = 'all_in_one' + _API_URL = 'http://clientapi.hrt.hr/client_api.php/config/identify/format/json' + + def _initialize_api(self): + init_data = { + 'application_publication_id': self._APP_PUBLICATION_ID + } + + uuid = self._download_json( + self._API_URL, None, note='Downloading uuid', + errnote='Unable to download uuid', + data=json.dumps(init_data).encode('utf-8'))['uuid'] + + app_data = { + 'uuid': uuid, + 'application_publication_id': self._APP_PUBLICATION_ID, + 'application_version': self._APP_VERSION + } + + req = sanitized_Request(self._API_URL, data=json.dumps(app_data).encode('utf-8')) + req.get_method = lambda: 'PUT' + + resources = self._download_json( + req, None, note='Downloading session information', + errnote='Unable to download session information') + + self._session_id = resources['session_id'] + + modules = resources['modules'] + + self._search_url = modules['vod_catalog']['resources']['search']['uri'].format( + language=self._APP_LANGUAGE, + application_id=self._APP_PUBLICATION_ID) + + self._login_url = (modules['user']['resources']['login']['uri'] + + '/format/json').format(session_id=self._session_id) + + self._logout_url = modules['user']['resources']['logout']['uri'] + + def _login(self): + username, password = self._get_login_info() + # TODO: figure out authentication with cookies + if username is None or password is None: + self.raise_login_required() + + auth_data = { + 'username': username, + 'password': password, + } + + try: + auth_info = self._download_json( + self._login_url, None, note='Logging in', errnote='Unable to log in', + data=json.dumps(auth_data).encode('utf-8')) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 406: + auth_info = self._parse_json(e.cause.read().encode('utf-8'), None) + else: + raise + + error_message = auth_info.get('error', {}).get('message') + if error_message: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error_message), + expected=True) + + self._token = auth_info['secure_streaming_token'] + + def _real_initialize(self): + self._initialize_api() + self._login() + + +class HRTiIE(HRTiBaseIE): + _VALID_URL = r'''(?x) + (?: + hrti:(?P<short_id>[0-9]+)| + https?:// + hrti\.hrt\.hr/(?:\#/)?video/show/(?P<id>[0-9]+)/(?P<display_id>[^/]+)? + ) + ''' + _TESTS = [{ + 'url': 'https://hrti.hrt.hr/#/video/show/2181385/republika-dokumentarna-serija-16-hd', + 'info_dict': { + 'id': '2181385', + 'display_id': 'republika-dokumentarna-serija-16-hd', + 'ext': 'mp4', + 'title': 'REPUBLIKA, dokumentarna serija (1/6) (HD)', + 'description': 'md5:48af85f620e8e0e1df4096270568544f', + 'duration': 2922, + 'view_count': int, + 'average_rating': int, + 'episode_number': int, + 'season_number': int, + 'age_limit': 12, + }, + 'skip': 'Requires account credentials', + }, { + 'url': 'https://hrti.hrt.hr/#/video/show/2181385/', + 'only_matching': True, + }, { + 'url': 'hrti:2181385', + 'only_matching': True, + }, { + 'url': 'https://hrti.hrt.hr/video/show/3873068/cuvar-dvorca-dramska-serija-14', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('short_id') or mobj.group('id') + display_id = mobj.group('display_id') or video_id + + video = self._download_json( + '%s/video_id/%s/format/json' % (self._search_url, video_id), + display_id, 'Downloading video metadata JSON')['video'][0] + + title_info = video['title'] + title = title_info['title_long'] + + movie = video['video_assets']['movie'][0] + m3u8_url = movie['url'].format(TOKEN=self._token) + formats = self._extract_m3u8_formats( + m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + self._sort_formats(formats) + + description = clean_html(title_info.get('summary_long')) + age_limit = parse_age_limit(video.get('parental_control', {}).get('rating')) + view_count = int_or_none(video.get('views')) + average_rating = int_or_none(video.get('user_rating')) + duration = int_or_none(movie.get('duration')) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'duration': duration, + 'view_count': view_count, + 'average_rating': average_rating, + 'age_limit': age_limit, + 'formats': formats, + } + + +class HRTiPlaylistIE(HRTiBaseIE): + _VALID_URL = r'https?://hrti\.hrt\.hr/(?:#/)?video/list/category/(?P<id>[0-9]+)/(?P<display_id>[^/]+)?' + _TESTS = [{ + 'url': 'https://hrti.hrt.hr/#/video/list/category/212/ekumena', + 'info_dict': { + 'id': '212', + 'title': 'ekumena', + }, + 'playlist_mincount': 8, + 'skip': 'Requires account credentials', + }, { + 'url': 'https://hrti.hrt.hr/#/video/list/category/212/', + 'only_matching': True, + }, { + 'url': 'https://hrti.hrt.hr/video/list/category/212/ekumena', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + category_id = mobj.group('id') + display_id = mobj.group('display_id') or category_id + + response = self._download_json( + '%s/category_id/%s/format/json' % (self._search_url, category_id), + display_id, 'Downloading video metadata JSON') + + video_ids = try_get( + response, lambda x: x['video_listings'][0]['alternatives'][0]['list'], + list) or [video['id'] for video in response.get('videos', []) if video.get('id')] + + entries = [self.url_result('hrti:%s' % video_id) for video_id in video_ids] + + return self.playlist_result(entries, category_id, display_id) diff --git a/youtube_dl/extractor/huajiao.py b/yt_dlp/extractor/huajiao.py index 4ca275dda..4ca275dda 100644 --- a/youtube_dl/extractor/huajiao.py +++ b/yt_dlp/extractor/huajiao.py diff --git a/youtube_dl/extractor/huffpost.py b/yt_dlp/extractor/huffpost.py index 97e36f056..97e36f056 100644 --- a/youtube_dl/extractor/huffpost.py +++ b/yt_dlp/extractor/huffpost.py diff --git a/yt_dlp/extractor/hungama.py b/yt_dlp/extractor/hungama.py new file mode 100644 index 000000000..821b16e5d --- /dev/null +++ b/yt_dlp/extractor/hungama.py @@ -0,0 +1,147 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + try_get, + urlencode_postdata, +) + + +class HungamaIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?hungama\.com/ + (?: + (?:video|movie)/[^/]+/| + tv-show/(?:[^/]+/){2}\d+/episode/[^/]+/ + ) + (?P<id>\d+) + ''' + _TESTS = [{ + 'url': 'http://www.hungama.com/video/krishna-chants/39349649/', + 'md5': 'a845a6d1ebd08d80c1035126d49bd6a0', + 'info_dict': { + 'id': '2931166', + 'ext': 'mp4', + 'title': 'Lucky Ali - Kitni Haseen Zindagi', + 'track': 'Kitni Haseen Zindagi', + 'artist': 'Lucky Ali', + 'album': 'Aks', + 'release_year': 2000, + } + }, { + 'url': 'https://www.hungama.com/movie/kahaani-2/44129919/', + 'only_matching': True, + }, { + 'url': 'https://www.hungama.com/tv-show/padded-ki-pushup/season-1/44139461/episode/ep-02-training-sasu-pathlaag-karing/44139503/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + info = self._search_json_ld(webpage, video_id) + + m3u8_url = self._download_json( + 'https://www.hungama.com/index.php', video_id, + data=urlencode_postdata({'content_id': video_id}), headers={ + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'X-Requested-With': 'XMLHttpRequest', + }, query={ + 'c': 'common', + 'm': 'get_video_mdn_url', + })['stream_url'] + + formats = self._extract_m3u8_formats( + m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + self._sort_formats(formats) + + info.update({ + 'id': video_id, + 'formats': formats, + }) + return info + + +class HungamaSongIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?hungama\.com/song/[^/]+/(?P<id>\d+)' + _TEST = { + 'url': 'https://www.hungama.com/song/kitni-haseen-zindagi/2931166/', + 'md5': 'd4a6a05a394ad0453a9bea3ca00e6024', + 'info_dict': { + 'id': '2931166', + 'ext': 'mp3', + 'title': 'Lucky Ali - Kitni Haseen Zindagi', + 'track': 'Kitni Haseen Zindagi', + 'artist': 'Lucky Ali', + 'album': None, + 'release_year': 2000, + } + } + + def _real_extract(self, url): + audio_id = self._match_id(url) + + data = self._download_json( + 'https://www.hungama.com/audio-player-data/track/%s' % audio_id, + audio_id, query={'_country': 'IN'})[0] + track = data['song_name'] + artist = data.get('singer_name') + formats = [] + media_json = self._download_json(data.get('file') or data['preview_link'], audio_id) + media_url = try_get(media_json, lambda x: x['response']['media_url'], str) + media_type = try_get(media_json, lambda x: x['response']['type'], str) + + if media_url: + formats.append({ + 'url': media_url, + 'ext': media_type, + 'vcodec': 'none', + 'acodec': media_type, + }) + + title = '%s - %s' % (artist, track) if artist else track + thumbnail = data.get('img_src') or data.get('album_image') + + return { + 'id': audio_id, + 'title': title, + 'thumbnail': thumbnail, + 'track': track, + 'artist': artist, + 'album': data.get('album_name') or None, + 'release_year': int_or_none(data.get('date')), + 'formats': formats, + } + + +class HungamaAlbumPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?hungama\.com/(?:playlists|album)/[^/]+/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.hungama.com/album/bhuj-the-pride-of-india/69481490/', + 'playlist_mincount': 7, + 'info_dict': { + 'id': '69481490', + }, + }, { + 'url': 'https://www.hungama.com/playlists/hindi-jan-to-june-2021/123063/', + 'playlist_mincount': 50, + 'info_dict': { + 'id': '123063', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + ptrn = r'<meta[^>]+?property=[\"\']?music:song:url[\"\']?[^>]+?content=[\"\']?([^\"\']+)' + items = re.findall(ptrn, webpage) + entries = [self.url_result(item, ie=HungamaSongIE.ie_key()) for item in items] + return self.playlist_result(entries, video_id) diff --git a/youtube_dl/extractor/hypem.py b/yt_dlp/extractor/hypem.py index 9ca28d632..9ca28d632 100644 --- a/youtube_dl/extractor/hypem.py +++ b/yt_dlp/extractor/hypem.py diff --git a/yt_dlp/extractor/ichinanalive.py b/yt_dlp/extractor/ichinanalive.py new file mode 100644 index 000000000..cb39f821c --- /dev/null +++ b/yt_dlp/extractor/ichinanalive.py @@ -0,0 +1,167 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ExtractorError, str_or_none, traverse_obj, unified_strdate +from ..compat import compat_str + + +class IchinanaLiveIE(InfoExtractor): + IE_NAME = '17live' + _VALID_URL = r'https?://(?:www\.)?17\.live/(?:[^/]+/)*(?:live|profile/r)/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://17.live/live/3773096', + 'info_dict': { + 'id': '3773096', + 'title': '萠珈☕🤡🍫moka', + 'is_live': True, + 'uploader': '萠珈☕🤡🍫moka', + 'uploader_id': '3773096', + 'like_count': 366, + 'view_count': 18121, + 'timestamp': 1630569012, + }, + 'skip': 'running as of writing, but may be ended as of testing', + }, { + 'note': 'nothing except language differs', + 'url': 'https://17.live/ja/live/3773096', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return not IchinanaLiveClipIE.suitable(url) and super(IchinanaLiveIE, cls).suitable(url) + + def _real_extract(self, url): + video_id = self._match_id(url) + url = 'https://17.live/live/%s' % video_id + + enter = self._download_json( + 'https://api-dsa.17app.co/api/v1/lives/%s/enter' % video_id, video_id, + headers={'Referer': url}, fatal=False, expected_status=420, + data=b'\0') + if enter and enter.get('message') == 'ended': + raise ExtractorError('This live has ended.', expected=True) + + view_data = self._download_json( + 'https://api-dsa.17app.co/api/v1/lives/%s' % video_id, video_id, + headers={'Referer': url}) + + uploader = traverse_obj( + view_data, ('userInfo', 'displayName'), ('userInfo', 'openID')) + + video_urls = view_data.get('rtmpUrls') + if not video_urls: + raise ExtractorError('unable to extract live URL information') + formats = [] + for (name, value) in video_urls[0].items(): + if not isinstance(value, compat_str): + continue + if not value.startswith('http'): + continue + quality = -1 + if 'web' in name: + quality -= 1 + if 'High' in name: + quality += 4 + if 'Low' in name: + quality -= 2 + formats.append({ + 'format_id': name, + 'url': value, + 'quality': quality, + 'http_headers': {'Referer': url}, + 'ext': 'flv', + 'vcodec': 'h264', + 'acodec': 'aac', + }) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': uploader or video_id, + 'formats': formats, + 'is_live': True, + 'uploader': uploader, + 'uploader_id': video_id, + 'like_count': view_data.get('receivedLikeCount'), + 'view_count': view_data.get('viewerCount'), + 'thumbnail': view_data.get('coverPhoto'), + 'description': view_data.get('caption'), + 'timestamp': view_data.get('beginTime'), + } + + +class IchinanaLiveClipIE(InfoExtractor): + IE_NAME = '17live:clip' + _VALID_URL = r'https?://(?:www\.)?17\.live/(?:[^/]+/)*profile/r/(?P<uploader_id>\d+)/clip/(?P<id>[^/]+)' + _TESTS = [{ + 'url': 'https://17.live/profile/r/1789280/clip/1bHQSK8KUieruFXaCH4A4upCzlN', + 'info_dict': { + 'id': '1bHQSK8KUieruFXaCH4A4upCzlN', + 'title': 'マチコ先生🦋Class💋', + 'description': 'マチ戦隊 第一次 バスターコール\n総額200万coin!\n動画制作@うぉーかー🌱Walker🎫', + 'uploader_id': '1789280', + }, + }, { + 'url': 'https://17.live/ja/profile/r/1789280/clip/1bHQSK8KUieruFXaCH4A4upCzlN', + 'only_matching': True, + }] + + def _real_extract(self, url): + uploader_id, video_id = self._match_valid_url(url).groups() + url = 'https://17.live/profile/r/%s/clip/%s' % (uploader_id, video_id) + + view_data = self._download_json( + 'https://api-dsa.17app.co/api/v1/clips/%s' % video_id, video_id, + headers={'Referer': url}) + + uploader = traverse_obj( + view_data, ('userInfo', 'displayName'), ('userInfo', 'name')) + + formats = [] + if view_data.get('videoURL'): + formats.append({ + 'id': 'video', + 'url': view_data['videoURL'], + 'quality': -1, + }) + if view_data.get('transcodeURL'): + formats.append({ + 'id': 'transcode', + 'url': view_data['transcodeURL'], + 'quality': -1, + }) + if view_data.get('srcVideoURL'): + # highest quality + formats.append({ + 'id': 'srcVideo', + 'url': view_data['srcVideoURL'], + 'quality': 1, + }) + + for fmt in formats: + fmt.update({ + 'ext': 'mp4', + 'protocol': 'https', + 'vcodec': 'h264', + 'acodec': 'aac', + 'http_headers': {'Referer': url}, + }) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': uploader or video_id, + 'formats': formats, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'like_count': view_data.get('likeCount'), + 'view_count': view_data.get('viewCount'), + 'thumbnail': view_data.get('imageURL'), + 'duration': view_data.get('duration'), + 'description': view_data.get('caption'), + 'upload_date': unified_strdate(str_or_none(view_data.get('createdAt'))), + } diff --git a/yt_dlp/extractor/ign.py b/yt_dlp/extractor/ign.py new file mode 100644 index 000000000..c826eb3ba --- /dev/null +++ b/yt_dlp/extractor/ign.py @@ -0,0 +1,257 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) +from ..utils import ( + HEADRequest, + determine_ext, + int_or_none, + parse_iso8601, + strip_or_none, + try_get, +) + + +class IGNBaseIE(InfoExtractor): + def _call_api(self, slug): + return self._download_json( + 'http://apis.ign.com/{0}/v3/{0}s/slug/{1}'.format(self._PAGE_TYPE, slug), slug) + + +class IGNIE(IGNBaseIE): + """ + Extractor for some of the IGN sites, like www.ign.com, es.ign.com de.ign.com. + Some videos of it.ign.com are also supported + """ + + _VALID_URL = r'https?://(?:.+?\.ign|www\.pcmag)\.com/videos/(?:\d{4}/\d{2}/\d{2}/)?(?P<id>[^/?&#]+)' + IE_NAME = 'ign.com' + _PAGE_TYPE = 'video' + + _TESTS = [{ + 'url': 'http://www.ign.com/videos/2013/06/05/the-last-of-us-review', + 'md5': 'd2e1586d9987d40fad7867bf96a018ea', + 'info_dict': { + 'id': '8f862beef863986b2785559b9e1aa599', + 'ext': 'mp4', + 'title': 'The Last of Us Review', + 'description': 'md5:c8946d4260a4d43a00d5ae8ed998870c', + 'timestamp': 1370440800, + 'upload_date': '20130605', + 'tags': 'count:9', + } + }, { + 'url': 'http://www.pcmag.com/videos/2015/01/06/010615-whats-new-now-is-gogo-snooping-on-your-data', + 'md5': 'f1581a6fe8c5121be5b807684aeac3f6', + 'info_dict': { + 'id': 'ee10d774b508c9b8ec07e763b9125b91', + 'ext': 'mp4', + 'title': 'What\'s New Now: Is GoGo Snooping on Your Data?', + 'description': 'md5:817a20299de610bd56f13175386da6fa', + 'timestamp': 1420571160, + 'upload_date': '20150106', + 'tags': 'count:4', + } + }, { + 'url': 'https://www.ign.com/videos/is-a-resident-evil-4-remake-on-the-way-ign-daily-fix', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + video = self._call_api(display_id) + video_id = video['videoId'] + metadata = video['metadata'] + title = metadata.get('longTitle') or metadata.get('title') or metadata['name'] + + formats = [] + refs = video.get('refs') or {} + + m3u8_url = refs.get('m3uUrl') + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + + f4m_url = refs.get('f4mUrl') + if f4m_url: + formats.extend(self._extract_f4m_formats( + f4m_url, video_id, f4m_id='hds', fatal=False)) + + for asset in (video.get('assets') or []): + asset_url = asset.get('url') + if not asset_url: + continue + formats.append({ + 'url': asset_url, + 'tbr': int_or_none(asset.get('bitrate'), 1000), + 'fps': int_or_none(asset.get('frame_rate')), + 'height': int_or_none(asset.get('height')), + 'width': int_or_none(asset.get('width')), + }) + + mezzanine_url = try_get(video, lambda x: x['system']['mezzanineUrl']) + if mezzanine_url: + formats.append({ + 'ext': determine_ext(mezzanine_url, 'mp4'), + 'format_id': 'mezzanine', + 'quality': 1, + 'url': mezzanine_url, + }) + + self._sort_formats(formats) + + thumbnails = [] + for thumbnail in (video.get('thumbnails') or []): + thumbnail_url = thumbnail.get('url') + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + }) + + tags = [] + for tag in (video.get('tags') or []): + display_name = tag.get('displayName') + if not display_name: + continue + tags.append(display_name) + + return { + 'id': video_id, + 'title': title, + 'description': strip_or_none(metadata.get('description')), + 'timestamp': parse_iso8601(metadata.get('publishDate')), + 'duration': int_or_none(metadata.get('duration')), + 'display_id': display_id, + 'thumbnails': thumbnails, + 'formats': formats, + 'tags': tags, + } + + +class IGNVideoIE(InfoExtractor): + _VALID_URL = r'https?://.+?\.ign\.com/(?:[a-z]{2}/)?[^/]+/(?P<id>\d+)/(?:video|trailer)/' + _TESTS = [{ + 'url': 'http://me.ign.com/en/videos/112203/video/how-hitman-aims-to-be-different-than-every-other-s', + 'md5': 'dd9aca7ed2657c4e118d8b261e5e9de1', + 'info_dict': { + 'id': 'e9be7ea899a9bbfc0674accc22a36cc8', + 'ext': 'mp4', + 'title': 'How Hitman Aims to Be Different Than Every Other Stealth Game - NYCC 2015', + 'description': 'Taking out assassination targets in Hitman has never been more stylish.', + 'timestamp': 1444665600, + 'upload_date': '20151012', + } + }, { + 'url': 'http://me.ign.com/ar/angry-birds-2/106533/video/lrd-ldyy-lwl-lfylm-angry-birds', + 'only_matching': True, + }, { + # Youtube embed + 'url': 'https://me.ign.com/ar/ratchet-clank-rift-apart/144327/trailer/embed', + 'only_matching': True, + }, { + # Twitter embed + 'url': 'http://adria.ign.com/sherlock-season-4/9687/trailer/embed', + 'only_matching': True, + }, { + # Vimeo embed + 'url': 'https://kr.ign.com/bic-2018/3307/trailer/embed', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + req = HEADRequest(url.rsplit('/', 1)[0] + '/embed') + url = self._request_webpage(req, video_id).geturl() + ign_url = compat_parse_qs( + compat_urllib_parse_urlparse(url).query).get('url', [None])[0] + if ign_url: + return self.url_result(ign_url, IGNIE.ie_key()) + return self.url_result(url) + + +class IGNArticleIE(IGNBaseIE): + _VALID_URL = r'https?://.+?\.ign\.com/(?:articles(?:/\d{4}/\d{2}/\d{2})?|(?:[a-z]{2}/)?feature/\d+)/(?P<id>[^/?&#]+)' + _PAGE_TYPE = 'article' + _TESTS = [{ + 'url': 'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind', + 'info_dict': { + 'id': '524497489e4e8ff5848ece34', + 'title': '100 Little Things in GTA 5 That Will Blow Your Mind', + }, + 'playlist': [ + { + 'info_dict': { + 'id': '5ebbd138523268b93c9141af17bec937', + 'ext': 'mp4', + 'title': 'GTA 5 Video Review', + 'description': 'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.', + 'timestamp': 1379339880, + 'upload_date': '20130916', + }, + }, + { + 'info_dict': { + 'id': '638672ee848ae4ff108df2a296418ee2', + 'ext': 'mp4', + 'title': '26 Twisted Moments from GTA 5 in Slow Motion', + 'description': 'The twisted beauty of GTA 5 in stunning slow motion.', + 'timestamp': 1386878820, + 'upload_date': '20131212', + }, + }, + ], + 'params': { + 'playlist_items': '2-3', + 'skip_download': True, + }, + }, { + 'url': 'http://www.ign.com/articles/2014/08/15/rewind-theater-wild-trailer-gamescom-2014?watch', + 'info_dict': { + 'id': '53ee806780a81ec46e0790f8', + 'title': 'Rewind Theater - Wild Trailer Gamescom 2014', + }, + 'playlist_count': 2, + }, { + # videoId pattern + 'url': 'http://www.ign.com/articles/2017/06/08/new-ducktales-short-donalds-birthday-doesnt-go-as-planned', + 'only_matching': True, + }, { + # Youtube embed + 'url': 'https://www.ign.com/articles/2021-mvp-named-in-puppy-bowl-xvii', + 'only_matching': True, + }, { + # IMDB embed + 'url': 'https://www.ign.com/articles/2014/08/07/sons-of-anarchy-final-season-trailer', + 'only_matching': True, + }, { + # Facebook embed + 'url': 'https://www.ign.com/articles/2017/09/20/marvels-the-punisher-watch-the-new-trailer-for-the-netflix-series', + 'only_matching': True, + }, { + # Brightcove embed + 'url': 'https://www.ign.com/articles/2016/01/16/supergirl-goes-flying-with-martian-manhunter-in-new-clip', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + article = self._call_api(display_id) + + def entries(): + media_url = try_get(article, lambda x: x['mediaRelations'][0]['media']['metadata']['url']) + if media_url: + yield self.url_result(media_url, IGNIE.ie_key()) + for content in (article.get('content') or []): + for video_url in re.findall(r'(?:\[(?:ignvideo\s+url|youtube\s+clip_id)|<iframe[^>]+src)="([^"]+)"', content): + yield self.url_result(video_url) + + return self.playlist_result( + entries(), article.get('articleId'), + strip_or_none(try_get(article, lambda x: x['metadata']['headline']))) diff --git a/youtube_dl/extractor/iheart.py b/yt_dlp/extractor/iheart.py index b54c05eeb..b54c05eeb 100644 --- a/youtube_dl/extractor/iheart.py +++ b/yt_dlp/extractor/iheart.py diff --git a/youtube_dl/extractor/imdb.py b/yt_dlp/extractor/imdb.py index a31301985..a31301985 100644 --- a/youtube_dl/extractor/imdb.py +++ b/yt_dlp/extractor/imdb.py diff --git a/yt_dlp/extractor/imggaming.py b/yt_dlp/extractor/imggaming.py new file mode 100644 index 000000000..ef20a4b9e --- /dev/null +++ b/yt_dlp/extractor/imggaming.py @@ -0,0 +1,132 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ( + ExtractorError, + int_or_none, + str_or_none, + try_get, +) + + +class ImgGamingBaseIE(InfoExtractor): + _API_BASE = 'https://dce-frontoffice.imggaming.com/api/v2/' + _API_KEY = '857a1e5d-e35e-4fdf-805b-a87b6f8364bf' + _HEADERS = None + _MANIFEST_HEADERS = {'Accept-Encoding': 'identity'} + _REALM = None + _VALID_URL_TEMPL = r'https?://(?P<domain>%s)/(?P<type>live|playlist|video)/(?P<id>\d+)(?:\?.*?\bplaylistId=(?P<playlist_id>\d+))?' + + def _real_initialize(self): + self._HEADERS = { + 'Realm': 'dce.' + self._REALM, + 'x-api-key': self._API_KEY, + } + + email, password = self._get_login_info() + if email is None: + self.raise_login_required() + + p_headers = self._HEADERS.copy() + p_headers['Content-Type'] = 'application/json' + self._HEADERS['Authorization'] = 'Bearer ' + self._download_json( + self._API_BASE + 'login', + None, 'Logging in', data=json.dumps({ + 'id': email, + 'secret': password, + }).encode(), headers=p_headers)['authorisationToken'] + + def _call_api(self, path, media_id): + return self._download_json( + self._API_BASE + path + media_id, media_id, headers=self._HEADERS) + + def _extract_dve_api_url(self, media_id, media_type): + stream_path = 'stream' + if media_type == 'video': + stream_path += '/vod/' + else: + stream_path += '?eventId=' + try: + return self._call_api( + stream_path, media_id)['playerUrlCallback'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + raise ExtractorError( + self._parse_json(e.cause.read().decode(), media_id)['messages'][0], + expected=True) + raise + + def _real_extract(self, url): + domain, media_type, media_id, playlist_id = self._match_valid_url(url).groups() + + if playlist_id: + if self.get_param('noplaylist'): + self.to_screen('Downloading just video %s because of --no-playlist' % media_id) + else: + self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % playlist_id) + media_type, media_id = 'playlist', playlist_id + + if media_type == 'playlist': + playlist = self._call_api('vod/playlist/', media_id) + entries = [] + for video in try_get(playlist, lambda x: x['videos']['vods']) or []: + video_id = str_or_none(video.get('id')) + if not video_id: + continue + entries.append(self.url_result( + 'https://%s/video/%s' % (domain, video_id), + self.ie_key(), video_id)) + return self.playlist_result( + entries, media_id, playlist.get('title'), + playlist.get('description')) + + dve_api_url = self._extract_dve_api_url(media_id, media_type) + video_data = self._download_json(dve_api_url, media_id) + is_live = media_type == 'live' + if is_live: + title = self._live_title(self._call_api('event/', media_id)['title']) + else: + title = video_data['name'] + + formats = [] + for proto in ('hls', 'dash'): + media_url = video_data.get(proto + 'Url') or try_get(video_data, lambda x: x[proto]['url']) + if not media_url: + continue + if proto == 'hls': + m3u8_formats = self._extract_m3u8_formats( + media_url, media_id, 'mp4', 'm3u8' if is_live else 'm3u8_native', + m3u8_id='hls', fatal=False, headers=self._MANIFEST_HEADERS) + for f in m3u8_formats: + f.setdefault('http_headers', {}).update(self._MANIFEST_HEADERS) + formats.append(f) + else: + formats.extend(self._extract_mpd_formats( + media_url, media_id, mpd_id='dash', fatal=False, + headers=self._MANIFEST_HEADERS)) + self._sort_formats(formats) + + subtitles = {} + for subtitle in video_data.get('subtitles', []): + subtitle_url = subtitle.get('url') + if not subtitle_url: + continue + subtitles.setdefault(subtitle.get('lang', 'en_US'), []).append({ + 'url': subtitle_url, + }) + + return { + 'id': media_id, + 'title': title, + 'formats': formats, + 'thumbnail': video_data.get('thumbnailUrl'), + 'description': video_data.get('description'), + 'duration': int_or_none(video_data.get('duration')), + 'tags': video_data.get('tags'), + 'is_live': is_live, + 'subtitles': subtitles, + } diff --git a/yt_dlp/extractor/imgur.py b/yt_dlp/extractor/imgur.py new file mode 100644 index 000000000..dfa473752 --- /dev/null +++ b/yt_dlp/extractor/imgur.py @@ -0,0 +1,154 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + js_to_json, + mimetype2ext, + ExtractorError, +) + + +class ImgurIE(InfoExtractor): + _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?!(?:a|gallery|(?:t(?:opic)?|r)/[^/]+)/)(?P<id>[a-zA-Z0-9]+)' + + _TESTS = [{ + 'url': 'https://i.imgur.com/A61SaA1.gifv', + 'info_dict': { + 'id': 'A61SaA1', + 'ext': 'mp4', + 'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$', + }, + }, { + 'url': 'https://imgur.com/A61SaA1', + 'only_matching': True, + }, { + 'url': 'https://i.imgur.com/crGpqCV.mp4', + 'only_matching': True, + }, { + # no title + 'url': 'https://i.imgur.com/jxBXAMC.gifv', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage( + 'https://i.imgur.com/{id}.gifv'.format(id=video_id), video_id) + + width = int_or_none(self._og_search_property( + 'video:width', webpage, default=None)) + height = int_or_none(self._og_search_property( + 'video:height', webpage, default=None)) + + video_elements = self._search_regex( + r'(?s)<div class="video-elements">(.*?)</div>', + webpage, 'video elements', default=None) + if not video_elements: + raise ExtractorError( + 'No sources found for video %s. Maybe an image?' % video_id, + expected=True) + + formats = [] + for m in re.finditer(r'<source\s+src="(?P<src>[^"]+)"\s+type="(?P<type>[^"]+)"', video_elements): + formats.append({ + 'format_id': m.group('type').partition('/')[2], + 'url': self._proto_relative_url(m.group('src')), + 'ext': mimetype2ext(m.group('type')), + 'width': width, + 'height': height, + 'http_headers': { + 'User-Agent': 'yt-dlp (like wget)', + }, + }) + + gif_json = self._search_regex( + r'(?s)var\s+videoItem\s*=\s*(\{.*?\})', + webpage, 'GIF code', fatal=False) + if gif_json: + gifd = self._parse_json( + gif_json, video_id, transform_source=js_to_json) + formats.append({ + 'format_id': 'gif', + 'preference': -10, # gifs are worse than videos + 'width': width, + 'height': height, + 'ext': 'gif', + 'acodec': 'none', + 'vcodec': 'gif', + 'container': 'gif', + 'url': self._proto_relative_url(gifd['gifUrl']), + 'filesize': gifd.get('size'), + 'http_headers': { + 'User-Agent': 'yt-dlp (like wget)', + }, + }) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'formats': formats, + 'title': self._og_search_title(webpage, default=video_id), + } + + +class ImgurGalleryIE(InfoExtractor): + IE_NAME = 'imgur:gallery' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:gallery|(?:t(?:opic)?|r)/[^/]+)/(?P<id>[a-zA-Z0-9]+)' + + _TESTS = [{ + 'url': 'http://imgur.com/gallery/Q95ko', + 'info_dict': { + 'id': 'Q95ko', + 'title': 'Adding faces make every GIF better', + }, + 'playlist_count': 25, + }, { + 'url': 'http://imgur.com/topic/Aww/ll5Vk', + 'only_matching': True, + }, { + 'url': 'https://imgur.com/gallery/YcAQlkx', + 'info_dict': { + 'id': 'YcAQlkx', + 'ext': 'mp4', + 'title': 'Classic Steve Carell gif...cracks me up everytime....damn the repost downvotes....', + } + }, { + 'url': 'http://imgur.com/topic/Funny/N8rOudd', + 'only_matching': True, + }, { + 'url': 'http://imgur.com/r/aww/VQcQPhM', + 'only_matching': True, + }] + + def _real_extract(self, url): + gallery_id = self._match_id(url) + + data = self._download_json( + 'https://imgur.com/gallery/%s.json' % gallery_id, + gallery_id)['data']['image'] + + if data.get('is_album'): + entries = [ + self.url_result('http://imgur.com/%s' % image['hash'], ImgurIE.ie_key(), image['hash']) + for image in data['album_images']['images'] if image.get('hash')] + return self.playlist_result(entries, gallery_id, data.get('title'), data.get('description')) + + return self.url_result('http://imgur.com/%s' % gallery_id, ImgurIE.ie_key(), gallery_id) + + +class ImgurAlbumIE(ImgurGalleryIE): + IE_NAME = 'imgur:album' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/a/(?P<id>[a-zA-Z0-9]+)' + + _TESTS = [{ + 'url': 'http://imgur.com/a/j6Orj', + 'info_dict': { + 'id': 'j6Orj', + 'title': 'A Literary Analysis of "Star Wars: The Force Awakens"', + }, + 'playlist_count': 12, + }] diff --git a/youtube_dl/extractor/ina.py b/yt_dlp/extractor/ina.py index b3b2683cb..b3b2683cb 100644 --- a/youtube_dl/extractor/ina.py +++ b/yt_dlp/extractor/ina.py diff --git a/youtube_dl/extractor/inc.py b/yt_dlp/extractor/inc.py index d5b258a0f..d5b258a0f 100644 --- a/youtube_dl/extractor/inc.py +++ b/yt_dlp/extractor/inc.py diff --git a/youtube_dl/extractor/indavideo.py b/yt_dlp/extractor/indavideo.py index 4c16243ec..4c16243ec 100644 --- a/youtube_dl/extractor/indavideo.py +++ b/yt_dlp/extractor/indavideo.py diff --git a/youtube_dl/extractor/infoq.py b/yt_dlp/extractor/infoq.py index 0a70a1fb4..0a70a1fb4 100644 --- a/youtube_dl/extractor/infoq.py +++ b/yt_dlp/extractor/infoq.py diff --git a/yt_dlp/extractor/instagram.py b/yt_dlp/extractor/instagram.py new file mode 100644 index 000000000..3801c7af9 --- /dev/null +++ b/yt_dlp/extractor/instagram.py @@ -0,0 +1,483 @@ +from __future__ import unicode_literals + +import itertools +import hashlib +import json +import re + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_HTTPError, +) +from ..utils import ( + ExtractorError, + float_or_none, + get_element_by_attribute, + int_or_none, + lowercase_escape, + std_headers, + try_get, + url_or_none, + variadic, +) + + +class InstagramIE(InfoExtractor): + _VALID_URL = r'(?P<url>https?://(?:www\.)?instagram\.com/(?:p|tv|reel)/(?P<id>[^/?#&]+))' + _TESTS = [{ + 'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc', + 'md5': '0d2da106a9d2631273e192b372806516', + 'info_dict': { + 'id': 'aye83DjauH', + 'ext': 'mp4', + 'title': 'Video by naomipq', + 'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 0, + 'timestamp': 1371748545, + 'upload_date': '20130620', + 'uploader_id': 'naomipq', + 'uploader': 'B E A U T Y F O R A S H E S', + 'like_count': int, + 'comment_count': int, + 'comments': list, + }, + }, { + # missing description + 'url': 'https://www.instagram.com/p/BA-pQFBG8HZ/?taken-by=britneyspears', + 'info_dict': { + 'id': 'BA-pQFBG8HZ', + 'ext': 'mp4', + 'title': 'Video by britneyspears', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 0, + 'timestamp': 1453760977, + 'upload_date': '20160125', + 'uploader_id': 'britneyspears', + 'uploader': 'Britney Spears', + 'like_count': int, + 'comment_count': int, + 'comments': list, + }, + 'params': { + 'skip_download': True, + }, + }, { + # multi video post + 'url': 'https://www.instagram.com/p/BQ0eAlwhDrw/', + 'playlist': [{ + 'info_dict': { + 'id': 'BQ0dSaohpPW', + 'ext': 'mp4', + 'title': 'Video 1', + }, + }, { + 'info_dict': { + 'id': 'BQ0dTpOhuHT', + 'ext': 'mp4', + 'title': 'Video 2', + }, + }, { + 'info_dict': { + 'id': 'BQ0dT7RBFeF', + 'ext': 'mp4', + 'title': 'Video 3', + }, + }], + 'info_dict': { + 'id': 'BQ0eAlwhDrw', + 'title': 'Post by instagram', + 'description': 'md5:0f9203fc6a2ce4d228da5754bcf54957', + }, + }, { + # IGTV + 'url': 'https://www.instagram.com/tv/BkfuX9UB-eK/', + 'info_dict': { + 'id': 'BkfuX9UB-eK', + 'ext': 'mp4', + 'title': 'Fingerboarding Tricks with @cass.fb', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 53.83, + 'timestamp': 1530032919, + 'upload_date': '20180626', + 'uploader_id': 'instagram', + 'uploader': 'Instagram', + 'like_count': int, + 'comment_count': int, + 'comments': list, + 'description': 'Meet Cass Hirst (@cass.fb), a fingerboarding pro who can perform tiny ollies and kickflips while blindfolded.', + } + }, { + 'url': 'https://instagram.com/p/-Cmh1cukG2/', + 'only_matching': True, + }, { + 'url': 'http://instagram.com/p/9o6LshA7zy/embed/', + 'only_matching': True, + }, { + 'url': 'https://www.instagram.com/tv/aye83DjauH/', + 'only_matching': True, + }, { + 'url': 'https://www.instagram.com/reel/CDUMkliABpa/', + 'only_matching': True, + }] + + @staticmethod + def _extract_embed_url(webpage): + mobj = re.search( + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?instagram\.com/p/[^/]+/embed.*?)\1', + webpage) + if mobj: + return mobj.group('url') + + blockquote_el = get_element_by_attribute( + 'class', 'instagram-media', webpage) + if blockquote_el is None: + return + + mobj = re.search( + r'<a[^>]+href=([\'"])(?P<link>[^\'"]+)\1', blockquote_el) + if mobj: + return mobj.group('link') + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + url = mobj.group('url') + + webpage, urlh = self._download_webpage_handle(url, video_id) + if 'www.instagram.com/accounts/login' in urlh.geturl().rstrip('/'): + self.raise_login_required('You need to log in to access this content', method='cookies') + + (media, video_url, description, thumbnail, timestamp, uploader, + uploader_id, like_count, comment_count, comments, height, + width) = [None] * 12 + + shared_data = self._parse_json( + self._search_regex( + r'window\._sharedData\s*=\s*({.+?});', + webpage, 'shared data', default='{}'), + video_id, fatal=False) + if shared_data: + media = try_get( + shared_data, + (lambda x: x['entry_data']['PostPage'][0]['graphql']['shortcode_media'], + lambda x: x['entry_data']['PostPage'][0]['media']), + dict) + # _sharedData.entry_data.PostPage is empty when authenticated (see + # https://github.com/ytdl-org/youtube-dl/pull/22880) + if not media: + additional_data = self._parse_json( + self._search_regex( + r'window\.__additionalDataLoaded\s*\(\s*[^,]+,\s*({.+?})\s*\)\s*;', + webpage, 'additional data', default='{}'), + video_id, fatal=False) + if additional_data: + media = try_get( + additional_data, lambda x: x['graphql']['shortcode_media'], + dict) + if media: + video_url = media.get('video_url') + height = int_or_none(media.get('dimensions', {}).get('height')) + width = int_or_none(media.get('dimensions', {}).get('width')) + description = try_get( + media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], + compat_str) or media.get('caption') + title = media.get('title') + thumbnail = media.get('display_src') or media.get('display_url') + duration = float_or_none(media.get('video_duration')) + timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date')) + uploader = media.get('owner', {}).get('full_name') + uploader_id = media.get('owner', {}).get('username') + + def get_count(keys, kind): + for key in variadic(keys): + count = int_or_none(try_get( + media, (lambda x: x['edge_media_%s' % key]['count'], + lambda x: x['%ss' % kind]['count']))) + if count is not None: + return count + + like_count = get_count('preview_like', 'like') + comment_count = get_count( + ('preview_comment', 'to_comment', 'to_parent_comment'), 'comment') + + comments = [] + for comment in try_get(media, lambda x: x['edge_media_to_parent_comment']['edges']): + comment_dict = comment.get('node', {}) + comment_text = comment_dict.get('text') + if comment_text: + comments.append({ + 'author': try_get(comment_dict, lambda x: x['owner']['username']), + 'author_id': try_get(comment_dict, lambda x: x['owner']['id']), + 'id': comment_dict.get('id'), + 'text': comment_text, + 'timestamp': int_or_none(comment_dict.get('created_at')), + }) + if not video_url: + edges = try_get( + media, lambda x: x['edge_sidecar_to_children']['edges'], + list) or [] + if edges: + entries = [] + for edge_num, edge in enumerate(edges, start=1): + node = try_get(edge, lambda x: x['node'], dict) + if not node: + continue + node_video_url = url_or_none(node.get('video_url')) + if not node_video_url: + continue + entries.append({ + 'id': node.get('shortcode') or node['id'], + 'title': node.get('title') or 'Video %d' % edge_num, + 'url': node_video_url, + 'thumbnail': node.get('display_url'), + 'duration': float_or_none(node.get('video_duration')), + 'width': int_or_none(try_get(node, lambda x: x['dimensions']['width'])), + 'height': int_or_none(try_get(node, lambda x: x['dimensions']['height'])), + 'view_count': int_or_none(node.get('video_view_count')), + }) + return self.playlist_result( + entries, video_id, + 'Post by %s' % uploader_id if uploader_id else None, + description) + + if not video_url: + video_url = self._og_search_video_url(webpage, secure=False) + + formats = [{ + 'url': video_url, + 'width': width, + 'height': height, + }] + + if not uploader_id: + uploader_id = self._search_regex( + r'"owner"\s*:\s*{\s*"username"\s*:\s*"(.+?)"', + webpage, 'uploader id', fatal=False) + + if not description: + description = self._search_regex( + r'"caption"\s*:\s*"(.+?)"', webpage, 'description', default=None) + if description is not None: + description = lowercase_escape(description) + + if not thumbnail: + thumbnail = self._og_search_thumbnail(webpage) + + return { + 'id': video_id, + 'formats': formats, + 'ext': 'mp4', + 'title': title or 'Video by %s' % uploader_id, + 'description': description, + 'duration': duration, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'uploader_id': uploader_id, + 'uploader': uploader, + 'like_count': like_count, + 'comment_count': comment_count, + 'comments': comments, + 'http_headers': { + 'Referer': 'https://www.instagram.com/', + } + } + + +class InstagramPlaylistIE(InfoExtractor): + # A superclass for handling any kind of query based on GraphQL which + # results in a playlist. + + _gis_tmpl = None # used to cache GIS request type + + def _parse_graphql(self, webpage, item_id): + # Reads a webpage and returns its GraphQL data. + return self._parse_json( + self._search_regex( + r'sharedData\s*=\s*({.+?})\s*;\s*[<\n]', webpage, 'data'), + item_id) + + def _extract_graphql(self, data, url): + # Parses GraphQL queries containing videos and generates a playlist. + def get_count(suffix): + return int_or_none(try_get( + node, lambda x: x['edge_media_' + suffix]['count'])) + + uploader_id = self._match_id(url) + csrf_token = data['config']['csrf_token'] + rhx_gis = data.get('rhx_gis') or '3c7ca9dcefcf966d11dacf1f151335e8' + + cursor = '' + for page_num in itertools.count(1): + variables = { + 'first': 12, + 'after': cursor, + } + variables.update(self._query_vars_for(data)) + variables = json.dumps(variables) + + if self._gis_tmpl: + gis_tmpls = [self._gis_tmpl] + else: + gis_tmpls = [ + '%s' % rhx_gis, + '', + '%s:%s' % (rhx_gis, csrf_token), + '%s:%s:%s' % (rhx_gis, csrf_token, std_headers['User-Agent']), + ] + + # try all of the ways to generate a GIS query, and not only use the + # first one that works, but cache it for future requests + for gis_tmpl in gis_tmpls: + try: + json_data = self._download_json( + 'https://www.instagram.com/graphql/query/', uploader_id, + 'Downloading JSON page %d' % page_num, headers={ + 'X-Requested-With': 'XMLHttpRequest', + 'X-Instagram-GIS': hashlib.md5( + ('%s:%s' % (gis_tmpl, variables)).encode('utf-8')).hexdigest(), + }, query={ + 'query_hash': self._QUERY_HASH, + 'variables': variables, + }) + media = self._parse_timeline_from(json_data) + self._gis_tmpl = gis_tmpl + break + except ExtractorError as e: + # if it's an error caused by a bad query, and there are + # more GIS templates to try, ignore it and keep trying + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + if gis_tmpl != gis_tmpls[-1]: + continue + raise + + edges = media.get('edges') + if not edges or not isinstance(edges, list): + break + + for edge in edges: + node = edge.get('node') + if not node or not isinstance(node, dict): + continue + if node.get('__typename') != 'GraphVideo' and node.get('is_video') is not True: + continue + video_id = node.get('shortcode') + if not video_id: + continue + + info = self.url_result( + 'https://instagram.com/p/%s/' % video_id, + ie=InstagramIE.ie_key(), video_id=video_id) + + description = try_get( + node, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], + compat_str) + thumbnail = node.get('thumbnail_src') or node.get('display_src') + timestamp = int_or_none(node.get('taken_at_timestamp')) + + comment_count = get_count('to_comment') + like_count = get_count('preview_like') + view_count = int_or_none(node.get('video_view_count')) + + info.update({ + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'comment_count': comment_count, + 'like_count': like_count, + 'view_count': view_count, + }) + + yield info + + page_info = media.get('page_info') + if not page_info or not isinstance(page_info, dict): + break + + has_next_page = page_info.get('has_next_page') + if not has_next_page: + break + + cursor = page_info.get('end_cursor') + if not cursor or not isinstance(cursor, compat_str): + break + + def _real_extract(self, url): + user_or_tag = self._match_id(url) + webpage = self._download_webpage(url, user_or_tag) + data = self._parse_graphql(webpage, user_or_tag) + + self._set_cookie('instagram.com', 'ig_pr', '1') + + return self.playlist_result( + self._extract_graphql(data, url), user_or_tag, user_or_tag) + + +class InstagramUserIE(InstagramPlaylistIE): + _VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P<id>[^/]{2,})/?(?:$|[?#])' + IE_DESC = 'Instagram user profile' + IE_NAME = 'instagram:user' + _TEST = { + 'url': 'https://instagram.com/porsche', + 'info_dict': { + 'id': 'porsche', + 'title': 'porsche', + }, + 'playlist_count': 5, + 'params': { + 'extract_flat': True, + 'skip_download': True, + 'playlistend': 5, + } + } + + _QUERY_HASH = '42323d64886122307be10013ad2dcc44', + + @staticmethod + def _parse_timeline_from(data): + # extracts the media timeline data from a GraphQL result + return data['data']['user']['edge_owner_to_timeline_media'] + + @staticmethod + def _query_vars_for(data): + # returns a dictionary of variables to add to the timeline query based + # on the GraphQL of the original page + return { + 'id': data['entry_data']['ProfilePage'][0]['graphql']['user']['id'] + } + + +class InstagramTagIE(InstagramPlaylistIE): + _VALID_URL = r'https?://(?:www\.)?instagram\.com/explore/tags/(?P<id>[^/]+)' + IE_DESC = 'Instagram hashtag search' + IE_NAME = 'instagram:tag' + _TEST = { + 'url': 'https://instagram.com/explore/tags/lolcats', + 'info_dict': { + 'id': 'lolcats', + 'title': 'lolcats', + }, + 'playlist_count': 50, + 'params': { + 'extract_flat': True, + 'skip_download': True, + 'playlistend': 50, + } + } + + _QUERY_HASH = 'f92f56d47dc7a55b606908374b43a314', + + @staticmethod + def _parse_timeline_from(data): + # extracts the media timeline data from a GraphQL result + return data['data']['hashtag']['edge_hashtag_to_media'] + + @staticmethod + def _query_vars_for(data): + # returns a dictionary of variables to add to the timeline query based + # on the GraphQL of the original page + return { + 'tag_name': + data['entry_data']['TagPage'][0]['graphql']['hashtag']['name'] + } diff --git a/youtube_dl/extractor/internazionale.py b/yt_dlp/extractor/internazionale.py index 676e8e269..676e8e269 100644 --- a/youtube_dl/extractor/internazionale.py +++ b/yt_dlp/extractor/internazionale.py diff --git a/yt_dlp/extractor/internetvideoarchive.py b/yt_dlp/extractor/internetvideoarchive.py new file mode 100644 index 000000000..880918cd7 --- /dev/null +++ b/yt_dlp/extractor/internetvideoarchive.py @@ -0,0 +1,61 @@ +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import parse_qs + + +class InternetVideoArchiveIE(InfoExtractor): + _VALID_URL = r'https?://video\.internetvideoarchive\.net/(?:player|flash/players)/.*?\?.*?publishedid.*?' + + _TEST = { + 'url': 'http://video.internetvideoarchive.net/player/6/configuration.ashx?customerid=69249&publishedid=194487&reporttag=vdbetatitle&playerid=641&autolist=0&domain=www.videodetective.com&maxrate=high&minrate=low&socialplayer=false', + 'info_dict': { + 'id': '194487', + 'ext': 'mp4', + 'title': 'Kick-Ass 2', + 'description': 'md5:c189d5b7280400630a1d3dd17eaa8d8a', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + @staticmethod + def _build_json_url(query): + return 'http://video.internetvideoarchive.net/player/6/configuration.ashx?' + query + + def _real_extract(self, url): + query = parse_qs(url) + video_id = query['publishedid'][0] + data = self._download_json( + 'https://video.internetvideoarchive.net/videojs7/videojs7.ivasettings.ashx', + video_id, data=json.dumps({ + 'customerid': query['customerid'][0], + 'publishedid': video_id, + }).encode()) + title = data['Title'] + formats = self._extract_m3u8_formats( + data['VideoUrl'], video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False) + file_url = formats[0]['url'] + if '.ism/' in file_url: + replace_url = lambda x: re.sub(r'\.ism/[^?]+', '.ism/' + x, file_url) + formats.extend(self._extract_f4m_formats( + replace_url('.f4m'), video_id, f4m_id='hds', fatal=False)) + formats.extend(self._extract_mpd_formats( + replace_url('.mpd'), video_id, mpd_id='dash', fatal=False)) + formats.extend(self._extract_ism_formats( + replace_url('Manifest'), video_id, ism_id='mss', fatal=False)) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': data.get('PosterUrl'), + 'description': data.get('Description'), + } diff --git a/yt_dlp/extractor/iprima.py b/yt_dlp/extractor/iprima.py new file mode 100644 index 000000000..28e660972 --- /dev/null +++ b/yt_dlp/extractor/iprima.py @@ -0,0 +1,149 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import time + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + js_to_json, +) + + +class IPrimaIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^/]+)\.iprima\.cz/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _GEO_BYPASS = False + + _TESTS = [{ + 'url': 'https://prima.iprima.cz/particka/92-epizoda', + 'info_dict': { + 'id': 'p51388', + 'ext': 'mp4', + 'title': 'Partička (92)', + 'description': 'md5:859d53beae4609e6dd7796413f1b6cac', + }, + 'params': { + 'skip_download': True, # m3u8 download + }, + }, { + 'url': 'https://cnn.iprima.cz/videa/70-epizoda', + 'info_dict': { + 'id': 'p681554', + 'ext': 'mp4', + 'title': 'HLAVNÍ ZPRÁVY 3.5.2020', + }, + 'params': { + 'skip_download': True, # m3u8 download + }, + }, { + 'url': 'http://play.iprima.cz/particka/particka-92', + 'only_matching': True, + }, { + # geo restricted + 'url': 'http://play.iprima.cz/closer-nove-pripady/closer-nove-pripady-iv-1', + 'only_matching': True, + }, { + # iframe api.play-backend.iprima.cz + 'url': 'https://prima.iprima.cz/my-little-pony/mapa-znameni-2-2', + 'only_matching': True, + }, { + # iframe prima.iprima.cz + 'url': 'https://prima.iprima.cz/porady/jak-se-stavi-sen/rodina-rathousova-praha', + 'only_matching': True, + }, { + 'url': 'http://www.iprima.cz/filmy/desne-rande', + 'only_matching': True, + }, { + 'url': 'https://zoom.iprima.cz/10-nejvetsich-tajemstvi-zahad/posvatna-mista-a-stavby', + 'only_matching': True, + }, { + 'url': 'https://krimi.iprima.cz/mraz-0/sebevrazdy', + 'only_matching': True, + }, { + 'url': 'https://cool.iprima.cz/derava-silnice-nevadi', + 'only_matching': True, + }, { + 'url': 'https://love.iprima.cz/laska-az-za-hrob/slib-dany-bratrovi', + 'only_matching': True, + }, { + 'url': 'https://autosalon.iprima.cz/motorsport/7-epizoda-1', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + self._set_cookie('play.iprima.cz', 'ott_adult_confirmed', '1') + + webpage = self._download_webpage(url, video_id) + + title = self._og_search_title( + webpage, default=None) or self._search_regex( + r'<h1>([^<]+)', webpage, 'title') + + video_id = self._search_regex( + (r'<iframe[^>]+\bsrc=["\'](?:https?:)?//(?:api\.play-backend\.iprima\.cz/prehravac/embedded|prima\.iprima\.cz/[^/]+/[^/]+)\?.*?\bid=(p\d+)', + r'data-product="([^"]+)">', + r'id=["\']player-(p\d+)"', + r'playerId\s*:\s*["\']player-(p\d+)', + r'\bvideos\s*=\s*["\'](p\d+)'), + webpage, 'real id') + + playerpage = self._download_webpage( + 'http://play.iprima.cz/prehravac/init', + video_id, note='Downloading player', query={ + '_infuse': 1, + '_ts': round(time.time()), + 'productId': video_id, + }, headers={'Referer': url}) + + formats = [] + + def extract_formats(format_url, format_key=None, lang=None): + ext = determine_ext(format_url) + new_formats = [] + if format_key == 'hls' or ext == 'm3u8': + new_formats = self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False) + elif format_key == 'dash' or ext == 'mpd': + return + new_formats = self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False) + if lang: + for f in new_formats: + if not f.get('language'): + f['language'] = lang + formats.extend(new_formats) + + options = self._parse_json( + self._search_regex( + r'(?s)(?:TDIPlayerOptions|playerOptions)\s*=\s*({.+?});\s*\]\]', + playerpage, 'player options', default='{}'), + video_id, transform_source=js_to_json, fatal=False) + if options: + for key, tracks in options.get('tracks', {}).items(): + if not isinstance(tracks, list): + continue + for track in tracks: + src = track.get('src') + if src: + extract_formats(src, key.lower(), track.get('lang')) + + if not formats: + for _, src in re.findall(r'src["\']\s*:\s*(["\'])(.+?)\1', playerpage): + extract_formats(src) + + if not formats and '>GEO_IP_NOT_ALLOWED<' in playerpage: + self.raise_geo_restricted(countries=['CZ'], metadata_available=True) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'formats': formats, + 'description': self._og_search_description(webpage, default=None), + } diff --git a/yt_dlp/extractor/iqiyi.py b/yt_dlp/extractor/iqiyi.py new file mode 100644 index 000000000..b13b9f4cf --- /dev/null +++ b/yt_dlp/extractor/iqiyi.py @@ -0,0 +1,219 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import hashlib +import itertools +import re +import time + +from .common import InfoExtractor +from ..compat import ( + compat_str, +) +from ..utils import ( + clean_html, + get_element_by_id, + get_element_by_attribute, + ExtractorError, + ohdave_rsa_encrypt, + remove_start, +) + + +def md5_text(text): + return hashlib.md5(text.encode('utf-8')).hexdigest() + + +class IqiyiIE(InfoExtractor): + IE_NAME = 'iqiyi' + IE_DESC = '爱奇艺' + + _VALID_URL = r'https?://(?:(?:[^.]+\.)?iqiyi\.com|www\.pps\.tv)/.+\.html' + + _NETRC_MACHINE = 'iqiyi' + + _TESTS = [{ + 'url': 'http://www.iqiyi.com/v_19rrojlavg.html', + # MD5 checksum differs on my machine and Travis CI + 'info_dict': { + 'id': '9c1fb1b99d192b21c559e5a1a2cb3c73', + 'ext': 'mp4', + 'title': '美国德州空中惊现奇异云团 酷似UFO', + } + }, { + 'url': 'http://www.iqiyi.com/v_19rrhnnclk.html', + 'md5': 'b7dc800a4004b1b57749d9abae0472da', + 'info_dict': { + 'id': 'e3f585b550a280af23c98b6cb2be19fb', + 'ext': 'mp4', + # This can be either Simplified Chinese or Traditional Chinese + 'title': r're:^(?:名侦探柯南 国语版:第752集 迫近灰原秘密的黑影 下篇|名偵探柯南 國語版:第752集 迫近灰原秘密的黑影 下篇)$', + }, + 'skip': 'Geo-restricted to China', + }, { + 'url': 'http://www.iqiyi.com/w_19rt6o8t9p.html', + 'only_matching': True, + }, { + 'url': 'http://www.iqiyi.com/a_19rrhbc6kt.html', + 'only_matching': True, + }, { + 'url': 'http://yule.iqiyi.com/pcb.html', + 'info_dict': { + 'id': '4a0af228fddb55ec96398a364248ed7f', + 'ext': 'mp4', + 'title': '第2017-04-21期 女艺人频遭极端粉丝骚扰', + }, + }, { + # VIP-only video. The first 2 parts (6 minutes) are available without login + # MD5 sums omitted as values are different on Travis CI and my machine + 'url': 'http://www.iqiyi.com/v_19rrny4w8w.html', + 'info_dict': { + 'id': 'f3cf468b39dddb30d676f89a91200dc1', + 'ext': 'mp4', + 'title': '泰坦尼克号', + }, + 'skip': 'Geo-restricted to China', + }, { + 'url': 'http://www.iqiyi.com/a_19rrhb8ce1.html', + 'info_dict': { + 'id': '202918101', + 'title': '灌篮高手 国语版', + }, + 'playlist_count': 101, + }, { + 'url': 'http://www.pps.tv/w_19rrbav0ph.html', + 'only_matching': True, + }] + + _FORMATS_MAP = { + '96': 1, # 216p, 240p + '1': 2, # 336p, 360p + '2': 3, # 480p, 504p + '21': 4, # 504p + '4': 5, # 720p + '17': 5, # 720p + '5': 6, # 1072p, 1080p + '18': 7, # 1080p + } + + def _real_initialize(self): + self._login() + + @staticmethod + def _rsa_fun(data): + # public key extracted from http://static.iqiyi.com/js/qiyiV2/20160129180840/jobs/i18n/i18nIndex.js + N = 0xab86b6371b5318aaa1d3c9e612a9f1264f372323c8c0f19875b5fc3b3fd3afcc1e5bec527aa94bfa85bffc157e4245aebda05389a5357b75115ac94f074aefcd + e = 65537 + + return ohdave_rsa_encrypt(data, e, N) + + def _login(self): + raise ExtractorError("iQiyi's non-free authentication algorithm has made login impossible", expected=True) + + def get_raw_data(self, tvid, video_id): + tm = int(time.time() * 1000) + + key = 'd5fb4bd9d50c4be6948c97edd7254b0e' + sc = md5_text(compat_str(tm) + key + tvid) + params = { + 'tvid': tvid, + 'vid': video_id, + 'src': '76f90cbd92f94a2e925d83e8ccd22cb7', + 'sc': sc, + 't': tm, + } + + return self._download_json( + 'http://cache.m.iqiyi.com/jp/tmts/%s/%s/' % (tvid, video_id), + video_id, transform_source=lambda s: remove_start(s, 'var tvInfoJs='), + query=params, headers=self.geo_verification_headers()) + + def _extract_playlist(self, webpage): + PAGE_SIZE = 50 + + links = re.findall( + r'<a[^>]+class="site-piclist_pic_link"[^>]+href="(http://www\.iqiyi\.com/.+\.html)"', + webpage) + if not links: + return + + album_id = self._search_regex( + r'albumId\s*:\s*(\d+),', webpage, 'album ID') + album_title = self._search_regex( + r'data-share-title="([^"]+)"', webpage, 'album title', fatal=False) + + entries = list(map(self.url_result, links)) + + # Start from 2 because links in the first page are already on webpage + for page_num in itertools.count(2): + pagelist_page = self._download_webpage( + 'http://cache.video.qiyi.com/jp/avlist/%s/%d/%d/' % (album_id, page_num, PAGE_SIZE), + album_id, + note='Download playlist page %d' % page_num, + errnote='Failed to download playlist page %d' % page_num) + pagelist = self._parse_json( + remove_start(pagelist_page, 'var tvInfoJs='), album_id) + vlist = pagelist['data']['vlist'] + for item in vlist: + entries.append(self.url_result(item['vurl'])) + if len(vlist) < PAGE_SIZE: + break + + return self.playlist_result(entries, album_id, album_title) + + def _real_extract(self, url): + webpage = self._download_webpage( + url, 'temp_id', note='download video page') + + # There's no simple way to determine whether an URL is a playlist or not + # Sometimes there are playlist links in individual videos, so treat it + # as a single video first + tvid = self._search_regex( + r'data-(?:player|shareplattrigger)-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid', default=None) + if tvid is None: + playlist_result = self._extract_playlist(webpage) + if playlist_result: + return playlist_result + raise ExtractorError('Can\'t find any video') + + video_id = self._search_regex( + r'data-(?:player|shareplattrigger)-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id') + + formats = [] + for _ in range(5): + raw_data = self.get_raw_data(tvid, video_id) + + if raw_data['code'] != 'A00000': + if raw_data['code'] == 'A00111': + self.raise_geo_restricted() + raise ExtractorError('Unable to load data. Error code: ' + raw_data['code']) + + data = raw_data['data'] + + for stream in data['vidl']: + if 'm3utx' not in stream: + continue + vd = compat_str(stream['vd']) + formats.append({ + 'url': stream['m3utx'], + 'format_id': vd, + 'ext': 'mp4', + 'quality': self._FORMATS_MAP.get(vd, -1), + 'protocol': 'm3u8_native', + }) + + if formats: + break + + self._sleep(5, video_id) + + self._sort_formats(formats) + title = (get_element_by_id('widget-videotitle', webpage) + or clean_html(get_element_by_attribute('class', 'mod-play-tit', webpage)) + or self._html_search_regex(r'<span[^>]+data-videochanged-title="word"[^>]*>([^<]+)</span>', webpage, 'title')) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + } diff --git a/youtube_dl/extractor/ir90tv.py b/yt_dlp/extractor/ir90tv.py index d5a3f6fa5..d5a3f6fa5 100644 --- a/youtube_dl/extractor/ir90tv.py +++ b/yt_dlp/extractor/ir90tv.py diff --git a/yt_dlp/extractor/itv.py b/yt_dlp/extractor/itv.py new file mode 100644 index 000000000..d69782b78 --- /dev/null +++ b/yt_dlp/extractor/itv.py @@ -0,0 +1,264 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from .brightcove import BrightcoveNewIE + +from ..compat import compat_str +from ..utils import ( + base_url, + clean_html, + determine_ext, + extract_attributes, + ExtractorError, + get_element_by_class, + JSON_LD_RE, + merge_dicts, + parse_duration, + smuggle_url, + try_get, + url_or_none, + url_basename, + urljoin, +) + + +class ITVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?itv\.com/hub/[^/]+/(?P<id>[0-9a-zA-Z]+)' + _GEO_COUNTRIES = ['GB'] + _TESTS = [{ + 'url': 'https://www.itv.com/hub/plebs/2a1873a0002', + 'info_dict': { + 'id': '2a1873a0002', + 'ext': 'mp4', + 'title': 'Plebs - The Orgy', + 'description': 'md5:4d7159af53ebd5b36e8b3ec82a41fdb4', + 'series': 'Plebs', + 'season_number': 1, + 'episode_number': 1, + 'thumbnail': r're:https?://hubimages\.itv\.com/episode/2_1873_0002' + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'https://www.itv.com/hub/the-jonathan-ross-show/2a1166a0209', + 'info_dict': { + 'id': '2a1166a0209', + 'ext': 'mp4', + 'title': 'The Jonathan Ross Show - Series 17 - Episode 8', + 'description': 'md5:3023dcdd375db1bc9967186cdb3f1399', + 'series': 'The Jonathan Ross Show', + 'episode_number': 8, + 'season_number': 17, + 'thumbnail': r're:https?://hubimages\.itv\.com/episode/2_1873_0002' + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # unavailable via data-playlist-url + 'url': 'https://www.itv.com/hub/through-the-keyhole/2a2271a0033', + 'only_matching': True, + }, { + # InvalidVodcrid + 'url': 'https://www.itv.com/hub/james-martins-saturday-morning/2a5159a0034', + 'only_matching': True, + }, { + # ContentUnavailable + 'url': 'https://www.itv.com/hub/whos-doing-the-dishes/2a2898a0024', + 'only_matching': True, + }] + + def _generate_api_headers(self, hmac): + return merge_dicts({ + 'Accept': 'application/vnd.itv.vod.playlist.v2+json', + 'Content-Type': 'application/json', + 'hmac': hmac.upper(), + }, self.geo_verification_headers()) + + def _call_api(self, video_id, playlist_url, headers, platform_tag, featureset, fatal=True): + return self._download_json( + playlist_url, video_id, data=json.dumps({ + 'user': { + 'itvUserId': '', + 'entitlements': [], + 'token': '' + }, + 'device': { + 'manufacturer': 'Safari', + 'model': '5', + 'os': { + 'name': 'Windows NT', + 'version': '6.1', + 'type': 'desktop' + } + }, + 'client': { + 'version': '4.1', + 'id': 'browser' + }, + 'variantAvailability': { + 'featureset': { + 'min': featureset, + 'max': featureset + }, + 'platformTag': platform_tag + } + }).encode(), headers=headers, fatal=fatal) + + def _get_subtitles(self, video_id, variants, ios_playlist_url, headers, *args, **kwargs): + subtitles = {} + # Prefer last matching featureset + # See: https://github.com/yt-dlp/yt-dlp/issues/986 + platform_tag_subs, featureset_subs = next( + ((platform_tag, featureset) + for platform_tag, featuresets in reversed(variants.items()) for featureset in featuresets + if try_get(featureset, lambda x: x[2]) == 'outband-webvtt'), + (None, None)) + + if platform_tag_subs and featureset_subs: + subs_playlist = self._call_api( + video_id, ios_playlist_url, headers, platform_tag_subs, featureset_subs, fatal=False) + subs = try_get(subs_playlist, lambda x: x['Playlist']['Video']['Subtitles'], list) or [] + for sub in subs: + if not isinstance(sub, dict): + continue + href = url_or_none(sub.get('Href')) + if not href: + continue + subtitles.setdefault('en', []).append({'url': href}) + return subtitles + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + params = extract_attributes(self._search_regex( + r'(?s)(<[^>]+id="video"[^>]*>)', webpage, 'params')) + variants = self._parse_json( + try_get(params, lambda x: x['data-video-variants'], compat_str) or '{}', + video_id, fatal=False) + # Prefer last matching featureset + # See: https://github.com/yt-dlp/yt-dlp/issues/986 + platform_tag_video, featureset_video = next( + ((platform_tag, featureset) + for platform_tag, featuresets in reversed(variants.items()) for featureset in featuresets + if try_get(featureset, lambda x: x[:2]) == ['hls', 'aes']), + (None, None)) + if not platform_tag_video or not featureset_video: + raise ExtractorError('No downloads available', expected=True, video_id=video_id) + + ios_playlist_url = params.get('data-video-playlist') or params['data-video-id'] + headers = self._generate_api_headers(params['data-video-hmac']) + ios_playlist = self._call_api( + video_id, ios_playlist_url, headers, platform_tag_video, featureset_video) + + video_data = try_get(ios_playlist, lambda x: x['Playlist']['Video'], dict) or {} + ios_base_url = video_data.get('Base') + formats = [] + for media_file in (video_data.get('MediaFiles') or []): + href = media_file.get('Href') + if not href: + continue + if ios_base_url: + href = ios_base_url + href + ext = determine_ext(href) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + href, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'url': href, + }) + self._sort_formats(formats) + info = self._search_json_ld(webpage, video_id, default={}) + if not info: + json_ld = self._parse_json(self._search_regex( + JSON_LD_RE, webpage, 'JSON-LD', '{}', + group='json_ld'), video_id, fatal=False) + if json_ld and json_ld.get('@type') == 'BreadcrumbList': + for ile in (json_ld.get('itemListElement:') or []): + item = ile.get('item:') or {} + if item.get('@type') == 'TVEpisode': + item['@context'] = 'http://schema.org' + info = self._json_ld(item, video_id, fatal=False) or {} + break + + thumbnails = [] + thumbnail_url = try_get(params, lambda x: x['data-video-posterframe'], compat_str) + if thumbnail_url: + thumbnails.extend([{ + 'url': thumbnail_url.format(width=1920, height=1080, quality=100, blur=0, bg='false'), + 'width': 1920, + 'height': 1080, + }, { + 'url': urljoin(base_url(thumbnail_url), url_basename(thumbnail_url)), + 'preference': -2 + }]) + + thumbnail_url = self._html_search_meta(['og:image', 'twitter:image'], webpage, default=None) + if thumbnail_url: + thumbnails.append({ + 'url': thumbnail_url, + }) + self._remove_duplicate_formats(thumbnails) + + return merge_dicts({ + 'id': video_id, + 'title': self._html_search_meta(['og:title', 'twitter:title'], webpage), + 'formats': formats, + 'subtitles': self.extract_subtitles(video_id, variants, ios_playlist_url, headers), + 'duration': parse_duration(video_data.get('Duration')), + 'description': clean_html(get_element_by_class('episode-info__synopsis', webpage)), + 'thumbnails': thumbnails + }, info) + + +class ITVBTCCIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?itv\.com/btcc/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'https://www.itv.com/btcc/articles/btcc-2019-brands-hatch-gp-race-action', + 'info_dict': { + 'id': 'btcc-2019-brands-hatch-gp-race-action', + 'title': 'BTCC 2019: Brands Hatch GP race action', + }, + 'playlist_count': 12, + } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1582188683001/HkiHLnNRx_default/index.html?videoId=%s' + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + json_map = try_get(self._parse_json(self._html_search_regex( + '(?s)<script[^>]+id=[\'"]__NEXT_DATA__[^>]*>([^<]+)</script>', webpage, 'json_map'), playlist_id), + lambda x: x['props']['pageProps']['article']['body']['content']) or [] + + # Discard empty objects + video_ids = [] + for video in json_map: + if video['data'].get('id'): + video_ids.append(video['data']['id']) + + entries = [ + self.url_result( + smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, { + # ITV does not like some GB IP ranges, so here are some + # IP blocks it accepts + 'geo_ip_blocks': [ + '193.113.0.0/16', '54.36.162.0/23', '159.65.16.0/21' + ], + 'referrer': url, + }), + ie=BrightcoveNewIE.ie_key(), video_id=video_id) + for video_id in video_ids] + + title = self._og_search_title(webpage, fatal=False) + + return self.playlist_result(entries, playlist_id, title) diff --git a/yt_dlp/extractor/ivi.py b/yt_dlp/extractor/ivi.py new file mode 100644 index 000000000..5f8a046e0 --- /dev/null +++ b/yt_dlp/extractor/ivi.py @@ -0,0 +1,268 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + qualities, +) + + +class IviIE(InfoExtractor): + IE_DESC = 'ivi.ru' + IE_NAME = 'ivi' + _VALID_URL = r'https?://(?:www\.)?ivi\.(?:ru|tv)/(?:watch/(?:[^/]+/)?|video/player\?.*?videoId=)(?P<id>\d+)' + _GEO_BYPASS = False + _GEO_COUNTRIES = ['RU'] + _LIGHT_KEY = b'\xf1\x02\x32\xb7\xbc\x5c\x7a\xe8\xf7\x96\xc1\x33\x2b\x27\xa1\x8c' + _LIGHT_URL = 'https://api.ivi.ru/light/' + + _TESTS = [ + # Single movie + { + 'url': 'http://www.ivi.ru/watch/53141', + 'md5': '6ff5be2254e796ed346251d117196cf4', + 'info_dict': { + 'id': '53141', + 'ext': 'mp4', + 'title': 'Иван Васильевич меняет профессию', + 'description': 'md5:b924063ea1677c8fe343d8a72ac2195f', + 'duration': 5498, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'skip': 'Only works from Russia', + }, + # Serial's series + { + 'url': 'http://www.ivi.ru/watch/dvoe_iz_lartsa/9549', + 'md5': '221f56b35e3ed815fde2df71032f4b3e', + 'info_dict': { + 'id': '9549', + 'ext': 'mp4', + 'title': 'Двое из ларца - Дело Гольдберга (1 часть)', + 'series': 'Двое из ларца', + 'season': 'Сезон 1', + 'season_number': 1, + 'episode': 'Дело Гольдберга (1 часть)', + 'episode_number': 1, + 'duration': 2655, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'skip': 'Only works from Russia', + }, + { + # with MP4-HD720 format + 'url': 'http://www.ivi.ru/watch/146500', + 'md5': 'd63d35cdbfa1ea61a5eafec7cc523e1e', + 'info_dict': { + 'id': '146500', + 'ext': 'mp4', + 'title': 'Кукла', + 'description': 'md5:ffca9372399976a2d260a407cc74cce6', + 'duration': 5599, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'skip': 'Only works from Russia', + }, + { + 'url': 'https://www.ivi.tv/watch/33560/', + 'only_matching': True, + }, + ] + + # Sorted by quality + _KNOWN_FORMATS = ( + 'MP4-low-mobile', 'MP4-mobile', 'FLV-lo', 'MP4-lo', 'FLV-hi', 'MP4-hi', + 'MP4-SHQ', 'MP4-HD720', 'MP4-HD1080') + + def _real_extract(self, url): + video_id = self._match_id(url) + + data = json.dumps({ + 'method': 'da.content.get', + 'params': [ + video_id, { + 'site': 's%d', + 'referrer': 'http://www.ivi.ru/watch/%s' % video_id, + 'contentid': video_id + } + ] + }) + + for site in (353, 183): + content_data = (data % site).encode() + if site == 353: + try: + from Cryptodome.Cipher import Blowfish + from Cryptodome.Hash import CMAC + pycryptodome_found = True + except ImportError: + try: + from Crypto.Cipher import Blowfish + from Crypto.Hash import CMAC + pycryptodome_found = True + except ImportError: + pycryptodome_found = False + continue + + timestamp = (self._download_json( + self._LIGHT_URL, video_id, + 'Downloading timestamp JSON', data=json.dumps({ + 'method': 'da.timestamp.get', + 'params': [] + }).encode(), fatal=False) or {}).get('result') + if not timestamp: + continue + + query = { + 'ts': timestamp, + 'sign': CMAC.new(self._LIGHT_KEY, timestamp.encode() + content_data, Blowfish).hexdigest(), + } + else: + query = {} + + video_json = self._download_json( + self._LIGHT_URL, video_id, + 'Downloading video JSON', data=content_data, query=query) + + error = video_json.get('error') + if error: + origin = error.get('origin') + message = error.get('message') or error.get('user_message') + extractor_msg = 'Unable to download video %s' + if origin == 'NotAllowedForLocation': + self.raise_geo_restricted(message, self._GEO_COUNTRIES) + elif origin == 'NoRedisValidData': + extractor_msg = 'Video %s does not exist' + elif site == 353: + continue + elif not pycryptodome_found: + raise ExtractorError('pycryptodomex not found. Please install', expected=True) + elif message: + extractor_msg += ': ' + message + raise ExtractorError(extractor_msg % video_id, expected=True) + else: + break + + result = video_json['result'] + title = result['title'] + + quality = qualities(self._KNOWN_FORMATS) + + formats = [] + for f in result.get('files', []): + f_url = f.get('url') + content_format = f.get('content_format') + if not f_url: + continue + if (not self.get_param('allow_unplayable_formats') + and ('-MDRM-' in content_format or '-FPS-' in content_format)): + continue + formats.append({ + 'url': f_url, + 'format_id': content_format, + 'quality': quality(content_format), + 'filesize': int_or_none(f.get('size_in_bytes')), + }) + self._sort_formats(formats) + + compilation = result.get('compilation') + episode = title if compilation else None + + title = '%s - %s' % (compilation, title) if compilation is not None else title + + thumbnails = [{ + 'url': preview['url'], + 'id': preview.get('content_format'), + } for preview in result.get('preview', []) if preview.get('url')] + + webpage = self._download_webpage(url, video_id) + + season = self._search_regex( + r'<li[^>]+class="season active"[^>]*><a[^>]+>([^<]+)', + webpage, 'season', default=None) + season_number = int_or_none(self._search_regex( + r'<li[^>]+class="season active"[^>]*><a[^>]+data-season(?:-index)?="(\d+)"', + webpage, 'season number', default=None)) + + episode_number = int_or_none(self._search_regex( + r'[^>]+itemprop="episode"[^>]*>\s*<meta[^>]+itemprop="episodeNumber"[^>]+content="(\d+)', + webpage, 'episode number', default=None)) + + description = self._og_search_description(webpage, default=None) or self._html_search_meta( + 'description', webpage, 'description', default=None) + + return { + 'id': video_id, + 'title': title, + 'series': compilation, + 'season': season, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, + 'thumbnails': thumbnails, + 'description': description, + 'duration': int_or_none(result.get('duration')), + 'formats': formats, + } + + +class IviCompilationIE(InfoExtractor): + IE_DESC = 'ivi.ru compilations' + IE_NAME = 'ivi:compilation' + _VALID_URL = r'https?://(?:www\.)?ivi\.ru/watch/(?!\d+)(?P<compilationid>[a-z\d_-]+)(?:/season(?P<seasonid>\d+))?$' + _TESTS = [{ + 'url': 'http://www.ivi.ru/watch/dvoe_iz_lartsa', + 'info_dict': { + 'id': 'dvoe_iz_lartsa', + 'title': 'Двое из ларца (2006 - 2008)', + }, + 'playlist_mincount': 24, + }, { + 'url': 'http://www.ivi.ru/watch/dvoe_iz_lartsa/season1', + 'info_dict': { + 'id': 'dvoe_iz_lartsa/season1', + 'title': 'Двое из ларца (2006 - 2008) 1 сезон', + }, + 'playlist_mincount': 12, + }] + + def _extract_entries(self, html, compilation_id): + return [ + self.url_result( + 'http://www.ivi.ru/watch/%s/%s' % (compilation_id, serie), IviIE.ie_key()) + for serie in re.findall( + r'<a\b[^>]+\bhref=["\']/watch/%s/(\d+)["\']' % compilation_id, html)] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + compilation_id = mobj.group('compilationid') + season_id = mobj.group('seasonid') + + if season_id is not None: # Season link + season_page = self._download_webpage( + url, compilation_id, 'Downloading season %s web page' % season_id) + playlist_id = '%s/season%s' % (compilation_id, season_id) + playlist_title = self._html_search_meta('title', season_page, 'title') + entries = self._extract_entries(season_page, compilation_id) + else: # Compilation link + compilation_page = self._download_webpage(url, compilation_id, 'Downloading compilation web page') + playlist_id = compilation_id + playlist_title = self._html_search_meta('title', compilation_page, 'title') + seasons = re.findall( + r'<a href="/watch/%s/season(\d+)' % compilation_id, compilation_page) + if not seasons: # No seasons in this compilation + entries = self._extract_entries(compilation_page, compilation_id) + else: + entries = [] + for season_id in seasons: + season_page = self._download_webpage( + 'http://www.ivi.ru/watch/%s/season%s' % (compilation_id, season_id), + compilation_id, 'Downloading season %s web page' % season_id) + entries.extend(self._extract_entries(season_page, compilation_id)) + + return self.playlist_result(entries, playlist_id, playlist_title) diff --git a/yt_dlp/extractor/ivideon.py b/yt_dlp/extractor/ivideon.py new file mode 100644 index 000000000..01e7b22d4 --- /dev/null +++ b/yt_dlp/extractor/ivideon.py @@ -0,0 +1,82 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse_urlencode, + compat_urlparse, +) +from ..utils import qualities + + +class IvideonIE(InfoExtractor): + IE_NAME = 'ivideon' + IE_DESC = 'Ivideon TV' + _VALID_URL = r'https?://(?:www\.)?ivideon\.com/tv/(?:[^/]+/)*camera/(?P<id>\d+-[\da-f]+)/(?P<camera_id>\d+)' + _TESTS = [{ + 'url': 'https://www.ivideon.com/tv/camera/100-916ca13b5c4ad9f564266424a026386d/0/', + 'info_dict': { + 'id': '100-916ca13b5c4ad9f564266424a026386d', + 'ext': 'flv', + 'title': 're:^Касса [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': 'Основное предназначение - запись действий кассиров. Плюс общий вид.', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + } + }, { + 'url': 'https://www.ivideon.com/tv/camera/100-c4ee4cb9ede885cf62dfbe93d7b53783/589824/?lang=ru', + 'only_matching': True, + }, { + 'url': 'https://www.ivideon.com/tv/map/22.917923/-31.816406/16/camera/100-e7bc16c7d4b5bbd633fd5350b66dfa9a/0', + 'only_matching': True, + }] + + _QUALITIES = ('low', 'mid', 'hi') + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + server_id, camera_id = mobj.group('id'), mobj.group('camera_id') + camera_name, description = None, None + camera_url = compat_urlparse.urljoin( + url, '/tv/camera/%s/%s/' % (server_id, camera_id)) + + webpage = self._download_webpage(camera_url, server_id, fatal=False) + if webpage: + config_string = self._search_regex( + r'var\s+config\s*=\s*({.+?});', webpage, 'config', default=None) + if config_string: + config = self._parse_json(config_string, server_id, fatal=False) + camera_info = config.get('ivTvAppOptions', {}).get('currentCameraInfo') + if camera_info: + camera_name = camera_info.get('camera_name') + description = camera_info.get('misc', {}).get('description') + if not camera_name: + camera_name = self._html_search_meta( + 'name', webpage, 'camera name', default=None) or self._search_regex( + r'<h1[^>]+class="b-video-title"[^>]*>([^<]+)', webpage, 'camera name', default=None) + + quality = qualities(self._QUALITIES) + + formats = [{ + 'url': 'https://streaming.ivideon.com/flv/live?%s' % compat_urllib_parse_urlencode({ + 'server': server_id, + 'camera': camera_id, + 'sessionId': 'demo', + 'q': quality(format_id), + }), + 'format_id': format_id, + 'ext': 'flv', + 'quality': quality(format_id), + } for format_id in self._QUALITIES] + self._sort_formats(formats) + + return { + 'id': server_id, + 'title': self._live_title(camera_name or server_id), + 'description': description, + 'is_live': True, + 'formats': formats, + } diff --git a/yt_dlp/extractor/iwara.py b/yt_dlp/extractor/iwara.py new file mode 100644 index 000000000..254d98692 --- /dev/null +++ b/yt_dlp/extractor/iwara.py @@ -0,0 +1,123 @@ +# coding: utf-8 +from __future__ import unicode_literals +import re + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_urlparse +from ..utils import ( + int_or_none, + mimetype2ext, + remove_end, + url_or_none, + unified_strdate, + strip_or_none, +) + + +class IwaraIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.|ecchi\.)?iwara\.tv/videos/(?P<id>[a-zA-Z0-9]+)' + _TESTS = [{ + 'url': 'http://iwara.tv/videos/amVwUl1EHpAD9RD', + # md5 is unstable + 'info_dict': { + 'id': 'amVwUl1EHpAD9RD', + 'ext': 'mp4', + 'title': '【MMD R-18】ガールフレンド carry_me_off', + 'age_limit': 18, + 'thumbnail': 'https://i.iwara.tv/sites/default/files/videos/thumbnails/7951/thumbnail-7951_0001.png', + 'uploader': 'Reimu丨Action', + 'upload_date': '20150828', + 'description': 'md5:1d4905ce48c66c9299c617f08e106e0f', + }, + }, { + 'url': 'http://ecchi.iwara.tv/videos/Vb4yf2yZspkzkBO', + 'md5': '7e5f1f359cd51a027ba4a7b7710a50f0', + 'info_dict': { + 'id': '0B1LvuHnL-sRFNXB1WHNqbGw4SXc', + 'ext': 'mp4', + 'title': '[3D Hentai] Kyonyu × Genkai × Emaki Shinobi Girls.mp4', + 'age_limit': 18, + }, + 'add_ie': ['GoogleDrive'], + }, { + 'url': 'http://www.iwara.tv/videos/nawkaumd6ilezzgq', + # md5 is unstable + 'info_dict': { + 'id': '6liAP9s2Ojc', + 'ext': 'mp4', + 'age_limit': 18, + 'title': '[MMD] Do It Again Ver.2 [1080p 60FPS] (Motion,Camera,Wav+DL)', + 'description': 'md5:590c12c0df1443d833fbebe05da8c47a', + 'upload_date': '20160910', + 'uploader': 'aMMDsork', + 'uploader_id': 'UCVOFyOSCyFkXTYYHITtqB7A', + }, + 'add_ie': ['Youtube'], + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage, urlh = self._download_webpage_handle(url, video_id) + + hostname = compat_urllib_parse_urlparse(urlh.geturl()).hostname + # ecchi is 'sexy' in Japanese + age_limit = 18 if hostname.split('.')[0] == 'ecchi' else 0 + + video_data = self._download_json('http://www.iwara.tv/api/video/%s' % video_id, video_id) + + if not video_data: + iframe_url = self._html_search_regex( + r'<iframe[^>]+src=([\'"])(?P<url>[^\'"]+)\1', + webpage, 'iframe URL', group='url') + return { + '_type': 'url_transparent', + 'url': iframe_url, + 'age_limit': age_limit, + } + + title = remove_end(self._html_search_regex( + r'<title>([^<]+)</title>', webpage, 'title'), ' | Iwara') + + thumbnail = self._html_search_regex( + r'poster=[\'"]([^\'"]+)', webpage, 'thumbnail', default=None) + + uploader = self._html_search_regex( + r'class="username">([^<]+)', webpage, 'uploader', fatal=False) + + upload_date = unified_strdate(self._html_search_regex( + r'作成日:([^\s]+)', webpage, 'upload_date', fatal=False)) + + description = strip_or_none(self._search_regex( + r'<p>(.+?(?=</div))', webpage, 'description', fatal=False, + flags=re.DOTALL)) + + formats = [] + for a_format in video_data: + format_uri = url_or_none(a_format.get('uri')) + if not format_uri: + continue + format_id = a_format.get('resolution') + height = int_or_none(self._search_regex( + r'(\d+)p', format_id, 'height', default=None)) + formats.append({ + 'url': self._proto_relative_url(format_uri, 'https:'), + 'format_id': format_id, + 'ext': mimetype2ext(a_format.get('mime')) or 'mp4', + 'height': height, + 'width': int_or_none(height / 9.0 * 16.0 if height else None), + 'quality': 1 if format_id == 'Source' else 0, + }) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'age_limit': age_limit, + 'formats': formats, + 'thumbnail': self._proto_relative_url(thumbnail, 'https:'), + 'uploader': uploader, + 'upload_date': upload_date, + 'description': description, + } diff --git a/youtube_dl/extractor/izlesene.py b/yt_dlp/extractor/izlesene.py index f8fca6c8f..f8fca6c8f 100644 --- a/youtube_dl/extractor/izlesene.py +++ b/yt_dlp/extractor/izlesene.py diff --git a/youtube_dl/extractor/jamendo.py b/yt_dlp/extractor/jamendo.py index 1db7c64af..1db7c64af 100644 --- a/youtube_dl/extractor/jamendo.py +++ b/yt_dlp/extractor/jamendo.py diff --git a/yt_dlp/extractor/jeuxvideo.py b/yt_dlp/extractor/jeuxvideo.py new file mode 100644 index 000000000..77c0f520c --- /dev/null +++ b/yt_dlp/extractor/jeuxvideo.py @@ -0,0 +1,55 @@ +# coding: utf-8 + +from __future__ import unicode_literals + + +from .common import InfoExtractor + + +class JeuxVideoIE(InfoExtractor): + _VALID_URL = r'https?://.*?\.jeuxvideo\.com/.*/(.*?)\.htm' + + _TESTS = [{ + 'url': 'http://www.jeuxvideo.com/reportages-videos-jeux/0004/00046170/tearaway-playstation-vita-gc-2013-tearaway-nous-presente-ses-papiers-d-identite-00115182.htm', + 'md5': '046e491afb32a8aaac1f44dd4ddd54ee', + 'info_dict': { + 'id': '114765', + 'ext': 'mp4', + 'title': 'Tearaway : GC 2013 : Tearaway nous présente ses papiers d\'identité', + 'description': 'Lorsque les développeurs de LittleBigPlanet proposent un nouveau titre, on ne peut que s\'attendre à un résultat original et fort attrayant.', + }, + }, { + 'url': 'http://www.jeuxvideo.com/videos/chroniques/434220/l-histoire-du-jeu-video-la-saturn.htm', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + title = mobj.group(1) + webpage = self._download_webpage(url, title) + title = self._html_search_meta('name', webpage) or self._og_search_title(webpage) + config_url = self._html_search_regex( + r'data-src(?:set-video)?="(/contenu/medias/video\.php.*?)"', + webpage, 'config URL') + config_url = 'http://www.jeuxvideo.com' + config_url + + video_id = self._search_regex( + r'id=(\d+)', + config_url, 'video ID') + + config = self._download_json( + config_url, title, 'Downloading JSON config') + + formats = [{ + 'url': source['file'], + 'format_id': source['label'], + 'resolution': source['label'], + } for source in reversed(config['sources'])] + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': self._og_search_description(webpage), + 'thumbnail': config.get('image'), + } diff --git a/yt_dlp/extractor/joj.py b/yt_dlp/extractor/joj.py new file mode 100644 index 000000000..637618183 --- /dev/null +++ b/yt_dlp/extractor/joj.py @@ -0,0 +1,108 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + js_to_json, + try_get, +) + + +class JojIE(InfoExtractor): + _VALID_URL = r'''(?x) + (?: + joj:| + https?://media\.joj\.sk/embed/ + ) + (?P<id>[^/?#^]+) + ''' + _TESTS = [{ + 'url': 'https://media.joj.sk/embed/a388ec4c-6019-4a4a-9312-b1bee194e932', + 'info_dict': { + 'id': 'a388ec4c-6019-4a4a-9312-b1bee194e932', + 'ext': 'mp4', + 'title': 'NOVÉ BÝVANIE', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 3118, + } + }, { + 'url': 'https://media.joj.sk/embed/9i1cxv', + 'only_matching': True, + }, { + 'url': 'joj:a388ec4c-6019-4a4a-9312-b1bee194e932', + 'only_matching': True, + }, { + 'url': 'joj:9i1cxv', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//media\.joj\.sk/embed/(?:(?!\1).)+)\1', + webpage)] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'https://media.joj.sk/embed/%s' % video_id, video_id) + + title = self._search_regex( + (r'videoTitle\s*:\s*(["\'])(?P<title>(?:(?!\1).)+)\1', + r'<title>(?P<title>[^<]+)'), webpage, 'title', + default=None, group='title') or self._og_search_title(webpage) + + bitrates = self._parse_json( + self._search_regex( + r'(?s)(?:src|bitrates)\s*=\s*({.+?});', webpage, 'bitrates', + default='{}'), + video_id, transform_source=js_to_json, fatal=False) + + formats = [] + for format_url in try_get(bitrates, lambda x: x['mp4'], list) or []: + if isinstance(format_url, compat_str): + height = self._search_regex( + r'(\d+)[pP]\.', format_url, 'height', default=None) + formats.append({ + 'url': format_url, + 'format_id': '%sp' % height if height else None, + 'height': int(height), + }) + if not formats: + playlist = self._download_xml( + 'https://media.joj.sk/services/Video.php?clip=%s' % video_id, + video_id) + for file_el in playlist.findall('./files/file'): + path = file_el.get('path') + if not path: + continue + format_id = file_el.get('id') or file_el.get('label') + formats.append({ + 'url': 'http://n16.joj.sk/storage/%s' % path.replace( + 'dat/', '', 1), + 'format_id': format_id, + 'height': int_or_none(self._search_regex( + r'(\d+)[pP]', format_id or path, 'height', + default=None)), + }) + self._sort_formats(formats) + + thumbnail = self._og_search_thumbnail(webpage) + + duration = int_or_none(self._search_regex( + r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False)) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + } diff --git a/yt_dlp/extractor/jove.py b/yt_dlp/extractor/jove.py new file mode 100644 index 000000000..4b7dfc526 --- /dev/null +++ b/yt_dlp/extractor/jove.py @@ -0,0 +1,79 @@ +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + unified_strdate +) + + +class JoveIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?jove\.com/video/(?P<id>[0-9]+)' + _CHAPTERS_URL = 'http://www.jove.com/video-chapters?videoid={video_id:}' + _TESTS = [ + { + 'url': 'http://www.jove.com/video/2744/electrode-positioning-montage-transcranial-direct-current', + 'md5': '93723888d82dbd6ba8b3d7d0cd65dd2b', + 'info_dict': { + 'id': '2744', + 'ext': 'mp4', + 'title': 'Electrode Positioning and Montage in Transcranial Direct Current Stimulation', + 'description': 'md5:015dd4509649c0908bc27f049e0262c6', + 'thumbnail': r're:^https?://.*\.png$', + 'upload_date': '20110523', + } + }, + { + 'url': 'http://www.jove.com/video/51796/culturing-caenorhabditis-elegans-axenic-liquid-media-creation', + 'md5': '914aeb356f416811d911996434811beb', + 'info_dict': { + 'id': '51796', + 'ext': 'mp4', + 'title': 'Culturing Caenorhabditis elegans in Axenic Liquid Media and Creation of Transgenic Worms by Microparticle Bombardment', + 'description': 'md5:35ff029261900583970c4023b70f1dc9', + 'thumbnail': r're:^https?://.*\.png$', + 'upload_date': '20140802', + } + }, + + ] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + + chapters_id = self._html_search_regex( + r'/video-chapters\?videoid=([0-9]+)', webpage, 'chapters id') + + chapters_xml = self._download_xml( + self._CHAPTERS_URL.format(video_id=chapters_id), + video_id, note='Downloading chapters XML', + errnote='Failed to download chapters XML') + + video_url = chapters_xml.attrib.get('video') + if not video_url: + raise ExtractorError('Failed to get the video URL') + + title = self._html_search_meta('citation_title', webpage, 'title') + thumbnail = self._og_search_thumbnail(webpage) + description = self._html_search_regex( + r'<div id="section_body_summary"><p class="jove_content">(.+?)</p>', + webpage, 'description', fatal=False) + publish_date = unified_strdate(self._html_search_meta( + 'citation_publication_date', webpage, 'publish date', fatal=False)) + comment_count = int(self._html_search_regex( + r'<meta name="num_comments" content="(\d+) Comments?"', + webpage, 'comment count', fatal=False)) + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + 'thumbnail': thumbnail, + 'description': description, + 'upload_date': publish_date, + 'comment_count': comment_count, + } diff --git a/yt_dlp/extractor/jwplatform.py b/yt_dlp/extractor/jwplatform.py new file mode 100644 index 000000000..5aa508bf9 --- /dev/null +++ b/yt_dlp/extractor/jwplatform.py @@ -0,0 +1,51 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import unsmuggle_url + + +class JWPlatformIE(InfoExtractor): + _VALID_URL = r'(?:https?://(?:content\.jwplatform|cdn\.jwplayer)\.com/(?:(?:feed|player|thumb|preview)s|jw6|v2/media)/|jwplatform:)(?P<id>[a-zA-Z0-9]{8})' + _TESTS = [{ + 'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js', + 'md5': 'fa8899fa601eb7c83a64e9d568bdf325', + 'info_dict': { + 'id': 'nPripu9l', + 'ext': 'mov', + 'title': 'Big Buck Bunny Trailer', + 'description': 'Big Buck Bunny is a short animated film by the Blender Institute. It is made using free and open source software.', + 'upload_date': '20081127', + 'timestamp': 1227796140, + } + }, { + 'url': 'https://cdn.jwplayer.com/players/nPripu9l-ALJ3XQCI.js', + 'only_matching': True, + }] + + @staticmethod + def _extract_url(webpage): + urls = JWPlatformIE._extract_urls(webpage) + return urls[0] if urls else None + + @staticmethod + def _extract_urls(webpage): + for tag, key in ((r'(?:script|iframe)', 'src'), ('input', 'value')): + # <input value=URL> is used by hyland.com + # if we find <iframe>, dont look for <input> + ret = re.findall( + r'<%s[^>]+?%s=["\']((?:https?:)?//(?:content\.jwplatform|cdn\.jwplayer)\.com/players/[a-zA-Z0-9]{8})' % (tag, key), + webpage) + if ret: + return ret + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + self._initialize_geo_bypass({ + 'countries': smuggled_data.get('geo_countries'), + }) + video_id = self._match_id(url) + json_data = self._download_json('https://cdn.jwplayer.com/v2/media/' + video_id, video_id) + return self._parse_jwplayer_data(json_data, video_id) diff --git a/yt_dlp/extractor/kakao.py b/yt_dlp/extractor/kakao.py new file mode 100644 index 000000000..97c986d8c --- /dev/null +++ b/yt_dlp/extractor/kakao.py @@ -0,0 +1,132 @@ +# coding: utf-8 + +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + strip_or_none, + traverse_obj, + unified_timestamp, +) + + +class KakaoIE(InfoExtractor): + _VALID_URL = r'https?://(?:play-)?tv\.kakao\.com/(?:channel/\d+|embed/player)/cliplink/(?P<id>\d+|[^?#&]+@my)' + _API_BASE_TMPL = 'http://tv.kakao.com/api/v1/ft/playmeta/cliplink/%s/' + _CDN_API = 'https://tv.kakao.com/katz/v1/ft/cliplink/%s/readyNplay?' + + _TESTS = [{ + 'url': 'http://tv.kakao.com/channel/2671005/cliplink/301965083', + 'md5': '702b2fbdeb51ad82f5c904e8c0766340', + 'info_dict': { + 'id': '301965083', + 'ext': 'mp4', + 'title': '乃木坂46 バナナマン 「3期生紹介コーナーが始動!顔高低差GPも!」 『乃木坂工事中』', + 'uploader_id': 2671005, + 'uploader': '그랑그랑이', + 'timestamp': 1488160199, + 'upload_date': '20170227', + } + }, { + 'url': 'http://tv.kakao.com/channel/2653210/cliplink/300103180', + 'md5': 'a8917742069a4dd442516b86e7d66529', + 'info_dict': { + 'id': '300103180', + 'ext': 'mp4', + 'description': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)\r\n\r\n[쇼! 음악중심] 20160611, 507회', + 'title': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)', + 'uploader_id': 2653210, + 'uploader': '쇼! 음악중심', + 'timestamp': 1485684628, + 'upload_date': '20170129', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + api_base = self._API_BASE_TMPL % video_id + cdn_api_base = self._CDN_API % video_id + + query = { + 'player': 'monet_html5', + 'referer': url, + 'uuid': '', + 'service': 'kakao_tv', + 'section': '', + 'dteType': 'PC', + 'fields': ','.join([ + '-*', 'tid', 'clipLink', 'displayTitle', 'clip', 'title', + 'description', 'channelId', 'createTime', 'duration', 'playCount', + 'likeCount', 'commentCount', 'tagList', 'channel', 'name', + 'clipChapterThumbnailList', 'thumbnailUrl', 'timeInSec', 'isDefault', + 'videoOutputList', 'width', 'height', 'kbps', 'profile', 'label']) + } + + api_json = self._download_json( + api_base, video_id, 'Downloading video info') + + clip_link = api_json['clipLink'] + clip = clip_link['clip'] + + title = clip.get('title') or clip_link.get('displayTitle') + + formats = [] + for fmt in clip.get('videoOutputList', []): + profile_name = fmt.get('profile') + if not profile_name or profile_name == 'AUDIO': + continue + query.update({ + 'profile': profile_name, + 'fields': '-*,url', + }) + + fmt_url_json = self._download_json( + cdn_api_base, video_id, + 'Downloading video URL for profile %s' % profile_name, + query=query, fatal=False) + fmt_url = traverse_obj(fmt_url_json, ('videoLocation', 'url')) + if not fmt_url: + continue + + formats.append({ + 'url': fmt_url, + 'format_id': profile_name, + 'width': int_or_none(fmt.get('width')), + 'height': int_or_none(fmt.get('height')), + 'format_note': fmt.get('label'), + 'filesize': int_or_none(fmt.get('filesize')), + 'tbr': int_or_none(fmt.get('kbps')), + }) + self._sort_formats(formats) + + thumbs = [] + for thumb in clip.get('clipChapterThumbnailList') or []: + thumbs.append({ + 'url': thumb.get('thumbnailUrl'), + 'id': compat_str(thumb.get('timeInSec')), + 'preference': -1 if thumb.get('isDefault') else 0 + }) + top_thumbnail = clip.get('thumbnailUrl') + if top_thumbnail: + thumbs.append({ + 'url': top_thumbnail, + 'preference': 10, + }) + + return { + 'id': video_id, + 'title': title, + 'description': strip_or_none(clip.get('description')), + 'uploader': traverse_obj(clip_link, ('channel', 'name')), + 'uploader_id': clip_link.get('channelId'), + 'thumbnails': thumbs, + 'timestamp': unified_timestamp(clip_link.get('createTime')), + 'duration': int_or_none(clip.get('duration')), + 'view_count': int_or_none(clip.get('playCount')), + 'like_count': int_or_none(clip.get('likeCount')), + 'comment_count': int_or_none(clip.get('commentCount')), + 'formats': formats, + 'tags': clip.get('tagList'), + } diff --git a/yt_dlp/extractor/kaltura.py b/yt_dlp/extractor/kaltura.py new file mode 100644 index 000000000..c8f60ef45 --- /dev/null +++ b/yt_dlp/extractor/kaltura.py @@ -0,0 +1,377 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import base64 + +from .common import InfoExtractor +from ..compat import ( + compat_urlparse, + compat_parse_qs, +) +from ..utils import ( + clean_html, + ExtractorError, + int_or_none, + unsmuggle_url, + smuggle_url, +) + + +class KalturaIE(InfoExtractor): + _VALID_URL = r'''(?x) + (?: + kaltura:(?P<partner_id>\d+):(?P<id>[0-9a-z_]+)| + https?:// + (:?(?:www|cdnapi(?:sec)?)\.)?kaltura\.com(?::\d+)?/ + (?: + (?: + # flash player + index\.php/(?:kwidget|extwidget/preview)| + # html5 player + html5/html5lib/[^/]+/mwEmbedFrame\.php + ) + )(?:/(?P<path>[^?]+))?(?:\?(?P<query>.*))? + ) + ''' + _SERVICE_URL = 'http://cdnapi.kaltura.com' + _SERVICE_BASE = '/api_v3/index.php' + # See https://github.com/kaltura/server/blob/master/plugins/content/caption/base/lib/model/enums/CaptionType.php + _CAPTION_TYPES = { + 1: 'srt', + 2: 'ttml', + 3: 'vtt', + } + _TESTS = [ + { + 'url': 'kaltura:269692:1_1jc2y3e4', + 'md5': '3adcbdb3dcc02d647539e53f284ba171', + 'info_dict': { + 'id': '1_1jc2y3e4', + 'ext': 'mp4', + 'title': 'Straight from the Heart', + 'upload_date': '20131219', + 'uploader_id': 'mlundberg@wolfgangsvault.com', + 'description': 'The Allman Brothers Band, 12/16/1981', + 'thumbnail': 're:^https?://.*/thumbnail/.*', + 'timestamp': int, + }, + }, + { + 'url': 'http://www.kaltura.com/index.php/kwidget/cache_st/1300318621/wid/_269692/uiconf_id/3873291/entry_id/1_1jc2y3e4', + 'only_matching': True, + }, + { + 'url': 'https://cdnapisec.kaltura.com/index.php/kwidget/wid/_557781/uiconf_id/22845202/entry_id/1_plr1syf3', + 'only_matching': True, + }, + { + 'url': 'https://cdnapisec.kaltura.com/html5/html5lib/v2.30.2/mwEmbedFrame.php/p/1337/uiconf_id/20540612/entry_id/1_sf5ovm7u?wid=_243342', + 'only_matching': True, + }, + { + # video with subtitles + 'url': 'kaltura:111032:1_cw786r8q', + 'only_matching': True, + }, + { + # video with ttml subtitles (no fileExt) + 'url': 'kaltura:1926081:0_l5ye1133', + 'info_dict': { + 'id': '0_l5ye1133', + 'ext': 'mp4', + 'title': 'What Can You Do With Python?', + 'upload_date': '20160221', + 'uploader_id': 'stork', + 'thumbnail': 're:^https?://.*/thumbnail/.*', + 'timestamp': int, + 'subtitles': { + 'en': [{ + 'ext': 'ttml', + }], + }, + }, + 'skip': 'Gone. Maybe https://www.safaribooksonline.com/library/tutorials/introduction-to-python-anon/3469/', + 'params': { + 'skip_download': True, + }, + }, + { + 'url': 'https://www.kaltura.com/index.php/extwidget/preview/partner_id/1770401/uiconf_id/37307382/entry_id/0_58u8kme7/embed/iframe?&flashvars[streamerType]=auto', + 'only_matching': True, + }, + { + 'url': 'https://www.kaltura.com:443/index.php/extwidget/preview/partner_id/1770401/uiconf_id/37307382/entry_id/0_58u8kme7/embed/iframe?&flashvars[streamerType]=auto', + 'only_matching': True, + }, + { + # unavailable source format + 'url': 'kaltura:513551:1_66x4rg7o', + 'only_matching': True, + } + ] + + @staticmethod + def _extract_url(webpage): + urls = KalturaIE._extract_urls(webpage) + return urls[0] if urls else None + + @staticmethod + def _extract_urls(webpage): + # Embed codes: https://knowledge.kaltura.com/embedding-kaltura-media-players-your-site + finditer = ( + list(re.finditer( + r"""(?xs) + kWidget\.(?:thumb)?[Ee]mbed\( + \{.*? + (?P<q1>['"])wid(?P=q1)\s*:\s* + (?P<q2>['"])_?(?P<partner_id>(?:(?!(?P=q2)).)+)(?P=q2),.*? + (?P<q3>['"])entry_?[Ii]d(?P=q3)\s*:\s* + (?P<q4>['"])(?P<id>(?:(?!(?P=q4)).)+)(?P=q4)(?:,|\s*\}) + """, webpage)) + or list(re.finditer( + r'''(?xs) + (?P<q1>["']) + (?:https?:)?//cdnapi(?:sec)?\.kaltura\.com(?::\d+)?/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P<partner_id>\d+)(?:(?!(?P=q1)).)* + (?P=q1).*? + (?: + (?: + entry_?[Ii]d| + (?P<q2>["'])entry_?[Ii]d(?P=q2) + )\s*:\s*| + \[\s*(?P<q2_1>["'])entry_?[Ii]d(?P=q2_1)\s*\]\s*=\s* + ) + (?P<q3>["'])(?P<id>(?:(?!(?P=q3)).)+)(?P=q3) + ''', webpage)) + or list(re.finditer( + r'''(?xs) + <(?:iframe[^>]+src|meta[^>]+\bcontent)=(?P<q1>["'])\s* + (?:https?:)?//(?:(?:www|cdnapi(?:sec)?)\.)?kaltura\.com/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P<partner_id>\d+) + (?:(?!(?P=q1)).)* + [?&;]entry_id=(?P<id>(?:(?!(?P=q1))[^&])+) + (?:(?!(?P=q1)).)* + (?P=q1) + ''', webpage)) + ) + urls = [] + for mobj in finditer: + embed_info = mobj.groupdict() + for k, v in embed_info.items(): + if v: + embed_info[k] = v.strip() + url = 'kaltura:%(partner_id)s:%(id)s' % embed_info + escaped_pid = re.escape(embed_info['partner_id']) + service_mobj = re.search( + r'<script[^>]+src=(["\'])(?P<id>(?:https?:)?//(?:(?!\1).)+)/p/%s/sp/%s00/embedIframeJs' % (escaped_pid, escaped_pid), + webpage) + if service_mobj: + url = smuggle_url(url, {'service_url': service_mobj.group('id')}) + urls.append(url) + return urls + + def _kaltura_api_call(self, video_id, actions, service_url=None, *args, **kwargs): + params = actions[0] + if len(actions) > 1: + for i, a in enumerate(actions[1:], start=1): + for k, v in a.items(): + params['%d:%s' % (i, k)] = v + + data = self._download_json( + (service_url or self._SERVICE_URL) + self._SERVICE_BASE, + video_id, query=params, *args, **kwargs) + + status = data if len(actions) == 1 else data[0] + if status.get('objectType') == 'KalturaAPIException': + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, status['message'])) + + return data + + def _get_video_info(self, video_id, partner_id, service_url=None): + actions = [ + { + 'action': 'null', + 'apiVersion': '3.1.5', + 'clientTag': 'kdp:v3.8.5', + 'format': 1, # JSON, 2 = XML, 3 = PHP + 'service': 'multirequest', + }, + { + 'expiry': 86400, + 'service': 'session', + 'action': 'startWidgetSession', + 'widgetId': '_%s' % partner_id, + }, + { + 'action': 'get', + 'entryId': video_id, + 'service': 'baseentry', + 'ks': '{1:result:ks}', + 'responseProfile:fields': 'createdAt,dataUrl,duration,name,plays,thumbnailUrl,userId', + 'responseProfile:type': 1, + }, + { + 'action': 'getbyentryid', + 'entryId': video_id, + 'service': 'flavorAsset', + 'ks': '{1:result:ks}', + }, + { + 'action': 'list', + 'filter:entryIdEqual': video_id, + 'service': 'caption_captionasset', + 'ks': '{1:result:ks}', + }, + ] + return self._kaltura_api_call( + video_id, actions, service_url, note='Downloading video info JSON') + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + + mobj = self._match_valid_url(url) + partner_id, entry_id = mobj.group('partner_id', 'id') + ks = None + captions = None + if partner_id and entry_id: + _, info, flavor_assets, captions = self._get_video_info(entry_id, partner_id, smuggled_data.get('service_url')) + else: + path, query = mobj.group('path', 'query') + if not path and not query: + raise ExtractorError('Invalid URL', expected=True) + params = {} + if query: + params = compat_parse_qs(query) + if path: + splitted_path = path.split('/') + params.update(dict((zip(splitted_path[::2], [[v] for v in splitted_path[1::2]])))) + if 'wid' in params: + partner_id = params['wid'][0][1:] + elif 'p' in params: + partner_id = params['p'][0] + elif 'partner_id' in params: + partner_id = params['partner_id'][0] + else: + raise ExtractorError('Invalid URL', expected=True) + if 'entry_id' in params: + entry_id = params['entry_id'][0] + _, info, flavor_assets, captions = self._get_video_info(entry_id, partner_id) + elif 'uiconf_id' in params and 'flashvars[referenceId]' in params: + reference_id = params['flashvars[referenceId]'][0] + webpage = self._download_webpage(url, reference_id) + entry_data = self._parse_json(self._search_regex( + r'window\.kalturaIframePackageData\s*=\s*({.*});', + webpage, 'kalturaIframePackageData'), + reference_id)['entryResult'] + info, flavor_assets = entry_data['meta'], entry_data['contextData']['flavorAssets'] + entry_id = info['id'] + # Unfortunately, data returned in kalturaIframePackageData lacks + # captions so we will try requesting the complete data using + # regular approach since we now know the entry_id + try: + _, info, flavor_assets, captions = self._get_video_info( + entry_id, partner_id) + except ExtractorError: + # Regular scenario failed but we already have everything + # extracted apart from captions and can process at least + # with this + pass + else: + raise ExtractorError('Invalid URL', expected=True) + ks = params.get('flashvars[ks]', [None])[0] + + source_url = smuggled_data.get('source_url') + if source_url: + referrer = base64.b64encode( + '://'.join(compat_urlparse.urlparse(source_url)[:2]) + .encode('utf-8')).decode('utf-8') + else: + referrer = None + + def sign_url(unsigned_url): + if ks: + unsigned_url += '/ks/%s' % ks + if referrer: + unsigned_url += '?referrer=%s' % referrer + return unsigned_url + + data_url = info['dataUrl'] + if '/flvclipper/' in data_url: + data_url = re.sub(r'/flvclipper/.*', '/serveFlavor', data_url) + + formats = [] + for f in flavor_assets: + # Continue if asset is not ready + if f.get('status') != 2: + continue + # Original format that's not available (e.g. kaltura:1926081:0_c03e1b5g) + # skip for now. + if f.get('fileExt') == 'chun': + continue + # DRM-protected video, cannot be decrypted + if not self.get_param('allow_unplayable_formats') and f.get('fileExt') == 'wvm': + continue + if not f.get('fileExt'): + # QT indicates QuickTime; some videos have broken fileExt + if f.get('containerFormat') == 'qt': + f['fileExt'] = 'mov' + else: + f['fileExt'] = 'mp4' + video_url = sign_url( + '%s/flavorId/%s' % (data_url, f['id'])) + format_id = '%(fileExt)s-%(bitrate)s' % f + # Source format may not be available (e.g. kaltura:513551:1_66x4rg7o) + if f.get('isOriginal') is True and not self._is_valid_url( + video_url, entry_id, format_id): + continue + # audio-only has no videoCodecId (e.g. kaltura:1926081:0_c03e1b5g + # -f mp4-56) + vcodec = 'none' if 'videoCodecId' not in f and f.get( + 'frameRate') == 0 else f.get('videoCodecId') + formats.append({ + 'format_id': format_id, + 'ext': f.get('fileExt'), + 'tbr': int_or_none(f['bitrate']), + 'fps': int_or_none(f.get('frameRate')), + 'filesize_approx': int_or_none(f.get('size'), invscale=1024), + 'container': f.get('containerFormat'), + 'vcodec': vcodec, + 'height': int_or_none(f.get('height')), + 'width': int_or_none(f.get('width')), + 'url': video_url, + }) + if '/playManifest/' in data_url: + m3u8_url = sign_url(data_url.replace( + 'format/url', 'format/applehttp')) + formats.extend(self._extract_m3u8_formats( + m3u8_url, entry_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + + self._sort_formats(formats) + + subtitles = {} + if captions: + for caption in captions.get('objects', []): + # Continue if caption is not ready + if caption.get('status') != 2: + continue + if not caption.get('id'): + continue + caption_format = int_or_none(caption.get('format')) + subtitles.setdefault(caption.get('languageCode') or caption.get('language'), []).append({ + 'url': '%s/api_v3/service/caption_captionasset/action/serve/captionAssetId/%s' % (self._SERVICE_URL, caption['id']), + 'ext': caption.get('fileExt') or self._CAPTION_TYPES.get(caption_format) or 'ttml', + }) + + return { + 'id': entry_id, + 'title': info['name'], + 'formats': formats, + 'subtitles': subtitles, + 'description': clean_html(info.get('description')), + 'thumbnail': info.get('thumbnailUrl'), + 'duration': info.get('duration'), + 'timestamp': info.get('createdAt'), + 'uploader_id': info.get('userId') if info.get('userId') != 'None' else None, + 'view_count': info.get('plays'), + } diff --git a/yt_dlp/extractor/kanalplay.py b/yt_dlp/extractor/kanalplay.py new file mode 100644 index 000000000..5e24f7e21 --- /dev/null +++ b/yt_dlp/extractor/kanalplay.py @@ -0,0 +1,96 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + srt_subtitles_timecode, +) + + +class KanalPlayIE(InfoExtractor): + IE_DESC = 'Kanal 5/9/11 Play' + _VALID_URL = r'https?://(?:www\.)?kanal(?P<channel_id>5|9|11)play\.se/(?:#!/)?(?:play/)?program/\d+/video/(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://www.kanal5play.se/#!/play/program/3060212363/video/3270012277', + 'info_dict': { + 'id': '3270012277', + 'ext': 'flv', + 'title': 'Saknar både dusch och avlopp', + 'description': 'md5:6023a95832a06059832ae93bc3c7efb7', + 'duration': 2636.36, + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + 'url': 'http://www.kanal9play.se/#!/play/program/335032/video/246042', + 'only_matching': True, + }, { + 'url': 'http://www.kanal11play.se/#!/play/program/232835958/video/367135199', + 'only_matching': True, + }] + + def _fix_subtitles(self, subs): + return '\r\n\r\n'.join( + '%s\r\n%s --> %s\r\n%s' + % ( + num, + srt_subtitles_timecode(item['startMillis'] / 1000.0), + srt_subtitles_timecode(item['endMillis'] / 1000.0), + item['text'], + ) for num, item in enumerate(subs, 1)) + + def _get_subtitles(self, channel_id, video_id): + subs = self._download_json( + 'http://www.kanal%splay.se/api/subtitles/%s' % (channel_id, video_id), + video_id, 'Downloading subtitles JSON', fatal=False) + return {'sv': [{'ext': 'srt', 'data': self._fix_subtitles(subs)}]} if subs else {} + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + channel_id = mobj.group('channel_id') + + video = self._download_json( + 'http://www.kanal%splay.se/api/getVideo?format=FLASH&videoId=%s' % (channel_id, video_id), + video_id) + + reasons_for_no_streams = video.get('reasonsForNoStreams') + if reasons_for_no_streams: + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, '\n'.join(reasons_for_no_streams)), + expected=True) + + title = video['title'] + description = video.get('description') + duration = float_or_none(video.get('length'), 1000) + thumbnail = video.get('posterUrl') + + stream_base_url = video['streamBaseUrl'] + + formats = [{ + 'url': stream_base_url, + 'play_path': stream['source'], + 'ext': 'flv', + 'tbr': float_or_none(stream.get('bitrate'), 1000), + 'rtmp_real_time': True, + } for stream in video['streams']] + self._sort_formats(formats) + + subtitles = {} + if video.get('hasSubtitle'): + subtitles = self.extract_subtitles(channel_id, video_id) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/youtube_dl/extractor/kankan.py b/yt_dlp/extractor/kankan.py index a677ff447..a677ff447 100644 --- a/youtube_dl/extractor/kankan.py +++ b/yt_dlp/extractor/kankan.py diff --git a/youtube_dl/extractor/karaoketv.py b/yt_dlp/extractor/karaoketv.py index bfccf89b0..bfccf89b0 100644 --- a/youtube_dl/extractor/karaoketv.py +++ b/yt_dlp/extractor/karaoketv.py diff --git a/youtube_dl/extractor/karrierevideos.py b/yt_dlp/extractor/karrierevideos.py index 7b291e0a0..7b291e0a0 100644 --- a/youtube_dl/extractor/karrierevideos.py +++ b/yt_dlp/extractor/karrierevideos.py diff --git a/yt_dlp/extractor/keezmovies.py b/yt_dlp/extractor/keezmovies.py new file mode 100644 index 000000000..027f43cf0 --- /dev/null +++ b/yt_dlp/extractor/keezmovies.py @@ -0,0 +1,133 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..aes import aes_decrypt_text +from ..compat import compat_urllib_parse_unquote +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + str_to_int, + strip_or_none, + url_or_none, +) + + +class KeezMoviesIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?keezmovies\.com/video/(?:(?P<display_id>[^/]+)-)?(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.keezmovies.com/video/arab-wife-want-it-so-bad-i-see-she-thirsty-and-has-tiny-money-18070681', + 'md5': '2ac69cdb882055f71d82db4311732a1a', + 'info_dict': { + 'id': '18070681', + 'display_id': 'arab-wife-want-it-so-bad-i-see-she-thirsty-and-has-tiny-money', + 'ext': 'mp4', + 'title': 'Arab wife want it so bad I see she thirsty and has tiny money.', + 'thumbnail': None, + 'view_count': int, + 'age_limit': 18, + } + }, { + 'url': 'http://www.keezmovies.com/video/18070681', + 'only_matching': True, + }] + + def _extract_info(self, url, fatal=True): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + display_id = (mobj.group('display_id') + if 'display_id' in mobj.groupdict() + else None) or mobj.group('id') + + webpage = self._download_webpage( + url, display_id, headers={'Cookie': 'age_verified=1'}) + + formats = [] + format_urls = set() + + title = None + thumbnail = None + duration = None + encrypted = False + + def extract_format(format_url, height=None): + format_url = url_or_none(format_url) + if not format_url or not format_url.startswith(('http', '//')): + return + if format_url in format_urls: + return + format_urls.add(format_url) + tbr = int_or_none(self._search_regex( + r'[/_](\d+)[kK][/_]', format_url, 'tbr', default=None)) + if not height: + height = int_or_none(self._search_regex( + r'[/_](\d+)[pP][/_]', format_url, 'height', default=None)) + if encrypted: + format_url = aes_decrypt_text( + video_url, title, 32).decode('utf-8') + formats.append({ + 'url': format_url, + 'format_id': '%dp' % height if height else None, + 'height': height, + 'tbr': tbr, + }) + + flashvars = self._parse_json( + self._search_regex( + r'flashvars\s*=\s*({.+?});', webpage, + 'flashvars', default='{}'), + display_id, fatal=False) + + if flashvars: + title = flashvars.get('video_title') + thumbnail = flashvars.get('image_url') + duration = int_or_none(flashvars.get('video_duration')) + encrypted = flashvars.get('encrypted') is True + for key, value in flashvars.items(): + mobj = re.search(r'quality_(\d+)[pP]', key) + if mobj: + extract_format(value, int(mobj.group(1))) + video_url = flashvars.get('video_url') + if video_url and determine_ext(video_url, None): + extract_format(video_url) + + video_url = self._html_search_regex( + r'flashvars\.video_url\s*=\s*(["\'])(?P<url>http.+?)\1', + webpage, 'video url', default=None, group='url') + if video_url: + extract_format(compat_urllib_parse_unquote(video_url)) + + if not formats: + if 'title="This video is no longer available"' in webpage: + self.raise_no_formats( + 'Video %s is no longer available' % video_id, expected=True) + + try: + self._sort_formats(formats) + except ExtractorError: + if fatal: + raise + + if not title: + title = self._html_search_regex( + r'<h1[^>]*>([^<]+)', webpage, 'title') + + return webpage, { + 'id': video_id, + 'display_id': display_id, + 'title': strip_or_none(title), + 'thumbnail': thumbnail, + 'duration': duration, + 'age_limit': 18, + 'formats': formats, + } + + def _real_extract(self, url): + webpage, info = self._extract_info(url, fatal=False) + if not info['formats']: + return self.url_result(url, 'Generic') + info['view_count'] = str_to_int(self._search_regex( + r'<b>([\d,.]+)</b> Views?', webpage, 'view count', fatal=False)) + return info diff --git a/youtube_dl/extractor/ketnet.py b/yt_dlp/extractor/ketnet.py index e0599d02f..e0599d02f 100644 --- a/youtube_dl/extractor/ketnet.py +++ b/yt_dlp/extractor/ketnet.py diff --git a/youtube_dl/extractor/khanacademy.py b/yt_dlp/extractor/khanacademy.py index 87e520378..87e520378 100644 --- a/youtube_dl/extractor/khanacademy.py +++ b/yt_dlp/extractor/khanacademy.py diff --git a/youtube_dl/extractor/kickstarter.py b/yt_dlp/extractor/kickstarter.py index d4da8f484..d4da8f484 100644 --- a/youtube_dl/extractor/kickstarter.py +++ b/yt_dlp/extractor/kickstarter.py diff --git a/yt_dlp/extractor/kinja.py b/yt_dlp/extractor/kinja.py new file mode 100644 index 000000000..1be8b4809 --- /dev/null +++ b/yt_dlp/extractor/kinja.py @@ -0,0 +1,221 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urllib_parse_unquote, +) +from ..utils import ( + int_or_none, + parse_iso8601, + strip_or_none, + try_get, + unescapeHTML, + urljoin, +) + + +class KinjaEmbedIE(InfoExtractor): + IENAME = 'kinja:embed' + _DOMAIN_REGEX = r'''(?:[^.]+\.)? + (?: + avclub| + clickhole| + deadspin| + gizmodo| + jalopnik| + jezebel| + kinja| + kotaku| + lifehacker| + splinternews| + the(?:inventory|onion|root|takeout) + )\.com''' + _COMMON_REGEX = r'''/ + (?: + ajax/inset| + embed/video + )/iframe\?.*?\bid=''' + _VALID_URL = r'''(?x)https?://%s%s + (?P<type> + fb| + imgur| + instagram| + jwp(?:layer)?-video| + kinjavideo| + mcp| + megaphone| + ooyala| + soundcloud(?:-playlist)?| + tumblr-post| + twitch-stream| + twitter| + ustream-channel| + vimeo| + vine| + youtube-(?:list|video) + )-(?P<id>[^&]+)''' % (_DOMAIN_REGEX, _COMMON_REGEX) + _TESTS = [{ + 'url': 'https://kinja.com/ajax/inset/iframe?id=fb-10103303356633621', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=kinjavideo-100313', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=megaphone-PPY1300931075', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=ooyala-xzMXhleDpopuT0u1ijt_qZj3Va-34pEX%2FZTIxYmJjZDM2NWYzZDViZGRiOWJjYzc5', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=soundcloud-128574047', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=soundcloud-playlist-317413750', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=tumblr-post-160130699814-daydreams-at-midnight', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=twitch-stream-libratus_extra', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=twitter-1068875942473404422', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=ustream-channel-10414700', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=vimeo-120153502', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=vine-5BlvV5qqPrD', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=youtube-list-BCQ3KyrPjgA/PLE6509247C270A72E', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=youtube-video-00QyL0AgPAE', + 'only_matching': True, + }] + _JWPLATFORM_PROVIDER = ('cdn.jwplayer.com/v2/media/', 'JWPlatform') + _PROVIDER_MAP = { + 'fb': ('facebook.com/video.php?v=', 'Facebook'), + 'imgur': ('imgur.com/', 'Imgur'), + 'instagram': ('instagram.com/p/', 'Instagram'), + 'jwplayer-video': _JWPLATFORM_PROVIDER, + 'jwp-video': _JWPLATFORM_PROVIDER, + 'megaphone': ('player.megaphone.fm/', 'Generic'), + 'ooyala': ('player.ooyala.com/player.js?embedCode=', 'Ooyala'), + 'soundcloud': ('api.soundcloud.com/tracks/', 'Soundcloud'), + 'soundcloud-playlist': ('api.soundcloud.com/playlists/', 'SoundcloudPlaylist'), + 'tumblr-post': ('%s.tumblr.com/post/%s', 'Tumblr'), + 'twitch-stream': ('twitch.tv/', 'TwitchStream'), + 'twitter': ('twitter.com/i/cards/tfw/v1/', 'TwitterCard'), + 'ustream-channel': ('ustream.tv/embed/', 'Ustream'), + 'vimeo': ('vimeo.com/', 'Vimeo'), + 'vine': ('vine.co/v/', 'Vine'), + 'youtube-list': ('youtube.com/embed/%s?list=%s', 'YoutubePlaylist'), + 'youtube-video': ('youtube.com/embed/', 'Youtube'), + } + + @staticmethod + def _extract_urls(webpage, url): + return [urljoin(url, unescapeHTML(mobj.group('url'))) for mobj in re.finditer( + r'(?x)<iframe[^>]+?src=(?P<q>["\'])(?P<url>(?:(?:https?:)?//%s)?%s(?:(?!\1).)+)\1' % (KinjaEmbedIE._DOMAIN_REGEX, KinjaEmbedIE._COMMON_REGEX), + webpage)] + + def _real_extract(self, url): + video_type, video_id = self._match_valid_url(url).groups() + + provider = self._PROVIDER_MAP.get(video_type) + if provider: + video_id = compat_urllib_parse_unquote(video_id) + if video_type == 'tumblr-post': + video_id, blog = video_id.split('-', 1) + result_url = provider[0] % (blog, video_id) + elif video_type == 'youtube-list': + video_id, playlist_id = video_id.split('/') + result_url = provider[0] % (video_id, playlist_id) + else: + if video_type == 'ooyala': + video_id = video_id.split('/')[0] + result_url = provider[0] + video_id + return self.url_result('http://' + result_url, provider[1]) + + if video_type == 'kinjavideo': + data = self._download_json( + 'https://kinja.com/api/core/video/views/videoById', + video_id, query={'videoId': video_id})['data'] + title = data['title'] + + formats = [] + for k in ('signedPlaylist', 'streaming'): + m3u8_url = data.get(k + 'Url') + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + self._sort_formats(formats) + + thumbnail = None + poster = data.get('poster') or {} + poster_id = poster.get('id') + if poster_id: + thumbnail = 'https://i.kinja-img.com/gawker-media/image/upload/%s.%s' % (poster_id, poster.get('format') or 'jpg') + + return { + 'id': video_id, + 'title': title, + 'description': strip_or_none(data.get('description')), + 'formats': formats, + 'tags': data.get('tags'), + 'timestamp': int_or_none(try_get( + data, lambda x: x['postInfo']['publishTimeMillis']), 1000), + 'thumbnail': thumbnail, + 'uploader': data.get('network'), + } + else: + video_data = self._download_json( + 'https://api.vmh.univision.com/metadata/v1/content/' + video_id, + video_id)['videoMetadata'] + iptc = video_data['photoVideoMetadataIPTC'] + title = iptc['title']['en'] + fmg = video_data.get('photoVideoMetadata_fmg') or {} + tvss_domain = fmg.get('tvssDomain') or 'https://auth.univision.com' + data = self._download_json( + tvss_domain + '/api/v3/video-auth/url-signature-tokens', + video_id, query={'mcpids': video_id})['data'][0] + formats = [] + + rendition_url = data.get('renditionUrl') + if rendition_url: + formats = self._extract_m3u8_formats( + rendition_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False) + + fallback_rendition_url = data.get('fallbackRenditionUrl') + if fallback_rendition_url: + formats.append({ + 'format_id': 'fallback', + 'tbr': int_or_none(self._search_regex( + r'_(\d+)\.mp4', fallback_rendition_url, + 'bitrate', default=None)), + 'url': fallback_rendition_url, + }) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': try_get(iptc, lambda x: x['cloudinaryLink']['link'], compat_str), + 'uploader': fmg.get('network'), + 'duration': int_or_none(iptc.get('fileDuration')), + 'formats': formats, + 'description': try_get(iptc, lambda x: x['description']['en'], compat_str), + 'timestamp': parse_iso8601(iptc.get('dateReleased')), + } diff --git a/youtube_dl/extractor/kinopoisk.py b/yt_dlp/extractor/kinopoisk.py index 9e8d01f53..9e8d01f53 100644 --- a/youtube_dl/extractor/kinopoisk.py +++ b/yt_dlp/extractor/kinopoisk.py diff --git a/youtube_dl/extractor/konserthusetplay.py b/yt_dlp/extractor/konserthusetplay.py index dd42bb2f2..dd42bb2f2 100644 --- a/youtube_dl/extractor/konserthusetplay.py +++ b/yt_dlp/extractor/konserthusetplay.py diff --git a/yt_dlp/extractor/koo.py b/yt_dlp/extractor/koo.py new file mode 100644 index 000000000..1706b28a0 --- /dev/null +++ b/yt_dlp/extractor/koo.py @@ -0,0 +1,116 @@ +# coding: utf-8 +from __future__ import unicode_literals +from .common import InfoExtractor +from ..utils import ( + clean_html, + try_get, +) + + +class KooIE(InfoExtractor): + _VALID_URL = r'(?:https?://)(?:www\.)?kooapp\.com/koo/[^/]+/(?P<id>[^/&#$?]+)' + _TESTS = [{ # Test for video in the comments + 'url': 'https://www.kooapp.com/koo/ytdlpTestAccount/946c4189-bc2d-4524-b95b-43f641e2adde', + 'info_dict': { + 'id': '946c4189-bc2d-4524-b95b-43f641e2adde', + 'ext': 'mp4', + 'title': 'test for video in comment', + 'description': 'md5:daa77dc214add4da8b6ea7d2226776e7', + 'timestamp': 1632215195, + 'uploader_id': 'ytdlpTestAccount', + 'uploader': 'yt-dlpTestAccount', + 'duration': 7000, + 'upload_date': '20210921' + }, + 'params': {'skip_download': True} + }, { # Test for koo with long title + 'url': 'https://www.kooapp.com/koo/laxman_kumarDBFEC/33decbf7-5e1e-4bb8-bfd7-04744a064361', + 'info_dict': { + 'id': '33decbf7-5e1e-4bb8-bfd7-04744a064361', + 'ext': 'mp4', + 'title': 'md5:47a71c2337295330c5a19a8af1bbf450', + 'description': 'md5:06a6a84e9321499486dab541693d8425', + 'timestamp': 1632106884, + 'uploader_id': 'laxman_kumarDBFEC', + 'uploader': 'Laxman Kumar 🇮🇳', + 'duration': 46000, + 'upload_date': '20210920' + }, + 'params': {'skip_download': True} + }, { # Test for audio + 'url': 'https://www.kooapp.com/koo/ytdlpTestAccount/a2a9c88e-ce4b-4d2d-952f-d06361c5b602', + 'info_dict': { + 'id': 'a2a9c88e-ce4b-4d2d-952f-d06361c5b602', + 'ext': 'mp4', + 'title': 'Test for audio', + 'description': 'md5:ecb9a2b6a5d34b736cecb53788cb11e8', + 'timestamp': 1632211634, + 'uploader_id': 'ytdlpTestAccount', + 'uploader': 'yt-dlpTestAccount', + 'duration': 214000, + 'upload_date': '20210921' + }, + 'params': {'skip_download': True} + }, { # Test for video + 'url': 'https://www.kooapp.com/koo/ytdlpTestAccount/a3e56c53-c1ed-4ac9-ac02-ed1630e6b1d1', + 'info_dict': { + 'id': 'a3e56c53-c1ed-4ac9-ac02-ed1630e6b1d1', + 'ext': 'mp4', + 'title': 'Test for video', + 'description': 'md5:7afc4eb839074ddeb2beea5dd6fe9500', + 'timestamp': 1632211468, + 'uploader_id': 'ytdlpTestAccount', + 'uploader': 'yt-dlpTestAccount', + 'duration': 14000, + 'upload_date': '20210921' + }, + 'params': {'skip_download': True} + }, { # Test for link + 'url': 'https://www.kooapp.com/koo/ytdlpTestAccount/01bf5b94-81a5-4d8e-a387-5f732022e15a', + 'skip': 'No video/audio found at the provided url.', + 'info_dict': { + 'id': '01bf5b94-81a5-4d8e-a387-5f732022e15a', + 'title': 'Test for link', + 'ext': 'none', + }, + }, { # Test for images + 'url': 'https://www.kooapp.com/koo/ytdlpTestAccount/dc05d9cd-a61d-45fd-bb07-e8019d8ca8cb', + 'skip': 'No video/audio found at the provided url.', + 'info_dict': { + 'id': 'dc05d9cd-a61d-45fd-bb07-e8019d8ca8cb', + 'title': 'Test for images', + 'ext': 'none', + }, + }] + + def _real_extract(self, url): + id = self._match_id(url) + data_json = self._download_json(f'https://www.kooapp.com/apiV1/ku/{id}?limit=20&offset=0&showSimilarKoos=true', id)['parentContent'] + item_json = next(content['items'][0] for content in data_json + if try_get(content, lambda x: x['items'][0]['id']) == id) + media_json = item_json['mediaMap'] + formats = [] + + mp4_url = media_json.get('videoMp4') + video_m3u8_url = media_json.get('videoHls') + if mp4_url: + formats.append({ + 'url': mp4_url, + 'ext': 'mp4', + }) + if video_m3u8_url: + formats.extend(self._extract_m3u8_formats(video_m3u8_url, id, fatal=False, ext='mp4')) + if not formats: + self.raise_no_formats('No video/audio found at the provided url.', expected=True) + + self._sort_formats(formats) + return { + 'id': id, + 'title': clean_html(item_json.get('title')), + 'description': f'{clean_html(item_json.get("title"))}\n\n{clean_html(item_json.get("enTransliteration"))}', + 'timestamp': item_json.get('createdAt'), + 'uploader_id': item_json.get('handle'), + 'uploader': item_json.get('name'), + 'duration': media_json.get('duration'), + 'formats': formats, + } diff --git a/youtube_dl/extractor/krasview.py b/yt_dlp/extractor/krasview.py index d27d052ff..d27d052ff 100644 --- a/youtube_dl/extractor/krasview.py +++ b/yt_dlp/extractor/krasview.py diff --git a/youtube_dl/extractor/ku6.py b/yt_dlp/extractor/ku6.py index a574408e5..a574408e5 100644 --- a/youtube_dl/extractor/ku6.py +++ b/yt_dlp/extractor/ku6.py diff --git a/yt_dlp/extractor/kusi.py b/yt_dlp/extractor/kusi.py new file mode 100644 index 000000000..707fe1821 --- /dev/null +++ b/yt_dlp/extractor/kusi.py @@ -0,0 +1,87 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import random + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote_plus +from ..utils import ( + int_or_none, + float_or_none, + timeconvert, + update_url_query, + xpath_text, +) + + +class KUSIIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?kusi\.com/(?P<path>story/.+|video\?clipId=(?P<clipId>\d+))' + _TESTS = [{ + 'url': 'http://www.kusi.com/story/32849881/turko-files-refused-to-help-it-aint-right', + 'md5': '4e76ce8e53660ce9697d06c0ba6fc47d', + 'info_dict': { + 'id': '12689020', + 'ext': 'mp4', + 'title': "Turko Files: Refused to Help, It Ain't Right!", + 'duration': 223.586, + 'upload_date': '20160826', + 'timestamp': 1472233118, + 'thumbnail': r're:^https?://.*\.jpg$' + }, + }, { + 'url': 'http://kusi.com/video?clipId=12203019', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + clip_id = mobj.group('clipId') + video_id = clip_id or mobj.group('path') + + webpage = self._download_webpage(url, video_id) + + if clip_id is None: + video_id = clip_id = self._html_search_regex( + r'"clipId"\s*,\s*"(\d+)"', webpage, 'clip id') + + affiliate_id = self._search_regex( + r'affiliateId\s*:\s*\'([^\']+)\'', webpage, 'affiliate id') + + # See __Packages/worldnow/model/GalleryModel.as of WNGallery.swf + xml_url = update_url_query('http://www.kusi.com/build.asp', { + 'buildtype': 'buildfeaturexmlrequest', + 'featureType': 'Clip', + 'featureid': clip_id, + 'affiliateno': affiliate_id, + 'clientgroupid': '1', + 'rnd': int(round(random.random() * 1000000)), + }) + + doc = self._download_xml(xml_url, video_id) + + video_title = xpath_text(doc, 'HEADLINE', fatal=True) + duration = float_or_none(xpath_text(doc, 'DURATION'), scale=1000) + description = xpath_text(doc, 'ABSTRACT') + thumbnail = xpath_text(doc, './THUMBNAILIMAGE/FILENAME') + creation_time = timeconvert(xpath_text(doc, 'rfc822creationdate')) + + quality_options = doc.find('{http://search.yahoo.com/mrss/}group').findall('{http://search.yahoo.com/mrss/}content') + formats = [] + for quality in quality_options: + formats.append({ + 'url': compat_urllib_parse_unquote_plus(quality.attrib['url']), + 'height': int_or_none(quality.attrib.get('height')), + 'width': int_or_none(quality.attrib.get('width')), + 'vbr': float_or_none(quality.attrib.get('bitratebits'), scale=1000), + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_title, + 'description': description, + 'duration': duration, + 'formats': formats, + 'thumbnail': thumbnail, + 'timestamp': creation_time, + } diff --git a/yt_dlp/extractor/kuwo.py b/yt_dlp/extractor/kuwo.py new file mode 100644 index 000000000..460a4252f --- /dev/null +++ b/yt_dlp/extractor/kuwo.py @@ -0,0 +1,352 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + get_element_by_id, + clean_html, + ExtractorError, + InAdvancePagedList, + remove_start, +) + + +class KuwoBaseIE(InfoExtractor): + _FORMATS = [ + {'format': 'ape', 'ext': 'ape', 'preference': 100}, + {'format': 'mp3-320', 'ext': 'mp3', 'br': '320kmp3', 'abr': 320, 'preference': 80}, + {'format': 'mp3-192', 'ext': 'mp3', 'br': '192kmp3', 'abr': 192, 'preference': 70}, + {'format': 'mp3-128', 'ext': 'mp3', 'br': '128kmp3', 'abr': 128, 'preference': 60}, + {'format': 'wma', 'ext': 'wma', 'preference': 20}, + {'format': 'aac', 'ext': 'aac', 'abr': 48, 'preference': 10} + ] + + def _get_formats(self, song_id, tolerate_ip_deny=False): + formats = [] + for file_format in self._FORMATS: + query = { + 'format': file_format['ext'], + 'br': file_format.get('br', ''), + 'rid': 'MUSIC_%s' % song_id, + 'type': 'convert_url', + 'response': 'url' + } + + song_url = self._download_webpage( + 'http://antiserver.kuwo.cn/anti.s', + song_id, note='Download %s url info' % file_format['format'], + query=query, headers=self.geo_verification_headers(), + ) + + if song_url == 'IPDeny' and not tolerate_ip_deny: + raise ExtractorError('This song is blocked in this region', expected=True) + + if song_url.startswith('http://') or song_url.startswith('https://'): + formats.append({ + 'url': song_url, + 'format_id': file_format['format'], + 'format': file_format['format'], + 'quality': file_format['preference'], + 'abr': file_format.get('abr'), + }) + + return formats + + +class KuwoIE(KuwoBaseIE): + IE_NAME = 'kuwo:song' + IE_DESC = '酷我音乐' + _VALID_URL = r'https?://(?:www\.)?kuwo\.cn/yinyue/(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://www.kuwo.cn/yinyue/635632/', + 'info_dict': { + 'id': '635632', + 'ext': 'ape', + 'title': '爱我别走', + 'creator': '张震岳', + 'upload_date': '20080122', + 'description': 'md5:ed13f58e3c3bf3f7fd9fbc4e5a7aa75c' + }, + 'skip': 'this song has been offline because of copyright issues', + }, { + 'url': 'http://www.kuwo.cn/yinyue/6446136/', + 'info_dict': { + 'id': '6446136', + 'ext': 'mp3', + 'title': '心', + 'description': 'md5:5d0e947b242c35dc0eb1d2fce9fbf02c', + 'creator': 'IU', + 'upload_date': '20150518', + }, + 'params': { + 'format': 'mp3-320', + }, + }, { + 'url': 'http://www.kuwo.cn/yinyue/3197154?catalog=yueku2016', + 'only_matching': True, + }] + + def _real_extract(self, url): + song_id = self._match_id(url) + webpage, urlh = self._download_webpage_handle( + url, song_id, note='Download song detail info', + errnote='Unable to get song detail info') + if song_id not in urlh.geturl() or '对不起,该歌曲由于版权问题已被下线,将返回网站首页' in webpage: + raise ExtractorError('this song has been offline because of copyright issues', expected=True) + + song_name = self._html_search_regex( + r'<p[^>]+id="lrcName">([^<]+)</p>', webpage, 'song name') + singer_name = remove_start(self._html_search_regex( + r'<a[^>]+href="http://www\.kuwo\.cn/artist/content\?name=([^"]+)">', + webpage, 'singer name', fatal=False), '歌手') + lrc_content = clean_html(get_element_by_id('lrcContent', webpage)) + if lrc_content == '暂无': # indicates no lyrics + lrc_content = None + + formats = self._get_formats(song_id) + self._sort_formats(formats) + + album_id = self._html_search_regex( + r'<a[^>]+href="http://www\.kuwo\.cn/album/(\d+)/"', + webpage, 'album id', fatal=False) + + publish_time = None + if album_id is not None: + album_info_page = self._download_webpage( + 'http://www.kuwo.cn/album/%s/' % album_id, song_id, + note='Download album detail info', + errnote='Unable to get album detail info') + + publish_time = self._html_search_regex( + r'发行时间:(\d{4}-\d{2}-\d{2})', album_info_page, + 'publish time', fatal=False) + if publish_time: + publish_time = publish_time.replace('-', '') + + return { + 'id': song_id, + 'title': song_name, + 'creator': singer_name, + 'upload_date': publish_time, + 'description': lrc_content, + 'formats': formats, + } + + +class KuwoAlbumIE(InfoExtractor): + IE_NAME = 'kuwo:album' + IE_DESC = '酷我音乐 - 专辑' + _VALID_URL = r'https?://(?:www\.)?kuwo\.cn/album/(?P<id>\d+?)/' + _TEST = { + 'url': 'http://www.kuwo.cn/album/502294/', + 'info_dict': { + 'id': '502294', + 'title': 'Made\xa0Series\xa0《M》', + 'description': 'md5:d463f0d8a0ff3c3ea3d6ed7452a9483f', + }, + 'playlist_count': 2, + } + + def _real_extract(self, url): + album_id = self._match_id(url) + + webpage = self._download_webpage( + url, album_id, note='Download album info', + errnote='Unable to get album info') + + album_name = self._html_search_regex( + r'<div[^>]+class="comm"[^<]+<h1[^>]+title="([^"]+)"', webpage, + 'album name') + album_intro = remove_start( + clean_html(get_element_by_id('intro', webpage)), + '%s简介:' % album_name) + + entries = [ + self.url_result(song_url, 'Kuwo') for song_url in re.findall( + r'<p[^>]+class="listen"><a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+/)"', + webpage) + ] + return self.playlist_result(entries, album_id, album_name, album_intro) + + +class KuwoChartIE(InfoExtractor): + IE_NAME = 'kuwo:chart' + IE_DESC = '酷我音乐 - 排行榜' + _VALID_URL = r'https?://yinyue\.kuwo\.cn/billboard_(?P<id>[^.]+).htm' + _TEST = { + 'url': 'http://yinyue.kuwo.cn/billboard_香港中文龙虎榜.htm', + 'info_dict': { + 'id': '香港中文龙虎榜', + }, + 'playlist_mincount': 7, + } + + def _real_extract(self, url): + chart_id = self._match_id(url) + webpage = self._download_webpage( + url, chart_id, note='Download chart info', + errnote='Unable to get chart info') + + entries = [ + self.url_result(song_url, 'Kuwo') for song_url in re.findall( + r'<a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+)', webpage) + ] + return self.playlist_result(entries, chart_id) + + +class KuwoSingerIE(InfoExtractor): + IE_NAME = 'kuwo:singer' + IE_DESC = '酷我音乐 - 歌手' + _VALID_URL = r'https?://(?:www\.)?kuwo\.cn/mingxing/(?P<id>[^/]+)' + _TESTS = [{ + 'url': 'http://www.kuwo.cn/mingxing/bruno+mars/', + 'info_dict': { + 'id': 'bruno+mars', + 'title': 'Bruno\xa0Mars', + }, + 'playlist_mincount': 329, + }, { + 'url': 'http://www.kuwo.cn/mingxing/Ali/music.htm', + 'info_dict': { + 'id': 'Ali', + 'title': 'Ali', + }, + 'playlist_mincount': 95, + 'skip': 'Regularly stalls travis build', # See https://travis-ci.org/ytdl-org/youtube-dl/jobs/78878540 + }] + + PAGE_SIZE = 15 + + def _real_extract(self, url): + singer_id = self._match_id(url) + webpage = self._download_webpage( + url, singer_id, note='Download singer info', + errnote='Unable to get singer info') + + singer_name = self._html_search_regex( + r'<h1>([^<]+)</h1>', webpage, 'singer name') + + artist_id = self._html_search_regex( + r'data-artistid="(\d+)"', webpage, 'artist id') + + page_count = int(self._html_search_regex( + r'data-page="(\d+)"', webpage, 'page count')) + + def page_func(page_num): + webpage = self._download_webpage( + 'http://www.kuwo.cn/artist/contentMusicsAjax', + singer_id, note='Download song list page #%d' % (page_num + 1), + errnote='Unable to get song list page #%d' % (page_num + 1), + query={'artistId': artist_id, 'pn': page_num, 'rn': self.PAGE_SIZE}) + + return [ + self.url_result(compat_urlparse.urljoin(url, song_url), 'Kuwo') + for song_url in re.findall( + r'<div[^>]+class="name"><a[^>]+href="(/yinyue/\d+)', + webpage) + ] + + entries = InAdvancePagedList(page_func, page_count, self.PAGE_SIZE) + + return self.playlist_result(entries, singer_id, singer_name) + + +class KuwoCategoryIE(InfoExtractor): + IE_NAME = 'kuwo:category' + IE_DESC = '酷我音乐 - 分类' + _VALID_URL = r'https?://yinyue\.kuwo\.cn/yy/cinfo_(?P<id>\d+?).htm' + _TEST = { + 'url': 'http://yinyue.kuwo.cn/yy/cinfo_86375.htm', + 'info_dict': { + 'id': '86375', + 'title': '八十年代精选', + 'description': '这些都是属于八十年代的回忆!', + }, + 'playlist_mincount': 24, + } + + def _real_extract(self, url): + category_id = self._match_id(url) + webpage = self._download_webpage( + url, category_id, note='Download category info', + errnote='Unable to get category info') + + category_name = self._html_search_regex( + r'<h1[^>]+title="([^<>]+?)">[^<>]+?</h1>', webpage, 'category name') + + category_desc = remove_start( + get_element_by_id('intro', webpage).strip(), + '%s简介:' % category_name) + if category_desc == '暂无': + category_desc = None + + jsonm = self._parse_json(self._html_search_regex( + r'var\s+jsonm\s*=\s*([^;]+);', webpage, 'category songs'), category_id) + + entries = [ + self.url_result('http://www.kuwo.cn/yinyue/%s/' % song['musicrid'], 'Kuwo') + for song in jsonm['musiclist'] + ] + return self.playlist_result(entries, category_id, category_name, category_desc) + + +class KuwoMvIE(KuwoBaseIE): + IE_NAME = 'kuwo:mv' + IE_DESC = '酷我音乐 - MV' + _VALID_URL = r'https?://(?:www\.)?kuwo\.cn/mv/(?P<id>\d+?)/' + _TEST = { + 'url': 'http://www.kuwo.cn/mv/6480076/', + 'info_dict': { + 'id': '6480076', + 'ext': 'mp4', + 'title': 'My HouseMV', + 'creator': '2PM', + }, + # In this video, music URLs (anti.s) are blocked outside China and + # USA, while the MV URL (mvurl) is available globally, so force the MV + # URL for consistent results in different countries + 'params': { + 'format': 'mv', + }, + } + _FORMATS = KuwoBaseIE._FORMATS + [ + {'format': 'mkv', 'ext': 'mkv', 'preference': 250}, + {'format': 'mp4', 'ext': 'mp4', 'preference': 200}, + ] + + def _real_extract(self, url): + song_id = self._match_id(url) + webpage = self._download_webpage( + url, song_id, note='Download mv detail info: %s' % song_id, + errnote='Unable to get mv detail info: %s' % song_id) + + mobj = re.search( + r'<h1[^>]+title="(?P<song>[^"]+)">[^<]+<span[^>]+title="(?P<singer>[^"]+)"', + webpage) + if mobj: + song_name = mobj.group('song') + singer_name = mobj.group('singer') + else: + raise ExtractorError('Unable to find song or singer names') + + formats = self._get_formats(song_id, tolerate_ip_deny=True) + + mv_url = self._download_webpage( + 'http://www.kuwo.cn/yy/st/mvurl?rid=MUSIC_%s' % song_id, + song_id, note='Download %s MV URL' % song_id) + formats.append({ + 'url': mv_url, + 'format_id': 'mv', + }) + + self._sort_formats(formats) + + return { + 'id': song_id, + 'title': song_name, + 'creator': singer_name, + 'formats': formats, + } diff --git a/yt_dlp/extractor/la7.py b/yt_dlp/extractor/la7.py new file mode 100644 index 000000000..363fbd6a5 --- /dev/null +++ b/yt_dlp/extractor/la7.py @@ -0,0 +1,203 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + float_or_none, + parse_duration, + smuggle_url, + unified_strdate, +) + + +class LA7IE(InfoExtractor): + IE_NAME = 'la7.it' + _VALID_URL = r'''(?x)(https?://)?(?: + (?:www\.)?la7\.it/([^/]+)/(?:rivedila7|video)/| + tg\.la7\.it/repliche-tgla7\?id= + )(?P<id>.+)''' + + _TESTS = [{ + # 'src' is a plain URL + 'url': 'http://www.la7.it/crozza/video/inccool8-02-10-2015-163722', + 'md5': '8b613ffc0c4bf9b9e377169fc19c214c', + 'info_dict': { + 'id': '0_42j6wd36', + 'ext': 'mp4', + 'title': 'Inc.Cool8', + 'description': 'Benvenuti nell\'incredibile mondo della INC. COOL. 8. dove “INC.” sta per “Incorporated” “COOL” sta per “fashion” ed Eight sta per il gesto atletico', + 'thumbnail': 're:^https?://.*', + 'uploader_id': 'kdla7pillole@iltrovatore.it', + 'timestamp': 1443814869, + 'upload_date': '20151002', + }, + }, { + 'url': 'http://www.la7.it/omnibus/rivedila7/omnibus-news-02-07-2016-189077', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + if not url.startswith('http'): + url = '%s//%s' % (self.http_scheme(), url) + + webpage = self._download_webpage(url, video_id) + + player_data = self._search_regex( + [r'(?s)videoParams\s*=\s*({.+?});', r'videoLa7\(({[^;]+})\);'], + webpage, 'player data') + vid = self._search_regex(r'vid\s*:\s*"(.+?)",', player_data, 'vid') + + return { + '_type': 'url_transparent', + 'url': smuggle_url('kaltura:103:%s' % vid, { + 'service_url': 'http://nkdam.iltrovatore.it', + }), + 'id': video_id, + 'title': self._og_search_title(webpage, default=None), + 'description': self._og_search_description(webpage, default=None), + 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'ie_key': 'Kaltura', + } + + +class LA7PodcastEpisodeIE(InfoExtractor): + IE_NAME = 'la7.it:pod:episode' + _VALID_URL = r'''(?x)(https?://)? + (?:www\.)?la7\.it/[^/]+/podcast/([^/]+-)?(?P<id>\d+)''' + + _TESTS = [{ + 'url': 'https://www.la7.it/voicetown/podcast/la-carezza-delle-memoria-di-carlo-verdone-23-03-2021-371497', + 'md5': '7737d4d79b3c1a34b3de3e16297119ed', + 'info_dict': { + 'id': '371497', + 'ext': 'mp3', + 'title': '"La carezza delle memoria" di Carlo Verdone', + 'description': 'md5:5abf07c3c551a687db80af3f9ceb7d52', + 'thumbnail': 'https://www.la7.it/sites/default/files/podcast/371497.jpg', + 'upload_date': '20210323', + }, + }, { + # embed url + 'url': 'https://www.la7.it/embed/podcast/371497', + 'only_matching': True, + }, { + # date already in the title + 'url': 'https://www.la7.it/propagandalive/podcast/lintervista-di-diego-bianchi-ad-annalisa-cuzzocrea-puntata-del-1932021-20-03-2021-371130', + 'only_matching': True, + }, { + # title same as show_title + 'url': 'https://www.la7.it/otto-e-mezzo/podcast/otto-e-mezzo-26-03-2021-372340', + 'only_matching': True, + }] + + def _extract_info(self, webpage, video_id=None, ppn=None): + if not video_id: + video_id = self._search_regex( + r'data-nid=([\'"])(?P<vid>\d+)\1', + webpage, 'video_id', group='vid') + + media_url = self._search_regex( + (r'src:\s*([\'"])(?P<url>.+?mp3.+?)\1', + r'data-podcast=([\'"])(?P<url>.+?mp3.+?)\1'), + webpage, 'media_url', group='url') + ext = determine_ext(media_url) + formats = [{ + 'url': media_url, + 'format_id': ext, + 'ext': ext, + }] + self._sort_formats(formats) + + title = self._html_search_regex( + (r'<div class="title">(?P<title>.+?)</', + r'<title>(?P<title>[^<]+)</title>', + r'title:\s*([\'"])(?P<title>.+?)\1'), + webpage, 'title', group='title') + + description = ( + self._html_search_regex( + (r'<div class="description">(.+?)</div>', + r'<div class="description-mobile">(.+?)</div>', + r'<div class="box-txt">([^<]+?)</div>', + r'<div class="field-content"><p>(.+?)</p></div>'), + webpage, 'description', default=None) + or self._html_search_meta('description', webpage)) + + thumb = self._html_search_regex( + (r'<div class="podcast-image"><img src="(.+?)"></div>', + r'<div class="container-embed"[^<]+url\((.+?)\);">', + r'<div class="field-content"><img src="(.+?)"'), + webpage, 'thumbnail', fatal=False, default=None) + + duration = parse_duration(self._html_search_regex( + r'<span class="(?:durata|duration)">([\d:]+)</span>', + webpage, 'duration', fatal=False, default=None)) + + date = self._html_search_regex( + r'class="data">\s*(?:<span>)?([\d\.]+)\s*</', + webpage, 'date', default=None) + + date_alt = self._search_regex( + r'(\d+[\./]\d+[\./]\d+)', title, 'date_alt', default=None) + ppn = ppn or self._search_regex( + r'ppN:\s*([\'"])(?P<ppn>.+?)\1', + webpage, 'ppn', group='ppn', default=None) + # if the date is not in the title + # and title is the same as the show_title + # add the date to the title + if date and not date_alt and ppn and ppn.lower() == title.lower(): + title += ' del %s' % date + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': float_or_none(duration), + 'formats': formats, + 'thumbnail': thumb, + 'upload_date': unified_strdate(date), + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + return self._extract_info(webpage, video_id) + + +class LA7PodcastIE(LA7PodcastEpisodeIE): + IE_NAME = 'la7.it:podcast' + _VALID_URL = r'(https?://)?(www\.)?la7\.it/(?P<id>[^/]+)/podcast/?(?:$|[#?])' + + _TESTS = [{ + 'url': 'https://www.la7.it/propagandalive/podcast', + 'info_dict': { + 'id': 'propagandalive', + 'title': "Propaganda Live", + }, + 'playlist_count': 10, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + + title = ( + self._html_search_regex( + r'<h1.*?>(.+?)</h1>', webpage, 'title', fatal=False, default=None) + or self._og_search_title(webpage)) + ppn = self._search_regex( + r'window\.ppN\s*=\s*([\'"])(?P<ppn>.+?)\1', + webpage, 'ppn', group='ppn', default=None) + + entries = [] + for episode in re.finditer( + r'<div class="container-podcast-property">([\s\S]+?)(?:</div>\s*){3}', + webpage): + entries.append(self._extract_info(episode.group(1), ppn=ppn)) + + return self.playlist_result(entries, playlist_id, title) diff --git a/youtube_dl/extractor/laola1tv.py b/yt_dlp/extractor/laola1tv.py index fa217365a..fa217365a 100644 --- a/youtube_dl/extractor/laola1tv.py +++ b/yt_dlp/extractor/laola1tv.py diff --git a/yt_dlp/extractor/lbry.py b/yt_dlp/extractor/lbry.py new file mode 100644 index 000000000..0f87bf1d7 --- /dev/null +++ b/yt_dlp/extractor/lbry.py @@ -0,0 +1,292 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import functools +import json + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urllib_parse_unquote, +) +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + mimetype2ext, + parse_qs, + OnDemandPagedList, + try_get, + urljoin, +) + + +class LBRYBaseIE(InfoExtractor): + _BASE_URL_REGEX = r'(?:https?://(?:www\.)?(?:lbry\.tv|odysee\.com)/|lbry://)' + _CLAIM_ID_REGEX = r'[0-9a-f]{1,40}' + _OPT_CLAIM_ID = '[^:/?#&]+(?:[:#]%s)?' % _CLAIM_ID_REGEX + _SUPPORTED_STREAM_TYPES = ['video', 'audio'] + + def _call_api_proxy(self, method, display_id, params, resource): + response = self._download_json( + 'https://api.lbry.tv/api/v1/proxy', + display_id, 'Downloading %s JSON metadata' % resource, + headers={'Content-Type': 'application/json-rpc'}, + data=json.dumps({ + 'method': method, + 'params': params, + }).encode()) + err = response.get('error') + if err: + raise ExtractorError( + f'{self.IE_NAME} said: {err.get("code")} - {err.get("message")}', expected=True) + return response['result'] + + def _resolve_url(self, url, display_id, resource): + return self._call_api_proxy( + 'resolve', display_id, {'urls': url}, resource)[url] + + def _permanent_url(self, url, claim_name, claim_id): + return urljoin( + url.replace('lbry://', 'https://lbry.tv/'), + '/%s:%s' % (claim_name, claim_id)) + + def _parse_stream(self, stream, url): + stream_value = stream.get('value') or {} + stream_type = stream_value.get('stream_type') + source = stream_value.get('source') or {} + media = stream_value.get(stream_type) or {} + signing_channel = stream.get('signing_channel') or {} + channel_name = signing_channel.get('name') + channel_claim_id = signing_channel.get('claim_id') + channel_url = None + if channel_name and channel_claim_id: + channel_url = self._permanent_url(url, channel_name, channel_claim_id) + + info = { + 'thumbnail': try_get(stream_value, lambda x: x['thumbnail']['url'], compat_str), + 'description': stream_value.get('description'), + 'license': stream_value.get('license'), + 'timestamp': int_or_none(stream.get('timestamp')), + 'release_timestamp': int_or_none(stream_value.get('release_time')), + 'tags': stream_value.get('tags'), + 'duration': int_or_none(media.get('duration')), + 'channel': try_get(signing_channel, lambda x: x['value']['title']), + 'channel_id': channel_claim_id, + 'channel_url': channel_url, + 'ext': determine_ext(source.get('name')) or mimetype2ext(source.get('media_type')), + 'filesize': int_or_none(source.get('size')), + } + if stream_type == 'audio': + info['vcodec'] = 'none' + else: + info.update({ + 'width': int_or_none(media.get('width')), + 'height': int_or_none(media.get('height')), + }) + return info + + +class LBRYIE(LBRYBaseIE): + IE_NAME = 'lbry' + _VALID_URL = LBRYBaseIE._BASE_URL_REGEX + r'(?P<id>\$/[^/]+/[^/]+/{1}|@{0}/{0}|(?!@){0})'.format(LBRYBaseIE._OPT_CLAIM_ID, LBRYBaseIE._CLAIM_ID_REGEX) + _TESTS = [{ + # Video + 'url': 'https://lbry.tv/@Mantega:1/First-day-LBRY:1', + 'md5': '65bd7ec1f6744ada55da8e4c48a2edf9', + 'info_dict': { + 'id': '17f983b61f53091fb8ea58a9c56804e4ff8cff4d', + 'ext': 'mp4', + 'title': 'First day in LBRY? Start HERE!', + 'description': 'md5:f6cb5c704b332d37f5119313c2c98f51', + 'timestamp': 1595694354, + 'upload_date': '20200725', + 'release_timestamp': 1595340697, + 'release_date': '20200721', + 'width': 1280, + 'height': 720, + } + }, { + # Audio + 'url': 'https://lbry.tv/@LBRYFoundation:0/Episode-1:e', + 'md5': 'c94017d3eba9b49ce085a8fad6b98d00', + 'info_dict': { + 'id': 'e7d93d772bd87e2b62d5ab993c1c3ced86ebb396', + 'ext': 'mp3', + 'title': 'The LBRY Foundation Community Podcast Episode 1 - Introduction, Streaming on LBRY, Transcoding', + 'description': 'md5:661ac4f1db09f31728931d7b88807a61', + 'timestamp': 1591312601, + 'upload_date': '20200604', + 'release_timestamp': 1591312421, + 'release_date': '20200604', + 'tags': list, + 'duration': 2570, + 'channel': 'The LBRY Foundation', + 'channel_id': '0ed629d2b9c601300cacf7eabe9da0be79010212', + 'channel_url': 'https://lbry.tv/@LBRYFoundation:0ed629d2b9c601300cacf7eabe9da0be79010212', + 'vcodec': 'none', + } + }, { + # HLS + 'url': 'https://odysee.com/@gardeningincanada:b/plants-i-will-never-grow-again.-the:e', + 'md5': 'fc82f45ea54915b1495dd7cb5cc1289f', + 'info_dict': { + 'id': 'e51671357333fe22ae88aad320bde2f6f96b1410', + 'ext': 'mp4', + 'title': 'PLANTS I WILL NEVER GROW AGAIN. THE BLACK LIST PLANTS FOR A CANADIAN GARDEN | Gardening in Canada 🍁', + 'description': 'md5:9c539c6a03fb843956de61a4d5288d5e', + 'timestamp': 1618254123, + 'upload_date': '20210412', + 'release_timestamp': 1618254002, + 'release_date': '20210412', + 'tags': list, + 'duration': 554, + 'channel': 'Gardening In Canada', + 'channel_id': 'b8be0e93b423dad221abe29545fbe8ec36e806bc', + 'channel_url': 'https://odysee.com/@gardeningincanada:b8be0e93b423dad221abe29545fbe8ec36e806bc', + 'formats': 'mincount:3', + } + }, { + 'url': 'https://odysee.com/@BrodieRobertson:5/apple-is-tracking-everything-you-do-on:e', + 'only_matching': True, + }, { + 'url': "https://odysee.com/@ScammerRevolts:b0/I-SYSKEY'D-THE-SAME-SCAMMERS-3-TIMES!:b", + 'only_matching': True, + }, { + 'url': 'https://lbry.tv/Episode-1:e7d93d772bd87e2b62d5ab993c1c3ced86ebb396', + 'only_matching': True, + }, { + 'url': 'https://lbry.tv/$/embed/Episode-1/e7d93d772bd87e2b62d5ab993c1c3ced86ebb396', + 'only_matching': True, + }, { + 'url': 'https://lbry.tv/Episode-1:e7', + 'only_matching': True, + }, { + 'url': 'https://lbry.tv/@LBRYFoundation/Episode-1', + 'only_matching': True, + }, { + 'url': 'https://lbry.tv/$/download/Episode-1/e7d93d772bd87e2b62d5ab993c1c3ced86ebb396', + 'only_matching': True, + }, { + 'url': 'https://lbry.tv/@lacajadepandora:a/TRUMP-EST%C3%81-BIEN-PUESTO-con-Pilar-Baselga,-Carlos-Senra,-Luis-Palacios-(720p_30fps_H264-192kbit_AAC):1', + 'only_matching': True, + }, { + 'url': 'lbry://@lbry#3f/odysee#7', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + if display_id.startswith('$/'): + display_id = display_id.split('/', 2)[-1].replace('/', ':') + else: + display_id = display_id.replace(':', '#') + display_id = compat_urllib_parse_unquote(display_id) + uri = 'lbry://' + display_id + result = self._resolve_url(uri, display_id, 'stream') + result_value = result['value'] + if result_value.get('stream_type') not in self._SUPPORTED_STREAM_TYPES: + raise ExtractorError('Unsupported URL', expected=True) + claim_id = result['claim_id'] + title = result_value['title'] + streaming_url = self._call_api_proxy( + 'get', claim_id, {'uri': uri}, 'streaming url')['streaming_url'] + info = self._parse_stream(result, url) + urlh = self._request_webpage( + streaming_url, display_id, note='Downloading streaming redirect url info') + if determine_ext(urlh.geturl()) == 'm3u8': + info['formats'] = self._extract_m3u8_formats( + urlh.geturl(), display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + self._sort_formats(info['formats']) + else: + info['url'] = streaming_url + info.update({ + 'id': claim_id, + 'title': title, + }) + return info + + +class LBRYChannelIE(LBRYBaseIE): + IE_NAME = 'lbry:channel' + _VALID_URL = LBRYBaseIE._BASE_URL_REGEX + r'(?P<id>@%s)/?(?:[?&]|$)' % LBRYBaseIE._OPT_CLAIM_ID + _TESTS = [{ + 'url': 'https://lbry.tv/@LBRYFoundation:0', + 'info_dict': { + 'id': '0ed629d2b9c601300cacf7eabe9da0be79010212', + 'title': 'The LBRY Foundation', + 'description': 'Channel for the LBRY Foundation. Follow for updates and news.', + }, + 'playlist_count': 29, + }, { + 'url': 'https://lbry.tv/@LBRYFoundation', + 'only_matching': True, + }, { + 'url': 'lbry://@lbry#3f', + 'only_matching': True, + }] + _PAGE_SIZE = 50 + + def _fetch_page(self, claim_id, url, params, page): + page += 1 + page_params = { + 'channel_ids': [claim_id], + 'claim_type': 'stream', + 'no_totals': True, + 'page': page, + 'page_size': self._PAGE_SIZE, + } + page_params.update(params) + result = self._call_api_proxy( + 'claim_search', claim_id, page_params, 'page %d' % page) + for item in (result.get('items') or []): + stream_claim_name = item.get('name') + stream_claim_id = item.get('claim_id') + if not (stream_claim_name and stream_claim_id): + continue + + info = self._parse_stream(item, url) + info.update({ + '_type': 'url', + 'id': stream_claim_id, + 'title': try_get(item, lambda x: x['value']['title']), + 'url': self._permanent_url(url, stream_claim_name, stream_claim_id), + }) + yield info + + def _real_extract(self, url): + display_id = self._match_id(url).replace(':', '#') + result = self._resolve_url( + 'lbry://' + display_id, display_id, 'channel') + claim_id = result['claim_id'] + qs = parse_qs(url) + content = qs.get('content', [None])[0] + params = { + 'fee_amount': qs.get('fee_amount', ['>=0'])[0], + 'order_by': { + 'new': ['release_time'], + 'top': ['effective_amount'], + 'trending': ['trending_group', 'trending_mixed'], + }[qs.get('order', ['new'])[0]], + 'stream_types': [content] if content in ['audio', 'video'] else self._SUPPORTED_STREAM_TYPES, + } + duration = qs.get('duration', [None])[0] + if duration: + params['duration'] = { + 'long': '>=1200', + 'short': '<=240', + }[duration] + language = qs.get('language', ['all'])[0] + if language != 'all': + languages = [language] + if language == 'en': + languages.append('none') + params['any_languages'] = languages + entries = OnDemandPagedList( + functools.partial(self._fetch_page, claim_id, url, params), + self._PAGE_SIZE) + result_value = result.get('value') or {} + return self.playlist_result( + entries, claim_id, result_value.get('title'), + result_value.get('description')) diff --git a/youtube_dl/extractor/lci.py b/yt_dlp/extractor/lci.py index 920872f5c..920872f5c 100644 --- a/youtube_dl/extractor/lci.py +++ b/yt_dlp/extractor/lci.py diff --git a/youtube_dl/extractor/lcp.py b/yt_dlp/extractor/lcp.py index ade27a99e..ade27a99e 100644 --- a/youtube_dl/extractor/lcp.py +++ b/yt_dlp/extractor/lcp.py diff --git a/youtube_dl/extractor/lecture2go.py b/yt_dlp/extractor/lecture2go.py index 81b5d41be..81b5d41be 100644 --- a/youtube_dl/extractor/lecture2go.py +++ b/yt_dlp/extractor/lecture2go.py diff --git a/yt_dlp/extractor/lecturio.py b/yt_dlp/extractor/lecturio.py new file mode 100644 index 000000000..9d2228700 --- /dev/null +++ b/yt_dlp/extractor/lecturio.py @@ -0,0 +1,243 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + determine_ext, + ExtractorError, + float_or_none, + int_or_none, + str_or_none, + url_or_none, + urlencode_postdata, + urljoin, +) + + +class LecturioBaseIE(InfoExtractor): + _API_BASE_URL = 'https://app.lecturio.com/api/en/latest/html5/' + _LOGIN_URL = 'https://app.lecturio.com/en/login' + _NETRC_MACHINE = 'lecturio' + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + # Sets some cookies + _, urlh = self._download_webpage_handle( + self._LOGIN_URL, None, 'Downloading login popup') + + def is_logged(url_handle): + return self._LOGIN_URL not in url_handle.geturl() + + # Already logged in + if is_logged(urlh): + return + + login_form = { + 'signin[email]': username, + 'signin[password]': password, + 'signin[remember]': 'on', + } + + response, urlh = self._download_webpage_handle( + self._LOGIN_URL, None, 'Logging in', + data=urlencode_postdata(login_form)) + + # Logged in successfully + if is_logged(urlh): + return + + errors = self._html_search_regex( + r'(?s)<ul[^>]+class=["\']error_list[^>]+>(.+?)</ul>', response, + 'errors', default=None) + if errors: + raise ExtractorError('Unable to login: %s' % errors, expected=True) + raise ExtractorError('Unable to log in') + + +class LecturioIE(LecturioBaseIE): + _VALID_URL = r'''(?x) + https:// + (?: + app\.lecturio\.com/([^/]+/(?P<nt>[^/?#&]+)\.lecture|(?:\#/)?lecture/c/\d+/(?P<id>\d+))| + (?:www\.)?lecturio\.de/[^/]+/(?P<nt_de>[^/?#&]+)\.vortrag + ) + ''' + _TESTS = [{ + 'url': 'https://app.lecturio.com/medical-courses/important-concepts-and-terms-introduction-to-microbiology.lecture#tab/videos', + 'md5': '9a42cf1d8282a6311bf7211bbde26fde', + 'info_dict': { + 'id': '39634', + 'ext': 'mp4', + 'title': 'Important Concepts and Terms — Introduction to Microbiology', + }, + 'skip': 'Requires lecturio account credentials', + }, { + 'url': 'https://www.lecturio.de/jura/oeffentliches-recht-staatsexamen.vortrag', + 'only_matching': True, + }, { + 'url': 'https://app.lecturio.com/#/lecture/c/6434/39634', + 'only_matching': True, + }] + + _CC_LANGS = { + 'Arabic': 'ar', + 'Bulgarian': 'bg', + 'German': 'de', + 'English': 'en', + 'Spanish': 'es', + 'Persian': 'fa', + 'French': 'fr', + 'Japanese': 'ja', + 'Polish': 'pl', + 'Pashto': 'ps', + 'Russian': 'ru', + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + nt = mobj.group('nt') or mobj.group('nt_de') + lecture_id = mobj.group('id') + display_id = nt or lecture_id + api_path = 'lectures/' + lecture_id if lecture_id else 'lecture/' + nt + '.json' + video = self._download_json( + self._API_BASE_URL + api_path, display_id) + title = video['title'].strip() + if not lecture_id: + pid = video.get('productId') or video.get('uid') + if pid: + spid = pid.split('_') + if spid and len(spid) == 2: + lecture_id = spid[1] + + formats = [] + for format_ in video['content']['media']: + if not isinstance(format_, dict): + continue + file_ = format_.get('file') + if not file_: + continue + ext = determine_ext(file_) + if ext == 'smil': + # smil contains only broken RTMP formats anyway + continue + file_url = url_or_none(file_) + if not file_url: + continue + label = str_or_none(format_.get('label')) + filesize = int_or_none(format_.get('fileSize')) + f = { + 'url': file_url, + 'format_id': label, + 'filesize': float_or_none(filesize, invscale=1000) + } + if label: + mobj = re.match(r'(\d+)p\s*\(([^)]+)\)', label) + if mobj: + f.update({ + 'format_id': mobj.group(2), + 'height': int(mobj.group(1)), + }) + formats.append(f) + self._sort_formats(formats) + + subtitles = {} + automatic_captions = {} + captions = video.get('captions') or [] + for cc in captions: + cc_url = cc.get('url') + if not cc_url: + continue + cc_label = cc.get('translatedCode') + lang = cc.get('languageCode') or self._search_regex( + r'/([a-z]{2})_', cc_url, 'lang', + default=cc_label.split()[0] if cc_label else 'en') + original_lang = self._search_regex( + r'/[a-z]{2}_([a-z]{2})_', cc_url, 'original lang', + default=None) + sub_dict = (automatic_captions + if 'auto-translated' in cc_label or original_lang + else subtitles) + sub_dict.setdefault(self._CC_LANGS.get(lang, lang), []).append({ + 'url': cc_url, + }) + + return { + 'id': lecture_id or nt, + 'title': title, + 'formats': formats, + 'subtitles': subtitles, + 'automatic_captions': automatic_captions, + } + + +class LecturioCourseIE(LecturioBaseIE): + _VALID_URL = r'https://app\.lecturio\.com/(?:[^/]+/(?P<nt>[^/?#&]+)\.course|(?:#/)?course/c/(?P<id>\d+))' + _TESTS = [{ + 'url': 'https://app.lecturio.com/medical-courses/microbiology-introduction.course#/', + 'info_dict': { + 'id': 'microbiology-introduction', + 'title': 'Microbiology: Introduction', + 'description': 'md5:13da8500c25880c6016ae1e6d78c386a', + }, + 'playlist_count': 45, + 'skip': 'Requires lecturio account credentials', + }, { + 'url': 'https://app.lecturio.com/#/course/c/6434', + 'only_matching': True, + }] + + def _real_extract(self, url): + nt, course_id = self._match_valid_url(url).groups() + display_id = nt or course_id + api_path = 'courses/' + course_id if course_id else 'course/content/' + nt + '.json' + course = self._download_json( + self._API_BASE_URL + api_path, display_id) + entries = [] + for lecture in course.get('lectures', []): + lecture_id = str_or_none(lecture.get('id')) + lecture_url = lecture.get('url') + if lecture_url: + lecture_url = urljoin(url, lecture_url) + else: + lecture_url = 'https://app.lecturio.com/#/lecture/c/%s/%s' % (course_id, lecture_id) + entries.append(self.url_result( + lecture_url, ie=LecturioIE.ie_key(), video_id=lecture_id)) + return self.playlist_result( + entries, display_id, course.get('title'), + clean_html(course.get('description'))) + + +class LecturioDeCourseIE(LecturioBaseIE): + _VALID_URL = r'https://(?:www\.)?lecturio\.de/[^/]+/(?P<id>[^/?#&]+)\.kurs' + _TEST = { + 'url': 'https://www.lecturio.de/jura/grundrechte.kurs', + 'only_matching': True, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + entries = [] + for mobj in re.finditer( + r'(?s)<td[^>]+\bdata-lecture-id=["\'](?P<id>\d+).+?\bhref=(["\'])(?P<url>(?:(?!\2).)+\.vortrag)\b[^>]+>', + webpage): + lecture_url = urljoin(url, mobj.group('url')) + lecture_id = mobj.group('id') + entries.append(self.url_result( + lecture_url, ie=LecturioIE.ie_key(), video_id=lecture_id)) + + title = self._search_regex( + r'<h1[^>]*>([^<]+)', webpage, 'title', default=None) + + return self.playlist_result(entries, display_id, title) diff --git a/yt_dlp/extractor/leeco.py b/yt_dlp/extractor/leeco.py new file mode 100644 index 000000000..d5e11423c --- /dev/null +++ b/yt_dlp/extractor/leeco.py @@ -0,0 +1,368 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import datetime +import hashlib +import re +import time + +from .common import InfoExtractor +from ..compat import ( + compat_b64decode, + compat_ord, + compat_str, + compat_urllib_parse_urlencode, +) +from ..utils import ( + determine_ext, + encode_data_uri, + ExtractorError, + int_or_none, + orderedSet, + parse_iso8601, + str_or_none, + url_basename, + urshift, +) + + +class LeIE(InfoExtractor): + IE_DESC = '乐视网' + _VALID_URL = r'https?://(?:www\.le\.com/ptv/vplay|(?:sports\.le|(?:www\.)?lesports)\.com/(?:match|video))/(?P<id>\d+)\.html' + _GEO_COUNTRIES = ['CN'] + _URL_TEMPLATE = 'http://www.le.com/ptv/vplay/%s.html' + + _TESTS = [{ + 'url': 'http://www.le.com/ptv/vplay/22005890.html', + 'md5': 'edadcfe5406976f42f9f266057ee5e40', + 'info_dict': { + 'id': '22005890', + 'ext': 'mp4', + 'title': '第87届奥斯卡颁奖礼完美落幕 《鸟人》成最大赢家', + 'description': 'md5:a9cb175fd753e2962176b7beca21a47c', + }, + 'params': { + 'hls_prefer_native': True, + }, + }, { + 'url': 'http://www.le.com/ptv/vplay/1415246.html', + 'info_dict': { + 'id': '1415246', + 'ext': 'mp4', + 'title': '美人天下01', + 'description': 'md5:28942e650e82ed4fcc8e4de919ee854d', + }, + 'params': { + 'hls_prefer_native': True, + }, + }, { + 'note': 'This video is available only in Mainland China, thus a proxy is needed', + 'url': 'http://www.le.com/ptv/vplay/1118082.html', + 'md5': '2424c74948a62e5f31988438979c5ad1', + 'info_dict': { + 'id': '1118082', + 'ext': 'mp4', + 'title': '与龙共舞 完整版', + 'description': 'md5:7506a5eeb1722bb9d4068f85024e3986', + }, + 'params': { + 'hls_prefer_native': True, + }, + }, { + 'url': 'http://sports.le.com/video/25737697.html', + 'only_matching': True, + }, { + 'url': 'http://www.lesports.com/match/1023203003.html', + 'only_matching': True, + }, { + 'url': 'http://sports.le.com/match/1023203003.html', + 'only_matching': True, + }] + + # ror() and calc_time_key() are reversed from a embedded swf file in LetvPlayer.swf + def ror(self, param1, param2): + _loc3_ = 0 + while _loc3_ < param2: + param1 = urshift(param1, 1) + ((param1 & 1) << 31) + _loc3_ += 1 + return param1 + + def calc_time_key(self, param1): + _loc2_ = 185025305 + return self.ror(param1, _loc2_ % 17) ^ _loc2_ + + # see M3U8Encryption class in KLetvPlayer.swf + @staticmethod + def decrypt_m3u8(encrypted_data): + if encrypted_data[:5].decode('utf-8').lower() != 'vc_01': + return encrypted_data + encrypted_data = encrypted_data[5:] + + _loc4_ = bytearray(2 * len(encrypted_data)) + for idx, val in enumerate(encrypted_data): + b = compat_ord(val) + _loc4_[2 * idx] = b // 16 + _loc4_[2 * idx + 1] = b % 16 + idx = len(_loc4_) - 11 + _loc4_ = _loc4_[idx:] + _loc4_[:idx] + _loc7_ = bytearray(len(encrypted_data)) + for i in range(len(encrypted_data)): + _loc7_[i] = _loc4_[2 * i] * 16 + _loc4_[2 * i + 1] + + return bytes(_loc7_) + + def _check_errors(self, play_json): + # Check for errors + playstatus = play_json['msgs']['playstatus'] + if playstatus['status'] == 0: + flag = playstatus['flag'] + if flag == 1: + self.raise_geo_restricted() + else: + raise ExtractorError('Generic error. flag = %d' % flag, expected=True) + + def _real_extract(self, url): + media_id = self._match_id(url) + page = self._download_webpage(url, media_id) + + play_json_flash = self._download_json( + 'http://player-pc.le.com/mms/out/video/playJson', + media_id, 'Downloading flash playJson data', query={ + 'id': media_id, + 'platid': 1, + 'splatid': 105, + 'format': 1, + 'source': 1000, + 'tkey': self.calc_time_key(int(time.time())), + 'domain': 'www.le.com', + 'region': 'cn', + }, + headers=self.geo_verification_headers()) + self._check_errors(play_json_flash) + + def get_flash_urls(media_url, format_id): + nodes_data = self._download_json( + media_url, media_id, + 'Download JSON metadata for format %s' % format_id, + query={ + 'm3v': 1, + 'format': 1, + 'expect': 3, + 'tss': 'ios', + }) + + req = self._request_webpage( + nodes_data['nodelist'][0]['location'], media_id, + note='Downloading m3u8 information for format %s' % format_id) + + m3u8_data = self.decrypt_m3u8(req.read()) + + return { + 'hls': encode_data_uri(m3u8_data, 'application/vnd.apple.mpegurl'), + } + + extracted_formats = [] + formats = [] + playurl = play_json_flash['msgs']['playurl'] + play_domain = playurl['domain'][0] + + for format_id, format_data in playurl.get('dispatch', []).items(): + if format_id in extracted_formats: + continue + extracted_formats.append(format_id) + + media_url = play_domain + format_data[0] + for protocol, format_url in get_flash_urls(media_url, format_id).items(): + f = { + 'url': format_url, + 'ext': determine_ext(format_data[1]), + 'format_id': '%s-%s' % (protocol, format_id), + 'protocol': 'm3u8_native' if protocol == 'hls' else 'http', + 'quality': int_or_none(format_id), + } + + if format_id[-1:] == 'p': + f['height'] = int_or_none(format_id[:-1]) + + formats.append(f) + self._sort_formats(formats, ('res', 'quality')) + + publish_time = parse_iso8601(self._html_search_regex( + r'发布时间 ([^<>]+) ', page, 'publish time', default=None), + delimiter=' ', timezone=datetime.timedelta(hours=8)) + description = self._html_search_meta('description', page, fatal=False) + + return { + 'id': media_id, + 'formats': formats, + 'title': playurl['title'], + 'thumbnail': playurl['pic'], + 'description': description, + 'timestamp': publish_time, + } + + +class LePlaylistIE(InfoExtractor): + _VALID_URL = r'https?://[a-z]+\.le\.com/(?!video)[a-z]+/(?P<id>[a-z0-9_]+)' + + _TESTS = [{ + 'url': 'http://www.le.com/tv/46177.html', + 'info_dict': { + 'id': '46177', + 'title': '美人天下', + 'description': 'md5:395666ff41b44080396e59570dbac01c' + }, + 'playlist_count': 35 + }, { + 'url': 'http://tv.le.com/izt/wuzetian/index.html', + 'info_dict': { + 'id': 'wuzetian', + 'title': '武媚娘传奇', + 'description': 'md5:e12499475ab3d50219e5bba00b3cb248' + }, + # This playlist contains some extra videos other than the drama itself + 'playlist_mincount': 96 + }, { + 'url': 'http://tv.le.com/pzt/lswjzzjc/index.shtml', + # This series is moved to http://www.le.com/tv/10005297.html + 'only_matching': True, + }, { + 'url': 'http://www.le.com/comic/92063.html', + 'only_matching': True, + }, { + 'url': 'http://list.le.com/listn/c1009_sc532002_d2_p1_o1.html', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if LeIE.suitable(url) else super(LePlaylistIE, cls).suitable(url) + + def _real_extract(self, url): + playlist_id = self._match_id(url) + page = self._download_webpage(url, playlist_id) + + # Currently old domain names are still used in playlists + media_ids = orderedSet(re.findall( + r'<a[^>]+href="http://www\.letv\.com/ptv/vplay/(\d+)\.html', page)) + entries = [self.url_result(LeIE._URL_TEMPLATE % media_id, ie='Le') + for media_id in media_ids] + + title = self._html_search_meta('keywords', page, + fatal=False).split(',')[0] + description = self._html_search_meta('description', page, fatal=False) + + return self.playlist_result(entries, playlist_id, playlist_title=title, + playlist_description=description) + + +class LetvCloudIE(InfoExtractor): + # Most of *.letv.com is changed to *.le.com on 2016/01/02 + # but yuntv.letv.com is kept, so also keep the extractor name + IE_DESC = '乐视云' + _VALID_URL = r'https?://yuntv\.letv\.com/bcloud.html\?.+' + + _TESTS = [{ + 'url': 'http://yuntv.letv.com/bcloud.html?uu=p7jnfw5hw9&vu=467623dedf', + 'md5': '26450599afd64c513bc77030ad15db44', + 'info_dict': { + 'id': 'p7jnfw5hw9_467623dedf', + 'ext': 'mp4', + 'title': 'Video p7jnfw5hw9_467623dedf', + }, + }, { + 'url': 'http://yuntv.letv.com/bcloud.html?uu=p7jnfw5hw9&vu=ec93197892&pu=2c7cd40209&auto_play=1&gpcflag=1&width=640&height=360', + 'md5': 'e03d9cc8d9c13191e1caf277e42dbd31', + 'info_dict': { + 'id': 'p7jnfw5hw9_ec93197892', + 'ext': 'mp4', + 'title': 'Video p7jnfw5hw9_ec93197892', + }, + }, { + 'url': 'http://yuntv.letv.com/bcloud.html?uu=p7jnfw5hw9&vu=187060b6fd', + 'md5': 'cb988699a776b22d4a41b9d43acfb3ac', + 'info_dict': { + 'id': 'p7jnfw5hw9_187060b6fd', + 'ext': 'mp4', + 'title': 'Video p7jnfw5hw9_187060b6fd', + }, + }] + + @staticmethod + def sign_data(obj): + if obj['cf'] == 'flash': + salt = '2f9d6924b33a165a6d8b5d3d42f4f987' + items = ['cf', 'format', 'ran', 'uu', 'ver', 'vu'] + elif obj['cf'] == 'html5': + salt = 'fbeh5player12c43eccf2bec3300344' + items = ['cf', 'ran', 'uu', 'bver', 'vu'] + input_data = ''.join([item + obj[item] for item in items]) + salt + obj['sign'] = hashlib.md5(input_data.encode('utf-8')).hexdigest() + + def _get_formats(self, cf, uu, vu, media_id): + def get_play_json(cf, timestamp): + data = { + 'cf': cf, + 'ver': '2.2', + 'bver': 'firefox44.0', + 'format': 'json', + 'uu': uu, + 'vu': vu, + 'ran': compat_str(timestamp), + } + self.sign_data(data) + return self._download_json( + 'http://api.letvcloud.com/gpc.php?' + compat_urllib_parse_urlencode(data), + media_id, 'Downloading playJson data for type %s' % cf) + + play_json = get_play_json(cf, time.time()) + # The server time may be different from local time + if play_json.get('code') == 10071: + play_json = get_play_json(cf, play_json['timestamp']) + + if not play_json.get('data'): + if play_json.get('message'): + raise ExtractorError('Letv cloud said: %s' % play_json['message'], expected=True) + elif play_json.get('code'): + raise ExtractorError('Letv cloud returned error %d' % play_json['code'], expected=True) + else: + raise ExtractorError('Letv cloud returned an unknown error') + + def b64decode(s): + return compat_b64decode(s).decode('utf-8') + + formats = [] + for media in play_json['data']['video_info']['media'].values(): + play_url = media['play_url'] + url = b64decode(play_url['main_url']) + decoded_url = b64decode(url_basename(url)) + formats.append({ + 'url': url, + 'ext': determine_ext(decoded_url), + 'format_id': str_or_none(play_url.get('vtype')), + 'format_note': str_or_none(play_url.get('definition')), + 'width': int_or_none(play_url.get('vwidth')), + 'height': int_or_none(play_url.get('vheight')), + }) + + return formats + + def _real_extract(self, url): + uu_mobj = re.search(r'uu=([\w]+)', url) + vu_mobj = re.search(r'vu=([\w]+)', url) + + if not uu_mobj or not vu_mobj: + raise ExtractorError('Invalid URL: %s' % url, expected=True) + + uu = uu_mobj.group(1) + vu = vu_mobj.group(1) + media_id = uu + '_' + vu + + formats = self._get_formats('flash', uu, vu, media_id) + self._get_formats('html5', uu, vu, media_id) + self._sort_formats(formats) + + return { + 'id': media_id, + 'title': 'Video %s' % media_id, + 'formats': formats, + } diff --git a/yt_dlp/extractor/lego.py b/yt_dlp/extractor/lego.py new file mode 100644 index 000000000..b9d8b167c --- /dev/null +++ b/yt_dlp/extractor/lego.py @@ -0,0 +1,148 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import uuid + +from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ( + ExtractorError, + int_or_none, + qualities, +) + + +class LEGOIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?lego\.com/(?P<locale>[a-z]{2}-[a-z]{2})/(?:[^/]+/)*videos/(?:[^/]+/)*[^/?#]+-(?P<id>[0-9a-f]{32})' + _TESTS = [{ + 'url': 'http://www.lego.com/en-us/videos/themes/club/blocumentary-kawaguchi-55492d823b1b4d5e985787fa8c2973b1', + 'md5': 'f34468f176cfd76488767fc162c405fa', + 'info_dict': { + 'id': '55492d82-3b1b-4d5e-9857-87fa8c2973b1_en-US', + 'ext': 'mp4', + 'title': 'Blocumentary Great Creations: Akiyuki Kawaguchi', + 'description': 'Blocumentary Great Creations: Akiyuki Kawaguchi', + }, + }, { + # geo-restricted but the contentUrl contain a valid url + 'url': 'http://www.lego.com/nl-nl/videos/themes/nexoknights/episode-20-kingdom-of-heroes-13bdc2299ab24d9685701a915b3d71e7##sp=399', + 'md5': 'c7420221f7ffd03ff056f9db7f8d807c', + 'info_dict': { + 'id': '13bdc229-9ab2-4d96-8570-1a915b3d71e7_nl-NL', + 'ext': 'mp4', + 'title': 'Aflevering 20: Helden van het koninkrijk', + 'description': 'md5:8ee499aac26d7fa8bcb0cedb7f9c3941', + 'age_limit': 5, + }, + }, { + # with subtitle + 'url': 'https://www.lego.com/nl-nl/kids/videos/classic/creative-storytelling-the-little-puppy-aa24f27c7d5242bc86102ebdc0f24cba', + 'info_dict': { + 'id': 'aa24f27c-7d52-42bc-8610-2ebdc0f24cba_nl-NL', + 'ext': 'mp4', + 'title': 'De kleine puppy', + 'description': 'md5:5b725471f849348ac73f2e12cfb4be06', + 'age_limit': 1, + 'subtitles': { + 'nl': [{ + 'ext': 'srt', + 'url': r're:^https://.+\.srt$', + }], + }, + }, + 'params': { + 'skip_download': True, + }, + }] + _QUALITIES = { + 'Lowest': (64, 180, 320), + 'Low': (64, 270, 480), + 'Medium': (96, 360, 640), + 'High': (128, 540, 960), + 'Highest': (128, 720, 1280), + } + + def _real_extract(self, url): + locale, video_id = self._match_valid_url(url).groups() + countries = [locale.split('-')[1].upper()] + self._initialize_geo_bypass({ + 'countries': countries, + }) + + try: + item = self._download_json( + # https://contentfeed.services.lego.com/api/v2/item/[VIDEO_ID]?culture=[LOCALE]&contentType=Video + 'https://services.slingshot.lego.com/mediaplayer/v2', + video_id, query={ + 'videoId': '%s_%s' % (uuid.UUID(video_id), locale), + }, headers=self.geo_verification_headers()) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 451: + self.raise_geo_restricted(countries=countries) + raise + + video = item['Video'] + video_id = video['Id'] + title = video['Title'] + + q = qualities(['Lowest', 'Low', 'Medium', 'High', 'Highest']) + formats = [] + for video_source in item.get('VideoFormats', []): + video_source_url = video_source.get('Url') + if not video_source_url: + continue + video_source_format = video_source.get('Format') + if video_source_format == 'F4M': + formats.extend(self._extract_f4m_formats( + video_source_url, video_id, + f4m_id=video_source_format, fatal=False)) + elif video_source_format == 'M3U8': + formats.extend(self._extract_m3u8_formats( + video_source_url, video_id, 'mp4', 'm3u8_native', + m3u8_id=video_source_format, fatal=False)) + else: + video_source_quality = video_source.get('Quality') + format_id = [] + for v in (video_source_format, video_source_quality): + if v: + format_id.append(v) + f = { + 'format_id': '-'.join(format_id), + 'quality': q(video_source_quality), + 'url': video_source_url, + } + quality = self._QUALITIES.get(video_source_quality) + if quality: + f.update({ + 'abr': quality[0], + 'height': quality[1], + 'width': quality[2], + }), + formats.append(f) + self._sort_formats(formats) + + subtitles = {} + sub_file_id = video.get('SubFileId') + if sub_file_id and sub_file_id != '00000000-0000-0000-0000-000000000000': + net_storage_path = video.get('NetstoragePath') + invariant_id = video.get('InvariantId') + video_file_id = video.get('VideoFileId') + video_version = video.get('VideoVersion') + if net_storage_path and invariant_id and video_file_id and video_version: + subtitles.setdefault(locale[:2], []).append({ + 'url': 'https://lc-mediaplayerns-live-s.legocdn.com/public/%s/%s_%s_%s_%s_sub.srt' % (net_storage_path, invariant_id, video_file_id, locale, video_version), + }) + + return { + 'id': video_id, + 'title': title, + 'description': video.get('Description'), + 'thumbnail': video.get('GeneratedCoverImage') or video.get('GeneratedThumbnail'), + 'duration': int_or_none(video.get('Length')), + 'formats': formats, + 'subtitles': subtitles, + 'age_limit': int_or_none(video.get('AgeFrom')), + 'season': video.get('SeasonTitle'), + 'season_number': int_or_none(video.get('Season')) or None, + 'episode_number': int_or_none(video.get('Episode')) or None, + } diff --git a/youtube_dl/extractor/lemonde.py b/yt_dlp/extractor/lemonde.py index 3306892e8..3306892e8 100644 --- a/youtube_dl/extractor/lemonde.py +++ b/yt_dlp/extractor/lemonde.py diff --git a/youtube_dl/extractor/lenta.py b/yt_dlp/extractor/lenta.py index 2ebd4e577..2ebd4e577 100644 --- a/youtube_dl/extractor/lenta.py +++ b/yt_dlp/extractor/lenta.py diff --git a/youtube_dl/extractor/libraryofcongress.py b/yt_dlp/extractor/libraryofcongress.py index 03f205144..03f205144 100644 --- a/youtube_dl/extractor/libraryofcongress.py +++ b/yt_dlp/extractor/libraryofcongress.py diff --git a/yt_dlp/extractor/libsyn.py b/yt_dlp/extractor/libsyn.py new file mode 100644 index 000000000..d1fcda4ef --- /dev/null +++ b/yt_dlp/extractor/libsyn.py @@ -0,0 +1,92 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ( + clean_html, + get_element_by_class, + parse_duration, + strip_or_none, + unified_strdate, +) + + +class LibsynIE(InfoExtractor): + _VALID_URL = r'(?P<mainurl>https?://html5-player\.libsyn\.com/embed/episode/id/(?P<id>[0-9]+))' + + _TESTS = [{ + 'url': 'http://html5-player.libsyn.com/embed/episode/id/6385796/', + 'md5': '2a55e75496c790cdeb058e7e6c087746', + 'info_dict': { + 'id': '6385796', + 'ext': 'mp3', + 'title': "Champion Minded - Developing a Growth Mindset", + # description fetched using another request: + # http://html5-player.libsyn.com/embed/getitemdetails?item_id=6385796 + # 'description': 'In this episode, Allistair talks about the importance of developing a growth mindset, not only in sports, but in life too.', + 'upload_date': '20180320', + 'thumbnail': 're:^https?://.*', + }, + }, { + 'url': 'https://html5-player.libsyn.com/embed/episode/id/3727166/height/75/width/200/theme/standard/direction/no/autoplay/no/autonext/no/thumbnail/no/preload/no/no_addthis/no/', + 'md5': '6c5cb21acd622d754d3b1a92b582ce42', + 'info_dict': { + 'id': '3727166', + 'ext': 'mp3', + 'title': 'Clients From Hell Podcast - How a Sex Toy Company Kickstarted my Freelance Career', + 'upload_date': '20150818', + 'thumbnail': 're:^https?://.*', + } + }] + + def _real_extract(self, url): + url, video_id = self._match_valid_url(url).groups() + webpage = self._download_webpage(url, video_id) + + data = self._parse_json(self._search_regex( + r'var\s+playlistItem\s*=\s*({.+?});', + webpage, 'JSON data block'), video_id) + + episode_title = data.get('item_title') or get_element_by_class('episode-title', webpage) + if not episode_title: + self._search_regex( + [r'data-title="([^"]+)"', r'<title>(.+?)</title>'], + webpage, 'episode title') + episode_title = episode_title.strip() + + podcast_title = strip_or_none(clean_html(self._search_regex( + r'<h3>([^<]+)</h3>', webpage, 'podcast title', + default=None) or get_element_by_class('podcast-title', webpage))) + + title = '%s - %s' % (podcast_title, episode_title) if podcast_title else episode_title + + formats = [] + for k, format_id in (('media_url_libsyn', 'libsyn'), ('media_url', 'main'), ('download_link', 'download')): + f_url = data.get(k) + if not f_url: + continue + formats.append({ + 'url': f_url, + 'format_id': format_id, + }) + + description = self._html_search_regex( + r'<p\s+id="info_text_body">(.+?)</p>', webpage, + 'description', default=None) + if description: + # Strip non-breaking and normal spaces + description = description.replace('\u00A0', ' ').strip() + release_date = unified_strdate(self._search_regex( + r'<div class="release_date">Released: ([^<]+)<', + webpage, 'release date', default=None) or data.get('release_date')) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': data.get('thumbnail_url'), + 'upload_date': release_date, + 'duration': parse_duration(data.get('duration')), + 'formats': formats, + } diff --git a/yt_dlp/extractor/lifenews.py b/yt_dlp/extractor/lifenews.py new file mode 100644 index 000000000..49a0a5989 --- /dev/null +++ b/yt_dlp/extractor/lifenews.py @@ -0,0 +1,239 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urlparse, +) +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + parse_iso8601, + remove_end, +) + + +class LifeNewsIE(InfoExtractor): + IE_NAME = 'life' + IE_DESC = 'Life.ru' + _VALID_URL = r'https?://life\.ru/t/[^/]+/(?P<id>\d+)' + + _TESTS = [{ + # single video embedded via video/source + 'url': 'https://life.ru/t/новости/98736', + 'md5': '77c95eaefaca216e32a76a343ad89d23', + 'info_dict': { + 'id': '98736', + 'ext': 'mp4', + 'title': 'Мужчина нашел дома архив оборонного завода', + 'description': 'md5:3b06b1b39b5e2bea548e403d99b8bf26', + 'timestamp': 1344154740, + 'upload_date': '20120805', + 'view_count': int, + } + }, { + # single video embedded via iframe + 'url': 'https://life.ru/t/новости/152125', + 'md5': '77d19a6f0886cd76bdbf44b4d971a273', + 'info_dict': { + 'id': '152125', + 'ext': 'mp4', + 'title': 'В Сети появилось видео захвата «Правым сектором» колхозных полей ', + 'description': 'Жители двух поселков Днепропетровской области не простили радикалам угрозу лишения плодородных земель и пошли в лобовую. ', + 'timestamp': 1427961840, + 'upload_date': '20150402', + 'view_count': int, + } + }, { + # two videos embedded via iframe + 'url': 'https://life.ru/t/новости/153461', + 'info_dict': { + 'id': '153461', + 'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве', + 'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.', + 'timestamp': 1430825520, + 'view_count': int, + }, + 'playlist': [{ + 'md5': '9b6ef8bc0ffa25aebc8bdb40d89ab795', + 'info_dict': { + 'id': '153461-video1', + 'ext': 'mp4', + 'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве (Видео 1)', + 'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.', + 'timestamp': 1430825520, + 'upload_date': '20150505', + }, + }, { + 'md5': 'ebb3bf3b1ce40e878d0d628e93eb0322', + 'info_dict': { + 'id': '153461-video2', + 'ext': 'mp4', + 'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве (Видео 2)', + 'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.', + 'timestamp': 1430825520, + 'upload_date': '20150505', + }, + }], + }, { + 'url': 'https://life.ru/t/новости/213035', + 'only_matching': True, + }, { + 'url': 'https://life.ru/t/%D0%BD%D0%BE%D0%B2%D0%BE%D1%81%D1%82%D0%B8/153461', + 'only_matching': True, + }, { + 'url': 'https://life.ru/t/новости/411489/manuel_vals_nazval_frantsiiu_tsieliu_nomier_odin_dlia_ighil', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_urls = re.findall( + r'<video[^>]+><source[^>]+src=["\'](.+?)["\']', webpage) + + iframe_links = re.findall( + r'<iframe[^>]+src=["\']((?:https?:)?//embed\.life\.ru/(?:embed|video)/.+?)["\']', + webpage) + + if not video_urls and not iframe_links: + raise ExtractorError('No media links available for %s' % video_id) + + title = remove_end( + self._og_search_title(webpage), + ' - Life.ru') + + description = self._og_search_description(webpage) + + view_count = self._html_search_regex( + r'<div[^>]+class=(["\']).*?\bhits-count\b.*?\1[^>]*>\s*(?P<value>\d+)\s*</div>', + webpage, 'view count', fatal=False, group='value') + + timestamp = parse_iso8601(self._search_regex( + r'<time[^>]+datetime=(["\'])(?P<value>.+?)\1', + webpage, 'upload date', fatal=False, group='value')) + + common_info = { + 'description': description, + 'view_count': int_or_none(view_count), + 'timestamp': timestamp, + } + + def make_entry(video_id, video_url, index=None): + cur_info = dict(common_info) + cur_info.update({ + 'id': video_id if not index else '%s-video%s' % (video_id, index), + 'url': video_url, + 'title': title if not index else '%s (Видео %s)' % (title, index), + }) + return cur_info + + def make_video_entry(video_id, video_url, index=None): + video_url = compat_urlparse.urljoin(url, video_url) + return make_entry(video_id, video_url, index) + + def make_iframe_entry(video_id, video_url, index=None): + video_url = self._proto_relative_url(video_url, 'http:') + cur_info = make_entry(video_id, video_url, index) + cur_info['_type'] = 'url_transparent' + return cur_info + + if len(video_urls) == 1 and not iframe_links: + return make_video_entry(video_id, video_urls[0]) + + if len(iframe_links) == 1 and not video_urls: + return make_iframe_entry(video_id, iframe_links[0]) + + entries = [] + + if video_urls: + for num, video_url in enumerate(video_urls, 1): + entries.append(make_video_entry(video_id, video_url, num)) + + if iframe_links: + for num, iframe_link in enumerate(iframe_links, len(video_urls) + 1): + entries.append(make_iframe_entry(video_id, iframe_link, num)) + + playlist = common_info.copy() + playlist.update(self.playlist_result(entries, video_id, title, description)) + return playlist + + +class LifeEmbedIE(InfoExtractor): + IE_NAME = 'life:embed' + _VALID_URL = r'https?://embed\.life\.ru/(?:embed|video)/(?P<id>[\da-f]{32})' + + _TESTS = [{ + 'url': 'http://embed.life.ru/embed/e50c2dec2867350528e2574c899b8291', + 'md5': 'b889715c9e49cb1981281d0e5458fbbe', + 'info_dict': { + 'id': 'e50c2dec2867350528e2574c899b8291', + 'ext': 'mp4', + 'title': 'e50c2dec2867350528e2574c899b8291', + 'thumbnail': r're:http://.*\.jpg', + } + }, { + # with 1080p + 'url': 'https://embed.life.ru/video/e50c2dec2867350528e2574c899b8291', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + thumbnail = None + formats = [] + + def extract_m3u8(manifest_url): + formats.extend(self._extract_m3u8_formats( + manifest_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='m3u8')) + + def extract_original(original_url): + formats.append({ + 'url': original_url, + 'format_id': determine_ext(original_url, None), + 'quality': 1, + }) + + playlist = self._parse_json( + self._search_regex( + r'options\s*=\s*({.+?});', webpage, 'options', default='{}'), + video_id).get('playlist', {}) + if playlist: + master = playlist.get('master') + if isinstance(master, compat_str) and determine_ext(master) == 'm3u8': + extract_m3u8(compat_urlparse.urljoin(url, master)) + original = playlist.get('original') + if isinstance(original, compat_str): + extract_original(original) + thumbnail = playlist.get('image') + + # Old rendition fallback + if not formats: + for video_url in re.findall(r'"file"\s*:\s*"([^"]+)', webpage): + video_url = compat_urlparse.urljoin(url, video_url) + if determine_ext(video_url) == 'm3u8': + extract_m3u8(video_url) + else: + extract_original(video_url) + + self._sort_formats(formats) + + thumbnail = thumbnail or self._search_regex( + r'"image"\s*:\s*"([^"]+)', webpage, 'thumbnail', default=None) + + return { + 'id': video_id, + 'title': video_id, + 'thumbnail': thumbnail, + 'formats': formats, + } diff --git a/yt_dlp/extractor/limelight.py b/yt_dlp/extractor/limelight.py new file mode 100644 index 000000000..369141d67 --- /dev/null +++ b/yt_dlp/extractor/limelight.py @@ -0,0 +1,363 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ( + determine_ext, + float_or_none, + int_or_none, + smuggle_url, + try_get, + unsmuggle_url, + ExtractorError, +) + + +class LimelightBaseIE(InfoExtractor): + _PLAYLIST_SERVICE_URL = 'http://production-ps.lvp.llnw.net/r/PlaylistService/%s/%s/%s' + + @classmethod + def _extract_urls(cls, webpage, source_url): + lm = { + 'Media': 'media', + 'Channel': 'channel', + 'ChannelList': 'channel_list', + } + + def smuggle(url): + return smuggle_url(url, {'source_url': source_url}) + + entries = [] + for kind, video_id in re.findall( + r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})', + webpage): + entries.append(cls.url_result( + smuggle('limelight:%s:%s' % (lm[kind], video_id)), + 'Limelight%s' % kind, video_id)) + for mobj in re.finditer( + # As per [1] class attribute should be exactly equal to + # LimelightEmbeddedPlayerFlash but numerous examples seen + # that don't exactly match it (e.g. [2]). + # 1. http://support.3playmedia.com/hc/en-us/articles/227732408-Limelight-Embedding-the-Captions-Plugin-with-the-Limelight-Player-on-Your-Webpage + # 2. http://www.sedona.com/FacilitatorTraining2017 + r'''(?sx) + <object[^>]+class=(["\'])(?:(?!\1).)*\bLimelightEmbeddedPlayerFlash\b(?:(?!\1).)*\1[^>]*>.*? + <param[^>]+ + name=(["\'])flashVars\2[^>]+ + value=(["\'])(?:(?!\3).)*(?P<kind>media|channel(?:List)?)Id=(?P<id>[a-z0-9]{32}) + ''', webpage): + kind, video_id = mobj.group('kind'), mobj.group('id') + entries.append(cls.url_result( + smuggle('limelight:%s:%s' % (kind, video_id)), + 'Limelight%s' % kind.capitalize(), video_id)) + # http://support.3playmedia.com/hc/en-us/articles/115009517327-Limelight-Embedding-the-Audio-Description-Plugin-with-the-Limelight-Player-on-Your-Web-Page) + for video_id in re.findall( + r'(?s)LimelightPlayerUtil\.embed\s*\(\s*{.*?\bmediaId["\']\s*:\s*["\'](?P<id>[a-z0-9]{32})', + webpage): + entries.append(cls.url_result( + smuggle('limelight:media:%s' % video_id), + LimelightMediaIE.ie_key(), video_id)) + return entries + + def _call_playlist_service(self, item_id, method, fatal=True, referer=None): + headers = {} + if referer: + headers['Referer'] = referer + try: + return self._download_json( + self._PLAYLIST_SERVICE_URL % (self._PLAYLIST_SERVICE_PATH, item_id, method), + item_id, 'Downloading PlaylistService %s JSON' % method, + fatal=fatal, headers=headers) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + error = self._parse_json(e.cause.read().decode(), item_id)['detail']['contentAccessPermission'] + if error == 'CountryDisabled': + self.raise_geo_restricted() + raise ExtractorError(error, expected=True) + raise + + def _extract(self, item_id, pc_method, mobile_method, referer=None): + pc = self._call_playlist_service(item_id, pc_method, referer=referer) + mobile = self._call_playlist_service( + item_id, mobile_method, fatal=False, referer=referer) + return pc, mobile + + def _extract_info(self, pc, mobile, i, referer): + get_item = lambda x, y: try_get(x, lambda x: x[y][i], dict) or {} + pc_item = get_item(pc, 'playlistItems') + mobile_item = get_item(mobile, 'mediaList') + video_id = pc_item.get('mediaId') or mobile_item['mediaId'] + title = pc_item.get('title') or mobile_item['title'] + + formats = [] + urls = [] + for stream in pc_item.get('streams', []): + stream_url = stream.get('url') + if not stream_url or stream_url in urls: + continue + if not self.get_param('allow_unplayable_formats') and stream.get('drmProtected'): + continue + urls.append(stream_url) + ext = determine_ext(stream_url) + if ext == 'f4m': + formats.extend(self._extract_f4m_formats( + stream_url, video_id, f4m_id='hds', fatal=False)) + else: + fmt = { + 'url': stream_url, + 'abr': float_or_none(stream.get('audioBitRate')), + 'fps': float_or_none(stream.get('videoFrameRate')), + 'ext': ext, + } + width = int_or_none(stream.get('videoWidthInPixels')) + height = int_or_none(stream.get('videoHeightInPixels')) + vbr = float_or_none(stream.get('videoBitRate')) + if width or height or vbr: + fmt.update({ + 'width': width, + 'height': height, + 'vbr': vbr, + }) + else: + fmt['vcodec'] = 'none' + rtmp = re.search(r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+))/(?P<playpath>mp[34]:.+)$', stream_url) + if rtmp: + format_id = 'rtmp' + if stream.get('videoBitRate'): + format_id += '-%d' % int_or_none(stream['videoBitRate']) + http_format_id = format_id.replace('rtmp', 'http') + + CDN_HOSTS = ( + ('delvenetworks.com', 'cpl.delvenetworks.com'), + ('video.llnw.net', 's2.content.video.llnw.net'), + ) + for cdn_host, http_host in CDN_HOSTS: + if cdn_host not in rtmp.group('host').lower(): + continue + http_url = 'http://%s/%s' % (http_host, rtmp.group('playpath')[4:]) + urls.append(http_url) + if self._is_valid_url(http_url, video_id, http_format_id): + http_fmt = fmt.copy() + http_fmt.update({ + 'url': http_url, + 'format_id': http_format_id, + }) + formats.append(http_fmt) + break + + fmt.update({ + 'url': rtmp.group('url'), + 'play_path': rtmp.group('playpath'), + 'app': rtmp.group('app'), + 'ext': 'flv', + 'format_id': format_id, + }) + formats.append(fmt) + + for mobile_url in mobile_item.get('mobileUrls', []): + media_url = mobile_url.get('mobileUrl') + format_id = mobile_url.get('targetMediaPlatform') + if not media_url or media_url in urls: + continue + if (format_id in ('Widevine', 'SmoothStreaming') + and not self.get_param('allow_unplayable_formats', False)): + continue + urls.append(media_url) + ext = determine_ext(media_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + media_url, video_id, 'mp4', 'm3u8_native', + m3u8_id=format_id, fatal=False)) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + stream_url, video_id, f4m_id=format_id, fatal=False)) + else: + formats.append({ + 'url': media_url, + 'format_id': format_id, + 'quality': -10, + 'ext': ext, + }) + + self._sort_formats(formats) + + subtitles = {} + for flag in mobile_item.get('flags'): + if flag == 'ClosedCaptions': + closed_captions = self._call_playlist_service( + video_id, 'getClosedCaptionsDetailsByMediaId', + False, referer) or [] + for cc in closed_captions: + cc_url = cc.get('webvttFileUrl') + if not cc_url: + continue + lang = cc.get('languageCode') or self._search_regex(r'/[a-z]{2}\.vtt', cc_url, 'lang', default='en') + subtitles.setdefault(lang, []).append({ + 'url': cc_url, + }) + break + + get_meta = lambda x: pc_item.get(x) or mobile_item.get(x) + + return { + 'id': video_id, + 'title': title, + 'description': get_meta('description'), + 'formats': formats, + 'duration': float_or_none(get_meta('durationInMilliseconds'), 1000), + 'thumbnail': get_meta('previewImageUrl') or get_meta('thumbnailImageUrl'), + 'subtitles': subtitles, + } + + +class LimelightMediaIE(LimelightBaseIE): + IE_NAME = 'limelight' + _VALID_URL = r'''(?x) + (?: + limelight:media:| + https?:// + (?: + link\.videoplatform\.limelight\.com/media/| + assets\.delvenetworks\.com/player/loader\.swf + ) + \?.*?\bmediaId= + ) + (?P<id>[a-z0-9]{32}) + ''' + _TESTS = [{ + 'url': 'http://link.videoplatform.limelight.com/media/?mediaId=3ffd040b522b4485b6d84effc750cd86', + 'info_dict': { + 'id': '3ffd040b522b4485b6d84effc750cd86', + 'ext': 'mp4', + 'title': 'HaP and the HB Prince Trailer', + 'description': 'md5:8005b944181778e313d95c1237ddb640', + 'thumbnail': r're:^https?://.*\.jpeg$', + 'duration': 144.23, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # video with subtitles + 'url': 'limelight:media:a3e00274d4564ec4a9b29b9466432335', + 'md5': '2fa3bad9ac321e23860ca23bc2c69e3d', + 'info_dict': { + 'id': 'a3e00274d4564ec4a9b29b9466432335', + 'ext': 'mp4', + 'title': '3Play Media Overview Video', + 'thumbnail': r're:^https?://.*\.jpeg$', + 'duration': 78.101, + # TODO: extract all languages that were accessible via API + # 'subtitles': 'mincount:9', + 'subtitles': 'mincount:1', + }, + }, { + 'url': 'https://assets.delvenetworks.com/player/loader.swf?mediaId=8018a574f08d416e95ceaccae4ba0452', + 'only_matching': True, + }] + _PLAYLIST_SERVICE_PATH = 'media' + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + video_id = self._match_id(url) + source_url = smuggled_data.get('source_url') + self._initialize_geo_bypass({ + 'countries': smuggled_data.get('geo_countries'), + }) + + pc, mobile = self._extract( + video_id, 'getPlaylistByMediaId', + 'getMobilePlaylistByMediaId', source_url) + + return self._extract_info(pc, mobile, 0, source_url) + + +class LimelightChannelIE(LimelightBaseIE): + IE_NAME = 'limelight:channel' + _VALID_URL = r'''(?x) + (?: + limelight:channel:| + https?:// + (?: + link\.videoplatform\.limelight\.com/media/| + assets\.delvenetworks\.com/player/loader\.swf + ) + \?.*?\bchannelId= + ) + (?P<id>[a-z0-9]{32}) + ''' + _TESTS = [{ + 'url': 'http://link.videoplatform.limelight.com/media/?channelId=ab6a524c379342f9b23642917020c082', + 'info_dict': { + 'id': 'ab6a524c379342f9b23642917020c082', + 'title': 'Javascript Sample Code', + 'description': 'Javascript Sample Code - http://www.delvenetworks.com/sample-code/playerCode-demo.html', + }, + 'playlist_mincount': 3, + }, { + 'url': 'http://assets.delvenetworks.com/player/loader.swf?channelId=ab6a524c379342f9b23642917020c082', + 'only_matching': True, + }] + _PLAYLIST_SERVICE_PATH = 'channel' + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + channel_id = self._match_id(url) + source_url = smuggled_data.get('source_url') + + pc, mobile = self._extract( + channel_id, 'getPlaylistByChannelId', + 'getMobilePlaylistWithNItemsByChannelId?begin=0&count=-1', + source_url) + + entries = [ + self._extract_info(pc, mobile, i, source_url) + for i in range(len(pc['playlistItems']))] + + return self.playlist_result( + entries, channel_id, pc.get('title'), mobile.get('description')) + + +class LimelightChannelListIE(LimelightBaseIE): + IE_NAME = 'limelight:channel_list' + _VALID_URL = r'''(?x) + (?: + limelight:channel_list:| + https?:// + (?: + link\.videoplatform\.limelight\.com/media/| + assets\.delvenetworks\.com/player/loader\.swf + ) + \?.*?\bchannelListId= + ) + (?P<id>[a-z0-9]{32}) + ''' + _TESTS = [{ + 'url': 'http://link.videoplatform.limelight.com/media/?channelListId=301b117890c4465c8179ede21fd92e2b', + 'info_dict': { + 'id': '301b117890c4465c8179ede21fd92e2b', + 'title': 'Website - Hero Player', + }, + 'playlist_mincount': 2, + }, { + 'url': 'https://assets.delvenetworks.com/player/loader.swf?channelListId=301b117890c4465c8179ede21fd92e2b', + 'only_matching': True, + }] + _PLAYLIST_SERVICE_PATH = 'channel_list' + + def _real_extract(self, url): + channel_list_id = self._match_id(url) + + channel_list = self._call_playlist_service( + channel_list_id, 'getMobileChannelListById') + + entries = [ + self.url_result('limelight:channel:%s' % channel['id'], 'LimelightChannel') + for channel in channel_list['channelList']] + + return self.playlist_result( + entries, channel_list_id, channel_list['title']) diff --git a/yt_dlp/extractor/line.py b/yt_dlp/extractor/line.py new file mode 100644 index 000000000..d4bcae6c1 --- /dev/null +++ b/yt_dlp/extractor/line.py @@ -0,0 +1,228 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + js_to_json, + str_or_none, +) + + +class LineTVIE(InfoExtractor): + _VALID_URL = r'https?://tv\.line\.me/v/(?P<id>\d+)_[^/]+-(?P<segment>ep\d+-\d+)' + + _TESTS = [{ + 'url': 'https://tv.line.me/v/793123_goodbye-mrblack-ep1-1/list/69246', + 'info_dict': { + 'id': '793123_ep1-1', + 'ext': 'mp4', + 'title': 'Goodbye Mr.Black | EP.1-1', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 998.509, + 'view_count': int, + }, + }, { + 'url': 'https://tv.line.me/v/2587507_%E6%B4%BE%E9%81%A3%E5%A5%B3%E9%86%ABx-ep1-02/list/185245', + 'only_matching': True, + }] + + def _real_extract(self, url): + series_id, segment = self._match_valid_url(url).groups() + video_id = '%s_%s' % (series_id, segment) + + webpage = self._download_webpage(url, video_id) + + player_params = self._parse_json(self._search_regex( + r'naver\.WebPlayer\(({[^}]+})\)', webpage, 'player parameters'), + video_id, transform_source=js_to_json) + + video_info = self._download_json( + 'https://global-nvapis.line.me/linetv/rmcnmv/vod_play_videoInfo.json', + video_id, query={ + 'videoId': player_params['videoId'], + 'key': player_params['key'], + }) + + stream = video_info['streams'][0] + extra_query = '?__gda__=' + stream['key']['value'] + formats = self._extract_m3u8_formats( + stream['source'] + extra_query, video_id, ext='mp4', + entry_protocol='m3u8_native', m3u8_id='hls') + + for a_format in formats: + a_format['url'] += extra_query + + duration = None + for video in video_info.get('videos', {}).get('list', []): + encoding_option = video.get('encodingOption', {}) + abr = video['bitrate']['audio'] + vbr = video['bitrate']['video'] + tbr = abr + vbr + formats.append({ + 'url': video['source'], + 'format_id': 'http-%d' % int(tbr), + 'height': encoding_option.get('height'), + 'width': encoding_option.get('width'), + 'abr': abr, + 'vbr': vbr, + 'filesize': video.get('size'), + }) + if video.get('duration') and duration is None: + duration = video['duration'] + + self._sort_formats(formats) + + if formats and not formats[0].get('width'): + formats[0]['vcodec'] = 'none' + + title = self._og_search_title(webpage) + + # like_count requires an additional API request https://tv.line.me/api/likeit/getCount + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'extra_param_to_segment_url': extra_query[1:], + 'duration': duration, + 'thumbnails': [{'url': thumbnail['source']} + for thumbnail in video_info.get('thumbnails', {}).get('list', [])], + 'view_count': video_info.get('meta', {}).get('count'), + } + + +class LineLiveBaseIE(InfoExtractor): + _API_BASE_URL = 'https://live-api.line-apps.com/web/v4.0/channel/' + + def _parse_broadcast_item(self, item): + broadcast_id = compat_str(item['id']) + title = item['title'] + is_live = item.get('isBroadcastingNow') + + thumbnails = [] + for thumbnail_id, thumbnail_url in (item.get('thumbnailURLs') or {}).items(): + if not thumbnail_url: + continue + thumbnails.append({ + 'id': thumbnail_id, + 'url': thumbnail_url, + }) + + channel = item.get('channel') or {} + channel_id = str_or_none(channel.get('id')) + + return { + 'id': broadcast_id, + 'title': self._live_title(title) if is_live else title, + 'thumbnails': thumbnails, + 'timestamp': int_or_none(item.get('createdAt')), + 'channel': channel.get('name'), + 'channel_id': channel_id, + 'channel_url': 'https://live.line.me/channels/' + channel_id if channel_id else None, + 'duration': int_or_none(item.get('archiveDuration')), + 'view_count': int_or_none(item.get('viewerCount')), + 'comment_count': int_or_none(item.get('chatCount')), + 'is_live': is_live, + } + + +class LineLiveIE(LineLiveBaseIE): + _VALID_URL = r'https?://live\.line\.me/channels/(?P<channel_id>\d+)/broadcast/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://live.line.me/channels/4867368/broadcast/16331360', + 'md5': 'bc931f26bf1d4f971e3b0982b3fab4a3', + 'info_dict': { + 'id': '16331360', + 'title': '振りコピ講座😙😙😙', + 'ext': 'mp4', + 'timestamp': 1617095132, + 'upload_date': '20210330', + 'channel': '白川ゆめか', + 'channel_id': '4867368', + 'view_count': int, + 'comment_count': int, + 'is_live': False, + } + }, { + # archiveStatus == 'DELETED' + 'url': 'https://live.line.me/channels/4778159/broadcast/16378488', + 'only_matching': True, + }] + + def _real_extract(self, url): + channel_id, broadcast_id = self._match_valid_url(url).groups() + broadcast = self._download_json( + self._API_BASE_URL + '%s/broadcast/%s' % (channel_id, broadcast_id), + broadcast_id) + item = broadcast['item'] + info = self._parse_broadcast_item(item) + protocol = 'm3u8' if info['is_live'] else 'm3u8_native' + formats = [] + for k, v in (broadcast.get(('live' if info['is_live'] else 'archived') + 'HLSURLs') or {}).items(): + if not v: + continue + if k == 'abr': + formats.extend(self._extract_m3u8_formats( + v, broadcast_id, 'mp4', protocol, + m3u8_id='hls', fatal=False)) + continue + f = { + 'ext': 'mp4', + 'format_id': 'hls-' + k, + 'protocol': protocol, + 'url': v, + } + if not k.isdigit(): + f['vcodec'] = 'none' + formats.append(f) + if not formats: + archive_status = item.get('archiveStatus') + if archive_status != 'ARCHIVED': + self.raise_no_formats('this video has been ' + archive_status.lower(), expected=True) + self._sort_formats(formats) + info['formats'] = formats + return info + + +class LineLiveChannelIE(LineLiveBaseIE): + _VALID_URL = r'https?://live\.line\.me/channels/(?P<id>\d+)(?!/broadcast/\d+)(?:[/?&#]|$)' + _TEST = { + 'url': 'https://live.line.me/channels/5893542', + 'info_dict': { + 'id': '5893542', + 'title': 'いくらちゃん', + 'description': 'md5:c3a4af801f43b2fac0b02294976580be', + }, + 'playlist_mincount': 29 + } + + def _archived_broadcasts_entries(self, archived_broadcasts, channel_id): + while True: + for row in (archived_broadcasts.get('rows') or []): + share_url = str_or_none(row.get('shareURL')) + if not share_url: + continue + info = self._parse_broadcast_item(row) + info.update({ + '_type': 'url', + 'url': share_url, + 'ie_key': LineLiveIE.ie_key(), + }) + yield info + if not archived_broadcasts.get('hasNextPage'): + return + archived_broadcasts = self._download_json( + self._API_BASE_URL + channel_id + '/archived_broadcasts', + channel_id, query={ + 'lastId': info['id'], + }) + + def _real_extract(self, url): + channel_id = self._match_id(url) + channel = self._download_json(self._API_BASE_URL + channel_id, channel_id) + return self.playlist_result( + self._archived_broadcasts_entries(channel.get('archivedBroadcasts') or {}, channel_id), + channel_id, channel.get('title'), channel.get('information')) diff --git a/yt_dlp/extractor/linkedin.py b/yt_dlp/extractor/linkedin.py new file mode 100644 index 000000000..3ce906e2f --- /dev/null +++ b/yt_dlp/extractor/linkedin.py @@ -0,0 +1,208 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from itertools import zip_longest +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + srt_subtitles_timecode, + try_get, + urlencode_postdata, + urljoin, +) + + +class LinkedInLearningBaseIE(InfoExtractor): + _NETRC_MACHINE = 'linkedin' + _LOGIN_URL = 'https://www.linkedin.com/uas/login?trk=learning' + + def _call_api(self, course_slug, fields, video_slug=None, resolution=None): + query = { + 'courseSlug': course_slug, + 'fields': fields, + 'q': 'slugs', + } + sub = '' + if video_slug: + query.update({ + 'videoSlug': video_slug, + 'resolution': '_%s' % resolution, + }) + sub = ' %dp' % resolution + api_url = 'https://www.linkedin.com/learning-api/detailedCourses' + return self._download_json( + api_url, video_slug, 'Downloading%s JSON metadata' % sub, headers={ + 'Csrf-Token': self._get_cookies(api_url)['JSESSIONID'].value, + }, query=query)['elements'][0] + + def _get_urn_id(self, video_data): + urn = video_data.get('urn') + if urn: + mobj = re.search(r'urn:li:lyndaCourse:\d+,(\d+)', urn) + if mobj: + return mobj.group(1) + + def _get_video_id(self, video_data, course_slug, video_slug): + return self._get_urn_id(video_data) or '%s/%s' % (course_slug, video_slug) + + def _real_initialize(self): + email, password = self._get_login_info() + if email is None: + return + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login page') + action_url = urljoin(self._LOGIN_URL, self._search_regex( + r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, 'post url', + default='https://www.linkedin.com/uas/login-submit', group='url')) + data = self._hidden_inputs(login_page) + data.update({ + 'session_key': email, + 'session_password': password, + }) + login_submit_page = self._download_webpage( + action_url, None, 'Logging in', + data=urlencode_postdata(data)) + error = self._search_regex( + r'<span[^>]+class="error"[^>]*>\s*(.+?)\s*</span>', + login_submit_page, 'error', default=None) + if error: + raise ExtractorError(error, expected=True) + + +class LinkedInLearningIE(LinkedInLearningBaseIE): + IE_NAME = 'linkedin:learning' + _VALID_URL = r'https?://(?:www\.)?linkedin\.com/learning/(?P<course_slug>[^/]+)/(?P<id>[^/?#]+)' + _TEST = { + 'url': 'https://www.linkedin.com/learning/programming-foundations-fundamentals/welcome?autoplay=true', + 'md5': 'a1d74422ff0d5e66a792deb996693167', + 'info_dict': { + 'id': '90426', + 'ext': 'mp4', + 'title': 'Welcome', + 'timestamp': 1430396150.82, + 'upload_date': '20150430', + }, + } + + def json2srt(self, transcript_lines, duration=None): + srt_data = '' + for line, (line_dict, next_dict) in enumerate(zip_longest(transcript_lines, transcript_lines[1:])): + start_time, caption = line_dict['transcriptStartAt'] / 1000, line_dict['caption'] + end_time = next_dict['transcriptStartAt'] / 1000 if next_dict else duration or start_time + 1 + srt_data += '%d\n%s --> %s\n%s\n\n' % (line + 1, srt_subtitles_timecode(start_time), + srt_subtitles_timecode(end_time), + caption) + return srt_data + + def _real_extract(self, url): + course_slug, video_slug = self._match_valid_url(url).groups() + + video_data = None + formats = [] + for width, height in ((640, 360), (960, 540), (1280, 720)): + video_data = self._call_api( + course_slug, 'selectedVideo', video_slug, height)['selectedVideo'] + + video_url_data = video_data.get('url') or {} + progressive_url = video_url_data.get('progressiveUrl') + if progressive_url: + formats.append({ + 'format_id': 'progressive-%dp' % height, + 'url': progressive_url, + 'ext': 'mp4', + 'height': height, + 'width': width, + 'source_preference': 1, + }) + + title = video_data['title'] + + audio_url = video_data.get('audio', {}).get('progressiveUrl') + if audio_url: + formats.append({ + 'abr': 64, + 'ext': 'm4a', + 'format_id': 'audio', + 'url': audio_url, + 'vcodec': 'none', + }) + + streaming_url = video_url_data.get('streamingUrl') + if streaming_url: + formats.extend(self._extract_m3u8_formats( + streaming_url, video_slug, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + + # It seems like this would be correctly handled by default + # However, unless someone can confirm this, the old + # behaviour is being kept as-is + self._sort_formats(formats, ('res', 'source_preference')) + subtitles = {} + duration = int_or_none(video_data.get('durationInSeconds')) + transcript_lines = try_get(video_data, lambda x: x['transcript']['lines'], expected_type=list) + if transcript_lines: + subtitles['en'] = [{ + 'ext': 'srt', + 'data': self.json2srt(transcript_lines, duration) + }] + + return { + 'id': self._get_video_id(video_data, course_slug, video_slug), + 'title': title, + 'formats': formats, + 'thumbnail': video_data.get('defaultThumbnail'), + 'timestamp': float_or_none(video_data.get('publishedOn'), 1000), + 'duration': duration, + 'subtitles': subtitles, + } + + +class LinkedInLearningCourseIE(LinkedInLearningBaseIE): + IE_NAME = 'linkedin:learning:course' + _VALID_URL = r'https?://(?:www\.)?linkedin\.com/learning/(?P<id>[^/?#]+)' + _TEST = { + 'url': 'https://www.linkedin.com/learning/programming-foundations-fundamentals', + 'info_dict': { + 'id': 'programming-foundations-fundamentals', + 'title': 'Programming Foundations: Fundamentals', + 'description': 'md5:76e580b017694eb89dc8e8923fff5c86', + }, + 'playlist_mincount': 61, + } + + @classmethod + def suitable(cls, url): + return False if LinkedInLearningIE.suitable(url) else super(LinkedInLearningCourseIE, cls).suitable(url) + + def _real_extract(self, url): + course_slug = self._match_id(url) + course_data = self._call_api(course_slug, 'chapters,description,title') + + entries = [] + for chapter_number, chapter in enumerate(course_data.get('chapters', []), 1): + chapter_title = chapter.get('title') + chapter_id = self._get_urn_id(chapter) + for video in chapter.get('videos', []): + video_slug = video.get('slug') + if not video_slug: + continue + entries.append({ + '_type': 'url_transparent', + 'id': self._get_video_id(video, course_slug, video_slug), + 'title': video.get('title'), + 'url': 'https://www.linkedin.com/learning/%s/%s' % (course_slug, video_slug), + 'chapter': chapter_title, + 'chapter_number': chapter_number, + 'chapter_id': chapter_id, + 'ie_key': LinkedInLearningIE.ie_key(), + }) + + return self.playlist_result( + entries, course_slug, + course_data.get('title'), + course_data.get('description')) diff --git a/yt_dlp/extractor/linuxacademy.py b/yt_dlp/extractor/linuxacademy.py new file mode 100644 index 000000000..2053970d1 --- /dev/null +++ b/yt_dlp/extractor/linuxacademy.py @@ -0,0 +1,252 @@ +from __future__ import unicode_literals + +import json +import random + +from .common import InfoExtractor +from ..compat import ( + compat_b64decode, + compat_HTTPError, + compat_str, +) +from ..utils import ( + clean_html, + ExtractorError, + js_to_json, + parse_duration, + try_get, + unified_timestamp, + urlencode_postdata, + urljoin, +) + + +class LinuxAcademyIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?linuxacademy\.com/cp/ + (?: + courses/lesson/course/(?P<chapter_id>\d+)/lesson/(?P<lesson_id>\d+)| + modules/view/id/(?P<course_id>\d+) + ) + ''' + _TESTS = [{ + 'url': 'https://linuxacademy.com/cp/courses/lesson/course/7971/lesson/2/module/675', + 'info_dict': { + 'id': '7971-2', + 'ext': 'mp4', + 'title': 'What Is Data Science', + 'description': 'md5:c574a3c20607144fb36cb65bdde76c99', + 'timestamp': int, # The timestamp and upload date changes + 'upload_date': r're:\d+', + 'duration': 304, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Requires Linux Academy account credentials', + }, { + 'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2', + 'only_matching': True, + }, { + 'url': 'https://linuxacademy.com/cp/modules/view/id/154', + 'info_dict': { + 'id': '154', + 'title': 'AWS Certified Cloud Practitioner', + 'description': 'md5:a68a299ca9bb98d41cca5abc4d4ce22c', + 'duration': 28835, + }, + 'playlist_count': 41, + 'skip': 'Requires Linux Academy account credentials', + }, { + 'url': 'https://linuxacademy.com/cp/modules/view/id/39', + 'info_dict': { + 'id': '39', + 'title': 'Red Hat Certified Systems Administrator - RHCSA (EX200) Exam Prep (legacy)', + 'description': 'md5:0f1d3369e90c3fb14a79813b863c902f', + 'duration': 89280, + }, + 'playlist_count': 73, + 'skip': 'Requires Linux Academy account credentials', + }] + + _AUTHORIZE_URL = 'https://login.linuxacademy.com/authorize' + _ORIGIN_URL = 'https://linuxacademy.com' + _CLIENT_ID = 'KaWxNn1C2Gc7n83W9OFeXltd8Utb5vvx' + _NETRC_MACHINE = 'linuxacademy' + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + def random_string(): + return ''.join([ + random.choice('0123456789ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz-._~') + for _ in range(32)]) + + webpage, urlh = self._download_webpage_handle( + self._AUTHORIZE_URL, None, 'Downloading authorize page', query={ + 'client_id': self._CLIENT_ID, + 'response_type': 'token id_token', + 'response_mode': 'web_message', + 'redirect_uri': self._ORIGIN_URL, + 'scope': 'openid email user_impersonation profile', + 'audience': self._ORIGIN_URL, + 'state': random_string(), + 'nonce': random_string(), + }) + + login_data = self._parse_json( + self._search_regex( + r'atob\(\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, + 'login info', group='value'), None, + transform_source=lambda x: compat_b64decode(x).decode('utf-8') + )['extraParams'] + + login_data.update({ + 'client_id': self._CLIENT_ID, + 'redirect_uri': self._ORIGIN_URL, + 'tenant': 'lacausers', + 'connection': 'Username-Password-ACG-Proxy', + 'username': username, + 'password': password, + 'sso': 'true', + }) + + login_state_url = urlh.geturl() + + try: + login_page = self._download_webpage( + 'https://login.linuxacademy.com/usernamepassword/login', None, + 'Downloading login page', data=json.dumps(login_data).encode(), + headers={ + 'Content-Type': 'application/json', + 'Origin': 'https://login.linuxacademy.com', + 'Referer': login_state_url, + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + error = self._parse_json(e.cause.read(), None) + message = error.get('description') or error['code'] + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, message), expected=True) + raise + + callback_page, urlh = self._download_webpage_handle( + 'https://login.linuxacademy.com/login/callback', None, + 'Downloading callback page', + data=urlencode_postdata(self._hidden_inputs(login_page)), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Origin': 'https://login.linuxacademy.com', + 'Referer': login_state_url, + }) + + access_token = self._search_regex( + r'access_token=([^=&]+)', urlh.geturl(), + 'access token', default=None) + if not access_token: + access_token = self._parse_json( + self._search_regex( + r'authorizationResponse\s*=\s*({.+?})\s*;', callback_page, + 'authorization response'), None, + transform_source=js_to_json)['response']['access_token'] + + self._download_webpage( + 'https://linuxacademy.com/cp/login/tokenValidateLogin/token/%s' + % access_token, None, 'Downloading token validation page') + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + chapter_id, lecture_id, course_id = mobj.group('chapter_id', 'lesson_id', 'course_id') + item_id = course_id if course_id else '%s-%s' % (chapter_id, lecture_id) + + webpage = self._download_webpage(url, item_id) + + # course path + if course_id: + module = self._parse_json( + self._search_regex( + r'window\.module\s*=\s*({(?:(?!};)[^"]|"([^"]|\\")*")+})\s*;', webpage, 'module'), + item_id) + entries = [] + chapter_number = None + chapter = None + chapter_id = None + for item in module['items']: + if not isinstance(item, dict): + continue + + def type_field(key): + return (try_get(item, lambda x: x['type'][key], compat_str) or '').lower() + type_fields = (type_field('name'), type_field('slug')) + # Move to next module section + if 'section' in type_fields: + chapter = item.get('course_name') + chapter_id = item.get('course_module') + chapter_number = 1 if not chapter_number else chapter_number + 1 + continue + # Skip non-lessons + if 'lesson' not in type_fields: + continue + lesson_url = urljoin(url, item.get('url')) + if not lesson_url: + continue + title = item.get('title') or item.get('lesson_name') + description = item.get('md_desc') or clean_html(item.get('description')) or clean_html(item.get('text')) + entries.append({ + '_type': 'url_transparent', + 'url': lesson_url, + 'ie_key': LinuxAcademyIE.ie_key(), + 'title': title, + 'description': description, + 'timestamp': unified_timestamp(item.get('date')) or unified_timestamp(item.get('created_on')), + 'duration': parse_duration(item.get('duration')), + 'chapter': chapter, + 'chapter_id': chapter_id, + 'chapter_number': chapter_number, + }) + return { + '_type': 'playlist', + 'entries': entries, + 'id': course_id, + 'title': module.get('title'), + 'description': module.get('md_desc') or clean_html(module.get('desc')), + 'duration': parse_duration(module.get('duration')), + } + + # single video path + m3u8_url = self._parse_json( + self._search_regex( + r'player\.playlist\s*=\s*(\[.+?\])\s*;', webpage, 'playlist'), + item_id)[0]['file'] + formats = self._extract_m3u8_formats( + m3u8_url, item_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + self._sort_formats(formats) + info = { + 'id': item_id, + 'formats': formats, + } + lesson = self._parse_json( + self._search_regex( + (r'window\.lesson\s*=\s*({.+?})\s*;', + r'player\.lesson\s*=\s*({.+?})\s*;'), + webpage, 'lesson', default='{}'), item_id, fatal=False) + if lesson: + info.update({ + 'title': lesson.get('lesson_name'), + 'description': lesson.get('md_desc') or clean_html(lesson.get('desc')), + 'timestamp': unified_timestamp(lesson.get('date')) or unified_timestamp(lesson.get('created_on')), + 'duration': parse_duration(lesson.get('duration')), + }) + if not info.get('title'): + info['title'] = self._search_regex( + (r'>Lecture\s*:\s*(?P<value>[^<]+)', + r'lessonName\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage, + 'title', group='value') + return info diff --git a/yt_dlp/extractor/litv.py b/yt_dlp/extractor/litv.py new file mode 100644 index 000000000..18d237ef9 --- /dev/null +++ b/yt_dlp/extractor/litv.py @@ -0,0 +1,148 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + smuggle_url, + unsmuggle_url, +) + + +class LiTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?litv\.tv/(?:vod|promo)/[^/]+/(?:content\.do)?\?.*?\b(?:content_)?id=(?P<id>[^&]+)' + + _URL_TEMPLATE = 'https://www.litv.tv/vod/%s/content.do?id=%s' + + _TESTS = [{ + 'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1', + 'info_dict': { + 'id': 'VOD00041606', + 'title': '花千骨', + }, + 'playlist_count': 50, + }, { + 'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1', + 'md5': '969e343d9244778cb29acec608e53640', + 'info_dict': { + 'id': 'VOD00041610', + 'ext': 'mp4', + 'title': '花千骨第1集', + 'thumbnail': r're:https?://.*\.jpg$', + 'description': 'md5:c7017aa144c87467c4fb2909c4b05d6f', + 'episode_number': 1, + }, + 'params': { + 'noplaylist': True, + }, + 'skip': 'Georestricted to Taiwan', + }, { + 'url': 'https://www.litv.tv/promo/miyuezhuan/?content_id=VOD00044841&', + 'md5': '88322ea132f848d6e3e18b32a832b918', + 'info_dict': { + 'id': 'VOD00044841', + 'ext': 'mp4', + 'title': '芈月傳第1集 霸星芈月降世楚國', + 'description': '楚威王二年,太史令唐昧夜觀星象,發現霸星即將現世。王后得知霸星的預言後,想盡辦法不讓孩子順利出生,幸得莒姬相護化解危機。沒想到眾人期待下出生的霸星卻是位公主,楚威王對此失望至極。楚王后命人將女嬰丟棄河中,居然奇蹟似的被少司命像攔下,楚威王認為此女非同凡響,為她取名芈月。', + }, + 'skip': 'Georestricted to Taiwan', + }] + + def _extract_playlist(self, season_list, video_id, program_info, prompt=True): + episode_title = program_info['title'] + content_id = season_list['contentId'] + + if prompt: + self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (content_id, video_id)) + + all_episodes = [ + self.url_result(smuggle_url( + self._URL_TEMPLATE % (program_info['contentType'], episode['contentId']), + {'force_noplaylist': True})) # To prevent infinite recursion + for episode in season_list['episode']] + + return self.playlist_result(all_episodes, content_id, episode_title) + + def _real_extract(self, url): + url, data = unsmuggle_url(url, {}) + + video_id = self._match_id(url) + + noplaylist = self.get_param('noplaylist') + noplaylist_prompt = True + if 'force_noplaylist' in data: + noplaylist = data['force_noplaylist'] + noplaylist_prompt = False + + webpage = self._download_webpage(url, video_id) + + program_info = self._parse_json(self._search_regex( + r'var\s+programInfo\s*=\s*([^;]+)', webpage, 'VOD data', default='{}'), + video_id) + + season_list = list(program_info.get('seasonList', {}).values()) + if season_list: + if not noplaylist: + return self._extract_playlist( + season_list[0], video_id, program_info, + prompt=noplaylist_prompt) + + if noplaylist_prompt: + self.to_screen('Downloading just video %s because of --no-playlist' % video_id) + + # In browsers `getMainUrl` request is always issued. Usually this + # endpoint gives the same result as the data embedded in the webpage. + # If georestricted, there are no embedded data, so an extra request is + # necessary to get the error code + if 'assetId' not in program_info: + program_info = self._download_json( + 'https://www.litv.tv/vod/ajax/getProgramInfo', video_id, + query={'contentId': video_id}, + headers={'Accept': 'application/json'}) + video_data = self._parse_json(self._search_regex( + r'uiHlsUrl\s*=\s*testBackendData\(([^;]+)\);', + webpage, 'video data', default='{}'), video_id) + if not video_data: + payload = { + 'assetId': program_info['assetId'], + 'watchDevices': program_info['watchDevices'], + 'contentType': program_info['contentType'], + } + video_data = self._download_json( + 'https://www.litv.tv/vod/getMainUrl', video_id, + data=json.dumps(payload).encode('utf-8'), + headers={'Content-Type': 'application/json'}) + + if not video_data.get('fullpath'): + error_msg = video_data.get('errorMessage') + if error_msg == 'vod.error.outsideregionerror': + self.raise_geo_restricted('This video is available in Taiwan only') + if error_msg: + raise ExtractorError('%s said: %s' % (self.IE_NAME, error_msg), expected=True) + raise ExtractorError('Unexpected result from %s' % self.IE_NAME) + + formats = self._extract_m3u8_formats( + video_data['fullpath'], video_id, ext='mp4', + entry_protocol='m3u8_native', m3u8_id='hls') + for a_format in formats: + # LiTV HLS segments doesn't like compressions + a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = True + + title = program_info['title'] + program_info.get('secondaryMark', '') + description = program_info.get('description') + thumbnail = program_info.get('imageFile') + categories = [item['name'] for item in program_info.get('category', [])] + episode = int_or_none(program_info.get('episode')) + + return { + 'id': video_id, + 'formats': formats, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'categories': categories, + 'episode_number': episode, + } diff --git a/youtube_dl/extractor/livejournal.py b/yt_dlp/extractor/livejournal.py index 3a9f4553f..3a9f4553f 100644 --- a/youtube_dl/extractor/livejournal.py +++ b/yt_dlp/extractor/livejournal.py diff --git a/yt_dlp/extractor/livestream.py b/yt_dlp/extractor/livestream.py new file mode 100644 index 000000000..f591289ec --- /dev/null +++ b/yt_dlp/extractor/livestream.py @@ -0,0 +1,366 @@ +from __future__ import unicode_literals + +import re +import itertools + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urlparse, +) +from ..utils import ( + find_xpath_attr, + xpath_attr, + xpath_with_ns, + xpath_text, + orderedSet, + update_url_query, + int_or_none, + float_or_none, + parse_iso8601, + determine_ext, +) + + +class LivestreamIE(InfoExtractor): + IE_NAME = 'livestream' + _VALID_URL = r'https?://(?:new\.)?livestream\.com/(?:accounts/(?P<account_id>\d+)|(?P<account_name>[^/]+))/(?:events/(?P<event_id>\d+)|(?P<event_name>[^/]+))(?:/videos/(?P<id>\d+))?' + _TESTS = [{ + 'url': 'http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370', + 'md5': '53274c76ba7754fb0e8d072716f2292b', + 'info_dict': { + 'id': '4719370', + 'ext': 'mp4', + 'title': 'Live from Webster Hall NYC', + 'timestamp': 1350008072, + 'upload_date': '20121012', + 'duration': 5968.0, + 'like_count': int, + 'view_count': int, + 'thumbnail': r're:^http://.*\.jpg$' + } + }, { + 'url': 'http://new.livestream.com/tedx/cityenglish', + 'info_dict': { + 'title': 'TEDCity2.0 (English)', + 'id': '2245590', + }, + 'playlist_mincount': 4, + }, { + 'url': 'http://new.livestream.com/chess24/tatasteelchess', + 'info_dict': { + 'title': 'Tata Steel Chess', + 'id': '3705884', + }, + 'playlist_mincount': 60, + }, { + 'url': 'https://new.livestream.com/accounts/362/events/3557232/videos/67864563/player?autoPlay=false&height=360&mute=false&width=640', + 'only_matching': True, + }, { + 'url': 'http://livestream.com/bsww/concacafbeachsoccercampeonato2015', + 'only_matching': True, + }] + _API_URL_TEMPLATE = 'http://livestream.com/api/accounts/%s/events/%s' + + def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): + base_ele = find_xpath_attr( + smil, self._xpath_ns('.//meta', namespace), 'name', 'httpBase') + base = base_ele.get('content') if base_ele is not None else 'http://livestreamvod-f.akamaihd.net/' + + formats = [] + video_nodes = smil.findall(self._xpath_ns('.//video', namespace)) + + for vn in video_nodes: + tbr = int_or_none(vn.attrib.get('system-bitrate'), 1000) + furl = ( + update_url_query(compat_urlparse.urljoin(base, vn.attrib['src']), { + 'v': '3.0.3', + 'fp': 'WIN% 14,0,0,145', + })) + if 'clipBegin' in vn.attrib: + furl += '&ssek=' + vn.attrib['clipBegin'] + formats.append({ + 'url': furl, + 'format_id': 'smil_%d' % tbr, + 'ext': 'flv', + 'tbr': tbr, + 'preference': -1000, # Strictly inferior than all other formats? + }) + return formats + + def _extract_video_info(self, video_data): + video_id = compat_str(video_data['id']) + + FORMAT_KEYS = ( + ('sd', 'progressive_url'), + ('hd', 'progressive_url_hd'), + ) + + formats = [] + for format_id, key in FORMAT_KEYS: + video_url = video_data.get(key) + if video_url: + ext = determine_ext(video_url) + if ext == 'm3u8': + continue + bitrate = int_or_none(self._search_regex( + r'(\d+)\.%s' % ext, video_url, 'bitrate', default=None)) + formats.append({ + 'url': video_url, + 'format_id': format_id, + 'tbr': bitrate, + 'ext': ext, + }) + + smil_url = video_data.get('smil_url') + if smil_url: + formats.extend(self._extract_smil_formats(smil_url, video_id, fatal=False)) + + m3u8_url = video_data.get('m3u8_url') + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + + f4m_url = video_data.get('f4m_url') + if f4m_url: + formats.extend(self._extract_f4m_formats( + f4m_url, video_id, f4m_id='hds', fatal=False)) + self._sort_formats(formats) + + comments = [{ + 'author_id': comment.get('author_id'), + 'author': comment.get('author', {}).get('full_name'), + 'id': comment.get('id'), + 'text': comment['text'], + 'timestamp': parse_iso8601(comment.get('created_at')), + } for comment in video_data.get('comments', {}).get('data', [])] + + return { + 'id': video_id, + 'formats': formats, + 'title': video_data['caption'], + 'description': video_data.get('description'), + 'thumbnail': video_data.get('thumbnail_url'), + 'duration': float_or_none(video_data.get('duration'), 1000), + 'timestamp': parse_iso8601(video_data.get('publish_at')), + 'like_count': video_data.get('likes', {}).get('total'), + 'comment_count': video_data.get('comments', {}).get('total'), + 'view_count': video_data.get('views'), + 'comments': comments, + } + + def _extract_stream_info(self, stream_info): + broadcast_id = compat_str(stream_info['broadcast_id']) + is_live = stream_info.get('is_live') + + formats = [] + smil_url = stream_info.get('play_url') + if smil_url: + formats.extend(self._extract_smil_formats(smil_url, broadcast_id)) + + m3u8_url = stream_info.get('m3u8_url') + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, broadcast_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + + rtsp_url = stream_info.get('rtsp_url') + if rtsp_url: + formats.append({ + 'url': rtsp_url, + 'format_id': 'rtsp', + }) + self._sort_formats(formats) + + return { + 'id': broadcast_id, + 'formats': formats, + 'title': self._live_title(stream_info['stream_title']) if is_live else stream_info['stream_title'], + 'thumbnail': stream_info.get('thumbnail_url'), + 'is_live': is_live, + } + + def _extract_event(self, event_data): + event_id = compat_str(event_data['id']) + account_id = compat_str(event_data['owner_account_id']) + feed_root_url = self._API_URL_TEMPLATE % (account_id, event_id) + '/feed.json' + + stream_info = event_data.get('stream_info') + if stream_info: + return self._extract_stream_info(stream_info) + + last_video = None + entries = [] + for i in itertools.count(1): + if last_video is None: + info_url = feed_root_url + else: + info_url = '{root}?&id={id}&newer=-1&type=video'.format( + root=feed_root_url, id=last_video) + videos_info = self._download_json( + info_url, event_id, 'Downloading page {0}'.format(i))['data'] + videos_info = [v['data'] for v in videos_info if v['type'] == 'video'] + if not videos_info: + break + for v in videos_info: + v_id = compat_str(v['id']) + entries.append(self.url_result( + 'http://livestream.com/accounts/%s/events/%s/videos/%s' % (account_id, event_id, v_id), + 'Livestream', v_id, v.get('caption'))) + last_video = videos_info[-1]['id'] + return self.playlist_result(entries, event_id, event_data['full_name']) + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + event = mobj.group('event_id') or mobj.group('event_name') + account = mobj.group('account_id') or mobj.group('account_name') + api_url = self._API_URL_TEMPLATE % (account, event) + if video_id: + video_data = self._download_json( + api_url + '/videos/%s' % video_id, video_id) + return self._extract_video_info(video_data) + else: + event_data = self._download_json(api_url, video_id) + return self._extract_event(event_data) + + +# The original version of Livestream uses a different system +class LivestreamOriginalIE(InfoExtractor): + IE_NAME = 'livestream:original' + _VALID_URL = r'''(?x)https?://original\.livestream\.com/ + (?P<user>[^/\?#]+)(?:/(?P<type>video|folder) + (?:(?:\?.*?Id=|/)(?P<id>.*?)(&|$))?)? + ''' + _TESTS = [{ + 'url': 'http://original.livestream.com/dealbook/video?clipId=pla_8aa4a3f1-ba15-46a4-893b-902210e138fb', + 'info_dict': { + 'id': 'pla_8aa4a3f1-ba15-46a4-893b-902210e138fb', + 'ext': 'mp4', + 'title': 'Spark 1 (BitCoin) with Cameron Winklevoss & Tyler Winklevoss of Winklevoss Capital', + 'duration': 771.301, + 'view_count': int, + }, + }, { + 'url': 'https://original.livestream.com/newplay/folder?dirId=a07bf706-d0e4-4e75-a747-b021d84f2fd3', + 'info_dict': { + 'id': 'a07bf706-d0e4-4e75-a747-b021d84f2fd3', + }, + 'playlist_mincount': 4, + }, { + # live stream + 'url': 'http://original.livestream.com/znsbahamas', + 'only_matching': True, + }] + + def _extract_video_info(self, user, video_id): + api_url = 'http://x%sx.api.channel.livestream.com/2.0/clipdetails?extendedInfo=true&id=%s' % (user, video_id) + info = self._download_xml(api_url, video_id) + + item = info.find('channel').find('item') + title = xpath_text(item, 'title') + media_ns = {'media': 'http://search.yahoo.com/mrss'} + thumbnail_url = xpath_attr( + item, xpath_with_ns('media:thumbnail', media_ns), 'url') + duration = float_or_none(xpath_attr( + item, xpath_with_ns('media:content', media_ns), 'duration')) + ls_ns = {'ls': 'http://api.channel.livestream.com/2.0'} + view_count = int_or_none(xpath_text( + item, xpath_with_ns('ls:viewsCount', ls_ns))) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail_url, + 'duration': duration, + 'view_count': view_count, + } + + def _extract_video_formats(self, video_data, video_id): + formats = [] + + progressive_url = video_data.get('progressiveUrl') + if progressive_url: + formats.append({ + 'url': progressive_url, + 'format_id': 'http', + }) + + m3u8_url = video_data.get('httpUrl') + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + + rtsp_url = video_data.get('rtspUrl') + if rtsp_url: + formats.append({ + 'url': rtsp_url, + 'format_id': 'rtsp', + }) + + self._sort_formats(formats) + return formats + + def _extract_folder(self, url, folder_id): + webpage = self._download_webpage(url, folder_id) + paths = orderedSet(re.findall( + r'''(?x)(?: + <li\s+class="folder">\s*<a\s+href="| + <a\s+href="(?=https?://livestre\.am/) + )([^"]+)"''', webpage)) + + entries = [{ + '_type': 'url', + 'url': compat_urlparse.urljoin(url, p), + } for p in paths] + + return self.playlist_result(entries, folder_id) + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + user = mobj.group('user') + url_type = mobj.group('type') + content_id = mobj.group('id') + if url_type == 'folder': + return self._extract_folder(url, content_id) + else: + # this url is used on mobile devices + stream_url = 'http://x%sx.api.channel.livestream.com/3.0/getstream.json' % user + info = {} + if content_id: + stream_url += '?id=%s' % content_id + info = self._extract_video_info(user, content_id) + else: + content_id = user + webpage = self._download_webpage(url, content_id) + info = { + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'thumbnail': self._search_regex(r'channelLogo\.src\s*=\s*"([^"]+)"', webpage, 'thumbnail', None), + } + video_data = self._download_json(stream_url, content_id) + is_live = video_data.get('isLive') + info.update({ + 'id': content_id, + 'title': self._live_title(info['title']) if is_live else info['title'], + 'formats': self._extract_video_formats(video_data, content_id), + 'is_live': is_live, + }) + return info + + +# The server doesn't support HEAD request, the generic extractor can't detect +# the redirection +class LivestreamShortenerIE(InfoExtractor): + IE_NAME = 'livestream:shortener' + IE_DESC = False # Do not list + _VALID_URL = r'https?://livestre\.am/(?P<id>.+)' + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + id = mobj.group('id') + webpage = self._download_webpage(url, id) + + return self.url_result(self._og_search_url(webpage)) diff --git a/yt_dlp/extractor/lnkgo.py b/yt_dlp/extractor/lnkgo.py new file mode 100644 index 000000000..14675968e --- /dev/null +++ b/yt_dlp/extractor/lnkgo.py @@ -0,0 +1,87 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ( + clean_html, + compat_str, + int_or_none, + parse_iso8601, +) + + +class LnkGoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?lnk(?:go)?\.(?:alfa\.)?lt/(?:visi-video/[^/]+|video)/(?P<id>[A-Za-z0-9-]+)(?:/(?P<episode_id>\d+))?' + _TESTS = [{ + 'url': 'http://www.lnkgo.lt/visi-video/aktualai-pratesimas/ziurek-putka-trys-klausimai', + 'info_dict': { + 'id': '10809', + 'ext': 'mp4', + 'title': "Put'ka: Trys Klausimai", + 'upload_date': '20161216', + 'description': 'Seniai matytas Put’ka užduoda tris klausimėlius. Pabandykime surasti atsakymus.', + 'age_limit': 18, + 'duration': 117, + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1481904000, + }, + 'params': { + 'skip_download': True, # HLS download + }, + }, { + 'url': 'http://lnkgo.alfa.lt/visi-video/aktualai-pratesimas/ziurek-nerdas-taiso-kompiuteri-2', + 'info_dict': { + 'id': '10467', + 'ext': 'mp4', + 'title': 'Nėrdas: Kompiuterio Valymas', + 'upload_date': '20150113', + 'description': 'md5:7352d113a242a808676ff17e69db6a69', + 'age_limit': 18, + 'duration': 346, + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1421164800, + }, + 'params': { + 'skip_download': True, # HLS download + }, + }, { + 'url': 'https://lnk.lt/video/neigalieji-tv-bokste/37413', + 'only_matching': True, + }] + _AGE_LIMITS = { + 'N-7': 7, + 'N-14': 14, + 'S': 18, + } + _M3U8_TEMPL = 'https://vod.lnk.lt/lnk_vod/lnk/lnk/%s:%s/playlist.m3u8%s' + + def _real_extract(self, url): + display_id, video_id = self._match_valid_url(url).groups() + + video_info = self._download_json( + 'https://lnk.lt/api/main/video-page/%s/%s/false' % (display_id, video_id or '0'), + display_id)['videoConfig']['videoInfo'] + + video_id = compat_str(video_info['id']) + title = video_info['title'] + prefix = 'smil' if video_info.get('isQualityChangeAvailable') else 'mp4' + formats = self._extract_m3u8_formats( + self._M3U8_TEMPL % (prefix, video_info['videoUrl'], video_info.get('secureTokenParams') or ''), + video_id, 'mp4', 'm3u8_native') + self._sort_formats(formats) + + poster_image = video_info.get('posterImage') + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'formats': formats, + 'thumbnail': 'https://lnk.lt/all-images/' + poster_image if poster_image else None, + 'duration': int_or_none(video_info.get('duration')), + 'description': clean_html(video_info.get('htmlDescription')), + 'age_limit': self._AGE_LIMITS.get(video_info.get('pgRating'), 0), + 'timestamp': parse_iso8601(video_info.get('airDate')), + 'view_count': int_or_none(video_info.get('viewsCount')), + } diff --git a/yt_dlp/extractor/localnews8.py b/yt_dlp/extractor/localnews8.py new file mode 100644 index 000000000..c3e9d10fa --- /dev/null +++ b/yt_dlp/extractor/localnews8.py @@ -0,0 +1,46 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor + + +class LocalNews8IE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?localnews8\.com/(?:[^/]+/)*(?P<display_id>[^/]+)/(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.localnews8.com/news/rexburg-business-turns-carbon-fiber-scraps-into-wedding-rings/35183304', + 'md5': 'be4d48aea61aa2bde7be2ee47691ad20', + 'info_dict': { + 'id': '35183304', + 'display_id': 'rexburg-business-turns-carbon-fiber-scraps-into-wedding-rings', + 'ext': 'mp4', + 'title': 'Rexburg business turns carbon fiber scraps into wedding ring', + 'description': 'The process was first invented by Lamborghini and less than a dozen companies around the world use it.', + 'duration': 153, + 'timestamp': 1441844822, + 'upload_date': '20150910', + 'uploader_id': 'api', + } + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') + + webpage = self._download_webpage(url, display_id) + + partner_id = self._search_regex( + r'partnerId\s*[:=]\s*(["\'])(?P<id>\d+)\1', + webpage, 'partner id', group='id') + kaltura_id = self._search_regex( + r'videoIdString\s*[:=]\s*(["\'])kaltura:(?P<id>[0-9a-z_]+)\1', + webpage, 'videl id', group='id') + + return { + '_type': 'url_transparent', + 'url': 'kaltura:%s:%s' % (partner_id, kaltura_id), + 'ie_key': 'Kaltura', + 'id': video_id, + 'display_id': display_id, + } diff --git a/yt_dlp/extractor/lovehomeporn.py b/yt_dlp/extractor/lovehomeporn.py new file mode 100644 index 000000000..ca4b5f375 --- /dev/null +++ b/yt_dlp/extractor/lovehomeporn.py @@ -0,0 +1,36 @@ +from __future__ import unicode_literals + + +from .nuevo import NuevoBaseIE + + +class LoveHomePornIE(NuevoBaseIE): + _VALID_URL = r'https?://(?:www\.)?lovehomeporn\.com/video/(?P<id>\d+)(?:/(?P<display_id>[^/?#&]+))?' + _TEST = { + 'url': 'http://lovehomeporn.com/video/48483/stunning-busty-brunette-girlfriend-sucking-and-riding-a-big-dick#menu', + 'info_dict': { + 'id': '48483', + 'display_id': 'stunning-busty-brunette-girlfriend-sucking-and-riding-a-big-dick', + 'ext': 'mp4', + 'title': 'Stunning busty brunette girlfriend sucking and riding a big dick', + 'age_limit': 18, + 'duration': 238.47, + }, + 'params': { + 'skip_download': True, + } + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') + + info = self._extract_nuevo( + 'http://lovehomeporn.com/media/nuevo/config.php?key=%s' % video_id, + video_id) + info.update({ + 'display_id': display_id, + 'age_limit': 18 + }) + return info diff --git a/yt_dlp/extractor/lrt.py b/yt_dlp/extractor/lrt.py new file mode 100644 index 000000000..4024aef73 --- /dev/null +++ b/yt_dlp/extractor/lrt.py @@ -0,0 +1,74 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ( + clean_html, + merge_dicts, +) + + +class LRTIE(InfoExtractor): + IE_NAME = 'lrt.lt' + _VALID_URL = r'https?://(?:www\.)?lrt\.lt(?P<path>/mediateka/irasas/(?P<id>[0-9]+))' + _TESTS = [{ + # m3u8 download + 'url': 'https://www.lrt.lt/mediateka/irasas/2000127261/greita-ir-gardu-sicilijos-ikvepta-klasikiniu-makaronu-su-baklazanais-vakariene', + 'md5': '85cb2bb530f31d91a9c65b479516ade4', + 'info_dict': { + 'id': '2000127261', + 'ext': 'mp4', + 'title': 'Greita ir gardu: Sicilijos įkvėpta klasikinių makaronų su baklažanais vakarienė', + 'description': 'md5:ad7d985f51b0dc1489ba2d76d7ed47fa', + 'duration': 3035, + 'timestamp': 1604079000, + 'upload_date': '20201030', + }, + }, { + # direct mp3 download + 'url': 'http://www.lrt.lt/mediateka/irasas/1013074524/', + 'md5': '389da8ca3cad0f51d12bed0c844f6a0a', + 'info_dict': { + 'id': '1013074524', + 'ext': 'mp3', + 'title': 'Kita tema 2016-09-05 15:05', + 'description': 'md5:1b295a8fc7219ed0d543fc228c931fb5', + 'duration': 3008, + 'view_count': int, + 'like_count': int, + }, + }] + + def _extract_js_var(self, webpage, var_name, default): + return self._search_regex( + r'%s\s*=\s*(["\'])((?:(?!\1).)+)\1' % var_name, + webpage, var_name.replace('_', ' '), default, group=2) + + def _real_extract(self, url): + path, video_id = self._match_valid_url(url).groups() + webpage = self._download_webpage(url, video_id) + + media_url = self._extract_js_var(webpage, 'main_url', path) + media = self._download_json(self._extract_js_var( + webpage, 'media_info_url', + 'https://www.lrt.lt/servisai/stream_url/vod/media_info/'), + video_id, query={'url': media_url}) + jw_data = self._parse_jwplayer_data( + media['playlist_item'], video_id, base_url=url) + + json_ld_data = self._search_json_ld(webpage, video_id) + + tags = [] + for tag in (media.get('tags') or []): + tag_name = tag.get('name') + if not tag_name: + continue + tags.append(tag_name) + + clean_info = { + 'description': clean_html(media.get('content')), + 'tags': tags, + } + + return merge_dicts(clean_info, jw_data, json_ld_data) diff --git a/yt_dlp/extractor/lynda.py b/yt_dlp/extractor/lynda.py new file mode 100644 index 000000000..58cf17239 --- /dev/null +++ b/yt_dlp/extractor/lynda.py @@ -0,0 +1,341 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urlparse, +) +from ..utils import ( + ExtractorError, + int_or_none, + urlencode_postdata, +) + + +class LyndaBaseIE(InfoExtractor): + _SIGNIN_URL = 'https://www.lynda.com/signin/lynda' + _PASSWORD_URL = 'https://www.lynda.com/signin/password' + _USER_URL = 'https://www.lynda.com/signin/user' + _ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to provide lynda.com account credentials.' + _NETRC_MACHINE = 'lynda' + + def _real_initialize(self): + self._login() + + @staticmethod + def _check_error(json_string, key_or_keys): + keys = [key_or_keys] if isinstance(key_or_keys, compat_str) else key_or_keys + for key in keys: + error = json_string.get(key) + if error: + raise ExtractorError('Unable to login: %s' % error, expected=True) + + def _login_step(self, form_html, fallback_action_url, extra_form_data, note, referrer_url): + action_url = self._search_regex( + r'<form[^>]+action=(["\'])(?P<url>.+?)\1', form_html, + 'post url', default=fallback_action_url, group='url') + + if not action_url.startswith('http'): + action_url = compat_urlparse.urljoin(self._SIGNIN_URL, action_url) + + form_data = self._hidden_inputs(form_html) + form_data.update(extra_form_data) + + response = self._download_json( + action_url, None, note, + data=urlencode_postdata(form_data), + headers={ + 'Referer': referrer_url, + 'X-Requested-With': 'XMLHttpRequest', + }, expected_status=(418, 500, )) + + self._check_error(response, ('email', 'password', 'ErrorMessage')) + + return response, action_url + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + # Step 1: download signin page + signin_page = self._download_webpage( + self._SIGNIN_URL, None, 'Downloading signin page') + + # Already logged in + if any(re.search(p, signin_page) for p in ( + r'isLoggedIn\s*:\s*true', r'logout\.aspx', r'>Log out<')): + return + + # Step 2: submit email + signin_form = self._search_regex( + r'(?s)(<form[^>]+data-form-name=["\']signin["\'][^>]*>.+?</form>)', + signin_page, 'signin form') + signin_page, signin_url = self._login_step( + signin_form, self._PASSWORD_URL, {'email': username}, + 'Submitting email', self._SIGNIN_URL) + + # Step 3: submit password + password_form = signin_page['body'] + self._login_step( + password_form, self._USER_URL, {'email': username, 'password': password}, + 'Submitting password', signin_url) + + +class LyndaIE(LyndaBaseIE): + IE_NAME = 'lynda' + IE_DESC = 'lynda.com videos' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?(?:lynda\.com|educourse\.ga)/ + (?: + (?:[^/]+/){2,3}(?P<course_id>\d+)| + player/embed + )/ + (?P<id>\d+) + ''' + + _TIMECODE_REGEX = r'\[(?P<timecode>\d+:\d+:\d+[\.,]\d+)\]' + + _TESTS = [{ + 'url': 'https://www.lynda.com/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html', + # md5 is unstable + 'info_dict': { + 'id': '114408', + 'ext': 'mp4', + 'title': 'Using the exercise files', + 'duration': 68 + } + }, { + 'url': 'https://www.lynda.com/player/embed/133770?tr=foo=1;bar=g;fizz=rt&fs=0', + 'only_matching': True, + }, { + 'url': 'https://educourse.ga/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html', + 'only_matching': True, + }, { + 'url': 'https://www.lynda.com/de/Graphic-Design-tutorials/Willkommen-Grundlagen-guten-Gestaltung/393570/393572-4.html', + 'only_matching': True, + }, { + # Status="NotFound", Message="Transcript not found" + 'url': 'https://www.lynda.com/ASP-NET-tutorials/What-you-should-know/5034180/2811512-4.html', + 'only_matching': True, + }] + + def _raise_unavailable(self, video_id): + self.raise_login_required( + 'Video %s is only available for members' % video_id) + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + course_id = mobj.group('course_id') + + query = { + 'videoId': video_id, + 'type': 'video', + } + + video = self._download_json( + 'https://www.lynda.com/ajax/player', video_id, + 'Downloading video JSON', fatal=False, query=query) + + # Fallback scenario + if not video: + query['courseId'] = course_id + + play = self._download_json( + 'https://www.lynda.com/ajax/course/%s/%s/play' + % (course_id, video_id), video_id, 'Downloading play JSON') + + if not play: + self._raise_unavailable(video_id) + + formats = [] + for formats_dict in play: + urls = formats_dict.get('urls') + if not isinstance(urls, dict): + continue + cdn = formats_dict.get('name') + for format_id, format_url in urls.items(): + if not format_url: + continue + formats.append({ + 'url': format_url, + 'format_id': '%s-%s' % (cdn, format_id) if cdn else format_id, + 'height': int_or_none(format_id), + }) + self._sort_formats(formats) + + conviva = self._download_json( + 'https://www.lynda.com/ajax/player/conviva', video_id, + 'Downloading conviva JSON', query=query) + + return { + 'id': video_id, + 'title': conviva['VideoTitle'], + 'description': conviva.get('VideoDescription'), + 'release_year': int_or_none(conviva.get('ReleaseYear')), + 'duration': int_or_none(conviva.get('Duration')), + 'creator': conviva.get('Author'), + 'formats': formats, + } + + if 'Status' in video: + raise ExtractorError( + 'lynda returned error: %s' % video['Message'], expected=True) + + if video.get('HasAccess') is False: + self._raise_unavailable(video_id) + + video_id = compat_str(video.get('ID') or video_id) + duration = int_or_none(video.get('DurationInSeconds')) + title = video['Title'] + + formats = [] + + fmts = video.get('Formats') + if fmts: + formats.extend([{ + 'url': f['Url'], + 'ext': f.get('Extension'), + 'width': int_or_none(f.get('Width')), + 'height': int_or_none(f.get('Height')), + 'filesize': int_or_none(f.get('FileSize')), + 'format_id': compat_str(f.get('Resolution')) if f.get('Resolution') else None, + } for f in fmts if f.get('Url')]) + + prioritized_streams = video.get('PrioritizedStreams') + if prioritized_streams: + for prioritized_stream_id, prioritized_stream in prioritized_streams.items(): + formats.extend([{ + 'url': video_url, + 'height': int_or_none(format_id), + 'format_id': '%s-%s' % (prioritized_stream_id, format_id), + } for format_id, video_url in prioritized_stream.items()]) + + self._check_formats(formats, video_id) + self._sort_formats(formats) + + subtitles = self.extract_subtitles(video_id) + + return { + 'id': video_id, + 'title': title, + 'duration': duration, + 'subtitles': subtitles, + 'formats': formats + } + + def _fix_subtitles(self, subs): + srt = '' + seq_counter = 0 + for pos in range(0, len(subs) - 1): + seq_current = subs[pos] + m_current = re.match(self._TIMECODE_REGEX, seq_current['Timecode']) + if m_current is None: + continue + seq_next = subs[pos + 1] + m_next = re.match(self._TIMECODE_REGEX, seq_next['Timecode']) + if m_next is None: + continue + appear_time = m_current.group('timecode') + disappear_time = m_next.group('timecode') + text = seq_current['Caption'].strip() + if text: + seq_counter += 1 + srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (seq_counter, appear_time, disappear_time, text) + if srt: + return srt + + def _get_subtitles(self, video_id): + url = 'https://www.lynda.com/ajax/player?videoId=%s&type=transcript' % video_id + subs = self._download_webpage( + url, video_id, 'Downloading subtitles JSON', fatal=False) + if not subs or 'Status="NotFound"' in subs: + return {} + subs = self._parse_json(subs, video_id, fatal=False) + if not subs: + return {} + fixed_subs = self._fix_subtitles(subs) + if fixed_subs: + return {'en': [{'ext': 'srt', 'data': fixed_subs}]} + return {} + + +class LyndaCourseIE(LyndaBaseIE): + IE_NAME = 'lynda:course' + IE_DESC = 'lynda.com online courses' + + # Course link equals to welcome/introduction video link of same course + # We will recognize it as course link + _VALID_URL = r'https?://(?:www|m)\.(?:lynda\.com|educourse\.ga)/(?P<coursepath>(?:[^/]+/){2,3}(?P<courseid>\d+))-2\.html' + + _TESTS = [{ + 'url': 'https://www.lynda.com/Graphic-Design-tutorials/Grundlagen-guten-Gestaltung/393570-2.html', + 'only_matching': True, + }, { + 'url': 'https://www.lynda.com/de/Graphic-Design-tutorials/Grundlagen-guten-Gestaltung/393570-2.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + course_path = mobj.group('coursepath') + course_id = mobj.group('courseid') + + item_template = 'https://www.lynda.com/%s/%%s-4.html' % course_path + + course = self._download_json( + 'https://www.lynda.com/ajax/player?courseId=%s&type=course' % course_id, + course_id, 'Downloading course JSON', fatal=False) + + if not course: + webpage = self._download_webpage(url, course_id) + entries = [ + self.url_result( + item_template % video_id, ie=LyndaIE.ie_key(), + video_id=video_id) + for video_id in re.findall( + r'data-video-id=["\'](\d+)', webpage)] + return self.playlist_result( + entries, course_id, + self._og_search_title(webpage, fatal=False), + self._og_search_description(webpage)) + + if course.get('Status') == 'NotFound': + raise ExtractorError( + 'Course %s does not exist' % course_id, expected=True) + + unaccessible_videos = 0 + entries = [] + + # Might want to extract videos right here from video['Formats'] as it seems 'Formats' is not provided + # by single video API anymore + + for chapter in course['Chapters']: + for video in chapter.get('Videos', []): + if video.get('HasAccess') is False: + unaccessible_videos += 1 + continue + video_id = video.get('ID') + if video_id: + entries.append({ + '_type': 'url_transparent', + 'url': item_template % video_id, + 'ie_key': LyndaIE.ie_key(), + 'chapter': chapter.get('Title'), + 'chapter_number': int_or_none(chapter.get('ChapterIndex')), + 'chapter_id': compat_str(chapter.get('ID')), + }) + + if unaccessible_videos > 0: + self.report_warning( + '%s videos are only available for members (or paid members) and will not be downloaded. ' + % unaccessible_videos + self._ACCOUNT_CREDENTIALS_HINT) + + course_title = course.get('Title') + course_description = course.get('Description') + + return self.playlist_result(entries, course_id, course_title, course_description) diff --git a/youtube_dl/extractor/m6.py b/yt_dlp/extractor/m6.py index 9806875e8..9806875e8 100644 --- a/youtube_dl/extractor/m6.py +++ b/yt_dlp/extractor/m6.py diff --git a/yt_dlp/extractor/magentamusik360.py b/yt_dlp/extractor/magentamusik360.py new file mode 100644 index 000000000..5c274902f --- /dev/null +++ b/yt_dlp/extractor/magentamusik360.py @@ -0,0 +1,61 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class MagentaMusik360IE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?magenta-musik-360\.de/([a-z0-9-]+-(?P<id>[0-9]+)|festivals/.+)' + _TESTS = [{ + 'url': 'https://www.magenta-musik-360.de/within-temptation-wacken-2019-1-9208205928595185932', + 'md5': '65b6f060b40d90276ec6fb9b992c1216', + 'info_dict': { + 'id': '9208205928595185932', + 'ext': 'm3u8', + 'title': 'WITHIN TEMPTATION', + 'description': 'Robert Westerholt und Sharon Janny den Adel gründeten die Symphonic Metal-Band. Privat sind die Niederländer ein Paar und haben zwei Kinder. Die Single Ice Queen brachte ihnen Platin und Gold und verhalf 2002 zum internationalen Durchbruch. Charakteristisch für die Band war Anfangs der hohe Gesang von Frontfrau Sharon. Stilistisch fing die Band im Gothic Metal an. Mit neuem Sound, schnellen Gitarrenriffs und Gitarrensoli, avancierte Within Temptation zur erfolgreichen Rockband. Auch dieses Jahr wird die Band ihre Fangemeinde wieder mitreißen.', + } + }, { + 'url': 'https://www.magenta-musik-360.de/festivals/wacken-world-wide-2020-body-count-feat-ice-t', + 'md5': '81010d27d7cab3f7da0b0f681b983b7e', + 'info_dict': { + 'id': '9208205928595231363', + 'ext': 'm3u8', + 'title': 'Body Count feat. Ice-T', + 'description': 'Body Count feat. Ice-T konnten bereits im vergangenen Jahr auf dem „Holy Ground“ in Wacken überzeugen. 2020 gehen die Crossover-Metaller aus einem Club in Los Angeles auf Sendung und bringen mit ihrer Mischung aus Metal und Hip-Hop Abwechslung und ordentlich Alarm zum WWW. Bereits seit 1990 stehen die beiden Gründer Ice-T (Gesang) und Ernie C (Gitarre) auf der Bühne. Sieben Studioalben hat die Gruppe bis jetzt veröffentlicht, darunter das Debüt „Body Count“ (1992) mit dem kontroversen Track „Cop Killer“.', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + # _match_id casts to string, but since "None" is not a valid video_id for magenta + # there is no risk for confusion + if video_id == "None": + webpage = self._download_webpage(url, video_id) + video_id = self._html_search_regex(r'data-asset-id="([^"]+)"', webpage, 'video_id') + json = self._download_json("https://wcps.t-online.de/cvss/magentamusic/vodplayer/v3/player/58935/%s/Main%%20Movie" % video_id, video_id) + xml_url = json['content']['feature']['representations'][0]['contentPackages'][0]['media']['href'] + metadata = json['content']['feature'].get('metadata') + title = None + description = None + duration = None + thumbnails = [] + if metadata: + title = metadata.get('title') + description = metadata.get('fullDescription') + duration = metadata.get('runtimeInSeconds') + for img_key in ('teaserImageWide', 'smallCoverImage'): + if img_key in metadata: + thumbnails.append({'url': metadata[img_key].get('href')}) + + xml = self._download_xml(xml_url, video_id) + final_url = xml[0][0][0].attrib['src'] + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'url': final_url, + 'duration': duration, + 'thumbnails': thumbnails + } diff --git a/yt_dlp/extractor/mailru.py b/yt_dlp/extractor/mailru.py new file mode 100644 index 000000000..5d9f80bb3 --- /dev/null +++ b/yt_dlp/extractor/mailru.py @@ -0,0 +1,344 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools +import json +import re + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote +from ..utils import ( + int_or_none, + parse_duration, + remove_end, + try_get, + urljoin, +) + + +class MailRuIE(InfoExtractor): + IE_NAME = 'mailru' + IE_DESC = 'Видео@Mail.Ru' + _VALID_URL = r'''(?x) + https?:// + (?:(?:www|m|videoapi)\.)?my\.mail\.ru/+ + (?: + video/.*\#video=/?(?P<idv1>(?:[^/]+/){3}\d+)| + (?:videos/embed/)?(?:(?P<idv2prefix>(?:[^/]+/+){2})(?:video/(?:embed/)?)?(?P<idv2suffix>[^/]+/\d+))(?:\.html)?| + (?:video/embed|\+/video/meta)/(?P<metaid>\d+) + ) + ''' + _TESTS = [ + { + 'url': 'http://my.mail.ru/video/top#video=/mail/sonypicturesrus/75/76', + 'md5': 'dea205f03120046894db4ebb6159879a', + 'info_dict': { + 'id': '46301138_76', + 'ext': 'mp4', + 'title': 'Новый Человек-Паук. Высокое напряжение. Восстание Электро', + 'timestamp': 1393235077, + 'upload_date': '20140224', + 'uploader': 'sonypicturesrus', + 'uploader_id': 'sonypicturesrus@mail.ru', + 'duration': 184, + }, + 'skip': 'Not accessible from Travis CI server', + }, + { + 'url': 'http://my.mail.ru/corp/hitech/video/news_hi-tech_mail_ru/1263.html', + 'md5': '00a91a58c3402204dcced523777b475f', + 'info_dict': { + 'id': '46843144_1263', + 'ext': 'mp4', + 'title': 'Samsung Galaxy S5 Hammer Smash Fail Battery Explosion', + 'timestamp': 1397039888, + 'upload_date': '20140409', + 'uploader': 'hitech', + 'uploader_id': 'hitech@corp.mail.ru', + 'duration': 245, + }, + 'skip': 'Not accessible from Travis CI server', + }, + { + # only available via metaUrl API + 'url': 'http://my.mail.ru/mail/720pizle/video/_myvideo/502.html', + 'md5': '3b26d2491c6949d031a32b96bd97c096', + 'info_dict': { + 'id': '56664382_502', + 'ext': 'mp4', + 'title': ':8336', + 'timestamp': 1449094163, + 'upload_date': '20151202', + 'uploader': '720pizle@mail.ru', + 'uploader_id': '720pizle@mail.ru', + 'duration': 6001, + }, + 'skip': 'Not accessible from Travis CI server', + }, + { + 'url': 'http://m.my.mail.ru/mail/3sktvtr/video/_myvideo/138.html', + 'only_matching': True, + }, + { + 'url': 'https://my.mail.ru/video/embed/7949340477499637815', + 'only_matching': True, + }, + { + 'url': 'http://my.mail.ru/+/video/meta/7949340477499637815', + 'only_matching': True, + }, + { + 'url': 'https://my.mail.ru//list/sinyutin10/video/_myvideo/4.html', + 'only_matching': True, + }, + { + 'url': 'https://my.mail.ru//list//sinyutin10/video/_myvideo/4.html', + 'only_matching': True, + }, + { + 'url': 'https://my.mail.ru/mail/cloud-strife/video/embed/Games/2009', + 'only_matching': True, + }, + { + 'url': 'https://videoapi.my.mail.ru/videos/embed/mail/cloud-strife/Games/2009.html', + 'only_matching': True, + } + ] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + meta_id = mobj.group('metaid') + + video_id = None + if meta_id: + meta_url = 'https://my.mail.ru/+/video/meta/%s' % meta_id + else: + video_id = mobj.group('idv1') + if not video_id: + video_id = mobj.group('idv2prefix') + mobj.group('idv2suffix') + webpage = self._download_webpage(url, video_id) + page_config = self._parse_json(self._search_regex([ + r'(?s)<script[^>]+class="sp-video__page-config"[^>]*>(.+?)</script>', + r'(?s)"video":\s*({.+?}),'], + webpage, 'page config', default='{}'), video_id, fatal=False) + if page_config: + meta_url = page_config.get('metaUrl') or page_config.get('video', {}).get('metaUrl') or page_config.get('metadataUrl') + else: + meta_url = None + + video_data = None + + # fix meta_url if missing the host address + if re.match(r'^\/\+\/', meta_url): + meta_url = urljoin('https://my.mail.ru', meta_url) + + if meta_url: + video_data = self._download_json( + meta_url, video_id or meta_id, 'Downloading video meta JSON', + fatal=not video_id) + + # Fallback old approach + if not video_data: + video_data = self._download_json( + 'http://api.video.mail.ru/videos/%s.json?new=1' % video_id, + video_id, 'Downloading video JSON') + + headers = {} + + video_key = self._get_cookies('https://my.mail.ru').get('video_key') + if video_key: + headers['Cookie'] = 'video_key=%s' % video_key.value + + formats = [] + for f in video_data['videos']: + video_url = f.get('url') + if not video_url: + continue + format_id = f.get('key') + height = int_or_none(self._search_regex( + r'^(\d+)[pP]$', format_id, 'height', default=None)) if format_id else None + formats.append({ + 'url': video_url, + 'format_id': format_id, + 'height': height, + 'http_headers': headers, + }) + self._sort_formats(formats) + + meta_data = video_data['meta'] + title = remove_end(meta_data['title'], '.mp4') + + author = video_data.get('author') + uploader = author.get('name') + uploader_id = author.get('id') or author.get('email') + view_count = int_or_none(video_data.get('viewsCount') or video_data.get('views_count')) + + acc_id = meta_data.get('accId') + item_id = meta_data.get('itemId') + content_id = '%s_%s' % (acc_id, item_id) if acc_id and item_id else video_id + + thumbnail = meta_data.get('poster') + duration = int_or_none(meta_data.get('duration')) + timestamp = int_or_none(meta_data.get('timestamp')) + + return { + 'id': content_id, + 'title': title, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'duration': duration, + 'view_count': view_count, + 'formats': formats, + } + + +class MailRuMusicSearchBaseIE(InfoExtractor): + def _search(self, query, url, audio_id, limit=100, offset=0): + search = self._download_json( + 'https://my.mail.ru/cgi-bin/my/ajax', audio_id, + 'Downloading songs JSON page %d' % (offset // limit + 1), + headers={ + 'Referer': url, + 'X-Requested-With': 'XMLHttpRequest', + }, query={ + 'xemail': '', + 'ajax_call': '1', + 'func_name': 'music.search', + 'mna': '', + 'mnb': '', + 'arg_query': query, + 'arg_extended': '1', + 'arg_search_params': json.dumps({ + 'music': { + 'limit': limit, + 'offset': offset, + }, + }), + 'arg_limit': limit, + 'arg_offset': offset, + }) + return next(e for e in search if isinstance(e, dict)) + + @staticmethod + def _extract_track(t, fatal=True): + audio_url = t['URL'] if fatal else t.get('URL') + if not audio_url: + return + + audio_id = t['File'] if fatal else t.get('File') + if not audio_id: + return + + thumbnail = t.get('AlbumCoverURL') or t.get('FiledAlbumCover') + uploader = t.get('OwnerName') or t.get('OwnerName_Text_HTML') + uploader_id = t.get('UploaderID') + duration = int_or_none(t.get('DurationInSeconds')) or parse_duration( + t.get('Duration') or t.get('DurationStr')) + view_count = int_or_none(t.get('PlayCount') or t.get('PlayCount_hr')) + + track = t.get('Name') or t.get('Name_Text_HTML') + artist = t.get('Author') or t.get('Author_Text_HTML') + + if track: + title = '%s - %s' % (artist, track) if artist else track + else: + title = audio_id + + return { + 'extractor_key': MailRuMusicIE.ie_key(), + 'id': audio_id, + 'title': title, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'duration': duration, + 'view_count': view_count, + 'vcodec': 'none', + 'abr': int_or_none(t.get('BitRate')), + 'track': track, + 'artist': artist, + 'album': t.get('Album'), + 'url': audio_url, + } + + +class MailRuMusicIE(MailRuMusicSearchBaseIE): + IE_NAME = 'mailru:music' + IE_DESC = 'Музыка@Mail.Ru' + _VALID_URL = r'https?://my\.mail\.ru/+music/+songs/+[^/?#&]+-(?P<id>[\da-f]+)' + _TESTS = [{ + 'url': 'https://my.mail.ru/music/songs/%D0%BC8%D0%BB8%D1%82%D1%85-l-a-h-luciferian-aesthetics-of-herrschaft-single-2017-4e31f7125d0dfaef505d947642366893', + 'md5': '0f8c22ef8c5d665b13ac709e63025610', + 'info_dict': { + 'id': '4e31f7125d0dfaef505d947642366893', + 'ext': 'mp3', + 'title': 'L.A.H. (Luciferian Aesthetics of Herrschaft) single, 2017 - М8Л8ТХ', + 'uploader': 'Игорь Мудрый', + 'uploader_id': '1459196328', + 'duration': 280, + 'view_count': int, + 'vcodec': 'none', + 'abr': 320, + 'track': 'L.A.H. (Luciferian Aesthetics of Herrschaft) single, 2017', + 'artist': 'М8Л8ТХ', + }, + }] + + def _real_extract(self, url): + audio_id = self._match_id(url) + + webpage = self._download_webpage(url, audio_id) + + title = self._og_search_title(webpage) + music_data = self._search(title, url, audio_id)['MusicData'] + t = next(t for t in music_data if t.get('File') == audio_id) + + info = self._extract_track(t) + info['title'] = title + return info + + +class MailRuMusicSearchIE(MailRuMusicSearchBaseIE): + IE_NAME = 'mailru:music:search' + IE_DESC = 'Музыка@Mail.Ru' + _VALID_URL = r'https?://my\.mail\.ru/+music/+search/+(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://my.mail.ru/music/search/black%20shadow', + 'info_dict': { + 'id': 'black shadow', + }, + 'playlist_mincount': 532, + }] + + def _real_extract(self, url): + query = compat_urllib_parse_unquote(self._match_id(url)) + + entries = [] + + LIMIT = 100 + offset = 0 + + for _ in itertools.count(1): + search = self._search(query, url, query, LIMIT, offset) + + music_data = search.get('MusicData') + if not music_data or not isinstance(music_data, list): + break + + for t in music_data: + track = self._extract_track(t, fatal=False) + if track: + entries.append(track) + + total = try_get( + search, lambda x: x['Results']['music']['Total'], int) + + if total is not None: + if offset > total: + break + + offset += LIMIT + + return self.playlist_result(entries, query) diff --git a/youtube_dl/extractor/malltv.py b/yt_dlp/extractor/malltv.py index fadfd9338..fadfd9338 100644 --- a/youtube_dl/extractor/malltv.py +++ b/yt_dlp/extractor/malltv.py diff --git a/youtube_dl/extractor/mangomolo.py b/yt_dlp/extractor/mangomolo.py index acee370e9..acee370e9 100644 --- a/youtube_dl/extractor/mangomolo.py +++ b/yt_dlp/extractor/mangomolo.py diff --git a/yt_dlp/extractor/manoto.py b/yt_dlp/extractor/manoto.py new file mode 100644 index 000000000..d12aa5f60 --- /dev/null +++ b/yt_dlp/extractor/manoto.py @@ -0,0 +1,138 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_html, + int_or_none, + traverse_obj +) + + +_API_URL = 'https://dak1vd5vmi7x6.cloudfront.net/api/v1/publicrole/{}/{}?id={}' + + +class ManotoTVIE(InfoExtractor): + IE_DESC = 'Manoto TV (Episode)' + _VALID_URL = r'https?://(?:www\.)?manototv\.com/episode/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'https://www.manototv.com/episode/8475', + 'info_dict': { + 'id': '8475', + 'series': 'خانه های رویایی با برادران اسکات', + 'season_number': 7, + 'episode_number': 25, + 'episode_id': 'My Dream Home S7: Carol & John', + 'duration': 3600, + 'categories': ['سرگرمی'], + 'title': 'کارول و جان', + 'description': 'md5:d0fff1f8ba5c6775d312a00165d1a97e', + 'thumbnail': r're:^https?://.*\.(jpeg|png|jpg)$', + 'ext': 'mp4' + }, + 'params': { + 'skip_download': 'm3u8', + } + }, { + 'url': 'https://www.manototv.com/episode/12576', + 'info_dict': { + 'id': '12576', + 'series': 'فیلم های ایرانی', + 'episode_id': 'Seh Mah Taatili', + 'duration': 5400, + 'view_count': int, + 'categories': ['سرگرمی'], + 'title': 'سه ماه تعطیلی', + 'description': 'سه ماه تعطیلی فیلمی به کارگردانی و نویسندگی شاپور قریب ساختهٔ سال ۱۳۵۶ است.', + 'thumbnail': r're:^https?://.*\.(jpeg|png|jpg)$', + 'ext': 'mp4' + }, + 'params': { + 'skip_download': 'm3u8', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + episode_json = self._download_json(_API_URL.format('showmodule', 'episodedetails', video_id), video_id) + details = episode_json.get('details', {}) + formats = self._extract_m3u8_formats(details.get('videoM3u8Url'), video_id, 'mp4') + self._sort_formats(formats) + return { + 'id': video_id, + 'series': details.get('showTitle'), + 'season_number': int_or_none(details.get('analyticsSeasonNumber')), + 'episode_number': int_or_none(details.get('episodeNumber')), + 'episode_id': details.get('analyticsEpisodeTitle'), + 'duration': int_or_none(details.get('durationInMinutes'), invscale=60), + 'view_count': details.get('viewCount'), + 'categories': [details.get('videoCategory')], + 'title': details.get('episodeTitle'), + 'description': clean_html(details.get('episodeDescription')), + 'thumbnail': details.get('episodelandscapeImgIxUrl'), + 'formats': formats, + } + + +class ManotoTVShowIE(InfoExtractor): + IE_DESC = 'Manoto TV (Show)' + _VALID_URL = r'https?://(?:www\.)?manototv\.com/show/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'https://www.manototv.com/show/2526', + 'playlist_mincount': 68, + 'info_dict': { + 'id': '2526', + 'title': 'فیلم های ایرانی', + 'description': 'مجموعه ای از فیلم های سینمای کلاسیک ایران', + }, + }] + + def _real_extract(self, url): + show_id = self._match_id(url) + show_json = self._download_json(_API_URL.format('showmodule', 'details', show_id), show_id) + show_details = show_json.get('details', {}) + title = show_details.get('showTitle') + description = show_details.get('showSynopsis') + + series_json = self._download_json(_API_URL.format('showmodule', 'serieslist', show_id), show_id) + playlist_id = str(traverse_obj(series_json, ('details', 'list', 0, 'id'))) + + playlist_json = self._download_json(_API_URL.format('showmodule', 'episodelist', playlist_id), playlist_id) + playlist = traverse_obj(playlist_json, ('details', 'list')) or [] + + entries = [ + self.url_result( + 'https://www.manototv.com/episode/%s' % item['slideID'], ie=ManotoTVIE.ie_key(), video_id=item['slideID']) + for item in playlist] + return self.playlist_result(entries, show_id, title, description) + + +class ManotoTVLiveIE(InfoExtractor): + IE_DESC = 'Manoto TV (Live)' + _VALID_URL = r'https?://(?:www\.)?manototv\.com/live/' + _TEST = { + 'url': 'https://www.manototv.com/live/', + 'info_dict': { + 'id': 'live', + 'title': 'Manoto TV Live', + 'ext': 'mp4', + 'is_live': True, + }, + 'params': { + 'skip_download': 'm3u8', + } + } + + def _real_extract(self, url): + video_id = 'live' + json = self._download_json(_API_URL.format('livemodule', 'details', ''), video_id) + details = json.get('details', {}) + video_url = details.get('liveUrl') + formats = self._extract_m3u8_formats(video_url, video_id, 'mp4', live=True) + self._sort_formats(formats) + return { + 'id': video_id, + 'title': 'Manoto TV Live', + 'is_live': True, + 'formats': formats, + } diff --git a/youtube_dl/extractor/manyvids.py b/yt_dlp/extractor/manyvids.py index e8d7163e4..e8d7163e4 100644 --- a/youtube_dl/extractor/manyvids.py +++ b/yt_dlp/extractor/manyvids.py diff --git a/youtube_dl/extractor/maoritv.py b/yt_dlp/extractor/maoritv.py index 0d23fec75..0d23fec75 100644 --- a/youtube_dl/extractor/maoritv.py +++ b/yt_dlp/extractor/maoritv.py diff --git a/youtube_dl/extractor/markiza.py b/yt_dlp/extractor/markiza.py index def960a0c..def960a0c 100644 --- a/youtube_dl/extractor/markiza.py +++ b/yt_dlp/extractor/markiza.py diff --git a/yt_dlp/extractor/massengeschmacktv.py b/yt_dlp/extractor/massengeschmacktv.py new file mode 100644 index 000000000..b381d31b4 --- /dev/null +++ b/yt_dlp/extractor/massengeschmacktv.py @@ -0,0 +1,77 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + determine_ext, + int_or_none, + js_to_json, + mimetype2ext, + parse_filesize, +) + + +class MassengeschmackTVIE(InfoExtractor): + IE_NAME = 'massengeschmack.tv' + _VALID_URL = r'https?://(?:www\.)?massengeschmack\.tv/play/(?P<id>[^?&#]+)' + + _TEST = { + 'url': 'https://massengeschmack.tv/play/fktv202', + 'md5': 'a9e054db9c2b5a08f0a0527cc201e8d3', + 'info_dict': { + 'id': 'fktv202', + 'ext': 'mp4', + 'title': 'Fernsehkritik-TV - Folge 202', + }, + } + + def _real_extract(self, url): + episode = self._match_id(url) + + webpage = self._download_webpage(url, episode) + title = clean_html(self._html_search_regex( + '<h3>([^<]+)</h3>', webpage, 'title')) + thumbnail = self._search_regex(r'POSTER\s*=\s*"([^"]+)', webpage, 'thumbnail', fatal=False) + sources = self._parse_json(self._search_regex(r'(?s)MEDIA\s*=\s*(\[.+?\]);', webpage, 'media'), episode, js_to_json) + + formats = [] + for source in sources: + furl = source.get('src') + if not furl: + continue + furl = self._proto_relative_url(furl) + ext = determine_ext(furl) or mimetype2ext(source.get('type')) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + furl, episode, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'url': furl, + 'format_id': determine_ext(furl), + }) + + for (durl, format_id, width, height, filesize) in re.findall(r'''(?x) + <a[^>]+?href="(?P<url>(?:https:)?//[^"]+)".*? + <strong>(?P<format_id>.+?)</strong>.*? + <small>(?:(?P<width>\d+)x(?P<height>\d+))?\s+?\((?P<filesize>[\d,]+\s*[GM]iB)\)</small> + ''', webpage): + formats.append({ + 'url': durl, + 'format_id': format_id, + 'width': int_or_none(width), + 'height': int_or_none(height), + 'filesize': parse_filesize(filesize), + 'vcodec': 'none' if format_id.startswith('Audio') else None, + }) + + self._sort_formats(formats) + + return { + 'id': episode, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/matchtv.py b/yt_dlp/extractor/matchtv.py index bc9933a81..bc9933a81 100644 --- a/youtube_dl/extractor/matchtv.py +++ b/yt_dlp/extractor/matchtv.py diff --git a/yt_dlp/extractor/mdr.py b/yt_dlp/extractor/mdr.py new file mode 100644 index 000000000..0bdd62693 --- /dev/null +++ b/yt_dlp/extractor/mdr.py @@ -0,0 +1,195 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urlparse, +) +from ..utils import ( + determine_ext, + int_or_none, + parse_duration, + parse_iso8601, + url_or_none, + xpath_text, +) + + +class MDRIE(InfoExtractor): + IE_DESC = 'MDR.DE and KiKA' + _VALID_URL = r'https?://(?:www\.)?(?:mdr|kika)\.de/(?:.*)/[a-z-]+-?(?P<id>\d+)(?:_.+?)?\.html' + + _GEO_COUNTRIES = ['DE'] + + _TESTS = [{ + # MDR regularly deletes its videos + 'url': 'http://www.mdr.de/fakt/video189002.html', + 'only_matching': True, + }, { + # audio + 'url': 'http://www.mdr.de/kultur/audio1312272_zc-15948bad_zs-86171fdd.html', + 'md5': '64c4ee50f0a791deb9479cd7bbe9d2fa', + 'info_dict': { + 'id': '1312272', + 'ext': 'mp3', + 'title': 'Feuilleton vom 30. Oktober 2015', + 'duration': 250, + 'uploader': 'MITTELDEUTSCHER RUNDFUNK', + }, + 'skip': '404 not found', + }, { + 'url': 'http://www.kika.de/baumhaus/videos/video19636.html', + 'md5': '4930515e36b06c111213e80d1e4aad0e', + 'info_dict': { + 'id': '19636', + 'ext': 'mp4', + 'title': 'Baumhaus vom 30. Oktober 2015', + 'duration': 134, + 'uploader': 'KIKA', + }, + 'skip': '404 not found', + }, { + 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/videos/video8182.html', + 'md5': '5fe9c4dd7d71e3b238f04b8fdd588357', + 'info_dict': { + 'id': '8182', + 'ext': 'mp4', + 'title': 'Beutolomäus und der geheime Weihnachtswunsch', + 'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd', + 'timestamp': 1482541200, + 'upload_date': '20161224', + 'duration': 4628, + 'uploader': 'KIKA', + }, + }, { + # audio with alternative playerURL pattern + 'url': 'http://www.mdr.de/kultur/videos-und-audios/audio-radio/operation-mindfuck-robert-wilson100.html', + 'info_dict': { + 'id': '100', + 'ext': 'mp4', + 'title': 'Feature: Operation Mindfuck - Robert Anton Wilson', + 'duration': 3239, + 'uploader': 'MITTELDEUTSCHER RUNDFUNK', + }, + }, { + # empty bitrateVideo and bitrateAudio + 'url': 'https://www.kika.de/filme/sendung128372_zc-572e3f45_zs-1d9fb70e.html', + 'info_dict': { + 'id': '128372', + 'ext': 'mp4', + 'title': 'Der kleine Wichtel kehrt zurück', + 'description': 'md5:f77fafdff90f7aa1e9dca14f662c052a', + 'duration': 4876, + 'timestamp': 1607823300, + 'upload_date': '20201213', + 'uploader': 'ZDF', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.kika.de/baumhaus/sendungen/video19636_zc-fea7f8a0_zs-4bf89c60.html', + 'only_matching': True, + }, { + 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/einzelsendung2534.html', + 'only_matching': True, + }, { + 'url': 'http://www.mdr.de/mediathek/mdr-videos/a/video-1334.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + data_url = self._search_regex( + r'(?:dataURL|playerXml(?:["\'])?)\s*:\s*(["\'])(?P<url>.+?-avCustom\.xml)\1', + webpage, 'data url', group='url').replace(r'\/', '/') + + doc = self._download_xml( + compat_urlparse.urljoin(url, data_url), video_id) + + title = xpath_text(doc, ['./title', './broadcast/broadcastName'], 'title', fatal=True) + + type_ = xpath_text(doc, './type', default=None) + + formats = [] + processed_urls = [] + for asset in doc.findall('./assets/asset'): + for source in ( + 'download', + 'progressiveDownload', + 'dynamicHttpStreamingRedirector', + 'adaptiveHttpStreamingRedirector'): + url_el = asset.find('./%sUrl' % source) + if url_el is None: + continue + + video_url = url_or_none(url_el.text) + if not video_url or video_url in processed_urls: + continue + + processed_urls.append(video_url) + + ext = determine_ext(video_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + quality=1, m3u8_id='HLS', fatal=False)) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + video_url + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id, + quality=1, f4m_id='HDS', fatal=False)) + else: + media_type = xpath_text(asset, './mediaType', 'media type', default='MP4') + vbr = int_or_none(xpath_text(asset, './bitrateVideo', 'vbr'), 1000) + abr = int_or_none(xpath_text(asset, './bitrateAudio', 'abr'), 1000) + filesize = int_or_none(xpath_text(asset, './fileSize', 'file size')) + + format_id = [media_type] + if vbr or abr: + format_id.append(compat_str(vbr or abr)) + + f = { + 'url': video_url, + 'format_id': '-'.join(format_id), + 'filesize': filesize, + 'abr': abr, + 'vbr': vbr, + } + + if vbr: + f.update({ + 'width': int_or_none(xpath_text(asset, './frameWidth', 'width')), + 'height': int_or_none(xpath_text(asset, './frameHeight', 'height')), + }) + + if type_ == 'audio': + f['vcodec'] = 'none' + + formats.append(f) + + self._sort_formats(formats) + + description = xpath_text(doc, './broadcast/broadcastDescription', 'description') + timestamp = parse_iso8601( + xpath_text( + doc, [ + './broadcast/broadcastDate', + './broadcast/broadcastStartDate', + './broadcast/broadcastEndDate'], + 'timestamp', default=None)) + duration = parse_duration(xpath_text(doc, './duration', 'duration')) + uploader = xpath_text(doc, './rights', 'uploader') + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'duration': duration, + 'uploader': uploader, + 'formats': formats, + } diff --git a/yt_dlp/extractor/medaltv.py b/yt_dlp/extractor/medaltv.py new file mode 100644 index 000000000..2ece5aac4 --- /dev/null +++ b/yt_dlp/extractor/medaltv.py @@ -0,0 +1,137 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + str_or_none, + try_get, +) + + +class MedalTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?medal\.tv/clips/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://medal.tv/clips/2mA60jWAGQCBH', + 'md5': '7b07b064331b1cf9e8e5c52a06ae68fa', + 'info_dict': { + 'id': '2mA60jWAGQCBH', + 'ext': 'mp4', + 'title': 'Quad Cold', + 'description': 'Medal,https://medal.tv/desktop/', + 'uploader': 'MowgliSB', + 'timestamp': 1603165266, + 'upload_date': '20201020', + 'uploader_id': '10619174', + } + }, { + 'url': 'https://medal.tv/clips/2um24TWdty0NA', + 'md5': 'b6dc76b78195fff0b4f8bf4a33ec2148', + 'info_dict': { + 'id': '2um24TWdty0NA', + 'ext': 'mp4', + 'title': 'u tk me i tk u bigger', + 'description': 'Medal,https://medal.tv/desktop/', + 'uploader': 'Mimicc', + 'timestamp': 1605580939, + 'upload_date': '20201117', + 'uploader_id': '5156321', + } + }, { + 'url': 'https://medal.tv/clips/37rMeFpryCC-9', + 'only_matching': True, + }, { + 'url': 'https://medal.tv/clips/2WRj40tpY_EU9', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + hydration_data = self._parse_json(self._search_regex( + r'<script[^>]*>\s*(?:var\s*)?hydrationData\s*=\s*({.+?})\s*</script>', + webpage, 'hydration data', default='{}'), video_id) + + clip = try_get( + hydration_data, lambda x: x['clips'][video_id], dict) or {} + if not clip: + raise ExtractorError( + 'Could not find video information.', video_id=video_id) + + title = clip['contentTitle'] + + source_width = int_or_none(clip.get('sourceWidth')) + source_height = int_or_none(clip.get('sourceHeight')) + + aspect_ratio = source_width / source_height if source_width and source_height else 16 / 9 + + def add_item(container, item_url, height, id_key='format_id', item_id=None): + item_id = item_id or '%dp' % height + if item_id not in item_url: + return + width = int(round(aspect_ratio * height)) + container.append({ + 'url': item_url, + id_key: item_id, + 'width': width, + 'height': height + }) + + formats = [] + thumbnails = [] + for k, v in clip.items(): + if not (v and isinstance(v, compat_str)): + continue + mobj = re.match(r'(contentUrl|thumbnail)(?:(\d+)p)?$', k) + if not mobj: + continue + prefix = mobj.group(1) + height = int_or_none(mobj.group(2)) + if prefix == 'contentUrl': + add_item( + formats, v, height or source_height, + item_id=None if height else 'source') + elif prefix == 'thumbnail': + add_item(thumbnails, v, height, 'id') + + error = clip.get('error') + if not formats and error: + if error == 404: + self.raise_no_formats( + 'That clip does not exist.', + expected=True, video_id=video_id) + else: + self.raise_no_formats( + 'An unknown error occurred ({0}).'.format(error), + video_id=video_id) + + self._sort_formats(formats) + + # Necessary because the id of the author is not known in advance. + # Won't raise an issue if no profile can be found as this is optional. + author = try_get( + hydration_data, lambda x: list(x['profiles'].values())[0], dict) or {} + author_id = str_or_none(author.get('id')) + author_url = 'https://medal.tv/users/{0}'.format(author_id) if author_id else None + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnails': thumbnails, + 'description': clip.get('contentDescription'), + 'uploader': author.get('displayName'), + 'timestamp': float_or_none(clip.get('created'), 1000), + 'uploader_id': author_id, + 'uploader_url': author_url, + 'duration': int_or_none(clip.get('videoLengthSeconds')), + 'view_count': int_or_none(clip.get('views')), + 'like_count': int_or_none(clip.get('likes')), + 'comment_count': int_or_none(clip.get('comments')), + } diff --git a/yt_dlp/extractor/mediaite.py b/yt_dlp/extractor/mediaite.py new file mode 100644 index 000000000..b670f0d61 --- /dev/null +++ b/yt_dlp/extractor/mediaite.py @@ -0,0 +1,93 @@ +from __future__ import unicode_literals + + +from .common import InfoExtractor + + +class MediaiteIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?mediaite.com(?!/category)(?:/[\w-]+){2}' + _TESTS = [{ + 'url': 'https://www.mediaite.com/sports/bill-burr-roasts-nfl-for-promoting-black-lives-matter-while-scheduling-more-games-after-all-the-sht-they-know-about-cte/', + 'info_dict': { + 'id': 'vPHKITzy', + 'ext': 'm4a', + 'title': 'Bill Burr On NFL And Black Lives Matter', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/vPHKITzy/poster.jpg?width=720', + 'duration': 55, + 'timestamp': 1631630185, + 'upload_date': '20210914', + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://www.mediaite.com/tv/joe-scarborough-goes-off-on-tax-breaks-for-super-wealthy-largest-income-redistribution-scam-in-american-history/', + 'info_dict': { + 'id': 'eeFcK4Xm', + 'ext': 'mp4', + 'title': 'Morning Joe-6_16_52 am - 6_21_10 am-2021-09-14.mp4', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/eeFcK4Xm/poster.jpg?width=720', + 'duration': 258, + 'timestamp': 1631618057, + 'upload_date': '20210914', + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://www.mediaite.com/politics/watch-rudy-giuliani-impersonates-queen-elizabeth-calls-mark-milley-an-asshle-in-bizarre-9-11-speech/', + 'info_dict': { + 'id': 'EiyiXKcr', + 'ext': 'mp4', + 'title': 'Giuliani 1', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/EiyiXKcr/poster.jpg?width=720', + 'duration': 39, + 'timestamp': 1631536476, + 'upload_date': '20210913', + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://www.mediaite.com/podcasts/clarissa-ward-says-she-decided-to-become-a-journalist-on-9-11/', + 'info_dict': { + 'id': 'TxavoRTx', + 'ext': 'mp4', + 'title': 'clarissa-ward-3.mp4', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/TxavoRTx/poster.jpg?width=720', + 'duration': 83, + 'timestamp': 1631311188, + 'upload_date': '20210910', + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://www.mediaite.com/opinion/mainstream-media-ignores-rose-mcgowans-bombshell-allegation-that-newsoms-wife-tried-to-silence-her-on-weinstein/', + 'info_dict': { + 'id': 'sEIWvKR7', + 'ext': 'mp4', + 'title': 'KTTV_09-13-2021_05.34.21', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/sEIWvKR7/poster.jpg?width=720', + 'duration': 52, + 'timestamp': 1631553328, + 'upload_date': '20210913', + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://www.mediaite.com/news/watch-cnbcs-jim-cramer-says-nobody-wants-to-die-getting-infected-by-unvaccinated-coworker-even-for-22-an-hour/', + 'info_dict': { + 'id': 'nwpt1elX', + 'ext': 'mp4', + 'title': "CNBC's Jim Cramer Says Nobody Wants to Die Getting Infected by Unvaccinated Coworker 'Even for $22 an Hour'.mp4", + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/nwpt1elX/poster.jpg?width=720', + 'duration': 60, + 'timestamp': 1633014214, + 'upload_date': '20210930', + }, + 'params': {'skip_download': True} + }] + + def _real_extract(self, url): + webpage = self._download_webpage(url, None) + id = self._search_regex(r'data-video-id\s?=\s?\"([^\"]+)\"', webpage, 'id') + data_json = self._download_json(f'https://cdn.jwplayer.com/v2/media/{id}', id) + return self._parse_jwplayer_data(data_json) diff --git a/yt_dlp/extractor/mediaklikk.py b/yt_dlp/extractor/mediaklikk.py new file mode 100644 index 000000000..b9b6d739f --- /dev/null +++ b/yt_dlp/extractor/mediaklikk.py @@ -0,0 +1,104 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from ..utils import ( + unified_strdate +) +from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse_unquote, + compat_str +) + + +class MediaKlikkIE(InfoExtractor): + _VALID_URL = r'''(?x)^https?:\/\/(?:www\.)? + (?:mediaklikk|m4sport|hirado|petofilive)\.hu\/.*?videok?\/ + (?:(?P<year>[0-9]{4})/(?P<month>[0-9]{1,2})/(?P<day>[0-9]{1,2})/)? + (?P<id>[^/#?_]+)''' + + _TESTS = [{ + # mediaklikk. date in html. + 'url': 'https://mediaklikk.hu/video/hazajaro-delnyugat-bacska-a-duna-menten-palankatol-doroszloig/', + 'info_dict': { + 'id': '4754129', + 'title': 'Hazajáró, DÉLNYUGAT-BÁCSKA – A Duna mentén Palánkától Doroszlóig', + 'ext': 'mp4', + 'upload_date': '20210901', + 'thumbnail': 'http://mediaklikk.hu/wp-content/uploads/sites/4/2014/02/hazajarouj_JO.jpg' + } + }, { + # m4sport + 'url': 'https://m4sport.hu/video/2021/08/30/gyemant-liga-parizs/', + 'info_dict': { + 'id': '4754999', + 'title': 'Gyémánt Liga, Párizs', + 'ext': 'mp4', + 'upload_date': '20210830', + 'thumbnail': 'http://m4sport.hu/wp-content/uploads/sites/4/2021/08/vlcsnap-2021-08-30-18h21m20s10-1024x576.jpg' + } + }, { + # m4sport with *video/ url and no date + 'url': 'https://m4sport.hu/bl-video/real-madrid-chelsea-1-1/', + 'info_dict': { + 'id': '4492099', + 'title': 'Real Madrid - Chelsea 1-1', + 'ext': 'mp4', + 'thumbnail': 'http://m4sport.hu/wp-content/uploads/sites/4/2021/04/Sequence-01.Still001-1024x576.png' + } + }, { + # hirado + 'url': 'https://hirado.hu/videok/felteteleket-szabott-a-fovaros/', + 'info_dict': { + 'id': '4760120', + 'title': 'Feltételeket szabott a főváros', + 'ext': 'mp4', + 'thumbnail': 'http://hirado.hu/wp-content/uploads/sites/4/2021/09/vlcsnap-2021-09-01-20h20m37s165.jpg' + } + }, { + # petofilive + 'url': 'https://petofilive.hu/video/2021/06/07/tha-shudras-az-akusztikban/', + 'info_dict': { + 'id': '4571948', + 'title': 'Tha Shudras az Akusztikban', + 'ext': 'mp4', + 'upload_date': '20210607', + 'thumbnail': 'http://petofilive.hu/wp-content/uploads/sites/4/2021/06/vlcsnap-2021-06-07-22h14m23s915-1024x576.jpg' + } + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + display_id = mobj.group('id') + webpage = self._download_webpage(url, display_id) + + player_data_str = self._html_search_regex( + r'mtva_player_manager\.player\(document.getElementById\(.*\),\s?(\{.*\}).*\);', webpage, 'player data') + player_data = self._parse_json(player_data_str, display_id, compat_urllib_parse_unquote) + video_id = compat_str(player_data['contentId']) + title = player_data.get('title') or self._og_search_title(webpage, fatal=False) or \ + self._html_search_regex(r'<h\d+\b[^>]+\bclass="article_title">([^<]+)<', webpage, 'title') + + upload_date = unified_strdate( + '%s-%s-%s' % (mobj.group('year'), mobj.group('month'), mobj.group('day'))) + if not upload_date: + upload_date = unified_strdate(self._html_search_regex( + r'<p+\b[^>]+\bclass="article_date">([^<]+)<', webpage, 'upload date', default=None)) + + player_data['video'] = player_data.pop('token') + player_page = self._download_webpage('https://player.mediaklikk.hu/playernew/player.php', video_id, query=player_data) + playlist_url = self._proto_relative_url(compat_urllib_parse_unquote( + self._html_search_regex(r'\"file\":\s*\"(\\?/\\?/.*playlist\.m3u8)\"', player_page, 'playlist_url')).replace('\\/', '/')) + + formats = self._extract_wowza_formats( + playlist_url, video_id, skip_protocols=['f4m', 'smil', 'dash']) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'display_id': display_id, + 'formats': formats, + 'upload_date': upload_date, + 'thumbnail': player_data.get('bgImage') or self._og_search_thumbnail(webpage) + } diff --git a/youtube_dl/extractor/medialaan.py b/yt_dlp/extractor/medialaan.py index 788acf7fb..788acf7fb 100644 --- a/youtube_dl/extractor/medialaan.py +++ b/yt_dlp/extractor/medialaan.py diff --git a/yt_dlp/extractor/mediaset.py b/yt_dlp/extractor/mediaset.py new file mode 100644 index 000000000..26e7abc49 --- /dev/null +++ b/yt_dlp/extractor/mediaset.py @@ -0,0 +1,214 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .theplatform import ThePlatformBaseIE +from ..utils import ( + ExtractorError, + int_or_none, + parse_qs, + update_url_query, +) + + +class MediasetIE(ThePlatformBaseIE): + _TP_TLD = 'eu' + _VALID_URL = r'''(?x) + (?: + mediaset:| + https?:// + (?:(?:www|static3)\.)?mediasetplay\.mediaset\.it/ + (?: + (?:video|on-demand|movie)/(?:[^/]+/)+[^/]+_| + player/index\.html\?.*?\bprogramGuid= + ) + )(?P<id>[0-9A-Z]{16,}) + ''' + _TESTS = [{ + # full episode + 'url': 'https://www.mediasetplay.mediaset.it/video/mrwronglezionidamore/episodio-1_F310575103000102', + 'md5': 'a7e75c6384871f322adb781d3bd72c26', + 'info_dict': { + 'id': 'F310575103000102', + 'ext': 'mp4', + 'title': 'Episodio 1', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 2682.0, + 'upload_date': '20210530', + 'series': 'Mr Wrong - Lezioni d\'amore', + 'timestamp': 1622413946, + 'uploader': 'Canale 5', + 'uploader_id': 'C5', + }, + }, { + 'url': 'https://www.mediasetplay.mediaset.it/video/matrix/puntata-del-25-maggio_F309013801000501', + 'md5': '1276f966ac423d16ba255ce867de073e', + 'info_dict': { + 'id': 'F309013801000501', + 'ext': 'mp4', + 'title': 'Puntata del 25 maggio', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 6565.008, + 'upload_date': '20200903', + 'series': 'Matrix', + 'timestamp': 1599172492, + 'uploader': 'Canale 5', + 'uploader_id': 'C5', + }, + }, { + 'url': 'https://www.mediasetplay.mediaset.it/video/cameracafe5/episodio-69-pezzo-di-luna_F303843101017801', + 'md5': 'd1650ac9ff944f185556126a736df148', + 'info_dict': { + 'id': 'F303843101017801', + 'ext': 'mp4', + 'title': 'Episodio 69 - Pezzo di luna', + 'description': '', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 263.008, + 'upload_date': '20200902', + 'series': 'Camera Café 5', + 'timestamp': 1599064700, + 'uploader': 'Italia 1', + 'uploader_id': 'I1', + }, + }, { + 'url': 'https://www.mediasetplay.mediaset.it/video/cameracafe5/episodio-51-tu-chi-sei_F303843107000601', + 'md5': '567e9ad375b7a27a0e370650f572a1e3', + 'info_dict': { + 'id': 'F303843107000601', + 'ext': 'mp4', + 'title': 'Episodio 51 - Tu chi sei?', + 'description': '', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 367.021, + 'upload_date': '20200902', + 'series': 'Camera Café 5', + 'timestamp': 1599069817, + 'uploader': 'Italia 1', + 'uploader_id': 'I1', + }, + }, { + # clip + 'url': 'https://www.mediasetplay.mediaset.it/video/gogglebox/un-grande-classico-della-commedia-sexy_FAFU000000661680', + 'only_matching': True, + }, { + # iframe simple + 'url': 'https://static3.mediasetplay.mediaset.it/player/index.html?appKey=5ad3966b1de1c4000d5cec48&programGuid=FAFU000000665924&id=665924', + 'only_matching': True, + }, { + # iframe twitter (from http://www.wittytv.it/se-prima-mi-fidavo-zero/) + 'url': 'https://static3.mediasetplay.mediaset.it/player/index.html?appKey=5ad3966b1de1c4000d5cec48&programGuid=FAFU000000665104&id=665104', + 'only_matching': True, + }, { + 'url': 'mediaset:FAFU000000665924', + 'only_matching': True, + }, { + 'url': 'https://www.mediasetplay.mediaset.it/video/mediasethaacuoreilfuturo/palmieri-alicudi-lisola-dei-tre-bambini-felici--un-decreto-per-alicudi-e-tutte-le-microscuole_FD00000000102295', + 'only_matching': True, + }, { + 'url': 'https://www.mediasetplay.mediaset.it/video/cherryseason/anticipazioni-degli-episodi-del-23-ottobre_F306837101005C02', + 'only_matching': True, + }, { + 'url': 'https://www.mediasetplay.mediaset.it/video/tg5/ambiente-onda-umana-per-salvare-il-pianeta_F309453601079D01', + 'only_matching': True, + }, { + 'url': 'https://www.mediasetplay.mediaset.it/video/grandefratellovip/benedetta-una-doccia-gelata_F309344401044C135', + 'only_matching': True, + }, { + 'url': 'https://www.mediasetplay.mediaset.it/movie/herculeslaleggendahainizio/hercules-la-leggenda-ha-inizio_F305927501000102', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(ie, webpage): + def _qs(url): + return parse_qs(url) + + def _program_guid(qs): + return qs.get('programGuid', [None])[0] + + entries = [] + for mobj in re.finditer( + r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?video\.mediaset\.it/player/playerIFrame(?:Twitter)?\.shtml.*?)\1', + webpage): + embed_url = mobj.group('url') + embed_qs = _qs(embed_url) + program_guid = _program_guid(embed_qs) + if program_guid: + entries.append(embed_url) + continue + video_id = embed_qs.get('id', [None])[0] + if not video_id: + continue + urlh = ie._request_webpage( + embed_url, video_id, note='Following embed URL redirect') + embed_url = urlh.geturl() + program_guid = _program_guid(_qs(embed_url)) + if program_guid: + entries.append(embed_url) + return entries + + def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): + for video in smil.findall(self._xpath_ns('.//video', namespace)): + video.attrib['src'] = re.sub(r'(https?://vod05)t(-mediaset-it\.akamaized\.net/.+?.mpd)\?.+', r'\1\2', video.attrib['src']) + return super(MediasetIE, self)._parse_smil_formats(smil, smil_url, video_id, namespace, f4m_params, transform_rtmp_url) + + def _real_extract(self, url): + guid = self._match_id(url) + tp_path = 'PR1GhC/media/guid/2702976343/' + guid + info = self._extract_theplatform_metadata(tp_path, guid) + + formats = [] + subtitles = {} + first_e = None + asset_type = 'geoNo:HD,browser,geoIT|geoNo:HD,geoIT|geoNo:SD,browser,geoIT|geoNo:SD,geoIT|geoNo|HD|SD' + # TODO: fixup ISM+none manifest URLs + for f in ('MPEG4', 'MPEG-DASH+none', 'M3U+none'): + try: + tp_formats, tp_subtitles = self._extract_theplatform_smil( + update_url_query('http://link.theplatform.%s/s/%s' % (self._TP_TLD, tp_path), { + 'mbr': 'true', + 'formats': f, + 'assetTypes': asset_type, + }), guid, 'Downloading %s SMIL data' % (f.split('+')[0])) + except ExtractorError as e: + if not first_e: + first_e = e + break + formats.extend(tp_formats) + subtitles = self._merge_subtitles(subtitles, tp_subtitles) + if first_e and not formats: + raise first_e + self._sort_formats(formats) + + feed_data = self._download_json( + 'https://feed.entertainment.tv.theplatform.eu/f/PR1GhC/mediaset-prod-all-programs-v2/guid/-/' + guid, + guid, fatal=False) + if feed_data: + publish_info = feed_data.get('mediasetprogram$publishInfo') or {} + thumbnails = feed_data.get('thumbnails') or {} + thumbnail = None + for key, value in thumbnails.items(): + if key.startswith('image_keyframe_poster-'): + thumbnail = value.get('url') + break + + info.update({ + 'episode_number': int_or_none(feed_data.get('tvSeasonEpisodeNumber')), + 'season_number': int_or_none(feed_data.get('tvSeasonNumber')), + 'series': feed_data.get('mediasetprogram$brandTitle'), + 'uploader': publish_info.get('description'), + 'uploader_id': publish_info.get('channel'), + 'view_count': int_or_none(feed_data.get('mediasetprogram$numberOfViews')), + 'thumbnail': thumbnail, + }) + + info.update({ + 'id': guid, + 'formats': formats, + 'subtitles': subtitles, + }) + return info diff --git a/yt_dlp/extractor/mediasite.py b/yt_dlp/extractor/mediasite.py new file mode 100644 index 000000000..ace86c2fd --- /dev/null +++ b/yt_dlp/extractor/mediasite.py @@ -0,0 +1,417 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import json + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urlparse, +) +from ..utils import ( + ExtractorError, + float_or_none, + mimetype2ext, + str_or_none, + try_get, + unescapeHTML, + unsmuggle_url, + url_or_none, + urljoin, +) + + +_ID_RE = r'(?:[0-9a-f]{32,34}|[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12,14})' + + +class MediasiteIE(InfoExtractor): + _VALID_URL = r'(?xi)https?://[^/]+/Mediasite/(?:Play|Showcase/[^/#?]+/Presentation)/(?P<id>%s)(?P<query>\?[^#]+|)' % _ID_RE + _TESTS = [ + { + 'url': 'https://hitsmediaweb.h-its.org/mediasite/Play/2db6c271681e4f199af3c60d1f82869b1d', + 'info_dict': { + 'id': '2db6c271681e4f199af3c60d1f82869b1d', + 'ext': 'mp4', + 'title': 'Lecture: Tuesday, September 20, 2016 - Sir Andrew Wiles', + 'description': 'Sir Andrew Wiles: “Equations in arithmetic”\\n\\nI will describe some of the interactions between modern number theory and the problem of solving equations in rational numbers or integers\\u0027.', + 'timestamp': 1474268400.0, + 'upload_date': '20160919', + }, + }, + { + 'url': 'http://mediasite.uib.no/Mediasite/Play/90bb363295d945d6b548c867d01181361d?catalog=a452b7df-9ae1-46b7-a3ba-aceeb285f3eb', + 'info_dict': { + 'id': '90bb363295d945d6b548c867d01181361d', + 'ext': 'mp4', + 'upload_date': '20150429', + 'title': '5) IT-forum 2015-Dag 1 - Dungbeetle - How and why Rain created a tiny bug tracker for Unity', + 'timestamp': 1430311380.0, + }, + }, + { + 'url': 'https://collegerama.tudelft.nl/Mediasite/Play/585a43626e544bdd97aeb71a0ec907a01d', + 'md5': '481fda1c11f67588c0d9d8fbdced4e39', + 'info_dict': { + 'id': '585a43626e544bdd97aeb71a0ec907a01d', + 'ext': 'mp4', + 'title': 'Een nieuwe wereld: waarden, bewustzijn en techniek van de mensheid 2.0.', + 'description': '', + 'thumbnail': r're:^https?://.*\.jpg(?:\?.*)?$', + 'duration': 7713.088, + 'timestamp': 1413309600, + 'upload_date': '20141014', + }, + }, + { + 'url': 'https://collegerama.tudelft.nl/Mediasite/Play/86a9ea9f53e149079fbdb4202b521ed21d?catalog=fd32fd35-6c99-466c-89d4-cd3c431bc8a4', + 'md5': 'ef1fdded95bdf19b12c5999949419c92', + 'info_dict': { + 'id': '86a9ea9f53e149079fbdb4202b521ed21d', + 'ext': 'wmv', + 'title': '64ste Vakantiecursus: Afvalwater', + 'description': 'md5:7fd774865cc69d972f542b157c328305', + 'thumbnail': r're:^https?://.*\.jpg(?:\?.*?)?$', + 'duration': 10853, + 'timestamp': 1326446400, + 'upload_date': '20120113', + }, + }, + { + 'url': 'http://digitalops.sandia.gov/Mediasite/Play/24aace4429fc450fb5b38cdbf424a66e1d', + 'md5': '9422edc9b9a60151727e4b6d8bef393d', + 'info_dict': { + 'id': '24aace4429fc450fb5b38cdbf424a66e1d', + 'ext': 'mp4', + 'title': 'Xyce Software Training - Section 1', + 'description': r're:(?s)SAND Number: SAND 2013-7800.{200,}', + 'upload_date': '20120409', + 'timestamp': 1333983600, + 'duration': 7794, + } + }, + { + 'url': 'https://collegerama.tudelft.nl/Mediasite/Showcase/livebroadcast/Presentation/ada7020854f743c49fbb45c9ec7dbb351d', + 'only_matching': True, + }, + { + 'url': 'https://mediasite.ntnu.no/Mediasite/Showcase/default/Presentation/7d8b913259334b688986e970fae6fcb31d', + 'only_matching': True, + }, + { + # dashed id + 'url': 'https://hitsmediaweb.h-its.org/mediasite/Play/2db6c271-681e-4f19-9af3-c60d1f82869b1d', + 'only_matching': True, + } + ] + + # look in Mediasite.Core.js (Mediasite.ContentStreamType[*]) + _STREAM_TYPES = { + 0: 'video1', # the main video + 2: 'slide', + 3: 'presentation', + 4: 'video2', # screencast? + 5: 'video3', + } + + @staticmethod + def _extract_urls(webpage): + return [ + unescapeHTML(mobj.group('url')) + for mobj in re.finditer( + r'(?xi)<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:(?:https?:)?//[^/]+)?/Mediasite/Play/%s(?:\?.*?)?)\1' % _ID_RE, + webpage)] + + def __extract_slides(self, *, stream_id, snum, Stream, duration, images): + slide_base_url = Stream['SlideBaseUrl'] + + fname_template = Stream['SlideImageFileNameTemplate'] + if fname_template != 'slide_{0:D4}.jpg': + self.report_warning('Unusual slide file name template; report a bug if slide downloading fails') + fname_template = re.sub(r'\{0:D([0-9]+)\}', r'{0:0\1}', fname_template) + + fragments = [] + for i, slide in enumerate(Stream['Slides']): + if i == 0: + if slide['Time'] > 0: + default_slide = images.get('DefaultSlide') + if default_slide is None: + default_slide = images.get('DefaultStreamImage') + if default_slide is not None: + default_slide = default_slide['ImageFilename'] + if default_slide is not None: + fragments.append({ + 'path': default_slide, + 'duration': slide['Time'] / 1000, + }) + + next_time = try_get(None, [ + lambda _: Stream['Slides'][i + 1]['Time'], + lambda _: duration, + lambda _: slide['Time'], + ], expected_type=(int, float)) + + fragments.append({ + 'path': fname_template.format(slide.get('Number', i + 1)), + 'duration': (next_time - slide['Time']) / 1000 + }) + + return { + 'format_id': '%s-%u.slides' % (stream_id, snum), + 'ext': 'mhtml', + 'url': slide_base_url, + 'protocol': 'mhtml', + 'acodec': 'none', + 'vcodec': 'none', + 'format_note': 'Slides', + 'fragments': fragments, + 'fragment_base_url': slide_base_url, + } + + def _real_extract(self, url): + url, data = unsmuggle_url(url, {}) + mobj = self._match_valid_url(url) + resource_id = mobj.group('id') + query = mobj.group('query') + + webpage, urlh = self._download_webpage_handle(url, resource_id) # XXX: add UrlReferrer? + redirect_url = urlh.geturl() + + # XXX: might have also extracted UrlReferrer and QueryString from the html + service_path = compat_urlparse.urljoin(redirect_url, self._html_search_regex( + r'<div[^>]+\bid=["\']ServicePath[^>]+>(.+?)</div>', webpage, resource_id, + default='/Mediasite/PlayerService/PlayerService.svc/json')) + + player_options = self._download_json( + '%s/GetPlayerOptions' % service_path, resource_id, + headers={ + 'Content-type': 'application/json; charset=utf-8', + 'X-Requested-With': 'XMLHttpRequest', + }, + data=json.dumps({ + 'getPlayerOptionsRequest': { + 'ResourceId': resource_id, + 'QueryString': query, + 'UrlReferrer': data.get('UrlReferrer', ''), + 'UseScreenReader': False, + } + }).encode('utf-8'))['d'] + + presentation = player_options['Presentation'] + title = presentation['Title'] + + if presentation is None: + raise ExtractorError( + 'Mediasite says: %s' % player_options['PlayerPresentationStatusMessage'], + expected=True) + + thumbnails = [] + formats = [] + for snum, Stream in enumerate(presentation['Streams']): + stream_type = Stream.get('StreamType') + if stream_type is None: + continue + + video_urls = Stream.get('VideoUrls') + if not isinstance(video_urls, list): + video_urls = [] + + stream_id = self._STREAM_TYPES.get( + stream_type, 'type%u' % stream_type) + + stream_formats = [] + for unum, VideoUrl in enumerate(video_urls): + video_url = url_or_none(VideoUrl.get('Location')) + if not video_url: + continue + # XXX: if Stream.get('CanChangeScheme', False), switch scheme to HTTP/HTTPS + + media_type = VideoUrl.get('MediaType') + if media_type == 'SS': + stream_formats.extend(self._extract_ism_formats( + video_url, resource_id, + ism_id='%s-%u.%u' % (stream_id, snum, unum), + fatal=False)) + elif media_type == 'Dash': + stream_formats.extend(self._extract_mpd_formats( + video_url, resource_id, + mpd_id='%s-%u.%u' % (stream_id, snum, unum), + fatal=False)) + else: + stream_formats.append({ + 'format_id': '%s-%u.%u' % (stream_id, snum, unum), + 'url': video_url, + 'ext': mimetype2ext(VideoUrl.get('MimeType')), + }) + + if Stream.get('HasSlideContent', False): + images = player_options['PlayerLayoutOptions']['Images'] + stream_formats.append(self.__extract_slides( + stream_id=stream_id, + snum=snum, + Stream=Stream, + duration=presentation.get('Duration'), + images=images, + )) + + # disprefer 'secondary' streams + if stream_type != 0: + for fmt in stream_formats: + fmt['quality'] = -10 + + thumbnail_url = Stream.get('ThumbnailUrl') + if thumbnail_url: + thumbnails.append({ + 'id': '%s-%u' % (stream_id, snum), + 'url': urljoin(redirect_url, thumbnail_url), + 'preference': -1 if stream_type != 0 else 0, + }) + formats.extend(stream_formats) + + self._sort_formats(formats) + + # XXX: Presentation['Presenters'] + # XXX: Presentation['Transcript'] + + return { + 'id': resource_id, + 'title': title, + 'description': presentation.get('Description'), + 'duration': float_or_none(presentation.get('Duration'), 1000), + 'timestamp': float_or_none(presentation.get('UnixTime'), 1000), + 'formats': formats, + 'thumbnails': thumbnails, + } + + +class MediasiteCatalogIE(InfoExtractor): + _VALID_URL = r'''(?xi) + (?P<url>https?://[^/]+/Mediasite) + /Catalog/Full/ + (?P<catalog_id>{0}) + (?: + /(?P<current_folder_id>{0}) + /(?P<root_dynamic_folder_id>{0}) + )? + '''.format(_ID_RE) + _TESTS = [{ + 'url': 'http://events7.mediasite.com/Mediasite/Catalog/Full/631f9e48530d454381549f955d08c75e21', + 'info_dict': { + 'id': '631f9e48530d454381549f955d08c75e21', + 'title': 'WCET Summit: Adaptive Learning in Higher Ed: Improving Outcomes Dynamically', + }, + 'playlist_count': 6, + 'expected_warnings': ['is not a supported codec'], + }, { + # with CurrentFolderId and RootDynamicFolderId + 'url': 'https://medaudio.medicine.iu.edu/Mediasite/Catalog/Full/9518c4a6c5cf4993b21cbd53e828a92521/97a9db45f7ab47428c77cd2ed74bb98f14/9518c4a6c5cf4993b21cbd53e828a92521', + 'info_dict': { + 'id': '9518c4a6c5cf4993b21cbd53e828a92521', + 'title': 'IUSM Family and Friends Sessions', + }, + 'playlist_count': 2, + }, { + 'url': 'http://uipsyc.mediasite.com/mediasite/Catalog/Full/d5d79287c75243c58c50fef50174ec1b21', + 'only_matching': True, + }, { + # no AntiForgeryToken + 'url': 'https://live.libraries.psu.edu/Mediasite/Catalog/Full/8376d4b24dd1457ea3bfe4cf9163feda21', + 'only_matching': True, + }, { + 'url': 'https://medaudio.medicine.iu.edu/Mediasite/Catalog/Full/9518c4a6c5cf4993b21cbd53e828a92521/97a9db45f7ab47428c77cd2ed74bb98f14/9518c4a6c5cf4993b21cbd53e828a92521', + 'only_matching': True, + }, { + # dashed id + 'url': 'http://events7.mediasite.com/Mediasite/Catalog/Full/631f9e48-530d-4543-8154-9f955d08c75e', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + mediasite_url = mobj.group('url') + catalog_id = mobj.group('catalog_id') + current_folder_id = mobj.group('current_folder_id') or catalog_id + root_dynamic_folder_id = mobj.group('root_dynamic_folder_id') + + webpage = self._download_webpage(url, catalog_id) + + # AntiForgeryToken is optional (e.g. [1]) + # 1. https://live.libraries.psu.edu/Mediasite/Catalog/Full/8376d4b24dd1457ea3bfe4cf9163feda21 + anti_forgery_token = self._search_regex( + r'AntiForgeryToken\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1', + webpage, 'anti forgery token', default=None, group='value') + if anti_forgery_token: + anti_forgery_header = self._search_regex( + r'AntiForgeryHeaderName\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1', + webpage, 'anti forgery header name', + default='X-SOFO-AntiForgeryHeader', group='value') + + data = { + 'IsViewPage': True, + 'IsNewFolder': True, + 'AuthTicket': None, + 'CatalogId': catalog_id, + 'CurrentFolderId': current_folder_id, + 'RootDynamicFolderId': root_dynamic_folder_id, + 'ItemsPerPage': 1000, + 'PageIndex': 0, + 'PermissionMask': 'Execute', + 'CatalogSearchType': 'SearchInFolder', + 'SortBy': 'Date', + 'SortDirection': 'Descending', + 'StartDate': None, + 'EndDate': None, + 'StatusFilterList': None, + 'PreviewKey': None, + 'Tags': [], + } + + headers = { + 'Content-Type': 'application/json; charset=UTF-8', + 'Referer': url, + 'X-Requested-With': 'XMLHttpRequest', + } + if anti_forgery_token: + headers[anti_forgery_header] = anti_forgery_token + + catalog = self._download_json( + '%s/Catalog/Data/GetPresentationsForFolder' % mediasite_url, + catalog_id, data=json.dumps(data).encode(), headers=headers) + + entries = [] + for video in catalog['PresentationDetailsList']: + if not isinstance(video, dict): + continue + video_id = str_or_none(video.get('Id')) + if not video_id: + continue + entries.append(self.url_result( + '%s/Play/%s' % (mediasite_url, video_id), + ie=MediasiteIE.ie_key(), video_id=video_id)) + + title = try_get( + catalog, lambda x: x['CurrentFolder']['Name'], compat_str) + + return self.playlist_result(entries, catalog_id, title,) + + +class MediasiteNamedCatalogIE(InfoExtractor): + _VALID_URL = r'(?xi)(?P<url>https?://[^/]+/Mediasite)/Catalog/catalogs/(?P<catalog_name>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://msite.misis.ru/Mediasite/Catalog/catalogs/2016-industrial-management-skriabin-o-o', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + mediasite_url = mobj.group('url') + catalog_name = mobj.group('catalog_name') + + webpage = self._download_webpage(url, catalog_name) + + catalog_id = self._search_regex( + r'CatalogId\s*:\s*["\'](%s)' % _ID_RE, webpage, 'catalog id') + + return self.url_result( + '%s/Catalog/Full/%s' % (mediasite_url, catalog_id), + ie=MediasiteCatalogIE.ie_key(), video_id=catalog_id) diff --git a/youtube_dl/extractor/medici.py b/yt_dlp/extractor/medici.py index cd910238e..cd910238e 100644 --- a/youtube_dl/extractor/medici.py +++ b/yt_dlp/extractor/medici.py diff --git a/youtube_dl/extractor/megaphone.py b/yt_dlp/extractor/megaphone.py index 5bafa6cf4..5bafa6cf4 100644 --- a/youtube_dl/extractor/megaphone.py +++ b/yt_dlp/extractor/megaphone.py diff --git a/youtube_dl/extractor/meipai.py b/yt_dlp/extractor/meipai.py index 2445b8b39..2445b8b39 100644 --- a/youtube_dl/extractor/meipai.py +++ b/yt_dlp/extractor/meipai.py diff --git a/youtube_dl/extractor/melonvod.py b/yt_dlp/extractor/melonvod.py index bd8cf13ab..bd8cf13ab 100644 --- a/youtube_dl/extractor/melonvod.py +++ b/yt_dlp/extractor/melonvod.py diff --git a/youtube_dl/extractor/meta.py b/yt_dlp/extractor/meta.py index cdb46e163..cdb46e163 100644 --- a/youtube_dl/extractor/meta.py +++ b/yt_dlp/extractor/meta.py diff --git a/yt_dlp/extractor/metacafe.py b/yt_dlp/extractor/metacafe.py new file mode 100644 index 000000000..7b2d4a003 --- /dev/null +++ b/yt_dlp/extractor/metacafe.py @@ -0,0 +1,287 @@ +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse, + compat_urllib_parse_unquote, +) +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + get_element_by_attribute, + mimetype2ext, +) + + +class MetacafeIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?metacafe\.com/watch/(?P<id>[^/]+)/(?P<display_id>[^/?#]+)' + _DISCLAIMER = 'http://www.metacafe.com/family_filter/' + _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user' + IE_NAME = 'metacafe' + _TESTS = [ + # Youtube video + { + 'add_ie': ['Youtube'], + 'url': 'http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/', + 'info_dict': { + 'id': '_aUehQsCQtM', + 'ext': 'mp4', + 'upload_date': '20090102', + 'title': 'The Electric Company | "Short I" | PBS KIDS GO!', + 'description': 'md5:2439a8ef6d5a70e380c22f5ad323e5a8', + 'uploader': 'PBS', + 'uploader_id': 'PBS' + } + }, + # Normal metacafe video + { + 'url': 'http://www.metacafe.com/watch/11121940/news_stuff_you_wont_do_with_your_playstation_4/', + 'md5': '6e0bca200eaad2552e6915ed6fd4d9ad', + 'info_dict': { + 'id': '11121940', + 'ext': 'mp4', + 'title': 'News: Stuff You Won\'t Do with Your PlayStation 4', + 'uploader': 'ign', + 'description': 'Sony released a massive FAQ on the PlayStation Blog detailing the PS4\'s capabilities and limitations.', + }, + 'skip': 'Page is temporarily unavailable.', + }, + # metacafe video with family filter + { + 'url': 'http://www.metacafe.com/watch/2155630/adult_art_by_david_hart_156/', + 'md5': 'b06082c5079bbdcde677a6291fbdf376', + 'info_dict': { + 'id': '2155630', + 'ext': 'mp4', + 'title': 'Adult Art By David Hart 156', + 'uploader': '63346', + 'description': 'md5:9afac8fc885252201ad14563694040fc', + }, + 'params': { + 'skip_download': True, + }, + }, + # AnyClip video + { + 'url': 'http://www.metacafe.com/watch/an-dVVXnuY7Jh77J/the_andromeda_strain_1971_stop_the_bomb_part_3/', + 'info_dict': { + 'id': 'an-dVVXnuY7Jh77J', + 'ext': 'mp4', + 'title': 'The Andromeda Strain (1971): Stop the Bomb Part 3', + 'uploader': 'AnyClip', + 'description': 'md5:cbef0460d31e3807f6feb4e7a5952e5b', + }, + }, + # age-restricted video + { + 'url': 'http://www.metacafe.com/watch/5186653/bbc_internal_christmas_tape_79_uncensored_outtakes_etc/', + 'md5': '98dde7c1a35d02178e8ab7560fe8bd09', + 'info_dict': { + 'id': '5186653', + 'ext': 'mp4', + 'title': 'BBC INTERNAL Christmas Tape \'79 - UNCENSORED Outtakes, Etc.', + 'uploader': 'Dwayne Pipe', + 'description': 'md5:950bf4c581e2c059911fa3ffbe377e4b', + 'age_limit': 18, + }, + }, + # cbs video + { + 'url': 'http://www.metacafe.com/watch/cb-8VD4r_Zws8VP/open_this_is_face_the_nation_february_9/', + 'info_dict': { + 'id': '8VD4r_Zws8VP', + 'ext': 'flv', + 'title': 'Open: This is Face the Nation, February 9', + 'description': 'md5:8a9ceec26d1f7ed6eab610834cc1a476', + 'duration': 96, + 'uploader': 'CBSI-NEW', + 'upload_date': '20140209', + 'timestamp': 1391959800, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, + # Movieclips.com video + { + 'url': 'http://www.metacafe.com/watch/mv-Wy7ZU/my_week_with_marilyn_do_you_love_me/', + 'info_dict': { + 'id': 'mv-Wy7ZU', + 'ext': 'mp4', + 'title': 'My Week with Marilyn - Do You Love Me?', + 'description': 'From the movie My Week with Marilyn - Colin (Eddie Redmayne) professes his love to Marilyn (Michelle Williams) and gets her to promise to return to set and finish the movie.', + 'uploader': 'movie_trailers', + 'duration': 176, + }, + 'params': { + 'skip_download': 'requires rtmpdump', + } + } + ] + + def report_disclaimer(self): + self.to_screen('Retrieving disclaimer') + + def _real_extract(self, url): + # Extract id and simplified title from URL + video_id, display_id = self._match_valid_url(url).groups() + + # the video may come from an external site + m_external = re.match(r'^(\w{2})-(.*)$', video_id) + if m_external is not None: + prefix, ext_id = m_external.groups() + # Check if video comes from YouTube + if prefix == 'yt': + return self.url_result('http://www.youtube.com/watch?v=%s' % ext_id, 'Youtube') + # CBS videos use theplatform.com + if prefix == 'cb': + return self.url_result('theplatform:%s' % ext_id, 'ThePlatform') + + headers = { + # Disable family filter + 'Cookie': 'user=%s; ' % compat_urllib_parse.quote(json.dumps({'ffilter': False})) + } + + # AnyClip videos require the flashversion cookie so that we get the link + # to the mp4 file + if video_id.startswith('an-'): + headers['Cookie'] += 'flashVersion=0; ' + + # Retrieve video webpage to extract further information + webpage = self._download_webpage(url, video_id, headers=headers) + + error = get_element_by_attribute( + 'class', 'notfound-page-title', webpage) + if error: + raise ExtractorError(error, expected=True) + + video_title = self._html_search_meta( + ['og:title', 'twitter:title'], webpage, 'title', default=None) or self._search_regex(r'<h1>(.*?)</h1>', webpage, 'title') + + # Extract URL, uploader and title from webpage + self.report_extraction(video_id) + video_url = None + mobj = re.search(r'(?m)&(?:media|video)URL=([^&]+)', webpage) + if mobj is not None: + mediaURL = compat_urllib_parse_unquote(mobj.group(1)) + video_ext = determine_ext(mediaURL) + + # Extract gdaKey if available + mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage) + if mobj is None: + video_url = mediaURL + else: + gdaKey = mobj.group(1) + video_url = '%s?__gda__=%s' % (mediaURL, gdaKey) + if video_url is None: + mobj = re.search(r'<video src="([^"]+)"', webpage) + if mobj: + video_url = mobj.group(1) + video_ext = 'mp4' + if video_url is None: + flashvars = self._search_regex( + r' name="flashvars" value="(.*?)"', webpage, 'flashvars', + default=None) + if flashvars: + vardict = compat_parse_qs(flashvars) + if 'mediaData' not in vardict: + raise ExtractorError('Unable to extract media URL') + mobj = re.search( + r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0]) + if mobj is None: + raise ExtractorError('Unable to extract media URL') + mediaURL = mobj.group('mediaURL').replace('\\/', '/') + video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key')) + video_ext = determine_ext(video_url) + if video_url is None: + player_url = self._search_regex( + r"swfobject\.embedSWF\('([^']+)'", + webpage, 'config URL', default=None) + if player_url: + config_url = self._search_regex( + r'config=(.+)$', player_url, 'config URL') + config_doc = self._download_xml( + config_url, video_id, + note='Downloading video config') + smil_url = config_doc.find('.//properties').attrib['smil_file'] + smil_doc = self._download_xml( + smil_url, video_id, + note='Downloading SMIL document') + base_url = smil_doc.find('./head/meta').attrib['base'] + video_url = [] + for vn in smil_doc.findall('.//video'): + br = int(vn.attrib['system-bitrate']) + play_path = vn.attrib['src'] + video_url.append({ + 'format_id': 'smil-%d' % br, + 'url': base_url, + 'play_path': play_path, + 'page_url': url, + 'player_url': player_url, + 'ext': play_path.partition(':')[0], + }) + if video_url is None: + flashvars = self._parse_json(self._search_regex( + r'flashvars\s*=\s*({.*});', webpage, 'flashvars', + default=None), video_id, fatal=False) + if flashvars: + video_url = [] + for source in flashvars.get('sources'): + source_url = source.get('src') + if not source_url: + continue + ext = mimetype2ext(source.get('type')) or determine_ext(source_url) + if ext == 'm3u8': + video_url.extend(self._extract_m3u8_formats( + source_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + else: + video_url.append({ + 'url': source_url, + 'ext': ext, + }) + + if video_url is None: + raise ExtractorError('Unsupported video type') + + description = self._html_search_meta( + ['og:description', 'twitter:description', 'description'], + webpage, 'title', fatal=False) + thumbnail = self._html_search_meta( + ['og:image', 'twitter:image'], webpage, 'title', fatal=False) + video_uploader = self._html_search_regex( + r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);', + webpage, 'uploader nickname', fatal=False) + duration = int_or_none( + self._html_search_meta('video:duration', webpage, default=None)) + age_limit = ( + 18 + if re.search(r'(?:"contentRating":|"rating",)"restricted"', webpage) + else 0) + + if isinstance(video_url, list): + formats = video_url + else: + formats = [{ + 'url': video_url, + 'ext': video_ext, + }] + self._sort_formats(formats) + + return { + 'id': video_id, + 'display_id': display_id, + 'description': description, + 'uploader': video_uploader, + 'title': video_title, + 'thumbnail': thumbnail, + 'age_limit': age_limit, + 'formats': formats, + 'duration': duration, + } diff --git a/yt_dlp/extractor/metacritic.py b/yt_dlp/extractor/metacritic.py new file mode 100644 index 000000000..1424288e7 --- /dev/null +++ b/yt_dlp/extractor/metacritic.py @@ -0,0 +1,65 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + fix_xml_ampersands, +) + + +class MetacriticIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?metacritic\.com/.+?/trailers/(?P<id>\d+)' + + _TESTS = [{ + 'url': 'http://www.metacritic.com/game/playstation-4/infamous-second-son/trailers/3698222', + 'info_dict': { + 'id': '3698222', + 'ext': 'mp4', + 'title': 'inFamous: Second Son - inSide Sucker Punch: Smoke & Mirrors', + 'description': 'Take a peak behind-the-scenes to see how Sucker Punch brings smoke into the universe of inFAMOUS Second Son on the PS4.', + 'duration': 221, + }, + 'skip': 'Not providing trailers anymore', + }, { + 'url': 'http://www.metacritic.com/game/playstation-4/tales-from-the-borderlands-a-telltale-game-series/trailers/5740315', + 'info_dict': { + 'id': '5740315', + 'ext': 'mp4', + 'title': 'Tales from the Borderlands - Finale: The Vault of the Traveler', + 'description': 'In the final episode of the season, all hell breaks loose. Jack is now in control of Helios\' systems, and he\'s ready to reclaim his rightful place as king of Hyperion (with or without you).', + 'duration': 114, + }, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + # The xml is not well formatted, there are raw '&' + info = self._download_xml('http://www.metacritic.com/video_data?video=' + video_id, + video_id, 'Downloading info xml', transform_source=fix_xml_ampersands) + + clip = next(c for c in info.findall('playList/clip') if c.find('id').text == video_id) + formats = [] + for videoFile in clip.findall('httpURI/videoFile'): + rate_str = videoFile.find('rate').text + video_url = videoFile.find('filePath').text + formats.append({ + 'url': video_url, + 'ext': 'mp4', + 'format_id': rate_str, + 'tbr': int(rate_str), + }) + self._sort_formats(formats) + + description = self._html_search_regex(r'<b>Description:</b>(.*?)</p>', + webpage, 'description', flags=re.DOTALL) + + return { + 'id': video_id, + 'title': clip.find('title').text, + 'formats': formats, + 'description': description, + 'duration': int(clip.find('duration').text), + } diff --git a/yt_dlp/extractor/mgoon.py b/yt_dlp/extractor/mgoon.py new file mode 100644 index 000000000..184c311be --- /dev/null +++ b/yt_dlp/extractor/mgoon.py @@ -0,0 +1,86 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + qualities, + unified_strdate, +) + + +class MgoonIE(InfoExtractor): + _VALID_URL = r'''(?x)https?://(?:www\.)? + (?:(:?m\.)?mgoon\.com/(?:ch/(?:.+)/v|play/view)| + video\.mgoon\.com)/(?P<id>[0-9]+)''' + _API_URL = 'http://mpos.mgoon.com/player/video?id={0:}' + _TESTS = [ + { + 'url': 'http://m.mgoon.com/ch/hi6618/v/5582148', + 'md5': 'dd46bb66ab35cf6d51cc812fd82da79d', + 'info_dict': { + 'id': '5582148', + 'uploader_id': 'hi6618', + 'duration': 240.419, + 'upload_date': '20131220', + 'ext': 'mp4', + 'title': 'md5:543aa4c27a4931d371c3f433e8cebebc', + 'thumbnail': r're:^https?://.*\.jpg$', + } + }, + { + 'url': 'http://www.mgoon.com/play/view/5582148', + 'only_matching': True, + }, + { + 'url': 'http://video.mgoon.com/5582148', + 'only_matching': True, + }, + ] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + + data = self._download_json(self._API_URL.format(video_id), video_id) + + if data.get('errorInfo', {}).get('code') != 'NONE': + raise ExtractorError('%s encountered an error: %s' % ( + self.IE_NAME, data['errorInfo']['message']), expected=True) + + v_info = data['videoInfo'] + title = v_info.get('v_title') + thumbnail = v_info.get('v_thumbnail') + duration = v_info.get('v_duration') + upload_date = unified_strdate(v_info.get('v_reg_date')) + uploader_id = data.get('userInfo', {}).get('u_alias') + if duration: + duration /= 1000.0 + + age_limit = None + if data.get('accessInfo', {}).get('code') == 'VIDEO_STATUS_ADULT': + age_limit = 18 + + formats = [] + get_quality = qualities(['360p', '480p', '720p', '1080p']) + for fmt in data['videoFiles']: + formats.append({ + 'format_id': fmt['label'], + 'quality': get_quality(fmt['label']), + 'url': fmt['url'], + 'ext': fmt['format'], + + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + 'duration': duration, + 'upload_date': upload_date, + 'uploader_id': uploader_id, + 'age_limit': age_limit, + } diff --git a/youtube_dl/extractor/mgtv.py b/yt_dlp/extractor/mgtv.py index cab3aa045..cab3aa045 100644 --- a/youtube_dl/extractor/mgtv.py +++ b/yt_dlp/extractor/mgtv.py diff --git a/youtube_dl/extractor/miaopai.py b/yt_dlp/extractor/miaopai.py index f9e35ac7f..f9e35ac7f 100644 --- a/youtube_dl/extractor/miaopai.py +++ b/yt_dlp/extractor/miaopai.py diff --git a/yt_dlp/extractor/microsoftvirtualacademy.py b/yt_dlp/extractor/microsoftvirtualacademy.py new file mode 100644 index 000000000..46abd2a6d --- /dev/null +++ b/yt_dlp/extractor/microsoftvirtualacademy.py @@ -0,0 +1,195 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_xpath, +) +from ..utils import ( + int_or_none, + parse_duration, + smuggle_url, + unsmuggle_url, + xpath_text, +) + + +class MicrosoftVirtualAcademyBaseIE(InfoExtractor): + def _extract_base_url(self, course_id, display_id): + return self._download_json( + 'https://api-mlxprod.microsoft.com/services/products/anonymous/%s' % course_id, + display_id, 'Downloading course base URL') + + def _extract_chapter_and_title(self, title): + if not title: + return None, None + m = re.search(r'(?P<chapter>\d+)\s*\|\s*(?P<title>.+)', title) + return (int(m.group('chapter')), m.group('title')) if m else (None, title) + + +class MicrosoftVirtualAcademyIE(MicrosoftVirtualAcademyBaseIE): + IE_NAME = 'mva' + IE_DESC = 'Microsoft Virtual Academy videos' + _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|(?:www\.)?microsoftvirtualacademy)\.com/[^/]+/training-courses/[^/?#&]+-)(?P<course_id>\d+)(?::|\?l=)(?P<id>[\da-zA-Z]+_\d+)' % IE_NAME + + _TESTS = [{ + 'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788?l=gfVXISmEB_6804984382', + 'md5': '7826c44fc31678b12ad8db11f6b5abb9', + 'info_dict': { + 'id': 'gfVXISmEB_6804984382', + 'ext': 'mp4', + 'title': 'Course Introduction', + 'formats': 'mincount:3', + 'subtitles': { + 'en': [{ + 'ext': 'ttml', + }], + }, + } + }, { + 'url': 'mva:11788:gfVXISmEB_6804984382', + 'only_matching': True, + }] + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + + mobj = self._match_valid_url(url) + course_id = mobj.group('course_id') + video_id = mobj.group('id') + + base_url = smuggled_data.get('base_url') or self._extract_base_url(course_id, video_id) + + settings = self._download_xml( + '%s/content/content_%s/videosettings.xml?v=1' % (base_url, video_id), + video_id, 'Downloading video settings XML') + + _, title = self._extract_chapter_and_title(xpath_text( + settings, './/Title', 'title', fatal=True)) + + formats = [] + + for sources in settings.findall(compat_xpath('.//MediaSources')): + sources_type = sources.get('videoType') + for source in sources.findall(compat_xpath('./MediaSource')): + video_url = source.text + if not video_url or not video_url.startswith('http'): + continue + if sources_type == 'smoothstreaming': + formats.extend(self._extract_ism_formats( + video_url, video_id, 'mss', fatal=False)) + continue + video_mode = source.get('videoMode') + height = int_or_none(self._search_regex( + r'^(\d+)[pP]$', video_mode or '', 'height', default=None)) + codec = source.get('codec') + acodec, vcodec = [None] * 2 + if codec: + codecs = codec.split(',') + if len(codecs) == 2: + acodec, vcodec = codecs + elif len(codecs) == 1: + vcodec = codecs[0] + formats.append({ + 'url': video_url, + 'format_id': video_mode, + 'height': height, + 'acodec': acodec, + 'vcodec': vcodec, + }) + self._sort_formats(formats) + + subtitles = {} + for source in settings.findall(compat_xpath('.//MarkerResourceSource')): + subtitle_url = source.text + if not subtitle_url: + continue + subtitles.setdefault('en', []).append({ + 'url': '%s/%s' % (base_url, subtitle_url), + 'ext': source.get('type'), + }) + + return { + 'id': video_id, + 'title': title, + 'subtitles': subtitles, + 'formats': formats + } + + +class MicrosoftVirtualAcademyCourseIE(MicrosoftVirtualAcademyBaseIE): + IE_NAME = 'mva:course' + IE_DESC = 'Microsoft Virtual Academy courses' + _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|(?:www\.)?microsoftvirtualacademy)\.com/[^/]+/training-courses/(?P<display_id>[^/?#&]+)-)(?P<id>\d+)' % IE_NAME + + _TESTS = [{ + 'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788', + 'info_dict': { + 'id': '11788', + 'title': 'Microsoft Azure Fundamentals: Virtual Machines', + }, + 'playlist_count': 36, + }, { + # with emphasized chapters + 'url': 'https://mva.microsoft.com/en-US/training-courses/developing-windows-10-games-with-construct-2-16335', + 'info_dict': { + 'id': '16335', + 'title': 'Developing Windows 10 Games with Construct 2', + }, + 'playlist_count': 10, + }, { + 'url': 'https://www.microsoftvirtualacademy.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788', + 'only_matching': True, + }, { + 'url': 'mva:course:11788', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if MicrosoftVirtualAcademyIE.suitable(url) else super( + MicrosoftVirtualAcademyCourseIE, cls).suitable(url) + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + course_id = mobj.group('id') + display_id = mobj.group('display_id') + + base_url = self._extract_base_url(course_id, display_id) + + manifest = self._download_json( + '%s/imsmanifestlite.json' % base_url, + display_id, 'Downloading course manifest JSON')['manifest'] + + organization = manifest['organizations']['organization'][0] + + entries = [] + for chapter in organization['item']: + chapter_number, chapter_title = self._extract_chapter_and_title(chapter.get('title')) + chapter_id = chapter.get('@identifier') + for item in chapter.get('item', []): + item_id = item.get('@identifier') + if not item_id: + continue + metadata = item.get('resource', {}).get('metadata') or {} + if metadata.get('learningresourcetype') != 'Video': + continue + _, title = self._extract_chapter_and_title(item.get('title')) + duration = parse_duration(metadata.get('duration')) + description = metadata.get('description') + entries.append({ + '_type': 'url_transparent', + 'url': smuggle_url( + 'mva:%s:%s' % (course_id, item_id), {'base_url': base_url}), + 'title': title, + 'description': description, + 'duration': duration, + 'chapter': chapter_title, + 'chapter_number': chapter_number, + 'chapter_id': chapter_id, + }) + + title = organization.get('title') or manifest.get('metadata', {}).get('title') + + return self.playlist_result(entries, course_id, title) diff --git a/yt_dlp/extractor/mildom.py b/yt_dlp/extractor/mildom.py new file mode 100644 index 000000000..c147cbbf6 --- /dev/null +++ b/yt_dlp/extractor/mildom.py @@ -0,0 +1,258 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import base64 +from datetime import datetime +import itertools +import json + +from .common import InfoExtractor +from ..utils import ( + std_headers, + update_url_query, + random_uuidv4, + try_get, +) +from ..compat import ( + compat_str, +) + + +class MildomBaseIE(InfoExtractor): + _GUEST_ID = None + _DISPATCHER_CONFIG = None + + def _call_api(self, url, video_id, query={}, note='Downloading JSON metadata', init=False): + url = update_url_query(url, self._common_queries(query, init=init)) + return self._download_json(url, video_id, note=note)['body'] + + def _common_queries(self, query={}, init=False): + dc = self._fetch_dispatcher_config() + r = { + 'timestamp': self.iso_timestamp(), + '__guest_id': '' if init else self.guest_id(), + '__location': dc['location'], + '__country': dc['country'], + '__cluster': dc['cluster'], + '__platform': 'web', + '__la': self.lang_code(), + '__pcv': 'v2.9.44', + 'sfr': 'pc', + 'accessToken': '', + } + r.update(query) + return r + + def _fetch_dispatcher_config(self): + if not self._DISPATCHER_CONFIG: + tmp = self._download_json( + 'https://disp.mildom.com/serverListV2', 'initialization', + note='Downloading dispatcher_config', data=json.dumps({ + 'protover': 0, + 'data': base64.b64encode(json.dumps({ + 'fr': 'web', + 'sfr': 'pc', + 'devi': 'Windows', + 'la': 'ja', + 'gid': None, + 'loc': '', + 'clu': '', + 'wh': '1919*810', + 'rtm': self.iso_timestamp(), + 'ua': std_headers['User-Agent'], + }).encode('utf8')).decode('utf8').replace('\n', ''), + }).encode('utf8')) + self._DISPATCHER_CONFIG = self._parse_json(base64.b64decode(tmp['data']), 'initialization') + return self._DISPATCHER_CONFIG + + @staticmethod + def iso_timestamp(): + 'new Date().toISOString()' + return datetime.utcnow().isoformat()[0:-3] + 'Z' + + def guest_id(self): + 'getGuestId' + if self._GUEST_ID: + return self._GUEST_ID + self._GUEST_ID = try_get( + self, ( + lambda x: x._call_api( + 'https://cloudac.mildom.com/nonolive/gappserv/guest/h5init', 'initialization', + note='Downloading guest token', init=True)['guest_id'] or None, + lambda x: x._get_cookies('https://www.mildom.com').get('gid').value, + lambda x: x._get_cookies('https://m.mildom.com').get('gid').value, + ), compat_str) or '' + return self._GUEST_ID + + def lang_code(self): + 'getCurrentLangCode' + return 'ja' + + +class MildomIE(MildomBaseIE): + IE_NAME = 'mildom' + IE_DESC = 'Record ongoing live by specific user in Mildom' + _VALID_URL = r'https?://(?:(?:www|m)\.)mildom\.com/(?P<id>\d+)' + + def _real_extract(self, url): + video_id = self._match_id(url) + url = 'https://www.mildom.com/%s' % video_id + + webpage = self._download_webpage(url, video_id) + + enterstudio = self._call_api( + 'https://cloudac.mildom.com/nonolive/gappserv/live/enterstudio', video_id, + note='Downloading live metadata', query={'user_id': video_id}) + result_video_id = enterstudio.get('log_id', video_id) + + title = try_get( + enterstudio, ( + lambda x: self._html_search_meta('twitter:description', webpage), + lambda x: x['anchor_intro'], + ), compat_str) + description = try_get( + enterstudio, ( + lambda x: x['intro'], + lambda x: x['live_intro'], + ), compat_str) + uploader = try_get( + enterstudio, ( + lambda x: self._html_search_meta('twitter:title', webpage), + lambda x: x['loginname'], + ), compat_str) + + servers = self._call_api( + 'https://cloudac.mildom.com/nonolive/gappserv/live/liveserver', result_video_id, + note='Downloading live server list', query={ + 'user_id': video_id, + 'live_server_type': 'hls', + }) + + stream_query = self._common_queries({ + 'streamReqId': random_uuidv4(), + 'is_lhls': '0', + }) + m3u8_url = update_url_query(servers['stream_server'] + '/%s_master.m3u8' % video_id, stream_query) + formats = self._extract_m3u8_formats(m3u8_url, result_video_id, 'mp4', headers={ + 'Referer': 'https://www.mildom.com/', + 'Origin': 'https://www.mildom.com', + }, note='Downloading m3u8 information') + + del stream_query['streamReqId'], stream_query['timestamp'] + for fmt in formats: + fmt.setdefault('http_headers', {})['Referer'] = 'https://www.mildom.com/' + + self._sort_formats(formats) + + return { + 'id': result_video_id, + 'title': title, + 'description': description, + 'uploader': uploader, + 'uploader_id': video_id, + 'formats': formats, + 'is_live': True, + } + + +class MildomVodIE(MildomBaseIE): + IE_NAME = 'mildom:vod' + IE_DESC = 'Download a VOD in Mildom' + _VALID_URL = r'https?://(?:(?:www|m)\.)mildom\.com/playback/(?P<user_id>\d+)/(?P<id>(?P=user_id)-[a-zA-Z0-9]+)' + + def _real_extract(self, url): + m = self._match_valid_url(url) + user_id, video_id = m.group('user_id'), m.group('id') + url = 'https://www.mildom.com/playback/%s/%s' % (user_id, video_id) + + webpage = self._download_webpage(url, video_id) + + autoplay = self._call_api( + 'https://cloudac.mildom.com/nonolive/videocontent/playback/getPlaybackDetail', video_id, + note='Downloading playback metadata', query={ + 'v_id': video_id, + })['playback'] + + title = try_get( + autoplay, ( + lambda x: self._html_search_meta('og:description', webpage), + lambda x: x['title'], + ), compat_str) + description = try_get( + autoplay, ( + lambda x: x['video_intro'], + ), compat_str) + uploader = try_get( + autoplay, ( + lambda x: x['author_info']['login_name'], + ), compat_str) + + formats = [{ + 'url': autoplay['audio_url'], + 'format_id': 'audio', + 'protocol': 'm3u8_native', + 'vcodec': 'none', + 'acodec': 'aac', + 'ext': 'm4a' + }] + for fmt in autoplay['video_link']: + formats.append({ + 'format_id': 'video-%s' % fmt['name'], + 'url': fmt['url'], + 'protocol': 'm3u8_native', + 'width': fmt['level'] * autoplay['video_width'] // autoplay['video_height'], + 'height': fmt['level'], + 'vcodec': 'h264', + 'acodec': 'aac', + 'ext': 'mp4' + }) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'uploader': uploader, + 'uploader_id': user_id, + 'formats': formats, + } + + +class MildomUserVodIE(MildomBaseIE): + IE_NAME = 'mildom:user:vod' + IE_DESC = 'Download all VODs from specific user in Mildom' + _VALID_URL = r'https?://(?:(?:www|m)\.)mildom\.com/profile/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.mildom.com/profile/10093333', + 'info_dict': { + 'id': '10093333', + 'title': 'Uploads from ねこばたけ', + }, + 'playlist_mincount': 351, + }] + + def _entries(self, user_id): + for page in itertools.count(1): + reply = self._call_api( + 'https://cloudac.mildom.com/nonolive/videocontent/profile/playbackList', + user_id, note='Downloading page %d' % page, query={ + 'user_id': user_id, + 'page': page, + 'limit': '30', + }) + if not reply: + break + for x in reply: + yield self.url_result('https://www.mildom.com/playback/%s/%s' % (user_id, x['v_id'])) + + def _real_extract(self, url): + user_id = self._match_id(url) + self.to_screen('This will download all VODs belonging to user. To download ongoing live video, use "https://www.mildom.com/%s" instead' % user_id) + + profile = self._call_api( + 'https://cloudac.mildom.com/nonolive/gappserv/user/profileV2', user_id, + query={'user_id': user_id}, note='Downloading user profile')['user_info'] + + return self.playlist_result( + self._entries(user_id), user_id, 'Uploads from %s' % profile['loginname']) diff --git a/youtube_dl/extractor/minds.py b/yt_dlp/extractor/minds.py index 8e9f0f825..8e9f0f825 100644 --- a/youtube_dl/extractor/minds.py +++ b/yt_dlp/extractor/minds.py diff --git a/youtube_dl/extractor/ministrygrid.py b/yt_dlp/extractor/ministrygrid.py index 8ad9239c5..8ad9239c5 100644 --- a/youtube_dl/extractor/ministrygrid.py +++ b/yt_dlp/extractor/ministrygrid.py diff --git a/yt_dlp/extractor/minoto.py b/yt_dlp/extractor/minoto.py new file mode 100644 index 000000000..603ce940b --- /dev/null +++ b/yt_dlp/extractor/minoto.py @@ -0,0 +1,50 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_codecs, +) + + +class MinotoIE(InfoExtractor): + _VALID_URL = r'(?:minoto:|https?://(?:play|iframe|embed)\.minoto-video\.com/(?P<player_id>[0-9]+)/)(?P<id>[a-zA-Z0-9]+)' + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + player_id = mobj.group('player_id') or '1' + video_id = mobj.group('id') + video_data = self._download_json('http://play.minoto-video.com/%s/%s.js' % (player_id, video_id), video_id) + video_metadata = video_data['video-metadata'] + formats = [] + for fmt in video_data['video-files']: + fmt_url = fmt.get('url') + if not fmt_url: + continue + container = fmt.get('container') + if container == 'hls': + formats.extend(fmt_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + else: + fmt_profile = fmt.get('profile') or {} + formats.append({ + 'format_id': fmt_profile.get('name-short'), + 'format_note': fmt_profile.get('name'), + 'url': fmt_url, + 'container': container, + 'tbr': int_or_none(fmt.get('bitrate')), + 'filesize': int_or_none(fmt.get('filesize')), + 'width': int_or_none(fmt.get('width')), + 'height': int_or_none(fmt.get('height')), + **parse_codecs(fmt.get('codecs')), + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_metadata['title'], + 'description': video_metadata.get('description'), + 'thumbnail': video_metadata.get('video-poster', {}).get('url'), + 'formats': formats, + } diff --git a/youtube_dl/extractor/miomio.py b/yt_dlp/extractor/miomio.py index 40f72d66f..40f72d66f 100644 --- a/youtube_dl/extractor/miomio.py +++ b/yt_dlp/extractor/miomio.py diff --git a/yt_dlp/extractor/mirrativ.py b/yt_dlp/extractor/mirrativ.py new file mode 100644 index 000000000..81aea54f6 --- /dev/null +++ b/yt_dlp/extractor/mirrativ.py @@ -0,0 +1,134 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + dict_get, + traverse_obj, + try_get, +) + + +class MirrativBaseIE(InfoExtractor): + def assert_error(self, response): + error_message = traverse_obj(response, ('status', 'error')) + if error_message: + raise ExtractorError('Mirrativ says: %s' % error_message, expected=True) + + +class MirrativIE(MirrativBaseIE): + IE_NAME = 'mirrativ' + _VALID_URL = r'https?://(?:www\.)?mirrativ\.com/live/(?P<id>[^/?#&]+)' + LIVE_API_URL = 'https://www.mirrativ.com/api/live/live?live_id=%s' + + TESTS = [{ + 'url': 'https://mirrativ.com/live/POxyuG1KmW2982lqlDTuPw', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage('https://www.mirrativ.com/live/%s' % video_id, video_id) + live_response = self._download_json(self.LIVE_API_URL % video_id, video_id) + self.assert_error(live_response) + + hls_url = dict_get(live_response, ('archive_url_hls', 'streaming_url_hls')) + is_live = bool(live_response.get('is_live')) + was_live = bool(live_response.get('is_archive')) + if not hls_url: + raise ExtractorError('Neither archive nor live is available.', expected=True) + + formats = self._extract_m3u8_formats( + hls_url, video_id, + ext='mp4', entry_protocol='m3u8_native', + m3u8_id='hls', live=is_live) + rtmp_url = live_response.get('streaming_url_edge') + if rtmp_url: + keys_to_copy = ('width', 'height', 'vcodec', 'acodec', 'tbr') + fmt = { + 'format_id': 'rtmp', + 'url': rtmp_url, + 'protocol': 'rtmp', + 'ext': 'mp4', + } + fmt.update({k: traverse_obj(formats, (0, k)) for k in keys_to_copy}) + formats.append(fmt) + self._sort_formats(formats) + + title = self._og_search_title(webpage, default=None) or self._search_regex( + r'<title>\s*(.+?) - Mirrativ\s*</title>', webpage) or live_response.get('title') + description = live_response.get('description') + thumbnail = live_response.get('image_url') + + duration = try_get(live_response, lambda x: x['ended_at'] - x['started_at']) + view_count = live_response.get('total_viewer_num') + release_timestamp = live_response.get('started_at') + timestamp = live_response.get('created_at') + + owner = live_response.get('owner', {}) + uploader = owner.get('name') + uploader_id = owner.get('user_id') + + return { + 'id': video_id, + 'title': title, + 'is_live': is_live, + 'description': description, + 'formats': formats, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'duration': duration, + 'view_count': view_count, + 'release_timestamp': release_timestamp, + 'timestamp': timestamp, + 'was_live': was_live, + } + + +class MirrativUserIE(MirrativBaseIE): + IE_NAME = 'mirrativ:user' + _VALID_URL = r'https?://(?:www\.)?mirrativ\.com/user/(?P<id>\d+)' + LIVE_HISTORY_API_URL = 'https://www.mirrativ.com/api/live/live_history?user_id=%s&page=%d' + USER_INFO_API_URL = 'https://www.mirrativ.com/api/user/profile?user_id=%s' + + _TESTS = [{ + # Live archive is available up to 3 days + # see: https://helpfeel.com/mirrativ/%E9%8C%B2%E7%94%BB-5e26d3ad7b59ef0017fb49ac (Japanese) + 'url': 'https://www.mirrativ.com/user/110943130', + 'note': 'multiple archives available', + 'only_matching': True, + }] + + def _entries(self, user_id): + page = 1 + while page is not None: + api_response = self._download_json( + self.LIVE_HISTORY_API_URL % (user_id, page), user_id, + note='Downloading page %d' % page) + self.assert_error(api_response) + lives = api_response.get('lives') + if not lives: + break + for live in lives: + if not live.get('is_archive') and not live.get('is_live'): + # neither archive nor live is available, so skip it + # or the service will ban your IP address for a while + continue + live_id = live.get('live_id') + url = 'https://www.mirrativ.com/live/%s' % live_id + yield self.url_result(url, video_id=live_id, video_title=live.get('title')) + page = api_response.get('next_page') + + def _real_extract(self, url): + user_id = self._match_id(url) + user_info = self._download_json( + self.USER_INFO_API_URL % user_id, user_id, + note='Downloading user info', fatal=False) + self.assert_error(user_info) + + uploader = user_info.get('name') + description = user_info.get('description') + + entries = self._entries(user_id) + return self.playlist_result(entries, user_id, uploader, description) diff --git a/yt_dlp/extractor/mit.py b/yt_dlp/extractor/mit.py new file mode 100644 index 000000000..60e456978 --- /dev/null +++ b/yt_dlp/extractor/mit.py @@ -0,0 +1,132 @@ +from __future__ import unicode_literals + +import re +import json + +from .common import InfoExtractor +from .youtube import YoutubeIE +from ..utils import ( + clean_html, + ExtractorError, + get_element_by_id, +) + + +class TechTVMITIE(InfoExtractor): + IE_NAME = 'techtv.mit.edu' + _VALID_URL = r'https?://techtv\.mit\.edu/(?:videos|embeds)/(?P<id>\d+)' + + _TEST = { + 'url': 'http://techtv.mit.edu/videos/25418-mit-dna-learning-center-set', + 'md5': '00a3a27ee20d44bcaa0933ccec4a2cf7', + 'info_dict': { + 'id': '25418', + 'ext': 'mp4', + 'title': 'MIT DNA and Protein Sets', + 'description': 'md5:46f5c69ce434f0a97e7c628cc142802d', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + raw_page = self._download_webpage( + 'http://techtv.mit.edu/videos/%s' % video_id, video_id) + clean_page = re.compile(r'<!--.*?-->', re.S).sub('', raw_page) + + base_url = self._proto_relative_url(self._search_regex( + r'ipadUrl: \'(.+?cloudfront.net/)', raw_page, 'base url'), 'http:') + formats_json = self._search_regex( + r'bitrates: (\[.+?\])', raw_page, 'video formats') + formats_mit = json.loads(formats_json) + formats = [ + { + 'format_id': f['label'], + 'url': base_url + f['url'].partition(':')[2], + 'ext': f['url'].partition(':')[0], + 'format': f['label'], + 'width': f['width'], + 'vbr': f['bitrate'], + } + for f in formats_mit + ] + + title = get_element_by_id('edit-title', clean_page) + description = clean_html(get_element_by_id('edit-description', clean_page)) + thumbnail = self._search_regex( + r'playlist:.*?url: \'(.+?)\'', + raw_page, 'thumbnail', flags=re.DOTALL) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': description, + 'thumbnail': thumbnail, + } + + +class OCWMITIE(InfoExtractor): + IE_NAME = 'ocw.mit.edu' + _VALID_URL = r'^https?://ocw\.mit\.edu/courses/(?P<topic>[a-z0-9\-]+)' + _BASE_URL = 'http://ocw.mit.edu/' + + _TESTS = [ + { + 'url': 'http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/', + 'info_dict': { + 'id': 'EObHWIEKGjA', + 'ext': 'webm', + 'title': 'Lecture 7: Multiple Discrete Random Variables: Expectations, Conditioning, Independence', + 'description': 'In this lecture, the professor discussed multiple random variables, expectations, and binomial distribution.', + 'upload_date': '20121109', + 'uploader_id': 'MIT', + 'uploader': 'MIT OpenCourseWare', + } + }, + { + 'url': 'http://ocw.mit.edu/courses/mathematics/18-01sc-single-variable-calculus-fall-2010/1.-differentiation/part-a-definition-and-basic-rules/session-1-introduction-to-derivatives/', + 'info_dict': { + 'id': '7K1sB05pE0A', + 'ext': 'mp4', + 'title': 'Session 1: Introduction to Derivatives', + 'upload_date': '20090818', + 'uploader_id': 'MIT', + 'uploader': 'MIT OpenCourseWare', + 'description': 'This section contains lecture video excerpts, lecture notes, an interactive mathlet with supporting documents, and problem solving videos.', + } + } + ] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + topic = mobj.group('topic') + + webpage = self._download_webpage(url, topic) + title = self._html_search_meta('WT.cg_s', webpage) + description = self._html_search_meta('Description', webpage) + + # search for call to ocw_embed_chapter_media(container_id, media_url, provider, page_url, image_url, start, stop, captions_file) + embed_chapter_media = re.search(r'ocw_embed_chapter_media\((.+?)\)', webpage) + if embed_chapter_media: + metadata = re.sub(r'[\'"]', '', embed_chapter_media.group(1)) + metadata = re.split(r', ?', metadata) + yt = metadata[1] + else: + # search for call to ocw_embed_chapter_media(container_id, media_url, provider, page_url, image_url, captions_file) + embed_media = re.search(r'ocw_embed_media\((.+?)\)', webpage) + if embed_media: + metadata = re.sub(r'[\'"]', '', embed_media.group(1)) + metadata = re.split(r', ?', metadata) + yt = metadata[1] + else: + raise ExtractorError('Unable to find embedded YouTube video.') + video_id = YoutubeIE.extract_id(yt) + + return { + '_type': 'url_transparent', + 'id': video_id, + 'title': title, + 'description': description, + 'url': yt, + 'ie_key': 'Youtube', + } diff --git a/youtube_dl/extractor/mitele.py b/yt_dlp/extractor/mitele.py index b5937233b..b5937233b 100644 --- a/youtube_dl/extractor/mitele.py +++ b/yt_dlp/extractor/mitele.py diff --git a/yt_dlp/extractor/mixcloud.py b/yt_dlp/extractor/mixcloud.py new file mode 100644 index 000000000..a0c043d4b --- /dev/null +++ b/yt_dlp/extractor/mixcloud.py @@ -0,0 +1,355 @@ +from __future__ import unicode_literals + +import itertools + +from .common import InfoExtractor +from ..compat import ( + compat_b64decode, + compat_chr, + compat_ord, + compat_str, + compat_urllib_parse_unquote, + compat_zip +) +from ..utils import ( + int_or_none, + parse_iso8601, + strip_or_none, + try_get, +) + + +class MixcloudBaseIE(InfoExtractor): + def _call_api(self, object_type, object_fields, display_id, username, slug=None): + lookup_key = object_type + 'Lookup' + return self._download_json( + 'https://www.mixcloud.com/graphql', display_id, query={ + 'query': '''{ + %s(lookup: {username: "%s"%s}) { + %s + } +}''' % (lookup_key, username, ', slug: "%s"' % slug if slug else '', object_fields) + })['data'][lookup_key] + + +class MixcloudIE(MixcloudBaseIE): + _VALID_URL = r'https?://(?:(?:www|beta|m)\.)?mixcloud\.com/([^/]+)/(?!stream|uploads|favorites|listens|playlists)([^/]+)' + IE_NAME = 'mixcloud' + + _TESTS = [{ + 'url': 'http://www.mixcloud.com/dholbach/cryptkeeper/', + 'info_dict': { + 'id': 'dholbach_cryptkeeper', + 'ext': 'm4a', + 'title': 'Cryptkeeper', + 'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.', + 'uploader': 'Daniel Holbach', + 'uploader_id': 'dholbach', + 'thumbnail': r're:https?://.*\.jpg', + 'view_count': int, + 'timestamp': 1321359578, + 'upload_date': '20111115', + }, + }, { + 'url': 'http://www.mixcloud.com/gillespeterson/caribou-7-inch-vinyl-mix-chat/', + 'info_dict': { + 'id': 'gillespeterson_caribou-7-inch-vinyl-mix-chat', + 'ext': 'mp3', + 'title': 'Caribou 7 inch Vinyl Mix & Chat', + 'description': 'md5:2b8aec6adce69f9d41724647c65875e8', + 'uploader': 'Gilles Peterson Worldwide', + 'uploader_id': 'gillespeterson', + 'thumbnail': 're:https?://.*', + 'view_count': int, + 'timestamp': 1422987057, + 'upload_date': '20150203', + }, + }, { + 'url': 'https://beta.mixcloud.com/RedLightRadio/nosedrip-15-red-light-radio-01-18-2016/', + 'only_matching': True, + }] + _DECRYPTION_KEY = 'IFYOUWANTTHEARTISTSTOGETPAIDDONOTDOWNLOADFROMMIXCLOUD' + + @staticmethod + def _decrypt_xor_cipher(key, ciphertext): + """Encrypt/Decrypt XOR cipher. Both ways are possible because it's XOR.""" + return ''.join([ + compat_chr(compat_ord(ch) ^ compat_ord(k)) + for ch, k in compat_zip(ciphertext, itertools.cycle(key))]) + + def _real_extract(self, url): + username, slug = self._match_valid_url(url).groups() + username, slug = compat_urllib_parse_unquote(username), compat_urllib_parse_unquote(slug) + track_id = '%s_%s' % (username, slug) + + cloudcast = self._call_api('cloudcast', '''audioLength + comments(first: 100) { + edges { + node { + comment + created + user { + displayName + username + } + } + } + totalCount + } + description + favorites { + totalCount + } + featuringArtistList + isExclusive + name + owner { + displayName + url + username + } + picture(width: 1024, height: 1024) { + url + } + plays + publishDate + reposts { + totalCount + } + streamInfo { + dashUrl + hlsUrl + url + } + tags { + tag { + name + } + }''', track_id, username, slug) + + title = cloudcast['name'] + + stream_info = cloudcast['streamInfo'] + formats = [] + + for url_key in ('url', 'hlsUrl', 'dashUrl'): + format_url = stream_info.get(url_key) + if not format_url: + continue + decrypted = self._decrypt_xor_cipher( + self._DECRYPTION_KEY, compat_b64decode(format_url)) + if url_key == 'hlsUrl': + formats.extend(self._extract_m3u8_formats( + decrypted, track_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif url_key == 'dashUrl': + formats.extend(self._extract_mpd_formats( + decrypted, track_id, mpd_id='dash', fatal=False)) + else: + formats.append({ + 'format_id': 'http', + 'url': decrypted, + 'downloader_options': { + # Mixcloud starts throttling at >~5M + 'http_chunk_size': 5242880, + }, + }) + + if not formats and cloudcast.get('isExclusive'): + self.raise_login_required(metadata_available=True) + + self._sort_formats(formats) + + comments = [] + for edge in (try_get(cloudcast, lambda x: x['comments']['edges']) or []): + node = edge.get('node') or {} + text = strip_or_none(node.get('comment')) + if not text: + continue + user = node.get('user') or {} + comments.append({ + 'author': user.get('displayName'), + 'author_id': user.get('username'), + 'text': text, + 'timestamp': parse_iso8601(node.get('created')), + }) + + tags = [] + for t in cloudcast.get('tags'): + tag = try_get(t, lambda x: x['tag']['name'], compat_str) + if not tag: + tags.append(tag) + + get_count = lambda x: int_or_none(try_get(cloudcast, lambda y: y[x]['totalCount'])) + + owner = cloudcast.get('owner') or {} + + return { + 'id': track_id, + 'title': title, + 'formats': formats, + 'description': cloudcast.get('description'), + 'thumbnail': try_get(cloudcast, lambda x: x['picture']['url'], compat_str), + 'uploader': owner.get('displayName'), + 'timestamp': parse_iso8601(cloudcast.get('publishDate')), + 'uploader_id': owner.get('username'), + 'uploader_url': owner.get('url'), + 'duration': int_or_none(cloudcast.get('audioLength')), + 'view_count': int_or_none(cloudcast.get('plays')), + 'like_count': get_count('favorites'), + 'repost_count': get_count('reposts'), + 'comment_count': get_count('comments'), + 'comments': comments, + 'tags': tags, + 'artist': ', '.join(cloudcast.get('featuringArtistList') or []) or None, + } + + +class MixcloudPlaylistBaseIE(MixcloudBaseIE): + def _get_cloudcast(self, node): + return node + + def _get_playlist_title(self, title, slug): + return title + + def _real_extract(self, url): + username, slug = self._match_valid_url(url).groups() + username = compat_urllib_parse_unquote(username) + if not slug: + slug = 'uploads' + else: + slug = compat_urllib_parse_unquote(slug) + playlist_id = '%s_%s' % (username, slug) + + is_playlist_type = self._ROOT_TYPE == 'playlist' + playlist_type = 'items' if is_playlist_type else slug + list_filter = '' + + has_next_page = True + entries = [] + while has_next_page: + playlist = self._call_api( + self._ROOT_TYPE, '''%s + %s + %s(first: 100%s) { + edges { + node { + %s + } + } + pageInfo { + endCursor + hasNextPage + } + }''' % (self._TITLE_KEY, self._DESCRIPTION_KEY, playlist_type, list_filter, self._NODE_TEMPLATE), + playlist_id, username, slug if is_playlist_type else None) + + items = playlist.get(playlist_type) or {} + for edge in items.get('edges', []): + cloudcast = self._get_cloudcast(edge.get('node') or {}) + cloudcast_url = cloudcast.get('url') + if not cloudcast_url: + continue + slug = try_get(cloudcast, lambda x: x['slug'], compat_str) + owner_username = try_get(cloudcast, lambda x: x['owner']['username'], compat_str) + video_id = '%s_%s' % (owner_username, slug) if slug and owner_username else None + entries.append(self.url_result( + cloudcast_url, MixcloudIE.ie_key(), video_id)) + + page_info = items['pageInfo'] + has_next_page = page_info['hasNextPage'] + list_filter = ', after: "%s"' % page_info['endCursor'] + + return self.playlist_result( + entries, playlist_id, + self._get_playlist_title(playlist[self._TITLE_KEY], slug), + playlist.get(self._DESCRIPTION_KEY)) + + +class MixcloudUserIE(MixcloudPlaylistBaseIE): + _VALID_URL = r'https?://(?:www\.)?mixcloud\.com/(?P<id>[^/]+)/(?P<type>uploads|favorites|listens|stream)?/?$' + IE_NAME = 'mixcloud:user' + + _TESTS = [{ + 'url': 'http://www.mixcloud.com/dholbach/', + 'info_dict': { + 'id': 'dholbach_uploads', + 'title': 'Daniel Holbach (uploads)', + 'description': 'md5:b60d776f0bab534c5dabe0a34e47a789', + }, + 'playlist_mincount': 36, + }, { + 'url': 'http://www.mixcloud.com/dholbach/uploads/', + 'info_dict': { + 'id': 'dholbach_uploads', + 'title': 'Daniel Holbach (uploads)', + 'description': 'md5:b60d776f0bab534c5dabe0a34e47a789', + }, + 'playlist_mincount': 36, + }, { + 'url': 'http://www.mixcloud.com/dholbach/favorites/', + 'info_dict': { + 'id': 'dholbach_favorites', + 'title': 'Daniel Holbach (favorites)', + 'description': 'md5:b60d776f0bab534c5dabe0a34e47a789', + }, + # 'params': { + # 'playlist_items': '1-100', + # }, + 'playlist_mincount': 396, + }, { + 'url': 'http://www.mixcloud.com/dholbach/listens/', + 'info_dict': { + 'id': 'dholbach_listens', + 'title': 'Daniel Holbach (listens)', + 'description': 'md5:b60d776f0bab534c5dabe0a34e47a789', + }, + # 'params': { + # 'playlist_items': '1-100', + # }, + 'playlist_mincount': 1623, + 'skip': 'Large list', + }, { + 'url': 'https://www.mixcloud.com/FirstEar/stream/', + 'info_dict': { + 'id': 'FirstEar_stream', + 'title': 'First Ear (stream)', + 'description': 'Curators of good music\r\n\r\nfirstearmusic.com', + }, + 'playlist_mincount': 271, + }] + + _TITLE_KEY = 'displayName' + _DESCRIPTION_KEY = 'biog' + _ROOT_TYPE = 'user' + _NODE_TEMPLATE = '''slug + url + owner { username }''' + + def _get_playlist_title(self, title, slug): + return '%s (%s)' % (title, slug) + + +class MixcloudPlaylistIE(MixcloudPlaylistBaseIE): + _VALID_URL = r'https?://(?:www\.)?mixcloud\.com/(?P<user>[^/]+)/playlists/(?P<playlist>[^/]+)/?$' + IE_NAME = 'mixcloud:playlist' + + _TESTS = [{ + 'url': 'https://www.mixcloud.com/maxvibes/playlists/jazzcat-on-ness-radio/', + 'info_dict': { + 'id': 'maxvibes_jazzcat-on-ness-radio', + 'title': 'Ness Radio sessions', + }, + 'playlist_mincount': 59, + }] + _TITLE_KEY = 'name' + _DESCRIPTION_KEY = 'description' + _ROOT_TYPE = 'playlist' + _NODE_TEMPLATE = '''cloudcast { + slug + url + owner { username } + }''' + + def _get_cloudcast(self, node): + return node.get('cloudcast') or {} diff --git a/youtube_dl/extractor/mlb.py b/yt_dlp/extractor/mlb.py index b69301d97..b69301d97 100644 --- a/youtube_dl/extractor/mlb.py +++ b/yt_dlp/extractor/mlb.py diff --git a/youtube_dl/extractor/mnet.py b/yt_dlp/extractor/mnet.py index 0e26ca1b3..0e26ca1b3 100644 --- a/youtube_dl/extractor/mnet.py +++ b/yt_dlp/extractor/mnet.py diff --git a/yt_dlp/extractor/moevideo.py b/yt_dlp/extractor/moevideo.py new file mode 100644 index 000000000..a3f1b3866 --- /dev/null +++ b/yt_dlp/extractor/moevideo.py @@ -0,0 +1,78 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ( + clean_html, + int_or_none, +) + + +class MoeVideoIE(InfoExtractor): + IE_DESC = 'LetitBit video services: moevideo.net, playreplay.net and videochart.net' + _VALID_URL = r'''(?x) + https?://(?P<host>(?:www\.)? + (?:(?:moevideo|playreplay|videochart)\.net|thesame\.tv))/ + (?:video|framevideo|embed)/(?P<id>[0-9a-z]+\.[0-9A-Za-z]+)''' + _API_URL = 'http://api.letitbit.net/' + _API_KEY = 'tVL0gjqo5' + _TESTS = [ + { + 'url': 'http://moevideo.net/video/00297.0036103fe3d513ef27915216fd29', + 'md5': '129f5ae1f6585d0e9bb4f38e774ffb3a', + 'info_dict': { + 'id': '00297.0036103fe3d513ef27915216fd29', + 'ext': 'flv', + 'title': 'Sink cut out machine', + 'description': 'md5:f29ff97b663aefa760bf7ca63c8ca8a8', + 'thumbnail': r're:^https?://.*\.jpg$', + 'width': 540, + 'height': 360, + 'duration': 179, + 'filesize': 17822500, + }, + 'skip': 'Video has been removed', + }, + { + 'url': 'http://playreplay.net/video/77107.7f325710a627383d40540d8e991a', + 'md5': '74f0a014d5b661f0f0e2361300d1620e', + 'info_dict': { + 'id': '77107.7f325710a627383d40540d8e991a', + 'ext': 'flv', + 'title': 'Operacion Condor.', + 'description': 'md5:7e68cb2fcda66833d5081c542491a9a3', + 'thumbnail': r're:^https?://.*\.jpg$', + 'width': 480, + 'height': 296, + 'duration': 6027, + 'filesize': 588257923, + }, + 'skip': 'Video has been removed', + }, + ] + + def _real_extract(self, url): + host, video_id = self._match_valid_url(url).groups() + + webpage = self._download_webpage( + 'http://%s/video/%s' % (host, video_id), + video_id, 'Downloading webpage') + + title = self._og_search_title(webpage) + + embed_webpage = self._download_webpage( + 'http://%s/embed/%s' % (host, video_id), + video_id, 'Downloading embed webpage') + video = self._parse_json(self._search_regex( + r'mvplayer\("#player"\s*,\s*({.+})', + embed_webpage, 'mvplayer'), video_id)['video'] + + return { + 'id': video_id, + 'title': title, + 'thumbnail': video.get('poster') or self._og_search_thumbnail(webpage), + 'description': clean_html(self._og_search_description(webpage)), + 'duration': int_or_none(self._og_search_property('video:duration', webpage)), + 'url': video['ourUrl'], + } diff --git a/youtube_dl/extractor/mofosex.py b/yt_dlp/extractor/mofosex.py index 5234cac02..5234cac02 100644 --- a/youtube_dl/extractor/mofosex.py +++ b/yt_dlp/extractor/mofosex.py diff --git a/yt_dlp/extractor/mojvideo.py b/yt_dlp/extractor/mojvideo.py new file mode 100644 index 000000000..0421f3f44 --- /dev/null +++ b/yt_dlp/extractor/mojvideo.py @@ -0,0 +1,57 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + parse_duration, +) + + +class MojvideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?mojvideo\.com/video-(?P<display_id>[^/]+)/(?P<id>[a-f0-9]+)' + _TEST = { + 'url': 'http://www.mojvideo.com/video-v-avtu-pred-mano-rdecelaska-alfi-nipic/3d1ed4497707730b2906', + 'md5': 'f7fd662cc8ce2be107b0d4f2c0483ae7', + 'info_dict': { + 'id': '3d1ed4497707730b2906', + 'display_id': 'v-avtu-pred-mano-rdecelaska-alfi-nipic', + 'ext': 'mp4', + 'title': 'V avtu pred mano rdečelaska - Alfi Nipič', + 'thumbnail': r're:^http://.*\.jpg$', + 'duration': 242, + } + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') + + # XML is malformed + playerapi = self._download_webpage( + 'http://www.mojvideo.com/playerapi.php?v=%s&t=1' % video_id, display_id) + + if '<error>true</error>' in playerapi: + error_desc = self._html_search_regex( + r'<errordesc>([^<]*)</errordesc>', playerapi, 'error description', fatal=False) + raise ExtractorError('%s said: %s' % (self.IE_NAME, error_desc), expected=True) + + title = self._html_search_regex( + r'<title>([^<]+)</title>', playerapi, 'title') + video_url = self._html_search_regex( + r'<file>([^<]+)</file>', playerapi, 'video URL') + thumbnail = self._html_search_regex( + r'<preview>([^<]+)</preview>', playerapi, 'thumbnail', fatal=False) + duration = parse_duration(self._html_search_regex( + r'<duration>([^<]+)</duration>', playerapi, 'duration', fatal=False)) + + return { + 'id': video_id, + 'display_id': display_id, + 'url': video_url, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + } diff --git a/yt_dlp/extractor/morningstar.py b/yt_dlp/extractor/morningstar.py new file mode 100644 index 000000000..71a22a614 --- /dev/null +++ b/yt_dlp/extractor/morningstar.py @@ -0,0 +1,49 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor + + +class MorningstarIE(InfoExtractor): + IE_DESC = 'morningstar.com' + _VALID_URL = r'https?://(?:(?:www|news)\.)morningstar\.com/[cC]over/video[cC]enter\.aspx\?id=(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://www.morningstar.com/cover/videocenter.aspx?id=615869', + 'md5': '6c0acface7a787aadc8391e4bbf7b0f5', + 'info_dict': { + 'id': '615869', + 'ext': 'mp4', + 'title': 'Get Ahead of the Curve on 2013 Taxes', + 'description': "Vanguard's Joel Dickson on managing higher tax rates for high-income earners and fund capital-gain distributions in 2013.", + 'thumbnail': r're:^https?://.*m(?:orning)?star\.com/.+thumb\.jpg$' + } + }, { + 'url': 'http://news.morningstar.com/cover/videocenter.aspx?id=825556', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + title = self._html_search_regex( + r'<h1 id="titleLink">(.*?)</h1>', webpage, 'title') + video_url = self._html_search_regex( + r'<input type="hidden" id="hidVideoUrl" value="([^"]+)"', + webpage, 'video URL') + thumbnail = self._html_search_regex( + r'<input type="hidden" id="hidSnapshot" value="([^"]+)"', + webpage, 'thumbnail', fatal=False) + description = self._html_search_regex( + r'<div id="mstarDeck".*?>(.*?)</div>', + webpage, 'description', fatal=False) + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + 'thumbnail': thumbnail, + 'description': description, + } diff --git a/yt_dlp/extractor/motherless.py b/yt_dlp/extractor/motherless.py new file mode 100644 index 000000000..111c7c544 --- /dev/null +++ b/yt_dlp/extractor/motherless.py @@ -0,0 +1,250 @@ +from __future__ import unicode_literals + +import datetime +import re + +from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + ExtractorError, + InAdvancePagedList, + orderedSet, + str_to_int, + unified_strdate, +) + + +class MotherlessIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?motherless\.com/(?:g/[a-z0-9_]+/)?(?P<id>[A-Z0-9]+)' + _TESTS = [{ + 'url': 'http://motherless.com/AC3FFE1', + 'md5': '310f62e325a9fafe64f68c0bccb6e75f', + 'info_dict': { + 'id': 'AC3FFE1', + 'ext': 'mp4', + 'title': 'Fucked in the ass while playing PS3', + 'categories': ['Gaming', 'anal', 'reluctant', 'rough', 'Wife'], + 'upload_date': '20100913', + 'uploader_id': 'famouslyfuckedup', + 'thumbnail': r're:https?://.*\.jpg', + 'age_limit': 18, + } + }, { + 'url': 'http://motherless.com/532291B', + 'md5': 'bc59a6b47d1f958e61fbd38a4d31b131', + 'info_dict': { + 'id': '532291B', + 'ext': 'mp4', + 'title': 'Amazing girl playing the omegle game, PERFECT!', + 'categories': ['Amateur', 'webcam', 'omegle', 'pink', 'young', 'masturbate', 'teen', + 'game', 'hairy'], + 'upload_date': '20140622', + 'uploader_id': 'Sulivana7x', + 'thumbnail': r're:https?://.*\.jpg', + 'age_limit': 18, + }, + 'skip': '404', + }, { + 'url': 'http://motherless.com/g/cosplay/633979F', + 'md5': '0b2a43f447a49c3e649c93ad1fafa4a0', + 'info_dict': { + 'id': '633979F', + 'ext': 'mp4', + 'title': 'Turtlette', + 'categories': ['superheroine heroine superher'], + 'upload_date': '20140827', + 'uploader_id': 'shade0230', + 'thumbnail': r're:https?://.*\.jpg', + 'age_limit': 18, + } + }, { + # no keywords + 'url': 'http://motherless.com/8B4BBC1', + 'only_matching': True, + }, { + # see https://motherless.com/videos/recent for recent videos with + # uploaded date in "ago" format + 'url': 'https://motherless.com/3C3E2CF', + 'info_dict': { + 'id': '3C3E2CF', + 'ext': 'mp4', + 'title': 'a/ Hot Teens', + 'categories': list, + 'upload_date': '20210104', + 'uploader_id': 'yonbiw', + 'thumbnail': r're:https?://.*\.jpg', + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + if any(p in webpage for p in ( + '<title>404 - MOTHERLESS.COM<', + ">The page you're looking for cannot be found.<")): + raise ExtractorError('Video %s does not exist' % video_id, expected=True) + + if '>The content you are trying to view is for friends only.' in webpage: + raise ExtractorError('Video %s is for friends only' % video_id, expected=True) + + title = self._html_search_regex( + (r'(?s)<div[^>]+\bclass=["\']media-meta-title[^>]+>(.+?)</div>', + r'id="view-upload-title">\s+([^<]+)<'), webpage, 'title') + video_url = (self._html_search_regex( + (r'setup\(\{\s*["\']file["\']\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', + r'fileurl\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1'), + webpage, 'video URL', default=None, group='url') + or 'http://cdn4.videos.motherlessmedia.com/videos/%s.mp4?fs=opencloud' % video_id) + age_limit = self._rta_search(webpage) + view_count = str_to_int(self._html_search_regex( + (r'>([\d,.]+)\s+Views<', r'<strong>Views</strong>\s+([^<]+)<'), + webpage, 'view count', fatal=False)) + like_count = str_to_int(self._html_search_regex( + (r'>([\d,.]+)\s+Favorites<', + r'<strong>Favorited</strong>\s+([^<]+)<'), + webpage, 'like count', fatal=False)) + + upload_date = unified_strdate(self._search_regex( + r'class=["\']count[^>]+>(\d+\s+[a-zA-Z]{3}\s+\d{4})<', webpage, + 'upload date', default=None)) + if not upload_date: + uploaded_ago = self._search_regex( + r'>\s*(\d+[hd])\s+[aA]go\b', webpage, 'uploaded ago', + default=None) + if uploaded_ago: + delta = int(uploaded_ago[:-1]) + _AGO_UNITS = { + 'h': 'hours', + 'd': 'days', + } + kwargs = {_AGO_UNITS.get(uploaded_ago[-1]): delta} + upload_date = (datetime.datetime.utcnow() - datetime.timedelta(**kwargs)).strftime('%Y%m%d') + + comment_count = webpage.count('class="media-comment-contents"') + uploader_id = self._html_search_regex( + (r'"media-meta-member">\s+<a href="/m/([^"]+)"', + r'<span\b[^>]+\bclass="username">([^<]+)</span>'), + webpage, 'uploader_id', fatal=False) + categories = self._html_search_meta('keywords', webpage, default=None) + if categories: + categories = [cat.strip() for cat in categories.split(',')] + + return { + 'id': video_id, + 'title': title, + 'upload_date': upload_date, + 'uploader_id': uploader_id, + 'thumbnail': self._og_search_thumbnail(webpage), + 'categories': categories, + 'view_count': view_count, + 'like_count': like_count, + 'comment_count': comment_count, + 'age_limit': age_limit, + 'url': video_url, + } + + +class MotherlessGroupIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?motherless\.com/gv?/(?P<id>[a-z0-9_]+)' + _TESTS = [{ + 'url': 'http://motherless.com/g/movie_scenes', + 'info_dict': { + 'id': 'movie_scenes', + 'title': 'Movie Scenes', + 'description': 'Hot and sexy scenes from "regular" movies... ' + 'Beautiful actresses fully nude... A looot of ' + 'skin! :)Enjoy!', + }, + 'playlist_mincount': 662, + }, { + 'url': 'http://motherless.com/gv/sex_must_be_funny', + 'info_dict': { + 'id': 'sex_must_be_funny', + 'title': 'Sex must be funny', + 'description': 'Sex can be funny. Wide smiles,laugh, games, fun of ' + 'any kind!' + }, + 'playlist_mincount': 0, + 'expected_warnings': [ + 'This group has no videos.', + ] + }, { + 'url': 'https://motherless.com/g/beautiful_cock', + 'info_dict': { + 'id': 'beautiful_cock', + 'title': 'Beautiful Cock', + 'description': 'Group for lovely cocks yours, mine, a friends anything human', + }, + 'playlist_mincount': 2500, + }] + + @classmethod + def suitable(cls, url): + return (False if MotherlessIE.suitable(url) + else super(MotherlessGroupIE, cls).suitable(url)) + + def _extract_entries(self, webpage, base): + entries = [] + for mobj in re.finditer( + r'href="(?P<href>/[^"]+)"[^>]*>(?:\s*<img[^>]+alt="[^-]+-\s(?P<title>[^"]+)")?', + webpage): + video_url = compat_urlparse.urljoin(base, mobj.group('href')) + if not MotherlessIE.suitable(video_url): + continue + video_id = MotherlessIE._match_id(video_url) + title = mobj.group('title') + entries.append(self.url_result( + video_url, ie=MotherlessIE.ie_key(), video_id=video_id, + video_title=title)) + # Alternative fallback + if not entries: + entries = [ + self.url_result( + compat_urlparse.urljoin(base, '/' + entry_id), + ie=MotherlessIE.ie_key(), video_id=entry_id) + for entry_id in orderedSet(re.findall( + r'data-codename=["\']([A-Z0-9]+)', webpage))] + return entries + + def _real_extract(self, url): + group_id = self._match_id(url) + page_url = compat_urlparse.urljoin(url, '/gv/%s' % group_id) + webpage = self._download_webpage(page_url, group_id) + title = self._search_regex( + r'<title>([\w\s]+\w)\s+-', webpage, 'title', fatal=False) + description = self._html_search_meta( + 'description', webpage, fatal=False) + page_count = self._int(self._search_regex( + r'(\d+)</(?:a|span)><(?:a|span)[^>]+rel="next">', + webpage, 'page_count', default=0), 'page_count') + if not page_count: + message = self._search_regex( + r'class="error-page"[^>]*>\s*<p[^>]*>\s*(?P<error_msg>[^<]+)(?<=\S)\s*', + webpage, 'error_msg', default=None) or 'This group has no videos.' + self.report_warning(message, group_id) + PAGE_SIZE = 80 + + def _get_page(idx): + if not page_count: + return + webpage = self._download_webpage( + page_url, group_id, query={'page': idx + 1}, + note='Downloading page %d/%d' % (idx + 1, page_count) + ) + for entry in self._extract_entries(webpage, url): + yield entry + + playlist = InAdvancePagedList(_get_page, page_count, PAGE_SIZE) + + return { + '_type': 'playlist', + 'id': group_id, + 'title': title, + 'description': description, + 'entries': playlist + } diff --git a/youtube_dl/extractor/motorsport.py b/yt_dlp/extractor/motorsport.py index c9d1ab64d..c9d1ab64d 100644 --- a/youtube_dl/extractor/motorsport.py +++ b/yt_dlp/extractor/motorsport.py diff --git a/youtube_dl/extractor/movieclips.py b/yt_dlp/extractor/movieclips.py index 5453da1ac..5453da1ac 100644 --- a/youtube_dl/extractor/movieclips.py +++ b/yt_dlp/extractor/movieclips.py diff --git a/yt_dlp/extractor/moviezine.py b/yt_dlp/extractor/moviezine.py new file mode 100644 index 000000000..730da4bd7 --- /dev/null +++ b/yt_dlp/extractor/moviezine.py @@ -0,0 +1,44 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor + + +class MoviezineIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?moviezine\.se/video/(?P<id>[^?#]+)' + + _TEST = { + 'url': 'http://www.moviezine.se/video/205866', + 'info_dict': { + 'id': '205866', + 'ext': 'mp4', + 'title': 'Oculus - Trailer 1', + 'description': 'md5:40cc6790fc81d931850ca9249b40e8a4', + 'thumbnail': r're:http://.*\.jpg', + }, + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + jsplayer = self._download_webpage('http://www.moviezine.se/api/player.js?video=%s' % video_id, video_id, 'Downloading js api player') + + formats = [{ + 'format_id': 'sd', + 'url': self._html_search_regex(r'file: "(.+?)",', jsplayer, 'file'), + 'quality': 0, + 'ext': 'mp4', + }] + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._search_regex(r'title: "(.+?)",', jsplayer, 'title'), + 'thumbnail': self._search_regex(r'image: "(.+?)",', jsplayer, 'image'), + 'formats': formats, + 'description': self._og_search_description(webpage), + } diff --git a/youtube_dl/extractor/movingimage.py b/yt_dlp/extractor/movingimage.py index 4f62d628a..4f62d628a 100644 --- a/youtube_dl/extractor/movingimage.py +++ b/yt_dlp/extractor/movingimage.py diff --git a/yt_dlp/extractor/msn.py b/yt_dlp/extractor/msn.py new file mode 100644 index 000000000..f34e2102c --- /dev/null +++ b/yt_dlp/extractor/msn.py @@ -0,0 +1,171 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + unescapeHTML, +) + + +class MSNIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www|preview)\.)?msn\.com/(?:[^/]+/)+(?P<display_id>[^/]+)/[a-z]{2}-(?P<id>[\da-zA-Z]+)' + _TESTS = [{ + 'url': 'https://www.msn.com/en-in/money/video/7-ways-to-get-rid-of-chest-congestion/vi-BBPxU6d', + 'md5': '087548191d273c5c55d05028f8d2cbcd', + 'info_dict': { + 'id': 'BBPxU6d', + 'display_id': '7-ways-to-get-rid-of-chest-congestion', + 'ext': 'mp4', + 'title': 'Seven ways to get rid of chest congestion', + 'description': '7 Ways to Get Rid of Chest Congestion', + 'duration': 88, + 'uploader': 'Health', + 'uploader_id': 'BBPrMqa', + }, + }, { + # Article, multiple Dailymotion Embeds + 'url': 'https://www.msn.com/en-in/money/sports/hottest-football-wags-greatest-footballers-turned-managers-and-more/ar-BBpc7Nl', + 'info_dict': { + 'id': 'BBpc7Nl', + }, + 'playlist_mincount': 4, + }, { + 'url': 'http://www.msn.com/en-ae/news/offbeat/meet-the-nine-year-old-self-made-millionaire/ar-BBt6ZKf', + 'only_matching': True, + }, { + 'url': 'http://www.msn.com/en-ae/video/watch/obama-a-lot-of-people-will-be-disappointed/vi-AAhxUMH', + 'only_matching': True, + }, { + # geo restricted + 'url': 'http://www.msn.com/en-ae/foodanddrink/joinourtable/the-first-fart-makes-you-laugh-the-last-fart-makes-you-cry/vp-AAhzIBU', + 'only_matching': True, + }, { + 'url': 'http://www.msn.com/en-ae/entertainment/bollywood/watch-how-salman-khan-reacted-when-asked-if-he-would-apologize-for-his-‘raped-woman’-comment/vi-AAhvzW6', + 'only_matching': True, + }, { + # Vidible(AOL) Embed + 'url': 'https://www.msn.com/en-us/money/other/jupiter-is-about-to-come-so-close-you-can-see-its-moons-with-binoculars/vi-AACqsHR', + 'only_matching': True, + }, { + # Dailymotion Embed + 'url': 'https://www.msn.com/es-ve/entretenimiento/watch/winston-salem-paire-refait-des-siennes-en-perdant-sa-raquette-au-service/vp-AAG704L', + 'only_matching': True, + }, { + # YouTube Embed + 'url': 'https://www.msn.com/en-in/money/news/meet-vikram-%E2%80%94-chandrayaan-2s-lander/vi-AAGUr0v', + 'only_matching': True, + }, { + # NBCSports Embed + 'url': 'https://www.msn.com/en-us/money/football_nfl/week-13-preview-redskins-vs-panthers/vi-BBXsCDb', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id, page_id = self._match_valid_url(url).groups() + + webpage = self._download_webpage(url, display_id) + + entries = [] + for _, metadata in re.findall(r'data-metadata\s*=\s*(["\'])(?P<data>.+?)\1', webpage): + video = self._parse_json(unescapeHTML(metadata), display_id) + + provider_id = video.get('providerId') + player_name = video.get('playerName') + if player_name and provider_id: + entry = None + if player_name == 'AOL': + if provider_id.startswith('http'): + provider_id = self._search_regex( + r'https?://delivery\.vidible\.tv/video/redirect/([0-9a-f]{24})', + provider_id, 'vidible id') + entry = self.url_result( + 'aol-video:' + provider_id, 'Aol', provider_id) + elif player_name == 'Dailymotion': + entry = self.url_result( + 'https://www.dailymotion.com/video/' + provider_id, + 'Dailymotion', provider_id) + elif player_name == 'YouTube': + entry = self.url_result( + provider_id, 'Youtube', provider_id) + elif player_name == 'NBCSports': + entry = self.url_result( + 'http://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/media/' + provider_id, + 'NBCSportsVPlayer', provider_id) + if entry: + entries.append(entry) + continue + + video_id = video['uuid'] + title = video['title'] + + formats = [] + for file_ in video.get('videoFiles', []): + format_url = file_.get('url') + if not format_url: + continue + if 'format=m3u8-aapl' in format_url: + # m3u8_native should not be used here until + # https://github.com/ytdl-org/youtube-dl/issues/9913 is fixed + formats.extend(self._extract_m3u8_formats( + format_url, display_id, 'mp4', + m3u8_id='hls', fatal=False)) + elif 'format=mpd-time-csf' in format_url: + formats.extend(self._extract_mpd_formats( + format_url, display_id, 'dash', fatal=False)) + elif '.ism' in format_url: + if format_url.endswith('.ism'): + format_url += '/manifest' + formats.extend(self._extract_ism_formats( + format_url, display_id, 'mss', fatal=False)) + else: + format_id = file_.get('formatCode') + formats.append({ + 'url': format_url, + 'ext': 'mp4', + 'format_id': format_id, + 'width': int_or_none(file_.get('width')), + 'height': int_or_none(file_.get('height')), + 'vbr': int_or_none(self._search_regex(r'_(\d+)\.mp4', format_url, 'vbr', default=None)), + 'quality': 1 if format_id == '1001' else None, + }) + self._sort_formats(formats) + + subtitles = {} + for file_ in video.get('files', []): + format_url = file_.get('url') + format_code = file_.get('formatCode') + if not format_url or not format_code: + continue + if compat_str(format_code) == '3100': + subtitles.setdefault(file_.get('culture', 'en'), []).append({ + 'ext': determine_ext(format_url, 'ttml'), + 'url': format_url, + }) + + entries.append({ + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': video.get('description'), + 'thumbnail': video.get('headlineImage', {}).get('url'), + 'duration': int_or_none(video.get('durationSecs')), + 'uploader': video.get('sourceFriendly'), + 'uploader_id': video.get('providerId'), + 'creator': video.get('creator'), + 'subtitles': subtitles, + 'formats': formats, + }) + + if not entries: + error = unescapeHTML(self._search_regex( + r'data-error=(["\'])(?P<error>.+?)\1', + webpage, 'error', group='error')) + raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) + + return self.playlist_result(entries, page_id) diff --git a/yt_dlp/extractor/mtv.py b/yt_dlp/extractor/mtv.py new file mode 100644 index 000000000..e0608845d --- /dev/null +++ b/yt_dlp/extractor/mtv.py @@ -0,0 +1,662 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_xpath, +) +from ..utils import ( + ExtractorError, + find_xpath_attr, + fix_xml_ampersands, + float_or_none, + HEADRequest, + int_or_none, + RegexNotFoundError, + sanitized_Request, + strip_or_none, + timeconvert, + try_get, + unescapeHTML, + update_url_query, + url_basename, + xpath_text, +) + + +def _media_xml_tag(tag): + return '{http://search.yahoo.com/mrss/}%s' % tag + + +class MTVServicesInfoExtractor(InfoExtractor): + _MOBILE_TEMPLATE = None + _LANG = None + + @staticmethod + def _id_from_uri(uri): + return uri.split(':')[-1] + + @staticmethod + def _remove_template_parameter(url): + # Remove the templates, like &device={device} + return re.sub(r'&[^=]*?={.*?}(?=(&|$))', '', url) + + def _get_feed_url(self, uri, url=None): + return self._FEED_URL + + def _get_thumbnail_url(self, uri, itemdoc): + search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail')) + thumb_node = itemdoc.find(search_path) + if thumb_node is None: + return None + return thumb_node.get('url') or thumb_node.text or None + + def _extract_mobile_video_formats(self, mtvn_id): + webpage_url = self._MOBILE_TEMPLATE % mtvn_id + req = sanitized_Request(webpage_url) + # Otherwise we get a webpage that would execute some javascript + req.add_header('User-Agent', 'curl/7') + webpage = self._download_webpage(req, mtvn_id, + 'Downloading mobile page') + metrics_url = unescapeHTML(self._search_regex(r'<a href="(http://metrics.+?)"', webpage, 'url')) + req = HEADRequest(metrics_url) + response = self._request_webpage(req, mtvn_id, 'Resolving url') + url = response.geturl() + # Transform the url to get the best quality: + url = re.sub(r'.+pxE=mp4', 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=0+_pxK=18639+_pxE=mp4', url, 1) + return [{'url': url, 'ext': 'mp4'}] + + def _extract_video_formats(self, mdoc, mtvn_id, video_id): + if re.match(r'.*/(error_country_block\.swf|geoblock\.mp4|copyright_error\.flv(?:\?geo\b.+?)?)$', mdoc.find('.//src').text) is not None: + if mtvn_id is not None and self._MOBILE_TEMPLATE is not None: + self.to_screen('The normal version is not available from your ' + 'country, trying with the mobile version') + return self._extract_mobile_video_formats(mtvn_id) + raise ExtractorError('This video is not available from your country.', + expected=True) + + formats = [] + for rendition in mdoc.findall('.//rendition'): + if rendition.get('method') == 'hls': + hls_url = rendition.find('./src').text + formats.extend(self._extract_m3u8_formats( + hls_url, video_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + else: + # fms + try: + _, _, ext = rendition.attrib['type'].partition('/') + rtmp_video_url = rendition.find('./src').text + if 'error_not_available.swf' in rtmp_video_url: + raise ExtractorError( + '%s said: video is not available' % self.IE_NAME, + expected=True) + if rtmp_video_url.endswith('siteunavail.png'): + continue + formats.extend([{ + 'ext': 'flv' if rtmp_video_url.startswith('rtmp') else ext, + 'url': rtmp_video_url, + 'format_id': '-'.join(filter(None, [ + 'rtmp' if rtmp_video_url.startswith('rtmp') else None, + rendition.get('bitrate')])), + 'width': int(rendition.get('width')), + 'height': int(rendition.get('height')), + }]) + except (KeyError, TypeError): + raise ExtractorError('Invalid rendition field.') + if formats: + self._sort_formats(formats) + return formats + + def _extract_subtitles(self, mdoc, mtvn_id): + subtitles = {} + for transcript in mdoc.findall('.//transcript'): + if transcript.get('kind') != 'captions': + continue + lang = transcript.get('srclang') + for typographic in transcript.findall('./typographic'): + sub_src = typographic.get('src') + if not sub_src: + continue + ext = typographic.get('format') + if ext == 'cea-608': + ext = 'scc' + subtitles.setdefault(lang, []).append({ + 'url': compat_str(sub_src), + 'ext': ext + }) + return subtitles + + def _get_video_info(self, itemdoc, use_hls=True): + uri = itemdoc.find('guid').text + video_id = self._id_from_uri(uri) + self.report_extraction(video_id) + content_el = itemdoc.find('%s/%s' % (_media_xml_tag('group'), _media_xml_tag('content'))) + mediagen_url = self._remove_template_parameter(content_el.attrib['url']) + mediagen_url = mediagen_url.replace('device={device}', '') + if 'acceptMethods' not in mediagen_url: + mediagen_url += '&' if '?' in mediagen_url else '?' + mediagen_url += 'acceptMethods=' + mediagen_url += 'hls' if use_hls else 'fms' + + mediagen_doc = self._download_xml( + mediagen_url, video_id, 'Downloading video urls', fatal=False) + + if mediagen_doc is False: + return None + + item = mediagen_doc.find('./video/item') + if item is not None and item.get('type') == 'text': + message = '%s returned error: ' % self.IE_NAME + if item.get('code') is not None: + message += '%s - ' % item.get('code') + message += item.text + raise ExtractorError(message, expected=True) + + description = strip_or_none(xpath_text(itemdoc, 'description')) + + timestamp = timeconvert(xpath_text(itemdoc, 'pubDate')) + + title_el = None + if title_el is None: + title_el = find_xpath_attr( + itemdoc, './/{http://search.yahoo.com/mrss/}category', + 'scheme', 'urn:mtvn:video_title') + if title_el is None: + title_el = itemdoc.find(compat_xpath('.//{http://search.yahoo.com/mrss/}title')) + if title_el is None: + title_el = itemdoc.find(compat_xpath('.//title')) + if title_el.text is None: + title_el = None + + title = title_el.text + if title is None: + raise ExtractorError('Could not find video title') + title = title.strip() + + series = find_xpath_attr( + itemdoc, './/{http://search.yahoo.com/mrss/}category', + 'scheme', 'urn:mtvn:franchise') + season = find_xpath_attr( + itemdoc, './/{http://search.yahoo.com/mrss/}category', + 'scheme', 'urn:mtvn:seasonN') + episode = find_xpath_attr( + itemdoc, './/{http://search.yahoo.com/mrss/}category', + 'scheme', 'urn:mtvn:episodeN') + series = series.text if series is not None else None + season = season.text if season is not None else None + episode = episode.text if episode is not None else None + if season and episode: + # episode number includes season, so remove it + episode = re.sub(r'^%s' % season, '', episode) + + # This a short id that's used in the webpage urls + mtvn_id = None + mtvn_id_node = find_xpath_attr(itemdoc, './/{http://search.yahoo.com/mrss/}category', + 'scheme', 'urn:mtvn:id') + if mtvn_id_node is not None: + mtvn_id = mtvn_id_node.text + + formats = self._extract_video_formats(mediagen_doc, mtvn_id, video_id) + + # Some parts of complete video may be missing (e.g. missing Act 3 in + # http://www.southpark.de/alle-episoden/s14e01-sexual-healing) + if not formats: + return None + + self._sort_formats(formats) + + return { + 'title': title, + 'formats': formats, + 'subtitles': self._extract_subtitles(mediagen_doc, mtvn_id), + 'id': video_id, + 'thumbnail': self._get_thumbnail_url(uri, itemdoc), + 'description': description, + 'duration': float_or_none(content_el.attrib.get('duration')), + 'timestamp': timestamp, + 'series': series, + 'season_number': int_or_none(season), + 'episode_number': int_or_none(episode), + } + + def _get_feed_query(self, uri): + data = {'uri': uri} + if self._LANG: + data['lang'] = self._LANG + return data + + def _get_videos_info(self, uri, use_hls=True, url=None): + video_id = self._id_from_uri(uri) + feed_url = self._get_feed_url(uri, url) + info_url = update_url_query(feed_url, self._get_feed_query(uri)) + return self._get_videos_info_from_url(info_url, video_id, use_hls) + + def _get_videos_info_from_url(self, url, video_id, use_hls=True): + idoc = self._download_xml( + url, video_id, + 'Downloading info', transform_source=fix_xml_ampersands) + + title = xpath_text(idoc, './channel/title') + description = xpath_text(idoc, './channel/description') + + entries = [] + for item in idoc.findall('.//item'): + info = self._get_video_info(item, use_hls) + if info: + entries.append(info) + + # TODO: should be multi-video + return self.playlist_result( + entries, playlist_title=title, playlist_description=description) + + def _extract_triforce_mgid(self, webpage, data_zone=None, video_id=None): + triforce_feed = self._parse_json(self._search_regex( + r'triforceManifestFeed\s*=\s*({.+?})\s*;\s*\n', webpage, + 'triforce feed', default='{}'), video_id, fatal=False) + + data_zone = self._search_regex( + r'data-zone=(["\'])(?P<zone>.+?_lc_promo.*?)\1', webpage, + 'data zone', default=data_zone, group='zone') + + feed_url = try_get( + triforce_feed, lambda x: x['manifest']['zones'][data_zone]['feed'], + compat_str) + if not feed_url: + return + + feed = self._download_json(feed_url, video_id, fatal=False) + if not feed: + return + + return try_get(feed, lambda x: x['result']['data']['id'], compat_str) + + @staticmethod + def _extract_child_with_type(parent, t): + for c in parent['children']: + if c.get('type') == t: + return c + + def _extract_mgid(self, webpage): + try: + # the url can be http://media.mtvnservices.com/fb/{mgid}.swf + # or http://media.mtvnservices.com/{mgid} + og_url = self._og_search_video_url(webpage) + mgid = url_basename(og_url) + if mgid.endswith('.swf'): + mgid = mgid[:-4] + except RegexNotFoundError: + mgid = None + + if mgid is None or ':' not in mgid: + mgid = self._search_regex( + [r'data-mgid="(.*?)"', r'swfobject\.embedSWF\(".*?(mgid:.*?)"'], + webpage, 'mgid', default=None) + + if not mgid: + sm4_embed = self._html_search_meta( + 'sm4:video:embed', webpage, 'sm4 embed', default='') + mgid = self._search_regex( + r'embed/(mgid:.+?)["\'&?/]', sm4_embed, 'mgid', default=None) + + if not mgid: + mgid = self._extract_triforce_mgid(webpage) + + if not mgid: + data = self._parse_json(self._search_regex( + r'__DATA__\s*=\s*({.+?});', webpage, 'data'), None) + main_container = self._extract_child_with_type(data, 'MainContainer') + ab_testing = self._extract_child_with_type(main_container, 'ABTesting') + video_player = self._extract_child_with_type(ab_testing or main_container, 'VideoPlayer') + mgid = video_player['props']['media']['video']['config']['uri'] + + if not mgid: + mgid = self._search_regex( + r'"media":{"video":{"config":{"uri":"(mgid:.*?)"', webpage, 'mgid', default=None) + + return mgid + + def _real_extract(self, url): + title = url_basename(url) + webpage = self._download_webpage(url, title) + mgid = self._extract_mgid(webpage) + videos_info = self._get_videos_info(mgid, url=url) + return videos_info + + +class MTVServicesEmbeddedIE(MTVServicesInfoExtractor): + IE_NAME = 'mtvservices:embedded' + _VALID_URL = r'https?://media\.mtvnservices\.com/embed/(?P<mgid>.+?)(\?|/|$)' + + _TEST = { + # From http://www.thewrap.com/peter-dinklage-sums-up-game-of-thrones-in-45-seconds-video/ + 'url': 'http://media.mtvnservices.com/embed/mgid:uma:video:mtv.com:1043906/cp~vid%3D1043906%26uri%3Dmgid%3Auma%3Avideo%3Amtv.com%3A1043906', + 'md5': 'cb349b21a7897164cede95bd7bf3fbb9', + 'info_dict': { + 'id': '1043906', + 'ext': 'mp4', + 'title': 'Peter Dinklage Sums Up \'Game Of Thrones\' In 45 Seconds', + 'description': '"Sexy sexy sexy, stabby stabby stabby, beautiful language," says Peter Dinklage as he tries summarizing "Game of Thrones" in under a minute.', + 'timestamp': 1400126400, + 'upload_date': '20140515', + }, + } + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//media\.mtvnservices\.com/embed/.+?)\1', webpage) + if mobj: + return mobj.group('url') + + def _get_feed_url(self, uri, url=None): + video_id = self._id_from_uri(uri) + config = self._download_json( + 'http://media.mtvnservices.com/pmt/e1/access/index.html?uri=%s&configtype=edge' % uri, video_id) + return self._remove_template_parameter(config['feedWithQueryParams']) + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + mgid = mobj.group('mgid') + return self._get_videos_info(mgid) + + +class MTVIE(MTVServicesInfoExtractor): + IE_NAME = 'mtv' + _VALID_URL = r'https?://(?:www\.)?mtv\.com/(?:video-clips|(?:full-)?episodes)/(?P<id>[^/?#.]+)' + _FEED_URL = 'http://www.mtv.com/feeds/mrss/' + + _TESTS = [{ + 'url': 'http://www.mtv.com/video-clips/vl8qof/unlocking-the-truth-trailer', + 'md5': '1edbcdf1e7628e414a8c5dcebca3d32b', + 'info_dict': { + 'id': '5e14040d-18a4-47c4-a582-43ff602de88e', + 'ext': 'mp4', + 'title': 'Unlocking The Truth|July 18, 2016|1|101|Trailer', + 'description': '"Unlocking the Truth" premieres August 17th at 11/10c.', + 'timestamp': 1468846800, + 'upload_date': '20160718', + }, + }, { + 'url': 'http://www.mtv.com/full-episodes/94tujl/unlocking-the-truth-gates-of-hell-season-1-ep-101', + 'only_matching': True, + }, { + 'url': 'http://www.mtv.com/episodes/g8xu7q/teen-mom-2-breaking-the-wall-season-7-ep-713', + 'only_matching': True, + }] + + +class MTVJapanIE(MTVServicesInfoExtractor): + IE_NAME = 'mtvjapan' + _VALID_URL = r'https?://(?:www\.)?mtvjapan\.com/videos/(?P<id>[0-9a-z]+)' + + _TEST = { + 'url': 'http://www.mtvjapan.com/videos/prayht/fresh-info-cadillac-escalade', + 'info_dict': { + 'id': 'bc01da03-6fe5-4284-8880-f291f4e368f5', + 'ext': 'mp4', + 'title': '【Fresh Info】Cadillac ESCALADE Sport Edition', + }, + 'params': { + 'skip_download': True, + }, + } + _GEO_COUNTRIES = ['JP'] + _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed' + + def _get_feed_query(self, uri): + return { + 'arcEp': 'mtvjapan.com', + 'mgid': uri, + } + + +class MTVVideoIE(MTVServicesInfoExtractor): + IE_NAME = 'mtv:video' + _VALID_URL = r'''(?x)^https?:// + (?:(?:www\.)?mtv\.com/videos/.+?/(?P<videoid>[0-9]+)/[^/]+$| + m\.mtv\.com/videos/video\.rbml\?.*?id=(?P<mgid>[^&]+))''' + + _FEED_URL = 'http://www.mtv.com/player/embed/AS3/rss/' + + _TESTS = [ + { + 'url': 'http://www.mtv.com/videos/misc/853555/ours-vh1-storytellers.jhtml', + 'md5': '850f3f143316b1e71fa56a4edfd6e0f8', + 'info_dict': { + 'id': '853555', + 'ext': 'mp4', + 'title': 'Taylor Swift - "Ours (VH1 Storytellers)"', + 'description': 'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.', + 'timestamp': 1352610000, + 'upload_date': '20121111', + }, + }, + ] + + def _get_thumbnail_url(self, uri, itemdoc): + return 'http://mtv.mtvnimages.com/uri/' + uri + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('videoid') + uri = mobj.groupdict().get('mgid') + if uri is None: + webpage = self._download_webpage(url, video_id) + + # Some videos come from Vevo.com + m_vevo = re.search( + r'(?s)isVevoVideo = true;.*?vevoVideoId = "(.*?)";', webpage) + if m_vevo: + vevo_id = m_vevo.group(1) + self.to_screen('Vevo video detected: %s' % vevo_id) + return self.url_result('vevo:%s' % vevo_id, ie='Vevo') + + uri = self._html_search_regex(r'/uri/(.*?)\?', webpage, 'uri') + return self._get_videos_info(uri) + + +class MTVDEIE(MTVServicesInfoExtractor): + IE_NAME = 'mtv.de' + _VALID_URL = r'https?://(?:www\.)?mtv\.de/(?:musik/videoclips|folgen|news)/(?P<id>[0-9a-z]+)' + _TESTS = [{ + 'url': 'http://www.mtv.de/musik/videoclips/2gpnv7/Traum', + 'info_dict': { + 'id': 'd5d472bc-f5b7-11e5-bffd-a4badb20dab5', + 'ext': 'mp4', + 'title': 'Traum', + 'description': 'Traum', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'Blocked at Travis CI', + }, { + # mediagen URL without query (e.g. http://videos.mtvnn.com/mediagen/e865da714c166d18d6f80893195fcb97) + 'url': 'http://www.mtv.de/folgen/6b1ylu/teen-mom-2-enthuellungen-S5-F1', + 'info_dict': { + 'id': '1e5a878b-31c5-11e7-a442-0e40cf2fc285', + 'ext': 'mp4', + 'title': 'Teen Mom 2', + 'description': 'md5:dc65e357ef7e1085ed53e9e9d83146a7', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'Blocked at Travis CI', + }, { + 'url': 'http://www.mtv.de/news/glolix/77491-mtv-movies-spotlight--pixels--teil-3', + 'info_dict': { + 'id': 'local_playlist-4e760566473c4c8c5344', + 'ext': 'mp4', + 'title': 'Article_mtv-movies-spotlight-pixels-teil-3_short-clips_part1', + 'description': 'MTV Movies Supercut', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'Das Video kann zur Zeit nicht abgespielt werden.', + }] + _GEO_COUNTRIES = ['DE'] + _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed' + + def _get_feed_query(self, uri): + return { + 'arcEp': 'mtv.de', + 'mgid': uri, + } + + +class MTVItaliaIE(MTVServicesInfoExtractor): + IE_NAME = 'mtv.it' + _VALID_URL = r'https?://(?:www\.)?mtv\.it/(?:episodi|video|musica)/(?P<id>[0-9a-z]+)' + _TESTS = [{ + 'url': 'http://www.mtv.it/episodi/24bqab/mario-una-serie-di-maccio-capatonda-cavoli-amario-episodio-completo-S1-E1', + 'info_dict': { + 'id': '0f0fc78e-45fc-4cce-8f24-971c25477530', + 'ext': 'mp4', + 'title': 'Cavoli amario (episodio completo)', + 'description': 'md5:4962bccea8fed5b7c03b295ae1340660', + 'series': 'Mario - Una Serie Di Maccio Capatonda', + 'season_number': 1, + 'episode_number': 1, + }, + 'params': { + 'skip_download': True, + }, + }] + _GEO_COUNTRIES = ['IT'] + _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed' + + def _get_feed_query(self, uri): + return { + 'arcEp': 'mtv.it', + 'mgid': uri, + } + + +class MTVItaliaProgrammaIE(MTVItaliaIE): + IE_NAME = 'mtv.it:programma' + _VALID_URL = r'https?://(?:www\.)?mtv\.it/(?:programmi|playlist)/(?P<id>[0-9a-z]+)' + _TESTS = [{ + # program page: general + 'url': 'http://www.mtv.it/programmi/s2rppv/mario-una-serie-di-maccio-capatonda', + 'info_dict': { + 'id': 'a6f155bc-8220-4640-aa43-9b95f64ffa3d', + 'title': 'Mario - Una Serie Di Maccio Capatonda', + 'description': 'md5:72fbffe1f77ccf4e90757dd4e3216153', + }, + 'playlist_count': 2, + 'params': { + 'skip_download': True, + }, + }, { + # program page: specific season + 'url': 'http://www.mtv.it/programmi/d9ncjf/mario-una-serie-di-maccio-capatonda-S2', + 'info_dict': { + 'id': '4deeb5d8-f272-490c-bde2-ff8d261c6dd1', + 'title': 'Mario - Una Serie Di Maccio Capatonda - Stagione 2', + }, + 'playlist_count': 34, + 'params': { + 'skip_download': True, + }, + }, { + # playlist page + redirect + 'url': 'http://www.mtv.it/playlist/sexy-videos/ilctal', + 'info_dict': { + 'id': 'dee8f9ee-756d-493b-bf37-16d1d2783359', + 'title': 'Sexy Videos', + }, + 'playlist_mincount': 145, + 'params': { + 'skip_download': True, + }, + }] + _GEO_COUNTRIES = ['IT'] + _FEED_URL = 'http://www.mtv.it/feeds/triforce/manifest/v8' + + def _get_entries(self, title, url): + while True: + pg = self._search_regex(r'/(\d+)$', url, 'entries', '1') + entries = self._download_json(url, title, 'page %s' % pg) + url = try_get( + entries, lambda x: x['result']['nextPageURL'], compat_str) + entries = try_get( + entries, ( + lambda x: x['result']['data']['items'], + lambda x: x['result']['data']['seasons']), + list) + for entry in entries or []: + if entry.get('canonicalURL'): + yield self.url_result(entry['canonicalURL']) + if not url: + break + + def _real_extract(self, url): + query = {'url': url} + info_url = update_url_query(self._FEED_URL, query) + video_id = self._match_id(url) + info = self._download_json(info_url, video_id).get('manifest') + + redirect = try_get( + info, lambda x: x['newLocation']['url'], compat_str) + if redirect: + return self.url_result(redirect) + + title = info.get('title') + video_id = try_get( + info, lambda x: x['reporting']['itemId'], compat_str) + parent_id = try_get( + info, lambda x: x['reporting']['parentId'], compat_str) + + playlist_url = current_url = None + for z in (info.get('zones') or {}).values(): + if z.get('moduleName') in ('INTL_M304', 'INTL_M209'): + info_url = z.get('feed') + if z.get('moduleName') in ('INTL_M308', 'INTL_M317'): + playlist_url = playlist_url or z.get('feed') + if z.get('moduleName') in ('INTL_M300',): + current_url = current_url or z.get('feed') + + if not info_url: + raise ExtractorError('No info found') + + if video_id == parent_id: + video_id = self._search_regex( + r'([^\/]+)/[^\/]+$', info_url, 'video_id') + + info = self._download_json(info_url, video_id, 'Show infos') + info = try_get(info, lambda x: x['result']['data'], dict) + title = title or try_get( + info, ( + lambda x: x['title'], + lambda x: x['headline']), + compat_str) + description = try_get(info, lambda x: x['content'], compat_str) + + if current_url: + season = try_get( + self._download_json(playlist_url, video_id, 'Seasons info'), + lambda x: x['result']['data'], dict) + current = try_get( + season, lambda x: x['currentSeason'], compat_str) + seasons = try_get( + season, lambda x: x['seasons'], list) or [] + + if current in [s.get('eTitle') for s in seasons]: + playlist_url = current_url + + title = re.sub( + r'[-|]\s*(?:mtv\s*italia|programma|playlist)', + '', title, flags=re.IGNORECASE).strip() + + return self.playlist_result( + self._get_entries(title, playlist_url), + video_id, title, description) diff --git a/yt_dlp/extractor/muenchentv.py b/yt_dlp/extractor/muenchentv.py new file mode 100644 index 000000000..d256236d1 --- /dev/null +++ b/yt_dlp/extractor/muenchentv.py @@ -0,0 +1,75 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, + js_to_json, +) + + +class MuenchenTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?muenchen\.tv/livestream' + IE_DESC = 'münchen.tv' + _TEST = { + 'url': 'http://www.muenchen.tv/livestream/', + 'info_dict': { + 'id': '5334', + 'display_id': 'live', + 'ext': 'mp4', + 'title': 're:^münchen.tv-Livestream [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'is_live': True, + 'thumbnail': r're:^https?://.*\.jpg$' + }, + 'params': { + 'skip_download': True, + } + } + + def _real_extract(self, url): + display_id = 'live' + webpage = self._download_webpage(url, display_id) + + title = self._live_title(self._og_search_title(webpage)) + + data_js = self._search_regex( + r'(?s)\nplaylist:\s*(\[.*?}\]),', + webpage, 'playlist configuration') + data_json = js_to_json(data_js) + data = json.loads(data_json)[0] + + video_id = data['mediaid'] + thumbnail = data.get('image') + + formats = [] + for format_num, s in enumerate(data['sources']): + ext = determine_ext(s['file'], None) + label_str = s.get('label') + if label_str is None: + label_str = '_%d' % format_num + + if ext is None: + format_id = label_str + else: + format_id = '%s-%s' % (ext, label_str) + + formats.append({ + 'url': s['file'], + 'tbr': int_or_none(s.get('label')), + 'ext': 'mp4', + 'format_id': format_id, + 'preference': -100 if '.smil' in s['file'] else 0, # Strictly inferior than all other formats? + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'formats': formats, + 'is_live': True, + 'thumbnail': thumbnail, + } diff --git a/yt_dlp/extractor/musescore.py b/yt_dlp/extractor/musescore.py new file mode 100644 index 000000000..dcd26388a --- /dev/null +++ b/yt_dlp/extractor/musescore.py @@ -0,0 +1,67 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class MuseScoreIE(InfoExtractor): + _VALID_URL = r'(?:https?://)(?:www\.)?musescore\.com/(?:user/\d+|[^/]+)(?:/scores)?/(?P<id>[^#&?]+)' + _TESTS = [{ + 'url': 'https://musescore.com/user/73797/scores/142975', + 'info_dict': { + 'id': '142975', + 'ext': 'mp3', + 'title': 'WA Mozart Marche Turque (Turkish March fingered)', + 'description': 'md5:7ede08230e4eaabd67a4a98bb54d07be', + 'thumbnail': r're:(?:https?://)(?:www\.)?musescore\.com/.*\.png[^$]+', + 'uploader': 'PapyPiano', + 'creator': 'Wolfgang Amadeus Mozart', + } + }, { + 'url': 'https://musescore.com/user/36164500/scores/6837638', + 'info_dict': { + 'id': '6837638', + 'ext': 'mp3', + 'title': 'Sweet Child O\' Mine – Guns N\' Roses sweet child', + 'description': 'md5:4dca71191c14abc312a0a4192492eace', + 'thumbnail': r're:(?:https?://)(?:www\.)?musescore\.com/.*\.png[^$]+', + 'uploader': 'roxbelviolin', + 'creator': 'Guns N´Roses Arr. Roxbel Violin', + } + }, { + 'url': 'https://musescore.com/classicman/fur-elise', + 'info_dict': { + 'id': '33816', + 'ext': 'mp3', + 'title': 'Für Elise – Beethoven', + 'description': 'md5:49515a3556d5ecaf9fa4b2514064ac34', + 'thumbnail': r're:(?:https?://)(?:www\.)?musescore\.com/.*\.png[^$]+', + 'uploader': 'ClassicMan', + 'creator': 'Ludwig van Beethoven (1770–1827)', + } + }, { + 'url': 'https://musescore.com/minh_cuteee/scores/6555384', + 'only_matching': True, + }] + + def _real_extract(self, url): + webpage = self._download_webpage(url, None) + url = self._og_search_url(webpage) or url + id = self._match_id(url) + mp3_url = self._download_json(f'https://musescore.com/api/jmuse?id={id}&index=0&type=mp3&v2=1', id, + headers={'authorization': '63794e5461e4cfa046edfbdddfccc1ac16daffd2'})['info']['url'] + formats = [{ + 'url': mp3_url, + 'ext': 'mp3', + 'vcodec': 'none', + }] + + return { + 'id': id, + 'formats': formats, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'uploader': self._html_search_meta('musescore:author', webpage, 'uploader'), + 'creator': self._html_search_meta('musescore:composer', webpage, 'composer'), + } diff --git a/youtube_dl/extractor/mwave.py b/yt_dlp/extractor/mwave.py index a67276596..a67276596 100644 --- a/youtube_dl/extractor/mwave.py +++ b/yt_dlp/extractor/mwave.py diff --git a/yt_dlp/extractor/mxplayer.py b/yt_dlp/extractor/mxplayer.py new file mode 100644 index 000000000..5874556e3 --- /dev/null +++ b/yt_dlp/extractor/mxplayer.py @@ -0,0 +1,222 @@ +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import try_get + + +class MxplayerIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?mxplayer\.in/(?P<type>movie|show/[-\w]+/[-\w]+)/(?P<display_id>[-\w]+)-(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://www.mxplayer.in/show/watch-my-girlfriend-is-an-alien-hindi-dubbed/season-1/episode-1-online-9d2013d31d5835bb8400e3b3c5e7bb72', + 'info_dict': { + 'id': '9d2013d31d5835bb8400e3b3c5e7bb72', + 'ext': 'mp4', + 'title': 'Episode 1', + 'description': 'md5:62ed43eb9fec5efde5cf3bd1040b7670', + 'season_number': 1, + 'episode_number': 1, + 'duration': 2451, + 'season': 'Season 1', + 'series': 'My Girlfriend Is An Alien (Hindi Dubbed)', + 'thumbnail': 'https://qqcdnpictest.mxplay.com/pic/9d2013d31d5835bb8400e3b3c5e7bb72/en/16x9/320x180/9562f5f8df42cad09c9a9c4e69eb1567_1920x1080.webp', + 'episode': 'Episode 1' + }, + 'params': { + 'format': 'bv', + 'skip_download': True, + }, + }, { + 'url': 'https://www.mxplayer.in/movie/watch-knock-knock-hindi-dubbed-movie-online-b9fa28df3bfb8758874735bbd7d2655a?watch=true', + 'info_dict': { + 'id': 'b9fa28df3bfb8758874735bbd7d2655a', + 'ext': 'mp4', + 'title': 'Knock Knock (Hindi Dubbed)', + 'description': 'md5:b195ba93ff1987309cfa58e2839d2a5b', + 'season_number': 0, + 'episode_number': 0, + 'duration': 5970, + 'season': 'Season 0', + 'series': None, + 'thumbnail': 'https://qqcdnpictest.mxplay.com/pic/b9fa28df3bfb8758874735bbd7d2655a/en/16x9/320x180/test_pic1588676032011.webp', + 'episode': 'Episode 0' + }, + 'params': { + 'format': 'bv', + 'skip_download': True, + }, + }, { + 'url': 'https://www.mxplayer.in/show/watch-shaitaan/season-1/the-infamous-taxi-gang-of-meerut-online-45055d5bcff169ad48f2ad7552a83d6c', + 'info_dict': { + 'id': '45055d5bcff169ad48f2ad7552a83d6c', + 'ext': 'mp4', + 'title': 'The infamous taxi gang of Meerut', + 'description': 'md5:033a0a7e3fd147be4fb7e07a01a3dc28', + 'season_number': 1, + 'episode_number': 1, + 'duration': 2332, + 'season': 'Season 1', + 'series': 'Shaitaan', + 'thumbnail': 'https://qqcdnpictest.mxplay.com/pic/45055d5bcff169ad48f2ad7552a83d6c/en/16x9/320x180/voot_8e7d5f8d8183340869279c732c1e3a43.webp', + 'episode': 'Episode 1' + }, + 'params': { + 'format': 'best', + 'skip_download': True, + }, + }, { + 'url': 'https://www.mxplayer.in/show/watch-aashram/chapter-1/duh-swapna-online-d445579792b0135598ba1bc9088a84cb', + 'info_dict': { + 'id': 'd445579792b0135598ba1bc9088a84cb', + 'ext': 'mp4', + 'title': 'Duh Swapna', + 'description': 'md5:35ff39c4bdac403c53be1e16a04192d8', + 'season_number': 1, + 'episode_number': 3, + 'duration': 2568, + 'season': 'Chapter 1', + 'series': 'Aashram', + 'thumbnail': 'https://qqcdnpictest.mxplay.com/pic/d445579792b0135598ba1bc9088a84cb/en/4x3/1600x1200/test_pic1624819307993.webp', + 'episode': 'Episode 3' + }, + 'params': { + 'format': 'bv', + 'skip_download': True, + }, + }, { + 'url': 'https://www.mxplayer.in/show/watch-dangerous/season-1/chapter-1-online-5a351b4f9fb69436f6bd6ae3a1a75292', + 'info_dict': { + 'id': '5a351b4f9fb69436f6bd6ae3a1a75292', + 'ext': 'mp4', + 'title': 'Chapter 1', + 'description': 'md5:233886b8598bc91648ac098abe1d288f', + 'season_number': 1, + 'episode_number': 1, + 'duration': 1305, + 'season': 'Season 1', + 'series': 'Dangerous', + 'thumbnail': 'https://qqcdnpictest.mxplay.com/pic/5a351b4f9fb69436f6bd6ae3a1a75292/en/4x3/1600x1200/test_pic1624706302350.webp', + 'episode': 'Episode 1' + }, + 'params': { + 'format': 'bv', + 'skip_download': True, + }, + }, { + 'url': 'https://www.mxplayer.in/movie/watch-the-attacks-of-2611-movie-online-0452f0d80226c398d63ce7e3ea40fa2d', + 'info_dict': { + 'id': '0452f0d80226c398d63ce7e3ea40fa2d', + 'ext': 'mp4', + 'title': 'The Attacks of 26/11', + 'description': 'md5:689bacd29e97b3f31eaf519eb14127e5', + 'season_number': 0, + 'episode_number': 0, + 'duration': 6085, + 'season': 'Season 0', + 'series': None, + 'thumbnail': 'https://qqcdnpictest.mxplay.com/pic/0452f0d80226c398d63ce7e3ea40fa2d/en/16x9/320x180/00c8955dab5e5d340dbde643f9b1f6fd_1920x1080.webp', + 'episode': 'Episode 0' + }, + 'params': { + 'format': 'best', + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + type, display_id, video_id = self._match_valid_url(url).groups() + type = 'movie_film' if type == 'movie' else 'tvshow_episode' + API_URL = 'https://androidapi.mxplay.com/v1/detail/' + headers = { + 'X-Av-Code': '23', + 'X-Country': 'IN', + 'X-Platform': 'android', + 'X-App-Version': '1370001318', + 'X-Resolution': '3840x2160', + } + data_json = self._download_json(f'{API_URL}{type}/{video_id}', display_id, headers=headers)['profile'] + + season, series = None, None + for dct in data_json.get('levelInfos', []): + if dct.get('type') == 'tvshow_season': + season = dct.get('name') + elif dct.get('type') == 'tvshow_show': + series = dct.get('name') + thumbnails = [] + for thumb in data_json.get('poster', []): + thumbnails.append({ + 'url': thumb.get('url'), + 'width': thumb.get('width'), + 'height': thumb.get('height'), + }) + + formats = [] + subtitles = {} + for dct in data_json.get('playInfo', []): + if dct.get('extension') == 'mpd': + frmt, subs = self._extract_mpd_formats_and_subtitles(dct.get('playUrl'), display_id, fatal=False) + formats.extend(frmt) + subtitles = self._merge_subtitles(subtitles, subs) + elif dct.get('extension') == 'm3u8': + frmt, subs = self._extract_m3u8_formats_and_subtitles(dct.get('playUrl'), display_id, fatal=False) + formats.extend(frmt) + subtitles = self._merge_subtitles(subtitles, subs) + self._sort_formats(formats) + return { + 'id': video_id, + 'display_id': display_id, + 'title': data_json.get('name') or display_id, + 'description': data_json.get('description'), + 'season_number': data_json.get('seasonNum'), + 'episode_number': data_json.get('episodeNum'), + 'duration': data_json.get('duration'), + 'season': season, + 'series': series, + 'thumbnails': thumbnails, + 'formats': formats, + 'subtitles': subtitles, + } + + +class MxplayerShowIE(InfoExtractor): + _VALID_URL = r'(?:https?://)(?:www\.)?mxplayer\.in/show/(?P<display_id>[-\w]+)-(?P<id>\w+)/?(?:$|[#?])' + _TESTS = [{ + 'url': 'https://www.mxplayer.in/show/watch-chakravartin-ashoka-samrat-series-online-a8f44e3cc0814b5601d17772cedf5417', + 'playlist_mincount': 440, + 'info_dict': { + 'id': 'a8f44e3cc0814b5601d17772cedf5417', + 'title': 'Watch Chakravartin Ashoka Samrat Series Online', + } + }] + + _API_SHOW_URL = "https://api.mxplay.com/v1/web/detail/tab/tvshowseasons?type=tv_show&id={}&device-density=2&platform=com.mxplay.desktop&content-languages=hi,en" + _API_EPISODES_URL = "https://api.mxplay.com/v1/web/detail/tab/tvshowepisodes?type=season&id={}&device-density=1&platform=com.mxplay.desktop&content-languages=hi,en&{}" + + def _entries(self, show_id): + show_json = self._download_json( + self._API_SHOW_URL.format(show_id), + video_id=show_id, headers={'Referer': 'https://mxplayer.in'}) + page_num = 0 + for season in show_json.get('items') or []: + season_id = try_get(season, lambda x: x['id'], compat_str) + next_url = '' + while next_url is not None: + page_num += 1 + season_json = self._download_json( + self._API_EPISODES_URL.format(season_id, next_url), + video_id=season_id, + headers={'Referer': 'https://mxplayer.in'}, + note='Downloading JSON metadata page %d' % page_num) + for episode in season_json.get('items') or []: + video_url = episode['webUrl'] + yield self.url_result( + 'https://mxplayer.in%s' % video_url, + ie=MxplayerIE.ie_key(), video_id=video_url.split('-')[-1]) + next_url = season_json.get('next') + + def _real_extract(self, url): + display_id, show_id = self._match_valid_url(url).groups() + return self.playlist_result( + self._entries(show_id), playlist_id=show_id, + playlist_title=display_id.replace('-', ' ').title()) diff --git a/yt_dlp/extractor/mychannels.py b/yt_dlp/extractor/mychannels.py new file mode 100644 index 000000000..d820d4eb8 --- /dev/null +++ b/yt_dlp/extractor/mychannels.py @@ -0,0 +1,39 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor + + +class MyChannelsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?mychannels\.com/.*(?P<id_type>video|production)_id=(?P<id>[0-9]+)' + _TEST = { + 'url': 'https://mychannels.com/missholland/miss-holland?production_id=3416', + 'md5': 'b8993daad4262dd68d89d651c0c52c45', + 'info_dict': { + 'id': 'wUUDZZep6vQD', + 'ext': 'mp4', + 'title': 'Miss Holland joins VOTE LEAVE', + 'description': 'Miss Holland | #13 Not a potato', + 'uploader': 'Miss Holland', + } + } + + def _real_extract(self, url): + id_type, url_id = self._match_valid_url(url).groups() + webpage = self._download_webpage(url, url_id) + video_data = self._html_search_regex(r'<div([^>]+data-%s-id="%s"[^>]+)>' % (id_type, url_id), webpage, 'video data') + + def extract_data_val(attr, fatal=False): + return self._html_search_regex(r'data-%s\s*=\s*"([^"]+)"' % attr, video_data, attr, fatal=fatal) + minoto_id = extract_data_val('minoto-id') or self._search_regex(r'/id/([a-zA-Z0-9]+)', extract_data_val('video-src', True), 'minoto id') + + return { + '_type': 'url_transparent', + 'url': 'minoto:%s' % minoto_id, + 'id': url_id, + 'title': extract_data_val('title', True), + 'description': extract_data_val('description'), + 'thumbnail': extract_data_val('image'), + 'uploader': extract_data_val('channel'), + } diff --git a/yt_dlp/extractor/myspace.py b/yt_dlp/extractor/myspace.py new file mode 100644 index 000000000..4227d4248 --- /dev/null +++ b/yt_dlp/extractor/myspace.py @@ -0,0 +1,200 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + parse_iso8601, +) + + +class MySpaceIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + myspace\.com/[^/]+/ + (?P<mediatype> + video/[^/]+/(?P<video_id>\d+)| + music/song/[^/?#&]+-(?P<song_id>\d+)-\d+(?:[/?#&]|$) + ) + ''' + + _TESTS = [{ + 'url': 'https://myspace.com/fiveminutestothestage/video/little-big-town/109594919', + 'md5': '9c1483c106f4a695c47d2911feed50a7', + 'info_dict': { + 'id': '109594919', + 'ext': 'mp4', + 'title': 'Little Big Town', + 'description': 'This country quartet was all smiles while playing a sold out show at the Pacific Amphitheatre in Orange County, California.', + 'uploader': 'Five Minutes to the Stage', + 'uploader_id': 'fiveminutestothestage', + 'timestamp': 1414108751, + 'upload_date': '20141023', + }, + }, { + # songs + 'url': 'https://myspace.com/killsorrow/music/song/of-weakened-soul...-93388656-103880681', + 'md5': '1d7ee4604a3da226dd69a123f748b262', + 'info_dict': { + 'id': '93388656', + 'ext': 'm4a', + 'title': 'Of weakened soul...', + 'uploader': 'Killsorrow', + 'uploader_id': 'killsorrow', + }, + }, { + 'url': 'https://myspace.com/starset2/music/song/first-light-95799905-106964426', + 'only_matching': True, + }, { + 'url': 'https://myspace.com/thelargemouthbassband/music/song/02-pure-eyes.mp3-94422330-105113388', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('video_id') or mobj.group('song_id') + is_song = mobj.group('mediatype').startswith('music/song') + webpage = self._download_webpage(url, video_id) + player_url = self._search_regex( + r'videoSwf":"([^"?]*)', webpage, 'player URL', fatal=False) + + def formats_from_stream_urls(stream_url, hls_stream_url, http_stream_url, width=None, height=None): + formats = [] + vcodec = 'none' if is_song else None + if hls_stream_url: + formats.append({ + 'format_id': 'hls', + 'url': hls_stream_url, + 'protocol': 'm3u8_native', + 'ext': 'm4a' if is_song else 'mp4', + 'vcodec': vcodec, + }) + if stream_url and player_url: + rtmp_url, play_path = stream_url.split(';', 1) + formats.append({ + 'format_id': 'rtmp', + 'url': rtmp_url, + 'play_path': play_path, + 'player_url': player_url, + 'protocol': 'rtmp', + 'ext': 'flv', + 'width': width, + 'height': height, + 'vcodec': vcodec, + }) + if http_stream_url: + formats.append({ + 'format_id': 'http', + 'url': http_stream_url, + 'width': width, + 'height': height, + 'vcodec': vcodec, + }) + return formats + + if is_song: + # songs don't store any useful info in the 'context' variable + song_data = self._search_regex( + r'''<button.*data-song-id=(["\'])%s\1.*''' % video_id, + webpage, 'song_data', default=None, group=0) + if song_data is None: + # some songs in an album are not playable + self.report_warning( + '%s: No downloadable song on this page' % video_id) + return + + def search_data(name): + return self._search_regex( + r'''data-%s=([\'"])(?P<data>.*?)\1''' % name, + song_data, name, default='', group='data') + formats = formats_from_stream_urls( + search_data('stream-url'), search_data('hls-stream-url'), + search_data('http-stream-url')) + if not formats: + vevo_id = search_data('vevo-id') + youtube_id = search_data('youtube-id') + if vevo_id: + self.to_screen('Vevo video detected: %s' % vevo_id) + return self.url_result('vevo:%s' % vevo_id, ie='Vevo') + elif youtube_id: + self.to_screen('Youtube video detected: %s' % youtube_id) + return self.url_result(youtube_id, ie='Youtube') + else: + raise ExtractorError( + 'Found song but don\'t know how to download it') + self._sort_formats(formats) + return { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'uploader': search_data('artist-name'), + 'uploader_id': search_data('artist-username'), + 'thumbnail': self._og_search_thumbnail(webpage), + 'duration': int_or_none(search_data('duration')), + 'formats': formats, + } + else: + video = self._parse_json(self._search_regex( + r'context = ({.*?});', webpage, 'context'), + video_id)['video'] + formats = formats_from_stream_urls( + video.get('streamUrl'), video.get('hlsStreamUrl'), + video.get('mp4StreamUrl'), int_or_none(video.get('width')), + int_or_none(video.get('height'))) + self._sort_formats(formats) + return { + 'id': video_id, + 'title': video['title'], + 'description': video.get('description'), + 'thumbnail': video.get('imageUrl'), + 'uploader': video.get('artistName'), + 'uploader_id': video.get('artistUsername'), + 'duration': int_or_none(video.get('duration')), + 'timestamp': parse_iso8601(video.get('dateAdded')), + 'formats': formats, + } + + +class MySpaceAlbumIE(InfoExtractor): + IE_NAME = 'MySpace:album' + _VALID_URL = r'https?://myspace\.com/([^/]+)/music/album/(?P<title>.*-)(?P<id>\d+)' + + _TESTS = [{ + 'url': 'https://myspace.com/starset2/music/album/transmissions-19455773', + 'info_dict': { + 'title': 'Transmissions', + 'id': '19455773', + }, + 'playlist_count': 14, + 'skip': 'this album is only available in some countries', + }, { + 'url': 'https://myspace.com/killsorrow/music/album/the-demo-18596029', + 'info_dict': { + 'title': 'The Demo', + 'id': '18596029', + }, + 'playlist_count': 5, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + playlist_id = mobj.group('id') + display_id = mobj.group('title') + playlist_id + webpage = self._download_webpage(url, display_id) + tracks_paths = re.findall(r'"music:song" content="(.*?)"', webpage) + if not tracks_paths: + raise ExtractorError( + '%s: No songs found, try using proxy' % display_id, + expected=True) + entries = [ + self.url_result(t_path, ie=MySpaceIE.ie_key()) + for t_path in tracks_paths] + return { + '_type': 'playlist', + 'id': playlist_id, + 'display_id': display_id, + 'title': self._og_search_title(webpage), + 'entries': entries, + } diff --git a/youtube_dl/extractor/myspass.py b/yt_dlp/extractor/myspass.py index db7ebc94c..db7ebc94c 100644 --- a/youtube_dl/extractor/myspass.py +++ b/yt_dlp/extractor/myspass.py diff --git a/youtube_dl/extractor/myvi.py b/yt_dlp/extractor/myvi.py index 75d286365..75d286365 100644 --- a/youtube_dl/extractor/myvi.py +++ b/yt_dlp/extractor/myvi.py diff --git a/yt_dlp/extractor/myvideoge.py b/yt_dlp/extractor/myvideoge.py new file mode 100644 index 000000000..0a1d7d0cb --- /dev/null +++ b/yt_dlp/extractor/myvideoge.py @@ -0,0 +1,56 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import js_to_json + + +class MyVideoGeIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?myvideo\.ge/v/(?P<id>[0-9]+)' + _TEST = { + 'url': 'https://www.myvideo.ge/v/3941048', + 'md5': '8c192a7d2b15454ba4f29dc9c9a52ea9', + 'info_dict': { + 'id': '3941048', + 'ext': 'mp4', + 'title': 'The best prikol', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'md5:d72addd357b0dd914e704781f7f777d8', + 'description': 'md5:5c0371f540f5888d603ebfedd46b6df3' + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex(r'<h1[^>]*>([^<]+)</h1>', webpage, 'title') + description = self._og_search_description(webpage) + thumbnail = self._html_search_meta(['og:image'], webpage) + uploader = self._search_regex(r'<a[^>]+class="mv_user_name"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False) + + jwplayer_sources = self._parse_json( + self._search_regex( + r"(?s)jwplayer\(\"mvplayer\"\).setup\(.*?sources: (.*?])", webpage, 'jwplayer sources'), + video_id, transform_source=js_to_json) + + def _formats_key(f): + if f['label'] == 'SD': + return -1 + elif f['label'] == 'HD': + return 1 + else: + return 0 + + jwplayer_sources = sorted(jwplayer_sources, key=_formats_key) + + formats = self._parse_jwplayer_formats(jwplayer_sources, video_id) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'uploader': uploader, + 'formats': formats, + 'thumbnail': thumbnail + } diff --git a/youtube_dl/extractor/myvidster.py b/yt_dlp/extractor/myvidster.py index 2117d302d..2117d302d 100644 --- a/youtube_dl/extractor/myvidster.py +++ b/yt_dlp/extractor/myvidster.py diff --git a/yt_dlp/extractor/n1.py b/yt_dlp/extractor/n1.py new file mode 100644 index 000000000..7a09c6779 --- /dev/null +++ b/yt_dlp/extractor/n1.py @@ -0,0 +1,136 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .youtube import YoutubeIE +from .reddit import RedditRIE +from .common import InfoExtractor +from ..utils import ( + unified_timestamp, + extract_attributes, +) + + +class N1InfoAssetIE(InfoExtractor): + _VALID_URL = r'https?://best-vod\.umn\.cdn\.united\.cloud/stream\?asset=(?P<id>[^&]+)' + _TESTS = [{ + 'url': 'https://best-vod.umn.cdn.united.cloud/stream?asset=ljsottomazilirija3060921-n1info-si-worldwide&stream=hp1400&t=0&player=m3u8v&sp=n1info&u=n1info&p=n1Sh4redSecre7iNf0', + 'md5': '28b08b32aeaff2b8562736ccd5a66fe7', + 'info_dict': { + 'id': 'ljsottomazilirija3060921-n1info-si-worldwide', + 'ext': 'mp4', + 'title': 'ljsottomazilirija3060921-n1info-si-worldwide', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + formats = self._extract_m3u8_formats( + url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_id, + 'formats': formats, + } + + +class N1InfoIIE(InfoExtractor): + IE_NAME = 'N1Info:article' + _VALID_URL = r'https?://(?:(?:ba|rs|hr)\.)?n1info\.(?:com|si)/(?:[^/]+/){1,2}(?P<id>[^/]+)' + _TESTS = [{ + # Youtube embedded + 'url': 'https://rs.n1info.com/sport-klub/tenis/kako-je-djokovic-propustio-istorijsku-priliku-video/', + 'md5': '01ddb6646d0fd9c4c7d990aa77fe1c5a', + 'info_dict': { + 'id': 'L5Hd4hQVUpk', + 'ext': 'mp4', + 'upload_date': '20210913', + 'title': 'Ozmo i USO21, ep. 13: Novak Đoković – Danil Medvedev | Ključevi Poraza, Budućnost | SPORT KLUB TENIS', + 'description': 'md5:467f330af1effedd2e290f10dc31bb8e', + 'uploader': 'Sport Klub', + 'uploader_id': 'sportklub', + } + }, { + 'url': 'https://rs.n1info.com/vesti/djilas-los-plan-za-metro-nece-resiti-nijedan-saobracajni-problem/', + 'info_dict': { + 'id': 'bgmetrosot2409zta20210924174316682-n1info-rs-worldwide', + 'ext': 'mp4', + 'title': 'Đilas: Predlog izgradnje metroa besmislen; SNS odbacuje navode', + 'upload_date': '20210924', + 'timestamp': 1632481347, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://n1info.si/novice/slovenija/zadnji-dnevi-na-kopaliscu-ilirija-ilirija-ni-umrla-ubili-so-jo/', + 'info_dict': { + 'id': 'ljsottomazilirija3060921-n1info-si-worldwide', + 'ext': 'mp4', + 'title': 'Zadnji dnevi na kopališču Ilirija: “Ilirija ni umrla, ubili so jo”', + 'timestamp': 1632567630, + 'upload_date': '20210925', + }, + 'params': { + 'skip_download': True, + }, + }, { + # Reddit embedded + 'url': 'https://ba.n1info.com/lifestyle/vucic-bolji-od-tita-ako-izgubi-ja-cu-da-crknem-jugoslavija-je-gotova/', + 'info_dict': { + 'id': '2wmfee9eycp71', + 'ext': 'mp4', + 'title': '"Ako Vučić izgubi izbore, ja ću da crknem, Jugoslavija je gotova"', + 'upload_date': '20210924', + 'timestamp': 1632448649.0, + 'uploader': 'YouLotWhatDontStop', + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, + }, { + 'url': 'https://hr.n1info.com/vijesti/pravobraniteljica-o-ubojstvu-u-zagrebu-radi-se-o-doista-nezapamcenoj-situaciji/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex(r'<h1[^>]+>(.+?)</h1>', webpage, 'title') + timestamp = unified_timestamp(self._html_search_meta('article:published_time', webpage)) + + videos = re.findall(r'(?m)(<video[^>]+>)', webpage) + entries = [] + for video in videos: + video_data = extract_attributes(video) + entries.append({ + '_type': 'url_transparent', + 'url': video_data.get('data-url'), + 'id': video_data.get('id'), + 'title': title, + 'thumbnail': video_data.get('data-thumbnail'), + 'timestamp': timestamp, + 'ie_key': N1InfoAssetIE.ie_key()}) + + embedded_videos = re.findall(r'(<iframe[^>]+>)', webpage) + for embedded_video in embedded_videos: + video_data = extract_attributes(embedded_video) + url = video_data.get('src') + if url.startswith('https://www.youtube.com'): + entries.append(self.url_result(url, ie=YoutubeIE.ie_key())) + elif url.startswith('https://www.redditmedia.com'): + entries.append(self.url_result(url, ie=RedditRIE.ie_key())) + + return { + '_type': 'playlist', + 'id': video_id, + 'title': title, + 'timestamp': timestamp, + 'entries': entries, + } diff --git a/youtube_dl/extractor/nationalgeographic.py b/yt_dlp/extractor/nationalgeographic.py index ee12e2b47..ee12e2b47 100644 --- a/youtube_dl/extractor/nationalgeographic.py +++ b/yt_dlp/extractor/nationalgeographic.py diff --git a/yt_dlp/extractor/naver.py b/yt_dlp/extractor/naver.py new file mode 100644 index 000000000..acf53c1ff --- /dev/null +++ b/yt_dlp/extractor/naver.py @@ -0,0 +1,251 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + dict_get, + ExtractorError, + int_or_none, + parse_duration, + try_get, + update_url_query, +) + + +class NaverBaseIE(InfoExtractor): + _CAPTION_EXT_RE = r'\.(?:ttml|vtt)' + + def _extract_video_info(self, video_id, vid, key): + video_data = self._download_json( + 'http://play.rmcnmv.naver.com/vod/play/v2.0/' + vid, + video_id, query={ + 'key': key, + }) + meta = video_data['meta'] + title = meta['subject'] + formats = [] + get_list = lambda x: try_get(video_data, lambda y: y[x + 's']['list'], list) or [] + + def extract_formats(streams, stream_type, query={}): + for stream in streams: + stream_url = stream.get('source') + if not stream_url: + continue + stream_url = update_url_query(stream_url, query) + encoding_option = stream.get('encodingOption', {}) + bitrate = stream.get('bitrate', {}) + formats.append({ + 'format_id': '%s_%s' % (stream.get('type') or stream_type, dict_get(encoding_option, ('name', 'id'))), + 'url': stream_url, + 'width': int_or_none(encoding_option.get('width')), + 'height': int_or_none(encoding_option.get('height')), + 'vbr': int_or_none(bitrate.get('video')), + 'abr': int_or_none(bitrate.get('audio')), + 'filesize': int_or_none(stream.get('size')), + 'protocol': 'm3u8_native' if stream_type == 'HLS' else None, + }) + + extract_formats(get_list('video'), 'H264') + for stream_set in video_data.get('streams', []): + query = {} + for param in stream_set.get('keys', []): + query[param['name']] = param['value'] + stream_type = stream_set.get('type') + videos = stream_set.get('videos') + if videos: + extract_formats(videos, stream_type, query) + elif stream_type == 'HLS': + stream_url = stream_set.get('source') + if not stream_url: + continue + formats.extend(self._extract_m3u8_formats( + update_url_query(stream_url, query), video_id, + 'mp4', 'm3u8_native', m3u8_id=stream_type, fatal=False)) + self._sort_formats(formats) + + replace_ext = lambda x, y: re.sub(self._CAPTION_EXT_RE, '.' + y, x) + + def get_subs(caption_url): + if re.search(self._CAPTION_EXT_RE, caption_url): + return [{ + 'url': replace_ext(caption_url, 'ttml'), + }, { + 'url': replace_ext(caption_url, 'vtt'), + }] + else: + return [{'url': caption_url}] + + automatic_captions = {} + subtitles = {} + for caption in get_list('caption'): + caption_url = caption.get('source') + if not caption_url: + continue + sub_dict = automatic_captions if caption.get('type') == 'auto' else subtitles + sub_dict.setdefault(dict_get(caption, ('locale', 'language')), []).extend(get_subs(caption_url)) + + user = meta.get('user', {}) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'subtitles': subtitles, + 'automatic_captions': automatic_captions, + 'thumbnail': try_get(meta, lambda x: x['cover']['source']), + 'view_count': int_or_none(meta.get('count')), + 'uploader_id': user.get('id'), + 'uploader': user.get('name'), + 'uploader_url': user.get('url'), + } + + +class NaverIE(NaverBaseIE): + _VALID_URL = r'https?://(?:m\.)?tv(?:cast)?\.naver\.com/(?:v|embed)/(?P<id>\d+)' + _GEO_BYPASS = False + _TESTS = [{ + 'url': 'http://tv.naver.com/v/81652', + 'info_dict': { + 'id': '81652', + 'ext': 'mp4', + 'title': '[9월 모의고사 해설강의][수학_김상희] 수학 A형 16~20번', + 'description': '메가스터디 수학 김상희 선생님이 9월 모의고사 수학A형 16번에서 20번까지 해설강의를 공개합니다.', + 'timestamp': 1378200754, + 'upload_date': '20130903', + 'uploader': '메가스터디, 합격불변의 법칙', + 'uploader_id': 'megastudy', + }, + }, { + 'url': 'http://tv.naver.com/v/395837', + 'md5': '8a38e35354d26a17f73f4e90094febd3', + 'info_dict': { + 'id': '395837', + 'ext': 'mp4', + 'title': '9년이 지나도 아픈 기억, 전효성의 아버지', + 'description': 'md5:eb6aca9d457b922e43860a2a2b1984d3', + 'timestamp': 1432030253, + 'upload_date': '20150519', + 'uploader': '4가지쇼 시즌2', + 'uploader_id': 'wrappinguser29', + }, + 'skip': 'Georestricted', + }, { + 'url': 'http://tvcast.naver.com/v/81652', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + content = self._download_json( + 'https://tv.naver.com/api/json/v/' + video_id, + video_id, headers=self.geo_verification_headers()) + player_info_json = content.get('playerInfoJson') or {} + current_clip = player_info_json.get('currentClip') or {} + + vid = current_clip.get('videoId') + in_key = current_clip.get('inKey') + + if not vid or not in_key: + player_auth = try_get(player_info_json, lambda x: x['playerOption']['auth']) + if player_auth == 'notCountry': + self.raise_geo_restricted(countries=['KR']) + elif player_auth == 'notLogin': + self.raise_login_required() + raise ExtractorError('couldn\'t extract vid and key') + info = self._extract_video_info(video_id, vid, in_key) + info.update({ + 'description': clean_html(current_clip.get('description')), + 'timestamp': int_or_none(current_clip.get('firstExposureTime'), 1000), + 'duration': parse_duration(current_clip.get('displayPlayTime')), + 'like_count': int_or_none(current_clip.get('recommendPoint')), + 'age_limit': 19 if current_clip.get('adult') else None, + }) + return info + + +class NaverLiveIE(InfoExtractor): + IE_NAME = 'Naver:live' + _VALID_URL = r'https?://(?:m\.)?tv(?:cast)?\.naver\.com/l/(?P<id>\d+)' + _GEO_BYPASS = False + _TESTS = [{ + 'url': 'https://tv.naver.com/l/52010', + 'info_dict': { + 'id': '52010', + 'ext': 'm3u8', + 'title': '[LIVE] 뉴스특보 : "수도권 거리두기, 2주간 2단계로 조정"', + 'description': 'md5:df7f0c237a5ed5e786ce5c91efbeaab3', + 'channel_id': 'NTV-ytnnews24-0', + 'start_time': 1597026780000, + }, + }, { + 'url': 'https://tv.naver.com/l/51549', + 'info_dict': { + 'id': '51549', + 'ext': 'm3u8', + 'title': '연합뉴스TV - 코로나19 뉴스특보', + 'description': 'md5:c655e82091bc21e413f549c0eaccc481', + 'channel_id': 'NTV-yonhapnewstv-0', + 'start_time': 1596406380000, + }, + }, { + 'url': 'https://tv.naver.com/l/54887', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + page = self._download_webpage(url, video_id, 'Downloading Page', 'Unable to download Page') + secure_url = self._search_regex(r'sApiF:\s+(?:"|\')([^"\']+)', page, 'secureurl') + + info = self._extract_video_info(video_id, secure_url) + info.update({ + 'description': self._og_search_description(page) + }) + + return info + + def _extract_video_info(self, video_id, url): + video_data = self._download_json(url, video_id, headers=self.geo_verification_headers()) + meta = video_data.get('meta') + status = meta.get('status') + + if status == 'CLOSED': + raise ExtractorError('Stream is offline.', expected=True) + elif status != 'OPENED': + raise ExtractorError('Unknown status %s' % status) + + title = meta.get('title') + stream_list = video_data.get('streams') + + if stream_list is None: + raise ExtractorError('Could not get stream data.', expected=True) + + formats = [] + for quality in stream_list: + if not quality.get('url'): + continue + + prop = quality.get('property') + if prop.get('abr'): # This abr doesn't mean Average audio bitrate. + continue + + formats.extend(self._extract_m3u8_formats( + quality.get('url'), video_id, 'm3u8', + m3u8_id=quality.get('qualityId'), live=True + )) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'channel_id': meta.get('channelId'), + 'channel_url': meta.get('channelUrl'), + 'thumbnail': meta.get('imgUrl'), + 'start_time': meta.get('startTime'), + 'categories': [meta.get('categoryId')], + 'is_live': True + } diff --git a/yt_dlp/extractor/nba.py b/yt_dlp/extractor/nba.py new file mode 100644 index 000000000..7390ef8bc --- /dev/null +++ b/yt_dlp/extractor/nba.py @@ -0,0 +1,427 @@ +from __future__ import unicode_literals + +import functools +import re + +from .turner import TurnerBaseIE +from ..compat import ( + compat_str, + compat_urllib_parse_unquote, +) +from ..utils import ( + int_or_none, + merge_dicts, + OnDemandPagedList, + parse_duration, + parse_iso8601, + parse_qs, + try_get, + update_url_query, + urljoin, +) + + +class NBACVPBaseIE(TurnerBaseIE): + def _extract_nba_cvp_info(self, path, video_id, fatal=False): + return self._extract_cvp_info( + 'http://secure.nba.com/%s' % path, video_id, { + 'default': { + 'media_src': 'http://nba.cdn.turner.com/nba/big', + }, + 'm3u8': { + 'media_src': 'http://nbavod-f.akamaihd.net', + }, + }, fatal=fatal) + + +class NBAWatchBaseIE(NBACVPBaseIE): + _VALID_URL_BASE = r'https?://(?:(?:www\.)?nba\.com(?:/watch)?|watch\.nba\.com)/' + + def _extract_video(self, filter_key, filter_value): + video = self._download_json( + 'https://neulionscnbav2-a.akamaihd.net/solr/nbad_program/usersearch', + filter_value, query={ + 'fl': 'description,image,name,pid,releaseDate,runtime,tags,seoName', + 'q': filter_key + ':' + filter_value, + 'wt': 'json', + })['response']['docs'][0] + + video_id = str(video['pid']) + title = video['name'] + + formats = [] + m3u8_url = (self._download_json( + 'https://watch.nba.com/service/publishpoint', video_id, query={ + 'type': 'video', + 'format': 'json', + 'id': video_id, + }, headers={ + 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0_1 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A402 Safari/604.1', + }, fatal=False) or {}).get('path') + if m3u8_url: + m3u8_formats = self._extract_m3u8_formats( + re.sub(r'_(?:pc|iphone)\.', '.', m3u8_url), video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False) + formats.extend(m3u8_formats) + for f in m3u8_formats: + http_f = f.copy() + http_f.update({ + 'format_id': http_f['format_id'].replace('hls-', 'http-'), + 'protocol': 'http', + 'url': http_f['url'].replace('.m3u8', ''), + }) + formats.append(http_f) + + info = { + 'id': video_id, + 'title': title, + 'thumbnail': urljoin('https://nbadsdmt.akamaized.net/media/nba/nba/thumbs/', video.get('image')), + 'description': video.get('description'), + 'duration': int_or_none(video.get('runtime')), + 'timestamp': parse_iso8601(video.get('releaseDate')), + 'tags': video.get('tags'), + } + + seo_name = video.get('seoName') + if seo_name and re.search(r'\d{4}/\d{2}/\d{2}/', seo_name): + base_path = '' + if seo_name.startswith('teams/'): + base_path += seo_name.split('/')[1] + '/' + base_path += 'video/' + cvp_info = self._extract_nba_cvp_info( + base_path + seo_name + '.xml', video_id, False) + if cvp_info: + formats.extend(cvp_info['formats']) + info = merge_dicts(info, cvp_info) + + self._sort_formats(formats) + info['formats'] = formats + return info + + +class NBAWatchEmbedIE(NBAWatchBaseIE): + IENAME = 'nba:watch:embed' + _VALID_URL = NBAWatchBaseIE._VALID_URL_BASE + r'embed\?.*?\bid=(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://watch.nba.com/embed?id=659395', + 'md5': 'b7e3f9946595f4ca0a13903ce5edd120', + 'info_dict': { + 'id': '659395', + 'ext': 'mp4', + 'title': 'Mix clip: More than 7 points of Joe Ingles, Luc Mbah a Moute, Blake Griffin and 6 more in Utah Jazz vs. the Clippers, 4/15/2017', + 'description': 'Mix clip: More than 7 points of Joe Ingles, Luc Mbah a Moute, Blake Griffin and 6 more in Utah Jazz vs. the Clippers, 4/15/2017', + 'timestamp': 1492228800, + 'upload_date': '20170415', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self._extract_video('pid', video_id) + + +class NBAWatchIE(NBAWatchBaseIE): + IE_NAME = 'nba:watch' + _VALID_URL = NBAWatchBaseIE._VALID_URL_BASE + r'(?:nba/)?video/(?P<id>.+?(?=/index\.html)|(?:[^/]+/)*[^/?#&]+)' + _TESTS = [{ + 'url': 'http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html', + 'md5': '9d902940d2a127af3f7f9d2f3dc79c96', + 'info_dict': { + 'id': '70946', + 'ext': 'mp4', + 'title': 'Thunder vs. Nets', + 'description': 'Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.', + 'duration': 181, + 'timestamp': 1354597200, + 'upload_date': '20121204', + }, + }, { + 'url': 'http://www.nba.com/video/games/hornets/2014/12/05/0021400276-nyk-cha-play5.nba/', + 'only_matching': True, + }, { + 'url': 'http://watch.nba.com/video/channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba', + 'md5': 'b2b39b81cf28615ae0c3360a3f9668c4', + 'info_dict': { + 'id': '330865', + 'ext': 'mp4', + 'title': 'Hawks vs. Cavaliers Game 1', + 'description': 'md5:8094c3498d35a9bd6b1a8c396a071b4d', + 'duration': 228, + 'timestamp': 1432094400, + 'upload_date': '20150521', + }, + }, { + 'url': 'http://watch.nba.com/nba/video/channels/nba_tv/2015/06/11/YT_go_big_go_home_Game4_061115', + 'only_matching': True, + }, { + # only CVP mp4 format available + 'url': 'https://watch.nba.com/video/teams/cavaliers/2012/10/15/sloan121015mov-2249106', + 'only_matching': True, + }, { + 'url': 'https://watch.nba.com/video/top-100-dunks-from-the-2019-20-season?plsrc=nba&collection=2019-20-season-highlights', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + collection_id = parse_qs(url).get('collection', [None])[0] + if collection_id: + if self.get_param('noplaylist'): + self.to_screen('Downloading just video %s because of --no-playlist' % display_id) + else: + self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % collection_id) + return self.url_result( + 'https://www.nba.com/watch/list/collection/' + collection_id, + NBAWatchCollectionIE.ie_key(), collection_id) + return self._extract_video('seoName', display_id) + + +class NBAWatchCollectionIE(NBAWatchBaseIE): + IE_NAME = 'nba:watch:collection' + _VALID_URL = NBAWatchBaseIE._VALID_URL_BASE + r'list/collection/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://watch.nba.com/list/collection/season-preview-2020', + 'info_dict': { + 'id': 'season-preview-2020', + }, + 'playlist_mincount': 43, + }] + _PAGE_SIZE = 100 + + def _fetch_page(self, collection_id, page): + page += 1 + videos = self._download_json( + 'https://content-api-prod.nba.com/public/1/endeavor/video-list/collection/' + collection_id, + collection_id, 'Downloading page %d JSON metadata' % page, query={ + 'count': self._PAGE_SIZE, + 'page': page, + })['results']['videos'] + for video in videos: + program = video.get('program') or {} + seo_name = program.get('seoName') or program.get('slug') + if not seo_name: + continue + yield { + '_type': 'url', + 'id': program.get('id'), + 'title': program.get('title') or video.get('title'), + 'url': 'https://www.nba.com/watch/video/' + seo_name, + 'thumbnail': video.get('image'), + 'description': program.get('description') or video.get('description'), + 'duration': parse_duration(program.get('runtimeHours')), + 'timestamp': parse_iso8601(video.get('releaseDate')), + } + + def _real_extract(self, url): + collection_id = self._match_id(url) + entries = OnDemandPagedList( + functools.partial(self._fetch_page, collection_id), + self._PAGE_SIZE) + return self.playlist_result(entries, collection_id) + + +class NBABaseIE(NBACVPBaseIE): + _VALID_URL_BASE = r'''(?x) + https?://(?:www\.)?nba\.com/ + (?P<team> + blazers| + bucks| + bulls| + cavaliers| + celtics| + clippers| + grizzlies| + hawks| + heat| + hornets| + jazz| + kings| + knicks| + lakers| + magic| + mavericks| + nets| + nuggets| + pacers| + pelicans| + pistons| + raptors| + rockets| + sixers| + spurs| + suns| + thunder| + timberwolves| + warriors| + wizards + ) + (?:/play\#)?/''' + _CHANNEL_PATH_REGEX = r'video/channel|series' + + def _embed_url_result(self, team, content_id): + return self.url_result(update_url_query( + 'https://secure.nba.com/assets/amp/include/video/iframe.html', { + 'contentId': content_id, + 'team': team, + }), NBAEmbedIE.ie_key()) + + def _call_api(self, team, content_id, query, resource): + return self._download_json( + 'https://api.nba.net/2/%s/video,imported_video,wsc/' % team, + content_id, 'Download %s JSON metadata' % resource, + query=query, headers={ + 'accessToken': 'internal|bb88df6b4c2244e78822812cecf1ee1b', + })['response']['result'] + + def _extract_video(self, video, team, extract_all=True): + video_id = compat_str(video['nid']) + team = video['brand'] + + info = { + 'id': video_id, + 'title': video.get('title') or video.get('headline') or video['shortHeadline'], + 'description': video.get('description'), + 'timestamp': parse_iso8601(video.get('published')), + } + + subtitles = {} + captions = try_get(video, lambda x: x['videoCaptions']['sidecars'], dict) or {} + for caption_url in captions.values(): + subtitles.setdefault('en', []).append({'url': caption_url}) + + formats = [] + mp4_url = video.get('mp4') + if mp4_url: + formats.append({ + 'url': mp4_url, + }) + + if extract_all: + source_url = video.get('videoSource') + if source_url and not source_url.startswith('s3://') and self._is_valid_url(source_url, video_id, 'source'): + formats.append({ + 'format_id': 'source', + 'url': source_url, + 'quality': 1, + }) + + m3u8_url = video.get('m3u8') + if m3u8_url: + if '.akamaihd.net/i/' in m3u8_url: + formats.extend(self._extract_akamai_formats( + m3u8_url, video_id, {'http': 'pmd.cdn.turner.com'})) + else: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + + content_xml = video.get('contentXml') + if team and content_xml: + cvp_info = self._extract_nba_cvp_info( + team + content_xml, video_id, fatal=False) + if cvp_info: + formats.extend(cvp_info['formats']) + subtitles = self._merge_subtitles(subtitles, cvp_info['subtitles']) + info = merge_dicts(info, cvp_info) + + self._sort_formats(formats) + else: + info.update(self._embed_url_result(team, video['videoId'])) + + info.update({ + 'formats': formats, + 'subtitles': subtitles, + }) + + return info + + def _real_extract(self, url): + team, display_id = self._match_valid_url(url).groups() + if '/play#/' in url: + display_id = compat_urllib_parse_unquote(display_id) + else: + webpage = self._download_webpage(url, display_id) + display_id = self._search_regex( + self._CONTENT_ID_REGEX + r'\s*:\s*"([^"]+)"', webpage, 'video id') + return self._extract_url_results(team, display_id) + + +class NBAEmbedIE(NBABaseIE): + IENAME = 'nba:embed' + _VALID_URL = r'https?://secure\.nba\.com/assets/amp/include/video/(?:topI|i)frame\.html\?.*?\bcontentId=(?P<id>[^?#&]+)' + _TESTS = [{ + 'url': 'https://secure.nba.com/assets/amp/include/video/topIframe.html?contentId=teams/bulls/2020/12/04/3478774/1607105587854-20201204_SCHEDULE_RELEASE_FINAL_DRUPAL-3478774&team=bulls&adFree=false&profile=71&videoPlayerName=TAMPCVP&baseUrl=&videoAdsection=nba.com_mobile_web_teamsites_chicagobulls&Env=', + 'only_matching': True, + }, { + 'url': 'https://secure.nba.com/assets/amp/include/video/iframe.html?contentId=2016/10/29/0021600027boschaplay7&adFree=false&profile=71&team=&videoPlayerName=LAMPCVP', + 'only_matching': True, + }] + + def _real_extract(self, url): + qs = parse_qs(url) + content_id = qs['contentId'][0] + team = qs.get('team', [None])[0] + if not team: + return self.url_result( + 'https://watch.nba.com/video/' + content_id, NBAWatchIE.ie_key()) + video = self._call_api(team, content_id, {'videoid': content_id}, 'video')[0] + return self._extract_video(video, team) + + +class NBAIE(NBABaseIE): + IENAME = 'nba' + _VALID_URL = NBABaseIE._VALID_URL_BASE + '(?!%s)video/(?P<id>(?:[^/]+/)*[^/?#&]+)' % NBABaseIE._CHANNEL_PATH_REGEX + _TESTS = [{ + 'url': 'https://www.nba.com/bulls/video/teams/bulls/2020/12/04/3478774/1607105587854-20201204schedulereleasefinaldrupal-3478774', + 'info_dict': { + 'id': '45039', + 'ext': 'mp4', + 'title': 'AND WE BACK.', + 'description': 'Part 1 of our 2020-21 schedule is here! Watch our games on NBC Sports Chicago.', + 'duration': 94, + 'timestamp': 1607112000, + 'upload_date': '20201218', + }, + }, { + 'url': 'https://www.nba.com/bucks/play#/video/teams%2Fbucks%2F2020%2F12%2F17%2F64860%2F1608252863446-Op_Dream_16x9-64860', + 'only_matching': True, + }, { + 'url': 'https://www.nba.com/bucks/play#/video/wsc%2Fteams%2F2787C911AA1ACD154B5377F7577CCC7134B2A4B0', + 'only_matching': True, + }] + _CONTENT_ID_REGEX = r'videoID' + + def _extract_url_results(self, team, content_id): + return self._embed_url_result(team, content_id) + + +class NBAChannelIE(NBABaseIE): + IENAME = 'nba:channel' + _VALID_URL = NBABaseIE._VALID_URL_BASE + '(?:%s)/(?P<id>[^/?#&]+)' % NBABaseIE._CHANNEL_PATH_REGEX + _TESTS = [{ + 'url': 'https://www.nba.com/blazers/video/channel/summer_league', + 'info_dict': { + 'title': 'Summer League', + }, + 'playlist_mincount': 138, + }, { + 'url': 'https://www.nba.com/bucks/play#/series/On%20This%20Date', + 'only_matching': True, + }] + _CONTENT_ID_REGEX = r'videoSubCategory' + _PAGE_SIZE = 100 + + def _fetch_page(self, team, channel, page): + results = self._call_api(team, channel, { + 'channels': channel, + 'count': self._PAGE_SIZE, + 'offset': page * self._PAGE_SIZE, + }, 'page %d' % (page + 1)) + for video in results: + yield self._extract_video(video, team, False) + + def _extract_url_results(self, team, content_id): + entries = OnDemandPagedList( + functools.partial(self._fetch_page, team, content_id), + self._PAGE_SIZE) + return self.playlist_result(entries, playlist_title=content_id) diff --git a/yt_dlp/extractor/nbc.py b/yt_dlp/extractor/nbc.py new file mode 100644 index 000000000..f304f191a --- /dev/null +++ b/yt_dlp/extractor/nbc.py @@ -0,0 +1,592 @@ +from __future__ import unicode_literals + +import base64 +import json +import re + +from .common import InfoExtractor +from .theplatform import ThePlatformIE +from .adobepass import AdobePassIE +from ..compat import compat_urllib_parse_unquote +from ..utils import ( + int_or_none, + parse_age_limit, + parse_duration, + RegexNotFoundError, + smuggle_url, + try_get, + unified_timestamp, + update_url_query, +) + + +class NBCIE(ThePlatformIE): + _VALID_URL = r'https?(?P<permalink>://(?:www\.)?nbc\.com/(?:classic-tv/)?[^/]+/video/[^/]+/(?P<id>n?\d+))' + + _TESTS = [ + { + 'url': 'http://www.nbc.com/the-tonight-show/video/jimmy-fallon-surprises-fans-at-ben-jerrys/2848237', + 'info_dict': { + 'id': '2848237', + 'ext': 'mp4', + 'title': 'Jimmy Fallon Surprises Fans at Ben & Jerry\'s', + 'description': 'Jimmy gives out free scoops of his new "Tonight Dough" ice cream flavor by surprising customers at the Ben & Jerry\'s scoop shop.', + 'timestamp': 1424246400, + 'upload_date': '20150218', + 'uploader': 'NBCU-COM', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + { + 'url': 'http://www.nbc.com/saturday-night-live/video/star-wars-teaser/2832821', + 'info_dict': { + 'id': '2832821', + 'ext': 'mp4', + 'title': 'Star Wars Teaser', + 'description': 'md5:0b40f9cbde5b671a7ff62fceccc4f442', + 'timestamp': 1417852800, + 'upload_date': '20141206', + 'uploader': 'NBCU-COM', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'skip': 'Only works from US', + }, + { + # HLS streams requires the 'hdnea3' cookie + 'url': 'http://www.nbc.com/Kings/video/goliath/n1806', + 'info_dict': { + 'id': '101528f5a9e8127b107e98c5e6ce4638', + 'ext': 'mp4', + 'title': 'Goliath', + 'description': 'When an unknown soldier saves the life of the King\'s son in battle, he\'s thrust into the limelight and politics of the kingdom.', + 'timestamp': 1237100400, + 'upload_date': '20090315', + 'uploader': 'NBCU-COM', + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Only works from US', + }, + { + 'url': 'https://www.nbc.com/classic-tv/charles-in-charge/video/charles-in-charge-pilot/n3310', + 'only_matching': True, + }, + { + # Percent escaped url + 'url': 'https://www.nbc.com/up-all-night/video/day-after-valentine%27s-day/n2189', + 'only_matching': True, + } + ] + + def _real_extract(self, url): + permalink, video_id = self._match_valid_url(url).groups() + permalink = 'http' + compat_urllib_parse_unquote(permalink) + video_data = self._download_json( + 'https://friendship.nbc.co/v2/graphql', video_id, query={ + 'query': '''query bonanzaPage( + $app: NBCUBrands! = nbc + $name: String! + $oneApp: Boolean + $platform: SupportedPlatforms! = web + $type: EntityPageType! = VIDEO + $userId: String! +) { + bonanzaPage( + app: $app + name: $name + oneApp: $oneApp + platform: $platform + type: $type + userId: $userId + ) { + metadata { + ... on VideoPageData { + description + episodeNumber + keywords + locked + mpxAccountId + mpxGuid + rating + resourceId + seasonNumber + secondaryTitle + seriesShortTitle + } + } + } +}''', + 'variables': json.dumps({ + 'name': permalink, + 'oneApp': True, + 'userId': '0', + }), + })['data']['bonanzaPage']['metadata'] + query = { + 'mbr': 'true', + 'manifest': 'm3u', + } + video_id = video_data['mpxGuid'] + tp_path = 'NnzsPC/media/guid/%s/%s' % (video_data.get('mpxAccountId') or '2410887629', video_id) + tpm = self._download_theplatform_metadata(tp_path, video_id) + title = tpm.get('title') or video_data.get('secondaryTitle') + if video_data.get('locked'): + resource = self._get_mvpd_resource( + video_data.get('resourceId') or 'nbcentertainment', + title, video_id, video_data.get('rating')) + query['auth'] = self._extract_mvpd_auth( + url, video_id, 'nbcentertainment', resource) + theplatform_url = smuggle_url(update_url_query( + 'http://link.theplatform.com/s/NnzsPC/media/guid/%s/%s' % (video_data.get('mpxAccountId') or '2410887629', video_id), + query), {'force_smil_url': True}) + + # Empty string or 0 can be valid values for these. So the check must be `is None` + description = video_data.get('description') + if description is None: + description = tpm.get('description') + episode_number = int_or_none(video_data.get('episodeNumber')) + if episode_number is None: + episode_number = int_or_none(tpm.get('nbcu$airOrder')) + rating = video_data.get('rating') + if rating is None: + try_get(tpm, lambda x: x['ratings'][0]['rating']) + season_number = int_or_none(video_data.get('seasonNumber')) + if season_number is None: + season_number = int_or_none(tpm.get('nbcu$seasonNumber')) + series = video_data.get('seriesShortTitle') + if series is None: + series = tpm.get('nbcu$seriesShortTitle') + tags = video_data.get('keywords') + if tags is None or len(tags) == 0: + tags = tpm.get('keywords') + + return { + '_type': 'url_transparent', + 'age_limit': parse_age_limit(rating), + 'description': description, + 'episode': title, + 'episode_number': episode_number, + 'id': video_id, + 'ie_key': 'ThePlatform', + 'season_number': season_number, + 'series': series, + 'tags': tags, + 'title': title, + 'url': theplatform_url, + } + + +class NBCSportsVPlayerIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:vplayer\.nbcsports\.com|(?:www\.)?nbcsports\.com/vplayer)/' + _VALID_URL = _VALID_URL_BASE + r'(?:[^/]+/)+(?P<id>[0-9a-zA-Z_]+)' + + _TESTS = [{ + 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/9CsDKds0kvHI', + 'info_dict': { + 'id': '9CsDKds0kvHI', + 'ext': 'mp4', + 'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d', + 'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson', + 'timestamp': 1426270238, + 'upload_date': '20150313', + 'uploader': 'NBCU-SPORTS', + } + }, { + 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/media/_hqLjQ95yx8Z', + 'only_matching': True, + }, { + 'url': 'https://www.nbcsports.com/vplayer/p/BxmELC/nbcsports/select/PHJSaFWbrTY9?form=html&autoPlay=true', + 'only_matching': True, + }] + + @staticmethod + def _extract_url(webpage): + iframe_m = re.search( + r'<(?:iframe[^>]+|div[^>]+data-(?:mpx-)?)src="(?P<url>%s[^"]+)"' % NBCSportsVPlayerIE._VALID_URL_BASE, webpage) + if iframe_m: + return iframe_m.group('url') + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + theplatform_url = self._og_search_video_url(webpage).replace( + 'vplayer.nbcsports.com', 'player.theplatform.com') + return self.url_result(theplatform_url, 'ThePlatform') + + +class NBCSportsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?nbcsports\.com//?(?!vplayer/)(?:[^/]+/)+(?P<id>[0-9a-z-]+)' + + _TESTS = [{ + # iframe src + 'url': 'http://www.nbcsports.com//college-basketball/ncaab/tom-izzo-michigan-st-has-so-much-respect-duke', + 'info_dict': { + 'id': 'PHJSaFWbrTY9', + 'ext': 'mp4', + 'title': 'Tom Izzo, Michigan St. has \'so much respect\' for Duke', + 'description': 'md5:ecb459c9d59e0766ac9c7d5d0eda8113', + 'uploader': 'NBCU-SPORTS', + 'upload_date': '20150330', + 'timestamp': 1427726529, + } + }, { + # data-mpx-src + 'url': 'https://www.nbcsports.com/philadelphia/philadelphia-phillies/bruce-bochy-hector-neris-hes-idiot', + 'only_matching': True, + }, { + # data-src + 'url': 'https://www.nbcsports.com/boston/video/report-card-pats-secondary-no-match-josh-allen', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + return self.url_result( + NBCSportsVPlayerIE._extract_url(webpage), 'NBCSportsVPlayer') + + +class NBCSportsStreamIE(AdobePassIE): + _VALID_URL = r'https?://stream\.nbcsports\.com/.+?\bpid=(?P<id>\d+)' + _TEST = { + 'url': 'http://stream.nbcsports.com/nbcsn/generic?pid=206559', + 'info_dict': { + 'id': '206559', + 'ext': 'mp4', + 'title': 'Amgen Tour of California Women\'s Recap', + 'description': 'md5:66520066b3b5281ada7698d0ea2aa894', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'skip': 'Requires Adobe Pass Authentication', + } + + def _real_extract(self, url): + video_id = self._match_id(url) + live_source = self._download_json( + 'http://stream.nbcsports.com/data/live_sources_%s.json' % video_id, + video_id) + video_source = live_source['videoSources'][0] + title = video_source['title'] + source_url = None + for k in ('source', 'msl4source', 'iossource', 'hlsv4'): + sk = k + 'Url' + source_url = video_source.get(sk) or video_source.get(sk + 'Alt') + if source_url: + break + else: + source_url = video_source['ottStreamUrl'] + is_live = video_source.get('type') == 'live' or video_source.get('status') == 'Live' + resource = self._get_mvpd_resource('nbcsports', title, video_id, '') + token = self._extract_mvpd_auth(url, video_id, 'nbcsports', resource) + tokenized_url = self._download_json( + 'https://token.playmakerservices.com/cdn', + video_id, data=json.dumps({ + 'requestorId': 'nbcsports', + 'pid': video_id, + 'application': 'NBCSports', + 'version': 'v1', + 'platform': 'desktop', + 'cdn': 'akamai', + 'url': video_source['sourceUrl'], + 'token': base64.b64encode(token.encode()).decode(), + 'resourceId': base64.b64encode(resource.encode()).decode(), + }).encode())['tokenizedUrl'] + formats = self._extract_m3u8_formats(tokenized_url, video_id, 'mp4') + self._sort_formats(formats) + return { + 'id': video_id, + 'title': self._live_title(title) if is_live else title, + 'description': live_source.get('description'), + 'formats': formats, + 'is_live': is_live, + } + + +class NBCNewsIE(ThePlatformIE): + _VALID_URL = r'(?x)https?://(?:www\.)?(?:nbcnews|today|msnbc)\.com/([^/]+/)*(?:.*-)?(?P<id>[^/?]+)' + + _TESTS = [ + { + 'url': 'http://www.nbcnews.com/watch/nbcnews-com/how-twitter-reacted-to-the-snowden-interview-269389891880', + 'md5': 'cf4bc9e6ce0130f00f545d80ecedd4bf', + 'info_dict': { + 'id': '269389891880', + 'ext': 'mp4', + 'title': 'How Twitter Reacted To The Snowden Interview', + 'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64', + 'timestamp': 1401363060, + 'upload_date': '20140529', + }, + }, + { + 'url': 'http://www.nbcnews.com/feature/dateline-full-episodes/full-episode-family-business-n285156', + 'md5': 'fdbf39ab73a72df5896b6234ff98518a', + 'info_dict': { + 'id': '529953347624', + 'ext': 'mp4', + 'title': 'FULL EPISODE: Family Business', + 'description': 'md5:757988edbaae9d7be1d585eb5d55cc04', + }, + 'skip': 'This page is unavailable.', + }, + { + 'url': 'http://www.nbcnews.com/nightly-news/video/nightly-news-with-brian-williams-full-broadcast-february-4-394064451844', + 'md5': '8eb831eca25bfa7d25ddd83e85946548', + 'info_dict': { + 'id': '394064451844', + 'ext': 'mp4', + 'title': 'Nightly News with Brian Williams Full Broadcast (February 4)', + 'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5', + 'timestamp': 1423104900, + 'upload_date': '20150205', + }, + }, + { + 'url': 'http://www.nbcnews.com/business/autos/volkswagen-11-million-vehicles-could-have-suspect-software-emissions-scandal-n431456', + 'md5': '4a8c4cec9e1ded51060bdda36ff0a5c0', + 'info_dict': { + 'id': 'n431456', + 'ext': 'mp4', + 'title': "Volkswagen U.S. Chief: We 'Totally Screwed Up'", + 'description': 'md5:d22d1281a24f22ea0880741bb4dd6301', + 'upload_date': '20150922', + 'timestamp': 1442917800, + }, + }, + { + 'url': 'http://www.today.com/video/see-the-aurora-borealis-from-space-in-stunning-new-nasa-video-669831235788', + 'md5': '118d7ca3f0bea6534f119c68ef539f71', + 'info_dict': { + 'id': '669831235788', + 'ext': 'mp4', + 'title': 'See the aurora borealis from space in stunning new NASA video', + 'description': 'md5:74752b7358afb99939c5f8bb2d1d04b1', + 'upload_date': '20160420', + 'timestamp': 1461152093, + }, + }, + { + 'url': 'http://www.msnbc.com/all-in-with-chris-hayes/watch/the-chaotic-gop-immigration-vote-314487875924', + 'md5': '6d236bf4f3dddc226633ce6e2c3f814d', + 'info_dict': { + 'id': '314487875924', + 'ext': 'mp4', + 'title': 'The chaotic GOP immigration vote', + 'description': 'The Republican House votes on a border bill that has no chance of getting through the Senate or signed by the President and is drawing criticism from all sides.', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1406937606, + 'upload_date': '20140802', + }, + }, + { + 'url': 'http://www.nbcnews.com/watch/dateline/full-episode--deadly-betrayal-386250819952', + 'only_matching': True, + }, + { + # From http://www.vulture.com/2016/06/letterman-couldnt-care-less-about-late-night.html + 'url': 'http://www.nbcnews.com/widget/video-embed/701714499682', + 'only_matching': True, + }, + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + data = self._parse_json(self._search_regex( + r'<script[^>]+id="__NEXT_DATA__"[^>]*>({.+?})</script>', + webpage, 'bootstrap json'), video_id)['props']['initialState'] + video_data = try_get(data, lambda x: x['video']['current'], dict) + if not video_data: + video_data = data['article']['content'][0]['primaryMedia']['video'] + title = video_data['headline']['primary'] + + formats = [] + for va in video_data.get('videoAssets', []): + public_url = va.get('publicUrl') + if not public_url: + continue + if '://link.theplatform.com/' in public_url: + public_url = update_url_query(public_url, {'format': 'redirect'}) + format_id = va.get('format') + if format_id == 'M3U': + formats.extend(self._extract_m3u8_formats( + public_url, video_id, 'mp4', 'm3u8_native', + m3u8_id=format_id, fatal=False)) + continue + tbr = int_or_none(va.get('bitrate'), 1000) + if tbr: + format_id += '-%d' % tbr + formats.append({ + 'format_id': format_id, + 'url': public_url, + 'width': int_or_none(va.get('width')), + 'height': int_or_none(va.get('height')), + 'tbr': tbr, + 'ext': 'mp4', + }) + self._sort_formats(formats) + + subtitles = {} + closed_captioning = video_data.get('closedCaptioning') + if closed_captioning: + for cc_url in closed_captioning.values(): + if not cc_url: + continue + subtitles.setdefault('en', []).append({ + 'url': cc_url, + }) + + return { + 'id': video_id, + 'title': title, + 'description': try_get(video_data, lambda x: x['description']['primary']), + 'thumbnail': try_get(video_data, lambda x: x['primaryImage']['url']['primary']), + 'duration': parse_duration(video_data.get('duration')), + 'timestamp': unified_timestamp(video_data.get('datePublished')), + 'formats': formats, + 'subtitles': subtitles, + } + + +class NBCOlympicsIE(InfoExtractor): + IE_NAME = 'nbcolympics' + _VALID_URL = r'https?://www\.nbcolympics\.com/videos?/(?P<id>[0-9a-z-]+)' + + _TEST = { + # Geo-restricted to US + 'url': 'http://www.nbcolympics.com/video/justin-roses-son-leo-was-tears-after-his-dad-won-gold', + 'md5': '54fecf846d05429fbaa18af557ee523a', + 'info_dict': { + 'id': 'WjTBzDXx5AUq', + 'display_id': 'justin-roses-son-leo-was-tears-after-his-dad-won-gold', + 'ext': 'mp4', + 'title': 'Rose\'s son Leo was in tears after his dad won gold', + 'description': 'Olympic gold medalist Justin Rose gets emotional talking to the impact his win in men\'s golf has already had on his children.', + 'timestamp': 1471274964, + 'upload_date': '20160815', + 'uploader': 'NBCU-SPORTS', + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + try: + drupal_settings = self._parse_json(self._search_regex( + r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', + webpage, 'drupal settings'), display_id) + + iframe_url = drupal_settings['vod']['iframe_url'] + theplatform_url = iframe_url.replace( + 'vplayer.nbcolympics.com', 'player.theplatform.com') + except RegexNotFoundError: + theplatform_url = self._search_regex( + r"([\"'])embedUrl\1: *([\"'])(?P<embedUrl>.+)\2", + webpage, 'embedding URL', group="embedUrl") + + return { + '_type': 'url_transparent', + 'url': theplatform_url, + 'ie_key': ThePlatformIE.ie_key(), + 'display_id': display_id, + } + + +class NBCOlympicsStreamIE(AdobePassIE): + IE_NAME = 'nbcolympics:stream' + _VALID_URL = r'https?://stream\.nbcolympics\.com/(?P<id>[0-9a-z-]+)' + _TESTS = [ + { + 'note': 'Tokenized m3u8 source URL', + 'url': 'https://stream.nbcolympics.com/womens-soccer-group-round-11', + 'info_dict': { + 'id': '2019740', + 'ext': 'mp4', + 'title': r"re:Women's Group Stage - Netherlands vs\. Brazil [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$", + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, { + 'note': 'Plain m3u8 source URL', + 'url': 'https://stream.nbcolympics.com/gymnastics-event-finals-mens-floor-pommel-horse-womens-vault-bars', + 'info_dict': { + 'id': '2021729', + 'ext': 'mp4', + 'title': r're:Event Finals: M Floor, W Vault, M Pommel, W Uneven Bars [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, + ] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + pid = self._search_regex(r'pid\s*=\s*(\d+);', webpage, 'pid') + + event_config = self._download_json( + f'http://stream.nbcolympics.com/data/event_config_{pid}.json', + pid, 'Downloading event config')['eventConfig'] + + title = event_config['eventTitle'] + is_live = {'live': True, 'replay': False}.get(event_config.get('eventStatus')) + if is_live: + title = self._live_title(title) + + source_url = self._download_json( + f'https://api-leap.nbcsports.com/feeds/assets/{pid}?application=NBCOlympics&platform=desktop&format=nbc-player&env=staging', + pid, 'Downloading leap config' + )['videoSources'][0]['cdnSources']['primary'][0]['sourceUrl'] + + if event_config.get('cdnToken'): + ap_resource = self._get_mvpd_resource( + event_config.get('resourceId', 'NBCOlympics'), + re.sub(r'[^\w\d ]+', '', event_config['eventTitle']), pid, + event_config.get('ratingId', 'NO VALUE')) + media_token = self._extract_mvpd_auth(url, pid, event_config.get('requestorId', 'NBCOlympics'), ap_resource) + + source_url = self._download_json( + 'https://tokens.playmakerservices.com/', pid, 'Retrieving tokenized URL', + data=json.dumps({ + 'application': 'NBCSports', + 'authentication-type': 'adobe-pass', + 'cdn': 'akamai', + 'pid': pid, + 'platform': 'desktop', + 'requestorId': 'NBCOlympics', + 'resourceId': base64.b64encode(ap_resource.encode()).decode(), + 'token': base64.b64encode(media_token.encode()).decode(), + 'url': source_url, + 'version': 'v1', + }).encode(), + )['akamai'][0]['tokenizedUrl'] + + formats = self._extract_m3u8_formats(source_url, pid, 'mp4', live=is_live) + for f in formats: + # -http_seekable requires ffmpeg 4.3+ but it doesnt seem possible to + # download with ffmpeg without this option + f['_ffmpeg_args'] = ['-seekable', '0', '-http_seekable', '0', '-icy', '0'] + self._sort_formats(formats) + + return { + 'id': pid, + 'display_id': display_id, + 'title': title, + 'formats': formats, + 'is_live': is_live, + } diff --git a/yt_dlp/extractor/ndr.py b/yt_dlp/extractor/ndr.py new file mode 100644 index 000000000..f2bae2c1a --- /dev/null +++ b/yt_dlp/extractor/ndr.py @@ -0,0 +1,441 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, + parse_duration, + qualities, + try_get, + unified_strdate, + urljoin, +) + + +class NDRBaseIE(InfoExtractor): + def _real_extract(self, url): + mobj = self._match_valid_url(url) + display_id = next(group for group in mobj.groups() if group) + id = mobj.group('id') + webpage = self._download_webpage(url, display_id) + return self._extract_embed(webpage, display_id, id) + + +class NDRIE(NDRBaseIE): + IE_NAME = 'ndr' + IE_DESC = 'NDR.de - Norddeutscher Rundfunk' + _VALID_URL = r'https?://(?:www\.)?(?:daserste\.)?ndr\.de/(?:[^/]+/)*(?P<display_id>[^/?#]+),(?P<id>[\da-z]+)\.html' + _TESTS = [{ + 'url': 'http://www.ndr.de/fernsehen/Party-Poette-und-Parade,hafengeburtstag988.html', + 'info_dict': { + 'id': 'hafengeburtstag988', + 'ext': 'mp4', + 'title': 'Party, Pötte und Parade', + 'thumbnail': 'https://www.ndr.de/fernsehen/hafengeburtstag990_v-contentxl.jpg', + 'description': 'md5:ad14f9d2f91d3040b6930c697e5f6b4c', + 'series': None, + 'channel': 'NDR Fernsehen', + 'upload_date': '20150508', + 'duration': 3498, + }, + }, { + 'url': 'https://www.ndr.de/sport/fussball/Rostocks-Matchwinner-Froede-Ein-Hansa-Debuet-wie-im-Maerchen,hansa10312.html', + 'only_matching': True + }, { + 'url': 'https://www.ndr.de/nachrichten/niedersachsen/kommunalwahl_niedersachsen_2021/Grosse-Parteien-zufrieden-mit-Ergebnissen-der-Kommunalwahl,kommunalwahl1296.html', + 'info_dict': { + 'id': 'kommunalwahl1296', + 'ext': 'mp4', + 'title': 'Die Spitzenrunde: Die Wahl aus Sicht der Landespolitik', + 'thumbnail': 'https://www.ndr.de/fernsehen/screenshot1194912_v-contentxl.jpg', + 'description': 'md5:5c6e2ad744cef499135735a1036d7aa7', + 'series': 'Hallo Niedersachsen', + 'channel': 'NDR Fernsehen', + 'upload_date': '20210913', + 'duration': 438, + }, + }, { + 'url': 'https://www.ndr.de/fernsehen/sendungen/extra_3/extra-3-Satiremagazin-mit-Christian-Ehring,sendung1091858.html', + 'info_dict': { + 'id': 'sendung1091858', + 'ext': 'mp4', + 'title': 'Extra 3 vom 11.11.2020 mit Christian Ehring', + 'thumbnail': 'https://www.ndr.de/fernsehen/screenshot983938_v-contentxl.jpg', + 'description': 'md5:700f6de264010585012a72f97b0ac0c9', + 'series': 'extra 3', + 'channel': 'NDR Fernsehen', + 'upload_date': '20201111', + 'duration': 1749, + } + }, { + 'url': 'http://www.ndr.de/info/La-Valette-entgeht-der-Hinrichtung,audio51535.html', + 'info_dict': { + 'id': 'audio51535', + 'ext': 'mp3', + 'title': 'La Valette entgeht der Hinrichtung', + 'thumbnail': 'https://www.ndr.de/mediathek/mediathekbild140_v-podcast.jpg', + 'description': 'md5:22f9541913a40fe50091d5cdd7c9f536', + 'upload_date': '20140729', + 'duration': 884.0, + }, + 'expected_warnings': ['unable to extract json url'], + }] + + def _extract_embed(self, webpage, display_id, id): + formats = [] + base_url = 'https://www.ndr.de' + json_url = self._search_regex(r'<iframe[^>]+src=\"([^\"]+)_theme-ndrde[^\.]*\.html\"', webpage, + 'json url', fatal=False) + if json_url: + data_json = self._download_json(base_url + json_url.replace('ardplayer_image', 'ardjson_image') + '.json', + id, fatal=False) + info_json = data_json.get('_info', {}) + media_json = try_get(data_json, lambda x: x['_mediaArray'][0]['_mediaStreamArray']) + for media in media_json: + if media.get('_quality') == 'auto': + formats.extend(self._extract_m3u8_formats(media['_stream'], id)) + subtitles = {} + sub_url = data_json.get('_subtitleUrl') + if sub_url: + subtitles.setdefault('de', []).append({ + 'url': base_url + sub_url, + }) + self._sort_formats(formats) + return { + 'id': id, + 'title': info_json.get('clipTitle'), + 'thumbnail': base_url + data_json.get('_previewImage'), + 'description': info_json.get('clipDescription'), + 'series': info_json.get('seriesTitle') or None, + 'channel': info_json.get('channelTitle'), + 'upload_date': unified_strdate(info_json.get('clipDate')), + 'duration': data_json.get('_duration'), + 'formats': formats, + 'subtitles': subtitles, + } + else: + json_url = base_url + self._search_regex(r'apiUrl\s?=\s?\'([^\']+)\'', webpage, 'json url').replace( + '_belongsToPodcast-', '') + data_json = self._download_json(json_url, id, fatal=False) + return { + 'id': id, + 'title': data_json.get('title'), + 'thumbnail': base_url + data_json.get('poster'), + 'description': data_json.get('summary'), + 'upload_date': unified_strdate(data_json.get('publicationDate')), + 'duration': parse_duration(data_json.get('duration')), + 'formats': [{ + 'url': try_get(data_json, (lambda x: x['audio'][0]['url'], lambda x: x['files'][0]['url'])), + 'vcodec': 'none', + 'ext': 'mp3', + }], + } + + +class NJoyIE(NDRBaseIE): + IE_NAME = 'njoy' + IE_DESC = 'N-JOY' + _VALID_URL = r'https?://(?:www\.)?n-joy\.de/(?:[^/]+/)*(?:(?P<display_id>[^/?#]+),)?(?P<id>[\da-z]+)\.html' + _TESTS = [{ + # httpVideo, same content id + 'url': 'http://www.n-joy.de/entertainment/comedy/comedy_contest/Benaissa-beim-NDR-Comedy-Contest,comedycontest2480.html', + 'md5': 'cb63be60cd6f9dd75218803146d8dc67', + 'info_dict': { + 'id': 'comedycontest2480', + 'display_id': 'Benaissa-beim-NDR-Comedy-Contest', + 'ext': 'mp4', + 'title': 'Benaissa beim NDR Comedy Contest', + 'description': 'md5:f057a6c4e1c728b10d33b5ffd36ddc39', + 'uploader': 'ndrtv', + 'upload_date': '20141129', + 'duration': 654, + }, + 'params': { + 'skip_download': True, + }, + }, { + # httpVideo, different content id + 'url': 'http://www.n-joy.de/musik/Das-frueheste-DJ-Set-des-Nordens-live-mit-Felix-Jaehn-,felixjaehn168.html', + 'md5': '417660fffa90e6df2fda19f1b40a64d8', + 'info_dict': { + 'id': 'dockville882', + 'display_id': 'Das-frueheste-DJ-Set-des-Nordens-live-mit-Felix-Jaehn-', + 'ext': 'mp4', + 'title': '"Ich hab noch nie" mit Felix Jaehn', + 'description': 'md5:85dd312d53be1b99e1f998a16452a2f3', + 'uploader': 'njoy', + 'upload_date': '20150822', + 'duration': 211, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.n-joy.de/radio/webradio/morningshow209.html', + 'only_matching': True, + }] + + def _extract_embed(self, webpage, display_id, id): + video_id = self._search_regex( + r'<iframe[^>]+id="pp_([\da-z]+)"', webpage, 'embed id') + description = self._search_regex( + r'<div[^>]+class="subline"[^>]*>[^<]+</div>\s*<p>([^<]+)</p>', + webpage, 'description', fatal=False) + return { + '_type': 'url_transparent', + 'ie_key': 'NDREmbedBase', + 'url': 'ndr:%s' % video_id, + 'display_id': display_id, + 'description': description, + } + + +class NDREmbedBaseIE(InfoExtractor): + IE_NAME = 'ndr:embed:base' + _VALID_URL = r'(?:ndr:(?P<id_s>[\da-z]+)|https?://www\.ndr\.de/(?P<id>[\da-z]+)-ppjson\.json)' + _TESTS = [{ + 'url': 'ndr:soundcheck3366', + 'only_matching': True, + }, { + 'url': 'http://www.ndr.de/soundcheck3366-ppjson.json', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') or mobj.group('id_s') + + ppjson = self._download_json( + 'http://www.ndr.de/%s-ppjson.json' % video_id, video_id) + + playlist = ppjson['playlist'] + + formats = [] + quality_key = qualities(('xs', 's', 'm', 'l', 'xl')) + + for format_id, f in playlist.items(): + src = f.get('src') + if not src: + continue + ext = determine_ext(src, None) + if ext == 'f4m': + formats.extend(self._extract_f4m_formats( + src + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id, + f4m_id='hds', fatal=False)) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + src, video_id, 'mp4', m3u8_id='hls', + entry_protocol='m3u8_native', fatal=False)) + else: + quality = f.get('quality') + ff = { + 'url': src, + 'format_id': quality or format_id, + 'quality': quality_key(quality), + } + type_ = f.get('type') + if type_ and type_.split('/')[0] == 'audio': + ff['vcodec'] = 'none' + ff['ext'] = ext or 'mp3' + formats.append(ff) + self._sort_formats(formats) + + config = playlist['config'] + + live = playlist.get('config', {}).get('streamType') in ['httpVideoLive', 'httpAudioLive'] + title = config['title'] + if live: + title = self._live_title(title) + uploader = ppjson.get('config', {}).get('branding') + upload_date = ppjson.get('config', {}).get('publicationDate') + duration = int_or_none(config.get('duration')) + + thumbnails = [] + poster = try_get(config, lambda x: x['poster'], dict) or {} + for thumbnail_id, thumbnail in poster.items(): + thumbnail_url = urljoin(url, thumbnail.get('src')) + if not thumbnail_url: + continue + thumbnails.append({ + 'id': thumbnail.get('quality') or thumbnail_id, + 'url': thumbnail_url, + 'preference': quality_key(thumbnail.get('quality')), + }) + + subtitles = {} + tracks = config.get('tracks') + if tracks and isinstance(tracks, list): + for track in tracks: + if not isinstance(track, dict): + continue + track_url = urljoin(url, track.get('src')) + if not track_url: + continue + subtitles.setdefault(track.get('srclang') or 'de', []).append({ + 'url': track_url, + 'ext': 'ttml', + }) + + return { + 'id': video_id, + 'title': title, + 'is_live': live, + 'uploader': uploader if uploader != '-' else None, + 'upload_date': upload_date[0:8] if upload_date else None, + 'duration': duration, + 'thumbnails': thumbnails, + 'formats': formats, + 'subtitles': subtitles, + } + + +class NDREmbedIE(NDREmbedBaseIE): + IE_NAME = 'ndr:embed' + _VALID_URL = r'https?://(?:www\.)?(?:daserste\.)?ndr\.de/(?:[^/]+/)*(?P<id>[\da-z]+)-(?:player|externalPlayer)\.html' + _TESTS = [{ + 'url': 'http://www.ndr.de/fernsehen/sendungen/ndr_aktuell/ndraktuell28488-player.html', + 'md5': '8b9306142fe65bbdefb5ce24edb6b0a9', + 'info_dict': { + 'id': 'ndraktuell28488', + 'ext': 'mp4', + 'title': 'Norddeutschland begrüßt Flüchtlinge', + 'is_live': False, + 'uploader': 'ndrtv', + 'upload_date': '20150907', + 'duration': 132, + }, + }, { + 'url': 'http://www.ndr.de/ndr2/events/soundcheck/soundcheck3366-player.html', + 'md5': '002085c44bae38802d94ae5802a36e78', + 'info_dict': { + 'id': 'soundcheck3366', + 'ext': 'mp4', + 'title': 'Ella Henderson braucht Vergleiche nicht zu scheuen', + 'is_live': False, + 'uploader': 'ndr2', + 'upload_date': '20150912', + 'duration': 3554, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.ndr.de/info/audio51535-player.html', + 'md5': 'bb3cd38e24fbcc866d13b50ca59307b8', + 'info_dict': { + 'id': 'audio51535', + 'ext': 'mp3', + 'title': 'La Valette entgeht der Hinrichtung', + 'is_live': False, + 'uploader': 'ndrinfo', + 'upload_date': '20140729', + 'duration': 884, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.ndr.de/fernsehen/sendungen/visite/visite11010-externalPlayer.html', + 'md5': 'ae57f80511c1e1f2fd0d0d3d31aeae7c', + 'info_dict': { + 'id': 'visite11010', + 'ext': 'mp4', + 'title': 'Visite - die ganze Sendung', + 'is_live': False, + 'uploader': 'ndrtv', + 'upload_date': '20150902', + 'duration': 3525, + }, + 'params': { + 'skip_download': True, + }, + }, { + # httpVideoLive + 'url': 'http://www.ndr.de/fernsehen/livestream/livestream217-externalPlayer.html', + 'info_dict': { + 'id': 'livestream217', + 'ext': 'flv', + 'title': r're:^NDR Fernsehen Niedersachsen \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'is_live': True, + 'upload_date': '20150910', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.ndr.de/ndrkultur/audio255020-player.html', + 'only_matching': True, + }, { + 'url': 'http://www.ndr.de/fernsehen/sendungen/nordtour/nordtour7124-player.html', + 'only_matching': True, + }, { + 'url': 'http://www.ndr.de/kultur/film/videos/videoimport10424-player.html', + 'only_matching': True, + }, { + 'url': 'http://www.ndr.de/fernsehen/sendungen/hamburg_journal/hamj43006-player.html', + 'only_matching': True, + }, { + 'url': 'http://www.ndr.de/fernsehen/sendungen/weltbilder/weltbilder4518-player.html', + 'only_matching': True, + }, { + 'url': 'http://www.ndr.de/fernsehen/doku952-player.html', + 'only_matching': True, + }] + + +class NJoyEmbedIE(NDREmbedBaseIE): + IE_NAME = 'njoy:embed' + _VALID_URL = r'https?://(?:www\.)?n-joy\.de/(?:[^/]+/)*(?P<id>[\da-z]+)-(?:player|externalPlayer)_[^/]+\.html' + _TESTS = [{ + # httpVideo + 'url': 'http://www.n-joy.de/events/reeperbahnfestival/doku948-player_image-bc168e87-5263-4d6d-bd27-bb643005a6de_theme-n-joy.html', + 'md5': '8483cbfe2320bd4d28a349d62d88bd74', + 'info_dict': { + 'id': 'doku948', + 'ext': 'mp4', + 'title': 'Zehn Jahre Reeperbahn Festival - die Doku', + 'is_live': False, + 'upload_date': '20150807', + 'duration': 1011, + }, + }, { + # httpAudio + 'url': 'http://www.n-joy.de/news_wissen/stefanrichter100-player_image-d5e938b1-f21a-4b9a-86b8-aaba8bca3a13_theme-n-joy.html', + 'md5': 'd989f80f28ac954430f7b8a48197188a', + 'info_dict': { + 'id': 'stefanrichter100', + 'ext': 'mp3', + 'title': 'Interview mit einem Augenzeugen', + 'is_live': False, + 'uploader': 'njoy', + 'upload_date': '20150909', + 'duration': 140, + }, + 'params': { + 'skip_download': True, + }, + }, { + # httpAudioLive, no explicit ext + 'url': 'http://www.n-joy.de/news_wissen/webradioweltweit100-player_image-3fec0484-2244-4565-8fb8-ed25fd28b173_theme-n-joy.html', + 'info_dict': { + 'id': 'webradioweltweit100', + 'ext': 'mp3', + 'title': r're:^N-JOY Weltweit \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'is_live': True, + 'uploader': 'njoy', + 'upload_date': '20150810', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.n-joy.de/musik/dockville882-player_image-3905259e-0803-4764-ac72-8b7de077d80a_theme-n-joy.html', + 'only_matching': True, + }, { + 'url': 'http://www.n-joy.de/radio/sendungen/morningshow/urlaubsfotos190-player_image-066a5df1-5c95-49ec-a323-941d848718db_theme-n-joy.html', + 'only_matching': True, + }, { + 'url': 'http://www.n-joy.de/entertainment/comedy/krudetv290-player_image-ab261bfe-51bf-4bf3-87ba-c5122ee35b3d_theme-n-joy.html', + 'only_matching': True, + }] diff --git a/youtube_dl/extractor/ndtv.py b/yt_dlp/extractor/ndtv.py index bc3eb9160..bc3eb9160 100644 --- a/youtube_dl/extractor/ndtv.py +++ b/yt_dlp/extractor/ndtv.py diff --git a/yt_dlp/extractor/nebula.py b/yt_dlp/extractor/nebula.py new file mode 100644 index 000000000..9698a358e --- /dev/null +++ b/yt_dlp/extractor/nebula.py @@ -0,0 +1,238 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import time + +from urllib.error import HTTPError +from .common import InfoExtractor +from ..compat import compat_str, compat_urllib_parse_unquote, compat_urllib_parse_quote +from ..utils import ( + ExtractorError, + parse_iso8601, + try_get, + urljoin, +) + + +class NebulaIE(InfoExtractor): + + _VALID_URL = r'https?://(?:www\.)?(?:watchnebula\.com|nebula\.app)/videos/(?P<id>[-\w]+)' + _TESTS = [ + { + 'url': 'https://nebula.app/videos/that-time-disney-remade-beauty-and-the-beast', + 'md5': 'fe79c4df8b3aa2fea98a93d027465c7e', + 'info_dict': { + 'id': '5c271b40b13fd613090034fd', + 'ext': 'mp4', + 'title': 'That Time Disney Remade Beauty and the Beast', + 'description': 'Note: this video was originally posted on YouTube with the sponsor read included. We weren’t able to remove it without reducing video quality, so it’s presented here in its original context.', + 'upload_date': '20180731', + 'timestamp': 1533009600, + 'channel': 'Lindsay Ellis', + 'uploader': 'Lindsay Ellis', + }, + 'params': { + 'usenetrc': True, + }, + 'skip': 'All Nebula content requires authentication', + }, + { + 'url': 'https://nebula.app/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore', + 'md5': '6d4edd14ce65720fa63aba5c583fb328', + 'info_dict': { + 'id': '5e7e78171aaf320001fbd6be', + 'ext': 'mp4', + 'title': 'Landing Craft - How The Allies Got Ashore', + 'description': r're:^In this episode we explore the unsung heroes of D-Day, the landing craft.', + 'upload_date': '20200327', + 'timestamp': 1585348140, + 'channel': 'The Logistics of D-Day', + 'uploader': 'The Logistics of D-Day', + }, + 'params': { + 'usenetrc': True, + }, + 'skip': 'All Nebula content requires authentication', + }, + { + 'url': 'https://nebula.app/videos/money-episode-1-the-draw', + 'md5': '8c7d272910eea320f6f8e6d3084eecf5', + 'info_dict': { + 'id': '5e779ebdd157bc0001d1c75a', + 'ext': 'mp4', + 'title': 'Episode 1: The Draw', + 'description': r'contains:There’s free money on offer… if the players can all work together.', + 'upload_date': '20200323', + 'timestamp': 1584980400, + 'channel': 'Tom Scott Presents: Money', + 'uploader': 'Tom Scott Presents: Money', + }, + 'params': { + 'usenetrc': True, + }, + 'skip': 'All Nebula content requires authentication', + }, + { + 'url': 'https://watchnebula.com/videos/money-episode-1-the-draw', + 'only_matching': True, + }, + ] + _NETRC_MACHINE = 'watchnebula' + + _nebula_token = None + + def _retrieve_nebula_auth(self): + """ + Log in to Nebula, and returns a Nebula API token + """ + + username, password = self._get_login_info() + if not (username and password): + self.raise_login_required() + + self.report_login() + data = json.dumps({'email': username, 'password': password}).encode('utf8') + response = self._download_json( + 'https://api.watchnebula.com/api/v1/auth/login/', + data=data, fatal=False, video_id=None, + headers={ + 'content-type': 'application/json', + # Submitting the 'sessionid' cookie always causes a 403 on auth endpoint + 'cookie': '' + }, + note='Authenticating to Nebula with supplied credentials', + errnote='Authentication failed or rejected') + if not response or not response.get('key'): + self.raise_login_required() + + # save nebula token as cookie + self._set_cookie( + 'nebula.app', 'nebula-auth', + compat_urllib_parse_quote( + json.dumps({ + "apiToken": response["key"], + "isLoggingIn": False, + "isLoggingOut": False, + }, separators=(",", ":"))), + expire_time=int(time.time()) + 86400 * 365, + ) + + return response['key'] + + def _retrieve_zype_api_key(self, page_url, display_id): + """ + Retrieves the Zype API key + """ + + # Find the js that has the API key from the webpage and download it + webpage = self._download_webpage(page_url, video_id=display_id) + main_script_relpath = self._search_regex( + r'<script[^>]*src="(?P<script_relpath>[^"]*main.[0-9a-f]*.chunk.js)"[^>]*>', webpage, + group='script_relpath', name='script relative path', fatal=True) + main_script_abspath = urljoin(page_url, main_script_relpath) + main_script = self._download_webpage(main_script_abspath, video_id=display_id, + note='Retrieving Zype API key') + + api_key = self._search_regex( + r'REACT_APP_ZYPE_API_KEY\s*:\s*"(?P<api_key>[\w-]*)"', main_script, + group='api_key', name='API key', fatal=True) + + return api_key + + def _call_zype_api(self, path, params, video_id, api_key, note): + """ + A helper for making calls to the Zype API. + """ + query = {'api_key': api_key, 'per_page': 1} + query.update(params) + return self._download_json('https://api.zype.com' + path, video_id, query=query, note=note) + + def _call_nebula_api(self, path, video_id, access_token, note): + """ + A helper for making calls to the Nebula API. + """ + return self._download_json('https://api.watchnebula.com/api/v1' + path, video_id, headers={ + 'Authorization': 'Token {access_token}'.format(access_token=access_token) + }, note=note) + + def _fetch_zype_access_token(self, video_id): + try: + user_object = self._call_nebula_api('/auth/user/', video_id, self._nebula_token, note='Retrieving Zype access token') + except ExtractorError as exc: + # if 401, attempt credential auth and retry + if exc.cause and isinstance(exc.cause, HTTPError) and exc.cause.code == 401: + self._nebula_token = self._retrieve_nebula_auth() + user_object = self._call_nebula_api('/auth/user/', video_id, self._nebula_token, note='Retrieving Zype access token') + else: + raise + + access_token = try_get(user_object, lambda x: x['zype_auth_info']['access_token'], compat_str) + if not access_token: + if try_get(user_object, lambda x: x['is_subscribed'], bool): + # TODO: Reimplement the same Zype token polling the Nebula frontend implements + # see https://github.com/ytdl-org/youtube-dl/pull/24805#issuecomment-749231532 + raise ExtractorError( + 'Unable to extract Zype access token from Nebula API authentication endpoint. ' + 'Open an arbitrary video in a browser with this account to generate a token', + expected=True) + raise ExtractorError('Unable to extract Zype access token from Nebula API authentication endpoint') + return access_token + + def _extract_channel_title(self, video_meta): + # TODO: Implement the API calls giving us the channel list, + # so that we can do the title lookup and then figure out the channel URL + categories = video_meta.get('categories', []) if video_meta else [] + # the channel name is the value of the first category + for category in categories: + if category.get('value'): + return category['value'][0] + + def _real_initialize(self): + # check cookie jar for valid token + nebula_cookies = self._get_cookies('https://nebula.app') + nebula_cookie = nebula_cookies.get('nebula-auth') + if nebula_cookie: + self.to_screen('Authenticating to Nebula with token from cookie jar') + nebula_cookie_value = compat_urllib_parse_unquote(nebula_cookie.value) + self._nebula_token = self._parse_json(nebula_cookie_value, None).get('apiToken') + + # try to authenticate using credentials if no valid token has been found + if not self._nebula_token: + self._nebula_token = self._retrieve_nebula_auth() + + def _real_extract(self, url): + display_id = self._match_id(url) + api_key = self._retrieve_zype_api_key(url, display_id) + + response = self._call_zype_api('/videos', {'friendly_title': display_id}, + display_id, api_key, note='Retrieving metadata from Zype') + if len(response.get('response') or []) != 1: + raise ExtractorError('Unable to find video on Zype API') + video_meta = response['response'][0] + + video_id = video_meta['_id'] + zype_access_token = self._fetch_zype_access_token(display_id) + + channel_title = self._extract_channel_title(video_meta) + + return { + 'id': video_id, + 'display_id': display_id, + '_type': 'url_transparent', + 'ie_key': 'Zype', + 'url': 'https://player.zype.com/embed/%s.html?access_token=%s' % (video_id, zype_access_token), + 'title': video_meta.get('title'), + 'description': video_meta.get('description'), + 'timestamp': parse_iso8601(video_meta.get('published_at')), + 'thumbnails': [{ + 'id': tn.get('name'), # this appears to be null + 'url': tn['url'], + 'width': tn.get('width'), + 'height': tn.get('height'), + } for tn in video_meta.get('thumbnails', [])], + 'duration': video_meta.get('duration'), + 'channel': channel_title, + 'uploader': channel_title, # we chose uploader = channel name + # TODO: uploader_url, channel_id, channel_url + } diff --git a/youtube_dl/extractor/nerdcubed.py b/yt_dlp/extractor/nerdcubed.py index 9feccc672..9feccc672 100644 --- a/youtube_dl/extractor/nerdcubed.py +++ b/yt_dlp/extractor/nerdcubed.py diff --git a/yt_dlp/extractor/neteasemusic.py b/yt_dlp/extractor/neteasemusic.py new file mode 100644 index 000000000..7652371b3 --- /dev/null +++ b/yt_dlp/extractor/neteasemusic.py @@ -0,0 +1,485 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from hashlib import md5 +from base64 import b64encode +from datetime import datetime +import re + +from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse_urlencode, + compat_str, + compat_itertools_count, +) +from ..utils import ( + sanitized_Request, + float_or_none, +) + + +class NetEaseMusicBaseIE(InfoExtractor): + _FORMATS = ['bMusic', 'mMusic', 'hMusic'] + _NETEASE_SALT = '3go8&$8*3*3h0k(2)2' + _API_BASE = 'http://music.163.com/api/' + + @classmethod + def _encrypt(cls, dfsid): + salt_bytes = bytearray(cls._NETEASE_SALT.encode('utf-8')) + string_bytes = bytearray(compat_str(dfsid).encode('ascii')) + salt_len = len(salt_bytes) + for i in range(len(string_bytes)): + string_bytes[i] = string_bytes[i] ^ salt_bytes[i % salt_len] + m = md5() + m.update(bytes(string_bytes)) + result = b64encode(m.digest()).decode('ascii') + return result.replace('/', '_').replace('+', '-') + + def extract_formats(self, info): + formats = [] + for song_format in self._FORMATS: + details = info.get(song_format) + if not details: + continue + song_file_path = '/%s/%s.%s' % ( + self._encrypt(details['dfsId']), details['dfsId'], details['extension']) + + # 203.130.59.9, 124.40.233.182, 115.231.74.139, etc is a reverse proxy-like feature + # from NetEase's CDN provider that can be used if m5.music.126.net does not + # work, especially for users outside of Mainland China + # via: https://github.com/JixunMoe/unblock-163/issues/3#issuecomment-163115880 + for host in ('http://m5.music.126.net', 'http://115.231.74.139/m1.music.126.net', + 'http://124.40.233.182/m1.music.126.net', 'http://203.130.59.9/m1.music.126.net'): + song_url = host + song_file_path + if self._is_valid_url(song_url, info['id'], 'song'): + formats.append({ + 'url': song_url, + 'ext': details.get('extension'), + 'abr': float_or_none(details.get('bitrate'), scale=1000), + 'format_id': song_format, + 'filesize': details.get('size'), + 'asr': details.get('sr') + }) + break + return formats + + @classmethod + def convert_milliseconds(cls, ms): + return int(round(ms / 1000.0)) + + def query_api(self, endpoint, video_id, note): + req = sanitized_Request('%s%s' % (self._API_BASE, endpoint)) + req.add_header('Referer', self._API_BASE) + return self._download_json(req, video_id, note) + + +class NetEaseMusicIE(NetEaseMusicBaseIE): + IE_NAME = 'netease:song' + IE_DESC = '网易云音乐' + _VALID_URL = r'https?://music\.163\.com/(#/)?song\?id=(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://music.163.com/#/song?id=32102397', + 'md5': 'f2e97280e6345c74ba9d5677dd5dcb45', + 'info_dict': { + 'id': '32102397', + 'ext': 'mp3', + 'title': 'Bad Blood (feat. Kendrick Lamar)', + 'creator': 'Taylor Swift / Kendrick Lamar', + 'upload_date': '20150517', + 'timestamp': 1431878400, + 'description': 'md5:a10a54589c2860300d02e1de821eb2ef', + }, + 'skip': 'Blocked outside Mainland China', + }, { + 'note': 'No lyrics translation.', + 'url': 'http://music.163.com/#/song?id=29822014', + 'info_dict': { + 'id': '29822014', + 'ext': 'mp3', + 'title': '听见下雨的声音', + 'creator': '周杰伦', + 'upload_date': '20141225', + 'timestamp': 1419523200, + 'description': 'md5:a4d8d89f44656af206b7b2555c0bce6c', + }, + 'skip': 'Blocked outside Mainland China', + }, { + 'note': 'No lyrics.', + 'url': 'http://music.163.com/song?id=17241424', + 'info_dict': { + 'id': '17241424', + 'ext': 'mp3', + 'title': 'Opus 28', + 'creator': 'Dustin O\'Halloran', + 'upload_date': '20080211', + 'timestamp': 1202745600, + }, + 'skip': 'Blocked outside Mainland China', + }, { + 'note': 'Has translated name.', + 'url': 'http://music.163.com/#/song?id=22735043', + 'info_dict': { + 'id': '22735043', + 'ext': 'mp3', + 'title': '소원을 말해봐 (Genie)', + 'creator': '少女时代', + 'description': 'md5:79d99cc560e4ca97e0c4d86800ee4184', + 'upload_date': '20100127', + 'timestamp': 1264608000, + 'alt_title': '说出愿望吧(Genie)', + }, + 'skip': 'Blocked outside Mainland China', + }] + + def _process_lyrics(self, lyrics_info): + original = lyrics_info.get('lrc', {}).get('lyric') + translated = lyrics_info.get('tlyric', {}).get('lyric') + + if not translated: + return original + + lyrics_expr = r'(\[[0-9]{2}:[0-9]{2}\.[0-9]{2,}\])([^\n]+)' + original_ts_texts = re.findall(lyrics_expr, original) + translation_ts_dict = dict( + (time_stamp, text) for time_stamp, text in re.findall(lyrics_expr, translated) + ) + lyrics = '\n'.join([ + '%s%s / %s' % (time_stamp, text, translation_ts_dict.get(time_stamp, '')) + for time_stamp, text in original_ts_texts + ]) + return lyrics + + def _real_extract(self, url): + song_id = self._match_id(url) + + params = { + 'id': song_id, + 'ids': '[%s]' % song_id + } + info = self.query_api( + 'song/detail?' + compat_urllib_parse_urlencode(params), + song_id, 'Downloading song info')['songs'][0] + + formats = self.extract_formats(info) + self._sort_formats(formats) + + lyrics_info = self.query_api( + 'song/lyric?id=%s&lv=-1&tv=-1' % song_id, + song_id, 'Downloading lyrics data') + lyrics = self._process_lyrics(lyrics_info) + + alt_title = None + if info.get('transNames'): + alt_title = '/'.join(info.get('transNames')) + + return { + 'id': song_id, + 'title': info['name'], + 'alt_title': alt_title, + 'creator': ' / '.join([artist['name'] for artist in info.get('artists', [])]), + 'timestamp': self.convert_milliseconds(info.get('album', {}).get('publishTime')), + 'thumbnail': info.get('album', {}).get('picUrl'), + 'duration': self.convert_milliseconds(info.get('duration', 0)), + 'description': lyrics, + 'formats': formats, + } + + +class NetEaseMusicAlbumIE(NetEaseMusicBaseIE): + IE_NAME = 'netease:album' + IE_DESC = '网易云音乐 - 专辑' + _VALID_URL = r'https?://music\.163\.com/(#/)?album\?id=(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://music.163.com/#/album?id=220780', + 'info_dict': { + 'id': '220780', + 'title': 'B\'day', + }, + 'playlist_count': 23, + 'skip': 'Blocked outside Mainland China', + } + + def _real_extract(self, url): + album_id = self._match_id(url) + + info = self.query_api( + 'album/%s?id=%s' % (album_id, album_id), + album_id, 'Downloading album data')['album'] + + name = info['name'] + desc = info.get('description') + entries = [ + self.url_result('http://music.163.com/#/song?id=%s' % song['id'], + 'NetEaseMusic', song['id']) + for song in info['songs'] + ] + return self.playlist_result(entries, album_id, name, desc) + + +class NetEaseMusicSingerIE(NetEaseMusicBaseIE): + IE_NAME = 'netease:singer' + IE_DESC = '网易云音乐 - 歌手' + _VALID_URL = r'https?://music\.163\.com/(#/)?artist\?id=(?P<id>[0-9]+)' + _TESTS = [{ + 'note': 'Singer has aliases.', + 'url': 'http://music.163.com/#/artist?id=10559', + 'info_dict': { + 'id': '10559', + 'title': '张惠妹 - aMEI;阿密特', + }, + 'playlist_count': 50, + 'skip': 'Blocked outside Mainland China', + }, { + 'note': 'Singer has translated name.', + 'url': 'http://music.163.com/#/artist?id=124098', + 'info_dict': { + 'id': '124098', + 'title': '李昇基 - 이승기', + }, + 'playlist_count': 50, + 'skip': 'Blocked outside Mainland China', + }] + + def _real_extract(self, url): + singer_id = self._match_id(url) + + info = self.query_api( + 'artist/%s?id=%s' % (singer_id, singer_id), + singer_id, 'Downloading singer data') + + name = info['artist']['name'] + if info['artist']['trans']: + name = '%s - %s' % (name, info['artist']['trans']) + if info['artist']['alias']: + name = '%s - %s' % (name, ';'.join(info['artist']['alias'])) + + entries = [ + self.url_result('http://music.163.com/#/song?id=%s' % song['id'], + 'NetEaseMusic', song['id']) + for song in info['hotSongs'] + ] + return self.playlist_result(entries, singer_id, name) + + +class NetEaseMusicListIE(NetEaseMusicBaseIE): + IE_NAME = 'netease:playlist' + IE_DESC = '网易云音乐 - 歌单' + _VALID_URL = r'https?://music\.163\.com/(#/)?(playlist|discover/toplist)\?id=(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://music.163.com/#/playlist?id=79177352', + 'info_dict': { + 'id': '79177352', + 'title': 'Billboard 2007 Top 100', + 'description': 'md5:12fd0819cab2965b9583ace0f8b7b022' + }, + 'playlist_count': 99, + 'skip': 'Blocked outside Mainland China', + }, { + 'note': 'Toplist/Charts sample', + 'url': 'http://music.163.com/#/discover/toplist?id=3733003', + 'info_dict': { + 'id': '3733003', + 'title': 're:韩国Melon排行榜周榜 [0-9]{4}-[0-9]{2}-[0-9]{2}', + 'description': 'md5:73ec782a612711cadc7872d9c1e134fc', + }, + 'playlist_count': 50, + 'skip': 'Blocked outside Mainland China', + }] + + def _real_extract(self, url): + list_id = self._match_id(url) + + info = self.query_api( + 'playlist/detail?id=%s&lv=-1&tv=-1' % list_id, + list_id, 'Downloading playlist data')['result'] + + name = info['name'] + desc = info.get('description') + + if info.get('specialType') == 10: # is a chart/toplist + datestamp = datetime.fromtimestamp( + self.convert_milliseconds(info['updateTime'])).strftime('%Y-%m-%d') + name = '%s %s' % (name, datestamp) + + entries = [ + self.url_result('http://music.163.com/#/song?id=%s' % song['id'], + 'NetEaseMusic', song['id']) + for song in info['tracks'] + ] + return self.playlist_result(entries, list_id, name, desc) + + +class NetEaseMusicMvIE(NetEaseMusicBaseIE): + IE_NAME = 'netease:mv' + IE_DESC = '网易云音乐 - MV' + _VALID_URL = r'https?://music\.163\.com/(#/)?mv\?id=(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://music.163.com/#/mv?id=415350', + 'info_dict': { + 'id': '415350', + 'ext': 'mp4', + 'title': '이럴거면 그러지말지', + 'description': '白雅言自作曲唱甜蜜爱情', + 'creator': '白雅言', + 'upload_date': '20150520', + }, + 'skip': 'Blocked outside Mainland China', + } + + def _real_extract(self, url): + mv_id = self._match_id(url) + + info = self.query_api( + 'mv/detail?id=%s&type=mp4' % mv_id, + mv_id, 'Downloading mv info')['data'] + + formats = [ + {'url': mv_url, 'ext': 'mp4', 'format_id': '%sp' % brs, 'height': int(brs)} + for brs, mv_url in info['brs'].items() + ] + self._sort_formats(formats) + + return { + 'id': mv_id, + 'title': info['name'], + 'description': info.get('desc') or info.get('briefDesc'), + 'creator': info['artistName'], + 'upload_date': info['publishTime'].replace('-', ''), + 'formats': formats, + 'thumbnail': info.get('cover'), + 'duration': self.convert_milliseconds(info.get('duration', 0)), + } + + +class NetEaseMusicProgramIE(NetEaseMusicBaseIE): + IE_NAME = 'netease:program' + IE_DESC = '网易云音乐 - 电台节目' + _VALID_URL = r'https?://music\.163\.com/(#/?)program\?id=(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://music.163.com/#/program?id=10109055', + 'info_dict': { + 'id': '10109055', + 'ext': 'mp3', + 'title': '不丹足球背后的故事', + 'description': '喜马拉雅人的足球梦 ...', + 'creator': '大话西藏', + 'timestamp': 1434179342, + 'upload_date': '20150613', + 'duration': 900, + }, + 'skip': 'Blocked outside Mainland China', + }, { + 'note': 'This program has accompanying songs.', + 'url': 'http://music.163.com/#/program?id=10141022', + 'info_dict': { + 'id': '10141022', + 'title': '25岁,你是自在如风的少年<27°C>', + 'description': 'md5:8d594db46cc3e6509107ede70a4aaa3b', + }, + 'playlist_count': 4, + 'skip': 'Blocked outside Mainland China', + }, { + 'note': 'This program has accompanying songs.', + 'url': 'http://music.163.com/#/program?id=10141022', + 'info_dict': { + 'id': '10141022', + 'ext': 'mp3', + 'title': '25岁,你是自在如风的少年<27°C>', + 'description': 'md5:8d594db46cc3e6509107ede70a4aaa3b', + 'timestamp': 1434450841, + 'upload_date': '20150616', + }, + 'params': { + 'noplaylist': True + }, + 'skip': 'Blocked outside Mainland China', + }] + + def _real_extract(self, url): + program_id = self._match_id(url) + + info = self.query_api( + 'dj/program/detail?id=%s' % program_id, + program_id, 'Downloading program info')['program'] + + name = info['name'] + description = info['description'] + + if not info['songs'] or self.get_param('noplaylist'): + if info['songs']: + self.to_screen( + 'Downloading just the main audio %s because of --no-playlist' + % info['mainSong']['id']) + + formats = self.extract_formats(info['mainSong']) + self._sort_formats(formats) + + return { + 'id': program_id, + 'title': name, + 'description': description, + 'creator': info['dj']['brand'], + 'timestamp': self.convert_milliseconds(info['createTime']), + 'thumbnail': info['coverUrl'], + 'duration': self.convert_milliseconds(info.get('duration', 0)), + 'formats': formats, + } + + self.to_screen( + 'Downloading playlist %s - add --no-playlist to just download the main audio %s' + % (program_id, info['mainSong']['id'])) + + song_ids = [info['mainSong']['id']] + song_ids.extend([song['id'] for song in info['songs']]) + entries = [ + self.url_result('http://music.163.com/#/song?id=%s' % song_id, + 'NetEaseMusic', song_id) + for song_id in song_ids + ] + return self.playlist_result(entries, program_id, name, description) + + +class NetEaseMusicDjRadioIE(NetEaseMusicBaseIE): + IE_NAME = 'netease:djradio' + IE_DESC = '网易云音乐 - 电台' + _VALID_URL = r'https?://music\.163\.com/(#/)?djradio\?id=(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://music.163.com/#/djradio?id=42', + 'info_dict': { + 'id': '42', + 'title': '声音蔓延', + 'description': 'md5:766220985cbd16fdd552f64c578a6b15' + }, + 'playlist_mincount': 40, + 'skip': 'Blocked outside Mainland China', + } + _PAGE_SIZE = 1000 + + def _real_extract(self, url): + dj_id = self._match_id(url) + + name = None + desc = None + entries = [] + for offset in compat_itertools_count(start=0, step=self._PAGE_SIZE): + info = self.query_api( + 'dj/program/byradio?asc=false&limit=%d&radioId=%s&offset=%d' + % (self._PAGE_SIZE, dj_id, offset), + dj_id, 'Downloading dj programs - %d' % offset) + + entries.extend([ + self.url_result( + 'http://music.163.com/#/program?id=%s' % program['id'], + 'NetEaseMusicProgram', program['id']) + for program in info['programs'] + ]) + + if name is None: + radio = info['programs'][0]['radio'] + name = radio['name'] + desc = radio['desc'] + + if not info['more']: + break + + return self.playlist_result(entries, dj_id, name, desc) diff --git a/yt_dlp/extractor/netzkino.py b/yt_dlp/extractor/netzkino.py new file mode 100644 index 000000000..4ad0d8e96 --- /dev/null +++ b/yt_dlp/extractor/netzkino.py @@ -0,0 +1,89 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ( + clean_html, + int_or_none, + js_to_json, + parse_iso8601, +) + + +class NetzkinoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?netzkino\.de/\#!/[^/]+/(?P<id>[^/]+)' + + _TESTS = [{ + 'url': 'https://www.netzkino.de/#!/scifikino/rakete-zum-mond', + 'md5': '92a3f8b76f8d7220acce5377ea5d4873', + 'info_dict': { + 'id': 'rakete-zum-mond', + 'ext': 'mp4', + 'title': 'Rakete zum Mond \u2013 Jules Verne', + 'description': 'md5:f0a8024479618ddbfa450ff48ffa6c60', + 'upload_date': '20120813', + 'thumbnail': r're:https?://.*\.jpg$', + 'timestamp': 1344858571, + 'age_limit': 12, + }, + 'params': { + 'skip_download': 'Download only works from Germany', + } + }, { + 'url': 'https://www.netzkino.de/#!/filme/dr-jekyll-mrs-hyde-2', + 'md5': 'c7728b2dadd04ff6727814847a51ef03', + 'info_dict': { + 'id': 'dr-jekyll-mrs-hyde-2', + 'ext': 'mp4', + 'title': 'Dr. Jekyll & Mrs. Hyde 2', + 'description': 'md5:c2e9626ebd02de0a794b95407045d186', + 'upload_date': '20190130', + 'thumbnail': r're:https?://.*\.jpg$', + 'timestamp': 1548849437, + 'age_limit': 18, + }, + 'params': { + 'skip_download': 'Download only works from Germany', + } + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + + api_url = 'https://api.netzkino.de.simplecache.net/capi-2.0a/movies/%s.json?d=www' % video_id + info = self._download_json(api_url, video_id) + custom_fields = info['custom_fields'] + + production_js = self._download_webpage( + 'http://www.netzkino.de/beta/dist/production.min.js', video_id, + note='Downloading player code') + avo_js = self._search_regex( + r'var urlTemplate=(\{.*?"\})', + production_js, 'URL templates') + templates = self._parse_json( + avo_js, video_id, transform_source=js_to_json) + + suffix = { + 'hds': '.mp4/manifest.f4m', + 'hls': '.mp4/master.m3u8', + 'pmd': '.mp4', + } + film_fn = custom_fields['Streaming'][0] + formats = [{ + 'format_id': key, + 'ext': 'mp4', + 'url': tpl.replace('{}', film_fn) + suffix[key], + } for key, tpl in templates.items()] + self._sort_formats(formats) + + return { + 'id': video_id, + 'formats': formats, + 'title': info['title'], + 'age_limit': int_or_none(custom_fields.get('FSK')[0]), + 'timestamp': parse_iso8601(info.get('date'), delimiter=' '), + 'description': clean_html(info.get('content')), + 'thumbnail': info.get('thumbnail'), + } diff --git a/yt_dlp/extractor/newgrounds.py b/yt_dlp/extractor/newgrounds.py new file mode 100644 index 000000000..bbbd9e8ee --- /dev/null +++ b/yt_dlp/extractor/newgrounds.py @@ -0,0 +1,283 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import functools +import re + +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + int_or_none, + parse_count, + parse_duration, + unified_timestamp, + OnDemandPagedList, + try_get, +) + + +class NewgroundsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?newgrounds\.com/(?:audio/listen|portal/view)/(?P<id>\d+)(?:/format/flash)?' + _TESTS = [{ + 'url': 'https://www.newgrounds.com/audio/listen/549479', + 'md5': 'fe6033d297591288fa1c1f780386f07a', + 'info_dict': { + 'id': '549479', + 'ext': 'mp3', + 'title': 'B7 - BusMode', + 'uploader': 'Burn7', + 'timestamp': 1378878540, + 'upload_date': '20130911', + 'duration': 143, + 'description': 'md5:6d885138814015dfd656c2ddb00dacfc', + }, + }, { + 'url': 'https://www.newgrounds.com/portal/view/1', + 'md5': 'fbfb40e2dc765a7e830cb251d370d981', + 'info_dict': { + 'id': '1', + 'ext': 'mp4', + 'title': 'Scrotum 1', + 'uploader': 'Brian-Beaton', + 'timestamp': 955064100, + 'upload_date': '20000406', + 'description': 'Scrotum plays "catch."', + 'age_limit': 17, + }, + }, { + # source format unavailable, additional mp4 formats + 'url': 'http://www.newgrounds.com/portal/view/689400', + 'info_dict': { + 'id': '689400', + 'ext': 'mp4', + 'title': 'ZTV News Episode 8', + 'uploader': 'ZONE-SAMA', + 'timestamp': 1487965140, + 'upload_date': '20170224', + 'description': 'ZTV News Episode 8 (February 2017)', + 'age_limit': 17, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.newgrounds.com/portal/view/297383', + 'md5': '2c11f5fd8cb6b433a63c89ba3141436c', + 'info_dict': { + 'id': '297383', + 'ext': 'mp4', + 'title': 'Metal Gear Awesome', + 'uploader': 'Egoraptor', + 'timestamp': 1140663240, + 'upload_date': '20060223', + 'description': 'Metal Gear is awesome is so is this movie.', + 'age_limit': 13, + } + }, { + 'url': 'https://www.newgrounds.com/portal/view/297383/format/flash', + 'md5': '5d05585a9a0caca059f5abfbd3865524', + 'info_dict': { + 'id': '297383', + 'ext': 'swf', + 'title': 'Metal Gear Awesome', + 'description': 'Metal Gear is awesome is so is this movie.', + 'uploader': 'Egoraptor', + 'upload_date': '20060223', + 'timestamp': 1140663240, + 'age_limit': 13, + } + }] + _AGE_LIMIT = { + 'e': 0, + 't': 13, + 'm': 17, + 'a': 18, + } + + def _real_extract(self, url): + media_id = self._match_id(url) + formats = [] + uploader = None + webpage = self._download_webpage(url, media_id) + + title = self._html_search_regex( + r'<title>(.+?)</title>', webpage, 'title') + + media_url_string = self._search_regex( + r'"url"\s*:\s*("[^"]+"),', webpage, 'media url', default=None) + + if media_url_string: + media_url = self._parse_json(media_url_string, media_id) + formats = [{ + 'url': media_url, + 'format_id': 'source', + 'quality': 1, + }] + else: + json_video = self._download_json('https://www.newgrounds.com/portal/video/' + media_id, media_id, headers={ + 'Accept': 'application/json', + 'Referer': url, + 'X-Requested-With': 'XMLHttpRequest' + }) + + uploader = json_video.get('author') + media_formats = json_video.get('sources', []) + for media_format in media_formats: + media_sources = media_formats[media_format] + for source in media_sources: + formats.append({ + 'format_id': media_format, + 'quality': int_or_none(media_format[:-1]), + 'url': source.get('src') + }) + + if not uploader: + uploader = self._html_search_regex( + (r'(?s)<h4[^>]*>(.+?)</h4>.*?<em>\s*(?:Author|Artist)\s*</em>', + r'(?:Author|Writer)\s*<a[^>]+>([^<]+)'), webpage, 'uploader', + fatal=False) + + age_limit = self._html_search_regex( + r'<h2\s*class=["\']rated-([^"\'])["\'][^>]+>', webpage, 'age_limit', default='e') + age_limit = self._AGE_LIMIT.get(age_limit) + + timestamp = unified_timestamp(self._html_search_regex( + (r'<dt>\s*Uploaded\s*</dt>\s*<dd>([^<]+</dd>\s*<dd>[^<]+)', + r'<dt>\s*Uploaded\s*</dt>\s*<dd>([^<]+)'), webpage, 'timestamp', + default=None)) + duration = parse_duration(self._html_search_regex( + r'"duration"\s*:\s*["\']?(\d+)["\']?', webpage, + 'duration', default=None)) + + view_count = parse_count(self._html_search_regex( + r'(?s)<dt>\s*(?:Views|Listens)\s*</dt>\s*<dd>([\d\.,]+)</dd>', webpage, + 'view count', default=None)) + + filesize = int_or_none(self._html_search_regex( + r'"filesize"\s*:\s*["\']?([\d]+)["\']?,', webpage, 'filesize', + default=None)) + + video_type_description = self._html_search_regex( + r'"description"\s*:\s*["\']?([^"\']+)["\']?,', webpage, 'filesize', + default=None) + + if len(formats) == 1: + formats[0]['filesize'] = filesize + + if video_type_description == 'Audio File': + formats[0]['vcodec'] = 'none' + self._check_formats(formats, media_id) + self._sort_formats(formats) + + return { + 'id': media_id, + 'title': title, + 'uploader': uploader, + 'timestamp': timestamp, + 'duration': duration, + 'formats': formats, + 'thumbnail': self._og_search_thumbnail(webpage), + 'description': self._og_search_description(webpage), + 'age_limit': age_limit, + 'view_count': view_count, + } + + +class NewgroundsPlaylistIE(InfoExtractor): + IE_NAME = 'Newgrounds:playlist' + _VALID_URL = r'https?://(?:www\.)?newgrounds\.com/(?:collection|[^/]+/search/[^/]+)/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.newgrounds.com/collection/cats', + 'info_dict': { + 'id': 'cats', + 'title': 'Cats', + }, + 'playlist_mincount': 45, + }, { + 'url': 'https://www.newgrounds.com/collection/dogs', + 'info_dict': { + 'id': 'dogs', + 'title': 'Dogs', + }, + 'playlist_mincount': 26, + }, { + 'url': 'http://www.newgrounds.com/audio/search/title/cats', + 'only_matching': True, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + title = self._search_regex( + r'<title>([^>]+)</title>', webpage, 'title', default=None) + + # cut left menu + webpage = self._search_regex( + r'(?s)<div[^>]+\bclass=["\']column wide(.+)', + webpage, 'wide column', default=webpage) + + entries = [] + for a, path, media_id in re.findall( + r'(<a[^>]+\bhref=["\'][^"\']+((?:portal/view|audio/listen)/(\d+))[^>]+>)', + webpage): + a_class = extract_attributes(a).get('class') + if a_class not in ('item-portalsubmission', 'item-audiosubmission'): + continue + entries.append( + self.url_result( + f'https://www.newgrounds.com/{path}', + ie=NewgroundsIE.ie_key(), video_id=media_id)) + + return self.playlist_result(entries, playlist_id, title) + + +class NewgroundsUserIE(InfoExtractor): + IE_NAME = 'Newgrounds:user' + _VALID_URL = r'https?://(?P<id>[^\.]+)\.newgrounds\.com/(?:movies|audio)/?(?:[#?]|$)' + _TESTS = [{ + 'url': 'https://burn7.newgrounds.com/audio', + 'info_dict': { + 'id': 'burn7', + }, + 'playlist_mincount': 150, + }, { + 'url': 'https://burn7.newgrounds.com/movies', + 'info_dict': { + 'id': 'burn7', + }, + 'playlist_mincount': 2, + }, { + 'url': 'https://brian-beaton.newgrounds.com/movies', + 'info_dict': { + 'id': 'brian-beaton', + }, + 'playlist_mincount': 10, + }] + _PAGE_SIZE = 30 + + def _fetch_page(self, channel_id, url, page): + page += 1 + posts_info = self._download_json( + f'{url}/page/{page}', channel_id, + note=f'Downloading page {page}', headers={ + 'Accept': 'application/json, text/javascript, */*; q = 0.01', + 'X-Requested-With': 'XMLHttpRequest', + }) + sequence = posts_info.get('sequence', []) + for year in sequence: + posts = try_get(posts_info, lambda x: x['years'][str(year)]['items']) + for post in posts: + path, media_id = self._search_regex( + r'<a[^>]+\bhref=["\'][^"\']+((?:portal/view|audio/listen)/(\d+))[^>]+>', + post, 'url', group=(1, 2)) + yield self.url_result(f'https://www.newgrounds.com/{path}', NewgroundsIE.ie_key(), media_id) + + def _real_extract(self, url): + channel_id = self._match_id(url) + + entries = OnDemandPagedList(functools.partial( + self._fetch_page, channel_id, url), self._PAGE_SIZE) + + return self.playlist_result(entries, channel_id) diff --git a/youtube_dl/extractor/newstube.py b/yt_dlp/extractor/newstube.py index dab4aec44..dab4aec44 100644 --- a/youtube_dl/extractor/newstube.py +++ b/yt_dlp/extractor/newstube.py diff --git a/youtube_dl/extractor/nextmedia.py b/yt_dlp/extractor/nextmedia.py index 7bd1290bf..7bd1290bf 100644 --- a/youtube_dl/extractor/nextmedia.py +++ b/yt_dlp/extractor/nextmedia.py diff --git a/yt_dlp/extractor/nexx.py b/yt_dlp/extractor/nexx.py new file mode 100644 index 000000000..860d636e2 --- /dev/null +++ b/yt_dlp/extractor/nexx.py @@ -0,0 +1,453 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import hashlib +import random +import re +import time + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + parse_duration, + try_get, + urlencode_postdata, +) + + +class NexxIE(InfoExtractor): + _VALID_URL = r'''(?x) + (?: + https?://api\.nexx(?:\.cloud|cdn\.com)/v3/(?P<domain_id>\d+)/videos/byid/| + nexx:(?:(?P<domain_id_s>\d+):)?| + https?://arc\.nexx\.cloud/api/video/ + ) + (?P<id>\d+) + ''' + _TESTS = [{ + # movie + 'url': 'https://api.nexx.cloud/v3/748/videos/byid/128907', + 'md5': '31899fd683de49ad46f4ee67e53e83fe', + 'info_dict': { + 'id': '128907', + 'ext': 'mp4', + 'title': 'Stiftung Warentest', + 'alt_title': 'Wie ein Test abläuft', + 'description': 'md5:d1ddb1ef63de721132abd38639cc2fd2', + 'creator': 'SPIEGEL TV', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 2509, + 'timestamp': 1384264416, + 'upload_date': '20131112', + }, + }, { + # episode + 'url': 'https://api.nexx.cloud/v3/741/videos/byid/247858', + 'info_dict': { + 'id': '247858', + 'ext': 'mp4', + 'title': 'Return of the Golden Child (OV)', + 'description': 'md5:5d969537509a92b733de21bae249dc63', + 'release_year': 2017, + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 1397, + 'timestamp': 1495033267, + 'upload_date': '20170517', + 'episode_number': 2, + 'season_number': 2, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'HTTP Error 404: Not Found', + }, { + # does not work via arc + 'url': 'nexx:741:1269984', + 'md5': 'c714b5b238b2958dc8d5642addba6886', + 'info_dict': { + 'id': '1269984', + 'ext': 'mp4', + 'title': '1 TAG ohne KLO... wortwörtlich! 😑', + 'alt_title': '1 TAG ohne KLO... wortwörtlich! 😑', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 607, + 'timestamp': 1518614955, + 'upload_date': '20180214', + }, + }, { + # free cdn from http://www.spiegel.de/video/eifel-zoo-aufregung-um-ausgebrochene-raubtiere-video-99018031.html + 'url': 'nexx:747:1533779', + 'md5': '6bf6883912b82b7069fb86c2297e9893', + 'info_dict': { + 'id': '1533779', + 'ext': 'mp4', + 'title': 'Aufregung um ausgebrochene Raubtiere', + 'alt_title': 'Eifel-Zoo', + 'description': 'md5:f21375c91c74ad741dcb164c427999d2', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 111, + 'timestamp': 1527874460, + 'upload_date': '20180601', + }, + }, { + 'url': 'https://api.nexxcdn.com/v3/748/videos/byid/128907', + 'only_matching': True, + }, { + 'url': 'nexx:748:128907', + 'only_matching': True, + }, { + 'url': 'nexx:128907', + 'only_matching': True, + }, { + 'url': 'https://arc.nexx.cloud/api/video/128907.json', + 'only_matching': True, + }] + + @staticmethod + def _extract_domain_id(webpage): + mobj = re.search( + r'<script\b[^>]+\bsrc=["\'](?:https?:)?//(?:require|arc)\.nexx(?:\.cloud|cdn\.com)/(?:sdk/)?(?P<id>\d+)', + webpage) + return mobj.group('id') if mobj else None + + @staticmethod + def _extract_urls(webpage): + # Reference: + # 1. https://nx-s.akamaized.net/files/201510/44.pdf + + entries = [] + + # JavaScript Integration + domain_id = NexxIE._extract_domain_id(webpage) + if domain_id: + for video_id in re.findall( + r'(?is)onPLAYReady.+?_play\.(?:init|(?:control\.)?addPlayer)\s*\(.+?\s*,\s*["\']?(\d+)', + webpage): + entries.append( + 'https://api.nexx.cloud/v3/%s/videos/byid/%s' + % (domain_id, video_id)) + + # TODO: support more embed formats + + return entries + + @staticmethod + def _extract_url(webpage): + return NexxIE._extract_urls(webpage)[0] + + def _handle_error(self, response): + status = int_or_none(try_get( + response, lambda x: x['metadata']['status']) or 200) + if 200 <= status < 300: + return + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, response['metadata']['errorhint']), + expected=True) + + def _call_api(self, domain_id, path, video_id, data=None, headers={}): + headers['Content-Type'] = 'application/x-www-form-urlencoded; charset=UTF-8' + result = self._download_json( + 'https://api.nexx.cloud/v3/%s/%s' % (domain_id, path), video_id, + 'Downloading %s JSON' % path, data=urlencode_postdata(data), + headers=headers) + self._handle_error(result) + return result['result'] + + def _extract_free_formats(self, video, video_id): + stream_data = video['streamdata'] + cdn = stream_data['cdnType'] + assert cdn == 'free' + + hash = video['general']['hash'] + + ps = compat_str(stream_data['originalDomain']) + if stream_data['applyFolderHierarchy'] == 1: + s = ('%04d' % int(video_id))[::-1] + ps += '/%s/%s' % (s[0:2], s[2:4]) + ps += '/%s/%s_' % (video_id, hash) + + t = 'http://%s' + ps + fd = stream_data['azureFileDistribution'].split(',') + cdn_provider = stream_data['cdnProvider'] + + def p0(p): + return '_%s' % p if stream_data['applyAzureStructure'] == 1 else '' + + formats = [] + if cdn_provider == 'ak': + t += ',' + for i in fd: + p = i.split(':') + t += p[1] + p0(int(p[0])) + ',' + t += '.mp4.csmil/master.%s' + elif cdn_provider == 'ce': + k = t.split('/') + h = k.pop() + http_base = t = '/'.join(k) + http_base = http_base % stream_data['cdnPathHTTP'] + t += '/asset.ism/manifest.%s?dcp_ver=aos4&videostream=' + for i in fd: + p = i.split(':') + tbr = int(p[0]) + filename = '%s%s%s.mp4' % (h, p[1], p0(tbr)) + f = { + 'url': http_base + '/' + filename, + 'format_id': '%s-http-%d' % (cdn, tbr), + 'tbr': tbr, + } + width_height = p[1].split('x') + if len(width_height) == 2: + f.update({ + 'width': int_or_none(width_height[0]), + 'height': int_or_none(width_height[1]), + }) + formats.append(f) + a = filename + ':%s' % (tbr * 1000) + t += a + ',' + t = t[:-1] + '&audiostream=' + a.split(':')[0] + else: + assert False + + if cdn_provider == 'ce': + formats.extend(self._extract_mpd_formats( + t % (stream_data['cdnPathDASH'], 'mpd'), video_id, + mpd_id='%s-dash' % cdn, fatal=False)) + formats.extend(self._extract_m3u8_formats( + t % (stream_data['cdnPathHLS'], 'm3u8'), video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='%s-hls' % cdn, fatal=False)) + + return formats + + def _extract_azure_formats(self, video, video_id): + stream_data = video['streamdata'] + cdn = stream_data['cdnType'] + assert cdn == 'azure' + + azure_locator = stream_data['azureLocator'] + + def get_cdn_shield_base(shield_type='', static=False): + for secure in ('', 's'): + cdn_shield = stream_data.get('cdnShield%sHTTP%s' % (shield_type, secure.upper())) + if cdn_shield: + return 'http%s://%s' % (secure, cdn_shield) + else: + if 'fb' in stream_data['azureAccount']: + prefix = 'df' if static else 'f' + else: + prefix = 'd' if static else 'p' + account = int(stream_data['azureAccount'].replace('nexxplayplus', '').replace('nexxplayfb', '')) + return 'http://nx-%s%02d.akamaized.net/' % (prefix, account) + + language = video['general'].get('language_raw') or '' + + azure_stream_base = get_cdn_shield_base() + is_ml = ',' in language + azure_manifest_url = '%s%s/%s_src%s.ism/Manifest' % ( + azure_stream_base, azure_locator, video_id, ('_manifest' if is_ml else '')) + '%s' + + protection_token = try_get( + video, lambda x: x['protectiondata']['token'], compat_str) + if protection_token: + azure_manifest_url += '?hdnts=%s' % protection_token + + formats = self._extract_m3u8_formats( + azure_manifest_url % '(format=m3u8-aapl)', + video_id, 'mp4', 'm3u8_native', + m3u8_id='%s-hls' % cdn, fatal=False) + formats.extend(self._extract_mpd_formats( + azure_manifest_url % '(format=mpd-time-csf)', + video_id, mpd_id='%s-dash' % cdn, fatal=False)) + formats.extend(self._extract_ism_formats( + azure_manifest_url % '', video_id, ism_id='%s-mss' % cdn, fatal=False)) + + azure_progressive_base = get_cdn_shield_base('Prog', True) + azure_file_distribution = stream_data.get('azureFileDistribution') + if azure_file_distribution: + fds = azure_file_distribution.split(',') + if fds: + for fd in fds: + ss = fd.split(':') + if len(ss) == 2: + tbr = int_or_none(ss[0]) + if tbr: + f = { + 'url': '%s%s/%s_src_%s_%d.mp4' % ( + azure_progressive_base, azure_locator, video_id, ss[1], tbr), + 'format_id': '%s-http-%d' % (cdn, tbr), + 'tbr': tbr, + } + width_height = ss[1].split('x') + if len(width_height) == 2: + f.update({ + 'width': int_or_none(width_height[0]), + 'height': int_or_none(width_height[1]), + }) + formats.append(f) + + return formats + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + domain_id = mobj.group('domain_id') or mobj.group('domain_id_s') + video_id = mobj.group('id') + + video = None + + def find_video(result): + if isinstance(result, dict): + return result + elif isinstance(result, list): + vid = int(video_id) + for v in result: + if try_get(v, lambda x: x['general']['ID'], int) == vid: + return v + return None + + response = self._download_json( + 'https://arc.nexx.cloud/api/video/%s.json' % video_id, + video_id, fatal=False) + if response and isinstance(response, dict): + result = response.get('result') + if result: + video = find_video(result) + + # not all videos work via arc, e.g. nexx:741:1269984 + if not video: + # Reverse engineered from JS code (see getDeviceID function) + device_id = '%d:%d:%d%d' % ( + random.randint(1, 4), int(time.time()), + random.randint(1e4, 99999), random.randint(1, 9)) + + result = self._call_api(domain_id, 'session/init', video_id, data={ + 'nxp_devh': device_id, + 'nxp_userh': '', + 'precid': '0', + 'playlicense': '0', + 'screenx': '1920', + 'screeny': '1080', + 'playerversion': '6.0.00', + 'gateway': 'html5', + 'adGateway': '', + 'explicitlanguage': 'en-US', + 'addTextTemplates': '1', + 'addDomainData': '1', + 'addAdModel': '1', + }, headers={ + 'X-Request-Enable-Auth-Fallback': '1', + }) + + cid = result['general']['cid'] + + # As described in [1] X-Request-Token generation algorithm is + # as follows: + # md5( operation + domain_id + domain_secret ) + # where domain_secret is a static value that will be given by nexx.tv + # as per [1]. Here is how this "secret" is generated (reversed + # from _play.api.init function, search for clienttoken). So it's + # actually not static and not that much of a secret. + # 1. https://nexxtvstorage.blob.core.windows.net/files/201610/27.pdf + secret = result['device']['clienttoken'][int(device_id[0]):] + secret = secret[0:len(secret) - int(device_id[-1])] + + op = 'byid' + + # Reversed from JS code for _play.api.call function (search for + # X-Request-Token) + request_token = hashlib.md5( + ''.join((op, domain_id, secret)).encode('utf-8')).hexdigest() + + result = self._call_api( + domain_id, 'videos/%s/%s' % (op, video_id), video_id, data={ + 'additionalfields': 'language,channel,actors,studio,licenseby,slug,subtitle,teaser,description', + 'addInteractionOptions': '1', + 'addStatusDetails': '1', + 'addStreamDetails': '1', + 'addCaptions': '1', + 'addScenes': '1', + 'addHotSpots': '1', + 'addBumpers': '1', + 'captionFormat': 'data', + }, headers={ + 'X-Request-CID': cid, + 'X-Request-Token': request_token, + }) + video = find_video(result) + + general = video['general'] + title = general['title'] + + cdn = video['streamdata']['cdnType'] + + if cdn == 'azure': + formats = self._extract_azure_formats(video, video_id) + elif cdn == 'free': + formats = self._extract_free_formats(video, video_id) + else: + # TODO: reverse more cdns + assert False + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'alt_title': general.get('subtitle'), + 'description': general.get('description'), + 'release_year': int_or_none(general.get('year')), + 'creator': general.get('studio') or general.get('studio_adref'), + 'thumbnail': try_get( + video, lambda x: x['imagedata']['thumb'], compat_str), + 'duration': parse_duration(general.get('runtime')), + 'timestamp': int_or_none(general.get('uploaded')), + 'episode_number': int_or_none(try_get( + video, lambda x: x['episodedata']['episode'])), + 'season_number': int_or_none(try_get( + video, lambda x: x['episodedata']['season'])), + 'formats': formats, + } + + +class NexxEmbedIE(InfoExtractor): + _VALID_URL = r'https?://embed\.nexx(?:\.cloud|cdn\.com)/\d+/(?:video/)?(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'http://embed.nexx.cloud/748/KC1614647Z27Y7T?autoplay=1', + 'md5': '16746bfc28c42049492385c989b26c4a', + 'info_dict': { + 'id': '161464', + 'ext': 'mp4', + 'title': 'Nervenkitzel Achterbahn', + 'alt_title': 'Karussellbauer in Deutschland', + 'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc', + 'creator': 'SPIEGEL TV', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 2761, + 'timestamp': 1394021479, + 'upload_date': '20140305', + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, + }, { + 'url': 'https://embed.nexx.cloud/11888/video/DSRTO7UVOX06S7', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + # Reference: + # 1. https://nx-s.akamaized.net/files/201510/44.pdf + + # iFrame Embed Integration + return [mobj.group('url') for mobj in re.finditer( + r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//embed\.nexx(?:\.cloud|cdn\.com)/\d+/(?:(?!\1).)+)\1', + webpage)] + + def _real_extract(self, url): + embed_id = self._match_id(url) + + webpage = self._download_webpage(url, embed_id) + + return self.url_result(NexxIE._extract_url(webpage), ie=NexxIE.ie_key()) diff --git a/yt_dlp/extractor/nfhsnetwork.py b/yt_dlp/extractor/nfhsnetwork.py new file mode 100644 index 000000000..802f6caf0 --- /dev/null +++ b/yt_dlp/extractor/nfhsnetwork.py @@ -0,0 +1,144 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +from ..utils import ( + try_get, + unified_strdate, + unified_timestamp +) + + +class NFHSNetworkIE(InfoExtractor): + IE_NAME = 'NFHSNetwork' + _VALID_URL = r'https?://(?:www\.)?nfhsnetwork\.com/events/[\w-]+/(?P<id>(?:gam|evt|dd|)?[\w\d]{0,10})' + _TESTS = [{ + # Auto-generated two-team sport (pixellot) + 'url': 'https://www.nfhsnetwork.com/events/rockford-high-school-rockford-mi/gamcf7e54cfbc', + 'info_dict': { + 'id': 'gamcf7e54cfbc', + 'ext': 'mp4', + 'title': 'Rockford vs Spring Lake - Girls Varsity Lacrosse 03/27/2021', + 'uploader': 'MHSAA - Michigan: Rockford High School, Rockford, MI', + 'uploader_id': 'cd2622cf76', + 'uploader_url': 'https://www.nfhsnetwork.com/schools/rockford-high-school-rockford-mi', + 'location': 'Rockford, Michigan', + 'timestamp': 1616859000, + 'upload_date': '20210327' + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + }, { + # Non-sport activity with description + 'url': 'https://www.nfhsnetwork.com/events/limon-high-school-limon-co/evt4a30e3726c', + 'info_dict': { + 'id': 'evt4a30e3726c', + 'ext': 'mp4', + 'title': 'Drama Performance Limon High School vs. Limon High School - 12/13/2020', + 'description': 'Join the broadcast of the Limon High School Musical Performance at 2 PM.', + 'uploader': 'CHSAA: Limon High School, Limon, CO', + 'uploader_id': '7d2d121332', + 'uploader_url': 'https://www.nfhsnetwork.com/schools/limon-high-school-limon-co', + 'location': 'Limon, Colorado', + 'timestamp': 1607893200, + 'upload_date': '20201213' + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + }, { + # Postseason game + 'url': 'https://www.nfhsnetwork.com/events/nfhs-network-special-events/dd8de71d45', + 'info_dict': { + 'id': 'dd8de71d45', + 'ext': 'mp4', + 'title': '2015 UA Holiday Classic Tournament: National Division - 12/26/2015', + 'uploader': 'SoCal Sports Productions', + 'uploader_id': '063dba0150', + 'uploader_url': 'https://www.nfhsnetwork.com/affiliates/socal-sports-productions', + 'location': 'San Diego, California', + 'timestamp': 1451187000, + 'upload_date': '20151226' + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + }, { + # Video with no broadcasts object + 'url': 'https://www.nfhsnetwork.com/events/wiaa-wi/9aa2f92f82', + 'info_dict': { + 'id': '9aa2f92f82', + 'ext': 'mp4', + 'title': 'Competitive Equity - 01/21/2015', + 'description': 'Committee members discuss points of their research regarding a competitive equity plan', + 'uploader': 'WIAA - Wisconsin: Wisconsin Interscholastic Athletic Association', + 'uploader_id': 'a49f7d1002', + 'uploader_url': 'https://www.nfhsnetwork.com/associations/wiaa-wi', + 'location': 'Stevens Point, Wisconsin', + 'timestamp': 1421856000, + 'upload_date': '20150121' + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + } + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + data = self._download_json( + 'https://cfunity.nfhsnetwork.com/v2/game_or_event/' + video_id, + video_id) + publisher = data.get('publishers')[0] # always exists + broadcast = (publisher.get('broadcasts') or publisher.get('vods'))[0] # some (older) videos don't have a broadcasts object + uploader = publisher.get('formatted_name') or publisher.get('name') + uploaderID = publisher.get('publisher_key') + pubType = publisher.get('type') + uploaderPrefix = ( + "schools" if pubType == "school" + else "associations" if "association" in pubType + else "affiliates" if (pubType == "publisher" or pubType == "affiliate") + else "schools") + uploaderPage = 'https://www.nfhsnetwork.com/%s/%s' % (uploaderPrefix, publisher.get('slug')) + location = '%s, %s' % (data.get('city'), data.get('state_name')) + description = broadcast.get('description') + isLive = broadcast.get('on_air') or broadcast.get('status') == 'on_air' or False + + timestamp = unified_timestamp(data.get('local_start_time')) + upload_date = unified_strdate(data.get('local_start_time')) + + title = ( + self._og_search_title(webpage) + or self._html_search_regex(r'<h1 class="sr-hidden">(.*?)</h1>', webpage, 'title')) + title = title.split('|')[0].strip() + + video_type = 'broadcasts' if isLive else 'vods' + key = broadcast.get('key') if isLive else try_get(publisher, lambda x: x['vods'][0]['key']) + m3u8_url = self._download_json( + 'https://cfunity.nfhsnetwork.com/v2/%s/%s/url' % (video_type, key), + video_id).get('video_url') + + formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', live=isLive) + self._sort_formats(formats, ['res', 'tbr']) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': description, + 'timestamp': timestamp, + 'uploader': uploader, + 'uploader_id': uploaderID, + 'uploader_url': uploaderPage, + 'location': location, + 'upload_date': upload_date, + 'is_live': isLive + } diff --git a/youtube_dl/extractor/nfl.py b/yt_dlp/extractor/nfl.py index 871923e4c..871923e4c 100644 --- a/youtube_dl/extractor/nfl.py +++ b/yt_dlp/extractor/nfl.py diff --git a/yt_dlp/extractor/nhk.py b/yt_dlp/extractor/nhk.py new file mode 100644 index 000000000..950a3d0d4 --- /dev/null +++ b/yt_dlp/extractor/nhk.py @@ -0,0 +1,177 @@ +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import urljoin + + +class NhkBaseIE(InfoExtractor): + _API_URL_TEMPLATE = 'https://api.nhk.or.jp/nhkworld/%sod%slist/v7a/%s/%s/%s/all%s.json' + _BASE_URL_REGEX = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/ondemand' + _TYPE_REGEX = r'/(?P<type>video|audio)/' + + def _call_api(self, m_id, lang, is_video, is_episode, is_clip): + return self._download_json( + self._API_URL_TEMPLATE % ( + 'v' if is_video else 'r', + 'clip' if is_clip else 'esd', + 'episode' if is_episode else 'program', + m_id, lang, '/all' if is_video else ''), + m_id, query={'apikey': 'EJfK8jdS57GqlupFgAfAAwr573q01y6k'})['data']['episodes'] or [] + + def _extract_episode_info(self, url, episode=None): + fetch_episode = episode is None + lang, m_type, episode_id = NhkVodIE._match_valid_url(url).groups() + if episode_id.isdigit(): + episode_id = episode_id[:4] + '-' + episode_id[4:] + + is_video = m_type == 'video' + if fetch_episode: + episode = self._call_api( + episode_id, lang, is_video, True, episode_id[:4] == '9999')[0] + title = episode.get('sub_title_clean') or episode['sub_title'] + + def get_clean_field(key): + return episode.get(key + '_clean') or episode.get(key) + + series = get_clean_field('title') + + thumbnails = [] + for s, w, h in [('', 640, 360), ('_l', 1280, 720)]: + img_path = episode.get('image' + s) + if not img_path: + continue + thumbnails.append({ + 'id': '%dp' % h, + 'height': h, + 'width': w, + 'url': 'https://www3.nhk.or.jp' + img_path, + }) + + info = { + 'id': episode_id + '-' + lang, + 'title': '%s - %s' % (series, title) if series and title else title, + 'description': get_clean_field('description'), + 'thumbnails': thumbnails, + 'series': series, + 'episode': title, + } + if is_video: + vod_id = episode['vod_id'] + info.update({ + '_type': 'url_transparent', + 'ie_key': 'Piksel', + 'url': 'https://player.piksel.com/v/refid/nhkworld/prefid/' + vod_id, + 'id': vod_id, + }) + else: + if fetch_episode: + audio_path = episode['audio']['audio'] + info['formats'] = self._extract_m3u8_formats( + 'https://nhkworld-vh.akamaihd.net/i%s/master.m3u8' % audio_path, + episode_id, 'm4a', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False) + for f in info['formats']: + f['language'] = lang + else: + info.update({ + '_type': 'url_transparent', + 'ie_key': NhkVodIE.ie_key(), + 'url': url, + }) + return info + + +class NhkVodIE(NhkBaseIE): + _VALID_URL = r'%s%s(?P<id>\d{7}|[^/]+?-\d{8}-[0-9a-z]+)' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX) + # Content available only for a limited period of time. Visit + # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples. + _TESTS = [{ + # video clip + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999011/', + 'md5': '7a90abcfe610ec22a6bfe15bd46b30ca', + 'info_dict': { + 'id': 'a95j5iza', + 'ext': 'mp4', + 'title': "Dining with the Chef - Chef Saito's Family recipe: MENCHI-KATSU", + 'description': 'md5:5aee4a9f9d81c26281862382103b0ea5', + 'timestamp': 1565965194, + 'upload_date': '20190816', + }, + }, { + # audio clip + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/r_inventions-20201104-1/', + 'info_dict': { + 'id': 'r_inventions-20201104-1-en', + 'ext': 'm4a', + 'title': "Japan's Top Inventions - Miniature Video Cameras", + 'description': 'md5:07ea722bdbbb4936fdd360b6a480c25b', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2015173/', + 'only_matching': True, + }, { + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/plugin-20190404-1/', + 'only_matching': True, + }, { + 'url': 'https://www3.nhk.or.jp/nhkworld/fr/ondemand/audio/plugin-20190404-1/', + 'only_matching': True, + }, { + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/j_art-20150903-1/', + 'only_matching': True, + }] + + def _real_extract(self, url): + return self._extract_episode_info(url) + + +class NhkVodProgramIE(NhkBaseIE): + _VALID_URL = r'%s/program%s(?P<id>[0-9a-z]+)(?:.+?\btype=(?P<episode_type>clip|(?:radio|tv)Episode))?' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX) + _TESTS = [{ + # video program episodes + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway', + 'info_dict': { + 'id': 'japanrailway', + 'title': 'Japan Railway Journal', + }, + 'playlist_mincount': 1, + }, { + # video program clips + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway/?type=clip', + 'info_dict': { + 'id': 'japanrailway', + 'title': 'Japan Railway Journal', + }, + 'playlist_mincount': 5, + }, { + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/10yearshayaomiyazaki/', + 'only_matching': True, + }, { + # audio program + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/audio/listener/', + 'only_matching': True, + }] + + def _real_extract(self, url): + lang, m_type, program_id, episode_type = self._match_valid_url(url).groups() + + episodes = self._call_api( + program_id, lang, m_type == 'video', False, episode_type == 'clip') + + entries = [] + for episode in episodes: + episode_path = episode.get('url') + if not episode_path: + continue + entries.append(self._extract_episode_info( + urljoin(url, episode_path), episode)) + + program_title = None + if entries: + program_title = entries[0].get('series') + + return self.playlist_result(entries, program_id, program_title) diff --git a/yt_dlp/extractor/nhl.py b/yt_dlp/extractor/nhl.py new file mode 100644 index 000000000..d3a5e17e9 --- /dev/null +++ b/yt_dlp/extractor/nhl.py @@ -0,0 +1,127 @@ +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + int_or_none, + parse_iso8601, + parse_duration, +) + + +class NHLBaseIE(InfoExtractor): + def _real_extract(self, url): + site, tmp_id = self._match_valid_url(url).groups() + video_data = self._download_json( + 'https://%s/%s/%sid/v1/%s/details/web-v1.json' + % (self._CONTENT_DOMAIN, site[:3], 'item/' if site == 'mlb' else '', tmp_id), tmp_id) + if video_data.get('type') != 'video': + video_data = video_data['media'] + video = video_data.get('video') + if video: + video_data = video + else: + videos = video_data.get('videos') + if videos: + video_data = videos[0] + + video_id = compat_str(video_data['id']) + title = video_data['title'] + + formats = [] + for playback in video_data.get('playbacks', []): + playback_url = playback.get('url') + if not playback_url: + continue + ext = determine_ext(playback_url) + if ext == 'm3u8': + m3u8_formats = self._extract_m3u8_formats( + playback_url, video_id, 'mp4', 'm3u8_native', + m3u8_id=playback.get('name', 'hls'), fatal=False) + self._check_formats(m3u8_formats, video_id) + formats.extend(m3u8_formats) + else: + height = int_or_none(playback.get('height')) + formats.append({ + 'format_id': playback.get('name', 'http' + ('-%dp' % height if height else '')), + 'url': playback_url, + 'width': int_or_none(playback.get('width')), + 'height': height, + 'tbr': int_or_none(self._search_regex(r'_(\d+)[kK]', playback_url, 'bitrate', default=None)), + }) + self._sort_formats(formats) + + thumbnails = [] + cuts = video_data.get('image', {}).get('cuts') or [] + if isinstance(cuts, dict): + cuts = cuts.values() + for thumbnail_data in cuts: + thumbnail_url = thumbnail_data.get('src') + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'width': int_or_none(thumbnail_data.get('width')), + 'height': int_or_none(thumbnail_data.get('height')), + }) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('description'), + 'timestamp': parse_iso8601(video_data.get('date')), + 'duration': parse_duration(video_data.get('duration')), + 'thumbnails': thumbnails, + 'formats': formats, + } + + +class NHLIE(NHLBaseIE): + IE_NAME = 'nhl.com' + _VALID_URL = r'https?://(?:www\.)?(?P<site>nhl|wch2016)\.com/(?:[^/]+/)*c-(?P<id>\d+)' + _CONTENT_DOMAIN = 'nhl.bamcontent.com' + _TESTS = [{ + # type=video + 'url': 'https://www.nhl.com/video/anisimov-cleans-up-mess/t-277752844/c-43663503', + 'md5': '0f7b9a8f986fb4b4eeeece9a56416eaf', + 'info_dict': { + 'id': '43663503', + 'ext': 'mp4', + 'title': 'Anisimov cleans up mess', + 'description': 'md5:a02354acdfe900e940ce40706939ca63', + 'timestamp': 1461288600, + 'upload_date': '20160422', + }, + }, { + # type=article + 'url': 'https://www.nhl.com/news/dennis-wideman-suspended/c-278258934', + 'md5': '1f39f4ea74c1394dea110699a25b366c', + 'info_dict': { + 'id': '40784403', + 'ext': 'mp4', + 'title': 'Wideman suspended by NHL', + 'description': 'Flames defenseman Dennis Wideman was banned 20 games for violation of Rule 40 (Physical Abuse of Officials)', + 'upload_date': '20160204', + 'timestamp': 1454544904, + }, + }, { + # Some m3u8 URLs are invalid (https://github.com/ytdl-org/youtube-dl/issues/10713) + 'url': 'https://www.nhl.com/predators/video/poile-laviolette-on-subban-trade/t-277437416/c-44315003', + 'md5': '50b2bb47f405121484dda3ccbea25459', + 'info_dict': { + 'id': '44315003', + 'ext': 'mp4', + 'title': 'Poile, Laviolette on Subban trade', + 'description': 'General manager David Poile and head coach Peter Laviolette share their thoughts on acquiring P.K. Subban from Montreal (06/29/16)', + 'timestamp': 1467242866, + 'upload_date': '20160629', + }, + }, { + 'url': 'https://www.wch2016.com/video/caneur-best-of-game-2-micd-up/t-281230378/c-44983703', + 'only_matching': True, + }, { + 'url': 'https://www.wch2016.com/news/3-stars-team-europe-vs-team-canada/c-282195068', + 'only_matching': True, + }] diff --git a/yt_dlp/extractor/nick.py b/yt_dlp/extractor/nick.py new file mode 100644 index 000000000..ba7da7602 --- /dev/null +++ b/yt_dlp/extractor/nick.py @@ -0,0 +1,248 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .mtv import MTVServicesInfoExtractor +from ..utils import update_url_query + + +class NickIE(MTVServicesInfoExtractor): + IE_NAME = 'nick.com' + _VALID_URL = r'https?://(?P<domain>(?:www\.)?nick(?:jr)?\.com)/(?:[^/]+/)?(?P<type>videos/clip|[^/]+/videos|episodes/[^/]+)/(?P<id>[^/?#.]+)' + _FEED_URL = 'http://udat.mtvnservices.com/service1/dispatch.htm' + _GEO_COUNTRIES = ['US'] + _TESTS = [{ + 'url': 'https://www.nick.com/episodes/sq47rw/spongebob-squarepants-a-place-for-pets-lockdown-for-love-season-13-ep-1', + 'info_dict': { + 'description': 'md5:0650a9eb88955609d5c1d1c79292e234', + 'title': 'A Place for Pets/Lockdown for Love', + }, + 'playlist': [ + { + 'md5': 'cb8a2afeafb7ae154aca5a64815ec9d6', + 'info_dict': { + 'id': '85ee8177-d6ce-48f8-9eee-a65364f8a6df', + 'ext': 'mp4', + 'title': 'SpongeBob SquarePants: "A Place for Pets/Lockdown for Love" S1', + 'description': 'A Place for Pets/Lockdown for Love: When customers bring pets into the Krusty Krab, Mr. Krabs realizes pets are more profitable than owners. Plankton ruins another date with Karen, so she puts the Chum Bucket on lockdown until he proves his affection.', + + } + }, + { + 'md5': '839a04f49900a1fcbf517020d94e0737', + 'info_dict': { + 'id': '2e2a9960-8fd4-411d-868b-28eb1beb7fae', + 'ext': 'mp4', + 'title': 'SpongeBob SquarePants: "A Place for Pets/Lockdown for Love" S2', + 'description': 'A Place for Pets/Lockdown for Love: When customers bring pets into the Krusty Krab, Mr. Krabs realizes pets are more profitable than owners. Plankton ruins another date with Karen, so she puts the Chum Bucket on lockdown until he proves his affection.', + + } + }, + { + 'md5': 'f1145699f199770e2919ee8646955d46', + 'info_dict': { + 'id': 'dc91c304-6876-40f7-84a6-7aece7baa9d0', + 'ext': 'mp4', + 'title': 'SpongeBob SquarePants: "A Place for Pets/Lockdown for Love" S3', + 'description': 'A Place for Pets/Lockdown for Love: When customers bring pets into the Krusty Krab, Mr. Krabs realizes pets are more profitable than owners. Plankton ruins another date with Karen, so she puts the Chum Bucket on lockdown until he proves his affection.', + + } + }, + { + 'md5': 'd463116875aee2585ee58de3b12caebd', + 'info_dict': { + 'id': '5d929486-cf4c-42a1-889a-6e0d183a101a', + 'ext': 'mp4', + 'title': 'SpongeBob SquarePants: "A Place for Pets/Lockdown for Love" S4', + 'description': 'A Place for Pets/Lockdown for Love: When customers bring pets into the Krusty Krab, Mr. Krabs realizes pets are more profitable than owners. Plankton ruins another date with Karen, so she puts the Chum Bucket on lockdown until he proves his affection.', + + } + }, + ], + }, { + 'url': 'http://www.nickjr.com/blues-clues-and-you/videos/blues-clues-and-you-original-209-imagination-station/', + 'info_dict': { + 'id': '31631529-2fc5-430b-b2ef-6a74b4609abd', + 'ext': 'mp4', + 'description': 'md5:9d65a66df38e02254852794b2809d1cf', + 'title': 'Blue\'s Imagination Station', + }, + 'skip': 'Not accessible?' + }] + + def _get_feed_query(self, uri): + return { + 'feed': 'nick_arc_player_prime', + 'mgid': uri, + } + + def _real_extract(self, url): + domain, video_type, display_id = self._match_valid_url(url).groups() + if video_type.startswith("episodes"): + return super()._real_extract(url) + video_data = self._download_json( + 'http://%s/data/video.endLevel.json' % domain, + display_id, query={ + 'urlKey': display_id, + }) + return self._get_videos_info(video_data['player'] + video_data['id']) + + +class NickBrIE(MTVServicesInfoExtractor): + IE_NAME = 'nickelodeon:br' + _VALID_URL = r'''(?x) + https?:// + (?: + (?P<domain>(?:www\.)?nickjr|mundonick\.uol)\.com\.br| + (?:www\.)?nickjr\.[a-z]{2}| + (?:www\.)?nickelodeonjunior\.fr + ) + /(?:programas/)?[^/]+/videos/(?:episodios/)?(?P<id>[^/?\#.]+) + ''' + _TESTS = [{ + 'url': 'http://www.nickjr.com.br/patrulha-canina/videos/210-labirinto-de-pipoca/', + 'only_matching': True, + }, { + 'url': 'http://mundonick.uol.com.br/programas/the-loud-house/videos/muitas-irmas/7ljo9j', + 'only_matching': True, + }, { + 'url': 'http://www.nickjr.nl/paw-patrol/videos/311-ge-wol-dig-om-terug-te-zijn/', + 'only_matching': True, + }, { + 'url': 'http://www.nickjr.de/blaze-und-die-monster-maschinen/videos/f6caaf8f-e4e8-4cc1-b489-9380d6dcd059/', + 'only_matching': True, + }, { + 'url': 'http://www.nickelodeonjunior.fr/paw-patrol-la-pat-patrouille/videos/episode-401-entier-paw-patrol/', + 'only_matching': True, + }] + + def _real_extract(self, url): + domain, display_id = self._match_valid_url(url).groups() + webpage = self._download_webpage(url, display_id) + uri = self._search_regex( + r'data-(?:contenturi|mgid)="([^"]+)', webpage, 'mgid') + video_id = self._id_from_uri(uri) + config = self._download_json( + 'http://media.mtvnservices.com/pmt/e1/access/index.html', + video_id, query={ + 'uri': uri, + 'configtype': 'edge', + }, headers={ + 'Referer': url, + }) + info_url = self._remove_template_parameter(config['feedWithQueryParams']) + if info_url == 'None': + if domain.startswith('www.'): + domain = domain[4:] + content_domain = { + 'mundonick.uol': 'mundonick.com.br', + 'nickjr': 'br.nickelodeonjunior.tv', + }[domain] + query = { + 'mgid': uri, + 'imageEp': content_domain, + 'arcEp': content_domain, + } + if domain == 'nickjr.com.br': + query['ep'] = 'c4b16088' + info_url = update_url_query( + 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed', query) + return self._get_videos_info_from_url(info_url, video_id) + + +class NickDeIE(MTVServicesInfoExtractor): + IE_NAME = 'nick.de' + _VALID_URL = r'https?://(?:www\.)?(?P<host>nick\.(?:de|com\.pl|ch)|nickelodeon\.(?:nl|be|at|dk|no|se))/[^/]+/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'http://www.nick.de/playlist/3773-top-videos/videos/episode/17306-zu-wasser-und-zu-land-rauchende-erdnusse', + 'only_matching': True, + }, { + 'url': 'http://www.nick.de/shows/342-icarly', + 'only_matching': True, + }, { + 'url': 'http://www.nickelodeon.nl/shows/474-spongebob/videos/17403-een-kijkje-in-de-keuken-met-sandy-van-binnenuit', + 'only_matching': True, + }, { + 'url': 'http://www.nickelodeon.at/playlist/3773-top-videos/videos/episode/77993-das-letzte-gefecht', + 'only_matching': True, + }, { + 'url': 'http://www.nick.com.pl/seriale/474-spongebob-kanciastoporty/wideo/17412-teatr-to-jest-to-rodeo-oszolom', + 'only_matching': True, + }, { + 'url': 'http://www.nickelodeon.no/program/2626-bulderhuset/videoer/90947-femteklasse-veronica-vs-vanzilla', + 'only_matching': True, + }, { + 'url': 'http://www.nickelodeon.dk/serier/2626-hojs-hus/videoer/761-tissepause', + 'only_matching': True, + }, { + 'url': 'http://www.nickelodeon.se/serier/2626-lugn-i-stormen/videos/998-', + 'only_matching': True, + }, { + 'url': 'http://www.nick.ch/shows/2304-adventure-time-abenteuerzeit-mit-finn-und-jake', + 'only_matching': True, + }, { + 'url': 'http://www.nickelodeon.be/afspeellijst/4530-top-videos/videos/episode/73917-inval-broodschapper-lariekoek-arie', + 'only_matching': True, + }] + + def _get_feed_url(self, uri, url=None): + video_id = self._id_from_uri(uri) + config = self._download_json( + 'http://media.mtvnservices.com/pmt/e1/access/index.html?uri=%s&configtype=edge&ref=%s' % (uri, url), video_id) + return self._remove_template_parameter(config['feedWithQueryParams']) + + +class NickNightIE(NickDeIE): + IE_NAME = 'nicknight' + _VALID_URL = r'https?://(?:www\.)(?P<host>nicknight\.(?:de|at|tv))/(?:playlist|shows)/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'http://www.nicknight.at/shows/977-awkward/videos/85987-nimmer-beste-freunde', + 'only_matching': True, + }, { + 'url': 'http://www.nicknight.at/shows/977-awkward', + 'only_matching': True, + }, { + 'url': 'http://www.nicknight.at/shows/1900-faking-it', + 'only_matching': True, + }] + + def _extract_mrss_url(self, webpage, *args): + return self._search_regex( + r'mrss\s*:\s*(["\'])(?P<url>http.+?)\1', webpage, + 'mrss url', group='url') + + +class NickRuIE(MTVServicesInfoExtractor): + IE_NAME = 'nickelodeonru' + _VALID_URL = r'https?://(?:www\.)nickelodeon\.(?:ru|fr|es|pt|ro|hu|com\.tr)/[^/]+/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'http://www.nickelodeon.ru/shows/henrydanger/videos/episodes/3-sezon-15-seriya-licenziya-na-polyot/pmomfb#playlist/7airc6', + 'only_matching': True, + }, { + 'url': 'http://www.nickelodeon.ru/videos/smotri-na-nickelodeon-v-iyule/g9hvh7', + 'only_matching': True, + }, { + 'url': 'http://www.nickelodeon.fr/programmes/bob-l-eponge/videos/le-marathon-de-booh-kini-bottom-mardi-31-octobre/nfn7z0', + 'only_matching': True, + }, { + 'url': 'http://www.nickelodeon.es/videos/nickelodeon-consejos-tortitas/f7w7xy', + 'only_matching': True, + }, { + 'url': 'http://www.nickelodeon.pt/series/spongebob-squarepants/videos/a-bolha-de-tinta-gigante/xutq1b', + 'only_matching': True, + }, { + 'url': 'http://www.nickelodeon.ro/emisiuni/shimmer-si-shine/video/nahal-din-bomboane/uw5u2k', + 'only_matching': True, + }, { + 'url': 'http://www.nickelodeon.hu/musorok/spongyabob-kockanadrag/videok/episodes/buborekfujas-az-elszakadt-nadrag/q57iob#playlist/k6te4y', + 'only_matching': True, + }, { + 'url': 'http://www.nickelodeon.com.tr/programlar/sunger-bob/videolar/kayip-yatak/mgqbjy', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + mgid = self._extract_mgid(webpage, url) + return self.url_result('http://media.mtvnservices.com/embed/%s' % mgid) diff --git a/yt_dlp/extractor/niconico.py b/yt_dlp/extractor/niconico.py new file mode 100644 index 000000000..76f087057 --- /dev/null +++ b/yt_dlp/extractor/niconico.py @@ -0,0 +1,797 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import datetime +import itertools +import json +import re + +from .common import InfoExtractor, SearchInfoExtractor +from ..postprocessor.ffmpeg import FFmpegPostProcessor +from ..compat import ( + compat_str, + compat_parse_qs, + compat_urllib_parse_urlparse, +) +from ..utils import ( + ExtractorError, + dict_get, + float_or_none, + int_or_none, + OnDemandPagedList, + parse_duration, + parse_iso8601, + PostProcessingError, + remove_start, + str_or_none, + try_get, + unified_timestamp, + urlencode_postdata, + xpath_text, +) + + +class NiconicoIE(InfoExtractor): + IE_NAME = 'niconico' + IE_DESC = 'ニコニコ動画' + + _TESTS = [{ + 'url': 'http://www.nicovideo.jp/watch/sm22312215', + 'md5': 'a5bad06f1347452102953f323c69da34s', + 'info_dict': { + 'id': 'sm22312215', + 'ext': 'mp4', + 'title': 'Big Buck Bunny', + 'thumbnail': r're:https?://.*', + 'uploader': 'takuya0301', + 'uploader_id': '2698420', + 'upload_date': '20131123', + 'timestamp': int, # timestamp is unstable + 'description': '(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org', + 'duration': 33, + 'view_count': int, + 'comment_count': int, + }, + 'skip': 'Requires an account', + }, { + # File downloaded with and without credentials are different, so omit + # the md5 field + 'url': 'http://www.nicovideo.jp/watch/nm14296458', + 'info_dict': { + 'id': 'nm14296458', + 'ext': 'swf', + 'title': '【鏡音リン】Dance on media【オリジナル】take2!', + 'description': 'md5:689f066d74610b3b22e0f1739add0f58', + 'thumbnail': r're:https?://.*', + 'uploader': 'りょうた', + 'uploader_id': '18822557', + 'upload_date': '20110429', + 'timestamp': 1304065916, + 'duration': 209, + }, + 'skip': 'Requires an account', + }, { + # 'video exists but is marked as "deleted" + # md5 is unstable + 'url': 'http://www.nicovideo.jp/watch/sm10000', + 'info_dict': { + 'id': 'sm10000', + 'ext': 'unknown_video', + 'description': 'deleted', + 'title': 'ドラえもんエターナル第3話「決戦第3新東京市」<前編>', + 'thumbnail': r're:https?://.*', + 'upload_date': '20071224', + 'timestamp': int, # timestamp field has different value if logged in + 'duration': 304, + 'view_count': int, + }, + 'skip': 'Requires an account', + }, { + 'url': 'http://www.nicovideo.jp/watch/so22543406', + 'info_dict': { + 'id': '1388129933', + 'ext': 'mp4', + 'title': '【第1回】RADIOアニメロミックス ラブライブ!~のぞえりRadio Garden~', + 'description': 'md5:b27d224bb0ff53d3c8269e9f8b561cf1', + 'thumbnail': r're:https?://.*', + 'timestamp': 1388851200, + 'upload_date': '20140104', + 'uploader': 'アニメロチャンネル', + 'uploader_id': '312', + }, + 'skip': 'The viewing period of the video you were searching for has expired.', + }, { + # video not available via `getflv`; "old" HTML5 video + 'url': 'http://www.nicovideo.jp/watch/sm1151009', + 'md5': '8fa81c364eb619d4085354eab075598a', + 'info_dict': { + 'id': 'sm1151009', + 'ext': 'mp4', + 'title': 'マスターシステム本体内蔵のスペハリのメインテーマ(PSG版)', + 'description': 'md5:6ee077e0581ff5019773e2e714cdd0b7', + 'thumbnail': r're:https?://.*', + 'duration': 184, + 'timestamp': 1190868283, + 'upload_date': '20070927', + 'uploader': 'denden2', + 'uploader_id': '1392194', + 'view_count': int, + 'comment_count': int, + }, + 'skip': 'Requires an account', + }, { + # "New" HTML5 video + # md5 is unstable + 'url': 'http://www.nicovideo.jp/watch/sm31464864', + 'info_dict': { + 'id': 'sm31464864', + 'ext': 'mp4', + 'title': '新作TVアニメ「戦姫絶唱シンフォギアAXZ」PV 最高画質', + 'description': 'md5:e52974af9a96e739196b2c1ca72b5feb', + 'timestamp': 1498514060, + 'upload_date': '20170626', + 'uploader': 'ゲスト', + 'uploader_id': '40826363', + 'thumbnail': r're:https?://.*', + 'duration': 198, + 'view_count': int, + 'comment_count': int, + }, + 'skip': 'Requires an account', + }, { + # Video without owner + 'url': 'http://www.nicovideo.jp/watch/sm18238488', + 'md5': 'd265680a1f92bdcbbd2a507fc9e78a9e', + 'info_dict': { + 'id': 'sm18238488', + 'ext': 'mp4', + 'title': '【実写版】ミュータントタートルズ', + 'description': 'md5:15df8988e47a86f9e978af2064bf6d8e', + 'timestamp': 1341160408, + 'upload_date': '20120701', + 'uploader': None, + 'uploader_id': None, + 'thumbnail': r're:https?://.*', + 'duration': 5271, + 'view_count': int, + 'comment_count': int, + }, + 'skip': 'Requires an account', + }, { + 'url': 'http://sp.nicovideo.jp/watch/sm28964488?ss_pos=1&cp_in=wt_tg', + 'only_matching': True, + }] + + _VALID_URL = r'https?://(?:www\.|secure\.|sp\.)?nicovideo\.jp/watch/(?P<id>(?:[a-z]{2})?[0-9]+)' + _NETRC_MACHINE = 'niconico' + + _API_HEADERS = { + 'X-Frontend-ID': '6', + 'X-Frontend-Version': '0' + } + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + # No authentication to be performed + if not username: + return True + + # Log in + login_ok = True + login_form_strs = { + 'mail_tel': username, + 'password': password, + } + urlh = self._request_webpage( + 'https://account.nicovideo.jp/api/v1/login', None, + note='Logging in', errnote='Unable to log in', + data=urlencode_postdata(login_form_strs)) + if urlh is False: + login_ok = False + else: + parts = compat_urllib_parse_urlparse(urlh.geturl()) + if compat_parse_qs(parts.query).get('message', [None])[0] == 'cant_login': + login_ok = False + if not login_ok: + self.report_warning('unable to log in: bad username or password') + return login_ok + + def _get_heartbeat_info(self, info_dict): + + video_id, video_src_id, audio_src_id = info_dict['url'].split(':')[1].split('/') + + api_data = ( + info_dict.get('_api_data') + or self._parse_json( + self._html_search_regex( + 'data-api-data="([^"]+)"', + self._download_webpage('http://www.nicovideo.jp/watch/' + video_id, video_id), + 'API data', default='{}'), + video_id)) + + session_api_data = try_get(api_data, lambda x: x['media']['delivery']['movie']['session']) + session_api_endpoint = try_get(session_api_data, lambda x: x['urls'][0]) + + def ping(): + status = try_get( + self._download_json( + 'https://nvapi.nicovideo.jp/v1/2ab0cbaa/watch', video_id, + query={'t': try_get(api_data, lambda x: x['media']['delivery']['trackingId'])}, + note='Acquiring permission for downloading video', + headers=self._API_HEADERS), + lambda x: x['meta']['status']) + if status != 200: + self.report_warning('Failed to acquire permission for playing video. The video may not download.') + + yesno = lambda x: 'yes' if x else 'no' + + # m3u8 (encryption) + if try_get(api_data, lambda x: x['media']['delivery']['encryption']) is not None: + protocol = 'm3u8' + encryption = self._parse_json(session_api_data['token'], video_id)['hls_encryption'] + session_api_http_parameters = { + 'parameters': { + 'hls_parameters': { + 'encryption': { + encryption: { + 'encrypted_key': try_get(api_data, lambda x: x['media']['delivery']['encryption']['encryptedKey']), + 'key_uri': try_get(api_data, lambda x: x['media']['delivery']['encryption']['keyUri']) + } + }, + 'transfer_preset': '', + 'use_ssl': yesno(session_api_endpoint['isSsl']), + 'use_well_known_port': yesno(session_api_endpoint['isWellKnownPort']), + 'segment_duration': 6000, + } + } + } + # http + else: + protocol = 'http' + session_api_http_parameters = { + 'parameters': { + 'http_output_download_parameters': { + 'use_ssl': yesno(session_api_endpoint['isSsl']), + 'use_well_known_port': yesno(session_api_endpoint['isWellKnownPort']), + } + } + } + + session_response = self._download_json( + session_api_endpoint['url'], video_id, + query={'_format': 'json'}, + headers={'Content-Type': 'application/json'}, + note='Downloading JSON metadata for %s' % info_dict['format_id'], + data=json.dumps({ + 'session': { + 'client_info': { + 'player_id': session_api_data.get('playerId'), + }, + 'content_auth': { + 'auth_type': try_get(session_api_data, lambda x: x['authTypes'][session_api_data['protocols'][0]]), + 'content_key_timeout': session_api_data.get('contentKeyTimeout'), + 'service_id': 'nicovideo', + 'service_user_id': session_api_data.get('serviceUserId') + }, + 'content_id': session_api_data.get('contentId'), + 'content_src_id_sets': [{ + 'content_src_ids': [{ + 'src_id_to_mux': { + 'audio_src_ids': [audio_src_id], + 'video_src_ids': [video_src_id], + } + }] + }], + 'content_type': 'movie', + 'content_uri': '', + 'keep_method': { + 'heartbeat': { + 'lifetime': session_api_data.get('heartbeatLifetime') + } + }, + 'priority': session_api_data.get('priority'), + 'protocol': { + 'name': 'http', + 'parameters': { + 'http_parameters': session_api_http_parameters + } + }, + 'recipe_id': session_api_data.get('recipeId'), + 'session_operation_auth': { + 'session_operation_auth_by_signature': { + 'signature': session_api_data.get('signature'), + 'token': session_api_data.get('token'), + } + }, + 'timing_constraint': 'unlimited' + } + }).encode()) + + info_dict['url'] = session_response['data']['session']['content_uri'] + info_dict['protocol'] = protocol + + # get heartbeat info + heartbeat_info_dict = { + 'url': session_api_endpoint['url'] + '/' + session_response['data']['session']['id'] + '?_format=json&_method=PUT', + 'data': json.dumps(session_response['data']), + # interval, convert milliseconds to seconds, then halve to make a buffer. + 'interval': float_or_none(session_api_data.get('heartbeatLifetime'), scale=3000), + 'ping': ping + } + + return info_dict, heartbeat_info_dict + + def _extract_format_for_quality(self, api_data, video_id, audio_quality, video_quality): + def parse_format_id(id_code): + mobj = re.match(r'''(?x) + (?:archive_)? + (?:(?P<codec>[^_]+)_)? + (?:(?P<br>[\d]+)kbps_)? + (?:(?P<res>[\d+]+)p_)? + ''', '%s_' % id_code) + return mobj.groupdict() if mobj else {} + + protocol = 'niconico_dmc' + format_id = '-'.join(map(lambda s: remove_start(s['id'], 'archive_'), [video_quality, audio_quality])) + vdict = parse_format_id(video_quality['id']) + adict = parse_format_id(audio_quality['id']) + resolution = try_get(video_quality, lambda x: x['metadata']['resolution'], dict) or {'height': vdict.get('res')} + vbr = try_get(video_quality, lambda x: x['metadata']['bitrate'], float) + + return { + 'url': '%s:%s/%s/%s' % (protocol, video_id, video_quality['id'], audio_quality['id']), + 'format_id': format_id, + 'format_note': 'DMC %s' % try_get(video_quality, lambda x: x['metadata']['label'], compat_str), + 'ext': 'mp4', # Session API are used in HTML5, which always serves mp4 + 'vcodec': vdict.get('codec'), + 'acodec': adict.get('codec'), + 'vbr': float_or_none(vbr, 1000) or float_or_none(vdict.get('br')), + 'abr': float_or_none(audio_quality.get('bitrate'), 1000) or float_or_none(adict.get('br')), + 'height': int_or_none(resolution.get('height', vdict.get('res'))), + 'width': int_or_none(resolution.get('width')), + 'quality': -2 if 'low' in format_id else -1, # Default quality value is -1 + 'protocol': protocol, + 'http_headers': { + 'Origin': 'https://www.nicovideo.jp', + 'Referer': 'https://www.nicovideo.jp/watch/' + video_id, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + # Get video webpage for API data. + webpage, handle = self._download_webpage_handle( + 'http://www.nicovideo.jp/watch/' + video_id, video_id) + if video_id.startswith('so'): + video_id = self._match_id(handle.geturl()) + + api_data = self._parse_json(self._html_search_regex( + 'data-api-data="([^"]+)"', webpage, + 'API data', default='{}'), video_id) + + def get_video_info_web(items): + return dict_get(api_data['video'], items) + + # Get video info + video_info_xml = self._download_xml( + 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, + video_id, note='Downloading video info page') + + def get_video_info_xml(items): + if not isinstance(items, list): + items = [items] + for item in items: + ret = xpath_text(video_info_xml, './/' + item) + if ret: + return ret + + if get_video_info_xml('error'): + error_code = get_video_info_xml('code') + + if error_code == 'DELETED': + raise ExtractorError('The video has been deleted.', + expected=True) + elif error_code == 'NOT_FOUND': + raise ExtractorError('The video is not found.', + expected=True) + elif error_code == 'COMMUNITY': + self.to_screen('%s: The video is community members only.' % video_id) + else: + raise ExtractorError('%s reports error: %s' % (self.IE_NAME, error_code)) + + # Start extracting video formats + formats = [] + + # Get HTML5 videos info + quality_info = try_get(api_data, lambda x: x['media']['delivery']['movie']) + if not quality_info: + raise ExtractorError('The video can\'t be downloaded', expected=True) + + for audio_quality in quality_info.get('audios') or {}: + for video_quality in quality_info.get('videos') or {}: + if not audio_quality.get('isAvailable') or not video_quality.get('isAvailable'): + continue + formats.append(self._extract_format_for_quality( + api_data, video_id, audio_quality, video_quality)) + + # Get flv/swf info + timestamp = None + video_real_url = try_get(api_data, lambda x: x['video']['smileInfo']['url']) + if video_real_url: + is_economy = video_real_url.endswith('low') + + if is_economy: + self.report_warning('Site is currently in economy mode! You will only have access to lower quality streams') + + # Invoking ffprobe to determine resolution + pp = FFmpegPostProcessor(self._downloader) + cookies = self._get_cookies('https://nicovideo.jp').output(header='', sep='; path=/; domain=nicovideo.jp;\n') + + self.to_screen('%s: %s' % (video_id, 'Checking smile format with ffprobe')) + + try: + metadata = pp.get_metadata_object(video_real_url, ['-cookies', cookies]) + except PostProcessingError as err: + raise ExtractorError(err.msg, expected=True) + + v_stream = a_stream = {} + + # Some complex swf files doesn't have video stream (e.g. nm4809023) + for stream in metadata['streams']: + if stream['codec_type'] == 'video': + v_stream = stream + elif stream['codec_type'] == 'audio': + a_stream = stream + + # Community restricted videos seem to have issues with the thumb API not returning anything at all + filesize = int( + (get_video_info_xml('size_high') if not is_economy else get_video_info_xml('size_low')) + or metadata['format']['size'] + ) + extension = ( + get_video_info_xml('movie_type') + or 'mp4' if 'mp4' in metadata['format']['format_name'] else metadata['format']['format_name'] + ) + + # 'creation_time' tag on video stream of re-encoded SMILEVIDEO mp4 files are '1970-01-01T00:00:00.000000Z'. + timestamp = ( + parse_iso8601(get_video_info_web('first_retrieve')) + or unified_timestamp(get_video_info_web('postedDateTime')) + ) + metadata_timestamp = ( + parse_iso8601(try_get(v_stream, lambda x: x['tags']['creation_time'])) + or timestamp if extension != 'mp4' else 0 + ) + + # According to compconf, smile videos from pre-2017 are always better quality than their DMC counterparts + smile_threshold_timestamp = parse_iso8601('2016-12-08T00:00:00+09:00') + + is_source = timestamp < smile_threshold_timestamp or metadata_timestamp > 0 + + # If movie file size is unstable, old server movie is not source movie. + if filesize > 1: + formats.append({ + 'url': video_real_url, + 'format_id': 'smile' if not is_economy else 'smile_low', + 'format_note': 'SMILEVIDEO source' if not is_economy else 'SMILEVIDEO low quality', + 'ext': extension, + 'container': extension, + 'vcodec': v_stream.get('codec_name'), + 'acodec': a_stream.get('codec_name'), + # Some complex swf files doesn't have total bit rate metadata (e.g. nm6049209) + 'tbr': int_or_none(metadata['format'].get('bit_rate'), scale=1000), + 'vbr': int_or_none(v_stream.get('bit_rate'), scale=1000), + 'abr': int_or_none(a_stream.get('bit_rate'), scale=1000), + 'height': int_or_none(v_stream.get('height')), + 'width': int_or_none(v_stream.get('width')), + 'source_preference': 5 if not is_economy else -2, + 'quality': 5 if is_source and not is_economy else None, + 'filesize': filesize + }) + + self._sort_formats(formats) + + # Start extracting information + title = ( + get_video_info_xml('title') # prefer to get the untranslated original title + or get_video_info_web(['originalTitle', 'title']) + or self._og_search_title(webpage, default=None) + or self._html_search_regex( + r'<span[^>]+class="videoHeaderTitle"[^>]*>([^<]+)</span>', + webpage, 'video title')) + + watch_api_data_string = self._html_search_regex( + r'<div[^>]+id="watchAPIDataContainer"[^>]+>([^<]+)</div>', + webpage, 'watch api data', default=None) + watch_api_data = self._parse_json(watch_api_data_string, video_id) if watch_api_data_string else {} + video_detail = watch_api_data.get('videoDetail', {}) + + thumbnail = ( + self._html_search_regex(r'<meta property="og:image" content="([^"]+)">', webpage, 'thumbnail data', default=None) + or dict_get( # choose highest from 720p to 240p + get_video_info_web('thumbnail'), + ['ogp', 'player', 'largeUrl', 'middleUrl', 'url']) + or self._html_search_meta('image', webpage, 'thumbnail', default=None) + or video_detail.get('thumbnail')) + + description = get_video_info_web('description') + + if not timestamp: + match = self._html_search_meta('datePublished', webpage, 'date published', default=None) + if match: + timestamp = parse_iso8601(match.replace('+', ':00+')) + if not timestamp and video_detail.get('postedAt'): + timestamp = parse_iso8601( + video_detail['postedAt'].replace('/', '-'), + delimiter=' ', timezone=datetime.timedelta(hours=9)) + timestamp = timestamp or try_get(api_data, lambda x: parse_iso8601(x['video']['registeredAt'])) + + view_count = int_or_none(get_video_info_web(['view_counter', 'viewCount'])) + if not view_count: + match = self._html_search_regex( + r'>Views: <strong[^>]*>([^<]+)</strong>', + webpage, 'view count', default=None) + if match: + view_count = int_or_none(match.replace(',', '')) + view_count = ( + view_count + or video_detail.get('viewCount') + or try_get(api_data, lambda x: x['video']['count']['view'])) + + comment_count = ( + int_or_none(get_video_info_web('comment_num')) + or video_detail.get('commentCount') + or try_get(api_data, lambda x: x['video']['count']['comment'])) + + if not comment_count: + match = self._html_search_regex( + r'>Comments: <strong[^>]*>([^<]+)</strong>', + webpage, 'comment count', default=None) + if match: + comment_count = int_or_none(match.replace(',', '')) + + duration = (parse_duration( + get_video_info_web('length') + or self._html_search_meta( + 'video:duration', webpage, 'video duration', default=None)) + or video_detail.get('length') + or get_video_info_web('duration')) + + webpage_url = get_video_info_web('watch_url') or url + + # for channel movie and community movie + channel_id = try_get( + api_data, + (lambda x: x['channel']['globalId'], + lambda x: x['community']['globalId'])) + channel = try_get( + api_data, + (lambda x: x['channel']['name'], + lambda x: x['community']['name'])) + + # Note: cannot use api_data.get('owner', {}) because owner may be set to "null" + # in the JSON, which will cause None to be returned instead of {}. + owner = try_get(api_data, lambda x: x.get('owner'), dict) or {} + uploader_id = str_or_none( + get_video_info_web(['ch_id', 'user_id']) + or owner.get('id') + or channel_id + ) + uploader = ( + get_video_info_web(['ch_name', 'user_nickname']) + or owner.get('nickname') + or channel + ) + + return { + 'id': video_id, + '_api_data': api_data, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + 'description': description, + 'uploader': uploader, + 'timestamp': timestamp, + 'uploader_id': uploader_id, + 'channel': channel, + 'channel_id': channel_id, + 'view_count': view_count, + 'comment_count': comment_count, + 'duration': duration, + 'webpage_url': webpage_url, + } + + +class NiconicoPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/(?:user/\d+/|my/)?mylist/(?P<id>\d+)' + + _TESTS = [{ + 'url': 'http://www.nicovideo.jp/mylist/27411728', + 'info_dict': { + 'id': '27411728', + 'title': 'AKB48のオールナイトニッポン', + 'description': 'md5:d89694c5ded4b6c693dea2db6e41aa08', + 'uploader': 'のっく', + 'uploader_id': '805442', + }, + 'playlist_mincount': 225, + }, { + 'url': 'https://www.nicovideo.jp/user/805442/mylist/27411728', + 'only_matching': True, + }] + + _API_HEADERS = { + 'X-Frontend-ID': '6', + 'X-Frontend-Version': '0' + } + + def _real_extract(self, url): + list_id = self._match_id(url) + + def get_page_data(pagenum, pagesize): + return self._download_json( + 'http://nvapi.nicovideo.jp/v2/mylists/' + list_id, list_id, + query={'page': 1 + pagenum, 'pageSize': pagesize}, + headers=self._API_HEADERS).get('data').get('mylist') + + data = get_page_data(0, 1) + title = data.get('name') + description = data.get('description') + uploader = data.get('owner').get('name') + uploader_id = data.get('owner').get('id') + + def pagefunc(pagenum): + data = get_page_data(pagenum, 25) + return ({ + '_type': 'url', + 'url': 'http://www.nicovideo.jp/watch/' + item.get('watchId'), + } for item in data.get('items')) + + return { + '_type': 'playlist', + 'id': list_id, + 'title': title, + 'description': description, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'entries': OnDemandPagedList(pagefunc, 25), + } + + +NicovideoSearchIE_NAME = 'nicovideo:search' + + +class NicovideoSearchURLIE(InfoExtractor): + IE_NAME = f'{NicovideoSearchIE_NAME}_url' + IE_DESC = 'Nico video search URLs' + _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/search/(?P<id>[^?#&]+)?' + _TESTS = [{ + 'url': 'http://www.nicovideo.jp/search/sm9', + 'info_dict': { + 'id': 'sm9', + 'title': 'sm9' + }, + 'playlist_mincount': 40, + }, { + 'url': 'https://www.nicovideo.jp/search/sm9?sort=h&order=d&end=2020-12-31&start=2020-01-01', + 'info_dict': { + 'id': 'sm9', + 'title': 'sm9' + }, + 'playlist_count': 31, + }] + + def _entries(self, url, item_id, query=None, note='Downloading page %(page)s'): + query = query or {} + pages = [query['page']] if 'page' in query else itertools.count(1) + for page_num in pages: + query['page'] = str(page_num) + webpage = self._download_webpage(url, item_id, query=query, note=note % {'page': page_num}) + results = re.findall(r'(?<=data-video-id=)["\']?(?P<videoid>.*?)(?=["\'])', webpage) + for item in results: + yield self.url_result(f'http://www.nicovideo.jp/watch/{item}', 'Niconico', item) + if not results: + break + + def _real_extract(self, url): + query = self._match_id(url) + return self.playlist_result(self._entries(url, query), query, query) + + +class NicovideoSearchIE(SearchInfoExtractor, NicovideoSearchURLIE): + IE_DESC = 'Nico video searches' + _MAX_RESULTS = float('inf') + IE_NAME = NicovideoSearchIE_NAME + _SEARCH_KEY = 'nicosearch' + _TESTS = [] + + def _search_results(self, query): + return self._entries( + self._proto_relative_url(f'//www.nicovideo.jp/search/{query}'), query) + + +class NicovideoSearchDateIE(NicovideoSearchIE): + IE_DESC = 'Nico video searches, newest first' + IE_NAME = f'{NicovideoSearchIE_NAME}:date' + _SEARCH_KEY = 'nicosearchdate' + _TESTS = [{ + 'url': 'nicosearchdateall:a', + 'info_dict': { + 'id': 'a', + 'title': 'a' + }, + 'playlist_mincount': 1610, + }] + + _START_DATE = datetime.date(2007, 1, 1) + _RESULTS_PER_PAGE = 32 + _MAX_PAGES = 50 + + def _entries(self, url, item_id, start_date=None, end_date=None): + start_date, end_date = start_date or self._START_DATE, end_date or datetime.datetime.now().date() + + # If the last page has a full page of videos, we need to break down the query interval further + last_page_len = len(list(self._get_entries_for_date( + url, item_id, start_date, end_date, self._MAX_PAGES, + note=f'Checking number of videos from {start_date} to {end_date}'))) + if (last_page_len == self._RESULTS_PER_PAGE and start_date != end_date): + midpoint = start_date + ((end_date - start_date) // 2) + yield from self._entries(url, item_id, midpoint, end_date) + yield from self._entries(url, item_id, start_date, midpoint) + else: + self.to_screen(f'{item_id}: Downloading results from {start_date} to {end_date}') + yield from self._get_entries_for_date( + url, item_id, start_date, end_date, note=' Downloading page %(page)s') + + def _get_entries_for_date(self, url, item_id, start_date, end_date=None, page_num=None, note=None): + query = { + 'start': str(start_date), + 'end': str(end_date or start_date), + 'sort': 'f', + 'order': 'd', + } + if page_num: + query['page'] = str(page_num) + + yield from NicovideoSearchURLIE._entries(self, url, item_id, query=query, note=note) + + +class NiconicoUserIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/user/(?P<id>\d+)/?(?:$|[#?])' + _TEST = { + 'url': 'https://www.nicovideo.jp/user/419948', + 'info_dict': { + 'id': '419948', + }, + 'playlist_mincount': 101, + } + _API_URL = "https://nvapi.nicovideo.jp/v1/users/%s/videos?sortKey=registeredAt&sortOrder=desc&pageSize=%s&page=%s" + _PAGE_SIZE = 100 + + _API_HEADERS = { + 'X-Frontend-ID': '6', + 'X-Frontend-Version': '0' + } + + def _entries(self, list_id): + total_count = 1 + count = page_num = 0 + while count < total_count: + json_parsed = self._download_json( + self._API_URL % (list_id, self._PAGE_SIZE, page_num + 1), list_id, + headers=self._API_HEADERS, + note='Downloading JSON metadata%s' % (' page %d' % page_num if page_num else '')) + if not page_num: + total_count = int_or_none(json_parsed['data'].get('totalCount')) + for entry in json_parsed["data"]["items"]: + count += 1 + yield self.url_result('https://www.nicovideo.jp/watch/%s' % entry['id']) + page_num += 1 + + def _real_extract(self, url): + list_id = self._match_id(url) + return self.playlist_result(self._entries(list_id), list_id, ie=NiconicoIE.ie_key()) diff --git a/yt_dlp/extractor/ninecninemedia.py b/yt_dlp/extractor/ninecninemedia.py new file mode 100644 index 000000000..4aaf21a12 --- /dev/null +++ b/yt_dlp/extractor/ninecninemedia.py @@ -0,0 +1,101 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ( + float_or_none, + int_or_none, + parse_iso8601, + try_get, +) + + +class NineCNineMediaIE(InfoExtractor): + IE_NAME = '9c9media' + _GEO_COUNTRIES = ['CA'] + _VALID_URL = r'9c9media:(?P<destination_code>[^:]+):(?P<id>\d+)' + _API_BASE_TEMPLATE = 'http://capi.9c9media.com/destinations/%s/platforms/desktop/contents/%s/' + + def _real_extract(self, url): + destination_code, content_id = self._match_valid_url(url).groups() + api_base_url = self._API_BASE_TEMPLATE % (destination_code, content_id) + content = self._download_json(api_base_url, content_id, query={ + '$include': '[Media.Name,Season,ContentPackages.Duration,ContentPackages.Id]', + }) + title = content['Name'] + content_package = content['ContentPackages'][0] + package_id = content_package['Id'] + content_package_url = api_base_url + 'contentpackages/%s/' % package_id + content_package = self._download_json( + content_package_url, content_id, query={ + '$include': '[HasClosedCaptions]', + }) + + if (not self.get_param('allow_unplayable_formats') + and try_get(content_package, lambda x: x['Constraints']['Security']['Type'])): + self.report_drm(content_id) + + manifest_base_url = content_package_url + 'manifest.' + formats = [] + formats.extend(self._extract_m3u8_formats( + manifest_base_url + 'm3u8', content_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + formats.extend(self._extract_f4m_formats( + manifest_base_url + 'f4m', content_id, + f4m_id='hds', fatal=False)) + formats.extend(self._extract_mpd_formats( + manifest_base_url + 'mpd', content_id, + mpd_id='dash', fatal=False)) + self._sort_formats(formats) + + thumbnails = [] + for image in (content.get('Images') or []): + image_url = image.get('Url') + if not image_url: + continue + thumbnails.append({ + 'url': image_url, + 'width': int_or_none(image.get('Width')), + 'height': int_or_none(image.get('Height')), + }) + + tags, categories = [], [] + for source_name, container in (('Tags', tags), ('Genres', categories)): + for e in content.get(source_name, []): + e_name = e.get('Name') + if not e_name: + continue + container.append(e_name) + + season = content.get('Season') or {} + + info = { + 'id': content_id, + 'title': title, + 'description': content.get('Desc') or content.get('ShortDesc'), + 'timestamp': parse_iso8601(content.get('BroadcastDateTime')), + 'episode_number': int_or_none(content.get('Episode')), + 'season': season.get('Name'), + 'season_number': int_or_none(season.get('Number')), + 'season_id': season.get('Id'), + 'series': try_get(content, lambda x: x['Media']['Name']), + 'tags': tags, + 'categories': categories, + 'duration': float_or_none(content_package.get('Duration')), + 'formats': formats, + 'thumbnails': thumbnails, + } + + if content_package.get('HasClosedCaptions'): + info['subtitles'] = { + 'en': [{ + 'url': manifest_base_url + 'vtt', + 'ext': 'vtt', + }, { + 'url': manifest_base_url + 'srt', + 'ext': 'srt', + }] + } + + return info diff --git a/youtube_dl/extractor/ninegag.py b/yt_dlp/extractor/ninegag.py index 14390823b..14390823b 100644 --- a/youtube_dl/extractor/ninegag.py +++ b/yt_dlp/extractor/ninegag.py diff --git a/yt_dlp/extractor/ninenow.py b/yt_dlp/extractor/ninenow.py new file mode 100644 index 000000000..6043674ba --- /dev/null +++ b/yt_dlp/extractor/ninenow.py @@ -0,0 +1,125 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + float_or_none, + smuggle_url, + str_or_none, + try_get, + unified_strdate, + unified_timestamp, +) + + +class NineNowIE(InfoExtractor): + IE_NAME = '9now.com.au' + _VALID_URL = r'https?://(?:www\.)?9now\.com\.au/(?:[^/]+/){2}(?P<id>[^/?#]+)' + _GEO_COUNTRIES = ['AU'] + _TESTS = [{ + # clip + 'url': 'https://www.9now.com.au/afl-footy-show/2016/clip-ciql02091000g0hp5oktrnytc', + 'md5': '17cf47d63ec9323e562c9957a968b565', + 'info_dict': { + 'id': '16801', + 'ext': 'mp4', + 'title': 'St. Kilda\'s Joey Montagna on the potential for a player\'s strike', + 'description': 'Is a boycott of the NAB Cup "on the table"?', + 'uploader_id': '4460760524001', + 'upload_date': '20160713', + 'timestamp': 1468421266, + }, + 'skip': 'Only available in Australia', + }, { + # episode + 'url': 'https://www.9now.com.au/afl-footy-show/2016/episode-19', + 'only_matching': True, + }, { + # DRM protected + 'url': 'https://www.9now.com.au/andrew-marrs-history-of-the-world/season-1/episode-1', + 'only_matching': True, + }, { + # episode of series + 'url': 'https://www.9now.com.au/lego-masters/season-3/episode-3', + 'info_dict': { + 'id': '6249614030001', + 'title': 'Episode 3', + 'ext': 'mp4', + 'season_number': 3, + 'episode_number': 3, + 'description': 'In the first elimination of the competition, teams will have 10 hours to build a world inside a snow globe.', + 'uploader_id': '4460760524001', + 'timestamp': 1619002200, + 'upload_date': '20210421', + }, + 'expected_warnings': ['Ignoring subtitle tracks'], + 'params':{ + 'skip_download': True, + } + }] + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/4460760524001/default_default/index.html?videoId=%s' + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + page_data = self._parse_json(self._search_regex( + r'window\.__data\s*=\s*({.*?});', webpage, + 'page data', default='{}'), display_id, fatal=False) + if not page_data: + page_data = self._parse_json(self._parse_json(self._search_regex( + r'window\.__data\s*=\s*JSON\.parse\s*\(\s*(".+?")\s*\)\s*;', + webpage, 'page data'), display_id), display_id) + + for kind in ('episode', 'clip'): + current_key = page_data.get(kind, {}).get( + 'current%sKey' % kind.capitalize()) + if not current_key: + continue + cache = page_data.get(kind, {}).get('%sCache' % kind, {}) + if not cache: + continue + common_data = { + 'episode': (cache.get(current_key) or list(cache.values())[0])[kind], + 'season': (cache.get(current_key) or list(cache.values())[0]).get('season', None) + } + break + else: + raise ExtractorError('Unable to find video data') + + if not self.get_param('allow_unplayable_formats') and try_get(common_data, lambda x: x['episode']['video']['drm'], bool): + self.report_drm(display_id) + brightcove_id = try_get( + common_data, lambda x: x['episode']['video']['brightcoveId'], compat_str) or 'ref:%s' % common_data['episode']['video']['referenceId'] + video_id = str_or_none(try_get(common_data, lambda x: x['episode']['video']['id'])) or brightcove_id + + title = try_get(common_data, lambda x: x['episode']['name'], compat_str) + season_number = try_get(common_data, lambda x: x['season']['seasonNumber'], int) + episode_number = try_get(common_data, lambda x: x['episode']['episodeNumber'], int) + timestamp = unified_timestamp(try_get(common_data, lambda x: x['episode']['airDate'], compat_str)) + release_date = unified_strdate(try_get(common_data, lambda x: x['episode']['availability'], compat_str)) + thumbnails_data = try_get(common_data, lambda x: x['episode']['image']['sizes'], dict) or {} + thumbnails = [{ + 'id': thumbnail_id, + 'url': thumbnail_url, + 'width': int_or_none(thumbnail_id[1:]), + } for thumbnail_id, thumbnail_url in thumbnails_data.items()] + + return { + '_type': 'url_transparent', + 'url': smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, + {'geo_countries': self._GEO_COUNTRIES}), + 'id': video_id, + 'title': title, + 'description': try_get(common_data, lambda x: x['episode']['description'], compat_str), + 'duration': float_or_none(try_get(common_data, lambda x: x['episode']['video']['duration'], float), 1000), + 'thumbnails': thumbnails, + 'ie_key': 'BrightcoveNew', + 'season_number': season_number, + 'episode_number': episode_number, + 'timestamp': timestamp, + 'release_date': release_date, + } diff --git a/youtube_dl/extractor/nintendo.py b/yt_dlp/extractor/nintendo.py index ff8f70ba6..ff8f70ba6 100644 --- a/youtube_dl/extractor/nintendo.py +++ b/yt_dlp/extractor/nintendo.py diff --git a/yt_dlp/extractor/nitter.py b/yt_dlp/extractor/nitter.py new file mode 100644 index 000000000..a0546cda0 --- /dev/null +++ b/yt_dlp/extractor/nitter.py @@ -0,0 +1,228 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + parse_count, + unified_strdate, + unified_timestamp, + remove_end, + determine_ext, +) +import re +import random + + +class NitterIE(InfoExtractor): + # Taken from https://github.com/zedeus/nitter/wiki/Instances + + NON_HTTP_INSTANCES = ( + '3nzoldnxplag42gqjs23xvghtzf6t6yzssrtytnntc6ppc7xxuoneoad.onion', + 'nitter.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd.onion', + 'nitter7bryz3jv7e3uekphigvmoyoem4al3fynerxkj22dmoxoq553qd.onion', + 'npf37k3mtzwxreiw52ccs5ay4e6qt2fkcs2ndieurdyn2cuzzsfyfvid.onion', + 'nitter.v6vgyqpa7yefkorazmg5d5fimstmvm2vtbirt6676mt7qmllrcnwycqd.onion', + 'i23nv6w3juvzlw32xzoxcqzktegd4i4fu3nmnc2ewv4ggiu4ledwklad.onion', + '26oq3gioiwcmfojub37nz5gzbkdiqp7fue5kvye7d4txv4ny6fb4wwid.onion', + + 'nitter.i2p', + 'u6ikd6zndl3c4dsdq4mmujpntgeevdk5qzkfb57r4tnfeccrn2qa.b32.i2p', + + 'nitterlgj3n5fgwesu3vxc5h67ruku33nqaoeoocae2mvlzhsu6k7fqd.onion', + ) + + HTTP_INSTANCES = ( + 'nitter.42l.fr', + 'nitter.pussthecat.org', + 'nitter.nixnet.services', + 'nitter.mastodont.cat', + 'nitter.tedomum.net', + 'nitter.fdn.fr', + 'nitter.1d4.us', + 'nitter.kavin.rocks', + 'tweet.lambda.dance', + 'nitter.cc', + 'nitter.vxempire.xyz', + 'nitter.unixfox.eu', + 'nitter.domain.glass', + 'nitter.himiko.cloud', + 'nitter.eu', + 'nitter.namazso.eu', + 'nitter.mailstation.de', + 'nitter.actionsack.com', + 'nitter.cattube.org', + 'nitter.dark.fail', + 'birdsite.xanny.family', + 'nitter.40two.app', + 'nitter.skrep.in', + + # not in the list anymore + 'nitter.snopyta.org', + ) + + DEAD_INSTANCES = ( + # maintenance + 'nitter.ethibox.fr', + + # official, rate limited + 'nitter.net', + # offline + 'nitter.13ad.de', + 'nitter.weaponizedhumiliation.com', + ) + + INSTANCES = NON_HTTP_INSTANCES + HTTP_INSTANCES + DEAD_INSTANCES + + _INSTANCES_RE = '(?:' + '|'.join([re.escape(instance) for instance in INSTANCES]) + ')' + _VALID_URL = r'https?://%(instance)s/(?P<uploader_id>.+)/status/(?P<id>[0-9]+)(#.)?' % {'instance': _INSTANCES_RE} + current_instance = random.choice(HTTP_INSTANCES) + + _TESTS = [ + { + # GIF (wrapped in mp4) + 'url': 'https://%s/firefox/status/1314279897502629888#m' % current_instance, + 'info_dict': { + 'id': '1314279897502629888', + 'ext': 'mp4', + 'title': 'Firefox 🔥 - You know the old saying, if you see something say something. Now you actually can with the YouTube regrets extension. \n\nReport harmful YouTube recommendations so others can avoid watching them. ➡️ https://mzl.la/3iFIiyg\n\n#UnfckTheInternet', + 'description': 'You know the old saying, if you see something say something. Now you actually can with the YouTube regrets extension. \n\nReport harmful YouTube recommendations so others can avoid watching them. ➡️ https://mzl.la/3iFIiyg\n\n#UnfckTheInternet', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Firefox 🔥', + 'uploader_id': 'firefox', + 'uploader_url': 'https://%s/firefox' % current_instance, + 'upload_date': '20201008', + 'timestamp': 1602183720, + }, + }, { # normal video + 'url': 'https://%s/Le___Doc/status/1299715685392756737#m' % current_instance, + 'info_dict': { + 'id': '1299715685392756737', + 'ext': 'mp4', + 'title': 'Le Doc - "Je ne prédis jamais rien"\nD Raoult, Août 2020...', + 'description': '"Je ne prédis jamais rien"\nD Raoult, Août 2020...', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Le Doc', + 'uploader_id': 'Le___Doc', + 'uploader_url': 'https://%s/Le___Doc' % current_instance, + 'upload_date': '20200829', + 'timestamp': 1598711341, + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + }, + }, { # video embed in a "Streaming Political Ads" box + 'url': 'https://%s/mozilla/status/1321147074491092994#m' % current_instance, + 'info_dict': { + 'id': '1321147074491092994', + 'ext': 'mp4', + 'title': "Mozilla - Are you being targeted with weird, ominous or just plain annoying political ads while streaming your favorite shows?\n\nThis isn't a real political ad, but if you're watching streaming TV in the U.S., chances are you've seen quite a few. \n\nLearn more ➡️ https://mzl.la/StreamingAds", + 'description': "Are you being targeted with weird, ominous or just plain annoying political ads while streaming your favorite shows?\n\nThis isn't a real political ad, but if you're watching streaming TV in the U.S., chances are you've seen quite a few. \n\nLearn more ➡️ https://mzl.la/StreamingAds", + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Mozilla', + 'uploader_id': 'mozilla', + 'uploader_url': 'https://%s/mozilla' % current_instance, + 'upload_date': '20201027', + 'timestamp': 1603820982 + }, + }, { # not the first tweet but main-tweet + 'url': 'https://%s/TheNaturalNu/status/1379050895539724290#m' % current_instance, + 'info_dict': { + 'id': '1379050895539724290', + 'ext': 'mp4', + 'title': 'Dorothy Zbornak - This had me hollering!!', + 'description': 'This had me hollering!!', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Dorothy Zbornak', + 'uploader_id': 'TheNaturalNu', + 'uploader_url': 'https://%s/TheNaturalNu' % current_instance, + 'timestamp': 1617626329, + 'upload_date': '20210405' + } + } + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + parsed_url = compat_urlparse.urlparse(url) + base_url = '%s://%s' % (parsed_url.scheme, parsed_url.netloc) + + self._set_cookie(parsed_url.netloc, 'hlsPlayback', 'on') + full_webpage = self._download_webpage(url, video_id) + + main_tweet_start = full_webpage.find('class="main-tweet"') + if main_tweet_start > 0: + webpage = full_webpage[main_tweet_start:] + if not webpage: + webpage = full_webpage + + video_url = '%s%s' % (base_url, self._html_search_regex(r'(?:<video[^>]+data-url|<source[^>]+src)="([^"]+)"', webpage, 'video url')) + ext = determine_ext(video_url) + + if ext == 'unknown_video': + formats = self._extract_m3u8_formats(video_url, video_id, ext='mp4') + else: + formats = [{ + 'url': video_url, + 'ext': ext + }] + + title = self._og_search_description(full_webpage) + if not title: + title = self._html_search_regex(r'<div class="tweet-content[^>]+>([^<]+)</div>', webpage, 'title') + description = title + + mobj = self._match_valid_url(url) + uploader_id = ( + mobj.group('uploader_id') + or self._html_search_regex(r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False) + ) + + if uploader_id: + uploader_url = '%s/%s' % (base_url, uploader_id) + + uploader = self._html_search_regex(r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False) + + if uploader: + title = '%s - %s' % (uploader, title) + + view_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-play[^>]*></span>\s([^<]+)</div>', webpage, 'view count', fatal=False)) + like_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-heart[^>]*></span>\s([^<]+)</div>', webpage, 'like count', fatal=False)) + repost_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-retweet[^>]*></span>\s([^<]+)</div>', webpage, 'repost count', fatal=False)) + comment_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-comment[^>]*></span>\s([^<]+)</div>', webpage, 'repost count', fatal=False)) + + thumbnail = self._html_search_meta('og:image', full_webpage, 'thumbnail url') + if not thumbnail: + thumbnail = '%s%s' % (base_url, self._html_search_regex(r'<video[^>]+poster="([^"]+)"', webpage, 'thumbnail url', fatal=False)) + thumbnail = remove_end(thumbnail, '%3Asmall') + + thumbnails = [] + thumbnail_ids = ('thumb', 'small', 'large', 'medium', 'orig') + for id in thumbnail_ids: + thumbnails.append({ + 'id': id, + 'url': thumbnail + '%3A' + id, + }) + + date = self._html_search_regex(r'<span[^>]+class="tweet-date"[^>]*><a[^>]+title="([^"]+)"', webpage, 'upload date', fatal=False) + upload_date = unified_strdate(date) + timestamp = unified_timestamp(date) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'uploader': uploader, + 'timestamp': timestamp, + 'uploader_id': uploader_id, + 'uploader_url': uploader_url, + 'view_count': view_count, + 'like_count': like_count, + 'repost_count': repost_count, + 'comment_count': comment_count, + 'formats': formats, + 'thumbnails': thumbnails, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + } diff --git a/youtube_dl/extractor/njpwworld.py b/yt_dlp/extractor/njpwworld.py index 3639d142f..3639d142f 100644 --- a/youtube_dl/extractor/njpwworld.py +++ b/yt_dlp/extractor/njpwworld.py diff --git a/youtube_dl/extractor/nobelprize.py b/yt_dlp/extractor/nobelprize.py index 4dfdb09d6..4dfdb09d6 100644 --- a/youtube_dl/extractor/nobelprize.py +++ b/yt_dlp/extractor/nobelprize.py diff --git a/yt_dlp/extractor/noco.py b/yt_dlp/extractor/noco.py new file mode 100644 index 000000000..78c4952f4 --- /dev/null +++ b/yt_dlp/extractor/noco.py @@ -0,0 +1,235 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import time +import hashlib + +from .common import InfoExtractor +from ..compat import ( + compat_str, +) +from ..utils import ( + clean_html, + ExtractorError, + int_or_none, + float_or_none, + parse_iso8601, + parse_qs, + sanitized_Request, + urlencode_postdata, +) + + +class NocoIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www\.)?noco\.tv/emission/|player\.noco\.tv/\?idvideo=)(?P<id>\d+)' + _LOGIN_URL = 'https://noco.tv/do.php' + _API_URL_TEMPLATE = 'https://api.noco.tv/1.1/%s?ts=%s&tk=%s' + _SUB_LANG_TEMPLATE = '&sub_lang=%s' + _NETRC_MACHINE = 'noco' + + _TESTS = [ + { + 'url': 'http://noco.tv/emission/11538/nolife/ami-ami-idol-hello-france/', + 'md5': '0a993f0058ddbcd902630b2047ef710e', + 'info_dict': { + 'id': '11538', + 'ext': 'mp4', + 'title': 'Ami Ami Idol - Hello! France', + 'description': 'md5:4eaab46ab68fa4197a317a88a53d3b86', + 'upload_date': '20140412', + 'uploader': 'Nolife', + 'uploader_id': 'NOL', + 'duration': 2851.2, + }, + 'skip': 'Requires noco account', + }, + { + 'url': 'http://noco.tv/emission/12610/lbl42/the-guild/s01e01-wake-up-call', + 'md5': 'c190f1f48e313c55838f1f412225934d', + 'info_dict': { + 'id': '12610', + 'ext': 'mp4', + 'title': 'The Guild #1 - Wake-Up Call', + 'timestamp': 1403863200, + 'upload_date': '20140627', + 'uploader': 'LBL42', + 'uploader_id': 'LBL', + 'duration': 233.023, + }, + 'skip': 'Requires noco account', + } + ] + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + login = self._download_json( + self._LOGIN_URL, None, 'Logging in', + data=urlencode_postdata({ + 'a': 'login', + 'cookie': '1', + 'username': username, + 'password': password, + }), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + }) + + if 'erreur' in login: + raise ExtractorError('Unable to login: %s' % clean_html(login['erreur']), expected=True) + + @staticmethod + def _ts(): + return int(time.time() * 1000) + + def _call_api(self, path, video_id, note, sub_lang=None): + ts = compat_str(self._ts() + self._ts_offset) + tk = hashlib.md5((hashlib.md5(ts.encode('ascii')).hexdigest() + '#8S?uCraTedap6a').encode('ascii')).hexdigest() + url = self._API_URL_TEMPLATE % (path, ts, tk) + if sub_lang: + url += self._SUB_LANG_TEMPLATE % sub_lang + + request = sanitized_Request(url) + request.add_header('Referer', self._referer) + + resp = self._download_json(request, video_id, note) + + if isinstance(resp, dict) and resp.get('error'): + self._raise_error(resp['error'], resp['description']) + + return resp + + def _raise_error(self, error, description): + raise ExtractorError( + '%s returned error: %s - %s' % (self.IE_NAME, error, description), + expected=True) + + def _real_extract(self, url): + video_id = self._match_id(url) + + # Timestamp adjustment offset between server time and local time + # must be calculated in order to use timestamps closest to server's + # in all API requests (see https://github.com/ytdl-org/youtube-dl/issues/7864) + webpage = self._download_webpage(url, video_id) + + player_url = self._search_regex( + r'(["\'])(?P<player>https?://noco\.tv/(?:[^/]+/)+NocoPlayer.+?\.swf.*?)\1', + webpage, 'noco player', group='player', + default='http://noco.tv/cdata/js/player/NocoPlayer-v1.2.40.swf') + + qs = parse_qs(player_url) + ts = int_or_none(qs.get('ts', [None])[0]) + self._ts_offset = ts - self._ts() if ts else 0 + self._referer = player_url + + medias = self._call_api( + 'shows/%s/medias' % video_id, + video_id, 'Downloading video JSON') + + show = self._call_api( + 'shows/by_id/%s' % video_id, + video_id, 'Downloading show JSON')[0] + + options = self._call_api( + 'users/init', video_id, + 'Downloading user options JSON')['options'] + audio_lang_pref = options.get('audio_language') or options.get('language', 'fr') + + if audio_lang_pref == 'original': + audio_lang_pref = show['original_lang'] + if len(medias) == 1: + audio_lang_pref = list(medias.keys())[0] + elif audio_lang_pref not in medias: + audio_lang_pref = 'fr' + + qualities = self._call_api( + 'qualities', + video_id, 'Downloading qualities JSON') + + formats = [] + + for audio_lang, audio_lang_dict in medias.items(): + preference = 1 if audio_lang == audio_lang_pref else 0 + for sub_lang, lang_dict in audio_lang_dict['video_list'].items(): + for format_id, fmt in lang_dict['quality_list'].items(): + format_id_extended = 'audio-%s_sub-%s_%s' % (audio_lang, sub_lang, format_id) + + video = self._call_api( + 'shows/%s/video/%s/%s' % (video_id, format_id.lower(), audio_lang), + video_id, 'Downloading %s video JSON' % format_id_extended, + sub_lang if sub_lang != 'none' else None) + + file_url = video['file'] + if not file_url: + continue + + if file_url in ['forbidden', 'not found']: + popmessage = video['popmessage'] + self._raise_error(popmessage['title'], popmessage['message']) + + formats.append({ + 'url': file_url, + 'format_id': format_id_extended, + 'width': int_or_none(fmt.get('res_width')), + 'height': int_or_none(fmt.get('res_lines')), + 'abr': int_or_none(fmt.get('audiobitrate'), 1000), + 'vbr': int_or_none(fmt.get('videobitrate'), 1000), + 'filesize': int_or_none(fmt.get('filesize')), + 'format_note': qualities[format_id].get('quality_name'), + 'quality': qualities[format_id].get('priority'), + 'language_preference': preference, + }) + + self._sort_formats(formats) + + timestamp = parse_iso8601(show.get('online_date_start_utc'), ' ') + + if timestamp is not None and timestamp < 0: + timestamp = None + + uploader = show.get('partner_name') + uploader_id = show.get('partner_key') + duration = float_or_none(show.get('duration_ms'), 1000) + + thumbnails = [] + for thumbnail_key, thumbnail_url in show.items(): + m = re.search(r'^screenshot_(?P<width>\d+)x(?P<height>\d+)$', thumbnail_key) + if not m: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'width': int(m.group('width')), + 'height': int(m.group('height')), + }) + + episode = show.get('show_TT') or show.get('show_OT') + family = show.get('family_TT') or show.get('family_OT') + episode_number = show.get('episode_number') + + title = '' + if family: + title += family + if episode_number: + title += ' #' + compat_str(episode_number) + if episode: + title += ' - ' + compat_str(episode) + + description = show.get('show_resume') or show.get('family_resume') + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnails': thumbnails, + 'timestamp': timestamp, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'duration': duration, + 'formats': formats, + } diff --git a/youtube_dl/extractor/nonktube.py b/yt_dlp/extractor/nonktube.py index ca1424e06..ca1424e06 100644 --- a/youtube_dl/extractor/nonktube.py +++ b/yt_dlp/extractor/nonktube.py diff --git a/youtube_dl/extractor/noovo.py b/yt_dlp/extractor/noovo.py index b40770d07..b40770d07 100644 --- a/youtube_dl/extractor/noovo.py +++ b/yt_dlp/extractor/noovo.py diff --git a/youtube_dl/extractor/normalboots.py b/yt_dlp/extractor/normalboots.py index 61fe571df..61fe571df 100644 --- a/youtube_dl/extractor/normalboots.py +++ b/yt_dlp/extractor/normalboots.py diff --git a/youtube_dl/extractor/nosvideo.py b/yt_dlp/extractor/nosvideo.py index 53c500c35..53c500c35 100644 --- a/youtube_dl/extractor/nosvideo.py +++ b/yt_dlp/extractor/nosvideo.py diff --git a/yt_dlp/extractor/nova.py b/yt_dlp/extractor/nova.py new file mode 100644 index 000000000..3acb88121 --- /dev/null +++ b/yt_dlp/extractor/nova.py @@ -0,0 +1,305 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + determine_ext, + int_or_none, + js_to_json, + qualities, + unified_strdate, + url_or_none, +) + + +class NovaEmbedIE(InfoExtractor): + _VALID_URL = r'https?://media\.cms\.nova\.cz/embed/(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'https://media.cms.nova.cz/embed/8o0n0r?autoplay=1', + 'md5': 'ee009bafcc794541570edd44b71cbea3', + 'info_dict': { + 'id': '8o0n0r', + 'ext': 'mp4', + 'title': '2180. díl', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 2578, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + duration = None + formats = [] + + player = self._parse_json( + self._search_regex( + r'Player\.init\s*\([^,]+,\s*(?:\w+\s*\?\s*{.+?}\s*:\s*)?({.+})\s*,\s*{.+?}\s*\)\s*;', + webpage, 'player', default='{}'), video_id, fatal=False) + if player: + for format_id, format_list in player['tracks'].items(): + if not isinstance(format_list, list): + format_list = [format_list] + for format_dict in format_list: + if not isinstance(format_dict, dict): + continue + format_url = url_or_none(format_dict.get('src')) + format_type = format_dict.get('type') + ext = determine_ext(format_url) + if (format_type == 'application/x-mpegURL' + or format_id == 'HLS' or ext == 'm3u8'): + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False)) + elif (format_type == 'application/dash+xml' + or format_id == 'DASH' or ext == 'mpd'): + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False)) + else: + formats.append({ + 'url': format_url, + }) + duration = int_or_none(player.get('duration')) + else: + # Old path, not actual as of 08.04.2020 + bitrates = self._parse_json( + self._search_regex( + r'(?s)(?:src|bitrates)\s*=\s*({.+?})\s*;', webpage, 'formats'), + video_id, transform_source=js_to_json) + + QUALITIES = ('lq', 'mq', 'hq', 'hd') + quality_key = qualities(QUALITIES) + + for format_id, format_list in bitrates.items(): + if not isinstance(format_list, list): + format_list = [format_list] + for format_url in format_list: + format_url = url_or_none(format_url) + if not format_url: + continue + if format_id == 'hls': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, ext='mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False)) + continue + f = { + 'url': format_url, + } + f_id = format_id + for quality in QUALITIES: + if '%s.mp4' % quality in format_url: + f_id += '-%s' % quality + f.update({ + 'quality': quality_key(quality), + 'format_note': quality.upper(), + }) + break + f['format_id'] = f_id + formats.append(f) + + self._sort_formats(formats) + + title = self._og_search_title( + webpage, default=None) or self._search_regex( + (r'<value>(?P<title>[^<]+)', + r'videoTitle\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage, + 'title', group='value') + thumbnail = self._og_search_thumbnail( + webpage, default=None) or self._search_regex( + r'poster\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, + 'thumbnail', fatal=False, group='value') + duration = int_or_none(self._search_regex( + r'videoDuration\s*:\s*(\d+)', webpage, 'duration', + default=duration)) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + } + + +class NovaIE(InfoExtractor): + IE_DESC = 'TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz' + _VALID_URL = r'https?://(?:[^.]+\.)?(?P<site>tv(?:noviny)?|tn|novaplus|vymena|fanda|krasna|doma|prask)\.nova\.cz/(?:[^/]+/)+(?P<id>[^/]+?)(?:\.html|/|$)' + _TESTS = [{ + 'url': 'http://tn.nova.cz/clanek/tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci.html#player_13260', + 'md5': '249baab7d0104e186e78b0899c7d5f28', + 'info_dict': { + 'id': '1757139', + 'display_id': 'tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci', + 'ext': 'mp4', + 'title': 'Podzemní nemocnice v pražské Krči', + 'description': 'md5:f0a42dd239c26f61c28f19e62d20ef53', + 'thumbnail': r're:^https?://.*\.(?:jpg)', + } + }, { + 'url': 'http://fanda.nova.cz/clanek/fun-and-games/krvavy-epos-zaklinac-3-divoky-hon-vychazi-vyhrajte-ho-pro-sebe.html', + 'info_dict': { + 'id': '1753621', + 'ext': 'mp4', + 'title': 'Zaklínač 3: Divoký hon', + 'description': 're:.*Pokud se stejně jako my nemůžete.*', + 'thumbnail': r're:https?://.*\.jpg(\?.*)?', + 'upload_date': '20150521', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'gone', + }, { + # media.cms.nova.cz embed + 'url': 'https://novaplus.nova.cz/porad/ulice/epizoda/18760-2180-dil', + 'info_dict': { + 'id': '8o0n0r', + 'ext': 'mp4', + 'title': '2180. díl', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 2578, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [NovaEmbedIE.ie_key()], + 'skip': 'CHYBA 404: STRÁNKA NENALEZENA', + }, { + 'url': 'http://sport.tn.nova.cz/clanek/sport/hokej/nhl/zivot-jde-dal-hodnotil-po-vyrazeni-z-playoff-jiri-sekac.html', + 'only_matching': True, + }, { + 'url': 'http://fanda.nova.cz/clanek/fun-and-games/krvavy-epos-zaklinac-3-divoky-hon-vychazi-vyhrajte-ho-pro-sebe.html', + 'only_matching': True, + }, { + 'url': 'http://doma.nova.cz/clanek/zdravi/prijdte-se-zapsat-do-registru-kostni-drene-jiz-ve-stredu-3-cervna.html', + 'only_matching': True, + }, { + 'url': 'http://prask.nova.cz/clanek/novinky/co-si-na-sobe-nase-hvezdy-nechaly-pojistit.html', + 'only_matching': True, + }, { + 'url': 'http://tv.nova.cz/clanek/novinky/zivot-je-zivot-bondovsky-trailer.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + display_id = mobj.group('id') + site = mobj.group('site') + + webpage = self._download_webpage(url, display_id) + + description = clean_html(self._og_search_description(webpage, default=None)) + if site == 'novaplus': + upload_date = unified_strdate(self._search_regex( + r'(\d{1,2}-\d{1,2}-\d{4})$', display_id, 'upload date', default=None)) + elif site == 'fanda': + upload_date = unified_strdate(self._search_regex( + r'<span class="date_time">(\d{1,2}\.\d{1,2}\.\d{4})', webpage, 'upload date', default=None)) + else: + upload_date = None + + # novaplus + embed_id = self._search_regex( + r'<iframe[^>]+\bsrc=["\'](?:https?:)?//media\.cms\.nova\.cz/embed/([^/?#&]+)', + webpage, 'embed url', default=None) + if embed_id: + return { + '_type': 'url_transparent', + 'url': 'https://media.cms.nova.cz/embed/%s' % embed_id, + 'ie_key': NovaEmbedIE.ie_key(), + 'id': embed_id, + 'description': description, + 'upload_date': upload_date + } + + video_id = self._search_regex( + [r"(?:media|video_id)\s*:\s*'(\d+)'", + r'media=(\d+)', + r'id="article_video_(\d+)"', + r'id="player_(\d+)"'], + webpage, 'video id') + + config_url = self._search_regex( + r'src="(https?://(?:tn|api)\.nova\.cz/bin/player/videojs/config\.php\?[^"]+)"', + webpage, 'config url', default=None) + config_params = {} + + if not config_url: + player = self._parse_json( + self._search_regex( + r'(?s)Player\s*\(.+?\s*,\s*({.+?\bmedia\b["\']?\s*:\s*["\']?\d+.+?})\s*\)', webpage, + 'player', default='{}'), + video_id, transform_source=js_to_json, fatal=False) + if player: + config_url = url_or_none(player.get('configUrl')) + params = player.get('configParams') + if isinstance(params, dict): + config_params = params + + if not config_url: + DEFAULT_SITE_ID = '23000' + SITES = { + 'tvnoviny': DEFAULT_SITE_ID, + 'novaplus': DEFAULT_SITE_ID, + 'vymena': DEFAULT_SITE_ID, + 'krasna': DEFAULT_SITE_ID, + 'fanda': '30', + 'tn': '30', + 'doma': '30', + } + + site_id = self._search_regex( + r'site=(\d+)', webpage, 'site id', default=None) or SITES.get( + site, DEFAULT_SITE_ID) + + config_url = 'https://api.nova.cz/bin/player/videojs/config.php' + config_params = { + 'site': site_id, + 'media': video_id, + 'quality': 3, + 'version': 1, + } + + config = self._download_json( + config_url, display_id, + 'Downloading config JSON', query=config_params, + transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1]) + + mediafile = config['mediafile'] + video_url = mediafile['src'] + + m = re.search(r'^(?P<url>rtmpe?://[^/]+/(?P<app>[^/]+?))/&*(?P<playpath>.+)$', video_url) + if m: + formats = [{ + 'url': m.group('url'), + 'app': m.group('app'), + 'play_path': m.group('playpath'), + 'player_path': 'http://tvnoviny.nova.cz/static/shared/app/videojs/video-js.swf', + 'ext': 'flv', + }] + else: + formats = [{ + 'url': video_url, + }] + self._sort_formats(formats) + + title = mediafile.get('meta', {}).get('title') or self._og_search_title(webpage) + thumbnail = config.get('poster') + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'upload_date': upload_date, + 'thumbnail': thumbnail, + 'formats': formats, + } diff --git a/yt_dlp/extractor/novaplay.py b/yt_dlp/extractor/novaplay.py new file mode 100644 index 000000000..724986a06 --- /dev/null +++ b/yt_dlp/extractor/novaplay.py @@ -0,0 +1,63 @@ +# coding: utf-8 +from .common import InfoExtractor +from ..utils import int_or_none, parse_duration, parse_iso8601 + + +class NovaPlayIE(InfoExtractor): + _VALID_URL = r'https://play.nova\.bg/video/.*/(?P<id>\d+)' + _TESTS = [ + { + 'url': 'https://play.nova.bg/video/bratya/season-3/bratq-2021-10-08/548677', + 'md5': 'b1127a84e61bed1632b7c2ca9cbb4153', + 'info_dict': { + 'id': '548677', + 'ext': 'mp4', + 'title': 'Братя', + 'alt_title': 'bratya/season-3/bratq-2021-10-08', + 'duration': 1603.0, + 'timestamp': 1633724150, + 'upload_date': '20211008', + 'thumbnail': 'https://nbg-img.fite.tv/img/548677_460x260.jpg', + 'description': 'Сезон 3 Епизод 25' + }, + }, + { + 'url': 'https://play.nova.bg/video/igri-na-volqta/season-3/igri-na-volqta-2021-09-20-1/548227', + 'md5': '5fd61b8ecbe582fc021019d570965d58', + 'info_dict': { + 'id': '548227', + 'ext': 'mp4', + 'title': 'Игри на волята: България (20.09.2021) - част 1', + 'alt_title': 'gri-na-volqta/season-3/igri-na-volqta-2021-09-20-1', + 'duration': 4060.0, + 'timestamp': 1632167564, + 'upload_date': '20210920', + 'thumbnail': 'https://nbg-img.fite.tv/img/548227_460x260.jpg', + 'description': 'Сезон 3 Епизод 13' + }, + } + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + video_props = self._parse_json(self._search_regex( + r'<script\s?id=\"__NEXT_DATA__\"\s?type=\"application/json\">({.+})</script>', + webpage, 'video_props'), video_id)['props']['pageProps']['video'] + m3u8_url = self._download_json( + f'https://nbg-api.fite.tv/api/v2/videos/{video_id}/streams', + video_id, headers={'x-flipps-user-agent': 'Flipps/75/9.7'})[0]['url'] + formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', m3u8_id='hls') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_props['title'], + 'alt_title': video_props.get('slug'), + 'thumbnail': self._og_search_thumbnail(webpage), + 'description': self._og_search_description(webpage), + 'formats': formats, + 'duration': parse_duration(video_props['duration']), + 'timestamp': parse_iso8601(video_props['published_at']), + 'view_count': int_or_none(video_props['view_count']), + } diff --git a/yt_dlp/extractor/nowness.py b/yt_dlp/extractor/nowness.py new file mode 100644 index 000000000..b2c715f41 --- /dev/null +++ b/yt_dlp/extractor/nowness.py @@ -0,0 +1,147 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .brightcove import ( + BrightcoveLegacyIE, + BrightcoveNewIE, +) +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + sanitized_Request, +) + + +class NownessBaseIE(InfoExtractor): + def _extract_url_result(self, post): + if post['type'] == 'video': + for media in post['media']: + if media['type'] == 'video': + video_id = media['content'] + source = media['source'] + if source == 'brightcove': + player_code = self._download_webpage( + 'http://www.nowness.com/iframe?id=%s' % video_id, video_id, + note='Downloading player JavaScript', + errnote='Unable to download player JavaScript') + bc_url = BrightcoveLegacyIE._extract_brightcove_url(player_code) + if bc_url: + return self.url_result(bc_url, BrightcoveLegacyIE.ie_key()) + bc_url = BrightcoveNewIE._extract_url(self, player_code) + if bc_url: + return self.url_result(bc_url, BrightcoveNewIE.ie_key()) + raise ExtractorError('Could not find player definition') + elif source == 'vimeo': + return self.url_result('http://vimeo.com/%s' % video_id, 'Vimeo') + elif source == 'youtube': + return self.url_result(video_id, 'Youtube') + elif source == 'cinematique': + # yt-dlp currently doesn't support cinematique + # return self.url_result('http://cinematique.com/embed/%s' % video_id, 'Cinematique') + pass + + def _api_request(self, url, request_path): + display_id = self._match_id(url) + request = sanitized_Request( + 'http://api.nowness.com/api/' + request_path % display_id, + headers={ + 'X-Nowness-Language': 'zh-cn' if 'cn.nowness.com' in url else 'en-us', + }) + return display_id, self._download_json(request, display_id) + + +class NownessIE(NownessBaseIE): + IE_NAME = 'nowness' + _VALID_URL = r'https?://(?:(?:www|cn)\.)?nowness\.com/(?:story|(?:series|category)/[^/]+)/(?P<id>[^/]+?)(?:$|[?#])' + _TESTS = [{ + 'url': 'https://www.nowness.com/story/candor-the-art-of-gesticulation', + 'md5': '068bc0202558c2e391924cb8cc470676', + 'info_dict': { + 'id': '2520295746001', + 'ext': 'mp4', + 'title': 'Candor: The Art of Gesticulation', + 'description': 'Candor: The Art of Gesticulation', + 'thumbnail': r're:^https?://.*\.jpg', + 'timestamp': 1446745676, + 'upload_date': '20151105', + 'uploader_id': '2385340575001', + }, + 'add_ie': ['BrightcoveNew'], + }, { + 'url': 'https://cn.nowness.com/story/kasper-bjorke-ft-jaakko-eino-kalevi-tnr', + 'md5': 'e79cf125e387216f86b2e0a5b5c63aa3', + 'info_dict': { + 'id': '3716354522001', + 'ext': 'mp4', + 'title': 'Kasper Bjørke ft. Jaakko Eino Kalevi: TNR', + 'description': 'Kasper Bjørke ft. Jaakko Eino Kalevi: TNR', + 'thumbnail': r're:^https?://.*\.jpg', + 'timestamp': 1407315371, + 'upload_date': '20140806', + 'uploader_id': '2385340575001', + }, + 'add_ie': ['BrightcoveNew'], + }, { + # vimeo + 'url': 'https://www.nowness.com/series/nowness-picks/jean-luc-godard-supercut', + 'md5': '9a5a6a8edf806407e411296ab6bc2a49', + 'info_dict': { + 'id': '130020913', + 'ext': 'mp4', + 'title': 'Bleu, Blanc, Rouge - A Godard Supercut', + 'description': 'md5:f0ea5f1857dffca02dbd37875d742cec', + 'thumbnail': r're:^https?://.*\.jpg', + 'upload_date': '20150607', + 'uploader': 'Cinema Sem Lei', + 'uploader_id': 'cinemasemlei', + }, + 'add_ie': ['Vimeo'], + }] + + def _real_extract(self, url): + _, post = self._api_request(url, 'post/getBySlug/%s') + return self._extract_url_result(post) + + +class NownessPlaylistIE(NownessBaseIE): + IE_NAME = 'nowness:playlist' + _VALID_URL = r'https?://(?:(?:www|cn)\.)?nowness\.com/playlist/(?P<id>\d+)' + _TEST = { + 'url': 'https://www.nowness.com/playlist/3286/i-guess-thats-why-they-call-it-the-blues', + 'info_dict': { + 'id': '3286', + }, + 'playlist_mincount': 8, + } + + def _real_extract(self, url): + playlist_id, playlist = self._api_request(url, 'post?PlaylistId=%s') + entries = [self._extract_url_result(item) for item in playlist['items']] + return self.playlist_result(entries, playlist_id) + + +class NownessSeriesIE(NownessBaseIE): + IE_NAME = 'nowness:series' + _VALID_URL = r'https?://(?:(?:www|cn)\.)?nowness\.com/series/(?P<id>[^/]+?)(?:$|[?#])' + _TEST = { + 'url': 'https://www.nowness.com/series/60-seconds', + 'info_dict': { + 'id': '60', + 'title': '60 Seconds', + 'description': 'One-minute wisdom in a new NOWNESS series', + }, + 'playlist_mincount': 4, + } + + def _real_extract(self, url): + display_id, series = self._api_request(url, 'series/getBySlug/%s') + entries = [self._extract_url_result(post) for post in series['posts']] + series_title = None + series_description = None + translations = series.get('translations', []) + if translations: + series_title = translations[0].get('title') or translations[0]['seoTitle'] + series_description = translations[0].get('seoDescription') + return self.playlist_result( + entries, compat_str(series['id']), series_title, series_description) diff --git a/youtube_dl/extractor/noz.py b/yt_dlp/extractor/noz.py index ccafd7723..ccafd7723 100644 --- a/youtube_dl/extractor/noz.py +++ b/yt_dlp/extractor/noz.py diff --git a/yt_dlp/extractor/npo.py b/yt_dlp/extractor/npo.py new file mode 100644 index 000000000..ed547d04b --- /dev/null +++ b/yt_dlp/extractor/npo.py @@ -0,0 +1,766 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_HTTPError, + compat_str, +) +from ..utils import ( + determine_ext, + ExtractorError, + fix_xml_ampersands, + int_or_none, + merge_dicts, + orderedSet, + parse_duration, + qualities, + str_or_none, + strip_jsonp, + unified_strdate, + unified_timestamp, + url_or_none, + urlencode_postdata, +) + + +class NPOBaseIE(InfoExtractor): + def _get_token(self, video_id): + return self._download_json( + 'http://ida.omroep.nl/app.php/auth', video_id, + note='Downloading token')['token'] + + +class NPOIE(NPOBaseIE): + IE_NAME = 'npo' + IE_DESC = 'npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl' + _VALID_URL = r'''(?x) + (?: + npo:| + https?:// + (?:www\.)? + (?: + npo\.nl/(?:[^/]+/)*| + (?:ntr|npostart)\.nl/(?:[^/]+/){2,}| + omroepwnl\.nl/video/fragment/[^/]+__| + (?:zapp|npo3)\.nl/(?:[^/]+/){2,} + ) + ) + (?P<id>[^/?#]+) + ''' + + _TESTS = [{ + 'url': 'http://www.npo.nl/nieuwsuur/22-06-2014/VPWON_1220719', + 'md5': '4b3f9c429157ec4775f2c9cb7b911016', + 'info_dict': { + 'id': 'VPWON_1220719', + 'ext': 'm4v', + 'title': 'Nieuwsuur', + 'description': 'Dagelijks tussen tien en elf: nieuws, sport en achtergronden.', + 'upload_date': '20140622', + }, + }, { + 'url': 'http://www.npo.nl/de-mega-mike-mega-thomas-show/27-02-2009/VARA_101191800', + 'md5': 'da50a5787dbfc1603c4ad80f31c5120b', + 'info_dict': { + 'id': 'VARA_101191800', + 'ext': 'm4v', + 'title': 'De Mega Mike & Mega Thomas show: The best of.', + 'description': 'md5:3b74c97fc9d6901d5a665aac0e5400f4', + 'upload_date': '20090227', + 'duration': 2400, + }, + }, { + 'url': 'http://www.npo.nl/tegenlicht/25-02-2013/VPWON_1169289', + 'md5': 'f8065e4e5a7824068ed3c7e783178f2c', + 'info_dict': { + 'id': 'VPWON_1169289', + 'ext': 'm4v', + 'title': 'Tegenlicht: Zwart geld. De toekomst komt uit Afrika', + 'description': 'md5:52cf4eefbc96fffcbdc06d024147abea', + 'upload_date': '20130225', + 'duration': 3000, + }, + }, { + 'url': 'http://www.npo.nl/de-nieuwe-mens-deel-1/21-07-2010/WO_VPRO_043706', + 'info_dict': { + 'id': 'WO_VPRO_043706', + 'ext': 'm4v', + 'title': 'De nieuwe mens - Deel 1', + 'description': 'md5:518ae51ba1293ffb80d8d8ce90b74e4b', + 'duration': 4680, + }, + 'params': { + 'skip_download': True, + } + }, { + # non asf in streams + 'url': 'http://www.npo.nl/hoe-gaat-europa-verder-na-parijs/10-01-2015/WO_NOS_762771', + 'info_dict': { + 'id': 'WO_NOS_762771', + 'ext': 'mp4', + 'title': 'Hoe gaat Europa verder na Parijs?', + }, + 'params': { + 'skip_download': True, + } + }, { + 'url': 'http://www.ntr.nl/Aap-Poot-Pies/27/detail/Aap-poot-pies/VPWON_1233944#content', + 'info_dict': { + 'id': 'VPWON_1233944', + 'ext': 'm4v', + 'title': 'Aap, poot, pies', + 'description': 'md5:c9c8005d1869ae65b858e82c01a91fde', + 'upload_date': '20150508', + 'duration': 599, + }, + 'params': { + 'skip_download': True, + } + }, { + 'url': 'http://www.omroepwnl.nl/video/fragment/vandaag-de-dag-verkiezingen__POMS_WNL_853698', + 'info_dict': { + 'id': 'POW_00996502', + 'ext': 'm4v', + 'title': '''"Dit is wel een 'landslide'..."''', + 'description': 'md5:f8d66d537dfb641380226e31ca57b8e8', + 'upload_date': '20150508', + 'duration': 462, + }, + 'params': { + 'skip_download': True, + } + }, { + # audio + 'url': 'http://www.npo.nl/jouw-stad-rotterdam/29-01-2017/RBX_FUNX_6683215/RBX_FUNX_7601437', + 'info_dict': { + 'id': 'RBX_FUNX_6683215', + 'ext': 'mp3', + 'title': 'Jouw Stad Rotterdam', + 'description': 'md5:db251505244f097717ec59fabc372d9f', + }, + 'params': { + 'skip_download': True, + } + }, { + 'url': 'http://www.zapp.nl/de-bzt-show/gemist/KN_1687547', + 'only_matching': True, + }, { + 'url': 'http://www.zapp.nl/de-bzt-show/filmpjes/POMS_KN_7315118', + 'only_matching': True, + }, { + 'url': 'http://www.zapp.nl/beste-vrienden-quiz/extra-video-s/WO_NTR_1067990', + 'only_matching': True, + }, { + 'url': 'https://www.npo3.nl/3onderzoekt/16-09-2015/VPWON_1239870', + 'only_matching': True, + }, { + # live stream + 'url': 'npo:LI_NL1_4188102', + 'only_matching': True, + }, { + 'url': 'http://www.npo.nl/radio-gaga/13-06-2017/BNN_101383373', + 'only_matching': True, + }, { + 'url': 'https://www.zapp.nl/1803-skelterlab/instructie-video-s/740-instructievideo-s/POMS_AT_11736927', + 'only_matching': True, + }, { + 'url': 'https://www.npostart.nl/broodje-gezond-ei/28-05-2018/KN_1698996', + 'only_matching': True, + }, { + 'url': 'https://npo.nl/KN_1698996', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return (False if any(ie.suitable(url) + for ie in (NPOLiveIE, NPORadioIE, NPORadioFragmentIE)) + else super(NPOIE, cls).suitable(url)) + + def _real_extract(self, url): + video_id = self._match_id(url) + return self._get_info(url, video_id) or self._get_old_info(video_id) + + def _get_info(self, url, video_id): + token = self._download_json( + 'https://www.npostart.nl/api/token', video_id, + 'Downloading token', headers={ + 'Referer': url, + 'X-Requested-With': 'XMLHttpRequest', + })['token'] + + player = self._download_json( + 'https://www.npostart.nl/player/%s' % video_id, video_id, + 'Downloading player JSON', data=urlencode_postdata({ + 'autoplay': 0, + 'share': 1, + 'pageUrl': url, + 'hasAdConsent': 0, + '_token': token, + })) + + player_token = player['token'] + + drm = False + format_urls = set() + formats = [] + for profile in ('hls', 'dash-widevine', 'dash-playready', 'smooth'): + streams = self._download_json( + 'https://start-player.npo.nl/video/%s/streams' % video_id, + video_id, 'Downloading %s profile JSON' % profile, fatal=False, + query={ + 'profile': profile, + 'quality': 'npo', + 'tokenId': player_token, + 'streamType': 'broadcast', + }) + if not streams: + continue + stream = streams.get('stream') + if not isinstance(stream, dict): + continue + stream_url = url_or_none(stream.get('src')) + if not stream_url or stream_url in format_urls: + continue + format_urls.add(stream_url) + if stream.get('protection') is not None or stream.get('keySystemOptions') is not None: + drm = True + continue + stream_type = stream.get('type') + stream_ext = determine_ext(stream_url) + if stream_type == 'application/dash+xml' or stream_ext == 'mpd': + formats.extend(self._extract_mpd_formats( + stream_url, video_id, mpd_id='dash', fatal=False)) + elif stream_type == 'application/vnd.apple.mpegurl' or stream_ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + stream_url, video_id, ext='mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + elif re.search(r'\.isml?/Manifest', stream_url): + formats.extend(self._extract_ism_formats( + stream_url, video_id, ism_id='mss', fatal=False)) + else: + formats.append({ + 'url': stream_url, + }) + + if not formats: + if not self.get_param('allow_unplayable_formats') and drm: + self.report_drm(video_id) + + self._sort_formats(formats) + + info = { + 'id': video_id, + 'title': video_id, + 'formats': formats, + } + + embed_url = url_or_none(player.get('embedUrl')) + if embed_url: + webpage = self._download_webpage( + embed_url, video_id, 'Downloading embed page', fatal=False) + if webpage: + video = self._parse_json( + self._search_regex( + r'\bvideo\s*=\s*({.+?})\s*;', webpage, 'video', + default='{}'), video_id) + if video: + title = video.get('episodeTitle') + subtitles = {} + subtitles_list = video.get('subtitles') + if isinstance(subtitles_list, list): + for cc in subtitles_list: + cc_url = url_or_none(cc.get('src')) + if not cc_url: + continue + lang = str_or_none(cc.get('language')) or 'nl' + subtitles.setdefault(lang, []).append({ + 'url': cc_url, + }) + return merge_dicts({ + 'title': title, + 'description': video.get('description'), + 'thumbnail': url_or_none( + video.get('still_image_url') or video.get('orig_image_url')), + 'duration': int_or_none(video.get('duration')), + 'timestamp': unified_timestamp(video.get('broadcastDate')), + 'creator': video.get('channel'), + 'series': video.get('title'), + 'episode': title, + 'episode_number': int_or_none(video.get('episodeNumber')), + 'subtitles': subtitles, + }, info) + + return info + + def _get_old_info(self, video_id): + metadata = self._download_json( + 'http://e.omroep.nl/metadata/%s' % video_id, + video_id, + # We have to remove the javascript callback + transform_source=strip_jsonp, + ) + + error = metadata.get('error') + if error: + raise ExtractorError(error, expected=True) + + # For some videos actual video id (prid) is different (e.g. for + # http://www.omroepwnl.nl/video/fragment/vandaag-de-dag-verkiezingen__POMS_WNL_853698 + # video id is POMS_WNL_853698 but prid is POW_00996502) + video_id = metadata.get('prid') or video_id + + # titel is too generic in some cases so utilize aflevering_titel as well + # when available (e.g. http://tegenlicht.vpro.nl/afleveringen/2014-2015/access-to-africa.html) + title = metadata['titel'] + sub_title = metadata.get('aflevering_titel') + if sub_title and sub_title != title: + title += ': %s' % sub_title + + token = self._get_token(video_id) + + formats = [] + urls = set() + + def is_legal_url(format_url): + return format_url and format_url not in urls and re.match( + r'^(?:https?:)?//', format_url) + + QUALITY_LABELS = ('Laag', 'Normaal', 'Hoog') + QUALITY_FORMATS = ('adaptive', 'wmv_sb', 'h264_sb', 'wmv_bb', 'h264_bb', 'wvc1_std', 'h264_std') + + quality_from_label = qualities(QUALITY_LABELS) + quality_from_format_id = qualities(QUALITY_FORMATS) + items = self._download_json( + 'http://ida.omroep.nl/app.php/%s' % video_id, video_id, + 'Downloading formats JSON', query={ + 'adaptive': 'yes', + 'token': token, + })['items'][0] + for num, item in enumerate(items): + item_url = item.get('url') + if not is_legal_url(item_url): + continue + urls.add(item_url) + format_id = self._search_regex( + r'video/ida/([^/]+)', item_url, 'format id', + default=None) + + item_label = item.get('label') + + def add_format_url(format_url): + width = int_or_none(self._search_regex( + r'(\d+)[xX]\d+', format_url, 'width', default=None)) + height = int_or_none(self._search_regex( + r'\d+[xX](\d+)', format_url, 'height', default=None)) + if item_label in QUALITY_LABELS: + quality = quality_from_label(item_label) + f_id = item_label + elif item_label in QUALITY_FORMATS: + quality = quality_from_format_id(format_id) + f_id = format_id + else: + quality, f_id = [None] * 2 + formats.append({ + 'url': format_url, + 'format_id': f_id, + 'width': width, + 'height': height, + 'quality': quality, + }) + + # Example: http://www.npo.nl/de-nieuwe-mens-deel-1/21-07-2010/WO_VPRO_043706 + if item.get('contentType') in ('url', 'audio'): + add_format_url(item_url) + continue + + try: + stream_info = self._download_json( + item_url + '&type=json', video_id, + 'Downloading %s stream JSON' + % item_label or item.get('format') or format_id or num) + except ExtractorError as ee: + if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404: + error = (self._parse_json( + ee.cause.read().decode(), video_id, + fatal=False) or {}).get('errorstring') + if error: + raise ExtractorError(error, expected=True) + raise + # Stream URL instead of JSON, example: npo:LI_NL1_4188102 + if isinstance(stream_info, compat_str): + if not stream_info.startswith('http'): + continue + video_url = stream_info + # JSON + else: + video_url = stream_info.get('url') + if not video_url or 'vodnotavailable.' in video_url or video_url in urls: + continue + urls.add(video_url) + if determine_ext(video_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, ext='mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + else: + add_format_url(video_url) + + is_live = metadata.get('medium') == 'live' + + if not is_live: + for num, stream in enumerate(metadata.get('streams', [])): + stream_url = stream.get('url') + if not is_legal_url(stream_url): + continue + urls.add(stream_url) + # smooth streaming is not supported + stream_type = stream.get('type', '').lower() + if stream_type in ['ss', 'ms']: + continue + if stream_type == 'hds': + f4m_formats = self._extract_f4m_formats( + stream_url, video_id, fatal=False) + # f4m downloader downloads only piece of live stream + for f4m_format in f4m_formats: + f4m_format['preference'] = -5 + formats.extend(f4m_formats) + elif stream_type == 'hls': + formats.extend(self._extract_m3u8_formats( + stream_url, video_id, ext='mp4', fatal=False)) + # Example: http://www.npo.nl/de-nieuwe-mens-deel-1/21-07-2010/WO_VPRO_043706 + elif '.asf' in stream_url: + asx = self._download_xml( + stream_url, video_id, + 'Downloading stream %d ASX playlist' % num, + transform_source=fix_xml_ampersands, fatal=False) + if not asx: + continue + ref = asx.find('./ENTRY/Ref') + if ref is None: + continue + video_url = ref.get('href') + if not video_url or video_url in urls: + continue + urls.add(video_url) + formats.append({ + 'url': video_url, + 'ext': stream.get('formaat', 'asf'), + 'quality': stream.get('kwaliteit'), + 'preference': -10, + }) + else: + formats.append({ + 'url': stream_url, + 'quality': stream.get('kwaliteit'), + }) + + self._sort_formats(formats) + + subtitles = {} + if metadata.get('tt888') == 'ja': + subtitles['nl'] = [{ + 'ext': 'vtt', + 'url': 'http://tt888.omroep.nl/tt888/%s' % video_id, + }] + + return { + 'id': video_id, + 'title': self._live_title(title) if is_live else title, + 'description': metadata.get('info'), + 'thumbnail': metadata.get('images', [{'url': None}])[-1]['url'], + 'upload_date': unified_strdate(metadata.get('gidsdatum')), + 'duration': parse_duration(metadata.get('tijdsduur')), + 'formats': formats, + 'subtitles': subtitles, + 'is_live': is_live, + } + + +class NPOLiveIE(NPOBaseIE): + IE_NAME = 'npo.nl:live' + _VALID_URL = r'https?://(?:www\.)?npo(?:start)?\.nl/live(?:/(?P<id>[^/?#&]+))?' + + _TESTS = [{ + 'url': 'http://www.npo.nl/live/npo-1', + 'info_dict': { + 'id': 'LI_NL1_4188102', + 'display_id': 'npo-1', + 'ext': 'mp4', + 'title': 're:^NPO 1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + } + }, { + 'url': 'http://www.npo.nl/live', + 'only_matching': True, + }, { + 'url': 'https://www.npostart.nl/live/npo-1', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) or 'npo-1' + + webpage = self._download_webpage(url, display_id) + + live_id = self._search_regex( + [r'media-id="([^"]+)"', r'data-prid="([^"]+)"'], webpage, 'live id') + + return { + '_type': 'url_transparent', + 'url': 'npo:%s' % live_id, + 'ie_key': NPOIE.ie_key(), + 'id': live_id, + 'display_id': display_id, + } + + +class NPORadioIE(InfoExtractor): + IE_NAME = 'npo.nl:radio' + _VALID_URL = r'https?://(?:www\.)?npo\.nl/radio/(?P<id>[^/]+)' + + _TEST = { + 'url': 'http://www.npo.nl/radio/radio-1', + 'info_dict': { + 'id': 'radio-1', + 'ext': 'mp3', + 'title': 're:^NPO Radio 1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + } + } + + @classmethod + def suitable(cls, url): + return False if NPORadioFragmentIE.suitable(url) else super(NPORadioIE, cls).suitable(url) + + @staticmethod + def _html_get_attribute_regex(attribute): + return r'{0}\s*=\s*\'([^\']+)\''.format(attribute) + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex( + self._html_get_attribute_regex('data-channel'), webpage, 'title') + + stream = self._parse_json( + self._html_search_regex(self._html_get_attribute_regex('data-streams'), webpage, 'data-streams'), + video_id) + + codec = stream.get('codec') + + return { + 'id': video_id, + 'url': stream['url'], + 'title': self._live_title(title), + 'acodec': codec, + 'ext': codec, + 'is_live': True, + } + + +class NPORadioFragmentIE(InfoExtractor): + IE_NAME = 'npo.nl:radio:fragment' + _VALID_URL = r'https?://(?:www\.)?npo\.nl/radio/[^/]+/fragment/(?P<id>\d+)' + + _TEST = { + 'url': 'http://www.npo.nl/radio/radio-5/fragment/174356', + 'md5': 'dd8cc470dad764d0fdc70a9a1e2d18c2', + 'info_dict': { + 'id': '174356', + 'ext': 'mp3', + 'title': 'Jubileumconcert Willeke Alberti', + }, + } + + def _real_extract(self, url): + audio_id = self._match_id(url) + + webpage = self._download_webpage(url, audio_id) + + title = self._html_search_regex( + r'href="/radio/[^/]+/fragment/%s" title="([^"]+)"' % audio_id, + webpage, 'title') + + audio_url = self._search_regex( + r"data-streams='([^']+)'", webpage, 'audio url') + + return { + 'id': audio_id, + 'url': audio_url, + 'title': title, + } + + +class NPODataMidEmbedIE(InfoExtractor): + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex( + r'data-mid=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video_id', group='id') + return { + '_type': 'url_transparent', + 'ie_key': 'NPO', + 'url': 'npo:%s' % video_id, + 'display_id': display_id + } + + +class SchoolTVIE(NPODataMidEmbedIE): + IE_NAME = 'schooltv' + _VALID_URL = r'https?://(?:www\.)?schooltv\.nl/video/(?P<id>[^/?#&]+)' + + _TEST = { + 'url': 'http://www.schooltv.nl/video/ademhaling-de-hele-dag-haal-je-adem-maar-wat-gebeurt-er-dan-eigenlijk-in-je-lichaam/', + 'info_dict': { + 'id': 'WO_NTR_429477', + 'display_id': 'ademhaling-de-hele-dag-haal-je-adem-maar-wat-gebeurt-er-dan-eigenlijk-in-je-lichaam', + 'title': 'Ademhaling: De hele dag haal je adem. Maar wat gebeurt er dan eigenlijk in je lichaam?', + 'ext': 'mp4', + 'description': 'md5:abfa0ff690adb73fd0297fd033aaa631' + }, + 'params': { + # Skip because of m3u8 download + 'skip_download': True + } + } + + +class HetKlokhuisIE(NPODataMidEmbedIE): + IE_NAME = 'hetklokhuis' + _VALID_URL = r'https?://(?:www\.)?hetklokhuis\.nl/[^/]+/\d+/(?P<id>[^/?#&]+)' + + _TEST = { + 'url': 'http://hetklokhuis.nl/tv-uitzending/3471/Zwaartekrachtsgolven', + 'info_dict': { + 'id': 'VPWON_1260528', + 'display_id': 'Zwaartekrachtsgolven', + 'ext': 'm4v', + 'title': 'Het Klokhuis: Zwaartekrachtsgolven', + 'description': 'md5:c94f31fb930d76c2efa4a4a71651dd48', + 'upload_date': '20170223', + }, + 'params': { + 'skip_download': True + } + } + + +class NPOPlaylistBaseIE(NPOIE): + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + entries = [ + self.url_result('npo:%s' % video_id if not video_id.startswith('http') else video_id) + for video_id in orderedSet(re.findall(self._PLAYLIST_ENTRY_RE, webpage)) + ] + + playlist_title = self._html_search_regex( + self._PLAYLIST_TITLE_RE, webpage, 'playlist title', + default=None) or self._og_search_title(webpage) + + return self.playlist_result(entries, playlist_id, playlist_title) + + +class VPROIE(NPOPlaylistBaseIE): + IE_NAME = 'vpro' + _VALID_URL = r'https?://(?:www\.)?(?:(?:tegenlicht\.)?vpro|2doc)\.nl/(?:[^/]+/)*(?P<id>[^/]+)\.html' + _PLAYLIST_TITLE_RE = (r'<h1[^>]+class=["\'].*?\bmedia-platform-title\b.*?["\'][^>]*>([^<]+)', + r'<h5[^>]+class=["\'].*?\bmedia-platform-subtitle\b.*?["\'][^>]*>([^<]+)') + _PLAYLIST_ENTRY_RE = r'data-media-id="([^"]+)"' + + _TESTS = [ + { + 'url': 'http://tegenlicht.vpro.nl/afleveringen/2012-2013/de-toekomst-komt-uit-afrika.html', + 'md5': 'f8065e4e5a7824068ed3c7e783178f2c', + 'info_dict': { + 'id': 'VPWON_1169289', + 'ext': 'm4v', + 'title': 'De toekomst komt uit Afrika', + 'description': 'md5:52cf4eefbc96fffcbdc06d024147abea', + 'upload_date': '20130225', + }, + 'skip': 'Video gone', + }, + { + 'url': 'http://www.vpro.nl/programmas/2doc/2015/sergio-herman.html', + 'info_dict': { + 'id': 'sergio-herman', + 'title': 'sergio herman: fucking perfect', + }, + 'playlist_count': 2, + }, + { + # playlist with youtube embed + 'url': 'http://www.vpro.nl/programmas/2doc/2015/education-education.html', + 'info_dict': { + 'id': 'education-education', + 'title': 'education education', + }, + 'playlist_count': 2, + }, + { + 'url': 'http://www.2doc.nl/documentaires/series/2doc/2015/oktober/de-tegenprestatie.html', + 'info_dict': { + 'id': 'de-tegenprestatie', + 'title': 'De Tegenprestatie', + }, + 'playlist_count': 2, + }, { + 'url': 'http://www.2doc.nl/speel~VARA_101375237~mh17-het-verdriet-van-nederland~.html', + 'info_dict': { + 'id': 'VARA_101375237', + 'ext': 'm4v', + 'title': 'MH17: Het verdriet van Nederland', + 'description': 'md5:09e1a37c1fdb144621e22479691a9f18', + 'upload_date': '20150716', + }, + 'params': { + # Skip because of m3u8 download + 'skip_download': True + }, + } + ] + + +class WNLIE(NPOPlaylistBaseIE): + IE_NAME = 'wnl' + _VALID_URL = r'https?://(?:www\.)?omroepwnl\.nl/video/detail/(?P<id>[^/]+)__\d+' + _PLAYLIST_TITLE_RE = r'(?s)<h1[^>]+class="subject"[^>]*>(.+?)</h1>' + _PLAYLIST_ENTRY_RE = r'<a[^>]+href="([^"]+)"[^>]+class="js-mid"[^>]*>Deel \d+' + + _TESTS = [{ + 'url': 'http://www.omroepwnl.nl/video/detail/vandaag-de-dag-6-mei__060515', + 'info_dict': { + 'id': 'vandaag-de-dag-6-mei', + 'title': 'Vandaag de Dag 6 mei', + }, + 'playlist_count': 4, + }] + + +class AndereTijdenIE(NPOPlaylistBaseIE): + IE_NAME = 'anderetijden' + _VALID_URL = r'https?://(?:www\.)?anderetijden\.nl/programma/(?:[^/]+/)+(?P<id>[^/?#&]+)' + _PLAYLIST_TITLE_RE = r'(?s)<h1[^>]+class=["\'].*?\bpage-title\b.*?["\'][^>]*>(.+?)</h1>' + _PLAYLIST_ENTRY_RE = r'<figure[^>]+class=["\']episode-container episode-page["\'][^>]+data-prid=["\'](.+?)["\']' + + _TESTS = [{ + 'url': 'http://anderetijden.nl/programma/1/Andere-Tijden/aflevering/676/Duitse-soldaten-over-de-Slag-bij-Arnhem', + 'info_dict': { + 'id': 'Duitse-soldaten-over-de-Slag-bij-Arnhem', + 'title': 'Duitse soldaten over de Slag bij Arnhem', + }, + 'playlist_count': 3, + }] diff --git a/youtube_dl/extractor/npr.py b/yt_dlp/extractor/npr.py index 9d1122f0c..9d1122f0c 100644 --- a/youtube_dl/extractor/npr.py +++ b/yt_dlp/extractor/npr.py diff --git a/yt_dlp/extractor/nrk.py b/yt_dlp/extractor/nrk.py new file mode 100644 index 000000000..b556bc6aa --- /dev/null +++ b/yt_dlp/extractor/nrk.py @@ -0,0 +1,873 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools +import random +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + parse_duration, + str_or_none, + try_get, + urljoin, + url_or_none, +) + + +class NRKBaseIE(InfoExtractor): + _GEO_COUNTRIES = ['NO'] + _CDN_REPL_REGEX = r'''(?x):// + (?: + nrkod\d{1,2}-httpcache0-47115-cacheod0\.dna\.ip-only\.net/47115-cacheod0| + nrk-od-no\.telenorcdn\.net| + minicdn-od\.nrk\.no/od/nrkhd-osl-rr\.netwerk\.no/no + )/''' + + def _extract_nrk_formats(self, asset_url, video_id): + if re.match(r'https?://[^/]+\.akamaihd\.net/i/', asset_url): + return self._extract_akamai_formats(asset_url, video_id) + asset_url = re.sub(r'(?:bw_(?:low|high)=\d+|no_audio_only)&?', '', asset_url) + formats = self._extract_m3u8_formats( + asset_url, video_id, 'mp4', 'm3u8_native', fatal=False) + if not formats and re.search(self._CDN_REPL_REGEX, asset_url): + formats = self._extract_m3u8_formats( + re.sub(self._CDN_REPL_REGEX, '://nrk-od-%02d.akamaized.net/no/' % random.randint(0, 99), asset_url), + video_id, 'mp4', 'm3u8_native', fatal=False) + return formats + + def _raise_error(self, data): + MESSAGES = { + 'ProgramRightsAreNotReady': 'Du kan dessverre ikke se eller høre programmet', + 'ProgramRightsHasExpired': 'Programmet har gått ut', + 'NoProgramRights': 'Ikke tilgjengelig', + 'ProgramIsGeoBlocked': 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge', + } + message_type = data.get('messageType', '') + # Can be ProgramIsGeoBlocked or ChannelIsGeoBlocked* + if 'IsGeoBlocked' in message_type or try_get(data, lambda x: x['usageRights']['isGeoBlocked']) is True: + self.raise_geo_restricted( + msg=MESSAGES.get('ProgramIsGeoBlocked'), + countries=self._GEO_COUNTRIES) + message = data.get('endUserMessage') or MESSAGES.get(message_type, message_type) + raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True) + + def _call_api(self, path, video_id, item=None, note=None, fatal=True, query=None): + return self._download_json( + urljoin('https://psapi.nrk.no/', path), + video_id, note or 'Downloading %s JSON' % item, + fatal=fatal, query=query, + headers={'Accept-Encoding': 'gzip, deflate, br'}) + + +class NRKIE(NRKBaseIE): + _VALID_URL = r'''(?x) + (?: + nrk:| + https?:// + (?: + (?:www\.)?nrk\.no/video/(?:PS\*|[^_]+_)| + v8[-.]psapi\.nrk\.no/mediaelement/ + ) + ) + (?P<id>[^?\#&]+) + ''' + + _TESTS = [{ + # video + 'url': 'http://www.nrk.no/video/PS*150533', + 'md5': 'f46be075326e23ad0e524edfcb06aeb6', + 'info_dict': { + 'id': '150533', + 'ext': 'mp4', + 'title': 'Dompap og andre fugler i Piip-Show', + 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f', + 'duration': 262, + } + }, { + # audio + 'url': 'http://www.nrk.no/video/PS*154915', + # MD5 is unstable + 'info_dict': { + 'id': '154915', + 'ext': 'mp4', + 'title': 'Slik høres internett ut når du er blind', + 'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568', + 'duration': 20, + } + }, { + 'url': 'nrk:ecc1b952-96dc-4a98-81b9-5296dc7a98d9', + 'only_matching': True, + }, { + 'url': 'nrk:clip/7707d5a3-ebe7-434a-87d5-a3ebe7a34a70', + 'only_matching': True, + }, { + 'url': 'https://v8-psapi.nrk.no/mediaelement/ecc1b952-96dc-4a98-81b9-5296dc7a98d9', + 'only_matching': True, + }, { + 'url': 'https://www.nrk.no/video/dompap-og-andre-fugler-i-piip-show_150533', + 'only_matching': True, + }, { + 'url': 'https://www.nrk.no/video/humor/kommentatorboksen-reiser-til-sjos_d1fda11f-a4ad-437a-a374-0398bc84e999', + 'only_matching': True, + }, { + # podcast + 'url': 'nrk:l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8', + 'only_matching': True, + }, { + 'url': 'nrk:podcast/l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8', + 'only_matching': True, + }, { + # clip + 'url': 'nrk:150533', + 'only_matching': True, + }, { + 'url': 'nrk:clip/150533', + 'only_matching': True, + }, { + # program + 'url': 'nrk:MDDP12000117', + 'only_matching': True, + }, { + 'url': 'nrk:program/ENRK10100318', + 'only_matching': True, + }, { + # direkte + 'url': 'nrk:nrk1', + 'only_matching': True, + }, { + 'url': 'nrk:channel/nrk1', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url).split('/')[-1] + + path_templ = 'playback/%s/' + video_id + + def call_playback_api(item, query=None): + return self._call_api(path_templ % item, video_id, item, query=query) + # known values for preferredCdn: akamai, iponly, minicdn and telenor + manifest = call_playback_api('manifest', {'preferredCdn': 'akamai'}) + + video_id = try_get(manifest, lambda x: x['id'], compat_str) or video_id + + if manifest.get('playability') == 'nonPlayable': + self._raise_error(manifest['nonPlayable']) + + playable = manifest['playable'] + + formats = [] + for asset in playable['assets']: + if not isinstance(asset, dict): + continue + if asset.get('encrypted'): + continue + format_url = url_or_none(asset.get('url')) + if not format_url: + continue + asset_format = (asset.get('format') or '').lower() + if asset_format == 'hls' or determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_nrk_formats(format_url, video_id)) + elif asset_format == 'mp3': + formats.append({ + 'url': format_url, + 'format_id': asset_format, + 'vcodec': 'none', + }) + self._sort_formats(formats) + + data = call_playback_api('metadata') + + preplay = data['preplay'] + titles = preplay['titles'] + title = titles['title'] + alt_title = titles.get('subtitle') + + description = preplay.get('description') + duration = parse_duration(playable.get('duration')) or parse_duration(data.get('duration')) + + thumbnails = [] + for image in try_get( + preplay, lambda x: x['poster']['images'], list) or []: + if not isinstance(image, dict): + continue + image_url = url_or_none(image.get('url')) + if not image_url: + continue + thumbnails.append({ + 'url': image_url, + 'width': int_or_none(image.get('pixelWidth')), + 'height': int_or_none(image.get('pixelHeight')), + }) + + subtitles = {} + for sub in try_get(playable, lambda x: x['subtitles'], list) or []: + if not isinstance(sub, dict): + continue + sub_url = url_or_none(sub.get('webVtt')) + if not sub_url: + continue + sub_key = str_or_none(sub.get('language')) or 'nb' + sub_type = str_or_none(sub.get('type')) + if sub_type: + sub_key += '-%s' % sub_type + subtitles.setdefault(sub_key, []).append({ + 'url': sub_url, + }) + + legal_age = try_get( + data, lambda x: x['legalAge']['body']['rating']['code'], compat_str) + # https://en.wikipedia.org/wiki/Norwegian_Media_Authority + age_limit = None + if legal_age: + if legal_age == 'A': + age_limit = 0 + elif legal_age.isdigit(): + age_limit = int_or_none(legal_age) + + is_series = try_get(data, lambda x: x['_links']['series']['name']) == 'series' + + info = { + 'id': video_id, + 'title': title, + 'alt_title': alt_title, + 'description': description, + 'duration': duration, + 'thumbnails': thumbnails, + 'age_limit': age_limit, + 'formats': formats, + 'subtitles': subtitles, + } + + if is_series: + series = season_id = season_number = episode = episode_number = None + programs = self._call_api( + 'programs/%s' % video_id, video_id, 'programs', fatal=False) + if programs and isinstance(programs, dict): + series = str_or_none(programs.get('seriesTitle')) + season_id = str_or_none(programs.get('seasonId')) + season_number = int_or_none(programs.get('seasonNumber')) + episode = str_or_none(programs.get('episodeTitle')) + episode_number = int_or_none(programs.get('episodeNumber')) + if not series: + series = title + if alt_title: + title += ' - %s' % alt_title + if not season_number: + season_number = int_or_none(self._search_regex( + r'Sesong\s+(\d+)', description or '', 'season number', + default=None)) + if not episode: + episode = alt_title if is_series else None + if not episode_number: + episode_number = int_or_none(self._search_regex( + r'^(\d+)\.', episode or '', 'episode number', + default=None)) + if not episode_number: + episode_number = int_or_none(self._search_regex( + r'\((\d+)\s*:\s*\d+\)', description or '', + 'episode number', default=None)) + info.update({ + 'title': title, + 'series': series, + 'season_id': season_id, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, + }) + + return info + + +class NRKTVIE(InfoExtractor): + IE_DESC = 'NRK TV and NRK Radio' + _EPISODE_RE = r'(?P<id>[a-zA-Z]{4}\d{8})' + _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/(?:[^/]+/)*%s' % _EPISODE_RE + _TESTS = [{ + 'url': 'https://tv.nrk.no/program/MDDP12000117', + 'md5': 'c4a5960f1b00b40d47db65c1064e0ab1', + 'info_dict': { + 'id': 'MDDP12000117', + 'ext': 'mp4', + 'title': 'Alarm Trolltunga', + 'description': 'md5:46923a6e6510eefcce23d5ef2a58f2ce', + 'duration': 2223.44, + 'age_limit': 6, + 'subtitles': { + 'nb-nor': [{ + 'ext': 'vtt', + }], + 'nb-ttv': [{ + 'ext': 'vtt', + }] + }, + }, + }, { + 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', + 'md5': '8d40dab61cea8ab0114e090b029a0565', + 'info_dict': { + 'id': 'MUHH48000314', + 'ext': 'mp4', + 'title': '20 spørsmål - 23. mai 2014', + 'alt_title': '23. mai 2014', + 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', + 'duration': 1741, + 'series': '20 spørsmål', + 'episode': '23. mai 2014', + 'age_limit': 0, + }, + }, { + 'url': 'https://tv.nrk.no/program/mdfp15000514', + 'info_dict': { + 'id': 'MDFP15000514', + 'ext': 'mp4', + 'title': 'Kunnskapskanalen - Grunnlovsjubiléet - Stor ståhei for ingenting', + 'description': 'md5:89290c5ccde1b3a24bb8050ab67fe1db', + 'duration': 4605.08, + 'series': 'Kunnskapskanalen', + 'episode': 'Grunnlovsjubiléet - Stor ståhei for ingenting', + 'age_limit': 0, + }, + 'params': { + 'skip_download': True, + }, + }, { + # single playlist video + 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', + 'info_dict': { + 'id': 'MSPO40010515', + 'ext': 'mp4', + 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', + 'description': 'md5:c03aba1e917561eface5214020551b7a', + 'age_limit': 0, + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Failed to download m3u8 information'], + 'skip': 'particular part is not supported currently', + }, { + 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', + 'info_dict': { + 'id': 'MSPO40010515', + 'ext': 'mp4', + 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', + 'description': 'md5:c03aba1e917561eface5214020551b7a', + 'age_limit': 0, + }, + 'expected_warnings': ['Failed to download m3u8 information'], + 'skip': 'Ikke tilgjengelig utenfor Norge', + }, { + 'url': 'https://tv.nrk.no/serie/anno/KMTE50001317/sesong-3/episode-13', + 'info_dict': { + 'id': 'KMTE50001317', + 'ext': 'mp4', + 'title': 'Anno - 13. episode', + 'description': 'md5:11d9613661a8dbe6f9bef54e3a4cbbfa', + 'duration': 2340, + 'series': 'Anno', + 'episode': '13. episode', + 'season_number': 3, + 'episode_number': 13, + 'age_limit': 0, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://tv.nrk.no/serie/nytt-paa-nytt/MUHH46000317/27-01-2017', + 'info_dict': { + 'id': 'MUHH46000317', + 'ext': 'mp4', + 'title': 'Nytt på Nytt 27.01.2017', + 'description': 'md5:5358d6388fba0ea6f0b6d11c48b9eb4b', + 'duration': 1796, + 'series': 'Nytt på nytt', + 'episode': '27.01.2017', + 'age_limit': 0, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'ProgramRightsHasExpired', + }, { + 'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#', + 'only_matching': True, + }, { + 'url': 'https://tv.nrk.no/serie/lindmo/2018/MUHU11006318/avspiller', + 'only_matching': True, + }, { + 'url': 'https://radio.nrk.no/serie/dagsnytt/sesong/201507/NPUB21019315', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result( + 'nrk:%s' % video_id, ie=NRKIE.ie_key(), video_id=video_id) + + +class NRKTVEpisodeIE(InfoExtractor): + _VALID_URL = r'https?://tv\.nrk\.no/serie/(?P<id>[^/]+/sesong/(?P<season_number>\d+)/episode/(?P<episode_number>\d+))' + _TESTS = [{ + 'url': 'https://tv.nrk.no/serie/hellums-kro/sesong/1/episode/2', + 'info_dict': { + 'id': 'MUHH36005220', + 'ext': 'mp4', + 'title': 'Hellums kro - 2. Kro, krig og kjærlighet', + 'description': 'md5:ad92ddffc04cea8ce14b415deef81787', + 'duration': 1563.92, + 'series': 'Hellums kro', + 'season_number': 1, + 'episode_number': 2, + 'episode': '2. Kro, krig og kjærlighet', + 'age_limit': 6, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://tv.nrk.no/serie/backstage/sesong/1/episode/8', + 'info_dict': { + 'id': 'MSUI14000816', + 'ext': 'mp4', + 'title': 'Backstage - 8. episode', + 'description': 'md5:de6ca5d5a2d56849e4021f2bf2850df4', + 'duration': 1320, + 'series': 'Backstage', + 'season_number': 1, + 'episode_number': 8, + 'episode': '8. episode', + 'age_limit': 0, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'ProgramRightsHasExpired', + }] + + def _real_extract(self, url): + display_id, season_number, episode_number = self._match_valid_url(url).groups() + + webpage = self._download_webpage(url, display_id) + + info = self._search_json_ld(webpage, display_id, default={}) + nrk_id = info.get('@id') or self._html_search_meta( + 'nrk:program-id', webpage, default=None) or self._search_regex( + r'data-program-id=["\'](%s)' % NRKTVIE._EPISODE_RE, webpage, + 'nrk id') + assert re.match(NRKTVIE._EPISODE_RE, nrk_id) + + info.update({ + '_type': 'url', + 'id': nrk_id, + 'url': 'nrk:%s' % nrk_id, + 'ie_key': NRKIE.ie_key(), + 'season_number': int(season_number), + 'episode_number': int(episode_number), + }) + return info + + +class NRKTVSerieBaseIE(NRKBaseIE): + def _extract_entries(self, entry_list): + if not isinstance(entry_list, list): + return [] + entries = [] + for episode in entry_list: + nrk_id = episode.get('prfId') or episode.get('episodeId') + if not nrk_id or not isinstance(nrk_id, compat_str): + continue + entries.append(self.url_result( + 'nrk:%s' % nrk_id, ie=NRKIE.ie_key(), video_id=nrk_id)) + return entries + + _ASSETS_KEYS = ('episodes', 'instalments',) + + def _extract_assets_key(self, embedded): + for asset_key in self._ASSETS_KEYS: + if embedded.get(asset_key): + return asset_key + + @staticmethod + def _catalog_name(serie_kind): + return 'podcast' if serie_kind in ('podcast', 'podkast') else 'series' + + def _entries(self, data, display_id): + for page_num in itertools.count(1): + embedded = data.get('_embedded') or data + if not isinstance(embedded, dict): + break + assets_key = self._extract_assets_key(embedded) + if not assets_key: + break + # Extract entries + entries = try_get( + embedded, + (lambda x: x[assets_key]['_embedded'][assets_key], + lambda x: x[assets_key]), + list) + for e in self._extract_entries(entries): + yield e + # Find next URL + next_url_path = try_get( + data, + (lambda x: x['_links']['next']['href'], + lambda x: x['_embedded'][assets_key]['_links']['next']['href']), + compat_str) + if not next_url_path: + break + data = self._call_api( + next_url_path, display_id, + note='Downloading %s JSON page %d' % (assets_key, page_num), + fatal=False) + if not data: + break + + +class NRKTVSeasonIE(NRKTVSerieBaseIE): + _VALID_URL = r'''(?x) + https?:// + (?P<domain>tv|radio)\.nrk\.no/ + (?P<serie_kind>serie|pod[ck]ast)/ + (?P<serie>[^/]+)/ + (?: + (?:sesong/)?(?P<id>\d+)| + sesong/(?P<id_2>[^/?#&]+) + ) + ''' + _TESTS = [{ + 'url': 'https://tv.nrk.no/serie/backstage/sesong/1', + 'info_dict': { + 'id': 'backstage/1', + 'title': 'Sesong 1', + }, + 'playlist_mincount': 30, + }, { + # no /sesong/ in path + 'url': 'https://tv.nrk.no/serie/lindmo/2016', + 'info_dict': { + 'id': 'lindmo/2016', + 'title': '2016', + }, + 'playlist_mincount': 29, + }, { + # weird nested _embedded in catalog JSON response + 'url': 'https://radio.nrk.no/serie/dickie-dick-dickens/sesong/1', + 'info_dict': { + 'id': 'dickie-dick-dickens/1', + 'title': 'Sesong 1', + }, + 'playlist_mincount': 11, + }, { + # 841 entries, multi page + 'url': 'https://radio.nrk.no/serie/dagsnytt/sesong/201509', + 'info_dict': { + 'id': 'dagsnytt/201509', + 'title': 'September 2015', + }, + 'playlist_mincount': 841, + }, { + # 180 entries, single page + 'url': 'https://tv.nrk.no/serie/spangas/sesong/1', + 'only_matching': True, + }, { + 'url': 'https://radio.nrk.no/podkast/hele_historien/sesong/diagnose-kverulant', + 'info_dict': { + 'id': 'hele_historien/diagnose-kverulant', + 'title': 'Diagnose kverulant', + }, + 'playlist_mincount': 3, + }, { + 'url': 'https://radio.nrk.no/podkast/loerdagsraadet/sesong/202101', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return (False if NRKTVIE.suitable(url) or NRKTVEpisodeIE.suitable(url) or NRKRadioPodkastIE.suitable(url) + else super(NRKTVSeasonIE, cls).suitable(url)) + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + domain = mobj.group('domain') + serie_kind = mobj.group('serie_kind') + serie = mobj.group('serie') + season_id = mobj.group('id') or mobj.group('id_2') + display_id = '%s/%s' % (serie, season_id) + + data = self._call_api( + '%s/catalog/%s/%s/seasons/%s' + % (domain, self._catalog_name(serie_kind), serie, season_id), + display_id, 'season', query={'pageSize': 50}) + + title = try_get(data, lambda x: x['titles']['title'], compat_str) or display_id + return self.playlist_result( + self._entries(data, display_id), + display_id, title) + + +class NRKTVSeriesIE(NRKTVSerieBaseIE): + _VALID_URL = r'https?://(?P<domain>(?:tv|radio)\.nrk|(?:tv\.)?nrksuper)\.no/(?P<serie_kind>serie|pod[ck]ast)/(?P<id>[^/]+)' + _TESTS = [{ + # new layout, instalments + 'url': 'https://tv.nrk.no/serie/groenn-glede', + 'info_dict': { + 'id': 'groenn-glede', + 'title': 'Grønn glede', + 'description': 'md5:7576e92ae7f65da6993cf90ee29e4608', + }, + 'playlist_mincount': 90, + }, { + # new layout, instalments, more entries + 'url': 'https://tv.nrk.no/serie/lindmo', + 'only_matching': True, + }, { + 'url': 'https://tv.nrk.no/serie/blank', + 'info_dict': { + 'id': 'blank', + 'title': 'Blank', + 'description': 'md5:7664b4e7e77dc6810cd3bca367c25b6e', + }, + 'playlist_mincount': 30, + }, { + # new layout, seasons + 'url': 'https://tv.nrk.no/serie/backstage', + 'info_dict': { + 'id': 'backstage', + 'title': 'Backstage', + 'description': 'md5:63692ceb96813d9a207e9910483d948b', + }, + 'playlist_mincount': 60, + }, { + # old layout + 'url': 'https://tv.nrksuper.no/serie/labyrint', + 'info_dict': { + 'id': 'labyrint', + 'title': 'Labyrint', + 'description': 'I Daidalos sin undersjøiske Labyrint venter spennende oppgaver, skumle robotskapninger og slim.', + }, + 'playlist_mincount': 3, + }, { + 'url': 'https://tv.nrk.no/serie/broedrene-dal-og-spektralsteinene', + 'only_matching': True, + }, { + 'url': 'https://tv.nrk.no/serie/saving-the-human-race', + 'only_matching': True, + }, { + 'url': 'https://tv.nrk.no/serie/postmann-pat', + 'only_matching': True, + }, { + 'url': 'https://radio.nrk.no/serie/dickie-dick-dickens', + 'info_dict': { + 'id': 'dickie-dick-dickens', + 'title': 'Dickie Dick Dickens', + 'description': 'md5:19e67411ffe57f7dce08a943d7a0b91f', + }, + 'playlist_mincount': 8, + }, { + 'url': 'https://nrksuper.no/serie/labyrint', + 'only_matching': True, + }, { + 'url': 'https://radio.nrk.no/podkast/ulrikkes_univers', + 'info_dict': { + 'id': 'ulrikkes_univers', + }, + 'playlist_mincount': 10, + }, { + 'url': 'https://radio.nrk.no/podkast/ulrikkes_univers/nrkno-poddkast-26588-134079-05042018030000', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return ( + False if any(ie.suitable(url) + for ie in (NRKTVIE, NRKTVEpisodeIE, NRKRadioPodkastIE, NRKTVSeasonIE)) + else super(NRKTVSeriesIE, cls).suitable(url)) + + def _real_extract(self, url): + site, serie_kind, series_id = self._match_valid_url(url).groups() + is_radio = site == 'radio.nrk' + domain = 'radio' if is_radio else 'tv' + + size_prefix = 'p' if is_radio else 'embeddedInstalmentsP' + series = self._call_api( + '%s/catalog/%s/%s' + % (domain, self._catalog_name(serie_kind), series_id), + series_id, 'serie', query={size_prefix + 'ageSize': 50}) + titles = try_get(series, [ + lambda x: x['titles'], + lambda x: x[x['type']]['titles'], + lambda x: x[x['seriesType']]['titles'], + ]) or {} + + entries = [] + entries.extend(self._entries(series, series_id)) + embedded = series.get('_embedded') or {} + linked_seasons = try_get(series, lambda x: x['_links']['seasons']) or [] + embedded_seasons = embedded.get('seasons') or [] + if len(linked_seasons) > len(embedded_seasons): + for season in linked_seasons: + season_url = urljoin(url, season.get('href')) + if not season_url: + season_name = season.get('name') + if season_name and isinstance(season_name, compat_str): + season_url = 'https://%s.nrk.no/serie/%s/sesong/%s' % (domain, series_id, season_name) + if season_url: + entries.append(self.url_result( + season_url, ie=NRKTVSeasonIE.ie_key(), + video_title=season.get('title'))) + else: + for season in embedded_seasons: + entries.extend(self._entries(season, series_id)) + entries.extend(self._entries( + embedded.get('extraMaterial') or {}, series_id)) + + return self.playlist_result( + entries, series_id, titles.get('title'), titles.get('subtitle')) + + +class NRKTVDirekteIE(NRKTVIE): + IE_DESC = 'NRK TV Direkte and NRK Radio Direkte' + _VALID_URL = r'https?://(?:tv|radio)\.nrk\.no/direkte/(?P<id>[^/?#&]+)' + + _TESTS = [{ + 'url': 'https://tv.nrk.no/direkte/nrk1', + 'only_matching': True, + }, { + 'url': 'https://radio.nrk.no/direkte/p1_oslo_akershus', + 'only_matching': True, + }] + + +class NRKRadioPodkastIE(InfoExtractor): + _VALID_URL = r'https?://radio\.nrk\.no/pod[ck]ast/(?:[^/]+/)+(?P<id>l_[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + + _TESTS = [{ + 'url': 'https://radio.nrk.no/podkast/ulrikkes_univers/l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8', + 'md5': '8d40dab61cea8ab0114e090b029a0565', + 'info_dict': { + 'id': 'MUHH48000314AA', + 'ext': 'mp4', + 'title': '20 spørsmål 23.05.2014', + 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', + 'duration': 1741, + 'series': '20 spørsmål', + 'episode': '23.05.2014', + }, + }, { + 'url': 'https://radio.nrk.no/podcast/ulrikkes_univers/l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8', + 'only_matching': True, + }, { + 'url': 'https://radio.nrk.no/podkast/ulrikkes_univers/sesong/1/l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8', + 'only_matching': True, + }, { + 'url': 'https://radio.nrk.no/podkast/hele_historien/sesong/bortfoert-i-bergen/l_774d1a2c-7aa7-4965-8d1a-2c7aa7d9652c', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result( + 'nrk:%s' % video_id, ie=NRKIE.ie_key(), video_id=video_id) + + +class NRKPlaylistBaseIE(InfoExtractor): + def _extract_description(self, webpage): + pass + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + entries = [ + self.url_result('nrk:%s' % video_id, NRKIE.ie_key()) + for video_id in re.findall(self._ITEM_RE, webpage) + ] + + playlist_title = self. _extract_title(webpage) + playlist_description = self._extract_description(webpage) + + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) + + +class NRKPlaylistIE(NRKPlaylistBaseIE): + _VALID_URL = r'https?://(?:www\.)?nrk\.no/(?!video|skole)(?:[^/]+/)+(?P<id>[^/]+)' + _ITEM_RE = r'class="[^"]*\brich\b[^"]*"[^>]+data-video-id="([^"]+)"' + _TESTS = [{ + 'url': 'http://www.nrk.no/troms/gjenopplev-den-historiske-solformorkelsen-1.12270763', + 'info_dict': { + 'id': 'gjenopplev-den-historiske-solformorkelsen-1.12270763', + 'title': 'Gjenopplev den historiske solformørkelsen', + 'description': 'md5:c2df8ea3bac5654a26fc2834a542feed', + }, + 'playlist_count': 2, + }, { + 'url': 'http://www.nrk.no/kultur/bok/rivertonprisen-til-karin-fossum-1.12266449', + 'info_dict': { + 'id': 'rivertonprisen-til-karin-fossum-1.12266449', + 'title': 'Rivertonprisen til Karin Fossum', + 'description': 'Første kvinne på 15 år til å vinne krimlitteraturprisen.', + }, + 'playlist_count': 2, + }] + + def _extract_title(self, webpage): + return self._og_search_title(webpage, fatal=False) + + def _extract_description(self, webpage): + return self._og_search_description(webpage) + + +class NRKTVEpisodesIE(NRKPlaylistBaseIE): + _VALID_URL = r'https?://tv\.nrk\.no/program/[Ee]pisodes/[^/]+/(?P<id>\d+)' + _ITEM_RE = r'data-episode=["\']%s' % NRKTVIE._EPISODE_RE + _TESTS = [{ + 'url': 'https://tv.nrk.no/program/episodes/nytt-paa-nytt/69031', + 'info_dict': { + 'id': '69031', + 'title': 'Nytt på nytt, sesong: 201210', + }, + 'playlist_count': 4, + }] + + def _extract_title(self, webpage): + return self._html_search_regex( + r'<h1>([^<]+)</h1>', webpage, 'title', fatal=False) + + +class NRKSkoleIE(InfoExtractor): + IE_DESC = 'NRK Skole' + _VALID_URL = r'https?://(?:www\.)?nrk\.no/skole/?\?.*\bmediaId=(?P<id>\d+)' + + _TESTS = [{ + 'url': 'https://www.nrk.no/skole/?page=search&q=&mediaId=14099', + 'md5': '18c12c3d071953c3bf8d54ef6b2587b7', + 'info_dict': { + 'id': '6021', + 'ext': 'mp4', + 'title': 'Genetikk og eneggede tvillinger', + 'description': 'md5:3aca25dcf38ec30f0363428d2b265f8d', + 'duration': 399, + }, + }, { + 'url': 'https://www.nrk.no/skole/?page=objectives&subject=naturfag&objective=K15114&mediaId=19355', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + nrk_id = self._download_json( + 'https://nrkno-skole-prod.kube.nrk.no/skole/api/media/%s' % video_id, + video_id)['psId'] + + return self.url_result('nrk:%s' % nrk_id) diff --git a/youtube_dl/extractor/nrl.py b/yt_dlp/extractor/nrl.py index 22a2df8d3..22a2df8d3 100644 --- a/youtube_dl/extractor/nrl.py +++ b/yt_dlp/extractor/nrl.py diff --git a/youtube_dl/extractor/ntvcojp.py b/yt_dlp/extractor/ntvcojp.py index 0c8221b22..0c8221b22 100644 --- a/youtube_dl/extractor/ntvcojp.py +++ b/yt_dlp/extractor/ntvcojp.py diff --git a/yt_dlp/extractor/ntvde.py b/yt_dlp/extractor/ntvde.py new file mode 100644 index 000000000..035582ee8 --- /dev/null +++ b/yt_dlp/extractor/ntvde.py @@ -0,0 +1,77 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + int_or_none, + js_to_json, + parse_duration, +) + + +class NTVDeIE(InfoExtractor): + IE_NAME = 'n-tv.de' + _VALID_URL = r'https?://(?:www\.)?n-tv\.de/mediathek/videos/[^/?#]+/[^/?#]+-article(?P<id>.+)\.html' + + _TESTS = [{ + 'url': 'http://www.n-tv.de/mediathek/videos/panorama/Schnee-und-Glaette-fuehren-zu-zahlreichen-Unfaellen-und-Staus-article14438086.html', + 'md5': '6ef2514d4b1e8e03ca24b49e2f167153', + 'info_dict': { + 'id': '14438086', + 'ext': 'mp4', + 'thumbnail': r're:^https?://.*\.jpg$', + 'title': 'Schnee und Glätte führen zu zahlreichen Unfällen und Staus', + 'alt_title': 'Winterchaos auf deutschen Straßen', + 'description': 'Schnee und Glätte sorgen deutschlandweit für einen chaotischen Start in die Woche: Auf den Straßen kommt es zu kilometerlangen Staus und Dutzenden Glätteunfällen. In Düsseldorf und München wirbelt der Schnee zudem den Flugplan durcheinander. Dutzende Flüge landen zu spät, einige fallen ganz aus.', + 'duration': 4020, + 'timestamp': 1422892797, + 'upload_date': '20150202', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + info = self._parse_json(self._search_regex( + r'(?s)ntv\.pageInfo\.article\s*=\s*(\{.*?\});', webpage, 'info'), + video_id, transform_source=js_to_json) + timestamp = int_or_none(info.get('publishedDateAsUnixTimeStamp')) + vdata = self._parse_json(self._search_regex( + r'(?s)\$\(\s*"\#player"\s*\)\s*\.data\(\s*"player",\s*(\{.*?\})\);', + webpage, 'player data'), video_id, + transform_source=lambda s: js_to_json(re.sub(r'advertising:\s*{[^}]+},', '', s))) + duration = parse_duration(vdata.get('duration')) + + formats = [] + if vdata.get('video'): + formats.append({ + 'format_id': 'flash', + 'url': 'rtmp://fms.n-tv.de/%s' % vdata['video'], + }) + if vdata.get('videoMp4'): + formats.append({ + 'format_id': 'mobile', + 'url': compat_urlparse.urljoin('http://video.n-tv.de', vdata['videoMp4']), + 'tbr': 400, # estimation + }) + if vdata.get('videoM3u8'): + m3u8_url = compat_urlparse.urljoin('http://video.n-tv.de', vdata['videoM3u8']) + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native', + quality=1, m3u8_id='hls', fatal=False)) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': info['headline'], + 'description': info.get('intro'), + 'alt_title': info.get('kicker'), + 'timestamp': timestamp, + 'thumbnail': vdata.get('html5VideoPoster'), + 'duration': duration, + 'formats': formats, + } diff --git a/youtube_dl/extractor/ntvru.py b/yt_dlp/extractor/ntvru.py index c47d1dfa4..c47d1dfa4 100644 --- a/youtube_dl/extractor/ntvru.py +++ b/yt_dlp/extractor/ntvru.py diff --git a/youtube_dl/extractor/nuevo.py b/yt_dlp/extractor/nuevo.py index be1e09d37..be1e09d37 100644 --- a/youtube_dl/extractor/nuevo.py +++ b/yt_dlp/extractor/nuevo.py diff --git a/yt_dlp/extractor/nuvid.py b/yt_dlp/extractor/nuvid.py new file mode 100644 index 000000000..7487824f9 --- /dev/null +++ b/yt_dlp/extractor/nuvid.py @@ -0,0 +1,73 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + parse_duration, + int_or_none, + try_get, +) + + +class NuvidIE(InfoExtractor): + _VALID_URL = r'https?://(?:www|m)\.nuvid\.com/video/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'https://www.nuvid.com/video/6513023/italian-babe', + 'md5': '772d2f8288f3d3c5c45f7a41761c7844', + 'info_dict': { + 'id': '6513023', + 'ext': 'mp4', + 'title': 'italian babe', + 'duration': 321.0, + 'age_limit': 18, + } + }, { + 'url': 'https://m.nuvid.com/video/6523263', + 'info_dict': { + 'id': '6523263', + 'ext': 'mp4', + 'age_limit': 18, + 'title': 'Slut brunette college student anal dorm', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + qualities = { + 'lq': '360p', + 'hq': '720p', + } + + json_url = f'https://www.nuvid.com/player_config_json/?vid={video_id}&aid=0&domain_id=0&embed=0&check_speed=0' + video_data = self._download_json( + json_url, video_id, headers={ + 'Accept': 'application/json, text/javascript, */*; q = 0.01', + 'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8', + }) + + formats = [{ + 'url': source, + 'format_id': qualities.get(quality), + 'height': int_or_none(qualities.get(quality)[:-1]), + } for quality, source in video_data.get('files').items() if source] + + self._check_formats(formats, video_id) + self._sort_formats(formats) + + title = video_data.get('title') + thumbnail_base_url = try_get(video_data, lambda x: x['thumbs']['url']) + thumbnail_extension = try_get(video_data, lambda x: x['thumbs']['extension']) + thumbnail_id = self._search_regex( + r'/media/videos/tmb/6523263/preview/(/d+)' + thumbnail_extension, video_data.get('poster', ''), 'thumbnail id', default=19) + thumbnail = f'{thumbnail_base_url}player/{thumbnail_id}{thumbnail_extension}' + duration = parse_duration(video_data.get('duration') or video_data.get('duration_format')) + + return { + 'id': video_id, + 'formats': formats, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'age_limit': 18, + } diff --git a/yt_dlp/extractor/nytimes.py b/yt_dlp/extractor/nytimes.py new file mode 100644 index 000000000..99964737d --- /dev/null +++ b/yt_dlp/extractor/nytimes.py @@ -0,0 +1,265 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import hmac +import hashlib +import base64 + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + float_or_none, + int_or_none, + js_to_json, + mimetype2ext, + parse_iso8601, + remove_start, +) + + +class NYTimesBaseIE(InfoExtractor): + _SECRET = b'pX(2MbU2);4N{7J8)>YwKRJ+/pQ3JkiU2Q^V>mFYv6g6gYvt6v' + + def _extract_video_from_id(self, video_id): + # Authorization generation algorithm is reverse engineered from `signer` in + # http://graphics8.nytimes.com/video/vhs/vhs-2.x.min.js + path = '/svc/video/api/v3/video/' + video_id + hm = hmac.new(self._SECRET, (path + ':vhs').encode(), hashlib.sha512).hexdigest() + video_data = self._download_json('http://www.nytimes.com' + path, video_id, 'Downloading video JSON', headers={ + 'Authorization': 'NYTV ' + base64.b64encode(hm.encode()).decode(), + 'X-NYTV': 'vhs', + }, fatal=False) + if not video_data: + video_data = self._download_json( + 'http://www.nytimes.com/svc/video/api/v2/video/' + video_id, + video_id, 'Downloading video JSON') + + title = video_data['headline'] + + def get_file_size(file_size): + if isinstance(file_size, int): + return file_size + elif isinstance(file_size, dict): + return int(file_size.get('value', 0)) + else: + return None + + urls = [] + formats = [] + subtitles = {} + for video in video_data.get('renditions', []): + video_url = video.get('url') + format_id = video.get('type') + if not video_url or format_id == 'thumbs' or video_url in urls: + continue + urls.append(video_url) + ext = mimetype2ext(video.get('mimetype')) or determine_ext(video_url) + if ext == 'm3u8': + m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles( + video_url, video_id, 'mp4', 'm3u8_native', + m3u8_id=format_id or 'hls', fatal=False) + formats.extend(m3u8_fmts) + subtitles = self._merge_subtitles(subtitles, m3u8_subs) + elif ext == 'mpd': + continue + # formats.extend(self._extract_mpd_formats( + # video_url, video_id, format_id or 'dash', fatal=False)) + else: + formats.append({ + 'url': video_url, + 'format_id': format_id, + 'vcodec': video.get('videoencoding') or video.get('video_codec'), + 'width': int_or_none(video.get('width')), + 'height': int_or_none(video.get('height')), + 'filesize': get_file_size(video.get('file_size') or video.get('fileSize')), + 'tbr': int_or_none(video.get('bitrate'), 1000) or None, + 'ext': ext, + }) + self._sort_formats(formats) + + thumbnails = [] + for image in video_data.get('images', []): + image_url = image.get('url') + if not image_url: + continue + thumbnails.append({ + 'url': 'http://www.nytimes.com/' + image_url, + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + }) + + publication_date = video_data.get('publication_date') + timestamp = parse_iso8601(publication_date[:-8]) if publication_date else None + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('summary'), + 'timestamp': timestamp, + 'uploader': video_data.get('byline'), + 'duration': float_or_none(video_data.get('duration'), 1000), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': thumbnails, + } + + +class NYTimesIE(NYTimesBaseIE): + _VALID_URL = r'https?://(?:(?:www\.)?nytimes\.com/video/(?:[^/]+/)+?|graphics8\.nytimes\.com/bcvideo/\d+(?:\.\d+)?/iframe/embed\.html\?videoId=)(?P<id>\d+)' + + _TESTS = [{ + 'url': 'http://www.nytimes.com/video/opinion/100000002847155/verbatim-what-is-a-photocopier.html?playlistId=100000001150263', + 'md5': 'd665342765db043f7e225cff19df0f2d', + 'info_dict': { + 'id': '100000002847155', + 'ext': 'mov', + 'title': 'Verbatim: What Is a Photocopier?', + 'description': 'md5:93603dada88ddbda9395632fdc5da260', + 'timestamp': 1398631707, + 'upload_date': '20140427', + 'uploader': 'Brett Weiner', + 'duration': 419, + } + }, { + 'url': 'http://www.nytimes.com/video/travel/100000003550828/36-hours-in-dubai.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + return self._extract_video_from_id(video_id) + + +class NYTimesArticleIE(NYTimesBaseIE): + _VALID_URL = r'https?://(?:www\.)?nytimes\.com/(.(?<!video))*?/(?:[^/]+/)*(?P<id>[^.]+)(?:\.html)?' + _TESTS = [{ + 'url': 'http://www.nytimes.com/2015/04/14/business/owner-of-gravity-payments-a-credit-card-processor-is-setting-a-new-minimum-wage-70000-a-year.html?_r=0', + 'md5': 'e2076d58b4da18e6a001d53fd56db3c9', + 'info_dict': { + 'id': '100000003628438', + 'ext': 'mov', + 'title': 'New Minimum Wage: $70,000 a Year', + 'description': 'Dan Price, C.E.O. of Gravity Payments, surprised his 120-person staff by announcing that he planned over the next three years to raise the salary of every employee to $70,000 a year.', + 'timestamp': 1429033037, + 'upload_date': '20150414', + 'uploader': 'Matthew Williams', + } + }, { + 'url': 'http://www.nytimes.com/2016/10/14/podcasts/revelations-from-the-final-weeks.html', + 'md5': 'e0d52040cafb07662acf3c9132db3575', + 'info_dict': { + 'id': '100000004709062', + 'title': 'The Run-Up: ‘He Was Like an Octopus’', + 'ext': 'mp3', + 'description': 'md5:fb5c6b93b12efc51649b4847fe066ee4', + 'series': 'The Run-Up', + 'episode': '‘He Was Like an Octopus’', + 'episode_number': 20, + 'duration': 2130, + } + }, { + 'url': 'http://www.nytimes.com/2016/10/16/books/review/inside-the-new-york-times-book-review-the-rise-of-hitler.html', + 'info_dict': { + 'id': '100000004709479', + 'title': 'The Rise of Hitler', + 'ext': 'mp3', + 'description': 'md5:bce877fd9e3444990cb141875fab0028', + 'creator': 'Pamela Paul', + 'duration': 3475, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.nytimes.com/news/minute/2014/03/17/times-minute-whats-next-in-crimea/?_php=true&_type=blogs&_php=true&_type=blogs&_r=1', + 'only_matching': True, + }] + + def _extract_podcast_from_json(self, json, page_id, webpage): + podcast_audio = self._parse_json( + json, page_id, transform_source=js_to_json) + + audio_data = podcast_audio['data'] + track = audio_data['track'] + + episode_title = track['title'] + video_url = track['source'] + + description = track.get('description') or self._html_search_meta( + ['og:description', 'twitter:description'], webpage) + + podcast_title = audio_data.get('podcast', {}).get('title') + title = ('%s: %s' % (podcast_title, episode_title) + if podcast_title else episode_title) + + episode = audio_data.get('podcast', {}).get('episode') or '' + episode_number = int_or_none(self._search_regex( + r'[Ee]pisode\s+(\d+)', episode, 'episode number', default=None)) + + return { + 'id': remove_start(podcast_audio.get('target'), 'FT') or page_id, + 'url': video_url, + 'title': title, + 'description': description, + 'creator': track.get('credit'), + 'series': podcast_title, + 'episode': episode_title, + 'episode_number': episode_number, + 'duration': int_or_none(track.get('duration')), + } + + def _real_extract(self, url): + page_id = self._match_id(url) + + webpage = self._download_webpage(url, page_id) + + video_id = self._search_regex( + r'data-videoid=["\'](\d+)', webpage, 'video id', + default=None, fatal=False) + if video_id is not None: + return self._extract_video_from_id(video_id) + + podcast_data = self._search_regex( + (r'NYTD\.FlexTypes\.push\s*\(\s*({.+?})\s*\)\s*;\s*</script', + r'NYTD\.FlexTypes\.push\s*\(\s*({.+})\s*\)\s*;'), + webpage, 'podcast data') + return self._extract_podcast_from_json(podcast_data, page_id, webpage) + + +class NYTimesCookingIE(NYTimesBaseIE): + _VALID_URL = r'https?://cooking\.nytimes\.com/(?:guid|recip)es/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://cooking.nytimes.com/recipes/1017817-cranberry-curd-tart', + 'md5': 'dab81fa2eaeb3f9ed47498bdcfcdc1d3', + 'info_dict': { + 'id': '100000004756089', + 'ext': 'mov', + 'timestamp': 1479383008, + 'uploader': 'By SHAW LASH, ADAM SAEWITZ and JAMES HERRON', + 'title': 'Cranberry Tart', + 'upload_date': '20161117', + 'description': 'If you are a fan of lemon curd or the classic French tarte au citron, you will love this cranberry version.', + }, + }, { + 'url': 'https://cooking.nytimes.com/guides/13-how-to-cook-a-turkey', + 'md5': '4b2e8c70530a89b8d905a2b572316eb8', + 'info_dict': { + 'id': '100000003951728', + 'ext': 'mov', + 'timestamp': 1445509539, + 'description': 'Turkey guide', + 'upload_date': '20151022', + 'title': 'Turkey', + } + }] + + def _real_extract(self, url): + page_id = self._match_id(url) + + webpage = self._download_webpage(url, page_id) + + video_id = self._search_regex( + r'data-video-id=["\'](\d+)', webpage, 'video id') + + return self._extract_video_from_id(video_id) diff --git a/yt_dlp/extractor/nzherald.py b/yt_dlp/extractor/nzherald.py new file mode 100644 index 000000000..e5601b495 --- /dev/null +++ b/yt_dlp/extractor/nzherald.py @@ -0,0 +1,98 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .brightcove import BrightcoveNewIE +from .common import InfoExtractor + +from ..compat import compat_str +from ..utils import ( + ExtractorError, + traverse_obj +) + + +class NZHeraldIE(InfoExtractor): + IE_NAME = 'nzherald' + _VALID_URL = r'https?://(?:www\.)?nzherald\.co\.nz/[\w\/-]+\/(?P<id>[A-Z0-9]+)' + _TESTS = [ + { + 'url': 'https://www.nzherald.co.nz/nz/weather-heavy-rain-gales-across-nz-most-days-this-week/PTG7QWY4E2225YHZ5NAIRBTYTQ/', + 'info_dict': { + 'id': '6271084466001', + 'ext': 'mp4', + 'title': 'MetService severe weather warning: September 6th - 7th', + 'timestamp': 1630891576, + 'upload_date': '20210906', + 'uploader_id': '1308227299001', + 'description': 'md5:db6ca335a22e2cdf37ab9d2bcda52902' + } + + }, { + # Webpage has brightcove embed player url + 'url': 'https://www.nzherald.co.nz/travel/pencarrow-coastal-trail/HDVTPJEPP46HJ2UEMK4EGD2DFI/', + 'info_dict': { + 'id': '6261791733001', + 'ext': 'mp4', + 'title': 'Pencarrow Coastal Trail', + 'timestamp': 1625102897, + 'upload_date': '20210701', + 'uploader_id': '1308227299001', + 'description': 'md5:d361aaa0c6498f7ac1bc4fc0a0aec1e4' + } + + }, { + # two video embeds of the same video + 'url': 'https://www.nzherald.co.nz/nz/truck-driver-captured-cutting-off-motorist-on-state-highway-1-in-canterbury/FIHNJB7PLLPHWQPK4S7ZBDUC4I/', + 'info_dict': { + 'id': '6251114530001', + 'ext': 'mp4', + 'title': 'Truck travelling north from Rakaia runs car off road', + 'timestamp': 1619730509, + 'upload_date': '20210429', + 'uploader_id': '1308227299001', + 'description': 'md5:4cae7dfb7613ac4c73b9e73a75c6b5d7' + } + }, { + 'url': 'https://www.nzherald.co.nz/kahu/kaupapa-companies-my-taiao-supporting-maori-in-study-and-business/PQBO2J25WCG77VGRX7W7BVYEAI/', + 'only_matching': True + }, { + 'url': 'https://nzherald.co.nz/the-country/video/focus-nzs-first-mass-covid-19-vaccination-event/N5I7IL3BRFLZSD33TLDLYJDGK4/', + 'only_matching': True + }, { + 'url': 'https://www.nzherald.co.nz/the-vision-is-clear/news/tvic-damian-roper-planting-trees-an-addiction/AN2AAEPNRK5VLISDWQAJZB6ATQ', + 'only_matching': True + } + ] + + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1308227299001/S1BXZn8t_default/index.html?videoId=%s' + + def _extract_bc_embed_url(self, webpage): + """The initial webpage may include the brightcove player embed url""" + bc_url = BrightcoveNewIE._extract_url(self, webpage) + return bc_url or self._search_regex( + r'(?:embedUrl)\"\s*:\s*\"(?P<embed_url>%s)' % BrightcoveNewIE._VALID_URL, + webpage, 'embed url', default=None, group='embed_url') + + def _real_extract(self, url): + article_id = self._match_id(url) + webpage = self._download_webpage(url, article_id) + bc_url = self._extract_bc_embed_url(webpage) + + if not bc_url: + fusion_metadata = self._parse_json( + self._search_regex(r'Fusion\.globalContent\s*=\s*({.+?})\s*;', webpage, 'fusion metadata'), article_id) + + video_metadata = fusion_metadata.get('video') + bc_video_id = traverse_obj( + video_metadata or fusion_metadata, # fusion metadata is the video metadata for video-only pages + 'brightcoveId', ('content_elements', ..., 'referent', 'id'), + get_all=False, expected_type=compat_str) + + if not bc_video_id: + if isinstance(video_metadata, dict) and len(video_metadata) == 0: + raise ExtractorError('This article does not have a video.', expected=True) + else: + raise ExtractorError('Failed to extract brightcove video id') + bc_url = self.BRIGHTCOVE_URL_TEMPLATE % bc_video_id + + return self.url_result(bc_url, 'BrightcoveNew') diff --git a/youtube_dl/extractor/nzz.py b/yt_dlp/extractor/nzz.py index 61ee77adb..61ee77adb 100644 --- a/youtube_dl/extractor/nzz.py +++ b/yt_dlp/extractor/nzz.py diff --git a/youtube_dl/extractor/odatv.py b/yt_dlp/extractor/odatv.py index 314527f98..314527f98 100644 --- a/youtube_dl/extractor/odatv.py +++ b/yt_dlp/extractor/odatv.py diff --git a/yt_dlp/extractor/odnoklassniki.py b/yt_dlp/extractor/odnoklassniki.py new file mode 100644 index 000000000..9cacd3815 --- /dev/null +++ b/yt_dlp/extractor/odnoklassniki.py @@ -0,0 +1,267 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_etree_fromstring, + compat_parse_qs, + compat_urllib_parse_unquote, + compat_urllib_parse_urlparse, +) +from ..utils import ( + ExtractorError, + unified_strdate, + int_or_none, + qualities, + unescapeHTML, + urlencode_postdata, +) + + +class OdnoklassnikiIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?:(?:www|m|mobile)\.)? + (?:odnoklassniki|ok)\.ru/ + (?: + video(?:embed)?/| + web-api/video/moviePlayer/| + live/| + dk\?.*?st\.mvId= + ) + (?P<id>[\d-]+) + ''' + _TESTS = [{ + # metadata in JSON + 'url': 'http://ok.ru/video/20079905452', + 'md5': '0b62089b479e06681abaaca9d204f152', + 'info_dict': { + 'id': '20079905452', + 'ext': 'mp4', + 'title': 'Культура меняет нас (прекрасный ролик!))', + 'duration': 100, + 'upload_date': '20141207', + 'uploader_id': '330537914540', + 'uploader': 'Виталий Добровольский', + 'like_count': int, + 'age_limit': 0, + }, + }, { + # metadataUrl + 'url': 'http://ok.ru/video/63567059965189-0?fromTime=5', + 'md5': '6ff470ea2dd51d5d18c295a355b0b6bc', + 'info_dict': { + 'id': '63567059965189-0', + 'ext': 'mp4', + 'title': 'Девушка без комплексов ...', + 'duration': 191, + 'upload_date': '20150518', + 'uploader_id': '534380003155', + 'uploader': '☭ Андрей Мещанинов ☭', + 'like_count': int, + 'age_limit': 0, + 'start_time': 5, + }, + }, { + # YouTube embed (metadataUrl, provider == USER_YOUTUBE) + 'url': 'http://ok.ru/video/64211978996595-1', + 'md5': '2f206894ffb5dbfcce2c5a14b909eea5', + 'info_dict': { + 'id': 'V_VztHT5BzY', + 'ext': 'mp4', + 'title': 'Космическая среда от 26 августа 2015', + 'description': 'md5:848eb8b85e5e3471a3a803dae1343ed0', + 'duration': 440, + 'upload_date': '20150826', + 'uploader_id': 'tvroscosmos', + 'uploader': 'Телестудия Роскосмоса', + 'age_limit': 0, + }, + }, { + # YouTube embed (metadata, provider == USER_YOUTUBE, no metadata.movie.title field) + 'url': 'http://ok.ru/video/62036049272859-0', + 'info_dict': { + 'id': '62036049272859-0', + 'ext': 'mp4', + 'title': 'МУЗЫКА ДОЖДЯ .', + 'description': 'md5:6f1867132bd96e33bf53eda1091e8ed0', + 'upload_date': '20120106', + 'uploader_id': '473534735899', + 'uploader': 'МARINA D', + 'age_limit': 0, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Video has not been found', + }, { + 'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452', + 'only_matching': True, + }, { + 'url': 'http://www.ok.ru/video/20648036891', + 'only_matching': True, + }, { + 'url': 'http://www.ok.ru/videoembed/20648036891', + 'only_matching': True, + }, { + 'url': 'http://m.ok.ru/video/20079905452', + 'only_matching': True, + }, { + 'url': 'http://mobile.ok.ru/video/20079905452', + 'only_matching': True, + }, { + 'url': 'https://www.ok.ru/live/484531969818', + 'only_matching': True, + }, { + 'url': 'https://m.ok.ru/dk?st.cmd=movieLayer&st.discId=863789452017&st.retLoc=friend&st.rtu=%2Fdk%3Fst.cmd%3DfriendMovies%26st.mode%3Down%26st.mrkId%3D%257B%2522uploadedMovieMarker%2522%253A%257B%2522marker%2522%253A%25221519410114503%2522%252C%2522hasMore%2522%253Atrue%257D%252C%2522sharedMovieMarker%2522%253A%257B%2522marker%2522%253Anull%252C%2522hasMore%2522%253Afalse%257D%257D%26st.friendId%3D561722190321%26st.frwd%3Don%26_prevCmd%3DfriendMovies%26tkn%3D7257&st.discType=MOVIE&st.mvId=863789452017&_prevCmd=friendMovies&tkn=3648#lst#', + 'only_matching': True, + }, { + # Paid video + 'url': 'https://ok.ru/video/954886983203', + 'only_matching': True, + }] + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:odnoklassniki|ok)\.ru/videoembed/.+?)\1', webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + start_time = int_or_none(compat_parse_qs( + compat_urllib_parse_urlparse(url).query).get('fromTime', [None])[0]) + + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'http://ok.ru/video/%s' % video_id, video_id) + + error = self._search_regex( + r'[^>]+class="vp_video_stub_txt"[^>]*>([^<]+)<', + webpage, 'error', default=None) + if error: + raise ExtractorError(error, expected=True) + + player = self._parse_json( + unescapeHTML(self._search_regex( + r'data-options=(?P<quote>["\'])(?P<player>{.+?%s.+?})(?P=quote)' % video_id, + webpage, 'player', group='player')), + video_id) + + flashvars = player['flashvars'] + + metadata = flashvars.get('metadata') + if metadata: + metadata = self._parse_json(metadata, video_id) + else: + data = {} + st_location = flashvars.get('location') + if st_location: + data['st.location'] = st_location + metadata = self._download_json( + compat_urllib_parse_unquote(flashvars['metadataUrl']), + video_id, 'Downloading metadata JSON', + data=urlencode_postdata(data)) + + movie = metadata['movie'] + + # Some embedded videos may not contain title in movie dict (e.g. + # http://ok.ru/video/62036049272859-0) thus we allow missing title + # here and it's going to be extracted later by an extractor that + # will process the actual embed. + provider = metadata.get('provider') + title = movie['title'] if provider == 'UPLOADED_ODKL' else movie.get('title') + + thumbnail = movie.get('poster') + duration = int_or_none(movie.get('duration')) + + author = metadata.get('author', {}) + uploader_id = author.get('id') + uploader = author.get('name') + + upload_date = unified_strdate(self._html_search_meta( + 'ya:ovs:upload_date', webpage, 'upload date', default=None)) + + age_limit = None + adult = self._html_search_meta( + 'ya:ovs:adult', webpage, 'age limit', default=None) + if adult: + age_limit = 18 if adult == 'true' else 0 + + like_count = int_or_none(metadata.get('likeCount')) + + info = { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'upload_date': upload_date, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'like_count': like_count, + 'age_limit': age_limit, + 'start_time': start_time, + } + + if provider == 'USER_YOUTUBE': + info.update({ + '_type': 'url_transparent', + 'url': movie['contentId'], + }) + return info + + assert title + if provider == 'LIVE_TV_APP': + info['title'] = self._live_title(title) + + quality = qualities(('4', '0', '1', '2', '3', '5')) + + formats = [{ + 'url': f['url'], + 'ext': 'mp4', + 'format_id': f['name'], + } for f in metadata['videos']] + + m3u8_url = metadata.get('hlsManifestUrl') + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + + dash_manifest = metadata.get('metadataEmbedded') + if dash_manifest: + formats.extend(self._parse_mpd_formats( + compat_etree_fromstring(dash_manifest), 'mpd')) + + for fmt in formats: + fmt_type = self._search_regex( + r'\btype[/=](\d)', fmt['url'], + 'format type', default=None) + if fmt_type: + fmt['quality'] = quality(fmt_type) + + # Live formats + m3u8_url = metadata.get('hlsMasterPlaylistUrl') + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + rtmp_url = metadata.get('rtmpUrl') + if rtmp_url: + formats.append({ + 'url': rtmp_url, + 'format_id': 'rtmp', + 'ext': 'flv', + }) + + if not formats: + payment_info = metadata.get('paymentInfo') + if payment_info: + self.raise_no_formats('This video is paid, subscribe to download it', expected=True) + + self._sort_formats(formats) + + info['formats'] = formats + return info diff --git a/youtube_dl/extractor/oktoberfesttv.py b/yt_dlp/extractor/oktoberfesttv.py index a914068f9..a914068f9 100644 --- a/youtube_dl/extractor/oktoberfesttv.py +++ b/yt_dlp/extractor/oktoberfesttv.py diff --git a/yt_dlp/extractor/olympics.py b/yt_dlp/extractor/olympics.py new file mode 100644 index 000000000..0bc9206ed --- /dev/null +++ b/yt_dlp/extractor/olympics.py @@ -0,0 +1,56 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import unified_strdate + + +class OlympicsReplayIE(InfoExtractor): + _VALID_URL = r'(?:https?://)(?:www\.)?olympics\.com/tokyo-2020/(?:[a-z]{2}/)?replay/(?P<id>[^/#&?]+)' + _TESTS = [{ + 'url': 'https://olympics.com/tokyo-2020/en/replay/300622eb-abc0-43ea-b03b-c5f2d429ec7b/jumping-team-qualifier', + 'info_dict': { + 'id': '300622eb-abc0-43ea-b03b-c5f2d429ec7b', + 'ext': 'mp4', + 'title': 'Jumping Team Qualifier', + 'release_date': '20210806', + 'upload_date': '20210713', + }, + 'params': { + 'format': 'bv', + }, + }, { + 'url': 'https://olympics.com/tokyo-2020/en/replay/bd242924-4b22-49a5-a846-f1d4c809250d/mens-bronze-medal-match-hun-esp', + 'only_matching': True, + }] + + def _real_extract(self, url): + id = self._match_id(url) + # The parameters are hardcoded in the webpage, it's not necessary to download the webpage just for these parameters. + # If in downloading webpage serves other functions aswell, then extract these parameters from it. + token_url = 'https://appovptok.ovpobs.tv/api/identity/app/token?api_key=OTk5NDcxOjpvY3N3LWFwaXVzZXI%3D&api_secret=ODY4ODM2MjE3ODMwYmVjNTAxMWZlMDJiMTYxZmY0MjFiMjMwMjllMjJmNDA1YWRiYzA5ODcxYTZjZTljZDkxOTo6NTM2NWIzNjRlMTM1ZmI2YWNjNmYzMGMzOGM3NzZhZTY%3D' + token = self._download_webpage(token_url, id) + headers = {'x-obs-app-token': token} + data_json = self._download_json(f'https://appocswtok.ovpobs.tv/api/schedule-sessions/{id}?include=stream', + id, headers=headers) + meta_data = data_json['data']['attributes'] + for t_dict in data_json['included']: + if t_dict.get('type') == 'Stream': + stream_data = t_dict['attributes'] + m3u8_url = self._download_json( + 'https://meteringtok.ovpobs.tv/api/playback-sessions', id, headers=headers, query={ + 'alias': stream_data['alias'], + 'stream': stream_data['stream'], + 'type': 'vod' + })['data']['attributes']['url'] + formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, id) + self._sort_formats(formats) + + return { + 'id': id, + 'title': meta_data['title'], + 'release_date': unified_strdate(meta_data.get('start') or meta_data.get('broadcastPublished')), + 'upload_date': unified_strdate(meta_data.get('publishedAt')), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/yt_dlp/extractor/on24.py b/yt_dlp/extractor/on24.py new file mode 100644 index 000000000..d4d824430 --- /dev/null +++ b/yt_dlp/extractor/on24.py @@ -0,0 +1,91 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + strip_or_none, + try_get, + urljoin, +) + + +class On24IE(InfoExtractor): + IE_NAME = 'on24' + IE_DESC = 'ON24' + + _VALID_URL = r'''(?x) + https?://event\.on24\.com/(?: + wcc/r/(?P<id_1>\d{7})/(?P<key_1>[0-9A-F]{32})| + eventRegistration/(?:console/EventConsoleApollo|EventLobbyServlet\?target=lobby30) + \.jsp\?(?:[^/#?]*&)?eventid=(?P<id_2>\d{7})[^/#?]*&key=(?P<key_2>[0-9A-F]{32}) + )''' + + _TESTS = [{ + 'url': 'https://event.on24.com/eventRegistration/console/EventConsoleApollo.jsp?uimode=nextgeneration&eventid=2197467&sessionid=1&key=5DF57BE53237F36A43B478DD36277A84&contenttype=A&eventuserid=305999&playerwidth=1000&playerheight=650&caller=previewLobby&text_language_id=en&format=fhaudio&newConsole=false', + 'info_dict': { + 'id': '2197467', + 'ext': 'wav', + 'title': 'Pearson Test of English General/Pearson English International Certificate Teacher Training Guide', + 'upload_date': '20200219', + 'timestamp': 1582149600.0, + 'view_count': int, + } + }, { + 'url': 'https://event.on24.com/wcc/r/2639291/82829018E813065A122363877975752E?mode=login&email=johnsmith@gmail.com', + 'only_matching': True, + }, { + 'url': 'https://event.on24.com/eventRegistration/console/EventConsoleApollo.jsp?&eventid=2639291&sessionid=1&username=&partnerref=&format=fhvideo1&mobile=&flashsupportedmobiledevice=&helpcenter=&key=82829018E813065A122363877975752E&newConsole=true&nxChe=true&newTabCon=true&text_language_id=en&playerwidth=748&playerheight=526&eventuserid=338788762&contenttype=A&mediametricsessionid=384764716&mediametricid=3558192&usercd=369267058&mode=launch', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + event_id = mobj.group('id_1') or mobj.group('id_2') + event_key = mobj.group('key_1') or mobj.group('key_2') + + event_data = self._download_json( + 'https://event.on24.com/apic/utilApp/EventConsoleCachedServlet', + event_id, query={ + 'eventId': event_id, + 'displayProfile': 'player', + 'key': event_key, + 'contentType': 'A' + }) + event_id = str(try_get(event_data, lambda x: x['presentationLogInfo']['eventid'])) or event_id + language = event_data.get('localelanguagecode') + + formats = [] + for media in event_data.get('mediaUrlInfo', []): + media_url = urljoin('https://event.on24.com/media/news/corporatevideo/events/', str(media.get('url'))) + if not media_url: + continue + media_type = media.get('code') + if media_type == 'fhvideo1': + formats.append({ + 'format_id': 'video', + 'url': media_url, + 'language': language, + 'ext': 'mp4', + 'vcodec': 'avc1.640020', + 'acodec': 'mp4a.40.2', + }) + elif media_type == 'audio': + formats.append({ + 'format_id': 'audio', + 'url': media_url, + 'language': language, + 'ext': 'wav', + 'vcodec': 'none', + 'acodec': 'wav' + }) + self._sort_formats(formats) + + return { + 'id': event_id, + 'title': strip_or_none(event_data.get('description')), + 'timestamp': int_or_none(try_get(event_data, lambda x: x['session']['startdate']), 1000), + 'webpage_url': f'https://event.on24.com/wcc/r/{event_id}/{event_key}', + 'view_count': event_data.get('registrantcount'), + 'formats': formats, + } diff --git a/youtube_dl/extractor/once.py b/yt_dlp/extractor/once.py index 3e44b7829..3e44b7829 100644 --- a/youtube_dl/extractor/once.py +++ b/yt_dlp/extractor/once.py diff --git a/yt_dlp/extractor/ondemandkorea.py b/yt_dlp/extractor/ondemandkorea.py new file mode 100644 index 000000000..cc3c587bc --- /dev/null +++ b/yt_dlp/extractor/ondemandkorea.py @@ -0,0 +1,86 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + js_to_json, +) + + +class OnDemandKoreaIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ondemandkorea\.com/(?P<id>[^/]+)\.html' + _GEO_COUNTRIES = ['US', 'CA'] + _TESTS = [{ + 'url': 'https://www.ondemandkorea.com/ask-us-anything-e43.html', + 'info_dict': { + 'id': 'ask-us-anything-e43', + 'ext': 'mp4', + 'title': 'Ask Us Anything : Gain, Ji Soo - 09/24/2016', + 'description': 'A talk show/game show with a school theme where celebrity guests appear as “transfer students.”', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + 'skip_download': 'm3u8 download' + } + }, { + 'url': 'https://www.ondemandkorea.com/confession-e01-1.html', + 'info_dict': { + 'id': 'confession-e01-1', + 'ext': 'mp4', + 'title': 'Confession : E01', + 'description': 'Choi Do-hyun, a criminal attorney, is the son of a death row convict. Ever since Choi Pil-su got arrested for murder, Do-hyun has wanted to solve his ', + 'thumbnail': r're:^https?://.*\.jpg$', + 'subtitles': { + 'English': 'mincount:1', + }, + }, + 'params': { + 'skip_download': 'm3u8 download' + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id, fatal=False) + + if not webpage: + # Page sometimes returns captcha page with HTTP 403 + raise ExtractorError( + 'Unable to access page. You may have been blocked.', + expected=True) + + if 'msg_block_01.png' in webpage: + self.raise_geo_restricted( + msg='This content is not available in your region', + countries=self._GEO_COUNTRIES) + + if 'This video is only available to ODK PLUS members.' in webpage: + raise ExtractorError( + 'This video is only available to ODK PLUS members.', + expected=True) + + if 'ODK PREMIUM Members Only' in webpage: + raise ExtractorError( + 'This video is only available to ODK PREMIUM members.', + expected=True) + + title = self._search_regex( + r'class=["\']episode_title["\'][^>]*>([^<]+)', + webpage, 'episode_title', fatal=False) or self._og_search_title(webpage) + + jw_config = self._parse_json( + self._search_regex( + r'(?s)odkPlayer\.init.*?(?P<options>{[^;]+}).*?;', + webpage, 'jw config', group='options'), + video_id, transform_source=js_to_json) + info = self._parse_jwplayer_data( + jw_config, video_id, require_title=False, m3u8_id='hls', + base_url=url) + + info.update({ + 'title': title, + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage) + }) + return info diff --git a/yt_dlp/extractor/onet.py b/yt_dlp/extractor/onet.py new file mode 100644 index 000000000..bf53ea0b0 --- /dev/null +++ b/yt_dlp/extractor/onet.py @@ -0,0 +1,268 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + ExtractorError, + float_or_none, + get_element_by_class, + int_or_none, + js_to_json, + NO_DEFAULT, + parse_iso8601, + remove_start, + strip_or_none, + url_basename, +) + + +class OnetBaseIE(InfoExtractor): + _URL_BASE_RE = r'https?://(?:(?:www\.)?onet\.tv|onet100\.vod\.pl)/[a-z]/' + + def _search_mvp_id(self, webpage): + return self._search_regex( + r'id=(["\'])mvp:(?P<id>.+?)\1', webpage, 'mvp id', group='id') + + def _extract_from_id(self, video_id, webpage=None): + response = self._download_json( + 'http://qi.ckm.onetapi.pl/', video_id, + query={ + 'body[id]': video_id, + 'body[jsonrpc]': '2.0', + 'body[method]': 'get_asset_detail', + 'body[params][ID_Publikacji]': video_id, + 'body[params][Service]': 'www.onet.pl', + 'content-type': 'application/jsonp', + 'x-onet-app': 'player.front.onetapi.pl', + }) + + error = response.get('error') + if error: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error['message']), expected=True) + + video = response['result'].get('0') + + formats = [] + for format_type, formats_dict in video['formats'].items(): + if not isinstance(formats_dict, dict): + continue + for format_id, format_list in formats_dict.items(): + if not isinstance(format_list, list): + continue + for f in format_list: + video_url = f.get('url') + if not video_url: + continue + ext = determine_ext(video_url) + if format_id.startswith('ism'): + formats.extend(self._extract_ism_formats( + video_url, video_id, 'mss', fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + video_url, video_id, mpd_id='dash', fatal=False)) + elif format_id.startswith('hls'): + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + else: + http_f = { + 'url': video_url, + 'format_id': format_id, + 'abr': float_or_none(f.get('audio_bitrate')), + } + if format_type == 'audio': + http_f['vcodec'] = 'none' + else: + http_f.update({ + 'height': int_or_none(f.get('vertical_resolution')), + 'width': int_or_none(f.get('horizontal_resolution')), + 'vbr': float_or_none(f.get('video_bitrate')), + }) + formats.append(http_f) + self._sort_formats(formats) + + meta = video.get('meta', {}) + + title = (self._og_search_title( + webpage, default=None) if webpage else None) or meta['title'] + description = (self._og_search_description( + webpage, default=None) if webpage else None) or meta.get('description') + duration = meta.get('length') or meta.get('lenght') + timestamp = parse_iso8601(meta.get('addDate'), ' ') + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, + } + + +class OnetMVPIE(OnetBaseIE): + _VALID_URL = r'onetmvp:(?P<id>\d+\.\d+)' + + _TEST = { + 'url': 'onetmvp:381027.1509591944', + 'only_matching': True, + } + + def _real_extract(self, url): + return self._extract_from_id(self._match_id(url)) + + +class OnetIE(OnetBaseIE): + _VALID_URL = OnetBaseIE._URL_BASE_RE + r'[a-z]+/(?P<display_id>[0-9a-z-]+)/(?P<id>[0-9a-z]+)' + IE_NAME = 'onet.tv' + + _TESTS = [{ + 'url': 'http://onet.tv/k/openerfestival/open-er-festival-2016-najdziwniejsze-wymagania-gwiazd/qbpyqc', + 'md5': '436102770fb095c75b8bb0392d3da9ff', + 'info_dict': { + 'id': 'qbpyqc', + 'display_id': 'open-er-festival-2016-najdziwniejsze-wymagania-gwiazd', + 'ext': 'mp4', + 'title': 'Open\'er Festival 2016: najdziwniejsze wymagania gwiazd', + 'description': 'Trzy samochody, których nigdy nie użyto, prywatne spa, hotel dekorowany czarnym suknem czy nielegalne używki. Organizatorzy koncertów i festiwali muszą stawać przed nie lada wyzwaniem zapraszając gwia...', + 'upload_date': '20160705', + 'timestamp': 1467721580, + }, + }, { + 'url': 'https://onet100.vod.pl/k/openerfestival/open-er-festival-2016-najdziwniejsze-wymagania-gwiazd/qbpyqc', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + display_id, video_id = mobj.group('display_id', 'id') + + webpage = self._download_webpage(url, display_id) + + mvp_id = self._search_mvp_id(webpage) + + info_dict = self._extract_from_id(mvp_id, webpage) + info_dict.update({ + 'id': video_id, + 'display_id': display_id, + }) + + return info_dict + + +class OnetChannelIE(OnetBaseIE): + _VALID_URL = OnetBaseIE._URL_BASE_RE + r'(?P<id>[a-z]+)(?:[?#]|$)' + IE_NAME = 'onet.tv:channel' + + _TESTS = [{ + 'url': 'http://onet.tv/k/openerfestival', + 'info_dict': { + 'id': 'openerfestival', + 'title': "Open'er Festival", + 'description': "Tak było na Open'er Festival 2016! Oglądaj nasze reportaże i wywiady z artystami.", + }, + 'playlist_mincount': 35, + }, { + 'url': 'https://onet100.vod.pl/k/openerfestival', + 'only_matching': True, + }] + + def _real_extract(self, url): + channel_id = self._match_id(url) + + webpage = self._download_webpage(url, channel_id) + + current_clip_info = self._parse_json(self._search_regex( + r'var\s+currentClip\s*=\s*({[^}]+})', webpage, 'video info'), channel_id, + transform_source=lambda s: js_to_json(re.sub(r'\'\s*\+\s*\'', '', s))) + video_id = remove_start(current_clip_info['ckmId'], 'mvp:') + video_name = url_basename(current_clip_info['url']) + + if self.get_param('noplaylist'): + self.to_screen( + 'Downloading just video %s because of --no-playlist' % video_name) + return self._extract_from_id(video_id, webpage) + + self.to_screen( + 'Downloading channel %s - add --no-playlist to just download video %s' % ( + channel_id, video_name)) + matches = re.findall( + r'<a[^>]+href=[\'"](%s[a-z]+/[0-9a-z-]+/[0-9a-z]+)' % self._URL_BASE_RE, + webpage) + entries = [ + self.url_result(video_link, OnetIE.ie_key()) + for video_link in matches] + + channel_title = strip_or_none(get_element_by_class('o_channelName', webpage)) + channel_description = strip_or_none(get_element_by_class('o_channelDesc', webpage)) + return self.playlist_result(entries, channel_id, channel_title, channel_description) + + +class OnetPlIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^/]+\.)?(?:onet|businessinsider\.com|plejada)\.pl/(?:[^/]+/)+(?P<id>[0-9a-z]+)' + IE_NAME = 'onet.pl' + + _TESTS = [{ + 'url': 'http://eurosport.onet.pl/zimowe/skoki-narciarskie/ziobro-wygral-kwalifikacje-w-pjongczangu/9ckrly', + 'md5': 'b94021eb56214c3969380388b6e73cb0', + 'info_dict': { + 'id': '1561707.1685479', + 'ext': 'mp4', + 'title': 'Ziobro wygrał kwalifikacje w Pjongczangu', + 'description': 'md5:61fb0740084d2d702ea96512a03585b4', + 'upload_date': '20170214', + 'timestamp': 1487078046, + }, + }, { + # embedded via pulsembed + 'url': 'http://film.onet.pl/pensjonat-nad-rozlewiskiem-relacja-z-planu-serialu/y428n0', + 'info_dict': { + 'id': '501235.965429946', + 'ext': 'mp4', + 'title': '"Pensjonat nad rozlewiskiem": relacja z planu serialu', + 'upload_date': '20170622', + 'timestamp': 1498159955, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://film.onet.pl/zwiastuny/ghost-in-the-shell-drugi-zwiastun-pl/5q6yl3', + 'only_matching': True, + }, { + 'url': 'http://moto.onet.pl/jak-wybierane-sa-miejsca-na-fotoradary/6rs04e', + 'only_matching': True, + }, { + 'url': 'http://businessinsider.com.pl/wideo/scenariusz-na-koniec-swiata-wedlug-nasa/dwnqptk', + 'only_matching': True, + }, { + 'url': 'http://plejada.pl/weronika-rosati-o-swoim-domniemanym-slubie/n2bq89', + 'only_matching': True, + }] + + def _search_mvp_id(self, webpage, default=NO_DEFAULT): + return self._search_regex( + r'data-(?:params-)?mvp=["\'](\d+\.\d+)', webpage, 'mvp id', + default=default) + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + mvp_id = self._search_mvp_id(webpage, default=None) + + if not mvp_id: + pulsembed_url = self._search_regex( + r'data-src=(["\'])(?P<url>(?:https?:)?//pulsembed\.eu/.+?)\1', + webpage, 'pulsembed url', group='url') + webpage = self._download_webpage( + pulsembed_url, video_id, 'Downloading pulsembed webpage') + mvp_id = self._search_mvp_id(webpage) + + return self.url_result( + 'onetmvp:%s' % mvp_id, OnetMVPIE.ie_key(), video_id=mvp_id) diff --git a/youtube_dl/extractor/onionstudios.py b/yt_dlp/extractor/onionstudios.py index cf5c39e66..cf5c39e66 100644 --- a/youtube_dl/extractor/onionstudios.py +++ b/yt_dlp/extractor/onionstudios.py diff --git a/yt_dlp/extractor/ooyala.py b/yt_dlp/extractor/ooyala.py new file mode 100644 index 000000000..20cfa0a87 --- /dev/null +++ b/yt_dlp/extractor/ooyala.py @@ -0,0 +1,209 @@ +from __future__ import unicode_literals + +import base64 +import re + +from .common import InfoExtractor +from ..compat import ( + compat_b64decode, + compat_str, +) +from ..utils import ( + determine_ext, + float_or_none, + int_or_none, + try_get, + unsmuggle_url, +) + + +class OoyalaBaseIE(InfoExtractor): + _PLAYER_BASE = 'http://player.ooyala.com/' + _CONTENT_TREE_BASE = _PLAYER_BASE + 'player_api/v1/content_tree/' + _AUTHORIZATION_URL_TEMPLATE = _PLAYER_BASE + 'sas/player_api/v2/authorization/embed_code/%s/%s' + + def _extract(self, content_tree_url, video_id, domain=None, supportedformats=None, embed_token=None): + content_tree = self._download_json(content_tree_url, video_id)['content_tree'] + metadata = content_tree[list(content_tree)[0]] + embed_code = metadata['embed_code'] + pcode = metadata.get('asset_pcode') or embed_code + title = metadata['title'] + + auth_data = self._download_json( + self._AUTHORIZATION_URL_TEMPLATE % (pcode, embed_code), + video_id, headers=self.geo_verification_headers(), query={ + 'domain': domain or 'player.ooyala.com', + 'supportedFormats': supportedformats or 'mp4,rtmp,m3u8,hds,dash,smooth', + 'embedToken': embed_token, + })['authorization_data'][embed_code] + + urls = [] + formats = [] + streams = auth_data.get('streams') or [{ + 'delivery_type': 'hls', + 'url': { + 'data': base64.b64encode(('http://player.ooyala.com/hls/player/all/%s.m3u8' % embed_code).encode()).decode(), + } + }] + for stream in streams: + url_data = try_get(stream, lambda x: x['url']['data'], compat_str) + if not url_data: + continue + s_url = compat_b64decode(url_data).decode('utf-8') + if not s_url or s_url in urls: + continue + urls.append(s_url) + ext = determine_ext(s_url, None) + delivery_type = stream.get('delivery_type') + if delivery_type == 'hls' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + re.sub(r'/ip(?:ad|hone)/', '/all/', s_url), embed_code, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif delivery_type == 'hds' or ext == 'f4m': + formats.extend(self._extract_f4m_formats( + s_url + '?hdcore=3.7.0', embed_code, f4m_id='hds', fatal=False)) + elif delivery_type == 'dash' or ext == 'mpd': + formats.extend(self._extract_mpd_formats( + s_url, embed_code, mpd_id='dash', fatal=False)) + elif delivery_type == 'smooth': + self._extract_ism_formats( + s_url, embed_code, ism_id='mss', fatal=False) + elif ext == 'smil': + formats.extend(self._extract_smil_formats( + s_url, embed_code, fatal=False)) + else: + formats.append({ + 'url': s_url, + 'ext': ext or delivery_type, + 'vcodec': stream.get('video_codec'), + 'format_id': delivery_type, + 'width': int_or_none(stream.get('width')), + 'height': int_or_none(stream.get('height')), + 'abr': int_or_none(stream.get('audio_bitrate')), + 'vbr': int_or_none(stream.get('video_bitrate')), + 'fps': float_or_none(stream.get('framerate')), + }) + if not formats and not auth_data.get('authorized'): + self.raise_no_formats('%s said: %s' % ( + self.IE_NAME, auth_data['message']), expected=True) + self._sort_formats(formats) + + subtitles = {} + for lang, sub in metadata.get('closed_captions_vtt', {}).get('captions', {}).items(): + sub_url = sub.get('url') + if not sub_url: + continue + subtitles[lang] = [{ + 'url': sub_url, + }] + + return { + 'id': embed_code, + 'title': title, + 'description': metadata.get('description'), + 'thumbnail': metadata.get('thumbnail_image') or metadata.get('promo_image'), + 'duration': float_or_none(metadata.get('duration'), 1000), + 'subtitles': subtitles, + 'formats': formats, + } + + +class OoyalaIE(OoyalaBaseIE): + _VALID_URL = r'(?:ooyala:|https?://.+?\.ooyala\.com/.*?(?:embedCode|ec)=)(?P<id>.+?)(&|$)' + + _TESTS = [ + { + # From http://it.slashdot.org/story/13/04/25/178216/recovering-data-from-broken-hard-drives-and-ssds-video + 'url': 'http://player.ooyala.com/player.js?embedCode=pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8', + 'info_dict': { + 'id': 'pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8', + 'ext': 'mp4', + 'title': 'Explaining Data Recovery from Hard Drives and SSDs', + 'description': 'How badly damaged does a drive have to be to defeat Russell and his crew? Apparently, smashed to bits.', + 'duration': 853.386, + }, + # The video in the original webpage now uses PlayWire + 'skip': 'Ooyala said: movie expired', + }, { + # Only available for ipad + 'url': 'http://player.ooyala.com/player.js?embedCode=x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0', + 'info_dict': { + 'id': 'x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0', + 'ext': 'mp4', + 'title': 'Simulation Overview - Levels of Simulation', + 'duration': 194.948, + }, + }, + { + # Information available only through SAS api + # From http://community.plm.automation.siemens.com/t5/News-NX-Manufacturing/Tool-Path-Divide/ba-p/4187 + 'url': 'http://player.ooyala.com/player.js?embedCode=FiOG81ZTrvckcchQxmalf4aQj590qTEx', + 'md5': 'a84001441b35ea492bc03736e59e7935', + 'info_dict': { + 'id': 'FiOG81ZTrvckcchQxmalf4aQj590qTEx', + 'ext': 'mp4', + 'title': 'Divide Tool Path.mp4', + 'duration': 204.405, + } + }, + { + # empty stream['url']['data'] + 'url': 'http://player.ooyala.com/player.js?embedCode=w2bnZtYjE6axZ_dw1Cd0hQtXd_ige2Is', + 'only_matching': True, + } + ] + + @staticmethod + def _url_for_embed_code(embed_code): + return 'http://player.ooyala.com/player.js?embedCode=%s' % embed_code + + @classmethod + def _build_url_result(cls, embed_code): + return cls.url_result(cls._url_for_embed_code(embed_code), + ie=cls.ie_key()) + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + embed_code = self._match_id(url) + domain = smuggled_data.get('domain') + supportedformats = smuggled_data.get('supportedformats') + embed_token = smuggled_data.get('embed_token') + content_tree_url = self._CONTENT_TREE_BASE + 'embed_code/%s/%s' % (embed_code, embed_code) + return self._extract(content_tree_url, embed_code, domain, supportedformats, embed_token) + + +class OoyalaExternalIE(OoyalaBaseIE): + _VALID_URL = r'''(?x) + (?: + ooyalaexternal:| + https?://.+?\.ooyala\.com/.*?\bexternalId= + ) + (?P<partner_id>[^:]+) + : + (?P<id>.+) + (?: + :| + .*?&pcode= + ) + (?P<pcode>.+?) + (?:&|$) + ''' + + _TEST = { + 'url': 'https://player.ooyala.com/player.js?externalId=espn:10365079&pcode=1kNG061cgaoolOncv54OAO1ceO-I&adSetCode=91cDU6NuXTGKz3OdjOxFdAgJVtQcKJnI&callback=handleEvents&hasModuleParams=1&height=968&playerBrandingId=7af3bd04449c444c964f347f11873075&targetReplaceId=videoPlayer&width=1656&wmode=opaque&allowScriptAccess=always', + 'info_dict': { + 'id': 'FkYWtmazr6Ed8xmvILvKLWjd4QvYZpzG', + 'ext': 'mp4', + 'title': 'dm_140128_30for30Shorts___JudgingJewellv2', + 'duration': 1302.0, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + partner_id, video_id, pcode = self._match_valid_url(url).groups() + content_tree_url = self._CONTENT_TREE_BASE + 'external_id/%s/%s:%s' % (pcode, partner_id, video_id) + return self._extract(content_tree_url, video_id) diff --git a/yt_dlp/extractor/openload.py b/yt_dlp/extractor/openload.py new file mode 100644 index 000000000..dfdd0e526 --- /dev/null +++ b/yt_dlp/extractor/openload.py @@ -0,0 +1,239 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import os +import subprocess +import tempfile + +from ..compat import ( + compat_urlparse, + compat_kwargs, +) +from ..utils import ( + check_executable, + encodeArgument, + ExtractorError, + get_exe_version, + is_outdated_version, + std_headers, + process_communicate_or_kill, +) + + +def cookie_to_dict(cookie): + cookie_dict = { + 'name': cookie.name, + 'value': cookie.value, + } + if cookie.port_specified: + cookie_dict['port'] = cookie.port + if cookie.domain_specified: + cookie_dict['domain'] = cookie.domain + if cookie.path_specified: + cookie_dict['path'] = cookie.path + if cookie.expires is not None: + cookie_dict['expires'] = cookie.expires + if cookie.secure is not None: + cookie_dict['secure'] = cookie.secure + if cookie.discard is not None: + cookie_dict['discard'] = cookie.discard + try: + if (cookie.has_nonstandard_attr('httpOnly') + or cookie.has_nonstandard_attr('httponly') + or cookie.has_nonstandard_attr('HttpOnly')): + cookie_dict['httponly'] = True + except TypeError: + pass + return cookie_dict + + +def cookie_jar_to_list(cookie_jar): + return [cookie_to_dict(cookie) for cookie in cookie_jar] + + +class PhantomJSwrapper(object): + """PhantomJS wrapper class + + This class is experimental. + """ + + _TEMPLATE = r''' + phantom.onError = function(msg, trace) {{ + var msgStack = ['PHANTOM ERROR: ' + msg]; + if(trace && trace.length) {{ + msgStack.push('TRACE:'); + trace.forEach(function(t) {{ + msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line + + (t.function ? ' (in function ' + t.function +')' : '')); + }}); + }} + console.error(msgStack.join('\n')); + phantom.exit(1); + }}; + var page = require('webpage').create(); + var fs = require('fs'); + var read = {{ mode: 'r', charset: 'utf-8' }}; + var write = {{ mode: 'w', charset: 'utf-8' }}; + JSON.parse(fs.read("{cookies}", read)).forEach(function(x) {{ + phantom.addCookie(x); + }}); + page.settings.resourceTimeout = {timeout}; + page.settings.userAgent = "{ua}"; + page.onLoadStarted = function() {{ + page.evaluate(function() {{ + delete window._phantom; + delete window.callPhantom; + }}); + }}; + var saveAndExit = function() {{ + fs.write("{html}", page.content, write); + fs.write("{cookies}", JSON.stringify(phantom.cookies), write); + phantom.exit(); + }}; + page.onLoadFinished = function(status) {{ + if(page.url === "") {{ + page.setContent(fs.read("{html}", read), "{url}"); + }} + else {{ + {jscode} + }} + }}; + page.open(""); + ''' + + _TMP_FILE_NAMES = ['script', 'html', 'cookies'] + + @staticmethod + def _version(): + return get_exe_version('phantomjs', version_re=r'([0-9.]+)') + + def __init__(self, extractor, required_version=None, timeout=10000): + self._TMP_FILES = {} + + self.exe = check_executable('phantomjs', ['-v']) + if not self.exe: + raise ExtractorError('PhantomJS executable not found in PATH, ' + 'download it from http://phantomjs.org', + expected=True) + + self.extractor = extractor + + if required_version: + version = self._version() + if is_outdated_version(version, required_version): + self.extractor._downloader.report_warning( + 'Your copy of PhantomJS is outdated, update it to version ' + '%s or newer if you encounter any errors.' % required_version) + + self.options = { + 'timeout': timeout, + } + for name in self._TMP_FILE_NAMES: + tmp = tempfile.NamedTemporaryFile(delete=False) + tmp.close() + self._TMP_FILES[name] = tmp + + def __del__(self): + for name in self._TMP_FILE_NAMES: + try: + os.remove(self._TMP_FILES[name].name) + except (IOError, OSError, KeyError): + pass + + def _save_cookies(self, url): + cookies = cookie_jar_to_list(self.extractor._downloader.cookiejar) + for cookie in cookies: + if 'path' not in cookie: + cookie['path'] = '/' + if 'domain' not in cookie: + cookie['domain'] = compat_urlparse.urlparse(url).netloc + with open(self._TMP_FILES['cookies'].name, 'wb') as f: + f.write(json.dumps(cookies).encode('utf-8')) + + def _load_cookies(self): + with open(self._TMP_FILES['cookies'].name, 'rb') as f: + cookies = json.loads(f.read().decode('utf-8')) + for cookie in cookies: + if cookie['httponly'] is True: + cookie['rest'] = {'httpOnly': None} + if 'expiry' in cookie: + cookie['expire_time'] = cookie['expiry'] + self.extractor._set_cookie(**compat_kwargs(cookie)) + + def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on webpage', headers={}, jscode='saveAndExit();'): + """ + Downloads webpage (if needed) and executes JS + + Params: + url: website url + html: optional, html code of website + video_id: video id + note: optional, displayed when downloading webpage + note2: optional, displayed when executing JS + headers: custom http headers + jscode: code to be executed when page is loaded + + Returns tuple with: + * downloaded website (after JS execution) + * anything you print with `console.log` (but not inside `page.execute`!) + + In most cases you don't need to add any `jscode`. + It is executed in `page.onLoadFinished`. + `saveAndExit();` is mandatory, use it instead of `phantom.exit()` + It is possible to wait for some element on the webpage, for example: + var check = function() { + var elementFound = page.evaluate(function() { + return document.querySelector('#b.done') !== null; + }); + if(elementFound) + saveAndExit(); + else + window.setTimeout(check, 500); + } + + page.evaluate(function(){ + document.querySelector('#a').click(); + }); + check(); + """ + if 'saveAndExit();' not in jscode: + raise ExtractorError('`saveAndExit();` not found in `jscode`') + if not html: + html = self.extractor._download_webpage(url, video_id, note=note, headers=headers) + with open(self._TMP_FILES['html'].name, 'wb') as f: + f.write(html.encode('utf-8')) + + self._save_cookies(url) + + replaces = self.options + replaces['url'] = url + user_agent = headers.get('User-Agent') or std_headers['User-Agent'] + replaces['ua'] = user_agent.replace('"', '\\"') + replaces['jscode'] = jscode + + for x in self._TMP_FILE_NAMES: + replaces[x] = self._TMP_FILES[x].name.replace('\\', '\\\\').replace('"', '\\"') + + with open(self._TMP_FILES['script'].name, 'wb') as f: + f.write(self._TEMPLATE.format(**replaces).encode('utf-8')) + + if video_id is None: + self.extractor.to_screen('%s' % (note2,)) + else: + self.extractor.to_screen('%s: %s' % (video_id, note2)) + + p = subprocess.Popen([ + self.exe, '--ssl-protocol=any', + self._TMP_FILES['script'].name + ], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + out, err = process_communicate_or_kill(p) + if p.returncode != 0: + raise ExtractorError( + 'Executing JS failed\n:' + encodeArgument(err)) + with open(self._TMP_FILES['html'].name, 'rb') as f: + html = f.read().decode('utf-8') + + self._load_cookies() + + return (html, encodeArgument(out)) diff --git a/yt_dlp/extractor/openrec.py b/yt_dlp/extractor/openrec.py new file mode 100644 index 000000000..d7073ab44 --- /dev/null +++ b/yt_dlp/extractor/openrec.py @@ -0,0 +1,126 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + traverse_obj, + try_get, + unified_strdate +) +from ..compat import compat_str + + +class OpenRecIE(InfoExtractor): + IE_NAME = 'openrec' + _VALID_URL = r'https?://(?:www\.)?openrec\.tv/live/(?P<id>[^/]+)' + _TESTS = [{ + 'url': 'https://www.openrec.tv/live/2p8v31qe4zy', + 'only_matching': True, + }, { + 'url': 'https://www.openrec.tv/live/wez93eqvjzl', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage('https://www.openrec.tv/live/%s' % video_id, video_id) + + window_stores = self._parse_json( + self._search_regex(r'(?m)window\.pageStore\s*=\s*(\{.+?\});$', webpage, 'window.pageStore'), video_id) + movie_store = traverse_obj( + window_stores, + ('v8', 'state', 'movie'), + ('v8', 'movie'), + expected_type=dict) + if not movie_store: + raise ExtractorError('Failed to extract live info') + + title = movie_store.get('title') + description = movie_store.get('introduction') + thumbnail = movie_store.get('thumbnailUrl') + + channel_user = movie_store.get('channel', {}).get('user') + uploader = try_get(channel_user, lambda x: x['name'], compat_str) + uploader_id = try_get(channel_user, lambda x: x['id'], compat_str) + + timestamp = traverse_obj(movie_store, ('startedAt', 'time'), expected_type=int) + + m3u8_playlists = movie_store.get('media') + formats = [] + for (name, m3u8_url) in m3u8_playlists.items(): + if not m3u8_url: + continue + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, ext='mp4', entry_protocol='m3u8', + m3u8_id='hls-%s' % name, live=True)) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'formats': formats, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'timestamp': timestamp, + 'is_live': True, + } + + +class OpenRecCaptureIE(InfoExtractor): + IE_NAME = 'openrec:capture' + _VALID_URL = r'https?://(?:www\.)?openrec\.tv/capture/(?P<id>[^/]+)' + _TESTS = [{ + 'url': 'https://www.openrec.tv/capture/l9nk2x4gn14', + 'only_matching': True, + }, { + 'url': 'https://www.openrec.tv/capture/mldjr82p7qk', + 'info_dict': { + 'id': 'mldjr82p7qk', + 'title': 'たいじの恥ずかしい英語力', + 'uploader': 'たいちゃんねる', + 'uploader_id': 'Yaritaiji', + 'upload_date': '20210803', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage('https://www.openrec.tv/capture/%s' % video_id, video_id) + + window_stores = self._parse_json( + self._search_regex(r'(?m)window\.pageStore\s*=\s*(\{.+?\});$', webpage, 'window.pageStore'), video_id) + movie_store = window_stores.get('movie') + + capture_data = window_stores.get('capture') + if not capture_data: + raise ExtractorError('Cannot extract title') + title = capture_data.get('title') + thumbnail = capture_data.get('thumbnailUrl') + upload_date = unified_strdate(capture_data.get('createdAt')) + + channel_info = movie_store.get('channel') or {} + uploader = channel_info.get('name') + uploader_id = channel_info.get('id') + + m3u8_url = capture_data.get('source') + if not m3u8_url: + raise ExtractorError('Cannot extract m3u8 url') + formats = self._extract_m3u8_formats( + m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'formats': formats, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'upload_date': upload_date, + } diff --git a/yt_dlp/extractor/ora.py b/yt_dlp/extractor/ora.py new file mode 100644 index 000000000..422d0b330 --- /dev/null +++ b/yt_dlp/extractor/ora.py @@ -0,0 +1,75 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + get_element_by_attribute, + qualities, + unescapeHTML, +) + + +class OraTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?:ora\.tv|unsafespeech\.com)/([^/]+/)*(?P<id>[^/\?#]+)' + _TESTS = [{ + 'url': 'https://www.ora.tv/larrykingnow/2015/12/16/vine-youtube-stars-zach-king-king-bach-on-their-viral-videos-0_36jupg6090pq', + 'md5': 'fa33717591c631ec93b04b0e330df786', + 'info_dict': { + 'id': '50178', + 'ext': 'mp4', + 'title': 'Vine & YouTube Stars Zach King & King Bach On Their Viral Videos!', + 'description': 'md5:ebbc5b1424dd5dba7be7538148287ac1', + } + }, { + 'url': 'http://www.unsafespeech.com/video/2016/5/10/student-self-censorship-and-the-thought-police-on-university-campuses-0_6622bnkppw4d', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + video_data = self._search_regex( + r'"(?:video|current)"\s*:\s*({[^}]+?})', webpage, 'current video') + m3u8_url = self._search_regex( + r'hls_stream"?\s*:\s*"([^"]+)', video_data, 'm3u8 url', None) + if m3u8_url: + formats = self._extract_m3u8_formats( + m3u8_url, display_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False) + # similar to GameSpotIE + m3u8_path = compat_urlparse.urlparse(m3u8_url).path + QUALITIES_RE = r'((,[a-z]+\d+)+,?)' + available_qualities = self._search_regex( + QUALITIES_RE, m3u8_path, 'qualities').strip(',').split(',') + http_path = m3u8_path[1:].split('/', 1)[1] + http_template = re.sub(QUALITIES_RE, r'%s', http_path) + http_template = http_template.replace('.csmil/master.m3u8', '') + http_template = compat_urlparse.urljoin( + 'http://videocdn-pmd.ora.tv/', http_template) + preference = qualities( + ['mobile400', 'basic400', 'basic600', 'sd900', 'sd1200', 'sd1500', 'hd720', 'hd1080']) + for q in available_qualities: + formats.append({ + 'url': http_template % q, + 'format_id': q, + 'quality': preference(q), + }) + self._sort_formats(formats) + else: + return self.url_result(self._search_regex( + r'"youtube_id"\s*:\s*"([^"]+)', webpage, 'youtube id'), 'Youtube') + + return { + 'id': self._search_regex( + r'"id"\s*:\s*(\d+)', video_data, 'video id', default=display_id), + 'display_id': display_id, + 'title': unescapeHTML(self._og_search_title(webpage)), + 'description': get_element_by_attribute( + 'class', 'video_txt_decription', webpage), + 'thumbnail': self._proto_relative_url(self._search_regex( + r'"thumb"\s*:\s*"([^"]+)', video_data, 'thumbnail', None)), + 'formats': formats, + } diff --git a/yt_dlp/extractor/orf.py b/yt_dlp/extractor/orf.py new file mode 100644 index 000000000..428ec97e4 --- /dev/null +++ b/yt_dlp/extractor/orf.py @@ -0,0 +1,592 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + clean_html, + determine_ext, + float_or_none, + HEADRequest, + int_or_none, + orderedSet, + remove_end, + str_or_none, + strip_jsonp, + unescapeHTML, + unified_strdate, + url_or_none, +) + + +class ORFTVthekIE(InfoExtractor): + IE_NAME = 'orf:tvthek' + IE_DESC = 'ORF TVthek' + _VALID_URL = r'https?://tvthek\.orf\.at/(?:[^/]+/)+(?P<id>\d+)' + + _TESTS = [{ + 'url': 'http://tvthek.orf.at/program/Aufgetischt/2745173/Aufgetischt-Mit-der-Steirischen-Tafelrunde/8891389', + 'playlist': [{ + 'md5': '2942210346ed779588f428a92db88712', + 'info_dict': { + 'id': '8896777', + 'ext': 'mp4', + 'title': 'Aufgetischt: Mit der Steirischen Tafelrunde', + 'description': 'md5:c1272f0245537812d4e36419c207b67d', + 'duration': 2668, + 'upload_date': '20141208', + }, + }], + 'skip': 'Blocked outside of Austria / Germany', + }, { + 'url': 'http://tvthek.orf.at/topic/Im-Wandel-der-Zeit/8002126/Best-of-Ingrid-Thurnher/7982256', + 'info_dict': { + 'id': '7982259', + 'ext': 'mp4', + 'title': 'Best of Ingrid Thurnher', + 'upload_date': '20140527', + 'description': 'Viele Jahre war Ingrid Thurnher das "Gesicht" der ZIB 2. Vor ihrem Wechsel zur ZIB 2 im Jahr 1995 moderierte sie unter anderem "Land und Leute", "Österreich-Bild" und "Niederösterreich heute".', + }, + 'params': { + 'skip_download': True, # rtsp downloads + }, + 'skip': 'Blocked outside of Austria / Germany', + }, { + 'url': 'http://tvthek.orf.at/topic/Fluechtlingskrise/10463081/Heimat-Fremde-Heimat/13879132/Senioren-betreuen-Migrantenkinder/13879141', + 'only_matching': True, + }, { + 'url': 'http://tvthek.orf.at/profile/Universum/35429', + 'only_matching': True, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + + data_jsb = self._parse_json( + self._search_regex( + r'<div[^>]+class=(["\']).*?VideoPlaylist.*?\1[^>]+data-jsb=(["\'])(?P<json>.+?)\2', + webpage, 'playlist', group='json'), + playlist_id, transform_source=unescapeHTML)['playlist']['videos'] + + entries = [] + for sd in data_jsb: + video_id, title = sd.get('id'), sd.get('title') + if not video_id or not title: + continue + video_id = compat_str(video_id) + formats = [] + for fd in sd['sources']: + src = url_or_none(fd.get('src')) + if not src: + continue + format_id_list = [] + for key in ('delivery', 'quality', 'quality_string'): + value = fd.get(key) + if value: + format_id_list.append(value) + format_id = '-'.join(format_id_list) + ext = determine_ext(src) + if ext == 'm3u8': + m3u8_formats = self._extract_m3u8_formats( + src, video_id, 'mp4', m3u8_id=format_id, fatal=False) + if any('/geoprotection' in f['url'] for f in m3u8_formats): + self.raise_geo_restricted() + formats.extend(m3u8_formats) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + src, video_id, f4m_id=format_id, fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + src, video_id, mpd_id=format_id, fatal=False)) + else: + formats.append({ + 'format_id': format_id, + 'url': src, + 'protocol': fd.get('protocol'), + }) + + # Check for geoblocking. + # There is a property is_geoprotection, but that's always false + geo_str = sd.get('geoprotection_string') + if geo_str: + try: + http_url = next( + f['url'] + for f in formats + if re.match(r'^https?://.*\.mp4$', f['url'])) + except StopIteration: + pass + else: + req = HEADRequest(http_url) + self._request_webpage( + req, video_id, + note='Testing for geoblocking', + errnote=(( + 'This video seems to be blocked outside of %s. ' + 'You may want to try the streaming-* formats.') + % geo_str), + fatal=False) + + self._check_formats(formats, video_id) + self._sort_formats(formats) + + subtitles = {} + for sub in sd.get('subtitles', []): + sub_src = sub.get('src') + if not sub_src: + continue + subtitles.setdefault(sub.get('lang', 'de-AT'), []).append({ + 'url': sub_src, + }) + + upload_date = unified_strdate(sd.get('created_date')) + + thumbnails = [] + preview = sd.get('preview_image_url') + if preview: + thumbnails.append({ + 'id': 'preview', + 'url': preview, + 'preference': 0, + }) + image = sd.get('image_full_url') + if not image and len(data_jsb) == 1: + image = self._og_search_thumbnail(webpage) + if image: + thumbnails.append({ + 'id': 'full', + 'url': image, + 'preference': 1, + }) + + entries.append({ + '_type': 'video', + 'id': video_id, + 'title': title, + 'formats': formats, + 'subtitles': subtitles, + 'description': sd.get('description'), + 'duration': int_or_none(sd.get('duration_in_seconds')), + 'upload_date': upload_date, + 'thumbnails': thumbnails, + }) + + return { + '_type': 'playlist', + 'entries': entries, + 'id': playlist_id, + } + + +class ORFRadioIE(InfoExtractor): + def _real_extract(self, url): + mobj = self._match_valid_url(url) + show_date = mobj.group('date') + show_id = mobj.group('show') + + data = self._download_json( + 'http://audioapi.orf.at/%s/api/json/current/broadcast/%s/%s' + % (self._API_STATION, show_id, show_date), show_id) + + entries = [] + for info in data['streams']: + loop_stream_id = str_or_none(info.get('loopStreamId')) + if not loop_stream_id: + continue + title = str_or_none(data.get('title')) + if not title: + continue + start = int_or_none(info.get('start'), scale=1000) + end = int_or_none(info.get('end'), scale=1000) + duration = end - start if end and start else None + entries.append({ + 'id': loop_stream_id.replace('.mp3', ''), + 'url': 'https://loopstream01.apa.at/?channel=%s&id=%s' % (self._LOOP_STATION, loop_stream_id), + 'title': title, + 'description': clean_html(data.get('subtitle')), + 'duration': duration, + 'timestamp': start, + 'ext': 'mp3', + 'series': data.get('programTitle'), + }) + + return { + '_type': 'playlist', + 'id': show_id, + 'title': data.get('title'), + 'description': clean_html(data.get('subtitle')), + 'entries': entries, + } + + +class ORFFM4IE(ORFRadioIE): + IE_NAME = 'orf:fm4' + IE_DESC = 'radio FM4' + _VALID_URL = r'https?://(?P<station>fm4)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>4\w+)' + _API_STATION = 'fm4' + _LOOP_STATION = 'fm4' + + _TEST = { + 'url': 'http://fm4.orf.at/player/20170107/4CC', + 'md5': '2b0be47375432a7ef104453432a19212', + 'info_dict': { + 'id': '2017-01-07_2100_tl_54_7DaysSat18_31295', + 'ext': 'mp3', + 'title': 'Solid Steel Radioshow', + 'description': 'Die Mixshow von Coldcut und Ninja Tune.', + 'duration': 3599, + 'timestamp': 1483819257, + 'upload_date': '20170107', + }, + 'skip': 'Shows from ORF radios are only available for 7 days.', + 'only_matching': True, + } + + +class ORFNOEIE(ORFRadioIE): + IE_NAME = 'orf:noe' + IE_DESC = 'Radio Niederösterreich' + _VALID_URL = r'https?://(?P<station>noe)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' + _API_STATION = 'noe' + _LOOP_STATION = 'oe2n' + + _TEST = { + 'url': 'https://noe.orf.at/player/20200423/NGM', + 'only_matching': True, + } + + +class ORFWIEIE(ORFRadioIE): + IE_NAME = 'orf:wien' + IE_DESC = 'Radio Wien' + _VALID_URL = r'https?://(?P<station>wien)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' + _API_STATION = 'wie' + _LOOP_STATION = 'oe2w' + + _TEST = { + 'url': 'https://wien.orf.at/player/20200423/WGUM', + 'only_matching': True, + } + + +class ORFBGLIE(ORFRadioIE): + IE_NAME = 'orf:burgenland' + IE_DESC = 'Radio Burgenland' + _VALID_URL = r'https?://(?P<station>burgenland)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' + _API_STATION = 'bgl' + _LOOP_STATION = 'oe2b' + + _TEST = { + 'url': 'https://burgenland.orf.at/player/20200423/BGM', + 'only_matching': True, + } + + +class ORFOOEIE(ORFRadioIE): + IE_NAME = 'orf:oberoesterreich' + IE_DESC = 'Radio Oberösterreich' + _VALID_URL = r'https?://(?P<station>ooe)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' + _API_STATION = 'ooe' + _LOOP_STATION = 'oe2o' + + _TEST = { + 'url': 'https://ooe.orf.at/player/20200423/OGMO', + 'only_matching': True, + } + + +class ORFSTMIE(ORFRadioIE): + IE_NAME = 'orf:steiermark' + IE_DESC = 'Radio Steiermark' + _VALID_URL = r'https?://(?P<station>steiermark)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' + _API_STATION = 'stm' + _LOOP_STATION = 'oe2st' + + _TEST = { + 'url': 'https://steiermark.orf.at/player/20200423/STGMS', + 'only_matching': True, + } + + +class ORFKTNIE(ORFRadioIE): + IE_NAME = 'orf:kaernten' + IE_DESC = 'Radio Kärnten' + _VALID_URL = r'https?://(?P<station>kaernten)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' + _API_STATION = 'ktn' + _LOOP_STATION = 'oe2k' + + _TEST = { + 'url': 'https://kaernten.orf.at/player/20200423/KGUMO', + 'only_matching': True, + } + + +class ORFSBGIE(ORFRadioIE): + IE_NAME = 'orf:salzburg' + IE_DESC = 'Radio Salzburg' + _VALID_URL = r'https?://(?P<station>salzburg)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' + _API_STATION = 'sbg' + _LOOP_STATION = 'oe2s' + + _TEST = { + 'url': 'https://salzburg.orf.at/player/20200423/SGUM', + 'only_matching': True, + } + + +class ORFTIRIE(ORFRadioIE): + IE_NAME = 'orf:tirol' + IE_DESC = 'Radio Tirol' + _VALID_URL = r'https?://(?P<station>tirol)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' + _API_STATION = 'tir' + _LOOP_STATION = 'oe2t' + + _TEST = { + 'url': 'https://tirol.orf.at/player/20200423/TGUMO', + 'only_matching': True, + } + + +class ORFVBGIE(ORFRadioIE): + IE_NAME = 'orf:vorarlberg' + IE_DESC = 'Radio Vorarlberg' + _VALID_URL = r'https?://(?P<station>vorarlberg)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' + _API_STATION = 'vbg' + _LOOP_STATION = 'oe2v' + + _TEST = { + 'url': 'https://vorarlberg.orf.at/player/20200423/VGUM', + 'only_matching': True, + } + + +class ORFOE3IE(ORFRadioIE): + IE_NAME = 'orf:oe3' + IE_DESC = 'Radio Österreich 3' + _VALID_URL = r'https?://(?P<station>oe3)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' + _API_STATION = 'oe3' + _LOOP_STATION = 'oe3' + + _TEST = { + 'url': 'https://oe3.orf.at/player/20200424/3WEK', + 'only_matching': True, + } + + +class ORFOE1IE(ORFRadioIE): + IE_NAME = 'orf:oe1' + IE_DESC = 'Radio Österreich 1' + _VALID_URL = r'https?://(?P<station>oe1)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' + _API_STATION = 'oe1' + _LOOP_STATION = 'oe1' + + _TEST = { + 'url': 'http://oe1.orf.at/player/20170108/456544', + 'md5': '34d8a6e67ea888293741c86a099b745b', + 'info_dict': { + 'id': '2017-01-08_0759_tl_51_7DaysSun6_256141', + 'ext': 'mp3', + 'title': 'Morgenjournal', + 'duration': 609, + 'timestamp': 1483858796, + 'upload_date': '20170108', + }, + 'skip': 'Shows from ORF radios are only available for 7 days.' + } + + +class ORFIPTVIE(InfoExtractor): + IE_NAME = 'orf:iptv' + IE_DESC = 'iptv.ORF.at' + _VALID_URL = r'https?://iptv\.orf\.at/(?:#/)?stories/(?P<id>\d+)' + + _TEST = { + 'url': 'http://iptv.orf.at/stories/2275236/', + 'md5': 'c8b22af4718a4b4af58342529453e3e5', + 'info_dict': { + 'id': '350612', + 'ext': 'flv', + 'title': 'Weitere Evakuierungen um Vulkan Calbuco', + 'description': 'md5:d689c959bdbcf04efeddedbf2299d633', + 'duration': 68.197, + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20150425', + }, + } + + def _real_extract(self, url): + story_id = self._match_id(url) + + webpage = self._download_webpage( + 'http://iptv.orf.at/stories/%s' % story_id, story_id) + + video_id = self._search_regex( + r'data-video(?:id)?="(\d+)"', webpage, 'video id') + + data = self._download_json( + 'http://bits.orf.at/filehandler/static-api/json/current/data.json?file=%s' % video_id, + video_id)[0] + + duration = float_or_none(data['duration'], 1000) + + video = data['sources']['default'] + load_balancer_url = video['loadBalancerUrl'] + abr = int_or_none(video.get('audioBitrate')) + vbr = int_or_none(video.get('bitrate')) + fps = int_or_none(video.get('videoFps')) + width = int_or_none(video.get('videoWidth')) + height = int_or_none(video.get('videoHeight')) + thumbnail = video.get('preview') + + rendition = self._download_json( + load_balancer_url, video_id, transform_source=strip_jsonp) + + f = { + 'abr': abr, + 'vbr': vbr, + 'fps': fps, + 'width': width, + 'height': height, + } + + formats = [] + for format_id, format_url in rendition['redirect'].items(): + if format_id == 'rtmp': + ff = f.copy() + ff.update({ + 'url': format_url, + 'format_id': format_id, + }) + formats.append(ff) + elif determine_ext(format_url) == 'f4m': + formats.extend(self._extract_f4m_formats( + format_url, video_id, f4m_id=format_id)) + elif determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', m3u8_id=format_id)) + else: + continue + self._sort_formats(formats) + + title = remove_end(self._og_search_title(webpage), ' - iptv.ORF.at') + description = self._og_search_description(webpage) + upload_date = unified_strdate(self._html_search_meta( + 'dc.date', webpage, 'upload date')) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'formats': formats, + } + + +class ORFFM4StoryIE(InfoExtractor): + IE_NAME = 'orf:fm4:story' + IE_DESC = 'fm4.orf.at stories' + _VALID_URL = r'https?://fm4\.orf\.at/stories/(?P<id>\d+)' + + _TEST = { + 'url': 'http://fm4.orf.at/stories/2865738/', + 'playlist': [{ + 'md5': 'e1c2c706c45c7b34cf478bbf409907ca', + 'info_dict': { + 'id': '547792', + 'ext': 'flv', + 'title': 'Manu Delago und Inner Tongue live', + 'description': 'Manu Delago und Inner Tongue haben bei der FM4 Soundpark Session live alles gegeben. Hier gibt es Fotos und die gesamte Session als Video.', + 'duration': 1748.52, + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20170913', + }, + }, { + 'md5': 'c6dd2179731f86f4f55a7b49899d515f', + 'info_dict': { + 'id': '547798', + 'ext': 'flv', + 'title': 'Manu Delago und Inner Tongue live (2)', + 'duration': 1504.08, + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20170913', + 'description': 'Manu Delago und Inner Tongue haben bei der FM4 Soundpark Session live alles gegeben. Hier gibt es Fotos und die gesamte Session als Video.', + }, + }], + } + + def _real_extract(self, url): + story_id = self._match_id(url) + webpage = self._download_webpage(url, story_id) + + entries = [] + all_ids = orderedSet(re.findall(r'data-video(?:id)?="(\d+)"', webpage)) + for idx, video_id in enumerate(all_ids): + data = self._download_json( + 'http://bits.orf.at/filehandler/static-api/json/current/data.json?file=%s' % video_id, + video_id)[0] + + duration = float_or_none(data['duration'], 1000) + + video = data['sources']['q8c'] + load_balancer_url = video['loadBalancerUrl'] + abr = int_or_none(video.get('audioBitrate')) + vbr = int_or_none(video.get('bitrate')) + fps = int_or_none(video.get('videoFps')) + width = int_or_none(video.get('videoWidth')) + height = int_or_none(video.get('videoHeight')) + thumbnail = video.get('preview') + + rendition = self._download_json( + load_balancer_url, video_id, transform_source=strip_jsonp) + + f = { + 'abr': abr, + 'vbr': vbr, + 'fps': fps, + 'width': width, + 'height': height, + } + + formats = [] + for format_id, format_url in rendition['redirect'].items(): + if format_id == 'rtmp': + ff = f.copy() + ff.update({ + 'url': format_url, + 'format_id': format_id, + }) + formats.append(ff) + elif determine_ext(format_url) == 'f4m': + formats.extend(self._extract_f4m_formats( + format_url, video_id, f4m_id=format_id)) + elif determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', m3u8_id=format_id)) + else: + continue + self._sort_formats(formats) + + title = remove_end(self._og_search_title(webpage), ' - fm4.ORF.at') + if idx >= 1: + # Titles are duplicates, make them unique + title += ' (' + str(idx + 1) + ')' + description = self._og_search_description(webpage) + upload_date = unified_strdate(self._html_search_meta( + 'dc.date', webpage, 'upload date')) + + entries.append({ + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'formats': formats, + }) + + return self.playlist_result(entries) diff --git a/youtube_dl/extractor/outsidetv.py b/yt_dlp/extractor/outsidetv.py index c5333b08c..c5333b08c 100644 --- a/youtube_dl/extractor/outsidetv.py +++ b/yt_dlp/extractor/outsidetv.py diff --git a/yt_dlp/extractor/packtpub.py b/yt_dlp/extractor/packtpub.py new file mode 100644 index 000000000..c06fca795 --- /dev/null +++ b/yt_dlp/extractor/packtpub.py @@ -0,0 +1,163 @@ +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..compat import ( + # compat_str, + compat_HTTPError, +) +from ..utils import ( + clean_html, + ExtractorError, + # remove_end, + str_or_none, + strip_or_none, + unified_timestamp, + # urljoin, +) + + +class PacktPubBaseIE(InfoExtractor): + # _PACKT_BASE = 'https://www.packtpub.com' + _STATIC_PRODUCTS_BASE = 'https://static.packt-cdn.com/products/' + + +class PacktPubIE(PacktPubBaseIE): + _VALID_URL = r'https?://(?:(?:www\.)?packtpub\.com/mapt|subscription\.packtpub\.com)/video/[^/]+/(?P<course_id>\d+)/(?P<chapter_id>[^/]+)/(?P<id>[^/]+)(?:/(?P<display_id>[^/?&#]+))?' + + _TESTS = [{ + 'url': 'https://www.packtpub.com/mapt/video/web-development/9781787122215/20528/20530/Project+Intro', + 'md5': '1e74bd6cfd45d7d07666f4684ef58f70', + 'info_dict': { + 'id': '20530', + 'ext': 'mp4', + 'title': 'Project Intro', + 'thumbnail': r're:(?i)^https?://.*\.jpg', + 'timestamp': 1490918400, + 'upload_date': '20170331', + }, + }, { + 'url': 'https://subscription.packtpub.com/video/web_development/9781787122215/20528/20530/project-intro', + 'only_matching': True, + }, { + 'url': 'https://subscription.packtpub.com/video/programming/9781838988906/p1/video1_1/business-card-project', + 'only_matching': True, + }] + _NETRC_MACHINE = 'packtpub' + _TOKEN = None + + def _real_initialize(self): + username, password = self._get_login_info() + if username is None: + return + try: + self._TOKEN = self._download_json( + 'https://services.packtpub.com/auth-v1/users/tokens', None, + 'Downloading Authorization Token', data=json.dumps({ + 'username': username, + 'password': password, + }).encode())['data']['access'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 401, 404): + message = self._parse_json(e.cause.read().decode(), None)['message'] + raise ExtractorError(message, expected=True) + raise + + def _real_extract(self, url): + course_id, chapter_id, video_id, display_id = self._match_valid_url(url).groups() + + headers = {} + if self._TOKEN: + headers['Authorization'] = 'Bearer ' + self._TOKEN + try: + video_url = self._download_json( + 'https://services.packtpub.com/products-v1/products/%s/%s/%s' % (course_id, chapter_id, video_id), video_id, + 'Downloading JSON video', headers=headers)['data'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + self.raise_login_required('This video is locked') + raise + + # TODO: find a better way to avoid duplicating course requests + # metadata = self._download_json( + # '%s/products/%s/chapters/%s/sections/%s/metadata' + # % (self._MAPT_REST, course_id, chapter_id, video_id), + # video_id)['data'] + + # title = metadata['pageTitle'] + # course_title = metadata.get('title') + # if course_title: + # title = remove_end(title, ' - %s' % course_title) + # timestamp = unified_timestamp(metadata.get('publicationDate')) + # thumbnail = urljoin(self._PACKT_BASE, metadata.get('filepath')) + + return { + 'id': video_id, + 'url': video_url, + 'title': display_id or video_id, # title, + # 'thumbnail': thumbnail, + # 'timestamp': timestamp, + } + + +class PacktPubCourseIE(PacktPubBaseIE): + _VALID_URL = r'(?P<url>https?://(?:(?:www\.)?packtpub\.com/mapt|subscription\.packtpub\.com)/video/[^/]+/(?P<id>\d+))' + _TESTS = [{ + 'url': 'https://www.packtpub.com/mapt/video/web-development/9781787122215', + 'info_dict': { + 'id': '9781787122215', + 'title': 'Learn Nodejs by building 12 projects [Video]', + 'description': 'md5:489da8d953f416e51927b60a1c7db0aa', + }, + 'playlist_count': 90, + }, { + 'url': 'https://subscription.packtpub.com/video/web_development/9781787122215', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if PacktPubIE.suitable(url) else super( + PacktPubCourseIE, cls).suitable(url) + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + url, course_id = mobj.group('url', 'id') + + course = self._download_json( + self._STATIC_PRODUCTS_BASE + '%s/toc' % course_id, course_id) + metadata = self._download_json( + self._STATIC_PRODUCTS_BASE + '%s/summary' % course_id, + course_id, fatal=False) or {} + + entries = [] + for chapter_num, chapter in enumerate(course['chapters'], 1): + chapter_id = str_or_none(chapter.get('id')) + sections = chapter.get('sections') + if not chapter_id or not isinstance(sections, list): + continue + chapter_info = { + 'chapter': chapter.get('title'), + 'chapter_number': chapter_num, + 'chapter_id': chapter_id, + } + for section in sections: + section_id = str_or_none(section.get('id')) + if not section_id or section.get('contentType') != 'video': + continue + entry = { + '_type': 'url_transparent', + 'url': '/'.join([url, chapter_id, section_id]), + 'title': strip_or_none(section.get('title')), + 'description': clean_html(section.get('summary')), + 'thumbnail': metadata.get('coverImage'), + 'timestamp': unified_timestamp(metadata.get('publicationDate')), + 'ie_key': PacktPubIE.ie_key(), + } + entry.update(chapter_info) + entries.append(entry) + + return self.playlist_result( + entries, course_id, metadata.get('title'), + clean_html(metadata.get('about'))) diff --git a/yt_dlp/extractor/palcomp3.py b/yt_dlp/extractor/palcomp3.py new file mode 100644 index 000000000..d0a62fb17 --- /dev/null +++ b/yt_dlp/extractor/palcomp3.py @@ -0,0 +1,147 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + str_or_none, + try_get, +) + + +class PalcoMP3BaseIE(InfoExtractor): + _GQL_QUERY_TMPL = '''{ + artist(slug: "%s") { + %s + } +}''' + _ARTIST_FIELDS_TMPL = '''music(slug: "%%s") { + %s + }''' + _MUSIC_FIELDS = '''duration + hls + mp3File + musicID + plays + title''' + + def _call_api(self, artist_slug, artist_fields): + return self._download_json( + 'https://www.palcomp3.com.br/graphql/', artist_slug, query={ + 'query': self._GQL_QUERY_TMPL % (artist_slug, artist_fields), + })['data'] + + def _parse_music(self, music): + music_id = compat_str(music['musicID']) + title = music['title'] + + formats = [] + hls_url = music.get('hls') + if hls_url: + formats.append({ + 'url': hls_url, + 'protocol': 'm3u8_native', + 'ext': 'mp4', + }) + mp3_file = music.get('mp3File') + if mp3_file: + formats.append({ + 'url': mp3_file, + }) + + return { + 'id': music_id, + 'title': title, + 'formats': formats, + 'duration': int_or_none(music.get('duration')), + 'view_count': int_or_none(music.get('plays')), + } + + def _real_initialize(self): + self._ARTIST_FIELDS_TMPL = self._ARTIST_FIELDS_TMPL % self._MUSIC_FIELDS + + def _real_extract(self, url): + artist_slug, music_slug = self._match_valid_url(url).groups() + artist_fields = self._ARTIST_FIELDS_TMPL % music_slug + music = self._call_api(artist_slug, artist_fields)['artist']['music'] + return self._parse_music(music) + + +class PalcoMP3IE(PalcoMP3BaseIE): + IE_NAME = 'PalcoMP3:song' + _VALID_URL = r'https?://(?:www\.)?palcomp3\.com(?:\.br)?/(?P<artist>[^/]+)/(?P<id>[^/?&#]+)' + _TESTS = [{ + 'url': 'https://www.palcomp3.com/maiaraemaraisaoficial/nossas-composicoes-cuida-bem-dela/', + 'md5': '99fd6405b2d8fd589670f6db1ba3b358', + 'info_dict': { + 'id': '3162927', + 'ext': 'mp3', + 'title': 'Nossas Composições - CUIDA BEM DELA', + 'duration': 210, + 'view_count': int, + } + }] + + @classmethod + def suitable(cls, url): + return False if PalcoMP3VideoIE.suitable(url) else super(PalcoMP3IE, cls).suitable(url) + + +class PalcoMP3ArtistIE(PalcoMP3BaseIE): + IE_NAME = 'PalcoMP3:artist' + _VALID_URL = r'https?://(?:www\.)?palcomp3\.com(?:\.br)?/(?P<id>[^/?&#]+)' + _TESTS = [{ + 'url': 'https://www.palcomp3.com.br/condedoforro/', + 'info_dict': { + 'id': '358396', + 'title': 'Conde do Forró', + }, + 'playlist_mincount': 188, + }] + _ARTIST_FIELDS_TMPL = '''artistID + musics { + nodes { + %s + } + } + name''' + + @classmethod + def suitable(cls, url): + return False if PalcoMP3IE._match_valid_url(url) else super(PalcoMP3ArtistIE, cls).suitable(url) + + def _real_extract(self, url): + artist_slug = self._match_id(url) + artist = self._call_api(artist_slug, self._ARTIST_FIELDS_TMPL)['artist'] + + def entries(): + for music in (try_get(artist, lambda x: x['musics']['nodes'], list) or []): + yield self._parse_music(music) + + return self.playlist_result( + entries(), str_or_none(artist.get('artistID')), artist.get('name')) + + +class PalcoMP3VideoIE(PalcoMP3BaseIE): + IE_NAME = 'PalcoMP3:video' + _VALID_URL = r'https?://(?:www\.)?palcomp3\.com(?:\.br)?/(?P<artist>[^/]+)/(?P<id>[^/?&#]+)/?#clipe' + _TESTS = [{ + 'url': 'https://www.palcomp3.com/maiaraemaraisaoficial/maiara-e-maraisa-voce-faz-falta-aqui-ao-vivo-em-vicosa-mg/#clipe', + 'add_ie': ['Youtube'], + 'info_dict': { + 'id': '_pD1nR2qqPg', + 'ext': 'mp4', + 'title': 'Maiara e Maraisa - Você Faz Falta Aqui - DVD Ao Vivo Em Campo Grande', + 'description': 'md5:7043342c09a224598e93546e98e49282', + 'upload_date': '20161107', + 'uploader_id': 'maiaramaraisaoficial', + 'uploader': 'Maiara e Maraisa', + } + }] + _MUSIC_FIELDS = 'youtubeID' + + def _parse_music(self, music): + youtube_id = music['youtubeID'] + return self.url_result(youtube_id, 'Youtube', youtube_id) diff --git a/yt_dlp/extractor/pandoratv.py b/yt_dlp/extractor/pandoratv.py new file mode 100644 index 000000000..623005338 --- /dev/null +++ b/yt_dlp/extractor/pandoratv.py @@ -0,0 +1,133 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..compat import ( + compat_str, +) +from ..utils import ( + ExtractorError, + float_or_none, + parse_duration, + parse_qs, + str_to_int, + urlencode_postdata, +) + + +class PandoraTVIE(InfoExtractor): + IE_NAME = 'pandora.tv' + IE_DESC = '판도라TV' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?pandora\.tv/view/(?P<user_id>[^/]+)/(?P<id>\d+)| # new format + (?:.+?\.)?channel\.pandora\.tv/channel/video\.ptv\?| # old format + m\.pandora\.tv/?\? # mobile + ) + ''' + _TESTS = [{ + 'url': 'http://jp.channel.pandora.tv/channel/video.ptv?c1=&prgid=53294230&ch_userid=mikakim&ref=main&lot=cate_01_2', + 'info_dict': { + 'id': '53294230', + 'ext': 'flv', + 'title': '頭を撫でてくれる?', + 'description': '頭を撫でてくれる?', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 39, + 'upload_date': '20151218', + 'uploader': 'カワイイ動物まとめ', + 'uploader_id': 'mikakim', + 'view_count': int, + 'like_count': int, + } + }, { + 'url': 'http://channel.pandora.tv/channel/video.ptv?ch_userid=gogoucc&prgid=54721744', + 'info_dict': { + 'id': '54721744', + 'ext': 'flv', + 'title': '[HD] JAPAN COUNTDOWN 170423', + 'description': '[HD] JAPAN COUNTDOWN 170423', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 1704.9, + 'upload_date': '20170423', + 'uploader': 'GOGO_UCC', + 'uploader_id': 'gogoucc', + 'view_count': int, + 'like_count': int, + }, + 'params': { + # Test metadata only + 'skip_download': True, + }, + }, { + 'url': 'http://www.pandora.tv/view/mikakim/53294230#36797454_new', + 'only_matching': True, + }, { + 'url': 'http://m.pandora.tv/?c=view&ch_userid=mikakim&prgid=54600346', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + user_id = mobj.group('user_id') + video_id = mobj.group('id') + + if not user_id or not video_id: + qs = parse_qs(url) + video_id = qs.get('prgid', [None])[0] + user_id = qs.get('ch_userid', [None])[0] + if any(not f for f in (video_id, user_id,)): + raise ExtractorError('Invalid URL', expected=True) + + data = self._download_json( + 'http://m.pandora.tv/?c=view&m=viewJsonApi&ch_userid=%s&prgid=%s' + % (user_id, video_id), video_id) + + info = data['data']['rows']['vod_play_info']['result'] + + formats = [] + for format_id, format_url in info.items(): + if not format_url: + continue + height = self._search_regex( + r'^v(\d+)[Uu]rl$', format_id, 'height', default=None) + if not height: + continue + + play_url = self._download_json( + 'http://m.pandora.tv/?c=api&m=play_url', video_id, + data=urlencode_postdata({ + 'prgid': video_id, + 'runtime': info.get('runtime'), + 'vod_url': format_url, + }), + headers={ + 'Origin': url, + 'Content-Type': 'application/x-www-form-urlencoded', + }) + format_url = play_url.get('url') + if not format_url: + continue + + formats.append({ + 'format_id': '%sp' % height, + 'url': format_url, + 'height': int(height), + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': info['subject'], + 'description': info.get('body'), + 'thumbnail': info.get('thumbnail') or info.get('poster'), + 'duration': float_or_none(info.get('runtime'), 1000) or parse_duration(info.get('time')), + 'upload_date': info['fid'].split('/')[-1][:8] if isinstance(info.get('fid'), compat_str) else None, + 'uploader': info.get('nickname'), + 'uploader_id': info.get('upload_userid'), + 'view_count': str_to_int(info.get('hit')), + 'like_count': str_to_int(info.get('likecnt')), + 'formats': formats, + } diff --git a/yt_dlp/extractor/paramountplus.py b/yt_dlp/extractor/paramountplus.py new file mode 100644 index 000000000..338b84d5b --- /dev/null +++ b/yt_dlp/extractor/paramountplus.py @@ -0,0 +1,145 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from .cbs import CBSBaseIE +from ..utils import ( + int_or_none, + url_or_none, +) + + +class ParamountPlusIE(CBSBaseIE): + _VALID_URL = r'''(?x) + (?: + paramountplus:| + https?://(?:www\.)?(?: + paramountplus\.com/(?:shows/[^/]+/video|movies/[^/]+)/ + )(?P<id>[\w-]+))''' + + # All tests are blocked outside US + _TESTS = [{ + 'url': 'https://www.paramountplus.com/shows/catdog/video/Oe44g5_NrlgiZE3aQVONleD6vXc8kP0k/catdog-climb-every-catdog-the-canine-mutiny/', + 'info_dict': { + 'id': 'Oe44g5_NrlgiZE3aQVONleD6vXc8kP0k', + 'ext': 'mp4', + 'title': 'CatDog - Climb Every CatDog/The Canine Mutiny', + 'description': 'md5:7ac835000645a69933df226940e3c859', + 'duration': 1418, + 'timestamp': 920264400, + 'upload_date': '19990301', + 'uploader': 'CBSI-NEW', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, { + 'url': 'https://www.paramountplus.com/shows/tooning-out-the-news/video/6hSWYWRrR9EUTz7IEe5fJKBhYvSUfexd/7-23-21-week-in-review-rep-jahana-hayes-howard-fineman-sen-michael-bennet-sheera-frenkel-cecilia-kang-/', + 'info_dict': { + 'id': '6hSWYWRrR9EUTz7IEe5fJKBhYvSUfexd', + 'ext': 'mp4', + 'title': '7/23/21 WEEK IN REVIEW (Rep. Jahana Hayes/Howard Fineman/Sen. Michael Bennet/Sheera Frenkel & Cecilia Kang)', + 'description': 'md5:f4adcea3e8b106192022e121f1565bae', + 'duration': 2506, + 'timestamp': 1627063200, + 'upload_date': '20210723', + 'uploader': 'CBSI-NEW', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, { + 'url': 'https://www.paramountplus.com/movies/daddys-home/vM2vm0kE6vsS2U41VhMRKTOVHyQAr6pC', + 'info_dict': { + 'id': 'vM2vm0kE6vsS2U41VhMRKTOVHyQAr6pC', + 'ext': 'mp4', + 'title': 'Daddy\'s Home', + 'upload_date': '20151225', + 'description': 'md5:a0beaf24e8d3b0e81b2ee41d47c06f33', + 'uploader': 'CBSI-NEW', + 'timestamp': 1451030400, + }, + 'params': { + 'skip_download': 'm3u8', + 'format': 'bestvideo', + }, + 'expected_warnings': ['Ignoring subtitle tracks'], # TODO: Investigate this + }, { + 'url': 'https://www.paramountplus.com/movies/sonic-the-hedgehog/5EKDXPOzdVf9voUqW6oRuocyAEeJGbEc', + 'info_dict': { + 'id': '5EKDXPOzdVf9voUqW6oRuocyAEeJGbEc', + 'ext': 'mp4', + 'uploader': 'CBSI-NEW', + 'description': 'md5:bc7b6fea84ba631ef77a9bda9f2ff911', + 'timestamp': 1577865600, + 'title': 'Sonic the Hedgehog', + 'upload_date': '20200101', + }, + 'params': { + 'skip_download': 'm3u8', + 'format': 'bestvideo', + }, + 'expected_warnings': ['Ignoring subtitle tracks'], + }, { + 'url': 'https://www.paramountplus.com/shows/all-rise/video/QmR1WhNkh1a_IrdHZrbcRklm176X_rVc/all-rise-space/', + 'only_matching': True, + }, { + 'url': 'https://www.paramountplus.com/movies/million-dollar-american-princesses-meghan-and-harry/C0LpgNwXYeB8txxycdWdR9TjxpJOsdCq', + 'only_matching': True, + }] + + def _extract_video_info(self, content_id, mpx_acc=2198311517): + items_data = self._download_json( + 'https://www.paramountplus.com/apps-api/v2.0/androidtv/video/cid/%s.json' % content_id, + content_id, query={'locale': 'en-us', 'at': 'ABCqWNNSwhIqINWIIAG+DFzcFUvF8/vcN6cNyXFFfNzWAIvXuoVgX+fK4naOC7V8MLI='}, headers=self.geo_verification_headers()) + + asset_types = { + item.get('assetType'): { + 'format': 'SMIL', + 'formats': 'MPEG4,M3U', + } for item in items_data['itemList'] + } + item = items_data['itemList'][-1] + return self._extract_common_video_info(content_id, asset_types, mpx_acc, extra_info={ + 'title': item.get('title'), + 'series': item.get('seriesTitle'), + 'season_number': int_or_none(item.get('seasonNum')), + 'episode_number': int_or_none(item.get('episodeNum')), + 'duration': int_or_none(item.get('duration')), + 'thumbnail': url_or_none(item.get('thumbnail')), + }) + + +class ParamountPlusSeriesIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?paramountplus\.com/shows/(?P<id>[a-zA-Z0-9-_]+)/?(?:[#?]|$)' + _TESTS = [{ + 'url': 'https://www.paramountplus.com/shows/drake-josh', + 'playlist_mincount': 50, + 'info_dict': { + 'id': 'drake-josh', + } + }, { + 'url': 'https://www.paramountplus.com/shows/hawaii_five_0/', + 'playlist_mincount': 240, + 'info_dict': { + 'id': 'hawaii_five_0', + } + }, { + 'url': 'https://www.paramountplus.com/shows/spongebob-squarepants/', + 'playlist_mincount': 248, + 'info_dict': { + 'id': 'spongebob-squarepants', + } + }] + _API_URL = 'https://www.paramountplus.com/shows/{}/xhr/episodes/page/0/size/100000/xs/0/season/0/' + + def _entries(self, show_name): + show_json = self._download_json(self._API_URL.format(show_name), video_id=show_name) + if show_json.get('success'): + for episode in show_json['result']['data']: + yield self.url_result( + 'https://www.paramountplus.com%s' % episode['url'], + ie=ParamountPlusIE.ie_key(), video_id=episode['content_id']) + + def _real_extract(self, url): + show_name = self._match_id(url) + return self.playlist_result(self._entries(show_name), playlist_id=show_name) diff --git a/yt_dlp/extractor/parliamentliveuk.py b/yt_dlp/extractor/parliamentliveuk.py new file mode 100644 index 000000000..869ebd865 --- /dev/null +++ b/yt_dlp/extractor/parliamentliveuk.py @@ -0,0 +1,83 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import uuid + +from .common import InfoExtractor +from ..utils import ( + unified_timestamp, + try_get, +) + + +class ParliamentLiveUKIE(InfoExtractor): + IE_NAME = 'parliamentlive.tv' + IE_DESC = 'UK parliament videos' + _VALID_URL = r'(?i)https?://(?:www\.)?parliamentlive\.tv/Event/Index/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + + _TESTS = [{ + 'url': 'http://parliamentlive.tv/Event/Index/c1e9d44d-fd6c-4263-b50f-97ed26cc998b', + 'info_dict': { + 'id': 'c1e9d44d-fd6c-4263-b50f-97ed26cc998b', + 'ext': 'mp4', + 'title': 'Home Affairs Committee', + 'timestamp': 1395153872, + 'upload_date': '20140318', + }, + 'params': { + 'format': 'bestvideo', + }, + }, { + 'url': 'http://parliamentlive.tv/event/index/3f24936f-130f-40bf-9a5d-b3d6479da6a4', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + video_info = self._download_json(f'https://www.parliamentlive.tv/Event/GetShareVideo/{video_id}', video_id) + _DEVICE_ID = str(uuid.uuid4()) + auth = 'Bearer ' + self._download_json( + 'https://exposure.api.redbee.live/v2/customer/UKParliament/businessunit/ParliamentLive/auth/anonymous', + video_id, headers={ + 'Origin': 'https://videoplayback.parliamentlive.tv', + 'Accept': 'application/json, text/plain, */*', + 'Content-Type': 'application/json;charset=utf-8' + }, data=json.dumps({ + 'deviceId': _DEVICE_ID, + 'device': { + 'deviceId': _DEVICE_ID, + 'width': 653, + 'height': 368, + 'type': 'WEB', + 'name': ' Mozilla Firefox 91' + } + }).encode('utf-8'))['sessionToken'] + + video_urls = self._download_json( + f'https://exposure.api.redbee.live/v2/customer/UKParliament/businessunit/ParliamentLive/entitlement/{video_id}/play', + video_id, headers={'Authorization': auth, 'Accept': 'application/json, text/plain, */*'})['formats'] + + formats = [] + for format in video_urls: + if not format.get('mediaLocator'): + continue + if format.get('format') == 'DASH': + formats.extend(self._extract_mpd_formats( + format['mediaLocator'], video_id, mpd_id='dash', fatal=False)) + elif format.get('format') == 'SMOOTHSTREAMING': + formats.extend(self._extract_ism_formats( + format['mediaLocator'], video_id, ism_id='ism', fatal=False)) + elif format.get('format') == 'HLS': + formats.extend(self._extract_m3u8_formats( + format['mediaLocator'], video_id, m3u8_id='hls', fatal=False)) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'formats': formats, + 'title': video_info['event']['title'], + 'timestamp': unified_timestamp(try_get(video_info, lambda x: x['event']['publishedStartTime'])), + 'thumbnail': video_info.get('thumbnailUrl'), + } diff --git a/yt_dlp/extractor/parlview.py b/yt_dlp/extractor/parlview.py new file mode 100644 index 000000000..c85eaa7dc --- /dev/null +++ b/yt_dlp/extractor/parlview.py @@ -0,0 +1,68 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + try_get, + unified_timestamp, +) + + +class ParlviewIE(InfoExtractor): + + _VALID_URL = r'https?://(?:www\.)?parlview\.aph\.gov\.au/(?:[^/]+)?\bvideoID=(?P<id>\d{6})' + _TESTS = [{ + 'url': 'https://parlview.aph.gov.au/mediaPlayer.php?videoID=542661', + 'info_dict': { + 'id': '542661', + 'ext': 'mp4', + 'title': "Australia's Family Law System [Part 2]", + 'duration': 5799, + 'description': 'md5:7099883b391619dbae435891ca871a62', + 'timestamp': 1621430700, + 'upload_date': '20210519', + 'uploader': 'Joint Committee', + }, + 'params': { + 'skip_download': True, + } + }, { + 'url': 'https://parlview.aph.gov.au/mediaPlayer.php?videoID=539936', + 'only_matching': True, + }] + _API_URL = 'https://parlview.aph.gov.au/api_v3/1/playback/getUniversalPlayerConfig?videoID=%s&format=json' + _MEDIA_INFO_URL = 'https://parlview.aph.gov.au/ajaxPlayer.php?videoID=%s&tabNum=4&action=loadTab' + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + media = self._download_json(self._API_URL % video_id, video_id).get('media') + timestamp = try_get(media, lambda x: x['timeMap']['source']['timecode_offsets'][0], compat_str) or '/' + + stream = try_get(media, lambda x: x['renditions'][0], dict) + if not stream: + self.raise_no_formats('No streams were detected') + elif stream.get('streamType') != 'VOD': + self.raise_no_formats('Unknown type of stream was detected: "%s"' % str(stream.get('streamType'))) + formats = self._extract_m3u8_formats(stream['url'], video_id, 'mp4', 'm3u8_native') + self._sort_formats(formats) + + media_info = self._download_webpage( + self._MEDIA_INFO_URL % video_id, video_id, note='Downloading media info', fatal=False) + + return { + 'id': video_id, + 'url': url, + 'title': self._html_search_regex(r'<h2>([^<]+)<', webpage, 'title', fatal=False), + 'formats': formats, + 'duration': int_or_none(media.get('duration')), + 'timestamp': unified_timestamp(timestamp.split('/', 1)[1].replace('_', ' ')), + 'description': self._html_search_regex( + r'<div[^>]+class="descripti?on"[^>]*>[^>]+<strong>[^>]+>[^>]+>([^<]+)', + webpage, 'description', fatal=False), + 'uploader': self._html_search_regex( + r'<td>[^>]+>Channel:[^>]+>([^<]+)', media_info, 'channel', fatal=False), + 'thumbnail': media.get('staticImage'), + } diff --git a/yt_dlp/extractor/patreon.py b/yt_dlp/extractor/patreon.py new file mode 100644 index 000000000..a189c0237 --- /dev/null +++ b/yt_dlp/extractor/patreon.py @@ -0,0 +1,242 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools + +from .common import InfoExtractor +from .vimeo import VimeoIE + +from ..compat import compat_urllib_parse_unquote +from ..utils import ( + clean_html, + determine_ext, + int_or_none, + KNOWN_EXTENSIONS, + mimetype2ext, + parse_iso8601, + str_or_none, + try_get, + url_or_none, +) + + +class PatreonIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?patreon\.com/(?:creation\?hid=|posts/(?:[\w-]+-)?)(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://www.patreon.com/creation?hid=743933', + 'md5': 'e25505eec1053a6e6813b8ed369875cc', + 'info_dict': { + 'id': '743933', + 'ext': 'mp3', + 'title': 'Episode 166: David Smalley of Dogma Debate', + 'description': 'md5:713b08b772cd6271b9f3906683cfacdf', + 'uploader': 'Cognitive Dissonance Podcast', + 'thumbnail': 're:^https?://.*$', + 'timestamp': 1406473987, + 'upload_date': '20140727', + 'uploader_id': '87145', + }, + }, { + 'url': 'http://www.patreon.com/creation?hid=754133', + 'md5': '3eb09345bf44bf60451b8b0b81759d0a', + 'info_dict': { + 'id': '754133', + 'ext': 'mp3', + 'title': 'CD 167 Extra', + 'uploader': 'Cognitive Dissonance Podcast', + 'thumbnail': 're:^https?://.*$', + }, + 'skip': 'Patron-only content', + }, { + 'url': 'https://www.patreon.com/creation?hid=1682498', + 'info_dict': { + 'id': 'SU4fj_aEMVw', + 'ext': 'mp4', + 'title': 'I\'m on Patreon!', + 'uploader': 'TraciJHines', + 'thumbnail': 're:^https?://.*$', + 'upload_date': '20150211', + 'description': 'md5:c5a706b1f687817a3de09db1eb93acd4', + 'uploader_id': 'TraciJHines', + }, + 'params': { + 'noplaylist': True, + 'skip_download': True, + } + }, { + 'url': 'https://www.patreon.com/posts/episode-166-of-743933', + 'only_matching': True, + }, { + 'url': 'https://www.patreon.com/posts/743933', + 'only_matching': True, + }, { + 'url': 'https://www.patreon.com/posts/kitchen-as-seen-51706779', + 'md5': '96656690071f6d64895866008484251b', + 'info_dict': { + 'id': '555089736', + 'ext': 'mp4', + 'title': 'KITCHEN AS SEEN ON DEEZ NUTS EXTENDED!', + 'uploader': 'Cold Ones', + 'thumbnail': 're:^https?://.*$', + 'upload_date': '20210526', + 'description': 'md5:557a409bd79d3898689419094934ba79', + 'uploader_id': '14936315', + }, + 'skip': 'Patron-only content' + }] + + # Currently Patreon exposes download URL via hidden CSS, so login is not + # needed. Keeping this commented for when this inevitably changes. + ''' + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + login_form = { + 'redirectUrl': 'http://www.patreon.com/', + 'email': username, + 'password': password, + } + + request = sanitized_Request( + 'https://www.patreon.com/processLogin', + compat_urllib_parse_urlencode(login_form).encode('utf-8') + ) + login_page = self._download_webpage(request, None, note='Logging in') + + if re.search(r'onLoginFailed', login_page): + raise ExtractorError('Unable to login, incorrect username and/or password', expected=True) + + def _real_initialize(self): + self._login() + ''' + + def _real_extract(self, url): + video_id = self._match_id(url) + post = self._download_json( + 'https://www.patreon.com/api/posts/' + video_id, video_id, query={ + 'fields[media]': 'download_url,mimetype,size_bytes', + 'fields[post]': 'comment_count,content,embed,image,like_count,post_file,published_at,title', + 'fields[user]': 'full_name,url', + 'json-api-use-default-includes': 'false', + 'include': 'media,user', + }) + attributes = post['data']['attributes'] + title = attributes['title'].strip() + image = attributes.get('image') or {} + info = { + 'id': video_id, + 'title': title, + 'description': clean_html(attributes.get('content')), + 'thumbnail': image.get('large_url') or image.get('url'), + 'timestamp': parse_iso8601(attributes.get('published_at')), + 'like_count': int_or_none(attributes.get('like_count')), + 'comment_count': int_or_none(attributes.get('comment_count')), + } + + for i in post.get('included', []): + i_type = i.get('type') + if i_type == 'media': + media_attributes = i.get('attributes') or {} + download_url = media_attributes.get('download_url') + ext = mimetype2ext(media_attributes.get('mimetype')) + if download_url and ext in KNOWN_EXTENSIONS: + info.update({ + 'ext': ext, + 'filesize': int_or_none(media_attributes.get('size_bytes')), + 'url': download_url, + }) + elif i_type == 'user': + user_attributes = i.get('attributes') + if user_attributes: + info.update({ + 'uploader': user_attributes.get('full_name'), + 'uploader_id': str_or_none(i.get('id')), + 'uploader_url': user_attributes.get('url'), + }) + + if not info.get('url'): + # handle Vimeo embeds + if try_get(attributes, lambda x: x['embed']['provider']) == 'Vimeo': + embed_html = try_get(attributes, lambda x: x['embed']['html']) + v_url = url_or_none(compat_urllib_parse_unquote( + self._search_regex(r'src=(https%3A%2F%2Fplayer\.vimeo\.com.+)%3F', embed_html, 'vimeo url', fatal=False))) + if v_url: + info.update({ + '_type': 'url_transparent', + 'url': VimeoIE._smuggle_referrer(v_url, 'https://patreon.com'), + 'ie_key': 'Vimeo', + }) + + if not info.get('url'): + embed_url = try_get(attributes, lambda x: x['embed']['url']) + if embed_url: + info.update({ + '_type': 'url', + 'url': embed_url, + }) + + if not info.get('url'): + post_file = attributes['post_file'] + ext = determine_ext(post_file.get('name')) + if ext in KNOWN_EXTENSIONS: + info.update({ + 'ext': ext, + 'url': post_file['url'], + }) + + return info + + +class PatreonUserIE(InfoExtractor): + + _VALID_URL = r'https?://(?:www\.)?patreon\.com/(?P<id>[-_\w\d]+)/?(?:posts/?)?' + + _TESTS = [{ + 'url': 'https://www.patreon.com/dissonancepod/', + 'info_dict': { + 'title': 'dissonancepod', + }, + 'playlist_mincount': 68, + 'expected_warnings': 'Post not viewable by current user! Skipping!', + }, { + 'url': 'https://www.patreon.com/dissonancepod/posts', + 'only_matching': True + }, ] + + @classmethod + def suitable(cls, url): + return False if PatreonIE.suitable(url) else super(PatreonUserIE, cls).suitable(url) + + def _entries(self, campaign_id, user_id): + cursor = None + params = { + 'fields[campaign]': 'show_audio_post_download_links,name,url', + 'fields[post]': 'current_user_can_view,embed,image,is_paid,post_file,published_at,patreon_url,url,post_type,thumbnail_url,title', + 'filter[campaign_id]': campaign_id, + 'filter[is_draft]': 'false', + 'sort': '-published_at', + 'json-api-version': 1.0, + 'json-api-use-default-includes': 'false', + } + + for page in itertools.count(1): + + params.update({'page[cursor]': cursor} if cursor else {}) + posts_json = self._download_json('https://www.patreon.com/api/posts', user_id, note='Downloading posts page %d' % page, query=params, headers={'Cookie': '.'}) + + cursor = try_get(posts_json, lambda x: x['meta']['pagination']['cursors']['next']) + + for post in posts_json.get('data') or []: + yield self.url_result(url_or_none(try_get(post, lambda x: x['attributes']['patreon_url'])), 'Patreon') + + if cursor is None: + break + + def _real_extract(self, url): + + user_id = self._match_id(url) + webpage = self._download_webpage(url, user_id, headers={'Cookie': '.'}) + campaign_id = self._search_regex(r'https://www.patreon.com/api/campaigns/(\d+)/?', webpage, 'Campaign ID') + return self.playlist_result(self._entries(campaign_id, user_id), playlist_title=user_id) diff --git a/yt_dlp/extractor/pbs.py b/yt_dlp/extractor/pbs.py new file mode 100644 index 000000000..0eabf9bee --- /dev/null +++ b/yt_dlp/extractor/pbs.py @@ -0,0 +1,699 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + determine_ext, + int_or_none, + float_or_none, + js_to_json, + orderedSet, + strip_jsonp, + strip_or_none, + unified_strdate, + url_or_none, + US_RATINGS, +) + + +class PBSIE(InfoExtractor): + _STATIONS = ( + (r'(?:video|www|player)\.pbs\.org', 'PBS: Public Broadcasting Service'), # http://www.pbs.org/ + (r'video\.aptv\.org', 'APT - Alabama Public Television (WBIQ)'), # http://aptv.org/ + (r'video\.gpb\.org', 'GPB/Georgia Public Broadcasting (WGTV)'), # http://www.gpb.org/ + (r'video\.mpbonline\.org', 'Mississippi Public Broadcasting (WMPN)'), # http://www.mpbonline.org + (r'video\.wnpt\.org', 'Nashville Public Television (WNPT)'), # http://www.wnpt.org + (r'video\.wfsu\.org', 'WFSU-TV (WFSU)'), # http://wfsu.org/ + (r'video\.wsre\.org', 'WSRE (WSRE)'), # http://www.wsre.org + (r'video\.wtcitv\.org', 'WTCI (WTCI)'), # http://www.wtcitv.org + (r'video\.pba\.org', 'WPBA/Channel 30 (WPBA)'), # http://pba.org/ + (r'video\.alaskapublic\.org', 'Alaska Public Media (KAKM)'), # http://alaskapublic.org/kakm + # (r'kuac\.org', 'KUAC (KUAC)'), # http://kuac.org/kuac-tv/ + # (r'ktoo\.org', '360 North (KTOO)'), # http://www.ktoo.org/ + # (r'azpm\.org', 'KUAT 6 (KUAT)'), # http://www.azpm.org/ + (r'video\.azpbs\.org', 'Arizona PBS (KAET)'), # http://www.azpbs.org + (r'portal\.knme\.org', 'KNME-TV/Channel 5 (KNME)'), # http://www.newmexicopbs.org/ + (r'video\.vegaspbs\.org', 'Vegas PBS (KLVX)'), # http://vegaspbs.org/ + (r'watch\.aetn\.org', 'AETN/ARKANSAS ETV NETWORK (KETS)'), # http://www.aetn.org/ + (r'video\.ket\.org', 'KET (WKLE)'), # http://www.ket.org/ + (r'video\.wkno\.org', 'WKNO/Channel 10 (WKNO)'), # http://www.wkno.org/ + (r'video\.lpb\.org', 'LPB/LOUISIANA PUBLIC BROADCASTING (WLPB)'), # http://www.lpb.org/ + (r'videos\.oeta\.tv', 'OETA (KETA)'), # http://www.oeta.tv + (r'video\.optv\.org', 'Ozarks Public Television (KOZK)'), # http://www.optv.org/ + (r'watch\.wsiu\.org', 'WSIU Public Broadcasting (WSIU)'), # http://www.wsiu.org/ + (r'video\.keet\.org', 'KEET TV (KEET)'), # http://www.keet.org + (r'pbs\.kixe\.org', 'KIXE/Channel 9 (KIXE)'), # http://kixe.org/ + (r'video\.kpbs\.org', 'KPBS San Diego (KPBS)'), # http://www.kpbs.org/ + (r'video\.kqed\.org', 'KQED (KQED)'), # http://www.kqed.org + (r'vids\.kvie\.org', 'KVIE Public Television (KVIE)'), # http://www.kvie.org + (r'video\.pbssocal\.org', 'PBS SoCal/KOCE (KOCE)'), # http://www.pbssocal.org/ + (r'video\.valleypbs\.org', 'ValleyPBS (KVPT)'), # http://www.valleypbs.org/ + (r'video\.cptv\.org', 'CONNECTICUT PUBLIC TELEVISION (WEDH)'), # http://cptv.org + (r'watch\.knpb\.org', 'KNPB Channel 5 (KNPB)'), # http://www.knpb.org/ + (r'video\.soptv\.org', 'SOPTV (KSYS)'), # http://www.soptv.org + # (r'klcs\.org', 'KLCS/Channel 58 (KLCS)'), # http://www.klcs.org + # (r'krcb\.org', 'KRCB Television & Radio (KRCB)'), # http://www.krcb.org + # (r'kvcr\.org', 'KVCR TV/DT/FM :: Vision for the Future (KVCR)'), # http://kvcr.org + (r'video\.rmpbs\.org', 'Rocky Mountain PBS (KRMA)'), # http://www.rmpbs.org + (r'video\.kenw\.org', 'KENW-TV3 (KENW)'), # http://www.kenw.org + (r'video\.kued\.org', 'KUED Channel 7 (KUED)'), # http://www.kued.org + (r'video\.wyomingpbs\.org', 'Wyoming PBS (KCWC)'), # http://www.wyomingpbs.org + (r'video\.cpt12\.org', 'Colorado Public Television / KBDI 12 (KBDI)'), # http://www.cpt12.org/ + (r'video\.kbyueleven\.org', 'KBYU-TV (KBYU)'), # http://www.kbyutv.org/ + (r'video\.thirteen\.org', 'Thirteen/WNET New York (WNET)'), # http://www.thirteen.org + (r'video\.wgbh\.org', 'WGBH/Channel 2 (WGBH)'), # http://wgbh.org + (r'video\.wgby\.org', 'WGBY (WGBY)'), # http://www.wgby.org + (r'watch\.njtvonline\.org', 'NJTV Public Media NJ (WNJT)'), # http://www.njtvonline.org/ + # (r'ripbs\.org', 'Rhode Island PBS (WSBE)'), # http://www.ripbs.org/home/ + (r'watch\.wliw\.org', 'WLIW21 (WLIW)'), # http://www.wliw.org/ + (r'video\.mpt\.tv', 'mpt/Maryland Public Television (WMPB)'), # http://www.mpt.org + (r'watch\.weta\.org', 'WETA Television and Radio (WETA)'), # http://www.weta.org + (r'video\.whyy\.org', 'WHYY (WHYY)'), # http://www.whyy.org + (r'video\.wlvt\.org', 'PBS 39 (WLVT)'), # http://www.wlvt.org/ + (r'video\.wvpt\.net', 'WVPT - Your Source for PBS and More! (WVPT)'), # http://www.wvpt.net + (r'video\.whut\.org', 'Howard University Television (WHUT)'), # http://www.whut.org + (r'video\.wedu\.org', 'WEDU PBS (WEDU)'), # http://www.wedu.org + (r'video\.wgcu\.org', 'WGCU Public Media (WGCU)'), # http://www.wgcu.org/ + # (r'wjct\.org', 'WJCT Public Broadcasting (WJCT)'), # http://www.wjct.org + (r'video\.wpbt2\.org', 'WPBT2 (WPBT)'), # http://www.wpbt2.org + (r'video\.wucftv\.org', 'WUCF TV (WUCF)'), # http://wucftv.org + (r'video\.wuft\.org', 'WUFT/Channel 5 (WUFT)'), # http://www.wuft.org + (r'watch\.wxel\.org', 'WXEL/Channel 42 (WXEL)'), # http://www.wxel.org/home/ + (r'video\.wlrn\.org', 'WLRN/Channel 17 (WLRN)'), # http://www.wlrn.org/ + (r'video\.wusf\.usf\.edu', 'WUSF Public Broadcasting (WUSF)'), # http://wusf.org/ + (r'video\.scetv\.org', 'ETV (WRLK)'), # http://www.scetv.org + (r'video\.unctv\.org', 'UNC-TV (WUNC)'), # http://www.unctv.org/ + # (r'pbsguam\.org', 'PBS Guam (KGTF)'), # http://www.pbsguam.org/ + (r'video\.pbshawaii\.org', 'PBS Hawaii - Oceanic Cable Channel 10 (KHET)'), # http://www.pbshawaii.org/ + (r'video\.idahoptv\.org', 'Idaho Public Television (KAID)'), # http://idahoptv.org + (r'video\.ksps\.org', 'KSPS (KSPS)'), # http://www.ksps.org/home/ + (r'watch\.opb\.org', 'OPB (KOPB)'), # http://www.opb.org + (r'watch\.nwptv\.org', 'KWSU/Channel 10 & KTNW/Channel 31 (KWSU)'), # http://www.kwsu.org + (r'video\.will\.illinois\.edu', 'WILL-TV (WILL)'), # http://will.illinois.edu/ + (r'video\.networkknowledge\.tv', 'Network Knowledge - WSEC/Springfield (WSEC)'), # http://www.wsec.tv + (r'video\.wttw\.com', 'WTTW11 (WTTW)'), # http://www.wttw.com/ + # (r'wtvp\.org', 'WTVP & WTVP.org, Public Media for Central Illinois (WTVP)'), # http://www.wtvp.org/ + (r'video\.iptv\.org', 'Iowa Public Television/IPTV (KDIN)'), # http://www.iptv.org/ + (r'video\.ninenet\.org', 'Nine Network (KETC)'), # http://www.ninenet.org + (r'video\.wfwa\.org', 'PBS39 Fort Wayne (WFWA)'), # http://wfwa.org/ + (r'video\.wfyi\.org', 'WFYI Indianapolis (WFYI)'), # http://www.wfyi.org + (r'video\.mptv\.org', 'Milwaukee Public Television (WMVS)'), # http://www.mptv.org + (r'video\.wnin\.org', 'WNIN (WNIN)'), # http://www.wnin.org/ + (r'video\.wnit\.org', 'WNIT Public Television (WNIT)'), # http://www.wnit.org/ + (r'video\.wpt\.org', 'WPT (WPNE)'), # http://www.wpt.org/ + (r'video\.wvut\.org', 'WVUT/Channel 22 (WVUT)'), # http://wvut.org/ + (r'video\.weiu\.net', 'WEIU/Channel 51 (WEIU)'), # http://www.weiu.net + (r'video\.wqpt\.org', 'WQPT-TV (WQPT)'), # http://www.wqpt.org + (r'video\.wycc\.org', 'WYCC PBS Chicago (WYCC)'), # http://www.wycc.org + # (r'lakeshorepublicmedia\.org', 'Lakeshore Public Television (WYIN)'), # http://lakeshorepublicmedia.org/ + (r'video\.wipb\.org', 'WIPB-TV (WIPB)'), # http://wipb.org + (r'video\.indianapublicmedia\.org', 'WTIU (WTIU)'), # http://indianapublicmedia.org/tv/ + (r'watch\.cetconnect\.org', 'CET (WCET)'), # http://www.cetconnect.org + (r'video\.thinktv\.org', 'ThinkTVNetwork (WPTD)'), # http://www.thinktv.org + (r'video\.wbgu\.org', 'WBGU-TV (WBGU)'), # http://wbgu.org + (r'video\.wgvu\.org', 'WGVU TV (WGVU)'), # http://www.wgvu.org/ + (r'video\.netnebraska\.org', 'NET1 (KUON)'), # http://netnebraska.org + (r'video\.pioneer\.org', 'Pioneer Public Television (KWCM)'), # http://www.pioneer.org + (r'watch\.sdpb\.org', 'SDPB Television (KUSD)'), # http://www.sdpb.org + (r'video\.tpt\.org', 'TPT (KTCA)'), # http://www.tpt.org + (r'watch\.ksmq\.org', 'KSMQ (KSMQ)'), # http://www.ksmq.org/ + (r'watch\.kpts\.org', 'KPTS/Channel 8 (KPTS)'), # http://www.kpts.org/ + (r'watch\.ktwu\.org', 'KTWU/Channel 11 (KTWU)'), # http://ktwu.org + # (r'shptv\.org', 'Smoky Hills Public Television (KOOD)'), # http://www.shptv.org + # (r'kcpt\.org', 'KCPT Kansas City Public Television (KCPT)'), # http://kcpt.org/ + # (r'blueridgepbs\.org', 'Blue Ridge PBS (WBRA)'), # http://www.blueridgepbs.org/ + (r'watch\.easttennesseepbs\.org', 'East Tennessee PBS (WSJK)'), # http://easttennesseepbs.org + (r'video\.wcte\.tv', 'WCTE-TV (WCTE)'), # http://www.wcte.org + (r'video\.wljt\.org', 'WLJT, Channel 11 (WLJT)'), # http://wljt.org/ + (r'video\.wosu\.org', 'WOSU TV (WOSU)'), # http://wosu.org/ + (r'video\.woub\.org', 'WOUB/WOUC (WOUB)'), # http://woub.org/tv/index.php?section=5 + (r'video\.wvpublic\.org', 'WVPB (WVPB)'), # http://wvpublic.org/ + (r'video\.wkyupbs\.org', 'WKYU-PBS (WKYU)'), # http://www.wkyupbs.org + # (r'wyes\.org', 'WYES-TV/New Orleans (WYES)'), # http://www.wyes.org + (r'video\.kera\.org', 'KERA 13 (KERA)'), # http://www.kera.org/ + (r'video\.mpbn\.net', 'MPBN (WCBB)'), # http://www.mpbn.net/ + (r'video\.mountainlake\.org', 'Mountain Lake PBS (WCFE)'), # http://www.mountainlake.org/ + (r'video\.nhptv\.org', 'NHPTV (WENH)'), # http://nhptv.org/ + (r'video\.vpt\.org', 'Vermont PBS (WETK)'), # http://www.vpt.org + (r'video\.witf\.org', 'witf (WITF)'), # http://www.witf.org + (r'watch\.wqed\.org', 'WQED Multimedia (WQED)'), # http://www.wqed.org/ + (r'video\.wmht\.org', 'WMHT Educational Telecommunications (WMHT)'), # http://www.wmht.org/home/ + (r'video\.deltabroadcasting\.org', 'Q-TV (WDCQ)'), # http://www.deltabroadcasting.org + (r'video\.dptv\.org', 'WTVS Detroit Public TV (WTVS)'), # http://www.dptv.org/ + (r'video\.wcmu\.org', 'CMU Public Television (WCMU)'), # http://www.wcmu.org + (r'video\.wkar\.org', 'WKAR-TV (WKAR)'), # http://wkar.org/ + (r'wnmuvideo\.nmu\.edu', 'WNMU-TV Public TV 13 (WNMU)'), # http://wnmutv.nmu.edu + (r'video\.wdse\.org', 'WDSE - WRPT (WDSE)'), # http://www.wdse.org/ + (r'video\.wgte\.org', 'WGTE TV (WGTE)'), # http://www.wgte.org + (r'video\.lptv\.org', 'Lakeland Public Television (KAWE)'), # http://www.lakelandptv.org + # (r'prairiepublic\.org', 'PRAIRIE PUBLIC (KFME)'), # http://www.prairiepublic.org/ + (r'video\.kmos\.org', 'KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS)'), # http://www.kmos.org/ + (r'watch\.montanapbs\.org', 'MontanaPBS (KUSM)'), # http://montanapbs.org + (r'video\.krwg\.org', 'KRWG/Channel 22 (KRWG)'), # http://www.krwg.org + (r'video\.kacvtv\.org', 'KACV (KACV)'), # http://www.panhandlepbs.org/home/ + (r'video\.kcostv\.org', 'KCOS/Channel 13 (KCOS)'), # www.kcostv.org + (r'video\.wcny\.org', 'WCNY/Channel 24 (WCNY)'), # http://www.wcny.org + (r'video\.wned\.org', 'WNED (WNED)'), # http://www.wned.org/ + (r'watch\.wpbstv\.org', 'WPBS (WPBS)'), # http://www.wpbstv.org + (r'video\.wskg\.org', 'WSKG Public TV (WSKG)'), # http://wskg.org + (r'video\.wxxi\.org', 'WXXI (WXXI)'), # http://wxxi.org + (r'video\.wpsu\.org', 'WPSU (WPSU)'), # http://www.wpsu.org + # (r'wqln\.org', 'WQLN/Channel 54 (WQLN)'), # http://www.wqln.org + (r'on-demand\.wvia\.org', 'WVIA Public Media Studios (WVIA)'), # http://www.wvia.org/ + (r'video\.wtvi\.org', 'WTVI (WTVI)'), # http://www.wtvi.org/ + # (r'whro\.org', 'WHRO (WHRO)'), # http://whro.org + (r'video\.westernreservepublicmedia\.org', 'Western Reserve PBS (WNEO)'), # http://www.WesternReservePublicMedia.org/ + (r'video\.ideastream\.org', 'WVIZ/PBS ideastream (WVIZ)'), # http://www.wviz.org/ + (r'video\.kcts9\.org', 'KCTS 9 (KCTS)'), # http://kcts9.org/ + (r'video\.basinpbs\.org', 'Basin PBS (KPBT)'), # http://www.basinpbs.org + (r'video\.houstonpbs\.org', 'KUHT / Channel 8 (KUHT)'), # http://www.houstonpublicmedia.org/ + # (r'tamu\.edu', 'KAMU - TV (KAMU)'), # http://KAMU.tamu.edu + # (r'kedt\.org', 'KEDT/Channel 16 (KEDT)'), # http://www.kedt.org + (r'video\.klrn\.org', 'KLRN (KLRN)'), # http://www.klrn.org + (r'video\.klru\.tv', 'KLRU (KLRU)'), # http://www.klru.org + # (r'kmbh\.org', 'KMBH-TV (KMBH)'), # http://www.kmbh.org + # (r'knct\.org', 'KNCT (KNCT)'), # http://www.knct.org + # (r'ktxt\.org', 'KTTZ-TV (KTXT)'), # http://www.ktxt.org + (r'video\.wtjx\.org', 'WTJX Channel 12 (WTJX)'), # http://www.wtjx.org/ + (r'video\.ideastations\.org', 'WCVE PBS (WCVE)'), # http://ideastations.org/ + (r'video\.kbtc\.org', 'KBTC Public Television (KBTC)'), # http://kbtc.org + ) + + IE_NAME = 'pbs' + IE_DESC = 'Public Broadcasting Service (PBS) and member stations: %s' % ', '.join(list(zip(*_STATIONS))[1]) + + _VALID_URL = r'''(?x)https?:// + (?: + # Direct video URL + (?:%s)/(?:(?:vir|port)alplayer|video)/(?P<id>[0-9]+)(?:[?/]|$) | + # Article with embedded player (or direct video) + (?:www\.)?pbs\.org/(?:[^/]+/){1,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) | + # Player + (?:video|player)\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+)/ + ) + ''' % '|'.join(list(zip(*_STATIONS))[0]) + + _GEO_COUNTRIES = ['US'] + + _TESTS = [ + { + 'url': 'http://www.pbs.org/tpt/constitution-usa-peter-sagal/watch/a-more-perfect-union/', + 'md5': '173dc391afd361fa72eab5d3d918968d', + 'info_dict': { + 'id': '2365006249', + 'ext': 'mp4', + 'title': 'Constitution USA with Peter Sagal - A More Perfect Union', + 'description': 'md5:31b664af3c65fd07fa460d306b837d00', + 'duration': 3190, + }, + }, + { + 'url': 'http://www.pbs.org/wgbh/pages/frontline/losing-iraq/', + 'md5': '6f722cb3c3982186d34b0f13374499c7', + 'info_dict': { + 'id': '2365297690', + 'ext': 'mp4', + 'title': 'FRONTLINE - Losing Iraq', + 'description': 'md5:5979a4d069b157f622d02bff62fbe654', + 'duration': 5050, + }, + }, + { + 'url': 'http://www.pbs.org/newshour/bb/education-jan-june12-cyberschools_02-23/', + 'md5': 'b19856d7f5351b17a5ab1dc6a64be633', + 'info_dict': { + 'id': '2201174722', + 'ext': 'mp4', + 'title': 'PBS NewsHour - Cyber Schools Gain Popularity, but Quality Questions Persist', + 'description': 'md5:86ab9a3d04458b876147b355788b8781', + 'duration': 801, + }, + }, + { + 'url': 'http://www.pbs.org/wnet/gperf/dudamel-conducts-verdi-requiem-hollywood-bowl-full-episode/3374/', + 'md5': 'c62859342be2a0358d6c9eb306595978', + 'info_dict': { + 'id': '2365297708', + 'ext': 'mp4', + 'title': 'Great Performances - Dudamel Conducts Verdi Requiem at the Hollywood Bowl - Full', + 'description': 'md5:657897370e09e2bc6bf0f8d2cd313c6b', + 'duration': 6559, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + }, + { + 'url': 'http://www.pbs.org/wgbh/nova/earth/killer-typhoon.html', + 'md5': '908f3e5473a693b266b84e25e1cf9703', + 'info_dict': { + 'id': '2365160389', + 'display_id': 'killer-typhoon', + 'ext': 'mp4', + 'description': 'md5:c741d14e979fc53228c575894094f157', + 'title': 'NOVA - Killer Typhoon', + 'duration': 3172, + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20140122', + 'age_limit': 10, + }, + }, + { + 'url': 'http://www.pbs.org/wgbh/pages/frontline/united-states-of-secrets/', + 'info_dict': { + 'id': 'united-states-of-secrets', + }, + 'playlist_count': 2, + }, + { + 'url': 'http://www.pbs.org/wgbh/americanexperience/films/great-war/', + 'info_dict': { + 'id': 'great-war', + }, + 'playlist_count': 3, + }, + { + 'url': 'http://www.pbs.org/wgbh/americanexperience/films/death/player/', + 'info_dict': { + 'id': '2276541483', + 'display_id': 'player', + 'ext': 'mp4', + 'title': 'American Experience - Death and the Civil War, Chapter 1', + 'description': 'md5:67fa89a9402e2ee7d08f53b920674c18', + 'duration': 682, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + 'skip_download': True, # requires ffmpeg + }, + }, + { + 'url': 'http://www.pbs.org/video/2365245528/', + 'md5': '115223d41bd55cda8ae5cd5ed4e11497', + 'info_dict': { + 'id': '2365245528', + 'display_id': '2365245528', + 'ext': 'mp4', + 'title': 'FRONTLINE - United States of Secrets (Part One)', + 'description': 'md5:55756bd5c551519cc4b7703e373e217e', + 'duration': 6851, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + }, + { + # Video embedded in iframe containing angle brackets as attribute's value (e.g. + # "<iframe style='position: absolute;<br />\ntop: 0; left: 0;' ...", see + # https://github.com/ytdl-org/youtube-dl/issues/7059) + 'url': 'http://www.pbs.org/food/features/a-chefs-life-season-3-episode-5-prickly-business/', + 'md5': '59b0ef5009f9ac8a319cc5efebcd865e', + 'info_dict': { + 'id': '2365546844', + 'display_id': 'a-chefs-life-season-3-episode-5-prickly-business', + 'ext': 'mp4', + 'title': "A Chef's Life - Season 3, Ep. 5: Prickly Business", + 'description': 'md5:c0ff7475a4b70261c7e58f493c2792a5', + 'duration': 1480, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + }, + { + # Frontline video embedded via flp2012.js + 'url': 'http://www.pbs.org/wgbh/pages/frontline/the-atomic-artists', + 'info_dict': { + 'id': '2070868960', + 'display_id': 'the-atomic-artists', + 'ext': 'mp4', + 'title': 'FRONTLINE - The Atomic Artists', + 'description': 'md5:f677e4520cfacb4a5ce1471e31b57800', + 'duration': 723, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + 'skip_download': True, # requires ffmpeg + }, + }, + { + # Serves hd only via wigget/partnerplayer page + 'url': 'http://www.pbs.org/video/2365641075/', + 'md5': 'fdf907851eab57211dd589cf12006666', + 'info_dict': { + 'id': '2365641075', + 'ext': 'mp4', + 'title': 'FRONTLINE - Netanyahu at War', + 'duration': 6852, + 'thumbnail': r're:^https?://.*\.jpg$', + 'formats': 'mincount:8', + }, + }, + { + # https://github.com/ytdl-org/youtube-dl/issues/13801 + 'url': 'https://www.pbs.org/video/pbs-newshour-full-episode-july-31-2017-1501539057/', + 'info_dict': { + 'id': '3003333873', + 'ext': 'mp4', + 'title': 'PBS NewsHour - full episode July 31, 2017', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'duration': 3265, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + 'skip_download': True, + }, + }, + { + 'url': 'http://www.pbs.org/wgbh/roadshow/watch/episode/2105-indianapolis-hour-2/', + 'info_dict': { + 'id': '2365936247', + 'ext': 'mp4', + 'title': 'Antiques Roadshow - Indianapolis, Hour 2', + 'description': 'md5:524b32249db55663e7231b6b8d1671a2', + 'duration': 3180, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['HTTP Error 403: Forbidden'], + }, + { + 'url': 'https://www.pbs.org/wgbh/masterpiece/episodes/victoria-s2-e1/', + 'info_dict': { + 'id': '3007193718', + 'ext': 'mp4', + 'title': "Victoria - A Soldier's Daughter / The Green-Eyed Monster", + 'description': 'md5:37efbac85e0c09b009586523ec143652', + 'duration': 6292, + 'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['HTTP Error 403: Forbidden'], + }, + { + 'url': 'https://player.pbs.org/partnerplayer/tOz9tM5ljOXQqIIWke53UA==/', + 'info_dict': { + 'id': '3011407934', + 'ext': 'mp4', + 'title': 'Stories from the Stage - Road Trip', + 'duration': 1619, + 'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['HTTP Error 403: Forbidden'], + }, + { + 'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true', + 'only_matching': True, + }, + { + 'url': 'http://watch.knpb.org/video/2365616055/', + 'only_matching': True, + }, + { + 'url': 'https://player.pbs.org/portalplayer/3004638221/?uid=', + 'only_matching': True, + } + ] + _ERRORS = { + 101: 'We\'re sorry, but this video is not yet available.', + 403: 'We\'re sorry, but this video is not available in your region due to right restrictions.', + 404: 'We are experiencing technical difficulties that are preventing us from playing the video at this time. Please check back again soon.', + 410: 'This video has expired and is no longer available for online streaming.', + } + + def _real_initialize(self): + cookie = (self._download_json( + 'http://localization.services.pbs.org/localize/auto/cookie/', + None, headers=self.geo_verification_headers(), fatal=False) or {}).get('cookie') + if cookie: + station = self._search_regex(r'#?s=\["([^"]+)"', cookie, 'station') + if station: + self._set_cookie('.pbs.org', 'pbsol.station', station) + + def _extract_webpage(self, url): + mobj = self._match_valid_url(url) + + description = None + + presumptive_id = mobj.group('presumptive_id') + display_id = presumptive_id + if presumptive_id: + webpage = self._download_webpage(url, display_id) + + description = strip_or_none(self._og_search_description( + webpage, default=None) or self._html_search_meta( + 'description', webpage, default=None)) + upload_date = unified_strdate(self._search_regex( + r'<input type="hidden" id="air_date_[0-9]+" value="([^"]+)"', + webpage, 'upload date', default=None)) + + # tabbed frontline videos + MULTI_PART_REGEXES = ( + r'<div[^>]+class="videotab[^"]*"[^>]+vid="(\d+)"', + r'<a[^>]+href=["\']#(?:video-|part)\d+["\'][^>]+data-cove[Ii]d=["\'](\d+)', + ) + for p in MULTI_PART_REGEXES: + tabbed_videos = orderedSet(re.findall(p, webpage)) + if tabbed_videos: + return tabbed_videos, presumptive_id, upload_date, description + + MEDIA_ID_REGEXES = [ + r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'", # frontline video embed + r'class="coveplayerid">([^<]+)<', # coveplayer + r'<section[^>]+data-coveid="(\d+)"', # coveplayer from http://www.pbs.org/wgbh/frontline/film/real-csi/ + r'<input type="hidden" id="pbs_video_id_[0-9]+" value="([0-9]+)"/>', # jwplayer + r"(?s)window\.PBS\.playerConfig\s*=\s*{.*?id\s*:\s*'([0-9]+)',", + r'<div[^>]+\bdata-cove-id=["\'](\d+)"', # http://www.pbs.org/wgbh/roadshow/watch/episode/2105-indianapolis-hour-2/ + r'<iframe[^>]+\bsrc=["\'](?:https?:)?//video\.pbs\.org/widget/partnerplayer/(\d+)', # https://www.pbs.org/wgbh/masterpiece/episodes/victoria-s2-e1/ + ] + + media_id = self._search_regex( + MEDIA_ID_REGEXES, webpage, 'media ID', fatal=False, default=None) + if media_id: + return media_id, presumptive_id, upload_date, description + + # Frontline video embedded via flp + video_id = self._search_regex( + r'videoid\s*:\s*"([\d+a-z]{7,})"', webpage, 'videoid', default=None) + if video_id: + # pkg_id calculation is reverse engineered from + # http://www.pbs.org/wgbh/pages/frontline/js/flp2012.js + prg_id = self._search_regex( + r'videoid\s*:\s*"([\d+a-z]{7,})"', webpage, 'videoid')[7:] + if 'q' in prg_id: + prg_id = prg_id.split('q')[1] + prg_id = int(prg_id, 16) + getdir = self._download_json( + 'http://www.pbs.org/wgbh/pages/frontline/.json/getdir/getdir%d.json' % prg_id, + presumptive_id, 'Downloading getdir JSON', + transform_source=strip_jsonp) + return getdir['mid'], presumptive_id, upload_date, description + + for iframe in re.findall(r'(?s)<iframe(.+?)></iframe>', webpage): + url = self._search_regex( + r'src=(["\'])(?P<url>.+?partnerplayer.+?)\1', iframe, + 'player URL', default=None, group='url') + if url: + break + + if not url: + url = self._og_search_url(webpage) + + mobj = re.match( + self._VALID_URL, self._proto_relative_url(url.strip())) + + player_id = mobj.group('player_id') + if not display_id: + display_id = player_id + if player_id: + player_page = self._download_webpage( + url, display_id, note='Downloading player page', + errnote='Could not download player page') + video_id = self._search_regex( + r'<div\s+id=["\']video_(\d+)', player_page, 'video ID', + default=None) + if not video_id: + video_info = self._extract_video_data( + player_page, 'video data', display_id) + video_id = compat_str( + video_info.get('id') or video_info['contentID']) + else: + video_id = mobj.group('id') + display_id = video_id + + return video_id, display_id, None, description + + def _extract_video_data(self, string, name, video_id, fatal=True): + return self._parse_json( + self._search_regex( + [r'(?s)PBS\.videoData\s*=\s*({.+?});\n', + r'window\.videoBridge\s*=\s*({.+?});'], + string, name, default='{}'), + video_id, transform_source=js_to_json, fatal=fatal) + + def _real_extract(self, url): + video_id, display_id, upload_date, description = self._extract_webpage(url) + + if isinstance(video_id, list): + entries = [self.url_result( + 'http://video.pbs.org/video/%s' % vid_id, 'PBS', vid_id) + for vid_id in video_id] + return self.playlist_result(entries, display_id) + + info = None + redirects = [] + redirect_urls = set() + + def extract_redirect_urls(info): + for encoding_name in ('recommended_encoding', 'alternate_encoding'): + redirect = info.get(encoding_name) + if not redirect: + continue + redirect_url = redirect.get('url') + if redirect_url and redirect_url not in redirect_urls: + redirects.append(redirect) + redirect_urls.add(redirect_url) + encodings = info.get('encodings') + if isinstance(encodings, list): + for encoding in encodings: + encoding_url = url_or_none(encoding) + if encoding_url and encoding_url not in redirect_urls: + redirects.append({'url': encoding_url}) + redirect_urls.add(encoding_url) + + chapters = [] + # Player pages may also serve different qualities + for page in ('widget/partnerplayer', 'portalplayer'): + player = self._download_webpage( + 'http://player.pbs.org/%s/%s' % (page, video_id), + display_id, 'Downloading %s page' % page, fatal=False) + if player: + video_info = self._extract_video_data( + player, '%s video data' % page, display_id, fatal=False) + if video_info: + extract_redirect_urls(video_info) + if not info: + info = video_info + if not chapters: + raw_chapters = video_info.get('chapters') or [] + if not raw_chapters: + for chapter_data in re.findall(r'(?s)chapters\.push\(({.*?})\)', player): + chapter = self._parse_json(chapter_data, video_id, js_to_json, fatal=False) + if not chapter: + continue + raw_chapters.append(chapter) + for chapter in raw_chapters: + start_time = float_or_none(chapter.get('start_time'), 1000) + duration = float_or_none(chapter.get('duration'), 1000) + if start_time is None or duration is None: + continue + chapters.append({ + 'start_time': start_time, + 'end_time': start_time + duration, + 'title': chapter.get('title'), + }) + + formats = [] + http_url = None + hls_subs = {} + for num, redirect in enumerate(redirects): + redirect_id = redirect.get('eeid') + + redirect_info = self._download_json( + '%s?format=json' % redirect['url'], display_id, + 'Downloading %s video url info' % (redirect_id or num), + headers=self.geo_verification_headers()) + + if redirect_info['status'] == 'error': + message = self._ERRORS.get( + redirect_info['http_code'], redirect_info['message']) + if redirect_info['http_code'] == 403: + self.raise_geo_restricted( + msg=message, countries=self._GEO_COUNTRIES) + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, message), expected=True) + + format_url = redirect_info.get('url') + if not format_url: + continue + + if determine_ext(format_url) == 'm3u8': + hls_formats, hls_subs = self._extract_m3u8_formats_and_subtitles( + format_url, display_id, 'mp4', m3u8_id='hls', fatal=False) + formats.extend(hls_formats) + else: + formats.append({ + 'url': format_url, + 'format_id': redirect_id, + }) + if re.search(r'^https?://.*(?:\d+k|baseline)', format_url): + http_url = format_url + self._remove_duplicate_formats(formats) + m3u8_formats = list(filter( + lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none', + formats)) + if http_url: + for m3u8_format in m3u8_formats: + bitrate = self._search_regex(r'(\d+)k', m3u8_format['url'], 'bitrate', default=None) + # Lower qualities (150k and 192k) are not available as HTTP formats (see [1]), + # we won't try extracting them. + # Since summer 2016 higher quality formats (4500k and 6500k) are also available + # albeit they are not documented in [2]. + # 1. https://github.com/ytdl-org/youtube-dl/commit/cbc032c8b70a038a69259378c92b4ba97b42d491#commitcomment-17313656 + # 2. https://projects.pbs.org/confluence/display/coveapi/COVE+Video+Specifications + if not bitrate or int(bitrate) < 400: + continue + f_url = re.sub(r'\d+k|baseline', bitrate + 'k', http_url) + # This may produce invalid links sometimes (e.g. + # http://www.pbs.org/wgbh/frontline/film/suicide-plan) + if not self._is_valid_url(f_url, display_id, 'http-%sk video' % bitrate): + continue + f = m3u8_format.copy() + f.update({ + 'url': f_url, + 'format_id': m3u8_format['format_id'].replace('hls', 'http'), + 'protocol': 'http', + }) + formats.append(f) + self._sort_formats(formats) + + rating_str = info.get('rating') + if rating_str is not None: + rating_str = rating_str.rpartition('-')[2] + age_limit = US_RATINGS.get(rating_str) + + subtitles = {} + captions = info.get('cc') or {} + for caption_url in captions.values(): + subtitles.setdefault('en', []).append({ + 'url': caption_url + }) + subtitles = self._merge_subtitles(subtitles, hls_subs) + + # info['title'] is often incomplete (e.g. 'Full Episode', 'Episode 5', etc) + # Try turning it to 'program - title' naming scheme if possible + alt_title = info.get('program', {}).get('title') + if alt_title: + info['title'] = alt_title + ' - ' + re.sub(r'^' + alt_title + r'[\s\-:]+', '', info['title']) + + description = info.get('description') or info.get( + 'program', {}).get('description') or description + + return { + 'id': video_id, + 'display_id': display_id, + 'title': info['title'], + 'description': description, + 'thumbnail': info.get('image_url'), + 'duration': int_or_none(info.get('duration')), + 'age_limit': age_limit, + 'upload_date': upload_date, + 'formats': formats, + 'subtitles': subtitles, + 'chapters': chapters, + } diff --git a/youtube_dl/extractor/pearvideo.py b/yt_dlp/extractor/pearvideo.py index 1d777221c..1d777221c 100644 --- a/youtube_dl/extractor/pearvideo.py +++ b/yt_dlp/extractor/pearvideo.py diff --git a/yt_dlp/extractor/peertube.py b/yt_dlp/extractor/peertube.py new file mode 100644 index 000000000..1e22f24e3 --- /dev/null +++ b/yt_dlp/extractor/peertube.py @@ -0,0 +1,1402 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import functools +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + parse_resolution, + str_or_none, + try_get, + unified_timestamp, + url_or_none, + urljoin, + OnDemandPagedList, +) + + +class PeerTubeIE(InfoExtractor): + _INSTANCES_RE = r'''(?: + # Taken from https://instances.joinpeertube.org/instances + 40two\.tube| + a\.metube\.ch| + advtv\.ml| + algorithmic\.tv| + alimulama\.com| + arcana\.fun| + archive\.vidicon\.org| + artefac-paris\.tv| + auf1\.eu| + battlepenguin\.video| + beertube\.epgn\.ch| + befree\.nohost\.me| + bideoak\.argia\.eus| + birkeundnymphe\.de| + bitcointv\.com| + cattube\.org| + clap\.nerv-project\.eu| + climatejustice\.video| + comf\.tube| + conspiracydistillery\.com| + darkvapor\.nohost\.me| + daschauher\.aksel\.rocks| + digitalcourage\.video| + dreiecksnebel\.alex-detsch\.de| + eduvid\.org| + evangelisch\.video| + exo\.tube| + fair\.tube| + fediverse\.tv| + film\.k-prod\.fr| + flim\.txmn\.tk| + fotogramas\.politicaconciencia\.org| + ftsi\.ru| + gary\.vger\.cloud| + graeber\.video| + greatview\.video| + grypstube\.uni-greifswald\.de| + highvoltage\.tv| + hpstube\.fr| + htp\.live| + hyperreal\.tube| + juggling\.digital| + kino\.kompot\.si| + kino\.schuerz\.at| + kinowolnosc\.pl| + kirche\.peertube-host\.de| + kodcast\.com| + kolektiva\.media| + kraut\.zone| + kumi\.tube| + lastbreach\.tv| + lepetitmayennais\.fr\.nf| + lexx\.impa\.me| + libertynode\.tv| + libra\.syntazia\.org| + libremedia\.video| + live\.libratoi\.org| + live\.nanao\.moe| + live\.toobnix\.org| + livegram\.net| + lolitube\.freedomchan\.moe| + lucarne\.balsamine\.be| + maindreieck-tv\.de| + mani\.tube| + manicphase\.me| + media\.gzevd\.de| + media\.inno3\.cricket| + media\.kaitaia\.life| + media\.krashboyz\.org| + media\.over-world\.org| + media\.skewed\.de| + media\.undeadnetwork\.de| + medias\.pingbase\.net| + melsungen\.peertube-host\.de| + mirametube\.fr| + mojotube\.net| + monplaisirtube\.ddns\.net| + mountaintown\.video| + my\.bunny\.cafe| + myfreetube\.de| + mytube\.kn-cloud\.de| + mytube\.madzel\.de| + myworkoutarenapeertube\.cf| + nanawel-peertube\.dyndns\.org| + nastub\.cz| + offenes\.tv| + orgdup\.media| + ovaltube\.codinglab\.ch| + p2ptv\.ru| + p\.eertu\.be| + p\.lu| + peer\.azurs\.fr| + peertube1\.zeteo\.me| + peertube\.020\.pl| + peertube\.0x5e\.eu| + peertube\.alpharius\.io| + peertube\.am-networks\.fr| + peertube\.anduin\.net| + peertube\.anzui\.dev| + peertube\.arbleizez\.bzh| + peertube\.art3mis\.de| + peertube\.atilla\.org| + peertube\.atsuchan\.page| + peertube\.aukfood\.net| + peertube\.aventer\.biz| + peertube\.b38\.rural-it\.org| + peertube\.beeldengeluid\.nl| + peertube\.be| + peertube\.bgzashtita\.es| + peertube\.bitsandlinux\.com| + peertube\.biz| + peertube\.boba\.best| + peertube\.br0\.fr| + peertube\.bridaahost\.ynh\.fr| + peertube\.bubbletea\.dev| + peertube\.bubuit\.net| + peertube\.cabaal\.net| + peertube\.cats-home\.net| + peertube\.chemnitz\.freifunk\.net| + peertube\.chevro\.fr| + peertube\.chrisspiegl\.com| + peertube\.chtisurel\.net| + peertube\.cipherbliss\.com| + peertube\.cloud\.sans\.pub| + peertube\.cpge-brizeux\.fr| + peertube\.ctseuro\.com| + peertube\.cuatrolibertades\.org| + peertube\.cybercirujas\.club| + peertube\.cythin\.com| + peertube\.davigge\.com| + peertube\.dc\.pini\.fr| + peertube\.debian\.social| + peertube\.demonix\.fr| + peertube\.designersethiques\.org| + peertube\.desmu\.fr| + peertube\.devloprog\.org| + peertube\.devol\.it| + peertube\.dtmf\.ca| + peertube\.ecologie\.bzh| + peertube\.eu\.org| + peertube\.european-pirates\.eu| + peertube\.euskarabildua\.eus| + peertube\.fenarinarsa\.com| + peertube\.fomin\.site| + peertube\.forsud\.be| + peertube\.francoispelletier\.org| + peertube\.freenet\.ru| + peertube\.freetalklive\.com| + peertube\.functional\.cafe| + peertube\.gardeludwig\.fr| + peertube\.gargantia\.fr| + peertube\.gcfamily\.fr| + peertube\.genma\.fr| + peertube\.get-racing\.de| + peertube\.gidikroon\.eu| + peertube\.gruezishop\.ch| + peertube\.habets\.house| + peertube\.hackerfraternity\.org| + peertube\.ichigo\.everydayimshuflin\.com| + peertube\.ignifi\.me| + peertube\.inapurna\.org| + peertube\.informaction\.info| + peertube\.interhop\.org| + peertube\.iselfhost\.com| + peertube\.it| + peertube\.jensdiemer\.de| + peertube\.joffreyverd\.fr| + peertube\.kalua\.im| + peertube\.kathryl\.fr| + peertube\.keazilla\.net| + peertube\.klaewyss\.fr| + peertube\.kodcast\.com| + peertube\.kx\.studio| + peertube\.lagvoid\.com| + peertube\.lavallee\.tech| + peertube\.le5emeaxe\.fr| + peertube\.lestutosdeprocessus\.fr| + peertube\.librenet\.co\.za| + peertube\.logilab\.fr| + peertube\.louisematic\.site| + peertube\.luckow\.org| + peertube\.luga\.at| + peertube\.lyceeconnecte\.fr| + peertube\.manalejandro\.com| + peertube\.marud\.fr| + peertube\.mattone\.net| + peertube\.maxweiss\.io| + peertube\.monlycee\.net| + peertube\.mxinfo\.fr| + peertube\.myrasp\.eu| + peertube\.nebelcloud\.de| + peertube\.netzbegruenung\.de| + peertube\.newsocial\.tech| + peertube\.nicolastissot\.fr| + peertube\.nz| + peertube\.offerman\.com| + peertube\.opencloud\.lu| + peertube\.orthus\.link| + peertube\.patapouf\.xyz| + peertube\.pi2\.dev| + peertube\.plataformess\.org| + peertube\.pl| + peertube\.portaesgnos\.org| + peertube\.r2\.enst\.fr| + peertube\.r5c3\.fr| + peertube\.radres\.xyz| + peertube\.red| + peertube\.robonomics\.network| + peertube\.rtnkv\.cloud| + peertube\.runfox\.tk| + peertube\.satoshishop\.de| + peertube\.scic-tetris\.org| + peertube\.securitymadein\.lu| + peertube\.semweb\.pro| + peertube\.social\.my-wan\.de| + peertube\.soykaf\.org| + peertube\.stefofficiel\.me| + peertube\.stream| + peertube\.su| + peertube\.swrs\.net| + peertube\.takeko\.cyou| + peertube\.tangentfox\.com| + peertube\.taxinachtegel\.de| + peertube\.thenewoil\.xyz| + peertube\.ti-fr\.com| + peertube\.tiennot\.net| + peertube\.troback\.com| + peertube\.tspu\.edu\.ru| + peertube\.tux\.ovh| + peertube\.tv| + peertube\.tweb\.tv| + peertube\.ucy\.de| + peertube\.underworld\.fr| + peertube\.us\.to| + peertube\.ventresmous\.fr| + peertube\.vlaki\.cz| + peertube\.w\.utnw\.de| + peertube\.westring\.digital| + peertube\.xwiki\.com| + peertube\.zoz-serv\.org| + peervideo\.ru| + periscope\.numenaute\.org| + perron-tube\.de| + petitlutinartube\.fr| + phijkchu\.com| + pierre\.tube| + piraten\.space| + play\.rosano\.ca| + player\.ojamajo\.moe| + plextube\.nl| + pocketnetpeertube1\.nohost\.me| + pocketnetpeertube3\.nohost\.me| + pocketnetpeertube4\.nohost\.me| + pocketnetpeertube5\.nohost\.me| + pocketnetpeertube6\.nohost\.me| + pt\.24-7\.ro| + pt\.apathy\.top| + pt\.diaspodon\.fr| + pt\.fedi\.tech| + pt\.maciej\.website| + ptb\.lunarviews\.net| + ptmir1\.inter21\.net| + ptmir2\.inter21\.net| + ptmir3\.inter21\.net| + ptmir4\.inter21\.net| + ptmir5\.inter21\.net| + ptube\.horsentiers\.fr| + ptube\.xmanifesto\.club| + queermotion\.org| + re-wizja\.re-medium\.com| + regarder\.sans\.pub| + ruraletv\.ovh| + s1\.gegenstimme\.tv| + s2\.veezee\.tube| + sdmtube\.fr| + sender-fm\.veezee\.tube| + serv1\.wiki-tube\.de| + serv3\.wiki-tube\.de| + sickstream\.net| + sleepy\.tube| + sovran\.video| + spectra\.video| + stream\.elven\.pw| + stream\.k-prod\.fr| + stream\.shahab\.nohost\.me| + streamsource\.video| + studios\.racer159\.com| + testtube\.florimond\.eu| + tgi\.hosted\.spacebear\.ee| + thaitube\.in\.th| + the\.jokertv\.eu| + theater\.ethernia\.net| + thecool\.tube| + tilvids\.com| + toob\.bub\.org| + tpaw\.video| + truetube\.media| + tuba\.lhub\.pl| + tube-aix-marseille\.beta\.education\.fr| + tube-amiens\.beta\.education\.fr| + tube-besancon\.beta\.education\.fr| + tube-bordeaux\.beta\.education\.fr| + tube-clermont-ferrand\.beta\.education\.fr| + tube-corse\.beta\.education\.fr| + tube-creteil\.beta\.education\.fr| + tube-dijon\.beta\.education\.fr| + tube-education\.beta\.education\.fr| + tube-grenoble\.beta\.education\.fr| + tube-lille\.beta\.education\.fr| + tube-limoges\.beta\.education\.fr| + tube-montpellier\.beta\.education\.fr| + tube-nancy\.beta\.education\.fr| + tube-nantes\.beta\.education\.fr| + tube-nice\.beta\.education\.fr| + tube-normandie\.beta\.education\.fr| + tube-orleans-tours\.beta\.education\.fr| + tube-outremer\.beta\.education\.fr| + tube-paris\.beta\.education\.fr| + tube-poitiers\.beta\.education\.fr| + tube-reims\.beta\.education\.fr| + tube-rennes\.beta\.education\.fr| + tube-strasbourg\.beta\.education\.fr| + tube-toulouse\.beta\.education\.fr| + tube-versailles\.beta\.education\.fr| + tube1\.it\.tuwien\.ac\.at| + tube\.abolivier\.bzh| + tube\.ac-amiens\.fr| + tube\.aerztefueraufklaerung\.de| + tube\.alexx\.ml| + tube\.amic37\.fr| + tube\.anufrij\.de| + tube\.apolut\.net| + tube\.arkhalabs\.io| + tube\.arthack\.nz| + tube\.as211696\.net| + tube\.avensio\.de| + tube\.azbyka\.ru| + tube\.azkware\.net| + tube\.bachaner\.fr| + tube\.bmesh\.org| + tube\.borked\.host| + tube\.bstly\.de| + tube\.chaoszone\.tv| + tube\.chatelet\.ovh| + tube\.cloud-libre\.eu| + tube\.cms\.garden| + tube\.cowfee\.moe| + tube\.cryptography\.dog| + tube\.darknight-coffee\.org| + tube\.dev\.lhub\.pl| + tube\.distrilab\.fr| + tube\.dsocialize\.net| + tube\.ebin\.club| + tube\.fdn\.fr| + tube\.florimond\.eu| + tube\.foxarmy\.ml| + tube\.foxden\.party| + tube\.frischesicht\.de| + tube\.futuretic\.fr| + tube\.gnous\.eu| + tube\.grap\.coop| + tube\.graz\.social| + tube\.grin\.hu| + tube\.hackerscop\.org| + tube\.hordearii\.fr| + tube\.jeena\.net| + tube\.kai-stuht\.com| + tube\.kockatoo\.org| + tube\.kotur\.org| + tube\.lacaveatonton\.ovh| + tube\.linkse\.media| + tube\.lokad\.com| + tube\.lucie-philou\.com| + tube\.melonbread\.xyz| + tube\.mfraters\.net| + tube\.motuhake\.xyz| + tube\.mrbesen\.de| + tube\.nah\.re| + tube\.nchoco\.net| + tube\.novg\.net| + tube\.nox-rhea\.org| + tube\.nuagelibre\.fr| + tube\.nx12\.net| + tube\.octaplex\.net| + tube\.odat\.xyz| + tube\.oisux\.org| + tube\.opportunis\.me| + tube\.org\.il| + tube\.ortion\.xyz| + tube\.others\.social| + tube\.picasoft\.net| + tube\.plomlompom\.com| + tube\.pmj\.rocks| + tube\.portes-imaginaire\.org| + tube\.pyngu\.com| + tube\.rebellion\.global| + tube\.rhythms-of-resistance\.org| + tube\.rita\.moe| + tube\.rsi\.cnr\.it| + tube\.s1gm4\.eu| + tube\.saumon\.io| + tube\.schleuss\.online| + tube\.schule\.social| + tube\.seditio\.fr| + tube\.shanti\.cafe| + tube\.shela\.nu| + tube\.skrep\.in| + tube\.sp-codes\.de| + tube\.sp4ke\.com| + tube\.superseriousbusiness\.org| + tube\.systest\.eu| + tube\.tappret\.fr| + tube\.tardis\.world| + tube\.toontoet\.nl| + tube\.tpshd\.de| + tube\.troopers\.agency| + tube\.tylerdavis\.xyz| + tube\.undernet\.uy| + tube\.vigilian-consulting\.nl| + tube\.vraphim\.com| + tube\.wehost\.lgbt| + tube\.wien\.rocks| + tube\.wolfe\.casa| + tube\.xd0\.de| + tube\.xy-space\.de| + tube\.yapbreak\.fr| + tubedu\.org| + tubes\.jodh\.us| + tuktube\.com| + turkum\.me| + tututu\.tube| + tuvideo\.encanarias\.info| + tv1\.cocu\.cc| + tv1\.gomntu\.space| + tv2\.cocu\.cc| + tv\.adn\.life| + tv\.atmx\.ca| + tv\.bitma\.st| + tv\.generallyrubbish\.net\.au| + tv\.lumbung\.space| + tv\.mattchristiansenmedia\.com| + tv\.netwhood\.online| + tv\.neue\.city| + tv\.piejacker\.net| + tv\.pirateradio\.social| + tv\.undersco\.re| + tvox\.ru| + twctube\.twc-zone\.eu| + unfilter\.tube| + v\.basspistol\.org| + v\.kisombrella\.top| + v\.lastorder\.xyz| + v\.lor\.sh| + v\.phreedom\.club| + v\.sil\.sh| + v\.szy\.io| + v\.xxxapex\.com| + veezee\.tube| + vid\.dascoyote\.xyz| + vid\.garwood\.io| + vid\.ncrypt\.at| + vid\.pravdastalina\.info| + vid\.qorg11\.net| + vid\.rajeshtaylor\.com| + vid\.samtripoli\.com| + vid\.werefox\.dev| + vid\.wildeboer\.net| + video-cave-v2\.de| + video\.076\.ne\.jp| + video\.1146\.nohost\.me| + video\.altertek\.org| + video\.anartist\.org| + video\.apps\.thedoodleproject\.net| + video\.artist\.cx| + video\.asgardius\.company| + video\.balsillie\.net| + video\.bards\.online| + video\.binarydad\.com| + video\.blast-info\.fr| + video\.catgirl\.biz| + video\.cigliola\.com| + video\.cm-en-transition\.fr| + video\.cnt\.social| + video\.coales\.co| + video\.codingfield\.com| + video\.comptoir\.net| + video\.comune\.trento\.it| + video\.cpn\.so| + video\.csc49\.fr| + video\.cybre\.town| + video\.demokratischer-sommer\.de| + video\.discord-insoumis\.fr| + video\.dolphincastle\.com| + video\.dresden\.network| + video\.ecole-89\.com| + video\.elgrillolibertario\.org| + video\.emergeheart\.info| + video\.eradicatinglove\.xyz| + video\.ethantheenigma\.me| + video\.exodus-privacy\.eu\.org| + video\.fbxl\.net| + video\.fhtagn\.org| + video\.greenmycity\.eu| + video\.guerredeclasse\.fr| + video\.gyt\.is| + video\.hackers\.town| + video\.hardlimit\.com| + video\.hooli\.co| + video\.igem\.org| + video\.internet-czas-dzialac\.pl| + video\.islameye\.com| + video\.kicik\.fr| + video\.kuba-orlik\.name| + video\.kyushojitsu\.ca| + video\.lavolte\.net| + video\.lespoesiesdheloise\.fr| + video\.liberta\.vip| + video\.liege\.bike| + video\.linc\.systems| + video\.linux\.it| + video\.linuxtrent\.it| + video\.lokal\.social| + video\.lono\.space| + video\.lunasqu\.ee| + video\.lundi\.am| + video\.marcorennmaus\.de| + video\.mass-trespass\.uk| + video\.mugoreve\.fr| + video\.mundodesconocido\.com| + video\.mycrowd\.ca| + video\.nogafam\.es| + video\.odayacres\.farm| + video\.ozgurkon\.org| + video\.p1ng0ut\.social| + video\.p3x\.de| + video\.pcf\.fr| + video\.pony\.gallery| + video\.potate\.space| + video\.pourpenser\.pro| + video\.progressiv\.dev| + video\.resolutions\.it| + video\.rw501\.de| + video\.screamer\.wiki| + video\.sdm-tools\.net| + video\.sftblw\.moe| + video\.shitposter\.club| + video\.skyn3t\.in| + video\.soi\.ch| + video\.stuartbrand\.co\.uk| + video\.thinkof\.name| + video\.toot\.pt| + video\.triplea\.fr| + video\.turbo\.chat| + video\.vaku\.org\.ua| + video\.veloma\.org| + video\.violoncello\.ch| + video\.wilkie\.how| + video\.wsf2021\.info| + videorelay\.co| + videos-passages\.huma-num\.fr| + videos\.3d-wolf\.com| + videos\.ahp-numerique\.fr| + videos\.alexandrebadalo\.pt| + videos\.archigny\.net| + videos\.benjaminbrady\.ie| + videos\.buceoluegoexisto\.com| + videos\.capas\.se| + videos\.casually\.cat| + videos\.cloudron\.io| + videos\.coletivos\.org| + videos\.danksquad\.org| + videos\.denshi\.live| + videos\.fromouter\.space| + videos\.fsci\.in| + videos\.globenet\.org| + videos\.hauspie\.fr| + videos\.hush\.is| + videos\.john-livingston\.fr| + videos\.jordanwarne\.xyz| + videos\.lavoixdessansvoix\.org| + videos\.leslionsfloorball\.fr| + videos\.lucero\.top| + videos\.martyn\.berlin| + videos\.mastodont\.cat| + videos\.monstro1\.com| + videos\.npo\.city| + videos\.optoutpod\.com| + videos\.petch\.rocks| + videos\.pzelawski\.xyz| + videos\.rampin\.org| + videos\.scanlines\.xyz| + videos\.shmalls\.pw| + videos\.sibear\.fr| + videos\.stadtfabrikanten\.org| + videos\.tankernn\.eu| + videos\.testimonia\.org| + videos\.thisishowidontdisappear\.com| + videos\.traumaheilung\.net| + videos\.trom\.tf| + videos\.wakkerewereld\.nu| + videos\.weblib\.re| + videos\.yesil\.club| + vids\.roshless\.me| + vids\.tekdmn\.me| + vidz\.dou\.bet| + vod\.lumikko\.dev| + vs\.uniter\.network| + vulgarisation-informatique\.fr| + watch\.breadtube\.tv| + watch\.deranalyst\.ch| + watch\.ignorance\.eu| + watch\.krazy\.party| + watch\.libertaria\.space| + watch\.rt4mn\.org| + watch\.softinio\.com| + watch\.tubelab\.video| + web-fellow\.de| + webtv\.vandoeuvre\.net| + wechill\.space| + wikileaks\.video| + wiwi\.video| + worldofvids\.com| + wwtube\.net| + www4\.mir\.inter21\.net| + www\.birkeundnymphe\.de| + www\.captain-german\.com| + www\.wiki-tube\.de| + xxivproduction\.video| + xxx\.noho\.st| + + # from youtube-dl + peertube\.rainbowswingers\.net| + tube\.stanisic\.nl| + peer\.suiri\.us| + medias\.libox\.fr| + videomensoif\.ynh\.fr| + peertube\.travelpandas\.eu| + peertube\.rachetjay\.fr| + peertube\.montecsys\.fr| + tube\.eskuero\.me| + peer\.tube| + peertube\.umeahackerspace\.se| + tube\.nx-pod\.de| + video\.monsieurbidouille\.fr| + tube\.openalgeria\.org| + vid\.lelux\.fi| + video\.anormallostpod\.ovh| + tube\.crapaud-fou\.org| + peertube\.stemy\.me| + lostpod\.space| + exode\.me| + peertube\.snargol\.com| + vis\.ion\.ovh| + videosdulib\.re| + v\.mbius\.io| + videos\.judrey\.eu| + peertube\.osureplayviewer\.xyz| + peertube\.mathieufamily\.ovh| + www\.videos-libr\.es| + fightforinfo\.com| + peertube\.fediverse\.ru| + peertube\.oiseauroch\.fr| + video\.nesven\.eu| + v\.bearvideo\.win| + video\.qoto\.org| + justporn\.cc| + video\.vny\.fr| + peervideo\.club| + tube\.taker\.fr| + peertube\.chantierlibre\.org| + tube\.ipfixe\.info| + tube\.kicou\.info| + tube\.dodsorf\.as| + videobit\.cc| + video\.yukari\.moe| + videos\.elbinario\.net| + hkvideo\.live| + pt\.tux\.tf| + www\.hkvideo\.live| + FIGHTFORINFO\.com| + pt\.765racing\.com| + peertube\.gnumeria\.eu\.org| + nordenmedia\.com| + peertube\.co\.uk| + tube\.darfweb\.eu| + tube\.kalah-france\.org| + 0ch\.in| + vod\.mochi\.academy| + film\.node9\.org| + peertube\.hatthieves\.es| + video\.fitchfamily\.org| + peertube\.ddns\.net| + video\.ifuncle\.kr| + video\.fdlibre\.eu| + tube\.22decembre\.eu| + peertube\.harmoniescreatives\.com| + tube\.fabrigli\.fr| + video\.thedwyers\.co| + video\.bruitbruit\.com| + peertube\.foxfam\.club| + peer\.philoxweb\.be| + videos\.bugs\.social| + peertube\.malbert\.xyz| + peertube\.bilange\.ca| + libretube\.net| + diytelevision\.com| + peertube\.fedilab\.app| + libre\.video| + video\.mstddntfdn\.online| + us\.tv| + peertube\.sl-network\.fr| + peertube\.dynlinux\.io| + peertube\.david\.durieux\.family| + peertube\.linuxrocks\.online| + peerwatch\.xyz| + v\.kretschmann\.social| + tube\.otter\.sh| + yt\.is\.nota\.live| + tube\.dragonpsi\.xyz| + peertube\.boneheadmedia\.com| + videos\.funkwhale\.audio| + watch\.44con\.com| + peertube\.gcaillaut\.fr| + peertube\.icu| + pony\.tube| + spacepub\.space| + tube\.stbr\.io| + v\.mom-gay\.faith| + tube\.port0\.xyz| + peertube\.simounet\.net| + play\.jergefelt\.se| + peertube\.zeteo\.me| + tube\.danq\.me| + peertube\.kerenon\.com| + tube\.fab-l3\.org| + tube\.calculate\.social| + peertube\.mckillop\.org| + tube\.netzspielplatz\.de| + vod\.ksite\.de| + peertube\.laas\.fr| + tube\.govital\.net| + peertube\.stephenson\.cc| + bistule\.nohost\.me| + peertube\.kajalinifi\.de| + video\.ploud\.jp| + video\.omniatv\.com| + peertube\.ffs2play\.fr| + peertube\.leboulaire\.ovh| + peertube\.tronic-studio\.com| + peertube\.public\.cat| + peertube\.metalbanana\.net| + video\.1000i100\.fr| + peertube\.alter-nativ-voll\.de| + tube\.pasa\.tf| + tube\.worldofhauru\.xyz| + pt\.kamp\.site| + peertube\.teleassist\.fr| + videos\.mleduc\.xyz| + conf\.tube| + media\.privacyinternational\.org| + pt\.forty-two\.nl| + video\.halle-leaks\.de| + video\.grosskopfgames\.de| + peertube\.schaeferit\.de| + peertube\.jackbot\.fr| + tube\.extinctionrebellion\.fr| + peertube\.f-si\.org| + video\.subak\.ovh| + videos\.koweb\.fr| + peertube\.zergy\.net| + peertube\.roflcopter\.fr| + peertube\.floss-marketing-school\.com| + vloggers\.social| + peertube\.iriseden\.eu| + videos\.ubuntu-paris\.org| + peertube\.mastodon\.host| + armstube\.com| + peertube\.s2s\.video| + peertube\.lol| + tube\.open-plug\.eu| + open\.tube| + peertube\.ch| + peertube\.normandie-libre\.fr| + peertube\.slat\.org| + video\.lacaveatonton\.ovh| + peertube\.uno| + peertube\.servebeer\.com| + peertube\.fedi\.quebec| + tube\.h3z\.jp| + tube\.plus200\.com| + peertube\.eric\.ovh| + tube\.metadocs\.cc| + tube\.unmondemeilleur\.eu| + gouttedeau\.space| + video\.antirep\.net| + nrop\.cant\.at| + tube\.ksl-bmx\.de| + tube\.plaf\.fr| + tube\.tchncs\.de| + video\.devinberg\.com| + hitchtube\.fr| + peertube\.kosebamse\.com| + yunopeertube\.myddns\.me| + peertube\.varney\.fr| + peertube\.anon-kenkai\.com| + tube\.maiti\.info| + tubee\.fr| + videos\.dinofly\.com| + toobnix\.org| + videotape\.me| + voca\.tube| + video\.heromuster\.com| + video\.lemediatv\.fr| + video\.up\.edu\.ph| + balafon\.video| + video\.ivel\.fr| + thickrips\.cloud| + pt\.laurentkruger\.fr| + video\.monarch-pass\.net| + peertube\.artica\.center| + video\.alternanet\.fr| + indymotion\.fr| + fanvid\.stopthatimp\.net| + video\.farci\.org| + v\.lesterpig\.com| + video\.okaris\.de| + tube\.pawelko\.net| + peertube\.mablr\.org| + tube\.fede\.re| + pytu\.be| + evertron\.tv| + devtube\.dev-wiki\.de| + raptube\.antipub\.org| + video\.selea\.se| + peertube\.mygaia\.org| + video\.oh14\.de| + peertube\.livingutopia\.org| + peertube\.the-penguin\.de| + tube\.thechangebook\.org| + tube\.anjara\.eu| + pt\.pube\.tk| + video\.samedi\.pm| + mplayer\.demouliere\.eu| + widemus\.de| + peertube\.me| + peertube\.zapashcanon\.fr| + video\.latavernedejohnjohn\.fr| + peertube\.pcservice46\.fr| + peertube\.mazzonetto\.eu| + video\.irem\.univ-paris-diderot\.fr| + video\.livecchi\.cloud| + alttube\.fr| + video\.coop\.tools| + video\.cabane-libre\.org| + peertube\.openstreetmap\.fr| + videos\.alolise\.org| + irrsinn\.video| + video\.antopie\.org| + scitech\.video| + tube2\.nemsia\.org| + video\.amic37\.fr| + peertube\.freeforge\.eu| + video\.arbitrarion\.com| + video\.datsemultimedia\.com| + stoptrackingus\.tv| + peertube\.ricostrongxxx\.com| + docker\.videos\.lecygnenoir\.info| + peertube\.togart\.de| + tube\.postblue\.info| + videos\.domainepublic\.net| + peertube\.cyber-tribal\.com| + video\.gresille\.org| + peertube\.dsmouse\.net| + cinema\.yunohost\.support| + tube\.theocevaer\.fr| + repro\.video| + tube\.4aem\.com| + quaziinc\.com| + peertube\.metawurst\.space| + videos\.wakapo\.com| + video\.ploud\.fr| + video\.freeradical\.zone| + tube\.valinor\.fr| + refuznik\.video| + pt\.kircheneuenburg\.de| + peertube\.asrun\.eu| + peertube\.lagob\.fr| + videos\.side-ways\.net| + 91video\.online| + video\.valme\.io| + video\.taboulisme\.com| + videos-libr\.es| + tv\.mooh\.fr| + nuage\.acostey\.fr| + video\.monsieur-a\.fr| + peertube\.librelois\.fr| + videos\.pair2jeux\.tube| + videos\.pueseso\.club| + peer\.mathdacloud\.ovh| + media\.assassinate-you\.net| + vidcommons\.org| + ptube\.rousset\.nom\.fr| + tube\.cyano\.at| + videos\.squat\.net| + video\.iphodase\.fr| + peertube\.makotoworkshop\.org| + peertube\.serveur\.slv-valbonne\.fr| + vault\.mle\.party| + hostyour\.tv| + videos\.hack2g2\.fr| + libre\.tube| + pire\.artisanlogiciel\.net| + videos\.numerique-en-commun\.fr| + video\.netsyms\.com| + video\.die-partei\.social| + video\.writeas\.org| + peertube\.swarm\.solvingmaz\.es| + tube\.pericoloso\.ovh| + watching\.cypherpunk\.observer| + videos\.adhocmusic\.com| + tube\.rfc1149\.net| + peertube\.librelabucm\.org| + videos\.numericoop\.fr| + peertube\.koehn\.com| + peertube\.anarchmusicall\.net| + tube\.kampftoast\.de| + vid\.y-y\.li| + peertube\.xtenz\.xyz| + diode\.zone| + tube\.egf\.mn| + peertube\.nomagic\.uk| + visionon\.tv| + videos\.koumoul\.com| + video\.rastapuls\.com| + video\.mantlepro\.com| + video\.deadsuperhero\.com| + peertube\.musicstudio\.pro| + peertube\.we-keys\.fr| + artitube\.artifaille\.fr| + peertube\.ethernia\.net| + tube\.midov\.pl| + peertube\.fr| + watch\.snoot\.tube| + peertube\.donnadieu\.fr| + argos\.aquilenet\.fr| + tube\.nemsia\.org| + tube\.bruniau\.net| + videos\.darckoune\.moe| + tube\.traydent\.info| + dev\.videos\.lecygnenoir\.info| + peertube\.nayya\.org| + peertube\.live| + peertube\.mofgao\.space| + video\.lequerrec\.eu| + peertube\.amicale\.net| + aperi\.tube| + tube\.ac-lyon\.fr| + video\.lw1\.at| + www\.yiny\.org| + videos\.pofilo\.fr| + tube\.lou\.lt| + choob\.h\.etbus\.ch| + tube\.hoga\.fr| + peertube\.heberge\.fr| + video\.obermui\.de| + videos\.cloudfrancois\.fr| + betamax\.video| + video\.typica\.us| + tube\.piweb\.be| + video\.blender\.org| + peertube\.cat| + tube\.kdy\.ch| + pe\.ertu\.be| + peertube\.social| + videos\.lescommuns\.org| + tv\.datamol\.org| + videonaute\.fr| + dialup\.express| + peertube\.nogafa\.org| + megatube\.lilomoino\.fr| + peertube\.tamanoir\.foucry\.net| + peertube\.devosi\.org| + peertube\.1312\.media| + tube\.bootlicker\.party| + skeptikon\.fr| + video\.blueline\.mg| + tube\.homecomputing\.fr| + tube\.ouahpiti\.info| + video\.tedomum\.net| + video\.g3l\.org| + fontube\.fr| + peertube\.gaialabs\.ch| + tube\.kher\.nl| + peertube\.qtg\.fr| + video\.migennes\.net| + tube\.p2p\.legal| + troll\.tv| + videos\.iut-orsay\.fr| + peertube\.solidev\.net| + videos\.cemea\.org| + video\.passageenseine\.fr| + videos\.festivalparminous\.org| + peertube\.touhoppai\.moe| + sikke\.fi| + peer\.hostux\.social| + share\.tube| + peertube\.walkingmountains\.fr| + videos\.benpro\.fr| + peertube\.parleur\.net| + peertube\.heraut\.eu| + tube\.aquilenet\.fr| + peertube\.gegeweb\.eu| + framatube\.org| + thinkerview\.video| + tube\.conferences-gesticulees\.net| + peertube\.datagueule\.tv| + video\.lqdn\.fr| + tube\.mochi\.academy| + media\.zat\.im| + video\.colibris-outilslibres\.org| + tube\.svnet\.fr| + peertube\.video| + peertube2\.cpy\.re| + peertube3\.cpy\.re| + videos\.tcit\.fr| + peertube\.cpy\.re| + canard\.tube + )''' + _UUID_RE = r'[\da-zA-Z]{22}|[\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12}' + _API_BASE = 'https://%s/api/v1/videos/%s/%s' + _VALID_URL = r'''(?x) + (?: + peertube:(?P<host>[^:]+):| + https?://(?P<host_2>%s)/(?:videos/(?:watch|embed)|api/v\d/videos|w)/ + ) + (?P<id>%s) + ''' % (_INSTANCES_RE, _UUID_RE) + _TESTS = [{ + 'url': 'https://framatube.org/videos/watch/9c9de5e8-0a1e-484a-b099-e80766180a6d', + 'md5': '8563064d245a4be5705bddb22bb00a28', + 'info_dict': { + 'id': '9c9de5e8-0a1e-484a-b099-e80766180a6d', + 'ext': 'mp4', + 'title': 'What is PeerTube?', + 'description': 'md5:3fefb8dde2b189186ce0719fda6f7b10', + 'thumbnail': r're:https?://.*\.(?:jpg|png)', + 'timestamp': 1538391166, + 'upload_date': '20181001', + 'uploader': 'Framasoft', + 'uploader_id': '3', + 'uploader_url': 'https://framatube.org/accounts/framasoft', + 'channel': 'A propos de PeerTube', + 'channel_id': '2215', + 'channel_url': 'https://framatube.org/video-channels/joinpeertube', + 'language': 'en', + 'license': 'Attribution - Share Alike', + 'duration': 113, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'tags': ['framasoft', 'peertube'], + 'categories': ['Science & Technology'], + } + }, { + 'url': 'https://peertube2.cpy.re/w/122d093a-1ede-43bd-bd34-59d2931ffc5e', + 'info_dict': { + 'id': '122d093a-1ede-43bd-bd34-59d2931ffc5e', + 'ext': 'mp4', + 'title': 'E2E tests', + 'uploader_id': '37855', + 'timestamp': 1589276219, + 'upload_date': '20200512', + 'uploader': 'chocobozzz', + } + }, { + 'url': 'https://peertube2.cpy.re/w/3fbif9S3WmtTP8gGsC5HBd', + 'info_dict': { + 'id': '3fbif9S3WmtTP8gGsC5HBd', + 'ext': 'mp4', + 'title': 'E2E tests', + 'uploader_id': '37855', + 'timestamp': 1589276219, + 'upload_date': '20200512', + 'uploader': 'chocobozzz', + }, + }, { + 'url': 'https://peertube2.cpy.re/api/v1/videos/3fbif9S3WmtTP8gGsC5HBd', + 'info_dict': { + 'id': '3fbif9S3WmtTP8gGsC5HBd', + 'ext': 'mp4', + 'title': 'E2E tests', + 'uploader_id': '37855', + 'timestamp': 1589276219, + 'upload_date': '20200512', + 'uploader': 'chocobozzz', + }, + }, { + # Issue #26002 + 'url': 'peertube:spacepub.space:d8943b2d-8280-497b-85ec-bc282ec2afdc', + 'info_dict': { + 'id': 'd8943b2d-8280-497b-85ec-bc282ec2afdc', + 'ext': 'mp4', + 'title': 'Dot matrix printer shell demo', + 'uploader_id': '3', + 'timestamp': 1587401293, + 'upload_date': '20200420', + 'uploader': 'Drew DeVault', + } + }, { + 'url': 'https://peertube.debian.social/videos/watch/0b04f13d-1e18-4f1d-814e-4979aa7c9c44', + 'only_matching': True, + }, { + # nsfw + 'url': 'https://vod.ksite.de/videos/watch/9bb88cd3-9959-46d9-9ab9-33d2bb704c39', + 'only_matching': True, + }, { + 'url': 'https://vod.ksite.de/videos/embed/fed67262-6edb-4d1c-833b-daa9085c71d7', + 'only_matching': True, + }, { + 'url': 'https://peertube.tv/api/v1/videos/c1875674-97d0-4c94-a058-3f7e64c962e8', + 'only_matching': True, + }, { + 'url': 'peertube:framatube.org:b37a5b9f-e6b5-415c-b700-04a5cd6ec205', + 'only_matching': True, + }] + + @staticmethod + def _extract_peertube_url(webpage, source_url): + mobj = re.match( + r'https?://(?P<host>[^/]+)/(?:videos/(?:watch|embed)|w)/(?P<id>%s)' + % PeerTubeIE._UUID_RE, source_url) + if mobj and any(p in webpage for p in ( + 'meta property="og:platform" content="PeerTube"', + '<title>PeerTube<', + 'There will be other non JS-based clients to access PeerTube', + '>We are sorry but it seems that PeerTube is not compatible with your web browser.<')): + return 'peertube:%s:%s' % mobj.group('host', 'id') + + @staticmethod + def _extract_urls(webpage, source_url): + entries = re.findall( + r'''(?x)<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//%s/videos/embed/%s)''' + % (PeerTubeIE._INSTANCES_RE, PeerTubeIE._UUID_RE), webpage) + if not entries: + peertube_url = PeerTubeIE._extract_peertube_url(webpage, source_url) + if peertube_url: + entries = [peertube_url] + return entries + + def _call_api(self, host, video_id, path, note=None, errnote=None, fatal=True): + return self._download_json( + self._API_BASE % (host, video_id, path), video_id, + note=note, errnote=errnote, fatal=fatal) + + def _get_subtitles(self, host, video_id): + captions = self._call_api( + host, video_id, 'captions', note='Downloading captions JSON', + fatal=False) + if not isinstance(captions, dict): + return + data = captions.get('data') + if not isinstance(data, list): + return + subtitles = {} + for e in data: + language_id = try_get(e, lambda x: x['language']['id'], compat_str) + caption_url = urljoin('https://%s' % host, e.get('captionPath')) + if not caption_url: + continue + subtitles.setdefault(language_id or 'en', []).append({ + 'url': caption_url, + }) + return subtitles + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + host = mobj.group('host') or mobj.group('host_2') + video_id = mobj.group('id') + + video = self._call_api( + host, video_id, '', note='Downloading video JSON') + + title = video['name'] + + formats = [] + files = video.get('files') or [] + for playlist in (video.get('streamingPlaylists') or []): + if not isinstance(playlist, dict): + continue + playlist_files = playlist.get('files') + if not (playlist_files and isinstance(playlist_files, list)): + continue + files.extend(playlist_files) + for file_ in files: + if not isinstance(file_, dict): + continue + file_url = url_or_none(file_.get('fileUrl')) + if not file_url: + continue + file_size = int_or_none(file_.get('size')) + format_id = try_get( + file_, lambda x: x['resolution']['label'], compat_str) + f = parse_resolution(format_id) + f.update({ + 'url': file_url, + 'format_id': format_id, + 'filesize': file_size, + }) + if format_id == '0p': + f['vcodec'] = 'none' + else: + f['fps'] = int_or_none(file_.get('fps')) + formats.append(f) + self._sort_formats(formats) + + description = video.get('description') + if description and len(description) >= 250: + # description is shortened + full_description = self._call_api( + host, video_id, 'description', note='Downloading description JSON', + fatal=False) + + if isinstance(full_description, dict): + description = str_or_none(full_description.get('description')) or description + + subtitles = self.extract_subtitles(host, video_id) + + def data(section, field, type_): + return try_get(video, lambda x: x[section][field], type_) + + def account_data(field, type_): + return data('account', field, type_) + + def channel_data(field, type_): + return data('channel', field, type_) + + category = data('category', 'label', compat_str) + categories = [category] if category else None + + nsfw = video.get('nsfw') + if nsfw is bool: + age_limit = 18 if nsfw else 0 + else: + age_limit = None + + webpage_url = 'https://%s/videos/watch/%s' % (host, video_id) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': urljoin(webpage_url, video.get('thumbnailPath')), + 'timestamp': unified_timestamp(video.get('publishedAt')), + 'uploader': account_data('displayName', compat_str), + 'uploader_id': str_or_none(account_data('id', int)), + 'uploader_url': url_or_none(account_data('url', compat_str)), + 'channel': channel_data('displayName', compat_str), + 'channel_id': str_or_none(channel_data('id', int)), + 'channel_url': url_or_none(channel_data('url', compat_str)), + 'language': data('language', 'id', compat_str), + 'license': data('licence', 'label', compat_str), + 'duration': int_or_none(video.get('duration')), + 'view_count': int_or_none(video.get('views')), + 'like_count': int_or_none(video.get('likes')), + 'dislike_count': int_or_none(video.get('dislikes')), + 'age_limit': age_limit, + 'tags': try_get(video, lambda x: x['tags'], list), + 'categories': categories, + 'formats': formats, + 'subtitles': subtitles, + 'webpage_url': webpage_url, + } + + +class PeerTubePlaylistIE(InfoExtractor): + IE_NAME = 'PeerTube:Playlist' + _TYPES = { + 'a': 'accounts', + 'c': 'video-channels', + 'w/p': 'video-playlists', + } + _VALID_URL = r'''(?x) + https?://(?P<host>%s)/(?P<type>(?:%s))/ + (?P<id>[^/]+) + ''' % (PeerTubeIE._INSTANCES_RE, '|'.join(_TYPES.keys())) + _TESTS = [{ + 'url': 'https://peertube.tux.ovh/w/p/3af94cba-95e8-4b74-b37a-807ab6d82526', + 'info_dict': { + 'id': '3af94cba-95e8-4b74-b37a-807ab6d82526', + 'description': 'playlist', + 'timestamp': 1611171863, + 'title': 'playlist', + }, + 'playlist_mincount': 6, + }, { + 'url': 'https://peertube.tux.ovh/w/p/wkyqcQBnsvFxtUB2pkYc1e', + 'info_dict': { + 'id': 'wkyqcQBnsvFxtUB2pkYc1e', + 'description': 'Cette liste de vidéos contient uniquement les jeux qui peuvent être terminés en une seule vidéo.', + 'title': 'Let\'s Play', + 'timestamp': 1604147331, + }, + 'playlist_mincount': 6, + }, { + 'url': 'https://peertube.debian.social/w/p/hFdJoTuyhNJVa1cDWd1d12', + 'info_dict': { + 'id': 'hFdJoTuyhNJVa1cDWd1d12', + 'description': 'Diversas palestras do Richard Stallman no Brasil.', + 'title': 'Richard Stallman no Brasil', + 'timestamp': 1599676222, + }, + 'playlist_mincount': 9, + }, { + 'url': 'https://peertube2.cpy.re/a/chocobozzz/videos', + 'info_dict': { + 'id': 'chocobozzz', + 'timestamp': 1553874564, + 'title': 'chocobozzz', + }, + 'playlist_mincount': 2, + }, { + 'url': 'https://framatube.org/c/bf54d359-cfad-4935-9d45-9d6be93f63e8/videos', + 'info_dict': { + 'id': 'bf54d359-cfad-4935-9d45-9d6be93f63e8', + 'timestamp': 1519917377, + 'title': 'Les vidéos de Framasoft', + }, + 'playlist_mincount': 345, + }, { + 'url': 'https://peertube2.cpy.re/c/blender_open_movies@video.blender.org/videos', + 'info_dict': { + 'id': 'blender_open_movies@video.blender.org', + 'timestamp': 1542287810, + 'title': 'Official Blender Open Movies', + }, + 'playlist_mincount': 11, + }] + _API_BASE = 'https://%s/api/v1/%s/%s%s' + _PAGE_SIZE = 30 + + def call_api(self, host, name, path, base, **kwargs): + return self._download_json( + self._API_BASE % (host, base, name, path), name, **kwargs) + + def fetch_page(self, host, id, type, page): + page += 1 + video_data = self.call_api( + host, id, + f'/videos?sort=-createdAt&start={self._PAGE_SIZE * (page - 1)}&count={self._PAGE_SIZE}&nsfw=both', + type, note=f'Downloading page {page}').get('data', []) + for video in video_data: + shortUUID = video.get('shortUUID') or try_get(video, lambda x: x['video']['shortUUID']) + video_title = video.get('name') or try_get(video, lambda x: x['video']['name']) + yield self.url_result( + f'https://{host}/w/{shortUUID}', PeerTubeIE.ie_key(), + video_id=shortUUID, video_title=video_title) + + def _extract_playlist(self, host, type, id): + info = self.call_api(host, id, '', type, note='Downloading playlist information', fatal=False) + + playlist_title = info.get('displayName') + playlist_description = info.get('description') + playlist_timestamp = unified_timestamp(info.get('createdAt')) + channel = try_get(info, lambda x: x['ownerAccount']['name']) or info.get('displayName') + channel_id = try_get(info, lambda x: x['ownerAccount']['id']) or info.get('id') + thumbnail = info.get('thumbnailPath') + thumbnail = f'https://{host}{thumbnail}' if thumbnail else None + + entries = OnDemandPagedList(functools.partial( + self.fetch_page, host, id, type), self._PAGE_SIZE) + + return self.playlist_result( + entries, id, playlist_title, playlist_description, + timestamp=playlist_timestamp, channel=channel, channel_id=channel_id, thumbnail=thumbnail) + + def _real_extract(self, url): + type, host, id = self._match_valid_url(url).group('type', 'host', 'id') + type = self._TYPES[type] + return self._extract_playlist(host, type, id) diff --git a/yt_dlp/extractor/peloton.py b/yt_dlp/extractor/peloton.py new file mode 100644 index 000000000..287d341c9 --- /dev/null +++ b/yt_dlp/extractor/peloton.py @@ -0,0 +1,222 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..compat import ( + compat_HTTPError, + compat_urllib_parse, +) +from ..utils import ( + ExtractorError, + float_or_none, + str_or_none, + traverse_obj, + url_or_none, +) + + +class PelotonIE(InfoExtractor): + IE_NAME = 'peloton' + _NETRC_MACHINE = 'peloton' + _VALID_URL = r'https?://members\.onepeloton\.com/classes/player/(?P<id>[a-f0-9]+)' + _TESTS = [{ + 'url': 'https://members.onepeloton.com/classes/player/0e9653eb53544eeb881298c8d7a87b86', + 'info_dict': { + 'id': '0e9653eb53544eeb881298c8d7a87b86', + 'title': '20 min Chest & Back Strength', + 'ext': 'mp4', + 'thumbnail': r're:^https?://.+\.jpg', + 'description': 'md5:fcd5be9b9eda0194b470e13219050a66', + 'creator': 'Chase Tucker', + 'release_timestamp': 1556141400, + 'timestamp': 1556141400, + 'upload_date': '20190424', + 'duration': 1389, + 'categories': ['Strength'], + 'tags': ['Workout Mat', 'Light Weights', 'Medium Weights'], + 'is_live': False, + 'chapters': 'count:1', + 'subtitles': {'en': [{ + 'url': r're:^https?://.+', + 'ext': 'vtt' + }]}, + }, 'params': { + 'skip_download': 'm3u8', + }, + '_skip': 'Account needed' + }, { + 'url': 'https://members.onepeloton.com/classes/player/26603d53d6bb4de1b340514864a6a6a8', + 'info_dict': { + 'id': '26603d53d6bb4de1b340514864a6a6a8', + 'title': '30 min Earth Day Run', + 'ext': 'm4a', + 'thumbnail': r're:https://.+\.jpg', + 'description': 'md5:adc065a073934d7ee0475d217afe0c3d', + 'creator': 'Selena Samuela', + 'release_timestamp': 1587567600, + 'timestamp': 1587567600, + 'upload_date': '20200422', + 'duration': 1802, + 'categories': ['Running'], + 'is_live': False, + 'chapters': 'count:3' + }, 'params': { + 'skip_download': 'm3u8', + }, + '_skip': 'Account needed' + }] + + _MANIFEST_URL_TEMPLATE = '%s?hdnea=%s' + + def _start_session(self, video_id): + self._download_webpage('https://api.onepeloton.com/api/started_client_session', video_id, note='Starting session') + + def _login(self, video_id): + username, password = self._get_login_info() + if not (username and password): + self.raise_login_required() + try: + self._download_json( + 'https://api.onepeloton.com/auth/login', video_id, note='Logging in', + data=json.dumps({ + 'username_or_email': username, + 'password': password, + 'with_pubsub': False + }).encode(), + headers={'Content-Type': 'application/json', 'User-Agent': 'web'}) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + json_string = self._webpage_read_content(e.cause, None, video_id) + res = self._parse_json(json_string, video_id) + raise ExtractorError(res['message'], expected=res['message'] == 'Login failed') + else: + raise + + def _get_token(self, video_id): + try: + subscription = self._download_json( + 'https://api.onepeloton.com/api/subscription/stream', video_id, note='Downloading token', + data=json.dumps({}).encode(), headers={'Content-Type': 'application/json'}) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + json_string = self._webpage_read_content(e.cause, None, video_id) + res = self._parse_json(json_string, video_id) + raise ExtractorError(res['message'], expected=res['message'] == 'Stream limit reached') + else: + raise + return subscription['token'] + + def _real_extract(self, url): + video_id = self._match_id(url) + try: + self._start_session(video_id) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + self._login(video_id) + self._start_session(video_id) + else: + raise + + metadata = self._download_json('https://api.onepeloton.com/api/ride/%s/details?stream_source=multichannel' % video_id, video_id) + ride_data = metadata.get('ride') + if not ride_data: + raise ExtractorError('Missing stream metadata') + token = self._get_token(video_id) + + is_live = False + if ride_data.get('content_format') == 'audio': + url = self._MANIFEST_URL_TEMPLATE % (ride_data.get('vod_stream_url'), compat_urllib_parse.quote(token)) + formats = [{ + 'url': url, + 'ext': 'm4a', + 'format_id': 'audio', + 'vcodec': 'none', + }] + subtitles = {} + else: + if ride_data.get('vod_stream_url'): + url = 'https://members.onepeloton.com/.netlify/functions/m3u8-proxy?displayLanguage=en&acceptedSubtitles=%s&url=%s?hdnea=%s' % ( + ','.join([re.sub('^([a-z]+)-([A-Z]+)$', r'\1', caption) for caption in ride_data['captions']]), + ride_data['vod_stream_url'], + compat_urllib_parse.quote(compat_urllib_parse.quote(token))) + elif ride_data.get('live_stream_url'): + url = self._MANIFEST_URL_TEMPLATE % (ride_data.get('live_stream_url'), compat_urllib_parse.quote(token)) + is_live = True + else: + raise ExtractorError('Missing video URL') + formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4') + + if metadata.get('instructor_cues'): + subtitles['cues'] = [{ + 'data': json.dumps(metadata.get('instructor_cues')), + 'ext': 'json' + }] + + category = ride_data.get('fitness_discipline_display_name') + chapters = [{ + 'start_time': segment.get('start_time_offset'), + 'end_time': segment.get('start_time_offset') + segment.get('length'), + 'title': segment.get('name') + } for segment in traverse_obj(metadata, ('segments', 'segment_list'))] + + self._sort_formats(formats) + return { + 'id': video_id, + 'title': ride_data.get('title'), + 'formats': formats, + 'thumbnail': url_or_none(ride_data.get('image_url')), + 'description': str_or_none(ride_data.get('description')), + 'creator': traverse_obj(ride_data, ('instructor', 'name')), + 'release_timestamp': ride_data.get('original_air_time'), + 'timestamp': ride_data.get('original_air_time'), + 'subtitles': subtitles, + 'duration': float_or_none(ride_data.get('length')), + 'categories': [category] if category else None, + 'tags': traverse_obj(ride_data, ('equipment_tags', ..., 'name')), + 'is_live': is_live, + 'chapters': chapters + } + + +class PelotonLiveIE(InfoExtractor): + IE_NAME = 'peloton:live' + IE_DESC = 'Peloton Live' + _VALID_URL = r'https?://members\.onepeloton\.com/player/live/(?P<id>[a-f0-9]+)' + _TEST = { + 'url': 'https://members.onepeloton.com/player/live/eedee2d19f804a9788f53aa8bd38eb1b', + 'info_dict': { + 'id': '32edc92d28044be5bf6c7b6f1f8d1cbc', + 'title': '30 min HIIT Ride: Live from Home', + 'ext': 'mp4', + 'thumbnail': r're:^https?://.+\.png', + 'description': 'md5:f0d7d8ed3f901b7ee3f62c1671c15817', + 'creator': 'Alex Toussaint', + 'release_timestamp': 1587736620, + 'timestamp': 1587736620, + 'upload_date': '20200424', + 'duration': 2014, + 'categories': ['Cycling'], + 'is_live': False, + 'chapters': 'count:3' + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': 'm3u8', + }, + '_skip': 'Account needed' + } + + def _real_extract(self, url): + workout_id = self._match_id(url) + peloton = self._download_json(f'https://api.onepeloton.com/api/peloton/{workout_id}', workout_id) + + if peloton.get('ride_id'): + if not peloton.get('is_live') or peloton.get('is_encore') or peloton.get('status') != 'PRE_START': + return self.url_result('https://members.onepeloton.com/classes/player/%s' % peloton['ride_id']) + else: + raise ExtractorError('Ride has not started', expected=True) + else: + raise ExtractorError('Missing video ID') diff --git a/youtube_dl/extractor/people.py b/yt_dlp/extractor/people.py index 6ca95715e..6ca95715e 100644 --- a/youtube_dl/extractor/people.py +++ b/yt_dlp/extractor/people.py diff --git a/yt_dlp/extractor/performgroup.py b/yt_dlp/extractor/performgroup.py new file mode 100644 index 000000000..c00d39375 --- /dev/null +++ b/yt_dlp/extractor/performgroup.py @@ -0,0 +1,82 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import int_or_none + + +class PerformGroupIE(InfoExtractor): + _VALID_URL = r'https?://player\.performgroup\.com/eplayer(?:/eplayer\.html|\.js)#/?(?P<id>[0-9a-f]{26})\.(?P<auth_token>[0-9a-z]{26})' + _TESTS = [{ + # http://www.faz.net/aktuell/sport/fussball/wm-2018-playoffs-schweiz-besiegt-nordirland-1-0-15286104.html + 'url': 'http://player.performgroup.com/eplayer/eplayer.html#d478c41c5d192f56b9aa859de8.1w4crrej5w14e1ed4s1ce4ykab', + 'md5': '259cb03d142e2e52471e8837ecacb29f', + 'info_dict': { + 'id': 'xgrwobuzumes1lwjxtcdpwgxd', + 'ext': 'mp4', + 'title': 'Liga MX: Keine Einsicht nach Horrorfoul', + 'description': 'md5:7cd3b459c82725b021e046ab10bf1c5b', + 'timestamp': 1511533477, + 'upload_date': '20171124', + } + }] + + def _call_api(self, service, auth_token, content_id, referer_url): + return self._download_json( + 'http://ep3.performfeeds.com/ep%s/%s/%s/' % (service, auth_token, content_id), + content_id, headers={ + 'Referer': referer_url, + 'Origin': 'http://player.performgroup.com', + }, query={ + '_fmt': 'json', + }) + + def _real_extract(self, url): + player_id, auth_token = self._match_valid_url(url).groups() + bootstrap = self._call_api('bootstrap', auth_token, player_id, url) + video = bootstrap['config']['dataSource']['sourceItems'][0]['videos'][0] + video_id = video['uuid'] + vod = self._call_api('vod', auth_token, video_id, url) + media = vod['videos']['video'][0]['media'] + + formats = [] + hls_url = media.get('hls', {}).get('url') + if hls_url: + formats.extend(self._extract_m3u8_formats(hls_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + + hds_url = media.get('hds', {}).get('url') + if hds_url: + formats.extend(self._extract_f4m_formats(hds_url + '?hdcore', video_id, f4m_id='hds', fatal=False)) + + for c in media.get('content', []): + c_url = c.get('url') + if not c_url: + continue + tbr = int_or_none(c.get('bitrate'), 1000) + format_id = 'http' + if tbr: + format_id += '-%d' % tbr + formats.append({ + 'format_id': format_id, + 'url': c_url, + 'tbr': tbr, + 'width': int_or_none(c.get('width')), + 'height': int_or_none(c.get('height')), + 'filesize': int_or_none(c.get('fileSize')), + 'vcodec': c.get('type'), + 'fps': int_or_none(c.get('videoFrameRate')), + 'vbr': int_or_none(c.get('videoRate'), 1000), + 'abr': int_or_none(c.get('audioRate'), 1000), + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video['title'], + 'description': video.get('description'), + 'thumbnail': video.get('poster'), + 'duration': int_or_none(video.get('duration')), + 'timestamp': int_or_none(video.get('publishedTime'), 1000), + 'formats': formats, + } diff --git a/youtube_dl/extractor/periscope.py b/yt_dlp/extractor/periscope.py index b93a02b7d..b93a02b7d 100644 --- a/youtube_dl/extractor/periscope.py +++ b/yt_dlp/extractor/periscope.py diff --git a/yt_dlp/extractor/philharmoniedeparis.py b/yt_dlp/extractor/philharmoniedeparis.py new file mode 100644 index 000000000..9f4899c09 --- /dev/null +++ b/yt_dlp/extractor/philharmoniedeparis.py @@ -0,0 +1,106 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + try_get, + urljoin, +) + + +class PhilharmonieDeParisIE(InfoExtractor): + IE_DESC = 'Philharmonie de Paris' + _VALID_URL = r'''(?x) + https?:// + (?: + live\.philharmoniedeparis\.fr/(?:[Cc]oncert/|embed(?:app)?/|misc/Playlist\.ashx\?id=)| + pad\.philharmoniedeparis\.fr/doc/CIMU/ + ) + (?P<id>\d+) + ''' + _TESTS = [{ + 'url': 'http://pad.philharmoniedeparis.fr/doc/CIMU/1086697/jazz-a-la-villette-knower', + 'md5': 'a0a4b195f544645073631cbec166a2c2', + 'info_dict': { + 'id': '1086697', + 'ext': 'mp4', + 'title': 'Jazz à la Villette : Knower', + }, + }, { + 'url': 'http://live.philharmoniedeparis.fr/concert/1032066.html', + 'info_dict': { + 'id': '1032066', + 'title': 'md5:0a031b81807b3593cffa3c9a87a167a0', + }, + 'playlist_mincount': 2, + }, { + 'url': 'http://live.philharmoniedeparis.fr/Concert/1030324.html', + 'only_matching': True, + }, { + 'url': 'http://live.philharmoniedeparis.fr/misc/Playlist.ashx?id=1030324&track=&lang=fr', + 'only_matching': True, + }, { + 'url': 'https://live.philharmoniedeparis.fr/embedapp/1098406/berlioz-fantastique-lelio-les-siecles-national-youth-choir-of.html?lang=fr-FR', + 'only_matching': True, + }, { + 'url': 'https://live.philharmoniedeparis.fr/embed/1098406/berlioz-fantastique-lelio-les-siecles-national-youth-choir-of.html?lang=fr-FR', + 'only_matching': True, + }] + _LIVE_URL = 'https://live.philharmoniedeparis.fr' + + def _real_extract(self, url): + video_id = self._match_id(url) + + config = self._download_json( + '%s/otoPlayer/config.ashx' % self._LIVE_URL, video_id, query={ + 'id': video_id, + 'lang': 'fr-FR', + }) + + def extract_entry(source): + if not isinstance(source, dict): + return + title = source.get('title') + if not title: + return + files = source.get('files') + if not isinstance(files, dict): + return + format_urls = set() + formats = [] + for format_id in ('mobile', 'desktop'): + format_url = try_get( + files, lambda x: x[format_id]['file'], compat_str) + if not format_url or format_url in format_urls: + continue + format_urls.add(format_url) + m3u8_url = urljoin(self._LIVE_URL, format_url) + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + if not formats and not self.get_param('ignore_no_formats'): + return + self._sort_formats(formats) + return { + 'title': title, + 'formats': formats, + } + + thumbnail = urljoin(self._LIVE_URL, config.get('image')) + + info = extract_entry(config) + if info: + info.update({ + 'id': video_id, + 'thumbnail': thumbnail, + }) + return info + + entries = [] + for num, chapter in enumerate(config['chapters'], start=1): + entry = extract_entry(chapter) + entry['id'] = '%s-%d' % (video_id, num) + entries.append(entry) + + return self.playlist_result(entries, video_id, config.get('title')) diff --git a/youtube_dl/extractor/phoenix.py b/yt_dlp/extractor/phoenix.py index e3ea01443..e3ea01443 100644 --- a/youtube_dl/extractor/phoenix.py +++ b/yt_dlp/extractor/phoenix.py diff --git a/yt_dlp/extractor/photobucket.py b/yt_dlp/extractor/photobucket.py new file mode 100644 index 000000000..53aebe2d9 --- /dev/null +++ b/yt_dlp/extractor/photobucket.py @@ -0,0 +1,45 @@ +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote + + +class PhotobucketIE(InfoExtractor): + _VALID_URL = r'https?://(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))' + _TEST = { + 'url': 'http://media.photobucket.com/user/rachaneronas/media/TiredofLinkBuildingTryBacklinkMyDomaincom_zpsc0c3b9fa.mp4.html?filters[term]=search&filters[primary]=videos&filters[secondary]=images&sort=1&o=0', + 'md5': '7dabfb92b0a31f6c16cebc0f8e60ff99', + 'info_dict': { + 'id': 'zpsc0c3b9fa', + 'ext': 'mp4', + 'timestamp': 1367669341, + 'upload_date': '20130504', + 'uploader': 'rachaneronas', + 'title': 'Tired of Link Building? Try BacklinkMyDomain.com!', + } + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + video_extension = mobj.group('ext') + + webpage = self._download_webpage(url, video_id) + + # Extract URL, uploader, and title from webpage + self.report_extraction(video_id) + info_json = self._search_regex(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (.*?)\);', + webpage, 'info json') + info = json.loads(info_json) + url = compat_urllib_parse_unquote(self._html_search_regex(r'file=(.+\.mp4)', info['linkcodes']['html'], 'url')) + return { + 'id': video_id, + 'url': url, + 'uploader': info['username'], + 'timestamp': info['creationDate'], + 'title': info['title'], + 'ext': video_extension, + 'thumbnail': info['thumbUrl'], + } diff --git a/youtube_dl/extractor/picarto.py b/yt_dlp/extractor/picarto.py index e6c51e16b..e6c51e16b 100644 --- a/youtube_dl/extractor/picarto.py +++ b/yt_dlp/extractor/picarto.py diff --git a/yt_dlp/extractor/piksel.py b/yt_dlp/extractor/piksel.py new file mode 100644 index 000000000..a362664b2 --- /dev/null +++ b/yt_dlp/extractor/piksel.py @@ -0,0 +1,187 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + dict_get, + ExtractorError, + int_or_none, + parse_iso8601, + try_get, + unescapeHTML, +) + + +class PikselIE(InfoExtractor): + _VALID_URL = r'''(?x)https?:// + (?: + (?: + player\. + (?: + olympusattelecom| + vibebyvista + )| + (?:api|player)\.multicastmedia| + (?:api-ovp|player)\.piksel + )\.com| + (?: + mz-edge\.stream\.co| + movie-s\.nhk\.or + )\.jp| + vidego\.baltimorecity\.gov + )/v/(?:refid/(?P<refid>[^/]+)/prefid/)?(?P<id>[\w-]+)''' + _TESTS = [ + { + 'url': 'http://player.piksel.com/v/ums2867l', + 'md5': '34e34c8d89dc2559976a6079db531e85', + 'info_dict': { + 'id': 'ums2867l', + 'ext': 'mp4', + 'title': 'GX-005 with Caption', + 'timestamp': 1481335659, + 'upload_date': '20161210' + } + }, + { + # Original source: http://www.uscourts.gov/cameras-courts/state-washington-vs-donald-j-trump-et-al + 'url': 'https://player.piksel.com/v/v80kqp41', + 'md5': '753ddcd8cc8e4fa2dda4b7be0e77744d', + 'info_dict': { + 'id': 'v80kqp41', + 'ext': 'mp4', + 'title': 'WAW- State of Washington vs. Donald J. Trump, et al', + 'description': 'State of Washington vs. Donald J. Trump, et al, Case Number 17-CV-00141-JLR, TRO Hearing, Civil Rights Case, 02/3/2017, 1:00 PM (PST), Seattle Federal Courthouse, Seattle, WA, Judge James L. Robart presiding.', + 'timestamp': 1486171129, + 'upload_date': '20170204' + } + }, + { + # https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2019240/ + 'url': 'http://player.piksel.com/v/refid/nhkworld/prefid/nw_vod_v_en_2019_240_20190823233000_02_1566873477', + 'only_matching': True, + } + ] + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe[^>]+src=["\'](?P<url>(?:https?:)?//player\.piksel\.com/v/[a-z0-9]+)', + webpage) + if mobj: + return mobj.group('url') + + def _call_api(self, app_token, resource, display_id, query, fatal=True): + response = (self._download_json( + 'http://player.piksel.com/ws/ws_%s/api/%s/mode/json/apiv/5' % (resource, app_token), + display_id, query=query, fatal=fatal) or {}).get('response') + failure = try_get(response, lambda x: x['failure']['reason']) + if failure: + if fatal: + raise ExtractorError(failure, expected=True) + self.report_warning(failure) + return response + + def _real_extract(self, url): + ref_id, display_id = self._match_valid_url(url).groups() + webpage = self._download_webpage(url, display_id) + app_token = self._search_regex([ + r'clientAPI\s*:\s*"([^"]+)"', + r'data-de-api-key\s*=\s*"([^"]+)"' + ], webpage, 'app token') + query = {'refid': ref_id, 'prefid': display_id} if ref_id else {'v': display_id} + program = self._call_api( + app_token, 'program', display_id, query)['WsProgramResponse']['program'] + video_id = program['uuid'] + video_data = program['asset'] + title = video_data['title'] + asset_type = dict_get(video_data, ['assetType', 'asset_type']) + + formats = [] + + def process_asset_file(asset_file): + if not asset_file: + return + # TODO: extract rtmp formats + http_url = asset_file.get('http_url') + if not http_url: + return + tbr = None + vbr = int_or_none(asset_file.get('videoBitrate'), 1024) + abr = int_or_none(asset_file.get('audioBitrate'), 1024) + if asset_type == 'video': + tbr = vbr + abr + elif asset_type == 'audio': + tbr = abr + + format_id = ['http'] + if tbr: + format_id.append(compat_str(tbr)) + + formats.append({ + 'format_id': '-'.join(format_id), + 'url': unescapeHTML(http_url), + 'vbr': vbr, + 'abr': abr, + 'width': int_or_none(asset_file.get('videoWidth')), + 'height': int_or_none(asset_file.get('videoHeight')), + 'filesize': int_or_none(asset_file.get('filesize')), + 'tbr': tbr, + }) + + def process_asset_files(asset_files): + for asset_file in (asset_files or []): + process_asset_file(asset_file) + + process_asset_files(video_data.get('assetFiles')) + process_asset_file(video_data.get('referenceFile')) + if not formats: + asset_id = video_data.get('assetid') or program.get('assetid') + if asset_id: + process_asset_files(try_get(self._call_api( + app_token, 'asset_file', display_id, { + 'assetid': asset_id, + }, False), lambda x: x['WsAssetFileResponse']['AssetFiles'])) + + m3u8_url = dict_get(video_data, [ + 'm3u8iPadURL', + 'ipadM3u8Url', + 'm3u8AndroidURL', + 'm3u8iPhoneURL', + 'iphoneM3u8Url']) + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + + smil_url = dict_get(video_data, ['httpSmil', 'hdSmil', 'rtmpSmil']) + if smil_url: + transform_source = None + if ref_id == 'nhkworld': + # TODO: figure out if this is something to be fixed in urljoin, + # _parse_smil_formats or keep it here + transform_source = lambda x: x.replace('src="/', 'src="').replace('/media"', '/media/"') + formats.extend(self._extract_smil_formats( + re.sub(r'/od/[^/]+/', '/od/http/', smil_url), video_id, + transform_source=transform_source, fatal=False)) + + self._sort_formats(formats) + + subtitles = {} + for caption in video_data.get('captions', []): + caption_url = caption.get('url') + if caption_url: + subtitles.setdefault(caption.get('locale', 'en'), []).append({ + 'url': caption_url}) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('description'), + 'thumbnail': video_data.get('thumbnailUrl'), + 'timestamp': parse_iso8601(video_data.get('dateadd')), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/youtube_dl/extractor/pinkbike.py b/yt_dlp/extractor/pinkbike.py index 9f3501f77..9f3501f77 100644 --- a/youtube_dl/extractor/pinkbike.py +++ b/yt_dlp/extractor/pinkbike.py diff --git a/yt_dlp/extractor/pinterest.py b/yt_dlp/extractor/pinterest.py new file mode 100644 index 000000000..80e9cd00e --- /dev/null +++ b/yt_dlp/extractor/pinterest.py @@ -0,0 +1,201 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + float_or_none, + int_or_none, + try_get, + unified_timestamp, + url_or_none, +) + + +class PinterestBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:[^/]+\.)?pinterest\.(?:com|fr|de|ch|jp|cl|ca|it|co\.uk|nz|ru|com\.au|at|pt|co\.kr|es|com\.mx|dk|ph|th|com\.uy|co|nl|info|kr|ie|vn|com\.vn|ec|mx|in|pe|co\.at|hu|co\.in|co\.nz|id|com\.ec|com\.py|tw|be|uk|com\.bo|com\.pe)' + + def _call_api(self, resource, video_id, options): + return self._download_json( + 'https://www.pinterest.com/resource/%sResource/get/' % resource, + video_id, 'Download %s JSON metadata' % resource, query={ + 'data': json.dumps({'options': options}) + })['resource_response'] + + def _extract_video(self, data, extract_formats=True): + video_id = data['id'] + + title = (data.get('title') or data.get('grid_title') or video_id).strip() + + urls = [] + formats = [] + duration = None + if extract_formats: + for format_id, format_dict in data['videos']['video_list'].items(): + if not isinstance(format_dict, dict): + continue + format_url = url_or_none(format_dict.get('url')) + if not format_url or format_url in urls: + continue + urls.append(format_url) + duration = float_or_none(format_dict.get('duration'), scale=1000) + ext = determine_ext(format_url) + if 'hls' in format_id.lower() or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False)) + else: + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'width': int_or_none(format_dict.get('width')), + 'height': int_or_none(format_dict.get('height')), + 'duration': duration, + }) + self._sort_formats(formats) + + description = data.get('description') or data.get('description_html') or data.get('seo_description') + timestamp = unified_timestamp(data.get('created_at')) + + def _u(field): + return try_get(data, lambda x: x['closeup_attribution'][field], compat_str) + + uploader = _u('full_name') + uploader_id = _u('id') + + repost_count = int_or_none(data.get('repin_count')) + comment_count = int_or_none(data.get('comment_count')) + categories = try_get(data, lambda x: x['pin_join']['visual_annotation'], list) + tags = data.get('hashtags') + + thumbnails = [] + images = data.get('images') + if isinstance(images, dict): + for thumbnail_id, thumbnail in images.items(): + if not isinstance(thumbnail, dict): + continue + thumbnail_url = url_or_none(thumbnail.get('url')) + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'thumbnails': thumbnails, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'repost_count': repost_count, + 'comment_count': comment_count, + 'categories': categories, + 'tags': tags, + 'formats': formats, + 'extractor_key': PinterestIE.ie_key(), + } + + +class PinterestIE(PinterestBaseIE): + _VALID_URL = r'%s/pin/(?P<id>\d+)' % PinterestBaseIE._VALID_URL_BASE + _TESTS = [{ + 'url': 'https://www.pinterest.com/pin/664281013778109217/', + 'md5': '6550c2af85d6d9f3fe3b88954d1577fc', + 'info_dict': { + 'id': '664281013778109217', + 'ext': 'mp4', + 'title': 'Origami', + 'description': 'md5:b9d90ddf7848e897882de9e73344f7dd', + 'duration': 57.7, + 'timestamp': 1593073622, + 'upload_date': '20200625', + 'uploader': 'Love origami -I am Dafei', + 'uploader_id': '586523688879454212', + 'repost_count': 50, + 'comment_count': 0, + 'categories': list, + 'tags': list, + }, + }, { + 'url': 'https://co.pinterest.com/pin/824721750502199491/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + data = self._call_api( + 'Pin', video_id, { + 'field_set_key': 'unauth_react_main_pin', + 'id': video_id, + })['data'] + return self._extract_video(data) + + +class PinterestCollectionIE(PinterestBaseIE): + _VALID_URL = r'%s/(?P<username>[^/]+)/(?P<id>[^/?#&]+)' % PinterestBaseIE._VALID_URL_BASE + _TESTS = [{ + 'url': 'https://www.pinterest.ca/mashal0407/cool-diys/', + 'info_dict': { + 'id': '585890301462791043', + 'title': 'cool diys', + }, + 'playlist_count': 8, + }, { + 'url': 'https://www.pinterest.ca/fudohub/videos/', + 'info_dict': { + 'id': '682858430939307450', + 'title': 'VIDEOS', + }, + 'playlist_mincount': 365, + 'skip': 'Test with extract_formats=False', + }] + + @classmethod + def suitable(cls, url): + return False if PinterestIE.suitable(url) else super( + PinterestCollectionIE, cls).suitable(url) + + def _real_extract(self, url): + username, slug = self._match_valid_url(url).groups() + board = self._call_api( + 'Board', slug, { + 'slug': slug, + 'username': username + })['data'] + board_id = board['id'] + options = { + 'board_id': board_id, + 'page_size': 250, + } + bookmark = None + entries = [] + while True: + if bookmark: + options['bookmarks'] = [bookmark] + board_feed = self._call_api('BoardFeed', board_id, options) + for item in (board_feed.get('data') or []): + if not isinstance(item, dict) or item.get('type') != 'pin': + continue + video_id = item.get('id') + if video_id: + # Some pins may not be available anonymously via pin URL + # video = self._extract_video(item, extract_formats=False) + # video.update({ + # '_type': 'url_transparent', + # 'url': 'https://www.pinterest.com/pin/%s/' % video_id, + # }) + # entries.append(video) + entries.append(self._extract_video(item)) + bookmark = board_feed.get('bookmark') + if not bookmark: + break + return self.playlist_result( + entries, playlist_id=board_id, playlist_title=board.get('name')) diff --git a/yt_dlp/extractor/pladform.py b/yt_dlp/extractor/pladform.py new file mode 100644 index 000000000..dc2030017 --- /dev/null +++ b/yt_dlp/extractor/pladform.py @@ -0,0 +1,125 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + parse_qs, + xpath_text, + qualities, +) + + +class PladformIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?: + (?: + out\.pladform\.ru/player| + static\.pladform\.ru/player\.swf + ) + \?.*\bvideoid=| + video\.pladform\.ru/catalog/video/videoid/ + ) + (?P<id>\d+) + ''' + _TESTS = [{ + 'url': 'https://out.pladform.ru/player?pl=64471&videoid=3777899&vk_puid15=0&vk_puid34=0', + 'md5': '53362fac3a27352da20fa2803cc5cd6f', + 'info_dict': { + 'id': '3777899', + 'ext': 'mp4', + 'title': 'СТУДИЯ СОЮЗ • Шоу Студия Союз, 24 выпуск (01.02.2018) Нурлан Сабуров и Слава Комиссаренко', + 'description': 'md5:05140e8bf1b7e2d46e7ba140be57fd95', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 3190, + }, + }, { + 'url': 'http://static.pladform.ru/player.swf?pl=21469&videoid=100183293&vkcid=0', + 'only_matching': True, + }, { + 'url': 'http://video.pladform.ru/catalog/video/videoid/100183293/vkcid/0', + 'only_matching': True, + }] + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//out\.pladform\.ru/player\?.+?)\1', webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + video_id = self._match_id(url) + + qs = parse_qs(url) + pl = qs.get('pl', ['1'])[0] + + video = self._download_xml( + 'http://out.pladform.ru/getVideo', video_id, query={ + 'pl': pl, + 'videoid': video_id, + }) + + def fail(text): + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, text), + expected=True) + + if video.tag == 'error': + fail(video.text) + + quality = qualities(('ld', 'sd', 'hd')) + + formats = [] + for src in video.findall('./src'): + if src is None: + continue + format_url = src.text + if not format_url: + continue + if src.get('type') == 'hls' or determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'url': src.text, + 'format_id': src.get('quality'), + 'quality': quality(src.get('quality')), + }) + + if not formats: + error = xpath_text(video, './cap', 'error', default=None) + if error: + fail(error) + + self._sort_formats(formats) + + webpage = self._download_webpage( + 'http://video.pladform.ru/catalog/video/videoid/%s' % video_id, + video_id) + + title = self._og_search_title(webpage, fatal=False) or xpath_text( + video, './/title', 'title', fatal=True) + description = self._search_regex( + r'</h3>\s*<p>([^<]+)</p>', webpage, 'description', fatal=False) + thumbnail = self._og_search_thumbnail(webpage) or xpath_text( + video, './/cover', 'cover') + + duration = int_or_none(xpath_text(video, './/time', 'duration')) + age_limit = int_or_none(xpath_text(video, './/age18', 'age limit')) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'age_limit': age_limit, + 'formats': formats, + } diff --git a/youtube_dl/extractor/platzi.py b/yt_dlp/extractor/platzi.py index 23c8256b5..23c8256b5 100644 --- a/youtube_dl/extractor/platzi.py +++ b/yt_dlp/extractor/platzi.py diff --git a/yt_dlp/extractor/playfm.py b/yt_dlp/extractor/playfm.py new file mode 100644 index 000000000..4298cbe30 --- /dev/null +++ b/yt_dlp/extractor/playfm.py @@ -0,0 +1,74 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + parse_iso8601, +) + + +class PlayFMIE(InfoExtractor): + IE_NAME = 'play.fm' + _VALID_URL = r'https?://(?:www\.)?play\.fm/(?P<slug>(?:[^/]+/)+(?P<id>[^/]+))/?(?:$|[?#])' + + _TEST = { + 'url': 'https://www.play.fm/dan-drastic/sven-tasnadi-leipzig-electronic-music-batofar-paris-fr-2014-07-12', + 'md5': 'c505f8307825a245d0c7ad1850001f22', + 'info_dict': { + 'id': '71276', + 'ext': 'mp3', + 'title': 'Sven Tasnadi - LEIPZIG ELECTRONIC MUSIC @ Batofar (Paris,FR) - 2014-07-12', + 'description': '', + 'duration': 5627, + 'timestamp': 1406033781, + 'upload_date': '20140722', + 'uploader': 'Dan Drastic', + 'uploader_id': '71170', + 'view_count': int, + 'comment_count': int, + }, + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + slug = mobj.group('slug') + + recordings = self._download_json( + 'http://v2api.play.fm/recordings/slug/%s' % slug, video_id) + + error = recordings.get('error') + if isinstance(error, dict): + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, error.get('message')), + expected=True) + + audio_url = recordings['audio'] + video_id = compat_str(recordings.get('id') or video_id) + title = recordings['title'] + description = recordings.get('description') + duration = int_or_none(recordings.get('recordingDuration')) + timestamp = parse_iso8601(recordings.get('created_at')) + uploader = recordings.get('page', {}).get('title') + uploader_id = compat_str(recordings.get('page', {}).get('id')) + view_count = int_or_none(recordings.get('playCount')) + comment_count = int_or_none(recordings.get('commentCount')) + categories = [tag['name'] for tag in recordings.get('tags', []) if tag.get('name')] + + return { + 'id': video_id, + 'url': audio_url, + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'view_count': view_count, + 'comment_count': comment_count, + 'categories': categories, + } diff --git a/yt_dlp/extractor/playplustv.py b/yt_dlp/extractor/playplustv.py new file mode 100644 index 000000000..fd72a3717 --- /dev/null +++ b/yt_dlp/extractor/playplustv.py @@ -0,0 +1,108 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ( + clean_html, + ExtractorError, + int_or_none, + PUTRequest, +) + + +class PlayPlusTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?playplus\.(?:com|tv)/VOD/(?P<project_id>[0-9]+)/(?P<id>[0-9a-f]{32})' + _TEST = { + 'url': 'https://www.playplus.tv/VOD/7572/db8d274a5163424e967f35a30ddafb8e', + 'md5': 'd078cb89d7ab6b9df37ce23c647aef72', + 'info_dict': { + 'id': 'db8d274a5163424e967f35a30ddafb8e', + 'ext': 'mp4', + 'title': 'Capítulo 179 - Final', + 'description': 'md5:01085d62d8033a1e34121d3c3cabc838', + 'timestamp': 1529992740, + 'upload_date': '20180626', + }, + 'skip': 'Requires account credential', + } + _NETRC_MACHINE = 'playplustv' + _GEO_COUNTRIES = ['BR'] + _token = None + _profile_id = None + + def _call_api(self, resource, video_id=None, query=None): + return self._download_json('https://api.playplus.tv/api/media/v2/get' + resource, video_id, headers={ + 'Authorization': 'Bearer ' + self._token, + }, query=query) + + def _real_initialize(self): + email, password = self._get_login_info() + if email is None: + self.raise_login_required() + + req = PUTRequest( + 'https://api.playplus.tv/api/web/login', json.dumps({ + 'email': email, + 'password': password, + }).encode(), { + 'Content-Type': 'application/json; charset=utf-8', + }) + + try: + self._token = self._download_json(req, None)['token'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + raise ExtractorError(self._parse_json( + e.cause.read(), None)['errorMessage'], expected=True) + raise + + self._profile = self._call_api('Profiles')['list'][0]['_id'] + + def _real_extract(self, url): + project_id, media_id = self._match_valid_url(url).groups() + media = self._call_api( + 'Media', media_id, { + 'profileId': self._profile, + 'projectId': project_id, + 'mediaId': media_id, + })['obj'] + title = media['title'] + + formats = [] + for f in media.get('files', []): + f_url = f.get('url') + if not f_url: + continue + file_info = f.get('fileInfo') or {} + formats.append({ + 'url': f_url, + 'width': int_or_none(file_info.get('width')), + 'height': int_or_none(file_info.get('height')), + }) + self._sort_formats(formats) + + thumbnails = [] + for thumb in media.get('thumbs', []): + thumb_url = thumb.get('url') + if not thumb_url: + continue + thumbnails.append({ + 'url': thumb_url, + 'width': int_or_none(thumb.get('width')), + 'height': int_or_none(thumb.get('height')), + }) + + return { + 'id': media_id, + 'title': title, + 'formats': formats, + 'thumbnails': thumbnails, + 'description': clean_html(media.get('description')) or media.get('shortDescription'), + 'timestamp': int_or_none(media.get('publishDate'), 1000), + 'view_count': int_or_none(media.get('numberOfViews')), + 'comment_count': int_or_none(media.get('numberOfComments')), + 'tags': media.get('tags'), + } diff --git a/youtube_dl/extractor/plays.py b/yt_dlp/extractor/plays.py index ddfc6f148..ddfc6f148 100644 --- a/youtube_dl/extractor/plays.py +++ b/yt_dlp/extractor/plays.py diff --git a/youtube_dl/extractor/playstuff.py b/yt_dlp/extractor/playstuff.py index 5a329957f..5a329957f 100644 --- a/youtube_dl/extractor/playstuff.py +++ b/yt_dlp/extractor/playstuff.py diff --git a/yt_dlp/extractor/playtvak.py b/yt_dlp/extractor/playtvak.py new file mode 100644 index 000000000..84e92dda4 --- /dev/null +++ b/yt_dlp/extractor/playtvak.py @@ -0,0 +1,191 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_urlparse, + compat_urllib_parse_urlencode, +) +from ..utils import ( + ExtractorError, + int_or_none, + parse_iso8601, + qualities, +) + + +class PlaytvakIE(InfoExtractor): + IE_DESC = 'Playtvak.cz, iDNES.cz and Lidovky.cz' + _VALID_URL = r'https?://(?:.+?\.)?(?:playtvak|idnes|lidovky|metro)\.cz/.*\?(?:c|idvideo)=(?P<id>[^&]+)' + _TESTS = [{ + 'url': 'http://www.playtvak.cz/vyzente-vosy-a-srsne-ze-zahrady-dn5-/hodinovy-manzel.aspx?c=A150730_150323_hodinovy-manzel_kuko', + 'md5': '4525ae312c324b4be2f4603cc78ceb4a', + 'info_dict': { + 'id': 'A150730_150323_hodinovy-manzel_kuko', + 'ext': 'mp4', + 'title': 'Vyžeňte vosy a sršně ze zahrady', + 'description': 'md5:4436e61b7df227a093778efb7e373571', + 'thumbnail': r're:(?i)^https?://.*\.(?:jpg|png)$', + 'duration': 279, + 'timestamp': 1438732860, + 'upload_date': '20150805', + 'is_live': False, + } + }, { # live video test + 'url': 'http://slowtv.playtvak.cz/planespotting-0pr-/planespotting.aspx?c=A150624_164934_planespotting_cat', + 'info_dict': { + 'id': 'A150624_164934_planespotting_cat', + 'ext': 'flv', + 'title': 're:^Planespotting [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': 'Sledujte provoz na ranveji Letiště Václava Havla v Praze', + 'is_live': True, + }, + 'params': { + 'skip_download': True, # requires rtmpdump + }, + }, { # another live stream, this one without Misc.videoFLV + 'url': 'https://slowtv.playtvak.cz/zive-sledujte-vlaky-v-primem-prenosu-dwi-/hlavni-nadrazi.aspx?c=A151218_145728_hlavni-nadrazi_plap', + 'info_dict': { + 'id': 'A151218_145728_hlavni-nadrazi_plap', + 'ext': 'flv', + 'title': 're:^Hlavní nádraží [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'is_live': True, + }, + 'params': { + 'skip_download': True, # requires rtmpdump + }, + }, { # idnes.cz + 'url': 'http://zpravy.idnes.cz/pes-zavreny-v-aute-rozbijeni-okynek-v-aute-fj5-/domaci.aspx?c=A150809_104116_domaci_pku', + 'md5': '819832ba33cd7016e58a6658577fe289', + 'info_dict': { + 'id': 'A150809_104116_domaci_pku', + 'ext': 'mp4', + 'title': 'Zavřeli jsme mraženou pizzu do auta. Upekla se', + 'description': 'md5:01e73f02329e2e5760bd5eed4d42e3c2', + 'thumbnail': r're:(?i)^https?://.*\.(?:jpg|png)$', + 'duration': 39, + 'timestamp': 1438969140, + 'upload_date': '20150807', + 'is_live': False, + } + }, { # lidovky.cz + 'url': 'http://www.lidovky.cz/dalsi-demonstrace-v-praze-o-migraci-duq-/video.aspx?c=A150808_214044_ln-video_ELE', + 'md5': 'c7209ac4ba9d234d4ad5bab7485bcee8', + 'info_dict': { + 'id': 'A150808_214044_ln-video_ELE', + 'ext': 'mp4', + 'title': 'Táhni! Demonstrace proti imigrantům budila emoce', + 'description': 'md5:97c81d589a9491fbfa323c9fa3cca72c', + 'thumbnail': r're:(?i)^https?://.*\.(?:jpg|png)$', + 'timestamp': 1439052180, + 'upload_date': '20150808', + 'is_live': False, + } + }, { # metro.cz + 'url': 'http://www.metro.cz/video-pod-billboardem-se-na-vltavske-roztocil-kolotoc-deti-vozil-jen-par-hodin-1hx-/metro-extra.aspx?c=A141111_173251_metro-extra_row', + 'md5': '84fc1deedcac37b7d4a6ccae7c716668', + 'info_dict': { + 'id': 'A141111_173251_metro-extra_row', + 'ext': 'mp4', + 'title': 'Recesisté udělali z billboardu kolotoč', + 'description': 'md5:7369926049588c3989a66c9c1a043c4c', + 'thumbnail': r're:(?i)^https?://.*\.(?:jpg|png)$', + 'timestamp': 1415725500, + 'upload_date': '20141111', + 'is_live': False, + } + }, { + 'url': 'http://www.playtvak.cz/embed.aspx?idvideo=V150729_141549_play-porad_kuko', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + info_url = self._html_search_regex( + r'Misc\.video(?:FLV)?\(\s*{\s*data\s*:\s*"([^"]+)"', webpage, 'info url') + + parsed_url = compat_urlparse.urlparse(info_url) + + qs = compat_urlparse.parse_qs(parsed_url.query) + qs.update({ + 'reklama': ['0'], + 'type': ['js'], + }) + + info_url = compat_urlparse.urlunparse( + parsed_url._replace(query=compat_urllib_parse_urlencode(qs, True))) + + json_info = self._download_json( + info_url, video_id, + transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1]) + + item = None + for i in json_info['items']: + if i.get('type') == 'video' or i.get('type') == 'stream': + item = i + break + if not item: + raise ExtractorError('No suitable stream found') + + quality = qualities(('low', 'middle', 'high')) + + formats = [] + for fmt in item['video']: + video_url = fmt.get('file') + if not video_url: + continue + + format_ = fmt['format'] + format_id = '%s_%s' % (format_, fmt['quality']) + preference = None + + if format_ in ('mp4', 'webm'): + ext = format_ + elif format_ == 'rtmp': + ext = 'flv' + elif format_ == 'apple': + ext = 'mp4' + # Some streams have mp3 audio which does not play + # well with ffmpeg filter aac_adtstoasc + preference = -10 + elif format_ == 'adobe': # f4m manifest fails with 404 in 80% of requests + continue + else: # Other formats not supported yet + continue + + formats.append({ + 'url': video_url, + 'ext': ext, + 'format_id': format_id, + 'quality': quality(fmt.get('quality')), + 'preference': preference, + }) + self._sort_formats(formats) + + title = item['title'] + is_live = item['type'] == 'stream' + if is_live: + title = self._live_title(title) + description = self._og_search_description(webpage, default=None) or self._html_search_meta( + 'description', webpage, 'description', default=None) + timestamp = None + duration = None + if not is_live: + duration = int_or_none(item.get('length')) + timestamp = item.get('published') + if timestamp: + timestamp = parse_iso8601(timestamp[:-5]) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': item.get('image'), + 'duration': duration, + 'timestamp': timestamp, + 'is_live': is_live, + 'formats': formats, + } diff --git a/youtube_dl/extractor/playvid.py b/yt_dlp/extractor/playvid.py index 4aef186ea..4aef186ea 100644 --- a/youtube_dl/extractor/playvid.py +++ b/yt_dlp/extractor/playvid.py diff --git a/yt_dlp/extractor/playwire.py b/yt_dlp/extractor/playwire.py new file mode 100644 index 000000000..9c9e597b5 --- /dev/null +++ b/yt_dlp/extractor/playwire.py @@ -0,0 +1,74 @@ +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ( + dict_get, + float_or_none, +) + + +class PlaywireIE(InfoExtractor): + _VALID_URL = r'https?://(?:config|cdn)\.playwire\.com(?:/v2)?/(?P<publisher_id>\d+)/(?:videos/v2|embed|config)/(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://config.playwire.com/14907/videos/v2/3353705/player.json', + 'md5': 'e6398701e3595888125729eaa2329ed9', + 'info_dict': { + 'id': '3353705', + 'ext': 'mp4', + 'title': 'S04_RM_UCL_Rus', + 'thumbnail': r're:^https?://.*\.png$', + 'duration': 145.94, + }, + }, { + # m3u8 in f4m + 'url': 'http://config.playwire.com/21772/videos/v2/4840492/zeus.json', + 'info_dict': { + 'id': '4840492', + 'ext': 'mp4', + 'title': 'ITV EL SHOW FULL', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # Multiple resolutions while bitrates missing + 'url': 'http://cdn.playwire.com/11625/embed/85228.html', + 'only_matching': True, + }, { + 'url': 'http://config.playwire.com/12421/videos/v2/3389892/zeus.json', + 'only_matching': True, + }, { + 'url': 'http://cdn.playwire.com/v2/12342/config/1532636.json', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + publisher_id, video_id = mobj.group('publisher_id'), mobj.group('id') + + player = self._download_json( + 'http://config.playwire.com/%s/videos/v2/%s/zeus.json' % (publisher_id, video_id), + video_id) + + title = player['settings']['title'] + duration = float_or_none(player.get('duration'), 1000) + + content = player['content'] + thumbnail = content.get('poster') + src = content['media']['f4m'] + + formats = self._extract_f4m_formats(src, video_id, m3u8_id='hls') + for a_format in formats: + if not dict_get(a_format, ['tbr', 'width', 'height']): + a_format['quality'] = 1 if '-hd.' in a_format['url'] else 0 + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + } diff --git a/yt_dlp/extractor/pluralsight.py b/yt_dlp/extractor/pluralsight.py new file mode 100644 index 000000000..801057ee1 --- /dev/null +++ b/yt_dlp/extractor/pluralsight.py @@ -0,0 +1,502 @@ +from __future__ import unicode_literals + +import collections +import json +import os +import random +import re + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urlparse, +) +from ..utils import ( + dict_get, + ExtractorError, + float_or_none, + int_or_none, + parse_duration, + parse_qs, + qualities, + srt_subtitles_timecode, + try_get, + update_url_query, + urlencode_postdata, +) + + +class PluralsightBaseIE(InfoExtractor): + _API_BASE = 'https://app.pluralsight.com' + + _GRAPHQL_EP = '%s/player/api/graphql' % _API_BASE + _GRAPHQL_HEADERS = { + 'Content-Type': 'application/json;charset=UTF-8', + } + _GRAPHQL_COURSE_TMPL = ''' +query BootstrapPlayer { + rpc { + bootstrapPlayer { + profile { + firstName + lastName + email + username + userHandle + authed + isAuthed + plan + } + course(courseId: "%s") { + name + title + courseHasCaptions + translationLanguages { + code + name + } + supportsWideScreenVideoFormats + timestamp + modules { + name + title + duration + formattedDuration + author + authorized + clips { + authorized + clipId + duration + formattedDuration + id + index + moduleIndex + moduleTitle + name + title + watched + } + } + } + } + } +}''' + + def _download_course(self, course_id, url, display_id): + try: + return self._download_course_rpc(course_id, url, display_id) + except ExtractorError: + # Old API fallback + return self._download_json( + 'https://app.pluralsight.com/player/user/api/v1/player/payload', + display_id, data=urlencode_postdata({'courseId': course_id}), + headers={'Referer': url}) + + def _download_course_rpc(self, course_id, url, display_id): + response = self._download_json( + self._GRAPHQL_EP, display_id, data=json.dumps({ + 'query': self._GRAPHQL_COURSE_TMPL % course_id, + 'variables': {} + }).encode('utf-8'), headers=self._GRAPHQL_HEADERS) + + course = try_get( + response, lambda x: x['data']['rpc']['bootstrapPlayer']['course'], + dict) + if course: + return course + + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, response['error']['message']), + expected=True) + + +class PluralsightIE(PluralsightBaseIE): + IE_NAME = 'pluralsight' + _VALID_URL = r'https?://(?:(?:www|app)\.)?pluralsight\.com/(?:training/)?player\?' + _LOGIN_URL = 'https://app.pluralsight.com/id/' + + _NETRC_MACHINE = 'pluralsight' + + _TESTS = [{ + 'url': 'http://www.pluralsight.com/training/player?author=mike-mckeown&name=hosting-sql-server-windows-azure-iaas-m7-mgmt&mode=live&clip=3&course=hosting-sql-server-windows-azure-iaas', + 'md5': '4d458cf5cf4c593788672419a8dd4cf8', + 'info_dict': { + 'id': 'hosting-sql-server-windows-azure-iaas-m7-mgmt-04', + 'ext': 'mp4', + 'title': 'Demo Monitoring', + 'duration': 338, + }, + 'skip': 'Requires pluralsight account credentials', + }, { + 'url': 'https://app.pluralsight.com/training/player?course=angularjs-get-started&author=scott-allen&name=angularjs-get-started-m1-introduction&clip=0&mode=live', + 'only_matching': True, + }, { + # available without pluralsight account + 'url': 'http://app.pluralsight.com/training/player?author=scott-allen&name=angularjs-get-started-m1-introduction&mode=live&clip=0&course=angularjs-get-started', + 'only_matching': True, + }, { + 'url': 'https://app.pluralsight.com/player?course=ccna-intro-networking&author=ross-bagurdes&name=ccna-intro-networking-m06&clip=0', + 'only_matching': True, + }] + + GRAPHQL_VIEWCLIP_TMPL = ''' +query viewClip { + viewClip(input: { + author: "%(author)s", + clipIndex: %(clipIndex)d, + courseName: "%(courseName)s", + includeCaptions: %(includeCaptions)s, + locale: "%(locale)s", + mediaType: "%(mediaType)s", + moduleName: "%(moduleName)s", + quality: "%(quality)s" + }) { + urls { + url + cdn + rank + source + }, + status + } +}''' + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login page') + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'Username': username, + 'Password': password, + }) + + post_url = self._search_regex( + r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, + 'post url', default=self._LOGIN_URL, group='url') + + if not post_url.startswith('http'): + post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) + + response = self._download_webpage( + post_url, None, 'Logging in', + data=urlencode_postdata(login_form), + headers={'Content-Type': 'application/x-www-form-urlencoded'}) + + error = self._search_regex( + r'<span[^>]+class="field-validation-error"[^>]*>([^<]+)</span>', + response, 'error message', default=None) + if error: + raise ExtractorError('Unable to login: %s' % error, expected=True) + + if all(not re.search(p, response) for p in ( + r'__INITIAL_STATE__', r'["\']currentUser["\']', + # new layout? + r'>\s*Sign out\s*<')): + BLOCKED = 'Your account has been blocked due to suspicious activity' + if BLOCKED in response: + raise ExtractorError( + 'Unable to login: %s' % BLOCKED, expected=True) + MUST_AGREE = 'To continue using Pluralsight, you must agree to' + if any(p in response for p in (MUST_AGREE, '>Disagree<', '>Agree<')): + raise ExtractorError( + 'Unable to login: %s some documents. Go to pluralsight.com, ' + 'log in and agree with what Pluralsight requires.' + % MUST_AGREE, expected=True) + + raise ExtractorError('Unable to log in') + + def _get_subtitles(self, author, clip_idx, clip_id, lang, name, duration, video_id): + captions = None + if clip_id: + captions = self._download_json( + '%s/transcript/api/v1/caption/json/%s/%s' + % (self._API_BASE, clip_id, lang), video_id, + 'Downloading captions JSON', 'Unable to download captions JSON', + fatal=False) + if not captions: + captions_post = { + 'a': author, + 'cn': int(clip_idx), + 'lc': lang, + 'm': name, + } + captions = self._download_json( + '%s/player/retrieve-captions' % self._API_BASE, video_id, + 'Downloading captions JSON', 'Unable to download captions JSON', + fatal=False, data=json.dumps(captions_post).encode('utf-8'), + headers={'Content-Type': 'application/json;charset=utf-8'}) + if captions: + return { + lang: [{ + 'ext': 'json', + 'data': json.dumps(captions), + }, { + 'ext': 'srt', + 'data': self._convert_subtitles(duration, captions), + }] + } + + @staticmethod + def _convert_subtitles(duration, subs): + srt = '' + TIME_OFFSET_KEYS = ('displayTimeOffset', 'DisplayTimeOffset') + TEXT_KEYS = ('text', 'Text') + for num, current in enumerate(subs): + current = subs[num] + start, text = ( + float_or_none(dict_get(current, TIME_OFFSET_KEYS, skip_false_values=False)), + dict_get(current, TEXT_KEYS)) + if start is None or text is None: + continue + end = duration if num == len(subs) - 1 else float_or_none( + dict_get(subs[num + 1], TIME_OFFSET_KEYS, skip_false_values=False)) + if end is None: + continue + srt += os.linesep.join( + ( + '%d' % num, + '%s --> %s' % ( + srt_subtitles_timecode(start), + srt_subtitles_timecode(end)), + text, + os.linesep, + )) + return srt + + def _real_extract(self, url): + qs = parse_qs(url) + + author = qs.get('author', [None])[0] + name = qs.get('name', [None])[0] + clip_idx = qs.get('clip', [None])[0] + course_name = qs.get('course', [None])[0] + + if any(not f for f in (author, name, clip_idx, course_name,)): + raise ExtractorError('Invalid URL', expected=True) + + display_id = '%s-%s' % (name, clip_idx) + + course = self._download_course(course_name, url, display_id) + + collection = course['modules'] + + clip = None + + for module_ in collection: + if name in (module_.get('moduleName'), module_.get('name')): + for clip_ in module_.get('clips', []): + clip_index = clip_.get('clipIndex') + if clip_index is None: + clip_index = clip_.get('index') + if clip_index is None: + continue + if compat_str(clip_index) == clip_idx: + clip = clip_ + break + + if not clip: + raise ExtractorError('Unable to resolve clip') + + title = clip['title'] + clip_id = clip.get('clipName') or clip.get('name') or clip['clipId'] + + QUALITIES = { + 'low': {'width': 640, 'height': 480}, + 'medium': {'width': 848, 'height': 640}, + 'high': {'width': 1024, 'height': 768}, + 'high-widescreen': {'width': 1280, 'height': 720}, + } + + QUALITIES_PREFERENCE = ('low', 'medium', 'high', 'high-widescreen',) + quality_key = qualities(QUALITIES_PREFERENCE) + + AllowedQuality = collections.namedtuple('AllowedQuality', ['ext', 'qualities']) + + ALLOWED_QUALITIES = ( + AllowedQuality('webm', ['high', ]), + AllowedQuality('mp4', ['low', 'medium', 'high', ]), + ) + + # Some courses also offer widescreen resolution for high quality (see + # https://github.com/ytdl-org/youtube-dl/issues/7766) + widescreen = course.get('supportsWideScreenVideoFormats') is True + best_quality = 'high-widescreen' if widescreen else 'high' + if widescreen: + for allowed_quality in ALLOWED_QUALITIES: + allowed_quality.qualities.append(best_quality) + + # In order to minimize the number of calls to ViewClip API and reduce + # the probability of being throttled or banned by Pluralsight we will request + # only single format until formats listing was explicitly requested. + if self.get_param('listformats', False): + allowed_qualities = ALLOWED_QUALITIES + else: + def guess_allowed_qualities(): + req_format = self.get_param('format') or 'best' + req_format_split = req_format.split('-', 1) + if len(req_format_split) > 1: + req_ext, req_quality = req_format_split + req_quality = '-'.join(req_quality.split('-')[:2]) + for allowed_quality in ALLOWED_QUALITIES: + if req_ext == allowed_quality.ext and req_quality in allowed_quality.qualities: + return (AllowedQuality(req_ext, (req_quality, )), ) + req_ext = 'webm' if self.get_param('prefer_free_formats') else 'mp4' + return (AllowedQuality(req_ext, (best_quality, )), ) + allowed_qualities = guess_allowed_qualities() + + formats = [] + for ext, qualities_ in allowed_qualities: + for quality in qualities_: + f = QUALITIES[quality].copy() + clip_post = { + 'author': author, + 'includeCaptions': 'false', + 'clipIndex': int(clip_idx), + 'courseName': course_name, + 'locale': 'en', + 'moduleName': name, + 'mediaType': ext, + 'quality': '%dx%d' % (f['width'], f['height']), + } + format_id = '%s-%s' % (ext, quality) + + try: + viewclip = self._download_json( + self._GRAPHQL_EP, display_id, + 'Downloading %s viewclip graphql' % format_id, + data=json.dumps({ + 'query': self.GRAPHQL_VIEWCLIP_TMPL % clip_post, + 'variables': {} + }).encode('utf-8'), + headers=self._GRAPHQL_HEADERS)['data']['viewClip'] + except ExtractorError: + # Still works but most likely will go soon + viewclip = self._download_json( + '%s/video/clips/viewclip' % self._API_BASE, display_id, + 'Downloading %s viewclip JSON' % format_id, fatal=False, + data=json.dumps(clip_post).encode('utf-8'), + headers={'Content-Type': 'application/json;charset=utf-8'}) + + # Pluralsight tracks multiple sequential calls to ViewClip API and start + # to return 429 HTTP errors after some time (see + # https://github.com/ytdl-org/youtube-dl/pull/6989). Moreover it may even lead + # to account ban (see https://github.com/ytdl-org/youtube-dl/issues/6842). + # To somewhat reduce the probability of these consequences + # we will sleep random amount of time before each call to ViewClip. + self._sleep( + random.randint(5, 10), display_id, + '%(video_id)s: Waiting for %(timeout)s seconds to avoid throttling') + + if not viewclip: + continue + + clip_urls = viewclip.get('urls') + if not isinstance(clip_urls, list): + continue + + for clip_url_data in clip_urls: + clip_url = clip_url_data.get('url') + if not clip_url: + continue + cdn = clip_url_data.get('cdn') + clip_f = f.copy() + clip_f.update({ + 'url': clip_url, + 'ext': ext, + 'format_id': '%s-%s' % (format_id, cdn) if cdn else format_id, + 'quality': quality_key(quality), + 'source_preference': int_or_none(clip_url_data.get('rank')), + }) + formats.append(clip_f) + + self._sort_formats(formats) + + duration = int_or_none( + clip.get('duration')) or parse_duration(clip.get('formattedDuration')) + + # TODO: other languages? + subtitles = self.extract_subtitles( + author, clip_idx, clip.get('clipId'), 'en', name, duration, display_id) + + return { + 'id': clip_id, + 'title': title, + 'duration': duration, + 'creator': author, + 'formats': formats, + 'subtitles': subtitles, + } + + +class PluralsightCourseIE(PluralsightBaseIE): + IE_NAME = 'pluralsight:course' + _VALID_URL = r'https?://(?:(?:www|app)\.)?pluralsight\.com/(?:library/)?courses/(?P<id>[^/]+)' + _TESTS = [{ + # Free course from Pluralsight Starter Subscription for Microsoft TechNet + # https://offers.pluralsight.com/technet?loc=zTS3z&prod=zOTprodz&tech=zOttechz&prog=zOTprogz&type=zSOz&media=zOTmediaz&country=zUSz + 'url': 'http://www.pluralsight.com/courses/hosting-sql-server-windows-azure-iaas', + 'info_dict': { + 'id': 'hosting-sql-server-windows-azure-iaas', + 'title': 'Hosting SQL Server in Microsoft Azure IaaS Fundamentals', + 'description': 'md5:61b37e60f21c4b2f91dc621a977d0986', + }, + 'playlist_count': 31, + }, { + # available without pluralsight account + 'url': 'https://www.pluralsight.com/courses/angularjs-get-started', + 'only_matching': True, + }, { + 'url': 'https://app.pluralsight.com/library/courses/understanding-microsoft-azure-amazon-aws/table-of-contents', + 'only_matching': True, + }] + + def _real_extract(self, url): + course_id = self._match_id(url) + + # TODO: PSM cookie + + course = self._download_course(course_id, url, course_id) + + title = course['title'] + course_name = course['name'] + course_data = course['modules'] + description = course.get('description') or course.get('shortDescription') + + entries = [] + for num, module in enumerate(course_data, 1): + author = module.get('author') + module_name = module.get('name') + if not author or not module_name: + continue + for clip in module.get('clips', []): + clip_index = int_or_none(clip.get('index')) + if clip_index is None: + continue + clip_url = update_url_query( + '%s/player' % self._API_BASE, query={ + 'mode': 'live', + 'course': course_name, + 'author': author, + 'name': module_name, + 'clip': clip_index, + }) + entries.append({ + '_type': 'url_transparent', + 'url': clip_url, + 'ie_key': PluralsightIE.ie_key(), + 'chapter': module.get('title'), + 'chapter_number': num, + 'chapter_id': module.get('moduleRef'), + }) + + return self.playlist_result(entries, course_id, title, description) diff --git a/yt_dlp/extractor/plutotv.py b/yt_dlp/extractor/plutotv.py new file mode 100644 index 000000000..0cf82466a --- /dev/null +++ b/yt_dlp/extractor/plutotv.py @@ -0,0 +1,184 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import uuid + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urlparse, +) +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + try_get, + url_or_none, +) + + +class PlutoTVIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?://(?:www\.)?pluto\.tv(?:/en)?/on-demand + /(?P<video_type>movies|series) + /(?P<series_or_movie_slug>[^/]+) + (?: + /seasons?/(?P<season_no>\d+) + (?:/episode/(?P<episode_slug>[^/]+))? + )? + /?(?:$|[#?])''' + + _INFO_URL = 'https://service-vod.clusters.pluto.tv/v3/vod/slugs/' + _INFO_QUERY_PARAMS = { + 'appName': 'web', + 'appVersion': 'na', + 'clientID': compat_str(uuid.uuid1()), + 'clientModelNumber': 'na', + 'serverSideAds': 'false', + 'deviceMake': 'unknown', + 'deviceModel': 'web', + 'deviceType': 'web', + 'deviceVersion': 'unknown', + 'sid': compat_str(uuid.uuid1()), + } + _TESTS = [ + { + 'url': 'https://pluto.tv/on-demand/series/i-love-money/season/2/episode/its-in-the-cards-2009-2-3', + 'md5': 'ebcdd8ed89aaace9df37924f722fd9bd', + 'info_dict': { + 'id': '5de6c598e9379ae4912df0a8', + 'ext': 'mp4', + 'title': 'It\'s In The Cards', + 'episode': 'It\'s In The Cards', + 'description': 'The teams face off against each other in a 3-on-2 soccer showdown. Strategy comes into play, though, as each team gets to select their opposing teams’ two defenders.', + 'series': 'I Love Money', + 'season_number': 2, + 'episode_number': 3, + 'duration': 3600, + } + }, { + 'url': 'https://pluto.tv/on-demand/series/i-love-money/season/1/', + 'playlist_count': 11, + 'info_dict': { + 'id': '5de6c582e9379ae4912dedbd', + 'title': 'I Love Money - Season 1', + } + }, { + 'url': 'https://pluto.tv/on-demand/series/i-love-money/', + 'playlist_count': 26, + 'info_dict': { + 'id': '5de6c582e9379ae4912dedbd', + 'title': 'I Love Money', + } + }, { + 'url': 'https://pluto.tv/on-demand/movies/arrival-2015-1-1', + 'md5': '3cead001d317a018bf856a896dee1762', + 'info_dict': { + 'id': '5e83ac701fa6a9001bb9df24', + 'ext': 'mp4', + 'title': 'Arrival', + 'description': 'When mysterious spacecraft touch down across the globe, an elite team - led by expert translator Louise Banks (Academy Award® nominee Amy Adams) – races against time to decipher their intent.', + 'duration': 9000, + } + }, { + 'url': 'https://pluto.tv/en/on-demand/series/manhunters-fugitive-task-force/seasons/1/episode/third-times-the-charm-1-1', + 'only_matching': True, + } + ] + + def _to_ad_free_formats(self, video_id, formats, subtitles): + ad_free_formats, ad_free_subtitles, m3u8_urls = [], {}, set() + for fmt in formats: + res = self._download_webpage( + fmt.get('url'), video_id, note='Downloading m3u8 playlist', + fatal=False) + if not res: + continue + first_segment_url = re.search( + r'^(https?://.*/)0\-(end|[0-9]+)/[^/]+\.ts$', res, + re.MULTILINE) + if first_segment_url: + m3u8_urls.add( + compat_urlparse.urljoin(first_segment_url.group(1), '0-end/master.m3u8')) + continue + first_segment_url = re.search( + r'^(https?://.*/).+\-0+\.ts$', res, + re.MULTILINE) + if first_segment_url: + m3u8_urls.add( + compat_urlparse.urljoin(first_segment_url.group(1), 'master.m3u8')) + continue + + for m3u8_url in m3u8_urls: + fmts, subs = self._extract_m3u8_formats_and_subtitles( + m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) + ad_free_formats.extend(fmts) + ad_free_subtitles = self._merge_subtitles(ad_free_subtitles, subs) + if ad_free_formats: + formats, subtitles = ad_free_formats, ad_free_subtitles + else: + self.report_warning('Unable to find ad-free formats') + return formats, subtitles + + def _get_video_info(self, video_json, slug, series_name=None): + video_id = video_json.get('_id', slug) + formats, subtitles = [], {} + for video_url in try_get(video_json, lambda x: x['stitched']['urls'], list) or []: + if video_url.get('type') != 'hls': + continue + url = url_or_none(video_url.get('url')) + + fmts, subs = self._extract_m3u8_formats_and_subtitles( + url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) + + formats, subtitles = self._to_ad_free_formats(video_id, formats, subtitles) + self._sort_formats(formats) + + info = { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + 'title': video_json.get('name'), + 'description': video_json.get('description'), + 'duration': float_or_none(video_json.get('duration'), scale=1000), + } + if series_name: + info.update({ + 'series': series_name, + 'episode': video_json.get('name'), + 'season_number': int_or_none(video_json.get('season')), + 'episode_number': int_or_none(video_json.get('number')), + }) + return info + + def _real_extract(self, url): + mobj = self._match_valid_url(url).groupdict() + info_slug = mobj['series_or_movie_slug'] + video_json = self._download_json(self._INFO_URL + info_slug, info_slug, query=self._INFO_QUERY_PARAMS) + + if mobj['video_type'] == 'series': + series_name = video_json.get('name', info_slug) + season_number, episode_slug = mobj.get('season_number'), mobj.get('episode_slug') + + videos = [] + for season in video_json['seasons']: + if season_number is not None and season_number != int_or_none(season.get('number')): + continue + for episode in season['episodes']: + if episode_slug is not None and episode_slug != episode.get('slug'): + continue + videos.append(self._get_video_info(episode, episode_slug, series_name)) + if not videos: + raise ExtractorError('Failed to find any videos to extract') + if episode_slug is not None and len(videos) == 1: + return videos[0] + playlist_title = series_name + if season_number is not None: + playlist_title += ' - Season %d' % season_number + return self.playlist_result(videos, + playlist_id=video_json.get('_id', info_slug), + playlist_title=playlist_title) + return self._get_video_info(video_json, info_slug) diff --git a/yt_dlp/extractor/podomatic.py b/yt_dlp/extractor/podomatic.py new file mode 100644 index 000000000..673a3ab94 --- /dev/null +++ b/yt_dlp/extractor/podomatic.py @@ -0,0 +1,75 @@ +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..utils import int_or_none + + +class PodomaticIE(InfoExtractor): + IE_NAME = 'podomatic' + _VALID_URL = r'''(?x) + (?P<proto>https?):// + (?: + (?P<channel>[^.]+)\.podomatic\.com/entry| + (?:www\.)?podomatic\.com/podcasts/(?P<channel_2>[^/]+)/episodes + )/ + (?P<id>[^/?#&]+) + ''' + + _TESTS = [{ + 'url': 'http://scienceteachingtips.podomatic.com/entry/2009-01-02T16_03_35-08_00', + 'md5': '84bb855fcf3429e6bf72460e1eed782d', + 'info_dict': { + 'id': '2009-01-02T16_03_35-08_00', + 'ext': 'mp3', + 'uploader': 'Science Teaching Tips', + 'uploader_id': 'scienceteachingtips', + 'title': '64. When the Moon Hits Your Eye', + 'duration': 446, + } + }, { + 'url': 'http://ostbahnhof.podomatic.com/entry/2013-11-15T16_31_21-08_00', + 'md5': 'd2cf443931b6148e27638650e2638297', + 'info_dict': { + 'id': '2013-11-15T16_31_21-08_00', + 'ext': 'mp3', + 'uploader': 'Ostbahnhof / Techno Mix', + 'uploader_id': 'ostbahnhof', + 'title': 'Einunddreizig', + 'duration': 3799, + } + }, { + 'url': 'https://www.podomatic.com/podcasts/scienceteachingtips/episodes/2009-01-02T16_03_35-08_00', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + channel = mobj.group('channel') or mobj.group('channel_2') + + json_url = (('%s://%s.podomatic.com/entry/embed_params/%s' + + '?permalink=true&rtmp=0') % + (mobj.group('proto'), channel, video_id)) + data_json = self._download_webpage( + json_url, video_id, 'Downloading video info') + data = json.loads(data_json) + + video_url = data['downloadLink'] + if not video_url: + video_url = '%s/%s' % (data['streamer'].replace('rtmp', 'http'), data['mediaLocation']) + uploader = data['podcast'] + title = data['title'] + thumbnail = data['imageLocation'] + duration = int_or_none(data.get('length'), 1000) + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'uploader': uploader, + 'uploader_id': channel, + 'thumbnail': thumbnail, + 'duration': duration, + } diff --git a/yt_dlp/extractor/pokemon.py b/yt_dlp/extractor/pokemon.py new file mode 100644 index 000000000..402b574a7 --- /dev/null +++ b/yt_dlp/extractor/pokemon.py @@ -0,0 +1,140 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + extract_attributes, + int_or_none, + js_to_json, + merge_dicts, +) + + +class PokemonIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pokemon\.com/[a-z]{2}(?:.*?play=(?P<id>[a-z0-9]{32})|/(?:[^/]+/)+(?P<display_id>[^/?#&]+))' + _TESTS = [{ + 'url': 'https://www.pokemon.com/us/pokemon-episodes/20_30-the-ol-raise-and-switch/', + 'md5': '2fe8eaec69768b25ef898cda9c43062e', + 'info_dict': { + 'id': 'afe22e30f01c41f49d4f1d9eab5cd9a4', + 'ext': 'mp4', + 'title': 'The Ol’ Raise and Switch!', + 'description': 'md5:7db77f7107f98ba88401d3adc80ff7af', + }, + 'add_id': ['LimelightMedia'], + }, { + # no data-video-title + 'url': 'https://www.pokemon.com/fr/episodes-pokemon/films-pokemon/pokemon-lascension-de-darkrai-2008', + 'info_dict': { + 'id': 'dfbaf830d7e54e179837c50c0c6cc0e1', + 'ext': 'mp4', + 'title': "Pokémon : L'ascension de Darkrai", + 'description': 'md5:d1dbc9e206070c3e14a06ff557659fb5', + }, + 'add_id': ['LimelightMedia'], + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.pokemon.com/uk/pokemon-episodes/?play=2e8b5c761f1d4a9286165d7748c1ece2', + 'only_matching': True, + }, { + 'url': 'http://www.pokemon.com/fr/episodes-pokemon/18_09-un-hiver-inattendu/', + 'only_matching': True, + }, { + 'url': 'http://www.pokemon.com/de/pokemon-folgen/01_20-bye-bye-smettbo/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id, display_id = self._match_valid_url(url).groups() + webpage = self._download_webpage(url, video_id or display_id) + video_data = extract_attributes(self._search_regex( + r'(<[^>]+data-video-id="%s"[^>]*>)' % (video_id if video_id else '[a-z0-9]{32}'), + webpage, 'video data element')) + video_id = video_data['data-video-id'] + title = video_data.get('data-video-title') or self._html_search_meta( + 'pkm-title', webpage, ' title', default=None) or self._search_regex( + r'<h1[^>]+\bclass=["\']us-title[^>]+>([^<]+)', webpage, 'title') + return { + '_type': 'url_transparent', + 'id': video_id, + 'url': 'limelight:media:%s' % video_id, + 'title': title, + 'description': video_data.get('data-video-summary'), + 'thumbnail': video_data.get('data-video-poster'), + 'series': 'Pokémon', + 'season_number': int_or_none(video_data.get('data-video-season')), + 'episode': title, + 'episode_number': int_or_none(video_data.get('data-video-episode')), + 'ie_key': 'LimelightMedia', + } + + +class PokemonWatchIE(InfoExtractor): + _VALID_URL = r'https?://watch\.pokemon\.com/[a-z]{2}-[a-z]{2}/(?:#/)?player(?:\.html)?\?id=(?P<id>[a-z0-9]{32})' + _API_URL = 'https://www.pokemon.com/api/pokemontv/v2/channels/{0:}' + _TESTS = [{ + 'url': 'https://watch.pokemon.com/en-us/player.html?id=8309a40969894a8e8d5bc1311e9c5667', + 'md5': '62833938a31e61ab49ada92f524c42ff', + 'info_dict': { + 'id': '8309a40969894a8e8d5bc1311e9c5667', + 'ext': 'mp4', + 'title': 'Lillier and the Staff!', + 'description': 'md5:338841b8c21b283d24bdc9b568849f04', + } + }, { + 'url': 'https://watch.pokemon.com/en-us/#/player?id=3fe7752ba09141f0b0f7756d1981c6b2', + 'only_matching': True + }, { + 'url': 'https://watch.pokemon.com/de-de/player.html?id=b3c402e111a4459eb47e12160ab0ba07', + 'only_matching': True + }] + + def _extract_media(self, channel_array, video_id): + for channel in channel_array: + for media in channel.get('media'): + if media.get('id') == video_id: + return media + return None + + def _real_extract(self, url): + video_id = self._match_id(url) + + info = { + '_type': 'url', + 'id': video_id, + 'url': 'limelight:media:%s' % video_id, + 'ie_key': 'LimelightMedia', + } + + # API call can be avoided entirely if we are listing formats + if self.get_param('listformats', False): + return info + + webpage = self._download_webpage(url, video_id) + build_vars = self._parse_json(self._search_regex( + r'(?s)buildVars\s*=\s*({.*?})', webpage, 'build vars'), + video_id, transform_source=js_to_json) + region = build_vars.get('region') + channel_array = self._download_json(self._API_URL.format(region), video_id) + video_data = self._extract_media(channel_array, video_id) + + if video_data is None: + raise ExtractorError( + 'Video %s does not exist' % video_id, expected=True) + + info['_type'] = 'url_transparent' + images = video_data.get('images') + + return merge_dicts(info, { + 'title': video_data.get('title'), + 'description': video_data.get('description'), + 'thumbnail': images.get('medium') or images.get('small'), + 'series': 'Pokémon', + 'season_number': int_or_none(video_data.get('season')), + 'episode': video_data.get('title'), + 'episode_number': int_or_none(video_data.get('episode')), + }) diff --git a/yt_dlp/extractor/polskieradio.py b/yt_dlp/extractor/polskieradio.py new file mode 100644 index 000000000..53fe0340a --- /dev/null +++ b/yt_dlp/extractor/polskieradio.py @@ -0,0 +1,209 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools +import re + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urllib_parse_unquote, + compat_urlparse +) +from ..utils import ( + extract_attributes, + int_or_none, + strip_or_none, + unified_timestamp, + unescapeHTML, +) + + +class PolskieRadioIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+/\d+/Artykul/(?P<id>[0-9]+)' + _TESTS = [{ # Old-style single broadcast. + 'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943,Prof-Andrzej-Nowak-o-historii-nie-da-sie-myslec-beznamietnie', + 'info_dict': { + 'id': '1587943', + 'title': 'Prof. Andrzej Nowak: o historii nie da się myśleć beznamiętnie', + 'description': 'md5:12f954edbf3120c5e7075e17bf9fc5c5', + }, + 'playlist': [{ + 'md5': '2984ee6ce9046d91fc233bc1a864a09a', + 'info_dict': { + 'id': '1540576', + 'ext': 'mp3', + 'title': 'md5:d4623290d4ac983bf924061c75c23a0d', + 'timestamp': 1456594200, + 'upload_date': '20160227', + 'duration': 2364, + 'thumbnail': r're:^https?://static\.prsa\.pl/images/.*\.jpg$' + }, + }], + }, { # New-style single broadcast. + 'url': 'https://www.polskieradio.pl/8/2382/Artykul/2534482,Zagarysci-Poezja-jak-spoiwo', + 'info_dict': { + 'id': '2534482', + 'title': 'Żagaryści. Poezja jak spoiwo', + 'description': 'md5:f18d95d5dcba747a09b635e21a4c0695', + }, + 'playlist': [{ + 'md5': 'd07559829f61d5a93a75755987ded760', + 'info_dict': { + 'id': '2516679', + 'ext': 'mp3', + 'title': 'md5:c6e1234e0b747ad883cb91b7ad06b98c', + 'timestamp': 1592654400, + 'upload_date': '20200620', + 'duration': 1430, + 'thumbnail': r're:^https?://static\.prsa\.pl/images/.*\.jpg$' + }, + }], + }, { # Old-style multiple broadcast playlist. + 'url': 'https://www.polskieradio.pl/8/4346/Artykul/2487823,Marek-Kondrat-czyta-Mistrza-i-Malgorzate', + 'info_dict': { + 'id': '2487823', + 'title': 'Marek Kondrat czyta "Mistrza i Małgorzatę"', + 'description': 'md5:8422a95cc83834f2aaeff9d82e9c8f39', + }, + 'playlist_mincount': 50, + }, { # New-style multiple broadcast playlist. + 'url': 'https://www.polskieradio.pl/8/4346/Artykul/2541317,Czytamy-Kalendarz-i-klepsydre-Tadeusza-Konwickiego', + 'info_dict': { + 'id': '2541317', + 'title': 'Czytamy "Kalendarz i klepsydrę" Tadeusza Konwickiego', + 'description': 'md5:0baeaa46d877f1351fb2eeed3e871f9f', + }, + 'playlist_mincount': 15, + }, { + 'url': 'http://polskieradio.pl/9/305/Artykul/1632955,Bardzo-popularne-slowo-remis', + 'only_matching': True, + }, { + 'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943', + 'only_matching': True, + }, { + # with mp4 video + 'url': 'http://www.polskieradio.pl/9/299/Artykul/1634903,Brexit-Leszek-Miller-swiat-sie-nie-zawali-Europa-bedzie-trwac-dalej', + 'only_matching': True, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + content = self._search_regex( + r'(?s)<div[^>]+class="\s*this-article\s*"[^>]*>(.+?)<div[^>]+class="tags"[^>]*>', + webpage, 'content') + + timestamp = unified_timestamp(self._html_search_regex( + r'(?s)<span[^>]+id="datetime2"[^>]*>(.+?)</span>', + webpage, 'timestamp', fatal=False)) + + thumbnail_url = self._og_search_thumbnail(webpage) + + entries = [] + + media_urls = set() + + for data_media in re.findall(r'<[^>]+data-media="?({[^>]+})"?', content): + media = self._parse_json(data_media, playlist_id, transform_source=unescapeHTML, fatal=False) + if not media.get('file') or not media.get('desc'): + continue + media_url = self._proto_relative_url(media['file'], 'http:') + if media_url in media_urls: + continue + media_urls.add(media_url) + entries.append({ + 'id': compat_str(media['id']), + 'url': media_url, + 'title': compat_urllib_parse_unquote(media['desc']), + 'duration': int_or_none(media.get('length')), + 'vcodec': 'none' if media.get('provider') == 'audio' else None, + 'timestamp': timestamp, + 'thumbnail': thumbnail_url + }) + + title = self._og_search_title(webpage).strip() + description = strip_or_none(self._og_search_description(webpage)) + description = description.replace('\xa0', ' ') if description is not None else None + + return self.playlist_result(entries, playlist_id, title, description) + + +class PolskieRadioCategoryIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+(?:,[^/]+)?/(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://www.polskieradio.pl/7/5102,HISTORIA-ZYWA', + 'info_dict': { + 'id': '5102', + 'title': 'HISTORIA ŻYWA', + }, + 'playlist_mincount': 38, + }, { + 'url': 'http://www.polskieradio.pl/7/4807', + 'info_dict': { + 'id': '4807', + 'title': 'Vademecum 1050. rocznicy Chrztu Polski' + }, + 'playlist_mincount': 5 + }, { + 'url': 'http://www.polskieradio.pl/7/129,Sygnaly-dnia?ref=source', + 'only_matching': True + }, { + 'url': 'http://www.polskieradio.pl/37,RedakcjaKatolicka/4143,Kierunek-Krakow', + 'info_dict': { + 'id': '4143', + 'title': 'Kierunek Kraków', + }, + 'playlist_mincount': 61 + }, { + 'url': 'http://www.polskieradio.pl/10,czworka/214,muzyka', + 'info_dict': { + 'id': '214', + 'title': 'Muzyka', + }, + 'playlist_mincount': 61 + }, { + 'url': 'http://www.polskieradio.pl/7,Jedynka/5102,HISTORIA-ZYWA', + 'only_matching': True, + }, { + 'url': 'http://www.polskieradio.pl/8,Dwojka/196,Publicystyka', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if PolskieRadioIE.suitable(url) else super(PolskieRadioCategoryIE, cls).suitable(url) + + def _entries(self, url, page, category_id): + content = page + for page_num in itertools.count(2): + for a_entry, entry_id in re.findall( + r'(?s)<article[^>]+>.*?(<a[^>]+href=["\']/\d+/\d+/Artykul/(\d+)[^>]+>).*?</article>', + content): + entry = extract_attributes(a_entry) + href = entry.get('href') + if not href: + continue + yield self.url_result( + compat_urlparse.urljoin(url, href), PolskieRadioIE.ie_key(), + entry_id, entry.get('title')) + mobj = re.search( + r'<div[^>]+class=["\']next["\'][^>]*>\s*<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1', + content) + if not mobj: + break + next_url = compat_urlparse.urljoin(url, mobj.group('url')) + content = self._download_webpage( + next_url, category_id, 'Downloading page %s' % page_num) + + def _real_extract(self, url): + category_id = self._match_id(url) + webpage = self._download_webpage(url, category_id) + title = self._html_search_regex( + r'<title>([^<]+) - [^<]+ - [^<]+</title>', + webpage, 'title', fatal=False) + return self.playlist_result( + self._entries(url, webpage, category_id), + category_id, title) diff --git a/yt_dlp/extractor/popcorntimes.py b/yt_dlp/extractor/popcorntimes.py new file mode 100644 index 000000000..5f9d0e720 --- /dev/null +++ b/yt_dlp/extractor/popcorntimes.py @@ -0,0 +1,98 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..compat import ( + compat_b64decode, + compat_chr, +) +from ..utils import int_or_none + + +class PopcorntimesIE(InfoExtractor): + _VALID_URL = r'https?://popcorntimes\.tv/[^/]+/m/(?P<id>[^/]+)/(?P<display_id>[^/?#&]+)' + _TEST = { + 'url': 'https://popcorntimes.tv/de/m/A1XCFvz/haensel-und-gretel-opera-fantasy', + 'md5': '93f210991ad94ba8c3485950a2453257', + 'info_dict': { + 'id': 'A1XCFvz', + 'display_id': 'haensel-und-gretel-opera-fantasy', + 'ext': 'mp4', + 'title': 'Hänsel und Gretel', + 'description': 'md5:1b8146791726342e7b22ce8125cf6945', + 'thumbnail': r're:^https?://.*\.jpg$', + 'creator': 'John Paul', + 'release_date': '19541009', + 'duration': 4260, + 'tbr': 5380, + 'width': 720, + 'height': 540, + }, + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id, display_id = mobj.group('id', 'display_id') + + webpage = self._download_webpage(url, display_id) + + title = self._search_regex( + r'<h1>([^<]+)', webpage, 'title', + default=None) or self._html_search_meta( + 'ya:ovs:original_name', webpage, 'title', fatal=True) + + loc = self._search_regex( + r'PCTMLOC\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, 'loc', + group='value') + + loc_b64 = '' + for c in loc: + c_ord = ord(c) + if ord('a') <= c_ord <= ord('z') or ord('A') <= c_ord <= ord('Z'): + upper = ord('Z') if c_ord <= ord('Z') else ord('z') + c_ord += 13 + if upper < c_ord: + c_ord -= 26 + loc_b64 += compat_chr(c_ord) + + video_url = compat_b64decode(loc_b64).decode('utf-8') + + description = self._html_search_regex( + r'(?s)<div[^>]+class=["\']pt-movie-desc[^>]+>(.+?)</div>', webpage, + 'description', fatal=False) + + thumbnail = self._search_regex( + r'<img[^>]+class=["\']video-preview[^>]+\bsrc=(["\'])(?P<value>(?:(?!\1).)+)\1', + webpage, 'thumbnail', default=None, + group='value') or self._og_search_thumbnail(webpage) + + creator = self._html_search_meta( + 'video:director', webpage, 'creator', default=None) + + release_date = self._html_search_meta( + 'video:release_date', webpage, default=None) + if release_date: + release_date = release_date.replace('-', '') + + def int_meta(name): + return int_or_none(self._html_search_meta( + name, webpage, default=None)) + + return { + 'id': video_id, + 'display_id': display_id, + 'url': video_url, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'creator': creator, + 'release_date': release_date, + 'duration': int_meta('video:duration'), + 'tbr': int_meta('ya:ovs:bitrate'), + 'width': int_meta('og:video:width'), + 'height': int_meta('og:video:height'), + 'http_headers': { + 'Referer': url, + }, + } diff --git a/yt_dlp/extractor/popcorntv.py b/yt_dlp/extractor/popcorntv.py new file mode 100644 index 000000000..66d2e5094 --- /dev/null +++ b/yt_dlp/extractor/popcorntv.py @@ -0,0 +1,75 @@ +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + int_or_none, + unified_timestamp, +) + + +class PopcornTVIE(InfoExtractor): + _VALID_URL = r'https?://[^/]+\.popcorntv\.it/guarda/(?P<display_id>[^/]+)/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://animemanga.popcorntv.it/guarda/food-wars-battaglie-culinarie-episodio-01/9183', + 'md5': '47d65a48d147caf692ab8562fe630b45', + 'info_dict': { + 'id': '9183', + 'display_id': 'food-wars-battaglie-culinarie-episodio-01', + 'ext': 'mp4', + 'title': 'Food Wars, Battaglie Culinarie | Episodio 01', + 'description': 'md5:b8bea378faae4651d3b34c6e112463d0', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1497610857, + 'upload_date': '20170616', + 'duration': 1440, + 'view_count': int, + }, + }, { + 'url': 'https://cinema.popcorntv.it/guarda/smash-cut/10433', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + display_id, video_id = mobj.group('display_id', 'id') + + webpage = self._download_webpage(url, display_id) + + m3u8_url = extract_attributes( + self._search_regex( + r'(<link[^>]+itemprop=["\'](?:content|embed)Url[^>]*>)', + webpage, 'content' + ))['href'] + + formats = self._extract_m3u8_formats( + m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + + title = self._search_regex( + r'<h1[^>]+itemprop=["\']name[^>]*>([^<]+)', webpage, + 'title', default=None) or self._og_search_title(webpage) + + description = self._html_search_regex( + r'(?s)<article[^>]+itemprop=["\']description[^>]*>(.+?)</article>', + webpage, 'description', fatal=False) + thumbnail = self._og_search_thumbnail(webpage) + timestamp = unified_timestamp(self._html_search_meta( + 'uploadDate', webpage, 'timestamp')) + duration = int_or_none(self._html_search_meta( + 'duration', webpage), invscale=60) + view_count = int_or_none(self._html_search_meta( + 'interactionCount', webpage, 'view count')) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'view_count': view_count, + 'formats': formats, + } diff --git a/youtube_dl/extractor/porn91.py b/yt_dlp/extractor/porn91.py index 20eac647a..20eac647a 100644 --- a/youtube_dl/extractor/porn91.py +++ b/yt_dlp/extractor/porn91.py diff --git a/yt_dlp/extractor/porncom.py b/yt_dlp/extractor/porncom.py new file mode 100644 index 000000000..83df22141 --- /dev/null +++ b/yt_dlp/extractor/porncom.py @@ -0,0 +1,103 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + int_or_none, + js_to_json, + parse_filesize, + str_to_int, +) + + +class PornComIE(InfoExtractor): + _VALID_URL = r'https?://(?:[a-zA-Z]+\.)?porn\.com/videos/(?:(?P<display_id>[^/]+)-)?(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://www.porn.com/videos/teen-grabs-a-dildo-and-fucks-her-pussy-live-on-1hottie-i-rec-2603339', + 'md5': '3f30ce76267533cd12ba999263156de7', + 'info_dict': { + 'id': '2603339', + 'display_id': 'teen-grabs-a-dildo-and-fucks-her-pussy-live-on-1hottie-i-rec', + 'ext': 'mp4', + 'title': 'Teen grabs a dildo and fucks her pussy live on 1hottie, I rec', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 551, + 'view_count': int, + 'age_limit': 18, + 'categories': list, + 'tags': list, + }, + }, { + 'url': 'http://se.porn.com/videos/marsha-may-rides-seth-on-top-of-his-thick-cock-2658067', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') or video_id + + webpage = self._download_webpage(url, display_id) + + config = self._parse_json( + self._search_regex( + (r'=\s*({.+?})\s*;\s*v1ar\b', + r'=\s*({.+?})\s*,\s*[\da-zA-Z_]+\s*='), + webpage, 'config', default='{}'), + display_id, transform_source=js_to_json, fatal=False) + + if config: + title = config['title'] + formats = [{ + 'url': stream['url'], + 'format_id': stream.get('id'), + 'height': int_or_none(self._search_regex( + r'^(\d+)[pP]', stream.get('id') or '', 'height', default=None)) + } for stream in config['streams'] if stream.get('url')] + thumbnail = (compat_urlparse.urljoin( + config['thumbCDN'], config['poster']) + if config.get('thumbCDN') and config.get('poster') else None) + duration = int_or_none(config.get('length')) + else: + title = self._search_regex( + (r'<title>([^<]+)</title>', r'<h1[^>]*>([^<]+)</h1>'), + webpage, 'title') + formats = [{ + 'url': compat_urlparse.urljoin(url, format_url), + 'format_id': '%sp' % height, + 'height': int(height), + 'filesize_approx': parse_filesize(filesize), + } for format_url, height, filesize in re.findall( + r'<a[^>]+href="(/download/[^"]+)">[^<]*?(\d+)p<span[^>]*>(\d+\s*[a-zA-Z]+)<', + webpage)] + thumbnail = None + duration = None + + self._sort_formats(formats) + + view_count = str_to_int(self._search_regex( + (r'Views:\s*</span>\s*<span>\s*([\d,.]+)', + r'class=["\']views["\'][^>]*><p>([\d,.]+)'), webpage, + 'view count', fatal=False)) + + def extract_list(kind): + s = self._search_regex( + (r'(?s)%s:\s*</span>\s*<span>(.+?)</span>' % kind.capitalize(), + r'(?s)<p[^>]*>%s:(.+?)</p>' % kind.capitalize()), + webpage, kind, fatal=False) + return re.findall(r'<a[^>]+>([^<]+)</a>', s or '') + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'view_count': view_count, + 'formats': formats, + 'age_limit': 18, + 'categories': extract_list('categories'), + 'tags': extract_list('tags'), + } diff --git a/yt_dlp/extractor/pornflip.py b/yt_dlp/extractor/pornflip.py new file mode 100644 index 000000000..d0aefa2dd --- /dev/null +++ b/yt_dlp/extractor/pornflip.py @@ -0,0 +1,82 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_duration, + parse_iso8601 +) + + +class PornFlipIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pornflip\.com/(?:(embed|sv|v)/)?(?P<id>[^/]+)' + _TESTS = [ + { + 'url': 'https://www.pornflip.com/dzv9Mtw1qj2/sv/brazzers-double-dare-two-couples-fucked-jenna-reid-maya-bijou', + 'info_dict': { + 'id': 'dzv9Mtw1qj2', + 'ext': 'mp4', + 'title': 'Brazzers - Double Dare Two couples fucked Jenna Reid Maya Bijou', + 'description': 'md5:d2b69e6cc743c5fd158e162aa7f05821', + 'duration': 476, + 'like_count': int, + 'dislike_count': int, + 'view_count': int, + 'timestamp': 1617846819, + 'upload_date': '20210408', + 'uploader': 'Brazzers', + 'age_limit': 18, + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, + }, + { + 'url': 'https://www.pornflip.com/v/IrJEC40i21L', + 'only_matching': True, + }, + { + 'url': 'https://www.pornflip.com/Z3jzbChC5-P/sexintaxi-e-sereyna-gomez-czech-naked-couple', + 'only_matching': True, + }, + { + 'url': 'https://www.pornflip.com/embed/bLcDFxnrZnU', + 'only_matching': True, + }, + ] + _HOST = 'www.pornflip.com' + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage( + 'https://{}/sv/{}'.format(self._HOST, video_id), video_id, headers={'host': self._HOST}) + description = self._html_search_regex(r'&p\[summary\]=(.*?)\s*&p', webpage, 'description', fatal=False) + duration = self._search_regex(r'"duration":\s+"([^"]+)",', webpage, 'duration', fatal=False) + view_count = self._search_regex(r'"interactionCount":\s+"([^"]+)"', webpage, 'view_count', fatal=False) + title = self._html_search_regex(r'id="mediaPlayerTitleLink"[^>]*>(.+)</a>', webpage, 'title', fatal=False) + uploader = self._html_search_regex(r'class="title-chanel"[^>]*>[^<]*<a[^>]*>([^<]+)<', webpage, 'uploader', fatal=False) + upload_date = self._search_regex(r'"uploadDate":\s+"([^"]+)",', webpage, 'upload_date', fatal=False) + likes = self._html_search_regex( + r'class="btn btn-up-rating[^>]*>[^<]*<i[^>]*>[^<]*</i>[^>]*<span[^>]*>[^0-9]*([0-9]+)[^<0-9]*<', webpage, 'like_count', fatal=False) + dislikes = self._html_search_regex( + r'class="btn btn-down-rating[^>]*>[^<]*<i[^>]*>[^<]*</i>[^>]*<span[^>]*>[^0-9]*([0-9]+)[^<0-9]*<', webpage, 'dislike_count', fatal=False) + mpd_url = self._search_regex(r'"([^"]+userscontent.net/dash/[0-9]+/manifest.mpd[^"]*)"', webpage, 'mpd_url').replace('&', '&') + formats = self._extract_mpd_formats(mpd_url, video_id, mpd_id='dash') + self._sort_formats(formats) + + return { + 'age_limit': 18, + 'description': description, + 'dislike_count': int_or_none(dislikes), + 'duration': parse_duration(duration), + 'formats': formats, + 'id': video_id, + 'like_count': int_or_none(likes), + 'timestamp': parse_iso8601(upload_date), + 'thumbnail': self._og_search_thumbnail(webpage), + 'title': title, + 'uploader': uploader, + 'view_count': int_or_none(view_count), + } diff --git a/yt_dlp/extractor/pornhd.py b/yt_dlp/extractor/pornhd.py new file mode 100644 index 000000000..9dbd72f1d --- /dev/null +++ b/yt_dlp/extractor/pornhd.py @@ -0,0 +1,120 @@ +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + js_to_json, + merge_dicts, + urljoin, +) + + +class PornHdIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<id>\d+)(?:/(?P<display_id>.+))?' + _TESTS = [{ + 'url': 'http://www.pornhd.com/videos/9864/selfie-restroom-masturbation-fun-with-chubby-cutie-hd-porn-video', + 'md5': '87f1540746c1d32ec7a2305c12b96b25', + 'info_dict': { + 'id': '9864', + 'display_id': 'selfie-restroom-masturbation-fun-with-chubby-cutie-hd-porn-video', + 'ext': 'mp4', + 'title': 'Restroom selfie masturbation', + 'description': 'md5:3748420395e03e31ac96857a8f125b2b', + 'thumbnail': r're:^https?://.*\.jpg', + 'view_count': int, + 'like_count': int, + 'age_limit': 18, + }, + 'skip': 'HTTP Error 404: Not Found', + }, { + 'url': 'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video', + 'md5': '1b7b3a40b9d65a8e5b25f7ab9ee6d6de', + 'info_dict': { + 'id': '1962', + 'display_id': 'sierra-day-gets-his-cum-all-over-herself-hd-porn-video', + 'ext': 'mp4', + 'title': 'md5:98c6f8b2d9c229d0f0fde47f61a1a759', + 'description': 'md5:8ff0523848ac2b8f9b065ba781ccf294', + 'thumbnail': r're:^https?://.*\.jpg', + 'view_count': int, + 'like_count': int, + 'age_limit': 18, + }, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') + + webpage = self._download_webpage(url, display_id or video_id) + + title = self._html_search_regex( + [r'<span[^>]+class=["\']video-name["\'][^>]*>([^<]+)', + r'<title>(.+?) - .*?[Pp]ornHD.*?</title>'], webpage, 'title') + + sources = self._parse_json(js_to_json(self._search_regex( + r"(?s)sources'?\s*[:=]\s*(\{.+?\})", + webpage, 'sources', default='{}')), video_id) + + info = {} + if not sources: + entries = self._parse_html5_media_entries(url, webpage, video_id) + if entries: + info = entries[0] + + if not sources and not info: + message = self._html_search_regex( + r'(?s)<(div|p)[^>]+class="no-video"[^>]*>(?P<value>.+?)</\1', + webpage, 'error message', group='value') + raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True) + + formats = [] + for format_id, video_url in sources.items(): + video_url = urljoin(url, video_url) + if not video_url: + continue + height = int_or_none(self._search_regex( + r'^(\d+)[pP]', format_id, 'height', default=None)) + formats.append({ + 'url': video_url, + 'ext': determine_ext(video_url, 'mp4'), + 'format_id': format_id, + 'height': height, + }) + if formats: + info['formats'] = formats + self._sort_formats(info['formats']) + + description = self._html_search_regex( + (r'(?s)<section[^>]+class=["\']video-description[^>]+>(?P<value>.+?)</section>', + r'<(div|p)[^>]+class="description"[^>]*>(?P<value>[^<]+)</\1'), + webpage, 'description', fatal=False, + group='value') or self._html_search_meta( + 'description', webpage, default=None) or self._og_search_description(webpage) + view_count = int_or_none(self._html_search_regex( + r'(\d+) views\s*<', webpage, 'view count', fatal=False)) + thumbnail = self._search_regex( + r"poster'?\s*:\s*([\"'])(?P<url>(?:(?!\1).)+)\1", webpage, + 'thumbnail', default=None, group='url') + + like_count = int_or_none(self._search_regex( + (r'(\d+)</span>\s*likes', + r'(\d+)\s*</11[^>]+>(?: |\s)*\blikes', + r'class=["\']save-count["\'][^>]*>\s*(\d+)'), + webpage, 'like count', fatal=False)) + + return merge_dicts(info, { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'view_count': view_count, + 'like_count': like_count, + 'formats': formats, + 'age_limit': 18, + }) diff --git a/yt_dlp/extractor/pornhub.py b/yt_dlp/extractor/pornhub.py new file mode 100644 index 000000000..6d894affd --- /dev/null +++ b/yt_dlp/extractor/pornhub.py @@ -0,0 +1,817 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import functools +import itertools +import math +import operator +import re + +from .common import InfoExtractor +from ..compat import ( + compat_HTTPError, + compat_str, + compat_urllib_request, +) +from .openload import PhantomJSwrapper +from ..utils import ( + clean_html, + determine_ext, + ExtractorError, + int_or_none, + merge_dicts, + NO_DEFAULT, + orderedSet, + remove_quotes, + str_to_int, + update_url_query, + urlencode_postdata, + url_or_none, +) + + +class PornHubBaseIE(InfoExtractor): + _NETRC_MACHINE = 'pornhub' + _PORNHUB_HOST_RE = r'(?:(?P<host>pornhub(?:premium)?\.(?:com|net|org))|pornhubthbh7ap3u\.onion)' + + def _download_webpage_handle(self, *args, **kwargs): + def dl(*args, **kwargs): + return super(PornHubBaseIE, self)._download_webpage_handle(*args, **kwargs) + + ret = dl(*args, **kwargs) + + if not ret: + return ret + + webpage, urlh = ret + + if any(re.search(p, webpage) for p in ( + r'<body\b[^>]+\bonload=["\']go\(\)', + r'document\.cookie\s*=\s*["\']RNKEY=', + r'document\.location\.reload\(true\)')): + url_or_request = args[0] + url = (url_or_request.get_full_url() + if isinstance(url_or_request, compat_urllib_request.Request) + else url_or_request) + phantom = PhantomJSwrapper(self, required_version='2.0') + phantom.get(url, html=webpage) + webpage, urlh = dl(*args, **kwargs) + + return webpage, urlh + + def _real_initialize(self): + self._logged_in = False + + def _login(self, host): + if self._logged_in: + return + + site = host.split('.')[0] + + # Both sites pornhub and pornhubpremium have separate accounts + # so there should be an option to provide credentials for both. + # At the same time some videos are available under the same video id + # on both sites so that we have to identify them as the same video. + # For that purpose we have to keep both in the same extractor + # but under different netrc machines. + username, password = self._get_login_info(netrc_machine=site) + if username is None: + return + + login_url = 'https://www.%s/%slogin' % (host, 'premium/' if 'premium' in host else '') + login_page = self._download_webpage( + login_url, None, 'Downloading %s login page' % site) + + def is_logged(webpage): + return any(re.search(p, webpage) for p in ( + r'class=["\']signOut', + r'>Sign\s+[Oo]ut\s*<')) + + if is_logged(login_page): + self._logged_in = True + return + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'username': username, + 'password': password, + }) + + response = self._download_json( + 'https://www.%s/front/authenticate' % host, None, + 'Logging in to %s' % site, + data=urlencode_postdata(login_form), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'Referer': login_url, + 'X-Requested-With': 'XMLHttpRequest', + }) + + if response.get('success') == '1': + self._logged_in = True + return + + message = response.get('message') + if message is not None: + raise ExtractorError( + 'Unable to login: %s' % message, expected=True) + + raise ExtractorError('Unable to log in') + + +class PornHubIE(PornHubBaseIE): + IE_DESC = 'PornHub and Thumbzilla' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:[^/]+\.)? + %s + /(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| + (?:www\.)?thumbzilla\.com/video/ + ) + (?P<id>[\da-z]+) + ''' % PornHubBaseIE._PORNHUB_HOST_RE + _TESTS = [{ + 'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015', + 'md5': 'a6391306d050e4547f62b3f485dd9ba9', + 'info_dict': { + 'id': '648719015', + 'ext': 'mp4', + 'title': 'Seductive Indian beauty strips down and fingers her pink pussy', + 'uploader': 'Babes', + 'upload_date': '20130628', + 'timestamp': 1372447216, + 'duration': 361, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'age_limit': 18, + 'tags': list, + 'categories': list, + 'cast': list, + }, + }, { + # non-ASCII title + 'url': 'http://www.pornhub.com/view_video.php?viewkey=1331683002', + 'info_dict': { + 'id': '1331683002', + 'ext': 'mp4', + 'title': '重庆婷婷女王足交', + 'upload_date': '20150213', + 'timestamp': 1423804862, + 'duration': 1753, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'age_limit': 18, + 'tags': list, + 'categories': list, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Video has been flagged for verification in accordance with our trust and safety policy', + }, { + # subtitles + 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5af5fef7c2aa7', + 'info_dict': { + 'id': 'ph5af5fef7c2aa7', + 'ext': 'mp4', + 'title': 'BFFS - Cute Teen Girls Share Cock On the Floor', + 'uploader': 'BFFs', + 'duration': 622, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'age_limit': 18, + 'tags': list, + 'categories': list, + 'subtitles': { + 'en': [{ + "ext": 'srt' + }] + }, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'This video has been disabled', + }, { + 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d', + 'only_matching': True, + }, { + # removed at the request of cam4.com + 'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862', + 'only_matching': True, + }, { + # removed at the request of the copyright owner + 'url': 'http://www.pornhub.com/view_video.php?viewkey=788152859', + 'only_matching': True, + }, { + # removed by uploader + 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph572716d15a111', + 'only_matching': True, + }, { + # private video + 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph56fd731fce6b7', + 'only_matching': True, + }, { + 'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex', + 'only_matching': True, + }, { + 'url': 'http://www.pornhub.com/video/show?viewkey=648719015', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.net/view_video.php?viewkey=203640933', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.org/view_video.php?viewkey=203640933', + 'only_matching': True, + }, { + 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5e4acdae54a82', + 'only_matching': True, + }, { + # Some videos are available with the same id on both premium + # and non-premium sites (e.g. this and the following test) + 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5f75b0f4b18e3', + 'only_matching': True, + }, { + 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5f75b0f4b18e3', + 'only_matching': True, + }, { + # geo restricted + 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5a9813bfa7156', + 'only_matching': True, + }, { + 'url': 'http://pornhubthbh7ap3u.onion/view_video.php?viewkey=ph5a9813bfa7156', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub(?:premium)?\.(?:com|net|org)/embed/[\da-z]+)', + webpage) + + def _extract_count(self, pattern, webpage, name): + return str_to_int(self._search_regex( + pattern, webpage, '%s count' % name, fatal=False)) + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + host = mobj.group('host') or 'pornhub.com' + video_id = mobj.group('id') + + self._login(host) + + self._set_cookie(host, 'age_verified', '1') + + def dl_webpage(platform): + self._set_cookie(host, 'platform', platform) + return self._download_webpage( + 'https://www.%s/view_video.php?viewkey=%s' % (host, video_id), + video_id, 'Downloading %s webpage' % platform) + + webpage = dl_webpage('pc') + + error_msg = self._html_search_regex( + (r'(?s)<div[^>]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P<error>.+?)</div>', + r'(?s)<section[^>]+class=["\']noVideo["\'][^>]*>(?P<error>.+?)</section>'), + webpage, 'error message', default=None, group='error') + if error_msg: + error_msg = re.sub(r'\s+', ' ', error_msg) + raise ExtractorError( + 'PornHub said: %s' % error_msg, + expected=True, video_id=video_id) + + if any(re.search(p, webpage) for p in ( + r'class=["\']geoBlocked["\']', + r'>\s*This content is unavailable in your country')): + self.raise_geo_restricted() + + # video_title from flashvars contains whitespace instead of non-ASCII (see + # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying + # on that anymore. + title = self._html_search_meta( + 'twitter:title', webpage, default=None) or self._html_search_regex( + (r'(?s)<h1[^>]+class=["\']title["\'][^>]*>(?P<title>.+?)</h1>', + r'<div[^>]+data-video-title=(["\'])(?P<title>(?:(?!\1).)+)\1', + r'shareTitle["\']\s*[=:]\s*(["\'])(?P<title>(?:(?!\1).)+)\1'), + webpage, 'title', group='title') + + video_urls = [] + video_urls_set = set() + subtitles = {} + + flashvars = self._parse_json( + self._search_regex( + r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'), + video_id) + if flashvars: + subtitle_url = url_or_none(flashvars.get('closedCaptionsFile')) + if subtitle_url: + subtitles.setdefault('en', []).append({ + 'url': subtitle_url, + 'ext': 'srt', + }) + thumbnail = flashvars.get('image_url') + duration = int_or_none(flashvars.get('video_duration')) + media_definitions = flashvars.get('mediaDefinitions') + if isinstance(media_definitions, list): + for definition in media_definitions: + if not isinstance(definition, dict): + continue + video_url = definition.get('videoUrl') + if not video_url or not isinstance(video_url, compat_str): + continue + if video_url in video_urls_set: + continue + video_urls_set.add(video_url) + video_urls.append( + (video_url, int_or_none(definition.get('quality')))) + else: + thumbnail, duration = [None] * 2 + + def extract_js_vars(webpage, pattern, default=NO_DEFAULT): + assignments = self._search_regex( + pattern, webpage, 'encoded url', default=default) + if not assignments: + return {} + + assignments = assignments.split(';') + + js_vars = {} + + def parse_js_value(inp): + inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp) + if '+' in inp: + inps = inp.split('+') + return functools.reduce( + operator.concat, map(parse_js_value, inps)) + inp = inp.strip() + if inp in js_vars: + return js_vars[inp] + return remove_quotes(inp) + + for assn in assignments: + assn = assn.strip() + if not assn: + continue + assn = re.sub(r'var\s+', '', assn) + vname, value = assn.split('=', 1) + js_vars[vname] = parse_js_value(value) + return js_vars + + def add_video_url(video_url): + v_url = url_or_none(video_url) + if not v_url: + return + if v_url in video_urls_set: + return + video_urls.append((v_url, None)) + video_urls_set.add(v_url) + + def parse_quality_items(quality_items): + q_items = self._parse_json(quality_items, video_id, fatal=False) + if not isinstance(q_items, list): + return + for item in q_items: + if isinstance(item, dict): + add_video_url(item.get('url')) + + if not video_urls: + FORMAT_PREFIXES = ('media', 'quality', 'qualityItems') + js_vars = extract_js_vars( + webpage, r'(var\s+(?:%s)_.+)' % '|'.join(FORMAT_PREFIXES), + default=None) + if js_vars: + for key, format_url in js_vars.items(): + if key.startswith(FORMAT_PREFIXES[-1]): + parse_quality_items(format_url) + elif any(key.startswith(p) for p in FORMAT_PREFIXES[:2]): + add_video_url(format_url) + if not video_urls and re.search( + r'<[^>]+\bid=["\']lockedPlayer', webpage): + raise ExtractorError( + 'Video %s is locked' % video_id, expected=True) + + if not video_urls: + js_vars = extract_js_vars( + dl_webpage('tv'), r'(var.+?mediastring.+?)</script>') + add_video_url(js_vars['mediastring']) + + for mobj in re.finditer( + r'<a[^>]+\bclass=["\']downloadBtn\b[^>]+\bhref=(["\'])(?P<url>(?:(?!\1).)+)\1', + webpage): + video_url = mobj.group('url') + if video_url not in video_urls_set: + video_urls.append((video_url, None)) + video_urls_set.add(video_url) + + upload_date = None + formats = [] + + def add_format(format_url, height=None): + ext = determine_ext(format_url) + if ext == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False)) + return + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + return + if not height: + height = int_or_none(self._search_regex( + r'(?P<height>\d+)[pP]?_\d+[kK]', format_url, 'height', + default=None)) + formats.append({ + 'url': format_url, + 'format_id': '%dp' % height if height else None, + 'height': height, + }) + + for video_url, height in video_urls: + if not upload_date: + upload_date = self._search_regex( + r'/(\d{6}/\d{2})/', video_url, 'upload data', default=None) + if upload_date: + upload_date = upload_date.replace('/', '') + if '/video/get_media' in video_url: + medias = self._download_json(video_url, video_id, fatal=False) + if isinstance(medias, list): + for media in medias: + if not isinstance(media, dict): + continue + video_url = url_or_none(media.get('videoUrl')) + if not video_url: + continue + height = int_or_none(media.get('quality')) + add_format(video_url, height) + continue + add_format(video_url) + + # field_preference is unnecessary here, but kept for code-similarity with youtube-dl + self._sort_formats( + formats, field_preference=('height', 'width', 'fps', 'format_id')) + + video_uploader = self._html_search_regex( + r'(?s)From: .+?<(?:a\b[^>]+\bhref=["\']/(?:(?:user|channel)s|model|pornstar)/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<', + webpage, 'uploader', default=None) + + def extract_vote_count(kind, name): + return self._extract_count( + (r'<span[^>]+\bclass="votes%s"[^>]*>([\d,\.]+)</span>' % kind, + r'<span[^>]+\bclass=["\']votes%s["\'][^>]*\bdata-rating=["\'](\d+)' % kind), + webpage, name) + + view_count = self._extract_count( + r'<span class="count">([\d,\.]+)</span> [Vv]iews', webpage, 'view') + like_count = extract_vote_count('Up', 'like') + dislike_count = extract_vote_count('Down', 'dislike') + comment_count = self._extract_count( + r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment') + + def extract_list(meta_key): + div = self._search_regex( + r'(?s)<div[^>]+\bclass=["\'].*?\b%sWrapper[^>]*>(.+?)</div>' + % meta_key, webpage, meta_key, default=None) + if div: + return [clean_html(x).strip() for x in re.findall(r'(?s)<a[^>]+\bhref=[^>]+>.+?</a>', div)] + + info = self._search_json_ld(webpage, video_id, default={}) + # description provided in JSON-LD is irrelevant + info['description'] = None + + return merge_dicts({ + 'id': video_id, + 'uploader': video_uploader, + 'upload_date': upload_date, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'view_count': view_count, + 'like_count': like_count, + 'dislike_count': dislike_count, + 'comment_count': comment_count, + 'formats': formats, + 'age_limit': 18, + 'tags': extract_list('tags'), + 'categories': extract_list('categories'), + 'cast': extract_list('pornstars'), + 'subtitles': subtitles, + }, info) + + +class PornHubPlaylistBaseIE(PornHubBaseIE): + def _extract_page(self, url): + return int_or_none(self._search_regex( + r'\bpage=(\d+)', url, 'page', default=None)) + + def _extract_entries(self, webpage, host): + # Only process container div with main playlist content skipping + # drop-down menu that uses similar pattern for videos (see + # https://github.com/ytdl-org/youtube-dl/issues/11594). + container = self._search_regex( + r'(?s)(<div[^>]+class=["\']container.+)', webpage, + 'container', default=webpage) + + return [ + self.url_result( + 'http://www.%s/%s' % (host, video_url), + PornHubIE.ie_key(), video_title=title) + for video_url, title in orderedSet(re.findall( + r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^"]+)"', + container)) + ] + + +class PornHubUserIE(PornHubPlaylistBaseIE): + _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?%s/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/?#&]+))(?:[?#&]|/(?!videos)|$)' % PornHubBaseIE._PORNHUB_HOST_RE + _TESTS = [{ + 'url': 'https://www.pornhub.com/model/zoe_ph', + 'playlist_mincount': 118, + }, { + 'url': 'https://www.pornhub.com/pornstar/liz-vicious', + 'info_dict': { + 'id': 'liz-vicious', + }, + 'playlist_mincount': 118, + }, { + 'url': 'https://www.pornhub.com/users/russianveet69', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/channels/povd', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/model/zoe_ph?abc=1', + 'only_matching': True, + }, { + # Unavailable via /videos page, but available with direct pagination + # on pornstar page (see [1]), requires premium + # 1. https://github.com/ytdl-org/youtube-dl/issues/27853 + 'url': 'https://www.pornhubpremium.com/pornstar/sienna-west', + 'only_matching': True, + }, { + # Same as before, multi page + 'url': 'https://www.pornhubpremium.com/pornstar/lily-labeau', + 'only_matching': True, + }, { + 'url': 'https://pornhubthbh7ap3u.onion/model/zoe_ph', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + user_id = mobj.group('id') + videos_url = '%s/videos' % mobj.group('url') + page = self._extract_page(url) + if page: + videos_url = update_url_query(videos_url, {'page': page}) + return self.url_result( + videos_url, ie=PornHubPagedVideoListIE.ie_key(), video_id=user_id) + + +class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): + @staticmethod + def _has_more(webpage): + return re.search( + r'''(?x) + <li[^>]+\bclass=["\']page_next| + <link[^>]+\brel=["\']next| + <button[^>]+\bid=["\']moreDataBtn + ''', webpage) is not None + + def _entries(self, url, host, item_id): + page = self._extract_page(url) + + VIDEOS = '/videos' + + def download_page(base_url, num, fallback=False): + note = 'Downloading page %d%s' % (num, ' (switch to fallback)' if fallback else '') + return self._download_webpage( + base_url, item_id, note, query={'page': num}) + + def is_404(e): + return isinstance(e.cause, compat_HTTPError) and e.cause.code == 404 + + base_url = url + has_page = page is not None + first_page = page if has_page else 1 + for page_num in (first_page, ) if has_page else itertools.count(first_page): + try: + try: + webpage = download_page(base_url, page_num) + except ExtractorError as e: + # Some sources may not be available via /videos page, + # trying to fallback to main page pagination (see [1]) + # 1. https://github.com/ytdl-org/youtube-dl/issues/27853 + if is_404(e) and page_num == first_page and VIDEOS in base_url: + base_url = base_url.replace(VIDEOS, '') + webpage = download_page(base_url, page_num, fallback=True) + else: + raise + except ExtractorError as e: + if is_404(e) and page_num != first_page: + break + raise + page_entries = self._extract_entries(webpage, host) + if not page_entries: + break + for e in page_entries: + yield e + if not self._has_more(webpage): + break + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + host = mobj.group('host') + item_id = mobj.group('id') + + self._login(host) + + return self.playlist_result(self._entries(url, host, item_id), item_id) + + +class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): + _VALID_URL = r'https?://(?:[^/]+\.)?%s/(?!playlist/)(?P<id>(?:[^/]+/)*[^/?#&]+)' % PornHubBaseIE._PORNHUB_HOST_RE + _TESTS = [{ + 'url': 'https://www.pornhub.com/model/zoe_ph/videos', + 'only_matching': True, + }, { + 'url': 'http://www.pornhub.com/users/rushandlia/videos', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos', + 'info_dict': { + 'id': 'pornstar/jenny-blighe/videos', + }, + 'playlist_mincount': 149, + }, { + 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos?page=3', + 'info_dict': { + 'id': 'pornstar/jenny-blighe/videos', + }, + 'playlist_mincount': 40, + }, { + # default sorting as Top Rated Videos + 'url': 'https://www.pornhub.com/channels/povd/videos', + 'info_dict': { + 'id': 'channels/povd/videos', + }, + 'playlist_mincount': 293, + }, { + # Top Rated Videos + 'url': 'https://www.pornhub.com/channels/povd/videos?o=ra', + 'only_matching': True, + }, { + # Most Recent Videos + 'url': 'https://www.pornhub.com/channels/povd/videos?o=da', + 'only_matching': True, + }, { + # Most Viewed Videos + 'url': 'https://www.pornhub.com/channels/povd/videos?o=vi', + 'only_matching': True, + }, { + 'url': 'http://www.pornhub.com/users/zoe_ph/videos/public', + 'only_matching': True, + }, { + # Most Viewed Videos + 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=mv', + 'only_matching': True, + }, { + # Top Rated Videos + 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=tr', + 'only_matching': True, + }, { + # Longest Videos + 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=lg', + 'only_matching': True, + }, { + # Newest Videos + 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=cm', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos/paid', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos/fanonly', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/video', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/video?page=3', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/video/search?search=123', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/categories/teen', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/categories/teen?page=3', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/hd', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/hd?page=3', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/described-video', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/described-video?page=2', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/video/incategories/60fps-1/hd-porn', + 'only_matching': True, + }, { + 'url': 'https://pornhubthbh7ap3u.onion/model/zoe_ph/videos', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return (False + if PornHubIE.suitable(url) or PornHubUserIE.suitable(url) or PornHubUserVideosUploadIE.suitable(url) + else super(PornHubPagedVideoListIE, cls).suitable(url)) + + +class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE): + _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?%s/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos/upload)' % PornHubBaseIE._PORNHUB_HOST_RE + _TESTS = [{ + 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload', + 'info_dict': { + 'id': 'jenny-blighe', + }, + 'playlist_mincount': 129, + }, { + 'url': 'https://www.pornhub.com/model/zoe_ph/videos/upload', + 'only_matching': True, + }, { + 'url': 'http://pornhubthbh7ap3u.onion/pornstar/jenny-blighe/videos/upload', + 'only_matching': True, + }] + + +class PornHubPlaylistIE(PornHubPlaylistBaseIE): + _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?%s/playlist/(?P<id>[^/?#&]+))' % PornHubBaseIE._PORNHUB_HOST_RE + _TESTS = [{ + 'url': 'https://www.pornhub.com/playlist/44121572', + 'info_dict': { + 'id': '44121572', + }, + 'playlist_count': 77, + }, { + 'url': 'https://www.pornhub.com/playlist/4667351', + 'only_matching': True, + }, { + 'url': 'https://de.pornhub.com/playlist/4667351', + 'only_matching': True, + }, { + 'url': 'https://de.pornhub.com/playlist/4667351?page=2', + 'only_matching': True, + }] + + def _entries(self, url, host, item_id): + webpage = self._download_webpage(url, item_id, 'Downloading page 1') + playlist_id = self._search_regex(r'var\s+playlistId\s*=\s*"([^"]+)"', webpage, 'playlist_id') + video_count = int_or_none( + self._search_regex(r'var\s+itemsCount\s*=\s*([0-9]+)\s*\|\|', webpage, 'video_count')) + token = self._search_regex(r'var\s+token\s*=\s*"([^"]+)"', webpage, 'token') + page_count = math.ceil((video_count - 36) / 40.) + 1 + page_entries = self._extract_entries(webpage, host) + + def download_page(page_num): + note = 'Downloading page {}'.format(page_num) + page_url = 'https://www.{}/playlist/viewChunked'.format(host) + return self._download_webpage(page_url, item_id, note, query={ + 'id': playlist_id, + 'page': page_num, + 'token': token, + }) + + for page_num in range(1, page_count + 1): + if page_num > 1: + webpage = download_page(page_num) + page_entries = self._extract_entries(webpage, host) + if not page_entries: + break + for e in page_entries: + yield e + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + host = mobj.group('host') + item_id = mobj.group('id') + + self._login(host) + + return self.playlist_result(self._entries(mobj.group('url'), host, item_id), item_id) diff --git a/youtube_dl/extractor/pornotube.py b/yt_dlp/extractor/pornotube.py index 1b5b9a320..1b5b9a320 100644 --- a/youtube_dl/extractor/pornotube.py +++ b/yt_dlp/extractor/pornotube.py diff --git a/yt_dlp/extractor/pornovoisines.py b/yt_dlp/extractor/pornovoisines.py new file mode 100644 index 000000000..18459fc94 --- /dev/null +++ b/yt_dlp/extractor/pornovoisines.py @@ -0,0 +1,107 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + float_or_none, + unified_strdate, +) + + +class PornoVoisinesIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pornovoisines\.com/videos/show/(?P<id>\d+)/(?P<display_id>[^/.]+)' + + _TEST = { + 'url': 'http://www.pornovoisines.com/videos/show/919/recherche-appartement.html', + 'md5': '6f8aca6a058592ab49fe701c8ba8317b', + 'info_dict': { + 'id': '919', + 'display_id': 'recherche-appartement', + 'ext': 'mp4', + 'title': 'Recherche appartement', + 'description': 'md5:fe10cb92ae2dd3ed94bb4080d11ff493', + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20140925', + 'duration': 120, + 'view_count': int, + 'average_rating': float, + 'categories': ['Débutante', 'Débutantes', 'Scénario', 'Sodomie'], + 'age_limit': 18, + 'subtitles': { + 'fr': [{ + 'ext': 'vtt', + }] + }, + } + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') + + settings_url = self._download_json( + 'http://www.pornovoisines.com/api/video/%s/getsettingsurl/' % video_id, + video_id, note='Getting settings URL')['video_settings_url'] + settings = self._download_json(settings_url, video_id)['data'] + + formats = [] + for kind, data in settings['variants'].items(): + if kind == 'HLS': + formats.extend(self._extract_m3u8_formats( + data, video_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls')) + elif kind == 'MP4': + for item in data: + formats.append({ + 'url': item['url'], + 'height': item.get('height'), + 'bitrate': item.get('bitrate'), + }) + self._sort_formats(formats) + + webpage = self._download_webpage(url, video_id) + + title = self._og_search_title(webpage) + description = self._og_search_description(webpage) + + # The webpage has a bug - there's no space between "thumb" and src= + thumbnail = self._html_search_regex( + r'<img[^>]+class=([\'"])thumb\1[^>]*src=([\'"])(?P<url>[^"]+)\2', + webpage, 'thumbnail', fatal=False, group='url') + + upload_date = unified_strdate(self._search_regex( + r'Le\s*<b>([\d/]+)', webpage, 'upload date', fatal=False)) + duration = settings.get('main', {}).get('duration') + view_count = int_or_none(self._search_regex( + r'(\d+) vues', webpage, 'view count', fatal=False)) + average_rating = self._search_regex( + r'Note\s*:\s*(\d+(?:,\d+)?)', webpage, 'average rating', fatal=False) + if average_rating: + average_rating = float_or_none(average_rating.replace(',', '.')) + + categories = self._html_search_regex( + r'(?s)Catégories\s*:\s*<b>(.+?)</b>', webpage, 'categories', fatal=False) + if categories: + categories = [category.strip() for category in categories.split(',')] + + subtitles = {'fr': [{ + 'url': subtitle, + } for subtitle in settings.get('main', {}).get('vtt_tracks', {}).values()]} + + return { + 'id': video_id, + 'display_id': display_id, + 'formats': formats, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'duration': duration, + 'view_count': view_count, + 'average_rating': average_rating, + 'categories': categories, + 'age_limit': 18, + 'subtitles': subtitles, + } diff --git a/yt_dlp/extractor/pornoxo.py b/yt_dlp/extractor/pornoxo.py new file mode 100644 index 000000000..489dc2b25 --- /dev/null +++ b/yt_dlp/extractor/pornoxo.py @@ -0,0 +1,57 @@ +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ( + str_to_int, +) + + +class PornoXOIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pornoxo\.com/videos/(?P<id>\d+)/(?P<display_id>[^/]+)\.html' + _TEST = { + 'url': 'http://www.pornoxo.com/videos/7564/striptease-from-sexy-secretary.html', + 'md5': '582f28ecbaa9e6e24cb90f50f524ce87', + 'info_dict': { + 'id': '7564', + 'ext': 'flv', + 'title': 'Striptease From Sexy Secretary!', + 'display_id': 'striptease-from-sexy-secretary', + 'description': 'md5:0ee35252b685b3883f4a1d38332f9980', + 'categories': list, # NSFW + 'thumbnail': r're:https?://.*\.jpg$', + 'age_limit': 18, + } + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id, display_id = mobj.groups() + + webpage = self._download_webpage(url, video_id) + video_data = self._extract_jwplayer_data(webpage, video_id, require_title=False) + + title = self._html_search_regex( + r'<title>([^<]+)\s*-\s*PornoXO', webpage, 'title') + + view_count = str_to_int(self._html_search_regex( + r'[vV]iews:\s*([0-9,]+)', webpage, 'view count', fatal=False)) + + categories_str = self._html_search_regex( + r'<meta name="description" content=".*featuring\s*([^"]+)"', + webpage, 'categories', fatal=False) + categories = ( + None if categories_str is None + else categories_str.split(',')) + + video_data.update({ + 'id': video_id, + 'title': title, + 'display_id': display_id, + 'description': self._html_search_meta('description', webpage), + 'categories': categories, + 'view_count': view_count, + 'age_limit': 18, + }) + + return video_data diff --git a/yt_dlp/extractor/presstv.py b/yt_dlp/extractor/presstv.py new file mode 100644 index 000000000..bfb2eb71e --- /dev/null +++ b/yt_dlp/extractor/presstv.py @@ -0,0 +1,73 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import remove_start + + +class PressTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?presstv\.ir/[^/]+/(?P<y>\d+)/(?P<m>\d+)/(?P<d>\d+)/(?P<id>\d+)/(?P<display_id>[^/]+)?' + + _TEST = { + 'url': 'http://www.presstv.ir/Detail/2016/04/09/459911/Australian-sewerage-treatment-facility-/', + 'md5': '5d7e3195a447cb13e9267e931d8dd5a5', + 'info_dict': { + 'id': '459911', + 'display_id': 'Australian-sewerage-treatment-facility-', + 'ext': 'mp4', + 'title': 'Organic mattresses used to clean waste water', + 'upload_date': '20160409', + 'thumbnail': r're:^https?://.*\.jpg', + 'description': 'md5:20002e654bbafb6908395a5c0cfcd125' + } + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') or video_id + + webpage = self._download_webpage(url, display_id) + + # extract video URL from webpage + video_url = self._hidden_inputs(webpage)['inpPlayback'] + + # build list of available formats + # specified in http://www.presstv.ir/Scripts/playback.js + base_url = 'http://192.99.219.222:82/presstv' + _formats = [ + (180, '_low200.mp4'), + (360, '_low400.mp4'), + (720, '_low800.mp4'), + (1080, '.mp4') + ] + + formats = [{ + 'url': base_url + video_url[:-4] + extension, + 'format_id': '%dp' % height, + 'height': height, + } for height, extension in _formats] + + # extract video metadata + title = remove_start( + self._html_search_meta('title', webpage, fatal=True), 'PressTV-') + + thumbnail = self._og_search_thumbnail(webpage) + description = self._og_search_description(webpage) + + upload_date = '%04d%02d%02d' % ( + int(mobj.group('y')), + int(mobj.group('m')), + int(mobj.group('d')), + ) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'description': description + } diff --git a/yt_dlp/extractor/projectveritas.py b/yt_dlp/extractor/projectveritas.py new file mode 100644 index 000000000..1d832a679 --- /dev/null +++ b/yt_dlp/extractor/projectveritas.py @@ -0,0 +1,55 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + traverse_obj, + unified_strdate, +) + + +class ProjectVeritasIE(InfoExtractor): + _VALID_URL = r'(?:https?://)(?:www\.)?projectveritas\.com/(?P<type>news|video)/(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'https://www.projectveritas.com/news/exclusive-inside-the-new-york-and-new-jersey-hospitals-battling-coronavirus/', + 'info_dict': { + 'id': '51910aab-365a-5cf1-88f2-8eb1ca5fd3c6', + 'ext': 'mp4', + 'title': 'Exclusive: Inside The New York and New Jersey Hospitals Battling Coronavirus', + 'upload_date': '20200327', + 'thumbnail': 'md5:6076477fe50b03eb8708be9415e18e1c', + } + }, { + 'url': 'https://www.projectveritas.com/video/ilhan-omar-connected-ballot-harvester-in-cash-for-ballots-scheme-car-is-full/', + 'info_dict': { + 'id': 'c5aab304-a56b-54b1-9f0b-03b77bc5f2f6', + 'ext': 'mp4', + 'title': 'Ilhan Omar connected Ballot Harvester in cash-for-ballots scheme: "Car is full" of absentee ballots', + 'upload_date': '20200927', + 'thumbnail': 'md5:194b8edf0e2ba64f25500ff4378369a4', + } + }] + + def _real_extract(self, url): + id, type = self._match_valid_url(url).group('id', 'type') + api_url = f'https://www.projectveritas.com/page-data/{type}/{id}/page-data.json' + data_json = self._download_json(api_url, id)['result']['data'] + main_data = traverse_obj(data_json, 'video', 'post') + video_id = main_data['id'] + thumbnail = traverse_obj(main_data, ('image', 'ogImage', 'src')) + mux_asset = traverse_obj(main_data, + 'muxAsset', ('body', 'json', 'content', ..., 'data', 'target', 'fields', 'muxAsset'), + get_all=False, expected_type=dict) + if not mux_asset: + raise ExtractorError('No video on the provided url.', expected=True) + playback_id = traverse_obj(mux_asset, 'playbackId', ('en-US', 'playbackId')) + formats = self._extract_m3u8_formats(f'https://stream.mux.com/{playback_id}.m3u8', video_id) + self._sort_formats(formats) + return { + 'id': video_id, + 'title': main_data['title'], + 'upload_date': unified_strdate(main_data.get('date')), + 'thumbnail': thumbnail.replace('//', ''), + 'formats': formats, + } diff --git a/yt_dlp/extractor/prosiebensat1.py b/yt_dlp/extractor/prosiebensat1.py new file mode 100644 index 000000000..e89bbfd27 --- /dev/null +++ b/yt_dlp/extractor/prosiebensat1.py @@ -0,0 +1,500 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from hashlib import sha1 +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + determine_ext, + float_or_none, + int_or_none, + merge_dicts, + unified_strdate, +) + + +class ProSiebenSat1BaseIE(InfoExtractor): + _GEO_BYPASS = False + _ACCESS_ID = None + _SUPPORTED_PROTOCOLS = 'dash:clear,hls:clear,progressive:clear' + _V4_BASE_URL = 'https://vas-v4.p7s1video.net/4.0/get' + + def _extract_video_info(self, url, clip_id): + client_location = url + + video = self._download_json( + 'http://vas.sim-technik.de/vas/live/v2/videos', + clip_id, 'Downloading videos JSON', query={ + 'access_token': self._TOKEN, + 'client_location': client_location, + 'client_name': self._CLIENT_NAME, + 'ids': clip_id, + })[0] + + if not self.get_param('allow_unplayable_formats') and video.get('is_protected') is True: + self.report_drm(clip_id) + + formats = [] + if self._ACCESS_ID: + raw_ct = self._ENCRYPTION_KEY + clip_id + self._IV + self._ACCESS_ID + protocols = self._download_json( + self._V4_BASE_URL + 'protocols', clip_id, + 'Downloading protocols JSON', + headers=self.geo_verification_headers(), query={ + 'access_id': self._ACCESS_ID, + 'client_token': sha1((raw_ct).encode()).hexdigest(), + 'video_id': clip_id, + }, fatal=False, expected_status=(403,)) or {} + error = protocols.get('error') or {} + if error.get('title') == 'Geo check failed': + self.raise_geo_restricted(countries=['AT', 'CH', 'DE']) + server_token = protocols.get('server_token') + if server_token: + urls = (self._download_json( + self._V4_BASE_URL + 'urls', clip_id, 'Downloading urls JSON', query={ + 'access_id': self._ACCESS_ID, + 'client_token': sha1((raw_ct + server_token + self._SUPPORTED_PROTOCOLS).encode()).hexdigest(), + 'protocols': self._SUPPORTED_PROTOCOLS, + 'server_token': server_token, + 'video_id': clip_id, + }, fatal=False) or {}).get('urls') or {} + for protocol, variant in urls.items(): + source_url = variant.get('clear', {}).get('url') + if not source_url: + continue + if protocol == 'dash': + formats.extend(self._extract_mpd_formats( + source_url, clip_id, mpd_id=protocol, fatal=False)) + elif protocol == 'hls': + formats.extend(self._extract_m3u8_formats( + source_url, clip_id, 'mp4', 'm3u8_native', + m3u8_id=protocol, fatal=False)) + else: + formats.append({ + 'url': source_url, + 'format_id': protocol, + }) + if not formats: + source_ids = [compat_str(source['id']) for source in video['sources']] + + client_id = self._SALT[:2] + sha1(''.join([clip_id, self._SALT, self._TOKEN, client_location, self._SALT, self._CLIENT_NAME]).encode('utf-8')).hexdigest() + + sources = self._download_json( + 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources' % clip_id, + clip_id, 'Downloading sources JSON', query={ + 'access_token': self._TOKEN, + 'client_id': client_id, + 'client_location': client_location, + 'client_name': self._CLIENT_NAME, + }) + server_id = sources['server_id'] + + def fix_bitrate(bitrate): + bitrate = int_or_none(bitrate) + if not bitrate: + return None + return (bitrate // 1000) if bitrate % 1000 == 0 else bitrate + + for source_id in source_ids: + client_id = self._SALT[:2] + sha1(''.join([self._SALT, clip_id, self._TOKEN, server_id, client_location, source_id, self._SALT, self._CLIENT_NAME]).encode('utf-8')).hexdigest() + urls = self._download_json( + 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url' % clip_id, + clip_id, 'Downloading urls JSON', fatal=False, query={ + 'access_token': self._TOKEN, + 'client_id': client_id, + 'client_location': client_location, + 'client_name': self._CLIENT_NAME, + 'server_id': server_id, + 'source_ids': source_id, + }) + if not urls: + continue + if urls.get('status_code') != 0: + raise ExtractorError('This video is unavailable', expected=True) + urls_sources = urls['sources'] + if isinstance(urls_sources, dict): + urls_sources = urls_sources.values() + for source in urls_sources: + source_url = source.get('url') + if not source_url: + continue + protocol = source.get('protocol') + mimetype = source.get('mimetype') + if mimetype == 'application/f4m+xml' or 'f4mgenerator' in source_url or determine_ext(source_url) == 'f4m': + formats.extend(self._extract_f4m_formats( + source_url, clip_id, f4m_id='hds', fatal=False)) + elif mimetype == 'application/x-mpegURL': + formats.extend(self._extract_m3u8_formats( + source_url, clip_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif mimetype == 'application/dash+xml': + formats.extend(self._extract_mpd_formats( + source_url, clip_id, mpd_id='dash', fatal=False)) + else: + tbr = fix_bitrate(source['bitrate']) + if protocol in ('rtmp', 'rtmpe'): + mobj = re.search(r'^(?P<url>rtmpe?://[^/]+)/(?P<path>.+)$', source_url) + if not mobj: + continue + path = mobj.group('path') + mp4colon_index = path.rfind('mp4:') + app = path[:mp4colon_index] + play_path = path[mp4colon_index:] + formats.append({ + 'url': '%s/%s' % (mobj.group('url'), app), + 'app': app, + 'play_path': play_path, + 'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf', + 'page_url': 'http://www.prosieben.de', + 'tbr': tbr, + 'ext': 'flv', + 'format_id': 'rtmp%s' % ('-%d' % tbr if tbr else ''), + }) + else: + formats.append({ + 'url': source_url, + 'tbr': tbr, + 'format_id': 'http%s' % ('-%d' % tbr if tbr else ''), + }) + self._sort_formats(formats) + + return { + 'duration': float_or_none(video.get('duration')), + 'formats': formats, + } + + +class ProSiebenSat1IE(ProSiebenSat1BaseIE): + IE_NAME = 'prosiebensat1' + IE_DESC = 'ProSiebenSat.1 Digital' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)? + (?: + (?:beta\.)? + (?: + prosieben(?:maxx)?|sixx|sat1(?:gold)?|kabeleins(?:doku)?|the-voice-of-germany|advopedia + )\.(?:de|at|ch)| + ran\.de|fem\.com|advopedia\.de|galileo\.tv/video + ) + /(?P<id>.+) + ''' + + _TESTS = [ + { + # Tests changes introduced in https://github.com/ytdl-org/youtube-dl/pull/6242 + # in response to fixing https://github.com/ytdl-org/youtube-dl/issues/6215: + # - malformed f4m manifest support + # - proper handling of URLs starting with `https?://` in 2.0 manifests + # - recursive child f4m manifests extraction + 'url': 'http://www.prosieben.de/tv/circus-halligalli/videos/218-staffel-2-episode-18-jahresrueckblick-ganze-folge', + 'info_dict': { + 'id': '2104602', + 'ext': 'mp4', + 'title': 'CIRCUS HALLIGALLI - Episode 18 - Staffel 2', + 'description': 'md5:8733c81b702ea472e069bc48bb658fc1', + 'upload_date': '20131231', + 'duration': 5845.04, + 'series': 'CIRCUS HALLIGALLI', + 'season_number': 2, + 'episode': 'Episode 18 - Staffel 2', + 'episode_number': 18, + }, + }, + { + 'url': 'http://www.prosieben.de/videokatalog/Gesellschaft/Leben/Trends/video-Lady-Umstyling-f%C3%BCr-Audrina-Rebekka-Audrina-Fergen-billig-aussehen-Battal-Modica-700544.html', + 'info_dict': { + 'id': '2570327', + 'ext': 'mp4', + 'title': 'Lady-Umstyling für Audrina', + 'description': 'md5:4c16d0c17a3461a0d43ea4084e96319d', + 'upload_date': '20131014', + 'duration': 606.76, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'Seems to be broken', + }, + { + 'url': 'http://www.prosiebenmaxx.de/tv/experience/video/144-countdown-fuer-die-autowerkstatt-ganze-folge', + 'info_dict': { + 'id': '2429369', + 'ext': 'mp4', + 'title': 'Countdown für die Autowerkstatt', + 'description': 'md5:809fc051a457b5d8666013bc40698817', + 'upload_date': '20140223', + 'duration': 2595.04, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'This video is unavailable', + }, + { + 'url': 'http://www.sixx.de/stars-style/video/sexy-laufen-in-ugg-boots-clip', + 'info_dict': { + 'id': '2904997', + 'ext': 'mp4', + 'title': 'Sexy laufen in Ugg Boots', + 'description': 'md5:edf42b8bd5bc4e5da4db4222c5acb7d6', + 'upload_date': '20140122', + 'duration': 245.32, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'This video is unavailable', + }, + { + 'url': 'http://www.sat1.de/film/der-ruecktritt/video/im-interview-kai-wiesinger-clip', + 'info_dict': { + 'id': '2906572', + 'ext': 'mp4', + 'title': 'Im Interview: Kai Wiesinger', + 'description': 'md5:e4e5370652ec63b95023e914190b4eb9', + 'upload_date': '20140203', + 'duration': 522.56, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'This video is unavailable', + }, + { + 'url': 'http://www.kabeleins.de/tv/rosins-restaurants/videos/jagd-auf-fertigkost-im-elsthal-teil-2-ganze-folge', + 'info_dict': { + 'id': '2992323', + 'ext': 'mp4', + 'title': 'Jagd auf Fertigkost im Elsthal - Teil 2', + 'description': 'md5:2669cde3febe9bce13904f701e774eb6', + 'upload_date': '20141014', + 'duration': 2410.44, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'This video is unavailable', + }, + { + 'url': 'http://www.ran.de/fussball/bundesliga/video/schalke-toennies-moechte-raul-zurueck-ganze-folge', + 'info_dict': { + 'id': '3004256', + 'ext': 'mp4', + 'title': 'Schalke: Tönnies möchte Raul zurück', + 'description': 'md5:4b5b271d9bcde223b54390754c8ece3f', + 'upload_date': '20140226', + 'duration': 228.96, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'This video is unavailable', + }, + { + 'url': 'http://www.the-voice-of-germany.de/video/31-andreas-kuemmert-rocket-man-clip', + 'info_dict': { + 'id': '2572814', + 'ext': 'mp4', + 'title': 'The Voice of Germany - Andreas Kümmert: Rocket Man', + 'description': 'md5:6ddb02b0781c6adf778afea606652e38', + 'timestamp': 1382041620, + 'upload_date': '20131017', + 'duration': 469.88, + }, + 'params': { + 'skip_download': True, + }, + }, + { + 'url': 'http://www.fem.com/videos/beauty-lifestyle/kurztrips-zum-valentinstag', + 'info_dict': { + 'id': '2156342', + 'ext': 'mp4', + 'title': 'Kurztrips zum Valentinstag', + 'description': 'Romantischer Kurztrip zum Valentinstag? Nina Heinemann verrät, was sich hier wirklich lohnt.', + 'duration': 307.24, + }, + 'params': { + 'skip_download': True, + }, + }, + { + 'url': 'http://www.prosieben.de/tv/joko-gegen-klaas/videos/playlists/episode-8-ganze-folge-playlist', + 'info_dict': { + 'id': '439664', + 'title': 'Episode 8 - Ganze Folge - Playlist', + 'description': 'md5:63b8963e71f481782aeea877658dec84', + }, + 'playlist_count': 2, + 'skip': 'This video is unavailable', + }, + { + # title in <h2 class="subtitle"> + 'url': 'http://www.prosieben.de/stars/oscar-award/videos/jetzt-erst-enthuellt-das-geheimnis-von-emma-stones-oscar-robe-clip', + 'info_dict': { + 'id': '4895826', + 'ext': 'mp4', + 'title': 'Jetzt erst enthüllt: Das Geheimnis von Emma Stones Oscar-Robe', + 'description': 'md5:e5ace2bc43fadf7b63adc6187e9450b9', + 'upload_date': '20170302', + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'geo restricted to Germany', + }, + { + # geo restricted to Germany + 'url': 'http://www.kabeleinsdoku.de/tv/mayday-alarm-im-cockpit/video/102-notlandung-im-hudson-river-ganze-folge', + 'only_matching': True, + }, + { + # geo restricted to Germany + 'url': 'http://www.sat1gold.de/tv/edel-starck/video/11-staffel-1-episode-1-partner-wider-willen-ganze-folge', + 'only_matching': True, + }, + { + # geo restricted to Germany + 'url': 'https://www.galileo.tv/video/diese-emojis-werden-oft-missverstanden', + 'only_matching': True, + }, + { + 'url': 'http://www.sat1gold.de/tv/edel-starck/playlist/die-gesamte-1-staffel', + 'only_matching': True, + }, + { + 'url': 'http://www.advopedia.de/videos/lenssen-klaert-auf/lenssen-klaert-auf-folge-8-staffel-3-feiertage-und-freie-tage', + 'only_matching': True, + }, + ] + + _TOKEN = 'prosieben' + _SALT = '01!8d8F_)r9]4s[qeuXfP%' + _CLIENT_NAME = 'kolibri-2.0.19-splec4' + + _ACCESS_ID = 'x_prosiebenmaxx-de' + _ENCRYPTION_KEY = 'Eeyeey9oquahthainoofashoyoikosag' + _IV = 'Aeluchoc6aevechuipiexeeboowedaok' + + _CLIPID_REGEXES = [ + r'"clip_id"\s*:\s+"(\d+)"', + r'clipid: "(\d+)"', + r'clip[iI]d=(\d+)', + r'clip[iI][dD]\s*=\s*["\'](\d+)', + r"'itemImageUrl'\s*:\s*'/dynamic/thumbnails/full/\d+/(\d+)", + r'proMamsId"\s*:\s*"(\d+)', + r'proMamsId"\s*:\s*"(\d+)', + ] + _TITLE_REGEXES = [ + r'<h2 class="subtitle" itemprop="name">\s*(.+?)</h2>', + r'<header class="clearfix">\s*<h3>(.+?)</h3>', + r'<!-- start video -->\s*<h1>(.+?)</h1>', + r'<h1 class="att-name">\s*(.+?)</h1>', + r'<header class="module_header">\s*<h2>([^<]+)</h2>\s*</header>', + r'<h2 class="video-title" itemprop="name">\s*(.+?)</h2>', + r'<div[^>]+id="veeseoTitle"[^>]*>(.+?)</div>', + r'<h2[^>]+class="subtitle"[^>]*>([^<]+)</h2>', + ] + _DESCRIPTION_REGEXES = [ + r'<p itemprop="description">\s*(.+?)</p>', + r'<div class="videoDecription">\s*<p><strong>Beschreibung</strong>: (.+?)</p>', + r'<div class="g-plusone" data-size="medium"></div>\s*</div>\s*</header>\s*(.+?)\s*<footer>', + r'<p class="att-description">\s*(.+?)\s*</p>', + r'<p class="video-description" itemprop="description">\s*(.+?)</p>', + r'<div[^>]+id="veeseoDescription"[^>]*>(.+?)</div>', + ] + _UPLOAD_DATE_REGEXES = [ + r'<span>\s*(\d{2}\.\d{2}\.\d{4} \d{2}:\d{2}) \|\s*<span itemprop="duration"', + r'<footer>\s*(\d{2}\.\d{2}\.\d{4}) \d{2}:\d{2} Uhr', + r'<span style="padding-left: 4px;line-height:20px; color:#404040">(\d{2}\.\d{2}\.\d{4})</span>', + r'(\d{2}\.\d{2}\.\d{4}) \| \d{2}:\d{2} Min<br/>', + ] + _PAGE_TYPE_REGEXES = [ + r'<meta name="page_type" content="([^"]+)">', + r"'itemType'\s*:\s*'([^']*)'", + ] + _PLAYLIST_ID_REGEXES = [ + r'content[iI]d=(\d+)', + r"'itemId'\s*:\s*'([^']*)'", + ] + _PLAYLIST_CLIP_REGEXES = [ + r'(?s)data-qvt=.+?<a href="([^"]+)"', + ] + + def _extract_clip(self, url, webpage): + clip_id = self._html_search_regex( + self._CLIPID_REGEXES, webpage, 'clip id') + title = self._html_search_regex( + self._TITLE_REGEXES, webpage, 'title', + default=None) or self._og_search_title(webpage) + info = self._extract_video_info(url, clip_id) + description = self._html_search_regex( + self._DESCRIPTION_REGEXES, webpage, 'description', default=None) + if description is None: + description = self._og_search_description(webpage) + thumbnail = self._og_search_thumbnail(webpage) + upload_date = unified_strdate( + self._html_search_meta('og:published_time', webpage, + 'upload date', default=None) + or self._html_search_regex(self._UPLOAD_DATE_REGEXES, + webpage, 'upload date', default=None)) + + json_ld = self._search_json_ld(webpage, clip_id, default={}) + + return merge_dicts(info, { + 'id': clip_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + }, json_ld) + + def _extract_playlist(self, url, webpage): + playlist_id = self._html_search_regex( + self._PLAYLIST_ID_REGEXES, webpage, 'playlist id') + playlist = self._parse_json( + self._search_regex( + r'var\s+contentResources\s*=\s*(\[.+?\]);\s*</script', + webpage, 'playlist'), + playlist_id) + entries = [] + for item in playlist: + clip_id = item.get('id') or item.get('upc') + if not clip_id: + continue + info = self._extract_video_info(url, clip_id) + info.update({ + 'id': clip_id, + 'title': item.get('title') or item.get('teaser', {}).get('headline'), + 'description': item.get('teaser', {}).get('description'), + 'thumbnail': item.get('poster'), + 'duration': float_or_none(item.get('duration')), + 'series': item.get('tvShowTitle'), + 'uploader': item.get('broadcastPublisher'), + }) + entries.append(info) + return self.playlist_result(entries, playlist_id) + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + page_type = self._search_regex( + self._PAGE_TYPE_REGEXES, webpage, + 'page type', default='clip').lower() + if page_type == 'clip': + return self._extract_clip(url, webpage) + elif page_type == 'playlist': + return self._extract_playlist(url, webpage) + else: + raise ExtractorError( + 'Unsupported page type %s' % page_type, expected=True) diff --git a/youtube_dl/extractor/puhutv.py b/yt_dlp/extractor/puhutv.py index ca71665e0..ca71665e0 100644 --- a/youtube_dl/extractor/puhutv.py +++ b/yt_dlp/extractor/puhutv.py diff --git a/youtube_dl/extractor/puls4.py b/yt_dlp/extractor/puls4.py index 80091b85f..80091b85f 100644 --- a/youtube_dl/extractor/puls4.py +++ b/yt_dlp/extractor/puls4.py diff --git a/yt_dlp/extractor/pyvideo.py b/yt_dlp/extractor/pyvideo.py new file mode 100644 index 000000000..869619723 --- /dev/null +++ b/yt_dlp/extractor/pyvideo.py @@ -0,0 +1,72 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import int_or_none + + +class PyvideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pyvideo\.org/(?P<category>[^/]+)/(?P<id>[^/?#&.]+)' + + _TESTS = [{ + 'url': 'http://pyvideo.org/pycon-us-2013/become-a-logging-expert-in-30-minutes.html', + 'info_dict': { + 'id': 'become-a-logging-expert-in-30-minutes', + }, + 'playlist_count': 2, + }, { + 'url': 'http://pyvideo.org/pygotham-2012/gloriajw-spotifywitherikbernhardsson182m4v.html', + 'md5': '5fe1c7e0a8aa5570330784c847ff6d12', + 'info_dict': { + 'id': '2542', + 'ext': 'm4v', + 'title': 'Gloriajw-SpotifyWithErikBernhardsson182.m4v', + }, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + category = mobj.group('category') + video_id = mobj.group('id') + + entries = [] + + data = self._download_json( + 'https://raw.githubusercontent.com/pyvideo/data/master/%s/videos/%s.json' + % (category, video_id), video_id, fatal=False) + + if data: + for video in data['videos']: + video_url = video.get('url') + if video_url: + if video.get('type') == 'youtube': + entries.append(self.url_result(video_url, 'Youtube')) + else: + entries.append({ + 'id': compat_str(data.get('id') or video_id), + 'url': video_url, + 'title': data['title'], + 'description': data.get('description') or data.get('summary'), + 'thumbnail': data.get('thumbnail_url'), + 'duration': int_or_none(data.get('duration')), + }) + else: + webpage = self._download_webpage(url, video_id) + title = self._og_search_title(webpage) + media_urls = self._search_regex( + r'(?s)Media URL:(.+?)</li>', webpage, 'media urls') + for m in re.finditer( + r'<a[^>]+href=(["\'])(?P<url>http.+?)\1', media_urls): + media_url = m.group('url') + if re.match(r'https?://www\.youtube\.com/watch\?v=.*', media_url): + entries.append(self.url_result(media_url, 'Youtube')) + else: + entries.append({ + 'id': video_id, + 'url': media_url, + 'title': title, + }) + + return self.playlist_result(entries, video_id) diff --git a/yt_dlp/extractor/qqmusic.py b/yt_dlp/extractor/qqmusic.py new file mode 100644 index 000000000..0106d166f --- /dev/null +++ b/yt_dlp/extractor/qqmusic.py @@ -0,0 +1,369 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import random +import re +import time + +from .common import InfoExtractor +from ..utils import ( + clean_html, + ExtractorError, + strip_jsonp, + unescapeHTML, +) + + +class QQMusicIE(InfoExtractor): + IE_NAME = 'qqmusic' + IE_DESC = 'QQ音乐' + _VALID_URL = r'https?://y\.qq\.com/n/yqq/song/(?P<id>[0-9A-Za-z]+)\.html' + _TESTS = [{ + 'url': 'https://y.qq.com/n/yqq/song/004295Et37taLD.html', + 'md5': '5f1e6cea39e182857da7ffc5ef5e6bb8', + 'info_dict': { + 'id': '004295Et37taLD', + 'ext': 'mp3', + 'title': '可惜没如果', + 'release_date': '20141227', + 'creator': '林俊杰', + 'description': 'md5:d85afb3051952ecc50a1ee8a286d1eac', + 'thumbnail': r're:^https?://.*\.jpg$', + } + }, { + 'note': 'There is no mp3-320 version of this song.', + 'url': 'https://y.qq.com/n/yqq/song/004MsGEo3DdNxV.html', + 'md5': 'fa3926f0c585cda0af8fa4f796482e3e', + 'info_dict': { + 'id': '004MsGEo3DdNxV', + 'ext': 'mp3', + 'title': '如果', + 'release_date': '20050626', + 'creator': '李季美', + 'description': 'md5:46857d5ed62bc4ba84607a805dccf437', + 'thumbnail': r're:^https?://.*\.jpg$', + } + }, { + 'note': 'lyrics not in .lrc format', + 'url': 'https://y.qq.com/n/yqq/song/001JyApY11tIp6.html', + 'info_dict': { + 'id': '001JyApY11tIp6', + 'ext': 'mp3', + 'title': 'Shadows Over Transylvania', + 'release_date': '19970225', + 'creator': 'Dark Funeral', + 'description': 'md5:c9b20210587cbcd6836a1c597bab4525', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + 'skip_download': True, + }, + }] + + _FORMATS = { + 'mp3-320': {'prefix': 'M800', 'ext': 'mp3', 'preference': 40, 'abr': 320}, + 'mp3-128': {'prefix': 'M500', 'ext': 'mp3', 'preference': 30, 'abr': 128}, + 'm4a': {'prefix': 'C200', 'ext': 'm4a', 'preference': 10} + } + + # Reference: m_r_GetRUin() in top_player.js + # http://imgcache.gtimg.cn/music/portal_v3/y/top_player.js + @staticmethod + def m_r_get_ruin(): + curMs = int(time.time() * 1000) % 1000 + return int(round(random.random() * 2147483647) * curMs % 1E10) + + def _real_extract(self, url): + mid = self._match_id(url) + + detail_info_page = self._download_webpage( + 'http://s.plcloud.music.qq.com/fcgi-bin/fcg_yqq_song_detail_info.fcg?songmid=%s&play=0' % mid, + mid, note='Download song detail info', + errnote='Unable to get song detail info', encoding='gbk') + + song_name = self._html_search_regex( + r"songname:\s*'([^']+)'", detail_info_page, 'song name') + + publish_time = self._html_search_regex( + r'发行时间:(\d{4}-\d{2}-\d{2})', detail_info_page, + 'publish time', default=None) + if publish_time: + publish_time = publish_time.replace('-', '') + + singer = self._html_search_regex( + r"singer:\s*'([^']+)", detail_info_page, 'singer', default=None) + + lrc_content = self._html_search_regex( + r'<div class="content" id="lrc_content"[^<>]*>([^<>]+)</div>', + detail_info_page, 'LRC lyrics', default=None) + if lrc_content: + lrc_content = lrc_content.replace('\\n', '\n') + + thumbnail_url = None + albummid = self._search_regex( + [r'albummid:\'([0-9a-zA-Z]+)\'', r'"albummid":"([0-9a-zA-Z]+)"'], + detail_info_page, 'album mid', default=None) + if albummid: + thumbnail_url = 'http://i.gtimg.cn/music/photo/mid_album_500/%s/%s/%s.jpg' \ + % (albummid[-2:-1], albummid[-1], albummid) + + guid = self.m_r_get_ruin() + + vkey = self._download_json( + 'http://base.music.qq.com/fcgi-bin/fcg_musicexpress.fcg?json=3&guid=%s' % guid, + mid, note='Retrieve vkey', errnote='Unable to get vkey', + transform_source=strip_jsonp)['key'] + + formats = [] + for format_id, details in self._FORMATS.items(): + formats.append({ + 'url': 'http://cc.stream.qqmusic.qq.com/%s%s.%s?vkey=%s&guid=%s&fromtag=0' + % (details['prefix'], mid, details['ext'], vkey, guid), + 'format': format_id, + 'format_id': format_id, + 'quality': details['preference'], + 'abr': details.get('abr'), + }) + self._check_formats(formats, mid) + self._sort_formats(formats) + + actual_lrc_lyrics = ''.join( + line + '\n' for line in re.findall( + r'(?m)^(\[[0-9]{2}:[0-9]{2}(?:\.[0-9]{2,})?\][^\n]*|\[[^\]]*\])', lrc_content)) + + info_dict = { + 'id': mid, + 'formats': formats, + 'title': song_name, + 'release_date': publish_time, + 'creator': singer, + 'description': lrc_content, + 'thumbnail': thumbnail_url + } + if actual_lrc_lyrics: + info_dict['subtitles'] = { + 'origin': [{ + 'ext': 'lrc', + 'data': actual_lrc_lyrics, + }] + } + return info_dict + + +class QQPlaylistBaseIE(InfoExtractor): + @staticmethod + def qq_static_url(category, mid): + return 'http://y.qq.com/y/static/%s/%s/%s/%s.html' % (category, mid[-2], mid[-1], mid) + + def get_singer_all_songs(self, singmid, num): + return self._download_webpage( + r'https://c.y.qq.com/v8/fcg-bin/fcg_v8_singer_track_cp.fcg', singmid, + query={ + 'format': 'json', + 'inCharset': 'utf8', + 'outCharset': 'utf-8', + 'platform': 'yqq', + 'needNewCode': 0, + 'singermid': singmid, + 'order': 'listen', + 'begin': 0, + 'num': num, + 'songstatus': 1, + }) + + def get_entries_from_page(self, singmid): + entries = [] + + default_num = 1 + json_text = self.get_singer_all_songs(singmid, default_num) + json_obj_all_songs = self._parse_json(json_text, singmid) + + if json_obj_all_songs['code'] == 0: + total = json_obj_all_songs['data']['total'] + json_text = self.get_singer_all_songs(singmid, total) + json_obj_all_songs = self._parse_json(json_text, singmid) + + for item in json_obj_all_songs['data']['list']: + if item['musicData'].get('songmid') is not None: + songmid = item['musicData']['songmid'] + entries.append(self.url_result( + r'https://y.qq.com/n/yqq/song/%s.html' % songmid, 'QQMusic', songmid)) + + return entries + + +class QQMusicSingerIE(QQPlaylistBaseIE): + IE_NAME = 'qqmusic:singer' + IE_DESC = 'QQ音乐 - 歌手' + _VALID_URL = r'https?://y\.qq\.com/n/yqq/singer/(?P<id>[0-9A-Za-z]+)\.html' + _TEST = { + 'url': 'https://y.qq.com/n/yqq/singer/001BLpXF2DyJe2.html', + 'info_dict': { + 'id': '001BLpXF2DyJe2', + 'title': '林俊杰', + 'description': 'md5:870ec08f7d8547c29c93010899103751', + }, + 'playlist_mincount': 12, + } + + def _real_extract(self, url): + mid = self._match_id(url) + + entries = self.get_entries_from_page(mid) + singer_page = self._download_webpage(url, mid, 'Download singer page') + singer_name = self._html_search_regex( + r"singername\s*:\s*'(.*?)'", singer_page, 'singer name', default=None) + singer_desc = None + + if mid: + singer_desc_page = self._download_xml( + 'http://s.plcloud.music.qq.com/fcgi-bin/fcg_get_singer_desc.fcg', mid, + 'Donwload singer description XML', + query={'utf8': 1, 'outCharset': 'utf-8', 'format': 'xml', 'singermid': mid}, + headers={'Referer': 'https://y.qq.com/n/yqq/singer/'}) + + singer_desc = singer_desc_page.find('./data/info/desc').text + + return self.playlist_result(entries, mid, singer_name, singer_desc) + + +class QQMusicAlbumIE(QQPlaylistBaseIE): + IE_NAME = 'qqmusic:album' + IE_DESC = 'QQ音乐 - 专辑' + _VALID_URL = r'https?://y\.qq\.com/n/yqq/album/(?P<id>[0-9A-Za-z]+)\.html' + + _TESTS = [{ + 'url': 'https://y.qq.com/n/yqq/album/000gXCTb2AhRR1.html', + 'info_dict': { + 'id': '000gXCTb2AhRR1', + 'title': '我们都是这样长大的', + 'description': 'md5:179c5dce203a5931970d306aa9607ea6', + }, + 'playlist_count': 4, + }, { + 'url': 'https://y.qq.com/n/yqq/album/002Y5a3b3AlCu3.html', + 'info_dict': { + 'id': '002Y5a3b3AlCu3', + 'title': '그리고...', + 'description': 'md5:a48823755615508a95080e81b51ba729', + }, + 'playlist_count': 8, + }] + + def _real_extract(self, url): + mid = self._match_id(url) + + album = self._download_json( + 'http://i.y.qq.com/v8/fcg-bin/fcg_v8_album_info_cp.fcg?albummid=%s&format=json' % mid, + mid, 'Download album page')['data'] + + entries = [ + self.url_result( + 'https://y.qq.com/n/yqq/song/' + song['songmid'] + '.html', 'QQMusic', song['songmid'] + ) for song in album['list'] + ] + album_name = album.get('name') + album_detail = album.get('desc') + if album_detail is not None: + album_detail = album_detail.strip() + + return self.playlist_result(entries, mid, album_name, album_detail) + + +class QQMusicToplistIE(QQPlaylistBaseIE): + IE_NAME = 'qqmusic:toplist' + IE_DESC = 'QQ音乐 - 排行榜' + _VALID_URL = r'https?://y\.qq\.com/n/yqq/toplist/(?P<id>[0-9]+)\.html' + + _TESTS = [{ + 'url': 'https://y.qq.com/n/yqq/toplist/123.html', + 'info_dict': { + 'id': '123', + 'title': '美国iTunes榜', + 'description': 'md5:89db2335fdbb10678dee2d43fe9aba08', + }, + 'playlist_count': 100, + }, { + 'url': 'https://y.qq.com/n/yqq/toplist/3.html', + 'info_dict': { + 'id': '3', + 'title': '巅峰榜·欧美', + 'description': 'md5:5a600d42c01696b26b71f8c4d43407da', + }, + 'playlist_count': 100, + }, { + 'url': 'https://y.qq.com/n/yqq/toplist/106.html', + 'info_dict': { + 'id': '106', + 'title': '韩国Mnet榜', + 'description': 'md5:cb84b325215e1d21708c615cac82a6e7', + }, + 'playlist_count': 50, + }] + + def _real_extract(self, url): + list_id = self._match_id(url) + + toplist_json = self._download_json( + 'http://i.y.qq.com/v8/fcg-bin/fcg_v8_toplist_cp.fcg', list_id, + note='Download toplist page', + query={'type': 'toplist', 'topid': list_id, 'format': 'json'}) + + entries = [self.url_result( + 'https://y.qq.com/n/yqq/song/' + song['data']['songmid'] + '.html', 'QQMusic', + song['data']['songmid']) + for song in toplist_json['songlist']] + + topinfo = toplist_json.get('topinfo', {}) + list_name = topinfo.get('ListName') + list_description = topinfo.get('info') + return self.playlist_result(entries, list_id, list_name, list_description) + + +class QQMusicPlaylistIE(QQPlaylistBaseIE): + IE_NAME = 'qqmusic:playlist' + IE_DESC = 'QQ音乐 - 歌单' + _VALID_URL = r'https?://y\.qq\.com/n/yqq/playlist/(?P<id>[0-9]+)\.html' + + _TESTS = [{ + 'url': 'http://y.qq.com/n/yqq/playlist/3462654915.html', + 'info_dict': { + 'id': '3462654915', + 'title': '韩国5月新歌精选下旬', + 'description': 'md5:d2c9d758a96b9888cf4fe82f603121d4', + }, + 'playlist_count': 40, + 'skip': 'playlist gone', + }, { + 'url': 'https://y.qq.com/n/yqq/playlist/1374105607.html', + 'info_dict': { + 'id': '1374105607', + 'title': '易入人心的华语民谣', + 'description': '民谣的歌曲易于传唱、、歌词朗朗伤口、旋律简单温馨。属于那种才入耳孔。却上心头的感觉。没有太多的复杂情绪。简单而直接地表达乐者的情绪,就是这样的简单才易入人心。', + }, + 'playlist_count': 20, + }] + + def _real_extract(self, url): + list_id = self._match_id(url) + + list_json = self._download_json( + 'http://i.y.qq.com/qzone-music/fcg-bin/fcg_ucc_getcdinfo_byids_cp.fcg', + list_id, 'Download list page', + query={'type': 1, 'json': 1, 'utf8': 1, 'onlysong': 0, 'disstid': list_id}, + transform_source=strip_jsonp) + if not len(list_json.get('cdlist', [])): + if list_json.get('code'): + raise ExtractorError( + 'QQ Music said: error %d in fetching playlist info' % list_json['code'], + expected=True) + raise ExtractorError('Unable to get playlist info') + + cdlist = list_json['cdlist'][0] + entries = [self.url_result( + 'https://y.qq.com/n/yqq/song/' + song['songmid'] + '.html', 'QQMusic', song['songmid']) + for song in cdlist['songlist']] + + list_name = cdlist.get('dissname') + list_description = clean_html(unescapeHTML(cdlist.get('desc'))) + return self.playlist_result(entries, list_id, list_name, list_description) diff --git a/youtube_dl/extractor/r7.py b/yt_dlp/extractor/r7.py index e2202d603..e2202d603 100644 --- a/youtube_dl/extractor/r7.py +++ b/yt_dlp/extractor/r7.py diff --git a/yt_dlp/extractor/radiko.py b/yt_dlp/extractor/radiko.py new file mode 100644 index 000000000..1e60de153 --- /dev/null +++ b/yt_dlp/extractor/radiko.py @@ -0,0 +1,234 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import base64 +import calendar +import datetime + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + update_url_query, + clean_html, + unified_timestamp, +) +from ..compat import compat_urllib_parse + + +class RadikoBaseIE(InfoExtractor): + _FULL_KEY = None + + def _auth_client(self): + auth_cache = self._downloader.cache.load('radiko', 'auth_data') + if auth_cache: + return auth_cache + + _, auth1_handle = self._download_webpage_handle( + 'https://radiko.jp/v2/api/auth1', None, 'Downloading authentication page', + headers={ + 'x-radiko-app': 'pc_html5', + 'x-radiko-app-version': '0.0.1', + 'x-radiko-device': 'pc', + 'x-radiko-user': 'dummy_user', + }) + auth1_header = auth1_handle.info() + + auth_token = auth1_header['X-Radiko-AuthToken'] + kl = int(auth1_header['X-Radiko-KeyLength']) + ko = int(auth1_header['X-Radiko-KeyOffset']) + raw_partial_key = self._extract_full_key()[ko:ko + kl] + partial_key = base64.b64encode(raw_partial_key).decode() + + area_id = self._download_webpage( + 'https://radiko.jp/v2/api/auth2', None, 'Authenticating', + headers={ + 'x-radiko-device': 'pc', + 'x-radiko-user': 'dummy_user', + 'x-radiko-authtoken': auth_token, + 'x-radiko-partialkey': partial_key, + }).split(',')[0] + + auth_data = (auth_token, area_id) + self._downloader.cache.store('radiko', 'auth_data', auth_data) + return auth_data + + def _extract_full_key(self): + if self._FULL_KEY: + return self._FULL_KEY + + jscode = self._download_webpage( + 'https://radiko.jp/apps/js/playerCommon.js', None, + note='Downloading player js code') + full_key = self._search_regex( + (r"RadikoJSPlayer\([^,]*,\s*(['\"])pc_html5\1,\s*(['\"])(?P<fullkey>[0-9a-f]+)\2,\s*{"), + jscode, 'full key', fatal=False, group='fullkey') + + if full_key: + full_key = full_key.encode() + else: # use full key ever known + full_key = b'bcd151073c03b352e1ef2fd66c32209da9ca0afa' + + self._FULL_KEY = full_key + return full_key + + def _find_program(self, video_id, station, cursor): + station_program = self._download_xml( + 'https://radiko.jp/v3/program/station/weekly/%s.xml' % station, video_id, + note='Downloading radio program for %s station' % station) + + prog = None + for p in station_program.findall('.//prog'): + ft_str, to_str = p.attrib['ft'], p.attrib['to'] + ft = unified_timestamp(ft_str, False) + to = unified_timestamp(to_str, False) + if ft <= cursor and cursor < to: + prog = p + break + if not prog: + raise ExtractorError('Cannot identify radio program to download!') + assert ft, to + return prog, station_program, ft, ft_str, to_str + + def _extract_formats(self, video_id, station, is_onair, ft, cursor, auth_token, area_id, query): + m3u8_playlist_data = self._download_xml( + 'https://radiko.jp/v3/station/stream/pc_html5/%s.xml' % station, video_id, + note='Downloading m3u8 information') + m3u8_urls = m3u8_playlist_data.findall('.//url') + + formats = [] + found = set() + for url_tag in m3u8_urls: + pcu = url_tag.find('playlist_create_url') + url_attrib = url_tag.attrib + playlist_url = update_url_query(pcu.text, { + 'station_id': station, + **query, + 'l': '15', + 'lsid': '77d0678df93a1034659c14d6fc89f018', + 'type': 'b', + }) + if playlist_url in found: + continue + else: + found.add(playlist_url) + + time_to_skip = None if is_onair else cursor - ft + + subformats = self._extract_m3u8_formats( + playlist_url, video_id, ext='m4a', + live=True, fatal=False, m3u8_id=None, + headers={ + 'X-Radiko-AreaId': area_id, + 'X-Radiko-AuthToken': auth_token, + }) + for sf in subformats: + domain = sf['format_id'] = compat_urllib_parse.urlparse(sf['url']).netloc + if re.match(r'^[cf]-radiko\.smartstream\.ne\.jp$', domain): + # Prioritize live radio vs playback based on extractor + sf['preference'] = 100 if is_onair else -100 + if not is_onair and url_attrib['timefree'] == '1' and time_to_skip: + sf['_ffmpeg_args'] = ['-ss', time_to_skip] + formats.extend(subformats) + + self._sort_formats(formats) + return formats + + +class RadikoIE(RadikoBaseIE): + _VALID_URL = r'https?://(?:www\.)?radiko\.jp/#!/ts/(?P<station>[A-Z0-9-]+)/(?P<id>\d+)' + + _TESTS = [{ + # QRR (文化放送) station provides <desc> + 'url': 'https://radiko.jp/#!/ts/QRR/20210425101300', + 'only_matching': True, + }, { + # FMT (TOKYO FM) station does not provide <desc> + 'url': 'https://radiko.jp/#!/ts/FMT/20210810150000', + 'only_matching': True, + }, { + 'url': 'https://radiko.jp/#!/ts/JOAK-FM/20210509090000', + 'only_matching': True, + }] + + def _real_extract(self, url): + station, video_id = self._match_valid_url(url).groups() + vid_int = unified_timestamp(video_id, False) + + auth_token, area_id = self._auth_client() + + prog, station_program, ft, radio_begin, radio_end = self._find_program(video_id, station, vid_int) + + title = prog.find('title').text + description = clean_html(prog.find('info').text) + station_name = station_program.find('.//name').text + + formats = self._extract_formats( + video_id=video_id, station=station, is_onair=False, + ft=ft, cursor=vid_int, auth_token=auth_token, area_id=area_id, + query={ + 'start_at': radio_begin, + 'ft': radio_begin, + 'end_at': radio_end, + 'to': radio_end, + 'seek': video_id, + }) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'uploader': station_name, + 'uploader_id': station, + 'timestamp': vid_int, + 'formats': formats, + 'is_live': True, + } + + +class RadikoRadioIE(RadikoBaseIE): + _VALID_URL = r'https?://(?:www\.)?radiko\.jp/#!/live/(?P<id>[A-Z0-9-]+)' + + _TESTS = [{ + # QRR (文化放送) station provides <desc> + 'url': 'https://radiko.jp/#!/live/QRR', + 'only_matching': True, + }, { + # FMT (TOKYO FM) station does not provide <desc> + 'url': 'https://radiko.jp/#!/live/FMT', + 'only_matching': True, + }, { + 'url': 'https://radiko.jp/#!/live/JOAK-FM', + 'only_matching': True, + }] + + def _real_extract(self, url): + station = self._match_id(url) + self.report_warning('Downloader will not stop at the end of the program! Press Ctrl+C to stop') + + auth_token, area_id = self._auth_client() + # get current time in JST (GMT+9:00 w/o DST) + vid_now = datetime.datetime.now(datetime.timezone(datetime.timedelta(hours=9))) + vid_now = calendar.timegm(vid_now.timetuple()) + + prog, station_program, ft, _, _ = self._find_program(station, station, vid_now) + + title = prog.find('title').text + description = clean_html(prog.find('info').text) + station_name = station_program.find('.//name').text + + formats = self._extract_formats( + video_id=station, station=station, is_onair=True, + ft=ft, cursor=vid_now, auth_token=auth_token, area_id=area_id, + query={}) + + return { + 'id': station, + 'title': title, + 'description': description, + 'uploader': station_name, + 'uploader_id': station, + 'timestamp': ft, + 'formats': formats, + 'is_live': True, + } diff --git a/youtube_dl/extractor/radiobremen.py b/yt_dlp/extractor/radiobremen.py index 2c35f9845..2c35f9845 100644 --- a/youtube_dl/extractor/radiobremen.py +++ b/yt_dlp/extractor/radiobremen.py diff --git a/yt_dlp/extractor/radiocanada.py b/yt_dlp/extractor/radiocanada.py new file mode 100644 index 000000000..4b4445c30 --- /dev/null +++ b/yt_dlp/extractor/radiocanada.py @@ -0,0 +1,170 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + unified_strdate, +) + + +class RadioCanadaIE(InfoExtractor): + IE_NAME = 'radiocanada' + _VALID_URL = r'(?:radiocanada:|https?://ici\.radio-canada\.ca/widgets/mediaconsole/)(?P<app_code>[^:/]+)[:/](?P<id>[0-9]+)' + _TESTS = [ + { + 'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7184272', + 'info_dict': { + 'id': '7184272', + 'ext': 'mp4', + 'title': 'Le parcours du tireur capté sur vidéo', + 'description': 'Images des caméras de surveillance fournies par la GRC montrant le parcours du tireur d\'Ottawa', + 'upload_date': '20141023', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + }, + { + # empty Title + 'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7754998/', + 'info_dict': { + 'id': '7754998', + 'ext': 'mp4', + 'title': 'letelejournal22h', + 'description': 'INTEGRALE WEB 22H-TJ', + 'upload_date': '20170720', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + { + # with protectionType but not actually DRM protected + 'url': 'radiocanada:toutv:140872', + 'info_dict': { + 'id': '140872', + 'title': 'Épisode 1', + 'series': 'District 31', + }, + 'only_matching': True, + } + ] + _GEO_COUNTRIES = ['CA'] + _access_token = None + _claims = None + + def _call_api(self, path, video_id=None, app_code=None, query=None): + if not query: + query = {} + query.update({ + 'client_key': '773aea60-0e80-41bb-9c7f-e6d7c3ad17fb', + 'output': 'json', + }) + if video_id: + query.update({ + 'appCode': app_code, + 'idMedia': video_id, + }) + if self._access_token: + query['access_token'] = self._access_token + try: + return self._download_json( + 'https://services.radio-canada.ca/media/' + path, video_id, query=query) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 422): + data = self._parse_json(e.cause.read().decode(), None) + error = data.get('error_description') or data['errorMessage']['text'] + raise ExtractorError(error, expected=True) + raise + + def _extract_info(self, app_code, video_id): + metas = self._call_api('meta/v1/index.ashx', video_id, app_code)['Metas'] + + def get_meta(name): + for meta in metas: + if meta.get('name') == name: + text = meta.get('text') + if text: + return text + + # protectionType does not necessarily mean the video is DRM protected (see + # https://github.com/ytdl-org/youtube-dl/pull/18609). + if get_meta('protectionType'): + self.report_warning('This video is probably DRM protected.') + + query = { + 'connectionType': 'hd', + 'deviceType': 'ipad', + 'multibitrate': 'true', + } + if self._claims: + query['claims'] = self._claims + v_data = self._call_api('validation/v2/', video_id, app_code, query) + v_url = v_data.get('url') + if not v_url: + error = v_data['message'] + if error == "Le contenu sélectionné n'est pas disponible dans votre pays": + raise self.raise_geo_restricted(error, self._GEO_COUNTRIES) + if error == 'Le contenu sélectionné est disponible seulement en premium': + self.raise_login_required(error) + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error), expected=True) + formats = self._extract_m3u8_formats(v_url, video_id, 'mp4') + self._sort_formats(formats) + + subtitles = {} + closed_caption_url = get_meta('closedCaption') or get_meta('closedCaptionHTML5') + if closed_caption_url: + subtitles['fr'] = [{ + 'url': closed_caption_url, + 'ext': determine_ext(closed_caption_url, 'vtt'), + }] + + return { + 'id': video_id, + 'title': get_meta('Title') or get_meta('AV-nomEmission'), + 'description': get_meta('Description') or get_meta('ShortDescription'), + 'thumbnail': get_meta('imageHR') or get_meta('imageMR') or get_meta('imageBR'), + 'duration': int_or_none(get_meta('length')), + 'series': get_meta('Emission'), + 'season_number': int_or_none('SrcSaison'), + 'episode_number': int_or_none('SrcEpisode'), + 'upload_date': unified_strdate(get_meta('Date')), + 'subtitles': subtitles, + 'formats': formats, + } + + def _real_extract(self, url): + return self._extract_info(*self._match_valid_url(url).groups()) + + +class RadioCanadaAudioVideoIE(InfoExtractor): + IE_NAME = 'radiocanada:audiovideo' + _VALID_URL = r'https?://ici\.radio-canada\.ca/([^/]+/)*media-(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://ici.radio-canada.ca/audio-video/media-7527184/barack-obama-au-vietnam', + 'info_dict': { + 'id': '7527184', + 'ext': 'mp4', + 'title': 'Barack Obama au Vietnam', + 'description': 'Les États-Unis lèvent l\'embargo sur la vente d\'armes qui datait de la guerre du Vietnam', + 'upload_date': '20160523', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'https://ici.radio-canada.ca/info/videos/media-7527184/barack-obama-au-vietnam', + 'only_matching': True, + }] + + def _real_extract(self, url): + return self.url_result('radiocanada:medianet:%s' % self._match_id(url)) diff --git a/youtube_dl/extractor/radiode.py b/yt_dlp/extractor/radiode.py index 2c06c8b1e..2c06c8b1e 100644 --- a/youtube_dl/extractor/radiode.py +++ b/yt_dlp/extractor/radiode.py diff --git a/yt_dlp/extractor/radiofrance.py b/yt_dlp/extractor/radiofrance.py new file mode 100644 index 000000000..082238bbc --- /dev/null +++ b/yt_dlp/extractor/radiofrance.py @@ -0,0 +1,59 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class RadioFranceIE(InfoExtractor): + _VALID_URL = r'^https?://maison\.radiofrance\.fr/radiovisions/(?P<id>[^?#]+)' + IE_NAME = 'radiofrance' + + _TEST = { + 'url': 'http://maison.radiofrance.fr/radiovisions/one-one', + 'md5': 'bdbb28ace95ed0e04faab32ba3160daf', + 'info_dict': { + 'id': 'one-one', + 'ext': 'ogg', + 'title': 'One to one', + 'description': "Plutôt que d'imaginer la radio de demain comme technologie ou comme création de contenu, je veux montrer que quelles que soient ses évolutions, j'ai l'intime conviction que la radio continuera d'être un grand média de proximité pour les auditeurs.", + 'uploader': 'Thomas Hercouët', + }, + } + + def _real_extract(self, url): + m = self._match_valid_url(url) + video_id = m.group('id') + + webpage = self._download_webpage(url, video_id) + title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, 'title') + description = self._html_search_regex( + r'<div class="bloc_page_wrapper"><div class="text">(.*?)</div>', + webpage, 'description', fatal=False) + uploader = self._html_search_regex( + r'<div class="credit"> © (.*?)</div>', + webpage, 'uploader', fatal=False) + + formats_str = self._html_search_regex( + r'class="jp-jplayer[^"]*" data-source="([^"]+)">', + webpage, 'audio URLs') + formats = [ + { + 'format_id': fm[0], + 'url': fm[1], + 'vcodec': 'none', + 'quality': i, + } + for i, fm in + enumerate(re.findall(r"([a-z0-9]+)\s*:\s*'([^']+)'", formats_str)) + ] + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': description, + 'uploader': uploader, + } diff --git a/youtube_dl/extractor/radiojavan.py b/yt_dlp/extractor/radiojavan.py index 3f74f0c01..3f74f0c01 100644 --- a/youtube_dl/extractor/radiojavan.py +++ b/yt_dlp/extractor/radiojavan.py diff --git a/yt_dlp/extractor/radlive.py b/yt_dlp/extractor/radlive.py new file mode 100644 index 000000000..2de7ab04a --- /dev/null +++ b/yt_dlp/extractor/radlive.py @@ -0,0 +1,179 @@ +import json + +from ..utils import ExtractorError, traverse_obj, try_get, unified_timestamp +from .common import InfoExtractor + + +class RadLiveIE(InfoExtractor): + IE_NAME = 'radlive' + _VALID_URL = r'https?://(?:www\.)?rad\.live/content/(?P<content_type>feature|episode)/(?P<id>[a-f0-9-]+)' + _TESTS = [{ + 'url': 'https://rad.live/content/feature/dc5acfbc-761b-4bec-9564-df999905116a', + 'md5': '6219d5d31d52de87d21c9cf5b7cb27ff', + 'info_dict': { + 'id': 'dc5acfbc-761b-4bec-9564-df999905116a', + 'ext': 'mp4', + 'title': 'Deathpact - Digital Mirage 2 [Full Set]', + 'language': 'en', + 'thumbnail': 'https://static.12core.net/cb65ae077a079c68380e38f387fbc438.png', + 'description': '', + 'release_timestamp': 1600185600.0, + 'channel': 'Proximity', + 'channel_id': '9ce6dd01-70a4-4d59-afb6-d01f807cd009', + 'channel_url': 'https://rad.live/content/channel/9ce6dd01-70a4-4d59-afb6-d01f807cd009', + } + }, { + 'url': 'https://rad.live/content/episode/bbcf66ec-0d02-4ca0-8dc0-4213eb2429bf', + 'md5': '40b2175f347592125d93e9a344080125', + 'info_dict': { + 'id': 'bbcf66ec-0d02-4ca0-8dc0-4213eb2429bf', + 'ext': 'mp4', + 'title': 'E01: Bad Jokes 1', + 'language': 'en', + 'thumbnail': 'https://lsp.littlstar.com/channels/WHISTLE/BAD_JOKES/SEASON_1/BAD_JOKES_101/poster.jpg', + 'description': 'Bad Jokes - Champions, Adam Pally, Super Troopers, Team Edge and 2Hype', + 'release_timestamp': None, + 'channel': None, + 'channel_id': None, + 'channel_url': None, + 'episode': 'E01: Bad Jokes 1', + 'episode_number': 1, + 'episode_id': '336', + }, + }] + + def _real_extract(self, url): + content_type, video_id = self._match_valid_url(url).groups() + + webpage = self._download_webpage(url, video_id) + + content_info = json.loads(self._search_regex( + r'<script[^>]*type=([\'"])application/json\1[^>]*>(?P<json>{.+?})</script>', + webpage, 'video info', group='json'))['props']['pageProps']['initialContentData'] + video_info = content_info[content_type] + + if not video_info: + raise ExtractorError('Unable to extract video info, make sure the URL is valid') + + formats = self._extract_m3u8_formats(video_info['assets']['videos'][0]['url'], video_id) + self._sort_formats(formats) + + data = video_info.get('structured_data', {}) + + release_date = unified_timestamp(traverse_obj(data, ('releasedEvent', 'startDate'))) + channel = next(iter(content_info.get('channels', [])), {}) + channel_id = channel.get('lrn', '').split(':')[-1] or None + + result = { + 'id': video_id, + 'title': video_info['title'], + 'formats': formats, + 'language': traverse_obj(data, ('potentialAction', 'target', 'inLanguage')), + 'thumbnail': traverse_obj(data, ('image', 'contentUrl')), + 'description': data.get('description'), + 'release_timestamp': release_date, + 'channel': channel.get('name'), + 'channel_id': channel_id, + 'channel_url': f'https://rad.live/content/channel/{channel_id}' if channel_id else None, + + } + if content_type == 'episode': + result.update({ + # TODO: Get season number when downloading single episode + 'episode': video_info.get('title'), + 'episode_number': video_info.get('number'), + 'episode_id': video_info.get('id'), + }) + + return result + + +class RadLiveSeasonIE(RadLiveIE): + IE_NAME = 'radlive:season' + _VALID_URL = r'https?://(?:www\.)?rad\.live/content/season/(?P<id>[a-f0-9-]+)' + _TESTS = [{ + 'url': 'https://rad.live/content/season/08a290f7-c9ef-4e22-9105-c255995a2e75', + 'md5': '40b2175f347592125d93e9a344080125', + 'info_dict': { + 'id': '08a290f7-c9ef-4e22-9105-c255995a2e75', + 'title': 'Bad Jokes - Season 1', + }, + 'playlist_mincount': 5, + }] + + @classmethod + def suitable(cls, url): + return False if RadLiveIE.suitable(url) else super(RadLiveSeasonIE, cls).suitable(url) + + def _real_extract(self, url): + season_id = self._match_id(url) + webpage = self._download_webpage(url, season_id) + + content_info = json.loads(self._search_regex( + r'<script[^>]*type=([\'"])application/json\1[^>]*>(?P<json>{.+?})</script>', + webpage, 'video info', group='json'))['props']['pageProps']['initialContentData'] + video_info = content_info['season'] + + entries = [{ + '_type': 'url_transparent', + 'id': episode['structured_data']['url'].split('/')[-1], + 'url': episode['structured_data']['url'], + 'series': try_get(content_info, lambda x: x['series']['title']), + 'season': video_info['title'], + 'season_number': video_info.get('number'), + 'season_id': video_info.get('id'), + 'ie_key': RadLiveIE.ie_key(), + } for episode in video_info['episodes']] + + return self.playlist_result(entries, season_id, video_info.get('title')) + + +class RadLiveChannelIE(RadLiveIE): + IE_NAME = 'radlive:channel' + _VALID_URL = r'https?://(?:www\.)?rad\.live/content/channel/(?P<id>[a-f0-9-]+)' + _TESTS = [{ + 'url': 'https://rad.live/content/channel/5c4d8df4-6fa0-413c-81e3-873479b49274', + 'md5': '625156a08b7f2b0b849f234e664457ac', + 'info_dict': { + 'id': '5c4d8df4-6fa0-413c-81e3-873479b49274', + 'title': 'Whistle Sports', + }, + 'playlist_mincount': 7, + }] + + _QUERY = ''' +query WebChannelListing ($lrn: ID!) { + channel (id:$lrn) { + name + features { + structured_data + } + } +}''' + + @classmethod + def suitable(cls, url): + return False if RadLiveIE.suitable(url) else super(RadLiveChannelIE, cls).suitable(url) + + def _real_extract(self, url): + channel_id = self._match_id(url) + + graphql = self._download_json( + 'https://content.mhq.12core.net/graphql', channel_id, + headers={'Content-Type': 'application/json'}, + data=json.dumps({ + 'query': self._QUERY, + 'variables': {'lrn': f'lrn:12core:media:content:channel:{channel_id}'} + }).encode('utf-8')) + + data = traverse_obj(graphql, ('data', 'channel')) + if not data: + raise ExtractorError('Unable to extract video info, make sure the URL is valid') + + entries = [{ + '_type': 'url_transparent', + 'url': feature['structured_data']['url'], + 'ie_key': RadLiveIE.ie_key(), + } for feature in data['features']] + + return self.playlist_result(entries, channel_id, data.get('name')) diff --git a/yt_dlp/extractor/rai.py b/yt_dlp/extractor/rai.py new file mode 100644 index 000000000..27cd01801 --- /dev/null +++ b/yt_dlp/extractor/rai.py @@ -0,0 +1,587 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urlparse, +) +from ..utils import ( + determine_ext, + ExtractorError, + find_xpath_attr, + fix_xml_ampersands, + GeoRestrictedError, + HEADRequest, + int_or_none, + parse_duration, + remove_start, + strip_or_none, + try_get, + unified_strdate, + unified_timestamp, + update_url_query, + urljoin, + xpath_text, +) + + +class RaiBaseIE(InfoExtractor): + _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' + _GEO_COUNTRIES = ['IT'] + _GEO_BYPASS = False + + def _extract_relinker_info(self, relinker_url, video_id): + if not re.match(r'https?://', relinker_url): + return {'formats': [{'url': relinker_url}]} + + formats = [] + geoprotection = None + is_live = None + duration = None + + for platform in ('mon', 'flash', 'native'): + relinker = self._download_xml( + relinker_url, video_id, + note='Downloading XML metadata for platform %s' % platform, + transform_source=fix_xml_ampersands, + query={'output': 45, 'pl': platform}, + headers=self.geo_verification_headers()) + + if not geoprotection: + geoprotection = xpath_text( + relinker, './geoprotection', default=None) == 'Y' + + if not is_live: + is_live = xpath_text( + relinker, './is_live', default=None) == 'Y' + if not duration: + duration = parse_duration(xpath_text( + relinker, './duration', default=None)) + + url_elem = find_xpath_attr(relinker, './url', 'type', 'content') + if url_elem is None: + continue + + media_url = url_elem.text + + # This does not imply geo restriction (e.g. + # http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html) + if '/video_no_available.mp4' in media_url: + continue + + ext = determine_ext(media_url) + if (ext == 'm3u8' and platform != 'mon') or (ext == 'f4m' and platform != 'flash'): + continue + + if ext == 'm3u8' or 'format=m3u8' in media_url or platform == 'mon': + formats.extend(self._extract_m3u8_formats( + media_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif ext == 'f4m' or platform == 'flash': + manifest_url = update_url_query( + media_url.replace('manifest#live_hds.f4m', 'manifest.f4m'), + {'hdcore': '3.7.0', 'plugin': 'aasp-3.7.0.39.44'}) + formats.extend(self._extract_f4m_formats( + manifest_url, video_id, f4m_id='hds', fatal=False)) + else: + bitrate = int_or_none(xpath_text(relinker, 'bitrate')) + formats.append({ + 'url': media_url, + 'tbr': bitrate if bitrate > 0 else None, + 'format_id': 'http-%d' % bitrate if bitrate > 0 else 'http', + }) + + if not formats and geoprotection is True: + self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True) + + formats.extend(self._create_http_urls(relinker_url, formats)) + + return dict((k, v) for k, v in { + 'is_live': is_live, + 'duration': duration, + 'formats': formats, + }.items() if v is not None) + + def _create_http_urls(self, relinker_url, fmts): + _RELINKER_REG = r'https?://(?P<host>[^/]+?)/(?:i/)?(?P<extra>[^/]+?)/(?P<path>.+?)/(?P<id>\d+)(?:_(?P<quality>[\d\,]+))?(?:\.mp4|/playlist\.m3u8).+?' + _MP4_TMPL = '%s&overrideUserAgentRule=mp4-%s' + _QUALITY = { + # tbr: w, h + '250': [352, 198], + '400': [512, 288], + '700': [512, 288], + '800': [700, 394], + '1200': [736, 414], + '1800': [1024, 576], + '2400': [1280, 720], + '3200': [1440, 810], + '3600': [1440, 810], + '5000': [1920, 1080], + '10000': [1920, 1080], + } + + def test_url(url): + resp = self._request_webpage( + HEADRequest(url), None, headers={'User-Agent': 'Rai'}, + fatal=False, errnote=False, note=False) + + if resp is False: + return False + + if resp.code == 200: + return False if resp.url == url else resp.url + return None + + def get_format_info(tbr): + import math + br = int_or_none(tbr) + if len(fmts) == 1 and not br: + br = fmts[0].get('tbr') + if br > 300: + tbr = compat_str(math.floor(br / 100) * 100) + else: + tbr = '250' + + # try extracting info from available m3u8 formats + format_copy = None + for f in fmts: + if f.get('tbr'): + br_limit = math.floor(br / 100) + if br_limit - 1 <= math.floor(f['tbr'] / 100) <= br_limit + 1: + format_copy = f.copy() + return { + 'width': format_copy.get('width'), + 'height': format_copy.get('height'), + 'tbr': format_copy.get('tbr'), + 'vcodec': format_copy.get('vcodec'), + 'acodec': format_copy.get('acodec'), + 'fps': format_copy.get('fps'), + 'format_id': 'https-%s' % tbr, + } if format_copy else { + 'width': _QUALITY[tbr][0], + 'height': _QUALITY[tbr][1], + 'format_id': 'https-%s' % tbr, + 'tbr': int(tbr), + } + + loc = test_url(_MP4_TMPL % (relinker_url, '*')) + if not isinstance(loc, compat_str): + return [] + + mobj = re.match( + _RELINKER_REG, + test_url(relinker_url) or '') + if not mobj: + return [] + + available_qualities = mobj.group('quality').split(',') if mobj.group('quality') else ['*'] + available_qualities = [i for i in available_qualities if i] + + formats = [] + for q in available_qualities: + fmt = { + 'url': _MP4_TMPL % (relinker_url, q), + 'protocol': 'https', + 'ext': 'mp4', + } + fmt.update(get_format_info(q)) + formats.append(fmt) + return formats + + @staticmethod + def _extract_subtitles(url, video_data): + STL_EXT = 'stl' + SRT_EXT = 'srt' + subtitles = {} + subtitles_array = video_data.get('subtitlesArray') or [] + for k in ('subtitles', 'subtitlesUrl'): + subtitles_array.append({'url': video_data.get(k)}) + for subtitle in subtitles_array: + sub_url = subtitle.get('url') + if sub_url and isinstance(sub_url, compat_str): + sub_lang = subtitle.get('language') or 'it' + sub_url = urljoin(url, sub_url) + sub_ext = determine_ext(sub_url, SRT_EXT) + subtitles.setdefault(sub_lang, []).append({ + 'ext': sub_ext, + 'url': sub_url, + }) + if STL_EXT == sub_ext: + subtitles[sub_lang].append({ + 'ext': SRT_EXT, + 'url': sub_url[:-len(STL_EXT)] + SRT_EXT, + }) + return subtitles + + +class RaiPlayIE(RaiBaseIE): + _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/.+?-(?P<id>%s))\.(?:html|json)' % RaiBaseIE._UUID_RE + _TESTS = [{ + 'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html', + 'md5': '8970abf8caf8aef4696e7b1f2adfc696', + 'info_dict': { + 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', + 'ext': 'mp4', + 'title': 'Report del 07/04/2014', + 'alt_title': 'St 2013/14 - Espresso nel caffè - 07/04/2014', + 'description': 'md5:d730c168a58f4bb35600fc2f881ec04e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Rai Gulp', + 'duration': 6160, + 'series': 'Report', + 'season': '2013/14', + 'subtitles': { + 'it': 'count:2', + }, + }, + 'params': { + 'skip_download': True, + }, + }, { + # 1080p direct mp4 url + 'url': 'https://www.raiplay.it/video/2021/03/Leonardo-S1E1-b5703b02-82ee-475a-85b6-c9e4a8adf642.html', + 'md5': '2e501e8651d72f05ffe8f5d286ad560b', + 'info_dict': { + 'id': 'b5703b02-82ee-475a-85b6-c9e4a8adf642', + 'ext': 'mp4', + 'title': 'Leonardo - S1E1', + 'alt_title': 'St 1 Ep 1 - Episodio 1', + 'description': 'md5:f5360cd267d2de146e4e3879a5a47d31', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Rai 1', + 'duration': 3229, + 'series': 'Leonardo', + 'season': 'Season 1', + }, + }, { + 'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?', + 'only_matching': True, + }, { + # subtitles at 'subtitlesArray' key (see #27698) + 'url': 'https://www.raiplay.it/video/2020/12/Report---04-01-2021-2e90f1de-8eee-4de4-ac0e-78d21db5b600.html', + 'only_matching': True, + }, { + # DRM protected + 'url': 'https://www.raiplay.it/video/2020/09/Lo-straordinario-mondo-di-Zoey-S1E1-Lo-straordinario-potere-di-Zoey-ed493918-1d32-44b7-8454-862e473d00ff.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + base, video_id = self._match_valid_url(url).groups() + + media = self._download_json( + base + '.json', video_id, 'Downloading video JSON') + + if not self.get_param('allow_unplayable_formats'): + if try_get( + media, + (lambda x: x['rights_management']['rights']['drm'], + lambda x: x['program_info']['rights_management']['rights']['drm']), + dict): + self.report_drm(video_id) + + title = media['name'] + video = media['video'] + + relinker_info = self._extract_relinker_info(video['content_url'], video_id) + self._sort_formats(relinker_info['formats']) + + thumbnails = [] + for _, value in media.get('images', {}).items(): + if value: + thumbnails.append({ + 'url': urljoin(url, value), + }) + + date_published = media.get('date_published') + time_published = media.get('time_published') + if date_published and time_published: + date_published += ' ' + time_published + + subtitles = self._extract_subtitles(url, video) + + program_info = media.get('program_info') or {} + season = media.get('season') + + info = { + 'id': remove_start(media.get('id'), 'ContentItem-') or video_id, + 'display_id': video_id, + 'title': self._live_title(title) if relinker_info.get( + 'is_live') else title, + 'alt_title': strip_or_none(media.get('subtitle')), + 'description': media.get('description'), + 'uploader': strip_or_none(media.get('channel')), + 'creator': strip_or_none(media.get('editor') or None), + 'duration': parse_duration(video.get('duration')), + 'timestamp': unified_timestamp(date_published), + 'thumbnails': thumbnails, + 'series': program_info.get('name'), + 'season_number': int_or_none(season), + 'season': season if (season and not season.isdigit()) else None, + 'episode': media.get('episode_title'), + 'episode_number': int_or_none(media.get('episode')), + 'subtitles': subtitles, + } + + info.update(relinker_info) + return info + + +class RaiPlayLiveIE(RaiPlayIE): + _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/dirette/(?P<id>[^/?#&]+))' + _TESTS = [{ + 'url': 'http://www.raiplay.it/dirette/rainews24', + 'info_dict': { + 'id': 'd784ad40-e0ae-4a69-aa76-37519d238a9c', + 'display_id': 'rainews24', + 'ext': 'mp4', + 'title': 're:^Diretta di Rai News 24 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': 'md5:4d00bcf6dc98b27c6ec480de329d1497', + 'uploader': 'Rai News 24', + 'creator': 'Rai News 24', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + }, + }] + + +class RaiPlayPlaylistIE(InfoExtractor): + _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/programmi/(?P<id>[^/?#&]+))' + _TESTS = [{ + 'url': 'http://www.raiplay.it/programmi/nondirloalmiocapo/', + 'info_dict': { + 'id': 'nondirloalmiocapo', + 'title': 'Non dirlo al mio capo', + 'description': 'md5:98ab6b98f7f44c2843fd7d6f045f153b', + }, + 'playlist_mincount': 12, + }] + + def _real_extract(self, url): + base, playlist_id = self._match_valid_url(url).groups() + + program = self._download_json( + base + '.json', playlist_id, 'Downloading program JSON') + + entries = [] + for b in (program.get('blocks') or []): + for s in (b.get('sets') or []): + s_id = s.get('id') + if not s_id: + continue + medias = self._download_json( + '%s/%s.json' % (base, s_id), s_id, + 'Downloading content set JSON', fatal=False) + if not medias: + continue + for m in (medias.get('items') or []): + path_id = m.get('path_id') + if not path_id: + continue + video_url = urljoin(url, path_id) + entries.append(self.url_result( + video_url, ie=RaiPlayIE.ie_key(), + video_id=RaiPlayIE._match_id(video_url))) + + return self.playlist_result( + entries, playlist_id, program.get('name'), + try_get(program, lambda x: x['program_info']['description'])) + + +class RaiIE(RaiBaseIE): + _VALID_URL = r'https?://[^/]+\.(?:rai\.(?:it|tv)|rainews\.it)/.+?-(?P<id>%s)(?:-.+?)?\.html' % RaiBaseIE._UUID_RE + _TESTS = [{ + # var uniquename = "ContentItem-..." + # data-id="ContentItem-..." + 'url': 'http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html', + 'info_dict': { + 'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9', + 'ext': 'mp4', + 'title': 'TG PRIMO TEMPO', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 1758, + 'upload_date': '20140612', + }, + 'skip': 'This content is available only in Italy', + }, { + # with ContentItem in many metas + 'url': 'http://www.rainews.it/dl/rainews/media/Weekend-al-cinema-da-Hollywood-arriva-il-thriller-di-Tate-Taylor-La-ragazza-del-treno-1632c009-c843-4836-bb65-80c33084a64b.html', + 'info_dict': { + 'id': '1632c009-c843-4836-bb65-80c33084a64b', + 'ext': 'mp4', + 'title': 'Weekend al cinema, da Hollywood arriva il thriller di Tate Taylor "La ragazza del treno"', + 'description': 'I film in uscita questa settimana.', + 'thumbnail': r're:^https?://.*\.png$', + 'duration': 833, + 'upload_date': '20161103', + } + }, { + # with ContentItem in og:url + 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-efb17665-691c-45d5-a60c-5301333cbb0c.html', + 'md5': '06345bd97c932f19ffb129973d07a020', + 'info_dict': { + 'id': 'efb17665-691c-45d5-a60c-5301333cbb0c', + 'ext': 'mp4', + 'title': 'TG1 ore 20:00 del 03/11/2016', + 'description': 'TG1 edizione integrale ore 20:00 del giorno 03/11/2016', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 2214, + 'upload_date': '20161103', + } + }, { + # initEdizione('ContentItem-...' + 'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', + 'info_dict': { + 'id': 'c2187016-8484-4e3a-8ac8-35e475b07303', + 'ext': 'mp4', + 'title': r're:TG1 ore \d{2}:\d{2} del \d{2}/\d{2}/\d{4}', + 'duration': 2274, + 'upload_date': '20170401', + }, + 'skip': 'Changes daily', + }, { + # HLS live stream with ContentItem in og:url + 'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html', + 'info_dict': { + 'id': '3156f2f2-dc70-4953-8e2f-70d7489d4ce9', + 'ext': 'mp4', + 'title': 'La diretta di Rainews24', + }, + 'params': { + 'skip_download': True, + }, + }, { + # Direct MMS URL + 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-b63a4089-ac28-48cf-bca5-9f5b5bc46df5.html', + 'only_matching': True, + }, { + 'url': 'https://www.rainews.it/tgr/marche/notiziari/video/2019/02/ContentItem-6ba945a2-889c-4a80-bdeb-8489c70a8db9.html', + 'only_matching': True, + }] + + def _extract_from_content_id(self, content_id, url): + media = self._download_json( + 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % content_id, + content_id, 'Downloading video JSON') + + title = media['name'].strip() + + media_type = media['type'] + if 'Audio' in media_type: + relinker_info = { + 'formats': [{ + 'format_id': media.get('formatoAudio'), + 'url': media['audioUrl'], + 'ext': media.get('formatoAudio'), + }] + } + elif 'Video' in media_type: + relinker_info = self._extract_relinker_info(media['mediaUri'], content_id) + else: + raise ExtractorError('not a media file') + + self._sort_formats(relinker_info['formats']) + + thumbnails = [] + for image_type in ('image', 'image_medium', 'image_300'): + thumbnail_url = media.get(image_type) + if thumbnail_url: + thumbnails.append({ + 'url': compat_urlparse.urljoin(url, thumbnail_url), + }) + + subtitles = self._extract_subtitles(url, media) + + info = { + 'id': content_id, + 'title': title, + 'description': strip_or_none(media.get('desc')), + 'thumbnails': thumbnails, + 'uploader': media.get('author'), + 'upload_date': unified_strdate(media.get('date')), + 'duration': parse_duration(media.get('length')), + 'subtitles': subtitles, + } + + info.update(relinker_info) + + return info + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + content_item_id = None + + content_item_url = self._html_search_meta( + ('og:url', 'og:video', 'og:video:secure_url', 'twitter:url', + 'twitter:player', 'jsonlink'), webpage, default=None) + if content_item_url: + content_item_id = self._search_regex( + r'ContentItem-(%s)' % self._UUID_RE, content_item_url, + 'content item id', default=None) + + if not content_item_id: + content_item_id = self._search_regex( + r'''(?x) + (?: + (?:initEdizione|drawMediaRaiTV)\(| + <(?:[^>]+\bdata-id|var\s+uniquename)=| + <iframe[^>]+\bsrc= + ) + (["\']) + (?:(?!\1).)*\bContentItem-(?P<id>%s) + ''' % self._UUID_RE, + webpage, 'content item id', default=None, group='id') + + content_item_ids = set() + if content_item_id: + content_item_ids.add(content_item_id) + if video_id not in content_item_ids: + content_item_ids.add(video_id) + + for content_item_id in content_item_ids: + try: + return self._extract_from_content_id(content_item_id, url) + except GeoRestrictedError: + raise + except ExtractorError: + pass + + relinker_url = self._proto_relative_url(self._search_regex( + r'''(?x) + (?: + var\s+videoURL| + mediaInfo\.mediaUri + )\s*=\s* + ([\'"]) + (?P<url> + (?:https?:)? + //mediapolis(?:vod)?\.rai\.it/relinker/relinkerServlet\.htm\? + (?:(?!\1).)*\bcont=(?:(?!\1).)+)\1 + ''', + webpage, 'relinker URL', group='url')) + + relinker_info = self._extract_relinker_info( + urljoin(url, relinker_url), video_id) + self._sort_formats(relinker_info['formats']) + + title = self._search_regex( + r'var\s+videoTitolo\s*=\s*([\'"])(?P<title>[^\'"]+)\1', + webpage, 'title', group='title', + default=None) or self._og_search_title(webpage) + + info = { + 'id': video_id, + 'title': title, + } + + info.update(relinker_info) + + return info diff --git a/yt_dlp/extractor/raywenderlich.py b/yt_dlp/extractor/raywenderlich.py new file mode 100644 index 000000000..f04d51f7b --- /dev/null +++ b/yt_dlp/extractor/raywenderlich.py @@ -0,0 +1,179 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .vimeo import VimeoIE +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + merge_dicts, + try_get, + unescapeHTML, + unified_timestamp, + urljoin, +) + + +class RayWenderlichIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?: + videos\.raywenderlich\.com/courses| + (?:www\.)?raywenderlich\.com + )/ + (?P<course_id>[^/]+)/lessons/(?P<id>\d+) + ''' + + _TESTS = [{ + 'url': 'https://www.raywenderlich.com/3530-testing-in-ios/lessons/1', + 'info_dict': { + 'id': '248377018', + 'ext': 'mp4', + 'title': 'Introduction', + 'description': 'md5:804d031b3efa9fcb49777d512d74f722', + 'timestamp': 1513906277, + 'upload_date': '20171222', + 'duration': 133, + 'uploader': 'Ray Wenderlich', + 'uploader_id': 'user3304672', + }, + 'params': { + 'noplaylist': True, + 'skip_download': True, + }, + 'add_ie': [VimeoIE.ie_key()], + 'expected_warnings': ['HTTP Error 403: Forbidden'], + }, { + 'url': 'https://videos.raywenderlich.com/courses/105-testing-in-ios/lessons/1', + 'only_matching': True, + }] + + @staticmethod + def _extract_video_id(data, lesson_id): + if not data: + return + groups = try_get(data, lambda x: x['groups'], list) or [] + if not groups: + return + for group in groups: + if not isinstance(group, dict): + continue + contents = try_get(data, lambda x: x['contents'], list) or [] + for content in contents: + if not isinstance(content, dict): + continue + ordinal = int_or_none(content.get('ordinal')) + if ordinal != lesson_id: + continue + video_id = content.get('identifier') + if video_id: + return compat_str(video_id) + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + course_id, lesson_id = mobj.group('course_id', 'id') + display_id = '%s/%s' % (course_id, lesson_id) + + webpage = self._download_webpage(url, display_id) + + thumbnail = self._og_search_thumbnail( + webpage, default=None) or self._html_search_meta( + 'twitter:image', webpage, 'thumbnail') + + if '>Subscribe to unlock' in webpage: + raise ExtractorError( + 'This content is only available for subscribers', + expected=True) + + info = { + 'thumbnail': thumbnail, + } + + vimeo_id = self._search_regex( + r'data-vimeo-id=["\'](\d+)', webpage, 'vimeo id', default=None) + + if not vimeo_id: + data = self._parse_json( + self._search_regex( + r'data-collection=(["\'])(?P<data>{.+?})\1', webpage, + 'data collection', default='{}', group='data'), + display_id, transform_source=unescapeHTML, fatal=False) + video_id = self._extract_video_id( + data, lesson_id) or self._search_regex( + r'/videos/(\d+)/', thumbnail, 'video id') + headers = { + 'Referer': url, + 'X-Requested-With': 'XMLHttpRequest', + } + csrf_token = self._html_search_meta( + 'csrf-token', webpage, 'csrf token', default=None) + if csrf_token: + headers['X-CSRF-Token'] = csrf_token + video = self._download_json( + 'https://videos.raywenderlich.com/api/v1/videos/%s.json' + % video_id, display_id, headers=headers)['video'] + vimeo_id = video['clips'][0]['provider_id'] + info.update({ + '_type': 'url_transparent', + 'title': video.get('name'), + 'description': video.get('description') or video.get( + 'meta_description'), + 'duration': int_or_none(video.get('duration')), + 'timestamp': unified_timestamp(video.get('created_at')), + }) + + return merge_dicts(info, self.url_result( + VimeoIE._smuggle_referrer( + 'https://player.vimeo.com/video/%s' % vimeo_id, url), + ie=VimeoIE.ie_key(), video_id=vimeo_id)) + + +class RayWenderlichCourseIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?: + videos\.raywenderlich\.com/courses| + (?:www\.)?raywenderlich\.com + )/ + (?P<id>[^/]+) + ''' + + _TEST = { + 'url': 'https://www.raywenderlich.com/3530-testing-in-ios', + 'info_dict': { + 'title': 'Testing in iOS', + 'id': '3530-testing-in-ios', + }, + 'params': { + 'noplaylist': False, + }, + 'playlist_count': 29, + } + + @classmethod + def suitable(cls, url): + return False if RayWenderlichIE.suitable(url) else super( + RayWenderlichCourseIE, cls).suitable(url) + + def _real_extract(self, url): + course_id = self._match_id(url) + + webpage = self._download_webpage(url, course_id) + + entries = [] + lesson_urls = set() + for lesson_url in re.findall( + r'<a[^>]+\bhref=["\'](/%s/lessons/\d+)' % course_id, webpage): + if lesson_url in lesson_urls: + continue + lesson_urls.add(lesson_url) + entries.append(self.url_result( + urljoin(url, lesson_url), ie=RayWenderlichIE.ie_key())) + + title = self._og_search_title( + webpage, default=None) or self._html_search_meta( + 'twitter:title', webpage, 'title', default=None) + + return self.playlist_result(entries, course_id, title) diff --git a/yt_dlp/extractor/rbmaradio.py b/yt_dlp/extractor/rbmaradio.py new file mode 100644 index 000000000..9642fbbe1 --- /dev/null +++ b/yt_dlp/extractor/rbmaradio.py @@ -0,0 +1,71 @@ +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + clean_html, + int_or_none, + unified_timestamp, + update_url_query, +) + + +class RBMARadioIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?:rbmaradio|redbullradio)\.com/shows/(?P<show_id>[^/]+)/episodes/(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'https://www.rbmaradio.com/shows/main-stage/episodes/ford-lopatin-live-at-primavera-sound-2011', + 'md5': '6bc6f9bcb18994b4c983bc3bf4384d95', + 'info_dict': { + 'id': 'ford-lopatin-live-at-primavera-sound-2011', + 'ext': 'mp3', + 'title': 'Main Stage - Ford & Lopatin at Primavera Sound', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 2452, + 'timestamp': 1307103164, + 'upload_date': '20110603', + }, + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + show_id = mobj.group('show_id') + episode_id = mobj.group('id') + + webpage = self._download_webpage(url, episode_id) + + episode = self._parse_json( + self._search_regex( + r'__INITIAL_STATE__\s*=\s*({.+?})\s*</script>', + webpage, 'json data'), + episode_id)['episodes'][show_id][episode_id] + + title = episode['title'] + + show_title = episode.get('showTitle') + if show_title: + title = '%s - %s' % (show_title, title) + + formats = [{ + 'url': update_url_query(episode['audioURL'], query={'cbr': abr}), + 'format_id': compat_str(abr), + 'abr': abr, + 'vcodec': 'none', + } for abr in (96, 128, 192, 256)] + self._check_formats(formats, episode_id) + + description = clean_html(episode.get('longTeaser')) + thumbnail = self._proto_relative_url(episode.get('imageURL', {}).get('landscape')) + duration = int_or_none(episode.get('duration')) + timestamp = unified_timestamp(episode.get('publishedAt')) + + return { + 'id': episode_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, + } diff --git a/yt_dlp/extractor/rcs.py b/yt_dlp/extractor/rcs.py new file mode 100644 index 000000000..ace611bc9 --- /dev/null +++ b/yt_dlp/extractor/rcs.py @@ -0,0 +1,427 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + ExtractorError, + js_to_json, + base_url, + url_basename, + urljoin, +) + + +class RCSBaseIE(InfoExtractor): + # based on VideoPlayerLoader.prototype.getVideoSrc + # and VideoPlayerLoader.prototype.transformSrc from + # https://js2.corriereobjects.it/includes2013/LIBS/js/corriere_video.sjs + _ALL_REPLACE = { + 'media2vam.corriere.it.edgesuite.net': + 'media2vam-corriere-it.akamaized.net', + 'media.youreporter.it.edgesuite.net': + 'media-youreporter-it.akamaized.net', + 'corrierepmd.corriere.it.edgesuite.net': + 'corrierepmd-corriere-it.akamaized.net', + 'media2vam-corriere-it.akamaized.net/fcs.quotidiani/vr/videos/': + 'video.corriere.it/vr360/videos/', + '.net//': '.net/', + } + _MP4_REPLACE = { + 'media2vam.corbologna.corriere.it.edgesuite.net': + 'media2vam-bologna-corriere-it.akamaized.net', + 'media2vam.corfiorentino.corriere.it.edgesuite.net': + 'media2vam-fiorentino-corriere-it.akamaized.net', + 'media2vam.cormezzogiorno.corriere.it.edgesuite.net': + 'media2vam-mezzogiorno-corriere-it.akamaized.net', + 'media2vam.corveneto.corriere.it.edgesuite.net': + 'media2vam-veneto-corriere-it.akamaized.net', + 'media2.oggi.it.edgesuite.net': + 'media2-oggi-it.akamaized.net', + 'media2.quimamme.it.edgesuite.net': + 'media2-quimamme-it.akamaized.net', + 'media2.amica.it.edgesuite.net': + 'media2-amica-it.akamaized.net', + 'media2.living.corriere.it.edgesuite.net': + 'media2-living-corriere-it.akamaized.net', + 'media2.style.corriere.it.edgesuite.net': + 'media2-style-corriere-it.akamaized.net', + 'media2.iodonna.it.edgesuite.net': + 'media2-iodonna-it.akamaized.net', + 'media2.leitv.it.edgesuite.net': + 'media2-leitv-it.akamaized.net', + } + _MIGRATION_MAP = { + 'videoamica-vh.akamaihd': 'amica', + 'media2-amica-it.akamaized': 'amica', + 'corrierevam-vh.akamaihd': 'corriere', + 'media2vam-corriere-it.akamaized': 'corriere', + 'cormezzogiorno-vh.akamaihd': 'corrieredelmezzogiorno', + 'media2vam-mezzogiorno-corriere-it.akamaized': 'corrieredelmezzogiorno', + 'corveneto-vh.akamaihd': 'corrieredelveneto', + 'media2vam-veneto-corriere-it.akamaized': 'corrieredelveneto', + 'corbologna-vh.akamaihd': 'corrieredibologna', + 'media2vam-bologna-corriere-it.akamaized': 'corrieredibologna', + 'corfiorentino-vh.akamaihd': 'corrierefiorentino', + 'media2vam-fiorentino-corriere-it.akamaized': 'corrierefiorentino', + 'corinnovazione-vh.akamaihd': 'corriereinnovazione', + 'media2-gazzanet-gazzetta-it.akamaized': 'gazzanet', + 'videogazzanet-vh.akamaihd': 'gazzanet', + 'videogazzaworld-vh.akamaihd': 'gazzaworld', + 'gazzettavam-vh.akamaihd': 'gazzetta', + 'media2vam-gazzetta-it.akamaized': 'gazzetta', + 'videoiodonna-vh.akamaihd': 'iodonna', + 'media2-leitv-it.akamaized': 'leitv', + 'videoleitv-vh.akamaihd': 'leitv', + 'videoliving-vh.akamaihd': 'living', + 'media2-living-corriere-it.akamaized': 'living', + 'media2-oggi-it.akamaized': 'oggi', + 'videooggi-vh.akamaihd': 'oggi', + 'media2-quimamme-it.akamaized': 'quimamme', + 'quimamme-vh.akamaihd': 'quimamme', + 'videorunning-vh.akamaihd': 'running', + 'media2-style-corriere-it.akamaized': 'style', + 'style-vh.akamaihd': 'style', + 'videostyle-vh.akamaihd': 'style', + 'media2-stylepiccoli-it.akamaized': 'stylepiccoli', + 'stylepiccoli-vh.akamaihd': 'stylepiccoli', + 'doveviaggi-vh.akamaihd': 'viaggi', + 'media2-doveviaggi-it.akamaized': 'viaggi', + 'media2-vivimilano-corriere-it.akamaized': 'vivimilano', + 'vivimilano-vh.akamaihd': 'vivimilano', + 'media2-youreporter-it.akamaized': 'youreporter' + } + _MIGRATION_MEDIA = { + 'advrcs-vh.akamaihd': '', + 'corriere-f.akamaihd': '', + 'corrierepmd-corriere-it.akamaized': '', + 'corrprotetto-vh.akamaihd': '', + 'gazzetta-f.akamaihd': '', + 'gazzettapmd-gazzetta-it.akamaized': '', + 'gazzprotetto-vh.akamaihd': '', + 'periodici-f.akamaihd': '', + 'periodicisecure-vh.akamaihd': '', + 'videocoracademy-vh.akamaihd': '' + } + + def _get_video_src(self, video): + mediaFiles = video.get('mediaProfile').get('mediaFile') + src = {} + # audio + if video.get('mediaType') == 'AUDIO': + for aud in mediaFiles: + # todo: check + src['mp3'] = aud.get('value') + # video + else: + for vid in mediaFiles: + if vid.get('mimeType') == 'application/vnd.apple.mpegurl': + src['m3u8'] = vid.get('value') + if vid.get('mimeType') == 'video/mp4': + src['mp4'] = vid.get('value') + + # replace host + for t in src: + for s, r in self._ALL_REPLACE.items(): + src[t] = src[t].replace(s, r) + for s, r in self._MP4_REPLACE.items(): + src[t] = src[t].replace(s, r) + + # switch cdn + if 'mp4' in src and 'm3u8' in src: + if ('-lh.akamaihd' not in src.get('m3u8') + and 'akamai' in src.get('mp4')): + if 'm3u8' in src: + matches = re.search(r'(?:https*:)?\/\/(?P<host>.*)\.net\/i(?P<path>.*)$', src.get('m3u8')) + src['m3u8'] = 'https://vod.rcsobjects.it/hls/%s%s' % ( + self._MIGRATION_MAP[matches.group('host')], + matches.group('path').replace( + '///', '/').replace( + '//', '/').replace( + '.csmil', '.urlset' + ) + ) + if 'mp4' in src: + matches = re.search(r'(?:https*:)?\/\/(?P<host>.*)\.net\/i(?P<path>.*)$', src.get('mp4')) + if matches: + if matches.group('host') in self._MIGRATION_MEDIA: + vh_stream = 'https://media2.corriereobjects.it' + if src.get('mp4').find('fcs.quotidiani_!'): + vh_stream = 'https://media2-it.corriereobjects.it' + src['mp4'] = '%s%s' % ( + vh_stream, + matches.group('path').replace( + '///', '/').replace( + '//', '/').replace( + '/fcs.quotidiani/mediacenter', '').replace( + '/fcs.quotidiani_!/mediacenter', '').replace( + 'corriere/content/mediacenter/', '').replace( + 'gazzetta/content/mediacenter/', '') + ) + else: + src['mp4'] = 'https://vod.rcsobjects.it/%s%s' % ( + self._MIGRATION_MAP[matches.group('host')], + matches.group('path').replace('///', '/').replace('//', '/') + ) + + if 'mp3' in src: + src['mp3'] = src.get('mp3').replace( + 'media2vam-corriere-it.akamaized.net', + 'vod.rcsobjects.it/corriere') + if 'mp4' in src: + if src.get('mp4').find('fcs.quotidiani_!'): + src['mp4'] = src.get('mp4').replace('vod.rcsobjects', 'vod-it.rcsobjects') + if 'm3u8' in src: + if src.get('m3u8').find('fcs.quotidiani_!'): + src['m3u8'] = src.get('m3u8').replace('vod.rcsobjects', 'vod-it.rcsobjects') + + if 'geoblocking' in video.get('mediaProfile'): + if 'm3u8' in src: + src['m3u8'] = src.get('m3u8').replace('vod.rcsobjects', 'vod-it.rcsobjects') + if 'mp4' in src: + src['mp4'] = src.get('mp4').replace('vod.rcsobjects', 'vod-it.rcsobjects') + if 'm3u8' in src: + if src.get('m3u8').find('csmil') and src.get('m3u8').find('vod'): + src['m3u8'] = src.get('m3u8').replace('.csmil', '.urlset') + + return src + + def _create_formats(self, urls, video_id): + formats = [] + formats = self._extract_m3u8_formats( + urls.get('m3u8'), video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False) + + if urls.get('mp4'): + formats.append({ + 'format_id': 'http-mp4', + 'url': urls['mp4'] + }) + self._sort_formats(formats) + return formats + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + + if 'cdn' not in mobj.groupdict(): + raise ExtractorError('CDN not found in url: %s' % url) + + # for leitv/youreporter/viaggi don't use the embed page + if ((mobj.group('cdn') not in ['leitv.it', 'youreporter.it']) + and (mobj.group('vid') == 'video')): + url = 'https://video.%s/video-embed/%s' % (mobj.group('cdn'), video_id) + + page = self._download_webpage(url, video_id) + + video_data = None + # look for json video data url + json = self._search_regex( + r'''(?x)url\s*=\s*(["']) + (?P<url> + (?:https?:)?//video\.rcs\.it + /fragment-includes/video-includes/.+?\.json + )\1;''', + page, video_id, group='url', default=None) + if json: + if json.startswith('//'): + json = 'https:%s' % json + video_data = self._download_json(json, video_id) + + # if json url not found, look for json video data directly in the page + else: + # RCS normal pages and most of the embeds + json = self._search_regex( + r'[\s;]video\s*=\s*({[\s\S]+?})(?:;|,playlist=)', + page, video_id, default=None) + if not json and 'video-embed' in url: + page = self._download_webpage(url.replace('video-embed', 'video-json'), video_id) + json = self._search_regex( + r'##start-video##({[\s\S]+?})##end-video##', + page, video_id, default=None) + if not json: + # if no video data found try search for iframes + emb = RCSEmbedsIE._extract_url(page) + if emb: + return { + '_type': 'url_transparent', + 'url': emb, + 'ie_key': RCSEmbedsIE.ie_key() + } + if json: + video_data = self._parse_json( + json, video_id, transform_source=js_to_json) + + if not video_data: + raise ExtractorError('Video data not found in the page') + + formats = self._create_formats( + self._get_video_src(video_data), video_id) + + description = (video_data.get('description') + or clean_html(video_data.get('htmlDescription')) + or self._html_search_meta('description', page)) + uploader = video_data.get('provider') or mobj.group('cdn') + + return { + 'id': video_id, + 'title': video_data.get('title'), + 'description': description, + 'uploader': uploader, + 'formats': formats + } + + +class RCSEmbedsIE(RCSBaseIE): + _VALID_URL = r'''(?x) + https?://(?P<vid>video)\. + (?P<cdn> + (?: + rcs| + (?:corriere\w+\.)?corriere| + (?:gazzanet\.)?gazzetta + )\.it) + /video-embed/(?P<id>[^/=&\?]+?)(?:$|\?)''' + _TESTS = [{ + 'url': 'https://video.rcs.it/video-embed/iodonna-0001585037', + 'md5': '623ecc8ffe7299b2d0c1046d8331a9df', + 'info_dict': { + 'id': 'iodonna-0001585037', + 'ext': 'mp4', + 'title': 'Sky Arte racconta Madonna nella serie "Artist to icon"', + 'description': 'md5:65b09633df9ffee57f48b39e34c9e067', + 'uploader': 'rcs.it', + } + }, { + # redownload the page changing 'video-embed' in 'video-json' + 'url': 'https://video.gazzanet.gazzetta.it/video-embed/gazzanet-mo05-0000260789', + 'md5': 'a043e3fecbe4d9ed7fc5d888652a5440', + 'info_dict': { + 'id': 'gazzanet-mo05-0000260789', + 'ext': 'mp4', + 'title': 'Valentino Rossi e papà Graziano si divertono col drifting', + 'description': 'md5:a8bf90d6adafd9815f70fc74c0fc370a', + 'uploader': 'rcd', + } + }, { + 'url': 'https://video.corriere.it/video-embed/b727632a-f9d0-11ea-91b0-38d50a849abb?player', + 'match_only': True + }, { + 'url': 'https://video.gazzetta.it/video-embed/49612410-00ca-11eb-bcd8-30d4253e0140', + 'match_only': True + }] + + @staticmethod + def _sanitize_urls(urls): + # add protocol if missing + for i, e in enumerate(urls): + if e.startswith('//'): + urls[i] = 'https:%s' % e + # clean iframes urls + for i, e in enumerate(urls): + urls[i] = urljoin(base_url(e), url_basename(e)) + return urls + + @staticmethod + def _extract_urls(webpage): + entries = [ + mobj.group('url') + for mobj in re.finditer(r'''(?x) + (?: + data-frame-src=| + <iframe[^\n]+src= + ) + (["']) + (?P<url>(?:https?:)?//video\. + (?: + rcs| + (?:corriere\w+\.)?corriere| + (?:gazzanet\.)?gazzetta + ) + \.it/video-embed/.+?) + \1''', webpage)] + return RCSEmbedsIE._sanitize_urls(entries) + + @staticmethod + def _extract_url(webpage): + urls = RCSEmbedsIE._extract_urls(webpage) + return urls[0] if urls else None + + +class RCSIE(RCSBaseIE): + _VALID_URL = r'''(?x)https?://(?P<vid>video|viaggi)\. + (?P<cdn> + (?: + corrieredelmezzogiorno\. + |corrieredelveneto\. + |corrieredibologna\. + |corrierefiorentino\. + )?corriere\.it + |(?:gazzanet\.)?gazzetta\.it) + /(?!video-embed/).+?/(?P<id>[^/\?]+)(?=\?|/$|$)''' + _TESTS = [{ + 'url': 'https://video.corriere.it/sport/formula-1/vettel-guida-ferrari-sf90-mugello-suo-fianco-c-elecrerc-bendato-video-esilarante/b727632a-f9d0-11ea-91b0-38d50a849abb', + 'md5': '0f4ededc202b0f00b6e509d831e2dcda', + 'info_dict': { + 'id': 'b727632a-f9d0-11ea-91b0-38d50a849abb', + 'ext': 'mp4', + 'title': 'Vettel guida la Ferrari SF90 al Mugello e al suo fianco c\'è Leclerc (bendato): il video è esilarante', + 'description': 'md5:93b51c9161ac8a64fb2f997b054d0152', + 'uploader': 'Corriere Tv', + } + }, { + # video data inside iframe + 'url': 'https://viaggi.corriere.it/video/norvegia-il-nuovo-ponte-spettacolare-sopra-la-cascata-di-voringsfossen/', + 'md5': 'da378e4918d2afbf7d61c35abb948d4c', + 'info_dict': { + 'id': '5b7cd134-e2c1-11ea-89b3-b56dd0df2aa2', + 'ext': 'mp4', + 'title': 'La nuova spettacolare attrazione in Norvegia: il ponte sopra Vøringsfossen', + 'description': 'md5:18b35a291f6746c0c8dacd16e5f5f4f8', + 'uploader': 'DOVE Viaggi', + } + }, { + 'url': 'https://video.gazzetta.it/video-motogp-catalogna-cadute-dovizioso-vale-rossi/49612410-00ca-11eb-bcd8-30d4253e0140?vclk=Videobar', + 'md5': 'eedc1b5defd18e67383afef51ff7bdf9', + 'info_dict': { + 'id': '49612410-00ca-11eb-bcd8-30d4253e0140', + 'ext': 'mp4', + 'title': 'Dovizioso, il contatto con Zarco e la caduta. E anche Vale finisce a terra', + 'description': 'md5:8c6e905dc3b9413218beca11ebd69778', + 'uploader': 'AMorici', + } + }, { + 'url': 'https://video.corriere.it/video-360/metro-copenaghen-tutta-italiana/a248a7f0-e2db-11e9-9830-af2de6b1f945', + 'match_only': True + }] + + +class RCSVariousIE(RCSBaseIE): + _VALID_URL = r'''(?x)https?://www\. + (?P<cdn> + leitv\.it| + youreporter\.it + )/(?:[^/]+/)?(?P<id>[^/]+?)(?:$|\?|/)''' + _TESTS = [{ + 'url': 'https://www.leitv.it/benessere/mal-di-testa-come-combatterlo-ed-evitarne-la-comparsa/', + 'md5': '92b4e63667b8f95acb0a04da25ae28a1', + 'info_dict': { + 'id': 'mal-di-testa-come-combatterlo-ed-evitarne-la-comparsa', + 'ext': 'mp4', + 'title': 'Cervicalgia e mal di testa, il video con i suggerimenti dell\'esperto', + 'description': 'md5:ae21418f34cee0b8d02a487f55bcabb5', + 'uploader': 'leitv.it', + } + }, { + 'url': 'https://www.youreporter.it/fiume-sesia-3-ottobre-2020/', + 'md5': '8dccd436b47a830bab5b4a88232f391a', + 'info_dict': { + 'id': 'fiume-sesia-3-ottobre-2020', + 'ext': 'mp4', + 'title': 'Fiume Sesia 3 ottobre 2020', + 'description': 'md5:0070eef1cc884d13c970a4125063de55', + 'uploader': 'youreporter.it', + } + }] diff --git a/yt_dlp/extractor/rcti.py b/yt_dlp/extractor/rcti.py new file mode 100644 index 000000000..31d9779dd --- /dev/null +++ b/yt_dlp/extractor/rcti.py @@ -0,0 +1,354 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools +import json +import random +import time + +from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ( + dict_get, + ExtractorError, + strip_or_none, + try_get +) + + +class RCTIPlusBaseIE(InfoExtractor): + def _real_initialize(self): + self._AUTH_KEY = self._download_json( + 'https://api.rctiplus.com/api/v1/visitor?platform=web', # platform can be web, mweb, android, ios + None, 'Fetching authorization key')['data']['access_token'] + + def _call_api(self, url, video_id, note=None): + json = self._download_json( + url, video_id, note=note, headers={'Authorization': self._AUTH_KEY}) + if json.get('status', {}).get('code', 0) != 0: + raise ExtractorError('%s said: %s' % (self.IE_NAME, json["status"]["message_client"]), cause=json) + return json.get('data'), json.get('meta') + + +class RCTIPlusIE(RCTIPlusBaseIE): + _VALID_URL = r'https://www\.rctiplus\.com/(?:programs/\d+?/.*?/)?(?P<type>episode|clip|extra|live-event|missed-event)/(?P<id>\d+)/(?P<display_id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.rctiplus.com/programs/1259/kiko-untuk-lola/episode/22124/untuk-lola', + 'md5': '56ed45affad45fa18d5592a1bc199997', + 'info_dict': { + 'id': 'v_e22124', + 'title': 'Untuk Lola', + 'display_id': 'untuk-lola', + 'description': 'md5:2b809075c0b1e071e228ad6d13e41deb', + 'ext': 'mp4', + 'duration': 1400, + 'timestamp': 1615978800, + 'upload_date': '20210317', + 'series': 'Kiko : Untuk Lola', + 'season_number': 1, + 'episode_number': 1, + 'channel': 'RCTI', + }, + 'params': { + 'fixup': 'never', + }, + }, { # Clip; Series title doesn't appear on metadata JSON + 'url': 'https://www.rctiplus.com/programs/316/cahaya-terindah/clip/3921/make-a-wish', + 'md5': 'd179b2ff356f0e91a53bcc6a4d8504f0', + 'info_dict': { + 'id': 'v_c3921', + 'title': 'Make A Wish', + 'display_id': 'make-a-wish', + 'description': 'Make A Wish', + 'ext': 'mp4', + 'duration': 288, + 'timestamp': 1571652600, + 'upload_date': '20191021', + 'series': 'Cahaya Terindah', + 'channel': 'RCTI', + }, + 'params': { + 'fixup': 'never', + }, + }, { # Extra + 'url': 'https://www.rctiplus.com/programs/616/inews-malam/extra/9438/diungkapkan-melalui-surat-terbuka-ceo-ruangguru-belva-devara-mundur-dari-staf-khusus-presiden', + 'md5': 'c48106afdbce609749f5e0c007d9278a', + 'info_dict': { + 'id': 'v_ex9438', + 'title': 'md5:2ede828c0f8bde249e0912be150314ca', + 'display_id': 'md5:62b8d4e9ff096db527a1ad797e8a9933', + 'description': 'md5:2ede828c0f8bde249e0912be150314ca', + 'ext': 'mp4', + 'duration': 93, + 'timestamp': 1587561540, + 'upload_date': '20200422', + 'series': 'iNews Malam', + 'channel': 'INews', + }, + 'params': { + 'format': 'bestvideo', + }, + }, { # Missed event/replay + 'url': 'https://www.rctiplus.com/missed-event/2507/mou-signing-ceremony-27-juli-2021-1400-wib', + 'md5': '649c5f27250faed1452ca8b91e06922d', + 'info_dict': { + 'id': 'v_pe2507', + 'title': 'MOU Signing Ceremony | 27 Juli 2021 | 14.00 WIB', + 'display_id': 'mou-signing-ceremony-27-juli-2021-1400-wib', + 'ext': 'mp4', + 'timestamp': 1627142400, + 'upload_date': '20210724', + 'was_live': True, + 'release_timestamp': 1627369200, + }, + 'params': { + 'fixup': 'never', + }, + }, { # Live event; Cloudfront CDN + 'url': 'https://www.rctiplus.com/live-event/2530/dai-muda-charging-imun-dengan-iman-4-agustus-2021-1600-wib', + 'info_dict': { + 'id': 'v_le2530', + 'title': 'Dai Muda : Charging Imun dengan Iman | 4 Agustus 2021 | 16.00 WIB', + 'display_id': 'dai-muda-charging-imun-dengan-iman-4-agustus-2021-1600-wib', + 'ext': 'mp4', + 'timestamp': 1627898400, + 'upload_date': '20210802', + 'release_timestamp': 1628067600, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'This live event has ended.', + }, { # TV; live_at is null + 'url': 'https://www.rctiplus.com/live-event/1/rcti', + 'info_dict': { + 'id': 'v_lt1', + 'title': 'RCTI', + 'display_id': 'rcti', + 'ext': 'mp4', + 'timestamp': 1546344000, + 'upload_date': '20190101', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + 'format': 'bestvideo', + }, + }] + _CONVIVA_JSON_TEMPLATE = { + 't': 'CwsSessionHb', + 'cid': 'ff84ae928c3b33064b76dec08f12500465e59a6f', + 'clid': '0', + 'sid': 0, + 'seq': 0, + 'caps': 0, + 'sf': 7, + 'sdk': True, + } + + def _real_extract(self, url): + match = self._match_valid_url(url).groupdict() + video_type, video_id, display_id = match['type'], match['id'], match['display_id'] + + url_api_version = 'v2' if video_type == 'missed-event' else 'v1' + appier_id = '23984824_' + str(random.randint(0, 10000000000)) # Based on the webpage's uuidRandom generator + video_json = self._call_api( + f'https://api.rctiplus.com/api/{url_api_version}/{video_type}/{video_id}/url?appierid={appier_id}', display_id, 'Downloading video URL JSON')[0] + video_url = video_json['url'] + + is_upcoming = try_get(video_json, lambda x: x['current_date'] < x['live_at']) + if is_upcoming is None: + is_upcoming = try_get(video_json, lambda x: x['current_date'] < x['start_date']) + if is_upcoming: + self.raise_no_formats( + 'This event will start at %s.' % video_json['live_label'] if video_json.get('live_label') else 'This event has not started yet.', expected=True) + if 'akamaized' in video_url: + # For some videos hosted on Akamai's CDN (possibly AES-encrypted ones?), a session needs to at least be made via Conviva's API + conviva_json_data = { + **self._CONVIVA_JSON_TEMPLATE, + 'url': video_url, + 'sst': int(time.time()) + } + conviva_json_res = self._download_json( + 'https://ff84ae928c3b33064b76dec08f12500465e59a6f.cws.conviva.com/0/wsg', display_id, + 'Creating Conviva session', 'Failed to create Conviva session', + fatal=False, data=json.dumps(conviva_json_data).encode('utf-8')) + if conviva_json_res and conviva_json_res.get('err') != 'ok': + self.report_warning('Conviva said: %s' % str(conviva_json_res.get('err'))) + + video_meta, meta_paths = self._call_api( + 'https://api.rctiplus.com/api/v1/%s/%s' % (video_type, video_id), display_id, 'Downloading video metadata') + + thumbnails, image_path = [], meta_paths.get('image_path', 'https://rstatic.akamaized.net/media/') + if video_meta.get('portrait_image'): + thumbnails.append({ + 'id': 'portrait_image', + 'url': '%s%d%s' % (image_path, 2000, video_meta['portrait_image']) # 2000px seems to be the highest resolution that can be given + }) + if video_meta.get('landscape_image'): + thumbnails.append({ + 'id': 'landscape_image', + 'url': '%s%d%s' % (image_path, 2000, video_meta['landscape_image']) + }) + try: + formats = self._extract_m3u8_formats(video_url, display_id, 'mp4', headers={'Referer': 'https://www.rctiplus.com/'}) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + self.raise_geo_restricted(countries=['ID'], metadata_available=True) + else: + raise e + for f in formats: + if 'akamaized' in f['url'] or 'cloudfront' in f['url']: + f.setdefault('http_headers', {})['Referer'] = 'https://www.rctiplus.com/' # Referer header is required for akamai/cloudfront CDNs + + self._sort_formats(formats) + + return { + 'id': video_meta.get('product_id') or video_json.get('product_id'), + 'title': dict_get(video_meta, ('title', 'name')) or dict_get(video_json, ('content_name', 'assets_name')), + 'display_id': display_id, + 'description': video_meta.get('summary'), + 'timestamp': video_meta.get('release_date') or video_json.get('start_date'), + 'duration': video_meta.get('duration'), + 'categories': [video_meta['genre']] if video_meta.get('genre') else None, + 'average_rating': video_meta.get('star_rating'), + 'series': video_meta.get('program_title') or video_json.get('program_title'), + 'season_number': video_meta.get('season'), + 'episode_number': video_meta.get('episode'), + 'channel': video_json.get('tv_name'), + 'channel_id': video_json.get('tv_id'), + 'formats': formats, + 'thumbnails': thumbnails, + 'is_live': video_type == 'live-event' and not is_upcoming, + 'was_live': video_type == 'missed-event', + 'live_status': 'is_upcoming' if is_upcoming else None, + 'release_timestamp': video_json.get('live_at'), + } + + +class RCTIPlusSeriesIE(RCTIPlusBaseIE): + _VALID_URL = r'https://www\.rctiplus\.com/programs/(?P<id>\d+)/(?P<display_id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.rctiplus.com/programs/540/upin-ipin', + 'playlist_mincount': 417, + 'info_dict': { + 'id': '540', + 'title': 'Upin & Ipin', + 'description': 'md5:22cc912381f389664416844e1ec4f86b', + }, + }, { + 'url': 'https://www.rctiplus.com/programs/540/upin-ipin/episodes?utm_source=Rplusdweb&utm_medium=share_copy&utm_campaign=programsupin-ipin', + 'only_matching': True, + }] + _AGE_RATINGS = { # Based off https://id.wikipedia.org/wiki/Sistem_rating_konten_televisi with additional ratings + 'S-SU': 2, + 'SU': 2, + 'P': 2, + 'A': 7, + 'R': 13, + 'R-R/1': 17, # Labelled as 17+ despite being R + 'D': 18, + } + + @classmethod + def suitable(cls, url): + return False if RCTIPlusIE.suitable(url) else super(RCTIPlusSeriesIE, cls).suitable(url) + + def _entries(self, url, display_id=None, note='Downloading entries JSON', metadata={}): + total_pages = 0 + try: + total_pages = self._call_api( + '%s&length=20&page=0' % url, + display_id, note)[1]['pagination']['total_page'] + except ExtractorError as e: + if 'not found' in str(e): + return [] + raise e + if total_pages <= 0: + return [] + + for page_num in range(1, total_pages + 1): + episode_list = self._call_api( + '%s&length=20&page=%s' % (url, page_num), + display_id, '%s page %s' % (note, page_num))[0] or [] + + for video_json in episode_list: + link = video_json['share_link'] + url_res = self.url_result(link, 'RCTIPlus', video_json.get('product_id'), video_json.get('title')) + url_res.update(metadata) + yield url_res + + def _real_extract(self, url): + series_id, display_id = self._match_valid_url(url).groups() + + series_meta, meta_paths = self._call_api( + 'https://api.rctiplus.com/api/v1/program/%s/detail' % series_id, display_id, 'Downloading series metadata') + metadata = { + 'age_limit': try_get(series_meta, lambda x: self._AGE_RATINGS[x['age_restriction'][0]['code']]) + } + + cast = [] + for star in series_meta.get('starring', []): + cast.append(strip_or_none(star.get('name'))) + for star in series_meta.get('creator', []): + cast.append(strip_or_none(star.get('name'))) + for star in series_meta.get('writer', []): + cast.append(strip_or_none(star.get('name'))) + metadata['cast'] = cast + + tags = [] + for tag in series_meta.get('tag', []): + tags.append(strip_or_none(tag.get('name'))) + metadata['tag'] = tags + + entries = [] + seasons_list = self._call_api( + 'https://api.rctiplus.com/api/v1/program/%s/season' % series_id, display_id, 'Downloading seasons list JSON')[0] + for season in seasons_list: + entries.append(self._entries('https://api.rctiplus.com/api/v2/program/%s/episode?season=%s' % (series_id, season['season']), + display_id, 'Downloading season %s episode entries' % season['season'], metadata)) + + entries.append(self._entries('https://api.rctiplus.com/api/v2/program/%s/clip?content_id=0' % series_id, + display_id, 'Downloading clip entries', metadata)) + entries.append(self._entries('https://api.rctiplus.com/api/v2/program/%s/extra?content_id=0' % series_id, + display_id, 'Downloading extra entries', metadata)) + + return self.playlist_result(itertools.chain(*entries), series_id, series_meta.get('title'), series_meta.get('summary'), **metadata) + + +class RCTIPlusTVIE(RCTIPlusBaseIE): + _VALID_URL = r'https://www\.rctiplus\.com/((tv/(?P<tvname>\w+))|(?P<eventname>live-event|missed-event))' + _TESTS = [{ + 'url': 'https://www.rctiplus.com/tv/rcti', + 'info_dict': { + 'id': 'v_lt1', + 'title': 'RCTI', + 'ext': 'mp4', + 'timestamp': 1546344000, + 'upload_date': '20190101', + }, + 'params': { + 'skip_download': True, + 'format': 'bestvideo', + } + }, { + # Returned video will always change + 'url': 'https://www.rctiplus.com/live-event', + 'only_matching': True, + }, { + # Returned video will also always change + 'url': 'https://www.rctiplus.com/missed-event', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if RCTIPlusIE.suitable(url) else super(RCTIPlusTVIE, cls).suitable(url) + + def _real_extract(self, url): + match = self._match_valid_url(url).groupdict() + tv_id = match.get('tvname') or match.get('eventname') + webpage = self._download_webpage(url, tv_id) + video_type, video_id = self._search_regex( + r'url\s*:\s*["\']https://api\.rctiplus\.com/api/v./(?P<type>[^/]+)/(?P<id>\d+)/url', webpage, 'video link', group=('type', 'id')) + return self.url_result(f'https://www.rctiplus.com/{video_type}/{video_id}/{tv_id}', 'RCTIPlus') diff --git a/youtube_dl/extractor/rds.py b/yt_dlp/extractor/rds.py index 0c497856e..0c497856e 100644 --- a/youtube_dl/extractor/rds.py +++ b/yt_dlp/extractor/rds.py diff --git a/yt_dlp/extractor/redbulltv.py b/yt_dlp/extractor/redbulltv.py new file mode 100644 index 000000000..e7fdcce3e --- /dev/null +++ b/yt_dlp/extractor/redbulltv.py @@ -0,0 +1,230 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ( + float_or_none, + ExtractorError, +) + + +class RedBullTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?redbull(?:\.tv|\.com(?:/[^/]+)?(?:/tv)?)(?:/events/[^/]+)?/(?:videos?|live|(?:film|episode)s)/(?P<id>AP-\w+)' + _TESTS = [{ + # film + 'url': 'https://www.redbull.tv/video/AP-1Q6XCDTAN1W11', + 'md5': 'fb0445b98aa4394e504b413d98031d1f', + 'info_dict': { + 'id': 'AP-1Q6XCDTAN1W11', + 'ext': 'mp4', + 'title': 'ABC of... WRC - ABC of... S1E6', + 'description': 'md5:5c7ed8f4015c8492ecf64b6ab31e7d31', + 'duration': 1582.04, + }, + }, { + # episode + 'url': 'https://www.redbull.tv/video/AP-1PMHKJFCW1W11', + 'info_dict': { + 'id': 'AP-1PMHKJFCW1W11', + 'ext': 'mp4', + 'title': 'Grime - Hashtags S2E4', + 'description': 'md5:5546aa612958c08a98faaad4abce484d', + 'duration': 904, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.redbull.com/int-en/tv/video/AP-1UWHCAR9S1W11/rob-meets-sam-gaze?playlist=playlists::3f81040a-2f31-4832-8e2e-545b1d39d173', + 'only_matching': True, + }, { + 'url': 'https://www.redbull.com/us-en/videos/AP-1YM9QCYE52111', + 'only_matching': True, + }, { + 'url': 'https://www.redbull.com/us-en/events/AP-1XV2K61Q51W11/live/AP-1XUJ86FDH1W11', + 'only_matching': True, + }, { + 'url': 'https://www.redbull.com/int-en/films/AP-1ZSMAW8FH2111', + 'only_matching': True, + }, { + 'url': 'https://www.redbull.com/int-en/episodes/AP-1TQWK7XE11W11', + 'only_matching': True, + }] + + def extract_info(self, video_id): + session = self._download_json( + 'https://api.redbull.tv/v3/session', video_id, + note='Downloading access token', query={ + 'category': 'personal_computer', + 'os_family': 'http', + }) + if session.get('code') == 'error': + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, session['message'])) + token = session['token'] + + try: + video = self._download_json( + 'https://api.redbull.tv/v3/products/' + video_id, + video_id, note='Downloading video information', + headers={'Authorization': token} + ) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: + error_message = self._parse_json( + e.cause.read().decode(), video_id)['error'] + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, error_message), expected=True) + raise + + title = video['title'].strip() + + formats = self._extract_m3u8_formats( + 'https://dms.redbull.tv/v3/%s/%s/playlist.m3u8' % (video_id, token), + video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') + self._sort_formats(formats) + + subtitles = {} + for resource in video.get('resources', []): + if resource.startswith('closed_caption_'): + splitted_resource = resource.split('_') + if splitted_resource[2]: + subtitles.setdefault('en', []).append({ + 'url': 'https://resources.redbull.tv/%s/%s' % (video_id, resource), + 'ext': splitted_resource[2], + }) + + subheading = video.get('subheading') + if subheading: + title += ' - %s' % subheading + + return { + 'id': video_id, + 'title': title, + 'description': video.get('long_description') or video.get( + 'short_description'), + 'duration': float_or_none(video.get('duration'), scale=1000), + 'formats': formats, + 'subtitles': subtitles, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.extract_info(video_id) + + +class RedBullEmbedIE(RedBullTVIE): + _VALID_URL = r'https?://(?:www\.)?redbull\.com/embed/(?P<id>rrn:content:[^:]+:[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}:[a-z]{2}-[A-Z]{2,3})' + _TESTS = [{ + # HLS manifest accessible only using assetId + 'url': 'https://www.redbull.com/embed/rrn:content:episode-videos:f3021f4f-3ed4-51ac-915a-11987126e405:en-INT', + 'only_matching': True, + }] + _VIDEO_ESSENSE_TMPL = '''... on %s { + videoEssence { + attributes + } + }''' + + def _real_extract(self, url): + rrn_id = self._match_id(url) + asset_id = self._download_json( + 'https://edge-graphql.crepo-production.redbullaws.com/v1/graphql', + rrn_id, headers={ + 'Accept': 'application/json', + 'API-KEY': 'e90a1ff11335423998b100c929ecc866', + }, query={ + 'query': '''{ + resource(id: "%s", enforceGeoBlocking: false) { + %s + %s + } +}''' % (rrn_id, self._VIDEO_ESSENSE_TMPL % 'LiveVideo', self._VIDEO_ESSENSE_TMPL % 'VideoResource'), + })['data']['resource']['videoEssence']['attributes']['assetId'] + return self.extract_info(asset_id) + + +class RedBullTVRrnContentIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?redbull\.com/(?P<region>[a-z]{2,3})-(?P<lang>[a-z]{2})/tv/(?:video|live|film)/(?P<id>rrn:content:[^:]+:[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _TESTS = [{ + 'url': 'https://www.redbull.com/int-en/tv/video/rrn:content:live-videos:e3e6feb4-e95f-50b7-962a-c70f8fd13c73/mens-dh-finals-fort-william', + 'only_matching': True, + }, { + 'url': 'https://www.redbull.com/int-en/tv/video/rrn:content:videos:a36a0f36-ff1b-5db8-a69d-ee11a14bf48b/tn-ts-style?playlist=rrn:content:event-profiles:83f05926-5de8-5389-b5e4-9bb312d715e8:extras', + 'only_matching': True, + }, { + 'url': 'https://www.redbull.com/int-en/tv/film/rrn:content:films:d1f4d00e-4c04-5d19-b510-a805ffa2ab83/follow-me', + 'only_matching': True, + }] + + def _real_extract(self, url): + region, lang, rrn_id = self._match_valid_url(url).groups() + rrn_id += ':%s-%s' % (lang, region.upper()) + return self.url_result( + 'https://www.redbull.com/embed/' + rrn_id, + RedBullEmbedIE.ie_key(), rrn_id) + + +class RedBullIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?redbull\.com/(?P<region>[a-z]{2,3})-(?P<lang>[a-z]{2})/(?P<type>(?:episode|film|(?:(?:recap|trailer)-)?video)s|live)/(?!AP-|rrn:content:)(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.redbull.com/int-en/episodes/grime-hashtags-s02-e04', + 'md5': 'db8271a7200d40053a1809ed0dd574ff', + 'info_dict': { + 'id': 'AA-1MT8DQWA91W14', + 'ext': 'mp4', + 'title': 'Grime - Hashtags S2E4', + 'description': 'md5:5546aa612958c08a98faaad4abce484d', + }, + }, { + 'url': 'https://www.redbull.com/int-en/films/kilimanjaro-mountain-of-greatness', + 'only_matching': True, + }, { + 'url': 'https://www.redbull.com/int-en/recap-videos/uci-mountain-bike-world-cup-2017-mens-xco-finals-from-vallnord', + 'only_matching': True, + }, { + 'url': 'https://www.redbull.com/int-en/trailer-videos/kings-of-content', + 'only_matching': True, + }, { + 'url': 'https://www.redbull.com/int-en/videos/tnts-style-red-bull-dance-your-style-s1-e12', + 'only_matching': True, + }, { + 'url': 'https://www.redbull.com/int-en/live/mens-dh-finals-fort-william', + 'only_matching': True, + }, { + # only available on the int-en website so a fallback is need for the API + # https://www.redbull.com/v3/api/graphql/v1/v3/query/en-GB>en-INT?filter[uriSlug]=fia-wrc-saturday-recap-estonia&rb3Schema=v1:hero + 'url': 'https://www.redbull.com/gb-en/live/fia-wrc-saturday-recap-estonia', + 'only_matching': True, + }] + _INT_FALLBACK_LIST = ['de', 'en', 'es', 'fr'] + _LAT_FALLBACK_MAP = ['ar', 'bo', 'car', 'cl', 'co', 'mx', 'pe'] + + def _real_extract(self, url): + region, lang, filter_type, display_id = self._match_valid_url(url).groups() + if filter_type == 'episodes': + filter_type = 'episode-videos' + elif filter_type == 'live': + filter_type = 'live-videos' + + regions = [region.upper()] + if region != 'int': + if region in self._LAT_FALLBACK_MAP: + regions.append('LAT') + if lang in self._INT_FALLBACK_LIST: + regions.append('INT') + locale = '>'.join(['%s-%s' % (lang, reg) for reg in regions]) + + rrn_id = self._download_json( + 'https://www.redbull.com/v3/api/graphql/v1/v3/query/' + locale, + display_id, query={ + 'filter[type]': filter_type, + 'filter[uriSlug]': display_id, + 'rb3Schema': 'v1:hero', + })['data']['id'] + + return self.url_result( + 'https://www.redbull.com/embed/' + rrn_id, + RedBullEmbedIE.ie_key(), rrn_id) diff --git a/yt_dlp/extractor/reddit.py b/yt_dlp/extractor/reddit.py new file mode 100644 index 000000000..c75d95a8e --- /dev/null +++ b/yt_dlp/extractor/reddit.py @@ -0,0 +1,169 @@ +import random + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + float_or_none, + try_get, + unescapeHTML, + url_or_none, +) + + +class RedditIE(InfoExtractor): + _VALID_URL = r'https?://v\.redd\.it/(?P<id>[^/?#&]+)' + _TEST = { + # from https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/ + 'url': 'https://v.redd.it/zv89llsvexdz', + 'md5': '0a070c53eba7ec4534d95a5a1259e253', + 'info_dict': { + 'id': 'zv89llsvexdz', + 'ext': 'mp4', + 'title': 'zv89llsvexdz', + }, + 'params': { + 'format': 'bestvideo', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + formats = self._extract_m3u8_formats( + 'https://v.redd.it/%s/HLSPlaylist.m3u8' % video_id, video_id, + 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + + formats.extend(self._extract_mpd_formats( + 'https://v.redd.it/%s/DASHPlaylist.mpd' % video_id, video_id, + mpd_id='dash', fatal=False)) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_id, + 'formats': formats, + } + + +class RedditRIE(InfoExtractor): + _VALID_URL = r'https?://(?P<subdomain>[^/]+\.)?reddit(?:media)?\.com/r/(?P<slug>[^/]+/comments/(?P<id>[^/?#&]+))' + _TESTS = [{ + 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/', + 'info_dict': { + 'id': 'zv89llsvexdz', + 'ext': 'mp4', + 'title': 'That small heart attack.', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'thumbnails': 'count:4', + 'timestamp': 1501941939, + 'upload_date': '20170805', + 'uploader': 'Antw87', + 'duration': 12, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'age_limit': 0, + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, + }, { + 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj', + 'only_matching': True, + }, { + # imgur + 'url': 'https://www.reddit.com/r/MadeMeSmile/comments/6t7wi5/wait_for_it/', + 'only_matching': True, + }, { + # imgur @ old reddit + 'url': 'https://old.reddit.com/r/MadeMeSmile/comments/6t7wi5/wait_for_it/', + 'only_matching': True, + }, { + # streamable + 'url': 'https://www.reddit.com/r/videos/comments/6t7sg9/comedians_hilarious_joke_about_the_guam_flag/', + 'only_matching': True, + }, { + # youtube + 'url': 'https://www.reddit.com/r/videos/comments/6t75wq/southern_man_tries_to_speak_without_an_accent/', + 'only_matching': True, + }, { + # reddit video @ nm reddit + 'url': 'https://nm.reddit.com/r/Cricket/comments/8idvby/lousy_cameraman_finds_himself_in_cairns_line_of/', + 'only_matching': True, + }, { + 'url': 'https://www.redditmedia.com/r/serbia/comments/pu9wbx/ako_vu%C4%8Di%C4%87_izgubi_izbore_ja_%C4%87u_da_crknem/', + 'only_matching': True, + }] + + @staticmethod + def _gen_session_id(): + id_length = 16 + rand_max = 1 << (id_length * 4) + return '%0.*x' % (id_length, random.randrange(rand_max)) + + def _real_extract(self, url): + subdomain, slug, video_id = self._match_valid_url(url).group('subdomain', 'slug', 'id') + + self._set_cookie('.reddit.com', 'reddit_session', self._gen_session_id()) + self._set_cookie('.reddit.com', '_options', '%7B%22pref_quarantine_optin%22%3A%20true%7D') + data = self._download_json(f'https://{subdomain}reddit.com/r/{slug}/.json', video_id, fatal=False) + if not data: + # Fall back to old.reddit.com in case the requested subdomain fails + data = self._download_json(f'https://old.reddit.com/r/{slug}/.json', video_id) + data = data[0]['data']['children'][0]['data'] + video_url = data['url'] + + # Avoid recursing into the same reddit URL + if 'reddit.com/' in video_url and '/%s/' % video_id in video_url: + raise ExtractorError('No media found', expected=True) + + over_18 = data.get('over_18') + if over_18 is True: + age_limit = 18 + elif over_18 is False: + age_limit = 0 + else: + age_limit = None + + thumbnails = [] + + def add_thumbnail(src): + if not isinstance(src, dict): + return + thumbnail_url = url_or_none(src.get('url')) + if not thumbnail_url: + return + thumbnails.append({ + 'url': unescapeHTML(thumbnail_url), + 'width': int_or_none(src.get('width')), + 'height': int_or_none(src.get('height')), + }) + + for image in try_get(data, lambda x: x['preview']['images']) or []: + if not isinstance(image, dict): + continue + add_thumbnail(image.get('source')) + resolutions = image.get('resolutions') + if isinstance(resolutions, list): + for resolution in resolutions: + add_thumbnail(resolution) + + return { + '_type': 'url_transparent', + 'url': video_url, + 'title': data.get('title'), + 'thumbnails': thumbnails, + 'timestamp': float_or_none(data.get('created_utc')), + 'uploader': data.get('author'), + 'duration': int_or_none(try_get( + data, + (lambda x: x['media']['reddit_video']['duration'], + lambda x: x['secure_media']['reddit_video']['duration']))), + 'like_count': int_or_none(data.get('ups')), + 'dislike_count': int_or_none(data.get('downs')), + 'comment_count': int_or_none(data.get('num_comments')), + 'age_limit': age_limit, + } diff --git a/yt_dlp/extractor/redtube.py b/yt_dlp/extractor/redtube.py new file mode 100644 index 000000000..747ce5199 --- /dev/null +++ b/yt_dlp/extractor/redtube.py @@ -0,0 +1,137 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + merge_dicts, + str_to_int, + unified_strdate, + url_or_none, +) + + +class RedTubeIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:\w+\.)?redtube\.com/|embed\.redtube\.com/\?.*?\bid=)(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://www.redtube.com/66418', + 'md5': 'fc08071233725f26b8f014dba9590005', + 'info_dict': { + 'id': '66418', + 'ext': 'mp4', + 'title': 'Sucked on a toilet', + 'upload_date': '20110811', + 'duration': 596, + 'view_count': int, + 'age_limit': 18, + } + }, { + 'url': 'http://embed.redtube.com/?bgcolor=000000&id=1443286', + 'only_matching': True, + }, { + 'url': 'http://it.redtube.com/66418', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//embed\.redtube\.com/\?.*?\bid=\d+)', + webpage) + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage( + 'http://www.redtube.com/%s' % video_id, video_id) + + ERRORS = ( + (('video-deleted-info', '>This video has been removed'), 'has been removed'), + (('private_video_text', '>This video is private', '>Send a friend request to its owner to be able to view it'), 'is private'), + ) + + for patterns, message in ERRORS: + if any(p in webpage for p in patterns): + raise ExtractorError( + 'Video %s %s' % (video_id, message), expected=True) + + info = self._search_json_ld(webpage, video_id, default={}) + + if not info.get('title'): + info['title'] = self._html_search_regex( + (r'<h(\d)[^>]+class="(?:video_title_text|videoTitle|video_title)[^"]*">(?P<title>(?:(?!\1).)+)</h\1>', + r'(?:videoTitle|title)\s*:\s*(["\'])(?P<title>(?:(?!\1).)+)\1',), + webpage, 'title', group='title', + default=None) or self._og_search_title(webpage) + + formats = [] + sources = self._parse_json( + self._search_regex( + r'sources\s*:\s*({.+?})', webpage, 'source', default='{}'), + video_id, fatal=False) + if sources and isinstance(sources, dict): + for format_id, format_url in sources.items(): + if format_url: + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'height': int_or_none(format_id), + }) + medias = self._parse_json( + self._search_regex( + r'mediaDefinition["\']?\s*:\s*(\[.+?}\s*\])', webpage, + 'media definitions', default='{}'), + video_id, fatal=False) + if medias and isinstance(medias, list): + for media in medias: + format_url = url_or_none(media.get('videoUrl')) + if not format_url: + continue + if media.get('format') == 'hls' or determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False)) + continue + format_id = media.get('quality') + formats.append({ + 'url': format_url, + 'ext': 'mp4', + 'format_id': format_id, + 'height': int_or_none(format_id), + }) + if not formats: + video_url = self._html_search_regex( + r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL') + formats.append({'url': video_url, 'ext': 'mp4'}) + self._sort_formats(formats) + + thumbnail = self._og_search_thumbnail(webpage) + upload_date = unified_strdate(self._search_regex( + r'<span[^>]+>(?:ADDED|Published on) ([^<]+)<', + webpage, 'upload date', default=None)) + duration = int_or_none(self._og_search_property( + 'video:duration', webpage, default=None) or self._search_regex( + r'videoDuration\s*:\s*(\d+)', webpage, 'duration', default=None)) + view_count = str_to_int(self._search_regex( + (r'<div[^>]*>Views</div>\s*<div[^>]*>\s*([\d,.]+)', + r'<span[^>]*>VIEWS</span>\s*</td>\s*<td>\s*([\d,.]+)', + r'<span[^>]+\bclass=["\']video_view_count[^>]*>\s*([\d,.]+)'), + webpage, 'view count', default=None)) + + # No self-labeling, but they describe themselves as + # "Home of Videos Porno" + age_limit = 18 + + return merge_dicts(info, { + 'id': video_id, + 'ext': 'mp4', + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'duration': duration, + 'view_count': view_count, + 'age_limit': age_limit, + 'formats': formats, + }) diff --git a/youtube_dl/extractor/regiotv.py b/yt_dlp/extractor/regiotv.py index e250a52f0..e250a52f0 100644 --- a/youtube_dl/extractor/regiotv.py +++ b/yt_dlp/extractor/regiotv.py diff --git a/youtube_dl/extractor/rentv.py b/yt_dlp/extractor/rentv.py index 7c8909d95..7c8909d95 100644 --- a/youtube_dl/extractor/rentv.py +++ b/yt_dlp/extractor/rentv.py diff --git a/youtube_dl/extractor/restudy.py b/yt_dlp/extractor/restudy.py index d47fb45ca..d47fb45ca 100644 --- a/youtube_dl/extractor/restudy.py +++ b/yt_dlp/extractor/restudy.py diff --git a/youtube_dl/extractor/reuters.py b/yt_dlp/extractor/reuters.py index 9dc482d21..9dc482d21 100644 --- a/youtube_dl/extractor/reuters.py +++ b/yt_dlp/extractor/reuters.py diff --git a/youtube_dl/extractor/reverbnation.py b/yt_dlp/extractor/reverbnation.py index 4cb99c244..4cb99c244 100644 --- a/youtube_dl/extractor/reverbnation.py +++ b/yt_dlp/extractor/reverbnation.py diff --git a/yt_dlp/extractor/rice.py b/yt_dlp/extractor/rice.py new file mode 100644 index 000000000..cf2bb1b51 --- /dev/null +++ b/yt_dlp/extractor/rice.py @@ -0,0 +1,116 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_parse_qs +from ..utils import ( + xpath_text, + xpath_element, + int_or_none, + parse_iso8601, + ExtractorError, +) + + +class RICEIE(InfoExtractor): + _VALID_URL = r'https?://mediahub\.rice\.edu/app/[Pp]ortal/video\.aspx\?(?P<query>.+)' + _TEST = { + 'url': 'https://mediahub.rice.edu/app/Portal/video.aspx?PortalID=25ffd62c-3d01-4b29-8c70-7c94270efb3e&DestinationID=66bc9434-03bd-4725-b47e-c659d8d809db&ContentID=YEWIvbhb40aqdjMD1ALSqw', + 'md5': '9b83b4a2eead4912dc3b7fac7c449b6a', + 'info_dict': { + 'id': 'YEWIvbhb40aqdjMD1ALSqw', + 'ext': 'mp4', + 'title': 'Active Learning in Archeology', + 'upload_date': '20140616', + 'timestamp': 1402926346, + } + } + _NS = 'http://schemas.datacontract.org/2004/07/ensembleVideo.Data.Service.Contracts.Models.Player.Config' + + def _real_extract(self, url): + qs = compat_parse_qs(self._match_valid_url(url).group('query')) + if not qs.get('PortalID') or not qs.get('DestinationID') or not qs.get('ContentID'): + raise ExtractorError('Invalid URL', expected=True) + + portal_id = qs['PortalID'][0] + playlist_id = qs['DestinationID'][0] + content_id = qs['ContentID'][0] + + content_data = self._download_xml('https://mediahub.rice.edu/api/portal/GetContentTitle', content_id, query={ + 'portalId': portal_id, + 'playlistId': playlist_id, + 'contentId': content_id + }) + metadata = xpath_element(content_data, './/metaData', fatal=True) + title = xpath_text(metadata, 'primaryTitle', fatal=True) + encodings = xpath_element(content_data, './/encodings', fatal=True) + player_data = self._download_xml('https://mediahub.rice.edu/api/player/GetPlayerConfig', content_id, query={ + 'temporaryLinkId': xpath_text(encodings, 'temporaryLinkId', fatal=True), + 'contentId': content_id, + }) + + common_fmt = {} + dimensions = xpath_text(encodings, 'dimensions') + if dimensions: + wh = dimensions.split('x') + if len(wh) == 2: + common_fmt.update({ + 'width': int_or_none(wh[0]), + 'height': int_or_none(wh[1]), + }) + + formats = [] + rtsp_path = xpath_text(player_data, self._xpath_ns('RtspPath', self._NS)) + if rtsp_path: + fmt = { + 'url': rtsp_path, + 'format_id': 'rtsp', + } + fmt.update(common_fmt) + formats.append(fmt) + for source in player_data.findall(self._xpath_ns('.//Source', self._NS)): + video_url = xpath_text(source, self._xpath_ns('File', self._NS)) + if not video_url: + continue + if '.m3u8' in video_url: + formats.extend(self._extract_m3u8_formats(video_url, content_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + else: + fmt = { + 'url': video_url, + 'format_id': video_url.split(':')[0], + } + fmt.update(common_fmt) + rtmp = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>.+))/(?P<playpath>mp4:.+)$', video_url) + if rtmp: + fmt.update({ + 'url': rtmp.group('url'), + 'play_path': rtmp.group('playpath'), + 'app': rtmp.group('app'), + 'ext': 'flv', + }) + formats.append(fmt) + self._sort_formats(formats) + + thumbnails = [] + for content_asset in content_data.findall('.//contentAssets'): + asset_type = xpath_text(content_asset, 'type') + if asset_type == 'image': + image_url = xpath_text(content_asset, 'httpPath') + if not image_url: + continue + thumbnails.append({ + 'id': xpath_text(content_asset, 'ID'), + 'url': image_url, + }) + + return { + 'id': content_id, + 'title': title, + 'description': xpath_text(metadata, 'abstract'), + 'duration': int_or_none(xpath_text(metadata, 'duration')), + 'timestamp': parse_iso8601(xpath_text(metadata, 'dateUpdated')), + 'thumbnails': thumbnails, + 'formats': formats, + } diff --git a/yt_dlp/extractor/rmcdecouverte.py b/yt_dlp/extractor/rmcdecouverte.py new file mode 100644 index 000000000..422d47ae9 --- /dev/null +++ b/yt_dlp/extractor/rmcdecouverte.py @@ -0,0 +1,76 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from .brightcove import BrightcoveLegacyIE +from ..compat import ( + compat_parse_qs, + compat_urlparse, +) +from ..utils import smuggle_url + + +class RMCDecouverteIE(InfoExtractor): + _VALID_URL = r'https?://rmcdecouverte\.bfmtv\.com/(?:[^?#]*_(?P<id>\d+)|mediaplayer-direct)/?(?:[#?]|$)' + + _TESTS = [{ + 'url': 'https://rmcdecouverte.bfmtv.com/vestiges-de-guerre_22240/les-bunkers-secrets-domaha-beach_25303/', + 'info_dict': { + 'id': '6250879771001', + 'ext': 'mp4', + 'title': 'LES BUNKERS SECRETS D´OMAHA BEACH', + 'uploader_id': '1969646226001', + 'description': 'md5:aed573ca24abde62a148e0eba909657d', + 'timestamp': 1619622984, + 'upload_date': '20210428', + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, + }, { + 'url': 'https://rmcdecouverte.bfmtv.com/wheeler-dealers-occasions-a-saisir/program_2566/', + 'info_dict': { + 'id': '5983675500001', + 'ext': 'mp4', + 'title': 'CORVETTE', + 'description': 'md5:c1e8295521e45ffebf635d6a7658f506', + 'uploader_id': '1969646226001', + 'upload_date': '20181226', + 'timestamp': 1545861635, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'only available for a week', + }, { + 'url': 'https://rmcdecouverte.bfmtv.com/avions-furtifs-la-technologie-de-lextreme_10598', + 'only_matching': True, + }, { + # The website accepts any URL as long as it has _\d+ at the end + 'url': 'https://rmcdecouverte.bfmtv.com/any/thing/can/go/here/_10598', + 'only_matching': True, + }, { + # live, geo restricted, bypassable + 'url': 'https://rmcdecouverte.bfmtv.com/mediaplayer-direct/', + 'only_matching': True, + }] + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1969646226001/default_default/index.html?videoId=%s' + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + display_id = mobj.group('id') or 'direct' + webpage = self._download_webpage(url, display_id) + brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) + if brightcove_legacy_url: + brightcove_id = compat_parse_qs(compat_urlparse.urlparse( + brightcove_legacy_url).query)['@videoPlayer'][0] + else: + brightcove_id = self._search_regex( + r'data-video-id=["\'](\d+)', webpage, 'brightcove id') + return self.url_result( + smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, + {'geo_countries': ['FR']}), + 'BrightcoveNew', brightcove_id) diff --git a/youtube_dl/extractor/ro220.py b/yt_dlp/extractor/ro220.py index 69934ef2b..69934ef2b 100644 --- a/youtube_dl/extractor/ro220.py +++ b/yt_dlp/extractor/ro220.py diff --git a/youtube_dl/extractor/rockstargames.py b/yt_dlp/extractor/rockstargames.py index cd6904bc9..cd6904bc9 100644 --- a/youtube_dl/extractor/rockstargames.py +++ b/yt_dlp/extractor/rockstargames.py diff --git a/yt_dlp/extractor/roosterteeth.py b/yt_dlp/extractor/roosterteeth.py new file mode 100644 index 000000000..2c815bda6 --- /dev/null +++ b/yt_dlp/extractor/roosterteeth.py @@ -0,0 +1,153 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_HTTPError, + compat_str, +) +from ..utils import ( + ExtractorError, + int_or_none, + str_or_none, + urlencode_postdata, +) + + +class RoosterTeethIE(InfoExtractor): + _VALID_URL = r'https?://(?:.+?\.)?roosterteeth\.com/(?:episode|watch)/(?P<id>[^/?#&]+)' + _NETRC_MACHINE = 'roosterteeth' + _TESTS = [{ + 'url': 'http://roosterteeth.com/episode/million-dollars-but-season-2-million-dollars-but-the-game-announcement', + 'md5': 'e2bd7764732d785ef797700a2489f212', + 'info_dict': { + 'id': '9156', + 'display_id': 'million-dollars-but-season-2-million-dollars-but-the-game-announcement', + 'ext': 'mp4', + 'title': 'Million Dollars, But... The Game Announcement', + 'description': 'md5:168a54b40e228e79f4ddb141e89fe4f5', + 'thumbnail': r're:^https?://.*\.png$', + 'series': 'Million Dollars, But...', + 'episode': 'Million Dollars, But... The Game Announcement', + }, + }, { + 'url': 'https://roosterteeth.com/watch/rwby-bonus-25', + 'md5': 'fe8d9d976b272c18a24fe7f1f5830084', + 'info_dict': { + 'id': '31', + 'display_id': 'rwby-bonus-25', + 'title': 'Volume 2, World of Remnant 3', + 'description': 'md5:8d58d3270292ea11da00ea712bbfb009', + 'episode': 'Volume 2, World of Remnant 3', + 'channel_id': 'fab60c1c-29cb-43bc-9383-5c3538d9e246', + 'thumbnail': r're:^https?://.*\.(png|jpe?g)$', + 'ext': 'mp4', + }, + }, { + 'url': 'http://achievementhunter.roosterteeth.com/episode/off-topic-the-achievement-hunter-podcast-2016-i-didn-t-think-it-would-pass-31', + 'only_matching': True, + }, { + 'url': 'http://funhaus.roosterteeth.com/episode/funhaus-shorts-2016-austin-sucks-funhaus-shorts', + 'only_matching': True, + }, { + 'url': 'http://screwattack.roosterteeth.com/episode/death-battle-season-3-mewtwo-vs-shadow', + 'only_matching': True, + }, { + 'url': 'http://theknow.roosterteeth.com/episode/the-know-game-news-season-1-boring-steam-sales-are-better', + 'only_matching': True, + }, { + # only available for FIRST members + 'url': 'http://roosterteeth.com/episode/rt-docs-the-world-s-greatest-head-massage-the-world-s-greatest-head-massage-an-asmr-journey-part-one', + 'only_matching': True, + }, { + 'url': 'https://roosterteeth.com/watch/million-dollars-but-season-2-million-dollars-but-the-game-announcement', + 'only_matching': True, + }] + _EPISODE_BASE_URL = 'https://svod-be.roosterteeth.com/api/v1/watch/' + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + try: + self._download_json( + 'https://auth.roosterteeth.com/oauth/token', + None, 'Logging in', data=urlencode_postdata({ + 'client_id': '4338d2b4bdc8db1239360f28e72f0d9ddb1fd01e7a38fbb07b4b1f4ba4564cc5', + 'grant_type': 'password', + 'username': username, + 'password': password, + })) + except ExtractorError as e: + msg = 'Unable to login' + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + resp = self._parse_json(e.cause.read().decode(), None, fatal=False) + if resp: + error = resp.get('extra_info') or resp.get('error_description') or resp.get('error') + if error: + msg += ': ' + error + self.report_warning(msg) + + def _real_initialize(self): + if self._get_cookies(self._EPISODE_BASE_URL).get('rt_access_token'): + return + self._login() + + def _real_extract(self, url): + display_id = self._match_id(url) + api_episode_url = self._EPISODE_BASE_URL + display_id + + try: + video_data = self._download_json( + api_episode_url + '/videos', display_id, + 'Downloading video JSON metadata')['data'][0] + m3u8_url = video_data['attributes']['url'] + # XXX: additional URL at video_data['links']['download'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + if self._parse_json(e.cause.read().decode(), display_id).get('access') is False: + self.raise_login_required( + '%s is only available for FIRST members' % display_id) + raise + + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + m3u8_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls') + self._sort_formats(formats) + + episode = self._download_json( + api_episode_url, display_id, + 'Downloading episode JSON metadata')['data'][0] + attributes = episode['attributes'] + title = attributes.get('title') or attributes['display_title'] + video_id = compat_str(episode['id']) + + thumbnails = [] + for image in episode.get('included', {}).get('images', []): + if image.get('type') in ('episode_image', 'bonus_feature_image'): + img_attributes = image.get('attributes') or {} + for k in ('thumb', 'small', 'medium', 'large'): + img_url = img_attributes.get(k) + if img_url: + thumbnails.append({ + 'id': k, + 'url': img_url, + }) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': attributes.get('description') or attributes.get('caption'), + 'thumbnails': thumbnails, + 'series': attributes.get('show_title'), + 'season_number': int_or_none(attributes.get('season_number')), + 'season_id': attributes.get('season_id'), + 'episode': title, + 'episode_number': int_or_none(attributes.get('number')), + 'episode_id': str_or_none(episode.get('uuid')), + 'formats': formats, + 'channel_id': attributes.get('channel_id'), + 'duration': int_or_none(attributes.get('length')), + 'subtitles': subtitles + } diff --git a/youtube_dl/extractor/rottentomatoes.py b/yt_dlp/extractor/rottentomatoes.py index 14c8e8236..14c8e8236 100644 --- a/youtube_dl/extractor/rottentomatoes.py +++ b/yt_dlp/extractor/rottentomatoes.py diff --git a/yt_dlp/extractor/roxwel.py b/yt_dlp/extractor/roxwel.py new file mode 100644 index 000000000..84bb1aa00 --- /dev/null +++ b/yt_dlp/extractor/roxwel.py @@ -0,0 +1,52 @@ +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import unified_strdate, determine_ext + + +class RoxwelIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?roxwel\.com/player/(?P<filename>.+?)(\.|\?|$)' + + _TEST = { + 'url': 'http://www.roxwel.com/player/passionpittakeawalklive.html', + 'info_dict': { + 'id': 'passionpittakeawalklive', + 'ext': 'flv', + 'title': 'Take A Walk (live)', + 'uploader': 'Passion Pit', + 'uploader_id': 'passionpit', + 'upload_date': '20120928', + 'description': 'Passion Pit performs "Take A Walk\" live at The Backyard in Austin, Texas. ', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + filename = mobj.group('filename') + info_url = 'http://www.roxwel.com/api/videos/%s' % filename + info = self._download_json(info_url, filename) + + rtmp_rates = sorted([int(r.replace('flv_', '')) for r in info['media_rates'] if r.startswith('flv_')]) + best_rate = rtmp_rates[-1] + url_page_url = 'http://roxwel.com/pl_one_time.php?filename=%s&quality=%s' % (filename, best_rate) + rtmp_url = self._download_webpage(url_page_url, filename, 'Downloading video url') + ext = determine_ext(rtmp_url) + if ext == 'f4v': + rtmp_url = rtmp_url.replace(filename, 'mp4:%s' % filename) + + return { + 'id': filename, + 'title': info['title'], + 'url': rtmp_url, + 'ext': 'flv', + 'description': info['description'], + 'thumbnail': info.get('player_image_url') or info.get('image_url_large'), + 'uploader': info['artist'], + 'uploader_id': info['artistname'], + 'upload_date': unified_strdate(info['dbdate']), + } diff --git a/youtube_dl/extractor/rozhlas.py b/yt_dlp/extractor/rozhlas.py index fccf69401..fccf69401 100644 --- a/youtube_dl/extractor/rozhlas.py +++ b/yt_dlp/extractor/rozhlas.py diff --git a/yt_dlp/extractor/rtbf.py b/yt_dlp/extractor/rtbf.py new file mode 100644 index 000000000..f9979d0a4 --- /dev/null +++ b/yt_dlp/extractor/rtbf.py @@ -0,0 +1,161 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + strip_or_none, +) + + +class RTBFIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?://(?:www\.)?rtbf\.be/ + (?: + video/[^?]+\?.*\bid=| + ouftivi/(?:[^/]+/)*[^?]+\?.*\bvideoId=| + auvio/[^/]+\?.*\b(?P<live>l)?id= + )(?P<id>\d+)''' + _TESTS = [{ + 'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274', + 'md5': '8c876a1cceeb6cf31b476461ade72384', + 'info_dict': { + 'id': '1921274', + 'ext': 'mp4', + 'title': 'Les Diables au coeur (épisode 2)', + 'description': '(du 25/04/2014)', + 'duration': 3099.54, + 'upload_date': '20140425', + 'timestamp': 1398456300, + } + }, { + # geo restricted + 'url': 'http://www.rtbf.be/ouftivi/heros/detail_scooby-doo-mysteres-associes?id=1097&videoId=2057442', + 'only_matching': True, + }, { + 'url': 'http://www.rtbf.be/ouftivi/niouzz?videoId=2055858', + 'only_matching': True, + }, { + 'url': 'http://www.rtbf.be/auvio/detail_jeudi-en-prime-siegfried-bracke?id=2102996', + 'only_matching': True, + }, { + # Live + 'url': 'https://www.rtbf.be/auvio/direct_pure-fm?lid=134775', + 'only_matching': True, + }, { + # Audio + 'url': 'https://www.rtbf.be/auvio/detail_cinq-heures-cinema?id=2360811', + 'only_matching': True, + }, { + # With Subtitle + 'url': 'https://www.rtbf.be/auvio/detail_les-carnets-du-bourlingueur?id=2361588', + 'only_matching': True, + }] + _IMAGE_HOST = 'http://ds1.ds.static.rtbf.be' + _PROVIDERS = { + 'YOUTUBE': 'Youtube', + 'DAILYMOTION': 'Dailymotion', + 'VIMEO': 'Vimeo', + } + _QUALITIES = [ + ('mobile', 'SD'), + ('web', 'MD'), + ('high', 'HD'), + ] + + def _real_extract(self, url): + live, media_id = self._match_valid_url(url).groups() + embed_page = self._download_webpage( + 'https://www.rtbf.be/auvio/embed/' + ('direct' if live else 'media'), + media_id, query={'id': media_id}) + data = self._parse_json(self._html_search_regex( + r'data-media="([^"]+)"', embed_page, 'media data'), media_id) + + error = data.get('error') + if error: + raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) + + provider = data.get('provider') + if provider in self._PROVIDERS: + return self.url_result(data['url'], self._PROVIDERS[provider]) + + title = data['title'] + is_live = data.get('isLive') + if is_live: + title = self._live_title(title) + height_re = r'-(\d+)p\.' + formats = [] + + m3u8_url = data.get('urlHlsAes128') or data.get('urlHls') + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False)) + + fix_url = lambda x: x.replace('//rtbf-vod.', '//rtbf.') if '/geo/drm/' in x else x + http_url = data.get('url') + if formats and http_url and re.search(height_re, http_url): + http_url = fix_url(http_url) + for m3u8_f in formats[:]: + height = m3u8_f.get('height') + if not height: + continue + f = m3u8_f.copy() + del f['protocol'] + f.update({ + 'format_id': m3u8_f['format_id'].replace('hls-', 'http-'), + 'url': re.sub(height_re, '-%dp.' % height, http_url), + }) + formats.append(f) + else: + sources = data.get('sources') or {} + for key, format_id in self._QUALITIES: + format_url = sources.get(key) + if not format_url: + continue + height = int_or_none(self._search_regex( + height_re, format_url, 'height', default=None)) + formats.append({ + 'format_id': format_id, + 'url': fix_url(format_url), + 'height': height, + }) + + mpd_url = data.get('urlDash') + if mpd_url and (self.get_param('allow_unplayable_formats') or not data.get('drm')): + formats.extend(self._extract_mpd_formats( + mpd_url, media_id, mpd_id='dash', fatal=False)) + + audio_url = data.get('urlAudio') + if audio_url: + formats.append({ + 'format_id': 'audio', + 'url': audio_url, + 'vcodec': 'none', + }) + self._sort_formats(formats) + + subtitles = {} + for track in (data.get('tracks') or {}).values(): + sub_url = track.get('url') + if not sub_url: + continue + subtitles.setdefault(track.get('lang') or 'fr', []).append({ + 'url': sub_url, + }) + + return { + 'id': media_id, + 'formats': formats, + 'title': title, + 'description': strip_or_none(data.get('description')), + 'thumbnail': data.get('thumbnail'), + 'duration': float_or_none(data.get('realDuration')), + 'timestamp': int_or_none(data.get('liveFrom')), + 'series': data.get('programLabel'), + 'subtitles': subtitles, + 'is_live': is_live, + } diff --git a/youtube_dl/extractor/rte.py b/yt_dlp/extractor/rte.py index 1fbc72915..1fbc72915 100644 --- a/youtube_dl/extractor/rte.py +++ b/yt_dlp/extractor/rte.py diff --git a/yt_dlp/extractor/rtl2.py b/yt_dlp/extractor/rtl2.py new file mode 100644 index 000000000..4e3aa0398 --- /dev/null +++ b/yt_dlp/extractor/rtl2.py @@ -0,0 +1,207 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..aes import aes_cbc_decrypt +from ..compat import ( + compat_b64decode, + compat_ord, + compat_str, +) +from ..utils import ( + bytes_to_intlist, + ExtractorError, + intlist_to_bytes, + int_or_none, + strip_or_none, +) + + +class RTL2IE(InfoExtractor): + IE_NAME = 'rtl2' + _VALID_URL = r'https?://(?:www\.)?rtl2\.de/sendung/[^/]+/(?:video/(?P<vico_id>\d+)[^/]+/(?P<vivi_id>\d+)-|folge/)(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'http://www.rtl2.de/sendung/grip-das-motormagazin/folge/folge-203-0', + 'info_dict': { + 'id': 'folge-203-0', + 'ext': 'f4v', + 'title': 'GRIP sucht den Sommerkönig', + 'description': 'md5:e3adbb940fd3c6e76fa341b8748b562f' + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], + }, { + 'url': 'http://www.rtl2.de/sendung/koeln-50667/video/5512-anna/21040-anna-erwischt-alex/', + 'info_dict': { + 'id': 'anna-erwischt-alex', + 'ext': 'mp4', + 'title': 'Anna erwischt Alex!', + 'description': 'Anna nimmt ihrem Vater nicht ab, dass er nicht spielt. Und tatsächlich erwischt sie ihn auf frischer Tat.' + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], + }] + + def _real_extract(self, url): + vico_id, vivi_id, display_id = self._match_valid_url(url).groups() + if not vico_id: + webpage = self._download_webpage(url, display_id) + + mobj = re.search( + r'data-collection="(?P<vico_id>\d+)"[^>]+data-video="(?P<vivi_id>\d+)"', + webpage) + if mobj: + vico_id = mobj.group('vico_id') + vivi_id = mobj.group('vivi_id') + else: + vico_id = self._html_search_regex( + r'vico_id\s*:\s*([0-9]+)', webpage, 'vico_id') + vivi_id = self._html_search_regex( + r'vivi_id\s*:\s*([0-9]+)', webpage, 'vivi_id') + + info = self._download_json( + 'https://service.rtl2.de/api-player-vipo/video.php', + display_id, query={ + 'vico_id': vico_id, + 'vivi_id': vivi_id, + }) + video_info = info['video'] + title = video_info['titel'] + + formats = [] + + rtmp_url = video_info.get('streamurl') + if rtmp_url: + rtmp_url = rtmp_url.replace('\\', '') + stream_url = 'mp4:' + self._html_search_regex(r'/ondemand/(.+)', rtmp_url, 'stream URL') + rtmp_conn = ['S:connect', 'O:1', 'NS:pageUrl:' + url, 'NB:fpad:0', 'NN:videoFunction:1', 'O:0'] + + formats.append({ + 'format_id': 'rtmp', + 'url': rtmp_url, + 'play_path': stream_url, + 'player_url': 'https://www.rtl2.de/sites/default/modules/rtl2/jwplayer/jwplayer-7.6.0/jwplayer.flash.swf', + 'page_url': url, + 'flash_version': 'LNX 11,2,202,429', + 'rtmp_conn': rtmp_conn, + 'no_resume': True, + 'quality': 1, + }) + + m3u8_url = video_info.get('streamurl_hls') + if m3u8_url: + formats.extend(self._extract_akamai_formats(m3u8_url, display_id)) + + self._sort_formats(formats) + + return { + 'id': display_id, + 'title': title, + 'thumbnail': video_info.get('image'), + 'description': video_info.get('beschreibung'), + 'duration': int_or_none(video_info.get('duration')), + 'formats': formats, + } + + +class RTL2YouBaseIE(InfoExtractor): + _BACKWERK_BASE_URL = 'https://p-you-backwerk.rtl2apps.de/' + + +class RTL2YouIE(RTL2YouBaseIE): + IE_NAME = 'rtl2:you' + _VALID_URL = r'http?://you\.rtl2\.de/(?:video/\d+/|youplayer/index\.html\?.*?\bvid=)(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://you.rtl2.de/video/3002/15740/MJUNIK%20%E2%80%93%20Home%20of%20YOU/307-hirn-wo-bist-du', + 'info_dict': { + 'id': '15740', + 'ext': 'mp4', + 'title': 'MJUNIK – Home of YOU - #307 Hirn, wo bist du?!', + 'description': 'md5:ddaa95c61b372b12b66e115b2772fe01', + 'age_limit': 12, + }, + }, { + 'url': 'http://you.rtl2.de/youplayer/index.html?vid=15712', + 'only_matching': True, + }] + _AES_KEY = b'\xe9W\xe4.<*\xb8\x1a\xd2\xb6\x92\xf3C\xd3\xefL\x1b\x03*\xbbbH\xc0\x03\xffo\xc2\xf2(\xaa\xaa!' + _GEO_COUNTRIES = ['DE'] + + def _real_extract(self, url): + video_id = self._match_id(url) + + stream_data = self._download_json( + self._BACKWERK_BASE_URL + 'stream/video/' + video_id, video_id) + + data, iv = compat_b64decode(stream_data['streamUrl']).decode().split(':') + stream_url = intlist_to_bytes(aes_cbc_decrypt( + bytes_to_intlist(compat_b64decode(data)), + bytes_to_intlist(self._AES_KEY), + bytes_to_intlist(compat_b64decode(iv)) + )) + if b'rtl2_you_video_not_found' in stream_url: + raise ExtractorError('video not found', expected=True) + + formats = self._extract_m3u8_formats( + stream_url[:-compat_ord(stream_url[-1])].decode(), + video_id, 'mp4', 'm3u8_native') + self._sort_formats(formats) + + video_data = self._download_json( + self._BACKWERK_BASE_URL + 'video/' + video_id, video_id) + + series = video_data.get('formatTitle') + title = episode = video_data.get('title') or series + if series and series != title: + title = '%s - %s' % (series, title) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': strip_or_none(video_data.get('description')), + 'thumbnail': video_data.get('image'), + 'duration': int_or_none(stream_data.get('duration') or video_data.get('duration'), 1000), + 'series': series, + 'episode': episode, + 'age_limit': int_or_none(video_data.get('minimumAge')), + } + + +class RTL2YouSeriesIE(RTL2YouBaseIE): + IE_NAME = 'rtl2:you:series' + _VALID_URL = r'http?://you\.rtl2\.de/videos/(?P<id>\d+)' + _TEST = { + 'url': 'http://you.rtl2.de/videos/115/dragon-ball', + 'info_dict': { + 'id': '115', + }, + 'playlist_mincount': 5, + } + + def _real_extract(self, url): + series_id = self._match_id(url) + stream_data = self._download_json( + self._BACKWERK_BASE_URL + 'videos', + series_id, query={ + 'formatId': series_id, + 'limit': 1000000000, + }) + + entries = [] + for video in stream_data.get('videos', []): + video_id = compat_str(video['videoId']) + if not video_id: + continue + entries.append(self.url_result( + 'http://you.rtl2.de/video/%s/%s' % (series_id, video_id), + 'RTL2You', video_id)) + return self.playlist_result(entries, series_id) diff --git a/youtube_dl/extractor/rtlnl.py b/yt_dlp/extractor/rtlnl.py index 9eaa06f25..9eaa06f25 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/yt_dlp/extractor/rtlnl.py diff --git a/yt_dlp/extractor/rtp.py b/yt_dlp/extractor/rtp.py new file mode 100644 index 000000000..c165ade78 --- /dev/null +++ b/yt_dlp/extractor/rtp.py @@ -0,0 +1,100 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import js_to_json +import re +import json +import urllib.parse +import base64 + + +class RTPIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/p(?P<program_id>[0-9]+)/(?P<id>[^/?#]+)/?' + _TESTS = [{ + 'url': 'http://www.rtp.pt/play/p405/e174042/paixoes-cruzadas', + 'md5': 'e736ce0c665e459ddb818546220b4ef8', + 'info_dict': { + 'id': 'e174042', + 'ext': 'mp3', + 'title': 'Paixões Cruzadas', + 'description': 'As paixões musicais de António Cartaxo e António Macedo', + 'thumbnail': r're:^https?://.*\.jpg', + }, + }, { + 'url': 'http://www.rtp.pt/play/p831/a-quimica-das-coisas', + 'only_matching': True, + }] + + _RX_OBFUSCATION = re.compile(r'''(?xs) + atob\s*\(\s*decodeURIComponent\s*\(\s* + (\[[0-9A-Za-z%,'"]*\]) + \s*\.\s*join\(\s*(?:""|'')\s*\)\s*\)\s*\) + ''') + + def __unobfuscate(self, data, *, video_id): + if data.startswith('{'): + data = self._RX_OBFUSCATION.sub( + lambda m: json.dumps( + base64.b64decode(urllib.parse.unquote( + ''.join(self._parse_json(m.group(1), video_id)) + )).decode('iso-8859-1')), + data) + return js_to_json(data) + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + title = self._html_search_meta( + 'twitter:title', webpage, display_name='title', fatal=True) + + f, config = self._search_regex( + r'''(?sx) + var\s+f\s*=\s*(?P<f>".*?"|{[^;]+?});\s* + var\s+player1\s+=\s+new\s+RTPPlayer\s*\((?P<config>{(?:(?!\*/).)+?})\);(?!\s*\*/) + ''', webpage, + 'player config', group=('f', 'config')) + + f = self._parse_json( + f, video_id, + lambda data: self.__unobfuscate(data, video_id=video_id)) + config = self._parse_json( + config, video_id, + lambda data: self.__unobfuscate(data, video_id=video_id)) + + formats = [] + if isinstance(f, dict): + f_hls = f.get('hls') + if f_hls is not None: + formats.extend(self._extract_m3u8_formats( + f_hls, video_id, 'mp4', 'm3u8_native', m3u8_id='hls')) + + f_dash = f.get('dash') + if f_dash is not None: + formats.extend(self._extract_mpd_formats(f_dash, video_id, mpd_id='dash')) + else: + formats.append({ + 'format_id': 'f', + 'url': f, + 'vcodec': 'none' if config.get('mediaType') == 'audio' else None, + }) + + subtitles = {} + + vtt = config.get('vtt') + if vtt is not None: + for lcode, lname, url in vtt: + subtitles.setdefault(lcode, []).append({ + 'name': lname, + 'url': url, + }) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': self._html_search_meta(['description', 'twitter:description'], webpage), + 'thumbnail': config.get('poster') or self._og_search_thumbnail(webpage), + 'subtitles': subtitles, + } diff --git a/yt_dlp/extractor/rts.py b/yt_dlp/extractor/rts.py new file mode 100644 index 000000000..865a73024 --- /dev/null +++ b/yt_dlp/extractor/rts.py @@ -0,0 +1,235 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .srgssr import SRGSSRIE +from ..compat import compat_str +from ..utils import ( + determine_ext, + int_or_none, + parse_duration, + parse_iso8601, + unescapeHTML, + urljoin, +) + + +class RTSIE(SRGSSRIE): + IE_DESC = 'RTS.ch' + _VALID_URL = r'rts:(?P<rts_id>\d+)|https?://(?:.+?\.)?rts\.ch/(?:[^/]+/){2,}(?P<id>[0-9]+)-(?P<display_id>.+?)\.html' + + _TESTS = [ + { + 'url': 'http://www.rts.ch/archives/tv/divers/3449373-les-enfants-terribles.html', + 'md5': '753b877968ad8afaeddccc374d4256a5', + 'info_dict': { + 'id': '3449373', + 'display_id': 'les-enfants-terribles', + 'ext': 'mp4', + 'duration': 1488, + 'title': 'Les Enfants Terribles', + 'description': 'France Pommier et sa soeur Luce Feral, les deux filles de ce groupe de 5.', + 'uploader': 'Divers', + 'upload_date': '19680921', + 'timestamp': -40280400, + 'thumbnail': r're:^https?://.*\.image', + 'view_count': int, + }, + 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], + }, + { + 'url': 'http://www.rts.ch/emissions/passe-moi-les-jumelles/5624067-entre-ciel-et-mer.html', + 'info_dict': { + 'id': '5624065', + 'title': 'Passe-moi les jumelles', + }, + 'playlist_mincount': 4, + }, + { + 'url': 'http://www.rts.ch/video/sport/hockey/5745975-1-2-kloten-fribourg-5-2-second-but-pour-gotteron-par-kwiatowski.html', + 'info_dict': { + 'id': '5745975', + 'display_id': '1-2-kloten-fribourg-5-2-second-but-pour-gotteron-par-kwiatowski', + 'ext': 'mp4', + 'duration': 48, + 'title': '1/2, Kloten - Fribourg (5-2): second but pour Gottéron par Kwiatowski', + 'description': 'Hockey - Playoff', + 'uploader': 'Hockey', + 'upload_date': '20140403', + 'timestamp': 1396556882, + 'thumbnail': r're:^https?://.*\.image', + 'view_count': int, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], + 'skip': 'Blocked outside Switzerland', + }, + { + 'url': 'http://www.rts.ch/video/info/journal-continu/5745356-londres-cachee-par-un-epais-smog.html', + 'md5': '9bb06503773c07ce83d3cbd793cebb91', + 'info_dict': { + 'id': '5745356', + 'display_id': 'londres-cachee-par-un-epais-smog', + 'ext': 'mp4', + 'duration': 33, + 'title': 'Londres cachée par un épais smog', + 'description': 'Un important voile de smog recouvre Londres depuis mercredi, provoqué par la pollution et du sable du Sahara.', + 'uploader': 'L\'actu en vidéo', + 'upload_date': '20140403', + 'timestamp': 1396537322, + 'thumbnail': r're:^https?://.*\.image', + 'view_count': int, + }, + 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], + }, + { + 'url': 'http://www.rts.ch/audio/couleur3/programmes/la-belle-video-de-stephane-laurenceau/5706148-urban-hippie-de-damien-krisl-03-04-2014.html', + 'md5': 'dd8ef6a22dff163d063e2a52bc8adcae', + 'info_dict': { + 'id': '5706148', + 'display_id': 'urban-hippie-de-damien-krisl-03-04-2014', + 'ext': 'mp3', + 'duration': 123, + 'title': '"Urban Hippie", de Damien Krisl', + 'description': 'Des Hippies super glam.', + 'upload_date': '20140403', + 'timestamp': 1396551600, + }, + }, + { + # article with videos on rhs + 'url': 'http://www.rts.ch/sport/hockey/6693917-hockey-davos-decroche-son-31e-titre-de-champion-de-suisse.html', + 'info_dict': { + 'id': '6693917', + 'title': 'Hockey: Davos décroche son 31e titre de champion de Suisse', + }, + 'playlist_mincount': 5, + }, + { + 'url': 'http://pages.rts.ch/emissions/passe-moi-les-jumelles/5624065-entre-ciel-et-mer.html', + 'only_matching': True, + } + ] + + def _real_extract(self, url): + m = self._match_valid_url(url) + media_id = m.group('rts_id') or m.group('id') + display_id = m.group('display_id') or media_id + + def download_json(internal_id): + return self._download_json( + 'http://www.rts.ch/a/%s.html?f=json/article' % internal_id, + display_id) + + all_info = download_json(media_id) + + # media_id extracted out of URL is not always a real id + if 'video' not in all_info and 'audio' not in all_info: + entries = [] + + for item in all_info.get('items', []): + item_url = item.get('url') + if not item_url: + continue + entries.append(self.url_result(item_url, 'RTS')) + + if not entries: + page, urlh = self._download_webpage_handle(url, display_id) + if re.match(self._VALID_URL, urlh.geturl()).group('id') != media_id: + return self.url_result(urlh.geturl(), 'RTS') + + # article with videos on rhs + videos = re.findall( + r'<article[^>]+class="content-item"[^>]*>\s*<a[^>]+data-video-urn="urn:([^"]+)"', + page) + if not videos: + videos = re.findall( + r'(?s)<iframe[^>]+class="srg-player"[^>]+src="[^"]+urn:([^"]+)"', + page) + if videos: + entries = [self.url_result('srgssr:%s' % video_urn, 'SRGSSR') for video_urn in videos] + + if entries: + return self.playlist_result(entries, media_id, all_info.get('title')) + + internal_id = self._html_search_regex( + r'<(?:video|audio) data-id="([0-9]+)"', page, + 'internal video id') + all_info = download_json(internal_id) + + media_type = 'video' if 'video' in all_info else 'audio' + + # check for errors + self._get_media_data('rts', media_type, media_id) + + info = all_info['video']['JSONinfo'] if 'video' in all_info else all_info['audio'] + + title = info['title'] + + def extract_bitrate(url): + return int_or_none(self._search_regex( + r'-([0-9]+)k\.', url, 'bitrate', default=None)) + + formats = [] + streams = info.get('streams', {}) + for format_id, format_url in streams.items(): + if format_id == 'hds_sd' and 'hds' in streams: + continue + if format_id == 'hls_sd' and 'hls' in streams: + continue + ext = determine_ext(format_url) + if ext in ('m3u8', 'f4m'): + format_url = self._get_tokenized_src(format_url, media_id, format_id) + if ext == 'f4m': + formats.extend(self._extract_f4m_formats( + format_url + ('?' if '?' not in format_url else '&') + 'hdcore=3.4.0', + media_id, f4m_id=format_id, fatal=False)) + else: + formats.extend(self._extract_m3u8_formats( + format_url, media_id, 'mp4', 'm3u8_native', m3u8_id=format_id, fatal=False)) + else: + formats.append({ + 'format_id': format_id, + 'url': format_url, + 'tbr': extract_bitrate(format_url), + }) + + download_base = 'http://rtsww%s-d.rts.ch/' % ('-a' if media_type == 'audio' else '') + for media in info.get('media', []): + media_url = media.get('url') + if not media_url or re.match(r'https?://', media_url): + continue + rate = media.get('rate') + ext = media.get('ext') or determine_ext(media_url, 'mp4') + format_id = ext + if rate: + format_id += '-%dk' % rate + formats.append({ + 'format_id': format_id, + 'url': urljoin(download_base, media_url), + 'tbr': rate or extract_bitrate(media_url), + }) + + self._check_formats(formats, media_id) + self._sort_formats(formats) + + duration = info.get('duration') or info.get('cutout') or info.get('cutduration') + if isinstance(duration, compat_str): + duration = parse_duration(duration) + + return { + 'id': media_id, + 'display_id': display_id, + 'formats': formats, + 'title': title, + 'description': info.get('intro'), + 'duration': duration, + 'view_count': int_or_none(info.get('plays')), + 'uploader': info.get('programName'), + 'timestamp': parse_iso8601(info.get('broadcast_date')), + 'thumbnail': unescapeHTML(info.get('preview_image_url')), + } diff --git a/yt_dlp/extractor/rtve.py b/yt_dlp/extractor/rtve.py new file mode 100644 index 000000000..59832eeac --- /dev/null +++ b/yt_dlp/extractor/rtve.py @@ -0,0 +1,267 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import base64 +import io +import sys + +from .common import InfoExtractor +from ..compat import ( + compat_b64decode, + compat_struct_unpack, +) +from ..utils import ( + determine_ext, + ExtractorError, + float_or_none, + qualities, + remove_end, + remove_start, + std_headers, +) + +_bytes_to_chr = (lambda x: x) if sys.version_info[0] == 2 else (lambda x: map(chr, x)) + + +class RTVEALaCartaIE(InfoExtractor): + IE_NAME = 'rtve.es:alacarta' + IE_DESC = 'RTVE a la carta' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/(m/)?(alacarta/videos|filmoteca)/[^/]+/[^/]+/(?P<id>\d+)' + + _TESTS = [{ + 'url': 'http://www.rtve.es/alacarta/videos/balonmano/o-swiss-cup-masculina-final-espana-suecia/2491869/', + 'md5': '1d49b7e1ca7a7502c56a4bf1b60f1b43', + 'info_dict': { + 'id': '2491869', + 'ext': 'mp4', + 'title': 'Balonmano - Swiss Cup masculina. Final: España-Suecia', + 'duration': 5024.566, + 'series': 'Balonmano', + }, + 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], + }, { + 'note': 'Live stream', + 'url': 'http://www.rtve.es/alacarta/videos/television/24h-live/1694255/', + 'info_dict': { + 'id': '1694255', + 'ext': 'mp4', + 'title': 're:^24H LIVE [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'is_live': True, + }, + 'params': { + 'skip_download': 'live stream', + }, + }, { + 'url': 'http://www.rtve.es/alacarta/videos/servir-y-proteger/servir-proteger-capitulo-104/4236788/', + 'md5': 'd850f3c8731ea53952ebab489cf81cbf', + 'info_dict': { + 'id': '4236788', + 'ext': 'mp4', + 'title': 'Servir y proteger - Capítulo 104', + 'duration': 3222.0, + }, + 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], + }, { + 'url': 'http://www.rtve.es/m/alacarta/videos/cuentame-como-paso/cuentame-como-paso-t16-ultimo-minuto-nuestra-vida-capitulo-276/2969138/?media=tve', + 'only_matching': True, + }, { + 'url': 'http://www.rtve.es/filmoteca/no-do/not-1-introduccion-primer-noticiario-espanol/1465256/', + 'only_matching': True, + }] + + def _real_initialize(self): + user_agent_b64 = base64.b64encode(std_headers['User-Agent'].encode('utf-8')).decode('utf-8') + self._manager = self._download_json( + 'http://www.rtve.es/odin/loki/' + user_agent_b64, + None, 'Fetching manager info')['manager'] + + @staticmethod + def _decrypt_url(png): + encrypted_data = io.BytesIO(compat_b64decode(png)[8:]) + while True: + length = compat_struct_unpack('!I', encrypted_data.read(4))[0] + chunk_type = encrypted_data.read(4) + if chunk_type == b'IEND': + break + data = encrypted_data.read(length) + if chunk_type == b'tEXt': + alphabet_data, text = data.split(b'\0') + quality, url_data = text.split(b'%%') + alphabet = [] + e = 0 + d = 0 + for l in _bytes_to_chr(alphabet_data): + if d == 0: + alphabet.append(l) + d = e = (e + 1) % 4 + else: + d -= 1 + url = '' + f = 0 + e = 3 + b = 1 + for letter in _bytes_to_chr(url_data): + if f == 0: + l = int(letter) * 10 + f = 1 + else: + if e == 0: + l += int(letter) + url += alphabet[l] + e = (b + 3) % 4 + f = 0 + b += 1 + else: + e -= 1 + + yield quality.decode(), url + encrypted_data.read(4) # CRC + + def _extract_png_formats(self, video_id): + png = self._download_webpage( + 'http://www.rtve.es/ztnr/movil/thumbnail/%s/videos/%s.png' % (self._manager, video_id), + video_id, 'Downloading url information', query={'q': 'v2'}) + q = qualities(['Media', 'Alta', 'HQ', 'HD_READY', 'HD_FULL']) + formats = [] + for quality, video_url in self._decrypt_url(png): + ext = determine_ext(video_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + video_url, video_id, 'dash', fatal=False)) + else: + formats.append({ + 'format_id': quality, + 'quality': q(quality), + 'url': video_url, + }) + self._sort_formats(formats) + return formats + + def _real_extract(self, url): + video_id = self._match_id(url) + info = self._download_json( + 'http://www.rtve.es/api/videos/%s/config/alacarta_videos.json' % video_id, + video_id)['page']['items'][0] + if info['state'] == 'DESPU': + raise ExtractorError('The video is no longer available', expected=True) + title = info['title'].strip() + formats = self._extract_png_formats(video_id) + + subtitles = None + sbt_file = info.get('sbtFile') + if sbt_file: + subtitles = self.extract_subtitles(video_id, sbt_file) + + is_live = info.get('live') is True + + return { + 'id': video_id, + 'title': self._live_title(title) if is_live else title, + 'formats': formats, + 'thumbnail': info.get('image'), + 'subtitles': subtitles, + 'duration': float_or_none(info.get('duration'), 1000), + 'is_live': is_live, + 'series': info.get('programTitle'), + } + + def _get_subtitles(self, video_id, sub_file): + subs = self._download_json( + sub_file + '.json', video_id, + 'Downloading subtitles info')['page']['items'] + return dict( + (s['lang'], [{'ext': 'vtt', 'url': s['src']}]) + for s in subs) + + +class RTVEInfantilIE(RTVEALaCartaIE): + IE_NAME = 'rtve.es:infantil' + IE_DESC = 'RTVE infantil' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/infantil/serie/[^/]+/video/[^/]+/(?P<id>[0-9]+)/' + + _TESTS = [{ + 'url': 'http://www.rtve.es/infantil/serie/cleo/video/maneras-vivir/3040283/', + 'md5': '5747454717aedf9f9fdf212d1bcfc48d', + 'info_dict': { + 'id': '3040283', + 'ext': 'mp4', + 'title': 'Maneras de vivir', + 'thumbnail': r're:https?://.+/1426182947956\.JPG', + 'duration': 357.958, + }, + 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], + }] + + +class RTVELiveIE(RTVEALaCartaIE): + IE_NAME = 'rtve.es:live' + IE_DESC = 'RTVE.es live streams' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/directo/(?P<id>[a-zA-Z0-9-]+)' + + _TESTS = [{ + 'url': 'http://www.rtve.es/directo/la-1/', + 'info_dict': { + 'id': 'la-1', + 'ext': 'mp4', + 'title': 're:^La 1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + }, + 'params': { + 'skip_download': 'live stream', + } + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + title = remove_end(self._og_search_title(webpage), ' en directo en RTVE.es') + title = remove_start(title, 'Estoy viendo ') + + vidplayer_id = self._search_regex( + (r'playerId=player([0-9]+)', + r'class=["\'].*?\blive_mod\b.*?["\'][^>]+data-assetid=["\'](\d+)', + r'data-id=["\'](\d+)'), + webpage, 'internal video ID') + + return { + 'id': video_id, + 'title': self._live_title(title), + 'formats': self._extract_png_formats(vidplayer_id), + 'is_live': True, + } + + +class RTVETelevisionIE(InfoExtractor): + IE_NAME = 'rtve.es:television' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/television/[^/]+/[^/]+/(?P<id>\d+).shtml' + + _TEST = { + 'url': 'http://www.rtve.es/television/20160628/revolucion-del-movil/1364141.shtml', + 'info_dict': { + 'id': '3069778', + 'ext': 'mp4', + 'title': 'Documentos TV - La revolución del móvil', + 'duration': 3496.948, + }, + 'params': { + 'skip_download': True, + }, + } + + def _real_extract(self, url): + page_id = self._match_id(url) + webpage = self._download_webpage(url, page_id) + + alacarta_url = self._search_regex( + r'data-location="alacarta_videos"[^<]+url":"(http://www\.rtve\.es/alacarta.+?)&', + webpage, 'alacarta url', default=None) + if alacarta_url is None: + raise ExtractorError( + 'The webpage doesn\'t contain any video', expected=True) + + return self.url_result(alacarta_url, ie=RTVEALaCartaIE.ie_key()) diff --git a/youtube_dl/extractor/rtvnh.py b/yt_dlp/extractor/rtvnh.py index 6a00f7007..6a00f7007 100644 --- a/youtube_dl/extractor/rtvnh.py +++ b/yt_dlp/extractor/rtvnh.py diff --git a/youtube_dl/extractor/rtvs.py b/yt_dlp/extractor/rtvs.py index 6573b260d..6573b260d 100644 --- a/youtube_dl/extractor/rtvs.py +++ b/yt_dlp/extractor/rtvs.py diff --git a/youtube_dl/extractor/ruhd.py b/yt_dlp/extractor/ruhd.py index 3c8053a26..3c8053a26 100644 --- a/youtube_dl/extractor/ruhd.py +++ b/yt_dlp/extractor/ruhd.py diff --git a/yt_dlp/extractor/rumble.py b/yt_dlp/extractor/rumble.py new file mode 100644 index 000000000..49c1f4485 --- /dev/null +++ b/yt_dlp/extractor/rumble.py @@ -0,0 +1,112 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools +import re + +from .common import InfoExtractor +from ..compat import compat_str, compat_HTTPError +from ..utils import ( + determine_ext, + int_or_none, + parse_iso8601, + try_get, + ExtractorError, +) + + +class RumbleEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rumble\.com/embed/(?:[0-9a-z]+\.)?(?P<id>[0-9a-z]+)' + _TESTS = [{ + 'url': 'https://rumble.com/embed/v5pv5f', + 'md5': '36a18a049856720189f30977ccbb2c34', + 'info_dict': { + 'id': 'v5pv5f', + 'ext': 'mp4', + 'title': 'WMAR 2 News Latest Headlines | October 20, 6pm', + 'timestamp': 1571611968, + 'upload_date': '20191020', + } + }, { + 'url': 'https://rumble.com/embed/ufe9n.v5pv5f', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r'(?:<(?:script|iframe)[^>]+\bsrc=|["\']embedUrl["\']\s*:\s*)["\'](?P<url>%s)' % RumbleEmbedIE._VALID_URL, + webpage)] + + def _real_extract(self, url): + video_id = self._match_id(url) + video = self._download_json( + 'https://rumble.com/embedJS/', video_id, + query={'request': 'video', 'v': video_id}) + title = video['title'] + + formats = [] + for height, ua in (video.get('ua') or {}).items(): + for i in range(2): + f_url = try_get(ua, lambda x: x[i], compat_str) + if f_url: + ext = determine_ext(f_url) + f = { + 'ext': ext, + 'format_id': '%s-%sp' % (ext, height), + 'height': int_or_none(height), + 'url': f_url, + } + bitrate = try_get(ua, lambda x: x[i + 2]['bitrate']) + if bitrate: + f['tbr'] = int_or_none(bitrate) + formats.append(f) + self._sort_formats(formats) + + author = video.get('author') or {} + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': video.get('i'), + 'timestamp': parse_iso8601(video.get('pubDate')), + 'channel': author.get('name'), + 'channel_url': author.get('url'), + 'duration': int_or_none(video.get('duration')), + } + + +class RumbleChannelIE(InfoExtractor): + _VALID_URL = r'(?P<url>https?://(?:www\.)?rumble\.com/(?:c|user)/(?P<id>[^&?#$/]+))' + + _TESTS = [{ + 'url': 'https://rumble.com/c/Styxhexenhammer666', + 'playlist_mincount': 1160, + 'info_dict': { + 'id': 'Styxhexenhammer666', + }, + }, { + 'url': 'https://rumble.com/user/goldenpoodleharleyeuna', + 'playlist_count': 4, + 'info_dict': { + 'id': 'goldenpoodleharleyeuna', + }, + }] + + def entries(self, url, playlist_id): + for page in itertools.count(1): + try: + webpage = self._download_webpage(f'{url}?page={page}', playlist_id, note='Downloading page %d' % page) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: + break + raise + for video_url in re.findall(r'class=video-item--a\s?href=([^>]+\.html)', webpage): + yield self.url_result('https://rumble.com' + video_url) + + def _real_extract(self, url): + url, playlist_id = self._match_valid_url(url).groups() + return self.playlist_result(self.entries(url, playlist_id), playlist_id=playlist_id) diff --git a/yt_dlp/extractor/rutube.py b/yt_dlp/extractor/rutube.py new file mode 100644 index 000000000..d027412c4 --- /dev/null +++ b/yt_dlp/extractor/rutube.py @@ -0,0 +1,314 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import itertools + +from .common import InfoExtractor +from ..compat import ( + compat_str, +) +from ..utils import ( + determine_ext, + bool_or_none, + int_or_none, + parse_qs, + try_get, + unified_timestamp, + url_or_none, +) + + +class RutubeBaseIE(InfoExtractor): + def _download_api_info(self, video_id, query=None): + if not query: + query = {} + query['format'] = 'json' + return self._download_json( + 'http://rutube.ru/api/video/%s/' % video_id, + video_id, 'Downloading video JSON', + 'Unable to download video JSON', query=query) + + @staticmethod + def _extract_info(video, video_id=None, require_title=True): + title = video['title'] if require_title else video.get('title') + + age_limit = video.get('is_adult') + if age_limit is not None: + age_limit = 18 if age_limit is True else 0 + + uploader_id = try_get(video, lambda x: x['author']['id']) + category = try_get(video, lambda x: x['category']['name']) + + return { + 'id': video.get('id') or video_id if video_id else video['id'], + 'title': title, + 'description': video.get('description'), + 'thumbnail': video.get('thumbnail_url'), + 'duration': int_or_none(video.get('duration')), + 'uploader': try_get(video, lambda x: x['author']['name']), + 'uploader_id': compat_str(uploader_id) if uploader_id else None, + 'timestamp': unified_timestamp(video.get('created_ts')), + 'category': [category] if category else None, + 'age_limit': age_limit, + 'view_count': int_or_none(video.get('hits')), + 'comment_count': int_or_none(video.get('comments_count')), + 'is_live': bool_or_none(video.get('is_livestream')), + } + + def _download_and_extract_info(self, video_id, query=None): + return self._extract_info( + self._download_api_info(video_id, query=query), video_id) + + def _download_api_options(self, video_id, query=None): + if not query: + query = {} + query['format'] = 'json' + return self._download_json( + 'http://rutube.ru/api/play/options/%s/' % video_id, + video_id, 'Downloading options JSON', + 'Unable to download options JSON', + headers=self.geo_verification_headers(), query=query) + + def _extract_formats(self, options, video_id): + formats = [] + for format_id, format_url in options['video_balancer'].items(): + ext = determine_ext(format_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + format_url, video_id, f4m_id=format_id, fatal=False)) + else: + formats.append({ + 'url': format_url, + 'format_id': format_id, + }) + self._sort_formats(formats) + return formats + + def _download_and_extract_formats(self, video_id, query=None): + return self._extract_formats( + self._download_api_options(video_id, query=query), video_id) + + +class RutubeIE(RutubeBaseIE): + IE_NAME = 'rutube' + IE_DESC = 'Rutube videos' + _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/(?P<id>[\da-z]{32})' + + _TESTS = [{ + 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/', + 'md5': '1d24f180fac7a02f3900712e5a5764d6', + 'info_dict': { + 'id': '3eac3b4561676c17df9132a9a1e62e3e', + 'ext': 'mp4', + 'title': 'Раненный кенгуру забежал в аптеку', + 'description': 'http://www.ntdtv.ru ', + 'duration': 81, + 'uploader': 'NTDRussian', + 'uploader_id': '29790', + 'timestamp': 1381943602, + 'upload_date': '20131016', + 'age_limit': 0, + }, + }, { + 'url': 'http://rutube.ru/play/embed/a10e53b86e8f349080f718582ce4c661', + 'only_matching': True, + }, { + 'url': 'http://rutube.ru/embed/a10e53b86e8f349080f718582ce4c661', + 'only_matching': True, + }, { + 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/?pl_id=4252', + 'only_matching': True, + }, { + 'url': 'https://rutube.ru/video/10b3a03fc01d5bbcc632a2f3514e8aab/?pl_type=source', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if RutubePlaylistIE.suitable(url) else super(RutubeIE, cls).suitable(url) + + @staticmethod + def _extract_urls(webpage): + return [mobj.group('url') for mobj in re.finditer( + r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//rutube\.ru/embed/[\da-z]{32}.*?)\1', + webpage)] + + def _real_extract(self, url): + video_id = self._match_id(url) + info = self._download_and_extract_info(video_id) + info['formats'] = self._download_and_extract_formats(video_id) + return info + + +class RutubeEmbedIE(RutubeBaseIE): + IE_NAME = 'rutube:embed' + IE_DESC = 'Rutube embedded videos' + _VALID_URL = r'https?://rutube\.ru/(?:video|play)/embed/(?P<id>[0-9]+)' + + _TESTS = [{ + 'url': 'http://rutube.ru/video/embed/6722881?vk_puid37=&vk_puid38=', + 'info_dict': { + 'id': 'a10e53b86e8f349080f718582ce4c661', + 'ext': 'mp4', + 'timestamp': 1387830582, + 'upload_date': '20131223', + 'uploader_id': '297833', + 'description': 'Видео группы ★http://vk.com/foxkidsreset★ музей Fox Kids и Jetix<br/><br/> восстановлено и сделано в шикоформате subziro89 http://vk.com/subziro89', + 'uploader': 'subziro89 ILya', + 'title': 'Мистический городок Эйри в Индиан 5 серия озвучка subziro89', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://rutube.ru/play/embed/8083783', + 'only_matching': True, + }, { + # private video + 'url': 'https://rutube.ru/play/embed/10631925?p=IbAigKqWd1do4mjaM5XLIQ', + 'only_matching': True, + }] + + def _real_extract(self, url): + embed_id = self._match_id(url) + # Query may contain private videos token and should be passed to API + # requests (see #19163) + query = parse_qs(url) + options = self._download_api_options(embed_id, query) + video_id = options['effective_video'] + formats = self._extract_formats(options, video_id) + info = self._download_and_extract_info(video_id, query) + info.update({ + 'extractor_key': 'Rutube', + 'formats': formats, + }) + return info + + +class RutubePlaylistBaseIE(RutubeBaseIE): + def _next_page_url(self, page_num, playlist_id, *args, **kwargs): + return self._PAGE_TEMPLATE % (playlist_id, page_num) + + def _entries(self, playlist_id, *args, **kwargs): + next_page_url = None + for pagenum in itertools.count(1): + page = self._download_json( + next_page_url or self._next_page_url( + pagenum, playlist_id, *args, **kwargs), + playlist_id, 'Downloading page %s' % pagenum) + + results = page.get('results') + if not results or not isinstance(results, list): + break + + for result in results: + video_url = url_or_none(result.get('video_url')) + if not video_url: + continue + entry = self._extract_info(result, require_title=False) + entry.update({ + '_type': 'url', + 'url': video_url, + 'ie_key': RutubeIE.ie_key(), + }) + yield entry + + next_page_url = page.get('next') + if not next_page_url or not page.get('has_next'): + break + + def _extract_playlist(self, playlist_id, *args, **kwargs): + return self.playlist_result( + self._entries(playlist_id, *args, **kwargs), + playlist_id, kwargs.get('playlist_name')) + + def _real_extract(self, url): + return self._extract_playlist(self._match_id(url)) + + +class RutubeChannelIE(RutubePlaylistBaseIE): + IE_NAME = 'rutube:channel' + IE_DESC = 'Rutube channels' + _VALID_URL = r'https?://rutube\.ru/tags/video/(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://rutube.ru/tags/video/1800/', + 'info_dict': { + 'id': '1800', + }, + 'playlist_mincount': 68, + }] + + _PAGE_TEMPLATE = 'http://rutube.ru/api/tags/video/%s/?page=%s&format=json' + + +class RutubeMovieIE(RutubePlaylistBaseIE): + IE_NAME = 'rutube:movie' + IE_DESC = 'Rutube movies' + _VALID_URL = r'https?://rutube\.ru/metainfo/tv/(?P<id>\d+)' + _TESTS = [] + + _MOVIE_TEMPLATE = 'http://rutube.ru/api/metainfo/tv/%s/?format=json' + _PAGE_TEMPLATE = 'http://rutube.ru/api/metainfo/tv/%s/video?page=%s&format=json' + + def _real_extract(self, url): + movie_id = self._match_id(url) + movie = self._download_json( + self._MOVIE_TEMPLATE % movie_id, movie_id, + 'Downloading movie JSON') + return self._extract_playlist( + movie_id, playlist_name=movie.get('name')) + + +class RutubePersonIE(RutubePlaylistBaseIE): + IE_NAME = 'rutube:person' + IE_DESC = 'Rutube person videos' + _VALID_URL = r'https?://rutube\.ru/video/person/(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://rutube.ru/video/person/313878/', + 'info_dict': { + 'id': '313878', + }, + 'playlist_mincount': 37, + }] + + _PAGE_TEMPLATE = 'http://rutube.ru/api/video/person/%s/?page=%s&format=json' + + +class RutubePlaylistIE(RutubePlaylistBaseIE): + IE_NAME = 'rutube:playlist' + IE_DESC = 'Rutube playlists' + _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/[\da-z]{32}/\?.*?\bpl_id=(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://rutube.ru/video/cecd58ed7d531fc0f3d795d51cee9026/?pl_id=3097&pl_type=tag', + 'info_dict': { + 'id': '3097', + }, + 'playlist_count': 27, + }, { + 'url': 'https://rutube.ru/video/10b3a03fc01d5bbcc632a2f3514e8aab/?pl_id=4252&pl_type=source', + 'only_matching': True, + }] + + _PAGE_TEMPLATE = 'http://rutube.ru/api/playlist/%s/%s/?page=%s&format=json' + + @classmethod + def suitable(cls, url): + from ..utils import int_or_none, parse_qs + + if not super(RutubePlaylistIE, cls).suitable(url): + return False + params = parse_qs(url) + return params.get('pl_type', [None])[0] and int_or_none(params.get('pl_id', [None])[0]) + + def _next_page_url(self, page_num, playlist_id, item_kind): + return self._PAGE_TEMPLATE % (item_kind, playlist_id, page_num) + + def _real_extract(self, url): + qs = parse_qs(url) + playlist_kind = qs['pl_type'][0] + playlist_id = qs['pl_id'][0] + return self._extract_playlist(playlist_id, item_kind=playlist_kind) diff --git a/yt_dlp/extractor/rutv.py b/yt_dlp/extractor/rutv.py new file mode 100644 index 000000000..7e0de994a --- /dev/null +++ b/yt_dlp/extractor/rutv.py @@ -0,0 +1,211 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none +) + + +class RUTVIE(InfoExtractor): + IE_DESC = 'RUTV.RU' + _VALID_URL = r'''(?x) + https?:// + (?:test)?player\.(?:rutv\.ru|vgtrk\.com)/ + (?P<path> + flash\d+v/container\.swf\?id=| + iframe/(?P<type>swf|video|live)/id/| + index/iframe/cast_id/ + ) + (?P<id>\d+) + ''' + + _TESTS = [ + { + 'url': 'http://player.rutv.ru/flash2v/container.swf?id=774471&sid=kultura&fbv=true&isPlay=true&ssl=false&i=560&acc_video_id=episode_id/972347/video_id/978186/brand_id/31724', + 'info_dict': { + 'id': '774471', + 'ext': 'mp4', + 'title': 'Монологи на все времена', + 'description': 'md5:18d8b5e6a41fb1faa53819471852d5d5', + 'duration': 2906, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + { + 'url': 'https://player.vgtrk.com/flash2v/container.swf?id=774016&sid=russiatv&fbv=true&isPlay=true&ssl=false&i=560&acc_video_id=episode_id/972098/video_id/977760/brand_id/57638', + 'info_dict': { + 'id': '774016', + 'ext': 'mp4', + 'title': 'Чужой в семье Сталина', + 'description': '', + 'duration': 2539, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + { + 'url': 'http://player.rutv.ru/iframe/swf/id/766888/sid/hitech/?acc_video_id=4000', + 'info_dict': { + 'id': '766888', + 'ext': 'mp4', + 'title': 'Вести.net: интернет-гиганты начали перетягивание программных "одеял"', + 'description': 'md5:65ddd47f9830c4f42ed6475f8730c995', + 'duration': 279, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + { + 'url': 'http://player.rutv.ru/iframe/video/id/771852/start_zoom/true/showZoomBtn/false/sid/russiatv/?acc_video_id=episode_id/970443/video_id/975648/brand_id/5169', + 'info_dict': { + 'id': '771852', + 'ext': 'mp4', + 'title': 'Прямой эфир. Жертвы загадочной болезни: смерть от старости в 17 лет', + 'description': 'md5:b81c8c55247a4bd996b43ce17395b2d8', + 'duration': 3096, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + { + 'url': 'http://player.rutv.ru/iframe/live/id/51499/showZoomBtn/false/isPlay/true/sid/sochi2014', + 'info_dict': { + 'id': '51499', + 'ext': 'flv', + 'title': 'Сочи-2014. Биатлон. Индивидуальная гонка. Мужчины ', + 'description': 'md5:9e0ed5c9d2fa1efbfdfed90c9a6d179c', + }, + 'skip': 'Translation has finished', + }, + { + 'url': 'http://player.rutv.ru/iframe/live/id/21/showZoomBtn/false/isPlay/true/', + 'info_dict': { + 'id': '21', + 'ext': 'mp4', + 'title': 're:^Россия 24. Прямой эфир [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'is_live': True, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + { + 'url': 'https://testplayer.vgtrk.com/iframe/live/id/19201/showZoomBtn/false/isPlay/true/', + 'only_matching': True, + }, + ] + + @classmethod + def _extract_url(cls, webpage): + mobj = re.search( + r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:test)?player\.(?:rutv\.ru|vgtrk\.com)/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage) + if mobj: + return mobj.group('url') + + mobj = re.search( + r'<meta[^>]+?property=(["\'])og:video\1[^>]+?content=(["\'])(?P<url>https?://(?:test)?player\.(?:rutv\.ru|vgtrk\.com)/flash\d+v/container\.swf\?id=.+?\2)', + webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + video_path = mobj.group('path') + + if re.match(r'flash\d+v', video_path): + video_type = 'video' + elif video_path.startswith('iframe'): + video_type = mobj.group('type') + if video_type == 'swf': + video_type = 'video' + elif video_path.startswith('index/iframe/cast_id'): + video_type = 'live' + + is_live = video_type == 'live' + + json_data = self._download_json( + 'http://player.vgtrk.com/iframe/data%s/id/%s' % ('live' if is_live else 'video', video_id), + video_id, 'Downloading JSON') + + if json_data['errors']: + raise ExtractorError('%s said: %s' % (self.IE_NAME, json_data['errors']), expected=True) + + playlist = json_data['data']['playlist'] + medialist = playlist['medialist'] + media = medialist[0] + + if media['errors']: + raise ExtractorError('%s said: %s' % (self.IE_NAME, media['errors']), expected=True) + + view_count = playlist.get('count_views') + priority_transport = playlist['priority_transport'] + + thumbnail = media['picture'] + width = int_or_none(media['width']) + height = int_or_none(media['height']) + description = media['anons'] + title = media['title'] + duration = int_or_none(media.get('duration')) + + formats = [] + + for transport, links in media['sources'].items(): + for quality, url in links.items(): + preference = -1 if priority_transport == transport else -2 + if transport == 'rtmp': + mobj = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>.+))/(?P<playpath>.+)$', url) + if not mobj: + continue + fmt = { + 'url': mobj.group('url'), + 'play_path': mobj.group('playpath'), + 'app': mobj.group('app'), + 'page_url': 'http://player.rutv.ru', + 'player_url': 'http://player.rutv.ru/flash3v/osmf.swf?i=22', + 'rtmp_live': True, + 'ext': 'flv', + 'vbr': int(quality), + 'quality': preference, + } + elif transport == 'm3u8': + formats.extend(self._extract_m3u8_formats( + url, video_id, 'mp4', quality=preference, m3u8_id='hls')) + continue + else: + fmt = { + 'url': url + } + fmt.update({ + 'width': width, + 'height': height, + 'format_id': '%s-%s' % (transport, quality), + }) + formats.append(fmt) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._live_title(title) if is_live else title, + 'description': description, + 'thumbnail': thumbnail, + 'view_count': view_count, + 'duration': duration, + 'formats': formats, + 'is_live': is_live, + } diff --git a/yt_dlp/extractor/ruutu.py b/yt_dlp/extractor/ruutu.py new file mode 100644 index 000000000..d9cf39d71 --- /dev/null +++ b/yt_dlp/extractor/ruutu.py @@ -0,0 +1,227 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_urlparse +from ..utils import ( + determine_ext, + ExtractorError, + find_xpath_attr, + int_or_none, + unified_strdate, + url_or_none, + xpath_attr, + xpath_text, +) + + +class RuutuIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?(?:ruutu|supla)\.fi/(?:video|supla|audio)/| + static\.nelonenmedia\.fi/player/misc/embed_player\.html\?.*?\bnid= + ) + (?P<id>\d+) + ''' + _TESTS = [ + { + 'url': 'http://www.ruutu.fi/video/2058907', + 'md5': 'ab2093f39be1ca8581963451b3c0234f', + 'info_dict': { + 'id': '2058907', + 'ext': 'mp4', + 'title': 'Oletko aina halunnut tietää mitä tapahtuu vain hetki ennen lähetystä? - Nyt se selvisi!', + 'description': 'md5:cfc6ccf0e57a814360df464a91ff67d6', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 114, + 'age_limit': 0, + }, + }, + { + 'url': 'http://www.ruutu.fi/video/2057306', + 'md5': '065a10ae4d5b8cfd9d0c3d332465e3d9', + 'info_dict': { + 'id': '2057306', + 'ext': 'mp4', + 'title': 'Superpesis: katso koko kausi Ruudussa', + 'description': 'md5:bfb7336df2a12dc21d18fa696c9f8f23', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 40, + 'age_limit': 0, + }, + }, + { + 'url': 'http://www.supla.fi/supla/2231370', + 'md5': 'df14e782d49a2c0df03d3be2a54ef949', + 'info_dict': { + 'id': '2231370', + 'ext': 'mp4', + 'title': 'Osa 1: Mikael Jungner', + 'description': 'md5:7d90f358c47542e3072ff65d7b1bcffe', + 'thumbnail': r're:^https?://.*\.jpg$', + 'age_limit': 0, + }, + }, + # Episode where <SourceFile> is "NOT-USED", but has other + # downloadable sources available. + { + 'url': 'http://www.ruutu.fi/video/3193728', + 'only_matching': True, + }, + { + # audio podcast + 'url': 'https://www.supla.fi/supla/3382410', + 'md5': 'b9d7155fed37b2ebf6021d74c4b8e908', + 'info_dict': { + 'id': '3382410', + 'ext': 'mp3', + 'title': 'Mikä ihmeen poltergeist?', + 'description': 'md5:bbb6963df17dfd0ecd9eb9a61bf14b52', + 'thumbnail': r're:^https?://.*\.jpg$', + 'age_limit': 0, + }, + 'expected_warnings': [ + 'HTTP Error 502: Bad Gateway', + 'Failed to download m3u8 information', + ], + }, + { + 'url': 'http://www.supla.fi/audio/2231370', + 'only_matching': True, + }, + { + 'url': 'https://static.nelonenmedia.fi/player/misc/embed_player.html?nid=3618790', + 'only_matching': True, + }, + { + # episode + 'url': 'https://www.ruutu.fi/video/3401964', + 'info_dict': { + 'id': '3401964', + 'ext': 'mp4', + 'title': 'Temptation Island Suomi - Kausi 5 - Jakso 17', + 'description': 'md5:87cf01d5e1e88adf0c8a2937d2bd42ba', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 2582, + 'age_limit': 12, + 'upload_date': '20190508', + 'series': 'Temptation Island Suomi', + 'season_number': 5, + 'episode_number': 17, + 'categories': ['Reality ja tositapahtumat', 'Kotimaiset suosikit', 'Romantiikka ja parisuhde'], + }, + 'params': { + 'skip_download': True, + }, + }, + { + # premium + 'url': 'https://www.ruutu.fi/video/3618715', + 'only_matching': True, + }, + ] + _API_BASE = 'https://gatling.nelonenmedia.fi' + + def _real_extract(self, url): + video_id = self._match_id(url) + + video_xml = self._download_xml( + '%s/media-xml-cache' % self._API_BASE, video_id, + query={'id': video_id}) + + formats = [] + processed_urls = [] + + def extract_formats(node): + for child in node: + if child.tag.endswith('Files'): + extract_formats(child) + elif child.tag.endswith('File'): + video_url = child.text + if (not video_url or video_url in processed_urls + or any(p in video_url for p in ('NOT_USED', 'NOT-USED'))): + continue + processed_urls.append(video_url) + ext = determine_ext(video_url) + auth_video_url = url_or_none(self._download_webpage( + '%s/auth/access/v2' % self._API_BASE, video_id, + note='Downloading authenticated %s stream URL' % ext, + fatal=False, query={'stream': video_url})) + if auth_video_url: + processed_urls.append(auth_video_url) + video_url = auth_video_url + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False)) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + video_url, video_id, f4m_id='hds', fatal=False)) + elif ext == 'mpd': + # video-only and audio-only streams are of different + # duration resulting in out of sync issue + continue + formats.extend(self._extract_mpd_formats( + video_url, video_id, mpd_id='dash', fatal=False)) + elif ext == 'mp3' or child.tag == 'AudioMediaFile': + formats.append({ + 'format_id': 'audio', + 'url': video_url, + 'vcodec': 'none', + }) + else: + proto = compat_urllib_parse_urlparse(video_url).scheme + if not child.tag.startswith('HTTP') and proto != 'rtmp': + continue + preference = -1 if proto == 'rtmp' else 1 + label = child.get('label') + tbr = int_or_none(child.get('bitrate')) + format_id = '%s-%s' % (proto, label if label else tbr) if label or tbr else proto + if not self._is_valid_url(video_url, video_id, format_id): + continue + width, height = [int_or_none(x) for x in child.get('resolution', 'x').split('x')[:2]] + formats.append({ + 'format_id': format_id, + 'url': video_url, + 'width': width, + 'height': height, + 'tbr': tbr, + 'preference': preference, + }) + + extract_formats(video_xml.find('./Clip')) + + def pv(name): + node = find_xpath_attr( + video_xml, './Clip/PassthroughVariables/variable', 'name', name) + if node is not None: + return node.get('value') + + if not formats: + if (not self.get_param('allow_unplayable_formats') + and xpath_text(video_xml, './Clip/DRM', default=None)): + self.report_drm(video_id) + ns_st_cds = pv('ns_st_cds') + if ns_st_cds != 'free': + raise ExtractorError('This video is %s.' % ns_st_cds, expected=True) + + self._sort_formats(formats) + + themes = pv('themes') + + return { + 'id': video_id, + 'title': xpath_attr(video_xml, './/Behavior/Program', 'program_name', 'title', fatal=True), + 'description': xpath_attr(video_xml, './/Behavior/Program', 'description', 'description'), + 'thumbnail': xpath_attr(video_xml, './/Behavior/Startpicture', 'href', 'thumbnail'), + 'duration': int_or_none(xpath_text(video_xml, './/Runtime', 'duration')) or int_or_none(pv('runtime')), + 'age_limit': int_or_none(xpath_text(video_xml, './/AgeLimit', 'age limit')), + 'upload_date': unified_strdate(pv('date_start')), + 'series': pv('series_name'), + 'season_number': int_or_none(pv('season_number')), + 'episode_number': int_or_none(pv('episode_number')), + 'categories': themes.split(',') if themes else [], + 'formats': formats, + } diff --git a/youtube_dl/extractor/ruv.py b/yt_dlp/extractor/ruv.py index 8f3cc4095..8f3cc4095 100644 --- a/youtube_dl/extractor/ruv.py +++ b/yt_dlp/extractor/ruv.py diff --git a/yt_dlp/extractor/safari.py b/yt_dlp/extractor/safari.py new file mode 100644 index 000000000..cca4464ca --- /dev/null +++ b/yt_dlp/extractor/safari.py @@ -0,0 +1,269 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor + +from ..compat import ( + compat_parse_qs, + compat_urlparse, +) +from ..utils import ( + ExtractorError, + update_url_query, +) + + +class SafariBaseIE(InfoExtractor): + _LOGIN_URL = 'https://learning.oreilly.com/accounts/login/' + _NETRC_MACHINE = 'safari' + + _API_BASE = 'https://learning.oreilly.com/api/v1' + _API_FORMAT = 'json' + + LOGGED_IN = False + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + _, urlh = self._download_webpage_handle( + 'https://learning.oreilly.com/accounts/login-check/', None, + 'Downloading login page') + + def is_logged(urlh): + return 'learning.oreilly.com/home/' in urlh.geturl() + + if is_logged(urlh): + self.LOGGED_IN = True + return + + redirect_url = urlh.geturl() + parsed_url = compat_urlparse.urlparse(redirect_url) + qs = compat_parse_qs(parsed_url.query) + next_uri = compat_urlparse.urljoin( + 'https://api.oreilly.com', qs['next'][0]) + + auth, urlh = self._download_json_handle( + 'https://www.oreilly.com/member/auth/login/', None, 'Logging in', + data=json.dumps({ + 'email': username, + 'password': password, + 'redirect_uri': next_uri, + }).encode(), headers={ + 'Content-Type': 'application/json', + 'Referer': redirect_url, + }, expected_status=400) + + credentials = auth.get('credentials') + if (not auth.get('logged_in') and not auth.get('redirect_uri') + and credentials): + raise ExtractorError( + 'Unable to login: %s' % credentials, expected=True) + + # oreilly serves two same instances of the following cookies + # in Set-Cookie header and expects first one to be actually set + for cookie in ('groot_sessionid', 'orm-jwt', 'orm-rt'): + self._apply_first_set_cookie_header(urlh, cookie) + + _, urlh = self._download_webpage_handle( + auth.get('redirect_uri') or next_uri, None, 'Completing login',) + + if is_logged(urlh): + self.LOGGED_IN = True + return + + raise ExtractorError('Unable to log in') + + +class SafariIE(SafariBaseIE): + IE_NAME = 'safari' + IE_DESC = 'safaribooksonline.com online video' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/ + (?: + library/view/[^/]+/(?P<course_id>[^/]+)/(?P<part>[^/?\#&]+)\.html| + videos/[^/]+/[^/]+/(?P<reference_id>[^-]+-[^/?\#&]+) + ) + ''' + + _TESTS = [{ + 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/part00.html', + 'md5': 'dcc5a425e79f2564148652616af1f2a3', + 'info_dict': { + 'id': '0_qbqx90ic', + 'ext': 'mp4', + 'title': 'Introduction to Hadoop Fundamentals LiveLessons', + 'timestamp': 1437758058, + 'upload_date': '20150724', + 'uploader_id': 'stork', + }, + }, { + # non-digits in course id + 'url': 'https://www.safaribooksonline.com/library/view/create-a-nodejs/100000006A0210/part00.html', + 'only_matching': True, + }, { + 'url': 'https://www.safaribooksonline.com/library/view/learning-path-red/9780134664057/RHCE_Introduction.html', + 'only_matching': True, + }, { + 'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314/9780134217314-PYMC_13_00', + 'only_matching': True, + }, { + 'url': 'https://learning.oreilly.com/videos/hadoop-fundamentals-livelessons/9780133392838/9780133392838-00_SeriesIntro', + 'only_matching': True, + }, { + 'url': 'https://www.oreilly.com/library/view/hadoop-fundamentals-livelessons/9780133392838/00_SeriesIntro.html', + 'only_matching': True, + }] + + _PARTNER_ID = '1926081' + _UICONF_ID = '29375172' + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + + reference_id = mobj.group('reference_id') + if reference_id: + video_id = reference_id + partner_id = self._PARTNER_ID + ui_id = self._UICONF_ID + else: + video_id = '%s-%s' % (mobj.group('course_id'), mobj.group('part')) + + webpage, urlh = self._download_webpage_handle(url, video_id) + + mobj = re.match(self._VALID_URL, urlh.geturl()) + reference_id = mobj.group('reference_id') + if not reference_id: + reference_id = self._search_regex( + r'data-reference-id=(["\'])(?P<id>(?:(?!\1).)+)\1', + webpage, 'kaltura reference id', group='id') + partner_id = self._search_regex( + r'data-partner-id=(["\'])(?P<id>(?:(?!\1).)+)\1', + webpage, 'kaltura widget id', default=self._PARTNER_ID, + group='id') + ui_id = self._search_regex( + r'data-ui-id=(["\'])(?P<id>(?:(?!\1).)+)\1', + webpage, 'kaltura uiconf id', default=self._UICONF_ID, + group='id') + + query = { + 'wid': '_%s' % partner_id, + 'uiconf_id': ui_id, + 'flashvars[referenceId]': reference_id, + } + + if self.LOGGED_IN: + kaltura_session = self._download_json( + '%s/player/kaltura_session/?reference_id=%s' % (self._API_BASE, reference_id), + video_id, 'Downloading kaltura session JSON', + 'Unable to download kaltura session JSON', fatal=False, + headers={'Accept': 'application/json'}) + if kaltura_session: + session = kaltura_session.get('session') + if session: + query['flashvars[ks]'] = session + + return self.url_result(update_url_query( + 'https://cdnapisec.kaltura.com/html5/html5lib/v2.37.1/mwEmbedFrame.php', query), + 'Kaltura') + + +class SafariApiIE(SafariBaseIE): + IE_NAME = 'safari:api' + _VALID_URL = r'https?://(?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/api/v1/book/(?P<course_id>[^/]+)/chapter(?:-content)?/(?P<part>[^/?#&]+)\.html' + + _TESTS = [{ + 'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html', + 'only_matching': True, + }, { + 'url': 'https://www.safaribooksonline.com/api/v1/book/9780134664057/chapter/RHCE_Introduction.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + part = self._download_json( + url, '%s/%s' % (mobj.group('course_id'), mobj.group('part')), + 'Downloading part JSON') + web_url = part['web_url'] + if 'library/view' in web_url: + web_url = web_url.replace('library/view', 'videos') + natural_keys = part['natural_key'] + web_url = f'{web_url.rsplit("/", 1)[0]}/{natural_keys[0]}-{natural_keys[1][:-5]}' + return self.url_result(web_url, SafariIE.ie_key()) + + +class SafariCourseIE(SafariBaseIE): + IE_NAME = 'safari:course' + IE_DESC = 'safaribooksonline.com online courses' + + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/ + (?: + library/view/[^/]+| + api/v1/book| + videos/[^/]+ + )| + techbus\.safaribooksonline\.com + ) + /(?P<id>[^/]+) + ''' + + _TESTS = [{ + 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/', + 'info_dict': { + 'id': '9780133392838', + 'title': 'Hadoop Fundamentals LiveLessons', + }, + 'playlist_count': 22, + 'skip': 'Requires safaribooksonline account credentials', + }, { + 'url': 'https://www.safaribooksonline.com/api/v1/book/9781449396459/?override_format=json', + 'only_matching': True, + }, { + 'url': 'http://techbus.safaribooksonline.com/9780134426365', + 'only_matching': True, + }, { + 'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314', + 'only_matching': True, + }, { + 'url': 'https://learning.oreilly.com/videos/hadoop-fundamentals-livelessons/9780133392838', + 'only_matching': True, + }, { + 'url': 'https://www.oreilly.com/library/view/hadoop-fundamentals-livelessons/9780133392838/', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return (False if SafariIE.suitable(url) or SafariApiIE.suitable(url) + else super(SafariCourseIE, cls).suitable(url)) + + def _real_extract(self, url): + course_id = self._match_id(url) + + course_json = self._download_json( + '%s/book/%s/?override_format=%s' % (self._API_BASE, course_id, self._API_FORMAT), + course_id, 'Downloading course JSON') + + if 'chapters' not in course_json: + raise ExtractorError( + 'No chapters found for course %s' % course_id, expected=True) + + entries = [ + self.url_result(chapter, SafariApiIE.ie_key()) + for chapter in course_json['chapters']] + + course_title = course_json['title'] + + return self.playlist_result(entries, course_id, course_title) diff --git a/yt_dlp/extractor/saitosan.py b/yt_dlp/extractor/saitosan.py new file mode 100644 index 000000000..621335ca0 --- /dev/null +++ b/yt_dlp/extractor/saitosan.py @@ -0,0 +1,78 @@ +# coding: utf-8 + +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ExtractorError, try_get + + +class SaitosanIE(InfoExtractor): + IE_NAME = 'Saitosan' + _VALID_URL = r'https?://(?:www\.)?saitosan\.net/bview.html\?id=(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://www.saitosan.net/bview.html?id=10031846', + 'info_dict': { + 'id': '10031846', + 'ext': 'mp4', + 'title': '井下原 和弥', + 'uploader': '井下原 和弥', + 'thumbnail': 'http://111.171.196.85:8088/921f916f-7f55-4c97-b92e-5d9d0fef8f5f/thumb', + 'is_live': True, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'skip': 'Broadcasts are ephemeral', + }, + { + 'url': 'http://www.saitosan.net/bview.html?id=10031795', + 'info_dict': { + 'id': '10031795', + 'ext': 'mp4', + 'title': '橋本', + 'uploader': '橋本', + 'thumbnail': 'http://111.171.196.85:8088/1a3933e1-a01a-483b-8931-af15f37f8082/thumb', + 'is_live': True, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'skip': 'Broadcasts are ephemeral', + }] + + def _real_extract(self, url): + b_id = self._match_id(url) + + base = 'http://hankachi.saitosan-api.net:8002/socket.io/?transport=polling&EIO=3' + sid = self._download_socket_json(base, b_id, note='Opening socket').get('sid') + base += '&sid=' + sid + + self._download_webpage(base, b_id, note='Polling socket') + payload = '420["room_start_join",{"room_id":"%s"}]' % b_id + payload = '%s:%s' % (len(payload), payload) + + self._download_webpage(base, b_id, data=payload, note='Polling socket with payload') + response = self._download_socket_json(base, b_id, note='Polling socket') + if not response.get('ok'): + err = response.get('error') or {} + raise ExtractorError( + '%s said: %s - %s' % (self.IE_NAME, err.get('code', '?'), err.get('msg', 'Unknown')) if err + else 'The socket reported that the broadcast could not be joined. Maybe it\'s offline or the URL is incorrect', + expected=True, video_id=b_id) + + self._download_webpage(base, b_id, data='26:421["room_finish_join",{}]', note='Polling socket') + b_data = self._download_socket_json(base, b_id, note='Getting broadcast metadata from socket') + m3u8_url = b_data.get('url') + + self._download_webpage(base, b_id, data='1:1', note='Closing socket', fatal=False) + + return { + 'id': b_id, + 'title': b_data.get('name'), + 'formats': self._extract_m3u8_formats(m3u8_url, b_id, 'mp4', live=True), + 'thumbnail': m3u8_url.replace('av.m3u8', 'thumb'), + 'uploader': try_get(b_data, lambda x: x['broadcast_user']['name']), # same as title + 'is_live': True + } diff --git a/youtube_dl/extractor/samplefocus.py b/yt_dlp/extractor/samplefocus.py index 806c3c354..806c3c354 100644 --- a/youtube_dl/extractor/samplefocus.py +++ b/yt_dlp/extractor/samplefocus.py diff --git a/yt_dlp/extractor/sapo.py b/yt_dlp/extractor/sapo.py new file mode 100644 index 000000000..df202a3a4 --- /dev/null +++ b/yt_dlp/extractor/sapo.py @@ -0,0 +1,119 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + parse_duration, + unified_strdate, +) + + +class SapoIE(InfoExtractor): + IE_DESC = 'SAPO Vídeos' + _VALID_URL = r'https?://(?:(?:v2|www)\.)?videos\.sapo\.(?:pt|cv|ao|mz|tl)/(?P<id>[\da-zA-Z]{20})' + + _TESTS = [ + { + 'url': 'http://videos.sapo.pt/UBz95kOtiWYUMTA5Ghfi', + 'md5': '79ee523f6ecb9233ac25075dee0eda83', + 'note': 'SD video', + 'info_dict': { + 'id': 'UBz95kOtiWYUMTA5Ghfi', + 'ext': 'mp4', + 'title': 'Benfica - Marcas na Hitória', + 'description': 'md5:c9082000a128c3fd57bf0299e1367f22', + 'duration': 264, + 'uploader': 'tiago_1988', + 'upload_date': '20080229', + 'categories': ['benfica', 'cabral', 'desporto', 'futebol', 'geovanni', 'hooijdonk', 'joao', 'karel', 'lisboa', 'miccoli'], + }, + }, + { + 'url': 'http://videos.sapo.pt/IyusNAZ791ZdoCY5H5IF', + 'md5': '90a2f283cfb49193fe06e861613a72aa', + 'note': 'HD video', + 'info_dict': { + 'id': 'IyusNAZ791ZdoCY5H5IF', + 'ext': 'mp4', + 'title': 'Codebits VII - Report', + 'description': 'md5:6448d6fd81ce86feac05321f354dbdc8', + 'duration': 144, + 'uploader': 'codebits', + 'upload_date': '20140427', + 'categories': ['codebits', 'codebits2014'], + }, + }, + { + 'url': 'http://v2.videos.sapo.pt/yLqjzPtbTimsn2wWBKHz', + 'md5': 'e5aa7cc0bdc6db9b33df1a48e49a15ac', + 'note': 'v2 video', + 'info_dict': { + 'id': 'yLqjzPtbTimsn2wWBKHz', + 'ext': 'mp4', + 'title': 'Hipnose Condicionativa 4', + 'description': 'md5:ef0481abf8fb4ae6f525088a6dadbc40', + 'duration': 692, + 'uploader': 'sapozen', + 'upload_date': '20090609', + 'categories': ['condicionativa', 'heloisa', 'hipnose', 'miranda', 'sapo', 'zen'], + }, + }, + ] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + + item = self._download_xml( + 'http://rd3.videos.sapo.pt/%s/rss2' % video_id, video_id).find('./channel/item') + + title = item.find('./title').text + description = item.find('./{http://videos.sapo.pt/mrss/}synopse').text + thumbnail = item.find('./{http://search.yahoo.com/mrss/}content').get('url') + duration = parse_duration(item.find('./{http://videos.sapo.pt/mrss/}time').text) + uploader = item.find('./{http://videos.sapo.pt/mrss/}author').text + upload_date = unified_strdate(item.find('./pubDate').text) + view_count = int(item.find('./{http://videos.sapo.pt/mrss/}views').text) + comment_count = int(item.find('./{http://videos.sapo.pt/mrss/}comment_count').text) + tags = item.find('./{http://videos.sapo.pt/mrss/}tags').text + categories = tags.split() if tags else [] + age_limit = 18 if item.find('./{http://videos.sapo.pt/mrss/}m18').text == 'true' else 0 + + video_url = item.find('./{http://videos.sapo.pt/mrss/}videoFile').text + video_size = item.find('./{http://videos.sapo.pt/mrss/}videoSize').text.split('x') + + formats = [{ + 'url': video_url, + 'ext': 'mp4', + 'format_id': 'sd', + 'width': int(video_size[0]), + 'height': int(video_size[1]), + }] + + if item.find('./{http://videos.sapo.pt/mrss/}HD').text == 'true': + formats.append({ + 'url': re.sub(r'/mov/1$', '/mov/39', video_url), + 'ext': 'mp4', + 'format_id': 'hd', + 'width': 1280, + 'height': 720, + }) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'uploader': uploader, + 'upload_date': upload_date, + 'view_count': view_count, + 'comment_count': comment_count, + 'categories': categories, + 'age_limit': age_limit, + 'formats': formats, + } diff --git a/yt_dlp/extractor/savefrom.py b/yt_dlp/extractor/savefrom.py new file mode 100644 index 000000000..98efdc2a4 --- /dev/null +++ b/yt_dlp/extractor/savefrom.py @@ -0,0 +1,33 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import os.path + +from .common import InfoExtractor + + +class SaveFromIE(InfoExtractor): + IE_NAME = 'savefrom.net' + _VALID_URL = r'https?://[^.]+\.savefrom\.net/\#url=(?P<url>.*)$' + + _TEST = { + 'url': 'http://en.savefrom.net/#url=http://youtube.com/watch?v=UlVRAPW2WJY&utm_source=youtube.com&utm_medium=short_domains&utm_campaign=ssyoutube.com', + 'info_dict': { + 'id': 'UlVRAPW2WJY', + 'ext': 'mp4', + 'title': 'About Team Radical MMA | MMA Fighting', + 'upload_date': '20120816', + 'uploader': 'Howcast', + 'uploader_id': 'Howcast', + 'description': r're:(?s).* Hi, my name is Rene Dreifuss\. And I\'m here to show you some MMA.*', + }, + 'params': { + 'skip_download': True + } + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = os.path.splitext(url.split('/')[-1])[0] + + return self.url_result(mobj.group('url'), video_id=video_id) diff --git a/youtube_dl/extractor/sbs.py b/yt_dlp/extractor/sbs.py index 0a806ee4e..0a806ee4e 100644 --- a/youtube_dl/extractor/sbs.py +++ b/yt_dlp/extractor/sbs.py diff --git a/youtube_dl/extractor/screencast.py b/yt_dlp/extractor/screencast.py index 69a0d01f3..69a0d01f3 100644 --- a/youtube_dl/extractor/screencast.py +++ b/yt_dlp/extractor/screencast.py diff --git a/youtube_dl/extractor/screencastomatic.py b/yt_dlp/extractor/screencastomatic.py index 0afdc1715..0afdc1715 100644 --- a/youtube_dl/extractor/screencastomatic.py +++ b/yt_dlp/extractor/screencastomatic.py diff --git a/yt_dlp/extractor/scrippsnetworks.py b/yt_dlp/extractor/scrippsnetworks.py new file mode 100644 index 000000000..84918b67f --- /dev/null +++ b/yt_dlp/extractor/scrippsnetworks.py @@ -0,0 +1,151 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import hashlib + +from .aws import AWSIE +from .anvato import AnvatoIE +from .common import InfoExtractor +from ..utils import ( + smuggle_url, + urlencode_postdata, + xpath_text, +) + + +class ScrippsNetworksWatchIE(AWSIE): + IE_NAME = 'scrippsnetworks:watch' + _VALID_URL = r'''(?x) + https?:// + watch\. + (?P<site>geniuskitchen)\.com/ + (?: + player\.[A-Z0-9]+\.html\#| + show/(?:[^/]+/){2}| + player/ + ) + (?P<id>\d+) + ''' + _TESTS = [{ + 'url': 'http://watch.geniuskitchen.com/player/3787617/Ample-Hills-Ice-Cream-Bike/', + 'info_dict': { + 'id': '4194875', + 'ext': 'mp4', + 'title': 'Ample Hills Ice Cream Bike', + 'description': 'Courtney Rada churns up a signature GK Now ice cream with The Scoopmaster.', + 'uploader': 'ANV', + 'upload_date': '20171011', + 'timestamp': 1507698000, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [AnvatoIE.ie_key()], + }] + + _SNI_TABLE = { + 'geniuskitchen': 'genius', + } + + _AWS_API_KEY = 'E7wSQmq0qK6xPrF13WmzKiHo4BQ7tip4pQcSXVl1' + _AWS_PROXY_HOST = 'web.api.video.snidigital.com' + + _AWS_USER_AGENT = 'aws-sdk-js/2.80.0 callback' + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + site_id, video_id = mobj.group('site', 'id') + + aws_identity_id_json = json.dumps({ + 'IdentityId': '%s:7655847c-0ae7-4d9b-80d6-56c062927eb3' % self._AWS_REGION + }).encode('utf-8') + token = self._download_json( + 'https://cognito-identity.%s.amazonaws.com/' % self._AWS_REGION, video_id, + data=aws_identity_id_json, + headers={ + 'Accept': '*/*', + 'Content-Type': 'application/x-amz-json-1.1', + 'Referer': url, + 'X-Amz-Content-Sha256': hashlib.sha256(aws_identity_id_json).hexdigest(), + 'X-Amz-Target': 'AWSCognitoIdentityService.GetOpenIdToken', + 'X-Amz-User-Agent': self._AWS_USER_AGENT, + })['Token'] + + sts = self._download_xml( + 'https://sts.amazonaws.com/', video_id, data=urlencode_postdata({ + 'Action': 'AssumeRoleWithWebIdentity', + 'RoleArn': 'arn:aws:iam::710330595350:role/Cognito_WebAPIUnauth_Role', + 'RoleSessionName': 'web-identity', + 'Version': '2011-06-15', + 'WebIdentityToken': token, + }), headers={ + 'Referer': url, + 'X-Amz-User-Agent': self._AWS_USER_AGENT, + 'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8', + }) + + def get(key): + return xpath_text( + sts, './/{https://sts.amazonaws.com/doc/2011-06-15/}%s' % key, + fatal=True) + + mcp_id = self._aws_execute_api({ + 'uri': '/1/web/brands/%s/episodes/scrid/%s' % (self._SNI_TABLE[site_id], video_id), + 'access_key': get('AccessKeyId'), + 'secret_key': get('SecretAccessKey'), + 'session_token': get('SessionToken'), + }, video_id)['results'][0]['mcpId'] + + return self.url_result( + smuggle_url( + 'anvato:anvato_scripps_app_web_prod_0837996dbe373629133857ae9eb72e740424d80a:%s' % mcp_id, + {'geo_countries': ['US']}), + AnvatoIE.ie_key(), video_id=mcp_id) + + +class ScrippsNetworksIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?P<site>cookingchanneltv|discovery|(?:diy|food)network|hgtv|travelchannel)\.com/videos/[0-9a-z-]+-(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.cookingchanneltv.com/videos/the-best-of-the-best-0260338', + 'info_dict': { + 'id': '0260338', + 'ext': 'mp4', + 'title': 'The Best of the Best', + 'description': 'Catch a new episode of MasterChef Canada Tuedsay at 9/8c.', + 'timestamp': 1475678834, + 'upload_date': '20161005', + 'uploader': 'SCNI-SCND', + }, + 'add_ie': ['ThePlatform'], + }, { + 'url': 'https://www.diynetwork.com/videos/diy-barnwood-tablet-stand-0265790', + 'only_matching': True, + }, { + 'url': 'https://www.foodnetwork.com/videos/chocolate-strawberry-cake-roll-7524591', + 'only_matching': True, + }, { + 'url': 'https://www.hgtv.com/videos/cookie-decorating-101-0301929', + 'only_matching': True, + }, { + 'url': 'https://www.travelchannel.com/videos/two-climates-one-bag-5302184', + 'only_matching': True, + }, { + 'url': 'https://www.discovery.com/videos/guardians-of-the-glades-cooking-with-tom-cobb-5578368', + 'only_matching': True, + }] + _ACCOUNT_MAP = { + 'cookingchanneltv': 2433005105, + 'discovery': 2706091867, + 'diynetwork': 2433004575, + 'foodnetwork': 2433005105, + 'hgtv': 2433004575, + 'travelchannel': 2433005739, + } + _TP_TEMPL = 'https://link.theplatform.com/s/ip77QC/media/guid/%d/%s?mbr=true' + + def _real_extract(self, url): + site, guid = self._match_valid_url(url).groups() + return self.url_result(smuggle_url( + self._TP_TEMPL % (self._ACCOUNT_MAP[site], guid), + {'force_smil_url': True}), 'ThePlatform', guid) diff --git a/youtube_dl/extractor/scte.py b/yt_dlp/extractor/scte.py index ca1de63b6..ca1de63b6 100644 --- a/youtube_dl/extractor/scte.py +++ b/yt_dlp/extractor/scte.py diff --git a/yt_dlp/extractor/seeker.py b/yt_dlp/extractor/seeker.py new file mode 100644 index 000000000..e5c18c7a5 --- /dev/null +++ b/yt_dlp/extractor/seeker.py @@ -0,0 +1,58 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + get_element_by_class, + strip_or_none, +) + + +class SeekerIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?seeker\.com/(?P<display_id>.*)-(?P<article_id>\d+)\.html' + _TESTS = [{ + 'url': 'http://www.seeker.com/should-trump-be-required-to-release-his-tax-returns-1833805621.html', + 'md5': '897d44bbe0d8986a2ead96de565a92db', + 'info_dict': { + 'id': 'Elrn3gnY', + 'ext': 'mp4', + 'title': 'Should Trump Be Required To Release His Tax Returns?', + 'description': 'md5:41efa8cfa8d627841045eec7b018eb45', + 'timestamp': 1490090165, + 'upload_date': '20170321', + } + }, { + 'url': 'http://www.seeker.com/changes-expected-at-zoos-following-recent-gorilla-lion-shootings-1834116536.html', + 'playlist': [ + { + 'md5': '0497b9f20495174be73ae136949707d2', + 'info_dict': { + 'id': 'FihYQ8AE', + 'ext': 'mp4', + 'title': 'The Pros & Cons Of Zoos', + 'description': 'md5:d88f99a8ea8e7d25e6ff77f271b1271c', + 'timestamp': 1490039133, + 'upload_date': '20170320', + }, + } + ], + 'info_dict': { + 'id': '1834116536', + 'title': 'After Gorilla Killing, Changes Ahead for Zoos', + 'description': 'The largest association of zoos and others are hoping to learn from recent incidents that led to the shooting deaths of a gorilla and two lions.', + }, + }] + + def _real_extract(self, url): + display_id, article_id = self._match_valid_url(url).groups() + webpage = self._download_webpage(url, display_id) + entries = [] + for jwp_id in re.findall(r'data-video-id="([a-zA-Z0-9]{8})"', webpage): + entries.append(self.url_result( + 'jwplatform:' + jwp_id, 'JWPlatform', jwp_id)) + return self.playlist_result( + entries, article_id, + self._og_search_title(webpage), + strip_or_none(get_element_by_class('subtitle__text', webpage)) or self._og_search_description(webpage)) diff --git a/yt_dlp/extractor/senateisvp.py b/yt_dlp/extractor/senateisvp.py new file mode 100644 index 000000000..8794d47ef --- /dev/null +++ b/yt_dlp/extractor/senateisvp.py @@ -0,0 +1,153 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + unsmuggle_url, +) +from ..compat import ( + compat_parse_qs, + compat_urlparse, +) + + +class SenateISVPIE(InfoExtractor): + _COMM_MAP = [ + ['ag', '76440', 'http://ag-f.akamaihd.net'], + ['aging', '76442', 'http://aging-f.akamaihd.net'], + ['approps', '76441', 'http://approps-f.akamaihd.net'], + ['armed', '76445', 'http://armed-f.akamaihd.net'], + ['banking', '76446', 'http://banking-f.akamaihd.net'], + ['budget', '76447', 'http://budget-f.akamaihd.net'], + ['cecc', '76486', 'http://srs-f.akamaihd.net'], + ['commerce', '80177', 'http://commerce1-f.akamaihd.net'], + ['csce', '75229', 'http://srs-f.akamaihd.net'], + ['dpc', '76590', 'http://dpc-f.akamaihd.net'], + ['energy', '76448', 'http://energy-f.akamaihd.net'], + ['epw', '76478', 'http://epw-f.akamaihd.net'], + ['ethics', '76449', 'http://ethics-f.akamaihd.net'], + ['finance', '76450', 'http://finance-f.akamaihd.net'], + ['foreign', '76451', 'http://foreign-f.akamaihd.net'], + ['govtaff', '76453', 'http://govtaff-f.akamaihd.net'], + ['help', '76452', 'http://help-f.akamaihd.net'], + ['indian', '76455', 'http://indian-f.akamaihd.net'], + ['intel', '76456', 'http://intel-f.akamaihd.net'], + ['intlnarc', '76457', 'http://intlnarc-f.akamaihd.net'], + ['jccic', '85180', 'http://jccic-f.akamaihd.net'], + ['jec', '76458', 'http://jec-f.akamaihd.net'], + ['judiciary', '76459', 'http://judiciary-f.akamaihd.net'], + ['rpc', '76591', 'http://rpc-f.akamaihd.net'], + ['rules', '76460', 'http://rules-f.akamaihd.net'], + ['saa', '76489', 'http://srs-f.akamaihd.net'], + ['smbiz', '76461', 'http://smbiz-f.akamaihd.net'], + ['srs', '75229', 'http://srs-f.akamaihd.net'], + ['uscc', '76487', 'http://srs-f.akamaihd.net'], + ['vetaff', '76462', 'http://vetaff-f.akamaihd.net'], + ['arch', '', 'http://ussenate-f.akamaihd.net/'] + ] + _IE_NAME = 'senate.gov' + _VALID_URL = r'https?://(?:www\.)?senate\.gov/isvp/?\?(?P<qs>.+)' + _TESTS = [{ + 'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png', + 'info_dict': { + 'id': 'judiciary031715', + 'ext': 'mp4', + 'title': 'Integrated Senate Video Player', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://www.senate.gov/isvp/?type=live&comm=commerce&filename=commerce011514.mp4&auto_play=false', + 'info_dict': { + 'id': 'commerce011514', + 'ext': 'mp4', + 'title': 'Integrated Senate Video Player' + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://www.senate.gov/isvp/?type=arch&comm=intel&filename=intel090613&hc_location=ufi', + # checksum differs each time + 'info_dict': { + 'id': 'intel090613', + 'ext': 'mp4', + 'title': 'Integrated Senate Video Player' + } + }, { + # From http://www.c-span.org/video/?96791-1 + 'url': 'http://www.senate.gov/isvp?type=live&comm=banking&filename=banking012715', + 'only_matching': True, + }] + + @staticmethod + def _search_iframe_url(webpage): + mobj = re.search( + r"<iframe[^>]+src=['\"](?P<url>https?://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]", + webpage) + if mobj: + return mobj.group('url') + + def _get_info_for_comm(self, committee): + for entry in self._COMM_MAP: + if entry[0] == committee: + return entry[1:] + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + + qs = compat_parse_qs(self._match_valid_url(url).group('qs')) + if not qs.get('filename') or not qs.get('type') or not qs.get('comm'): + raise ExtractorError('Invalid URL', expected=True) + + video_id = re.sub(r'.mp4$', '', qs['filename'][0]) + + webpage = self._download_webpage(url, video_id) + + if smuggled_data.get('force_title'): + title = smuggled_data['force_title'] + else: + title = self._html_search_regex(r'<title>([^<]+)</title>', webpage, video_id) + poster = qs.get('poster') + thumbnail = poster[0] if poster else None + + video_type = qs['type'][0] + committee = video_type if video_type == 'arch' else qs['comm'][0] + stream_num, domain = self._get_info_for_comm(committee) + + formats = [] + if video_type == 'arch': + filename = video_id if '.' in video_id else video_id + '.mp4' + formats = [{ + # All parameters in the query string are necessary to prevent a 403 error + 'url': compat_urlparse.urljoin(domain, filename) + '?v=3.1.0&fp=&r=&g=', + }] + else: + hdcore_sign = 'hdcore=3.1.0' + url_params = (domain, video_id, stream_num) + f4m_url = '%s/z/%s_1@%s/manifest.f4m?' % url_params + hdcore_sign + m3u8_url = '%s/i/%s_1@%s/master.m3u8' % url_params + for entry in self._extract_f4m_formats(f4m_url, video_id, f4m_id='f4m'): + # URLs without the extra param induce an 404 error + entry.update({'extra_param_to_segment_url': hdcore_sign}) + formats.append(entry) + for entry in self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', m3u8_id='m3u8'): + mobj = re.search(r'(?P<tag>(?:-p|-b)).m3u8', entry['url']) + if mobj: + entry['format_id'] += mobj.group('tag') + formats.append(entry) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + } diff --git a/yt_dlp/extractor/sendtonews.py b/yt_dlp/extractor/sendtonews.py new file mode 100644 index 000000000..bc38a0f1e --- /dev/null +++ b/yt_dlp/extractor/sendtonews.py @@ -0,0 +1,107 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + float_or_none, + parse_iso8601, + update_url_query, + int_or_none, + determine_protocol, + unescapeHTML, +) + + +class SendtoNewsIE(InfoExtractor): + _VALID_URL = r'https?://embed\.sendtonews\.com/player2/embedplayer\.php\?.*\bSC=(?P<id>[0-9A-Za-z-]+)' + + _TEST = { + # From http://cleveland.cbslocal.com/2016/05/16/indians-score-season-high-15-runs-in-blowout-win-over-reds-rapid-reaction/ + 'url': 'http://embed.sendtonews.com/player2/embedplayer.php?SC=GxfCe0Zo7D-175909-5588&type=single&autoplay=on&sound=YES', + 'info_dict': { + 'id': 'GxfCe0Zo7D-175909-5588' + }, + 'playlist_count': 8, + # test the first video only to prevent lengthy tests + 'playlist': [{ + 'info_dict': { + 'id': '240385', + 'ext': 'mp4', + 'title': 'Indians introduce Encarnacion', + 'description': 'Indians president of baseball operations Chris Antonetti and Edwin Encarnacion discuss the slugger\'s three-year contract with Cleveland', + 'duration': 137.898, + 'thumbnail': r're:https?://.*\.jpg$', + 'upload_date': '20170105', + 'timestamp': 1483649762, + }, + }], + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + _URL_TEMPLATE = '//embed.sendtonews.com/player2/embedplayer.php?SC=%s' + + @classmethod + def _extract_url(cls, webpage): + mobj = re.search(r'''(?x)<script[^>]+src=([\'"]) + (?:https?:)?//embed\.sendtonews\.com/player/responsiveembed\.php\? + .*\bSC=(?P<SC>[0-9a-zA-Z-]+).* + \1>''', webpage) + if mobj: + sc = mobj.group('SC') + return cls._URL_TEMPLATE % sc + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + data_url = update_url_query( + url.replace('embedplayer.php', 'data_read.php'), + {'cmd': 'loadInitial'}) + playlist_data = self._download_json(data_url, playlist_id) + + entries = [] + for video in playlist_data['playlistData'][0]: + info_dict = self._parse_jwplayer_data( + video['jwconfiguration'], + require_title=False, m3u8_id='hls', rtmp_params={'no_resume': True}) + + for f in info_dict['formats']: + if f.get('tbr'): + continue + tbr = int_or_none(self._search_regex( + r'/(\d+)k/', f['url'], 'bitrate', default=None)) + if not tbr: + continue + f.update({ + 'format_id': '%s-%d' % (determine_protocol(f), tbr), + 'tbr': tbr, + }) + # 'tbr' was explicitly set to be prefered over 'height' originally, + # So this is being kept unless someone can confirm this is unnecessary + self._sort_formats(info_dict['formats'], ('tbr', 'res')) + + thumbnails = [] + if video.get('thumbnailUrl'): + thumbnails.append({ + 'id': 'normal', + 'url': video['thumbnailUrl'], + }) + if video.get('smThumbnailUrl'): + thumbnails.append({ + 'id': 'small', + 'url': video['smThumbnailUrl'], + }) + info_dict.update({ + 'title': video['S_headLine'].strip(), + 'description': unescapeHTML(video.get('S_fullStory')), + 'thumbnails': thumbnails, + 'duration': float_or_none(video.get('SM_length')), + 'timestamp': parse_iso8601(video.get('S_sysDate'), delimiter=' '), + }) + entries.append(info_dict) + + return self.playlist_result(entries, playlist_id) diff --git a/youtube_dl/extractor/servus.py b/yt_dlp/extractor/servus.py index 1610ddc2c..1610ddc2c 100644 --- a/youtube_dl/extractor/servus.py +++ b/yt_dlp/extractor/servus.py diff --git a/yt_dlp/extractor/sevenplus.py b/yt_dlp/extractor/sevenplus.py new file mode 100644 index 000000000..210c44ab2 --- /dev/null +++ b/yt_dlp/extractor/sevenplus.py @@ -0,0 +1,138 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .brightcove import BrightcoveNewIE +from ..compat import ( + compat_HTTPError, + compat_str, +) +from ..utils import ( + ExtractorError, + try_get, + update_url_query, +) + + +class SevenPlusIE(BrightcoveNewIE): + IE_NAME = '7plus' + _VALID_URL = r'https?://(?:www\.)?7plus\.com\.au/(?P<path>[^?]+\?.*?\bepisode-id=(?P<id>[^&#]+))' + _TESTS = [{ + 'url': 'https://7plus.com.au/MTYS?episode-id=MTYS7-003', + 'info_dict': { + 'id': 'MTYS7-003', + 'ext': 'mp4', + 'title': 'S7 E3 - Wind Surf', + 'description': 'md5:29c6a69f21accda7601278f81b46483d', + 'uploader_id': '5303576322001', + 'upload_date': '20171201', + 'timestamp': 1512106377, + 'series': 'Mighty Ships', + 'season_number': 7, + 'episode_number': 3, + 'episode': 'Wind Surf', + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + } + }, { + 'url': 'https://7plus.com.au/UUUU?episode-id=AUMS43-001', + 'only_matching': True, + }] + + def _real_initialize(self): + self.token = None + + cookies = self._get_cookies('https://7plus.com.au') + api_key = next((x for x in cookies if x.startswith('glt_')), '')[4:] + if not api_key: # Cookies are signed out, skip login + return + + login_resp = self._download_json( + 'https://login.7plus.com.au/accounts.getJWT', None, 'Logging in', fatal=False, + query={ + 'APIKey': api_key, + 'sdk': 'js_latest', + 'login_token': cookies[f'glt_{api_key}'].value, + 'authMode': 'cookie', + 'pageURL': 'https://7plus.com.au/', + 'sdkBuild': '12471', + 'format': 'json', + }) or {} + + if 'errorMessage' in login_resp: + self.report_warning(f'Unable to login: 7plus said: {login_resp["errorMessage"]}') + return + id_token = login_resp.get('id_token') + if not id_token: + self.report_warning('Unable to login: Could not extract id token') + return + + token_resp = self._download_json( + 'https://7plus.com.au/auth/token', None, 'Getting auth token', fatal=False, + headers={'Content-Type': 'application/json'}, data=json.dumps({ + 'idToken': id_token, + 'platformId': 'web', + 'regSource': '7plus', + }).encode('utf-8')) or {} + self.token = token_resp.get('token') + if not self.token: + self.report_warning('Unable to log in: Could not extract auth token') + + def _real_extract(self, url): + path, episode_id = self._match_valid_url(url).groups() + + headers = {} + if self.token: + headers['Authorization'] = f'Bearer {self.token}' + + try: + media = self._download_json( + 'https://videoservice.swm.digital/playback', episode_id, query={ + 'appId': '7plus', + 'deviceType': 'web', + 'platformType': 'web', + 'accountId': 5303576322001, + 'referenceId': 'ref:' + episode_id, + 'deliveryId': 'csai', + 'videoType': 'vod', + }, headers=headers)['media'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + raise ExtractorError(self._parse_json( + e.cause.read().decode(), episode_id)[0]['error_code'], expected=True) + raise + + for source in media.get('sources', {}): + src = source.get('src') + if not src: + continue + source['src'] = update_url_query(src, {'rule': ''}) + + info = self._parse_brightcove_metadata(media, episode_id) + + content = self._download_json( + 'https://component-cdn.swm.digital/content/' + path, + episode_id, headers={ + 'market-id': 4, + }, fatal=False) or {} + for item in content.get('items', {}): + if item.get('componentData', {}).get('componentType') == 'infoPanel': + for src_key, dst_key in [('title', 'title'), ('shortSynopsis', 'description')]: + value = item.get(src_key) + if value: + info[dst_key] = value + info['series'] = try_get( + item, lambda x: x['seriesLogo']['name'], compat_str) + mobj = re.search(r'^S(\d+)\s+E(\d+)\s+-\s+(.+)$', info['title']) + if mobj: + info.update({ + 'season_number': int(mobj.group(1)), + 'episode_number': int(mobj.group(2)), + 'episode': mobj.group(3), + }) + + return info diff --git a/youtube_dl/extractor/sexu.py b/yt_dlp/extractor/sexu.py index 3df51520b..3df51520b 100644 --- a/youtube_dl/extractor/sexu.py +++ b/yt_dlp/extractor/sexu.py diff --git a/yt_dlp/extractor/seznamzpravy.py b/yt_dlp/extractor/seznamzpravy.py new file mode 100644 index 000000000..eef4975cb --- /dev/null +++ b/yt_dlp/extractor/seznamzpravy.py @@ -0,0 +1,169 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urllib_parse_urlparse, +) +from ..utils import ( + urljoin, + int_or_none, + parse_codecs, + parse_qs, + try_get, +) + + +def _raw_id(src_url): + return compat_urllib_parse_urlparse(src_url).path.split('/')[-1] + + +class SeznamZpravyIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?seznamzpravy\.cz/iframe/player\?.*\bsrc=' + _TESTS = [{ + 'url': 'https://www.seznamzpravy.cz/iframe/player?duration=241&serviceSlug=zpravy&src=https%3A%2F%2Fv39-a.sdn.szn.cz%2Fv_39%2Fvmd%2F5999c902ea707c67d8e267a9%3Ffl%3Dmdk%2C432f65a0%7C&itemType=video&autoPlay=false&title=Sv%C4%9Bt%20bez%20obalu%3A%20%C4%8Ce%C5%A1t%C3%AD%20voj%C3%A1ci%20na%20mis%C3%ADch%20(kr%C3%A1tk%C3%A1%20verze)&series=Sv%C4%9Bt%20bez%20obalu&serviceName=Seznam%20Zpr%C3%A1vy&poster=%2F%2Fd39-a.sdn.szn.cz%2Fd_39%2Fc_img_F_I%2FR5puJ.jpeg%3Ffl%3Dcro%2C0%2C0%2C1920%2C1080%7Cres%2C1200%2C%2C1%7Cjpg%2C80%2C%2C1&width=1920&height=1080&cutFrom=0&cutTo=0&splVersion=VOD&contentId=170889&contextId=35990&showAdvert=true&collocation=&autoplayPossible=true&embed=&isVideoTooShortForPreroll=false&isVideoTooLongForPostroll=true&videoCommentOpKey=&videoCommentId=&version=4.0.76&dotService=zpravy&gemiusPrismIdentifier=bVc1ZIb_Qax4W2v5xOPGpMeCP31kFfrTzj0SqPTLh_b.Z7&zoneIdPreroll=seznam.pack.videospot&skipOffsetPreroll=5§ionPrefixPreroll=%2Fzpravy', + 'info_dict': { + 'id': '170889', + 'ext': 'mp4', + 'title': 'Svět bez obalu: Čeští vojáci na misích (krátká verze)', + 'thumbnail': r're:^https?://.*\.jpe?g', + 'duration': 241, + 'series': 'Svět bez obalu', + }, + 'params': { + 'skip_download': True, + }, + }, { + # with Location key + 'url': 'https://www.seznamzpravy.cz/iframe/player?duration=null&serviceSlug=zpravy&src=https%3A%2F%2Flive-a.sdn.szn.cz%2Fv_39%2F59e468fe454f8472a96af9fa%3Ffl%3Dmdk%2C5c1e2840%7C&itemType=livevod&autoPlay=false&title=P%C5%99edseda%20KDU-%C4%8CSL%20Pavel%20B%C4%9Blobr%C3%A1dek%20ve%20volebn%C3%AD%20V%C3%BDzv%C4%9B%20Seznamu&series=V%C3%BDzva&serviceName=Seznam%20Zpr%C3%A1vy&poster=%2F%2Fd39-a.sdn.szn.cz%2Fd_39%2Fc_img_G_J%2FjTBCs.jpeg%3Ffl%3Dcro%2C0%2C0%2C1280%2C720%7Cres%2C1200%2C%2C1%7Cjpg%2C80%2C%2C1&width=16&height=9&cutFrom=0&cutTo=0&splVersion=VOD&contentId=185688&contextId=38489&showAdvert=true&collocation=&hideFullScreen=false&hideSubtitles=false&embed=&isVideoTooShortForPreroll=false&isVideoTooShortForPreroll2=false&isVideoTooLongForPostroll=false&fakePostrollZoneID=seznam.clanky.zpravy.preroll&fakePrerollZoneID=seznam.clanky.zpravy.preroll&videoCommentId=&trim=default_16x9&noPrerollVideoLength=30&noPreroll2VideoLength=undefined&noMidrollVideoLength=0&noPostrollVideoLength=999999&autoplayPossible=true&version=5.0.41&dotService=zpravy&gemiusPrismIdentifier=zD3g7byfW5ekpXmxTVLaq5Srjw5i4hsYo0HY1aBwIe..27&zoneIdPreroll=seznam.pack.videospot&skipOffsetPreroll=5§ionPrefixPreroll=%2Fzpravy%2Fvyzva&zoneIdPostroll=seznam.pack.videospot&skipOffsetPostroll=5§ionPrefixPostroll=%2Fzpravy%2Fvyzva®ression=false', + 'info_dict': { + 'id': '185688', + 'ext': 'mp4', + 'title': 'Předseda KDU-ČSL Pavel Bělobrádek ve volební Výzvě Seznamu', + 'thumbnail': r're:^https?://.*\.jpe?g', + 'series': 'Výzva', + }, + 'params': { + 'skip_download': True, + }, + }] + + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') for mobj in re.finditer( + r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?seznamzpravy\.cz/iframe/player\?.*?)\1', + webpage)] + + def _extract_sdn_formats(self, sdn_url, video_id): + sdn_data = self._download_json(sdn_url, video_id) + + if sdn_data.get('Location'): + sdn_url = sdn_data['Location'] + sdn_data = self._download_json(sdn_url, video_id) + + formats = [] + mp4_formats = try_get(sdn_data, lambda x: x['data']['mp4'], dict) or {} + for format_id, format_data in mp4_formats.items(): + relative_url = format_data.get('url') + if not relative_url: + continue + + try: + width, height = format_data.get('resolution') + except (TypeError, ValueError): + width, height = None, None + + f = { + 'url': urljoin(sdn_url, relative_url), + 'format_id': 'http-%s' % format_id, + 'tbr': int_or_none(format_data.get('bandwidth'), scale=1000), + 'width': int_or_none(width), + 'height': int_or_none(height), + } + f.update(parse_codecs(format_data.get('codec'))) + formats.append(f) + + pls = sdn_data.get('pls', {}) + + def get_url(format_id): + return try_get(pls, lambda x: x[format_id]['url'], compat_str) + + dash_rel_url = get_url('dash') + if dash_rel_url: + formats.extend(self._extract_mpd_formats( + urljoin(sdn_url, dash_rel_url), video_id, mpd_id='dash', + fatal=False)) + + hls_rel_url = get_url('hls') + if hls_rel_url: + formats.extend(self._extract_m3u8_formats( + urljoin(sdn_url, hls_rel_url), video_id, ext='mp4', + m3u8_id='hls', fatal=False)) + + self._sort_formats(formats) + return formats + + def _real_extract(self, url): + params = parse_qs(url) + + src = params['src'][0] + title = params['title'][0] + video_id = params.get('contentId', [_raw_id(src)])[0] + formats = self._extract_sdn_formats(src + 'spl2,2,VOD', video_id) + + duration = int_or_none(params.get('duration', [None])[0]) + series = params.get('series', [None])[0] + thumbnail = params.get('poster', [None])[0] + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'series': series, + 'formats': formats, + } + + +class SeznamZpravyArticleIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?:seznam\.cz/zpravy|seznamzpravy\.cz)/clanek/(?:[^/?#&]+)-(?P<id>\d+)' + _API_URL = 'https://apizpravy.seznam.cz/' + + _TESTS = [{ + # two videos on one page, with SDN URL + 'url': 'https://www.seznamzpravy.cz/clanek/jejich-svet-na-nas-utoci-je-lepsi-branit-se-na-jejich-pisecku-rika-reziser-a-major-v-zaloze-marhoul-35990', + 'info_dict': { + 'id': '35990', + 'title': 'md5:6011c877a36905f28f271fcd8dcdb0f2', + 'description': 'md5:933f7b06fa337a814ba199d3596d27ba', + }, + 'playlist_count': 2, + }, { + # video with live stream URL + 'url': 'https://www.seznam.cz/zpravy/clanek/znovu-do-vlady-s-ano-pavel-belobradek-ve-volebnim-specialu-seznamu-38489', + 'info_dict': { + 'id': '38489', + 'title': 'md5:8fa1afdc36fd378cf0eba2b74c5aca60', + 'description': 'md5:428e7926a1a81986ec7eb23078004fb4', + }, + 'playlist_count': 1, + }] + + def _real_extract(self, url): + article_id = self._match_id(url) + + webpage = self._download_webpage(url, article_id) + + info = self._search_json_ld(webpage, article_id, default={}) + + title = info.get('title') or self._og_search_title(webpage, fatal=False) + description = info.get('description') or self._og_search_description(webpage) + + return self.playlist_result([ + self.url_result(entry_url, ie=SeznamZpravyIE.ie_key()) + for entry_url in SeznamZpravyIE._extract_urls(webpage)], + article_id, title, description) diff --git a/yt_dlp/extractor/shahid.py b/yt_dlp/extractor/shahid.py new file mode 100644 index 000000000..42de41a11 --- /dev/null +++ b/yt_dlp/extractor/shahid.py @@ -0,0 +1,225 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import math +import re + +from .aws import AWSIE +from ..compat import compat_HTTPError +from ..utils import ( + clean_html, + ExtractorError, + InAdvancePagedList, + int_or_none, + parse_iso8601, + str_or_none, + urlencode_postdata, +) + + +class ShahidBaseIE(AWSIE): + _AWS_PROXY_HOST = 'api2.shahid.net' + _AWS_API_KEY = '2RRtuMHx95aNI1Kvtn2rChEuwsCogUd4samGPjLh' + _VALID_URL_BASE = r'https?://shahid\.mbc\.net/[a-z]{2}/' + + def _handle_error(self, e): + fail_data = self._parse_json( + e.cause.read().decode('utf-8'), None, fatal=False) + if fail_data: + faults = fail_data.get('faults', []) + faults_message = ', '.join([clean_html(fault['userMessage']) for fault in faults if fault.get('userMessage')]) + if faults_message: + raise ExtractorError(faults_message, expected=True) + + def _call_api(self, path, video_id, request=None): + query = {} + if request: + query['request'] = json.dumps(request) + try: + return self._aws_execute_api({ + 'uri': '/proxy/v2/' + path, + 'access_key': 'AKIAI6X4TYCIXM2B7MUQ', + 'secret_key': '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', + }, video_id, query) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError): + self._handle_error(e) + raise + + +class ShahidIE(ShahidBaseIE): + _NETRC_MACHINE = 'shahid' + _VALID_URL = ShahidBaseIE._VALID_URL_BASE + r'(?:serie|show|movie)s/[^/]+/(?P<type>episode|clip|movie)-(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://shahid.mbc.net/ar/shows/%D9%85%D8%AA%D8%AD%D9%81-%D8%A7%D9%84%D8%AF%D8%AD%D9%8A%D8%AD-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D9%83%D9%84%D9%8A%D8%A8-1/clip-816924', + 'info_dict': { + 'id': '816924', + 'ext': 'mp4', + 'title': 'متحف الدحيح الموسم 1 كليب 1', + 'timestamp': 1602806400, + 'upload_date': '20201016', + 'description': 'برومو', + 'duration': 22, + 'categories': ['كوميديا'], + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + }, { + 'url': 'https://shahid.mbc.net/ar/movies/%D8%A7%D9%84%D9%82%D9%86%D8%A7%D8%B5%D8%A9/movie-151746', + 'only_matching': True + }, { + # shahid plus subscriber only + 'url': 'https://shahid.mbc.net/ar/series/%D9%85%D8%B1%D8%A7%D9%8A%D8%A7-2011-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/episode-90511', + 'only_matching': True + }, { + 'url': 'https://shahid.mbc.net/en/shows/Ramez-Fi-Al-Shallal-season-1-episode-1/episode-359319', + 'only_matching': True + }] + + def _real_initialize(self): + email, password = self._get_login_info() + if email is None: + return + + try: + user_data = self._download_json( + 'https://shahid.mbc.net/wd/service/users/login', + None, 'Logging in', data=json.dumps({ + 'email': email, + 'password': password, + 'basic': 'false', + }).encode('utf-8'), headers={ + 'Content-Type': 'application/json; charset=UTF-8', + })['user'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError): + self._handle_error(e) + raise + + self._download_webpage( + 'https://shahid.mbc.net/populateContext', + None, 'Populate Context', data=urlencode_postdata({ + 'firstName': user_data['firstName'], + 'lastName': user_data['lastName'], + 'userName': user_data['email'], + 'csg_user_name': user_data['email'], + 'subscriberId': user_data['id'], + 'sessionId': user_data['sessionId'], + })) + + def _real_extract(self, url): + page_type, video_id = self._match_valid_url(url).groups() + if page_type == 'clip': + page_type = 'episode' + + playout = self._call_api( + 'playout/new/url/' + video_id, video_id)['playout'] + + if not self.get_param('allow_unplayable_formats') and playout.get('drm'): + self.report_drm(video_id) + + formats = self._extract_m3u8_formats(re.sub( + # https://docs.aws.amazon.com/mediapackage/latest/ug/manifest-filtering.html + r'aws\.manifestfilter=[\w:;,-]+&?', + '', playout['url']), video_id, 'mp4') + self._sort_formats(formats) + + # video = self._call_api( + # 'product/id', video_id, { + # 'id': video_id, + # 'productType': 'ASSET', + # 'productSubType': page_type.upper() + # })['productModel'] + + response = self._download_json( + 'http://api.shahid.net/api/v1_1/%s/%s' % (page_type, video_id), + video_id, 'Downloading video JSON', query={ + 'apiKey': 'sh@hid0nlin3', + 'hash': 'b2wMCTHpSmyxGqQjJFOycRmLSex+BpTK/ooxy6vHaqs=', + }) + data = response.get('data', {}) + error = data.get('error') + if error: + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, '\n'.join(error.values())), + expected=True) + + video = data[page_type] + title = video['title'] + categories = [ + category['name'] + for category in video.get('genres', []) if 'name' in category] + + return { + 'id': video_id, + 'title': title, + 'description': video.get('description'), + 'thumbnail': video.get('thumbnailUrl'), + 'duration': int_or_none(video.get('duration')), + 'timestamp': parse_iso8601(video.get('referenceDate')), + 'categories': categories, + 'series': video.get('showTitle') or video.get('showName'), + 'season': video.get('seasonTitle'), + 'season_number': int_or_none(video.get('seasonNumber')), + 'season_id': str_or_none(video.get('seasonId')), + 'episode_number': int_or_none(video.get('number')), + 'episode_id': video_id, + 'formats': formats, + } + + +class ShahidShowIE(ShahidBaseIE): + _VALID_URL = ShahidBaseIE._VALID_URL_BASE + r'(?:show|serie)s/[^/]+/(?:show|series)-(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://shahid.mbc.net/ar/shows/%D8%B1%D8%A7%D9%85%D8%B2-%D9%82%D8%B1%D8%B4-%D8%A7%D9%84%D8%A8%D8%AD%D8%B1/show-79187', + 'info_dict': { + 'id': '79187', + 'title': 'رامز قرش البحر', + 'description': 'md5:c88fa7e0f02b0abd39d417aee0d046ff', + }, + 'playlist_mincount': 32, + }, { + 'url': 'https://shahid.mbc.net/ar/series/How-to-live-Longer-(The-Big-Think)/series-291861', + 'only_matching': True + }] + _PAGE_SIZE = 30 + + def _real_extract(self, url): + show_id = self._match_id(url) + + product = self._call_api( + 'playableAsset', show_id, {'showId': show_id})['productModel'] + playlist = product['playlist'] + playlist_id = playlist['id'] + show = product.get('show', {}) + + def page_func(page_num): + playlist = self._call_api( + 'product/playlist', show_id, { + 'playListId': playlist_id, + 'pageNumber': page_num, + 'pageSize': 30, + 'sorts': [{ + 'order': 'DESC', + 'type': 'SORTDATE' + }], + }) + for product in playlist.get('productList', {}).get('products', []): + product_url = product.get('productUrl', []).get('url') + if not product_url: + continue + yield self.url_result( + product_url, 'Shahid', + str_or_none(product.get('id')), + product.get('title')) + + entries = InAdvancePagedList( + page_func, + math.ceil(playlist['count'] / self._PAGE_SIZE), + self._PAGE_SIZE) + + return self.playlist_result( + entries, show_id, show.get('title'), show.get('description')) diff --git a/youtube_dl/extractor/shared.py b/yt_dlp/extractor/shared.py index 93ab2a167..93ab2a167 100644 --- a/youtube_dl/extractor/shared.py +++ b/yt_dlp/extractor/shared.py diff --git a/yt_dlp/extractor/shemaroome.py b/yt_dlp/extractor/shemaroome.py new file mode 100644 index 000000000..142d5dc3a --- /dev/null +++ b/yt_dlp/extractor/shemaroome.py @@ -0,0 +1,104 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..aes import aes_cbc_decrypt +from ..compat import ( + compat_b64decode, + compat_ord, +) +from ..utils import ( + bytes_to_intlist, + ExtractorError, + intlist_to_bytes, + unified_strdate, +) + + +class ShemarooMeIE(InfoExtractor): + _VALID_URL = r'(?:https?://)(?:www\.)?shemaroome\.com/(?:movies|shows)/(?P<id>[^?#]+)' + _TESTS = [{ + 'url': 'https://www.shemaroome.com/movies/dil-hai-tumhaara', + 'info_dict': { + 'id': 'dil-hai-tumhaara', + 'ext': 'mp4', + 'title': 'Dil Hai Tumhaara', + 'release_date': '20020906', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'md5:2782c4127807103cf5a6ae2ca33645ce', + }, + 'params': { + 'skip_download': True + } + }, { + 'url': 'https://www.shemaroome.com/shows/jurm-aur-jazbaat/laalach', + 'info_dict': { + 'id': 'jurm-aur-jazbaat_laalach', + 'ext': 'mp4', + 'title': 'Laalach', + 'description': 'md5:92b79c2dcb539b0ab53f9fa5a048f53c', + 'thumbnail': r're:^https?://.*\.jpg$', + 'release_date': '20210507', + }, + 'params': { + 'skip_download': True + }, + 'skip': 'Premium videos cannot be downloaded yet.' + }, { + 'url': 'https://www.shemaroome.com/shows/jai-jai-jai-bajrang-bali/jai-jai-jai-bajrang-bali-episode-99', + 'info_dict': { + 'id': 'jai-jai-jai-bajrang-bali_jai-jai-jai-bajrang-bali-episode-99', + 'ext': 'mp4', + 'title': 'Jai Jai Jai Bajrang Bali Episode 99', + 'description': 'md5:850d127a18ee3f9529d7fbde2f49910d', + 'thumbnail': r're:^https?://.*\.jpg$', + 'release_date': '20110101', + }, + 'params': { + 'skip_download': True + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url).replace('/', '_') + webpage = self._download_webpage(url, video_id) + title = self._search_regex(r'id=\"ma_title\" value=\"([^\"]+)', webpage, 'title') + thumbnail = self._og_search_thumbnail(webpage) + content_def = self._search_regex(r'id=\"content_definition\" value=\"([^\"]+)', webpage, 'content_def') + catalog_id = self._search_regex(r'id=\"catalog_id\" value=\"([^\"]+)', webpage, 'catalog_id') + item_category = self._search_regex(r'id=\"item_category\" value=\"([^\"]+)', webpage, 'item_category') + content_id = self._search_regex(r'id=\"content_id\" value=\"([^\"]+)', webpage, 'content_id') + + data = f'catalog_id={catalog_id}&content_id={content_id}&category={item_category}&content_def={content_def}' + data_json = self._download_json('https://www.shemaroome.com/users/user_all_lists', video_id, data=data.encode()) + if not data_json.get('status'): + raise ExtractorError('Premium videos cannot be downloaded yet.', expected=True) + url_data = bytes_to_intlist(compat_b64decode(data_json['new_play_url'])) + key = bytes_to_intlist(compat_b64decode(data_json['key'])) + iv = [0] * 16 + m3u8_url = intlist_to_bytes(aes_cbc_decrypt(url_data, key, iv)) + m3u8_url = m3u8_url[:-compat_ord((m3u8_url[-1]))].decode('ascii') + formats = self._extract_m3u8_formats(m3u8_url, video_id, fatal=False, headers={'stream_key': data_json['stream_key']}) + self._sort_formats(formats) + + release_date = self._html_search_regex( + (r'itemprop="uploadDate">\s*([\d-]+)', r'id="release_date" value="([\d-]+)'), + webpage, 'release date', fatal=False) + + subtitles = {} + sub_url = data_json.get('subtitle') + if sub_url: + subtitles.setdefault('EN', []).append({ + 'url': self._proto_relative_url(sub_url), + }) + description = self._html_search_regex(r'(?s)>Synopsis(</.+?)</', webpage, 'description', fatal=False) + + return { + 'id': video_id, + 'formats': formats, + 'title': title, + 'thumbnail': thumbnail, + 'release_date': unified_strdate(release_date), + 'description': description, + 'subtitles': subtitles, + } diff --git a/youtube_dl/extractor/showroomlive.py b/yt_dlp/extractor/showroomlive.py index efd9d561f..efd9d561f 100644 --- a/youtube_dl/extractor/showroomlive.py +++ b/yt_dlp/extractor/showroomlive.py diff --git a/yt_dlp/extractor/simplecast.py b/yt_dlp/extractor/simplecast.py new file mode 100644 index 000000000..857e9414f --- /dev/null +++ b/yt_dlp/extractor/simplecast.py @@ -0,0 +1,160 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + clean_podcast_url, + int_or_none, + parse_iso8601, + strip_or_none, + try_get, + urlencode_postdata, +) + + +class SimplecastBaseIE(InfoExtractor): + _UUID_REGEX = r'[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12}' + _API_BASE = 'https://api.simplecast.com/' + + def _call_api(self, path_tmpl, video_id): + return self._download_json( + self._API_BASE + path_tmpl % video_id, video_id) + + def _call_search_api(self, resource, resource_id, resource_url): + return self._download_json( + 'https://api.simplecast.com/%ss/search' % resource, resource_id, + data=urlencode_postdata({'url': resource_url})) + + def _parse_episode(self, episode): + episode_id = episode['id'] + title = episode['title'].strip() + audio_file = episode.get('audio_file') or {} + audio_file_url = audio_file.get('url') or episode.get('audio_file_url') or episode['enclosure_url'] + + season = episode.get('season') or {} + season_href = season.get('href') + season_id = None + if season_href: + season_id = self._search_regex( + r'https?://api.simplecast.com/seasons/(%s)' % self._UUID_REGEX, + season_href, 'season id', default=None) + + webpage_url = episode.get('episode_url') + channel_url = None + if webpage_url: + channel_url = self._search_regex( + r'(https?://[^/]+\.simplecast\.com)', + webpage_url, 'channel url', default=None) + + return { + 'id': episode_id, + 'display_id': episode.get('slug'), + 'title': title, + 'url': clean_podcast_url(audio_file_url), + 'webpage_url': webpage_url, + 'channel_url': channel_url, + 'series': try_get(episode, lambda x: x['podcast']['title']), + 'season_number': int_or_none(season.get('number')), + 'season_id': season_id, + 'thumbnail': episode.get('image_url'), + 'episode_id': episode_id, + 'episode_number': int_or_none(episode.get('number')), + 'description': strip_or_none(episode.get('description')), + 'timestamp': parse_iso8601(episode.get('published_at')), + 'duration': int_or_none(episode.get('duration')), + 'filesize': int_or_none(audio_file.get('size') or episode.get('audio_file_size')), + } + + +class SimplecastIE(SimplecastBaseIE): + IE_NAME = 'simplecast' + _VALID_URL = r'https?://(?:api\.simplecast\.com/episodes|player\.simplecast\.com)/(?P<id>%s)' % SimplecastBaseIE._UUID_REGEX + _COMMON_TEST_INFO = { + 'display_id': 'errant-signal-chris-franklin-new-wave-video-essays', + 'id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876', + 'ext': 'mp3', + 'title': 'Errant Signal - Chris Franklin & New Wave Video Essays', + 'episode_number': 1, + 'episode_id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876', + 'description': 'md5:34752789d3d2702e2d2c975fbd14f357', + 'season_number': 1, + 'season_id': 'e23df0da-bae4-4531-8bbf-71364a88dc13', + 'series': 'The RE:BIND.io Podcast', + 'duration': 5343, + 'timestamp': 1580979475, + 'upload_date': '20200206', + 'webpage_url': r're:^https?://the-re-bind-io-podcast\.simplecast\.com/episodes/errant-signal-chris-franklin-new-wave-video-essays', + 'channel_url': r're:^https?://the-re-bind-io-podcast\.simplecast\.com$', + } + _TESTS = [{ + 'url': 'https://api.simplecast.com/episodes/b6dc49a2-9404-4853-9aa9-9cfc097be876', + 'md5': '8c93be7be54251bf29ee97464eabe61c', + 'info_dict': _COMMON_TEST_INFO, + }, { + 'url': 'https://player.simplecast.com/b6dc49a2-9404-4853-9aa9-9cfc097be876', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r'''(?x)<iframe[^>]+src=["\'] + ( + https?://(?:embed\.simplecast\.com/[0-9a-f]{8}| + player\.simplecast\.com/%s + ))''' % SimplecastBaseIE._UUID_REGEX, webpage) + + def _real_extract(self, url): + episode_id = self._match_id(url) + episode = self._call_api('episodes/%s', episode_id) + return self._parse_episode(episode) + + +class SimplecastEpisodeIE(SimplecastBaseIE): + IE_NAME = 'simplecast:episode' + _VALID_URL = r'https?://(?!api\.)[^/]+\.simplecast\.com/episodes/(?P<id>[^/?&#]+)' + _TEST = { + 'url': 'https://the-re-bind-io-podcast.simplecast.com/episodes/errant-signal-chris-franklin-new-wave-video-essays', + 'md5': '8c93be7be54251bf29ee97464eabe61c', + 'info_dict': SimplecastIE._COMMON_TEST_INFO, + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + episode = self._call_search_api( + 'episode', mobj.group(1), mobj.group(0)) + return self._parse_episode(episode) + + +class SimplecastPodcastIE(SimplecastBaseIE): + IE_NAME = 'simplecast:podcast' + _VALID_URL = r'https?://(?!(?:api|cdn|embed|feeds|player)\.)(?P<id>[^/]+)\.simplecast\.com(?!/episodes/[^/?&#]+)' + _TESTS = [{ + 'url': 'https://the-re-bind-io-podcast.simplecast.com', + 'playlist_mincount': 33, + 'info_dict': { + 'id': '07d28d26-7522-42eb-8c53-2bdcfc81c43c', + 'title': 'The RE:BIND.io Podcast', + }, + }, { + 'url': 'https://the-re-bind-io-podcast.simplecast.com/episodes', + 'only_matching': True, + }] + + def _real_extract(self, url): + subdomain = self._match_id(url) + site = self._call_search_api('site', subdomain, url) + podcast = site['podcast'] + podcast_id = podcast['id'] + podcast_title = podcast.get('title') + + def entries(): + episodes = self._call_api('podcasts/%s/episodes', podcast_id) + for episode in (episodes.get('collection') or []): + info = self._parse_episode(episode) + info['series'] = podcast_title + yield info + + return self.playlist_result(entries(), podcast_id, podcast_title) diff --git a/yt_dlp/extractor/sina.py b/yt_dlp/extractor/sina.py new file mode 100644 index 000000000..b62b0c3e5 --- /dev/null +++ b/yt_dlp/extractor/sina.py @@ -0,0 +1,114 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ( + HEADRequest, + ExtractorError, + int_or_none, + update_url_query, + qualities, + get_element_by_attribute, + clean_html, +) + + +class SinaIE(InfoExtractor): + _VALID_URL = r'''(?x)https?://(?:.*?\.)?video\.sina\.com\.cn/ + (?: + (?:view/|.*\#)(?P<id>\d+)| + .+?/(?P<pseudo_id>[^/?#]+)(?:\.s?html)| + # This is used by external sites like Weibo + api/sinawebApi/outplay.php/(?P<token>.+?)\.swf + ) + ''' + + _TESTS = [ + { + 'url': 'http://video.sina.com.cn/news/spj/topvideoes20160504/?opsubject_id=top1#250576622', + 'md5': 'd38433e2fc886007729735650ae4b3e9', + 'info_dict': { + 'id': '250576622', + 'ext': 'mp4', + 'title': '现场:克鲁兹宣布退选 特朗普将稳获提名', + } + }, + { + 'url': 'http://video.sina.com.cn/v/b/101314253-1290078633.html', + 'info_dict': { + 'id': '101314253', + 'ext': 'flv', + 'title': '军方提高对朝情报监视级别', + }, + 'skip': 'the page does not exist or has been deleted', + }, + { + 'url': 'http://video.sina.com.cn/view/250587748.html', + 'md5': '3d1807a25c775092aab3bc157fff49b4', + 'info_dict': { + 'id': '250587748', + 'ext': 'mp4', + 'title': '瞬间泪目:8年前汶川地震珍贵视频首曝光', + }, + }, + ] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + + video_id = mobj.group('id') + if not video_id: + if mobj.group('token') is not None: + # The video id is in the redirected url + self.to_screen('Getting video id') + request = HEADRequest(url) + _, urlh = self._download_webpage_handle(request, 'NA', False) + return self._real_extract(urlh.geturl()) + else: + pseudo_id = mobj.group('pseudo_id') + webpage = self._download_webpage(url, pseudo_id) + error = get_element_by_attribute('class', 'errtitle', webpage) + if error: + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, clean_html(error)), expected=True) + video_id = self._search_regex( + r"video_id\s*:\s*'(\d+)'", webpage, 'video id') + + video_data = self._download_json( + 'http://s.video.sina.com.cn/video/h5play', + video_id, query={'video_id': video_id}) + if video_data['code'] != 1: + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, video_data['message']), expected=True) + else: + video_data = video_data['data'] + title = video_data['title'] + description = video_data.get('description') + if description: + description = description.strip() + + preference = qualities(['cif', 'sd', 'hd', 'fhd', 'ffd']) + formats = [] + for quality_id, quality in video_data.get('videos', {}).get('mp4', {}).items(): + file_api = quality.get('file_api') + file_id = quality.get('file_id') + if not file_api or not file_id: + continue + formats.append({ + 'format_id': quality_id, + 'url': update_url_query(file_api, {'vid': file_id}), + 'quality': preference(quality_id), + 'ext': 'mp4', + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': video_data.get('image'), + 'duration': int_or_none(video_data.get('length')), + 'timestamp': int_or_none(video_data.get('create_time')), + 'formats': formats, + } diff --git a/yt_dlp/extractor/sixplay.py b/yt_dlp/extractor/sixplay.py new file mode 100644 index 000000000..fd747f59b --- /dev/null +++ b/yt_dlp/extractor/sixplay.py @@ -0,0 +1,127 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..compat import ( + compat_str, +) +from ..utils import ( + determine_ext, + int_or_none, + parse_qs, + try_get, + qualities, +) + + +class SixPlayIE(InfoExtractor): + IE_NAME = '6play' + _VALID_URL = r'(?:6play:|https?://(?:www\.)?(?P<domain>6play\.fr|rtlplay\.be|play\.rtl\.hr|rtlmost\.hu)/.+?-c_)(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'https://www.6play.fr/minute-par-minute-p_9533/le-but-qui-a-marque-lhistoire-du-football-francais-c_12041051', + 'md5': '31fcd112637baa0c2ab92c4fcd8baf27', + 'info_dict': { + 'id': '12041051', + 'ext': 'mp4', + 'title': 'Le but qui a marqué l\'histoire du football français !', + 'description': 'md5:b59e7e841d646ef1eb42a7868eb6a851', + }, + }, { + 'url': 'https://www.rtlplay.be/rtl-info-13h-p_8551/les-titres-du-rtlinfo-13h-c_12045869', + 'only_matching': True, + }, { + 'url': 'https://play.rtl.hr/pj-masks-p_9455/epizoda-34-sezona-1-catboyevo-cudo-na-dva-kotaca-c_11984989', + 'only_matching': True, + }, { + 'url': 'https://www.rtlmost.hu/megtorve-p_14167/megtorve-6-resz-c_12397787', + 'only_matching': True, + }] + + def _real_extract(self, url): + domain, video_id = self._match_valid_url(url).groups() + service, consumer_name = { + '6play.fr': ('6play', 'm6web'), + 'rtlplay.be': ('rtlbe_rtl_play', 'rtlbe'), + 'play.rtl.hr': ('rtlhr_rtl_play', 'rtlhr'), + 'rtlmost.hu': ('rtlhu_rtl_most', 'rtlhu'), + }.get(domain, ('6play', 'm6web')) + + data = self._download_json( + 'https://pc.middleware.6play.fr/6play/v2/platforms/m6group_web/services/%s/videos/clip_%s' % (service, video_id), + video_id, headers={ + 'x-customer-name': consumer_name + }, query={ + 'csa': 5, + 'with': 'clips', + }) + + clip_data = data['clips'][0] + title = clip_data['title'] + + urls = [] + quality_key = qualities(['lq', 'sd', 'hq', 'hd']) + formats = [] + subtitles = {} + assets = clip_data.get('assets') or [] + for asset in assets: + asset_url = asset.get('full_physical_path') + protocol = asset.get('protocol') + if not asset_url or ((protocol == 'primetime' or asset.get('type') == 'usp_hlsfp_h264') and not ('_drmnp.ism/' in asset_url or '_unpnp.ism/' in asset_url)) or asset_url in urls: + continue + urls.append(asset_url) + container = asset.get('video_container') + ext = determine_ext(asset_url) + if protocol == 'http_subtitle' or ext == 'vtt': + subtitles.setdefault('fr', []).append({'url': asset_url}) + continue + if container == 'm3u8' or ext == 'm3u8': + if protocol == 'usp': + if parse_qs(asset_url).get('token', [None])[0]: + urlh = self._request_webpage( + asset_url, video_id, fatal=False, + headers=self.geo_verification_headers()) + if not urlh: + continue + asset_url = urlh.geturl() + asset_url = asset_url.replace('_drmnp.ism/', '_unpnp.ism/') + for i in range(3, 0, -1): + asset_url = asset_url = asset_url.replace('_sd1/', '_sd%d/' % i) + m3u8_formats = self._extract_m3u8_formats( + asset_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False) + formats.extend(m3u8_formats) + formats.extend(self._extract_mpd_formats( + asset_url.replace('.m3u8', '.mpd'), + video_id, mpd_id='dash', fatal=False)) + if m3u8_formats: + break + else: + formats.extend(self._extract_m3u8_formats( + asset_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif container == 'mp4' or ext == 'mp4': + quality = asset.get('video_quality') + formats.append({ + 'url': asset_url, + 'format_id': quality, + 'quality': quality_key(quality), + 'ext': ext, + }) + self._sort_formats(formats) + + def get(getter): + for src in (data, clip_data): + v = try_get(src, getter, compat_str) + if v: + return v + + return { + 'id': video_id, + 'title': title, + 'description': get(lambda x: x['description']), + 'duration': int_or_none(clip_data.get('duration')), + 'series': get(lambda x: x['program']['title']), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/youtube_dl/extractor/sky.py b/yt_dlp/extractor/sky.py index ff2c977a0..ff2c977a0 100644 --- a/youtube_dl/extractor/sky.py +++ b/yt_dlp/extractor/sky.py diff --git a/youtube_dl/extractor/skyit.py b/yt_dlp/extractor/skyit.py index 14a4d8d4c..14a4d8d4c 100644 --- a/youtube_dl/extractor/skyit.py +++ b/yt_dlp/extractor/skyit.py diff --git a/youtube_dl/extractor/skylinewebcams.py b/yt_dlp/extractor/skylinewebcams.py index b7f8ac736..b7f8ac736 100644 --- a/youtube_dl/extractor/skylinewebcams.py +++ b/yt_dlp/extractor/skylinewebcams.py diff --git a/youtube_dl/extractor/skynewsarabia.py b/yt_dlp/extractor/skynewsarabia.py index fffc9aa22..fffc9aa22 100644 --- a/youtube_dl/extractor/skynewsarabia.py +++ b/yt_dlp/extractor/skynewsarabia.py diff --git a/yt_dlp/extractor/skynewsau.py b/yt_dlp/extractor/skynewsau.py new file mode 100644 index 000000000..b1d77951e --- /dev/null +++ b/yt_dlp/extractor/skynewsau.py @@ -0,0 +1,46 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + try_get, + unified_strdate, +) + + +class SkyNewsAUIE(InfoExtractor): + _VALID_URL = r'(?:https?://)(?:www\.)?skynews\.com\.au/[^/]+/[^/]+/[^/]+/video/(?P<id>[a-z0-9]+)' + + _TESTS = [{ + 'url': 'https://www.skynews.com.au/world-news/united-states/incredible-vision-shows-lava-overflowing-from-spains-la-palma-volcano/video/0f4c6243d6903502c01251f228b91a71', + 'info_dict': { + 'id': '6277184925001', + 'ext': 'mp4', + 'title': 'md5:60594f1ea6d5ae93e292900f4d34e9ae', + 'description': 'md5:60594f1ea6d5ae93e292900f4d34e9ae', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 76.394, + 'timestamp': 1634271300, + 'uploader_id': '5348771529001', + 'tags': ['fblink', 'msn', 'usa', 'world', 'yt'], + 'upload_date': '20211015', + }, + 'params': {'skip_download': True, 'format': 'bv'} + }] + + _API_KEY = '6krsj3w249nk779d8fukqx9f' + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + embedcode = self._search_regex(r'embedcode\s?=\s?\"([^\"]+)\"', webpage, 'embedcode') + data_json = self._download_json( + f'https://content.api.news/v3/videos/brightcove/{embedcode}?api_key={self._API_KEY}', id)['content'] + return { + 'id': id, + '_type': 'url_transparent', + 'url': 'https://players.brightcove.net/%s/default_default/index.html?videoId=%s' % tuple(embedcode.split('-')), + 'ie_key': 'BrightcoveNew', + 'title': data_json.get('caption'), + 'upload_date': unified_strdate(try_get(data_json, lambda x: x['date']['created'])), + } diff --git a/yt_dlp/extractor/slideshare.py b/yt_dlp/extractor/slideshare.py new file mode 100644 index 000000000..9b3ad0ad4 --- /dev/null +++ b/yt_dlp/extractor/slideshare.py @@ -0,0 +1,55 @@ +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..compat import ( + compat_urlparse, +) +from ..utils import ( + ExtractorError, + get_element_by_id, +) + + +class SlideshareIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?slideshare\.net/[^/]+?/(?P<title>.+?)($|\?)' + + _TEST = { + 'url': 'http://www.slideshare.net/Dataversity/keynote-presentation-managing-scale-and-complexity', + 'info_dict': { + 'id': '25665706', + 'ext': 'mp4', + 'title': 'Managing Scale and Complexity', + 'description': 'This was a keynote presentation at the NoSQL Now! 2013 Conference & Expo (http://www.nosqlnow.com). This presentation was given by Adrian Cockcroft from Netflix.', + }, + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + page_title = mobj.group('title') + webpage = self._download_webpage(url, page_title) + slideshare_obj = self._search_regex( + r'\$\.extend\(.*?slideshare_object,\s*(\{.*?\})\);', + webpage, 'slideshare object') + info = json.loads(slideshare_obj) + if info['slideshow']['type'] != 'video': + raise ExtractorError('Webpage type is "%s": only video extraction is supported for Slideshare' % info['slideshow']['type'], expected=True) + + doc = info['doc'] + bucket = info['jsplayer']['video_bucket'] + ext = info['jsplayer']['video_extension'] + video_url = compat_urlparse.urljoin(bucket, doc + '-SD.' + ext) + description = get_element_by_id('slideshow-description-paragraph', webpage) or self._html_search_regex( + r'(?s)<p[^>]+itemprop="description"[^>]*>(.+?)</p>', webpage, + 'description', fatal=False) + + return { + '_type': 'video', + 'id': info['slideshow']['id'], + 'title': info['slideshow']['title'], + 'ext': ext, + 'url': video_url, + 'thumbnail': info['slideshow']['pin_image_url'], + 'description': description.strip() if description else None, + } diff --git a/youtube_dl/extractor/slideslive.py b/yt_dlp/extractor/slideslive.py index 9409a0100..9409a0100 100644 --- a/youtube_dl/extractor/slideslive.py +++ b/yt_dlp/extractor/slideslive.py diff --git a/youtube_dl/extractor/slutload.py b/yt_dlp/extractor/slutload.py index 661f9e59d..661f9e59d 100644 --- a/youtube_dl/extractor/slutload.py +++ b/yt_dlp/extractor/slutload.py diff --git a/yt_dlp/extractor/snotr.py b/yt_dlp/extractor/snotr.py new file mode 100644 index 000000000..0bb548255 --- /dev/null +++ b/yt_dlp/extractor/snotr.py @@ -0,0 +1,72 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ( + parse_duration, + parse_filesize, + str_to_int, +) + + +class SnotrIE(InfoExtractor): + _VALID_URL = r'http?://(?:www\.)?snotr\.com/video/(?P<id>\d+)/([\w]+)' + _TESTS = [{ + 'url': 'http://www.snotr.com/video/13708/Drone_flying_through_fireworks', + 'info_dict': { + 'id': '13708', + 'ext': 'mp4', + 'title': 'Drone flying through fireworks!', + 'duration': 248, + 'filesize_approx': 40700000, + 'description': 'A drone flying through Fourth of July Fireworks', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'expected_warnings': ['description'], + }, { + 'url': 'http://www.snotr.com/video/530/David_Letteman_-_George_W_Bush_Top_10', + 'info_dict': { + 'id': '530', + 'ext': 'mp4', + 'title': 'David Letteman - George W. Bush Top 10', + 'duration': 126, + 'filesize_approx': 8500000, + 'description': 'The top 10 George W. Bush moments, brought to you by David Letterman!', + 'thumbnail': r're:^https?://.*\.jpg$', + } + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + title = self._og_search_title(webpage) + + description = self._og_search_description(webpage) + info_dict = self._parse_html5_media_entries( + url, webpage, video_id, m3u8_entry_protocol='m3u8_native')[0] + + view_count = str_to_int(self._html_search_regex( + r'<p[^>]*>\s*<strong[^>]*>Views:</strong>\s*<span[^>]*>([\d,\.]+)', + webpage, 'view count', fatal=False)) + + duration = parse_duration(self._html_search_regex( + r'<p[^>]*>\s*<strong[^>]*>Length:</strong>\s*<span[^>]*>([\d:]+)', + webpage, 'duration', fatal=False)) + + filesize_approx = parse_filesize(self._html_search_regex( + r'<p[^>]*>\s*<strong[^>]*>Filesize:</strong>\s*<span[^>]*>([^<]+)', + webpage, 'filesize', fatal=False)) + + info_dict.update({ + 'id': video_id, + 'description': description, + 'title': title, + 'view_count': view_count, + 'duration': duration, + 'filesize_approx': filesize_approx, + }) + + return info_dict diff --git a/yt_dlp/extractor/sohu.py b/yt_dlp/extractor/sohu.py new file mode 100644 index 000000000..3bff5c595 --- /dev/null +++ b/yt_dlp/extractor/sohu.py @@ -0,0 +1,202 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urllib_parse_urlencode, +) +from ..utils import ( + ExtractorError, + int_or_none, + try_get, +) + + +class SohuIE(InfoExtractor): + _VALID_URL = r'https?://(?P<mytv>my\.)?tv\.sohu\.com/.+?/(?(mytv)|n)(?P<id>\d+)\.shtml.*?' + + # Sohu videos give different MD5 sums on Travis CI and my machine + _TESTS = [{ + 'note': 'This video is available only in Mainland China', + 'url': 'http://tv.sohu.com/20130724/n382479172.shtml#super', + 'info_dict': { + 'id': '382479172', + 'ext': 'mp4', + 'title': 'MV:Far East Movement《The Illest》', + }, + 'skip': 'On available in China', + }, { + 'url': 'http://tv.sohu.com/20150305/n409385080.shtml', + 'info_dict': { + 'id': '409385080', + 'ext': 'mp4', + 'title': '《2015湖南卫视羊年元宵晚会》唐嫣《花好月圆》', + } + }, { + 'url': 'http://my.tv.sohu.com/us/232799889/78693464.shtml', + 'info_dict': { + 'id': '78693464', + 'ext': 'mp4', + 'title': '【爱范品】第31期:MWC见不到的奇葩手机', + } + }, { + 'note': 'Multipart video', + 'url': 'http://my.tv.sohu.com/pl/8384802/78910339.shtml', + 'info_dict': { + 'id': '78910339', + 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', + }, + 'playlist': [{ + 'info_dict': { + 'id': '78910339_part1', + 'ext': 'mp4', + 'duration': 294, + 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', + } + }, { + 'info_dict': { + 'id': '78910339_part2', + 'ext': 'mp4', + 'duration': 300, + 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', + } + }, { + 'info_dict': { + 'id': '78910339_part3', + 'ext': 'mp4', + 'duration': 150, + 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', + } + }] + }, { + 'note': 'Video with title containing dash', + 'url': 'http://my.tv.sohu.com/us/249884221/78932792.shtml', + 'info_dict': { + 'id': '78932792', + 'ext': 'mp4', + 'title': 'youtube-dl testing video', + }, + 'params': { + 'skip_download': True + } + }] + + def _real_extract(self, url): + + def _fetch_data(vid_id, mytv=False): + if mytv: + base_data_url = 'http://my.tv.sohu.com/play/videonew.do?vid=' + else: + base_data_url = 'http://hot.vrs.sohu.com/vrs_flash.action?vid=' + + return self._download_json( + base_data_url + vid_id, video_id, + 'Downloading JSON data for %s' % vid_id, + headers=self.geo_verification_headers()) + + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + mytv = mobj.group('mytv') is not None + + webpage = self._download_webpage(url, video_id) + + title = re.sub(r' - 搜狐视频$', '', self._og_search_title(webpage)) + + vid = self._html_search_regex( + r'var vid ?= ?["\'](\d+)["\']', + webpage, 'video path') + vid_data = _fetch_data(vid, mytv) + if vid_data['play'] != 1: + if vid_data.get('status') == 12: + raise ExtractorError( + '%s said: There\'s something wrong in the video.' % self.IE_NAME, + expected=True) + else: + self.raise_geo_restricted( + '%s said: The video is only licensed to users in Mainland China.' % self.IE_NAME) + + formats_json = {} + for format_id in ('nor', 'high', 'super', 'ori', 'h2644k', 'h2654k'): + vid_id = vid_data['data'].get('%sVid' % format_id) + if not vid_id: + continue + vid_id = compat_str(vid_id) + formats_json[format_id] = vid_data if vid == vid_id else _fetch_data(vid_id, mytv) + + part_count = vid_data['data']['totalBlocks'] + + playlist = [] + for i in range(part_count): + formats = [] + for format_id, format_data in formats_json.items(): + allot = format_data['allot'] + + data = format_data['data'] + clips_url = data['clipsURL'] + su = data['su'] + + video_url = 'newflv.sohu.ccgslb.net' + cdnId = None + retries = 0 + + while 'newflv.sohu.ccgslb.net' in video_url: + params = { + 'prot': 9, + 'file': clips_url[i], + 'new': su[i], + 'prod': 'flash', + 'rb': 1, + } + + if cdnId is not None: + params['idc'] = cdnId + + download_note = 'Downloading %s video URL part %d of %d' % ( + format_id, i + 1, part_count) + + if retries > 0: + download_note += ' (retry #%d)' % retries + part_info = self._parse_json(self._download_webpage( + 'http://%s/?%s' % (allot, compat_urllib_parse_urlencode(params)), + video_id, download_note), video_id) + + video_url = part_info['url'] + cdnId = part_info.get('nid') + + retries += 1 + if retries > 5: + raise ExtractorError('Failed to get video URL') + + formats.append({ + 'url': video_url, + 'format_id': format_id, + 'filesize': int_or_none( + try_get(data, lambda x: x['clipsBytes'][i])), + 'width': int_or_none(data.get('width')), + 'height': int_or_none(data.get('height')), + 'fps': int_or_none(data.get('fps')), + }) + self._sort_formats(formats) + + playlist.append({ + 'id': '%s_part%d' % (video_id, i + 1), + 'title': title, + 'duration': vid_data['data']['clipsDuration'][i], + 'formats': formats, + }) + + if len(playlist) == 1: + info = playlist[0] + info['id'] = video_id + else: + info = { + '_type': 'multi_video', + 'entries': playlist, + 'id': video_id, + 'title': title, + } + + return info diff --git a/yt_dlp/extractor/sonyliv.py b/yt_dlp/extractor/sonyliv.py new file mode 100644 index 000000000..c3ed44275 --- /dev/null +++ b/yt_dlp/extractor/sonyliv.py @@ -0,0 +1,162 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import time +import uuid + +from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ( + ExtractorError, + int_or_none, + try_get, +) + + +class SonyLIVIE(InfoExtractor): + _VALID_URL = r'''(?x) + (?: + sonyliv:| + https?://(?:www\.)?sonyliv\.com/(?:s(?:how|port)s/[^/]+|movies|clip|trailer|music-videos)/[^/?#&]+- + ) + (?P<id>\d+) + ''' + _TESTS = [{ + 'url': 'https://www.sonyliv.com/shows/bachelors-delight-1700000113/achaari-cheese-toast-1000022678?watch=true', + 'info_dict': { + 'title': 'Achaari Cheese Toast', + 'id': '1000022678', + 'ext': 'mp4', + 'upload_date': '20200411', + 'description': 'md5:3957fa31d9309bf336ceb3f37ad5b7cb', + 'timestamp': 1586632091, + 'duration': 185, + 'season_number': 1, + 'series': 'Bachelors Delight', + 'episode_number': 1, + 'release_year': 2016, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.sonyliv.com/movies/tahalka-1000050121?watch=true', + 'only_matching': True, + }, { + 'url': 'https://www.sonyliv.com/clip/jigarbaaz-1000098925', + 'only_matching': True, + }, { + 'url': 'https://www.sonyliv.com/trailer/sandwiched-forever-1000100286?watch=true', + 'only_matching': True, + }, { + 'url': 'https://www.sonyliv.com/sports/india-tour-of-australia-2020-21-1700000286/cricket-hls-day-3-1st-test-aus-vs-ind-19-dec-2020-1000100959?watch=true', + 'only_matching': True, + }, { + 'url': 'https://www.sonyliv.com/music-videos/yeh-un-dinon-ki-baat-hai-1000018779', + 'only_matching': True, + }] + _GEO_COUNTRIES = ['IN'] + _TOKEN = None + + def _call_api(self, version, path, video_id): + headers = {} + if self._TOKEN: + headers['security_token'] = self._TOKEN + try: + return self._download_json( + 'https://apiv2.sonyliv.com/AGL/%s/A/ENG/WEB/%s' % (version, path), + video_id, headers=headers)['resultObj'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + message = self._parse_json( + e.cause.read().decode(), video_id)['message'] + if message == 'Geoblocked Country': + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + raise ExtractorError(message) + raise + + def _real_initialize(self): + self._TOKEN = self._call_api('1.4', 'ALL/GETTOKEN', None) + + def _real_extract(self, url): + video_id = self._match_id(url) + content = self._call_api( + '1.5', 'IN/CONTENT/VIDEOURL/VOD/' + video_id, video_id) + if not self.get_param('allow_unplayable_formats') and content.get('isEncrypted'): + self.report_drm(video_id) + dash_url = content['videoURL'] + headers = { + 'x-playback-session-id': '%s-%d' % (uuid.uuid4().hex, time.time() * 1000) + } + formats = self._extract_mpd_formats( + dash_url, video_id, mpd_id='dash', headers=headers, fatal=False) + formats.extend(self._extract_m3u8_formats( + dash_url.replace('.mpd', '.m3u8').replace('/DASH/', '/HLS/'), + video_id, 'mp4', m3u8_id='hls', headers=headers, fatal=False)) + for f in formats: + f.setdefault('http_headers', {}).update(headers) + self._sort_formats(formats) + + metadata = self._call_api( + '1.6', 'IN/DETAIL/' + video_id, video_id)['containers'][0]['metadata'] + title = metadata['episodeTitle'] + subtitles = {} + for sub in content.get('subtitle', []): + sub_url = sub.get('subtitleUrl') + if not sub_url: + continue + subtitles.setdefault(sub.get('subtitleLanguageName', 'ENG'), []).append({ + 'url': sub_url, + }) + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': content.get('posterURL'), + 'description': metadata.get('longDescription') or metadata.get('shortDescription'), + 'timestamp': int_or_none(metadata.get('creationDate'), 1000), + 'duration': int_or_none(metadata.get('duration')), + 'season_number': int_or_none(metadata.get('season')), + 'series': metadata.get('title'), + 'episode_number': int_or_none(metadata.get('episodeNumber')), + 'release_year': int_or_none(metadata.get('year')), + 'subtitles': subtitles, + } + + +class SonyLIVSeriesIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?sonyliv\.com/shows/[^/?#&]+-(?P<id>\d{10})$' + _TESTS = [{ + 'url': 'https://www.sonyliv.com/shows/adaalat-1700000091', + 'playlist_mincount': 456, + 'info_dict': { + 'id': '1700000091', + }, + }] + _API_SHOW_URL = "https://apiv2.sonyliv.com/AGL/1.9/R/ENG/WEB/IN/DL/DETAIL/{}?kids_safe=false&from=0&to=49" + _API_EPISODES_URL = "https://apiv2.sonyliv.com/AGL/1.4/R/ENG/WEB/IN/CONTENT/DETAIL/BUNDLE/{}?from=0&to=1000&orderBy=episodeNumber&sortOrder=asc" + _API_SECURITY_URL = 'https://apiv2.sonyliv.com/AGL/1.4/A/ENG/WEB/ALL/GETTOKEN' + + def _entries(self, show_id): + headers = { + 'Accept': 'application/json, text/plain, */*', + 'Referer': 'https://www.sonyliv.com', + } + headers['security_token'] = self._download_json( + self._API_SECURITY_URL, video_id=show_id, headers=headers, + note='Downloading security token')['resultObj'] + seasons = try_get( + self._download_json(self._API_SHOW_URL.format(show_id), video_id=show_id, headers=headers), + lambda x: x['resultObj']['containers'][0]['containers'], list) + for season in seasons or []: + season_id = season['id'] + episodes = try_get( + self._download_json(self._API_EPISODES_URL.format(season_id), video_id=season_id, headers=headers), + lambda x: x['resultObj']['containers'][0]['containers'], list) + for episode in episodes or []: + video_id = episode.get('id') + yield self.url_result('sonyliv:%s' % video_id, ie=SonyLIVIE.ie_key(), video_id=video_id) + + def _real_extract(self, url): + show_id = self._match_id(url) + return self.playlist_result(self._entries(show_id), playlist_id=show_id) diff --git a/yt_dlp/extractor/soundcloud.py b/yt_dlp/extractor/soundcloud.py new file mode 100644 index 000000000..e89383ff1 --- /dev/null +++ b/yt_dlp/extractor/soundcloud.py @@ -0,0 +1,898 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools +import re +import json +# import random + +from .common import ( + InfoExtractor, + SearchInfoExtractor +) +from ..compat import ( + compat_HTTPError, + compat_kwargs, + compat_str, +) +from ..utils import ( + error_to_compat_str, + ExtractorError, + float_or_none, + HEADRequest, + int_or_none, + KNOWN_EXTENSIONS, + mimetype2ext, + remove_end, + parse_qs, + str_or_none, + try_get, + unified_timestamp, + update_url_query, + url_or_none, + urlhandle_detect_ext, + sanitized_Request, +) + + +class SoundcloudEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:w|player|p)\.soundcloud\.com/player/?.*?\burl=(?P<id>.+)' + _TEST = { + # from https://www.soundi.fi/uutiset/ennakkokuuntelussa-timo-kaukolammen-station-to-station-to-station-julkaisua-juhlitaan-tanaan-g-livelabissa/ + 'url': 'https://w.soundcloud.com/player/?visual=true&url=https%3A%2F%2Fapi.soundcloud.com%2Fplaylists%2F922213810&show_artwork=true&maxwidth=640&maxheight=960&dnt=1&secret_token=s-ziYey', + 'only_matching': True, + } + + @staticmethod + def _extract_urls(webpage): + return [m.group('url') for m in re.finditer( + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?(?:w\.)?soundcloud\.com/player.+?)\1', + webpage)] + + def _real_extract(self, url): + query = parse_qs(url) + api_url = query['url'][0] + secret_token = query.get('secret_token') + if secret_token: + api_url = update_url_query(api_url, {'secret_token': secret_token[0]}) + return self.url_result(api_url) + + +class SoundcloudIE(InfoExtractor): + """Information extractor for soundcloud.com + To access the media, the uid of the song and a stream token + must be extracted from the page source and the script must make + a request to media.soundcloud.com/crossdomain.xml. Then + the media can be grabbed by requesting from an url composed + of the stream token and uid + """ + + _VALID_URL = r'''(?x)^(?:https?://)? + (?:(?:(?:www\.|m\.)?soundcloud\.com/ + (?!stations/track) + (?P<uploader>[\w\d-]+)/ + (?!(?:tracks|albums|sets(?:/.+?)?|reposts|likes|spotlight)/?(?:$|[?#])) + (?P<title>[\w\d-]+)/? + (?P<token>[^?]+?)?(?:[?].*)?$) + |(?:api(?:-v2)?\.soundcloud\.com/tracks/(?P<track_id>\d+) + (?:/?\?secret_token=(?P<secret_token>[^&]+))?) + ) + ''' + IE_NAME = 'soundcloud' + _TESTS = [ + { + 'url': 'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy', + 'md5': 'ebef0a451b909710ed1d7787dddbf0d7', + 'info_dict': { + 'id': '62986583', + 'ext': 'mp3', + 'title': 'Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1', + 'description': 'No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o\'d', + 'uploader': 'E.T. ExTerrestrial Music', + 'uploader_id': '1571244', + 'timestamp': 1349920598, + 'upload_date': '20121011', + 'duration': 143.216, + 'license': 'all-rights-reserved', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + } + }, + # geo-restricted + { + 'url': 'https://soundcloud.com/the-concept-band/goldrushed-mastered?in=the-concept-band/sets/the-royal-concept-ep', + 'info_dict': { + 'id': '47127627', + 'ext': 'mp3', + 'title': 'Goldrushed', + 'description': 'From Stockholm Sweden\r\nPovel / Magnus / Filip / David\r\nwww.theroyalconcept.com', + 'uploader': 'The Royal Concept', + 'uploader_id': '9615865', + 'timestamp': 1337635207, + 'upload_date': '20120521', + 'duration': 227.155, + 'license': 'all-rights-reserved', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + }, + }, + # private link + { + 'url': 'https://soundcloud.com/jaimemf/youtube-dl-test-video-a-y-baw/s-8Pjrp', + 'md5': 'aa0dd32bfea9b0c5ef4f02aacd080604', + 'info_dict': { + 'id': '123998367', + 'ext': 'mp3', + 'title': 'Youtube - Dl Test Video \'\' Ä↭', + 'description': 'test chars: \"\'/\\ä↭', + 'uploader': 'jaimeMF', + 'uploader_id': '69767071', + 'timestamp': 1386604920, + 'upload_date': '20131209', + 'duration': 9.927, + 'license': 'all-rights-reserved', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + }, + }, + # private link (alt format) + { + 'url': 'https://api.soundcloud.com/tracks/123998367?secret_token=s-8Pjrp', + 'md5': 'aa0dd32bfea9b0c5ef4f02aacd080604', + 'info_dict': { + 'id': '123998367', + 'ext': 'mp3', + 'title': 'Youtube - Dl Test Video \'\' Ä↭', + 'description': 'test chars: \"\'/\\ä↭', + 'uploader': 'jaimeMF', + 'uploader_id': '69767071', + 'timestamp': 1386604920, + 'upload_date': '20131209', + 'duration': 9.927, + 'license': 'all-rights-reserved', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + }, + }, + # downloadable song + { + 'url': 'https://soundcloud.com/the80m/the-following', + 'md5': '9ffcddb08c87d74fb5808a3c183a1d04', + 'info_dict': { + 'id': '343609555', + 'ext': 'wav', + }, + }, + # private link, downloadable format + { + 'url': 'https://soundcloud.com/oriuplift/uponly-238-no-talking-wav/s-AyZUd', + 'md5': '64a60b16e617d41d0bef032b7f55441e', + 'info_dict': { + 'id': '340344461', + 'ext': 'wav', + 'title': 'Uplifting Only 238 [No Talking] (incl. Alex Feed Guestmix) (Aug 31, 2017) [wav]', + 'description': 'md5:fa20ee0fca76a3d6df8c7e57f3715366', + 'uploader': 'Ori Uplift Music', + 'uploader_id': '12563093', + 'timestamp': 1504206263, + 'upload_date': '20170831', + 'duration': 7449.096, + 'license': 'all-rights-reserved', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + }, + }, + # no album art, use avatar pic for thumbnail + { + 'url': 'https://soundcloud.com/garyvee/sideways-prod-mad-real', + 'md5': '59c7872bc44e5d99b7211891664760c2', + 'info_dict': { + 'id': '309699954', + 'ext': 'mp3', + 'title': 'Sideways (Prod. Mad Real)', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'uploader': 'garyvee', + 'uploader_id': '2366352', + 'timestamp': 1488152409, + 'upload_date': '20170226', + 'duration': 207.012, + 'thumbnail': r're:https?://.*\.jpg', + 'license': 'all-rights-reserved', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + }, + 'params': { + 'skip_download': True, + }, + }, + { + 'url': 'https://soundcloud.com/giovannisarani/mezzo-valzer', + 'md5': 'e22aecd2bc88e0e4e432d7dcc0a1abf7', + 'info_dict': { + 'id': '583011102', + 'ext': 'mp3', + 'title': 'Mezzo Valzer', + 'description': 'md5:4138d582f81866a530317bae316e8b61', + 'uploader': 'Micronie', + 'uploader_id': '3352531', + 'timestamp': 1551394171, + 'upload_date': '20190228', + 'duration': 180.157, + 'thumbnail': r're:https?://.*\.jpg', + 'license': 'all-rights-reserved', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + }, + }, + { + # AAC HQ format available (account with active subscription needed) + 'url': 'https://soundcloud.com/wandw/the-chainsmokers-ft-daya-dont-let-me-down-ww-remix-1', + 'only_matching': True, + }, + { + # Go+ (account with active subscription needed) + 'url': 'https://soundcloud.com/taylorswiftofficial/look-what-you-made-me-do', + 'only_matching': True, + }, + ] + + _API_V2_BASE = 'https://api-v2.soundcloud.com/' + _BASE_URL = 'https://soundcloud.com/' + _IMAGE_REPL_RE = r'-([0-9a-z]+)\.jpg' + + _ARTWORK_MAP = { + 'mini': 16, + 'tiny': 20, + 'small': 32, + 'badge': 47, + 't67x67': 67, + 'large': 100, + 't300x300': 300, + 'crop': 400, + 't500x500': 500, + 'original': 0, + } + + def _store_client_id(self, client_id): + self._downloader.cache.store('soundcloud', 'client_id', client_id) + + def _update_client_id(self): + webpage = self._download_webpage('https://soundcloud.com/', None) + for src in reversed(re.findall(r'<script[^>]+src="([^"]+)"', webpage)): + script = self._download_webpage(src, None, fatal=False) + if script: + client_id = self._search_regex( + r'client_id\s*:\s*"([0-9a-zA-Z]{32})"', + script, 'client id', default=None) + if client_id: + self._CLIENT_ID = client_id + self._store_client_id(client_id) + return + raise ExtractorError('Unable to extract client id') + + def _download_json(self, *args, **kwargs): + non_fatal = kwargs.get('fatal') is False + if non_fatal: + del kwargs['fatal'] + query = kwargs.get('query', {}).copy() + for _ in range(2): + query['client_id'] = self._CLIENT_ID + kwargs['query'] = query + try: + return super(SoundcloudIE, self)._download_json(*args, **compat_kwargs(kwargs)) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 403): + self._store_client_id(None) + self._update_client_id() + continue + elif non_fatal: + self.report_warning(error_to_compat_str(e)) + return False + raise + + def _real_initialize(self): + self._CLIENT_ID = self._downloader.cache.load('soundcloud', 'client_id') or 'a3e059563d7fd3372b49b37f00a00bcf' + self._login() + + _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' + _API_AUTH_QUERY_TEMPLATE = '?client_id=%s' + _API_AUTH_URL_PW = 'https://api-auth.soundcloud.com/web-auth/sign-in/password%s' + _API_VERIFY_AUTH_TOKEN = 'https://api-auth.soundcloud.com/connect/session%s' + _access_token = None + _HEADERS = {} + _NETRC_MACHINE = 'soundcloud' + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + if username == 'oauth' and password is not None: + self._access_token = password + query = self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID + payload = {'session': {'access_token': self._access_token}} + token_verification = sanitized_Request(self._API_VERIFY_AUTH_TOKEN % query, json.dumps(payload).encode('utf-8')) + response = self._download_json(token_verification, None, note='Verifying login token...', fatal=False) + if response is not False: + self._HEADERS = {'Authorization': 'OAuth ' + self._access_token} + self.report_login() + else: + self.report_warning('Provided authorization token seems to be invalid. Continue as guest') + elif username is not None: + self.report_warning( + 'Login using username and password is not currently supported. ' + 'Use "--user oauth --password <oauth_token>" to login using an oauth token') + + r''' + def genDevId(): + def genNumBlock(): + return ''.join([str(random.randrange(10)) for i in range(6)]) + return '-'.join([genNumBlock() for i in range(4)]) + + payload = { + 'client_id': self._CLIENT_ID, + 'recaptcha_pubkey': 'null', + 'recaptcha_response': 'null', + 'credentials': { + 'identifier': username, + 'password': password + }, + 'signature': self.sign(username, password, self._CLIENT_ID), + 'device_id': genDevId(), + 'user_agent': self._USER_AGENT + } + + query = self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID + login = sanitized_Request(self._API_AUTH_URL_PW % query, json.dumps(payload).encode('utf-8')) + response = self._download_json(login, None) + self._access_token = response.get('session').get('access_token') + if not self._access_token: + self.report_warning('Unable to get access token, login may has failed') + else: + self._HEADERS = {'Authorization': 'OAuth ' + self._access_token} + ''' + + # signature generation + def sign(self, user, pw, clid): + a = 33 + i = 1 + s = 440123 + w = 117 + u = 1800000 + l = 1042 + b = 37 + k = 37 + c = 5 + n = '0763ed7314c69015fd4a0dc16bbf4b90' # _KEY + y = '8' # _REV + r = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' # _USER_AGENT + e = user # _USERNAME + t = clid # _CLIENT_ID + + d = '-'.join([str(mInt) for mInt in [a, i, s, w, u, l, b, k]]) + p = n + y + d + r + e + t + d + n + h = p + + m = 8011470 + f = 0 + + for f in range(f, len(h)): + m = (m >> 1) + ((1 & m) << 23) + m += ord(h[f]) + m &= 16777215 + + # c is not even needed + out = str(y) + ':' + str(d) + ':' + format(m, 'x') + ':' + str(c) + + return out + + @classmethod + def _resolv_url(cls, url): + return SoundcloudIE._API_V2_BASE + 'resolve?url=' + url + + def _extract_info_dict(self, info, full_title=None, secret_token=None): + track_id = compat_str(info['id']) + title = info['title'] + + format_urls = set() + formats = [] + query = {'client_id': self._CLIENT_ID} + if secret_token: + query['secret_token'] = secret_token + + if info.get('downloadable') and info.get('has_downloads_left'): + download_url = update_url_query( + self._API_V2_BASE + 'tracks/' + track_id + '/download', query) + redirect_url = (self._download_json(download_url, track_id, fatal=False) or {}).get('redirectUri') + if redirect_url: + urlh = self._request_webpage( + HEADRequest(redirect_url), track_id, fatal=False) + if urlh: + format_url = urlh.geturl() + format_urls.add(format_url) + formats.append({ + 'format_id': 'download', + 'ext': urlhandle_detect_ext(urlh) or 'mp3', + 'filesize': int_or_none(urlh.headers.get('Content-Length')), + 'url': format_url, + 'quality': 10, + }) + + def invalid_url(url): + return not url or url in format_urls + + def add_format(f, protocol, is_preview=False): + mobj = re.search(r'\.(?P<abr>\d+)\.(?P<ext>[0-9a-z]{3,4})(?=[/?])', stream_url) + if mobj: + for k, v in mobj.groupdict().items(): + if not f.get(k): + f[k] = v + format_id_list = [] + if protocol: + format_id_list.append(protocol) + ext = f.get('ext') + if ext == 'aac': + f['abr'] = '256' + for k in ('ext', 'abr'): + v = f.get(k) + if v: + format_id_list.append(v) + preview = is_preview or re.search(r'/(?:preview|playlist)/0/30/', f['url']) + if preview: + format_id_list.append('preview') + abr = f.get('abr') + if abr: + f['abr'] = int(abr) + if protocol == 'hls': + protocol = 'm3u8' if ext == 'aac' else 'm3u8_native' + else: + protocol = 'http' + f.update({ + 'format_id': '_'.join(format_id_list), + 'protocol': protocol, + 'preference': -10 if preview else None, + }) + formats.append(f) + + # New API + transcodings = try_get( + info, lambda x: x['media']['transcodings'], list) or [] + for t in transcodings: + if not isinstance(t, dict): + continue + format_url = url_or_none(t.get('url')) + if not format_url: + continue + stream = self._download_json( + format_url, track_id, query=query, fatal=False, headers=self._HEADERS) + if not isinstance(stream, dict): + continue + stream_url = url_or_none(stream.get('url')) + if invalid_url(stream_url): + continue + format_urls.add(stream_url) + stream_format = t.get('format') or {} + protocol = stream_format.get('protocol') + if protocol != 'hls' and '/hls' in format_url: + protocol = 'hls' + ext = None + preset = str_or_none(t.get('preset')) + if preset: + ext = preset.split('_')[0] + if ext not in KNOWN_EXTENSIONS: + ext = mimetype2ext(stream_format.get('mime_type')) + add_format({ + 'url': stream_url, + 'ext': ext, + }, 'http' if protocol == 'progressive' else protocol, + t.get('snipped') or '/preview/' in format_url) + + for f in formats: + f['vcodec'] = 'none' + + if not formats and info.get('policy') == 'BLOCK': + self.raise_geo_restricted(metadata_available=True) + self._sort_formats(formats) + + user = info.get('user') or {} + + thumbnails = [] + artwork_url = info.get('artwork_url') + thumbnail = artwork_url or user.get('avatar_url') + if isinstance(thumbnail, compat_str): + if re.search(self._IMAGE_REPL_RE, thumbnail): + for image_id, size in self._ARTWORK_MAP.items(): + i = { + 'id': image_id, + 'url': re.sub(self._IMAGE_REPL_RE, '-%s.jpg' % image_id, thumbnail), + } + if image_id == 'tiny' and not artwork_url: + size = 18 + elif image_id == 'original': + i['preference'] = 10 + if size: + i.update({ + 'width': size, + 'height': size, + }) + thumbnails.append(i) + else: + thumbnails = [{'url': thumbnail}] + + def extract_count(key): + return int_or_none(info.get('%s_count' % key)) + + return { + 'id': track_id, + 'uploader': user.get('username'), + 'uploader_id': str_or_none(user.get('id')) or user.get('permalink'), + 'uploader_url': user.get('permalink_url'), + 'timestamp': unified_timestamp(info.get('created_at')), + 'title': title, + 'description': info.get('description'), + 'thumbnails': thumbnails, + 'duration': float_or_none(info.get('duration'), 1000), + 'webpage_url': info.get('permalink_url'), + 'license': info.get('license'), + 'view_count': extract_count('playback'), + 'like_count': extract_count('favoritings') or extract_count('likes'), + 'comment_count': extract_count('comment'), + 'repost_count': extract_count('reposts'), + 'genre': info.get('genre'), + 'formats': formats + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + + track_id = mobj.group('track_id') + + query = {} + if track_id: + info_json_url = self._API_V2_BASE + 'tracks/' + track_id + full_title = track_id + token = mobj.group('secret_token') + if token: + query['secret_token'] = token + else: + full_title = resolve_title = '%s/%s' % mobj.group('uploader', 'title') + token = mobj.group('token') + if token: + resolve_title += '/%s' % token + info_json_url = self._resolv_url(self._BASE_URL + resolve_title) + + info = self._download_json( + info_json_url, full_title, 'Downloading info JSON', query=query, headers=self._HEADERS) + + return self._extract_info_dict(info, full_title, token) + + +class SoundcloudPlaylistBaseIE(SoundcloudIE): + def _extract_set(self, playlist, token=None): + playlist_id = compat_str(playlist['id']) + tracks = playlist.get('tracks') or [] + if not all([t.get('permalink_url') for t in tracks]) and token: + tracks = self._download_json( + self._API_V2_BASE + 'tracks', playlist_id, + 'Downloading tracks', query={ + 'ids': ','.join([compat_str(t['id']) for t in tracks]), + 'playlistId': playlist_id, + 'playlistSecretToken': token, + }, headers=self._HEADERS) + entries = [] + for track in tracks: + track_id = str_or_none(track.get('id')) + url = track.get('permalink_url') + if not url: + if not track_id: + continue + url = self._API_V2_BASE + 'tracks/' + track_id + if token: + url += '?secret_token=' + token + entries.append(self.url_result( + url, SoundcloudIE.ie_key(), track_id)) + return self.playlist_result( + entries, playlist_id, + playlist.get('title'), + playlist.get('description')) + + +class SoundcloudSetIE(SoundcloudPlaylistBaseIE): + _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<uploader>[\w\d-]+)/sets/(?P<slug_title>[:\w\d-]+)(?:/(?P<token>[^?/]+))?' + IE_NAME = 'soundcloud:set' + _TESTS = [{ + 'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep', + 'info_dict': { + 'id': '2284613', + 'title': 'The Royal Concept EP', + 'description': 'md5:71d07087c7a449e8941a70a29e34671e', + }, + 'playlist_mincount': 5, + }, { + 'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep/token', + 'only_matching': True, + }, { + 'url': 'https://soundcloud.com/discover/sets/weekly::flacmatic', + 'only_matching': True, + }, { + 'url': 'https://soundcloud.com/discover/sets/charts-top:all-music:de', + 'only_matching': True, + }, { + 'url': 'https://soundcloud.com/discover/sets/charts-top:hiphoprap:kr', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + + full_title = '%s/sets/%s' % mobj.group('uploader', 'slug_title') + token = mobj.group('token') + if token: + full_title += '/' + token + + info = self._download_json(self._resolv_url( + self._BASE_URL + full_title), full_title, headers=self._HEADERS) + + if 'errors' in info: + msgs = (compat_str(err['error_message']) for err in info['errors']) + raise ExtractorError('unable to download video webpage: %s' % ','.join(msgs)) + + return self._extract_set(info, token) + + +class SoundcloudPagedPlaylistBaseIE(SoundcloudIE): + def _extract_playlist(self, base_url, playlist_id, playlist_title): + return { + '_type': 'playlist', + 'id': playlist_id, + 'title': playlist_title, + 'entries': self._entries(base_url, playlist_id), + } + + def _entries(self, url, playlist_id): + # Per the SoundCloud documentation, the maximum limit for a linked partitioning query is 200. + # https://developers.soundcloud.com/blog/offset-pagination-deprecated + query = { + 'limit': 200, + 'linked_partitioning': '1', + 'offset': 0, + } + + retries = self.get_param('extractor_retries', 3) + + for i in itertools.count(): + attempt, last_error = -1, None + while attempt < retries: + attempt += 1 + if last_error: + self.report_warning('%s. Retrying ...' % remove_end(last_error, '.'), playlist_id) + try: + response = self._download_json( + url, playlist_id, query=query, headers=self._HEADERS, + note='Downloading track page %s%s' % (i + 1, f' (retry #{attempt})' if attempt else '')) + break + except ExtractorError as e: + # Downloading page may result in intermittent 502 HTTP error + # See https://github.com/yt-dlp/yt-dlp/issues/872 + if attempt >= retries or not isinstance(e.cause, compat_HTTPError) or e.cause.code != 502: + raise + last_error = str(e.cause or e.msg) + + def resolve_entry(*candidates): + for cand in candidates: + if not isinstance(cand, dict): + continue + permalink_url = url_or_none(cand.get('permalink_url')) + if permalink_url: + return self.url_result( + permalink_url, + SoundcloudIE.ie_key() if SoundcloudIE.suitable(permalink_url) else None, + str_or_none(cand.get('id')), cand.get('title')) + + for e in response['collection'] or []: + yield resolve_entry(e, e.get('track'), e.get('playlist')) + + url = response.get('next_href') + if not url: + break + query.pop('offset', None) + + +class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE): + _VALID_URL = r'''(?x) + https?:// + (?:(?:www|m)\.)?soundcloud\.com/ + (?P<user>[^/]+) + (?:/ + (?P<rsrc>tracks|albums|sets|reposts|likes|spotlight) + )? + /?(?:[?#].*)?$ + ''' + IE_NAME = 'soundcloud:user' + _TESTS = [{ + 'url': 'https://soundcloud.com/soft-cell-official', + 'info_dict': { + 'id': '207965082', + 'title': 'Soft Cell (All)', + }, + 'playlist_mincount': 28, + }, { + 'url': 'https://soundcloud.com/soft-cell-official/tracks', + 'info_dict': { + 'id': '207965082', + 'title': 'Soft Cell (Tracks)', + }, + 'playlist_mincount': 27, + }, { + 'url': 'https://soundcloud.com/soft-cell-official/albums', + 'info_dict': { + 'id': '207965082', + 'title': 'Soft Cell (Albums)', + }, + 'playlist_mincount': 1, + }, { + 'url': 'https://soundcloud.com/jcv246/sets', + 'info_dict': { + 'id': '12982173', + 'title': 'Jordi / cv (Sets)', + }, + 'playlist_mincount': 2, + }, { + 'url': 'https://soundcloud.com/jcv246/reposts', + 'info_dict': { + 'id': '12982173', + 'title': 'Jordi / cv (Reposts)', + }, + 'playlist_mincount': 6, + }, { + 'url': 'https://soundcloud.com/clalberg/likes', + 'info_dict': { + 'id': '11817582', + 'title': 'clalberg (Likes)', + }, + 'playlist_mincount': 5, + }, { + 'url': 'https://soundcloud.com/grynpyret/spotlight', + 'info_dict': { + 'id': '7098329', + 'title': 'Grynpyret (Spotlight)', + }, + 'playlist_mincount': 1, + }] + + _BASE_URL_MAP = { + 'all': 'stream/users/%s', + 'tracks': 'users/%s/tracks', + 'albums': 'users/%s/albums', + 'sets': 'users/%s/playlists', + 'reposts': 'stream/users/%s/reposts', + 'likes': 'users/%s/likes', + 'spotlight': 'users/%s/spotlight', + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + uploader = mobj.group('user') + + user = self._download_json( + self._resolv_url(self._BASE_URL + uploader), + uploader, 'Downloading user info', headers=self._HEADERS) + + resource = mobj.group('rsrc') or 'all' + + return self._extract_playlist( + self._API_V2_BASE + self._BASE_URL_MAP[resource] % user['id'], + str_or_none(user.get('id')), + '%s (%s)' % (user['username'], resource.capitalize())) + + +class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE): + _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/stations/track/[^/]+/(?P<id>[^/?#&]+)' + IE_NAME = 'soundcloud:trackstation' + _TESTS = [{ + 'url': 'https://soundcloud.com/stations/track/officialsundial/your-text', + 'info_dict': { + 'id': '286017854', + 'title': 'Track station: your text', + }, + 'playlist_mincount': 47, + }] + + def _real_extract(self, url): + track_name = self._match_id(url) + + track = self._download_json(self._resolv_url(url), track_name, headers=self._HEADERS) + track_id = self._search_regex( + r'soundcloud:track-stations:(\d+)', track['id'], 'track id') + + return self._extract_playlist( + self._API_V2_BASE + 'stations/%s/tracks' % track['id'], + track_id, 'Track station: %s' % track['title']) + + +class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE): + _VALID_URL = r'https?://api(?:-v2)?\.soundcloud\.com/playlists/(?P<id>[0-9]+)(?:/?\?secret_token=(?P<token>[^&]+?))?$' + IE_NAME = 'soundcloud:playlist' + _TESTS = [{ + 'url': 'https://api.soundcloud.com/playlists/4110309', + 'info_dict': { + 'id': '4110309', + 'title': 'TILT Brass - Bowery Poetry Club, August \'03 [Non-Site SCR 02]', + 'description': 're:.*?TILT Brass - Bowery Poetry Club', + }, + 'playlist_count': 6, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + playlist_id = mobj.group('id') + + query = {} + token = mobj.group('token') + if token: + query['secret_token'] = token + + data = self._download_json( + self._API_V2_BASE + 'playlists/' + playlist_id, + playlist_id, 'Downloading playlist', query=query, headers=self._HEADERS) + + return self._extract_set(data, token) + + +class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE): + IE_NAME = 'soundcloud:search' + IE_DESC = 'Soundcloud search' + _MAX_RESULTS = float('inf') + _TESTS = [{ + 'url': 'scsearch15:post-avant jazzcore', + 'info_dict': { + 'title': 'post-avant jazzcore', + }, + 'playlist_count': 15, + }] + + _SEARCH_KEY = 'scsearch' + _MAX_RESULTS_PER_PAGE = 200 + _DEFAULT_RESULTS_PER_PAGE = 50 + + def _get_collection(self, endpoint, collection_id, **query): + limit = min( + query.get('limit', self._DEFAULT_RESULTS_PER_PAGE), + self._MAX_RESULTS_PER_PAGE) + query.update({ + 'limit': limit, + 'linked_partitioning': 1, + 'offset': 0, + }) + next_url = update_url_query(self._API_V2_BASE + endpoint, query) + + for i in itertools.count(1): + response = self._download_json( + next_url, collection_id, f'Downloading page {i}', + 'Unable to download API page', headers=self._HEADERS) + + for item in response.get('collection') or []: + if item: + yield self.url_result(item['uri'], SoundcloudIE.ie_key()) + + next_url = response.get('next_href') + if not next_url: + break + + def _get_n_results(self, query, n): + tracks = self._get_collection('search/tracks', query, limit=n, q=query) + return self.playlist_result(tracks, query, query) diff --git a/yt_dlp/extractor/soundgasm.py b/yt_dlp/extractor/soundgasm.py new file mode 100644 index 000000000..d608eb7a7 --- /dev/null +++ b/yt_dlp/extractor/soundgasm.py @@ -0,0 +1,77 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class SoundgasmIE(InfoExtractor): + IE_NAME = 'soundgasm' + _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P<user>[0-9a-zA-Z_-]+)/(?P<display_id>[0-9a-zA-Z_-]+)' + _TEST = { + 'url': 'http://soundgasm.net/u/ytdl/Piano-sample', + 'md5': '010082a2c802c5275bb00030743e75ad', + 'info_dict': { + 'id': '88abd86ea000cafe98f96321b23cc1206cbcbcc9', + 'ext': 'm4a', + 'title': 'Piano sample', + 'description': 'Royalty Free Sample Music', + 'uploader': 'ytdl', + } + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + display_id = mobj.group('display_id') + + webpage = self._download_webpage(url, display_id) + + audio_url = self._html_search_regex( + r'(?s)m4a\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'audio URL', group='url') + + title = self._search_regex( + r'<div[^>]+\bclass=["\']jp-title[^>]+>([^<]+)', + webpage, 'title', default=display_id) + + description = self._html_search_regex( + (r'(?s)<div[^>]+\bclass=["\']jp-description[^>]+>(.+?)</div>', + r'(?s)<li>Description:\s(.*?)<\/li>'), + webpage, 'description', fatal=False) + + audio_id = self._search_regex( + r'/([^/]+)\.m4a', audio_url, 'audio id', default=display_id) + + return { + 'id': audio_id, + 'display_id': display_id, + 'url': audio_url, + 'vcodec': 'none', + 'title': title, + 'description': description, + 'uploader': mobj.group('user'), + } + + +class SoundgasmProfileIE(InfoExtractor): + IE_NAME = 'soundgasm:profile' + _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P<id>[^/]+)/?(?:\#.*)?$' + _TEST = { + 'url': 'http://soundgasm.net/u/ytdl', + 'info_dict': { + 'id': 'ytdl', + }, + 'playlist_count': 1, + } + + def _real_extract(self, url): + profile_id = self._match_id(url) + + webpage = self._download_webpage(url, profile_id) + + entries = [ + self.url_result(audio_url, 'Soundgasm') + for audio_url in re.findall(r'href="([^"]+/u/%s/[^"]+)' % profile_id, webpage)] + + return self.playlist_result(entries, profile_id) diff --git a/yt_dlp/extractor/southpark.py b/yt_dlp/extractor/southpark.py new file mode 100644 index 000000000..d49749467 --- /dev/null +++ b/yt_dlp/extractor/southpark.py @@ -0,0 +1,149 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .mtv import MTVServicesInfoExtractor + + +class SouthParkIE(MTVServicesInfoExtractor): + IE_NAME = 'southpark.cc.com' + _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark(?:\.cc|studios)\.com/(?:clips|(?:full-)?episodes|collections)/(?P<id>.+?)(\?|#|$))' + + _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed' + + _TESTS = [{ + 'url': 'http://southpark.cc.com/clips/104437/bat-daded#tab=featured', + 'info_dict': { + 'id': 'a7bff6c2-ed00-11e0-aca6-0026b9414f30', + 'ext': 'mp4', + 'title': 'South Park|Bat Daded', + 'description': 'Randy disqualifies South Park by getting into a fight with Bat Dad.', + 'timestamp': 1112760000, + 'upload_date': '20050406', + }, + }, { + 'url': 'http://southpark.cc.com/collections/7758/fan-favorites/1', + 'only_matching': True, + }, { + 'url': 'https://www.southparkstudios.com/episodes/h4o269/south-park-stunning-and-brave-season-19-ep-1', + 'only_matching': True, + }] + + def _get_feed_query(self, uri): + return { + 'accountOverride': 'intl.mtvi.com', + 'arcEp': 'shared.southpark.global', + 'ep': '90877963', + 'imageEp': 'shared.southpark.global', + 'mgid': uri, + } + + +class SouthParkEsIE(SouthParkIE): + IE_NAME = 'southpark.cc.com:español' + _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.cc\.com/episodios-en-espanol/(?P<id>.+?)(\?|#|$))' + _LANG = 'es' + + _TESTS = [{ + 'url': 'http://southpark.cc.com/episodios-en-espanol/s01e01-cartman-consigue-una-sonda-anal#source=351c1323-0b96-402d-a8b9-40d01b2e9bde&position=1&sort=!airdate', + 'info_dict': { + 'title': 'Cartman Consigue Una Sonda Anal', + 'description': 'Cartman Consigue Una Sonda Anal', + }, + 'playlist_count': 4, + 'skip': 'Geo-restricted', + }] + + +class SouthParkDeIE(SouthParkIE): + IE_NAME = 'southpark.de' + _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.de/(?:(en/(videoclip|collections|episodes|video-clips))|(videoclip|collections|folgen))/(?P<id>(?P<unique_id>.+?)/.+?)(?:\?|#|$))' + _TESTS = [{ + 'url': 'https://www.southpark.de/videoclip/rsribv/south-park-rueckzug-zum-gummibonbon-wald', + 'only_matching': True, + }, { + 'url': 'https://www.southpark.de/folgen/jiru42/south-park-verkabelung-staffel-23-ep-9', + 'only_matching': True, + }, { + 'url': 'https://www.southpark.de/collections/zzno5a/south-park-good-eats/7q26gp', + 'only_matching': True, + }, { + # clip + 'url': 'https://www.southpark.de/en/video-clips/ct46op/south-park-tooth-fairy-cartman', + 'info_dict': { + 'id': 'e99d45ea-ed00-11e0-aca6-0026b9414f30', + 'ext': 'mp4', + 'title': 'Tooth Fairy Cartman', + 'description': 'md5:db02e23818b4dc9cb5f0c5a7e8833a68', + }, + }, { + # episode + 'url': 'https://www.southpark.de/en/episodes/yy0vjs/south-park-the-pandemic-special-season-24-ep-1', + 'info_dict': { + 'id': 'f5fbd823-04bc-11eb-9b1b-0e40cf2fc285', + 'ext': 'mp4', + 'title': 'South Park', + 'description': 'md5:ae0d875eff169dcbed16b21531857ac1', + }, + }, { + # clip + 'url': 'https://www.southpark.de/videoclip/ct46op/south-park-zahnfee-cartman', + 'info_dict': { + 'id': 'e99d45ea-ed00-11e0-aca6-0026b9414f30', + 'ext': 'mp4', + 'title': 'Zahnfee Cartman', + 'description': 'md5:b917eec991d388811d911fd1377671ac' + }, + }, { + # episode + 'url': 'https://www.southpark.de/folgen/242csn/south-park-her-mit-dem-hirn-staffel-1-ep-7', + 'info_dict': { + 'id': '607115f3-496f-40c3-8647-2b0bcff486c0', + 'ext': 'mp4', + 'title': 'md5:South Park | Pink Eye | E 0107 | HDSS0107X deu | Version: 634312 | Comedy Central S1', + }, + }] + + def _get_feed_url(self, uri, url=None): + video_id = self._id_from_uri(uri) + config = self._download_json( + 'http://media.mtvnservices.com/pmt/e1/access/index.html?uri=%s&configtype=edge&ref=%s' % (uri, url), video_id) + return self._remove_template_parameter(config['feedWithQueryParams']) + + def _get_feed_query(self, uri): + return + + +class SouthParkNlIE(SouthParkIE): + IE_NAME = 'southpark.nl' + _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.nl/(?:clips|(?:full-)?episodes|collections)/(?P<id>.+?)(\?|#|$))' + _FEED_URL = 'http://www.southpark.nl/feeds/video-player/mrss/' + + _TESTS = [{ + 'url': 'http://www.southpark.nl/full-episodes/s18e06-freemium-isnt-free', + 'info_dict': { + 'title': 'Freemium Isn\'t Free', + 'description': 'Stan is addicted to the new Terrance and Phillip mobile game.', + }, + 'playlist_mincount': 3, + }] + + +class SouthParkDkIE(SouthParkIE): + IE_NAME = 'southparkstudios.dk' + _VALID_URL = r'https?://(?:www\.)?(?P<url>southparkstudios\.(?:dk|nu)/(?:clips|full-episodes|collections)/(?P<id>.+?)(\?|#|$))' + _FEED_URL = 'http://www.southparkstudios.dk/feeds/video-player/mrss/' + + _TESTS = [{ + 'url': 'http://www.southparkstudios.dk/full-episodes/s18e07-grounded-vindaloop', + 'info_dict': { + 'title': 'Grounded Vindaloop', + 'description': 'Butters is convinced he\'s living in a virtual reality.', + }, + 'playlist_mincount': 3, + }, { + 'url': 'http://www.southparkstudios.dk/collections/2476/superhero-showdown/1', + 'only_matching': True, + }, { + 'url': 'http://www.southparkstudios.nu/collections/2476/superhero-showdown/1', + 'only_matching': True, + }] diff --git a/yt_dlp/extractor/sovietscloset.py b/yt_dlp/extractor/sovietscloset.py new file mode 100644 index 000000000..7df23759a --- /dev/null +++ b/yt_dlp/extractor/sovietscloset.py @@ -0,0 +1,221 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + js_to_json, + try_get, + unified_timestamp +) + + +class SovietsClosetBaseIE(InfoExtractor): + MEDIADELIVERY_REFERER = {'Referer': 'https://iframe.mediadelivery.net/'} + + def parse_nuxt_jsonp(self, nuxt_jsonp_url, video_id, name): + nuxt_jsonp = self._download_webpage(nuxt_jsonp_url, video_id, note=f'Downloading {name} __NUXT_JSONP__') + js, arg_keys, arg_vals = self._search_regex( + r'__NUXT_JSONP__\(.*?\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.*?)\)', + nuxt_jsonp, '__NUXT_JSONP__', group=['js', 'arg_keys', 'arg_vals']) + + args = dict(zip(arg_keys.split(','), arg_vals.split(','))) + + for key, val in args.items(): + if val in ('undefined', 'void 0'): + args[key] = 'null' + + return self._parse_json(js_to_json(js, args), video_id)['data'][0] + + def video_meta(self, video_id, game_name, category_name, episode_number, stream_date): + title = game_name + if category_name and category_name != 'Misc': + title += f' - {category_name}' + if episode_number: + title += f' #{episode_number}' + + timestamp = unified_timestamp(stream_date) + + return { + 'id': video_id, + 'title': title, + 'http_headers': self.MEDIADELIVERY_REFERER, + 'uploader': 'SovietWomble', + 'creator': 'SovietWomble', + 'release_timestamp': timestamp, + 'timestamp': timestamp, + 'uploader_id': 'SovietWomble', + 'uploader_url': 'https://www.twitch.tv/SovietWomble', + 'was_live': True, + 'availability': 'public', + 'series': game_name, + 'season': category_name, + 'episode_number': episode_number, + } + + +class SovietsClosetIE(SovietsClosetBaseIE): + _VALID_URL = r'https?://(?:www\.)?sovietscloset\.com/video/(?P<id>[0-9]+)/?' + _TESTS = [ + { + 'url': 'https://sovietscloset.com/video/1337', + 'md5': '11e58781c4ca5b283307aa54db5b3f93', + 'info_dict': { + 'id': '1337', + 'ext': 'mp4', + 'title': 'The Witcher #13', + 'thumbnail': r're:^https?://.*\.b-cdn\.net/2f0cfbf4-3588-43a9-a7d6-7c9ea3755e67/thumbnail\.jpg$', + 'uploader': 'SovietWomble', + 'creator': 'SovietWomble', + 'release_timestamp': 1492091580, + 'release_date': '20170413', + 'timestamp': 1492091580, + 'upload_date': '20170413', + 'uploader_id': 'SovietWomble', + 'uploader_url': 'https://www.twitch.tv/SovietWomble', + 'duration': 7007, + 'was_live': True, + 'availability': 'public', + 'series': 'The Witcher', + 'season': 'Misc', + 'episode_number': 13, + }, + }, + { + 'url': 'https://sovietscloset.com/video/1105', + 'md5': '578b1958a379e7110ba38697042e9efb', + 'info_dict': { + 'id': '1105', + 'ext': 'mp4', + 'title': 'Arma 3 - Zeus Games #3', + 'uploader': 'SovietWomble', + 'thumbnail': r're:^https?://.*\.b-cdn\.net/c0e5e76f-3a93-40b4-bf01-12343c2eec5d/thumbnail\.jpg$', + 'uploader': 'SovietWomble', + 'creator': 'SovietWomble', + 'release_timestamp': 1461157200, + 'release_date': '20160420', + 'timestamp': 1461157200, + 'upload_date': '20160420', + 'uploader_id': 'SovietWomble', + 'uploader_url': 'https://www.twitch.tv/SovietWomble', + 'duration': 8804, + 'was_live': True, + 'availability': 'public', + 'series': 'Arma 3', + 'season': 'Zeus Games', + 'episode_number': 3, + }, + }, + ] + + def _extract_bunnycdn_iframe(self, video_id, bunnycdn_id): + iframe = self._download_webpage( + f'https://iframe.mediadelivery.net/embed/5105/{bunnycdn_id}', + video_id, note='Downloading BunnyCDN iframe', headers=self.MEDIADELIVERY_REFERER) + + m3u8_url = self._search_regex(r'(https?://.*?\.m3u8)', iframe, 'm3u8 url') + thumbnail_url = self._search_regex(r'(https?://.*?thumbnail\.jpg)', iframe, 'thumbnail url') + + m3u8_formats = self._extract_m3u8_formats(m3u8_url, video_id, headers=self.MEDIADELIVERY_REFERER) + self._sort_formats(m3u8_formats) + + if not m3u8_formats: + duration = None + else: + duration = self._extract_m3u8_vod_duration( + m3u8_formats[0]['url'], video_id, headers=self.MEDIADELIVERY_REFERER) + + return { + 'formats': m3u8_formats, + 'thumbnail': thumbnail_url, + 'duration': duration, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + static_assets_base = self._search_regex(r'staticAssetsBase:\"(.*?)\"', webpage, 'staticAssetsBase') + static_assets_base = f'https://sovietscloset.com{static_assets_base}' + + stream = self.parse_nuxt_jsonp(f'{static_assets_base}/video/{video_id}/payload.js', video_id, 'video')['stream'] + + return { + **self.video_meta( + video_id=video_id, game_name=stream['game']['name'], + category_name=try_get(stream, lambda x: x['subcategory']['name'], str), + episode_number=stream.get('number'), stream_date=stream.get('date')), + **self._extract_bunnycdn_iframe(video_id, stream['bunnyId']), + } + + +class SovietsClosetPlaylistIE(SovietsClosetBaseIE): + _VALID_URL = r'https?://(?:www\.)?sovietscloset\.com/(?!video)(?P<id>[^#?]+)' + _TESTS = [ + + { + 'url': 'https://sovietscloset.com/The-Witcher', + 'info_dict': { + 'id': 'The-Witcher', + 'title': 'The Witcher', + }, + 'playlist_mincount': 31, + }, + { + 'url': 'https://sovietscloset.com/Arma-3/Zeus-Games', + 'info_dict': { + 'id': 'Arma-3/Zeus-Games', + 'title': 'Arma 3 - Zeus Games', + }, + 'playlist_mincount': 3, + }, + { + 'url': 'https://sovietscloset.com/arma-3/zeus-games/', + 'info_dict': { + 'id': 'arma-3/zeus-games', + 'title': 'Arma 3 - Zeus Games', + }, + 'playlist_mincount': 3, + }, + { + 'url': 'https://sovietscloset.com/Total-War-Warhammer', + 'info_dict': { + 'id': 'Total-War-Warhammer', + 'title': 'Total War: Warhammer - Greenskins', + }, + 'playlist_mincount': 33, + }, + ] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + if playlist_id.endswith('/'): + playlist_id = playlist_id[:-1] + + webpage = self._download_webpage(url, playlist_id) + + static_assets_base = self._search_regex(r'staticAssetsBase:\"(.*?)\"', webpage, 'staticAssetsBase') + static_assets_base = f'https://sovietscloset.com{static_assets_base}' + + sovietscloset = self.parse_nuxt_jsonp(f'{static_assets_base}/payload.js', playlist_id, 'global')['games'] + + if '/' in playlist_id: + game_slug, category_slug = playlist_id.lower().split('/') + else: + game_slug = playlist_id.lower() + category_slug = 'misc' + + game = next(game for game in sovietscloset if game['slug'].lower() == game_slug) + category = next((cat for cat in game['subcategories'] if cat.get('slug', '').lower() == category_slug), + game['subcategories'][0]) + category_slug = category.get('slug', '').lower() or category_slug + playlist_title = game.get('name') or game_slug + if category_slug != 'misc': + playlist_title += f' - {category.get("name") or category_slug}' + entries = [{ + **self.url_result(f'https://sovietscloset.com/video/{stream["id"]}', ie=SovietsClosetIE.ie_key()), + **self.video_meta( + video_id=stream['id'], game_name=game['name'], category_name=category.get('name'), + episode_number=i + 1, stream_date=stream.get('date')), + } for i, stream in enumerate(category['streams'])] + + return self.playlist_result(entries, playlist_id, playlist_title) diff --git a/yt_dlp/extractor/spankbang.py b/yt_dlp/extractor/spankbang.py new file mode 100644 index 000000000..dd849ae13 --- /dev/null +++ b/yt_dlp/extractor/spankbang.py @@ -0,0 +1,200 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + ExtractorError, + merge_dicts, + parse_duration, + parse_resolution, + str_to_int, + url_or_none, + urlencode_postdata, + urljoin, +) + + +class SpankBangIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?:[^/]+\.)?spankbang\.com/ + (?: + (?P<id>[\da-z]+)/(?:video|play|embed)\b| + [\da-z]+-(?P<id_2>[\da-z]+)/playlist/[^/?#&]+ + ) + ''' + _TESTS = [{ + 'url': 'https://spankbang.com/56b3d/video/the+slut+maker+hmv', + 'md5': '2D13903DE4ECC7895B5D55930741650A', + 'info_dict': { + 'id': '56b3d', + 'ext': 'mp4', + 'title': 'The Slut Maker HMV', + 'description': 'Girls getting converted into cock slaves.', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Mindself', + 'uploader_id': 'mindself', + 'timestamp': 1617109572, + 'upload_date': '20210330', + 'age_limit': 18, + } + }, { + # 480p only + 'url': 'http://spankbang.com/1vt0/video/solvane+gangbang', + 'only_matching': True, + }, { + # no uploader + 'url': 'http://spankbang.com/lklg/video/sex+with+anyone+wedding+edition+2', + 'only_matching': True, + }, { + # mobile page + 'url': 'http://m.spankbang.com/1o2de/video/can+t+remember+her+name', + 'only_matching': True, + }, { + # 4k + 'url': 'https://spankbang.com/1vwqx/video/jade+kush+solo+4k', + 'only_matching': True, + }, { + 'url': 'https://m.spankbang.com/3vvn/play/fantasy+solo/480p/', + 'only_matching': True, + }, { + 'url': 'https://m.spankbang.com/3vvn/play', + 'only_matching': True, + }, { + 'url': 'https://spankbang.com/2y3td/embed/', + 'only_matching': True, + }, { + 'url': 'https://spankbang.com/2v7ik-7ecbgu/playlist/latina+booty', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') or mobj.group('id_2') + webpage = self._download_webpage( + url.replace('/%s/embed' % video_id, '/%s/video' % video_id), + video_id, headers={'Cookie': 'country=US'}) + + if re.search(r'<[^>]+\b(?:id|class)=["\']video_removed', webpage): + raise ExtractorError( + 'Video %s is not available' % video_id, expected=True) + + formats = [] + + def extract_format(format_id, format_url): + f_url = url_or_none(format_url) + if not f_url: + return + f = parse_resolution(format_id) + ext = determine_ext(f_url) + if format_id.startswith('m3u8') or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + f_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif format_id.startswith('mpd') or ext == 'mpd': + formats.extend(self._extract_mpd_formats( + f_url, video_id, mpd_id='dash', fatal=False)) + elif ext == 'mp4' or f.get('width') or f.get('height'): + f.update({ + 'url': f_url, + 'format_id': format_id, + }) + formats.append(f) + + STREAM_URL_PREFIX = 'stream_url_' + + for mobj in re.finditer( + r'%s(?P<id>[^\s=]+)\s*=\s*(["\'])(?P<url>(?:(?!\2).)+)\2' + % STREAM_URL_PREFIX, webpage): + extract_format(mobj.group('id', 'url')) + + if not formats: + stream_key = self._search_regex( + r'data-streamkey\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1', + webpage, 'stream key', group='value') + + stream = self._download_json( + 'https://spankbang.com/api/videos/stream', video_id, + 'Downloading stream JSON', data=urlencode_postdata({ + 'id': stream_key, + 'data': 0, + }), headers={ + 'Referer': url, + 'X-Requested-With': 'XMLHttpRequest', + }) + + for format_id, format_url in stream.items(): + if format_url and isinstance(format_url, list): + format_url = format_url[0] + extract_format(format_id, format_url) + + self._sort_formats(formats) + + info = self._search_json_ld(webpage, video_id, default={}) + + title = self._html_search_regex( + r'(?s)<h1[^>]+\btitle=["\']([^"]+)["\']>', webpage, 'title', default=None) + description = self._search_regex( + r'<div[^>]+\bclass=["\']bottom[^>]+>\s*<p>[^<]*</p>\s*<p>([^<]+)', + webpage, 'description', default=None) + thumbnail = self._og_search_thumbnail(webpage, default=None) + uploader = self._html_search_regex( + r'<svg[^>]+\bclass="(?:[^"]*?user[^"]*?)">.*?</svg>([^<]+)', webpage, 'uploader', default=None) + uploader_id = self._html_search_regex( + r'<a[^>]+href="/profile/([^"]+)"', webpage, 'uploader_id', default=None) + duration = parse_duration(self._search_regex( + r'<div[^>]+\bclass=["\']right_side[^>]+>\s*<span>([^<]+)', + webpage, 'duration', default=None)) + view_count = str_to_int(self._search_regex( + r'([\d,.]+)\s+plays', webpage, 'view count', default=None)) + + age_limit = self._rta_search(webpage) + + return merge_dicts({ + 'id': video_id, + 'title': title or video_id, + 'description': description, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'duration': duration, + 'view_count': view_count, + 'formats': formats, + 'age_limit': age_limit, + }, info + ) + + +class SpankBangPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P<id>[\da-z]+)/playlist/(?P<display_id>[^/]+)' + _TEST = { + 'url': 'https://spankbang.com/ug0k/playlist/big+ass+titties', + 'info_dict': { + 'id': 'ug0k', + 'title': 'Big Ass Titties', + }, + 'playlist_mincount': 40, + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + playlist_id = mobj.group('id') + display_id = mobj.group('display_id') + + webpage = self._download_webpage( + url, playlist_id, headers={'Cookie': 'country=US; mobile=on'}) + + entries = [self.url_result( + urljoin(url, mobj.group('path')), + ie=SpankBangIE.ie_key(), video_id=mobj.group('id')) + for mobj in re.finditer( + r'<a[^>]+\bhref=(["\'])(?P<path>/?[\da-z]+-(?P<id>[\da-z]+)/playlist/%s(?:(?!\1).)*)\1' + % re.escape(display_id), webpage)] + + title = self._html_search_regex( + r'<h1>([^<]+)\s+playlist\s*<', webpage, 'playlist title', + fatal=False) + + return self.playlist_result(entries, playlist_id, title) diff --git a/yt_dlp/extractor/spankwire.py b/yt_dlp/extractor/spankwire.py new file mode 100644 index 000000000..e97c1d23e --- /dev/null +++ b/yt_dlp/extractor/spankwire.py @@ -0,0 +1,182 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + float_or_none, + int_or_none, + merge_dicts, + str_or_none, + str_to_int, + url_or_none, +) + + +class SpankwireIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?spankwire\.com/ + (?: + [^/]+/video| + EmbedPlayer\.aspx/?\?.*?\bArticleId= + ) + (?P<id>\d+) + ''' + _TESTS = [{ + # download URL pattern: */<height>P_<tbr>K_<video_id>.mp4 + 'url': 'http://www.spankwire.com/Buckcherry-s-X-Rated-Music-Video-Crazy-Bitch/video103545/', + 'md5': '5aa0e4feef20aad82cbcae3aed7ab7cd', + 'info_dict': { + 'id': '103545', + 'ext': 'mp4', + 'title': 'Buckcherry`s X Rated Music Video Crazy Bitch', + 'description': 'Crazy Bitch X rated music video.', + 'duration': 222, + 'uploader': 'oreusz', + 'uploader_id': '124697', + 'timestamp': 1178587885, + 'upload_date': '20070508', + 'average_rating': float, + 'view_count': int, + 'comment_count': int, + 'age_limit': 18, + 'categories': list, + 'tags': list, + }, + }, { + # download URL pattern: */mp4_<format_id>_<video_id>.mp4 + 'url': 'http://www.spankwire.com/Titcums-Compiloation-I/video1921551/', + 'md5': '09b3c20833308b736ae8902db2f8d7e6', + 'info_dict': { + 'id': '1921551', + 'ext': 'mp4', + 'title': 'Titcums Compiloation I', + 'description': 'cum on tits', + 'uploader': 'dannyh78999', + 'uploader_id': '3056053', + 'upload_date': '20150822', + 'age_limit': 18, + }, + 'params': { + 'proxy': '127.0.0.1:8118' + }, + 'skip': 'removed', + }, { + 'url': 'https://www.spankwire.com/EmbedPlayer.aspx/?ArticleId=156156&autostart=true', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r'<iframe[^>]+\bsrc=["\']((?:https?:)?//(?:www\.)?spankwire\.com/EmbedPlayer\.aspx/?\?.*?\bArticleId=\d+)', + webpage) + + def _real_extract(self, url): + video_id = self._match_id(url) + + video = self._download_json( + 'https://www.spankwire.com/api/video/%s.json' % video_id, video_id) + + title = video['title'] + + formats = [] + videos = video.get('videos') + if isinstance(videos, dict): + for format_id, format_url in videos.items(): + video_url = url_or_none(format_url) + if not format_url: + continue + height = int_or_none(self._search_regex( + r'(\d+)[pP]', format_id, 'height', default=None)) + m = re.search( + r'/(?P<height>\d+)[pP]_(?P<tbr>\d+)[kK]', video_url) + if m: + tbr = int(m.group('tbr')) + height = height or int(m.group('height')) + else: + tbr = None + formats.append({ + 'url': video_url, + 'format_id': '%dp' % height if height else format_id, + 'height': height, + 'tbr': tbr, + }) + m3u8_url = url_or_none(video.get('HLS')) + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + self._sort_formats(formats) + + view_count = str_to_int(video.get('viewed')) + + thumbnails = [] + for preference, t in enumerate(('', '2x'), start=0): + thumbnail_url = url_or_none(video.get('poster%s' % t)) + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'preference': preference, + }) + + def extract_names(key): + entries_list = video.get(key) + if not isinstance(entries_list, list): + return + entries = [] + for entry in entries_list: + name = str_or_none(entry.get('name')) + if name: + entries.append(name) + return entries + + categories = extract_names('categories') + tags = extract_names('tags') + + uploader = None + info = {} + + webpage = self._download_webpage( + 'https://www.spankwire.com/_/video%s/' % video_id, video_id, + fatal=False) + if webpage: + info = self._search_json_ld(webpage, video_id, default={}) + thumbnail_url = None + if 'thumbnail' in info: + thumbnail_url = url_or_none(info['thumbnail']) + del info['thumbnail'] + if not thumbnail_url: + thumbnail_url = self._og_search_thumbnail(webpage) + if thumbnail_url: + thumbnails.append({ + 'url': thumbnail_url, + 'preference': 10, + }) + uploader = self._html_search_regex( + r'(?s)by\s*<a[^>]+\bclass=["\']uploaded__by[^>]*>(.+?)</a>', + webpage, 'uploader', fatal=False) + if not view_count: + view_count = str_to_int(self._search_regex( + r'data-views=["\']([\d,.]+)', webpage, 'view count', + fatal=False)) + + return merge_dicts({ + 'id': video_id, + 'title': title, + 'description': video.get('description'), + 'duration': int_or_none(video.get('duration')), + 'thumbnails': thumbnails, + 'uploader': uploader, + 'uploader_id': str_or_none(video.get('userId')), + 'timestamp': int_or_none(video.get('time_approved_on')), + 'average_rating': float_or_none(video.get('rating')), + 'view_count': view_count, + 'comment_count': int_or_none(video.get('comments')), + 'age_limit': 18, + 'categories': categories, + 'tags': tags, + 'formats': formats, + }, info) diff --git a/youtube_dl/extractor/spiegel.py b/yt_dlp/extractor/spiegel.py index 2da32b9b2..2da32b9b2 100644 --- a/youtube_dl/extractor/spiegel.py +++ b/yt_dlp/extractor/spiegel.py diff --git a/yt_dlp/extractor/spiegeltv.py b/yt_dlp/extractor/spiegeltv.py new file mode 100644 index 000000000..6ccf4c342 --- /dev/null +++ b/yt_dlp/extractor/spiegeltv.py @@ -0,0 +1,17 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from .nexx import NexxIE + + +class SpiegeltvIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?spiegel\.tv/videos/(?P<id>\d+)' + _TEST = { + 'url': 'http://www.spiegel.tv/videos/161681-flug-mh370/', + 'only_matching': True, + } + + def _real_extract(self, url): + return self.url_result( + 'https://api.nexx.cloud/v3/748/videos/byid/%s' + % self._match_id(url), ie=NexxIE.ie_key()) diff --git a/youtube_dl/extractor/spike.py b/yt_dlp/extractor/spike.py index 5805f3d44..5805f3d44 100644 --- a/youtube_dl/extractor/spike.py +++ b/yt_dlp/extractor/spike.py diff --git a/yt_dlp/extractor/sport5.py b/yt_dlp/extractor/sport5.py new file mode 100644 index 000000000..35c57d62a --- /dev/null +++ b/yt_dlp/extractor/sport5.py @@ -0,0 +1,91 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class Sport5IE(InfoExtractor): + _VALID_URL = r'https?://(?:www|vod)?\.sport5\.co\.il/.*\b(?:Vi|docID)=(?P<id>\d+)' + _TESTS = [ + { + 'url': 'http://vod.sport5.co.il/?Vc=147&Vi=176331&Page=1', + 'info_dict': { + 'id': 's5-Y59xx1-GUh2', + 'ext': 'mp4', + 'title': 'ולנסיה-קורדובה 0:3', + 'description': 'אלקאסר, גאייה ופגולי סידרו לקבוצה של נונו ניצחון על קורדובה ואת המקום הראשון בליגה', + 'duration': 228, + 'categories': list, + }, + 'skip': 'Blocked outside of Israel', + }, { + 'url': 'http://www.sport5.co.il/articles.aspx?FolderID=3075&docID=176372&lang=HE', + 'info_dict': { + 'id': 's5-SiXxx1-hKh2', + 'ext': 'mp4', + 'title': 'GOALS_CELTIC_270914.mp4', + 'description': '', + 'duration': 87, + 'categories': list, + }, + 'skip': 'Blocked outside of Israel', + } + ] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + media_id = mobj.group('id') + + webpage = self._download_webpage(url, media_id) + + video_id = self._html_search_regex(r'clipId=([\w-]+)', webpage, 'video id') + + metadata = self._download_xml( + 'http://sport5-metadata-rr-d.nsacdn.com/vod/vod/%s/HDS/metadata.xml' % video_id, + video_id) + + error = metadata.find('./Error') + if error is not None: + raise ExtractorError( + '%s returned error: %s - %s' % ( + self.IE_NAME, + error.find('./Name').text, + error.find('./Description').text), + expected=True) + + title = metadata.find('./Title').text + description = metadata.find('./Description').text + duration = int(metadata.find('./Duration').text) + + posters_el = metadata.find('./PosterLinks') + thumbnails = [{ + 'url': thumbnail.text, + 'width': int(thumbnail.get('width')), + 'height': int(thumbnail.get('height')), + } for thumbnail in posters_el.findall('./PosterIMG')] if posters_el is not None else [] + + categories_el = metadata.find('./Categories') + categories = [ + cat.get('name') for cat in categories_el.findall('./Category') + ] if categories_el is not None else [] + + formats = [{ + 'url': fmt.text, + 'ext': 'mp4', + 'vbr': int(fmt.get('bitrate')), + 'width': int(fmt.get('width')), + 'height': int(fmt.get('height')), + } for fmt in metadata.findall('./PlaybackLinks/FileURL')] + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnails': thumbnails, + 'duration': duration, + 'categories': categories, + 'formats': formats, + } diff --git a/youtube_dl/extractor/sportbox.py b/yt_dlp/extractor/sportbox.py index b9017fd2a..b9017fd2a 100644 --- a/youtube_dl/extractor/sportbox.py +++ b/yt_dlp/extractor/sportbox.py diff --git a/yt_dlp/extractor/sportdeutschland.py b/yt_dlp/extractor/sportdeutschland.py new file mode 100644 index 000000000..94bcaba44 --- /dev/null +++ b/yt_dlp/extractor/sportdeutschland.py @@ -0,0 +1,102 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_html, + float_or_none, + int_or_none, + parse_iso8601, + parse_qs, + strip_or_none, + try_get, +) + + +class SportDeutschlandIE(InfoExtractor): + _VALID_URL = r'https?://sportdeutschland\.tv/(?P<id>(?:[^/]+/)?[^?#/&]+)' + _TESTS = [{ + 'url': 'https://sportdeutschland.tv/badminton/re-live-deutsche-meisterschaften-2020-halbfinals?playlistId=0', + 'info_dict': { + 'id': '5318cac0275701382770543d7edaf0a0', + 'ext': 'mp4', + 'title': 'Re-live: Deutsche Meisterschaften 2020 - Halbfinals - Teil 1', + 'duration': 16106.36, + }, + 'params': { + 'noplaylist': True, + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'https://sportdeutschland.tv/badminton/re-live-deutsche-meisterschaften-2020-halbfinals?playlistId=0', + 'info_dict': { + 'id': 'c6e2fdd01f63013854c47054d2ab776f', + 'title': 'Re-live: Deutsche Meisterschaften 2020 - Halbfinals', + 'description': 'md5:5263ff4c31c04bb780c9f91130b48530', + 'duration': 31397, + }, + 'playlist_count': 2, + }, { + 'url': 'https://sportdeutschland.tv/freeride-world-tour-2021-fieberbrunn-oesterreich', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + data = self._download_json( + 'https://backend.sportdeutschland.tv/api/permalinks/' + display_id, + display_id, query={'access_token': 'true'}) + asset = data['asset'] + title = (asset.get('title') or asset['label']).strip() + asset_id = asset.get('id') or asset.get('uuid') + info = { + 'id': asset_id, + 'title': title, + 'description': clean_html(asset.get('body') or asset.get('description')) or asset.get('teaser'), + 'duration': int_or_none(asset.get('seconds')), + } + videos = asset.get('videos') or [] + if len(videos) > 1: + playlist_id = parse_qs(url).get('playlistId', [None])[0] + if playlist_id: + if self.get_param('noplaylist'): + videos = [videos[int(playlist_id)]] + self.to_screen('Downloading just a single video because of --no-playlist') + else: + self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % asset_id) + + def entries(): + for i, video in enumerate(videos, 1): + video_id = video.get('uuid') + video_url = video.get('url') + if not (video_id and video_url): + continue + formats = self._extract_m3u8_formats( + video_url.replace('.smil', '.m3u8'), video_id, 'mp4', fatal=False) + if not formats and not self.get_param('ignore_no_formats'): + continue + yield { + 'id': video_id, + 'formats': formats, + 'title': title + ' - ' + (video.get('label') or 'Teil %d' % i), + 'duration': float_or_none(video.get('duration')), + } + info.update({ + '_type': 'multi_video', + 'entries': entries(), + }) + else: + formats = self._extract_m3u8_formats( + videos[0]['url'].replace('.smil', '.m3u8'), asset_id, 'mp4') + section_title = strip_or_none(try_get(data, lambda x: x['section']['title'])) + info.update({ + 'formats': formats, + 'display_id': asset.get('permalink'), + 'thumbnail': try_get(asset, lambda x: x['images'][0]), + 'categories': [section_title] if section_title else None, + 'view_count': int_or_none(asset.get('views')), + 'is_live': asset.get('is_live') is True, + 'timestamp': parse_iso8601(asset.get('date') or asset.get('published_at')), + }) + return info diff --git a/youtube_dl/extractor/spotify.py b/yt_dlp/extractor/spotify.py index 826f98cff..826f98cff 100644 --- a/youtube_dl/extractor/spotify.py +++ b/yt_dlp/extractor/spotify.py diff --git a/youtube_dl/extractor/spreaker.py b/yt_dlp/extractor/spreaker.py index 6c7e40ae4..6c7e40ae4 100644 --- a/youtube_dl/extractor/spreaker.py +++ b/yt_dlp/extractor/spreaker.py diff --git a/yt_dlp/extractor/springboardplatform.py b/yt_dlp/extractor/springboardplatform.py new file mode 100644 index 000000000..49ac1f559 --- /dev/null +++ b/yt_dlp/extractor/springboardplatform.py @@ -0,0 +1,125 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + xpath_attr, + xpath_text, + xpath_element, + unescapeHTML, + unified_timestamp, +) + + +class SpringboardPlatformIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + cms\.springboardplatform\.com/ + (?: + (?:previews|embed_iframe)/(?P<index>\d+)/video/(?P<id>\d+)| + xml_feeds_advanced/index/(?P<index_2>\d+)/rss3/(?P<id_2>\d+) + ) + ''' + _TESTS = [{ + 'url': 'http://cms.springboardplatform.com/previews/159/video/981017/0/0/1', + 'md5': '5c3cb7b5c55740d482561099e920f192', + 'info_dict': { + 'id': '981017', + 'ext': 'mp4', + 'title': 'Redman "BUD like YOU" "Usher Good Kisser" REMIX', + 'description': 'Redman "BUD like YOU" "Usher Good Kisser" REMIX', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1409132328, + 'upload_date': '20140827', + 'duration': 193, + }, + }, { + 'url': 'http://cms.springboardplatform.com/embed_iframe/159/video/981017/rab007/rapbasement.com/1/1', + 'only_matching': True, + }, { + 'url': 'http://cms.springboardplatform.com/embed_iframe/20/video/1731611/ki055/kidzworld.com/10', + 'only_matching': True, + }, { + 'url': 'http://cms.springboardplatform.com/xml_feeds_advanced/index/159/rss3/981017/0/0/1/', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//cms\.springboardplatform\.com/embed_iframe/\d+/video/\d+.*?)\1', + webpage)] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') or mobj.group('id_2') + index = mobj.group('index') or mobj.group('index_2') + + video = self._download_xml( + 'http://cms.springboardplatform.com/xml_feeds_advanced/index/%s/rss3/%s' + % (index, video_id), video_id) + + item = xpath_element(video, './/item', 'item', fatal=True) + + content = xpath_element( + item, './{http://search.yahoo.com/mrss/}content', 'content', + fatal=True) + title = unescapeHTML(xpath_text(item, './title', 'title', fatal=True)) + + video_url = content.attrib['url'] + + if 'error_video.mp4' in video_url: + raise ExtractorError( + 'Video %s no longer exists' % video_id, expected=True) + + duration = int_or_none(content.get('duration')) + tbr = int_or_none(content.get('bitrate')) + filesize = int_or_none(content.get('fileSize')) + width = int_or_none(content.get('width')) + height = int_or_none(content.get('height')) + + description = unescapeHTML(xpath_text( + item, './description', 'description')) + thumbnail = xpath_attr( + item, './{http://search.yahoo.com/mrss/}thumbnail', 'url', + 'thumbnail') + + timestamp = unified_timestamp(xpath_text( + item, './{http://cms.springboardplatform.com/namespaces.html}created', + 'timestamp')) + + formats = [{ + 'url': video_url, + 'format_id': 'http', + 'tbr': tbr, + 'filesize': filesize, + 'width': width, + 'height': height, + }] + + m3u8_format = formats[0].copy() + m3u8_format.update({ + 'url': re.sub(r'(https?://)cdn\.', r'\1hls.', video_url) + '.m3u8', + 'ext': 'mp4', + 'format_id': 'hls', + 'protocol': 'm3u8_native', + }) + formats.append(m3u8_format) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'formats': formats, + } diff --git a/youtube_dl/extractor/sprout.py b/yt_dlp/extractor/sprout.py index e243732f2..e243732f2 100644 --- a/youtube_dl/extractor/sprout.py +++ b/yt_dlp/extractor/sprout.py diff --git a/yt_dlp/extractor/srgssr.py b/yt_dlp/extractor/srgssr.py new file mode 100644 index 000000000..cbc1c47d2 --- /dev/null +++ b/yt_dlp/extractor/srgssr.py @@ -0,0 +1,255 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + parse_iso8601, + qualities, + try_get, +) + + +class SRGSSRIE(InfoExtractor): + _VALID_URL = r'''(?x) + (?: + https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=urn| + srgssr + ): + (?P<bu> + srf|rts|rsi|rtr|swi + ):(?:[^:]+:)? + (?P<type> + video|audio + ): + (?P<id> + [0-9a-f\-]{36}|\d+ + ) + ''' + _GEO_BYPASS = False + _GEO_COUNTRIES = ['CH'] + + _ERRORS = { + 'AGERATING12': 'To protect children under the age of 12, this video is only available between 8 p.m. and 6 a.m.', + 'AGERATING18': 'To protect children under the age of 18, this video is only available between 11 p.m. and 5 a.m.', + # 'ENDDATE': 'For legal reasons, this video was only available for a specified period of time.', + 'GEOBLOCK': 'For legal reasons, this video is only available in Switzerland.', + 'LEGAL': 'The video cannot be transmitted for legal reasons.', + 'STARTDATE': 'This video is not yet available. Please try again later.', + } + _DEFAULT_LANGUAGE_CODES = { + 'srf': 'de', + 'rts': 'fr', + 'rsi': 'it', + 'rtr': 'rm', + 'swi': 'en', + } + + def _get_tokenized_src(self, url, video_id, format_id): + token = self._download_json( + 'http://tp.srgssr.ch/akahd/token?acl=*', + video_id, 'Downloading %s token' % format_id, fatal=False) or {} + auth_params = try_get(token, lambda x: x['token']['authparams']) + if auth_params: + url += ('?' if '?' not in url else '&') + auth_params + return url + + def _get_media_data(self, bu, media_type, media_id): + query = {'onlyChapters': True} if media_type == 'video' else {} + full_media_data = self._download_json( + 'https://il.srgssr.ch/integrationlayer/2.0/%s/mediaComposition/%s/%s.json' + % (bu, media_type, media_id), + media_id, query=query)['chapterList'] + try: + media_data = next( + x for x in full_media_data if x.get('id') == media_id) + except StopIteration: + raise ExtractorError('No media information found') + + block_reason = media_data.get('blockReason') + if block_reason and block_reason in self._ERRORS: + message = self._ERRORS[block_reason] + if block_reason == 'GEOBLOCK': + self.raise_geo_restricted( + msg=message, countries=self._GEO_COUNTRIES) + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, message), expected=True) + + return media_data + + def _real_extract(self, url): + bu, media_type, media_id = self._match_valid_url(url).groups() + media_data = self._get_media_data(bu, media_type, media_id) + title = media_data['title'] + + formats = [] + subtitles = {} + q = qualities(['SD', 'HD']) + for source in (media_data.get('resourceList') or []): + format_url = source.get('url') + if not format_url: + continue + protocol = source.get('protocol') + quality = source.get('quality') + format_id = [] + for e in (protocol, source.get('encoding'), quality): + if e: + format_id.append(e) + format_id = '-'.join(format_id) + + if protocol in ('HDS', 'HLS'): + if source.get('tokenType') == 'AKAMAI': + format_url = self._get_tokenized_src( + format_url, media_id, format_id) + fmts, subs = self._extract_akamai_formats_and_subtitles( + format_url, media_id) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) + elif protocol == 'HLS': + m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles( + format_url, media_id, 'mp4', 'm3u8_native', + m3u8_id=format_id, fatal=False) + formats.extend(m3u8_fmts) + subtitles = self._merge_subtitles(subtitles, m3u8_subs) + elif protocol in ('HTTP', 'HTTPS'): + formats.append({ + 'format_id': format_id, + 'url': format_url, + 'quality': q(quality), + }) + + # This is needed because for audio medias the podcast url is usually + # always included, even if is only an audio segment and not the + # whole episode. + if int_or_none(media_data.get('position')) == 0: + for p in ('S', 'H'): + podcast_url = media_data.get('podcast%sdUrl' % p) + if not podcast_url: + continue + quality = p + 'D' + formats.append({ + 'format_id': 'PODCAST-' + quality, + 'url': podcast_url, + 'quality': q(quality), + }) + self._sort_formats(formats) + + if media_type == 'video': + for sub in (media_data.get('subtitleList') or []): + sub_url = sub.get('url') + if not sub_url: + continue + lang = sub.get('locale') or self._DEFAULT_LANGUAGE_CODES[bu] + subtitles.setdefault(lang, []).append({ + 'url': sub_url, + }) + + return { + 'id': media_id, + 'title': title, + 'description': media_data.get('description'), + 'timestamp': parse_iso8601(media_data.get('date')), + 'thumbnail': media_data.get('imageUrl'), + 'duration': float_or_none(media_data.get('duration'), 1000), + 'subtitles': subtitles, + 'formats': formats, + } + + +class SRGSSRPlayIE(InfoExtractor): + IE_DESC = 'srf.ch, rts.ch, rsi.ch, rtr.ch and swissinfo.ch play sites' + _VALID_URL = r'''(?x) + https?:// + (?:(?:www|play)\.)? + (?P<bu>srf|rts|rsi|rtr|swissinfo)\.ch/play/(?:tv|radio)/ + (?: + [^/]+/(?P<type>video|audio)/[^?]+| + popup(?P<type_2>video|audio)player + ) + \?.*?\b(?:id=|urn=urn:[^:]+:video:)(?P<id>[0-9a-f\-]{36}|\d+) + ''' + + _TESTS = [{ + 'url': 'http://www.srf.ch/play/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5', + 'md5': '6db2226ba97f62ad42ce09783680046c', + 'info_dict': { + 'id': '28e1a57d-5b76-4399-8ab3-9097f071e6c5', + 'ext': 'mp4', + 'upload_date': '20130701', + 'title': 'Snowden beantragt Asyl in Russland', + 'timestamp': 1372708215, + 'duration': 113.827, + 'thumbnail': r're:^https?://.*1383719781\.png$', + }, + 'expected_warnings': ['Unable to download f4m manifest'], + }, { + 'url': 'http://www.rtr.ch/play/radio/actualitad/audio/saira-tujetsch-tuttina-cuntinuar-cun-sedrun-muster-turissem?id=63cb0778-27f8-49af-9284-8c7a8c6d15fc', + 'info_dict': { + 'id': '63cb0778-27f8-49af-9284-8c7a8c6d15fc', + 'ext': 'mp3', + 'upload_date': '20151013', + 'title': 'Saira: Tujetsch - tuttina cuntinuar cun Sedrun Mustér Turissem', + 'timestamp': 1444709160, + 'duration': 336.816, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, { + 'url': 'http://www.rts.ch/play/tv/-/video/le-19h30?id=6348260', + 'md5': '67a2a9ae4e8e62a68d0e9820cc9782df', + 'info_dict': { + 'id': '6348260', + 'display_id': '6348260', + 'ext': 'mp4', + 'duration': 1796.76, + 'title': 'Le 19h30', + 'upload_date': '20141201', + 'timestamp': 1417458600, + 'thumbnail': r're:^https?://.*\.image', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + }, { + 'url': 'http://play.swissinfo.ch/play/tv/business/video/why-people-were-against-tax-reforms?id=42960270', + 'info_dict': { + 'id': '42960270', + 'ext': 'mp4', + 'title': 'Why people were against tax reforms', + 'description': 'md5:7ac442c558e9630e947427469c4b824d', + 'duration': 94.0, + 'upload_date': '20170215', + 'timestamp': 1487173560, + 'thumbnail': r're:https?://www\.swissinfo\.ch/srgscalableimage/42961964', + 'subtitles': 'count:9', + }, + 'params': { + 'skip_download': True, + } + }, { + 'url': 'https://www.srf.ch/play/tv/popupvideoplayer?id=c4dba0ca-e75b-43b2-a34f-f708a4932e01', + 'only_matching': True, + }, { + 'url': 'https://www.srf.ch/play/tv/10vor10/video/snowden-beantragt-asyl-in-russland?urn=urn:srf:video:28e1a57d-5b76-4399-8ab3-9097f071e6c5', + 'only_matching': True, + }, { + 'url': 'https://www.rts.ch/play/tv/19h30/video/le-19h30?urn=urn:rts:video:6348260', + 'only_matching': True, + }, { + # audio segment, has podcastSdUrl of the full episode + 'url': 'https://www.srf.ch/play/radio/popupaudioplayer?id=50b20dc8-f05b-4972-bf03-e438ff2833eb', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + bu = mobj.group('bu') + media_type = mobj.group('type') or mobj.group('type_2') + media_id = mobj.group('id') + return self.url_result('srgssr:%s:%s:%s' % (bu[:3], media_type, media_id), 'SRGSSR') diff --git a/youtube_dl/extractor/srmediathek.py b/yt_dlp/extractor/srmediathek.py index 359dadaa3..359dadaa3 100644 --- a/youtube_dl/extractor/srmediathek.py +++ b/yt_dlp/extractor/srmediathek.py diff --git a/yt_dlp/extractor/stanfordoc.py b/yt_dlp/extractor/stanfordoc.py new file mode 100644 index 000000000..0003075ac --- /dev/null +++ b/yt_dlp/extractor/stanfordoc.py @@ -0,0 +1,91 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + orderedSet, + unescapeHTML, +) + + +class StanfordOpenClassroomIE(InfoExtractor): + IE_NAME = 'stanfordoc' + IE_DESC = 'Stanford Open ClassRoom' + _VALID_URL = r'https?://openclassroom\.stanford\.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$' + _TEST = { + 'url': 'http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100', + 'md5': '544a9468546059d4e80d76265b0443b8', + 'info_dict': { + 'id': 'PracticalUnix_intro-environment', + 'ext': 'mp4', + 'title': 'Intro Environment', + } + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + + if mobj.group('course') and mobj.group('video'): # A specific video + course = mobj.group('course') + video = mobj.group('video') + info = { + 'id': course + '_' + video, + 'uploader': None, + 'upload_date': None, + } + + baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/' + xmlUrl = baseUrl + video + '.xml' + mdoc = self._download_xml(xmlUrl, info['id']) + try: + info['title'] = mdoc.findall('./title')[0].text + info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text + except IndexError: + raise ExtractorError('Invalid metadata XML file') + return info + elif mobj.group('course'): # A course page + course = mobj.group('course') + info = { + 'id': course, + '_type': 'playlist', + 'uploader': None, + 'upload_date': None, + } + + coursepage = self._download_webpage( + url, info['id'], + note='Downloading course info page', + errnote='Unable to download course info page') + + info['title'] = self._html_search_regex( + r'<h1>([^<]+)</h1>', coursepage, 'title', default=info['id']) + + info['description'] = self._html_search_regex( + r'(?s)<description>([^<]+)</description>', + coursepage, 'description', fatal=False) + + links = orderedSet(re.findall(r'<a href="(VideoPage\.php\?[^"]+)">', coursepage)) + info['entries'] = [self.url_result( + 'http://openclassroom.stanford.edu/MainFolder/%s' % unescapeHTML(l) + ) for l in links] + return info + else: # Root page + info = { + 'id': 'Stanford OpenClassroom', + '_type': 'playlist', + 'uploader': None, + 'upload_date': None, + } + info['title'] = info['id'] + + rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php' + rootpage = self._download_webpage(rootURL, info['id'], + errnote='Unable to download course info page') + + links = orderedSet(re.findall(r'<a href="(CoursePage\.php\?[^"]+)">', rootpage)) + info['entries'] = [self.url_result( + 'http://openclassroom.stanford.edu/MainFolder/%s' % unescapeHTML(l) + ) for l in links] + return info diff --git a/yt_dlp/extractor/startv.py b/yt_dlp/extractor/startv.py new file mode 100644 index 000000000..411320ede --- /dev/null +++ b/yt_dlp/extractor/startv.py @@ -0,0 +1,103 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_str, +) +from ..utils import ( + clean_html, + ExtractorError, + traverse_obj, + int_or_none, +) + + +class StarTVIE(InfoExtractor): + _VALID_URL = r"""(?x) + https?://(?:www\.)?startv\.com\.tr/ + (?: + (?:dizi|program)/(?:[^/?#&]+)/(?:bolumler|fragmanlar|ekstralar)| + video/arsiv/(?:dizi|program)/(?:[^/?#&]+) + )/ + (?P<id>[^/?#&]+) + """ + IE_NAME = 'startv' + _TESTS = [ + { + 'url': 'https://www.startv.com.tr/dizi/cocuk/bolumler/3-bolum', + 'md5': '72381a32bcc2e2eb5841e8c8bf68f127', + 'info_dict': { + 'id': '904972', + 'display_id': '3-bolum', + 'ext': 'mp4', + 'title': '3. Bölüm', + 'description': 'md5:3a8049f05a75c2e8747116a673275de4', + 'thumbnail': r're:^https?://.*\.jpg(?:\?.*?)?$', + 'timestamp': 1569281400, + 'upload_date': '20190923' + }, + }, + { + 'url': 'https://www.startv.com.tr/video/arsiv/dizi/avlu/44-bolum', + 'only_matching': True + }, + { + 'url': 'https://www.startv.com.tr/dizi/cocuk/fragmanlar/5-bolum-fragmani', + 'only_matching': True + }, + { + 'url': 'https://www.startv.com.tr/dizi/cocuk/ekstralar/5-bolumun-nefes-kesen-final-sahnesi', + 'only_matching': True + }, + { + 'url': 'https://www.startv.com.tr/program/burcu-ile-haftasonu/bolumler/1-bolum', + 'only_matching': True + }, + { + 'url': 'https://www.startv.com.tr/program/burcu-ile-haftasonu/fragmanlar/2-fragman', + 'only_matching': True + }, + { + 'url': 'https://www.startv.com.tr/video/arsiv/program/buyukrisk/14-bolumde-hangi-unlu-ne-sordu-', + 'only_matching': True + }, + { + 'url': 'https://www.startv.com.tr/video/arsiv/program/buyukrisk/buyuk-risk-334-bolum', + 'only_matching': True + }, + { + 'url': 'https://www.startv.com.tr/video/arsiv/program/dada/dada-58-bolum', + 'only_matching': True + } + ] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + info_url = self._search_regex( + r'(["\'])videoUrl\1\s*:\s*\1(?P<url>(?:(?!\1).)+)\1\s*', + webpage, 'video info url', group='url') + + info = traverse_obj(self._download_json(info_url, display_id), 'data', expected_type=dict) + if not info: + raise ExtractorError('Failed to extract API data') + + video_id = compat_str(info.get('id')) + title = info.get('title') or self._og_search_title(webpage) + description = clean_html(info.get('description')) or self._og_search_description(webpage, default=None) + thumbnail = self._proto_relative_url( + self._og_search_thumbnail(webpage), scheme='http:') + + formats = self._extract_m3u8_formats( + traverse_obj(info, ('flavors', 'hls')), video_id, entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': int_or_none(info.get('release_date')), + 'formats': formats + } diff --git a/yt_dlp/extractor/steam.py b/yt_dlp/extractor/steam.py new file mode 100644 index 000000000..7f777c40b --- /dev/null +++ b/yt_dlp/extractor/steam.py @@ -0,0 +1,149 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + ExtractorError, + get_element_by_class, + js_to_json, +) + + +class SteamIE(InfoExtractor): + _VALID_URL = r"""(?x) + https?://store\.steampowered\.com/ + (agecheck/)? + (?P<urltype>video|app)/ #If the page is only for videos or for a game + (?P<gameID>\d+)/? + (?P<videoID>\d*)(?P<extra>\??) # For urltype == video we sometimes get the videoID + | + https?://(?:www\.)?steamcommunity\.com/sharedfiles/filedetails/\?id=(?P<fileID>[0-9]+) + """ + _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/' + _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' + _TESTS = [{ + 'url': 'http://store.steampowered.com/video/105600/', + 'playlist': [ + { + 'md5': '6a294ee0c4b1f47f5bb76a65e31e3592', + 'info_dict': { + 'id': '2040428', + 'ext': 'mp4', + 'title': 'Terraria 1.3 Trailer', + 'playlist_index': 1, + } + }, + { + 'md5': '911672b20064ca3263fa89650ba5a7aa', + 'info_dict': { + 'id': '2029566', + 'ext': 'mp4', + 'title': 'Terraria 1.2 Trailer', + 'playlist_index': 2, + } + } + ], + 'info_dict': { + 'id': '105600', + 'title': 'Terraria', + }, + 'params': { + 'playlistend': 2, + } + }, { + 'url': 'http://steamcommunity.com/sharedfiles/filedetails/?id=242472205', + 'info_dict': { + 'id': 'X8kpJBlzD2E', + 'ext': 'mp4', + 'upload_date': '20140617', + 'title': 'FRONTIERS - Trapping', + 'description': 'md5:bf6f7f773def614054089e5769c12a6e', + 'uploader': 'AAD Productions', + 'uploader_id': 'AtomicAgeDogGames', + } + }] + + def _real_extract(self, url): + m = self._match_valid_url(url) + fileID = m.group('fileID') + if fileID: + videourl = url + playlist_id = fileID + else: + gameID = m.group('gameID') + playlist_id = gameID + videourl = self._VIDEO_PAGE_TEMPLATE % playlist_id + + self._set_cookie('steampowered.com', 'mature_content', '1') + + webpage = self._download_webpage(videourl, playlist_id) + + if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None: + videourl = self._AGECHECK_TEMPLATE % playlist_id + self.report_age_confirmation() + webpage = self._download_webpage(videourl, playlist_id) + + flash_vars = self._parse_json(self._search_regex( + r'(?s)rgMovieFlashvars\s*=\s*({.+?});', webpage, + 'flash vars'), playlist_id, js_to_json) + + playlist_title = None + entries = [] + if fileID: + playlist_title = get_element_by_class('workshopItemTitle', webpage) + for movie in flash_vars.values(): + if not movie: + continue + youtube_id = movie.get('YOUTUBE_VIDEO_ID') + if not youtube_id: + continue + entries.append({ + '_type': 'url', + 'url': youtube_id, + 'ie_key': 'Youtube', + }) + else: + playlist_title = get_element_by_class('apphub_AppName', webpage) + for movie_id, movie in flash_vars.items(): + if not movie: + continue + video_id = self._search_regex(r'movie_(\d+)', movie_id, 'video id', fatal=False) + title = movie.get('MOVIE_NAME') + if not title or not video_id: + continue + entry = { + 'id': video_id, + 'title': title.replace('+', ' '), + } + formats = [] + flv_url = movie.get('FILENAME') + if flv_url: + formats.append({ + 'format_id': 'flv', + 'url': flv_url, + }) + highlight_element = self._search_regex( + r'(<div[^>]+id="highlight_movie_%s"[^>]+>)' % video_id, + webpage, 'highlight element', fatal=False) + if highlight_element: + highlight_attribs = extract_attributes(highlight_element) + if highlight_attribs: + entry['thumbnail'] = highlight_attribs.get('data-poster') + for quality in ('', '-hd'): + for ext in ('webm', 'mp4'): + video_url = highlight_attribs.get('data-%s%s-source' % (ext, quality)) + if video_url: + formats.append({ + 'format_id': ext + quality, + 'url': video_url, + }) + if not formats and not self.get_param('ignore_no_formats'): + continue + entry['formats'] = formats + entries.append(entry) + if not entries: + raise ExtractorError('Could not find any videos') + + return self.playlist_result(entries, playlist_id, playlist_title) diff --git a/youtube_dl/extractor/stitcher.py b/yt_dlp/extractor/stitcher.py index 822782507..822782507 100644 --- a/youtube_dl/extractor/stitcher.py +++ b/yt_dlp/extractor/stitcher.py diff --git a/youtube_dl/extractor/storyfire.py b/yt_dlp/extractor/storyfire.py index 9c698626f..9c698626f 100644 --- a/youtube_dl/extractor/storyfire.py +++ b/yt_dlp/extractor/storyfire.py diff --git a/yt_dlp/extractor/streamable.py b/yt_dlp/extractor/streamable.py new file mode 100644 index 000000000..808129649 --- /dev/null +++ b/yt_dlp/extractor/streamable.py @@ -0,0 +1,116 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + try_get, + parse_codecs, +) + + +class StreamableIE(InfoExtractor): + _VALID_URL = r'https?://streamable\.com/(?:[es]/)?(?P<id>\w+)' + _TESTS = [ + { + 'url': 'https://streamable.com/dnd1', + 'md5': '3e3bc5ca088b48c2d436529b64397fef', + 'info_dict': { + 'id': 'dnd1', + 'ext': 'mp4', + 'title': 'Mikel Oiarzabal scores to make it 0-3 for La Real against Espanyol', + 'thumbnail': r're:https?://.*\.jpg$', + 'uploader': 'teabaker', + 'timestamp': 1454964157.35115, + 'upload_date': '20160208', + 'duration': 61.516, + 'view_count': int, + } + }, + # older video without bitrate, width/height, codecs, etc. info + { + 'url': 'https://streamable.com/moo', + 'md5': '2cf6923639b87fba3279ad0df3a64e73', + 'info_dict': { + 'id': 'moo', + 'ext': 'mp4', + 'title': '"Please don\'t eat me!"', + 'thumbnail': r're:https?://.*\.jpg$', + 'timestamp': 1426115495, + 'upload_date': '20150311', + 'duration': 12, + 'view_count': int, + } + }, + { + 'url': 'https://streamable.com/e/dnd1', + 'only_matching': True, + }, + { + 'url': 'https://streamable.com/s/okkqk/drxjds', + 'only_matching': True, + } + ] + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe[^>]+src=(?P<q1>[\'"])(?P<src>(?:https?:)?//streamable\.com/(?:(?!\1).+))(?P=q1)', + webpage) + if mobj: + return mobj.group('src') + + def _real_extract(self, url): + video_id = self._match_id(url) + + # Note: Using the ajax API, as the public Streamable API doesn't seem + # to return video info like the title properly sometimes, and doesn't + # include info like the video duration + video = self._download_json( + 'https://ajax.streamable.com/videos/%s' % video_id, video_id) + + # Format IDs: + # 0 The video is being uploaded + # 1 The video is being processed + # 2 The video has at least one file ready + # 3 The video is unavailable due to an error + status = video.get('status') + if status != 2: + raise ExtractorError( + 'This video is currently unavailable. It may still be uploading or processing.', + expected=True) + + title = video.get('reddit_title') or video['title'] + + formats = [] + for key, info in video['files'].items(): + if not info.get('url'): + continue + formats.append({ + 'format_id': key, + 'url': self._proto_relative_url(info['url']), + 'width': int_or_none(info.get('width')), + 'height': int_or_none(info.get('height')), + 'filesize': int_or_none(info.get('size')), + 'fps': int_or_none(info.get('framerate')), + 'vbr': float_or_none(info.get('bitrate'), 1000), + 'vcodec': parse_codecs(try_get(info, lambda x: x['input_metadata']['video_codec_name'])).get('vcodec'), + 'acodec': parse_codecs(try_get(info, lambda x: x['input_metadata']['audio_codec_name'])).get('acodec'), + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': video.get('description'), + 'thumbnail': self._proto_relative_url(video.get('thumbnail_url')), + 'uploader': video.get('owner', {}).get('user_name'), + 'timestamp': float_or_none(video.get('date_added')), + 'duration': float_or_none(video.get('duration')), + 'view_count': int_or_none(video.get('plays')), + 'formats': formats + } diff --git a/yt_dlp/extractor/streamanity.py b/yt_dlp/extractor/streamanity.py new file mode 100644 index 000000000..2e2d5eedf --- /dev/null +++ b/yt_dlp/extractor/streamanity.py @@ -0,0 +1,51 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class StreamanityIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?streamanity\.com/video/(?P<id>[A-Za-z0-9]+)' + _TESTS = [{ + 'url': 'https://streamanity.com/video/9DFPTnuYi8f2', + 'md5': '6ab171e8d4a02ad5dcbff6bea44cf5a1', + 'info_dict': { + 'id': '9DFPTnuYi8f2', + 'ext': 'mp4', + 'title': 'Bitcoin vs The Lighting Network', + 'thumbnail': r're:https://res\.cloudinary\.com/.+\.png', + 'description': '', + 'uploader': 'Tom Bombadil (Freddy78)', + } + }, { + 'url': 'https://streamanity.com/video/JktOUjSlfzTD', + 'md5': '31f131e28abd3377c38be586a59532dc', + 'info_dict': { + 'id': 'JktOUjSlfzTD', + 'ext': 'mp4', + 'title': 'Share data when you see it', + 'thumbnail': r're:https://res\.cloudinary\.com/.+\.png', + 'description': 'Reposting as data should be public and stored on blockchain', + 'uploader': 'digitalcurrencydaily', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + video_info = self._download_json( + f'https://app.streamanity.com/api/video/{video_id}', video_id)['data']['video'] + + formats = self._extract_m3u8_formats( + f'https://stream.mux.com/{video_info["play_id"]}.m3u8?token={video_info["token"]}', + video_id, ext='mp4', m3u8_id='hls') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_info['title'], + 'description': video_info.get('description'), + 'uploader': video_info.get('author_name'), + 'is_live': False, + 'thumbnail': video_info.get('thumb'), + 'formats': formats, + } diff --git a/youtube_dl/extractor/streamcloud.py b/yt_dlp/extractor/streamcloud.py index b97bb4374..b97bb4374 100644 --- a/youtube_dl/extractor/streamcloud.py +++ b/yt_dlp/extractor/streamcloud.py diff --git a/youtube_dl/extractor/streamcz.py b/yt_dlp/extractor/streamcz.py index 58e0b4c80..58e0b4c80 100644 --- a/youtube_dl/extractor/streamcz.py +++ b/yt_dlp/extractor/streamcz.py diff --git a/youtube_dl/extractor/streetvoice.py b/yt_dlp/extractor/streetvoice.py index f21681ae7..f21681ae7 100644 --- a/youtube_dl/extractor/streetvoice.py +++ b/yt_dlp/extractor/streetvoice.py diff --git a/youtube_dl/extractor/stretchinternet.py b/yt_dlp/extractor/stretchinternet.py index ec08eae55..ec08eae55 100644 --- a/youtube_dl/extractor/stretchinternet.py +++ b/yt_dlp/extractor/stretchinternet.py diff --git a/yt_dlp/extractor/stv.py b/yt_dlp/extractor/stv.py new file mode 100644 index 000000000..d36a4b6e9 --- /dev/null +++ b/yt_dlp/extractor/stv.py @@ -0,0 +1,94 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ( + compat_str, + float_or_none, + int_or_none, + smuggle_url, + str_or_none, + try_get, +) + + +class STVPlayerIE(InfoExtractor): + IE_NAME = 'stv:player' + _VALID_URL = r'https?://player\.stv\.tv/(?P<type>episode|video)/(?P<id>[a-z0-9]{4})' + _TESTS = [{ + # shortform + 'url': 'https://player.stv.tv/video/4gwd/emmerdale/60-seconds-on-set-with-laura-norton/', + 'md5': '5adf9439c31d554f8be0707c7abe7e0a', + 'info_dict': { + 'id': '5333973339001', + 'ext': 'mp4', + 'upload_date': '20170301', + 'title': '60 seconds on set with Laura Norton', + 'description': "How many questions can Laura - a.k.a Kerry Wyatt - answer in 60 seconds? Let\'s find out!", + 'timestamp': 1488388054, + 'uploader_id': '1486976045', + }, + 'skip': 'this resource is unavailable outside of the UK', + }, { + # episodes + 'url': 'https://player.stv.tv/episode/4125/jennifer-saunders-memory-lane', + 'only_matching': True, + }] + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1486976045/default_default/index.html?videoId=%s' + _PTYPE_MAP = { + 'episode': 'episodes', + 'video': 'shortform', + } + + def _real_extract(self, url): + ptype, video_id = self._match_valid_url(url).groups() + + webpage = self._download_webpage(url, video_id, fatal=False) or '' + props = (self._parse_json(self._search_regex( + r'<script[^>]+id="__NEXT_DATA__"[^>]*>({.+?})</script>', + webpage, 'next data', default='{}'), video_id, + fatal=False) or {}).get('props') or {} + player_api_cache = try_get( + props, lambda x: x['initialReduxState']['playerApiCache']) or {} + + api_path, resp = None, {} + for k, v in player_api_cache.items(): + if k.startswith('/episodes/') or k.startswith('/shortform/'): + api_path, resp = k, v + break + else: + episode_id = str_or_none(try_get( + props, lambda x: x['pageProps']['episodeId'])) + api_path = '/%s/%s' % (self._PTYPE_MAP[ptype], episode_id or video_id) + + result = resp.get('results') + if not result: + resp = self._download_json( + 'https://player.api.stv.tv/v1' + api_path, video_id) + result = resp['results'] + + video = result['video'] + video_id = compat_str(video['id']) + + subtitles = {} + _subtitles = result.get('_subtitles') or {} + for ext, sub_url in _subtitles.items(): + subtitles.setdefault('en', []).append({ + 'ext': 'vtt' if ext == 'webvtt' else ext, + 'url': sub_url, + }) + + programme = result.get('programme') or {} + + return { + '_type': 'url_transparent', + 'id': video_id, + 'url': smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, {'geo_countries': ['GB']}), + 'description': result.get('summary'), + 'duration': float_or_none(video.get('length'), 1000), + 'subtitles': subtitles, + 'view_count': int_or_none(result.get('views')), + 'series': programme.get('name') or programme.get('shortName'), + 'ie_key': 'BrightcoveNew', + } diff --git a/youtube_dl/extractor/sunporno.py b/yt_dlp/extractor/sunporno.py index 68051169b..68051169b 100644 --- a/youtube_dl/extractor/sunporno.py +++ b/yt_dlp/extractor/sunporno.py diff --git a/youtube_dl/extractor/sverigesradio.py b/yt_dlp/extractor/sverigesradio.py index aa0691f0d..aa0691f0d 100644 --- a/youtube_dl/extractor/sverigesradio.py +++ b/yt_dlp/extractor/sverigesradio.py diff --git a/yt_dlp/extractor/svt.py b/yt_dlp/extractor/svt.py new file mode 100644 index 000000000..38e0086b3 --- /dev/null +++ b/yt_dlp/extractor/svt.py @@ -0,0 +1,425 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + dict_get, + int_or_none, + unified_timestamp, + str_or_none, + strip_or_none, + try_get, +) + + +class SVTBaseIE(InfoExtractor): + _GEO_COUNTRIES = ['SE'] + + def _extract_video(self, video_info, video_id): + is_live = dict_get(video_info, ('live', 'simulcast'), default=False) + m3u8_protocol = 'm3u8' if is_live else 'm3u8_native' + formats = [] + for vr in video_info['videoReferences']: + player_type = vr.get('playerType') or vr.get('format') + vurl = vr['url'] + ext = determine_ext(vurl) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + vurl, video_id, + ext='mp4', entry_protocol=m3u8_protocol, + m3u8_id=player_type, fatal=False)) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + vurl + '?hdcore=3.3.0', video_id, + f4m_id=player_type, fatal=False)) + elif ext == 'mpd': + if player_type == 'dashhbbtv': + formats.extend(self._extract_mpd_formats( + vurl, video_id, mpd_id=player_type, fatal=False)) + else: + formats.append({ + 'format_id': player_type, + 'url': vurl, + }) + rights = try_get(video_info, lambda x: x['rights'], dict) or {} + if not formats and rights.get('geoBlockedSweden'): + self.raise_geo_restricted( + 'This video is only available in Sweden', + countries=self._GEO_COUNTRIES, metadata_available=True) + self._sort_formats(formats) + + subtitles = {} + subtitle_references = dict_get(video_info, ('subtitles', 'subtitleReferences')) + if isinstance(subtitle_references, list): + for sr in subtitle_references: + subtitle_url = sr.get('url') + subtitle_lang = sr.get('language', 'sv') + if subtitle_url: + if determine_ext(subtitle_url) == 'm3u8': + # TODO(yan12125): handle WebVTT in m3u8 manifests + continue + + subtitles.setdefault(subtitle_lang, []).append({'url': subtitle_url}) + + title = video_info.get('title') + + series = video_info.get('programTitle') + season_number = int_or_none(video_info.get('season')) + episode = video_info.get('episodeTitle') + episode_number = int_or_none(video_info.get('episodeNumber')) + + timestamp = unified_timestamp(rights.get('validFrom')) + duration = int_or_none(dict_get(video_info, ('materialLength', 'contentDuration'))) + age_limit = None + adult = dict_get( + video_info, ('inappropriateForChildren', 'blockedForChildren'), + skip_false_values=False) + if adult is not None: + age_limit = 18 if adult else 0 + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'subtitles': subtitles, + 'duration': duration, + 'timestamp': timestamp, + 'age_limit': age_limit, + 'series': series, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, + 'is_live': is_live, + } + + +class SVTIE(SVTBaseIE): + _VALID_URL = r'https?://(?:www\.)?svt\.se/wd\?(?:.*?&)?widgetId=(?P<widget_id>\d+)&.*?\barticleId=(?P<id>\d+)' + _TEST = { + 'url': 'http://www.svt.se/wd?widgetId=23991§ionId=541&articleId=2900353&type=embed&contextSectionId=123&autostart=false', + 'md5': '33e9a5d8f646523ce0868ecfb0eed77d', + 'info_dict': { + 'id': '2900353', + 'ext': 'mp4', + 'title': 'Stjärnorna skojar till det - under SVT-intervjun', + 'duration': 27, + 'age_limit': 0, + }, + } + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'(?:<iframe src|href)="(?P<url>%s[^"]*)"' % SVTIE._VALID_URL, webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + widget_id = mobj.group('widget_id') + article_id = mobj.group('id') + + info = self._download_json( + 'http://www.svt.se/wd?widgetId=%s&articleId=%s&format=json&type=embed&output=json' % (widget_id, article_id), + article_id) + + info_dict = self._extract_video(info['video'], article_id) + info_dict['title'] = info['context']['title'] + return info_dict + + +class SVTPlayBaseIE(SVTBaseIE): + _SVTPLAY_RE = r'root\s*\[\s*(["\'])_*svtplay\1\s*\]\s*=\s*(?P<json>{.+?})\s*;\s*\n' + + +class SVTPlayIE(SVTPlayBaseIE): + IE_DESC = 'SVT Play and Öppet arkiv' + _VALID_URL = r'''(?x) + (?: + (?: + svt:| + https?://(?:www\.)?svt\.se/barnkanalen/barnplay/[^/]+/ + ) + (?P<svt_id>[^/?#&]+)| + https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P<id>[^/?#&]+) + (?:.*?(?:modalId|id)=(?P<modal_id>[\da-zA-Z-]+))? + ) + ''' + _TESTS = [{ + 'url': 'https://www.svtplay.se/video/30479064', + 'md5': '2382036fd6f8c994856c323fe51c426e', + 'info_dict': { + 'id': '8zVbDPA', + 'ext': 'mp4', + 'title': 'Designdrömmar i Stenungsund', + 'timestamp': 1615770000, + 'upload_date': '20210315', + 'duration': 3519, + 'thumbnail': r're:^https?://(?:.*[\.-]jpg|www.svtstatic.se/image/.*)$', + 'age_limit': 0, + 'subtitles': { + 'sv': [{ + 'ext': 'vtt', + }] + }, + }, + 'params': { + 'format': 'bestvideo', + # skip for now due to download test asserts that segment is > 10000 bytes and svt uses + # init segments that are smaller + # AssertionError: Expected test_SVTPlay_jNwpV9P.mp4 to be at least 9.77KiB, but it's only 864.00B + 'skip_download': True, + }, + }, { + 'url': 'https://www.svtplay.se/video/30479064/husdrommar/husdrommar-sasong-8-designdrommar-i-stenungsund?modalId=8zVbDPA', + 'only_matching': True, + }, { + 'url': 'https://www.svtplay.se/video/30684086/rapport/rapport-24-apr-18-00-7?id=e72gVpa', + 'only_matching': True, + }, { + # geo restricted to Sweden + 'url': 'http://www.oppetarkiv.se/video/5219710/trollflojten', + 'only_matching': True, + }, { + 'url': 'http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg', + 'only_matching': True, + }, { + 'url': 'https://www.svtplay.se/kanaler/svt1', + 'only_matching': True, + }, { + 'url': 'svt:1376446-003A', + 'only_matching': True, + }, { + 'url': 'svt:14278044', + 'only_matching': True, + }, { + 'url': 'https://www.svt.se/barnkanalen/barnplay/kar/eWv5MLX/', + 'only_matching': True, + }, { + 'url': 'svt:eWv5MLX', + 'only_matching': True, + }] + + def _adjust_title(self, info): + if info['is_live']: + info['title'] = self._live_title(info['title']) + + def _extract_by_video_id(self, video_id, webpage=None): + data = self._download_json( + 'https://api.svt.se/videoplayer-api/video/%s' % video_id, + video_id, headers=self.geo_verification_headers()) + info_dict = self._extract_video(data, video_id) + if not info_dict.get('title'): + title = dict_get(info_dict, ('episode', 'series')) + if not title and webpage: + title = re.sub( + r'\s*\|\s*.+?$', '', self._og_search_title(webpage)) + if not title: + title = video_id + info_dict['title'] = title + self._adjust_title(info_dict) + return info_dict + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + svt_id = mobj.group('svt_id') or mobj.group('modal_id') + + if svt_id: + return self._extract_by_video_id(svt_id) + + webpage = self._download_webpage(url, video_id) + + data = self._parse_json( + self._search_regex( + self._SVTPLAY_RE, webpage, 'embedded data', default='{}', + group='json'), + video_id, fatal=False) + + thumbnail = self._og_search_thumbnail(webpage) + + if data: + video_info = try_get( + data, lambda x: x['context']['dispatcher']['stores']['VideoTitlePageStore']['data']['video'], + dict) + if video_info: + info_dict = self._extract_video(video_info, video_id) + info_dict.update({ + 'title': data['context']['dispatcher']['stores']['MetaStore']['title'], + 'thumbnail': thumbnail, + }) + self._adjust_title(info_dict) + return info_dict + + svt_id = try_get( + data, lambda x: x['statistics']['dataLake']['content']['id'], + compat_str) + + if not svt_id: + svt_id = self._search_regex( + (r'<video[^>]+data-video-id=["\']([\da-zA-Z-]+)', + r'<[^>]+\bdata-rt=["\']top-area-play-button["\'][^>]+\bhref=["\'][^"\']*video/%s/[^"\']*\b(?:modalId|id)=([\da-zA-Z-]+)' % re.escape(video_id), + r'["\']videoSvtId["\']\s*:\s*["\']([\da-zA-Z-]+)', + r'["\']videoSvtId\\?["\']\s*:\s*\\?["\']([\da-zA-Z-]+)', + r'"content"\s*:\s*{.*?"id"\s*:\s*"([\da-zA-Z-]+)"', + r'["\']svtId["\']\s*:\s*["\']([\da-zA-Z-]+)', + r'["\']svtId\\?["\']\s*:\s*\\?["\']([\da-zA-Z-]+)'), + webpage, 'video id') + + info_dict = self._extract_by_video_id(svt_id, webpage) + info_dict['thumbnail'] = thumbnail + + return info_dict + + +class SVTSeriesIE(SVTPlayBaseIE): + _VALID_URL = r'https?://(?:www\.)?svtplay\.se/(?P<id>[^/?&#]+)(?:.+?\btab=(?P<season_slug>[^&#]+))?' + _TESTS = [{ + 'url': 'https://www.svtplay.se/rederiet', + 'info_dict': { + 'id': '14445680', + 'title': 'Rederiet', + 'description': 'md5:d9fdfff17f5d8f73468176ecd2836039', + }, + 'playlist_mincount': 318, + }, { + 'url': 'https://www.svtplay.se/rederiet?tab=season-2-14445680', + 'info_dict': { + 'id': 'season-2-14445680', + 'title': 'Rederiet - Säsong 2', + 'description': 'md5:d9fdfff17f5d8f73468176ecd2836039', + }, + 'playlist_mincount': 12, + }] + + @classmethod + def suitable(cls, url): + return False if SVTIE.suitable(url) or SVTPlayIE.suitable(url) else super(SVTSeriesIE, cls).suitable(url) + + def _real_extract(self, url): + series_slug, season_id = self._match_valid_url(url).groups() + + series = self._download_json( + 'https://api.svt.se/contento/graphql', series_slug, + 'Downloading series page', query={ + 'query': '''{ + listablesBySlug(slugs: ["%s"]) { + associatedContent(include: [productionPeriod, season]) { + items { + item { + ... on Episode { + videoSvtId + } + } + } + id + name + } + id + longDescription + name + shortDescription + } +}''' % series_slug, + })['data']['listablesBySlug'][0] + + season_name = None + + entries = [] + for season in series['associatedContent']: + if not isinstance(season, dict): + continue + if season_id: + if season.get('id') != season_id: + continue + season_name = season.get('name') + items = season.get('items') + if not isinstance(items, list): + continue + for item in items: + video = item.get('item') or {} + content_id = video.get('videoSvtId') + if not content_id or not isinstance(content_id, compat_str): + continue + entries.append(self.url_result( + 'svt:' + content_id, SVTPlayIE.ie_key(), content_id)) + + title = series.get('name') + season_name = season_name or season_id + + if title and season_name: + title = '%s - %s' % (title, season_name) + elif season_id: + title = season_id + + return self.playlist_result( + entries, season_id or series.get('id'), title, + dict_get(series, ('longDescription', 'shortDescription'))) + + +class SVTPageIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?svt\.se/(?P<path>(?:[^/]+/)*(?P<id>[^/?&#]+))' + _TESTS = [{ + 'url': 'https://www.svt.se/sport/ishockey/bakom-masken-lehners-kamp-mot-mental-ohalsa', + 'info_dict': { + 'id': '25298267', + 'title': 'Bakom masken – Lehners kamp mot mental ohälsa', + }, + 'playlist_count': 4, + }, { + 'url': 'https://www.svt.se/nyheter/utrikes/svenska-andrea-ar-en-mil-fran-branderna-i-kalifornien', + 'info_dict': { + 'id': '24243746', + 'title': 'Svenska Andrea redo att fly sitt hem i Kalifornien', + }, + 'playlist_count': 2, + }, { + # only programTitle + 'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun', + 'info_dict': { + 'id': '8439V2K', + 'ext': 'mp4', + 'title': 'Stjärnorna skojar till det - under SVT-intervjun', + 'duration': 27, + 'age_limit': 0, + }, + }, { + 'url': 'https://www.svt.se/nyheter/lokalt/vast/svt-testar-tar-nagon-upp-skrapet-1', + 'only_matching': True, + }, { + 'url': 'https://www.svt.se/vader/manadskronikor/maj2018', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if SVTIE.suitable(url) or SVTPlayIE.suitable(url) else super(SVTPageIE, cls).suitable(url) + + def _real_extract(self, url): + path, display_id = self._match_valid_url(url).groups() + + article = self._download_json( + 'https://api.svt.se/nss-api/page/' + path, display_id, + query={'q': 'articles'})['articles']['content'][0] + + entries = [] + + def _process_content(content): + if content.get('_type') in ('VIDEOCLIP', 'VIDEOEPISODE'): + video_id = compat_str(content['image']['svtId']) + entries.append(self.url_result( + 'svt:' + video_id, SVTPlayIE.ie_key(), video_id)) + + for media in article.get('media', []): + _process_content(media) + + for obj in article.get('structuredBody', []): + _process_content(obj.get('content') or {}) + + return self.playlist_result( + entries, str_or_none(article.get('id')), + strip_or_none(article.get('title'))) diff --git a/youtube_dl/extractor/swrmediathek.py b/yt_dlp/extractor/swrmediathek.py index 0f615979e..0f615979e 100644 --- a/youtube_dl/extractor/swrmediathek.py +++ b/yt_dlp/extractor/swrmediathek.py diff --git a/youtube_dl/extractor/syfy.py b/yt_dlp/extractor/syfy.py index def7e5a2c..def7e5a2c 100644 --- a/youtube_dl/extractor/syfy.py +++ b/yt_dlp/extractor/syfy.py diff --git a/youtube_dl/extractor/sztvhu.py b/yt_dlp/extractor/sztvhu.py index cfad33146..cfad33146 100644 --- a/youtube_dl/extractor/sztvhu.py +++ b/yt_dlp/extractor/sztvhu.py diff --git a/yt_dlp/extractor/tagesschau.py b/yt_dlp/extractor/tagesschau.py new file mode 100644 index 000000000..25c200455 --- /dev/null +++ b/yt_dlp/extractor/tagesschau.py @@ -0,0 +1,311 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + js_to_json, + parse_iso8601, + parse_filesize, +) + + +class TagesschauPlayerIE(InfoExtractor): + IE_NAME = 'tagesschau:player' + _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?P<kind>audio|video)/(?P=kind)-(?P<id>\d+)~player(?:_[^/?#&]+)?\.html' + + _TESTS = [{ + 'url': 'http://www.tagesschau.de/multimedia/video/video-179517~player.html', + 'md5': '8d09548d5c15debad38bee3a4d15ca21', + 'info_dict': { + 'id': '179517', + 'ext': 'mp4', + 'title': 'Marie Kristin Boese, ARD Berlin, über den zukünftigen Kurs der AfD', + 'thumbnail': r're:^https?:.*\.jpg$', + 'formats': 'mincount:6', + }, + }, { + 'url': 'https://www.tagesschau.de/multimedia/audio/audio-29417~player.html', + 'md5': '76e6eec6ebd40740671cf0a2c88617e5', + 'info_dict': { + 'id': '29417', + 'ext': 'mp3', + 'title': 'Trabi - Bye, bye Rennpappe', + 'thumbnail': r're:^https?:.*\.jpg$', + 'formats': 'mincount:2', + }, + }, { + 'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417~player_autoplay-true.html', + 'only_matching': True, + }] + + _FORMATS = { + 'xs': {'quality': 0}, + 's': {'width': 320, 'height': 180, 'quality': 1}, + 'm': {'width': 512, 'height': 288, 'quality': 2}, + 'l': {'width': 960, 'height': 540, 'quality': 3}, + 'xl': {'width': 1280, 'height': 720, 'quality': 4}, + 'xxl': {'quality': 5}, + } + + def _extract_via_api(self, kind, video_id): + info = self._download_json( + 'https://www.tagesschau.de/api/multimedia/{0}/{0}-{1}.json'.format(kind, video_id), + video_id) + title = info['headline'] + formats = [] + for media in info['mediadata']: + for format_id, format_url in media.items(): + if determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls')) + else: + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'vcodec': 'none' if kind == 'audio' else None, + }) + self._sort_formats(formats) + timestamp = parse_iso8601(info.get('date')) + return { + 'id': video_id, + 'title': title, + 'timestamp': timestamp, + 'formats': formats, + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + + # kind = mobj.group('kind').lower() + # if kind == 'video': + # return self._extract_via_api(kind, video_id) + + # JSON api does not provide some audio formats (e.g. ogg) thus + # extracting audio via webpage + + webpage = self._download_webpage(url, video_id) + + title = self._og_search_title(webpage).strip() + formats = [] + + for media_json in re.findall(r'({src\s*:\s*["\']http[^}]+type\s*:[^}]+})', webpage): + media = self._parse_json(js_to_json(media_json), video_id, fatal=False) + if not media: + continue + src = media.get('src') + if not src: + return + quality = media.get('quality') + kind = media.get('type', '').split('/')[0] + ext = determine_ext(src) + f = { + 'url': src, + 'format_id': '%s_%s' % (quality, ext) if quality else ext, + 'ext': ext, + 'vcodec': 'none' if kind == 'audio' else None, + } + f.update(self._FORMATS.get(quality, {})) + formats.append(f) + + self._sort_formats(formats) + + thumbnail = self._og_search_thumbnail(webpage) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'formats': formats, + } + + +class TagesschauIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/(?P<path>[^/]+/(?:[^/]+/)*?(?P<id>[^/#?]+?(?:-?[0-9]+)?))(?:~_?[^/#?]+?)?\.html' + + _TESTS = [{ + 'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html', + 'md5': 'f7c27a0eff3bfe8c7727e65f8fe1b1e6', + 'info_dict': { + 'id': 'video-102143', + 'ext': 'mp4', + 'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt', + 'description': '18.07.2015 20:10 Uhr', + 'thumbnail': r're:^https?:.*\.jpg$', + }, + }, { + 'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html', + 'md5': '3c54c1f6243d279b706bde660ceec633', + 'info_dict': { + 'id': 'ts-5727', + 'ext': 'mp4', + 'title': 'Sendung: tagesschau \t04.12.2014 20:00 Uhr', + 'description': 'md5:695c01bfd98b7e313c501386327aea59', + 'thumbnail': r're:^https?:.*\.jpg$', + }, + }, { + # exclusive audio + 'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417.html', + 'md5': '76e6eec6ebd40740671cf0a2c88617e5', + 'info_dict': { + 'id': 'audio-29417', + 'ext': 'mp3', + 'title': 'Trabi - Bye, bye Rennpappe', + 'description': 'md5:8687dda862cbbe2cfb2df09b56341317', + 'thumbnail': r're:^https?:.*\.jpg$', + }, + }, { + # audio in article + 'url': 'http://www.tagesschau.de/inland/bnd-303.html', + 'md5': 'e0916c623e85fc1d2b26b78f299d3958', + 'info_dict': { + 'id': 'bnd-303', + 'ext': 'mp3', + 'title': 'Viele Baustellen für neuen BND-Chef', + 'description': 'md5:1e69a54be3e1255b2b07cdbce5bcd8b4', + 'thumbnail': r're:^https?:.*\.jpg$', + }, + }, { + 'url': 'http://www.tagesschau.de/inland/afd-parteitag-135.html', + 'info_dict': { + 'id': 'afd-parteitag-135', + 'title': 'Möchtegern-Underdog mit Machtanspruch', + }, + 'playlist_count': 2, + }, { + 'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html', + 'only_matching': True, + }, { + 'url': 'http://www.tagesschau.de/multimedia/sendung/tt-3827.html', + 'only_matching': True, + }, { + 'url': 'http://www.tagesschau.de/multimedia/sendung/nm-3475.html', + 'only_matching': True, + }, { + 'url': 'http://www.tagesschau.de/multimedia/sendung/weltspiegel-3167.html', + 'only_matching': True, + }, { + 'url': 'http://www.tagesschau.de/multimedia/tsvorzwanzig-959.html', + 'only_matching': True, + }, { + 'url': 'http://www.tagesschau.de/multimedia/sendung/bab/bab-3299~_bab-sendung-209.html', + 'only_matching': True, + }, { + 'url': 'http://www.tagesschau.de/multimedia/video/video-102303~_bab-sendung-211.html', + 'only_matching': True, + }, { + 'url': 'http://www.tagesschau.de/100sekunden/index.html', + 'only_matching': True, + }, { + # playlist article with collapsing sections + 'url': 'http://www.tagesschau.de/wirtschaft/faq-freihandelszone-eu-usa-101.html', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if TagesschauPlayerIE.suitable(url) else super(TagesschauIE, cls).suitable(url) + + def _extract_formats(self, download_text, media_kind): + links = re.finditer( + r'<div class="button" title="(?P<title>[^"]*)"><a href="(?P<url>[^"]+)">(?P<name>.+?)</a></div>', + download_text) + formats = [] + for l in links: + link_url = l.group('url') + if not link_url: + continue + format_id = self._search_regex( + r'.*/[^/.]+\.([^/]+)\.[^/.]+$', link_url, 'format ID', + default=determine_ext(link_url)) + format = { + 'format_id': format_id, + 'url': l.group('url'), + 'format_name': l.group('name'), + } + title = l.group('title') + if title: + if media_kind.lower() == 'video': + m = re.match( + r'''(?x) + Video:\s*(?P<vcodec>[a-zA-Z0-9/._-]+)\s*&\#10; + (?P<width>[0-9]+)x(?P<height>[0-9]+)px&\#10; + (?P<vbr>[0-9]+)kbps&\#10; + Audio:\s*(?P<abr>[0-9]+)kbps,\s*(?P<audio_desc>[A-Za-z\.0-9]+)&\#10; + Größe:\s*(?P<filesize_approx>[0-9.,]+\s+[a-zA-Z]*B)''', + title) + if m: + format.update({ + 'format_note': m.group('audio_desc'), + 'vcodec': m.group('vcodec'), + 'width': int(m.group('width')), + 'height': int(m.group('height')), + 'abr': int(m.group('abr')), + 'vbr': int(m.group('vbr')), + 'filesize_approx': parse_filesize(m.group('filesize_approx')), + }) + else: + m = re.match( + r'(?P<format>.+?)-Format\s*:\s*(?P<abr>\d+)kbps\s*,\s*(?P<note>.+)', + title) + if m: + format.update({ + 'format_note': '%s, %s' % (m.group('format'), m.group('note')), + 'vcodec': 'none', + 'abr': int(m.group('abr')), + }) + formats.append(format) + self._sort_formats(formats) + return formats + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') or mobj.group('path') + display_id = video_id.lstrip('-') + + webpage = self._download_webpage(url, display_id) + + title = self._html_search_regex( + r'<span[^>]*class="headline"[^>]*>(.+?)</span>', + webpage, 'title', default=None) or self._og_search_title(webpage) + + DOWNLOAD_REGEX = r'(?s)<p>Wir bieten dieses (?P<kind>Video|Audio) in folgenden Formaten zum Download an:</p>\s*<div class="controls">(?P<links>.*?)</div>\s*<p>' + + webpage_type = self._og_search_property('type', webpage, default=None) + if webpage_type == 'website': # Article + entries = [] + for num, (entry_title, media_kind, download_text) in enumerate(re.findall( + r'(?s)<p[^>]+class="infotext"[^>]*>\s*(?:<a[^>]+>)?\s*<strong>(.+?)</strong>.*?</p>.*?%s' % DOWNLOAD_REGEX, + webpage), 1): + entries.append({ + 'id': '%s-%d' % (display_id, num), + 'title': '%s' % entry_title, + 'formats': self._extract_formats(download_text, media_kind), + }) + if len(entries) > 1: + return self.playlist_result(entries, display_id, title) + formats = entries[0]['formats'] + else: # Assume single video + download_text = self._search_regex( + DOWNLOAD_REGEX, webpage, 'download links', group='links') + media_kind = self._search_regex( + DOWNLOAD_REGEX, webpage, 'media kind', default='Video', group='kind') + formats = self._extract_formats(download_text, media_kind) + thumbnail = self._og_search_thumbnail(webpage) + description = self._html_search_regex( + r'(?s)<p class="teasertext">(.*?)</p>', + webpage, 'description', default=None) + + self._sort_formats(formats) + + return { + 'id': display_id, + 'title': title, + 'thumbnail': thumbnail, + 'formats': formats, + 'description': description, + } diff --git a/youtube_dl/extractor/tass.py b/yt_dlp/extractor/tass.py index 6d336da78..6d336da78 100644 --- a/youtube_dl/extractor/tass.py +++ b/yt_dlp/extractor/tass.py diff --git a/yt_dlp/extractor/tastytrade.py b/yt_dlp/extractor/tastytrade.py new file mode 100644 index 000000000..7fe96bd5f --- /dev/null +++ b/yt_dlp/extractor/tastytrade.py @@ -0,0 +1,43 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from .ooyala import OoyalaIE + + +class TastyTradeIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tastytrade\.com/tt/shows/[^/]+/episodes/(?P<id>[^/?#&]+)' + + _TESTS = [{ + 'url': 'https://www.tastytrade.com/tt/shows/market-measures/episodes/correlation-in-short-volatility-06-28-2017', + 'info_dict': { + 'id': 'F3bnlzbToeI6pLEfRyrlfooIILUjz4nM', + 'ext': 'mp4', + 'title': 'A History of Teaming', + 'description': 'md5:2a9033db8da81f2edffa4c99888140b3', + 'duration': 422.255, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['Ooyala'], + }, { + 'url': 'https://www.tastytrade.com/tt/shows/daily-dose/episodes/daily-dose-06-30-2017', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + ooyala_code = self._search_regex( + r'data-media-id=(["\'])(?P<code>(?:(?!\1).)+)\1', + webpage, 'ooyala code', group='code') + + info = self._search_json_ld(webpage, display_id, fatal=False) + info.update({ + '_type': 'url_transparent', + 'ie_key': OoyalaIE.ie_key(), + 'url': 'ooyala:%s' % ooyala_code, + 'display_id': display_id, + }) + return info diff --git a/yt_dlp/extractor/tbs.py b/yt_dlp/extractor/tbs.py new file mode 100644 index 000000000..c7d62ff4e --- /dev/null +++ b/yt_dlp/extractor/tbs.py @@ -0,0 +1,92 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .turner import TurnerBaseIE +from ..compat import ( + compat_urllib_parse_urlparse, + compat_parse_qs, +) +from ..utils import ( + float_or_none, + int_or_none, + strip_or_none, +) + + +class TBSIE(TurnerBaseIE): + _VALID_URL = r'https?://(?:www\.)?(?P<site>tbs|tntdrama)\.com(?P<path>/(?:movies|watchtnt|watchtbs|shows/[^/]+/(?:clips|season-\d+/episode-\d+))/(?P<id>[^/?#]+))' + _TESTS = [{ + 'url': 'http://www.tntdrama.com/shows/the-alienist/clips/monster', + 'info_dict': { + 'id': '8d384cde33b89f3a43ce5329de42903ed5099887', + 'ext': 'mp4', + 'title': 'Monster', + 'description': 'Get a first look at the theatrical trailer for TNT’s highly anticipated new psychological thriller The Alienist, which premieres January 22 on TNT.', + 'timestamp': 1508175329, + 'upload_date': '20171016', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + }, { + 'url': 'http://www.tbs.com/shows/search-party/season-1/episode-1/explicit-the-mysterious-disappearance-of-the-girl-no-one-knew', + 'only_matching': True, + }, { + 'url': 'http://www.tntdrama.com/movies/star-wars-a-new-hope', + 'only_matching': True, + }] + + def _real_extract(self, url): + site, path, display_id = self._match_valid_url(url).groups() + webpage = self._download_webpage(url, display_id) + drupal_settings = self._parse_json(self._search_regex( + r'<script[^>]+?data-drupal-selector="drupal-settings-json"[^>]*?>({.+?})</script>', + webpage, 'drupal setting'), display_id) + isLive = 'watchtnt' in path or 'watchtbs' in path + video_data = next(v for v in drupal_settings['turner_playlist'] if isLive or v.get('url') == path) + + media_id = video_data['mediaID'] + title = video_data['title'] + tokenizer_query = compat_parse_qs(compat_urllib_parse_urlparse( + drupal_settings['ngtv_token_url']).query) + + info = self._extract_ngtv_info( + media_id, tokenizer_query, { + 'url': url, + 'site_name': site[:3].upper(), + 'auth_required': video_data.get('authRequired') == '1' or isLive, + 'is_live': isLive + }) + + thumbnails = [] + for image_id, image in video_data.get('images', {}).items(): + image_url = image.get('url') + if not image_url or image.get('type') != 'video': + continue + i = { + 'id': image_id, + 'url': image_url, + } + mobj = re.search(r'(\d+)x(\d+)', image_url) + if mobj: + i.update({ + 'width': int(mobj.group(1)), + 'height': int(mobj.group(2)), + }) + thumbnails.append(i) + + info.update({ + 'id': media_id, + 'title': title, + 'description': strip_or_none(video_data.get('descriptionNoTags') or video_data.get('shortDescriptionNoTags')), + 'duration': float_or_none(video_data.get('duration')) or info.get('duration'), + 'timestamp': int_or_none(video_data.get('created')), + 'season_number': int_or_none(video_data.get('season')), + 'episode_number': int_or_none(video_data.get('episode')), + 'thumbnails': thumbnails, + 'is_live': isLive + }) + return info diff --git a/youtube_dl/extractor/tdslifeway.py b/yt_dlp/extractor/tdslifeway.py index 101c6ee31..101c6ee31 100644 --- a/youtube_dl/extractor/tdslifeway.py +++ b/yt_dlp/extractor/tdslifeway.py diff --git a/yt_dlp/extractor/teachable.py b/yt_dlp/extractor/teachable.py new file mode 100644 index 000000000..37eae82bc --- /dev/null +++ b/yt_dlp/extractor/teachable.py @@ -0,0 +1,298 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .wistia import WistiaIE +from ..utils import ( + clean_html, + ExtractorError, + int_or_none, + get_element_by_class, + strip_or_none, + urlencode_postdata, + urljoin, +) + + +class TeachableBaseIE(InfoExtractor): + _NETRC_MACHINE = 'teachable' + _URL_PREFIX = 'teachable:' + + _SITES = { + # Only notable ones here + 'v1.upskillcourses.com': 'upskill', + 'gns3.teachable.com': 'gns3', + 'academyhacker.com': 'academyhacker', + 'stackskills.com': 'stackskills', + 'market.saleshacker.com': 'saleshacker', + 'learnability.org': 'learnability', + 'edurila.com': 'edurila', + 'courses.workitdaily.com': 'workitdaily', + } + + _VALID_URL_SUB_TUPLE = (_URL_PREFIX, '|'.join(re.escape(site) for site in _SITES.keys())) + + def _real_initialize(self): + self._logged_in = False + + def _login(self, site): + if self._logged_in: + return + + username, password = self._get_login_info( + netrc_machine=self._SITES.get(site, site)) + if username is None: + return + + login_page, urlh = self._download_webpage_handle( + 'https://%s/sign_in' % site, None, + 'Downloading %s login page' % site) + + def is_logged(webpage): + return any(re.search(p, webpage) for p in ( + r'class=["\']user-signout', + r'<a[^>]+\bhref=["\']/sign_out', + r'Log\s+[Oo]ut\s*<')) + + if is_logged(login_page): + self._logged_in = True + return + + login_url = urlh.geturl() + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'user[email]': username, + 'user[password]': password, + }) + + post_url = self._search_regex( + r'<form[^>]+action=(["\'])(?P<url>(?:(?!\1).)+)\1', login_page, + 'post url', default=login_url, group='url') + + if not post_url.startswith('http'): + post_url = urljoin(login_url, post_url) + + response = self._download_webpage( + post_url, None, 'Logging in to %s' % site, + data=urlencode_postdata(login_form), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': login_url, + }) + + if '>I accept the new Privacy Policy<' in response: + raise ExtractorError( + 'Unable to login: %s asks you to accept new Privacy Policy. ' + 'Go to https://%s/ and accept.' % (site, site), expected=True) + + # Successful login + if is_logged(response): + self._logged_in = True + return + + message = get_element_by_class('alert', response) + if message is not None: + raise ExtractorError( + 'Unable to login: %s' % clean_html(message), expected=True) + + raise ExtractorError('Unable to log in') + + +class TeachableIE(TeachableBaseIE): + _VALID_URL = r'''(?x) + (?: + %shttps?://(?P<site_t>[^/]+)| + https?://(?:www\.)?(?P<site>%s) + ) + /courses/[^/]+/lectures/(?P<id>\d+) + ''' % TeachableBaseIE._VALID_URL_SUB_TUPLE + + _TESTS = [{ + 'url': 'https://gns3.teachable.com/courses/gns3-certified-associate/lectures/6842364', + 'info_dict': { + 'id': 'untlgzk1v7', + 'ext': 'bin', + 'title': 'Overview', + 'description': 'md5:071463ff08b86c208811130ea1c2464c', + 'duration': 736.4, + 'timestamp': 1542315762, + 'upload_date': '20181115', + 'chapter': 'Welcome', + 'chapter_number': 1, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://v1.upskillcourses.com/courses/119763/lectures/1747100', + 'only_matching': True, + }, { + 'url': 'https://gns3.teachable.com/courses/423415/lectures/6885939', + 'only_matching': True, + }, { + 'url': 'teachable:https://v1.upskillcourses.com/courses/essential-web-developer-course/lectures/1747100', + 'only_matching': True, + }] + + @staticmethod + def _is_teachable(webpage): + return 'teachableTracker.linker:autoLink' in webpage and re.search( + r'<link[^>]+href=["\']https?://(?:process\.fs|assets)\.teachablecdn\.com', + webpage) + + @staticmethod + def _extract_url(webpage, source_url): + if not TeachableIE._is_teachable(webpage): + return + if re.match(r'https?://[^/]+/(?:courses|p)', source_url): + return '%s%s' % (TeachableBaseIE._URL_PREFIX, source_url) + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + site = mobj.group('site') or mobj.group('site_t') + video_id = mobj.group('id') + + self._login(site) + + prefixed = url.startswith(self._URL_PREFIX) + if prefixed: + url = url[len(self._URL_PREFIX):] + + webpage = self._download_webpage(url, video_id) + + wistia_urls = WistiaIE._extract_urls(webpage) + if not wistia_urls: + if any(re.search(p, webpage) for p in ( + r'class=["\']lecture-contents-locked', + r'>\s*Lecture contents locked', + r'id=["\']lecture-locked', + # https://academy.tailoredtutors.co.uk/courses/108779/lectures/1955313 + r'class=["\'](?:inner-)?lesson-locked', + r'>LESSON LOCKED<')): + self.raise_login_required('Lecture contents locked') + raise ExtractorError('Unable to find video URL') + + title = self._og_search_title(webpage, default=None) + + chapter = None + chapter_number = None + section_item = self._search_regex( + r'(?s)(?P<li><li[^>]+\bdata-lecture-id=["\']%s[^>]+>.+?</li>)' % video_id, + webpage, 'section item', default=None, group='li') + if section_item: + chapter_number = int_or_none(self._search_regex( + r'data-ss-position=["\'](\d+)', section_item, 'section id', + default=None)) + if chapter_number is not None: + sections = [] + for s in re.findall( + r'(?s)<div[^>]+\bclass=["\']section-title[^>]+>(.+?)</div>', webpage): + section = strip_or_none(clean_html(s)) + if not section: + sections = [] + break + sections.append(section) + if chapter_number <= len(sections): + chapter = sections[chapter_number - 1] + + entries = [{ + '_type': 'url_transparent', + 'url': wistia_url, + 'ie_key': WistiaIE.ie_key(), + 'title': title, + 'chapter': chapter, + 'chapter_number': chapter_number, + } for wistia_url in wistia_urls] + + return self.playlist_result(entries, video_id, title) + + +class TeachableCourseIE(TeachableBaseIE): + _VALID_URL = r'''(?x) + (?: + %shttps?://(?P<site_t>[^/]+)| + https?://(?:www\.)?(?P<site>%s) + ) + /(?:courses|p)/(?:enrolled/)?(?P<id>[^/?#&]+) + ''' % TeachableBaseIE._VALID_URL_SUB_TUPLE + _TESTS = [{ + 'url': 'http://v1.upskillcourses.com/courses/essential-web-developer-course/', + 'info_dict': { + 'id': 'essential-web-developer-course', + 'title': 'The Essential Web Developer Course (Free)', + }, + 'playlist_count': 192, + }, { + 'url': 'http://v1.upskillcourses.com/courses/119763/', + 'only_matching': True, + }, { + 'url': 'http://v1.upskillcourses.com/courses/enrolled/119763', + 'only_matching': True, + }, { + 'url': 'https://gns3.teachable.com/courses/enrolled/423415', + 'only_matching': True, + }, { + 'url': 'teachable:https://learn.vrdev.school/p/gear-vr-developer-mini', + 'only_matching': True, + }, { + 'url': 'teachable:https://filmsimplified.com/p/davinci-resolve-15-crash-course', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if TeachableIE.suitable(url) else super( + TeachableCourseIE, cls).suitable(url) + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + site = mobj.group('site') or mobj.group('site_t') + course_id = mobj.group('id') + + self._login(site) + + prefixed = url.startswith(self._URL_PREFIX) + if prefixed: + prefix = self._URL_PREFIX + url = url[len(prefix):] + + webpage = self._download_webpage(url, course_id) + + url_base = 'https://%s/' % site + + entries = [] + + for mobj in re.finditer( + r'(?s)(?P<li><li[^>]+class=(["\'])(?:(?!\2).)*?section-item[^>]+>.+?</li>)', + webpage): + li = mobj.group('li') + if 'fa-youtube-play' not in li and not re.search(r'\d{1,2}:\d{2}', li): + continue + lecture_url = self._search_regex( + r'<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1', li, + 'lecture url', default=None, group='url') + if not lecture_url: + continue + lecture_id = self._search_regex( + r'/lectures/(\d+)', lecture_url, 'lecture id', default=None) + title = self._html_search_regex( + r'<span[^>]+class=["\']lecture-name[^>]+>([^<]+)', li, + 'title', default=None) + entry_url = urljoin(url_base, lecture_url) + if prefixed: + entry_url = self._URL_PREFIX + entry_url + entries.append( + self.url_result( + entry_url, + ie=TeachableIE.ie_key(), video_id=lecture_id, + video_title=clean_html(title))) + + course_title = self._html_search_regex( + (r'(?s)<img[^>]+class=["\']course-image[^>]+>\s*<h\d>(.+?)</h', + r'(?s)<h\d[^>]+class=["\']course-title[^>]+>(.+?)</h'), + webpage, 'course title', fatal=False) + + return self.playlist_result(entries, course_id, course_title) diff --git a/yt_dlp/extractor/teachertube.py b/yt_dlp/extractor/teachertube.py new file mode 100644 index 000000000..e22f0114c --- /dev/null +++ b/yt_dlp/extractor/teachertube.py @@ -0,0 +1,129 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + ExtractorError, + qualities, +) + + +class TeacherTubeIE(InfoExtractor): + IE_NAME = 'teachertube' + IE_DESC = 'teachertube.com videos' + + _VALID_URL = r'https?://(?:www\.)?teachertube\.com/(viewVideo\.php\?video_id=|music\.php\?music_id=|video/(?:[\da-z-]+-)?|audio/)(?P<id>\d+)' + + _TESTS = [{ + # flowplayer + 'url': 'http://www.teachertube.com/viewVideo.php?video_id=339997', + 'md5': 'f9434ef992fd65936d72999951ee254c', + 'info_dict': { + 'id': '339997', + 'ext': 'mp4', + 'title': 'Measures of dispersion from a frequency table', + 'description': 'Measures of dispersion from a frequency table', + 'thumbnail': r're:https?://.*\.(?:jpg|png)', + }, + }, { + # jwplayer + 'url': 'http://www.teachertube.com/music.php?music_id=8805', + 'md5': '01e8352006c65757caf7b961f6050e21', + 'info_dict': { + 'id': '8805', + 'ext': 'mp3', + 'title': 'PER ASPERA AD ASTRA', + 'description': 'RADIJSKA EMISIJA ZRAKOPLOVNE TEHNI?KE ?KOLE P', + }, + }, { + # unavailable video + 'url': 'http://www.teachertube.com/video/intro-video-schleicher-297790', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + error = self._search_regex( + r'<div\b[^>]+\bclass=["\']msgBox error[^>]+>([^<]+)', webpage, + 'error', default=None) + if error: + raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) + + title = self._html_search_meta('title', webpage, 'title', fatal=True) + TITLE_SUFFIX = ' - TeacherTube' + if title.endswith(TITLE_SUFFIX): + title = title[:-len(TITLE_SUFFIX)].strip() + + description = self._html_search_meta('description', webpage, 'description') + if description: + description = description.strip() + + quality = qualities(['mp3', 'flv', 'mp4']) + + media_urls = re.findall(r'data-contenturl="([^"]+)"', webpage) + media_urls.extend(re.findall(r'var\s+filePath\s*=\s*"([^"]+)"', webpage)) + media_urls.extend(re.findall(r'\'file\'\s*:\s*["\']([^"\']+)["\'],', webpage)) + + formats = [ + { + 'url': media_url, + 'quality': quality(determine_ext(media_url)) + } for media_url in set(media_urls) + ] + + self._sort_formats(formats) + + thumbnail = self._og_search_thumbnail( + webpage, default=None) or self._html_search_meta( + 'thumbnail', webpage) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'formats': formats, + } + + +class TeacherTubeUserIE(InfoExtractor): + IE_NAME = 'teachertube:user:collection' + IE_DESC = 'teachertube.com user and collection videos' + + _VALID_URL = r'https?://(?:www\.)?teachertube\.com/(user/profile|collection)/(?P<user>[0-9a-zA-Z]+)/?' + + _MEDIA_RE = r'''(?sx) + class="?sidebar_thumb_time"?>[0-9:]+</div> + \s* + <a\s+href="(https?://(?:www\.)?teachertube\.com/(?:video|audio)/[^"]+)" + ''' + _TEST = { + 'url': 'http://www.teachertube.com/user/profile/rbhagwati2', + 'info_dict': { + 'id': 'rbhagwati2' + }, + 'playlist_mincount': 179, + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + user_id = mobj.group('user') + + urls = [] + webpage = self._download_webpage(url, user_id) + urls.extend(re.findall(self._MEDIA_RE, webpage)) + + pages = re.findall(r'/ajax-user/user-videos/%s\?page=([0-9]+)' % user_id, webpage)[:-1] + for p in pages: + more = 'http://www.teachertube.com/ajax-user/user-videos/%s?page=%s' % (user_id, p) + webpage = self._download_webpage(more, user_id, 'Downloading page %s/%s' % (p, len(pages))) + video_urls = re.findall(self._MEDIA_RE, webpage) + urls.extend(video_urls) + + entries = [self.url_result(vurl, 'TeacherTube') for vurl in urls] + return self.playlist_result(entries, user_id) diff --git a/youtube_dl/extractor/teachingchannel.py b/yt_dlp/extractor/teachingchannel.py index 624cdb3ad..624cdb3ad 100644 --- a/youtube_dl/extractor/teachingchannel.py +++ b/yt_dlp/extractor/teachingchannel.py diff --git a/youtube_dl/extractor/teamcoco.py b/yt_dlp/extractor/teamcoco.py index 5793b711f..5793b711f 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/yt_dlp/extractor/teamcoco.py diff --git a/youtube_dl/extractor/teamtreehouse.py b/yt_dlp/extractor/teamtreehouse.py index d347e97ef..d347e97ef 100644 --- a/youtube_dl/extractor/teamtreehouse.py +++ b/yt_dlp/extractor/teamtreehouse.py diff --git a/yt_dlp/extractor/techtalks.py b/yt_dlp/extractor/techtalks.py new file mode 100644 index 000000000..78f07319b --- /dev/null +++ b/yt_dlp/extractor/techtalks.py @@ -0,0 +1,82 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + get_element_by_attribute, + clean_html, +) + + +class TechTalksIE(InfoExtractor): + _VALID_URL = r'https?://techtalks\.tv/talks/(?:[^/]+/)?(?P<id>\d+)' + + _TESTS = [{ + 'url': 'http://techtalks.tv/talks/learning-topic-models-going-beyond-svd/57758/', + 'info_dict': { + 'id': '57758', + 'title': 'Learning Topic Models --- Going beyond SVD', + }, + 'playlist': [ + { + 'info_dict': { + 'id': '57758', + 'ext': 'flv', + 'title': 'Learning Topic Models --- Going beyond SVD', + }, + }, + { + 'info_dict': { + 'id': '57758-slides', + 'ext': 'flv', + 'title': 'Learning Topic Models --- Going beyond SVD', + }, + }, + ], + 'params': { + # rtmp download + 'skip_download': True, + }, + }, { + 'url': 'http://techtalks.tv/talks/57758', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + talk_id = mobj.group('id') + webpage = self._download_webpage(url, talk_id) + rtmp_url = self._search_regex( + r'netConnectionUrl: \'(.*?)\'', webpage, 'rtmp url') + play_path = self._search_regex( + r'href=\'(.*?)\' [^>]*id="flowplayer_presenter"', + webpage, 'presenter play path') + title = clean_html(get_element_by_attribute('class', 'title', webpage)) + video_info = { + 'id': talk_id, + 'title': title, + 'url': rtmp_url, + 'play_path': play_path, + 'ext': 'flv', + } + m_slides = re.search(r'<a class="slides" href=\'(.*?)\'', webpage) + if m_slides is None: + return video_info + else: + return { + '_type': 'playlist', + 'id': talk_id, + 'title': title, + 'entries': [ + video_info, + # The slides video + { + 'id': talk_id + '-slides', + 'title': title, + 'url': rtmp_url, + 'play_path': m_slides.group(1), + 'ext': 'flv', + }, + ], + } diff --git a/youtube_dl/extractor/ted.py b/yt_dlp/extractor/ted.py index f09f1a3f9..f09f1a3f9 100644 --- a/youtube_dl/extractor/ted.py +++ b/yt_dlp/extractor/ted.py diff --git a/yt_dlp/extractor/tele13.py b/yt_dlp/extractor/tele13.py new file mode 100644 index 000000000..f8a27550e --- /dev/null +++ b/yt_dlp/extractor/tele13.py @@ -0,0 +1,88 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .youtube import YoutubeIE +from ..utils import ( + js_to_json, + qualities, + determine_ext, +) + + +class Tele13IE(InfoExtractor): + _VALID_URL = r'^https?://(?:www\.)?t13\.cl/videos(?:/[^/]+)+/(?P<id>[\w-]+)' + _TESTS = [ + { + 'url': 'http://www.t13.cl/videos/actualidad/el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', + 'md5': '4cb1fa38adcad8fea88487a078831755', + 'info_dict': { + 'id': 'el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', + 'ext': 'mp4', + 'title': 'El círculo de hierro de Michelle Bachelet en su regreso a La Moneda', + }, + 'params': { + # HTTP Error 404: Not Found + 'skip_download': True, + }, + }, + { + 'url': 'http://www.t13.cl/videos/mundo/tendencias/video-captan-misteriosa-bola-fuego-cielos-bangkok', + 'md5': '867adf6a3b3fef932c68a71d70b70946', + 'info_dict': { + 'id': 'rOoKv2OMpOw', + 'ext': 'mp4', + 'title': 'Shooting star seen on 7-Sep-2015', + 'description': 'md5:7292ff2a34b2f673da77da222ae77e1e', + 'uploader': 'Porjai Jaturongkhakun', + 'upload_date': '20150906', + 'uploader_id': 'UCnLY_3ezwNcDSC_Wc6suZxw', + }, + 'add_ie': ['Youtube'], + } + ] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + setup_js = self._search_regex( + r"(?s)jwplayer\('player-vivo'\).setup\((\{.*?\})\)", + webpage, 'setup code') + sources = self._parse_json(self._search_regex( + r'sources\s*:\s*(\[[^\]]+\])', setup_js, 'sources'), + display_id, js_to_json) + + preference = qualities(['Móvil', 'SD', 'HD']) + formats = [] + urls = [] + for f in sources: + format_url = f['file'] + if format_url and format_url not in urls: + ext = determine_ext(format_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, display_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif YoutubeIE.suitable(format_url): + return self.url_result(format_url, 'Youtube') + else: + formats.append({ + 'url': format_url, + 'format_id': f.get('label'), + 'quality': preference(f.get('label')), + 'ext': ext, + }) + urls.append(format_url) + self._sort_formats(formats) + + return { + 'id': display_id, + 'title': self._search_regex( + r'title\s*:\s*"([^"]+)"', setup_js, 'title'), + 'description': self._html_search_meta( + 'description', webpage, 'description'), + 'thumbnail': self._search_regex( + r'image\s*:\s*"([^"]+)"', setup_js, 'thumbnail', default=None), + 'formats': formats, + } diff --git a/yt_dlp/extractor/tele5.py b/yt_dlp/extractor/tele5.py new file mode 100644 index 000000000..0d9cf75ca --- /dev/null +++ b/yt_dlp/extractor/tele5.py @@ -0,0 +1,108 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .jwplatform import JWPlatformIE +from .nexx import NexxIE +from ..utils import ( + NO_DEFAULT, + parse_qs, + smuggle_url, +) + + +class Tele5IE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tele5\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _GEO_COUNTRIES = ['DE'] + _TESTS = [{ + 'url': 'https://www.tele5.de/mediathek/filme-online/videos?vid=1549416', + 'info_dict': { + 'id': '1549416', + 'ext': 'mp4', + 'upload_date': '20180814', + 'timestamp': 1534290623, + 'title': 'Pandorum', + }, + 'params': { + 'skip_download': True, + }, + }, { + # jwplatform, nexx unavailable + 'url': 'https://www.tele5.de/filme/ghoul-das-geheimnis-des-friedhofmonsters/', + 'info_dict': { + 'id': 'WJuiOlUp', + 'ext': 'mp4', + 'upload_date': '20200603', + 'timestamp': 1591214400, + 'title': 'Ghoul - Das Geheimnis des Friedhofmonsters', + 'description': 'md5:42002af1d887ff3d5b2b3ca1f8137d97', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [JWPlatformIE.ie_key()], + }, { + 'url': 'https://www.tele5.de/kalkofes-mattscheibe/video-clips/politik-und-gesellschaft?ve_id=1551191', + 'only_matching': True, + }, { + 'url': 'https://www.tele5.de/video-clip/?ve_id=1609440', + 'only_matching': True, + }, { + 'url': 'https://www.tele5.de/filme/schlefaz-dragon-crusaders/', + 'only_matching': True, + }, { + 'url': 'https://www.tele5.de/filme/making-of/avengers-endgame/', + 'only_matching': True, + }, { + 'url': 'https://www.tele5.de/star-trek/raumschiff-voyager/ganze-folge/das-vinculum/', + 'only_matching': True, + }, { + 'url': 'https://www.tele5.de/anders-ist-sevda/', + 'only_matching': True, + }] + + def _real_extract(self, url): + qs = parse_qs(url) + video_id = (qs.get('vid') or qs.get('ve_id') or [None])[0] + + NEXX_ID_RE = r'\d{6,}' + JWPLATFORM_ID_RE = r'[a-zA-Z0-9]{8}' + + def nexx_result(nexx_id): + return self.url_result( + 'https://api.nexx.cloud/v3/759/videos/byid/%s' % nexx_id, + ie=NexxIE.ie_key(), video_id=nexx_id) + + nexx_id = jwplatform_id = None + + if video_id: + if re.match(NEXX_ID_RE, video_id): + return nexx_result(video_id) + elif re.match(JWPLATFORM_ID_RE, video_id): + jwplatform_id = video_id + + if not nexx_id: + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + def extract_id(pattern, name, default=NO_DEFAULT): + return self._html_search_regex( + (r'id\s*=\s*["\']video-player["\'][^>]+data-id\s*=\s*["\'](%s)' % pattern, + r'\s+id\s*=\s*["\']player_(%s)' % pattern, + r'\bdata-id\s*=\s*["\'](%s)' % pattern), webpage, name, + default=default) + + nexx_id = extract_id(NEXX_ID_RE, 'nexx id', default=None) + if nexx_id: + return nexx_result(nexx_id) + + if not jwplatform_id: + jwplatform_id = extract_id(JWPLATFORM_ID_RE, 'jwplatform id') + + return self.url_result( + smuggle_url( + 'jwplatform:%s' % jwplatform_id, + {'geo_countries': self._GEO_COUNTRIES}), + ie=JWPlatformIE.ie_key(), video_id=jwplatform_id) diff --git a/youtube_dl/extractor/telebruxelles.py b/yt_dlp/extractor/telebruxelles.py index a0353fe3a..a0353fe3a 100644 --- a/youtube_dl/extractor/telebruxelles.py +++ b/yt_dlp/extractor/telebruxelles.py diff --git a/youtube_dl/extractor/telecinco.py b/yt_dlp/extractor/telecinco.py index eecd6a5c9..eecd6a5c9 100644 --- a/youtube_dl/extractor/telecinco.py +++ b/yt_dlp/extractor/telecinco.py diff --git a/youtube_dl/extractor/telegraaf.py b/yt_dlp/extractor/telegraaf.py index 2dc020537..2dc020537 100644 --- a/youtube_dl/extractor/telegraaf.py +++ b/yt_dlp/extractor/telegraaf.py diff --git a/yt_dlp/extractor/telemb.py b/yt_dlp/extractor/telemb.py new file mode 100644 index 000000000..ac2d603b6 --- /dev/null +++ b/yt_dlp/extractor/telemb.py @@ -0,0 +1,78 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import remove_start + + +class TeleMBIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?telemb\.be/(?P<display_id>.+?)_d_(?P<id>\d+)\.html' + _TESTS = [ + { + 'url': 'http://www.telemb.be/mons-cook-with-danielle-des-cours-de-cuisine-en-anglais-_d_13466.html', + 'md5': 'f45ea69878516ba039835794e0f8f783', + 'info_dict': { + 'id': '13466', + 'display_id': 'mons-cook-with-danielle-des-cours-de-cuisine-en-anglais-', + 'ext': 'mp4', + 'title': 'Mons - Cook with Danielle : des cours de cuisine en anglais ! - Les reportages', + 'description': 'md5:bc5225f47b17c309761c856ad4776265', + 'thumbnail': r're:^http://.*\.(?:jpg|png)$', + } + }, + { + # non-ASCII characters in download URL + 'url': 'http://telemb.be/les-reportages-havre-incendie-mortel_d_13514.html', + 'md5': '6e9682736e5ccd4eab7f21e855350733', + 'info_dict': { + 'id': '13514', + 'display_id': 'les-reportages-havre-incendie-mortel', + 'ext': 'mp4', + 'title': 'Havré - Incendie mortel - Les reportages', + 'description': 'md5:5e54cb449acb029c2b7734e2d946bd4a', + 'thumbnail': r're:^http://.*\.(?:jpg|png)$', + } + }, + ] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') + + webpage = self._download_webpage(url, display_id) + + formats = [] + for video_url in re.findall(r'file\s*:\s*"([^"]+)"', webpage): + fmt = { + 'url': video_url, + 'format_id': video_url.split(':')[0] + } + rtmp = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>.+))/(?P<playpath>mp4:.+)$', video_url) + if rtmp: + fmt.update({ + 'play_path': rtmp.group('playpath'), + 'app': rtmp.group('app'), + 'player_url': 'http://p.jwpcdn.com/6/10/jwplayer.flash.swf', + 'page_url': 'http://www.telemb.be', + 'preference': -10, + }) + formats.append(fmt) + self._sort_formats(formats) + + title = remove_start(self._og_search_title(webpage), 'TéléMB : ') + description = self._html_search_regex( + r'<meta property="og:description" content="(.+?)" />', + webpage, 'description', fatal=False) + thumbnail = self._og_search_thumbnail(webpage) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'formats': formats, + } diff --git a/yt_dlp/extractor/telemundo.py b/yt_dlp/extractor/telemundo.py new file mode 100644 index 000000000..18552a0ef --- /dev/null +++ b/yt_dlp/extractor/telemundo.py @@ -0,0 +1,58 @@ +# coding=utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + try_get, + unified_timestamp, + HEADRequest, +) + + +class TelemundoIE(InfoExtractor): + + _VALID_URL = r'https?:\/\/(?:www\.)?telemundo\.com\/.+?video\/[^\/]+(?P<id>tmvo\d{7})' + _TESTS = [{ + 'url': 'https://www.telemundo.com/noticias/noticias-telemundo-en-la-noche/empleo/video/esta-aplicacion-gratuita-esta-ayudando-los-latinos-encontrar-trabajo-en-estados-unidos-tmvo9829325', + 'info_dict': { + 'id': 'tmvo9829325', + 'timestamp': 1621396800, + 'title': 'Esta aplicación gratuita está ayudando a los latinos a encontrar trabajo en Estados Unidos', + 'uploader': 'Telemundo', + 'uploader_id': 'NBCU_Telemundo', + 'ext': 'mp4', + 'upload_date': '20210519', + }, + 'params': { + 'skip_download': True, + } + }, { + 'url': 'https://www.telemundo.com/shows/al-rojo-vivo/empleo/video/personajes-de-times-square-piden-que-la-ciudad-de-nueva-york-los-deje-volver-trabajar-tmvo9816272', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + metadata = self._parse_json( + self._search_regex(r'<[^>]+id="__NEXT_DATA__"[^>]+>([^<]+)', webpage, 'JSON metadata'), video_id) + redirect_url = try_get( + metadata, + lambda x: x['props']['initialState']['video']['associatedPlaylists'][0]['videos'][0]['videoAssets'][0]['publicUrl']) + + m3u8_url = self._request_webpage(HEADRequest( + redirect_url + '?format=redirect&manifest=m3u&format=redirect&Tracking=true&Embedded=true&formats=MPEG4'), + video_id, 'Processing m3u8').geturl() + formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') + self._sort_formats(formats) + date = unified_timestamp(try_get( + metadata, lambda x: x['props']['initialState']['video']['associatedPlaylists'][0]['videos'][0]['datePublished'].split(' ', 1)[1])) + return { + 'url': url, + 'id': video_id, + 'title': self._search_regex(r'<h1[^>]+>([^<]+)', webpage, 'title', fatal=False), + 'formats': formats, + 'timestamp': date, + 'uploader': 'Telemundo', + 'uploader_id': self._search_regex(r'https?:\/\/(?:[^/]+\/){3}video\/(?P<id>[^\/]+)', m3u8_url, 'Akamai account', fatal=False) + } diff --git a/youtube_dl/extractor/telequebec.py b/yt_dlp/extractor/telequebec.py index 800d87b70..800d87b70 100644 --- a/youtube_dl/extractor/telequebec.py +++ b/yt_dlp/extractor/telequebec.py diff --git a/youtube_dl/extractor/teletask.py b/yt_dlp/extractor/teletask.py index b9e2ef8ca..b9e2ef8ca 100644 --- a/youtube_dl/extractor/teletask.py +++ b/yt_dlp/extractor/teletask.py diff --git a/youtube_dl/extractor/telewebion.py b/yt_dlp/extractor/telewebion.py index 1207b1a1b..1207b1a1b 100644 --- a/youtube_dl/extractor/telewebion.py +++ b/yt_dlp/extractor/telewebion.py diff --git a/yt_dlp/extractor/tennistv.py b/yt_dlp/extractor/tennistv.py new file mode 100644 index 000000000..a39a2fc60 --- /dev/null +++ b/yt_dlp/extractor/tennistv.py @@ -0,0 +1,114 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor + +from ..utils import ( + ExtractorError, + unified_timestamp, +) + + +class TennisTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tennistv\.com/videos/(?P<id>[-a-z0-9]+)' + _TEST = { + 'url': 'https://www.tennistv.com/videos/indian-wells-2018-verdasco-fritz', + 'info_dict': { + 'id': 'indian-wells-2018-verdasco-fritz', + 'ext': 'mp4', + 'title': 'Fernando Verdasco v Taylor Fritz', + 'description': 're:^After his stunning victory.{174}$', + 'thumbnail': 'https://atp-prod.akamaized.net/api/images/v1/images/112831/landscape/1242/0', + 'timestamp': 1521017381, + 'upload_date': '20180314', + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Requires email and password of a subscribed account', + } + _NETRC_MACHINE = 'tennistv' + + def _login(self): + username, password = self._get_login_info() + if not username or not password: + raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) + + login_form = { + 'Email': username, + 'Password': password, + } + login_json = json.dumps(login_form).encode('utf-8') + headers = { + 'content-type': 'application/json', + 'Referer': 'https://www.tennistv.com/login', + 'Origin': 'https://www.tennistv.com', + } + + login_result = self._download_json( + 'https://www.tennistv.com/api/users/v1/login', None, + note='Logging in', + errnote='Login failed (wrong password?)', + headers=headers, + data=login_json) + + if login_result['error']['errorCode']: + raise ExtractorError('Login failed, %s said: %r' % (self.IE_NAME, login_result['error']['errorMessage'])) + + if login_result['entitlement'] != 'SUBSCRIBED': + self.report_warning('%s may not be subscribed to %s.' % (username, self.IE_NAME)) + + self._session_token = login_result['sessionToken'] + + def _real_initialize(self): + self._login() + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + internal_id = self._search_regex(r'video=([\w-]+)', webpage, 'internal video id') + + headers = { + 'Origin': 'https://www.tennistv.com', + 'authorization': 'ATP %s' % self._session_token, + 'content-type': 'application/json', + 'Referer': url, + } + check_data = { + 'videoID': internal_id, + 'VideoUrlType': 'HLS', + } + check_json = json.dumps(check_data).encode('utf-8') + check_result = self._download_json( + 'https://www.tennistv.com/api/users/v1/entitlementchecknondiva', + video_id, note='Checking video authorization', headers=headers, data=check_json) + formats = self._extract_m3u8_formats(check_result['contentUrl'], video_id, ext='mp4') + self._sort_formats(formats) + + vdata = self._download_json( + 'https://www.tennistv.com/api/en/v2/none/common/video/%s' % video_id, + video_id, headers=headers) + + timestamp = unified_timestamp(vdata['timestamp']) + thumbnail = vdata['video']['thumbnailUrl'] + description = vdata['displayText']['description'] + title = vdata['video']['title'] + + series = vdata['tour'] + venue = vdata['displayText']['venue'] + round_str = vdata['seo']['round'] + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'formats': formats, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'series': series, + 'season': venue, + 'episode': round_str, + } diff --git a/yt_dlp/extractor/tenplay.py b/yt_dlp/extractor/tenplay.py new file mode 100644 index 000000000..c810cfd0d --- /dev/null +++ b/yt_dlp/extractor/tenplay.py @@ -0,0 +1,90 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from datetime import datetime +import base64 + +from .common import InfoExtractor +from ..utils import ( + HEADRequest, + urlencode_postdata, +) + + +class TenPlayIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?10play\.com\.au/(?:[^/]+/)+(?P<id>tpv\d{6}[a-z]{5})' + _NETRC_MACHINE = '10play' + _TESTS = [{ + 'url': 'https://10play.com.au/todd-sampsons-body-hack/episodes/season-4/episode-7/tpv200921kvngh', + 'info_dict': { + 'id': '6192880312001', + 'ext': 'mp4', + 'title': "Todd Sampson's Body Hack - S4 Ep. 2", + 'description': 'md5:fa278820ad90f08ea187f9458316ac74', + 'age_limit': 15, + 'timestamp': 1600770600, + 'upload_date': '20200922', + 'uploader': 'Channel 10', + 'uploader_id': '2199827728001' + }, + 'params': { + 'skip_download': True, + } + }, { + 'url': 'https://10play.com.au/how-to-stay-married/web-extras/season-1/terrys-talks-ep-1-embracing-change/tpv190915ylupc', + 'only_matching': True, + }] + _GEO_BYPASS = False + + _AUS_AGES = { + 'G': 0, + 'PG': 15, + 'M': 15, + 'MA': 15, + 'MA15+': 15, + 'R': 18, + 'X': 18 + } + + def _get_bearer_token(self, video_id): + username, password = self._get_login_info() + if username is None or password is None: + self.raise_login_required('Your 10play account\'s details must be provided with --username and --password.') + _timestamp = datetime.now().strftime('%Y%m%d000000') + _auth_header = base64.b64encode(_timestamp.encode('ascii')).decode('ascii') + data = self._download_json('https://10play.com.au/api/user/auth', video_id, 'Getting bearer token', headers={ + 'X-Network-Ten-Auth': _auth_header, + }, data=urlencode_postdata({ + 'email': username, + 'password': password, + })) + return "Bearer " + data['jwt']['accessToken'] + + def _real_extract(self, url): + content_id = self._match_id(url) + _token = self._get_bearer_token(content_id) + data = self._download_json( + 'https://10play.com.au/api/v1/videos/' + content_id, content_id) + _video_url = self._download_json( + data.get('playbackApiEndpoint'), content_id, 'Downloading video JSON', + headers={'Authorization': _token}).get('source') + m3u8_url = self._request_webpage(HEADRequest( + _video_url), content_id).geturl() + if '10play-not-in-oz' in m3u8_url: + self.raise_geo_restricted(countries=['AU']) + formats = self._extract_m3u8_formats(m3u8_url, content_id, 'mp4') + self._sort_formats(formats) + + return { + 'formats': formats, + 'id': data.get('altId') or content_id, + 'title': data.get('title'), + 'description': data.get('description'), + 'age_limit': self._AUS_AGES.get(data.get('classification')), + 'series': data.get('showName'), + 'season': data.get('showContentSeason'), + 'timestamp': data.get('published'), + 'thumbnail': data.get('imageUrl'), + 'uploader': 'Channel 10', + 'uploader_id': '2199827728001', + } diff --git a/yt_dlp/extractor/testurl.py b/yt_dlp/extractor/testurl.py new file mode 100644 index 000000000..8bc512a9c --- /dev/null +++ b/yt_dlp/extractor/testurl.py @@ -0,0 +1,64 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class TestURLIE(InfoExtractor): + """ Allows addressing of the test cases as test:yout.*be_1 """ + + IE_DESC = False # Do not list + _VALID_URL = r'test(?:url)?:(?P<id>(?P<extractor>.+?)(?:_(?P<num>[0-9]+))?)$' + + def _real_extract(self, url): + from ..extractor import gen_extractors + + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + extractor_id = mobj.group('extractor') + all_extractors = gen_extractors() + + rex = re.compile(extractor_id, flags=re.IGNORECASE) + matching_extractors = [ + e for e in all_extractors if rex.search(e.IE_NAME)] + + if len(matching_extractors) == 0: + raise ExtractorError( + 'No extractors matching %r found' % extractor_id, + expected=True) + elif len(matching_extractors) > 1: + # Is it obvious which one to pick? + try: + extractor = next( + ie for ie in matching_extractors + if ie.IE_NAME.lower() == extractor_id.lower()) + except StopIteration: + raise ExtractorError( + ('Found multiple matching extractors: %s' % + ' '.join(ie.IE_NAME for ie in matching_extractors)), + expected=True) + else: + extractor = matching_extractors[0] + + num_str = mobj.group('num') + num = int(num_str) if num_str else 0 + + testcases = [] + t = getattr(extractor, '_TEST', None) + if t: + testcases.append(t) + testcases.extend(getattr(extractor, '_TESTS', [])) + + try: + tc = testcases[num] + except IndexError: + raise ExtractorError( + ('Test case %d not found, got only %d tests' % + (num, len(testcases))), + expected=True) + + self.to_screen('Test URL: %s' % tc['url']) + + return self.url_result(tc['url'], video_id=video_id) diff --git a/yt_dlp/extractor/tf1.py b/yt_dlp/extractor/tf1.py new file mode 100644 index 000000000..669eb5015 --- /dev/null +++ b/yt_dlp/extractor/tf1.py @@ -0,0 +1,86 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, + try_get, +) + + +class TF1IE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tf1\.fr/[^/]+/(?P<program_slug>[^/]+)/videos/(?P<id>[^/?&#]+)\.html' + _TESTS = [{ + 'url': 'https://www.tf1.fr/tmc/quotidien-avec-yann-barthes/videos/quotidien-premiere-partie-11-juin-2019.html', + 'info_dict': { + 'id': '13641379', + 'ext': 'mp4', + 'title': 'md5:f392bc52245dc5ad43771650c96fb620', + 'description': 'md5:a02cdb217141fb2d469d6216339b052f', + 'upload_date': '20190611', + 'timestamp': 1560273989, + 'duration': 1738, + 'series': 'Quotidien avec Yann Barthès', + 'tags': ['intégrale', 'quotidien', 'Replay'], + }, + 'params': { + # Sometimes wat serves the whole file with the --test option + 'skip_download': True, + 'format': 'bestvideo', + }, + }, { + 'url': 'http://www.tf1.fr/tf1/koh-lanta/videos/replay-koh-lanta-22-mai-2015.html', + 'only_matching': True, + }, { + 'url': 'http://www.tf1.fr/hd1/documentaire/videos/mylene-farmer-d-une-icone.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + program_slug, slug = self._match_valid_url(url).groups() + video = self._download_json( + 'https://www.tf1.fr/graphql/web', slug, query={ + 'id': '9b80783950b85247541dd1d851f9cc7fa36574af015621f853ab111a679ce26f', + 'variables': json.dumps({ + 'programSlug': program_slug, + 'slug': slug, + }) + })['data']['videoBySlug'] + wat_id = video['streamId'] + + tags = [] + for tag in (video.get('tags') or []): + label = tag.get('label') + if not label: + continue + tags.append(label) + + decoration = video.get('decoration') or {} + + thumbnails = [] + for source in (try_get(decoration, lambda x: x['image']['sources'], list) or []): + source_url = source.get('url') + if not source_url: + continue + thumbnails.append({ + 'url': source_url, + 'width': int_or_none(source.get('width')), + }) + + return { + '_type': 'url_transparent', + 'id': wat_id, + 'url': 'wat:' + wat_id, + 'title': video.get('title'), + 'thumbnails': thumbnails, + 'description': decoration.get('description'), + 'timestamp': parse_iso8601(video.get('date')), + 'duration': int_or_none(try_get(video, lambda x: x['publicPlayingInfos']['duration'])), + 'tags': tags, + 'series': decoration.get('programLabel'), + 'season_number': int_or_none(video.get('season')), + 'episode_number': int_or_none(video.get('episode')), + } diff --git a/youtube_dl/extractor/tfo.py b/yt_dlp/extractor/tfo.py index 0631cb7ab..0631cb7ab 100644 --- a/youtube_dl/extractor/tfo.py +++ b/yt_dlp/extractor/tfo.py diff --git a/youtube_dl/extractor/theintercept.py b/yt_dlp/extractor/theintercept.py index f23b58713..f23b58713 100644 --- a/youtube_dl/extractor/theintercept.py +++ b/yt_dlp/extractor/theintercept.py diff --git a/yt_dlp/extractor/theplatform.py b/yt_dlp/extractor/theplatform.py new file mode 100644 index 000000000..c2729f12d --- /dev/null +++ b/yt_dlp/extractor/theplatform.py @@ -0,0 +1,411 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import time +import hmac +import binascii +import hashlib + + +from .once import OnceIE +from .adobepass import AdobePassIE +from ..utils import ( + determine_ext, + ExtractorError, + float_or_none, + int_or_none, + parse_qs, + sanitized_Request, + unsmuggle_url, + update_url_query, + xpath_with_ns, + mimetype2ext, + find_xpath_attr, +) + +default_ns = 'http://www.w3.org/2005/SMIL21/Language' +_x = lambda p: xpath_with_ns(p, {'smil': default_ns}) + + +class ThePlatformBaseIE(OnceIE): + _TP_TLD = 'com' + + def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'): + meta = self._download_xml( + smil_url, video_id, note=note, query={'format': 'SMIL'}, + headers=self.geo_verification_headers()) + error_element = find_xpath_attr(meta, _x('.//smil:ref'), 'src') + if error_element is not None: + exception = find_xpath_attr( + error_element, _x('.//smil:param'), 'name', 'exception') + if exception is not None: + if exception.get('value') == 'GeoLocationBlocked': + self.raise_geo_restricted(error_element.attrib['abstract']) + elif error_element.attrib['src'].startswith( + 'http://link.theplatform.%s/s/errorFiles/Unavailable.' + % self._TP_TLD): + raise ExtractorError( + error_element.attrib['abstract'], expected=True) + + smil_formats = self._parse_smil_formats( + meta, smil_url, video_id, namespace=default_ns, + # the parameters are from syfy.com, other sites may use others, + # they also work for nbc.com + f4m_params={'g': 'UXWGVKRWHFSP', 'hdcore': '3.0.3'}, + transform_rtmp_url=lambda streamer, src: (streamer, 'mp4:' + src)) + + formats = [] + for _format in smil_formats: + if OnceIE.suitable(_format['url']): + formats.extend(self._extract_once_formats(_format['url'])) + else: + media_url = _format['url'] + if determine_ext(media_url) == 'm3u8': + hdnea2 = self._get_cookies(media_url).get('hdnea2') + if hdnea2: + _format['url'] = update_url_query(media_url, {'hdnea3': hdnea2.value}) + + formats.append(_format) + + subtitles = self._parse_smil_subtitles(meta, default_ns) + + return formats, subtitles + + def _download_theplatform_metadata(self, path, video_id): + info_url = 'http://link.theplatform.%s/s/%s?format=preview' % (self._TP_TLD, path) + return self._download_json(info_url, video_id) + + def _parse_theplatform_metadata(self, info): + subtitles = {} + captions = info.get('captions') + if isinstance(captions, list): + for caption in captions: + lang, src, mime = caption.get('lang', 'en'), caption.get('src'), caption.get('type') + subtitles.setdefault(lang, []).append({ + 'ext': mimetype2ext(mime), + 'url': src, + }) + + duration = info.get('duration') + tp_chapters = info.get('chapters', []) + chapters = [] + if tp_chapters: + def _add_chapter(start_time, end_time): + start_time = float_or_none(start_time, 1000) + end_time = float_or_none(end_time, 1000) + if start_time is None or end_time is None: + return + chapters.append({ + 'start_time': start_time, + 'end_time': end_time, + }) + + for chapter in tp_chapters[:-1]: + _add_chapter(chapter.get('startTime'), chapter.get('endTime')) + _add_chapter(tp_chapters[-1].get('startTime'), tp_chapters[-1].get('endTime') or duration) + + return { + 'title': info['title'], + 'subtitles': subtitles, + 'description': info['description'], + 'thumbnail': info['defaultThumbnailUrl'], + 'duration': float_or_none(duration, 1000), + 'timestamp': int_or_none(info.get('pubDate'), 1000) or None, + 'uploader': info.get('billingCode'), + 'chapters': chapters, + } + + def _extract_theplatform_metadata(self, path, video_id): + info = self._download_theplatform_metadata(path, video_id) + return self._parse_theplatform_metadata(info) + + +class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): + _VALID_URL = r'''(?x) + (?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P<provider_id>[^/]+)/ + (?:(?:(?:[^/]+/)+select/)?(?P<media>media/(?:guid/\d+/)?)?|(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/))? + |theplatform:)(?P<id>[^/\?&]+)''' + + _TESTS = [{ + # from http://www.metacafe.com/watch/cb-e9I_cZgTgIPd/blackberrys_big_bold_z30/ + 'url': 'http://link.theplatform.com/s/dJ5BDC/e9I_cZgTgIPd/meta.smil?format=smil&Tracking=true&mbr=true', + 'info_dict': { + 'id': 'e9I_cZgTgIPd', + 'ext': 'flv', + 'title': 'Blackberry\'s big, bold Z30', + 'description': 'The Z30 is Blackberry\'s biggest, baddest mobile messaging device yet.', + 'duration': 247, + 'timestamp': 1383239700, + 'upload_date': '20131031', + 'uploader': 'CBSI-NEW', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': '404 Not Found', + }, { + # from http://www.cnet.com/videos/tesla-model-s-a-second-step-towards-a-cleaner-motoring-future/ + 'url': 'http://link.theplatform.com/s/kYEXFC/22d_qsQ6MIRT', + 'info_dict': { + 'id': '22d_qsQ6MIRT', + 'ext': 'flv', + 'description': 'md5:ac330c9258c04f9d7512cf26b9595409', + 'title': 'Tesla Model S: A second step towards a cleaner motoring future', + 'timestamp': 1426176191, + 'upload_date': '20150312', + 'uploader': 'CBSI-NEW', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + 'url': 'https://player.theplatform.com/p/D6x-PC/pulse_preview/embed/select/media/yMBg9E8KFxZD', + 'info_dict': { + 'id': 'yMBg9E8KFxZD', + 'ext': 'mp4', + 'description': 'md5:644ad9188d655b742f942bf2e06b002d', + 'title': 'HIGHLIGHTS: USA bag first ever series Cup win', + 'uploader': 'EGSM', + } + }, { + 'url': 'http://player.theplatform.com/p/NnzsPC/widget/select/media/4Y0TlYUr_ZT7', + 'only_matching': True, + }, { + 'url': 'http://player.theplatform.com/p/2E2eJC/nbcNewsOffsite?guid=tdy_or_siri_150701', + 'md5': 'fb96bb3d85118930a5b055783a3bd992', + 'info_dict': { + 'id': 'tdy_or_siri_150701', + 'ext': 'mp4', + 'title': 'iPhone Siri’s sassy response to a math question has people talking', + 'description': 'md5:a565d1deadd5086f3331d57298ec6333', + 'duration': 83.0, + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1435752600, + 'upload_date': '20150701', + 'uploader': 'NBCU-NEWS', + }, + }, { + # From http://www.nbc.com/the-blacklist/video/sir-crispin-crandall/2928790?onid=137781#vc137781=1 + # geo-restricted (US), HLS encrypted with AES-128 + 'url': 'http://player.theplatform.com/p/NnzsPC/onsite_universal/select/media/guid/2410887629/2928790?fwsitesection=nbc_the_blacklist_video_library&autoPlay=true&carouselID=137781', + 'only_matching': True, + }] + + @classmethod + def _extract_urls(cls, webpage): + m = re.search( + r'''(?x) + <meta\s+ + property=(["'])(?:og:video(?::(?:secure_)?url)?|twitter:player)\1\s+ + content=(["'])(?P<url>https?://player\.theplatform\.com/p/.+?)\2 + ''', webpage) + if m: + return [m.group('url')] + + # Are whitespaces ignored in URLs? + # https://github.com/ytdl-org/youtube-dl/issues/12044 + matches = re.findall( + r'(?s)<(?:iframe|script)[^>]+src=(["\'])((?:https?:)?//player\.theplatform\.com/p/.+?)\1', webpage) + if matches: + return [re.sub(r'\s', '', list(zip(*matches))[1][0])] + + @staticmethod + def _sign_url(url, sig_key, sig_secret, life=600, include_qs=False): + flags = '10' if include_qs else '00' + expiration_date = '%x' % (int(time.time()) + life) + + def str_to_hex(str): + return binascii.b2a_hex(str.encode('ascii')).decode('ascii') + + def hex_to_bytes(hex): + return binascii.a2b_hex(hex.encode('ascii')) + + relative_path = re.match(r'https?://link\.theplatform\.com/s/([^?]+)', url).group(1) + clear_text = hex_to_bytes(flags + expiration_date + str_to_hex(relative_path)) + checksum = hmac.new(sig_key.encode('ascii'), clear_text, hashlib.sha1).hexdigest() + sig = flags + expiration_date + checksum + str_to_hex(sig_secret) + return '%s&sig=%s' % (url, sig) + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + self._initialize_geo_bypass({ + 'countries': smuggled_data.get('geo_countries'), + }) + + mobj = self._match_valid_url(url) + provider_id = mobj.group('provider_id') + video_id = mobj.group('id') + + if not provider_id: + provider_id = 'dJ5BDC' + + path = provider_id + '/' + if mobj.group('media'): + path += mobj.group('media') + path += video_id + + qs_dict = parse_qs(url) + if 'guid' in qs_dict: + webpage = self._download_webpage(url, video_id) + scripts = re.findall(r'<script[^>]+src="([^"]+)"', webpage) + feed_id = None + # feed id usually locates in the last script. + # Seems there's no pattern for the interested script filename, so + # I try one by one + for script in reversed(scripts): + feed_script = self._download_webpage( + self._proto_relative_url(script, 'http:'), + video_id, 'Downloading feed script') + feed_id = self._search_regex( + r'defaultFeedId\s*:\s*"([^"]+)"', feed_script, + 'default feed id', default=None) + if feed_id is not None: + break + if feed_id is None: + raise ExtractorError('Unable to find feed id') + return self.url_result('http://feed.theplatform.com/f/%s/%s?byGuid=%s' % ( + provider_id, feed_id, qs_dict['guid'][0])) + + if smuggled_data.get('force_smil_url', False): + smil_url = url + # Explicitly specified SMIL (see https://github.com/ytdl-org/youtube-dl/issues/7385) + elif '/guid/' in url: + headers = {} + source_url = smuggled_data.get('source_url') + if source_url: + headers['Referer'] = source_url + request = sanitized_Request(url, headers=headers) + webpage = self._download_webpage(request, video_id) + smil_url = self._search_regex( + r'<link[^>]+href=(["\'])(?P<url>.+?)\1[^>]+type=["\']application/smil\+xml', + webpage, 'smil url', group='url') + path = self._search_regex( + r'link\.theplatform\.com/s/((?:[^/?#&]+/)+[^/?#&]+)', smil_url, 'path') + smil_url += '?' if '?' not in smil_url else '&' + 'formats=m3u,mpeg4' + elif mobj.group('config'): + config_url = url + '&form=json' + config_url = config_url.replace('swf/', 'config/') + config_url = config_url.replace('onsite/', 'onsite/config/') + config = self._download_json(config_url, video_id, 'Downloading config') + if 'releaseUrl' in config: + release_url = config['releaseUrl'] + else: + release_url = 'http://link.theplatform.com/s/%s?mbr=true' % path + smil_url = release_url + '&formats=MPEG4&manifest=f4m' + else: + smil_url = 'http://link.theplatform.com/s/%s?mbr=true' % path + + sig = smuggled_data.get('sig') + if sig: + smil_url = self._sign_url(smil_url, sig['key'], sig['secret']) + + formats, subtitles = self._extract_theplatform_smil(smil_url, video_id) + self._sort_formats(formats) + + ret = self._extract_theplatform_metadata(path, video_id) + combined_subtitles = self._merge_subtitles(ret.get('subtitles', {}), subtitles) + ret.update({ + 'id': video_id, + 'formats': formats, + 'subtitles': combined_subtitles, + }) + + return ret + + +class ThePlatformFeedIE(ThePlatformBaseIE): + _URL_TEMPLATE = '%s//feed.theplatform.com/f/%s/%s?form=json&%s' + _VALID_URL = r'https?://feed\.theplatform\.com/f/(?P<provider_id>[^/]+)/(?P<feed_id>[^?/]+)\?(?:[^&]+&)*(?P<filter>by(?:Gui|I)d=(?P<id>[^&]+))' + _TESTS = [{ + # From http://player.theplatform.com/p/7wvmTC/MSNBCEmbeddedOffSite?guid=n_hardball_5biden_140207 + 'url': 'http://feed.theplatform.com/f/7wvmTC/msnbc_video-p-test?form=json&pretty=true&range=-40&byGuid=n_hardball_5biden_140207', + 'md5': '6e32495b5073ab414471b615c5ded394', + 'info_dict': { + 'id': 'n_hardball_5biden_140207', + 'ext': 'mp4', + 'title': 'The Biden factor: will Joe run in 2016?', + 'description': 'Could Vice President Joe Biden be preparing a 2016 campaign? Mark Halperin and Sam Stein weigh in.', + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20140208', + 'timestamp': 1391824260, + 'duration': 467.0, + 'categories': ['MSNBC/Issues/Democrats', 'MSNBC/Issues/Elections/Election 2016'], + 'uploader': 'NBCU-NEWS', + }, + }, { + 'url': 'http://feed.theplatform.com/f/2E2eJC/nnd_NBCNews?byGuid=nn_netcast_180306.Copy.01', + 'only_matching': True, + }] + + def _extract_feed_info(self, provider_id, feed_id, filter_query, video_id, custom_fields=None, asset_types_query={}, account_id=None): + real_url = self._URL_TEMPLATE % (self.http_scheme(), provider_id, feed_id, filter_query) + entry = self._download_json(real_url, video_id)['entries'][0] + main_smil_url = 'http://link.theplatform.com/s/%s/media/guid/%d/%s' % (provider_id, account_id, entry['guid']) if account_id else entry.get('plmedia$publicUrl') + + formats = [] + subtitles = {} + first_video_id = None + duration = None + asset_types = [] + for item in entry['media$content']: + smil_url = item['plfile$url'] + cur_video_id = ThePlatformIE._match_id(smil_url) + if first_video_id is None: + first_video_id = cur_video_id + duration = float_or_none(item.get('plfile$duration')) + file_asset_types = item.get('plfile$assetTypes') or parse_qs(smil_url)['assetTypes'] + for asset_type in file_asset_types: + if asset_type in asset_types: + continue + asset_types.append(asset_type) + query = { + 'mbr': 'true', + 'formats': item['plfile$format'], + 'assetTypes': asset_type, + } + if asset_type in asset_types_query: + query.update(asset_types_query[asset_type]) + cur_formats, cur_subtitles = self._extract_theplatform_smil(update_url_query( + main_smil_url or smil_url, query), video_id, 'Downloading SMIL data for %s' % asset_type) + formats.extend(cur_formats) + subtitles = self._merge_subtitles(subtitles, cur_subtitles) + + self._sort_formats(formats) + + thumbnails = [{ + 'url': thumbnail['plfile$url'], + 'width': int_or_none(thumbnail.get('plfile$width')), + 'height': int_or_none(thumbnail.get('plfile$height')), + } for thumbnail in entry.get('media$thumbnails', [])] + + timestamp = int_or_none(entry.get('media$availableDate'), scale=1000) + categories = [item['media$name'] for item in entry.get('media$categories', [])] + + ret = self._extract_theplatform_metadata('%s/%s' % (provider_id, first_video_id), video_id) + subtitles = self._merge_subtitles(subtitles, ret['subtitles']) + ret.update({ + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': thumbnails, + 'duration': duration, + 'timestamp': timestamp, + 'categories': categories, + }) + if custom_fields: + ret.update(custom_fields(entry)) + + return ret + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + + video_id = mobj.group('id') + provider_id = mobj.group('provider_id') + feed_id = mobj.group('feed_id') + filter_query = mobj.group('filter') + + return self._extract_feed_info(provider_id, feed_id, filter_query, video_id) diff --git a/youtube_dl/extractor/thescene.py b/yt_dlp/extractor/thescene.py index cd642355c..cd642355c 100644 --- a/youtube_dl/extractor/thescene.py +++ b/yt_dlp/extractor/thescene.py diff --git a/youtube_dl/extractor/thestar.py b/yt_dlp/extractor/thestar.py index c3f118894..c3f118894 100644 --- a/youtube_dl/extractor/thestar.py +++ b/yt_dlp/extractor/thestar.py diff --git a/youtube_dl/extractor/thesun.py b/yt_dlp/extractor/thesun.py index 15d4a6932..15d4a6932 100644 --- a/youtube_dl/extractor/thesun.py +++ b/yt_dlp/extractor/thesun.py diff --git a/yt_dlp/extractor/theta.py b/yt_dlp/extractor/theta.py new file mode 100644 index 000000000..3b6543629 --- /dev/null +++ b/yt_dlp/extractor/theta.py @@ -0,0 +1,87 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import try_get + + +class ThetaStreamIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?theta\.tv/(?!video/)(?P<id>[a-z0-9]+)' + _TESTS = [{ + 'url': 'https://www.theta.tv/davirus', + 'skip': 'The live may have ended', + 'info_dict': { + 'id': 'DaVirus', + 'ext': 'mp4', + 'title': 'I choose you - My Community is King -👀 - YO HABLO ESPANOL - CODE DAVIRUS', + 'thumbnail': r're:https://live-thumbnails-prod-theta-tv\.imgix\.net/thumbnail/.+\.jpg', + } + }, { + 'url': 'https://www.theta.tv/mst3k', + 'note': 'This channel is live 24/7', + 'info_dict': { + 'id': 'MST3K', + 'ext': 'mp4', + 'title': 'Mystery Science Theatre 3000 24/7 Powered by the THETA Network.', + 'thumbnail': r're:https://user-prod-theta-tv\.imgix\.net/.+\.jpg', + } + }] + + def _real_extract(self, url): + channel_id = self._match_id(url) + info = self._download_json(f'https://api.theta.tv/v1/channel?alias={channel_id}', channel_id)['body'] + + m3u8_playlist = next( + data['url'] for data in info['live_stream']['video_urls'] + if data.get('type') != 'embed' and data.get('resolution') in ('master', 'source')) + + formats = self._extract_m3u8_formats(m3u8_playlist, channel_id, 'mp4', m3u8_id='hls', live=True) + self._sort_formats(formats) + + channel = try_get(info, lambda x: x['user']['username']) # using this field instead of channel_id due to capitalization + + return { + 'id': channel, + 'title': try_get(info, lambda x: x['live_stream']['title']), + 'channel': channel, + 'view_count': try_get(info, lambda x: x['live_stream']['view_count']), + 'is_live': True, + 'formats': formats, + 'thumbnail': try_get(info, lambda x: x['live_stream']['thumbnail_url']), + } + + +class ThetaVideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?theta\.tv/video/(?P<id>vid[a-z0-9]+)' + _TEST = { + 'url': 'https://www.theta.tv/video/vidiq6aaet3kzf799p0', + 'md5': '633d8c29eb276bb38a111dbd591c677f', + 'info_dict': { + 'id': 'vidiq6aaet3kzf799p0', + 'ext': 'mp4', + 'title': 'Theta EdgeCast Tutorial', + 'uploader': 'Pixiekittie', + 'description': 'md5:e316253f5bdced8b5a46bb50ae60a09f', + 'thumbnail': r're:https://user-prod-theta-tv\.imgix\.net/.+/vod_thumb/.+.jpg', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + info = self._download_json(f'https://api.theta.tv/v1/video/{video_id}/raw', video_id)['body'] + + m3u8_playlist = try_get(info, lambda x: x['video_urls'][0]['url']) + + formats = self._extract_m3u8_formats(m3u8_playlist, video_id, 'mp4', m3u8_id='hls') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': info.get('title'), + 'uploader': try_get(info, lambda x: x['user']['username']), + 'description': info.get('description'), + 'view_count': info.get('view_count'), + 'like_count': info.get('like_count'), + 'formats': formats, + 'thumbnail': info.get('thumbnail_url'), + } diff --git a/yt_dlp/extractor/theweatherchannel.py b/yt_dlp/extractor/theweatherchannel.py new file mode 100644 index 000000000..9e506c9e0 --- /dev/null +++ b/yt_dlp/extractor/theweatherchannel.py @@ -0,0 +1,101 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .theplatform import ThePlatformIE +from ..utils import ( + determine_ext, + parse_duration, + parse_iso8601, +) + + +class TheWeatherChannelIE(ThePlatformIE): + _VALID_URL = r'https?://(?:www\.)?weather\.com(?P<asset_name>(?:/(?P<locale>[a-z]{2}-[A-Z]{2}))?/(?:[^/]+/)*video/(?P<id>[^/?#]+))' + _TESTS = [{ + 'url': 'https://weather.com/series/great-outdoors/video/ice-climber-is-in-for-a-shock', + 'md5': 'c4cbe74c9c17c5676b704b950b73dd92', + 'info_dict': { + 'id': 'cc82397e-cc3f-4d11-9390-a785add090e8', + 'ext': 'mp4', + 'title': 'Ice Climber Is In For A Shock', + 'description': 'md5:55606ce1378d4c72e6545e160c9d9695', + 'uploader': 'TWC - Digital (No Distro)', + 'uploader_id': '6ccd5455-16bb-46f2-9c57-ff858bb9f62c', + 'upload_date': '20160720', + 'timestamp': 1469018835, + } + }, { + 'url': 'https://weather.com/en-CA/international/videos/video/unidentified-object-falls-from-sky-in-india', + 'only_matching': True, + }] + + def _real_extract(self, url): + asset_name, locale, display_id = self._match_valid_url(url).groups() + if not locale: + locale = 'en-US' + video_data = list(self._download_json( + 'https://weather.com/api/v1/p/redux-dal', display_id, data=json.dumps([{ + 'name': 'getCMSAssetsUrlConfig', + 'params': { + 'language': locale.replace('-', '_'), + 'query': { + 'assetName': { + '$in': asset_name, + }, + }, + } + }]).encode(), headers={ + 'Content-Type': 'application/json', + })['dal']['getCMSAssetsUrlConfig'].values())[0]['data'][0] + video_id = video_data['id'] + seo_meta = video_data.get('seometa', {}) + title = video_data.get('title') or seo_meta['title'] + + urls = [] + thumbnails = [] + formats = [] + for variant_id, variant_url in video_data.get('variants', []).items(): + variant_url = variant_url.strip() + if not variant_url or variant_url in urls: + continue + urls.append(variant_url) + ext = determine_ext(variant_url) + if ext == 'jpg': + thumbnails.append({ + 'url': variant_url, + 'id': variant_id, + }) + elif ThePlatformIE.suitable(variant_url): + tp_formats, _ = self._extract_theplatform_smil(variant_url, video_id) + formats.extend(tp_formats) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + variant_url, video_id, 'mp4', 'm3u8_native', + m3u8_id=variant_id, fatal=False)) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + variant_url, video_id, f4m_id=variant_id, fatal=False)) + else: + formats.append({ + 'url': variant_url, + 'format_id': variant_id, + }) + self._sort_formats(formats) + + cc_url = video_data.get('cc_url') + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': video_data.get('description') or seo_meta.get('description') or seo_meta.get('og:description'), + 'duration': parse_duration(video_data.get('duration')), + 'uploader': video_data.get('providername'), + 'uploader_id': video_data.get('providerid'), + 'timestamp': parse_iso8601(video_data.get('publishdate')), + 'subtitles': {locale[:2]: [{'url': cc_url}]} if cc_url else None, + 'thumbnails': thumbnails, + 'formats': formats, + } diff --git a/youtube_dl/extractor/thisamericanlife.py b/yt_dlp/extractor/thisamericanlife.py index 91e45f2c3..91e45f2c3 100644 --- a/youtube_dl/extractor/thisamericanlife.py +++ b/yt_dlp/extractor/thisamericanlife.py diff --git a/yt_dlp/extractor/thisav.py b/yt_dlp/extractor/thisav.py new file mode 100644 index 000000000..4af286e6d --- /dev/null +++ b/yt_dlp/extractor/thisav.py @@ -0,0 +1,72 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import remove_end + + +class ThisAVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?thisav\.com/video/(?P<id>[0-9]+)/.*' + _TESTS = [{ + # jwplayer + 'url': 'http://www.thisav.com/video/47734/%98%26sup1%3B%83%9E%83%82---just-fit.html', + 'md5': '0480f1ef3932d901f0e0e719f188f19b', + 'info_dict': { + 'id': '47734', + 'ext': 'flv', + 'title': '高樹マリア - Just fit', + 'uploader': 'dj7970', + 'uploader_id': 'dj7970' + } + }, { + # html5 media + 'url': 'http://www.thisav.com/video/242352/nerdy-18yo-big-ass-tattoos-and-glasses.html', + 'md5': 'ba90c076bd0f80203679e5b60bf523ee', + 'info_dict': { + 'id': '242352', + 'ext': 'mp4', + 'title': 'Nerdy 18yo Big Ass Tattoos and Glasses', + 'uploader': 'cybersluts', + 'uploader_id': 'cybersluts', + }, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + title = remove_end(self._html_search_regex( + r'<title>([^<]+)</title>', webpage, 'title'), + ' - 視頻 - ThisAV.com-世界第一中文成人娛樂網站') + video_url = self._html_search_regex( + r"addVariable\('file','([^']+)'\);", webpage, 'video url', default=None) + if video_url: + info_dict = { + 'formats': [{ + 'url': video_url, + }], + } + else: + entries = self._parse_html5_media_entries(url, webpage, video_id) + if entries: + info_dict = entries[0] + else: + info_dict = self._extract_jwplayer_data( + webpage, video_id, require_title=False) + uploader = self._html_search_regex( + r': <a href="http://www\.thisav\.com/user/[0-9]+/(?:[^"]+)">([^<]+)</a>', + webpage, 'uploader name', fatal=False) + uploader_id = self._html_search_regex( + r': <a href="http://www\.thisav\.com/user/[0-9]+/([^"]+)">(?:[^<]+)</a>', + webpage, 'uploader id', fatal=False) + + info_dict.update({ + 'id': video_id, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'title': title, + }) + + return info_dict diff --git a/youtube_dl/extractor/thisoldhouse.py b/yt_dlp/extractor/thisoldhouse.py index a3d9b4017..a3d9b4017 100644 --- a/youtube_dl/extractor/thisoldhouse.py +++ b/yt_dlp/extractor/thisoldhouse.py diff --git a/yt_dlp/extractor/threeqsdn.py b/yt_dlp/extractor/threeqsdn.py new file mode 100644 index 000000000..bb7610352 --- /dev/null +++ b/yt_dlp/extractor/threeqsdn.py @@ -0,0 +1,166 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ( + determine_ext, + ExtractorError, + float_or_none, + int_or_none, + parse_iso8601, +) + + +class ThreeQSDNIE(InfoExtractor): + IE_NAME = '3qsdn' + IE_DESC = '3Q SDN' + _VALID_URL = r'https?://playout\.3qsdn\.com/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _TESTS = [{ + # https://player.3qsdn.com/demo.html + 'url': 'https://playout.3qsdn.com/7201c779-6b3c-11e7-a40e-002590c750be', + 'md5': '64a57396b16fa011b15e0ea60edce918', + 'info_dict': { + 'id': '7201c779-6b3c-11e7-a40e-002590c750be', + 'ext': 'mp4', + 'title': 'Video Ads', + 'is_live': False, + 'description': 'Video Ads Demo', + 'timestamp': 1500334803, + 'upload_date': '20170717', + 'duration': 888.032, + 'subtitles': { + 'eng': 'count:1', + }, + }, + 'expected_warnings': ['Unknown MIME type application/mp4 in DASH manifest'], + }, { + # live video stream + 'url': 'https://playout.3qsdn.com/66e68995-11ca-11e8-9273-002590c750be', + 'info_dict': { + 'id': '66e68995-11ca-11e8-9273-002590c750be', + 'ext': 'mp4', + 'title': 're:^66e68995-11ca-11e8-9273-002590c750be [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'is_live': True, + }, + 'params': { + 'skip_download': True, # m3u8 downloads + }, + }, { + # live audio stream + 'url': 'http://playout.3qsdn.com/9edf36e0-6bf2-11e2-a16a-9acf09e2db48', + 'only_matching': True, + }, { + # live audio stream with some 404 URLs + 'url': 'http://playout.3qsdn.com/ac5c3186-777a-11e2-9c30-9acf09e2db48', + 'only_matching': True, + }, { + # geo restricted with 'This content is not available in your country' + 'url': 'http://playout.3qsdn.com/d63a3ffe-75e8-11e2-9c30-9acf09e2db48', + 'only_matching': True, + }, { + # geo restricted with 'playout.3qsdn.com/forbidden' + 'url': 'http://playout.3qsdn.com/8e330f26-6ae2-11e2-a16a-9acf09e2db48', + 'only_matching': True, + }, { + # live video with rtmp link + 'url': 'https://playout.3qsdn.com/6092bb9e-8f72-11e4-a173-002590c750be', + 'only_matching': True, + }, { + # ondemand from http://www.philharmonie.tv/veranstaltung/26/ + 'url': 'http://playout.3qsdn.com/0280d6b9-1215-11e6-b427-0cc47a188158?protocol=http', + 'only_matching': True, + }, { + # live video stream + 'url': 'https://playout.3qsdn.com/d755d94b-4ab9-11e3-9162-0025907ad44f?js=true', + 'only_matching': True, + }] + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe[^>]+\b(?:data-)?src=(["\'])(?P<url>%s.*?)\1' % ThreeQSDNIE._VALID_URL, webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + video_id = self._match_id(url) + + try: + config = self._download_json( + url.replace('://playout.3qsdn.com/', '://playout.3qsdn.com/config/'), video_id) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + self.raise_geo_restricted() + raise + + live = config.get('streamContent') == 'live' + aspect = float_or_none(config.get('aspect')) + + formats = [] + subtitles = {} + for source_type, source in (config.get('sources') or {}).items(): + if not source: + continue + if source_type == 'dash': + fmts, subs = self._extract_mpd_formats_and_subtitles( + source, video_id, mpd_id='mpd', fatal=False) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) + elif source_type == 'hls': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + source, video_id, 'mp4', 'm3u8' if live else 'm3u8_native', + m3u8_id='hls', fatal=False) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) + elif source_type == 'progressive': + for s in source: + src = s.get('src') + if not (src and self._is_valid_url(src, video_id)): + continue + width = None + format_id = ['http'] + ext = determine_ext(src) + if ext: + format_id.append(ext) + height = int_or_none(s.get('height')) + if height: + format_id.append('%dp' % height) + if aspect: + width = int(height * aspect) + formats.append({ + 'ext': ext, + 'format_id': '-'.join(format_id), + 'height': height, + 'source_preference': 0, + 'url': src, + 'vcodec': 'none' if height == 0 else None, + 'width': width, + }) + # It seems like this would be correctly handled by default + # However, unless someone can confirm this, the old + # behaviour is being kept as-is + self._sort_formats(formats, ('res', 'source_preference')) + + for subtitle in (config.get('subtitles') or []): + src = subtitle.get('src') + if not src: + continue + subtitles.setdefault(subtitle.get('label') or 'eng', []).append({ + 'url': src, + }) + + title = config.get('title') or video_id + + return { + 'id': video_id, + 'title': self._live_title(title) if live else title, + 'thumbnail': config.get('poster') or None, + 'description': config.get('description') or None, + 'timestamp': parse_iso8601(config.get('upload_date')), + 'duration': float_or_none(config.get('vlength')) or None, + 'is_live': live, + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py new file mode 100644 index 000000000..1db6327e2 --- /dev/null +++ b/yt_dlp/extractor/tiktok.py @@ -0,0 +1,563 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools +import random +import string +import time +import json + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote +from ..utils import ( + ExtractorError, + int_or_none, + str_or_none, + traverse_obj, + try_get, + url_or_none, + qualities, +) + + +class TikTokBaseIE(InfoExtractor): + _APP_VERSION = '20.9.3' + _MANIFEST_APP_VERSION = '291' + _APP_NAME = 'trill' + _AID = 1180 + _API_HOSTNAME = 'api-t2.tiktokv.com' + _UPLOADER_URL_FORMAT = 'https://www.tiktok.com/@%s' + QUALITIES = ('360p', '540p', '720p') + + def _call_api(self, ep, query, video_id, fatal=True, + note='Downloading API JSON', errnote='Unable to download API page'): + real_query = { + **query, + 'version_name': self._APP_VERSION, + 'version_code': self._MANIFEST_APP_VERSION, + 'build_number': self._APP_VERSION, + 'manifest_version_code': self._MANIFEST_APP_VERSION, + 'update_version_code': self._MANIFEST_APP_VERSION, + 'openudid': ''.join(random.choice('0123456789abcdef') for i in range(16)), + 'uuid': ''.join([random.choice(string.digits) for num in range(16)]), + '_rticket': int(time.time() * 1000), + 'ts': int(time.time()), + 'device_brand': 'Google', + 'device_type': 'Pixel 4', + 'device_platform': 'android', + 'resolution': '1080*1920', + 'dpi': 420, + 'os_version': '10', + 'os_api': '29', + 'carrier_region': 'US', + 'sys_region': 'US', + 'region': 'US', + 'app_name': self._APP_NAME, + 'app_language': 'en', + 'language': 'en', + 'timezone_name': 'America/New_York', + 'timezone_offset': '-14400', + 'channel': 'googleplay', + 'ac': 'wifi', + 'mcc_mnc': '310260', + 'is_my_cn': 0, + 'aid': self._AID, + 'ssmix': 'a', + 'as': 'a1qwert123', + 'cp': 'cbfhckdckkde1', + } + self._set_cookie(self._API_HOSTNAME, 'odin_tt', ''.join(random.choice('0123456789abcdef') for i in range(160))) + return self._download_json( + 'https://%s/aweme/v1/%s/' % (self._API_HOSTNAME, ep), video_id=video_id, + fatal=fatal, note=note, errnote=errnote, headers={ + 'User-Agent': f'com.ss.android.ugc.trill/{self._MANIFEST_APP_VERSION} (Linux; U; Android 10; en_US; Pixel 4; Build/QQ3A.200805.001; Cronet/58.0.2991.0)', + 'Accept': 'application/json', + }, query=real_query) + + def _parse_aweme_video_app(self, aweme_detail): + aweme_id = aweme_detail['aweme_id'] + video_info = aweme_detail['video'] + + def parse_url_key(url_key): + format_id, codec, res, bitrate = self._search_regex( + r'v[^_]+_(?P<id>(?P<codec>[^_]+)_(?P<res>\d+p)_(?P<bitrate>\d+))', url_key, + 'url key', default=(None, None, None, None), group=('id', 'codec', 'res', 'bitrate')) + if not format_id: + return {}, None + return { + 'format_id': format_id, + 'vcodec': 'h265' if codec == 'bytevc1' else codec, + 'tbr': int_or_none(bitrate, scale=1000) or None, + 'quality': qualities(self.QUALITIES)(res), + }, res + + known_resolutions = {} + + def extract_addr(addr, add_meta={}): + parsed_meta, res = parse_url_key(addr.get('url_key', '')) + if res: + known_resolutions.setdefault(res, {}).setdefault('height', add_meta.get('height')) + known_resolutions[res].setdefault('width', add_meta.get('width')) + parsed_meta.update(known_resolutions.get(res, {})) + add_meta.setdefault('height', int_or_none(res[:-1])) + return [{ + 'url': url, + 'filesize': int_or_none(addr.get('data_size')), + 'ext': 'mp4', + 'acodec': 'aac', + 'source_preference': -2 if 'aweme/v1' in url else -1, # Downloads from API might get blocked + **add_meta, **parsed_meta, + 'format_note': ' '.join(filter(None, ( + add_meta.get('format_note'), '(API)' if 'aweme/v1' in url else ''))) + } for url in addr.get('url_list') or []] + + # Hack: Add direct video links first to prioritize them when removing duplicate formats + formats = [] + if video_info.get('play_addr'): + formats.extend(extract_addr(video_info['play_addr'], { + 'format_id': 'play_addr', + 'format_note': 'Direct video', + 'vcodec': 'h265' if traverse_obj( + video_info, 'is_bytevc1', 'is_h265') else 'h264', # Always h264? + 'width': video_info.get('width'), + 'height': video_info.get('height'), + })) + if video_info.get('download_addr'): + formats.extend(extract_addr(video_info['download_addr'], { + 'format_id': 'download_addr', + 'format_note': 'Download video%s' % (', watermarked' if video_info.get('has_watermark') else ''), + 'vcodec': 'h264', + 'width': video_info.get('width'), + 'height': video_info.get('height'), + 'preference': -2 if video_info.get('has_watermark') else -1, + })) + if video_info.get('play_addr_h264'): + formats.extend(extract_addr(video_info['play_addr_h264'], { + 'format_id': 'play_addr_h264', + 'format_note': 'Direct video', + 'vcodec': 'h264', + })) + if video_info.get('play_addr_bytevc1'): + formats.extend(extract_addr(video_info['play_addr_bytevc1'], { + 'format_id': 'play_addr_bytevc1', + 'format_note': 'Direct video', + 'vcodec': 'h265', + })) + + for bitrate in video_info.get('bit_rate', []): + if bitrate.get('play_addr'): + formats.extend(extract_addr(bitrate['play_addr'], { + 'format_id': bitrate.get('gear_name'), + 'format_note': 'Playback video', + 'tbr': try_get(bitrate, lambda x: x['bit_rate'] / 1000), + 'vcodec': 'h265' if traverse_obj( + bitrate, 'is_bytevc1', 'is_h265') else 'h264', + 'fps': bitrate.get('FPS'), + })) + + self._remove_duplicate_formats(formats) + self._sort_formats(formats, ('quality', 'codec', 'size', 'br')) + + thumbnails = [] + for cover_id in ('cover', 'ai_dynamic_cover', 'animated_cover', 'ai_dynamic_cover_bak', + 'origin_cover', 'dynamic_cover'): + cover = video_info.get(cover_id) + if cover: + for cover_url in cover['url_list']: + thumbnails.append({ + 'id': cover_id, + 'url': cover_url, + }) + + stats_info = aweme_detail.get('statistics', {}) + author_info = aweme_detail.get('author', {}) + music_info = aweme_detail.get('music', {}) + user_url = self._UPLOADER_URL_FORMAT % (traverse_obj(author_info, + 'sec_uid', 'id', 'uid', 'unique_id', + expected_type=str_or_none, get_all=False)) + + contained_music_track = traverse_obj( + music_info, ('matched_song', 'title'), ('matched_pgc_sound', 'title'), expected_type=str) + contained_music_author = traverse_obj( + music_info, ('matched_song', 'author'), ('matched_pgc_sound', 'author'), 'author', expected_type=str) + + is_generic_og_trackname = music_info.get('is_original_sound') and music_info.get('title') == 'original sound - %s' % music_info.get('owner_handle') + if is_generic_og_trackname: + music_track, music_author = contained_music_track or 'original sound', contained_music_author + else: + music_track, music_author = music_info.get('title'), music_info.get('author') + + return { + 'id': aweme_id, + 'title': aweme_detail['desc'], + 'description': aweme_detail['desc'], + 'view_count': int_or_none(stats_info.get('play_count')), + 'like_count': int_or_none(stats_info.get('digg_count')), + 'repost_count': int_or_none(stats_info.get('share_count')), + 'comment_count': int_or_none(stats_info.get('comment_count')), + 'uploader': str_or_none(author_info.get('unique_id')), + 'creator': str_or_none(author_info.get('nickname')), + 'uploader_id': str_or_none(author_info.get('uid')), + 'uploader_url': user_url, + 'track': music_track, + 'album': str_or_none(music_info.get('album')) or None, + 'artist': music_author, + 'timestamp': int_or_none(aweme_detail.get('create_time')), + 'formats': formats, + 'thumbnails': thumbnails, + 'duration': int_or_none(traverse_obj(video_info, 'duration', ('download_addr', 'duration')), scale=1000) + } + + def _parse_aweme_video_web(self, aweme_detail, webpage_url): + video_info = aweme_detail['video'] + author_info = traverse_obj(aweme_detail, 'author', 'authorInfo', default={}) + music_info = aweme_detail.get('music') or {} + stats_info = aweme_detail.get('stats') or {} + user_url = self._UPLOADER_URL_FORMAT % (traverse_obj(author_info, + 'secUid', 'id', 'uid', 'uniqueId', + expected_type=str_or_none, get_all=False)) + + formats = [] + play_url = video_info.get('playAddr') + width = video_info.get('width') + height = video_info.get('height') + if isinstance(play_url, str): + formats = [{ + 'url': self._proto_relative_url(play_url), + 'ext': 'mp4', + 'width': width, + 'height': height, + }] + elif isinstance(play_url, list): + formats = [{ + 'url': self._proto_relative_url(url), + 'ext': 'mp4', + 'width': width, + 'height': height, + } for url in traverse_obj(play_url, (..., 'src'), expected_type=url_or_none, default=[]) if url] + + download_url = url_or_none(video_info.get('downloadAddr')) or traverse_obj(video_info, ('download', 'url'), expected_type=url_or_none) + if download_url: + formats.append({ + 'format_id': 'download', + 'url': self._proto_relative_url(download_url), + 'ext': 'mp4', + 'width': width, + 'height': height, + }) + self._remove_duplicate_formats(formats) + self._sort_formats(formats) + + thumbnails = [] + for thumbnail_name in ('thumbnail', 'cover', 'dynamicCover', 'originCover'): + if aweme_detail.get(thumbnail_name): + thumbnails = [{ + 'url': self._proto_relative_url(aweme_detail[thumbnail_name]), + 'width': width, + 'height': height + }] + + return { + 'id': traverse_obj(aweme_detail, 'id', 'awemeId', expected_type=str_or_none), + 'title': aweme_detail.get('desc'), + 'duration': try_get(aweme_detail, lambda x: x['video']['duration'], int), + 'view_count': int_or_none(stats_info.get('playCount')), + 'like_count': int_or_none(stats_info.get('diggCount')), + 'repost_count': int_or_none(stats_info.get('shareCount')), + 'comment_count': int_or_none(stats_info.get('commentCount')), + 'timestamp': int_or_none(aweme_detail.get('createTime')), + 'creator': str_or_none(author_info.get('nickname')), + 'uploader': str_or_none(author_info.get('uniqueId')), + 'uploader_id': str_or_none(author_info.get('id')), + 'uploader_url': user_url, + 'track': str_or_none(music_info.get('title')), + 'album': str_or_none(music_info.get('album')) or None, + 'artist': str_or_none(music_info.get('authorName')), + 'formats': formats, + 'thumbnails': thumbnails, + 'description': str_or_none(aweme_detail.get('desc')), + 'http_headers': { + 'Referer': webpage_url + } + } + + +class TikTokIE(TikTokBaseIE): + _VALID_URL = r'https?://www\.tiktok\.com/@[\w\.-]+/video/(?P<id>\d+)' + + _TESTS = [{ + 'url': 'https://www.tiktok.com/@leenabhushan/video/6748451240264420610', + 'md5': '736bb7a466c6f0a6afeb597da1e6f5b7', + 'info_dict': { + 'id': '6748451240264420610', + 'ext': 'mp4', + 'title': '#jassmanak #lehanga #leenabhushan', + 'description': '#jassmanak #lehanga #leenabhushan', + 'duration': 13, + 'height': 1024, + 'width': 576, + 'uploader': 'leenabhushan', + 'uploader_id': '6691488002098119685', + 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAA_Eb4t1vodM1IuTy_cvp9CY22RAb59xqrO0Xtz9CYQJvgXaDvZxYnZYRzDWhhgJmy', + 'creator': 'facestoriesbyleenabh', + 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?', + 'upload_date': '20191016', + 'timestamp': 1571246252, + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + } + }, { + 'url': 'https://www.tiktok.com/@patroxofficial/video/6742501081818877190?langCountry=en', + 'md5': '6f3cf8cdd9b28cb8363fe0a9a160695b', + 'info_dict': { + 'id': '6742501081818877190', + 'ext': 'mp4', + 'title': 'md5:5e2a23877420bb85ce6521dbee39ba94', + 'description': 'md5:5e2a23877420bb85ce6521dbee39ba94', + 'duration': 27, + 'height': 960, + 'width': 540, + 'uploader': 'patrox', + 'uploader_id': '18702747', + 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAiFnldaILebi5heDoVU6bn4jBWWycX6-9U3xuNPqZ8Ws', + 'creator': 'patroX', + 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?', + 'upload_date': '20190930', + 'timestamp': 1569860870, + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + } + }, { + # Promoted content/ad + 'url': 'https://www.tiktok.com/@MS4wLjABAAAAAR29F6J2Ktu0Daw03BJyXPNoRQ-W7U5a0Mn3lVCq2rQhjOd_WNLclHUoFgwX8Eno/video/6932675057474981122', + 'only_matching': True, + }] + + def _extract_aweme_app(self, aweme_id): + aweme_detail = self._call_api('aweme/detail', {'aweme_id': aweme_id}, aweme_id, + note='Downloading video details', errnote='Unable to download video details')['aweme_detail'] + return self._parse_aweme_video_app(aweme_detail) + + def _real_extract(self, url): + video_id = self._match_id(url) + + try: + return self._extract_aweme_app(video_id) + except ExtractorError as e: + self.report_warning(f'{e}; Retrying with webpage') + + # If we only call once, we get a 403 when downlaoding the video. + self._download_webpage(url, video_id) + webpage = self._download_webpage(url, video_id, note='Downloading video webpage') + json_string = self._search_regex( + r'id=\"__NEXT_DATA__\"\s+type=\"application\/json\"\s*[^>]+>\s*(?P<json_string_ld>[^<]+)', + webpage, 'json_string', group='json_string_ld') + json_data = self._parse_json(json_string, video_id) + props_data = try_get(json_data, lambda x: x['props'], expected_type=dict) + + # Chech statusCode for success + status = props_data.get('pageProps').get('statusCode') + if status == 0: + return self._parse_aweme_video_web(props_data['pageProps']['itemInfo']['itemStruct'], url) + elif status == 10216: + raise ExtractorError('This video is private', expected=True) + + raise ExtractorError('Video not available', video_id=video_id) + + +class TikTokUserIE(TikTokBaseIE): + IE_NAME = 'tiktok:user' + _VALID_URL = r'https?://(?:www\.)?tiktok\.com/@(?P<id>[\w\.-]+)/?(?:$|[#?])' + _TESTS = [{ + 'url': 'https://tiktok.com/@corgibobaa?lang=en', + 'playlist_mincount': 45, + 'info_dict': { + 'id': '6935371178089399301', + 'title': 'corgibobaa', + }, + 'expected_warnings': ['Retrying'] + }, { + 'url': 'https://www.tiktok.com/@meme', + 'playlist_mincount': 593, + 'info_dict': { + 'id': '79005827461758976', + 'title': 'meme', + }, + 'expected_warnings': ['Retrying'] + }] + + r''' # TODO: Fix by adding _signature to api_url + def _entries(self, webpage, user_id, username): + secuid = self._search_regex(r'\"secUid\":\"(?P<secUid>[^\"]+)', webpage, username) + verifyfp_cookie = self._get_cookies('https://www.tiktok.com').get('s_v_web_id') + if not verifyfp_cookie: + raise ExtractorError('Improper cookies (missing s_v_web_id).', expected=True) + api_url = f'https://m.tiktok.com/api/post/item_list/?aid=1988&cookie_enabled=true&count=30&verifyFp={verifyfp_cookie.value}&secUid={secuid}&cursor=' + cursor = '0' + for page in itertools.count(): + data_json = self._download_json(api_url + cursor, username, note='Downloading Page %d' % page) + for video in data_json.get('itemList', []): + video_id = video['id'] + video_url = f'https://www.tiktok.com/@{user_id}/video/{video_id}' + yield self._url_result(video_url, 'TikTok', video_id, str_or_none(video.get('desc'))) + if not data_json.get('hasMore'): + break + cursor = data_json['cursor'] + ''' + + def _entries_api(self, webpage, user_id, username): + query = { + 'user_id': user_id, + 'count': 21, + 'max_cursor': 0, + 'min_cursor': 0, + 'retry_type': 'no_retry', + 'device_id': ''.join(random.choice(string.digits) for i in range(19)), # Some endpoints don't like randomized device_id, so it isn't directly set in _call_api. + } + + max_retries = self.get_param('extractor_retries', 3) + for page in itertools.count(1): + for retries in itertools.count(): + try: + post_list = self._call_api('aweme/post', query, username, + note='Downloading user video list page %d%s' % (page, f' (attempt {retries})' if retries != 0 else ''), + errnote='Unable to download user video list') + except ExtractorError as e: + if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0 and retries != max_retries: + self.report_warning('%s. Retrying...' % str(e.cause or e.msg)) + continue + raise + break + for video in post_list.get('aweme_list', []): + yield { + **self._parse_aweme_video_app(video), + 'ie_key': TikTokIE.ie_key(), + 'extractor': 'TikTok', + } + if not post_list.get('has_more'): + break + query['max_cursor'] = post_list['max_cursor'] + + def _real_extract(self, url): + user_name = self._match_id(url) + webpage = self._download_webpage(url, user_name, headers={ + 'User-Agent': 'facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)' + }) + user_id = self._html_search_regex(r'snssdk\d*://user/profile/(\d+)', webpage, 'user ID') + return self.playlist_result(self._entries_api(webpage, user_id, user_name), user_id, user_name) + + +class DouyinIE(TikTokIE): + _VALID_URL = r'https?://(?:www\.)?douyin\.com/video/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'https://www.douyin.com/video/6961737553342991651', + 'md5': '10523312c8b8100f353620ac9dc8f067', + 'info_dict': { + 'id': '6961737553342991651', + 'ext': 'mp4', + 'title': '#杨超越 小小水手带你去远航❤️', + 'uploader': '杨超越', + 'upload_date': '20210513', + 'timestamp': 1620905839, + 'uploader_id': '110403406559', + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + } + }, { + 'url': 'https://www.douyin.com/video/6982497745948921092', + 'md5': 'd78408c984b9b5102904cf6b6bc2d712', + 'info_dict': { + 'id': '6982497745948921092', + 'ext': 'mp4', + 'title': '这个夏日和小羊@杨超越 一起遇见白色幻想', + 'uploader': '杨超越工作室', + 'upload_date': '20210708', + 'timestamp': 1625739481, + 'uploader_id': '408654318141572', + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + } + }, { + 'url': 'https://www.douyin.com/video/6953975910773099811', + 'md5': '72e882e24f75064c218b76c8b713c185', + 'info_dict': { + 'id': '6953975910773099811', + 'ext': 'mp4', + 'title': '#一起看海 出现在你的夏日里', + 'uploader': '杨超越', + 'upload_date': '20210422', + 'timestamp': 1619098692, + 'uploader_id': '110403406559', + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + } + }, { + 'url': 'https://www.douyin.com/video/6950251282489675042', + 'md5': 'b4db86aec367ef810ddd38b1737d2fed', + 'info_dict': { + 'id': '6950251282489675042', + 'ext': 'mp4', + 'title': '哈哈哈,成功了哈哈哈哈哈哈', + 'uploader': '杨超越', + 'upload_date': '20210412', + 'timestamp': 1618231483, + 'uploader_id': '110403406559', + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + } + }, { + 'url': 'https://www.douyin.com/video/6963263655114722595', + 'md5': '1abe1c477d05ee62efb40bf2329957cf', + 'info_dict': { + 'id': '6963263655114722595', + 'ext': 'mp4', + 'title': '#哪个爱豆的105度最甜 换个角度看看我哈哈', + 'uploader': '杨超越', + 'upload_date': '20210517', + 'timestamp': 1621261163, + 'uploader_id': '110403406559', + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + } + }] + _APP_VERSION = '9.6.0' + _MANIFEST_APP_VERSION = '960' + _APP_NAME = 'aweme' + _AID = 1128 + _API_HOSTNAME = 'aweme.snssdk.com' + _UPLOADER_URL_FORMAT = 'https://www.douyin.com/user/%s' + + def _real_extract(self, url): + video_id = self._match_id(url) + + try: + return self._extract_aweme_app(video_id) + except ExtractorError as e: + self.report_warning(f'{e}; Retrying with webpage') + + webpage = self._download_webpage(url, video_id) + render_data_json = self._search_regex( + r'<script [^>]*\bid=[\'"]RENDER_DATA[\'"][^>]*>(%7B.+%7D)</script>', + webpage, 'render data', default=None) + if not render_data_json: + # TODO: Run verification challenge code to generate signature cookies + raise ExtractorError('Fresh cookies (not necessarily logged in) are needed') + + render_data = self._parse_json( + render_data_json, video_id, transform_source=compat_urllib_parse_unquote) + return self._parse_aweme_video_web( + traverse_obj(render_data, (..., 'aweme', 'detail'), get_all=False), url) diff --git a/yt_dlp/extractor/tinypic.py b/yt_dlp/extractor/tinypic.py new file mode 100644 index 000000000..39056e52e --- /dev/null +++ b/yt_dlp/extractor/tinypic.py @@ -0,0 +1,56 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class TinyPicIE(InfoExtractor): + IE_NAME = 'tinypic' + IE_DESC = 'tinypic.com videos' + _VALID_URL = r'https?://(?:.+?\.)?tinypic\.com/player\.php\?v=(?P<id>[^&]+)&s=\d+' + + _TESTS = [ + { + 'url': 'http://tinypic.com/player.php?v=6xw7tc%3E&s=5#.UtqZmbRFCM8', + 'md5': '609b74432465364e72727ebc6203f044', + 'info_dict': { + 'id': '6xw7tc', + 'ext': 'flv', + 'title': 'shadow phenomenon weird', + }, + }, + { + 'url': 'http://de.tinypic.com/player.php?v=dy90yh&s=8', + 'only_matching': True, + } + ] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id, 'Downloading page') + + mobj = re.search(r'(?m)fo\.addVariable\("file",\s"(?P<fileid>[\da-z]+)"\);\n' + r'\s+fo\.addVariable\("s",\s"(?P<serverid>\d+)"\);', webpage) + if mobj is None: + raise ExtractorError('Video %s does not exist' % video_id, expected=True) + + file_id = mobj.group('fileid') + server_id = mobj.group('serverid') + + KEYWORDS_SUFFIX = ', Video, images, photos, videos, myspace, ebay, video hosting, photo hosting' + keywords = self._html_search_meta('keywords', webpage, 'title') + title = keywords[:-len(KEYWORDS_SUFFIX)] if keywords.endswith(KEYWORDS_SUFFIX) else '' + + video_url = 'http://v%s.tinypic.com/%s.flv' % (server_id, file_id) + thumbnail = 'http://v%s.tinypic.com/%s_th.jpg' % (server_id, file_id) + + return { + 'id': file_id, + 'url': video_url, + 'thumbnail': thumbnail, + 'title': title + } diff --git a/yt_dlp/extractor/tmz.py b/yt_dlp/extractor/tmz.py new file mode 100644 index 000000000..aee2273b8 --- /dev/null +++ b/yt_dlp/extractor/tmz.py @@ -0,0 +1,157 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + get_element_by_attribute, +) + + +class TMZIE(InfoExtractor): + _VALID_URL = r"https?://(?:www\.)?tmz\.com/.*" + _TESTS = [ + { + "url": "http://www.tmz.com/videos/0-cegprt2p/", + "info_dict": { + "id": "http://www.tmz.com/videos/0-cegprt2p/", + "ext": "mp4", + "title": "No Charges Against Hillary Clinton? Harvey Says It Ain't Over Yet", + "description": "Harvey talks about Director Comey’s decision not to prosecute Hillary Clinton.", + "timestamp": 1467831837, + "uploader": "{'@type': 'Person', 'name': 'TMZ Staff'}", + "upload_date": "20160706", + }, + }, + { + "url": "https://www.tmz.com/videos/071119-chris-morgan-women-4590005-0-zcsejvcr/", + "info_dict": { + "id": "https://www.tmz.com/videos/071119-chris-morgan-women-4590005-0-zcsejvcr/", + "ext": "mp4", + "title": "Angry Bagel Shop Guy Says He Doesn't Trust Women", + "description": "The enraged man who went viral for ranting about women on dating sites before getting ragdolled in a bagel shop is defending his misogyny ... he says it's women's fault in the first place.", + "timestamp": 1562889485, + "uploader": "{'@type': 'Person', 'name': 'TMZ Staff'}", + "upload_date": "20190711", + }, + }, + { + "url": "http://www.tmz.com/2015/04/19/bobby-brown-bobbi-kristina-awake-video-concert", + "md5": "5429c85db8bde39a473a56ca8c4c5602", + "info_dict": { + "id": "http://www.tmz.com/2015/04/19/bobby-brown-bobbi-kristina-awake-video-concert", + "ext": "mp4", + "title": "Bobby Brown Tells Crowd ... Bobbi Kristina is Awake", + "description": 'Bobby Brown stunned his audience during a concert Saturday night, when he told the crowd, "Bobbi is awake. She\'s watching me."', + "timestamp": 1429467813, + "uploader": "{'@type': 'Person', 'name': 'TMZ Staff'}", + "upload_date": "20150419", + }, + }, + { + "url": "http://www.tmz.com/2015/09/19/patti-labelle-concert-fan-stripping-kicked-out-nicki-minaj/", + "info_dict": { + "id": "http://www.tmz.com/2015/09/19/patti-labelle-concert-fan-stripping-kicked-out-nicki-minaj/", + "ext": "mp4", + "title": "Patti LaBelle -- Goes Nuclear On Stripping Fan", + "description": "Patti LaBelle made it known loud and clear last night ... NO " + "ONE gets on her stage and strips down.", + "timestamp": 1442683746, + "uploader": "{'@type': 'Person', 'name': 'TMZ Staff'}", + "upload_date": "20150919", + }, + }, + { + "url": "http://www.tmz.com/2016/01/28/adam-silver-sting-drake-blake-griffin/", + "info_dict": { + "id": "http://www.tmz.com/2016/01/28/adam-silver-sting-drake-blake-griffin/", + "ext": "mp4", + "title": "NBA's Adam Silver -- Blake Griffin's a Great Guy ... He'll Learn from This", + "description": "Two pretty parts of this video with NBA Commish Adam Silver.", + "timestamp": 1454010989, + "uploader": "{'@type': 'Person', 'name': 'TMZ Staff'}", + "upload_date": "20160128", + }, + }, + { + "url": "http://www.tmz.com/2016/10/27/donald-trump-star-vandal-arrested-james-otis/", + "info_dict": { + "id": "http://www.tmz.com/2016/10/27/donald-trump-star-vandal-arrested-james-otis/", + "ext": "mp4", + "title": "Trump Star Vandal -- I'm Not Afraid of Donald or the Cops!", + "description": "James Otis is the the guy who took a pickaxe to Donald Trump's star on the Walk of Fame, and he tells TMZ .. he's ready and willing to go to jail for the crime.", + "timestamp": 1477500095, + "uploader": "{'@type': 'Person', 'name': 'TMZ Staff'}", + "upload_date": "20161026", + }, + }, + { + "url": "https://www.tmz.com/videos/2020-10-31-103120-beverly-hills-protest-4878209/", + "info_dict": { + "id": "https://www.tmz.com/videos/2020-10-31-103120-beverly-hills-protest-4878209/", + "ext": "mp4", + "title": "Cops Use Billy Clubs Against Pro-Trump and Anti-Fascist " + "Demonstrators", + "description": "Beverly Hills may be an omen of what's coming next week, " + "because things got crazy on the streets and cops started " + "swinging their billy clubs at both Anti-Fascist and Pro-Trump " + "demonstrators.", + "timestamp": 1604182772, + "uploader": "{'@type': 'Person', 'name': 'TMZ Staff'}", + "upload_date": "20201031", + }, + }, + { + "url": "https://www.tmz.com/2020/11/05/gervonta-davis-car-crash-hit-and-run-police/", + "info_dict": { + "id": "Dddb6IGe-ws", + "ext": "mp4", + "title": "SICK LAMBO GERVONTA DAVIS IN HIS NEW RIDE RIGHT AFTER KO AFTER LEO EsNews Boxing", + "uploader": "ESNEWS", + "description": "md5:49675bc58883ccf80474b8aa701e1064", + "upload_date": "20201101", + "uploader_id": "ESNEWS", + }, + }, + { + "url": "https://www.tmz.com/2020/11/19/conor-mcgregor-dustin-poirier-contract-fight-ufc-257-fight-island/", + "info_dict": { + "id": "1329450007125225473", + "ext": "mp4", + "title": "TheMacLife - BREAKING: Conor McGregor (@thenotoriousmma) has signed his bout agreement for his rematch with Dustin Poirier for January 23.", + "uploader": "TheMacLife", + "description": "md5:56e6009bbc3d12498e10d08a8e1f1c69", + "upload_date": "20201119", + "uploader_id": "Maclifeofficial", + "timestamp": 1605800556, + }, + }, + ] + + def _real_extract(self, url): + webpage = self._download_webpage(url, url) + jsonld = self._search_json_ld(webpage, url) + if not jsonld or "url" not in jsonld: + # try to extract from YouTube Player API + # see https://developers.google.com/youtube/iframe_api_reference#Video_Queueing_Functions + match_obj = re.search(r'\.cueVideoById\(\s*(?P<quote>[\'"])(?P<id>.*?)(?P=quote)', webpage) + if match_obj: + res = self.url_result(match_obj.group("id")) + return res + # try to extract from twitter + blockquote_el = get_element_by_attribute("class", "twitter-tweet", webpage) + if blockquote_el: + matches = re.findall( + r'<a[^>]+href=\s*(?P<quote>[\'"])(?P<link>.*?)(?P=quote)', + blockquote_el) + if matches: + for _, match in matches: + if "/status/" in match: + res = self.url_result(match) + return res + raise ExtractorError("No video found!") + if id not in jsonld: + jsonld["id"] = url + return jsonld diff --git a/yt_dlp/extractor/tnaflix.py b/yt_dlp/extractor/tnaflix.py new file mode 100644 index 000000000..d7617f708 --- /dev/null +++ b/yt_dlp/extractor/tnaflix.py @@ -0,0 +1,327 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + fix_xml_ampersands, + float_or_none, + int_or_none, + parse_duration, + str_to_int, + unescapeHTML, + xpath_text, +) + + +class TNAFlixNetworkBaseIE(InfoExtractor): + # May be overridden in descendants if necessary + _CONFIG_REGEX = [ + r'flashvars\.config\s*=\s*escape\("(?P<url>[^"]+)"', + r'<input[^>]+name="config\d?" value="(?P<url>[^"]+)"', + r'config\s*=\s*(["\'])(?P<url>(?:https?:)?//(?:(?!\1).)+)\1', + ] + _HOST = 'tna' + _VKEY_SUFFIX = '' + _TITLE_REGEX = r'<input[^>]+name="title" value="([^"]+)"' + _DESCRIPTION_REGEX = r'<input[^>]+name="description" value="([^"]+)"' + _UPLOADER_REGEX = r'<input[^>]+name="username" value="([^"]+)"' + _VIEW_COUNT_REGEX = None + _COMMENT_COUNT_REGEX = None + _AVERAGE_RATING_REGEX = None + _CATEGORIES_REGEX = r'<li[^>]*>\s*<span[^>]+class="infoTitle"[^>]*>Categories:</span>\s*<span[^>]+class="listView"[^>]*>(.+?)</span>\s*</li>' + + def _extract_thumbnails(self, flix_xml): + + def get_child(elem, names): + for name in names: + child = elem.find(name) + if child is not None: + return child + + timeline = get_child(flix_xml, ['timeline', 'rolloverBarImage']) + if timeline is None: + return + + pattern_el = get_child(timeline, ['imagePattern', 'pattern']) + if pattern_el is None or not pattern_el.text: + return + + first_el = get_child(timeline, ['imageFirst', 'first']) + last_el = get_child(timeline, ['imageLast', 'last']) + if first_el is None or last_el is None: + return + + first_text = first_el.text + last_text = last_el.text + if not first_text.isdigit() or not last_text.isdigit(): + return + + first = int(first_text) + last = int(last_text) + if first > last: + return + + width = int_or_none(xpath_text(timeline, './imageWidth', 'thumbnail width')) + height = int_or_none(xpath_text(timeline, './imageHeight', 'thumbnail height')) + + return [{ + 'url': self._proto_relative_url(pattern_el.text.replace('#', compat_str(i)), 'http:'), + 'width': width, + 'height': height, + } for i in range(first, last + 1)] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + for display_id_key in ('display_id', 'display_id_2'): + if display_id_key in mobj.groupdict(): + display_id = mobj.group(display_id_key) + if display_id: + break + else: + display_id = video_id + + webpage = self._download_webpage(url, display_id) + + cfg_url = self._proto_relative_url(self._html_search_regex( + self._CONFIG_REGEX, webpage, 'flashvars.config', default=None, + group='url'), 'http:') + + if not cfg_url: + inputs = self._hidden_inputs(webpage) + cfg_url = ('https://cdn-fck.%sflix.com/%sflix/%s%s.fid?key=%s&VID=%s&premium=1&vip=1&alpha' + % (self._HOST, self._HOST, inputs['vkey'], self._VKEY_SUFFIX, inputs['nkey'], video_id)) + + cfg_xml = self._download_xml( + cfg_url, display_id, 'Downloading metadata', + transform_source=fix_xml_ampersands, headers={'Referer': url}) + + formats = [] + + def extract_video_url(vl): + # Any URL modification now results in HTTP Error 403: Forbidden + return unescapeHTML(vl.text) + + video_link = cfg_xml.find('./videoLink') + if video_link is not None: + formats.append({ + 'url': extract_video_url(video_link), + 'ext': xpath_text(cfg_xml, './videoConfig/type', 'type', default='flv'), + }) + + for item in cfg_xml.findall('./quality/item'): + video_link = item.find('./videoLink') + if video_link is None: + continue + res = item.find('res') + format_id = None if res is None else res.text + height = int_or_none(self._search_regex( + r'^(\d+)[pP]', format_id, 'height', default=None)) + formats.append({ + 'url': self._proto_relative_url(extract_video_url(video_link), 'http:'), + 'format_id': format_id, + 'height': height, + }) + + self._sort_formats(formats) + + thumbnail = self._proto_relative_url( + xpath_text(cfg_xml, './startThumb', 'thumbnail'), 'http:') + thumbnails = self._extract_thumbnails(cfg_xml) + + title = None + if self._TITLE_REGEX: + title = self._html_search_regex( + self._TITLE_REGEX, webpage, 'title', default=None) + if not title: + title = self._og_search_title(webpage) + + age_limit = self._rta_search(webpage) or 18 + + duration = parse_duration(self._html_search_meta( + 'duration', webpage, 'duration', default=None)) + + def extract_field(pattern, name): + return self._html_search_regex(pattern, webpage, name, default=None) if pattern else None + + description = extract_field(self._DESCRIPTION_REGEX, 'description') + uploader = extract_field(self._UPLOADER_REGEX, 'uploader') + view_count = str_to_int(extract_field(self._VIEW_COUNT_REGEX, 'view count')) + comment_count = str_to_int(extract_field(self._COMMENT_COUNT_REGEX, 'comment count')) + average_rating = float_or_none(extract_field(self._AVERAGE_RATING_REGEX, 'average rating')) + + categories_str = extract_field(self._CATEGORIES_REGEX, 'categories') + categories = [c.strip() for c in categories_str.split(',')] if categories_str is not None else [] + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'thumbnails': thumbnails, + 'duration': duration, + 'age_limit': age_limit, + 'uploader': uploader, + 'view_count': view_count, + 'comment_count': comment_count, + 'average_rating': average_rating, + 'categories': categories, + 'formats': formats, + } + + +class TNAFlixNetworkEmbedIE(TNAFlixNetworkBaseIE): + _VALID_URL = r'https?://player\.(?:tna|emp)flix\.com/video/(?P<id>\d+)' + + _TITLE_REGEX = r'<title>([^<]+)</title>' + + _TESTS = [{ + 'url': 'https://player.tnaflix.com/video/6538', + 'info_dict': { + 'id': '6538', + 'display_id': '6538', + 'ext': 'mp4', + 'title': 'Educational xxx video', + 'thumbnail': r're:https?://.*\.jpg$', + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://player.empflix.com/video/33051', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [url for _, url in re.findall( + r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.(?:tna|emp)flix\.com/video/\d+)\1', + webpage)] + + +class TNAEMPFlixBaseIE(TNAFlixNetworkBaseIE): + _DESCRIPTION_REGEX = r'(?s)>Description:</[^>]+>(.+?)<' + _UPLOADER_REGEX = r'<span>by\s*<a[^>]+\bhref=["\']/profile/[^>]+>([^<]+)<' + _CATEGORIES_REGEX = r'(?s)<span[^>]*>Categories:</span>(.+?)</div>' + + +class TNAFlixIE(TNAEMPFlixBaseIE): + _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/[^/]+/(?P<display_id>[^/]+)/video(?P<id>\d+)' + + _TITLE_REGEX = r'<title>(.+?) - (?:TNAFlix Porn Videos|TNAFlix\.com)</title>' + + _TESTS = [{ + # anonymous uploader, no categories + 'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878', + 'md5': '7e569419fe6d69543d01e6be22f5f7c4', + 'info_dict': { + 'id': '553878', + 'display_id': 'Carmella-Decesare-striptease', + 'ext': 'mp4', + 'title': 'Carmella Decesare - striptease', + 'thumbnail': r're:https?://.*\.jpg$', + 'duration': 91, + 'age_limit': 18, + 'categories': ['Porn Stars'], + } + }, { + # non-anonymous uploader, categories + 'url': 'https://www.tnaflix.com/teen-porn/Educational-xxx-video/video6538', + 'md5': '0f5d4d490dbfd117b8607054248a07c0', + 'info_dict': { + 'id': '6538', + 'display_id': 'Educational-xxx-video', + 'ext': 'mp4', + 'title': 'Educational xxx video', + 'description': 'md5:b4fab8f88a8621c8fabd361a173fe5b8', + 'thumbnail': r're:https?://.*\.jpg$', + 'duration': 164, + 'age_limit': 18, + 'uploader': 'bobwhite39', + 'categories': list, + } + }, { + 'url': 'https://www.tnaflix.com/amateur-porn/bunzHD-Ms.Donk/video358632', + 'only_matching': True, + }] + + +class EMPFlixIE(TNAEMPFlixBaseIE): + _VALID_URL = r'https?://(?:www\.)?empflix\.com/(?:videos/(?P<display_id>.+?)-|[^/]+/(?P<display_id_2>[^/]+)/video)(?P<id>[0-9]+)' + + _HOST = 'emp' + _VKEY_SUFFIX = '-1' + + _TESTS = [{ + 'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html', + 'md5': 'bc30d48b91a7179448a0bda465114676', + 'info_dict': { + 'id': '33051', + 'display_id': 'Amateur-Finger-Fuck', + 'ext': 'mp4', + 'title': 'Amateur Finger Fuck', + 'description': 'Amateur solo finger fucking.', + 'thumbnail': r're:https?://.*\.jpg$', + 'duration': 83, + 'age_limit': 18, + 'uploader': 'cwbike', + 'categories': ['Amateur', 'Anal', 'Fisting', 'Home made', 'Solo'], + } + }, { + 'url': 'http://www.empflix.com/videos/[AROMA][ARMD-718]-Aoi-Yoshino-Sawa-25826.html', + 'only_matching': True, + }, { + 'url': 'https://www.empflix.com/amateur-porn/Amateur-Finger-Fuck/video33051', + 'only_matching': True, + }] + + +class MovieFapIE(TNAFlixNetworkBaseIE): + _VALID_URL = r'https?://(?:www\.)?moviefap\.com/videos/(?P<id>[0-9a-f]+)/(?P<display_id>[^/]+)\.html' + + _VIEW_COUNT_REGEX = r'<br>Views\s*<strong>([\d,.]+)</strong>' + _COMMENT_COUNT_REGEX = r'<span[^>]+id="comCount"[^>]*>([\d,.]+)</span>' + _AVERAGE_RATING_REGEX = r'Current Rating\s*<br>\s*<strong>([\d.]+)</strong>' + _CATEGORIES_REGEX = r'(?s)<div[^>]+id="vid_info"[^>]*>\s*<div[^>]*>.+?</div>(.*?)<br>' + + _TESTS = [{ + # normal, multi-format video + 'url': 'http://www.moviefap.com/videos/be9867c9416c19f54a4a/experienced-milf-amazing-handjob.html', + 'md5': '26624b4e2523051b550067d547615906', + 'info_dict': { + 'id': 'be9867c9416c19f54a4a', + 'display_id': 'experienced-milf-amazing-handjob', + 'ext': 'mp4', + 'title': 'Experienced MILF Amazing Handjob', + 'description': 'Experienced MILF giving an Amazing Handjob', + 'thumbnail': r're:https?://.*\.jpg$', + 'age_limit': 18, + 'uploader': 'darvinfred06', + 'view_count': int, + 'comment_count': int, + 'average_rating': float, + 'categories': ['Amateur', 'Masturbation', 'Mature', 'Flashing'], + } + }, { + # quirky single-format case where the extension is given as fid, but the video is really an flv + 'url': 'http://www.moviefap.com/videos/e5da0d3edce5404418f5/jeune-couple-russe.html', + 'md5': 'fa56683e291fc80635907168a743c9ad', + 'info_dict': { + 'id': 'e5da0d3edce5404418f5', + 'display_id': 'jeune-couple-russe', + 'ext': 'flv', + 'title': 'Jeune Couple Russe', + 'description': 'Amateur', + 'thumbnail': r're:https?://.*\.jpg$', + 'age_limit': 18, + 'uploader': 'whiskeyjar', + 'view_count': int, + 'comment_count': int, + 'average_rating': float, + 'categories': ['Amateur', 'Teen'], + } + }] diff --git a/yt_dlp/extractor/toggle.py b/yt_dlp/extractor/toggle.py new file mode 100644 index 000000000..eb873495f --- /dev/null +++ b/yt_dlp/extractor/toggle.py @@ -0,0 +1,232 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + float_or_none, + int_or_none, + parse_iso8601, + strip_or_none, +) + + +class ToggleIE(InfoExtractor): + IE_NAME = 'toggle' + _VALID_URL = r'(?:https?://(?:(?:www\.)?mewatch|video\.toggle)\.sg/(?:en|zh)/(?:[^/]+/){2,}|toggle:)(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://www.mewatch.sg/en/series/lion-moms-tif/trailers/lion-moms-premier/343115', + 'info_dict': { + 'id': '343115', + 'ext': 'mp4', + 'title': 'Lion Moms Premiere', + 'description': 'md5:aea1149404bff4d7f7b6da11fafd8e6b', + 'upload_date': '20150910', + 'timestamp': 1441858274, + }, + 'params': { + 'skip_download': 'm3u8 download', + } + }, { + 'note': 'DRM-protected video', + 'url': 'http://www.mewatch.sg/en/movies/dug-s-special-mission/341413', + 'info_dict': { + 'id': '341413', + 'ext': 'wvm', + 'title': 'Dug\'s Special Mission', + 'description': 'md5:e86c6f4458214905c1772398fabc93e0', + 'upload_date': '20150827', + 'timestamp': 1440644006, + }, + 'params': { + 'skip_download': 'DRM-protected wvm download', + } + }, { + # this also tests correct video id extraction + 'note': 'm3u8 links are geo-restricted, but Android/mp4 is okay', + 'url': 'http://www.mewatch.sg/en/series/28th-sea-games-5-show/28th-sea-games-5-show-ep11/332861', + 'info_dict': { + 'id': '332861', + 'ext': 'mp4', + 'title': '28th SEA Games (5 Show) - Episode 11', + 'description': 'md5:3cd4f5f56c7c3b1340c50a863f896faa', + 'upload_date': '20150605', + 'timestamp': 1433480166, + }, + 'params': { + 'skip_download': 'DRM-protected wvm download', + }, + 'skip': 'm3u8 links are geo-restricted' + }, { + 'url': 'http://video.toggle.sg/en/clips/seraph-sun-aloysius-will-suddenly-sing-some-old-songs-in-high-pitch-on-set/343331', + 'only_matching': True, + }, { + 'url': 'http://www.mewatch.sg/en/clips/seraph-sun-aloysius-will-suddenly-sing-some-old-songs-in-high-pitch-on-set/343331', + 'only_matching': True, + }, { + 'url': 'http://www.mewatch.sg/zh/series/zero-calling-s2-hd/ep13/336367', + 'only_matching': True, + }, { + 'url': 'http://www.mewatch.sg/en/series/vetri-s2/webisodes/jeeva-is-an-orphan-vetri-s2-webisode-7/342302', + 'only_matching': True, + }, { + 'url': 'http://www.mewatch.sg/en/movies/seven-days/321936', + 'only_matching': True, + }, { + 'url': 'https://www.mewatch.sg/en/tv-show/news/may-2017-cna-singapore-tonight/fri-19-may-2017/512456', + 'only_matching': True, + }, { + 'url': 'http://www.mewatch.sg/en/channels/eleven-plus/401585', + 'only_matching': True, + }] + + _API_USER = 'tvpapi_147' + _API_PASS = '11111' + + def _real_extract(self, url): + video_id = self._match_id(url) + + params = { + 'initObj': { + 'Locale': { + 'LocaleLanguage': '', + 'LocaleCountry': '', + 'LocaleDevice': '', + 'LocaleUserState': 0 + }, + 'Platform': 0, + 'SiteGuid': 0, + 'DomainID': '0', + 'UDID': '', + 'ApiUser': self._API_USER, + 'ApiPass': self._API_PASS + }, + 'MediaID': video_id, + 'mediaType': 0, + } + + info = self._download_json( + 'http://tvpapi.as.tvinci.com/v2_9/gateways/jsonpostgw.aspx?m=GetMediaInfo', + video_id, 'Downloading video info json', data=json.dumps(params).encode('utf-8')) + + title = info['MediaName'] + + formats = [] + for video_file in info.get('Files', []): + video_url, vid_format = video_file.get('URL'), video_file.get('Format') + if not video_url or video_url == 'NA' or not vid_format: + continue + ext = determine_ext(video_url) + vid_format = vid_format.replace(' ', '') + # if geo-restricted, m3u8 is inaccessible, but mp4 is okay + if ext == 'm3u8': + m3u8_formats = self._extract_m3u8_formats( + video_url, video_id, ext='mp4', m3u8_id=vid_format, + note='Downloading %s m3u8 information' % vid_format, + errnote='Failed to download %s m3u8 information' % vid_format, + fatal=False) + for f in m3u8_formats: + # Apple FairPlay Streaming + if '/fpshls/' in f['url']: + continue + formats.append(f) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + video_url, video_id, mpd_id=vid_format, + note='Downloading %s MPD manifest' % vid_format, + errnote='Failed to download %s MPD manifest' % vid_format, + fatal=False)) + elif ext == 'ism': + formats.extend(self._extract_ism_formats( + video_url, video_id, ism_id=vid_format, + note='Downloading %s ISM manifest' % vid_format, + errnote='Failed to download %s ISM manifest' % vid_format, + fatal=False)) + elif ext == 'mp4': + formats.append({ + 'ext': ext, + 'url': video_url, + 'format_id': vid_format, + }) + if not formats: + for meta in (info.get('Metas') or []): + if (not self.get_param('allow_unplayable_formats') + and meta.get('Key') == 'Encryption' and meta.get('Value') == '1'): + self.report_drm(video_id) + # Most likely because geo-blocked if no formats and no DRM + self._sort_formats(formats) + + thumbnails = [] + for picture in info.get('Pictures', []): + if not isinstance(picture, dict): + continue + pic_url = picture.get('URL') + if not pic_url: + continue + thumbnail = { + 'url': pic_url, + } + pic_size = picture.get('PicSize', '') + m = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', pic_size) + if m: + thumbnail.update({ + 'width': int(m.group('width')), + 'height': int(m.group('height')), + }) + thumbnails.append(thumbnail) + + def counter(prefix): + return int_or_none( + info.get(prefix + 'Counter') or info.get(prefix.lower() + '_counter')) + + return { + 'id': video_id, + 'title': title, + 'description': strip_or_none(info.get('Description')), + 'duration': int_or_none(info.get('Duration')), + 'timestamp': parse_iso8601(info.get('CreationDate') or None), + 'average_rating': float_or_none(info.get('Rating')), + 'view_count': counter('View'), + 'like_count': counter('Like'), + 'thumbnails': thumbnails, + 'formats': formats, + } + + +class MeWatchIE(InfoExtractor): + IE_NAME = 'mewatch' + _VALID_URL = r'https?://(?:(?:www|live)\.)?mewatch\.sg/watch/[^/?#&]+-(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'https://www.mewatch.sg/watch/Recipe-Of-Life-E1-179371', + 'info_dict': { + 'id': '1008625', + 'ext': 'mp4', + 'title': 'Recipe Of Life 味之道', + 'timestamp': 1603306526, + 'description': 'md5:6e88cde8af2068444fc8e1bc3ebf257c', + 'upload_date': '20201021', + }, + 'params': { + 'skip_download': 'm3u8 download', + }, + }, { + 'url': 'https://www.mewatch.sg/watch/Little-Red-Dot-Detectives-S2-搜密。打卡。小红点-S2-E1-176232', + 'only_matching': True, + }, { + 'url': 'https://www.mewatch.sg/watch/Little-Red-Dot-Detectives-S2-%E6%90%9C%E5%AF%86%E3%80%82%E6%89%93%E5%8D%A1%E3%80%82%E5%B0%8F%E7%BA%A2%E7%82%B9-S2-E1-176232', + 'only_matching': True, + }, { + 'url': 'https://live.mewatch.sg/watch/Recipe-Of-Life-E41-189759', + 'only_matching': True, + }] + + def _real_extract(self, url): + item_id = self._match_id(url) + custom_id = self._download_json( + 'https://cdn.mewatch.sg/api/items/' + item_id, + item_id, query={'segments': 'all'})['customId'] + return self.url_result( + 'toggle:' + custom_id, ToggleIE.ie_key(), custom_id) diff --git a/yt_dlp/extractor/tokentube.py b/yt_dlp/extractor/tokentube.py new file mode 100644 index 000000000..d6362117f --- /dev/null +++ b/yt_dlp/extractor/tokentube.py @@ -0,0 +1,152 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import functools +import re + +from .common import InfoExtractor +from ..utils import ( + parse_count, + unified_strdate, + js_to_json, + OnDemandPagedList, +) + + +class TokentubeIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tokentube\.net/(?:view\?[vl]=|[vl]/)(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://tokentube.net/l/3236632011/Praise-A-Thon-Pastori-Chrisin-ja-Pastori-Bennyn-kanssa-27-8-2021', + 'info_dict': { + 'id': '3236632011', + 'ext': 'mp4', + 'title': 'Praise-A-Thon Pastori Chrisin ja Pastori Bennyn kanssa 27.8.2021', + 'description': '', + 'uploader': 'Pastori Chris - Rapsodia.fi', + 'upload_date': '20210827', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://tokentube.net/v/3950239124/Linux-Ubuntu-Studio-perus-k%C3%A4ytt%C3%B6', + 'md5': '0e1f00421f501f5eada9890d38fcfb56', + 'info_dict': { + 'id': '3950239124', + 'ext': 'mp4', + 'title': 'Linux Ubuntu Studio perus käyttö', + 'description': 'md5:854ff1dc732ff708976de2880ea32050', + 'uploader': 'jyrilehtonen', + 'upload_date': '20210825', + }, + }, { + 'url': 'https://tokentube.net/view?v=3582463289', + 'info_dict': { + 'id': '3582463289', + 'ext': 'mp4', + 'title': 'Police for Freedom - toiminta aloitetaan Suomessa ❤️??', + 'description': 'md5:cd92e620d7f5fa162e8410d0fc9a08be', + 'uploader': 'Voitontie', + 'upload_date': '20210428', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex(r'<h1\s*class=["\']title-text["\']>(.+?)</h1>', webpage, 'title') + + data_json = self._html_search_regex(r'({["\']html5["\'].+?}}}+)', webpage, 'data json') + data_json = self._parse_json(js_to_json(data_json), video_id, fatal=False) + + sources = data_json.get('sources') or self._parse_json( + self._html_search_regex(r'updateSrc\(([^\)]+)\)', webpage, 'sources'), + video_id, transform_source=js_to_json) + + formats = [{ + 'url': format.get('src'), + 'format_id': format.get('label'), + 'height': format.get('res'), + } for format in sources] + + view_count = parse_count(self._html_search_regex( + r'<p\s*class=["\']views_counter["\']>\s*([\d\.,]+)\s*<span>views?</span></p>', + webpage, 'view_count', fatal=False)) + + like_count = parse_count(self._html_search_regex( + r'<div\s*class="sh_button\s*likes_count">\s*(\d+)\s*</div>', + webpage, 'like count', fatal=False)) + + dislike_count = parse_count(self._html_search_regex( + r'<div\s*class="sh_button\s*dislikes_count">\s*(\d+)\s*</div>', + webpage, 'dislike count', fatal=False)) + + upload_date = unified_strdate(self._html_search_regex( + r'<span\s*class="p-date">Published\s*on\s+([^<]+)', + webpage, 'upload date', fatal=False)) + + uploader = self._html_search_regex( + r'<a\s*class="place-left"[^>]+>(.+?)</a>', + webpage, 'uploader', fatal=False) + + description = self._html_search_meta('description', webpage) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'formats': formats, + 'title': title, + 'view_count': view_count, + 'like_count': like_count, + 'dislike_count': dislike_count, + 'upload_date': upload_date, + 'description': description, + 'uploader': uploader, + } + + +class TokentubeChannelIE(InfoExtractor): + _PAGE_SIZE = 20 + IE_NAME = 'Tokentube:channel' + _VALID_URL = r'https?://(?:www\.)?tokentube\.net/channel/(?P<id>\d+)/[^/]+(?:/videos)?' + _TESTS = [{ + 'url': 'https://tokentube.net/channel/3697658904/TokenTube', + 'info_dict': { + 'id': '3697658904', + }, + 'playlist_mincount': 7, + }, { + 'url': 'https://tokentube.net/channel/3353234420/Linux/videos', + 'info_dict': { + 'id': '3353234420', + }, + 'playlist_mincount': 20, + }, { + 'url': 'https://tokentube.net/channel/3475834195/Voitontie', + 'info_dict': { + 'id': '3475834195', + }, + 'playlist_mincount': 150, + }] + + def _fetch_page(self, channel_id, page): + page += 1 + videos_info = self._download_webpage( + f'https://tokentube.net/videos?p=0&m=1&sort=recent&u={channel_id}&page={page}', + channel_id, headers={'X-Requested-With': 'XMLHttpRequest'}, + note=f'Downloading page {page}', fatal=False) + if '</i> Sorry, no results were found.' not in videos_info: + for path, media_id in re.findall( + r'<a[^>]+\bhref=["\']([^"\']+/[lv]/(\d+)/\S+)["\'][^>]+>', + videos_info): + yield self.url_result(path, ie=TokentubeIE.ie_key(), video_id=media_id) + + def _real_extract(self, url): + channel_id = self._match_id(url) + + entries = OnDemandPagedList(functools.partial( + self._fetch_page, channel_id), self._PAGE_SIZE) + + return self.playlist_result(entries, channel_id) diff --git a/youtube_dl/extractor/tonline.py b/yt_dlp/extractor/tonline.py index cc11eae2a..cc11eae2a 100644 --- a/youtube_dl/extractor/tonline.py +++ b/yt_dlp/extractor/tonline.py diff --git a/yt_dlp/extractor/toongoggles.py b/yt_dlp/extractor/toongoggles.py new file mode 100644 index 000000000..df13d64c0 --- /dev/null +++ b/yt_dlp/extractor/toongoggles.py @@ -0,0 +1,80 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_duration, +) + + +class ToonGogglesIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?toongoggles\.com/shows/(?P<show_id>\d+)(?:/[^/]+/episodes/(?P<episode_id>\d+))?' + _TESTS = [{ + 'url': 'http://www.toongoggles.com/shows/217143/bernard-season-2/episodes/217147/football', + 'md5': '18289fc2b951eff6b953a9d8f01e6831', + 'info_dict': { + 'id': '217147', + 'ext': 'mp4', + 'title': 'Football', + 'uploader_id': '1', + 'description': 'Bernard decides to play football in order to be better than Lloyd and tries to beat him no matter how, he even cheats.', + 'upload_date': '20160718', + 'timestamp': 1468879330, + } + }, { + 'url': 'http://www.toongoggles.com/shows/227759/om-nom-stories-around-the-world', + 'info_dict': { + 'id': '227759', + 'title': 'Om Nom Stories Around The World', + }, + 'playlist_mincount': 11, + }] + + def _call_api(self, action, page_id, query): + query.update({ + 'for_ng': 1, + 'for_web': 1, + 'show_meta': 1, + 'version': 7.0, + }) + return self._download_json('http://api.toongoggles.com/' + action, page_id, query=query) + + def _parse_episode_data(self, episode_data): + title = episode_data['episode_name'] + + return { + '_type': 'url_transparent', + 'id': episode_data['episode_id'], + 'title': title, + 'url': 'kaltura:513551:' + episode_data['entry_id'], + 'thumbnail': episode_data.get('thumbnail_url'), + 'description': episode_data.get('description'), + 'duration': parse_duration(episode_data.get('hms')), + 'series': episode_data.get('show_name'), + 'season_number': int_or_none(episode_data.get('season_num')), + 'episode_id': episode_data.get('episode_id'), + 'episode': title, + 'episode_number': int_or_none(episode_data.get('episode_num')), + 'categories': episode_data.get('categories'), + 'ie_key': 'Kaltura', + } + + def _real_extract(self, url): + show_id, episode_id = self._match_valid_url(url).groups() + if episode_id: + episode_data = self._call_api('search', episode_id, { + 'filter': 'episode', + 'id': episode_id, + })['objects'][0] + return self._parse_episode_data(episode_data) + else: + show_data = self._call_api('getepisodesbyshow', show_id, { + 'max': 1000000000, + 'showid': show_id, + }) + entries = [] + for episode_data in show_data.get('objects', []): + entries.append(self._parse_episode_data(episode_data)) + return self.playlist_result(entries, show_id, show_data.get('show_name')) diff --git a/yt_dlp/extractor/toutv.py b/yt_dlp/extractor/toutv.py new file mode 100644 index 000000000..6c84c211c --- /dev/null +++ b/yt_dlp/extractor/toutv.py @@ -0,0 +1,93 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .radiocanada import RadioCanadaIE +from ..compat import compat_HTTPError +from ..utils import ( + ExtractorError, + int_or_none, + merge_dicts, +) + + +class TouTvIE(RadioCanadaIE): + _NETRC_MACHINE = 'toutv' + IE_NAME = 'tou.tv' + _VALID_URL = r'https?://ici\.tou\.tv/(?P<id>[a-zA-Z0-9_-]+(?:/S[0-9]+[EC][0-9]+)?)' + + _TESTS = [{ + 'url': 'http://ici.tou.tv/garfield-tout-court/S2015E17', + 'info_dict': { + 'id': '122017', + 'ext': 'mp4', + 'title': 'Saison 2015 Épisode 17', + 'description': 'La photo de famille 2', + 'upload_date': '20100717', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'skip': '404 Not Found', + }, { + 'url': 'http://ici.tou.tv/hackers', + 'only_matching': True, + }, { + 'url': 'https://ici.tou.tv/l-age-adulte/S01C501', + 'only_matching': True, + }] + _CLIENT_KEY = '90505c8d-9c34-4f34-8da1-3a85bdc6d4f4' + + def _real_initialize(self): + email, password = self._get_login_info() + if email is None: + return + try: + self._access_token = self._download_json( + 'https://services.radio-canada.ca/toutv/profiling/accounts/login', + None, 'Logging in', data=json.dumps({ + 'ClientId': self._CLIENT_KEY, + 'ClientSecret': '34026772-244b-49b6-8b06-317b30ac9a20', + 'Email': email, + 'Password': password, + 'Scope': 'id.write media-validation.read', + }).encode(), headers={ + 'Authorization': 'client-key ' + self._CLIENT_KEY, + 'Content-Type': 'application/json;charset=utf-8', + })['access_token'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + error = self._parse_json(e.cause.read().decode(), None)['Message'] + raise ExtractorError(error, expected=True) + raise + self._claims = self._call_api('validation/v2/getClaims')['claims'] + + def _real_extract(self, url): + path = self._match_id(url) + metadata = self._download_json( + 'https://services.radio-canada.ca/toutv/presentation/%s' % path, path, query={ + 'client_key': self._CLIENT_KEY, + 'device': 'web', + 'version': 4, + }) + # IsDrm does not necessarily mean the video is DRM protected (see + # https://github.com/ytdl-org/youtube-dl/issues/13994). + if not self.get_param('allow_unplayable_formats') and metadata.get('IsDrm'): + self.report_warning('This video is probably DRM protected.', path) + video_id = metadata['IdMedia'] + details = metadata['Details'] + + return merge_dicts({ + 'id': video_id, + 'title': details.get('OriginalTitle'), + 'description': details.get('Description'), + 'thumbnail': details.get('ImageUrl'), + 'duration': int_or_none(details.get('LengthInSeconds')), + 'series': metadata.get('ProgramTitle'), + 'season_number': int_or_none(metadata.get('SeasonNumber')), + 'season': metadata.get('SeasonTitle'), + 'episode_number': int_or_none(metadata.get('EpisodeNumber')), + 'episode': metadata.get('EpisodeTitle'), + }, self._extract_info(metadata.get('AppCode', 'toutv'), video_id)) diff --git a/youtube_dl/extractor/toypics.py b/yt_dlp/extractor/toypics.py index f705a06c9..f705a06c9 100644 --- a/youtube_dl/extractor/toypics.py +++ b/yt_dlp/extractor/toypics.py diff --git a/yt_dlp/extractor/traileraddict.py b/yt_dlp/extractor/traileraddict.py new file mode 100644 index 000000000..10100fbcf --- /dev/null +++ b/yt_dlp/extractor/traileraddict.py @@ -0,0 +1,64 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class TrailerAddictIE(InfoExtractor): + _WORKING = False + _VALID_URL = r'(?:https?://)?(?:www\.)?traileraddict\.com/(?:trailer|clip)/(?P<movie>.+?)/(?P<trailer_name>.+)' + _TEST = { + 'url': 'http://www.traileraddict.com/trailer/prince-avalanche/trailer', + 'md5': '41365557f3c8c397d091da510e73ceb4', + 'info_dict': { + 'id': '76184', + 'ext': 'mp4', + 'title': 'Prince Avalanche Trailer', + 'description': 'Trailer for Prince Avalanche.\n\nTwo highway road workers spend the summer of 1988 away from their city lives. The isolated landscape becomes a place of misadventure as the men find themselves at odds with each other and the women they left behind.', + } + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + name = mobj.group('movie') + '/' + mobj.group('trailer_name') + webpage = self._download_webpage(url, name) + + title = self._search_regex(r'<title>(.+?)</title>', + webpage, 'video title').replace(' - Trailer Addict', '') + view_count_str = self._search_regex( + r'<span class="views_n">([0-9,.]+)</span>', + webpage, 'view count', fatal=False) + view_count = ( + None if view_count_str is None + else int(view_count_str.replace(',', ''))) + video_id = self._search_regex( + r'<param\s+name="movie"\s+value="/emb/([0-9]+)"\s*/>', + webpage, 'video id') + + # Presence of (no)watchplus function indicates HD quality is available + if re.search(r'function (no)?watchplus()', webpage): + fvar = 'fvarhd' + else: + fvar = 'fvar' + + info_url = 'http://www.traileraddict.com/%s.php?tid=%s' % (fvar, str(video_id)) + info_webpage = self._download_webpage(info_url, video_id, 'Downloading the info webpage') + + final_url = self._search_regex(r'&fileurl=(.+)', + info_webpage, 'Download url').replace('%3F', '?') + thumbnail_url = self._search_regex(r'&image=(.+?)&', + info_webpage, 'thumbnail url') + + description = self._html_search_regex( + r'(?s)<div class="synopsis">.*?<div class="movie_label_info"[^>]*>(.*?)</div>', + webpage, 'description', fatal=False) + + return { + 'id': video_id, + 'url': final_url, + 'title': title, + 'thumbnail': thumbnail_url, + 'description': description, + 'view_count': view_count, + } diff --git a/youtube_dl/extractor/trilulilu.py b/yt_dlp/extractor/trilulilu.py index a800449e9..a800449e9 100644 --- a/youtube_dl/extractor/trilulilu.py +++ b/yt_dlp/extractor/trilulilu.py diff --git a/yt_dlp/extractor/trovo.py b/yt_dlp/extractor/trovo.py new file mode 100644 index 000000000..ec55f41f2 --- /dev/null +++ b/yt_dlp/extractor/trovo.py @@ -0,0 +1,263 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools +import json + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + str_or_none, + try_get, +) + + +class TrovoBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?trovo\.live/' + _HEADERS = {'Origin': 'https://trovo.live'} + + def _extract_streamer_info(self, data): + streamer_info = data.get('streamerInfo') or {} + username = streamer_info.get('userName') + return { + 'uploader': streamer_info.get('nickName'), + 'uploader_id': str_or_none(streamer_info.get('uid')), + 'uploader_url': 'https://trovo.live/' + username if username else None, + } + + +class TrovoIE(TrovoBaseIE): + _VALID_URL = TrovoBaseIE._VALID_URL_BASE + r'(?!(?:clip|video)/)(?P<id>[^/?&#]+)' + + def _real_extract(self, url): + username = self._match_id(url) + live_info = self._download_json( + 'https://gql.trovo.live/', username, query={ + 'query': '''{ + getLiveInfo(params: {userName: "%s"}) { + isLive + programInfo { + coverUrl + id + streamInfo { + desc + playUrl + } + title + } + streamerInfo { + nickName + uid + userName + } + } +}''' % username, + })['data']['getLiveInfo'] + if live_info.get('isLive') == 0: + raise ExtractorError('%s is offline' % username, expected=True) + program_info = live_info['programInfo'] + program_id = program_info['id'] + title = self._live_title(program_info['title']) + + formats = [] + for stream_info in (program_info.get('streamInfo') or []): + play_url = stream_info.get('playUrl') + if not play_url: + continue + format_id = stream_info.get('desc') + formats.append({ + 'format_id': format_id, + 'height': int_or_none(format_id[:-1]) if format_id else None, + 'url': play_url, + 'http_headers': self._HEADERS, + }) + self._sort_formats(formats) + + info = { + 'id': program_id, + 'title': title, + 'formats': formats, + 'thumbnail': program_info.get('coverUrl'), + 'is_live': True, + } + info.update(self._extract_streamer_info(live_info)) + return info + + +class TrovoVodIE(TrovoBaseIE): + _VALID_URL = TrovoBaseIE._VALID_URL_BASE + r'(?:clip|video)/(?P<id>[^/?&#]+)' + _TESTS = [{ + 'url': 'https://trovo.live/video/ltv-100095501_100095501_1609596043', + 'info_dict': { + 'id': 'ltv-100095501_100095501_1609596043', + 'ext': 'mp4', + 'title': 'Spontaner 12 Stunden Stream! - Ok Boomer!', + 'uploader': 'Exsl', + 'timestamp': 1609640305, + 'upload_date': '20210103', + 'uploader_id': '100095501', + 'duration': 43977, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'comments': 'mincount:8', + 'categories': ['Grand Theft Auto V'], + }, + }, { + 'url': 'https://trovo.live/clip/lc-5285890810184026005', + 'only_matching': True, + }] + + def _real_extract(self, url): + vid = self._match_id(url) + resp = self._download_json( + 'https://gql.trovo.live/', vid, data=json.dumps([{ + 'query': '''{ + batchGetVodDetailInfo(params: {vids: ["%s"]}) { + VodDetailInfos + } +}''' % vid, + }, { + 'query': '''{ + getCommentList(params: {appInfo: {postID: "%s"}, pageSize: 1000000000, preview: {}}) { + commentList { + author { + nickName + uid + } + commentID + content + createdAt + parentID + } + } +}''' % vid, + }]).encode(), headers={ + 'Content-Type': 'application/json', + }) + vod_detail_info = resp[0]['data']['batchGetVodDetailInfo']['VodDetailInfos'][vid] + vod_info = vod_detail_info['vodInfo'] + title = vod_info['title'] + + language = vod_info.get('languageName') + formats = [] + for play_info in (vod_info.get('playInfos') or []): + play_url = play_info.get('playUrl') + if not play_url: + continue + format_id = play_info.get('desc') + formats.append({ + 'ext': 'mp4', + 'filesize': int_or_none(play_info.get('fileSize')), + 'format_id': format_id, + 'height': int_or_none(format_id[:-1]) if format_id else None, + 'language': language, + 'protocol': 'm3u8_native', + 'tbr': int_or_none(play_info.get('bitrate')), + 'url': play_url, + 'http_headers': self._HEADERS, + }) + self._sort_formats(formats) + + category = vod_info.get('categoryName') + get_count = lambda x: int_or_none(vod_info.get(x + 'Num')) + + comment_list = try_get(resp, lambda x: x[1]['data']['getCommentList']['commentList'], list) or [] + comments = [] + for comment in comment_list: + content = comment.get('content') + if not content: + continue + author = comment.get('author') or {} + parent = comment.get('parentID') + comments.append({ + 'author': author.get('nickName'), + 'author_id': str_or_none(author.get('uid')), + 'id': str_or_none(comment.get('commentID')), + 'text': content, + 'timestamp': int_or_none(comment.get('createdAt')), + 'parent': 'root' if parent == 0 else str_or_none(parent), + }) + + info = { + 'id': vid, + 'title': title, + 'formats': formats, + 'thumbnail': vod_info.get('coverUrl'), + 'timestamp': int_or_none(vod_info.get('publishTs')), + 'duration': int_or_none(vod_info.get('duration')), + 'view_count': get_count('watch'), + 'like_count': get_count('like'), + 'comment_count': get_count('comment'), + 'comments': comments, + 'categories': [category] if category else None, + } + info.update(self._extract_streamer_info(vod_detail_info)) + return info + + +class TrovoChannelBaseIE(InfoExtractor): + def _get_vod_json(self, page, uid): + raise NotImplementedError('This method must be implemented by subclasses') + + def _entries(self, uid): + for page in itertools.count(1): + vod_json = self._get_vod_json(page, uid) + vods = vod_json.get('vodInfos', []) + for vod in vods: + yield self.url_result( + 'https://trovo.live/%s/%s' % (self._TYPE, vod.get('vid')), + ie=TrovoVodIE.ie_key()) + has_more = vod_json['hasMore'] + if not has_more: + break + + def _real_extract(self, url): + id = self._match_id(url) + uid = str(self._download_json('https://gql.trovo.live/', id, query={ + 'query': '{getLiveInfo(params:{userName:"%s"}){streamerInfo{uid}}}' % id + })['data']['getLiveInfo']['streamerInfo']['uid']) + return self.playlist_result(self._entries(uid), playlist_id=uid) + + +class TrovoChannelVodIE(TrovoChannelBaseIE): + _VALID_URL = r'trovovod:(?P<id>[^\s]+)' + IE_DESC = 'All VODs of a trovo.live channel, "trovovod" keyword' + + _TESTS = [{ + 'url': 'trovovod:OneTappedYou', + 'playlist_mincount': 24, + 'info_dict': { + 'id': '100719456', + }, + }] + + _QUERY = '{getChannelLtvVideoInfos(params:{pageSize:99,currPage:%d,channelID:%s}){hasMore,vodInfos{vid}}}' + _TYPE = 'video' + + def _get_vod_json(self, page, uid): + return self._download_json('https://gql.trovo.live/', uid, query={ + 'query': self._QUERY % (page, uid) + })['data']['getChannelLtvVideoInfos'] + + +class TrovoChannelClipIE(TrovoChannelBaseIE): + _VALID_URL = r'trovoclip:(?P<id>[^\s]+)' + IE_DESC = 'All Clips of a trovo.live channel, "trovoclip" keyword' + + _TESTS = [{ + 'url': 'trovoclip:OneTappedYou', + 'playlist_mincount': 29, + 'info_dict': { + 'id': '100719456', + }, + }] + + _QUERY = '{getChannelClipVideoInfos(params:{pageSize:99,currPage:%d,channelID:%s,albumType:VOD_CLIP_ALBUM_TYPE_LATEST}){hasMore,vodInfos{vid}}}' + _TYPE = 'clip' + + def _get_vod_json(self, page, uid): + return self._download_json('https://gql.trovo.live/', uid, query={ + 'query': self._QUERY % (page, uid) + })['data']['getChannelClipVideoInfos'] diff --git a/youtube_dl/extractor/trunews.py b/yt_dlp/extractor/trunews.py index cca5b5ceb..cca5b5ceb 100644 --- a/youtube_dl/extractor/trunews.py +++ b/yt_dlp/extractor/trunews.py diff --git a/yt_dlp/extractor/trutv.py b/yt_dlp/extractor/trutv.py new file mode 100644 index 000000000..c09ff897c --- /dev/null +++ b/yt_dlp/extractor/trutv.py @@ -0,0 +1,74 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .turner import TurnerBaseIE +from ..utils import ( + int_or_none, + parse_iso8601, +) + + +class TruTVIE(TurnerBaseIE): + _VALID_URL = r'https?://(?:www\.)?trutv\.com/(?:shows|full-episodes)/(?P<series_slug>[0-9A-Za-z-]+)/(?:videos/(?P<clip_slug>[0-9A-Za-z-]+)|(?P<id>\d+))' + _TEST = { + 'url': 'https://www.trutv.com/shows/the-carbonaro-effect/videos/sunlight-activated-flower.html', + 'info_dict': { + 'id': 'f16c03beec1e84cd7d1a51f11d8fcc29124cc7f1', + 'ext': 'mp4', + 'title': 'Sunlight-Activated Flower', + 'description': "A customer is stunned when he sees Michael's sunlight-activated flower.", + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + series_slug, clip_slug, video_id = self._match_valid_url(url).groups() + + if video_id: + path = 'episode' + display_id = video_id + else: + path = 'series/clip' + display_id = clip_slug + + data = self._download_json( + 'https://api.trutv.com/v2/web/%s/%s/%s' % (path, series_slug, display_id), + display_id) + video_data = data['episode'] if video_id else data['info'] + media_id = video_data['mediaId'] + title = video_data['title'].strip() + + info = self._extract_ngtv_info( + media_id, {}, { + 'url': url, + 'site_name': 'truTV', + 'auth_required': video_data.get('isAuthRequired'), + }) + + thumbnails = [] + for image in video_data.get('images', []): + image_url = image.get('srcUrl') + if not image_url: + continue + thumbnails.append({ + 'url': image_url, + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + }) + + info.update({ + 'id': media_id, + 'display_id': display_id, + 'title': title, + 'description': video_data.get('description'), + 'thumbnails': thumbnails, + 'timestamp': parse_iso8601(video_data.get('publicationDate')), + 'series': video_data.get('showTitle'), + 'season_number': int_or_none(video_data.get('seasonNum')), + 'episode_number': int_or_none(video_data.get('episodeNum')), + }) + return info diff --git a/youtube_dl/extractor/tube8.py b/yt_dlp/extractor/tube8.py index db93b0182..db93b0182 100644 --- a/youtube_dl/extractor/tube8.py +++ b/yt_dlp/extractor/tube8.py diff --git a/yt_dlp/extractor/tubitv.py b/yt_dlp/extractor/tubitv.py new file mode 100644 index 000000000..2e9b325ba --- /dev/null +++ b/yt_dlp/extractor/tubitv.py @@ -0,0 +1,145 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + js_to_json, + sanitized_Request, + urlencode_postdata, +) + + +class TubiTvIE(InfoExtractor): + _VALID_URL = r'''(?x) + (?: + tubitv:| + https?://(?:www\.)?tubitv\.com/(?:video|movies|tv-shows)/ + ) + (?P<id>[0-9]+)''' + _LOGIN_URL = 'http://tubitv.com/login' + _NETRC_MACHINE = 'tubitv' + _GEO_COUNTRIES = ['US'] + _TESTS = [{ + 'url': 'http://tubitv.com/video/283829/the_comedian_at_the_friday', + 'md5': '43ac06be9326f41912dc64ccf7a80320', + 'info_dict': { + 'id': '283829', + 'ext': 'mp4', + 'title': 'The Comedian at The Friday', + 'description': 'A stand up comedian is forced to look at the decisions in his life while on a one week trip to the west coast.', + 'uploader_id': 'bc168bee0d18dd1cb3b86c68706ab434', + }, + }, { + 'url': 'http://tubitv.com/tv-shows/321886/s01_e01_on_nom_stories', + 'only_matching': True, + }, { + 'url': 'http://tubitv.com/movies/383676/tracker', + 'only_matching': True, + }, { + 'url': 'https://tubitv.com/movies/560057/penitentiary?start=true', + 'info_dict': { + 'id': '560057', + 'ext': 'mp4', + 'title': 'Penitentiary', + 'description': 'md5:8d2fc793a93cc1575ff426fdcb8dd3f9', + 'uploader_id': 'd8fed30d4f24fcb22ec294421b9defc2', + 'release_year': 1979, + }, + 'params': { + 'skip_download': True, + }, + }] + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + self.report_login() + form_data = { + 'username': username, + 'password': password, + } + payload = urlencode_postdata(form_data) + request = sanitized_Request(self._LOGIN_URL, payload) + request.add_header('Content-Type', 'application/x-www-form-urlencoded') + login_page = self._download_webpage( + request, None, False, 'Wrong login info') + if not re.search(r'id="tubi-logout"', login_page): + raise ExtractorError( + 'Login failed (invalid username/password)', expected=True) + + def _real_initialize(self): + self._login() + + def _real_extract(self, url): + video_id = self._match_id(url) + video_data = self._download_json( + 'http://tubitv.com/oz/videos/%s/content' % video_id, video_id) + title = video_data['title'] + + formats = [] + url = video_data['url'] + # URL can be sometimes empty. Does this only happen when there is DRM? + if url: + formats = self._extract_m3u8_formats( + self._proto_relative_url(url), + video_id, 'mp4', 'm3u8_native') + self._sort_formats(formats) + + thumbnails = [] + for thumbnail_url in video_data.get('thumbnails', []): + if not thumbnail_url: + continue + thumbnails.append({ + 'url': self._proto_relative_url(thumbnail_url), + }) + + subtitles = {} + for sub in video_data.get('subtitles', []): + sub_url = sub.get('url') + if not sub_url: + continue + subtitles.setdefault(sub.get('lang', 'English'), []).append({ + 'url': self._proto_relative_url(sub_url), + }) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': thumbnails, + 'description': video_data.get('description'), + 'duration': int_or_none(video_data.get('duration')), + 'uploader_id': video_data.get('publisher_id'), + 'release_year': int_or_none(video_data.get('year')), + } + + +class TubiTvShowIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tubitv\.com/series/[0-9]+/(?P<show_name>[^/?#]+)' + _TESTS = [{ + 'url': 'https://tubitv.com/series/3936/the-joy-of-painting-with-bob-ross?start=true', + 'playlist_mincount': 390, + 'info_dict': { + 'id': 'the-joy-of-painting-with-bob-ross', + } + }] + + def _entries(self, show_url, show_name): + show_webpage = self._download_webpage(show_url, show_name) + show_json = self._parse_json(self._search_regex( + r"window\.__data\s*=\s*({.+?});\s*</script>", + show_webpage, 'data',), show_name, transform_source=js_to_json)['video'] + for episode_id in show_json['fullContentById'].keys(): + yield self.url_result( + 'tubitv:%s' % episode_id, + ie=TubiTvIE.ie_key(), video_id=episode_id) + + def _real_extract(self, url): + show_name = self._match_valid_url(url).group('show_name') + return self.playlist_result(self._entries(url, show_name), playlist_id=show_name) diff --git a/youtube_dl/extractor/tudou.py b/yt_dlp/extractor/tudou.py index 7421378a8..7421378a8 100644 --- a/youtube_dl/extractor/tudou.py +++ b/yt_dlp/extractor/tudou.py diff --git a/yt_dlp/extractor/tumblr.py b/yt_dlp/extractor/tumblr.py new file mode 100644 index 000000000..adc370127 --- /dev/null +++ b/yt_dlp/extractor/tumblr.py @@ -0,0 +1,212 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + urlencode_postdata +) + + +class TumblrIE(InfoExtractor): + _VALID_URL = r'https?://(?P<blog_name>[^/?#&]+)\.tumblr\.com/(?:post|video)/(?P<id>[0-9]+)(?:$|[/?#])' + _NETRC_MACHINE = 'tumblr' + _LOGIN_URL = 'https://www.tumblr.com/login' + _TESTS = [{ + 'url': 'http://tatianamaslanydaily.tumblr.com/post/54196191430/orphan-black-dvd-extra-behind-the-scenes', + 'md5': '479bb068e5b16462f5176a6828829767', + 'info_dict': { + 'id': '54196191430', + 'ext': 'mp4', + 'title': 'tatiana maslany news, Orphan Black || DVD extra - behind the scenes ↳...', + 'description': 'md5:37db8211e40b50c7c44e95da14f630b7', + 'thumbnail': r're:http://.*\.jpg', + } + }, { + 'url': 'http://5sostrum.tumblr.com/post/90208453769/yall-forgetting-the-greatest-keek-of-them-all', + 'md5': 'bf348ef8c0ef84fbf1cbd6fa6e000359', + 'info_dict': { + 'id': '90208453769', + 'ext': 'mp4', + 'title': '5SOS STRUM ;]', + 'description': 'md5:dba62ac8639482759c8eb10ce474586a', + 'thumbnail': r're:http://.*\.jpg', + } + }, { + 'url': 'http://hdvideotest.tumblr.com/post/130323439814/test-description-for-my-hd-video', + 'md5': '7ae503065ad150122dc3089f8cf1546c', + 'info_dict': { + 'id': '130323439814', + 'ext': 'mp4', + 'title': 'HD Video Testing \u2014 Test description for my HD video', + 'description': 'md5:97cc3ab5fcd27ee4af6356701541319c', + 'thumbnail': r're:http://.*\.jpg', + }, + 'params': { + 'format': 'hd', + }, + }, { + 'url': 'http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching', + 'md5': 'de07e5211d60d4f3a2c3df757ea9f6ab', + 'info_dict': { + 'id': 'Wmur', + 'ext': 'mp4', + 'title': 'naked smoking & stretching', + 'upload_date': '20150506', + 'timestamp': 1430931613, + 'age_limit': 18, + 'uploader_id': '1638622', + 'uploader': 'naked-yogi', + }, + 'add_ie': ['Vidme'], + }, { + 'url': 'http://camdamage.tumblr.com/post/98846056295/', + 'md5': 'a9e0c8371ea1ca306d6554e3fecf50b6', + 'info_dict': { + 'id': '105463834', + 'ext': 'mp4', + 'title': 'Cam Damage-HD 720p', + 'uploader': 'John Moyer', + 'uploader_id': 'user32021558', + }, + 'add_ie': ['Vimeo'], + }, { + 'url': 'http://sutiblr.tumblr.com/post/139638707273', + 'md5': '2dd184b3669e049ba40563a7d423f95c', + 'info_dict': { + 'id': 'ir7qBEIKqvq', + 'ext': 'mp4', + 'title': 'Vine by sutiblr', + 'alt_title': 'Vine by sutiblr', + 'uploader': 'sutiblr', + 'uploader_id': '1198993975374495744', + 'upload_date': '20160220', + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + }, + 'add_ie': ['Vine'], + }, { + 'url': 'http://vitasidorkina.tumblr.com/post/134652425014/joskriver-victoriassecret-invisibility-or', + 'md5': '01c12ceb82cbf6b2fe0703aa56b3ad72', + 'info_dict': { + 'id': '-7LnUPGlSo', + 'ext': 'mp4', + 'title': 'Video by victoriassecret', + 'description': 'Invisibility or flight…which superpower would YOU choose? #VSFashionShow #ThisOrThat', + 'uploader_id': 'victoriassecret', + 'thumbnail': r're:^https?://.*\.jpg' + }, + 'add_ie': ['Instagram'], + }] + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login page') + + login_form = self._hidden_inputs(login_page) + login_form.update({ + 'user[email]': username, + 'user[password]': password + }) + + response, urlh = self._download_webpage_handle( + self._LOGIN_URL, None, 'Logging in', + data=urlencode_postdata(login_form), headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': self._LOGIN_URL, + }) + + # Successful login + if '/dashboard' in urlh.geturl(): + return + + login_errors = self._parse_json( + self._search_regex( + r'RegistrationForm\.errors\s*=\s*(\[.+?\])\s*;', response, + 'login errors', default='[]'), + None, fatal=False) + if login_errors: + raise ExtractorError( + 'Unable to login: %s' % login_errors[0], expected=True) + + self.report_warning('Login has probably failed') + + def _real_extract(self, url): + m_url = self._match_valid_url(url) + video_id = m_url.group('id') + blog = m_url.group('blog_name') + + url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id) + webpage, urlh = self._download_webpage_handle(url, video_id) + + redirect_url = urlh.geturl() + if 'tumblr.com/safe-mode' in redirect_url or redirect_url.startswith('/safe-mode'): + raise ExtractorError( + 'This Tumblr may contain sensitive media. ' + 'Disable safe mode in your account settings ' + 'at https://www.tumblr.com/settings/account#safe_mode', + expected=True) + + iframe_url = self._search_regex( + r'src=\'(https?://www\.tumblr\.com/video/[^\']+)\'', + webpage, 'iframe url', default=None) + if iframe_url is None: + return self.url_result(redirect_url, 'Generic') + + iframe = self._download_webpage(iframe_url, video_id, 'Downloading iframe page') + + duration = None + sources = [] + + sd_url = self._search_regex( + r'<source[^>]+src=(["\'])(?P<url>.+?)\1', iframe, + 'sd video url', default=None, group='url') + if sd_url: + sources.append((sd_url, 'sd')) + + options = self._parse_json( + self._search_regex( + r'data-crt-options=(["\'])(?P<options>.+?)\1', iframe, + 'hd video url', default='', group='options'), + video_id, fatal=False) + if options: + duration = int_or_none(options.get('duration')) + hd_url = options.get('hdUrl') + if hd_url: + sources.append((hd_url, 'hd')) + + formats = [{ + 'url': video_url, + 'ext': 'mp4', + 'format_id': format_id, + 'height': int_or_none(self._search_regex( + r'/(\d{3,4})$', video_url, 'height', default=None)), + 'quality': quality, + } for quality, (video_url, format_id) in enumerate(sources)] + + self._sort_formats(formats) + + # The only place where you can get a title, it's not complete, + # but searching in other places doesn't work for all videos + video_title = self._html_search_regex( + r'(?s)<title>(?P<title>.*?)(?: \| Tumblr)?</title>', + webpage, 'title') + + return { + 'id': video_id, + 'title': video_title, + 'description': self._og_search_description(webpage, default=None), + 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'duration': duration, + 'formats': formats, + } diff --git a/youtube_dl/extractor/tunein.py b/yt_dlp/extractor/tunein.py index c7a5f5a63..c7a5f5a63 100644 --- a/youtube_dl/extractor/tunein.py +++ b/yt_dlp/extractor/tunein.py diff --git a/youtube_dl/extractor/tunepk.py b/yt_dlp/extractor/tunepk.py index 9d42651ce..9d42651ce 100644 --- a/youtube_dl/extractor/tunepk.py +++ b/yt_dlp/extractor/tunepk.py diff --git a/yt_dlp/extractor/turbo.py b/yt_dlp/extractor/turbo.py new file mode 100644 index 000000000..f6bbf2529 --- /dev/null +++ b/yt_dlp/extractor/turbo.py @@ -0,0 +1,68 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + qualities, + xpath_text, +) + + +class TurboIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?turbo\.fr/videos-voiture/(?P<id>[0-9]+)-' + _API_URL = 'http://www.turbo.fr/api/tv/xml.php?player_generique=player_generique&id={0:}' + _TEST = { + 'url': 'http://www.turbo.fr/videos-voiture/454443-turbo-du-07-09-2014-renault-twingo-3-bentley-continental-gt-speed-ces-guide-achat-dacia.html', + 'md5': '33f4b91099b36b5d5a91f84b5bcba600', + 'info_dict': { + 'id': '454443', + 'ext': 'mp4', + 'duration': 3715, + 'title': 'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia... ', + 'description': 'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia...', + 'thumbnail': r're:^https?://.*\.jpg$', + } + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + + playlist = self._download_xml(self._API_URL.format(video_id), video_id) + item = playlist.find('./channel/item') + if item is None: + raise ExtractorError('Playlist item was not found', expected=True) + + title = xpath_text(item, './title', 'title') + duration = int_or_none(xpath_text(item, './durate', 'duration')) + thumbnail = xpath_text(item, './visuel_clip', 'thumbnail') + description = self._html_search_meta('description', webpage) + + formats = [] + get_quality = qualities(['3g', 'sd', 'hq']) + for child in item: + m = re.search(r'url_video_(?P<quality>.+)', child.tag) + if m: + quality = compat_str(m.group('quality')) + formats.append({ + 'format_id': quality, + 'url': child.text, + 'quality': get_quality(quality), + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'duration': duration, + 'thumbnail': thumbnail, + 'description': description, + 'formats': formats, + } diff --git a/yt_dlp/extractor/turner.py b/yt_dlp/extractor/turner.py new file mode 100644 index 000000000..32125bc79 --- /dev/null +++ b/yt_dlp/extractor/turner.py @@ -0,0 +1,261 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .adobepass import AdobePassIE +from ..compat import compat_str +from ..utils import ( + fix_xml_ampersands, + xpath_text, + int_or_none, + determine_ext, + float_or_none, + parse_duration, + xpath_attr, + update_url_query, + ExtractorError, + strip_or_none, + url_or_none, +) + + +class TurnerBaseIE(AdobePassIE): + _AKAMAI_SPE_TOKEN_CACHE = {} + + def _extract_timestamp(self, video_data): + return int_or_none(xpath_attr(video_data, 'dateCreated', 'uts')) + + def _add_akamai_spe_token(self, tokenizer_src, video_url, content_id, ap_data, custom_tokenizer_query=None): + secure_path = self._search_regex(r'https?://[^/]+(.+/)', video_url, 'secure path') + '*' + token = self._AKAMAI_SPE_TOKEN_CACHE.get(secure_path) + if not token: + query = { + 'path': secure_path, + } + if custom_tokenizer_query: + query.update(custom_tokenizer_query) + else: + query['videoId'] = content_id + if ap_data.get('auth_required'): + query['accessToken'] = self._extract_mvpd_auth(ap_data['url'], content_id, ap_data['site_name'], ap_data['site_name']) + auth = self._download_xml( + tokenizer_src, content_id, query=query) + error_msg = xpath_text(auth, 'error/msg') + if error_msg: + raise ExtractorError(error_msg, expected=True) + token = xpath_text(auth, 'token') + if not token: + return video_url + self._AKAMAI_SPE_TOKEN_CACHE[secure_path] = token + return video_url + '?hdnea=' + token + + def _extract_cvp_info(self, data_src, video_id, path_data={}, ap_data={}, fatal=False): + video_data = self._download_xml( + data_src, video_id, + transform_source=lambda s: fix_xml_ampersands(s).strip(), + fatal=fatal) + if not video_data: + return {} + video_id = video_data.attrib['id'] + title = xpath_text(video_data, 'headline', fatal=True) + content_id = xpath_text(video_data, 'contentId') or video_id + # rtmp_src = xpath_text(video_data, 'akamai/src') + # if rtmp_src: + # split_rtmp_src = rtmp_src.split(',') + # if len(split_rtmp_src) == 2: + # rtmp_src = split_rtmp_src[1] + # aifp = xpath_text(video_data, 'akamai/aifp', default='') + + urls = [] + formats = [] + thumbnails = [] + subtitles = {} + rex = re.compile( + r'(?P<width>[0-9]+)x(?P<height>[0-9]+)(?:_(?P<bitrate>[0-9]+))?') + # Possible formats locations: files/file, files/groupFiles/files + # and maybe others + for video_file in video_data.findall('.//file'): + video_url = url_or_none(video_file.text.strip()) + if not video_url: + continue + ext = determine_ext(video_url) + if video_url.startswith('/mp4:protected/'): + continue + # TODO Correct extraction for these files + # protected_path_data = path_data.get('protected') + # if not protected_path_data or not rtmp_src: + # continue + # protected_path = self._search_regex( + # r'/mp4:(.+)\.[a-z0-9]', video_url, 'secure path') + # auth = self._download_webpage( + # protected_path_data['tokenizer_src'], query={ + # 'path': protected_path, + # 'videoId': content_id, + # 'aifp': aifp, + # }) + # token = xpath_text(auth, 'token') + # if not token: + # continue + # video_url = rtmp_src + video_url + '?' + token + elif video_url.startswith('/secure/'): + secure_path_data = path_data.get('secure') + if not secure_path_data: + continue + video_url = self._add_akamai_spe_token( + secure_path_data['tokenizer_src'], + secure_path_data['media_src'] + video_url, + content_id, ap_data) + elif not re.match('https?://', video_url): + base_path_data = path_data.get(ext, path_data.get('default', {})) + media_src = base_path_data.get('media_src') + if not media_src: + continue + video_url = media_src + video_url + if video_url in urls: + continue + urls.append(video_url) + format_id = video_file.get('bitrate') + if ext in ('scc', 'srt', 'vtt'): + subtitles.setdefault('en', []).append({ + 'ext': ext, + 'url': video_url, + }) + elif ext == 'png': + thumbnails.append({ + 'id': format_id, + 'url': video_url, + }) + elif ext == 'smil': + formats.extend(self._extract_smil_formats( + video_url, video_id, fatal=False)) + elif re.match(r'https?://[^/]+\.akamaihd\.net/[iz]/', video_url): + formats.extend(self._extract_akamai_formats( + video_url, video_id, { + 'hds': path_data.get('f4m', {}).get('host'), + # nba.cdn.turner.com, ht.cdn.turner.com, ht2.cdn.turner.com + # ht3.cdn.turner.com, i.cdn.turner.com, s.cdn.turner.com + # ssl.cdn.turner.com + 'http': 'pmd.cdn.turner.com', + })) + elif ext == 'm3u8': + m3u8_formats = self._extract_m3u8_formats( + video_url, video_id, 'mp4', + m3u8_id=format_id or 'hls', fatal=False) + if '/secure/' in video_url and '?hdnea=' in video_url: + for f in m3u8_formats: + f['_ffmpeg_args'] = ['-seekable', '0'] + formats.extend(m3u8_formats) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + update_url_query(video_url, {'hdcore': '3.7.0'}), + video_id, f4m_id=format_id or 'hds', fatal=False)) + else: + f = { + 'format_id': format_id, + 'url': video_url, + 'ext': ext, + } + mobj = rex.search(video_url) + if mobj: + f.update({ + 'width': int(mobj.group('width')), + 'height': int(mobj.group('height')), + 'tbr': int_or_none(mobj.group('bitrate')), + }) + elif isinstance(format_id, compat_str): + if format_id.isdigit(): + f['tbr'] = int(format_id) + else: + mobj = re.match(r'ios_(audio|[0-9]+)$', format_id) + if mobj: + if mobj.group(1) == 'audio': + f.update({ + 'vcodec': 'none', + 'ext': 'm4a', + }) + else: + f['tbr'] = int(mobj.group(1)) + formats.append(f) + self._sort_formats(formats) + + for source in video_data.findall('closedCaptions/source'): + for track in source.findall('track'): + track_url = url_or_none(track.get('url')) + if not track_url or track_url.endswith('/big'): + continue + lang = track.get('lang') or track.get('label') or 'en' + subtitles.setdefault(lang, []).append({ + 'url': track_url, + 'ext': { + 'scc': 'scc', + 'webvtt': 'vtt', + 'smptett': 'tt', + }.get(source.get('format')) + }) + + thumbnails.extend({ + 'id': image.get('cut') or image.get('name'), + 'url': image.text, + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + } for image in video_data.findall('images/image')) + + is_live = xpath_text(video_data, 'isLive') == 'true' + + return { + 'id': video_id, + 'title': self._live_title(title) if is_live else title, + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': thumbnails, + 'thumbnail': xpath_text(video_data, 'poster'), + 'description': strip_or_none(xpath_text(video_data, 'description')), + 'duration': parse_duration(xpath_text(video_data, 'length') or xpath_text(video_data, 'trt')), + 'timestamp': self._extract_timestamp(video_data), + 'upload_date': xpath_attr(video_data, 'metas', 'version'), + 'series': xpath_text(video_data, 'showTitle'), + 'season_number': int_or_none(xpath_text(video_data, 'seasonNumber')), + 'episode_number': int_or_none(xpath_text(video_data, 'episodeNumber')), + 'is_live': is_live, + } + + def _extract_ngtv_info(self, media_id, tokenizer_query, ap_data=None): + is_live = ap_data.get('is_live') + streams_data = self._download_json( + 'http://medium.ngtv.io/media/%s/tv' % media_id, + media_id)['media']['tv'] + duration = None + chapters = [] + formats = [] + for supported_type in ('unprotected', 'bulkaes'): + stream_data = streams_data.get(supported_type, {}) + m3u8_url = stream_data.get('secureUrl') or stream_data.get('url') + if not m3u8_url: + continue + if stream_data.get('playlistProtection') == 'spe': + m3u8_url = self._add_akamai_spe_token( + 'http://token.ngtv.io/token/token_spe', + m3u8_url, media_id, ap_data or {}, tokenizer_query) + formats.extend(self._extract_m3u8_formats( + m3u8_url, media_id, 'mp4', m3u8_id='hls', live=is_live, fatal=False)) + + duration = float_or_none(stream_data.get('totalRuntime')) + + if not chapters and not is_live: + for chapter in stream_data.get('contentSegments', []): + start_time = float_or_none(chapter.get('start')) + chapter_duration = float_or_none(chapter.get('duration')) + if start_time is None or chapter_duration is None: + continue + chapters.append({ + 'start_time': start_time, + 'end_time': start_time + chapter_duration, + }) + self._sort_formats(formats) + + return { + 'formats': formats, + 'chapters': chapters, + 'duration': duration, + } diff --git a/yt_dlp/extractor/tv2.py b/yt_dlp/extractor/tv2.py new file mode 100644 index 000000000..e0851531c --- /dev/null +++ b/yt_dlp/extractor/tv2.py @@ -0,0 +1,324 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + float_or_none, + js_to_json, + parse_iso8601, + remove_end, + strip_or_none, + try_get, +) + + +class TV2IE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tv2\.no/v/(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://www.tv2.no/v/916509/', + 'info_dict': { + 'id': '916509', + 'ext': 'mp4', + 'title': 'Se Frode Gryttens hyllest av Steven Gerrard', + 'description': 'TV 2 Sportens huspoet tar avskjed med Liverpools kaptein Steven Gerrard.', + 'timestamp': 1431715610, + 'upload_date': '20150515', + 'duration': 157, + 'view_count': int, + 'categories': list, + }, + }] + _PROTOCOLS = ('HLS', 'DASH') + _GEO_COUNTRIES = ['NO'] + + def _real_extract(self, url): + video_id = self._match_id(url) + asset = self._download_json('https://sumo.tv2.no/rest/assets/' + video_id, video_id, + 'Downloading metadata JSON') + title = asset['title'] + is_live = asset.get('live') is True + + formats = [] + format_urls = [] + for protocol in self._PROTOCOLS: + try: + data = self._download_json('https://api.sumo.tv2.no/play/%s?stream=%s' % (video_id, protocol), + video_id, 'Downloading playabck JSON', + headers={'content-type': 'application/json'}, + data='{"device":{"id":"1-1-1","name":"Nettleser (HTML)"}}'.encode())['playback'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + error = self._parse_json(e.cause.read().decode(), video_id)['error'] + error_code = error.get('code') + if error_code == 'ASSET_PLAYBACK_INVALID_GEO_LOCATION': + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + elif error_code == 'SESSION_NOT_AUTHENTICATED': + self.raise_login_required() + raise ExtractorError(error['description']) + raise + items = data.get('streams', []) + for item in items: + video_url = item.get('url') + if not video_url or video_url in format_urls: + continue + format_id = '%s-%s' % (protocol.lower(), item.get('type')) + if not self._is_valid_url(video_url, video_id, format_id): + continue + format_urls.append(video_url) + ext = determine_ext(video_url) + if ext == 'f4m': + formats.extend(self._extract_f4m_formats( + video_url, video_id, f4m_id=format_id, fatal=False)) + elif ext == 'm3u8': + if not data.get('drmProtected'): + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', + 'm3u8' if is_live else 'm3u8_native', + m3u8_id=format_id, fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + video_url, video_id, format_id, fatal=False)) + elif ext == 'ism' or video_url.endswith('.ism/Manifest'): + pass + else: + formats.append({ + 'url': video_url, + 'format_id': format_id, + }) + if not formats and data.get('drmProtected'): + self.report_drm(video_id) + self._sort_formats(formats) + + thumbnails = [{ + 'id': type, + 'url': thumb_url, + } for type, thumb_url in (asset.get('images') or {}).items()] + + return { + 'id': video_id, + 'url': video_url, + 'title': self._live_title(title) if is_live else title, + 'description': strip_or_none(asset.get('description')), + 'thumbnails': thumbnails, + 'timestamp': parse_iso8601(asset.get('live_broadcast_time') or asset.get('update_time')), + 'duration': float_or_none(asset.get('accurateDuration') or asset.get('duration')), + 'view_count': int_or_none(asset.get('views')), + 'categories': asset.get('tags', '').split(','), + 'formats': formats, + 'is_live': is_live, + } + + +class TV2ArticleIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tv2\.no/(?:a|\d{4}/\d{2}/\d{2}(/[^/]+)+)/(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://www.tv2.no/2015/05/16/nyheter/alesund/krim/pingvin/6930542', + 'info_dict': { + 'id': '6930542', + 'title': 'Russen hetses etter pingvintyveri - innrømmer å ha åpnet luken på buret', + 'description': 'De fire siktede nekter fortsatt for å ha stjålet pingvinbabyene, men innrømmer å ha åpnet luken til de små kyllingene.', + }, + 'playlist_count': 2, + }, { + 'url': 'http://www.tv2.no/a/6930542', + 'only_matching': True, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + # Old embed pattern (looks unused nowadays) + assets = re.findall(r'data-assetid=["\'](\d+)', webpage) + + if not assets: + # New embed pattern + for v in re.findall(r'(?s)TV2ContentboxVideo\(({.+?})\)', webpage): + video = self._parse_json( + v, playlist_id, transform_source=js_to_json, fatal=False) + if not video: + continue + asset = video.get('assetId') + if asset: + assets.append(asset) + + entries = [ + self.url_result('http://www.tv2.no/v/%s' % asset_id, 'TV2') + for asset_id in assets] + + title = remove_end(self._og_search_title(webpage), ' - TV2.no') + description = remove_end(self._og_search_description(webpage), ' - TV2.no') + + return self.playlist_result(entries, playlist_id, title, description) + + +class KatsomoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?:katsomo|mtv(uutiset)?)\.fi/(?:sarja/[0-9a-z-]+-\d+/[0-9a-z-]+-|(?:#!/)?jakso/(?:\d+/[^/]+/)?|video/prog)(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.mtv.fi/sarja/mtv-uutiset-live-33001002003/lahden-pelicans-teki-kovan-ratkaisun-ville-nieminen-pihalle-1181321', + 'info_dict': { + 'id': '1181321', + 'ext': 'mp4', + 'title': 'Lahden Pelicans teki kovan ratkaisun – Ville Nieminen pihalle', + 'description': 'Päätöksen teki Pelicansin hallitus.', + 'timestamp': 1575116484, + 'upload_date': '20191130', + 'duration': 37.12, + 'view_count': int, + 'categories': list, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://www.katsomo.fi/#!/jakso/33001005/studio55-fi/658521/jukka-kuoppamaki-tekee-yha-lauluja-vaikka-lentokoneessa', + 'only_matching': True, + }, { + 'url': 'https://www.mtvuutiset.fi/video/prog1311159', + 'only_matching': True, + }, { + 'url': 'https://www.katsomo.fi/#!/jakso/1311159', + 'only_matching': True, + }] + _API_DOMAIN = 'api.katsomo.fi' + _PROTOCOLS = ('HLS', 'MPD') + _GEO_COUNTRIES = ['FI'] + + def _real_extract(self, url): + video_id = self._match_id(url) + api_base = 'http://%s/api/web/asset/%s' % (self._API_DOMAIN, video_id) + + asset = self._download_json( + api_base + '.json', video_id, + 'Downloading metadata JSON')['asset'] + title = asset.get('subtitle') or asset['title'] + is_live = asset.get('live') is True + + formats = [] + format_urls = [] + for protocol in self._PROTOCOLS: + try: + data = self._download_json( + api_base + '/play.json?protocol=%s&videoFormat=SMIL+ISMUSP' % protocol, + video_id, 'Downloading play JSON')['playback'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + error = self._parse_json(e.cause.read().decode(), video_id)['error'] + error_code = error.get('code') + if error_code == 'ASSET_PLAYBACK_INVALID_GEO_LOCATION': + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + elif error_code == 'SESSION_NOT_AUTHENTICATED': + self.raise_login_required() + raise ExtractorError(error['description']) + raise + items = try_get(data, lambda x: x['items']['item']) + if not items: + continue + if not isinstance(items, list): + items = [items] + for item in items: + if not isinstance(item, dict): + continue + video_url = item.get('url') + if not video_url or video_url in format_urls: + continue + format_id = '%s-%s' % (protocol.lower(), item.get('mediaFormat')) + if not self._is_valid_url(video_url, video_id, format_id): + continue + format_urls.append(video_url) + ext = determine_ext(video_url) + if ext == 'f4m': + formats.extend(self._extract_f4m_formats( + video_url, video_id, f4m_id=format_id, fatal=False)) + elif ext == 'm3u8': + if not data.get('drmProtected'): + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', + 'm3u8' if is_live else 'm3u8_native', + m3u8_id=format_id, fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + video_url, video_id, format_id, fatal=False)) + elif ext == 'ism' or video_url.endswith('.ism/Manifest'): + pass + else: + formats.append({ + 'url': video_url, + 'format_id': format_id, + 'tbr': int_or_none(item.get('bitrate')), + 'filesize': int_or_none(item.get('fileSize')), + }) + if not formats and data.get('drmProtected'): + self.report_drm(video_id) + self._sort_formats(formats) + + thumbnails = [{ + 'id': thumbnail.get('@type'), + 'url': thumbnail.get('url'), + } for _, thumbnail in (asset.get('imageVersions') or {}).items()] + + return { + 'id': video_id, + 'url': video_url, + 'title': self._live_title(title) if is_live else title, + 'description': strip_or_none(asset.get('description')), + 'thumbnails': thumbnails, + 'timestamp': parse_iso8601(asset.get('createTime')), + 'duration': float_or_none(asset.get('accurateDuration') or asset.get('duration')), + 'view_count': int_or_none(asset.get('views')), + 'categories': asset.get('keywords', '').split(','), + 'formats': formats, + 'is_live': is_live, + } + + +class MTVUutisetArticleIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)mtvuutiset\.fi/artikkeli/[^/]+/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.mtvuutiset.fi/artikkeli/tallaisia-vaurioita-viking-amorellassa-on-useamman-osaston-alla-vetta/7931384', + 'info_dict': { + 'id': '1311159', + 'ext': 'mp4', + 'title': 'Viking Amorellan matkustajien evakuointi on alkanut – tältä operaatio näyttää laivalla', + 'description': 'Viking Amorellan matkustajien evakuointi on alkanut – tältä operaatio näyttää laivalla', + 'timestamp': 1600608966, + 'upload_date': '20200920', + 'duration': 153.7886666, + 'view_count': int, + 'categories': list, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # multiple Youtube embeds + 'url': 'https://www.mtvuutiset.fi/artikkeli/50-vuotta-subarun-vastaiskua/6070962', + 'only_matching': True, + }] + + def _real_extract(self, url): + article_id = self._match_id(url) + article = self._download_json( + 'http://api.mtvuutiset.fi/mtvuutiset/api/json/' + article_id, + article_id) + + def entries(): + for video in (article.get('videos') or []): + video_type = video.get('videotype') + video_url = video.get('url') + if not (video_url and video_type in ('katsomo', 'youtube')): + continue + yield self.url_result( + video_url, video_type.capitalize(), video.get('video_id')) + + return self.playlist_result( + entries(), article_id, article.get('title'), article.get('description')) diff --git a/youtube_dl/extractor/tv2dk.py b/yt_dlp/extractor/tv2dk.py index 8bd5fd640..8bd5fd640 100644 --- a/youtube_dl/extractor/tv2dk.py +++ b/yt_dlp/extractor/tv2dk.py diff --git a/yt_dlp/extractor/tv2hu.py b/yt_dlp/extractor/tv2hu.py new file mode 100644 index 000000000..f2104358b --- /dev/null +++ b/yt_dlp/extractor/tv2hu.py @@ -0,0 +1,110 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + traverse_obj, + UnsupportedError, +) + + +class TV2HuIE(InfoExtractor): + IE_NAME = 'tv2play.hu' + _VALID_URL = r'https?://(?:www\.)?tv2play\.hu/(?!szalag/)(?P<id>[^#&?]+)' + _TESTS = [{ + 'url': 'https://tv2play.hu/mintaapak/mintaapak_213_epizod_resz', + 'info_dict': { + 'id': '249240', + 'ext': 'mp4', + 'title': 'Mintaapák - 213. epizód', + 'series': 'Mintaapák', + 'duration': 2164, + 'description': 'md5:7350147e75485a59598e806c47967b07', + 'thumbnail': r're:^https?://.*\.jpg$', + 'release_date': '20210825', + 'season_number': None, + 'episode_number': 213, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://tv2play.hu/taxi_2', + 'md5': '585e58e2e090f34603804bb2c48e98d8', + 'info_dict': { + 'id': '199363', + 'ext': 'mp4', + 'title': 'Taxi 2', + 'series': 'Taxi 2', + 'duration': 5087, + 'description': 'md5:47762155dc9a50241797ded101b1b08c', + 'thumbnail': r're:^https?://.*\.jpg$', + 'release_date': '20210118', + 'season_number': None, + 'episode_number': None, + }, + 'params': { + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + id = self._match_id(url) + json_data = self._download_json(f'https://tv2play.hu/api/search/{id}', id) + + if json_data['contentType'] == 'showpage': + ribbon_ids = traverse_obj(json_data, ('pages', ..., 'tabs', ..., 'ribbonIds'), get_all=False, expected_type=list) + entries = [self.url_result(f'https://tv2play.hu/szalag/{ribbon_id}', + ie=TV2HuSeriesIE.ie_key(), video_id=ribbon_id) for ribbon_id in ribbon_ids] + return self.playlist_result(entries, playlist_id=id) + elif json_data['contentType'] != 'video': + raise UnsupportedError(url) + + video_id = str(json_data['id']) + player_id = json_data.get('playerId') + series_json = json_data.get('seriesInfo', {}) + + video_json_url = self._download_json(f'https://tv2play.hu/api/streaming-url?playerId={player_id}', video_id)['url'] + video_json = self._download_json(video_json_url, video_id) + m3u8_url = self._proto_relative_url(traverse_obj(video_json, ('bitrates', 'hls'))) + formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': json_data['title'], + 'series': json_data.get('seriesTitle'), + 'duration': json_data.get('length'), + 'description': json_data.get('description'), + 'thumbnail': 'https://tv2play.hu' + json_data.get('thumbnailUrl'), + 'release_date': json_data.get('uploadedAt').replace('.', ''), + 'season_number': series_json.get('seasonNr'), + 'episode_number': series_json.get('episodeNr'), + 'formats': formats, + 'subtitles': subtitles, + } + + +class TV2HuSeriesIE(InfoExtractor): + IE_NAME = 'tv2playseries.hu' + _VALID_URL = r'https?://(?:www\.)?tv2play\.hu/szalag/(?P<id>[^#&?]+)' + + _TESTS = [{ + 'url': 'https://tv2play.hu/szalag/59?rendezes=nepszeruseg', + 'playlist_mincount': 284, + 'info_dict': { + 'id': '59', + } + }] + + def _real_extract(self, url): + id = self._match_id(url) + json_data = self._download_json(f'https://tv2play.hu/api/ribbons/{id}/0?size=100000', id) + entries = [] + for card in json_data.get('cards', []): + video_id = card.get('slug') + if video_id: + entries.append(self.url_result(f'https://tv2play.hu/{video_id}', + ie=TV2HuIE.ie_key(), video_id=video_id)) + + return self.playlist_result(entries, playlist_id=id) diff --git a/yt_dlp/extractor/tv4.py b/yt_dlp/extractor/tv4.py new file mode 100644 index 000000000..4043e6366 --- /dev/null +++ b/yt_dlp/extractor/tv4.py @@ -0,0 +1,141 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, +) + + +class TV4IE(InfoExtractor): + IE_DESC = 'tv4.se and tv4play.se' + _VALID_URL = r'''(?x)https?://(?:www\.)? + (?: + tv4\.se/(?:[^/]+)/klipp/(?:.*)-| + tv4play\.se/ + (?: + (?:program|barn)/(?:(?:[^/]+/){1,2}|(?:[^\?]+)\?video_id=)| + iframe/video/| + film/| + sport/| + ) + )(?P<id>[0-9]+)''' + _GEO_COUNTRIES = ['SE'] + _TESTS = [ + { + 'url': 'http://www.tv4.se/kalla-fakta/klipp/kalla-fakta-5-english-subtitles-2491650', + 'md5': 'cb837212f342d77cec06e6dad190e96d', + 'info_dict': { + 'id': '2491650', + 'ext': 'mp4', + 'title': 'Kalla Fakta 5 (english subtitles)', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': int, + 'upload_date': '20131125', + }, + }, + { + 'url': 'http://www.tv4play.se/iframe/video/3054113', + 'md5': 'cb837212f342d77cec06e6dad190e96d', + 'info_dict': { + 'id': '3054113', + 'ext': 'mp4', + 'title': 'Så här jobbar ficktjuvarna - se avslöjande bilder', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'Unika bilder avslöjar hur turisternas fickor vittjas mitt på Stockholms central. Två experter på ficktjuvarna avslöjar knepen du ska se upp för.', + 'timestamp': int, + 'upload_date': '20150130', + }, + }, + { + 'url': 'http://www.tv4play.se/sport/3060959', + 'only_matching': True, + }, + { + 'url': 'http://www.tv4play.se/film/2378136', + 'only_matching': True, + }, + { + 'url': 'http://www.tv4play.se/barn/looney-tunes?video_id=3062412', + 'only_matching': True, + }, + { + 'url': 'http://www.tv4play.se/program/farang/3922081', + 'only_matching': True, + }, + { + 'url': 'https://www.tv4play.se/program/nyheterna/avsnitt/13315940', + 'only_matching': True, + } + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + + info = self._download_json( + 'https://playback-api.b17g.net/asset/%s' % video_id, + video_id, 'Downloading video info JSON', query={ + 'service': 'tv4', + 'device': 'browser', + 'protocol': 'hls,dash', + 'drm': 'widevine', + })['metadata'] + + title = info['title'] + + manifest_url = self._download_json( + 'https://playback-api.b17g.net/media/' + video_id, + video_id, query={ + 'service': 'tv4', + 'device': 'browser', + 'protocol': 'hls', + })['playbackItem']['manifestUrl'] + formats = [] + subtitles = {} + + fmts, subs = self._extract_m3u8_formats_and_subtitles( + manifest_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) + + fmts, subs = self._extract_mpd_formats_and_subtitles( + manifest_url.replace('.m3u8', '.mpd'), + video_id, mpd_id='dash', fatal=False) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) + + fmts = self._extract_f4m_formats( + manifest_url.replace('.m3u8', '.f4m'), + video_id, f4m_id='hds', fatal=False) + formats.extend(fmts) + + fmts, subs = self._extract_ism_formats_and_subtitles( + re.sub(r'\.ism/.*?\.m3u8', r'.ism/Manifest', manifest_url), + video_id, ism_id='mss', fatal=False) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) + + if not formats and info.get('is_geo_restricted'): + self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'subtitles': subtitles, + 'description': info.get('description'), + 'timestamp': parse_iso8601(info.get('broadcast_date_time')), + 'duration': int_or_none(info.get('duration')), + 'thumbnail': info.get('image'), + 'is_live': info.get('isLive') is True, + 'series': info.get('seriesTitle'), + 'season_number': int_or_none(info.get('seasonNumber')), + 'episode': info.get('episodeTitle'), + 'episode_number': int_or_none(info.get('episodeNumber')), + } diff --git a/yt_dlp/extractor/tv5mondeplus.py b/yt_dlp/extractor/tv5mondeplus.py new file mode 100644 index 000000000..a0832d28f --- /dev/null +++ b/yt_dlp/extractor/tv5mondeplus.py @@ -0,0 +1,123 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + extract_attributes, + int_or_none, + parse_duration, + try_get, +) + + +class TV5MondePlusIE(InfoExtractor): + IE_DESC = 'TV5MONDE+' + _VALID_URL = r'https?://(?:www\.)?(?:tv5mondeplus|revoir\.tv5monde)\.com/toutes-les-videos/[^/]+/(?P<id>[^/?#]+)' + _TESTS = [{ + # movie + 'url': 'https://revoir.tv5monde.com/toutes-les-videos/cinema/ceux-qui-travaillent', + 'md5': '32fa0cde16a4480d1251502a66856d5f', + 'info_dict': { + 'id': 'dc57a011-ec4b-4648-2a9a-4f03f8352ed3', + 'display_id': 'ceux-qui-travaillent', + 'ext': 'mp4', + 'title': 'Ceux qui travaillent', + 'description': 'md5:570e8bb688036ace873b2d50d24c026d', + 'upload_date': '20210819', + }, + }, { + # series episode + 'url': 'https://revoir.tv5monde.com/toutes-les-videos/series-fictions/vestiaires-caro-actrice', + 'info_dict': { + 'id': '9e9d599e-23af-6915-843e-ecbf62e97925', + 'display_id': 'vestiaires-caro-actrice', + 'ext': 'mp4', + 'title': "Vestiaires - Caro actrice", + 'description': 'md5:db15d2e1976641e08377f942778058ea', + 'upload_date': '20210819', + 'series': "Vestiaires", + 'episode': 'Caro actrice', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://revoir.tv5monde.com/toutes-les-videos/series-fictions/neuf-jours-en-hiver-neuf-jours-en-hiver', + 'only_matching': True, + }, { + 'url': 'https://revoir.tv5monde.com/toutes-les-videos/info-societe/le-journal-de-la-rts-edition-du-30-01-20-19h30', + 'only_matching': True, + }] + _GEO_BYPASS = False + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + if ">Ce programme n'est malheureusement pas disponible pour votre zone géographique.<" in webpage: + self.raise_geo_restricted(countries=['FR']) + + title = episode = self._html_search_regex(r'<h1>([^<]+)', webpage, 'title') + vpl_data = extract_attributes(self._search_regex( + r'(<[^>]+class="video_player_loader"[^>]+>)', + webpage, 'video player loader')) + + video_files = self._parse_json( + vpl_data['data-broadcast'], display_id) + formats = [] + for video_file in video_files: + v_url = video_file.get('url') + if not v_url: + continue + video_format = video_file.get('format') or determine_ext(v_url) + if video_format == 'm3u8': + formats.extend(self._extract_m3u8_formats( + v_url, display_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'url': v_url, + 'format_id': video_format, + }) + self._sort_formats(formats) + + metadata = self._parse_json( + vpl_data['data-metadata'], display_id) + duration = (int_or_none(try_get(metadata, lambda x: x['content']['duration'])) + or parse_duration(self._html_search_meta('duration', webpage))) + + description = self._html_search_regex( + r'(?s)<div[^>]+class=["\']episode-texte[^>]+>(.+?)</div>', webpage, + 'description', fatal=False) + + series = self._html_search_regex( + r'<p[^>]+class=["\']episode-emission[^>]+>([^<]+)', webpage, + 'series', default=None) + + if series and series != title: + title = '%s - %s' % (series, title) + + upload_date = self._search_regex( + r'(?:date_publication|publish_date)["\']\s*:\s*["\'](\d{4}_\d{2}_\d{2})', + webpage, 'upload date', default=None) + if upload_date: + upload_date = upload_date.replace('_', '') + + video_id = self._search_regex( + (r'data-guid=["\']([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})', + r'id_contenu["\']\s:\s*(\d+)'), webpage, 'video id', + default=display_id) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': vpl_data.get('data-image'), + 'duration': duration, + 'upload_date': upload_date, + 'formats': formats, + 'series': series, + 'episode': episode, + } diff --git a/yt_dlp/extractor/tv5unis.py b/yt_dlp/extractor/tv5unis.py new file mode 100644 index 000000000..398b85db5 --- /dev/null +++ b/yt_dlp/extractor/tv5unis.py @@ -0,0 +1,120 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_age_limit, + smuggle_url, + try_get, +) + + +class TV5UnisBaseIE(InfoExtractor): + _GEO_COUNTRIES = ['CA'] + + def _real_extract(self, url): + groups = self._match_valid_url(url).groups() + product = self._download_json( + 'https://api.tv5unis.ca/graphql', groups[0], query={ + 'query': '''{ + %s(%s) { + collection { + title + } + episodeNumber + rating { + name + } + seasonNumber + tags + title + videoElement { + ... on Video { + mediaId + } + } + } +}''' % (self._GQL_QUERY_NAME, self._gql_args(groups)), + })['data'][self._GQL_QUERY_NAME] + media_id = product['videoElement']['mediaId'] + + return { + '_type': 'url_transparent', + 'id': media_id, + 'title': product.get('title'), + 'url': smuggle_url('limelight:media:' + media_id, {'geo_countries': self._GEO_COUNTRIES}), + 'age_limit': parse_age_limit(try_get(product, lambda x: x['rating']['name'])), + 'tags': product.get('tags'), + 'series': try_get(product, lambda x: x['collection']['title']), + 'season_number': int_or_none(product.get('seasonNumber')), + 'episode_number': int_or_none(product.get('episodeNumber')), + 'ie_key': 'LimelightMedia', + } + + +class TV5UnisVideoIE(TV5UnisBaseIE): + IE_NAME = 'tv5unis:video' + _VALID_URL = r'https?://(?:www\.)?tv5unis\.ca/videos/[^/]+/(?P<id>\d+)' + _TEST = { + 'url': 'https://www.tv5unis.ca/videos/bande-annonces/71843', + 'md5': '3d794164928bda97fb87a17e89923d9b', + 'info_dict': { + 'id': 'a883684aecb2486cad9bdc7bbe17f861', + 'ext': 'mp4', + 'title': 'Watatatow', + 'duration': 10.01, + } + } + _GQL_QUERY_NAME = 'productById' + + @staticmethod + def _gql_args(groups): + return 'id: %s' % groups + + +class TV5UnisIE(TV5UnisBaseIE): + IE_NAME = 'tv5unis' + _VALID_URL = r'https?://(?:www\.)?tv5unis\.ca/videos/(?P<id>[^/]+)(?:/saisons/(?P<season_number>\d+)/episodes/(?P<episode_number>\d+))?/?(?:[?#&]|$)' + _TESTS = [{ + 'url': 'https://www.tv5unis.ca/videos/watatatow/saisons/6/episodes/1', + 'md5': 'a479907d2e531a73e1f8dc48d6388d02', + 'info_dict': { + 'id': 'e5ee23a586c44612a56aad61accf16ef', + 'ext': 'mp4', + 'title': 'Je ne peux pas lui résister', + 'description': "Atys, le nouveau concierge de l'école, a réussi à ébranler la confiance de Mado en affirmant qu\'une médaille, ce n'est que du métal. Comme Mado essaie de lui prouver que ses valeurs sont solides, il veut la mettre à l'épreuve...", + 'subtitles': { + 'fr': 'count:1', + }, + 'duration': 1370, + 'age_limit': 8, + 'tags': 'count:3', + 'series': 'Watatatow', + 'season_number': 6, + 'episode_number': 1, + }, + }, { + 'url': 'https://www.tv5unis.ca/videos/le-voyage-de-fanny', + 'md5': '9ca80ebb575c681d10cae1adff3d4774', + 'info_dict': { + 'id': '726188eefe094d8faefb13381d42bc06', + 'ext': 'mp4', + 'title': 'Le voyage de Fanny', + 'description': "Fanny, 12 ans, cachée dans un foyer loin de ses parents, s'occupe de ses deux soeurs. Devant fuir, Fanny prend la tête d'un groupe de huit enfants et s'engage dans un dangereux périple à travers la France occupée pour rejoindre la frontière suisse.", + 'subtitles': { + 'fr': 'count:1', + }, + 'duration': 5587.034, + 'tags': 'count:4', + }, + }] + _GQL_QUERY_NAME = 'productByRootProductSlug' + + @staticmethod + def _gql_args(groups): + args = 'rootProductSlug: "%s"' % groups[0] + if groups[1]: + args += ', seasonNumber: %s, episodeNumber: %s' % groups[1:] + return args diff --git a/youtube_dl/extractor/tva.py b/yt_dlp/extractor/tva.py index 52a4ddf32..52a4ddf32 100644 --- a/youtube_dl/extractor/tva.py +++ b/yt_dlp/extractor/tva.py diff --git a/youtube_dl/extractor/tvanouvelles.py b/yt_dlp/extractor/tvanouvelles.py index 1086176a2..1086176a2 100644 --- a/youtube_dl/extractor/tvanouvelles.py +++ b/yt_dlp/extractor/tvanouvelles.py diff --git a/youtube_dl/extractor/tvc.py b/yt_dlp/extractor/tvc.py index 008f64cc2..008f64cc2 100644 --- a/youtube_dl/extractor/tvc.py +++ b/yt_dlp/extractor/tvc.py diff --git a/yt_dlp/extractor/tver.py b/yt_dlp/extractor/tver.py new file mode 100644 index 000000000..943b3ebdd --- /dev/null +++ b/yt_dlp/extractor/tver.py @@ -0,0 +1,60 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + remove_start, + smuggle_url, + try_get, +) + + +class TVerIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tver\.jp/(?P<path>(?:corner|episode|feature)/(?P<id>f?\d+))' + # videos are only available for 7 days + _TESTS = [{ + 'url': 'https://tver.jp/corner/f0062178', + 'only_matching': True, + }, { + 'url': 'https://tver.jp/feature/f0062413', + 'only_matching': True, + }, { + 'url': 'https://tver.jp/episode/79622438', + 'only_matching': True, + }, { + # subtitle = ' ' + 'url': 'https://tver.jp/corner/f0068870', + 'only_matching': True, + }] + _TOKEN = None + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' + + def _real_initialize(self): + self._TOKEN = self._download_json( + 'https://tver.jp/api/access_token.php', None)['token'] + + def _real_extract(self, url): + path, video_id = self._match_valid_url(url).groups() + main = self._download_json( + 'https://api.tver.jp/v4/' + path, video_id, + query={'token': self._TOKEN})['main'] + p_id = main['publisher_id'] + service = remove_start(main['service'], 'ts_') + + r_id = main['reference_id'] + if service not in ('tx', 'russia2018', 'sebare2018live', 'gorin'): + r_id = 'ref:' + r_id + bc_url = smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % (p_id, r_id), + {'geo_countries': ['JP']}) + + return { + '_type': 'url_transparent', + 'description': try_get(main, lambda x: x['note'][0]['text'], compat_str), + 'episode_number': int_or_none(try_get(main, lambda x: x['ext']['episode_number'])), + 'url': bc_url, + 'ie_key': 'BrightcoveNew', + } diff --git a/yt_dlp/extractor/tvigle.py b/yt_dlp/extractor/tvigle.py new file mode 100644 index 000000000..aa25ba0dc --- /dev/null +++ b/yt_dlp/extractor/tvigle.py @@ -0,0 +1,137 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + parse_age_limit, + try_get, + url_or_none, +) + + +class TvigleIE(InfoExtractor): + IE_NAME = 'tvigle' + IE_DESC = 'Интернет-телевидение Tvigle.ru' + _VALID_URL = r'https?://(?:www\.)?(?:tvigle\.ru/(?:[^/]+/)+(?P<display_id>[^/]+)/$|cloud\.tvigle\.ru/video/(?P<id>\d+))' + + _GEO_BYPASS = False + _GEO_COUNTRIES = ['RU'] + + _TESTS = [ + { + 'url': 'http://www.tvigle.ru/video/sokrat/', + 'info_dict': { + 'id': '1848932', + 'display_id': 'sokrat', + 'ext': 'mp4', + 'title': 'Сократ', + 'description': 'md5:d6b92ffb7217b4b8ebad2e7665253c17', + 'duration': 6586, + 'age_limit': 12, + }, + 'skip': 'georestricted', + }, + { + 'url': 'http://www.tvigle.ru/video/vladimir-vysotskii/vedushchii-teleprogrammy-60-minut-ssha-o-vladimire-vysotskom/', + 'info_dict': { + 'id': '5142516', + 'ext': 'flv', + 'title': 'Ведущий телепрограммы «60 минут» (США) о Владимире Высоцком', + 'description': 'md5:027f7dc872948f14c96d19b4178428a4', + 'duration': 186.080, + 'age_limit': 0, + }, + 'skip': 'georestricted', + }, { + 'url': 'https://cloud.tvigle.ru/video/5267604/', + 'only_matching': True, + } + ] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') + + if not video_id: + webpage = self._download_webpage(url, display_id) + video_id = self._html_search_regex( + (r'<div[^>]+class=["\']player["\'][^>]+id=["\'](\d+)', + r'cloudId\s*=\s*["\'](\d+)', + r'class="video-preview current_playing" id="(\d+)"'), + webpage, 'video id') + + video_data = self._download_json( + 'http://cloud.tvigle.ru/api/play/video/%s/' % video_id, display_id) + + item = video_data['playlist']['items'][0] + + videos = item.get('videos') + + error_message = item.get('errorMessage') + if not videos and error_message: + if item.get('isGeoBlocked') is True: + self.raise_geo_restricted( + msg=error_message, countries=self._GEO_COUNTRIES) + else: + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, error_message), + expected=True) + + title = item['title'] + description = item.get('description') + thumbnail = item.get('thumbnail') + duration = float_or_none(item.get('durationMilliseconds'), 1000) + age_limit = parse_age_limit(item.get('ageRestrictions')) + + formats = [] + for vcodec, url_or_fmts in item['videos'].items(): + if vcodec == 'hls': + m3u8_url = url_or_none(url_or_fmts) + if not m3u8_url: + continue + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif vcodec == 'dash': + mpd_url = url_or_none(url_or_fmts) + if not mpd_url: + continue + formats.extend(self._extract_mpd_formats( + mpd_url, video_id, mpd_id='dash', fatal=False)) + else: + if not isinstance(url_or_fmts, dict): + continue + for format_id, video_url in url_or_fmts.items(): + if format_id == 'm3u8': + continue + video_url = url_or_none(video_url) + if not video_url: + continue + height = self._search_regex( + r'^(\d+)[pP]$', format_id, 'height', default=None) + filesize = int_or_none(try_get( + item, lambda x: x['video_files_size'][vcodec][format_id])) + formats.append({ + 'url': video_url, + 'format_id': '%s-%s' % (vcodec, format_id), + 'vcodec': vcodec, + 'height': int_or_none(height), + 'filesize': filesize, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'age_limit': age_limit, + 'formats': formats, + } diff --git a/yt_dlp/extractor/tvland.py b/yt_dlp/extractor/tvland.py new file mode 100644 index 000000000..9ebf57f74 --- /dev/null +++ b/yt_dlp/extractor/tvland.py @@ -0,0 +1,40 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .mtv import MTVServicesInfoExtractor + +# TODO: Remove - Reason not used anymore - Service moved to youtube + + +class TVLandIE(MTVServicesInfoExtractor): + IE_NAME = 'tvland.com' + _VALID_URL = r'https?://(?:www\.)?tvland\.com/(?:video-clips|(?:full-)?episodes)/(?P<id>[^/?#.]+)' + _FEED_URL = 'http://www.tvland.com/feeds/mrss/' + _TESTS = [{ + # Geo-restricted. Without a proxy metadata are still there. With a + # proxy it redirects to http://m.tvland.com/app/ + 'url': 'https://www.tvland.com/episodes/s04pzf/everybody-loves-raymond-the-dog-season-1-ep-19', + 'info_dict': { + 'description': 'md5:84928e7a8ad6649371fbf5da5e1ad75a', + 'title': 'The Dog', + }, + 'playlist_mincount': 5, + 'skip': '404 Not found', + }, { + 'url': 'https://www.tvland.com/video-clips/4n87f2/younger-a-first-look-at-younger-season-6', + 'md5': 'e2c6389401cf485df26c79c247b08713', + 'info_dict': { + 'id': '891f7d3c-5b5b-4753-b879-b7ba1a601757', + 'ext': 'mp4', + 'title': 'Younger|April 30, 2019|6|NO-EPISODE#|A First Look at Younger Season 6', + 'description': 'md5:595ea74578d3a888ae878dfd1c7d4ab2', + 'upload_date': '20190430', + 'timestamp': 1556658000, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.tvland.com/full-episodes/iu0hz6/younger-a-kiss-is-just-a-kiss-season-3-ep-301', + 'only_matching': True, + }] diff --git a/youtube_dl/extractor/tvn24.py b/yt_dlp/extractor/tvn24.py index de0fb5063..de0fb5063 100644 --- a/youtube_dl/extractor/tvn24.py +++ b/yt_dlp/extractor/tvn24.py diff --git a/youtube_dl/extractor/tvnet.py b/yt_dlp/extractor/tvnet.py index 4222ff9ee..4222ff9ee 100644 --- a/youtube_dl/extractor/tvnet.py +++ b/yt_dlp/extractor/tvnet.py diff --git a/youtube_dl/extractor/tvnoe.py b/yt_dlp/extractor/tvnoe.py index 26a5aeae4..26a5aeae4 100644 --- a/youtube_dl/extractor/tvnoe.py +++ b/yt_dlp/extractor/tvnoe.py diff --git a/yt_dlp/extractor/tvnow.py b/yt_dlp/extractor/tvnow.py new file mode 100644 index 000000000..b31818477 --- /dev/null +++ b/yt_dlp/extractor/tvnow.py @@ -0,0 +1,644 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + get_element_by_id, + int_or_none, + parse_iso8601, + parse_duration, + str_or_none, + try_get, + update_url_query, + urljoin, +) + + +class TVNowBaseIE(InfoExtractor): + _VIDEO_FIELDS = ( + 'id', 'title', 'free', 'geoblocked', 'articleLong', 'articleShort', + 'broadcastStartDate', 'isDrm', 'duration', 'season', 'episode', + 'manifest.dashclear', 'manifest.hlsclear', 'manifest.smoothclear', + 'format.title', 'format.defaultImage169Format', 'format.defaultImage169Logo') + + def _call_api(self, path, video_id, query): + return self._download_json( + 'https://api.tvnow.de/v3/' + path, video_id, query=query) + + def _extract_video(self, info, display_id): + video_id = compat_str(info['id']) + title = info['title'] + + paths = [] + for manifest_url in (info.get('manifest') or {}).values(): + if not manifest_url: + continue + manifest_url = update_url_query(manifest_url, {'filter': ''}) + path = self._search_regex(r'https?://[^/]+/(.+?)\.ism/', manifest_url, 'path') + if path in paths: + continue + paths.append(path) + + def url_repl(proto, suffix): + return re.sub( + r'(?:hls|dash|hss)([.-])', proto + r'\1', re.sub( + r'\.ism/(?:[^.]*\.(?:m3u8|mpd)|[Mm]anifest)', + '.ism/' + suffix, manifest_url)) + + def make_urls(proto, suffix): + urls = [url_repl(proto, suffix)] + hd_url = urls[0].replace('/manifest/', '/ngvod/') + if hd_url != urls[0]: + urls.append(hd_url) + return urls + + for man_url in make_urls('dash', '.mpd'): + formats = self._extract_mpd_formats( + man_url, video_id, mpd_id='dash', fatal=False) + for man_url in make_urls('hss', 'Manifest'): + formats.extend(self._extract_ism_formats( + man_url, video_id, ism_id='mss', fatal=False)) + for man_url in make_urls('hls', '.m3u8'): + formats.extend(self._extract_m3u8_formats( + man_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', + fatal=False)) + if formats: + break + else: + if not self.get_param('allow_unplayable_formats') and info.get('isDrm'): + raise ExtractorError( + 'Video %s is DRM protected' % video_id, expected=True) + if info.get('geoblocked'): + raise self.raise_geo_restricted() + if not info.get('free', True): + raise ExtractorError( + 'Video %s is not available for free' % video_id, expected=True) + self._sort_formats(formats) + + description = info.get('articleLong') or info.get('articleShort') + timestamp = parse_iso8601(info.get('broadcastStartDate'), ' ') + duration = parse_duration(info.get('duration')) + + f = info.get('format', {}) + + thumbnails = [{ + 'url': 'https://aistvnow-a.akamaihd.net/tvnow/movie/%s' % video_id, + }] + thumbnail = f.get('defaultImage169Format') or f.get('defaultImage169Logo') + if thumbnail: + thumbnails.append({ + 'url': thumbnail, + }) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnails': thumbnails, + 'timestamp': timestamp, + 'duration': duration, + 'series': f.get('title'), + 'season_number': int_or_none(info.get('season')), + 'episode_number': int_or_none(info.get('episode')), + 'episode': title, + 'formats': formats, + } + + +class TVNowIE(TVNowBaseIE): + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?tvnow\.(?:de|at|ch)/(?P<station>[^/]+)/ + (?P<show_id>[^/]+)/ + (?!(?:list|jahr)(?:/|$))(?P<id>[^/?\#&]+) + ''' + + @classmethod + def suitable(cls, url): + return (False if TVNowNewIE.suitable(url) or TVNowSeasonIE.suitable(url) or TVNowAnnualIE.suitable(url) or TVNowShowIE.suitable(url) + else super(TVNowIE, cls).suitable(url)) + + _TESTS = [{ + 'url': 'https://www.tvnow.de/rtl2/grip-das-motormagazin/der-neue-porsche-911-gt-3/player', + 'info_dict': { + 'id': '331082', + 'display_id': 'grip-das-motormagazin/der-neue-porsche-911-gt-3', + 'ext': 'mp4', + 'title': 'Der neue Porsche 911 GT 3', + 'description': 'md5:6143220c661f9b0aae73b245e5d898bb', + 'timestamp': 1495994400, + 'upload_date': '20170528', + 'duration': 5283, + 'series': 'GRIP - Das Motormagazin', + 'season_number': 14, + 'episode_number': 405, + 'episode': 'Der neue Porsche 911 GT 3', + }, + }, { + # rtl2 + 'url': 'https://www.tvnow.de/rtl2/armes-deutschland/episode-0008/player', + 'only_matching': True, + }, { + # rtlnitro + 'url': 'https://www.tvnow.de/nitro/alarm-fuer-cobra-11-die-autobahnpolizei/auf-eigene-faust-pilot/player', + 'only_matching': True, + }, { + # superrtl + 'url': 'https://www.tvnow.de/superrtl/die-lustigsten-schlamassel-der-welt/u-a-ketchup-effekt/player', + 'only_matching': True, + }, { + # ntv + 'url': 'https://www.tvnow.de/ntv/startup-news/goetter-in-weiss/player', + 'only_matching': True, + }, { + # vox + 'url': 'https://www.tvnow.de/vox/auto-mobil/neues-vom-automobilmarkt-2017-11-19-17-00-00/player', + 'only_matching': True, + }, { + # rtlplus + 'url': 'https://www.tvnow.de/rtlplus/op-ruft-dr-bruckner/die-vernaehte-frau/player', + 'only_matching': True, + }, { + 'url': 'https://www.tvnow.de/rtl2/grip-das-motormagazin/der-neue-porsche-911-gt-3', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + display_id = '%s/%s' % mobj.group(2, 3) + + info = self._call_api( + 'movies/' + display_id, display_id, query={ + 'fields': ','.join(self._VIDEO_FIELDS), + }) + + return self._extract_video(info, display_id) + + +class TVNowNewIE(InfoExtractor): + _VALID_URL = r'''(?x) + (?P<base_url>https?:// + (?:www\.)?tvnow\.(?:de|at|ch)/ + (?:shows|serien))/ + (?P<show>[^/]+)-\d+/ + [^/]+/ + episode-\d+-(?P<episode>[^/?$&]+)-(?P<id>\d+) + ''' + + _TESTS = [{ + 'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669/2017-05/episode-405-der-neue-porsche-911-gt-3-331082', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + base_url = re.sub(r'(?:shows|serien)', '_', mobj.group('base_url')) + show, episode = mobj.group('show', 'episode') + return self.url_result( + # Rewrite new URLs to the old format and use extraction via old API + # at api.tvnow.de as a loophole for bypassing premium content checks + '%s/%s/%s' % (base_url, show, episode), + ie=TVNowIE.ie_key(), video_id=mobj.group('id')) + + +class TVNowFilmIE(TVNowBaseIE): + _VALID_URL = r'''(?x) + (?P<base_url>https?:// + (?:www\.)?tvnow\.(?:de|at|ch)/ + (?:filme))/ + (?P<title>[^/?$&]+)-(?P<id>\d+) + ''' + _TESTS = [{ + 'url': 'https://www.tvnow.de/filme/lord-of-war-haendler-des-todes-7959', + 'info_dict': { + 'id': '1426690', + 'display_id': 'lord-of-war-haendler-des-todes', + 'ext': 'mp4', + 'title': 'Lord of War', + 'description': 'md5:5eda15c0d5b8cb70dac724c8a0ff89a9', + 'timestamp': 1550010000, + 'upload_date': '20190212', + 'duration': 7016, + }, + }, { + 'url': 'https://www.tvnow.de/filme/the-machinist-12157', + 'info_dict': { + 'id': '328160', + 'display_id': 'the-machinist', + 'ext': 'mp4', + 'title': 'The Machinist', + 'description': 'md5:9a0e363fdd74b3a9e1cdd9e21d0ecc28', + 'timestamp': 1496469720, + 'upload_date': '20170603', + 'duration': 5836, + }, + }, { + 'url': 'https://www.tvnow.de/filme/horst-schlaemmer-isch-kandidiere-17777', + 'only_matching': True, # DRM protected + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + display_id = mobj.group('title') + + webpage = self._download_webpage(url, display_id, fatal=False) + if not webpage: + raise ExtractorError('Cannot download "%s"' % url, expected=True) + + json_text = get_element_by_id('now-web-state', webpage) + if not json_text: + raise ExtractorError('Cannot read video data', expected=True) + + json_data = self._parse_json( + json_text, + display_id, + transform_source=lambda x: x.replace('&q;', '"'), + fatal=False) + if not json_data: + raise ExtractorError('Cannot read video data', expected=True) + + player_key = next( + (key for key in json_data.keys() if 'module/player' in key), + None) + page_key = next( + (key for key in json_data.keys() if 'page/filme' in key), + None) + movie_id = try_get( + json_data, + [ + lambda x: x[player_key]['body']['id'], + lambda x: x[page_key]['body']['modules'][0]['id'], + lambda x: x[page_key]['body']['modules'][1]['id']], + int) + if not movie_id: + raise ExtractorError('Cannot extract movie ID', expected=True) + + info = self._call_api( + 'movies/%d' % movie_id, + display_id, + query={'fields': ','.join(self._VIDEO_FIELDS)}) + + return self._extract_video(info, display_id) + + +class TVNowNewBaseIE(InfoExtractor): + def _call_api(self, path, video_id, query={}): + result = self._download_json( + 'https://apigw.tvnow.de/module/' + path, video_id, query=query) + error = result.get('error') + if error: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error), expected=True) + return result + + +r""" +TODO: new apigw.tvnow.de based version of TVNowIE. Replace old TVNowIE with it +when api.tvnow.de is shut down. This version can't bypass premium checks though. +class TVNowIE(TVNowNewBaseIE): + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?tvnow\.(?:de|at|ch)/ + (?:shows|serien)/[^/]+/ + (?:[^/]+/)+ + (?P<display_id>[^/?$&]+)-(?P<id>\d+) + ''' + + _TESTS = [{ + # episode with annual navigation + 'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669/2017-05/episode-405-der-neue-porsche-911-gt-3-331082', + 'info_dict': { + 'id': '331082', + 'display_id': 'grip-das-motormagazin/der-neue-porsche-911-gt-3', + 'ext': 'mp4', + 'title': 'Der neue Porsche 911 GT 3', + 'description': 'md5:6143220c661f9b0aae73b245e5d898bb', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1495994400, + 'upload_date': '20170528', + 'duration': 5283, + 'series': 'GRIP - Das Motormagazin', + 'season_number': 14, + 'episode_number': 405, + 'episode': 'Der neue Porsche 911 GT 3', + }, + }, { + # rtl2, episode with season navigation + 'url': 'https://www.tvnow.de/shows/armes-deutschland-11471/staffel-3/episode-14-bernd-steht-seit-der-trennung-von-seiner-frau-allein-da-526124', + 'only_matching': True, + }, { + # rtlnitro + 'url': 'https://www.tvnow.de/serien/alarm-fuer-cobra-11-die-autobahnpolizei-1815/staffel-13/episode-5-auf-eigene-faust-pilot-366822', + 'only_matching': True, + }, { + # superrtl + 'url': 'https://www.tvnow.de/shows/die-lustigsten-schlamassel-der-welt-1221/staffel-2/episode-14-u-a-ketchup-effekt-364120', + 'only_matching': True, + }, { + # ntv + 'url': 'https://www.tvnow.de/shows/startup-news-10674/staffel-2/episode-39-goetter-in-weiss-387630', + 'only_matching': True, + }, { + # vox + 'url': 'https://www.tvnow.de/shows/auto-mobil-174/2017-11/episode-46-neues-vom-automobilmarkt-2017-11-19-17-00-00-380072', + 'only_matching': True, + }, { + 'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669/2017-05/episode-405-der-neue-porsche-911-gt-3-331082', + 'only_matching': True, + }] + + def _extract_video(self, info, url, display_id): + config = info['config'] + source = config['source'] + + video_id = compat_str(info.get('id') or source['videoId']) + title = source['title'].strip() + + paths = [] + for manifest_url in (info.get('manifest') or {}).values(): + if not manifest_url: + continue + manifest_url = update_url_query(manifest_url, {'filter': ''}) + path = self._search_regex(r'https?://[^/]+/(.+?)\.ism/', manifest_url, 'path') + if path in paths: + continue + paths.append(path) + + def url_repl(proto, suffix): + return re.sub( + r'(?:hls|dash|hss)([.-])', proto + r'\1', re.sub( + r'\.ism/(?:[^.]*\.(?:m3u8|mpd)|[Mm]anifest)', + '.ism/' + suffix, manifest_url)) + + formats = self._extract_mpd_formats( + url_repl('dash', '.mpd'), video_id, + mpd_id='dash', fatal=False) + formats.extend(self._extract_ism_formats( + url_repl('hss', 'Manifest'), + video_id, ism_id='mss', fatal=False)) + formats.extend(self._extract_m3u8_formats( + url_repl('hls', '.m3u8'), video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + if formats: + break + else: + if try_get(info, lambda x: x['rights']['isDrm']): + raise ExtractorError( + 'Video %s is DRM protected' % video_id, expected=True) + if try_get(config, lambda x: x['boards']['geoBlocking']['block']): + raise self.raise_geo_restricted() + if not info.get('free', True): + raise ExtractorError( + 'Video %s is not available for free' % video_id, expected=True) + self._sort_formats(formats) + + description = source.get('description') + thumbnail = url_or_none(source.get('poster')) + timestamp = unified_timestamp(source.get('previewStart')) + duration = parse_duration(source.get('length')) + + series = source.get('format') + season_number = int_or_none(self._search_regex( + r'staffel-(\d+)', url, 'season number', default=None)) + episode_number = int_or_none(self._search_regex( + r'episode-(\d+)', url, 'episode number', default=None)) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'series': series, + 'season_number': season_number, + 'episode_number': episode_number, + 'episode': title, + 'formats': formats, + } + + def _real_extract(self, url): + display_id, video_id = self._match_valid_url(url).groups() + info = self._call_api('player/' + video_id, video_id) + return self._extract_video(info, video_id, display_id) + + +class TVNowFilmIE(TVNowIE): + _VALID_URL = r'''(?x) + (?P<base_url>https?:// + (?:www\.)?tvnow\.(?:de|at|ch)/ + (?:filme))/ + (?P<title>[^/?$&]+)-(?P<id>\d+) + ''' + _TESTS = [{ + 'url': 'https://www.tvnow.de/filme/lord-of-war-haendler-des-todes-7959', + 'info_dict': { + 'id': '1426690', + 'display_id': 'lord-of-war-haendler-des-todes', + 'ext': 'mp4', + 'title': 'Lord of War', + 'description': 'md5:5eda15c0d5b8cb70dac724c8a0ff89a9', + 'timestamp': 1550010000, + 'upload_date': '20190212', + 'duration': 7016, + }, + }, { + 'url': 'https://www.tvnow.de/filme/the-machinist-12157', + 'info_dict': { + 'id': '328160', + 'display_id': 'the-machinist', + 'ext': 'mp4', + 'title': 'The Machinist', + 'description': 'md5:9a0e363fdd74b3a9e1cdd9e21d0ecc28', + 'timestamp': 1496469720, + 'upload_date': '20170603', + 'duration': 5836, + }, + }, { + 'url': 'https://www.tvnow.de/filme/horst-schlaemmer-isch-kandidiere-17777', + 'only_matching': True, # DRM protected + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + display_id = mobj.group('title') + + webpage = self._download_webpage(url, display_id, fatal=False) + if not webpage: + raise ExtractorError('Cannot download "%s"' % url, expected=True) + + json_text = get_element_by_id('now-web-state', webpage) + if not json_text: + raise ExtractorError('Cannot read video data', expected=True) + + json_data = self._parse_json( + json_text, + display_id, + transform_source=lambda x: x.replace('&q;', '"'), + fatal=False) + if not json_data: + raise ExtractorError('Cannot read video data', expected=True) + + player_key = next( + (key for key in json_data.keys() if 'module/player' in key), + None) + page_key = next( + (key for key in json_data.keys() if 'page/filme' in key), + None) + movie_id = try_get( + json_data, + [ + lambda x: x[player_key]['body']['id'], + lambda x: x[page_key]['body']['modules'][0]['id'], + lambda x: x[page_key]['body']['modules'][1]['id']], + int) + if not movie_id: + raise ExtractorError('Cannot extract movie ID', expected=True) + + info = self._call_api('player/%d' % movie_id, display_id) + return self._extract_video(info, url, display_id) +""" + + +class TVNowListBaseIE(TVNowNewBaseIE): + _SHOW_VALID_URL = r'''(?x) + (?P<base_url> + https?:// + (?:www\.)?tvnow\.(?:de|at|ch)/(?:shows|serien)/ + [^/?#&]+-(?P<show_id>\d+) + ) + ''' + + @classmethod + def suitable(cls, url): + return (False if TVNowNewIE.suitable(url) + else super(TVNowListBaseIE, cls).suitable(url)) + + def _extract_items(self, url, show_id, list_id, query): + items = self._call_api( + 'teaserrow/format/episode/' + show_id, list_id, + query=query)['items'] + + entries = [] + for item in items: + if not isinstance(item, dict): + continue + item_url = urljoin(url, item.get('url')) + if not item_url: + continue + video_id = str_or_none(item.get('id') or item.get('videoId')) + item_title = item.get('subheadline') or item.get('text') + entries.append(self.url_result( + item_url, ie=TVNowNewIE.ie_key(), video_id=video_id, + video_title=item_title)) + + return self.playlist_result(entries, '%s/%s' % (show_id, list_id)) + + +class TVNowSeasonIE(TVNowListBaseIE): + _VALID_URL = r'%s/staffel-(?P<id>\d+)' % TVNowListBaseIE._SHOW_VALID_URL + _TESTS = [{ + 'url': 'https://www.tvnow.de/serien/alarm-fuer-cobra-11-die-autobahnpolizei-1815/staffel-13', + 'info_dict': { + 'id': '1815/13', + }, + 'playlist_mincount': 22, + }] + + def _real_extract(self, url): + _, show_id, season_id = self._match_valid_url(url).groups() + return self._extract_items( + url, show_id, season_id, {'season': season_id}) + + +class TVNowAnnualIE(TVNowListBaseIE): + _VALID_URL = r'%s/(?P<year>\d{4})-(?P<month>\d{2})' % TVNowListBaseIE._SHOW_VALID_URL + _TESTS = [{ + 'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669/2017-05', + 'info_dict': { + 'id': '1669/2017-05', + }, + 'playlist_mincount': 2, + }] + + def _real_extract(self, url): + _, show_id, year, month = self._match_valid_url(url).groups() + return self._extract_items( + url, show_id, '%s-%s' % (year, month), { + 'year': int(year), + 'month': int(month), + }) + + +class TVNowShowIE(TVNowListBaseIE): + _VALID_URL = TVNowListBaseIE._SHOW_VALID_URL + _TESTS = [{ + # annual navigationType + 'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669', + 'info_dict': { + 'id': '1669', + }, + 'playlist_mincount': 73, + }, { + # season navigationType + 'url': 'https://www.tvnow.de/shows/armes-deutschland-11471', + 'info_dict': { + 'id': '11471', + }, + 'playlist_mincount': 3, + }] + + @classmethod + def suitable(cls, url): + return (False if TVNowNewIE.suitable(url) or TVNowSeasonIE.suitable(url) or TVNowAnnualIE.suitable(url) + else super(TVNowShowIE, cls).suitable(url)) + + def _real_extract(self, url): + base_url, show_id = self._match_valid_url(url).groups() + + result = self._call_api( + 'teaserrow/format/navigation/' + show_id, show_id) + + items = result['items'] + + entries = [] + navigation = result.get('navigationType') + if navigation == 'annual': + for item in items: + if not isinstance(item, dict): + continue + year = int_or_none(item.get('year')) + if year is None: + continue + months = item.get('months') + if not isinstance(months, list): + continue + for month_dict in months: + if not isinstance(month_dict, dict) or not month_dict: + continue + month_number = int_or_none(list(month_dict.keys())[0]) + if month_number is None: + continue + entries.append(self.url_result( + '%s/%04d-%02d' % (base_url, year, month_number), + ie=TVNowAnnualIE.ie_key())) + elif navigation == 'season': + for item in items: + if not isinstance(item, dict): + continue + season_number = int_or_none(item.get('season')) + if season_number is None: + continue + entries.append(self.url_result( + '%s/staffel-%d' % (base_url, season_number), + ie=TVNowSeasonIE.ie_key())) + else: + raise ExtractorError('Unknown navigationType') + + return self.playlist_result(entries, show_id) diff --git a/yt_dlp/extractor/tvp.py b/yt_dlp/extractor/tvp.py new file mode 100644 index 000000000..1e42b33a4 --- /dev/null +++ b/yt_dlp/extractor/tvp.py @@ -0,0 +1,252 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + determine_ext, + ExtractorError, + get_element_by_attribute, + orderedSet, +) + + +class TVPIE(InfoExtractor): + IE_NAME = 'tvp' + IE_DESC = 'Telewizja Polska' + _VALID_URL = r'https?://[^/]+\.tvp\.(?:pl|info)/(?:video/(?:[^,\s]*,)*|(?:(?!\d+/)[^/]+/)*)(?P<id>\d+)' + + _TESTS = [{ + 'url': 'https://vod.tvp.pl/video/czas-honoru,i-seria-odc-13,194536', + 'md5': 'a21eb0aa862f25414430f15fdfb9e76c', + 'info_dict': { + 'id': '194536', + 'ext': 'mp4', + 'title': 'Czas honoru, odc. 13 – Władek', + 'description': 'md5:437f48b93558370b031740546b696e24', + }, + }, { + 'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176', + 'md5': 'b0005b542e5b4de643a9690326ab1257', + 'info_dict': { + 'id': '17916176', + 'ext': 'mp4', + 'title': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata', + 'description': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata', + }, + }, { + # page id is not the same as video id(#7799) + 'url': 'https://wiadomosci.tvp.pl/33908820/28092017-1930', + 'md5': '84cd3c8aec4840046e5ab712416b73d0', + 'info_dict': { + 'id': '33908820', + 'ext': 'mp4', + 'title': 'Wiadomości, 28.09.2017, 19:30', + 'description': 'Wydanie główne codziennego serwisu informacyjnego.' + }, + 'skip': 'HTTP Error 404: Not Found', + }, { + 'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272', + 'only_matching': True, + }, { + 'url': 'http://wiadomosci.tvp.pl/25169746/24052016-1200', + 'only_matching': True, + }, { + 'url': 'http://krakow.tvp.pl/25511623/25lecie-mck-wyjatkowe-miejsce-na-mapie-krakowa', + 'only_matching': True, + }, { + 'url': 'http://teleexpress.tvp.pl/25522307/wierni-wzieli-udzial-w-procesjach', + 'only_matching': True, + }, { + 'url': 'http://sport.tvp.pl/25522165/krychowiak-uspokaja-w-sprawie-kontuzji-dwa-tygodnie-to-maksimum', + 'only_matching': True, + }, { + 'url': 'http://www.tvp.info/25511919/trwa-rewolucja-wladza-zdecydowala-sie-na-pogwalcenie-konstytucji', + 'only_matching': True, + }] + + def _real_extract(self, url): + page_id = self._match_id(url) + webpage = self._download_webpage(url, page_id) + video_id = self._search_regex([ + r'<iframe[^>]+src="[^"]*?object_id=(\d+)', + r"object_id\s*:\s*'(\d+)'", + r'data-video-id="(\d+)"'], webpage, 'video id', default=page_id) + return { + '_type': 'url_transparent', + 'url': 'tvp:' + video_id, + 'description': self._og_search_description( + webpage, default=None) or self._html_search_meta( + 'description', webpage, default=None), + 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'ie_key': 'TVPEmbed', + } + + +class TVPEmbedIE(InfoExtractor): + IE_NAME = 'tvp:embed' + IE_DESC = 'Telewizja Polska' + _VALID_URL = r'(?:tvp:|https?://[^/]+\.tvp\.(?:pl|info)/sess/tvplayer\.php\?.*?object_id=)(?P<id>\d+)' + + _TESTS = [{ + 'url': 'tvp:194536', + 'md5': 'a21eb0aa862f25414430f15fdfb9e76c', + 'info_dict': { + 'id': '194536', + 'ext': 'mp4', + 'title': 'Czas honoru, odc. 13 – Władek', + }, + }, { + # not available + 'url': 'http://www.tvp.pl/sess/tvplayer.php?object_id=22670268', + 'md5': '8c9cd59d16edabf39331f93bf8a766c7', + 'info_dict': { + 'id': '22670268', + 'ext': 'mp4', + 'title': 'Panorama, 07.12.2015, 15:40', + }, + 'skip': 'Transmisja została zakończona lub materiał niedostępny', + }, { + 'url': 'tvp:22670268', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'http://www.tvp.pl/sess/tvplayer.php?object_id=%s' % video_id, video_id) + + error = self._html_search_regex( + r'(?s)<p[^>]+\bclass=["\']notAvailable__text["\'][^>]*>(.+?)</p>', + webpage, 'error', default=None) or clean_html( + get_element_by_attribute('class', 'msg error', webpage)) + if error: + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, clean_html(error)), expected=True) + + title = self._search_regex( + r'name\s*:\s*([\'"])Title\1\s*,\s*value\s*:\s*\1(?P<title>.+?)\1', + webpage, 'title', group='title') + series_title = self._search_regex( + r'name\s*:\s*([\'"])SeriesTitle\1\s*,\s*value\s*:\s*\1(?P<series>.+?)\1', + webpage, 'series', group='series', default=None) + if series_title: + title = '%s, %s' % (series_title, title) + + thumbnail = self._search_regex( + r"poster\s*:\s*'([^']+)'", webpage, 'thumbnail', default=None) + + video_url = self._search_regex( + r'0:{src:([\'"])(?P<url>.*?)\1', webpage, + 'formats', group='url', default=None) + if not video_url or 'material_niedostepny.mp4' in video_url: + video_url = self._download_json( + 'http://www.tvp.pl/pub/stat/videofileinfo?video_id=%s' % video_id, + video_id)['video_url'] + + formats = [] + video_url_base = self._search_regex( + r'(https?://.+?/video)(?:\.(?:ism|f4m|m3u8)|-\d+\.mp4)', + video_url, 'video base url', default=None) + if video_url_base: + # TODO: <Group> found instead of <AdaptationSet> in MPD manifest. + # It's not mentioned in MPEG-DASH standard. Figure that out. + # formats.extend(self._extract_mpd_formats( + # video_url_base + '.ism/video.mpd', + # video_id, mpd_id='dash', fatal=False)) + formats.extend(self._extract_ism_formats( + video_url_base + '.ism/Manifest', + video_id, 'mss', fatal=False)) + formats.extend(self._extract_f4m_formats( + video_url_base + '.ism/video.f4m', + video_id, f4m_id='hds', fatal=False)) + m3u8_formats = self._extract_m3u8_formats( + video_url_base + '.ism/video.m3u8', video_id, + 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) + self._sort_formats(m3u8_formats) + m3u8_formats = list(filter( + lambda f: f.get('vcodec') != 'none', m3u8_formats)) + formats.extend(m3u8_formats) + for i, m3u8_format in enumerate(m3u8_formats, 2): + http_url = '%s-%d.mp4' % (video_url_base, i) + if self._is_valid_url(http_url, video_id): + f = m3u8_format.copy() + f.update({ + 'url': http_url, + 'format_id': f['format_id'].replace('hls', 'http'), + 'protocol': 'http', + }) + formats.append(f) + else: + formats = [{ + 'format_id': 'direct', + 'url': video_url, + 'ext': determine_ext(video_url, 'mp4'), + }] + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'formats': formats, + } + + +class TVPWebsiteIE(InfoExtractor): + IE_NAME = 'tvp:series' + _VALID_URL = r'https?://vod\.tvp\.pl/website/(?P<display_id>[^,]+),(?P<id>\d+)' + + _TESTS = [{ + # series + 'url': 'https://vod.tvp.pl/website/lzy-cennet,38678312/video', + 'info_dict': { + 'id': '38678312', + }, + 'playlist_count': 115, + }, { + # film + 'url': 'https://vod.tvp.pl/website/gloria,35139666', + 'info_dict': { + 'id': '36637049', + 'ext': 'mp4', + 'title': 'Gloria, Gloria', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['TVPEmbed'], + }, { + 'url': 'https://vod.tvp.pl/website/lzy-cennet,38678312', + 'only_matching': True, + }] + + def _entries(self, display_id, playlist_id): + url = 'https://vod.tvp.pl/website/%s,%s/video' % (display_id, playlist_id) + for page_num in itertools.count(1): + page = self._download_webpage( + url, display_id, 'Downloading page %d' % page_num, + query={'page': page_num}) + + video_ids = orderedSet(re.findall( + r'<a[^>]+\bhref=["\']/video/%s,[^,]+,(\d+)' % display_id, + page)) + + if not video_ids: + break + + for video_id in video_ids: + yield self.url_result( + 'tvp:%s' % video_id, ie=TVPEmbedIE.ie_key(), + video_id=video_id) + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + display_id, playlist_id = mobj.group('display_id', 'id') + return self.playlist_result( + self._entries(display_id, playlist_id), playlist_id) diff --git a/yt_dlp/extractor/tvplay.py b/yt_dlp/extractor/tvplay.py new file mode 100644 index 000000000..fbafb41f8 --- /dev/null +++ b/yt_dlp/extractor/tvplay.py @@ -0,0 +1,508 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_HTTPError, + compat_urlparse, +) +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + parse_duration, + parse_iso8601, + qualities, + try_get, + update_url_query, + url_or_none, + urljoin, +) + + +class TVPlayIE(InfoExtractor): + IE_NAME = 'mtg' + IE_DESC = 'MTG services' + _VALID_URL = r'''(?x) + (?: + mtg:| + https?:// + (?:www\.)? + (?: + tvplay(?:\.skaties)?\.lv(?:/parraides)?| + (?:tv3play|play\.tv3)\.lt(?:/programos)?| + tv3play(?:\.tv3)?\.ee/sisu| + (?:tv(?:3|6|8|10)play)\.se/program| + (?:(?:tv3play|viasat4play|tv6play)\.no|(?:tv3play)\.dk)/programmer| + play\.nova(?:tv)?\.bg/programi + ) + /(?:[^/]+/)+ + ) + (?P<id>\d+) + ''' + _TESTS = [ + { + 'url': 'http://www.tvplay.lv/parraides/vinas-melo-labak/418113?autostart=true', + 'md5': 'a1612fe0849455423ad8718fe049be21', + 'info_dict': { + 'id': '418113', + 'ext': 'mp4', + 'title': 'Kādi ir īri? - Viņas melo labāk', + 'description': 'Baiba apsmej īrus, kādi tie ir un ko viņi dara.', + 'series': 'Viņas melo labāk', + 'season': '2.sezona', + 'season_number': 2, + 'duration': 25, + 'timestamp': 1406097056, + 'upload_date': '20140723', + }, + }, + { + 'url': 'http://play.tv3.lt/programos/moterys-meluoja-geriau/409229?autostart=true', + 'info_dict': { + 'id': '409229', + 'ext': 'flv', + 'title': 'Moterys meluoja geriau', + 'description': 'md5:9aec0fc68e2cbc992d2a140bd41fa89e', + 'series': 'Moterys meluoja geriau', + 'episode_number': 47, + 'season': '1 sezonas', + 'season_number': 1, + 'duration': 1330, + 'timestamp': 1403769181, + 'upload_date': '20140626', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, + { + 'url': 'http://www.tv3play.ee/sisu/kodu-keset-linna/238551?autostart=true', + 'info_dict': { + 'id': '238551', + 'ext': 'flv', + 'title': 'Kodu keset linna 398537', + 'description': 'md5:7df175e3c94db9e47c0d81ffa5d68701', + 'duration': 1257, + 'timestamp': 1292449761, + 'upload_date': '20101215', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, + { + 'url': 'http://www.tv3play.se/program/husraddarna/395385?autostart=true', + 'info_dict': { + 'id': '395385', + 'ext': 'mp4', + 'title': 'Husräddarna S02E07', + 'description': 'md5:f210c6c89f42d4fc39faa551be813777', + 'duration': 2574, + 'timestamp': 1400596321, + 'upload_date': '20140520', + }, + 'params': { + 'skip_download': True, + }, + }, + { + 'url': 'http://www.tv6play.se/program/den-sista-dokusapan/266636?autostart=true', + 'info_dict': { + 'id': '266636', + 'ext': 'mp4', + 'title': 'Den sista dokusåpan S01E08', + 'description': 'md5:295be39c872520221b933830f660b110', + 'duration': 1492, + 'timestamp': 1330522854, + 'upload_date': '20120229', + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, + }, + { + 'url': 'http://www.tv8play.se/program/antikjakten/282756?autostart=true', + 'info_dict': { + 'id': '282756', + 'ext': 'mp4', + 'title': 'Antikjakten S01E10', + 'description': 'md5:1b201169beabd97e20c5ad0ad67b13b8', + 'duration': 2646, + 'timestamp': 1348575868, + 'upload_date': '20120925', + }, + 'params': { + 'skip_download': True, + }, + }, + { + 'url': 'http://www.tv3play.no/programmer/anna-anka-soker-assistent/230898?autostart=true', + 'info_dict': { + 'id': '230898', + 'ext': 'mp4', + 'title': 'Anna Anka søker assistent - Ep. 8', + 'description': 'md5:f80916bf5bbe1c5f760d127f8dd71474', + 'duration': 2656, + 'timestamp': 1277720005, + 'upload_date': '20100628', + }, + 'params': { + 'skip_download': True, + }, + }, + { + 'url': 'http://www.viasat4play.no/programmer/budbringerne/21873?autostart=true', + 'info_dict': { + 'id': '21873', + 'ext': 'mp4', + 'title': 'Budbringerne program 10', + 'description': 'md5:4db78dc4ec8a85bb04fd322a3ee5092d', + 'duration': 1297, + 'timestamp': 1254205102, + 'upload_date': '20090929', + }, + 'params': { + 'skip_download': True, + }, + }, + { + 'url': 'http://www.tv6play.no/programmer/hotelinspektor-alex-polizzi/361883?autostart=true', + 'info_dict': { + 'id': '361883', + 'ext': 'mp4', + 'title': 'Hotelinspektør Alex Polizzi - Ep. 10', + 'description': 'md5:3ecf808db9ec96c862c8ecb3a7fdaf81', + 'duration': 2594, + 'timestamp': 1393236292, + 'upload_date': '20140224', + }, + 'params': { + 'skip_download': True, + }, + }, + { + 'url': 'http://play.novatv.bg/programi/zdravei-bulgariya/624952?autostart=true', + 'info_dict': { + 'id': '624952', + 'ext': 'flv', + 'title': 'Здравей, България (12.06.2015 г.) ', + 'description': 'md5:99f3700451ac5bb71a260268b8daefd7', + 'duration': 8838, + 'timestamp': 1434100372, + 'upload_date': '20150612', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, + { + 'url': 'https://play.nova.bg/programi/zdravei-bulgariya/764300?autostart=true', + 'only_matching': True, + }, + { + 'url': 'http://tvplay.skaties.lv/parraides/vinas-melo-labak/418113?autostart=true', + 'only_matching': True, + }, + { + 'url': 'https://tvplay.skaties.lv/vinas-melo-labak/418113/?autostart=true', + 'only_matching': True, + }, + { + # views is null + 'url': 'http://tvplay.skaties.lv/parraides/tv3-zinas/760183', + 'only_matching': True, + }, + { + 'url': 'http://tv3play.tv3.ee/sisu/kodu-keset-linna/238551?autostart=true', + 'only_matching': True, + }, + { + 'url': 'mtg:418113', + 'only_matching': True, + } + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + geo_country = self._search_regex( + r'https?://[^/]+\.([a-z]{2})', url, + 'geo country', default=None) + if geo_country: + self._initialize_geo_bypass({'countries': [geo_country.upper()]}) + video = self._download_json( + 'http://playapi.mtgx.tv/v3/videos/%s' % video_id, video_id, 'Downloading video JSON') + + title = video['title'] + + try: + streams = self._download_json( + 'http://playapi.mtgx.tv/v3/videos/stream/%s' % video_id, + video_id, 'Downloading streams JSON') + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + msg = self._parse_json(e.cause.read().decode('utf-8'), video_id) + raise ExtractorError(msg['msg'], expected=True) + raise + + quality = qualities(['hls', 'medium', 'high']) + formats = [] + for format_id, video_url in streams.get('streams', {}).items(): + video_url = url_or_none(video_url) + if not video_url: + continue + ext = determine_ext(video_url) + if ext == 'f4m': + formats.extend(self._extract_f4m_formats( + update_url_query(video_url, { + 'hdcore': '3.5.0', + 'plugin': 'aasp-3.5.0.151.81' + }), video_id, f4m_id='hds', fatal=False)) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + else: + fmt = { + 'format_id': format_id, + 'quality': quality(format_id), + 'ext': ext, + } + if video_url.startswith('rtmp'): + m = re.search( + r'^(?P<url>rtmp://[^/]+/(?P<app>[^/]+))/(?P<playpath>.+)$', video_url) + if not m: + continue + fmt.update({ + 'ext': 'flv', + 'url': m.group('url'), + 'app': m.group('app'), + 'play_path': m.group('playpath'), + 'preference': -1, + }) + else: + fmt.update({ + 'url': video_url, + }) + formats.append(fmt) + + if not formats and video.get('is_geo_blocked'): + self.raise_geo_restricted( + 'This content might not be available in your country due to copyright reasons', + metadata_available=True) + + self._sort_formats(formats) + + # TODO: webvtt in m3u8 + subtitles = {} + sami_path = video.get('sami_path') + if sami_path: + lang = self._search_regex( + r'_([a-z]{2})\.xml', sami_path, 'lang', + default=compat_urlparse.urlparse(url).netloc.rsplit('.', 1)[-1]) + subtitles[lang] = [{ + 'url': sami_path, + }] + + series = video.get('format_title') + episode_number = int_or_none(video.get('format_position', {}).get('episode')) + season = video.get('_embedded', {}).get('season', {}).get('title') + season_number = int_or_none(video.get('format_position', {}).get('season')) + + return { + 'id': video_id, + 'title': title, + 'description': video.get('description'), + 'series': series, + 'episode_number': episode_number, + 'season': season, + 'season_number': season_number, + 'duration': int_or_none(video.get('duration')), + 'timestamp': parse_iso8601(video.get('created_at')), + 'view_count': try_get(video, lambda x: x['views']['total'], int), + 'age_limit': int_or_none(video.get('age_limit', 0)), + 'formats': formats, + 'subtitles': subtitles, + } + + +class ViafreeIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?:www\.)? + viafree\.(?P<country>dk|no|se|fi) + /(?P<id>(?:program(?:mer)?|ohjelmat)?/(?:[^/]+/)+[^/?#&]+) + ''' + _TESTS = [{ + 'url': 'http://www.viafree.no/programmer/underholdning/det-beste-vorspielet/sesong-2/episode-1', + 'info_dict': { + 'id': '757786', + 'ext': 'mp4', + 'title': 'Det beste vorspielet - Sesong 2 - Episode 1', + 'description': 'md5:b632cb848331404ccacd8cd03e83b4c3', + 'series': 'Det beste vorspielet', + 'season_number': 2, + 'duration': 1116, + 'timestamp': 1471200600, + 'upload_date': '20160814', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.viafree.dk/programmer/humor/comedy-central-roast-of-charlie-sheen/film/1047660', + 'info_dict': { + 'id': '1047660', + 'ext': 'mp4', + 'title': 'Comedy Central Roast of Charlie Sheen - Comedy Central Roast of Charlie Sheen', + 'description': 'md5:ec956d941ae9fd7c65a48fd64951dc6d', + 'series': 'Comedy Central Roast of Charlie Sheen', + 'season_number': 1, + 'duration': 3747, + 'timestamp': 1608246060, + 'upload_date': '20201217' + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True + } + }, { + # with relatedClips + 'url': 'http://www.viafree.se/program/reality/sommaren-med-youtube-stjarnorna/sasong-1/avsnitt-1', + 'only_matching': True, + }, { + # Different og:image URL schema + 'url': 'http://www.viafree.se/program/reality/sommaren-med-youtube-stjarnorna/sasong-1/avsnitt-2', + 'only_matching': True, + }, { + 'url': 'http://www.viafree.se/program/livsstil/husraddarna/sasong-2/avsnitt-2', + 'only_matching': True, + }, { + 'url': 'http://www.viafree.dk/programmer/reality/paradise-hotel/saeson-7/episode-5', + 'only_matching': True, + }, { + 'url': 'http://www.viafree.se/program/underhallning/i-like-radio-live/sasong-1/676869', + 'only_matching': True, + }, { + 'url': 'https://www.viafree.fi/ohjelmat/entertainment/amazing-makeovers/kausi-7/jakso-2', + 'only_matching': True, + }] + _GEO_BYPASS = False + + def _real_extract(self, url): + country, path = self._match_valid_url(url).groups() + content = self._download_json( + 'https://viafree-content.mtg-api.com/viafree-content/v1/%s/path/%s' % (country, path), path) + program = content['_embedded']['viafreeBlocks'][0]['_embedded']['program'] + guid = program['guid'] + meta = content['meta'] + title = meta['title'] + + try: + stream_href = self._download_json( + program['_links']['streamLink']['href'], guid, + headers=self.geo_verification_headers())['embedded']['prioritizedStreams'][0]['links']['stream']['href'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + self.raise_geo_restricted(countries=[country]) + raise + + formats, subtitles = self._extract_m3u8_formats_and_subtitles(stream_href, guid, 'mp4') + self._sort_formats(formats) + episode = program.get('episode') or {} + return { + 'id': guid, + 'title': title, + 'thumbnail': meta.get('image'), + 'description': meta.get('description'), + 'series': episode.get('seriesTitle'), + 'subtitles': subtitles, + 'episode_number': int_or_none(episode.get('episodeNumber')), + 'season_number': int_or_none(episode.get('seasonNumber')), + 'duration': int_or_none(try_get(program, lambda x: x['video']['duration']['milliseconds']), 1000), + 'timestamp': parse_iso8601(try_get(program, lambda x: x['availability']['start'])), + 'formats': formats, + } + + +class TVPlayHomeIE(InfoExtractor): + _VALID_URL = r'https?://(?:tv3?)?play\.(?:tv3\.lt|skaties\.lv|tv3\.ee)/(?:[^/]+/)*[^/?#&]+-(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://tvplay.tv3.lt/aferistai-n-7/aferistai-10047125/', + 'info_dict': { + 'id': '366367', + 'ext': 'mp4', + 'title': 'Aferistai', + 'description': 'Aferistai. Kalėdinė pasaka.', + 'series': 'Aferistai [N-7]', + 'season': '1 sezonas', + 'season_number': 1, + 'duration': 464, + 'timestamp': 1394209658, + 'upload_date': '20140307', + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://tvplay.skaties.lv/vinas-melo-labak/vinas-melo-labak-10280317/', + 'only_matching': True, + }, { + 'url': 'https://tvplay.tv3.ee/cool-d-ga-mehhikosse/cool-d-ga-mehhikosse-10044354/', + 'only_matching': True, + }, { + 'url': 'https://play.tv3.lt/aferistai-10047125', + 'only_matching': True, + }, { + 'url': 'https://tv3play.skaties.lv/vinas-melo-labak-10280317', + 'only_matching': True, + }, { + 'url': 'https://play.tv3.ee/cool-d-ga-mehhikosse-10044354', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + asset = self._download_json( + urljoin(url, '/sb/public/asset/' + video_id), video_id) + + m3u8_url = asset['movie']['contentUrl'] + video_id = asset['assetId'] + asset_title = asset['title'] + title = asset_title['title'] + + formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls') + self._sort_formats(formats) + + thumbnails = None + image_url = asset.get('imageUrl') + if image_url: + thumbnails = [{ + 'url': urljoin(url, image_url), + 'ext': 'jpg', + }] + + metadata = asset.get('metadata') or {} + + return { + 'id': video_id, + 'title': title, + 'description': asset_title.get('summaryLong') or asset_title.get('summaryShort'), + 'thumbnails': thumbnails, + 'duration': parse_duration(asset_title.get('runTime')), + 'series': asset.get('tvSeriesTitle'), + 'season': asset.get('tvSeasonTitle'), + 'season_number': int_or_none(metadata.get('seasonNumber')), + 'episode': asset_title.get('titleBrief'), + 'episode_number': int_or_none(metadata.get('episodeNumber')), + 'formats': formats, + } diff --git a/youtube_dl/extractor/tvplayer.py b/yt_dlp/extractor/tvplayer.py index 8f8686a65..8f8686a65 100644 --- a/youtube_dl/extractor/tvplayer.py +++ b/yt_dlp/extractor/tvplayer.py diff --git a/youtube_dl/extractor/tweakers.py b/yt_dlp/extractor/tweakers.py index 2b10d9bca..2b10d9bca 100644 --- a/youtube_dl/extractor/tweakers.py +++ b/yt_dlp/extractor/tweakers.py diff --git a/yt_dlp/extractor/twentyfourvideo.py b/yt_dlp/extractor/twentyfourvideo.py new file mode 100644 index 000000000..ae19e11e1 --- /dev/null +++ b/yt_dlp/extractor/twentyfourvideo.py @@ -0,0 +1,132 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ( + parse_iso8601, + int_or_none, + xpath_attr, + xpath_element, +) + + +class TwentyFourVideoIE(InfoExtractor): + IE_NAME = '24video' + _VALID_URL = r'''(?x) + https?:// + (?P<host> + (?:(?:www|porno?)\.)?24video\. + (?:net|me|xxx|sexy?|tube|adult|site|vip) + )/ + (?: + video/(?:(?:view|xml)/)?| + player/new24_play\.swf\?id= + ) + (?P<id>\d+) + ''' + + _TESTS = [{ + 'url': 'http://www.24video.net/video/view/1044982', + 'md5': 'e09fc0901d9eaeedac872f154931deeb', + 'info_dict': { + 'id': '1044982', + 'ext': 'mp4', + 'title': 'Эротика каменного века', + 'description': 'Как смотрели порно в каменном веке.', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'SUPERTELO', + 'duration': 31, + 'timestamp': 1275937857, + 'upload_date': '20100607', + 'age_limit': 18, + 'like_count': int, + 'dislike_count': int, + }, + }, { + 'url': 'http://www.24video.net/player/new24_play.swf?id=1044982', + 'only_matching': True, + }, { + 'url': 'http://www.24video.me/video/view/1044982', + 'only_matching': True, + }, { + 'url': 'http://www.24video.tube/video/view/2363750', + 'only_matching': True, + }, { + 'url': 'https://www.24video.site/video/view/2640421', + 'only_matching': True, + }, { + 'url': 'https://porno.24video.net/video/2640421-vsya-takaya-gibkaya-i-v-masle', + 'only_matching': True, + }, { + 'url': 'https://www.24video.vip/video/view/1044982', + 'only_matching': True, + }, { + 'url': 'https://porn.24video.net/video/2640421-vsya-takay', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + host = mobj.group('host') + + webpage = self._download_webpage( + 'http://%s/video/view/%s' % (host, video_id), video_id) + + title = self._og_search_title(webpage) + description = self._html_search_regex( + r'<(p|span)[^>]+itemprop="description"[^>]*>(?P<description>[^<]+)</\1>', + webpage, 'description', fatal=False, group='description') + thumbnail = self._og_search_thumbnail(webpage) + duration = int_or_none(self._og_search_property( + 'duration', webpage, 'duration', fatal=False)) + timestamp = parse_iso8601(self._search_regex( + r'<time[^>]+\bdatetime="([^"]+)"[^>]+itemprop="uploadDate"', + webpage, 'upload date', fatal=False)) + + uploader = self._html_search_regex( + r'class="video-uploaded"[^>]*>\s*<a href="/jsecUser/movies/[^"]+"[^>]*>([^<]+)</a>', + webpage, 'uploader', fatal=False) + + view_count = int_or_none(self._html_search_regex( + r'<span class="video-views">(\d+) просмотр', + webpage, 'view count', fatal=False)) + comment_count = int_or_none(self._html_search_regex( + r'<a[^>]+href="#tab-comments"[^>]*>(\d+) комментари', + webpage, 'comment count', default=None)) + + # Sets some cookies + self._download_xml( + r'http://%s/video/xml/%s?mode=init' % (host, video_id), + video_id, 'Downloading init XML') + + video_xml = self._download_xml( + 'http://%s/video/xml/%s?mode=play' % (host, video_id), + video_id, 'Downloading video XML') + + video = xpath_element(video_xml, './/video', 'video', fatal=True) + + formats = [{ + 'url': xpath_attr(video, '', 'url', 'video URL', fatal=True), + }] + + like_count = int_or_none(video.get('ratingPlus')) + dislike_count = int_or_none(video.get('ratingMinus')) + age_limit = 18 if video.get('adult') == 'true' else 0 + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'duration': duration, + 'timestamp': timestamp, + 'view_count': view_count, + 'comment_count': comment_count, + 'like_count': like_count, + 'dislike_count': dislike_count, + 'age_limit': age_limit, + 'formats': formats, + } diff --git a/youtube_dl/extractor/twentymin.py b/yt_dlp/extractor/twentymin.py index a42977f39..a42977f39 100644 --- a/youtube_dl/extractor/twentymin.py +++ b/yt_dlp/extractor/twentymin.py diff --git a/yt_dlp/extractor/twentythreevideo.py b/yt_dlp/extractor/twentythreevideo.py new file mode 100644 index 000000000..e8cf5a1e9 --- /dev/null +++ b/yt_dlp/extractor/twentythreevideo.py @@ -0,0 +1,79 @@ +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import int_or_none + + +class TwentyThreeVideoIE(InfoExtractor): + IE_NAME = '23video' + _VALID_URL = r'https?://(?P<domain>[^.]+\.(?:twentythree\.net|23video\.com|filmweb\.no))/v\.ihtml/player\.html\?(?P<query>.*?\bphoto(?:_|%5f)id=(?P<id>\d+).*)' + _TESTS = [{ + 'url': 'https://video.twentythree.net/v.ihtml/player.html?showDescriptions=0&source=site&photo%5fid=20448876&autoPlay=1', + 'md5': '75fcf216303eb1dae9920d651f85ced4', + 'info_dict': { + 'id': '20448876', + 'ext': 'mp4', + 'title': 'Video Marketing Minute: Personalized Video', + 'timestamp': 1513855354, + 'upload_date': '20171221', + 'uploader_id': '12258964', + 'uploader': 'Rasmus Bysted', + } + }, { + 'url': 'https://bonnier-publications-danmark.23video.com/v.ihtml/player.html?token=f0dc46476e06e13afd5a1f84a29e31e8&source=embed&photo%5fid=36137620', + 'only_matching': True, + }] + + def _real_extract(self, url): + domain, query, photo_id = self._match_valid_url(url).groups() + base_url = 'https://%s' % domain + photo_data = self._download_json( + base_url + '/api/photo/list?' + query, photo_id, query={ + 'format': 'json', + }, transform_source=lambda s: self._search_regex(r'(?s)({.+})', s, 'photo data'))['photo'] + title = photo_data['title'] + + formats = [] + + audio_path = photo_data.get('audio_download') + if audio_path: + formats.append({ + 'format_id': 'audio', + 'url': base_url + audio_path, + 'filesize': int_or_none(photo_data.get('audio_size')), + 'vcodec': 'none', + }) + + def add_common_info_to_list(l, template, id_field, id_value): + f_base = template % id_value + f_path = photo_data.get(f_base + 'download') + if not f_path: + return + l.append({ + id_field: id_value, + 'url': base_url + f_path, + 'width': int_or_none(photo_data.get(f_base + 'width')), + 'height': int_or_none(photo_data.get(f_base + 'height')), + 'filesize': int_or_none(photo_data.get(f_base + 'size')), + }) + + for f in ('mobile_high', 'medium', 'hd', '1080p', '4k'): + add_common_info_to_list(formats, 'video_%s_', 'format_id', f) + + thumbnails = [] + for t in ('quad16', 'quad50', 'quad75', 'quad100', 'small', 'portrait', 'standard', 'medium', 'large', 'original'): + add_common_info_to_list(thumbnails, '%s_', 'id', t) + + return { + 'id': photo_id, + 'title': title, + 'timestamp': int_or_none(photo_data.get('creation_date_epoch')), + 'duration': int_or_none(photo_data.get('video_length')), + 'view_count': int_or_none(photo_data.get('view_count')), + 'comment_count': int_or_none(photo_data.get('number_of_comments')), + 'uploader_id': photo_data.get('user_id'), + 'uploader': photo_data.get('display_name'), + 'thumbnails': thumbnails, + 'formats': formats, + } diff --git a/yt_dlp/extractor/twitcasting.py b/yt_dlp/extractor/twitcasting.py new file mode 100644 index 000000000..3acf1b118 --- /dev/null +++ b/yt_dlp/extractor/twitcasting.py @@ -0,0 +1,196 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools +import re + +from .common import InfoExtractor +from ..downloader.websocket import has_websockets +from ..utils import ( + clean_html, + float_or_none, + get_element_by_class, + get_element_by_id, + parse_duration, + qualities, + str_to_int, + try_get, + unified_timestamp, + urlencode_postdata, + urljoin, + ExtractorError, +) + + +class TwitCastingIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^/]+\.)?twitcasting\.tv/(?P<uploader_id>[^/]+)/(?:movie|twplayer)/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://twitcasting.tv/ivetesangalo/movie/2357609', + 'md5': '745243cad58c4681dc752490f7540d7f', + 'info_dict': { + 'id': '2357609', + 'ext': 'mp4', + 'title': 'Live #2357609', + 'uploader_id': 'ivetesangalo', + 'description': 'Twitter Oficial da cantora brasileira Ivete Sangalo.', + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20110822', + 'timestamp': 1314010824, + 'duration': 32, + 'view_count': int, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://twitcasting.tv/mttbernardini/movie/3689740', + 'info_dict': { + 'id': '3689740', + 'ext': 'mp4', + 'title': 'Live playing something #3689740', + 'uploader_id': 'mttbernardini', + 'description': 'Salve, io sono Matto (ma con la e). Questa è la mia presentazione, in quanto sono letteralmente matto (nel senso di strano), con qualcosa in più.', + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20120212', + 'timestamp': 1329028024, + 'duration': 681, + 'view_count': int, + }, + 'params': { + 'skip_download': True, + 'videopassword': 'abc', + }, + }] + + def _real_extract(self, url): + uploader_id, video_id = self._match_valid_url(url).groups() + + video_password = self.get_param('videopassword') + request_data = None + if video_password: + request_data = urlencode_postdata({ + 'password': video_password, + }) + webpage = self._download_webpage( + url, video_id, data=request_data, + headers={'Origin': 'https://twitcasting.tv'}) + + title = (clean_html(get_element_by_id('movietitle', webpage)) + or self._html_search_meta(['og:title', 'twitter:title'], webpage, fatal=True)) + + video_js_data = {} + m3u8_url = self._search_regex( + r'data-movie-url=(["\'])(?P<url>(?:(?!\1).)+)\1', + webpage, 'm3u8 url', group='url', default=None) + if not m3u8_url: + video_js_data = self._parse_json(self._search_regex( + r'data-movie-playlist=(["\'])(?P<url>(?:(?!\1).)+)', + webpage, 'movie playlist', group='url', default='[{}]'), video_id) + if isinstance(video_js_data, dict): + video_js_data = list(video_js_data.values())[0] + video_js_data = video_js_data[0] + m3u8_url = try_get(video_js_data, lambda x: x['source']['url']) + + stream_server_data = self._download_json( + 'https://twitcasting.tv/streamserver.php?target=%s&mode=client' % uploader_id, video_id, + 'Downloading live info', fatal=False) + + is_live = 'data-status="online"' in webpage + formats = [] + if is_live and not m3u8_url: + m3u8_url = 'https://twitcasting.tv/%s/metastream.m3u8' % uploader_id + if is_live and has_websockets and stream_server_data: + qq = qualities(['base', 'mobilesource', 'main']) + for mode, ws_url in stream_server_data['llfmp4']['streams'].items(): + formats.append({ + 'url': ws_url, + 'format_id': 'ws-%s' % mode, + 'ext': 'mp4', + 'quality': qq(mode), + 'protocol': 'websocket_frag', # TwitCasting simply sends moof atom directly over WS + }) + + thumbnail = video_js_data.get('thumbnailUrl') or self._og_search_thumbnail(webpage) + description = clean_html(get_element_by_id( + 'authorcomment', webpage)) or self._html_search_meta( + ['description', 'og:description', 'twitter:description'], webpage) + duration = float_or_none(video_js_data.get( + 'duration'), 1000) or parse_duration(clean_html( + get_element_by_class('tw-player-duration-time', webpage))) + view_count = str_to_int(self._search_regex( + r'Total\s*:\s*([\d,]+)\s*Views', webpage, 'views', None)) + timestamp = unified_timestamp(self._search_regex( + r'data-toggle="true"[^>]+datetime="([^"]+)"', + webpage, 'datetime', None)) + + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', live=is_live)) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'uploader_id': uploader_id, + 'duration': duration, + 'view_count': view_count, + 'formats': formats, + 'is_live': is_live, + } + + +class TwitCastingLiveIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^/]+\.)?twitcasting\.tv/(?P<id>[^/]+)/?(?:[#?]|$)' + _TESTS = [{ + 'url': 'https://twitcasting.tv/ivetesangalo', + 'only_matching': True, + }] + + def _real_extract(self, url): + uploader_id = self._match_id(url) + self.to_screen( + 'Downloading live video of user {0}. ' + 'Pass "https://twitcasting.tv/{0}/show" to download the history'.format(uploader_id)) + + webpage = self._download_webpage(url, uploader_id) + current_live = self._search_regex( + (r'data-type="movie" data-id="(\d+)">', + r'tw-sound-flag-open-link" data-id="(\d+)" style=',), + webpage, 'current live ID', default=None) + if not current_live: + raise ExtractorError('The user is not currently live') + return self.url_result('https://twitcasting.tv/%s/movie/%s' % (uploader_id, current_live)) + + +class TwitCastingUserIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^/]+\.)?twitcasting\.tv/(?P<id>[^/]+)/show/?(?:[#?]|$)' + _TESTS = [{ + 'url': 'https://twitcasting.tv/noriyukicas/show', + 'only_matching': True, + }] + + def _entries(self, uploader_id): + base_url = next_url = 'https://twitcasting.tv/%s/show' % uploader_id + for page_num in itertools.count(1): + webpage = self._download_webpage( + next_url, uploader_id, query={'filter': 'watchable'}, note='Downloading page %d' % page_num) + matches = re.finditer( + r'''(?isx)<a\s+class="tw-movie-thumbnail"\s*href="(?P<url>/[^/]+/movie/\d+)"\s*>.+?</a>''', + webpage) + for mobj in matches: + yield self.url_result(urljoin(base_url, mobj.group('url'))) + + next_url = self._search_regex( + r'<a href="(/%s/show/%d-\d+)[?"]' % (re.escape(uploader_id), page_num), + webpage, 'next url', default=None) + next_url = urljoin(base_url, next_url) + if not next_url: + return + + def _real_extract(self, url): + uploader_id = self._match_id(url) + return self.playlist_result( + self._entries(uploader_id), uploader_id, '%s - Live History' % uploader_id) diff --git a/yt_dlp/extractor/twitch.py b/yt_dlp/extractor/twitch.py new file mode 100644 index 000000000..be70beed4 --- /dev/null +++ b/yt_dlp/extractor/twitch.py @@ -0,0 +1,990 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import collections +import itertools +import json +import random +import re + +from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_str, + compat_urllib_parse_urlencode, + compat_urllib_parse_urlparse, +) +from ..utils import ( + clean_html, + dict_get, + ExtractorError, + float_or_none, + int_or_none, + parse_duration, + parse_iso8601, + parse_qs, + qualities, + try_get, + unified_timestamp, + update_url_query, + url_or_none, + urljoin, +) + + +class TwitchBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:(?:www|go|m)\.)?twitch\.tv' + + _API_BASE = 'https://api.twitch.tv' + _USHER_BASE = 'https://usher.ttvnw.net' + _LOGIN_FORM_URL = 'https://www.twitch.tv/login' + _LOGIN_POST_URL = 'https://passport.twitch.tv/login' + _CLIENT_ID = 'kimne78kx3ncx6brgo4mv6wki5h1ko' + _NETRC_MACHINE = 'twitch' + + _OPERATION_HASHES = { + 'CollectionSideBar': '27111f1b382effad0b6def325caef1909c733fe6a4fbabf54f8d491ef2cf2f14', + 'FilterableVideoTower_Videos': 'a937f1d22e269e39a03b509f65a7490f9fc247d7f83d6ac1421523e3b68042cb', + 'ClipsCards__User': 'b73ad2bfaecfd30a9e6c28fada15bd97032c83ec77a0440766a56fe0bd632777', + 'ChannelCollectionsContent': '07e3691a1bad77a36aba590c351180439a40baefc1c275356f40fc7082419a84', + 'StreamMetadata': '1c719a40e481453e5c48d9bb585d971b8b372f8ebb105b17076722264dfa5b3e', + 'ComscoreStreamingQuery': 'e1edae8122517d013405f237ffcc124515dc6ded82480a88daef69c83b53ac01', + 'VideoAccessToken_Clip': '36b89d2507fce29e5ca551df756d27c1cfe079e2609642b4390aa4c35796eb11', + 'VideoPreviewOverlay': '3006e77e51b128d838fa4e835723ca4dc9a05c5efd4466c1085215c6e437e65c', + 'VideoMetadata': '226edb3e692509f727fd56821f5653c05740242c82b0388883e0c0e75dcbf687', + } + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + def fail(message): + raise ExtractorError( + 'Unable to login. Twitch said: %s' % message, expected=True) + + def login_step(page, urlh, note, data): + form = self._hidden_inputs(page) + form.update(data) + + page_url = urlh.geturl() + post_url = self._search_regex( + r'<form[^>]+action=(["\'])(?P<url>.+?)\1', page, + 'post url', default=self._LOGIN_POST_URL, group='url') + post_url = urljoin(page_url, post_url) + + headers = { + 'Referer': page_url, + 'Origin': 'https://www.twitch.tv', + 'Content-Type': 'text/plain;charset=UTF-8', + } + + response = self._download_json( + post_url, None, note, data=json.dumps(form).encode(), + headers=headers, expected_status=400) + error = dict_get(response, ('error', 'error_description', 'error_code')) + if error: + fail(error) + + if 'Authenticated successfully' in response.get('message', ''): + return None, None + + redirect_url = urljoin( + post_url, + response.get('redirect') or response['redirect_path']) + return self._download_webpage_handle( + redirect_url, None, 'Downloading login redirect page', + headers=headers) + + login_page, handle = self._download_webpage_handle( + self._LOGIN_FORM_URL, None, 'Downloading login page') + + # Some TOR nodes and public proxies are blocked completely + if 'blacklist_message' in login_page: + fail(clean_html(login_page)) + + redirect_page, handle = login_step( + login_page, handle, 'Logging in', { + 'username': username, + 'password': password, + 'client_id': self._CLIENT_ID, + }) + + # Successful login + if not redirect_page: + return + + if re.search(r'(?i)<form[^>]+id="two-factor-submit"', redirect_page) is not None: + # TODO: Add mechanism to request an SMS or phone call + tfa_token = self._get_tfa_info('two-factor authentication token') + login_step(redirect_page, handle, 'Submitting TFA token', { + 'authy_token': tfa_token, + 'remember_2fa': 'true', + }) + + def _prefer_source(self, formats): + try: + source = next(f for f in formats if f['format_id'] == 'Source') + source['quality'] = 10 + except StopIteration: + for f in formats: + if '/chunked/' in f['url']: + f.update({ + 'quality': 10, + 'format_note': 'Source', + }) + self._sort_formats(formats) + + def _download_base_gql(self, video_id, ops, note, fatal=True): + headers = { + 'Content-Type': 'text/plain;charset=UTF-8', + 'Client-ID': self._CLIENT_ID, + } + gql_auth = self._get_cookies('https://gql.twitch.tv').get('auth-token') + if gql_auth: + headers['Authorization'] = 'OAuth ' + gql_auth.value + return self._download_json( + 'https://gql.twitch.tv/gql', video_id, note, + data=json.dumps(ops).encode(), + headers=headers, fatal=fatal) + + def _download_gql(self, video_id, ops, note, fatal=True): + for op in ops: + op['extensions'] = { + 'persistedQuery': { + 'version': 1, + 'sha256Hash': self._OPERATION_HASHES[op['operationName']], + } + } + return self._download_base_gql(video_id, ops, note) + + def _download_access_token(self, video_id, token_kind, param_name): + method = '%sPlaybackAccessToken' % token_kind + ops = { + 'query': '''{ + %s( + %s: "%s", + params: { + platform: "web", + playerBackend: "mediaplayer", + playerType: "site" + } + ) + { + value + signature + } + }''' % (method, param_name, video_id), + } + return self._download_base_gql( + video_id, ops, + 'Downloading %s access token GraphQL' % token_kind)['data'][method] + + +class TwitchVodIE(TwitchBaseIE): + IE_NAME = 'twitch:vod' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:(?:www|go|m)\.)?twitch\.tv/(?:[^/]+/v(?:ideo)?|videos)/| + player\.twitch\.tv/\?.*?\bvideo=v? + ) + (?P<id>\d+) + ''' + + _TESTS = [{ + 'url': 'http://www.twitch.tv/riotgames/v/6528877?t=5m10s', + 'info_dict': { + 'id': 'v6528877', + 'ext': 'mp4', + 'title': 'LCK Summer Split - Week 6 Day 1', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 17208, + 'timestamp': 1435131734, + 'upload_date': '20150624', + 'uploader': 'Riot Games', + 'uploader_id': 'riotgames', + 'view_count': int, + 'start_time': 310, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # Untitled broadcast (title is None) + 'url': 'http://www.twitch.tv/belkao_o/v/11230755', + 'info_dict': { + 'id': 'v11230755', + 'ext': 'mp4', + 'title': 'Untitled Broadcast', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 1638, + 'timestamp': 1439746708, + 'upload_date': '20150816', + 'uploader': 'BelkAO_o', + 'uploader_id': 'belkao_o', + 'view_count': int, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'skip': 'HTTP Error 404: Not Found', + }, { + 'url': 'http://player.twitch.tv/?t=5m10s&video=v6528877', + 'only_matching': True, + }, { + 'url': 'https://www.twitch.tv/videos/6528877', + 'only_matching': True, + }, { + 'url': 'https://m.twitch.tv/beagsandjam/v/247478721', + 'only_matching': True, + }, { + 'url': 'https://www.twitch.tv/northernlion/video/291940395', + 'only_matching': True, + }, { + 'url': 'https://player.twitch.tv/?video=480452374', + 'only_matching': True, + }] + + def _download_info(self, item_id): + data = self._download_gql( + item_id, [{ + 'operationName': 'VideoMetadata', + 'variables': { + 'channelLogin': '', + 'videoID': item_id, + }, + }], + 'Downloading stream metadata GraphQL')[0]['data'] + video = data.get('video') + if video is None: + raise ExtractorError( + 'Video %s does not exist' % item_id, expected=True) + return self._extract_info_gql(video, item_id) + + @staticmethod + def _extract_info(info): + status = info.get('status') + if status == 'recording': + is_live = True + elif status == 'recorded': + is_live = False + else: + is_live = None + _QUALITIES = ('small', 'medium', 'large') + quality_key = qualities(_QUALITIES) + thumbnails = [] + preview = info.get('preview') + if isinstance(preview, dict): + for thumbnail_id, thumbnail_url in preview.items(): + thumbnail_url = url_or_none(thumbnail_url) + if not thumbnail_url: + continue + if thumbnail_id not in _QUALITIES: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'preference': quality_key(thumbnail_id), + }) + return { + 'id': info['_id'], + 'title': info.get('title') or 'Untitled Broadcast', + 'description': info.get('description'), + 'duration': int_or_none(info.get('length')), + 'thumbnails': thumbnails, + 'uploader': info.get('channel', {}).get('display_name'), + 'uploader_id': info.get('channel', {}).get('name'), + 'timestamp': parse_iso8601(info.get('recorded_at')), + 'view_count': int_or_none(info.get('views')), + 'is_live': is_live, + } + + @staticmethod + def _extract_info_gql(info, item_id): + vod_id = info.get('id') or item_id + # id backward compatibility for download archives + if vod_id[0] != 'v': + vod_id = 'v%s' % vod_id + thumbnail = url_or_none(info.get('previewThumbnailURL')) + if thumbnail: + for p in ('width', 'height'): + thumbnail = thumbnail.replace('{%s}' % p, '0') + return { + 'id': vod_id, + 'title': info.get('title') or 'Untitled Broadcast', + 'description': info.get('description'), + 'duration': int_or_none(info.get('lengthSeconds')), + 'thumbnail': thumbnail, + 'uploader': try_get(info, lambda x: x['owner']['displayName'], compat_str), + 'uploader_id': try_get(info, lambda x: x['owner']['login'], compat_str), + 'timestamp': unified_timestamp(info.get('publishedAt')), + 'view_count': int_or_none(info.get('viewCount')), + } + + def _real_extract(self, url): + vod_id = self._match_id(url) + + info = self._download_info(vod_id) + access_token = self._download_access_token(vod_id, 'video', 'id') + + formats = self._extract_m3u8_formats( + '%s/vod/%s.m3u8?%s' % ( + self._USHER_BASE, vod_id, + compat_urllib_parse_urlencode({ + 'allow_source': 'true', + 'allow_audio_only': 'true', + 'allow_spectre': 'true', + 'player': 'twitchweb', + 'playlist_include_framerate': 'true', + 'nauth': access_token['value'], + 'nauthsig': access_token['signature'], + })), + vod_id, 'mp4', entry_protocol='m3u8_native') + + self._prefer_source(formats) + info['formats'] = formats + + parsed_url = compat_urllib_parse_urlparse(url) + query = compat_parse_qs(parsed_url.query) + if 't' in query: + info['start_time'] = parse_duration(query['t'][0]) + + if info.get('timestamp') is not None: + info['subtitles'] = { + 'rechat': [{ + 'url': update_url_query( + 'https://api.twitch.tv/v5/videos/%s/comments' % vod_id, { + 'client_id': self._CLIENT_ID, + }), + 'ext': 'json', + }], + } + + return info + + +def _make_video_result(node): + assert isinstance(node, dict) + video_id = node.get('id') + if not video_id: + return + return { + '_type': 'url_transparent', + 'ie_key': TwitchVodIE.ie_key(), + 'id': 'v' + video_id, + 'url': 'https://www.twitch.tv/videos/%s' % video_id, + 'title': node.get('title'), + 'thumbnail': node.get('previewThumbnailURL'), + 'duration': float_or_none(node.get('lengthSeconds')), + 'view_count': int_or_none(node.get('viewCount')), + } + + +class TwitchCollectionIE(TwitchBaseIE): + _VALID_URL = r'https?://(?:(?:www|go|m)\.)?twitch\.tv/collections/(?P<id>[^/]+)' + + _TESTS = [{ + 'url': 'https://www.twitch.tv/collections/wlDCoH0zEBZZbQ', + 'info_dict': { + 'id': 'wlDCoH0zEBZZbQ', + 'title': 'Overthrow Nook, capitalism for children', + }, + 'playlist_mincount': 13, + }] + + _OPERATION_NAME = 'CollectionSideBar' + + def _real_extract(self, url): + collection_id = self._match_id(url) + collection = self._download_gql( + collection_id, [{ + 'operationName': self._OPERATION_NAME, + 'variables': {'collectionID': collection_id}, + }], + 'Downloading collection GraphQL')[0]['data']['collection'] + title = collection.get('title') + entries = [] + for edge in collection['items']['edges']: + if not isinstance(edge, dict): + continue + node = edge.get('node') + if not isinstance(node, dict): + continue + video = _make_video_result(node) + if video: + entries.append(video) + return self.playlist_result( + entries, playlist_id=collection_id, playlist_title=title) + + +class TwitchPlaylistBaseIE(TwitchBaseIE): + _PAGE_LIMIT = 100 + + def _entries(self, channel_name, *args): + cursor = None + variables_common = self._make_variables(channel_name, *args) + entries_key = '%ss' % self._ENTRY_KIND + for page_num in itertools.count(1): + variables = variables_common.copy() + variables['limit'] = self._PAGE_LIMIT + if cursor: + variables['cursor'] = cursor + page = self._download_gql( + channel_name, [{ + 'operationName': self._OPERATION_NAME, + 'variables': variables, + }], + 'Downloading %ss GraphQL page %s' % (self._NODE_KIND, page_num), + fatal=False) + if not page: + break + edges = try_get( + page, lambda x: x[0]['data']['user'][entries_key]['edges'], list) + if not edges: + break + for edge in edges: + if not isinstance(edge, dict): + continue + if edge.get('__typename') != self._EDGE_KIND: + continue + node = edge.get('node') + if not isinstance(node, dict): + continue + if node.get('__typename') != self._NODE_KIND: + continue + entry = self._extract_entry(node) + if entry: + cursor = edge.get('cursor') + yield entry + if not cursor or not isinstance(cursor, compat_str): + break + + +class TwitchVideosIE(TwitchPlaylistBaseIE): + _VALID_URL = r'https?://(?:(?:www|go|m)\.)?twitch\.tv/(?P<id>[^/]+)/(?:videos|profile)' + + _TESTS = [{ + # All Videos sorted by Date + 'url': 'https://www.twitch.tv/spamfish/videos?filter=all', + 'info_dict': { + 'id': 'spamfish', + 'title': 'spamfish - All Videos sorted by Date', + }, + 'playlist_mincount': 924, + }, { + # All Videos sorted by Popular + 'url': 'https://www.twitch.tv/spamfish/videos?filter=all&sort=views', + 'info_dict': { + 'id': 'spamfish', + 'title': 'spamfish - All Videos sorted by Popular', + }, + 'playlist_mincount': 931, + }, { + # Past Broadcasts sorted by Date + 'url': 'https://www.twitch.tv/spamfish/videos?filter=archives', + 'info_dict': { + 'id': 'spamfish', + 'title': 'spamfish - Past Broadcasts sorted by Date', + }, + 'playlist_mincount': 27, + }, { + # Highlights sorted by Date + 'url': 'https://www.twitch.tv/spamfish/videos?filter=highlights', + 'info_dict': { + 'id': 'spamfish', + 'title': 'spamfish - Highlights sorted by Date', + }, + 'playlist_mincount': 901, + }, { + # Uploads sorted by Date + 'url': 'https://www.twitch.tv/esl_csgo/videos?filter=uploads&sort=time', + 'info_dict': { + 'id': 'esl_csgo', + 'title': 'esl_csgo - Uploads sorted by Date', + }, + 'playlist_mincount': 5, + }, { + # Past Premieres sorted by Date + 'url': 'https://www.twitch.tv/spamfish/videos?filter=past_premieres', + 'info_dict': { + 'id': 'spamfish', + 'title': 'spamfish - Past Premieres sorted by Date', + }, + 'playlist_mincount': 1, + }, { + 'url': 'https://www.twitch.tv/spamfish/videos/all', + 'only_matching': True, + }, { + 'url': 'https://m.twitch.tv/spamfish/videos/all', + 'only_matching': True, + }, { + 'url': 'https://www.twitch.tv/spamfish/videos', + 'only_matching': True, + }] + + Broadcast = collections.namedtuple('Broadcast', ['type', 'label']) + + _DEFAULT_BROADCAST = Broadcast(None, 'All Videos') + _BROADCASTS = { + 'archives': Broadcast('ARCHIVE', 'Past Broadcasts'), + 'highlights': Broadcast('HIGHLIGHT', 'Highlights'), + 'uploads': Broadcast('UPLOAD', 'Uploads'), + 'past_premieres': Broadcast('PAST_PREMIERE', 'Past Premieres'), + 'all': _DEFAULT_BROADCAST, + } + + _DEFAULT_SORTED_BY = 'Date' + _SORTED_BY = { + 'time': _DEFAULT_SORTED_BY, + 'views': 'Popular', + } + + _OPERATION_NAME = 'FilterableVideoTower_Videos' + _ENTRY_KIND = 'video' + _EDGE_KIND = 'VideoEdge' + _NODE_KIND = 'Video' + + @classmethod + def suitable(cls, url): + return (False + if any(ie.suitable(url) for ie in ( + TwitchVideosClipsIE, + TwitchVideosCollectionsIE)) + else super(TwitchVideosIE, cls).suitable(url)) + + @staticmethod + def _make_variables(channel_name, broadcast_type, sort): + return { + 'channelOwnerLogin': channel_name, + 'broadcastType': broadcast_type, + 'videoSort': sort.upper(), + } + + @staticmethod + def _extract_entry(node): + return _make_video_result(node) + + def _real_extract(self, url): + channel_name = self._match_id(url) + qs = parse_qs(url) + filter = qs.get('filter', ['all'])[0] + sort = qs.get('sort', ['time'])[0] + broadcast = self._BROADCASTS.get(filter, self._DEFAULT_BROADCAST) + return self.playlist_result( + self._entries(channel_name, broadcast.type, sort), + playlist_id=channel_name, + playlist_title='%s - %s sorted by %s' + % (channel_name, broadcast.label, + self._SORTED_BY.get(sort, self._DEFAULT_SORTED_BY))) + + +class TwitchVideosClipsIE(TwitchPlaylistBaseIE): + _VALID_URL = r'https?://(?:(?:www|go|m)\.)?twitch\.tv/(?P<id>[^/]+)/(?:clips|videos/*?\?.*?\bfilter=clips)' + + _TESTS = [{ + # Clips + 'url': 'https://www.twitch.tv/vanillatv/clips?filter=clips&range=all', + 'info_dict': { + 'id': 'vanillatv', + 'title': 'vanillatv - Clips Top All', + }, + 'playlist_mincount': 1, + }, { + 'url': 'https://www.twitch.tv/dota2ruhub/videos?filter=clips&range=7d', + 'only_matching': True, + }] + + Clip = collections.namedtuple('Clip', ['filter', 'label']) + + _DEFAULT_CLIP = Clip('LAST_WEEK', 'Top 7D') + _RANGE = { + '24hr': Clip('LAST_DAY', 'Top 24H'), + '7d': _DEFAULT_CLIP, + '30d': Clip('LAST_MONTH', 'Top 30D'), + 'all': Clip('ALL_TIME', 'Top All'), + } + + # NB: values other than 20 result in skipped videos + _PAGE_LIMIT = 20 + + _OPERATION_NAME = 'ClipsCards__User' + _ENTRY_KIND = 'clip' + _EDGE_KIND = 'ClipEdge' + _NODE_KIND = 'Clip' + + @staticmethod + def _make_variables(channel_name, filter): + return { + 'login': channel_name, + 'criteria': { + 'filter': filter, + }, + } + + @staticmethod + def _extract_entry(node): + assert isinstance(node, dict) + clip_url = url_or_none(node.get('url')) + if not clip_url: + return + return { + '_type': 'url_transparent', + 'ie_key': TwitchClipsIE.ie_key(), + 'id': node.get('id'), + 'url': clip_url, + 'title': node.get('title'), + 'thumbnail': node.get('thumbnailURL'), + 'duration': float_or_none(node.get('durationSeconds')), + 'timestamp': unified_timestamp(node.get('createdAt')), + 'view_count': int_or_none(node.get('viewCount')), + 'language': node.get('language'), + } + + def _real_extract(self, url): + channel_name = self._match_id(url) + qs = parse_qs(url) + range = qs.get('range', ['7d'])[0] + clip = self._RANGE.get(range, self._DEFAULT_CLIP) + return self.playlist_result( + self._entries(channel_name, clip.filter), + playlist_id=channel_name, + playlist_title='%s - Clips %s' % (channel_name, clip.label)) + + +class TwitchVideosCollectionsIE(TwitchPlaylistBaseIE): + _VALID_URL = r'https?://(?:(?:www|go|m)\.)?twitch\.tv/(?P<id>[^/]+)/videos/*?\?.*?\bfilter=collections' + + _TESTS = [{ + # Collections + 'url': 'https://www.twitch.tv/spamfish/videos?filter=collections', + 'info_dict': { + 'id': 'spamfish', + 'title': 'spamfish - Collections', + }, + 'playlist_mincount': 3, + }] + + _OPERATION_NAME = 'ChannelCollectionsContent' + _ENTRY_KIND = 'collection' + _EDGE_KIND = 'CollectionsItemEdge' + _NODE_KIND = 'Collection' + + @staticmethod + def _make_variables(channel_name): + return { + 'ownerLogin': channel_name, + } + + @staticmethod + def _extract_entry(node): + assert isinstance(node, dict) + collection_id = node.get('id') + if not collection_id: + return + return { + '_type': 'url_transparent', + 'ie_key': TwitchCollectionIE.ie_key(), + 'id': collection_id, + 'url': 'https://www.twitch.tv/collections/%s' % collection_id, + 'title': node.get('title'), + 'thumbnail': node.get('thumbnailURL'), + 'duration': float_or_none(node.get('lengthSeconds')), + 'timestamp': unified_timestamp(node.get('updatedAt')), + 'view_count': int_or_none(node.get('viewCount')), + } + + def _real_extract(self, url): + channel_name = self._match_id(url) + return self.playlist_result( + self._entries(channel_name), playlist_id=channel_name, + playlist_title='%s - Collections' % channel_name) + + +class TwitchStreamIE(TwitchBaseIE): + IE_NAME = 'twitch:stream' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:(?:www|go|m)\.)?twitch\.tv/| + player\.twitch\.tv/\?.*?\bchannel= + ) + (?P<id>[^/#?]+) + ''' + + _TESTS = [{ + 'url': 'http://www.twitch.tv/shroomztv', + 'info_dict': { + 'id': '12772022048', + 'display_id': 'shroomztv', + 'ext': 'mp4', + 'title': 're:^ShroomzTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': 'H1Z1 - lonewolfing with ShroomzTV | A3 Battle Royale later - @ShroomzTV', + 'is_live': True, + 'timestamp': 1421928037, + 'upload_date': '20150122', + 'uploader': 'ShroomzTV', + 'uploader_id': 'shroomztv', + 'view_count': int, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://www.twitch.tv/miracle_doto#profile-0', + 'only_matching': True, + }, { + 'url': 'https://player.twitch.tv/?channel=lotsofs', + 'only_matching': True, + }, { + 'url': 'https://go.twitch.tv/food', + 'only_matching': True, + }, { + 'url': 'https://m.twitch.tv/food', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return (False + if any(ie.suitable(url) for ie in ( + TwitchVodIE, + TwitchCollectionIE, + TwitchVideosIE, + TwitchVideosClipsIE, + TwitchVideosCollectionsIE, + TwitchClipsIE)) + else super(TwitchStreamIE, cls).suitable(url)) + + def _real_extract(self, url): + channel_name = self._match_id(url).lower() + + gql = self._download_gql( + channel_name, [{ + 'operationName': 'StreamMetadata', + 'variables': {'channelLogin': channel_name}, + }, { + 'operationName': 'ComscoreStreamingQuery', + 'variables': { + 'channel': channel_name, + 'clipSlug': '', + 'isClip': False, + 'isLive': True, + 'isVodOrCollection': False, + 'vodID': '', + }, + }, { + 'operationName': 'VideoPreviewOverlay', + 'variables': {'login': channel_name}, + }], + 'Downloading stream GraphQL') + + user = gql[0]['data']['user'] + + if not user: + raise ExtractorError( + '%s does not exist' % channel_name, expected=True) + + stream = user['stream'] + + if not stream: + raise ExtractorError('%s is offline' % channel_name, expected=True) + + access_token = self._download_access_token( + channel_name, 'stream', 'channelName') + token = access_token['value'] + + stream_id = stream.get('id') or channel_name + query = { + 'allow_source': 'true', + 'allow_audio_only': 'true', + 'allow_spectre': 'true', + 'p': random.randint(1000000, 10000000), + 'player': 'twitchweb', + 'playlist_include_framerate': 'true', + 'segment_preference': '4', + 'sig': access_token['signature'].encode('utf-8'), + 'token': token.encode('utf-8'), + } + formats = self._extract_m3u8_formats( + '%s/api/channel/hls/%s.m3u8' % (self._USHER_BASE, channel_name), + stream_id, 'mp4', query=query) + self._prefer_source(formats) + + view_count = stream.get('viewers') + timestamp = unified_timestamp(stream.get('createdAt')) + + sq_user = try_get(gql, lambda x: x[1]['data']['user'], dict) or {} + uploader = sq_user.get('displayName') + description = try_get( + sq_user, lambda x: x['broadcastSettings']['title'], compat_str) + + thumbnail = url_or_none(try_get( + gql, lambda x: x[2]['data']['user']['stream']['previewImageURL'], + compat_str)) + + title = uploader or channel_name + stream_type = stream.get('type') + if stream_type in ['rerun', 'live']: + title += ' (%s)' % stream_type + + return { + 'id': stream_id, + 'display_id': channel_name, + 'title': self._live_title(title), + 'description': description, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'uploader_id': channel_name, + 'timestamp': timestamp, + 'view_count': view_count, + 'formats': formats, + 'is_live': stream_type == 'live', + } + + +class TwitchClipsIE(TwitchBaseIE): + IE_NAME = 'twitch:clips' + _VALID_URL = r'''(?x) + https?:// + (?: + clips\.twitch\.tv/(?:embed\?.*?\bclip=|(?:[^/]+/)*)| + (?:(?:www|go|m)\.)?twitch\.tv/[^/]+/clip/ + ) + (?P<id>[^/?#&]+) + ''' + + _TESTS = [{ + 'url': 'https://clips.twitch.tv/FaintLightGullWholeWheat', + 'md5': '761769e1eafce0ffebfb4089cb3847cd', + 'info_dict': { + 'id': '42850523', + 'display_id': 'FaintLightGullWholeWheat', + 'ext': 'mp4', + 'title': 'EA Play 2016 Live from the Novo Theatre', + 'thumbnail': r're:^https?://.*\.jpg', + 'timestamp': 1465767393, + 'upload_date': '20160612', + 'creator': 'EA', + 'uploader': 'stereotype_', + 'uploader_id': '43566419', + }, + }, { + # multiple formats + 'url': 'https://clips.twitch.tv/rflegendary/UninterestedBeeDAESuppy', + 'only_matching': True, + }, { + 'url': 'https://www.twitch.tv/sergeynixon/clip/StormyThankfulSproutFutureMan', + 'only_matching': True, + }, { + 'url': 'https://clips.twitch.tv/embed?clip=InquisitiveBreakableYogurtJebaited', + 'only_matching': True, + }, { + 'url': 'https://m.twitch.tv/rossbroadcast/clip/ConfidentBraveHumanChefFrank', + 'only_matching': True, + }, { + 'url': 'https://go.twitch.tv/rossbroadcast/clip/ConfidentBraveHumanChefFrank', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + clip = self._download_gql( + video_id, [{ + 'operationName': 'VideoAccessToken_Clip', + 'variables': { + 'slug': video_id, + }, + }], + 'Downloading clip access token GraphQL')[0]['data']['clip'] + + if not clip: + raise ExtractorError( + 'This clip is no longer available', expected=True) + + access_query = { + 'sig': clip['playbackAccessToken']['signature'], + 'token': clip['playbackAccessToken']['value'], + } + + data = self._download_base_gql( + video_id, { + 'query': '''{ + clip(slug: "%s") { + broadcaster { + displayName + } + createdAt + curator { + displayName + id + } + durationSeconds + id + tiny: thumbnailURL(width: 86, height: 45) + small: thumbnailURL(width: 260, height: 147) + medium: thumbnailURL(width: 480, height: 272) + title + videoQualities { + frameRate + quality + sourceURL + } + viewCount + } +}''' % video_id}, 'Downloading clip GraphQL', fatal=False) + + if data: + clip = try_get(data, lambda x: x['data']['clip'], dict) or clip + + formats = [] + for option in clip.get('videoQualities', []): + if not isinstance(option, dict): + continue + source = url_or_none(option.get('sourceURL')) + if not source: + continue + formats.append({ + 'url': update_url_query(source, access_query), + 'format_id': option.get('quality'), + 'height': int_or_none(option.get('quality')), + 'fps': int_or_none(option.get('frameRate')), + }) + self._sort_formats(formats) + + thumbnails = [] + for thumbnail_id in ('tiny', 'small', 'medium'): + thumbnail_url = clip.get(thumbnail_id) + if not thumbnail_url: + continue + thumb = { + 'id': thumbnail_id, + 'url': thumbnail_url, + } + mobj = re.search(r'-(\d+)x(\d+)\.', thumbnail_url) + if mobj: + thumb.update({ + 'height': int(mobj.group(2)), + 'width': int(mobj.group(1)), + }) + thumbnails.append(thumb) + + return { + 'id': clip.get('id') or video_id, + 'display_id': video_id, + 'title': clip.get('title') or video_id, + 'formats': formats, + 'duration': int_or_none(clip.get('durationSeconds')), + 'views': int_or_none(clip.get('viewCount')), + 'timestamp': unified_timestamp(clip.get('createdAt')), + 'thumbnails': thumbnails, + 'creator': try_get(clip, lambda x: x['broadcaster']['displayName'], compat_str), + 'uploader': try_get(clip, lambda x: x['curator']['displayName'], compat_str), + 'uploader_id': try_get(clip, lambda x: x['curator']['id'], compat_str), + } diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py new file mode 100644 index 000000000..485b781ca --- /dev/null +++ b/yt_dlp/extractor/twitter.py @@ -0,0 +1,697 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_HTTPError, + compat_parse_qs, + compat_urllib_parse_unquote, + compat_urllib_parse_urlparse, +) +from ..utils import ( + dict_get, + ExtractorError, + float_or_none, + int_or_none, + try_get, + strip_or_none, + unified_timestamp, + update_url_query, + url_or_none, + xpath_text, +) + +from .periscope import ( + PeriscopeBaseIE, + PeriscopeIE, +) + + +class TwitterBaseIE(InfoExtractor): + _API_BASE = 'https://api.twitter.com/1.1/' + _BASE_REGEX = r'https?://(?:(?:www|m(?:obile)?)\.)?twitter\.com/' + _GUEST_TOKEN = None + + def _extract_variant_formats(self, variant, video_id): + variant_url = variant.get('url') + if not variant_url: + return [], {} + elif '.m3u8' in variant_url: + return self._extract_m3u8_formats_and_subtitles( + variant_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False) + else: + tbr = int_or_none(dict_get(variant, ('bitrate', 'bit_rate')), 1000) or None + f = { + 'url': variant_url, + 'format_id': 'http' + ('-%d' % tbr if tbr else ''), + 'tbr': tbr, + } + self._search_dimensions_in_video_url(f, variant_url) + return [f], {} + + def _extract_formats_from_vmap_url(self, vmap_url, video_id): + vmap_url = url_or_none(vmap_url) + if not vmap_url: + return [] + vmap_data = self._download_xml(vmap_url, video_id) + formats = [] + subtitles = {} + urls = [] + for video_variant in vmap_data.findall('.//{http://twitter.com/schema/videoVMapV2.xsd}videoVariant'): + video_variant.attrib['url'] = compat_urllib_parse_unquote( + video_variant.attrib['url']) + urls.append(video_variant.attrib['url']) + fmts, subs = self._extract_variant_formats( + video_variant.attrib, video_id) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) + video_url = strip_or_none(xpath_text(vmap_data, './/MediaFile')) + if video_url not in urls: + fmts, subs = self._extract_variant_formats({'url': video_url}, video_id) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) + return formats, subtitles + + @staticmethod + def _search_dimensions_in_video_url(a_format, video_url): + m = re.search(r'/(?P<width>\d+)x(?P<height>\d+)/', video_url) + if m: + a_format.update({ + 'width': int(m.group('width')), + 'height': int(m.group('height')), + }) + + def _call_api(self, path, video_id, query={}): + headers = { + 'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw', + } + if not self._GUEST_TOKEN: + self._GUEST_TOKEN = self._download_json( + self._API_BASE + 'guest/activate.json', video_id, + 'Downloading guest token', data=b'', + headers=headers)['guest_token'] + headers['x-guest-token'] = self._GUEST_TOKEN + try: + return self._download_json( + self._API_BASE + path, video_id, headers=headers, query=query) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + raise ExtractorError(self._parse_json( + e.cause.read().decode(), + video_id)['errors'][0]['message'], expected=True) + raise + + +class TwitterCardIE(InfoExtractor): + IE_NAME = 'twitter:card' + _VALID_URL = TwitterBaseIE._BASE_REGEX + r'i/(?:cards/tfw/v1|videos(?:/tweet)?)/(?P<id>\d+)' + _TESTS = [ + { + 'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889', + # MD5 checksums are different in different places + 'info_dict': { + 'id': '560070183650213889', + 'ext': 'mp4', + 'title': "Twitter - You can now shoot, edit and share video on Twitter. Capture life's most moving moments from your perspective.", + 'description': 'md5:18d3e24bb4f6e5007487dd546e53bd96', + 'uploader': 'Twitter', + 'uploader_id': 'Twitter', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 30.033, + 'timestamp': 1422366112, + 'upload_date': '20150127', + }, + }, + { + 'url': 'https://twitter.com/i/cards/tfw/v1/623160978427936768', + 'md5': '7137eca597f72b9abbe61e5ae0161399', + 'info_dict': { + 'id': '623160978427936768', + 'ext': 'mp4', + 'title': "NASA - Fly over Pluto's icy Norgay Mountains and Sputnik Plain in this @NASANewHorizons #PlutoFlyby video.", + 'description': "Fly over Pluto's icy Norgay Mountains and Sputnik Plain in this @NASANewHorizons #PlutoFlyby video. https://t.co/BJYgOjSeGA", + 'uploader': 'NASA', + 'uploader_id': 'NASA', + 'timestamp': 1437408129, + 'upload_date': '20150720', + }, + }, + { + 'url': 'https://twitter.com/i/cards/tfw/v1/654001591733886977', + 'md5': 'b6d9683dd3f48e340ded81c0e917ad46', + 'info_dict': { + 'id': 'dq4Oj5quskI', + 'ext': 'mp4', + 'title': 'Ubuntu 11.10 Overview', + 'description': 'md5:a831e97fa384863d6e26ce48d1c43376', + 'upload_date': '20111013', + 'uploader': 'OMG! UBUNTU!', + 'uploader_id': 'omgubuntu', + }, + 'add_ie': ['Youtube'], + }, + { + 'url': 'https://twitter.com/i/cards/tfw/v1/665289828897005568', + 'md5': '6dabeaca9e68cbb71c99c322a4b42a11', + 'info_dict': { + 'id': 'iBb2x00UVlv', + 'ext': 'mp4', + 'upload_date': '20151113', + 'uploader_id': '1189339351084113920', + 'uploader': 'ArsenalTerje', + 'title': 'Vine by ArsenalTerje', + 'timestamp': 1447451307, + }, + 'add_ie': ['Vine'], + }, { + 'url': 'https://twitter.com/i/videos/tweet/705235433198714880', + 'md5': '884812a2adc8aaf6fe52b15ccbfa3b88', + 'info_dict': { + 'id': '705235433198714880', + 'ext': 'mp4', + 'title': "Brent Yarina - Khalil Iverson's missed highlight dunk. And made highlight dunk. In one highlight.", + 'description': "Khalil Iverson's missed highlight dunk. And made highlight dunk. In one highlight. https://t.co/OrxcJ28Bns", + 'uploader': 'Brent Yarina', + 'uploader_id': 'BTNBrentYarina', + 'timestamp': 1456976204, + 'upload_date': '20160303', + }, + 'skip': 'This content is no longer available.', + }, { + 'url': 'https://twitter.com/i/videos/752274308186120192', + 'only_matching': True, + }, + ] + + def _real_extract(self, url): + status_id = self._match_id(url) + return self.url_result( + 'https://twitter.com/statuses/' + status_id, + TwitterIE.ie_key(), status_id) + + +class TwitterIE(TwitterBaseIE): + IE_NAME = 'twitter' + _VALID_URL = TwitterBaseIE._BASE_REGEX + r'(?:(?:i/web|[^/]+)/status|statuses)/(?P<id>\d+)' + + _TESTS = [{ + 'url': 'https://twitter.com/freethenipple/status/643211948184596480', + 'info_dict': { + 'id': '643211948184596480', + 'ext': 'mp4', + 'title': 'FREE THE NIPPLE - FTN supporters on Hollywood Blvd today!', + 'thumbnail': r're:^https?://.*\.jpg', + 'description': 'FTN supporters on Hollywood Blvd today! http://t.co/c7jHH749xJ', + 'uploader': 'FREE THE NIPPLE', + 'uploader_id': 'freethenipple', + 'duration': 12.922, + 'timestamp': 1442188653, + 'upload_date': '20150913', + 'age_limit': 18, + }, + }, { + 'url': 'https://twitter.com/giphz/status/657991469417025536/photo/1', + 'md5': 'f36dcd5fb92bf7057f155e7d927eeb42', + 'info_dict': { + 'id': '657991469417025536', + 'ext': 'mp4', + 'title': 'Gifs - tu vai cai tu vai cai tu nao eh capaz disso tu vai cai', + 'description': 'Gifs on Twitter: "tu vai cai tu vai cai tu nao eh capaz disso tu vai cai https://t.co/tM46VHFlO5"', + 'thumbnail': r're:^https?://.*\.png', + 'uploader': 'Gifs', + 'uploader_id': 'giphz', + }, + 'expected_warnings': ['height', 'width'], + 'skip': 'Account suspended', + }, { + 'url': 'https://twitter.com/starwars/status/665052190608723968', + 'info_dict': { + 'id': '665052190608723968', + 'ext': 'mp4', + 'title': 'Star Wars - A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens.', + 'description': 'A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens. https://t.co/OkSqT2fjWJ', + 'uploader_id': 'starwars', + 'uploader': 'Star Wars', + 'timestamp': 1447395772, + 'upload_date': '20151113', + }, + }, { + 'url': 'https://twitter.com/BTNBrentYarina/status/705235433198714880', + 'info_dict': { + 'id': '705235433198714880', + 'ext': 'mp4', + 'title': "Brent Yarina - Khalil Iverson's missed highlight dunk. And made highlight dunk. In one highlight.", + 'description': "Khalil Iverson's missed highlight dunk. And made highlight dunk. In one highlight. https://t.co/OrxcJ28Bns", + 'uploader_id': 'BTNBrentYarina', + 'uploader': 'Brent Yarina', + 'timestamp': 1456976204, + 'upload_date': '20160303', + }, + 'params': { + # The same video as https://twitter.com/i/videos/tweet/705235433198714880 + # Test case of TwitterCardIE + 'skip_download': True, + }, + }, { + 'url': 'https://twitter.com/jaydingeer/status/700207533655363584', + 'info_dict': { + 'id': '700207533655363584', + 'ext': 'mp4', + 'title': 'simon vertugo - BEAT PROD: @suhmeduh #Damndaniel', + 'description': 'BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ', + 'thumbnail': r're:^https?://.*\.jpg', + 'uploader': 'simon vertugo', + 'uploader_id': 'simonvertugo', + 'duration': 30.0, + 'timestamp': 1455777459, + 'upload_date': '20160218', + }, + }, { + 'url': 'https://twitter.com/Filmdrunk/status/713801302971588609', + 'md5': '89a15ed345d13b86e9a5a5e051fa308a', + 'info_dict': { + 'id': 'MIOxnrUteUd', + 'ext': 'mp4', + 'title': 'Dr.Pepperの飲み方 #japanese #バカ #ドクペ #電動ガン', + 'uploader': 'TAKUMA', + 'uploader_id': '1004126642786242560', + 'timestamp': 1402826626, + 'upload_date': '20140615', + }, + 'add_ie': ['Vine'], + }, { + 'url': 'https://twitter.com/captainamerica/status/719944021058060289', + 'info_dict': { + 'id': '719944021058060289', + 'ext': 'mp4', + 'title': 'Captain America - @King0fNerd Are you sure you made the right choice? Find out in theaters.', + 'description': '@King0fNerd Are you sure you made the right choice? Find out in theaters. https://t.co/GpgYi9xMJI', + 'uploader_id': 'CaptainAmerica', + 'uploader': 'Captain America', + 'duration': 3.17, + 'timestamp': 1460483005, + 'upload_date': '20160412', + }, + }, { + 'url': 'https://twitter.com/OPP_HSD/status/779210622571536384', + 'info_dict': { + 'id': '1zqKVVlkqLaKB', + 'ext': 'mp4', + 'title': 'Sgt Kerry Schmidt - Ontario Provincial Police - Road rage, mischief, assault, rollover and fire in one occurrence', + 'upload_date': '20160923', + 'uploader_id': '1PmKqpJdOJQoY', + 'uploader': 'Sgt Kerry Schmidt - Ontario Provincial Police', + 'timestamp': 1474613214, + }, + 'add_ie': ['Periscope'], + }, { + # has mp4 formats via mobile API + 'url': 'https://twitter.com/news_al3alm/status/852138619213144067', + 'info_dict': { + 'id': '852138619213144067', + 'ext': 'mp4', + 'title': 'عالم الأخبار - كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة', + 'description': 'كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة https://t.co/xg6OhpyKfN', + 'uploader': 'عالم الأخبار', + 'uploader_id': 'news_al3alm', + 'duration': 277.4, + 'timestamp': 1492000653, + 'upload_date': '20170412', + }, + 'skip': 'Account suspended', + }, { + 'url': 'https://twitter.com/i/web/status/910031516746514432', + 'info_dict': { + 'id': '910031516746514432', + 'ext': 'mp4', + 'title': 'Préfet de Guadeloupe - [Direct] #Maria Le centre se trouve actuellement au sud de Basse-Terre. Restez confinés. Réfugiez-vous dans la pièce la + sûre.', + 'thumbnail': r're:^https?://.*\.jpg', + 'description': '[Direct] #Maria Le centre se trouve actuellement au sud de Basse-Terre. Restez confinés. Réfugiez-vous dans la pièce la + sûre. https://t.co/mwx01Rs4lo', + 'uploader': 'Préfet de Guadeloupe', + 'uploader_id': 'Prefet971', + 'duration': 47.48, + 'timestamp': 1505803395, + 'upload_date': '20170919', + }, + 'params': { + 'skip_download': True, # requires ffmpeg + }, + }, { + # card via api.twitter.com/1.1/videos/tweet/config + 'url': 'https://twitter.com/LisPower1/status/1001551623938805763', + 'info_dict': { + 'id': '1001551623938805763', + 'ext': 'mp4', + 'title': 're:.*?Shep is on a roll today.*?', + 'thumbnail': r're:^https?://.*\.jpg', + 'description': 'md5:37b9f2ff31720cef23b2bd42ee8a0f09', + 'uploader': 'Lis Power', + 'uploader_id': 'LisPower1', + 'duration': 111.278, + 'timestamp': 1527623489, + 'upload_date': '20180529', + }, + 'params': { + 'skip_download': True, # requires ffmpeg + }, + }, { + 'url': 'https://twitter.com/foobar/status/1087791357756956680', + 'info_dict': { + 'id': '1087791357756956680', + 'ext': 'mp4', + 'title': 'Twitter - A new is coming. Some of you got an opt-in to try it now. Check out the emoji button, quick keyboard shortcuts, upgraded trends, advanced search, and more. Let us know your thoughts!', + 'thumbnail': r're:^https?://.*\.jpg', + 'description': 'md5:6dfd341a3310fb97d80d2bf7145df976', + 'uploader': 'Twitter', + 'uploader_id': 'Twitter', + 'duration': 61.567, + 'timestamp': 1548184644, + 'upload_date': '20190122', + }, + }, { + # not available in Periscope + 'url': 'https://twitter.com/ViviEducation/status/1136534865145286656', + 'info_dict': { + 'id': '1vOGwqejwoWxB', + 'ext': 'mp4', + 'title': 'Vivi - Vivi founder @lior_rauchy announcing our new student feedback tool live at @EduTECH_AU #EduTECH2019', + 'uploader': 'Vivi', + 'uploader_id': '1eVjYOLGkGrQL', + }, + 'add_ie': ['TwitterBroadcast'], + }, { + # unified card + 'url': 'https://twitter.com/BrooklynNets/status/1349794411333394432?s=20', + 'info_dict': { + 'id': '1349794411333394432', + 'ext': 'mp4', + 'title': 'md5:d1c4941658e4caaa6cb579260d85dcba', + 'thumbnail': r're:^https?://.*\.jpg', + 'description': 'md5:71ead15ec44cee55071547d6447c6a3e', + 'uploader': 'Brooklyn Nets', + 'uploader_id': 'BrooklynNets', + 'duration': 324.484, + 'timestamp': 1610651040, + 'upload_date': '20210114', + }, + 'params': { + 'skip_download': True, + }, + }, { + # Twitch Clip Embed + 'url': 'https://twitter.com/GunB1g/status/1163218564784017422', + 'only_matching': True, + }, { + # promo_video_website card + 'url': 'https://twitter.com/GunB1g/status/1163218564784017422', + 'only_matching': True, + }, { + # promo_video_convo card + 'url': 'https://twitter.com/poco_dandy/status/1047395834013384704', + 'only_matching': True, + }, { + # appplayer card + 'url': 'https://twitter.com/poco_dandy/status/1150646424461176832', + 'only_matching': True, + }, { + # video_direct_message card + 'url': 'https://twitter.com/qarev001/status/1348948114569269251', + 'only_matching': True, + }, { + # poll2choice_video card + 'url': 'https://twitter.com/CAF_Online/status/1349365911120195585', + 'only_matching': True, + }, { + # poll3choice_video card + 'url': 'https://twitter.com/SamsungMobileSA/status/1348609186725289984', + 'only_matching': True, + }, { + # poll4choice_video card + 'url': 'https://twitter.com/SouthamptonFC/status/1347577658079641604', + 'only_matching': True, + }] + + def _real_extract(self, url): + twid = self._match_id(url) + status = self._call_api( + 'statuses/show/%s.json' % twid, twid, { + 'cards_platform': 'Web-12', + 'include_cards': 1, + 'include_reply_count': 1, + 'include_user_entities': 0, + 'tweet_mode': 'extended', + }) + + title = description = status['full_text'].replace('\n', ' ') + # strip 'https -_t.co_BJYgOjSeGA' junk from filenames + title = re.sub(r'\s+(https?://[^ ]+)', '', title) + user = status.get('user') or {} + uploader = user.get('name') + if uploader: + title = '%s - %s' % (uploader, title) + uploader_id = user.get('screen_name') + + tags = [] + for hashtag in (try_get(status, lambda x: x['entities']['hashtags'], list) or []): + hashtag_text = hashtag.get('text') + if not hashtag_text: + continue + tags.append(hashtag_text) + + info = { + 'id': twid, + 'title': title, + 'description': description, + 'uploader': uploader, + 'timestamp': unified_timestamp(status.get('created_at')), + 'uploader_id': uploader_id, + 'uploader_url': 'https://twitter.com/' + uploader_id if uploader_id else None, + 'like_count': int_or_none(status.get('favorite_count')), + 'repost_count': int_or_none(status.get('retweet_count')), + 'comment_count': int_or_none(status.get('reply_count')), + 'age_limit': 18 if status.get('possibly_sensitive') else 0, + 'tags': tags, + } + + def extract_from_video_info(media): + video_info = media.get('video_info') or {} + + formats = [] + subtitles = {} + for variant in video_info.get('variants', []): + fmts, subs = self._extract_variant_formats(variant, twid) + subtitles = self._merge_subtitles(subtitles, subs) + formats.extend(fmts) + self._sort_formats(formats) + + thumbnails = [] + media_url = media.get('media_url_https') or media.get('media_url') + if media_url: + def add_thumbnail(name, size): + thumbnails.append({ + 'id': name, + 'url': update_url_query(media_url, {'name': name}), + 'width': int_or_none(size.get('w') or size.get('width')), + 'height': int_or_none(size.get('h') or size.get('height')), + }) + for name, size in media.get('sizes', {}).items(): + add_thumbnail(name, size) + add_thumbnail('orig', media.get('original_info') or {}) + + info.update({ + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': thumbnails, + 'duration': float_or_none(video_info.get('duration_millis'), 1000), + }) + + media = try_get(status, lambda x: x['extended_entities']['media'][0]) + if media and media.get('type') != 'photo': + extract_from_video_info(media) + else: + card = status.get('card') + if card: + binding_values = card['binding_values'] + + def get_binding_value(k): + o = binding_values.get(k) or {} + return try_get(o, lambda x: x[x['type'].lower() + '_value']) + + card_name = card['name'].split(':')[-1] + if card_name == 'player': + info.update({ + '_type': 'url', + 'url': get_binding_value('player_url'), + }) + elif card_name == 'periscope_broadcast': + info.update({ + '_type': 'url', + 'url': get_binding_value('url') or get_binding_value('player_url'), + 'ie_key': PeriscopeIE.ie_key(), + }) + elif card_name == 'broadcast': + info.update({ + '_type': 'url', + 'url': get_binding_value('broadcast_url'), + 'ie_key': TwitterBroadcastIE.ie_key(), + }) + elif card_name == 'summary': + info.update({ + '_type': 'url', + 'url': get_binding_value('card_url'), + }) + elif card_name == 'unified_card': + media_entities = self._parse_json(get_binding_value('unified_card'), twid)['media_entities'] + extract_from_video_info(next(iter(media_entities.values()))) + # amplify, promo_video_website, promo_video_convo, appplayer, + # video_direct_message, poll2choice_video, poll3choice_video, + # poll4choice_video, ... + else: + is_amplify = card_name == 'amplify' + vmap_url = get_binding_value('amplify_url_vmap') if is_amplify else get_binding_value('player_stream_url') + content_id = get_binding_value('%s_content_id' % (card_name if is_amplify else 'player')) + formats, subtitles = self._extract_formats_from_vmap_url(vmap_url, content_id or twid) + self._sort_formats(formats) + + thumbnails = [] + for suffix in ('_small', '', '_large', '_x_large', '_original'): + image = get_binding_value('player_image' + suffix) or {} + image_url = image.get('url') + if not image_url or '/player-placeholder' in image_url: + continue + thumbnails.append({ + 'id': suffix[1:] if suffix else 'medium', + 'url': image_url, + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + }) + + info.update({ + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': thumbnails, + 'duration': int_or_none(get_binding_value( + 'content_duration_seconds')), + }) + else: + expanded_url = try_get(status, lambda x: x['entities']['urls'][0]['expanded_url']) + if not expanded_url: + raise ExtractorError("There's no video in this tweet.") + info.update({ + '_type': 'url', + 'url': expanded_url, + }) + return info + + +class TwitterAmplifyIE(TwitterBaseIE): + IE_NAME = 'twitter:amplify' + _VALID_URL = r'https?://amp\.twimg\.com/v/(?P<id>[0-9a-f\-]{36})' + + _TEST = { + 'url': 'https://amp.twimg.com/v/0ba0c3c7-0af3-4c0a-bed5-7efd1ffa2951', + 'md5': '7df102d0b9fd7066b86f3159f8e81bf6', + 'info_dict': { + 'id': '0ba0c3c7-0af3-4c0a-bed5-7efd1ffa2951', + 'ext': 'mp4', + 'title': 'Twitter Video', + 'thumbnail': 're:^https?://.*', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + vmap_url = self._html_search_meta( + 'twitter:amplify:vmap', webpage, 'vmap url') + formats = self._extract_formats_from_vmap_url(vmap_url, video_id) + + thumbnails = [] + thumbnail = self._html_search_meta( + 'twitter:image:src', webpage, 'thumbnail', fatal=False) + + def _find_dimension(target): + w = int_or_none(self._html_search_meta( + 'twitter:%s:width' % target, webpage, fatal=False)) + h = int_or_none(self._html_search_meta( + 'twitter:%s:height' % target, webpage, fatal=False)) + return w, h + + if thumbnail: + thumbnail_w, thumbnail_h = _find_dimension('image') + thumbnails.append({ + 'url': thumbnail, + 'width': thumbnail_w, + 'height': thumbnail_h, + }) + + video_w, video_h = _find_dimension('player') + formats[0].update({ + 'width': video_w, + 'height': video_h, + }) + + return { + 'id': video_id, + 'title': 'Twitter Video', + 'formats': formats, + 'thumbnails': thumbnails, + } + + +class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE): + IE_NAME = 'twitter:broadcast' + _VALID_URL = TwitterBaseIE._BASE_REGEX + r'i/broadcasts/(?P<id>[0-9a-zA-Z]{13})' + + _TEST = { + # untitled Periscope video + 'url': 'https://twitter.com/i/broadcasts/1yNGaQLWpejGj', + 'info_dict': { + 'id': '1yNGaQLWpejGj', + 'ext': 'mp4', + 'title': 'Andrea May Sahouri - Periscope Broadcast', + 'uploader': 'Andrea May Sahouri', + 'uploader_id': '1PXEdBZWpGwKe', + }, + } + + def _real_extract(self, url): + broadcast_id = self._match_id(url) + broadcast = self._call_api( + 'broadcasts/show.json', broadcast_id, + {'ids': broadcast_id})['broadcasts'][broadcast_id] + info = self._parse_broadcast_data(broadcast, broadcast_id) + media_key = broadcast['media_key'] + source = self._call_api( + 'live_video_stream/status/' + media_key, media_key)['source'] + m3u8_url = source.get('noRedirectPlaybackUrl') or source['location'] + if '/live_video_stream/geoblocked/' in m3u8_url: + self.raise_geo_restricted() + m3u8_id = compat_parse_qs(compat_urllib_parse_urlparse( + m3u8_url).query).get('type', [None])[0] + state, width, height = self._extract_common_format_info(broadcast) + info['formats'] = self._extract_pscp_m3u8_formats( + m3u8_url, broadcast_id, m3u8_id, state, width, height) + return info + + +class TwitterShortenerIE(TwitterBaseIE): + IE_NAME = 'twitter:shortener' + _VALID_URL = r'https?://t.co/(?P<id>[^?]+)|tco:(?P<eid>[^?]+)' + _BASE_URL = 'https://t.co/' + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + eid, id = mobj.group('eid', 'id') + if eid: + id = eid + url = self._BASE_URL + id + new_url = self._request_webpage(url, id, headers={'User-Agent': 'curl'}).geturl() + __UNSAFE_LINK = "https://twitter.com/safety/unsafe_link_warning?unsafe_link=" + if new_url.startswith(__UNSAFE_LINK): + new_url = new_url.replace(__UNSAFE_LINK, "") + return self.url_result(new_url) diff --git a/yt_dlp/extractor/udemy.py b/yt_dlp/extractor/udemy.py new file mode 100644 index 000000000..25b28e98e --- /dev/null +++ b/yt_dlp/extractor/udemy.py @@ -0,0 +1,481 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_HTTPError, + compat_kwargs, + compat_str, + compat_urllib_request, + compat_urlparse, +) +from ..utils import ( + determine_ext, + extract_attributes, + ExtractorError, + float_or_none, + int_or_none, + js_to_json, + sanitized_Request, + try_get, + unescapeHTML, + url_or_none, + urlencode_postdata, +) + + +class UdemyIE(InfoExtractor): + IE_NAME = 'udemy' + _VALID_URL = r'''(?x) + https?:// + (?:[^/]+\.)?udemy\.com/ + (?: + [^#]+\#/lecture/| + lecture/view/?\?lectureId=| + [^/]+/learn/v4/t/lecture/ + ) + (?P<id>\d+) + ''' + _LOGIN_URL = 'https://www.udemy.com/join/login-popup/?displayType=ajax&showSkipButton=1' + _ORIGIN_URL = 'https://www.udemy.com' + _NETRC_MACHINE = 'udemy' + + _TESTS = [{ + 'url': 'https://www.udemy.com/java-tutorial/#/lecture/172757', + 'md5': '98eda5b657e752cf945d8445e261b5c5', + 'info_dict': { + 'id': '160614', + 'ext': 'mp4', + 'title': 'Introduction and Installation', + 'description': 'md5:c0d51f6f21ef4ec65f091055a5eef876', + 'duration': 579.29, + }, + 'skip': 'Requires udemy account credentials', + }, { + # new URL schema + 'url': 'https://www.udemy.com/electric-bass-right-from-the-start/learn/v4/t/lecture/4580906', + 'only_matching': True, + }, { + # no url in outputs format entry + 'url': 'https://www.udemy.com/learn-web-development-complete-step-by-step-guide-to-success/learn/v4/t/lecture/4125812', + 'only_matching': True, + }, { + # only outputs rendition + 'url': 'https://www.udemy.com/how-you-can-help-your-local-community-5-amazing-examples/learn/v4/t/lecture/3225750?start=0', + 'only_matching': True, + }, { + 'url': 'https://wipro.udemy.com/java-tutorial/#/lecture/172757', + 'only_matching': True, + }] + + def _extract_course_info(self, webpage, video_id): + course = self._parse_json( + unescapeHTML(self._search_regex( + r'ng-init=["\'].*\bcourse=({.+?})[;"\']', + webpage, 'course', default='{}')), + video_id, fatal=False) or {} + course_id = course.get('id') or self._search_regex( + [ + r'data-course-id=["\'](\d+)', + r'"courseId"\s*:\s*(\d+)' + ], webpage, 'course id') + return course_id, course.get('title') + + def _enroll_course(self, base_url, webpage, course_id): + def combine_url(base_url, url): + return compat_urlparse.urljoin(base_url, url) if not url.startswith('http') else url + + checkout_url = unescapeHTML(self._search_regex( + r'href=(["\'])(?P<url>(?:https?://(?:www\.)?udemy\.com)?/(?:payment|cart)/checkout/.+?)\1', + webpage, 'checkout url', group='url', default=None)) + if checkout_url: + raise ExtractorError( + 'Course %s is not free. You have to pay for it before you can download. ' + 'Use this URL to confirm purchase: %s' + % (course_id, combine_url(base_url, checkout_url)), + expected=True) + + enroll_url = unescapeHTML(self._search_regex( + r'href=(["\'])(?P<url>(?:https?://(?:www\.)?udemy\.com)?/course/subscribe/.+?)\1', + webpage, 'enroll url', group='url', default=None)) + if enroll_url: + webpage = self._download_webpage( + combine_url(base_url, enroll_url), + course_id, 'Enrolling in the course', + headers={'Referer': base_url}) + if '>You have enrolled in' in webpage: + self.to_screen('%s: Successfully enrolled in the course' % course_id) + + def _download_lecture(self, course_id, lecture_id): + return self._download_json( + 'https://www.udemy.com/api-2.0/users/me/subscribed-courses/%s/lectures/%s?' + % (course_id, lecture_id), + lecture_id, 'Downloading lecture JSON', query={ + 'fields[lecture]': 'title,description,view_html,asset', + 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,stream_urls,captions,data', + }) + + def _handle_error(self, response): + if not isinstance(response, dict): + return + error = response.get('error') + if error: + error_str = 'Udemy returned error #%s: %s' % (error.get('code'), error.get('message')) + error_data = error.get('data') + if error_data: + error_str += ' - %s' % error_data.get('formErrors') + raise ExtractorError(error_str, expected=True) + + def _download_webpage_handle(self, *args, **kwargs): + headers = kwargs.get('headers', {}).copy() + headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36' + kwargs['headers'] = headers + ret = super(UdemyIE, self)._download_webpage_handle( + *args, **compat_kwargs(kwargs)) + if not ret: + return ret + webpage, _ = ret + if any(p in webpage for p in ( + '>Please verify you are a human', + 'Access to this page has been denied because we believe you are using automation tools to browse the website', + '"_pxCaptcha"')): + raise ExtractorError( + 'Udemy asks you to solve a CAPTCHA. Login with browser, ' + 'solve CAPTCHA, then export cookies and pass cookie file to ' + 'yt-dlp with --cookies.', expected=True) + return ret + + def _download_json(self, url_or_request, *args, **kwargs): + headers = { + 'X-Udemy-Snail-Case': 'true', + 'X-Requested-With': 'XMLHttpRequest', + } + for cookie in self._downloader.cookiejar: + if cookie.name == 'client_id': + headers['X-Udemy-Client-Id'] = cookie.value + elif cookie.name == 'access_token': + headers['X-Udemy-Bearer-Token'] = cookie.value + headers['X-Udemy-Authorization'] = 'Bearer %s' % cookie.value + + if isinstance(url_or_request, compat_urllib_request.Request): + for header, value in headers.items(): + url_or_request.add_header(header, value) + else: + url_or_request = sanitized_Request(url_or_request, headers=headers) + + response = super(UdemyIE, self)._download_json(url_or_request, *args, **kwargs) + self._handle_error(response) + return response + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + login_popup = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login popup') + + def is_logged(webpage): + return any(re.search(p, webpage) for p in ( + r'href=["\'](?:https://www\.udemy\.com)?/user/logout/', + r'>Logout<')) + + # already logged in + if is_logged(login_popup): + return + + login_form = self._form_hidden_inputs('login-form', login_popup) + + login_form.update({ + 'email': username, + 'password': password, + }) + + response = self._download_webpage( + self._LOGIN_URL, None, 'Logging in', + data=urlencode_postdata(login_form), + headers={ + 'Referer': self._ORIGIN_URL, + 'Origin': self._ORIGIN_URL, + }) + + if not is_logged(response): + error = self._html_search_regex( + r'(?s)<div[^>]+class="form-errors[^"]*">(.+?)</div>', + response, 'error message', default=None) + if error: + raise ExtractorError('Unable to login: %s' % error, expected=True) + raise ExtractorError('Unable to log in') + + def _real_extract(self, url): + lecture_id = self._match_id(url) + + webpage = self._download_webpage(url, lecture_id) + + course_id, _ = self._extract_course_info(webpage, lecture_id) + + try: + lecture = self._download_lecture(course_id, lecture_id) + except ExtractorError as e: + # Error could possibly mean we are not enrolled in the course + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + self._enroll_course(url, webpage, course_id) + lecture = self._download_lecture(course_id, lecture_id) + else: + raise + + title = lecture['title'] + description = lecture.get('description') + + asset = lecture['asset'] + + asset_type = asset.get('asset_type') or asset.get('assetType') + if asset_type != 'Video': + raise ExtractorError( + 'Lecture %s is not a video' % lecture_id, expected=True) + + stream_url = asset.get('stream_url') or asset.get('streamUrl') + if stream_url: + youtube_url = self._search_regex( + r'(https?://www\.youtube\.com/watch\?v=.*)', stream_url, 'youtube URL', default=None) + if youtube_url: + return self.url_result(youtube_url, 'Youtube') + + video_id = compat_str(asset['id']) + thumbnail = asset.get('thumbnail_url') or asset.get('thumbnailUrl') + duration = float_or_none(asset.get('data', {}).get('duration')) + + subtitles = {} + automatic_captions = {} + + formats = [] + + def extract_output_format(src, f_id): + return { + 'url': src.get('url'), + 'format_id': '%sp' % (src.get('height') or f_id), + 'width': int_or_none(src.get('width')), + 'height': int_or_none(src.get('height')), + 'vbr': int_or_none(src.get('video_bitrate_in_kbps')), + 'vcodec': src.get('video_codec'), + 'fps': int_or_none(src.get('frame_rate')), + 'abr': int_or_none(src.get('audio_bitrate_in_kbps')), + 'acodec': src.get('audio_codec'), + 'asr': int_or_none(src.get('audio_sample_rate')), + 'tbr': int_or_none(src.get('total_bitrate_in_kbps')), + 'filesize': int_or_none(src.get('file_size_in_bytes')), + } + + outputs = asset.get('data', {}).get('outputs') + if not isinstance(outputs, dict): + outputs = {} + + def add_output_format_meta(f, key): + output = outputs.get(key) + if isinstance(output, dict): + output_format = extract_output_format(output, key) + output_format.update(f) + return output_format + return f + + def extract_formats(source_list): + if not isinstance(source_list, list): + return + for source in source_list: + video_url = url_or_none(source.get('file') or source.get('src')) + if not video_url: + continue + if source.get('type') == 'application/x-mpegURL' or determine_ext(video_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + continue + format_id = source.get('label') + f = { + 'url': video_url, + 'format_id': '%sp' % format_id, + 'height': int_or_none(format_id), + } + if format_id: + # Some videos contain additional metadata (e.g. + # https://www.udemy.com/ios9-swift/learn/#/lecture/3383208) + f = add_output_format_meta(f, format_id) + formats.append(f) + + def extract_subtitles(track_list): + if not isinstance(track_list, list): + return + for track in track_list: + if not isinstance(track, dict): + continue + if track.get('kind') != 'captions': + continue + src = url_or_none(track.get('src')) + if not src: + continue + lang = track.get('language') or track.get( + 'srclang') or track.get('label') + sub_dict = automatic_captions if track.get( + 'autogenerated') is True else subtitles + sub_dict.setdefault(lang, []).append({ + 'url': src, + }) + + for url_kind in ('download', 'stream'): + urls = asset.get('%s_urls' % url_kind) + if isinstance(urls, dict): + extract_formats(urls.get('Video')) + + captions = asset.get('captions') + if isinstance(captions, list): + for cc in captions: + if not isinstance(cc, dict): + continue + cc_url = url_or_none(cc.get('url')) + if not cc_url: + continue + lang = try_get(cc, lambda x: x['locale']['locale'], compat_str) + sub_dict = (automatic_captions if cc.get('source') == 'auto' + else subtitles) + sub_dict.setdefault(lang or 'en', []).append({ + 'url': cc_url, + }) + + view_html = lecture.get('view_html') + if view_html: + view_html_urls = set() + for source in re.findall(r'<source[^>]+>', view_html): + attributes = extract_attributes(source) + src = attributes.get('src') + if not src: + continue + res = attributes.get('data-res') + height = int_or_none(res) + if src in view_html_urls: + continue + view_html_urls.add(src) + if attributes.get('type') == 'application/x-mpegURL' or determine_ext(src) == 'm3u8': + m3u8_formats = self._extract_m3u8_formats( + src, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False) + for f in m3u8_formats: + m = re.search(r'/hls_(?P<height>\d{3,4})_(?P<tbr>\d{2,})/', f['url']) + if m: + if not f.get('height'): + f['height'] = int(m.group('height')) + if not f.get('tbr'): + f['tbr'] = int(m.group('tbr')) + formats.extend(m3u8_formats) + else: + formats.append(add_output_format_meta({ + 'url': src, + 'format_id': '%dp' % height if height else None, + 'height': height, + }, res)) + + # react rendition since 2017.04.15 (see + # https://github.com/ytdl-org/youtube-dl/issues/12744) + data = self._parse_json( + self._search_regex( + r'videojs-setup-data=(["\'])(?P<data>{.+?})\1', view_html, + 'setup data', default='{}', group='data'), video_id, + transform_source=unescapeHTML, fatal=False) + if data and isinstance(data, dict): + extract_formats(data.get('sources')) + if not duration: + duration = int_or_none(data.get('duration')) + extract_subtitles(data.get('tracks')) + + if not subtitles and not automatic_captions: + text_tracks = self._parse_json( + self._search_regex( + r'text-tracks=(["\'])(?P<data>\[.+?\])\1', view_html, + 'text tracks', default='{}', group='data'), video_id, + transform_source=lambda s: js_to_json(unescapeHTML(s)), + fatal=False) + extract_subtitles(text_tracks) + + if not formats and outputs: + for format_id, output in outputs.items(): + f = extract_output_format(output, format_id) + if f.get('url'): + formats.append(f) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + 'subtitles': subtitles, + 'automatic_captions': automatic_captions, + } + + +class UdemyCourseIE(UdemyIE): + IE_NAME = 'udemy:course' + _VALID_URL = r'https?://(?:[^/]+\.)?udemy\.com/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.udemy.com/java-tutorial/', + 'only_matching': True, + }, { + 'url': 'https://wipro.udemy.com/java-tutorial/', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if UdemyIE.suitable(url) else super(UdemyCourseIE, cls).suitable(url) + + def _real_extract(self, url): + course_path = self._match_id(url) + + webpage = self._download_webpage(url, course_path) + + course_id, title = self._extract_course_info(webpage, course_path) + + self._enroll_course(url, webpage, course_id) + + response = self._download_json( + 'https://www.udemy.com/api-2.0/courses/%s/cached-subscriber-curriculum-items' % course_id, + course_id, 'Downloading course curriculum', query={ + 'fields[chapter]': 'title,object_index', + 'fields[lecture]': 'title,asset', + 'page_size': '1000', + }) + + entries = [] + chapter, chapter_number = [None] * 2 + for entry in response['results']: + clazz = entry.get('_class') + if clazz == 'lecture': + asset = entry.get('asset') + if isinstance(asset, dict): + asset_type = asset.get('asset_type') or asset.get('assetType') + if asset_type != 'Video': + continue + lecture_id = entry.get('id') + if lecture_id: + entry = { + '_type': 'url_transparent', + 'url': 'https://www.udemy.com/%s/learn/v4/t/lecture/%s' % (course_path, entry['id']), + 'title': entry.get('title'), + 'ie_key': UdemyIE.ie_key(), + } + if chapter_number: + entry['chapter_number'] = chapter_number + if chapter: + entry['chapter'] = chapter + entries.append(entry) + elif clazz == 'chapter': + chapter_number = entry.get('object_index') + chapter = entry.get('title') + + return self.playlist_result(entries, course_id, title) diff --git a/youtube_dl/extractor/udn.py b/yt_dlp/extractor/udn.py index 2c8e5c7b4..2c8e5c7b4 100644 --- a/youtube_dl/extractor/udn.py +++ b/yt_dlp/extractor/udn.py diff --git a/youtube_dl/extractor/ufctv.py b/yt_dlp/extractor/ufctv.py index 3d74ba071..3d74ba071 100644 --- a/youtube_dl/extractor/ufctv.py +++ b/yt_dlp/extractor/ufctv.py diff --git a/yt_dlp/extractor/ukcolumn.py b/yt_dlp/extractor/ukcolumn.py new file mode 100644 index 000000000..d2626f0d3 --- /dev/null +++ b/yt_dlp/extractor/ukcolumn.py @@ -0,0 +1,72 @@ +from __future__ import unicode_literals + +from ..utils import ( + unescapeHTML, + urljoin, + ExtractorError, +) +from .common import InfoExtractor +from .vimeo import VimeoIE +from .youtube import YoutubeIE + + +class UkColumnIE(InfoExtractor): + IE_NAME = 'ukcolumn' + _VALID_URL = r'(?i)https?://(?:www\.)?ukcolumn\.org(/index\.php)?/(?:video|ukcolumn-news)/(?P<id>[-a-z0-9]+)' + + _TESTS = [{ + 'url': 'https://www.ukcolumn.org/ukcolumn-news/uk-column-news-28th-april-2021', + 'info_dict': { + 'id': '541632443', + 'ext': 'mp4', + 'title': 'UK Column News - 28th April 2021', + 'uploader_id': 'ukcolumn', + 'uploader': 'UK Column', + }, + 'add_ie': [VimeoIE.ie_key()], + 'expected_warnings': ['Unable to download JSON metadata'], + 'params': { + 'skip_download': 'Handled by Vimeo', + }, + }, { + 'url': 'https://www.ukcolumn.org/video/insight-eu-military-unification', + 'info_dict': { + 'id': 'Fzbnb9t7XAw', + 'ext': 'mp4', + 'title': 'Insight: EU Military Unification', + 'uploader_id': 'ukcolumn', + 'description': 'md5:29a207965271af89baa0bc191f5de576', + 'uploader': 'UK Column', + 'upload_date': '20170514', + }, + 'add_ie': [YoutubeIE.ie_key()], + 'params': { + 'skip_download': 'Handled by Youtube', + }, + }, { + 'url': 'https://www.ukcolumn.org/index.php/ukcolumn-news/uk-column-news-30th-april-2021', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + oembed_url = urljoin(url, unescapeHTML(self._search_regex( + r'<iframe[^>]+src=(["\'])(?P<url>/media/oembed\?url=.+?)\1', + webpage, 'OEmbed URL', group='url'))) + oembed_webpage = self._download_webpage( + oembed_url, display_id, note='Downloading OEmbed page') + + ie, video_url = YoutubeIE, YoutubeIE._extract_url(oembed_webpage) + if not video_url: + ie, video_url = VimeoIE, VimeoIE._extract_url(url, oembed_webpage) + if not video_url: + raise ExtractorError('No embedded video found') + + return { + '_type': 'url_transparent', + 'title': self._og_search_title(webpage), + 'url': video_url, + 'ie_key': ie.ie_key(), + } diff --git a/youtube_dl/extractor/uktvplay.py b/yt_dlp/extractor/uktvplay.py index f28fd514d..f28fd514d 100644 --- a/youtube_dl/extractor/uktvplay.py +++ b/yt_dlp/extractor/uktvplay.py diff --git a/yt_dlp/extractor/umg.py b/yt_dlp/extractor/umg.py new file mode 100644 index 000000000..c1b65d189 --- /dev/null +++ b/yt_dlp/extractor/umg.py @@ -0,0 +1,101 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_filesize, + parse_iso8601, +) + + +class UMGDeIE(InfoExtractor): + IE_NAME = 'umg:de' + IE_DESC = 'Universal Music Deutschland' + _VALID_URL = r'https?://(?:www\.)?universal-music\.de/[^/]+/videos/[^/?#]+-(?P<id>\d+)' + _TEST = { + 'url': 'https://www.universal-music.de/sido/videos/jedes-wort-ist-gold-wert-457803', + 'md5': 'ebd90f48c80dcc82f77251eb1902634f', + 'info_dict': { + 'id': '457803', + 'ext': 'mp4', + 'title': 'Jedes Wort ist Gold wert', + 'timestamp': 1513591800, + 'upload_date': '20171218', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + video_data = self._download_json( + 'https://graphql.universal-music.de/', + video_id, query={ + 'query': '''{ + universalMusic(channel:16) { + video(id:%s) { + headline + formats { + formatId + url + type + width + height + mimeType + fileSize + } + duration + createdDate + } + } +}''' % video_id})['data']['universalMusic']['video'] + + title = video_data['headline'] + hls_url_template = 'http://mediadelivery.universal-music-services.de/vod/mp4:autofill/storage/' + '/'.join(list(video_id)) + '/content/%s/file/playlist.m3u8' + + thumbnails = [] + formats = [] + + def add_m3u8_format(format_id): + formats.extend(self._extract_m3u8_formats( + hls_url_template % format_id, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + + for f in video_data.get('formats', []): + f_url = f.get('url') + mime_type = f.get('mimeType') + if not f_url or mime_type == 'application/mxf': + continue + fmt = { + 'url': f_url, + 'width': int_or_none(f.get('width')), + 'height': int_or_none(f.get('height')), + 'filesize': parse_filesize(f.get('fileSize')), + } + f_type = f.get('type') + if f_type == 'Image': + thumbnails.append(fmt) + elif f_type == 'Video': + format_id = f.get('formatId') + if format_id: + fmt['format_id'] = format_id + if mime_type == 'video/mp4': + add_m3u8_format(format_id) + urlh = self._request_webpage(f_url, video_id, fatal=False) + if urlh: + first_byte = urlh.read(1) + if first_byte not in (b'F', b'\x00'): + continue + formats.append(fmt) + if not formats: + for format_id in (867, 836, 940): + add_m3u8_format(format_id) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'duration': int_or_none(video_data.get('duration')), + 'timestamp': parse_iso8601(video_data.get('createdDate'), ' '), + 'thumbnails': thumbnails, + 'formats': formats, + } diff --git a/yt_dlp/extractor/unistra.py b/yt_dlp/extractor/unistra.py new file mode 100644 index 000000000..685d74f35 --- /dev/null +++ b/yt_dlp/extractor/unistra.py @@ -0,0 +1,67 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import qualities + + +class UnistraIE(InfoExtractor): + _VALID_URL = r'https?://utv\.unistra\.fr/(?:index|video)\.php\?id_video\=(?P<id>\d+)' + + _TESTS = [ + { + 'url': 'http://utv.unistra.fr/video.php?id_video=154', + 'md5': '736f605cfdc96724d55bb543ab3ced24', + 'info_dict': { + 'id': '154', + 'ext': 'mp4', + 'title': 'M!ss Yella', + 'description': 'md5:104892c71bd48e55d70b902736b81bbf', + }, + }, + { + 'url': 'http://utv.unistra.fr/index.php?id_video=437', + 'md5': '1ddddd6cccaae76f622ce29b8779636d', + 'info_dict': { + 'id': '437', + 'ext': 'mp4', + 'title': 'Prix Louise Weiss 2014', + 'description': 'md5:cc3a8735f079f4fb6b0b570fc10c135a', + }, + } + ] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + + files = set(re.findall(r'file\s*:\s*"(/[^"]+)"', webpage)) + + quality = qualities(['SD', 'HD']) + formats = [] + for file_path in files: + format_id = 'HD' if file_path.endswith('-HD.mp4') else 'SD' + formats.append({ + 'url': 'http://vod-flash.u-strasbg.fr:8080%s' % file_path, + 'format_id': format_id, + 'quality': quality(format_id) + }) + self._sort_formats(formats) + + title = self._html_search_regex( + r'<title>UTV - (.*?)</', webpage, 'title') + description = self._html_search_regex( + r'<meta name="Description" content="(.*?)"', webpage, 'description', flags=re.DOTALL) + thumbnail = self._search_regex( + r'image: "(.*?)"', webpage, 'thumbnail') + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'formats': formats + } diff --git a/youtube_dl/extractor/unity.py b/yt_dlp/extractor/unity.py index 73daacf29..73daacf29 100644 --- a/youtube_dl/extractor/unity.py +++ b/yt_dlp/extractor/unity.py diff --git a/yt_dlp/extractor/uol.py b/yt_dlp/extractor/uol.py new file mode 100644 index 000000000..4a2a97fa4 --- /dev/null +++ b/yt_dlp/extractor/uol.py @@ -0,0 +1,143 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urllib_parse_urlencode, +) +from ..utils import ( + clean_html, + int_or_none, + parse_duration, + parse_iso8601, + qualities, + update_url_query, +) + + +class UOLIE(InfoExtractor): + IE_NAME = 'uol.com.br' + _VALID_URL = r'https?://(?:.+?\.)?uol\.com\.br/.*?(?:(?:mediaId|v)=|view/(?:[a-z0-9]+/)?|video(?:=|/(?:\d{4}/\d{2}/\d{2}/)?))(?P<id>\d+|[\w-]+-[A-Z0-9]+)' + _TESTS = [{ + 'url': 'http://player.mais.uol.com.br/player_video_v3.swf?mediaId=15951931', + 'md5': '4f1e26683979715ff64e4e29099cf020', + 'info_dict': { + 'id': '15951931', + 'ext': 'mp4', + 'title': 'Miss simpatia é encontrada morta', + 'description': 'md5:3f8c11a0c0556d66daf7e5b45ef823b2', + 'timestamp': 1470421860, + 'upload_date': '20160805', + } + }, { + 'url': 'http://tvuol.uol.com.br/video/incendio-destroi-uma-das-maiores-casas-noturnas-de-londres-04024E9A3268D4C95326', + 'md5': '2850a0e8dfa0a7307e04a96c5bdc5bc2', + 'info_dict': { + 'id': '15954259', + 'ext': 'mp4', + 'title': 'Incêndio destrói uma das maiores casas noturnas de Londres', + 'description': 'Em Londres, um incêndio destruiu uma das maiores boates da cidade. Não há informações sobre vítimas.', + 'timestamp': 1470674520, + 'upload_date': '20160808', + } + }, { + 'url': 'http://mais.uol.com.br/static/uolplayer/index.html?mediaId=15951931', + 'only_matching': True, + }, { + 'url': 'http://mais.uol.com.br/view/15954259', + 'only_matching': True, + }, { + 'url': 'http://noticias.band.uol.com.br/brasilurgente/video/2016/08/05/15951931/miss-simpatia-e-encontrada-morta.html', + 'only_matching': True, + }, { + 'url': 'http://videos.band.uol.com.br/programa.asp?e=noticias&pr=brasil-urgente&v=15951931&t=Policia-desmonte-base-do-PCC-na-Cracolandia', + 'only_matching': True, + }, { + 'url': 'http://mais.uol.com.br/view/cphaa0gl2x8r/incendio-destroi-uma-das-maiores-casas-noturnas-de-londres-04024E9A3268D4C95326', + 'only_matching': True, + }, { + 'url': 'http://noticias.uol.com.br//videos/assistir.htm?video=rafaela-silva-inspira-criancas-no-judo-04024D983968D4C95326', + 'only_matching': True, + }, { + 'url': 'http://mais.uol.com.br/view/e0qbgxid79uv/15275470', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + video_data = self._download_json( + # https://api.mais.uol.com.br/apiuol/v4/player/data/[MEDIA_ID] + 'https://api.mais.uol.com.br/apiuol/v3/media/detail/' + video_id, + video_id)['item'] + media_id = compat_str(video_data['mediaId']) + title = video_data['title'] + ver = video_data.get('revision', 2) + + uol_formats = self._download_json( + 'https://croupier.mais.uol.com.br/v3/formats/%s/jsonp' % media_id, + media_id) + quality = qualities(['mobile', 'WEBM', '360p', '720p', '1080p']) + formats = [] + for format_id, f in uol_formats.items(): + if not isinstance(f, dict): + continue + f_url = f.get('url') or f.get('secureUrl') + if not f_url: + continue + query = { + 'ver': ver, + 'r': 'http://mais.uol.com.br', + } + for k in ('token', 'sign'): + v = f.get(k) + if v: + query[k] = v + f_url = update_url_query(f_url, query) + format_id = format_id + if format_id == 'HLS': + m3u8_formats = self._extract_m3u8_formats( + f_url, media_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False) + encoded_query = compat_urllib_parse_urlencode(query) + for m3u8_f in m3u8_formats: + m3u8_f['extra_param_to_segment_url'] = encoded_query + m3u8_f['url'] = update_url_query(m3u8_f['url'], query) + formats.extend(m3u8_formats) + continue + formats.append({ + 'format_id': format_id, + 'url': f_url, + 'quality': quality(format_id), + }) + self._sort_formats(formats) + + tags = [] + for tag in video_data.get('tags', []): + tag_description = tag.get('description') + if not tag_description: + continue + tags.append(tag_description) + + thumbnails = [] + for q in ('Small', 'Medium', 'Wmedium', 'Large', 'Wlarge', 'Xlarge'): + q_url = video_data.get('thumb' + q) + if not q_url: + continue + thumbnails.append({ + 'id': q, + 'url': q_url, + }) + + return { + 'id': media_id, + 'title': title, + 'description': clean_html(video_data.get('description')), + 'thumbnails': thumbnails, + 'duration': parse_duration(video_data.get('duration')), + 'tags': tags, + 'formats': formats, + 'timestamp': parse_iso8601(video_data.get('publishDate'), ' '), + 'view_count': int_or_none(video_data.get('viewsQtty')), + } diff --git a/yt_dlp/extractor/uplynk.py b/yt_dlp/extractor/uplynk.py new file mode 100644 index 000000000..9adb96943 --- /dev/null +++ b/yt_dlp/extractor/uplynk.py @@ -0,0 +1,71 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + float_or_none, + ExtractorError, +) + + +class UplynkIE(InfoExtractor): + IE_NAME = 'uplynk' + _VALID_URL = r'https?://.*?\.uplynk\.com/(?P<path>ext/[0-9a-f]{32}/(?P<external_id>[^/?&]+)|(?P<id>[0-9a-f]{32}))\.(?:m3u8|json)(?:.*?\bpbs=(?P<session_id>[^&]+))?' + _TEST = { + 'url': 'http://content.uplynk.com/e89eaf2ce9054aa89d92ddb2d817a52e.m3u8', + 'info_dict': { + 'id': 'e89eaf2ce9054aa89d92ddb2d817a52e', + 'ext': 'mp4', + 'title': '030816-kgo-530pm-solar-eclipse-vid_web.mp4', + 'uploader_id': '4413701bf5a1488db55b767f8ae9d4fa', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _extract_uplynk_info(self, uplynk_content_url): + path, external_id, video_id, session_id = re.match(UplynkIE._VALID_URL, uplynk_content_url).groups() + display_id = video_id or external_id + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + 'http://content.uplynk.com/%s.m3u8' % path, + display_id, 'mp4', 'm3u8_native') + if session_id: + for f in formats: + f['extra_param_to_segment_url'] = 'pbs=' + session_id + self._sort_formats(formats) + asset = self._download_json('http://content.uplynk.com/player/assetinfo/%s.json' % path, display_id) + if asset.get('error') == 1: + raise ExtractorError('% said: %s' % (self.IE_NAME, asset['msg']), expected=True) + + return { + 'id': asset['asset'], + 'title': asset['desc'], + 'thumbnail': asset.get('default_poster_url'), + 'duration': float_or_none(asset.get('duration')), + 'uploader_id': asset.get('owner'), + 'formats': formats, + 'subtitles': subtitles, + } + + def _real_extract(self, url): + return self._extract_uplynk_info(url) + + +class UplynkPreplayIE(UplynkIE): + IE_NAME = 'uplynk:preplay' + _VALID_URL = r'https?://.*?\.uplynk\.com/preplay2?/(?P<path>ext/[0-9a-f]{32}/(?P<external_id>[^/?&]+)|(?P<id>[0-9a-f]{32}))\.json' + _TEST = None + + def _real_extract(self, url): + path, external_id, video_id = self._match_valid_url(url).groups() + display_id = video_id or external_id + preplay = self._download_json(url, display_id) + content_url = 'http://content.uplynk.com/%s.m3u8' % path + session_id = preplay.get('sid') + if session_id: + content_url += '?pbs=' + session_id + return self._extract_uplynk_info(content_url) diff --git a/yt_dlp/extractor/urort.py b/yt_dlp/extractor/urort.py new file mode 100644 index 000000000..020425fc7 --- /dev/null +++ b/yt_dlp/extractor/urort.py @@ -0,0 +1,66 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse, +) +from ..utils import ( + unified_strdate, +) + + +class UrortIE(InfoExtractor): + IE_DESC = 'NRK P3 Urørt' + _VALID_URL = r'https?://(?:www\.)?urort\.p3\.no/#!/Band/(?P<id>[^/]+)$' + + _TEST = { + 'url': 'https://urort.p3.no/#!/Band/Gerilja', + 'md5': '5ed31a924be8a05e47812678a86e127b', + 'info_dict': { + 'id': '33124-24', + 'ext': 'mp3', + 'title': 'The Bomb', + 'thumbnail': r're:^https?://.+\.jpg', + 'uploader': 'Gerilja', + 'uploader_id': 'Gerilja', + 'upload_date': '20100323', + }, + 'params': { + 'matchtitle': '^The Bomb$', # To test, we want just one video + } + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + fstr = compat_urllib_parse.quote("InternalBandUrl eq '%s'" % playlist_id) + json_url = 'http://urort.p3.no/breeze/urort/TrackDTOViews?$filter=%s&$orderby=Released%%20desc&$expand=Tags%%2CFiles' % fstr + songs = self._download_json(json_url, playlist_id) + entries = [] + for s in songs: + formats = [{ + 'tbr': f.get('Quality'), + 'ext': f['FileType'], + 'format_id': '%s-%s' % (f['FileType'], f.get('Quality', '')), + 'url': 'http://p3urort.blob.core.windows.net/tracks/%s' % f['FileRef'], + 'quality': 3 if f['FileType'] == 'mp3' else 2, + } for f in s['Files']] + self._sort_formats(formats) + e = { + 'id': '%d-%s' % (s['BandId'], s['$id']), + 'title': s['Title'], + 'uploader_id': playlist_id, + 'uploader': s.get('BandName', playlist_id), + 'thumbnail': 'http://urort.p3.no/cloud/images/%s' % s['Image'], + 'upload_date': unified_strdate(s.get('Released')), + 'formats': formats, + } + entries.append(e) + + return { + '_type': 'playlist', + 'id': playlist_id, + 'title': playlist_id, + 'entries': entries, + } diff --git a/yt_dlp/extractor/urplay.py b/yt_dlp/extractor/urplay.py new file mode 100644 index 000000000..753ffa49c --- /dev/null +++ b/yt_dlp/extractor/urplay.py @@ -0,0 +1,114 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + dict_get, + int_or_none, + unified_timestamp, +) + + +class URPlayIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ur(?:play|skola)\.se/(?:program|Produkter)/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'https://urplay.se/program/203704-ur-samtiden-livet-universum-och-rymdens-markliga-musik-om-vetenskap-kritiskt-tankande-och-motstand', + 'md5': 'ff5b0c89928f8083c74bbd5099c9292d', + 'info_dict': { + 'id': '203704', + 'ext': 'mp4', + 'title': 'UR Samtiden - Livet, universum och rymdens märkliga musik : Om vetenskap, kritiskt tänkande och motstånd', + 'description': 'md5:5344508a52aa78c1ced6c1b8b9e44e9a', + 'timestamp': 1513292400, + 'upload_date': '20171214', + 'series': 'UR Samtiden - Livet, universum och rymdens märkliga musik', + 'duration': 2269, + 'categories': ['Kultur & historia'], + 'tags': ['Kritiskt tänkande', 'Vetenskap', 'Vetenskaplig verksamhet'], + 'episode': 'Om vetenskap, kritiskt tänkande och motstånd', + }, + }, { + 'url': 'https://urskola.se/Produkter/190031-Tripp-Trapp-Trad-Sovkudde', + 'info_dict': { + 'id': '190031', + 'ext': 'mp4', + 'title': 'Tripp, Trapp, Träd : Sovkudde', + 'description': 'md5:b86bffdae04a7e9379d1d7e5947df1d1', + 'timestamp': 1440086400, + 'upload_date': '20150820', + 'series': 'Tripp, Trapp, Träd', + 'duration': 865, + 'tags': ['Sova'], + 'episode': 'Sovkudde', + }, + }, { + 'url': 'http://urskola.se/Produkter/155794-Smasagor-meankieli-Grodan-i-vida-varlden', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + url = url.replace('skola.se/Produkter', 'play.se/program') + webpage = self._download_webpage(url, video_id) + vid = int(video_id) + accessible_episodes = self._parse_json(self._html_search_regex( + r'data-react-class="routes/Product/components/ProgramContainer/ProgramContainer"[^>]+data-react-props="({.+?})"', + webpage, 'urplayer data'), video_id)['accessibleEpisodes'] + urplayer_data = next(e for e in accessible_episodes if e.get('id') == vid) + episode = urplayer_data['title'] + + host = self._download_json('http://streaming-loadbalancer.ur.se/loadbalancer.json', video_id)['redirect'] + formats = [] + urplayer_streams = urplayer_data.get('streamingInfo', {}) + + for k, v in urplayer_streams.get('raw', {}).items(): + if not (k in ('sd', 'hd') and isinstance(v, dict)): + continue + file_http = v.get('location') + if file_http: + formats.extend(self._extract_wowza_formats( + 'http://%s/%splaylist.m3u8' % (host, file_http), + video_id, skip_protocols=['f4m', 'rtmp', 'rtsp'])) + self._sort_formats(formats) + + subtitles = {} + subs = urplayer_streams.get("sweComplete", {}).get("tt", {}).get("location") + if subs: + subtitles.setdefault('Svenska', []).append({ + 'url': subs, + }) + + image = urplayer_data.get('image') or {} + thumbnails = [] + for k, v in image.items(): + t = { + 'id': k, + 'url': v, + } + wh = k.split('x') + if len(wh) == 2: + t.update({ + 'width': int_or_none(wh[0]), + 'height': int_or_none(wh[1]), + }) + thumbnails.append(t) + + series = urplayer_data.get('series') or {} + series_title = dict_get(series, ('seriesTitle', 'title')) or dict_get(urplayer_data, ('seriesTitle', 'mainTitle')) + + return { + 'id': video_id, + 'subtitles': subtitles, + 'title': '%s : %s' % (series_title, episode) if series_title else episode, + 'description': urplayer_data.get('description'), + 'thumbnails': thumbnails, + 'timestamp': unified_timestamp(urplayer_data.get('publishedAt')), + 'series': series_title, + 'formats': formats, + 'duration': int_or_none(urplayer_data.get('duration')), + 'categories': urplayer_data.get('categories'), + 'tags': urplayer_data.get('keywords'), + 'season': series.get('label'), + 'episode': episode, + 'episode_number': int_or_none(urplayer_data.get('episodeNumber')), + } diff --git a/yt_dlp/extractor/usanetwork.py b/yt_dlp/extractor/usanetwork.py new file mode 100644 index 000000000..d953e460b --- /dev/null +++ b/yt_dlp/extractor/usanetwork.py @@ -0,0 +1,24 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .nbc import NBCIE + + +class USANetworkIE(NBCIE): + _VALID_URL = r'https?(?P<permalink>://(?:www\.)?usanetwork\.com/(?:[^/]+/videos?|movies?)/(?:[^/]+/)?(?P<id>\d+))' + _TESTS = [{ + 'url': 'https://www.usanetwork.com/peacock-trailers/video/intelligence-trailer/4185302', + 'info_dict': { + 'id': '4185302', + 'ext': 'mp4', + 'title': 'Intelligence (Trailer)', + 'description': 'A maverick NSA agent enlists the help of a junior systems analyst in a workplace power grab.', + 'upload_date': '20200715', + 'timestamp': 1594785600, + 'uploader': 'NBCU-MPAT', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }] diff --git a/youtube_dl/extractor/usatoday.py b/yt_dlp/extractor/usatoday.py index b2103448d..b2103448d 100644 --- a/youtube_dl/extractor/usatoday.py +++ b/yt_dlp/extractor/usatoday.py diff --git a/yt_dlp/extractor/ustream.py b/yt_dlp/extractor/ustream.py new file mode 100644 index 000000000..8b758795f --- /dev/null +++ b/yt_dlp/extractor/ustream.py @@ -0,0 +1,284 @@ +from __future__ import unicode_literals + +import random +import re + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urlparse, +) +from ..utils import ( + encode_data_uri, + ExtractorError, + int_or_none, + float_or_none, + mimetype2ext, + str_or_none, +) + + +class UstreamIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?:ustream\.tv|video\.ibm\.com)/(?P<type>recorded|embed|embed/recorded)/(?P<id>\d+)' + IE_NAME = 'ustream' + _TESTS = [{ + 'url': 'http://www.ustream.tv/recorded/20274954', + 'md5': '088f151799e8f572f84eb62f17d73e5c', + 'info_dict': { + 'id': '20274954', + 'ext': 'flv', + 'title': 'Young Americans for Liberty February 7, 2012 2:28 AM', + 'description': 'Young Americans for Liberty February 7, 2012 2:28 AM', + 'timestamp': 1328577035, + 'upload_date': '20120207', + 'uploader': 'yaliberty', + 'uploader_id': '6780869', + }, + }, { + # From http://sportscanada.tv/canadagames/index.php/week2/figure-skating/444 + # Title and uploader available only from params JSON + 'url': 'http://www.ustream.tv/embed/recorded/59307601?ub=ff0000&lc=ff0000&oc=ffffff&uc=ffffff&v=3&wmode=direct', + 'md5': '5a2abf40babeac9812ed20ae12d34e10', + 'info_dict': { + 'id': '59307601', + 'ext': 'flv', + 'title': '-CG11- Canada Games Figure Skating', + 'uploader': 'sportscanadatv', + }, + 'skip': 'This Pro Broadcaster has chosen to remove this video from the ustream.tv site.', + }, { + 'url': 'http://www.ustream.tv/embed/10299409', + 'info_dict': { + 'id': '10299409', + }, + 'playlist_count': 3, + }, { + 'url': 'http://www.ustream.tv/recorded/91343263', + 'info_dict': { + 'id': '91343263', + 'ext': 'mp4', + 'title': 'GitHub Universe - General Session - Day 1', + 'upload_date': '20160914', + 'description': 'GitHub Universe - General Session - Day 1', + 'timestamp': 1473872730, + 'uploader': 'wa0dnskeqkr', + 'uploader_id': '38977840', + }, + 'params': { + 'skip_download': True, # m3u8 download + }, + }, { + 'url': 'https://video.ibm.com/embed/recorded/128240221?&autoplay=true&controls=true&volume=100', + 'only_matching': True, + }] + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?(?:ustream\.tv|video\.ibm\.com)/embed/.+?)\1', webpage) + if mobj is not None: + return mobj.group('url') + + def _get_stream_info(self, url, video_id, app_id_ver, extra_note=None): + def num_to_hex(n): + return hex(n)[2:] + + rnd = random.randrange + + if not extra_note: + extra_note = '' + + conn_info = self._download_json( + 'http://r%d-1-%s-recorded-lp-live.ums.ustream.tv/1/ustream' % (rnd(1e8), video_id), + video_id, note='Downloading connection info' + extra_note, + query={ + 'type': 'viewer', + 'appId': app_id_ver[0], + 'appVersion': app_id_ver[1], + 'rsid': '%s:%s' % (num_to_hex(rnd(1e8)), num_to_hex(rnd(1e8))), + 'rpin': '_rpin.%d' % rnd(1e15), + 'referrer': url, + 'media': video_id, + 'application': 'recorded', + }) + host = conn_info[0]['args'][0]['host'] + connection_id = conn_info[0]['args'][0]['connectionId'] + + return self._download_json( + 'http://%s/1/ustream?connectionId=%s' % (host, connection_id), + video_id, note='Downloading stream info' + extra_note) + + def _get_streams(self, url, video_id, app_id_ver): + # Sometimes the return dict does not have 'stream' + for trial_count in range(3): + stream_info = self._get_stream_info( + url, video_id, app_id_ver, + extra_note=' (try %d)' % (trial_count + 1) if trial_count > 0 else '') + if 'stream' in stream_info[0]['args'][0]: + return stream_info[0]['args'][0]['stream'] + return [] + + def _parse_segmented_mp4(self, dash_stream_info): + def resolve_dash_template(template, idx, chunk_hash): + return template.replace('%', compat_str(idx), 1).replace('%', chunk_hash) + + formats = [] + for stream in dash_stream_info['streams']: + # Use only one provider to avoid too many formats + provider = dash_stream_info['providers'][0] + fragments = [{ + 'url': resolve_dash_template( + provider['url'] + stream['initUrl'], 0, dash_stream_info['hashes']['0']) + }] + for idx in range(dash_stream_info['videoLength'] // dash_stream_info['chunkTime']): + fragments.append({ + 'url': resolve_dash_template( + provider['url'] + stream['segmentUrl'], idx, + dash_stream_info['hashes'][compat_str(idx // 10 * 10)]) + }) + content_type = stream['contentType'] + kind = content_type.split('/')[0] + f = { + 'format_id': '-'.join(filter(None, [ + 'dash', kind, str_or_none(stream.get('bitrate'))])), + 'protocol': 'http_dash_segments', + # TODO: generate a MPD doc for external players? + 'url': encode_data_uri(b'<MPD/>', 'text/xml'), + 'ext': mimetype2ext(content_type), + 'height': stream.get('height'), + 'width': stream.get('width'), + 'fragments': fragments, + } + if kind == 'video': + f.update({ + 'vcodec': stream.get('codec'), + 'acodec': 'none', + 'vbr': stream.get('bitrate'), + }) + else: + f.update({ + 'vcodec': 'none', + 'acodec': stream.get('codec'), + 'abr': stream.get('bitrate'), + }) + formats.append(f) + return formats + + def _real_extract(self, url): + m = self._match_valid_url(url) + video_id = m.group('id') + + # some sites use this embed format (see: https://github.com/ytdl-org/youtube-dl/issues/2990) + if m.group('type') == 'embed/recorded': + video_id = m.group('id') + desktop_url = 'http://www.ustream.tv/recorded/' + video_id + return self.url_result(desktop_url, 'Ustream') + if m.group('type') == 'embed': + video_id = m.group('id') + webpage = self._download_webpage(url, video_id) + content_video_ids = self._parse_json(self._search_regex( + r'ustream\.vars\.offAirContentVideoIds=([^;]+);', webpage, + 'content video IDs'), video_id) + return self.playlist_result( + map(lambda u: self.url_result('http://www.ustream.tv/recorded/' + u, 'Ustream'), content_video_ids), + video_id) + + params = self._download_json( + 'https://api.ustream.tv/videos/%s.json' % video_id, video_id) + + error = params.get('error') + if error: + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, error), expected=True) + + video = params['video'] + + title = video['title'] + filesize = float_or_none(video.get('file_size')) + + formats = [{ + 'id': video_id, + 'url': video_url, + 'ext': format_id, + 'filesize': filesize, + } for format_id, video_url in video['media_urls'].items() if video_url] + + if not formats: + hls_streams = self._get_streams(url, video_id, app_id_ver=(11, 2)) + if hls_streams: + # m3u8_native leads to intermittent ContentTooShortError + formats.extend(self._extract_m3u8_formats( + hls_streams[0]['url'], video_id, ext='mp4', m3u8_id='hls')) + + ''' + # DASH streams handling is incomplete as 'url' is missing + dash_streams = self._get_streams(url, video_id, app_id_ver=(3, 1)) + if dash_streams: + formats.extend(self._parse_segmented_mp4(dash_streams)) + ''' + + self._sort_formats(formats) + + description = video.get('description') + timestamp = int_or_none(video.get('created_at')) + duration = float_or_none(video.get('length')) + view_count = int_or_none(video.get('views')) + + uploader = video.get('owner', {}).get('username') + uploader_id = video.get('owner', {}).get('id') + + thumbnails = [{ + 'id': thumbnail_id, + 'url': thumbnail_url, + } for thumbnail_id, thumbnail_url in video.get('thumbnail', {}).items()] + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnails': thumbnails, + 'timestamp': timestamp, + 'duration': duration, + 'view_count': view_count, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'formats': formats, + } + + +class UstreamChannelIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ustream\.tv/channel/(?P<slug>.+)' + IE_NAME = 'ustream:channel' + _TEST = { + 'url': 'http://www.ustream.tv/channel/channeljapan', + 'info_dict': { + 'id': '10874166', + }, + 'playlist_mincount': 17, + } + + def _real_extract(self, url): + m = self._match_valid_url(url) + display_id = m.group('slug') + webpage = self._download_webpage(url, display_id) + channel_id = self._html_search_meta('ustream:channel_id', webpage) + + BASE = 'http://www.ustream.tv' + next_url = '/ajax/socialstream/videos/%s/1.json' % channel_id + video_ids = [] + while next_url: + reply = self._download_json( + compat_urlparse.urljoin(BASE, next_url), display_id, + note='Downloading video information (next: %d)' % (len(video_ids) + 1)) + video_ids.extend(re.findall(r'data-content-id="(\d.*)"', reply['data'])) + next_url = reply['nextUrl'] + + entries = [ + self.url_result('http://www.ustream.tv/recorded/' + vid, 'Ustream') + for vid in video_ids] + return { + '_type': 'playlist', + 'id': channel_id, + 'display_id': display_id, + 'entries': entries, + } diff --git a/yt_dlp/extractor/ustudio.py b/yt_dlp/extractor/ustudio.py new file mode 100644 index 000000000..92509d1bf --- /dev/null +++ b/yt_dlp/extractor/ustudio.py @@ -0,0 +1,124 @@ +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + unified_strdate, + unescapeHTML, +) + + +class UstudioIE(InfoExtractor): + IE_NAME = 'ustudio' + _VALID_URL = r'https?://(?:(?:www|v1)\.)?ustudio\.com/video/(?P<id>[^/]+)/(?P<display_id>[^/?#&]+)' + _TEST = { + 'url': 'http://ustudio.com/video/Uxu2my9bgSph/san_francisco_golden_gate_bridge', + 'md5': '58bbfca62125378742df01fc2abbdef6', + 'info_dict': { + 'id': 'Uxu2my9bgSph', + 'display_id': 'san_francisco_golden_gate_bridge', + 'ext': 'mp4', + 'title': 'San Francisco: Golden Gate Bridge', + 'description': 'md5:23925500697f2c6d4830e387ba51a9be', + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20111107', + 'uploader': 'Tony Farley', + } + } + + def _real_extract(self, url): + video_id, display_id = self._match_valid_url(url).groups() + + config = self._download_xml( + 'http://v1.ustudio.com/embed/%s/ustudio/config.xml' % video_id, + display_id) + + def extract(kind): + return [{ + 'url': unescapeHTML(item.attrib['url']), + 'width': int_or_none(item.get('width')), + 'height': int_or_none(item.get('height')), + } for item in config.findall('./qualities/quality/%s' % kind) if item.get('url')] + + formats = extract('video') + self._sort_formats(formats) + + webpage = self._download_webpage(url, display_id) + + title = self._og_search_title(webpage) + upload_date = unified_strdate(self._search_regex( + r'(?s)Uploaded by\s*.+?\s*on\s*<span>([^<]+)</span>', + webpage, 'upload date', fatal=False)) + uploader = self._search_regex( + r'Uploaded by\s*<a[^>]*>([^<]+)<', + webpage, 'uploader', fatal=False) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': self._og_search_description(webpage), + 'thumbnails': extract('image'), + 'upload_date': upload_date, + 'uploader': uploader, + 'formats': formats, + } + + +class UstudioEmbedIE(InfoExtractor): + IE_NAME = 'ustudio:embed' + _VALID_URL = r'https?://(?:(?:app|embed)\.)?ustudio\.com/embed/(?P<uid>[^/]+)/(?P<id>[^/]+)' + _TEST = { + 'url': 'http://app.ustudio.com/embed/DeN7VdYRDKhP/Uw7G1kMCe65T', + 'md5': '47c0be52a09b23a7f40de9469cec58f4', + 'info_dict': { + 'id': 'Uw7G1kMCe65T', + 'ext': 'mp4', + 'title': '5 Things IT Should Know About Video', + 'description': 'md5:93d32650884b500115e158c5677d25ad', + 'uploader_id': 'DeN7VdYRDKhP', + } + } + + def _real_extract(self, url): + uploader_id, video_id = self._match_valid_url(url).groups() + video_data = self._download_json( + 'http://app.ustudio.com/embed/%s/%s/config.json' % (uploader_id, video_id), + video_id)['videos'][0] + title = video_data['name'] + + formats = [] + for ext, qualities in video_data.get('transcodes', {}).items(): + for quality in qualities: + quality_url = quality.get('url') + if not quality_url: + continue + height = int_or_none(quality.get('height')) + formats.append({ + 'format_id': '%s-%dp' % (ext, height) if height else ext, + 'url': quality_url, + 'width': int_or_none(quality.get('width')), + 'height': height, + }) + self._sort_formats(formats) + + thumbnails = [] + for image in video_data.get('images', []): + image_url = image.get('url') + if not image_url: + continue + thumbnails.append({ + 'url': image_url, + }) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('description'), + 'duration': int_or_none(video_data.get('duration')), + 'uploader_id': uploader_id, + 'tags': video_data.get('keywords'), + 'thumbnails': thumbnails, + 'formats': formats, + } diff --git a/yt_dlp/extractor/utreon.py b/yt_dlp/extractor/utreon.py new file mode 100644 index 000000000..4a25f0c55 --- /dev/null +++ b/yt_dlp/extractor/utreon.py @@ -0,0 +1,85 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + dict_get, + int_or_none, + str_or_none, + try_get, + unified_strdate, + url_or_none, +) + + +class UtreonIE(InfoExtractor): + _VALID_URL = r'(?:https?://)(?:www\.)?utreon.com/v/(?P<id>[a-zA-Z0-9_-]+)' + _TESTS = [{ + 'url': 'https://utreon.com/v/z_I7ikQbuDw', + 'info_dict': { + 'id': 'z_I7ikQbuDw', + 'ext': 'mp4', + 'title': 'Freedom Friday meditation - Rising in the wind', + 'description': 'md5:a9bf15a42434a062fe313b938343ad1b', + 'uploader': 'Heather Dawn Elemental Health', + 'thumbnail': 'https://data-1.utreon.com/v/MG/M2/NT/z_I7ikQbuDw/z_I7ikQbuDw_preview.jpg', + 'release_date': '20210723', + } + }, { + 'url': 'https://utreon.com/v/jerJw5EOOVU', + 'info_dict': { + 'id': 'jerJw5EOOVU', + 'ext': 'mp4', + 'title': 'When I\'m alone, I love to reflect in peace, to make my dreams come true... [Quotes and Poems]', + 'description': 'md5:61ee6c2da98be51b04b969ca80273aaa', + 'uploader': 'Frases e Poemas Quotes and Poems', + 'thumbnail': 'https://data-1.utreon.com/v/Mz/Zh/ND/jerJw5EOOVU/jerJw5EOOVU_89af85470a4b16eededde7f8674c96d9_cover.jpg', + 'release_date': '20210723', + } + }, { + 'url': 'https://utreon.com/v/C4ZxXhYBBmE', + 'info_dict': { + 'id': 'C4ZxXhYBBmE', + 'ext': 'mp4', + 'title': 'Biden’s Capital Gains Tax Rate to Test World’s Highest', + 'description': 'md5:fb5a6c2e506f013cc76f133f673bc5c8', + 'uploader': 'Nomad Capitalist', + 'thumbnail': 'https://data-1.utreon.com/v/ZD/k1/Mj/C4ZxXhYBBmE/C4ZxXhYBBmE_628342076198c9c06dd6b2c665978584_cover.jpg', + 'release_date': '20210723', + } + }, { + 'url': 'https://utreon.com/v/Y-stEH-FBm8', + 'info_dict': { + 'id': 'Y-stEH-FBm8', + 'ext': 'mp4', + 'title': 'Creeper-Chan Pranks Steve! 💚 [MINECRAFT ANIME]', + 'description': 'md5:7a48450b0d761b96dec194be0c5ecb5f', + 'uploader': 'Merryweather Comics', + 'thumbnail': 'https://data-1.utreon.com/v/MT/E4/Zj/Y-stEH-FBm8/Y-stEH-FBm8_5290676a41a4a1096db133b09f54f77b_cover.jpg', + 'release_date': '20210718', + }}, + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + json_data = self._download_json( + 'https://api.utreon.com/v1/videos/' + video_id, + video_id) + videos_json = json_data['videos'] + formats = [{ + 'url': format_url, + 'format_id': format_key.split('_')[1], + 'height': int(format_key.split('_')[1][:-1]), + } for format_key, format_url in videos_json.items() if url_or_none(format_url)] + self._sort_formats(formats) + thumbnail = url_or_none(dict_get(json_data, ('cover_image_url', 'preview_image_url'))) + return { + 'id': video_id, + 'title': json_data['title'], + 'formats': formats, + 'description': str_or_none(json_data.get('description')), + 'duration': int_or_none(json_data.get('duration')), + 'uploader': str_or_none(try_get(json_data, lambda x: x['channel']['title'])), + 'thumbnail': thumbnail, + 'release_date': unified_strdate(json_data.get('published_datetime')), + } diff --git a/yt_dlp/extractor/varzesh3.py b/yt_dlp/extractor/varzesh3.py new file mode 100644 index 000000000..81313dc9d --- /dev/null +++ b/yt_dlp/extractor/varzesh3.py @@ -0,0 +1,76 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_html, + parse_qs, + remove_start, +) + + +class Varzesh3IE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?video\.varzesh3\.com/(?:[^/]+/)+(?P<id>[^/]+)/?' + _TESTS = [{ + 'url': 'http://video.varzesh3.com/germany/bundesliga/5-%D9%88%D8%A7%DA%A9%D9%86%D8%B4-%D8%A8%D8%B1%D8%AA%D8%B1-%D8%AF%D8%B1%D9%88%D8%A7%D8%B2%D9%87%E2%80%8C%D8%A8%D8%A7%D9%86%D8%A7%D9%86%D8%9B%D9%87%D9%81%D8%AA%D9%87-26-%D8%A8%D9%88%D9%86%D8%AF%D8%B3/', + 'md5': '2a933874cb7dce4366075281eb49e855', + 'info_dict': { + 'id': '76337', + 'ext': 'mp4', + 'title': '۵ واکنش برتر دروازهبانان؛هفته ۲۶ بوندسلیگا', + 'description': 'فصل ۲۰۱۵-۲۰۱۴', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'skip': 'HTTP 404 Error', + }, { + 'url': 'http://video.varzesh3.com/video/112785/%D8%AF%D9%84%D9%87-%D8%B9%D9%84%DB%8C%D8%9B-%D8%B3%D8%AA%D8%A7%D8%B1%D9%87-%D9%86%D9%88%D8%B8%D9%87%D9%88%D8%B1-%D9%84%DB%8C%DA%AF-%D8%A8%D8%B1%D8%AA%D8%B1-%D8%AC%D8%B2%DB%8C%D8%B1%D9%87', + 'md5': '841b7cd3afbc76e61708d94e53a4a4e7', + 'info_dict': { + 'id': '112785', + 'ext': 'mp4', + 'title': 'دله علی؛ ستاره نوظهور لیگ برتر جزیره', + 'description': 'فوتبال 120', + }, + 'expected_warnings': ['description'], + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + video_url = self._search_regex( + r'<source[^>]+src="([^"]+)"', webpage, 'video url') + + title = remove_start(self._html_search_regex( + r'<title>([^<]+)</title>', webpage, 'title'), 'ویدیو ورزش 3 | ') + + description = self._html_search_regex( + r'(?s)<div class="matn">(.+?)</div>', + webpage, 'description', default=None) + if description is None: + description = clean_html(self._html_search_meta('description', webpage)) + + thumbnail = self._og_search_thumbnail(webpage, default=None) + if thumbnail is None: + fb_sharer_url = self._search_regex( + r'<a[^>]+href="(https?://www\.facebook\.com/sharer/sharer\.php?[^"]+)"', + webpage, 'facebook sharer URL', fatal=False) + sharer_params = parse_qs(fb_sharer_url) + thumbnail = sharer_params.get('p[images][0]', [None])[0] + + video_id = self._search_regex( + r"<link[^>]+rel='(?:canonical|shortlink)'[^>]+href='/\?p=([^']+)'", + webpage, display_id, default=None) + if video_id is None: + video_id = self._search_regex( + r'var\s+VideoId\s*=\s*(\d+);', webpage, 'video id', + default=display_id) + + return { + 'url': video_url, + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/vbox7.py b/yt_dlp/extractor/vbox7.py index 8152acefd..8152acefd 100644 --- a/youtube_dl/extractor/vbox7.py +++ b/yt_dlp/extractor/vbox7.py diff --git a/youtube_dl/extractor/veehd.py b/yt_dlp/extractor/veehd.py index a6dc3c8d8..a6dc3c8d8 100644 --- a/youtube_dl/extractor/veehd.py +++ b/yt_dlp/extractor/veehd.py diff --git a/yt_dlp/extractor/veo.py b/yt_dlp/extractor/veo.py new file mode 100644 index 000000000..4e57a52d1 --- /dev/null +++ b/yt_dlp/extractor/veo.py @@ -0,0 +1,74 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + +from ..utils import ( + int_or_none, + mimetype2ext, + unified_timestamp, + url_or_none, +) + + +class VeoIE(InfoExtractor): + _VALID_URL = r'https?://app\.veo\.co/matches/(?P<id>[0-9A-Za-z-]+)' + + _TESTS = [{ + 'url': 'https://app.veo.co/matches/20201027-last-period/', + 'info_dict': { + 'id': '20201027-last-period', + 'ext': 'mp4', + 'title': 'Akidemy u11s v Bradford Boys u11s (Game 3)', + 'thumbnail': 're:https://c.veocdn.com/.+/thumbnail.jpg', + 'upload_date': '20201028', + 'timestamp': 1603847208, + 'duration': 1916, + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + metadata = self._download_json( + 'https://app.veo.co/api/app/matches/%s' % video_id, video_id) + + video_data = self._download_json( + 'https://app.veo.co/api/app/matches/%s/videos' % video_id, video_id, 'Downloading video data') + + title = metadata.get('title') + thumbnail = url_or_none(metadata.get('thumbnail')) + + timestamp = unified_timestamp(metadata.get('created')) + duration = int_or_none(metadata.get('duration')) + view_count = int_or_none(metadata.get('view_count')) + + formats = [] + for fmt in video_data: + mimetype = fmt.get('mime_type') + # skip configuration file for panoramic video + if mimetype == 'video/mp2t': + continue + height = int_or_none(fmt.get('height')) + bitrate = int_or_none(fmt.get('bit_rate'), scale=1000) + render_type = fmt.get('render_type') + formats.append({ + 'url': url_or_none(fmt.get('url')), + 'format_id': '%s-%sp' % (render_type, height), + 'ext': mimetype2ext(mimetype), + 'width': int_or_none(fmt.get('width')), + 'height': height, + 'vbr': bitrate + }) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'view_count': view_count, + 'duration': duration + } diff --git a/youtube_dl/extractor/veoh.py b/yt_dlp/extractor/veoh.py index 1c44c145c..1c44c145c 100644 --- a/youtube_dl/extractor/veoh.py +++ b/yt_dlp/extractor/veoh.py diff --git a/yt_dlp/extractor/vesti.py b/yt_dlp/extractor/vesti.py new file mode 100644 index 000000000..002047dbf --- /dev/null +++ b/yt_dlp/extractor/vesti.py @@ -0,0 +1,121 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ExtractorError +from .rutv import RUTVIE + + +class VestiIE(InfoExtractor): + IE_DESC = 'Вести.Ru' + _VALID_URL = r'https?://(?:.+?\.)?vesti\.ru/(?P<id>.+)' + + _TESTS = [ + { + 'url': 'http://www.vesti.ru/videos?vid=575582&cid=1', + 'info_dict': { + 'id': '765035', + 'ext': 'mp4', + 'title': 'Вести.net: биткоины в России не являются законными', + 'description': 'md5:d4bb3859dc1177b28a94c5014c35a36b', + 'duration': 302, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + { + 'url': 'http://www.vesti.ru/doc.html?id=1349233', + 'info_dict': { + 'id': '773865', + 'ext': 'mp4', + 'title': 'Участники митинга штурмуют Донецкую областную администрацию', + 'description': 'md5:1a160e98b3195379b4c849f2f4958009', + 'duration': 210, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + { + 'url': 'http://www.vesti.ru/only_video.html?vid=576180', + 'info_dict': { + 'id': '766048', + 'ext': 'mp4', + 'title': 'США заморозило, Британию затопило', + 'description': 'md5:f0ed0695ec05aed27c56a70a58dc4cc1', + 'duration': 87, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + { + 'url': 'http://hitech.vesti.ru/news/view/id/4000', + 'info_dict': { + 'id': '766888', + 'ext': 'mp4', + 'title': 'Вести.net: интернет-гиганты начали перетягивание программных "одеял"', + 'description': 'md5:65ddd47f9830c4f42ed6475f8730c995', + 'duration': 279, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + { + 'url': 'http://sochi2014.vesti.ru/video/index/video_id/766403', + 'info_dict': { + 'id': '766403', + 'ext': 'mp4', + 'title': 'XXII зимние Олимпийские игры. Российские хоккеисты стартовали на Олимпиаде с победы', + 'description': 'md5:55805dfd35763a890ff50fa9e35e31b3', + 'duration': 271, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'skip': 'Blocked outside Russia', + }, + { + 'url': 'http://sochi2014.vesti.ru/live/play/live_id/301', + 'info_dict': { + 'id': '51499', + 'ext': 'flv', + 'title': 'Сочи-2014. Биатлон. Индивидуальная гонка. Мужчины ', + 'description': 'md5:9e0ed5c9d2fa1efbfdfed90c9a6d179c', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'Translation has finished' + }, + ] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + + page = self._download_webpage(url, video_id, 'Downloading page') + + mobj = re.search( + r'<meta[^>]+?property="og:video"[^>]+?content="http://www\.vesti\.ru/i/flvplayer_videoHost\.swf\?vid=(?P<id>\d+)', + page) + if mobj: + video_id = mobj.group('id') + page = self._download_webpage('http://www.vesti.ru/only_video.html?vid=%s' % video_id, video_id, + 'Downloading video page') + + rutv_url = RUTVIE._extract_url(page) + if rutv_url: + return self.url_result(rutv_url, 'RUTV') + + raise ExtractorError('No video found', expected=True) diff --git a/yt_dlp/extractor/vevo.py b/yt_dlp/extractor/vevo.py new file mode 100644 index 000000000..8a0f29259 --- /dev/null +++ b/yt_dlp/extractor/vevo.py @@ -0,0 +1,242 @@ +from __future__ import unicode_literals + +import re +import json + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_HTTPError, +) +from ..utils import ( + ExtractorError, + int_or_none, + parse_iso8601, + parse_qs, +) + + +class VevoBaseIE(InfoExtractor): + def _extract_json(self, webpage, video_id): + return self._parse_json( + self._search_regex( + r'window\.__INITIAL_STORE__\s*=\s*({.+?});\s*</script>', + webpage, 'initial store'), + video_id) + + +class VevoIE(VevoBaseIE): + ''' + Accepts urls from vevo.com or in the format 'vevo:{id}' + (currently used by MTVIE and MySpaceIE) + ''' + _VALID_URL = r'''(?x) + (?:https?://(?:www\.)?vevo\.com/watch/(?!playlist|genre)(?:[^/]+/(?:[^/]+/)?)?| + https?://cache\.vevo\.com/m/html/embed\.html\?video=| + https?://videoplayer\.vevo\.com/embed/embedded\?videoId=| + https?://embed\.vevo\.com/.*?[?&]isrc=| + vevo:) + (?P<id>[^&?#]+)''' + + _TESTS = [] + _VERSIONS = { + 0: 'youtube', # only in AuthenticateVideo videoVersions + 1: 'level3', + 2: 'akamai', + 3: 'level3', + 4: 'amazon', + } + + def _initialize_api(self, video_id): + webpage = self._download_webpage( + 'https://accounts.vevo.com/token', None, + note='Retrieving oauth token', + errnote='Unable to retrieve oauth token', + data=json.dumps({ + 'client_id': 'SPupX1tvqFEopQ1YS6SS', + 'grant_type': 'urn:vevo:params:oauth:grant-type:anonymous', + }).encode('utf-8'), + headers={ + 'Content-Type': 'application/json', + }) + + if re.search(r'(?i)THIS PAGE IS CURRENTLY UNAVAILABLE IN YOUR REGION', webpage): + self.raise_geo_restricted( + '%s said: This page is currently unavailable in your region' % self.IE_NAME) + + auth_info = self._parse_json(webpage, video_id) + self._api_url_template = self.http_scheme() + '//apiv2.vevo.com/%s?token=' + auth_info['legacy_token'] + + def _call_api(self, path, *args, **kwargs): + try: + data = self._download_json(self._api_url_template % path, *args, **kwargs) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError): + errors = self._parse_json(e.cause.read().decode(), None)['errors'] + error_message = ', '.join([error['message'] for error in errors]) + raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message), expected=True) + raise + return data + + def _real_extract(self, url): + video_id = self._match_id(url) + + self._initialize_api(video_id) + + video_info = self._call_api( + 'video/%s' % video_id, video_id, 'Downloading api video info', + 'Failed to download video info') + + video_versions = self._call_api( + 'video/%s/streams' % video_id, video_id, + 'Downloading video versions info', + 'Failed to download video versions info', + fatal=False) + + # Some videos are only available via webpage (e.g. + # https://github.com/ytdl-org/youtube-dl/issues/9366) + if not video_versions: + webpage = self._download_webpage(url, video_id) + json_data = self._extract_json(webpage, video_id) + if 'streams' in json_data.get('default', {}): + video_versions = json_data['default']['streams'][video_id][0] + else: + video_versions = [ + value + for key, value in json_data['apollo']['data'].items() + if key.startswith('%s.streams' % video_id)] + + uploader = None + artist = None + featured_artist = None + artists = video_info.get('artists') + for curr_artist in artists: + if curr_artist.get('role') == 'Featured': + featured_artist = curr_artist['name'] + else: + artist = uploader = curr_artist['name'] + + formats = [] + for video_version in video_versions: + version = self._VERSIONS.get(video_version.get('version'), 'generic') + version_url = video_version.get('url') + if not version_url: + continue + + if '.ism' in version_url: + continue + elif '.mpd' in version_url: + formats.extend(self._extract_mpd_formats( + version_url, video_id, mpd_id='dash-%s' % version, + note='Downloading %s MPD information' % version, + errnote='Failed to download %s MPD information' % version, + fatal=False)) + elif '.m3u8' in version_url: + formats.extend(self._extract_m3u8_formats( + version_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls-%s' % version, + note='Downloading %s m3u8 information' % version, + errnote='Failed to download %s m3u8 information' % version, + fatal=False)) + else: + m = re.search(r'''(?xi) + _(?P<width>[0-9]+)x(?P<height>[0-9]+) + _(?P<vcodec>[a-z0-9]+) + _(?P<vbr>[0-9]+) + _(?P<acodec>[a-z0-9]+) + _(?P<abr>[0-9]+) + \.(?P<ext>[a-z0-9]+)''', version_url) + if not m: + continue + + formats.append({ + 'url': version_url, + 'format_id': 'http-%s-%s' % (version, video_version['quality']), + 'vcodec': m.group('vcodec'), + 'acodec': m.group('acodec'), + 'vbr': int(m.group('vbr')), + 'abr': int(m.group('abr')), + 'ext': m.group('ext'), + 'width': int(m.group('width')), + 'height': int(m.group('height')), + }) + self._sort_formats(formats) + + track = video_info['title'] + if featured_artist: + artist = '%s ft. %s' % (artist, featured_artist) + title = '%s - %s' % (artist, track) if artist else track + + genres = video_info.get('genres') + genre = ( + genres[0] if genres and isinstance(genres, list) + and isinstance(genres[0], compat_str) else None) + + is_explicit = video_info.get('isExplicit') + if is_explicit is True: + age_limit = 18 + elif is_explicit is False: + age_limit = 0 + else: + age_limit = None + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': video_info.get('imageUrl') or video_info.get('thumbnailUrl'), + 'timestamp': parse_iso8601(video_info.get('releaseDate')), + 'uploader': uploader, + 'duration': int_or_none(video_info.get('duration')), + 'view_count': int_or_none(video_info.get('views', {}).get('total')), + 'age_limit': age_limit, + 'track': track, + 'artist': uploader, + 'genre': genre, + } + + +class VevoPlaylistIE(VevoBaseIE): + _VALID_URL = r'https?://(?:www\.)?vevo\.com/watch/(?P<kind>playlist|genre)/(?P<id>[^/?#&]+)' + + _TESTS = [{ + 'url': 'http://www.vevo.com/watch/genre/rock', + 'info_dict': { + 'id': 'rock', + 'title': 'Rock', + }, + 'playlist_count': 20, + }, { + 'url': 'http://www.vevo.com/watch/genre/rock?index=0', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + playlist_id = mobj.group('id') + playlist_kind = mobj.group('kind') + + webpage = self._download_webpage(url, playlist_id) + + qs = parse_qs(url) + index = qs.get('index', [None])[0] + + if index: + video_id = self._search_regex( + r'<meta[^>]+content=(["\'])vevo://video/(?P<id>.+?)\1[^>]*>', + webpage, 'video id', default=None, group='id') + if video_id: + return self.url_result('vevo:%s' % video_id, VevoIE.ie_key()) + + playlists = self._extract_json(webpage, playlist_id)['default']['%ss' % playlist_kind] + + playlist = (list(playlists.values())[0] + if playlist_kind == 'playlist' else playlists[playlist_id]) + + entries = [ + self.url_result('vevo:%s' % src, VevoIE.ie_key()) + for src in playlist['isrcs']] + + return self.playlist_result( + entries, playlist.get('playlistId') or playlist_id, + playlist.get('name'), playlist.get('description')) diff --git a/yt_dlp/extractor/vgtv.py b/yt_dlp/extractor/vgtv.py new file mode 100644 index 000000000..b6131ff82 --- /dev/null +++ b/yt_dlp/extractor/vgtv.py @@ -0,0 +1,313 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .xstream import XstreamIE +from ..utils import ( + ExtractorError, + float_or_none, + try_get, +) + + +class VGTVIE(XstreamIE): + IE_DESC = 'VGTV, BTTV, FTV, Aftenposten and Aftonbladet' + _GEO_BYPASS = False + + _HOST_TO_APPNAME = { + 'vgtv.no': 'vgtv', + 'bt.no/tv': 'bttv', + 'aftenbladet.no/tv': 'satv', + 'fvn.no/fvntv': 'fvntv', + 'aftenposten.no/webtv': 'aptv', + 'ap.vgtv.no/webtv': 'aptv', + 'tv.aftonbladet.se': 'abtv', + # obsolete URL schemas, kept in order to save one HTTP redirect + 'tv.aftonbladet.se/abtv': 'abtv', + 'www.aftonbladet.se/tv': 'abtv', + } + + _APP_NAME_TO_VENDOR = { + 'vgtv': 'vgtv', + 'bttv': 'bt', + 'satv': 'sa', + 'fvntv': 'fvn', + 'aptv': 'ap', + 'abtv': 'ab', + } + + _VALID_URL = r'''(?x) + (?:https?://(?:www\.)? + (?P<host> + %s + ) + /? + (?: + (?:\#!/)?(?:video|live)/| + embed?.*id=| + a(?:rticles)?/ + )| + (?P<appname> + %s + ):) + (?P<id>\d+) + ''' % ('|'.join(_HOST_TO_APPNAME.keys()), '|'.join(_APP_NAME_TO_VENDOR.keys())) + + _TESTS = [ + { + # streamType: vod + 'url': 'http://www.vgtv.no/#!/video/84196/hevnen-er-soet-episode-10-abu', + 'md5': 'b8be7a234cebb840c0d512c78013e02f', + 'info_dict': { + 'id': '84196', + 'ext': 'mp4', + 'title': 'Hevnen er søt: Episode 10 - Abu', + 'description': 'md5:e25e4badb5f544b04341e14abdc72234', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 648.000, + 'timestamp': 1404626400, + 'upload_date': '20140706', + 'view_count': int, + }, + }, + { + # streamType: wasLive + 'url': 'http://www.vgtv.no/#!/live/100764/opptak-vgtv-foelger-em-kvalifiseringen', + 'info_dict': { + 'id': '100764', + 'ext': 'flv', + 'title': 'OPPTAK: VGTV følger EM-kvalifiseringen', + 'description': 'md5:3772d9c0dc2dff92a886b60039a7d4d3', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 9103.0, + 'timestamp': 1410113864, + 'upload_date': '20140907', + 'view_count': int, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'skip': 'Video is no longer available', + }, + { + # streamType: wasLive + 'url': 'http://www.vgtv.no/#!/live/113063/direkte-v75-fra-solvalla', + 'info_dict': { + 'id': '113063', + 'ext': 'mp4', + 'title': 'V75 fra Solvalla 30.05.15', + 'description': 'md5:b3743425765355855f88e096acc93231', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 25966, + 'timestamp': 1432975582, + 'upload_date': '20150530', + 'view_count': int, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + { + 'url': 'http://www.aftenposten.no/webtv/#!/video/21039/trailer-sweatshop-i-can-t-take-any-more', + 'md5': 'fd828cd29774a729bf4d4425fe192972', + 'info_dict': { + 'id': '21039', + 'ext': 'mp4', + 'title': 'TRAILER: «SWEATSHOP» - I can´t take any more', + 'description': 'md5:21891f2b0dd7ec2f78d84a50e54f8238', + 'duration': 66, + 'timestamp': 1417002452, + 'upload_date': '20141126', + 'view_count': int, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + { + 'url': 'http://www.bt.no/tv/#!/video/100250/norling-dette-er-forskjellen-paa-1-divisjon-og-eliteserien', + 'only_matching': True, + }, + { + 'url': 'http://ap.vgtv.no/webtv#!/video/111084/de-nye-bysyklene-lettere-bedre-gir-stoerre-hjul-og-feste-til-mobil', + 'only_matching': True, + }, + { + # geoblocked + 'url': 'http://www.vgtv.no/#!/video/127205/inside-the-mind-of-favela-funk', + 'only_matching': True, + }, + { + 'url': 'https://tv.aftonbladet.se/video/36015/vulkanutbrott-i-rymden-nu-slapper-nasa-bilderna', + 'only_matching': True, + }, + { + 'url': 'http://tv.aftonbladet.se/abtv/articles/36015', + 'only_matching': True, + }, + { + 'url': 'https://www.aftonbladet.se/tv/a/36015', + 'only_matching': True, + }, + { + 'url': 'abtv:140026', + 'only_matching': True, + }, + { + 'url': 'http://www.vgtv.no/video/84196/hevnen-er-soet-episode-10-abu', + 'only_matching': True, + }, + ] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + host = mobj.group('host') + appname = self._HOST_TO_APPNAME[host] if host else mobj.group('appname') + vendor = self._APP_NAME_TO_VENDOR[appname] + + data = self._download_json( + 'http://svp.vg.no/svp/api/v1/%s/assets/%s?appName=%s-website' + % (vendor, video_id, appname), + video_id, 'Downloading media JSON') + + if data.get('status') == 'inactive': + raise ExtractorError( + 'Video %s is no longer available' % video_id, expected=True) + + info = { + 'formats': [], + } + if len(video_id) == 5: + if appname == 'bttv': + info = self._extract_video_info('btno', video_id) + + streams = data['streamUrls'] + stream_type = data.get('streamType') + is_live = stream_type == 'live' + formats = [] + + hls_url = streams.get('hls') + if hls_url: + formats.extend(self._extract_m3u8_formats( + hls_url, video_id, 'mp4', + entry_protocol='m3u8' if is_live else 'm3u8_native', + m3u8_id='hls', fatal=False)) + + hds_url = streams.get('hds') + if hds_url: + hdcore_sign = 'hdcore=3.7.0' + f4m_formats = self._extract_f4m_formats( + hds_url + '?%s' % hdcore_sign, video_id, f4m_id='hds', fatal=False) + if f4m_formats: + for entry in f4m_formats: + # URLs without the extra param induce an 404 error + entry.update({'extra_param_to_segment_url': hdcore_sign}) + formats.append(entry) + + mp4_urls = streams.get('pseudostreaming') or [] + mp4_url = streams.get('mp4') + if mp4_url: + mp4_urls.append(mp4_url) + for mp4_url in mp4_urls: + format_info = { + 'url': mp4_url, + } + mobj = re.search(r'(\d+)_(\d+)_(\d+)', mp4_url) + if mobj: + tbr = int(mobj.group(3)) + format_info.update({ + 'width': int(mobj.group(1)), + 'height': int(mobj.group(2)), + 'tbr': tbr, + 'format_id': 'mp4-%s' % tbr, + }) + formats.append(format_info) + + info['formats'].extend(formats) + + if not info['formats']: + properties = try_get( + data, lambda x: x['streamConfiguration']['properties'], list) + if properties and 'geoblocked' in properties: + raise self.raise_geo_restricted( + countries=[host.rpartition('.')[-1].partition('/')[0].upper()]) + + self._sort_formats(info['formats']) + + info.update({ + 'id': video_id, + 'title': self._live_title(data['title']) if is_live else data['title'], + 'description': data['description'], + 'thumbnail': data['images']['main'] + '?t[]=900x506q80', + 'timestamp': data['published'], + 'duration': float_or_none(data['duration'], 1000), + 'view_count': data['displays'], + 'is_live': is_live, + }) + return info + + +class BTArticleIE(InfoExtractor): + IE_NAME = 'bt:article' + IE_DESC = 'Bergens Tidende Articles' + _VALID_URL = r'https?://(?:www\.)?bt\.no/(?:[^/]+/)+(?P<id>[^/]+)-\d+\.html' + _TEST = { + 'url': 'http://www.bt.no/nyheter/lokalt/Kjemper-for-internatet-1788214.html', + 'md5': '2acbe8ad129b3469d5ae51b1158878df', + 'info_dict': { + 'id': '23199', + 'ext': 'mp4', + 'title': 'Alrekstad internat', + 'description': 'md5:dc81a9056c874fedb62fc48a300dac58', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 191, + 'timestamp': 1289991323, + 'upload_date': '20101117', + 'view_count': int, + }, + } + + def _real_extract(self, url): + webpage = self._download_webpage(url, self._match_id(url)) + video_id = self._search_regex( + r'<video[^>]+data-id="(\d+)"', webpage, 'video id') + return self.url_result('bttv:%s' % video_id, 'VGTV') + + +class BTVestlendingenIE(InfoExtractor): + IE_NAME = 'bt:vestlendingen' + IE_DESC = 'Bergens Tidende - Vestlendingen' + _VALID_URL = r'https?://(?:www\.)?bt\.no/spesial/vestlendingen/#!/(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://www.bt.no/spesial/vestlendingen/#!/86588', + 'md5': 'd7d17e3337dc80de6d3a540aefbe441b', + 'info_dict': { + 'id': '86588', + 'ext': 'mov', + 'title': 'Otto Wollertsen', + 'description': 'Vestlendingen Otto Fredrik Wollertsen', + 'timestamp': 1430473209, + 'upload_date': '20150501', + }, + 'skip': '404 Error', + }, { + 'url': 'http://www.bt.no/spesial/vestlendingen/#!/86255', + 'md5': 'a2893f8632e96389f4bdf36aa9463ceb', + 'info_dict': { + 'id': '86255', + 'ext': 'mov', + 'title': 'Du må tåle å fryse og være sulten', + 'description': 'md5:b8046f4d022d5830ddab04865791d063', + 'upload_date': '20150321', + 'timestamp': 1426942023, + }, + }] + + def _real_extract(self, url): + return self.url_result('bttv:%s' % self._match_id(url), 'VGTV') diff --git a/yt_dlp/extractor/vh1.py b/yt_dlp/extractor/vh1.py new file mode 100644 index 000000000..862c5c7dc --- /dev/null +++ b/yt_dlp/extractor/vh1.py @@ -0,0 +1,36 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .mtv import MTVServicesInfoExtractor + +# TODO Remove - Reason: Outdated Site + + +class VH1IE(MTVServicesInfoExtractor): + IE_NAME = 'vh1.com' + _FEED_URL = 'http://www.vh1.com/feeds/mrss/' + _TESTS = [{ + 'url': 'https://www.vh1.com/episodes/0aqivv/nick-cannon-presents-wild-n-out-foushee-season-16-ep-12', + 'info_dict': { + 'title': 'Fousheé', + 'description': 'Fousheé joins Team Evolutions fight against Nick and Team Revolution in Baby Daddy, Baby Mama; Kick Em Out the Classroom; Backseat of My Ride and Wildstyle; and Fousheé performs.', + }, + 'playlist_mincount': 4, + 'skip': '404 Not found', + }, { + # Clip + 'url': 'https://www.vh1.com/video-clips/e0sja0/nick-cannon-presents-wild-n-out-foushee-clap-for-him', + 'info_dict': { + 'id': 'a07563f7-a37b-4e7f-af68-85855c2c7cc3', + 'ext': 'mp4', + 'title': 'Fousheé - "clap for him"', + 'description': 'Singer Fousheé hits the Wild N Out: In the Dark stage with a performance of the tongue-in-cheek track "clap for him" from her 2021 album "time machine."', + 'upload_date': '20210826', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }] + + _VALID_URL = r'https?://(?:www\.)?vh1\.com/(?:video-clips|episodes)/(?P<id>[^/?#.]+)' diff --git a/yt_dlp/extractor/vice.py b/yt_dlp/extractor/vice.py new file mode 100644 index 000000000..ca4d3edbd --- /dev/null +++ b/yt_dlp/extractor/vice.py @@ -0,0 +1,337 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import functools +import hashlib +import json +import random +import re +import time + +from .adobepass import AdobePassIE +from .common import InfoExtractor +from .youtube import YoutubeIE +from ..compat import ( + compat_HTTPError, + compat_str, +) +from ..utils import ( + clean_html, + ExtractorError, + int_or_none, + OnDemandPagedList, + parse_age_limit, + str_or_none, + try_get, +) + + +class ViceBaseIE(InfoExtractor): + def _call_api(self, resource, resource_key, resource_id, locale, fields, args=''): + return self._download_json( + 'https://video.vice.com/api/v1/graphql', resource_id, query={ + 'query': '''{ + %s(locale: "%s", %s: "%s"%s) { + %s + } +}''' % (resource, locale, resource_key, resource_id, args, fields), + })['data'][resource] + + +class ViceIE(ViceBaseIE, AdobePassIE): + IE_NAME = 'vice' + _VALID_URL = r'https?://(?:(?:video|vms)\.vice|(?:www\.)?vice(?:land|tv))\.com/(?P<locale>[^/]+)/(?:video/[^/]+|embed)/(?P<id>[\da-f]{24})' + _TESTS = [{ + 'url': 'https://video.vice.com/en_us/video/pet-cremator/58c69e38a55424f1227dc3f7', + 'info_dict': { + 'id': '58c69e38a55424f1227dc3f7', + 'ext': 'mp4', + 'title': '10 Questions You Always Wanted To Ask: Pet Cremator', + 'description': 'md5:fe856caacf61fe0e74fab15ce2b07ca5', + 'uploader': 'vice', + 'uploader_id': '57a204088cb727dec794c67b', + 'timestamp': 1489664942, + 'upload_date': '20170316', + 'age_limit': 14, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # geo restricted to US + 'url': 'https://video.vice.com/en_us/video/the-signal-from-tolva/5816510690b70e6c5fd39a56', + 'info_dict': { + 'id': '5816510690b70e6c5fd39a56', + 'ext': 'mp4', + 'uploader': 'vice', + 'title': 'The Signal From Tölva', + 'description': 'md5:3927e3c79f9e8094606a2b3c5b5e55d5', + 'uploader_id': '57a204088cb727dec794c67b', + 'timestamp': 1477941983, + 'upload_date': '20161031', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'https://video.vice.com/alps/video/ulfs-wien-beruchtigste-grafitti-crew-part-1/581b12b60a0e1f4c0fb6ea2f', + 'info_dict': { + 'id': '581b12b60a0e1f4c0fb6ea2f', + 'ext': 'mp4', + 'title': 'ULFs - Wien berüchtigste Grafitti Crew - Part 1', + 'description': 'Zwischen Hinterzimmer-Tattoos und U-Bahnschächten erzählen uns die Ulfs, wie es ist, "süchtig nach Sachbeschädigung" zu sein.', + 'uploader': 'vice', + 'uploader_id': '57a204088cb727dec794c67b', + 'timestamp': 1485368119, + 'upload_date': '20170125', + 'age_limit': 14, + }, + 'params': { + # AES-encrypted m3u8 + 'skip_download': True, + }, + }, { + 'url': 'https://video.vice.com/en_us/video/pizza-show-trailer/56d8c9a54d286ed92f7f30e4', + 'only_matching': True, + }, { + 'url': 'https://video.vice.com/en_us/embed/57f41d3556a0a80f54726060', + 'only_matching': True, + }, { + 'url': 'https://vms.vice.com/en_us/video/preplay/58c69e38a55424f1227dc3f7', + 'only_matching': True, + }, { + 'url': 'https://www.viceland.com/en_us/video/thursday-march-1-2018/5a8f2d7ff1cdb332dd446ec1', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r'<iframe\b[^>]+\bsrc=["\']((?:https?:)?//video\.vice\.com/[^/]+/embed/[\da-f]{24})', + webpage) + + @staticmethod + def _extract_url(webpage): + urls = ViceIE._extract_urls(webpage) + return urls[0] if urls else None + + def _real_extract(self, url): + locale, video_id = self._match_valid_url(url).groups() + + video = self._call_api('videos', 'id', video_id, locale, '''body + locked + rating + thumbnail_url + title''')[0] + title = video['title'].strip() + rating = video.get('rating') + + query = {} + if video.get('locked'): + resource = self._get_mvpd_resource( + 'VICELAND', title, video_id, rating) + query['tvetoken'] = self._extract_mvpd_auth( + url, video_id, 'VICELAND', resource) + + # signature generation algorithm is reverse engineered from signatureGenerator in + # webpack:///../shared/~/vice-player/dist/js/vice-player.js in + # https://www.viceland.com/assets/common/js/web.vendor.bundle.js + # new JS is located here https://vice-web-statics-cdn.vice.com/vice-player/player-embed.js + exp = int(time.time()) + 1440 + + query.update({ + 'exp': exp, + 'sign': hashlib.sha512(('%s:GET:%d' % (video_id, exp)).encode()).hexdigest(), + 'skipadstitching': 1, + 'platform': 'desktop', + 'rn': random.randint(10000, 100000), + }) + + try: + preplay = self._download_json( + 'https://vms.vice.com/%s/video/preplay/%s' % (locale, video_id), + video_id, query=query) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 401): + error = json.loads(e.cause.read().decode()) + error_message = error.get('error_description') or error['details'] + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, error_message), expected=True) + raise + + video_data = preplay['video'] + formats = self._extract_m3u8_formats( + preplay['playURL'], video_id, 'mp4', 'm3u8_native') + self._sort_formats(formats) + episode = video_data.get('episode') or {} + channel = video_data.get('channel') or {} + season = video_data.get('season') or {} + + subtitles = {} + for subtitle in preplay.get('subtitleURLs', []): + cc_url = subtitle.get('url') + if not cc_url: + continue + language_code = try_get(subtitle, lambda x: x['languages'][0]['language_code'], compat_str) or 'en' + subtitles.setdefault(language_code, []).append({ + 'url': cc_url, + }) + + return { + 'formats': formats, + 'id': video_id, + 'title': title, + 'description': clean_html(video.get('body')), + 'thumbnail': video.get('thumbnail_url'), + 'duration': int_or_none(video_data.get('video_duration')), + 'timestamp': int_or_none(video_data.get('created_at'), 1000), + 'age_limit': parse_age_limit(video_data.get('video_rating') or rating), + 'series': try_get(video_data, lambda x: x['show']['base']['display_title'], compat_str), + 'episode_number': int_or_none(episode.get('episode_number')), + 'episode_id': str_or_none(episode.get('id') or video_data.get('episode_id')), + 'season_number': int_or_none(season.get('season_number')), + 'season_id': str_or_none(season.get('id') or video_data.get('season_id')), + 'uploader': channel.get('name'), + 'uploader_id': str_or_none(channel.get('id')), + 'subtitles': subtitles, + } + + +class ViceShowIE(ViceBaseIE): + IE_NAME = 'vice:show' + _VALID_URL = r'https?://(?:video\.vice|(?:www\.)?vice(?:land|tv))\.com/(?P<locale>[^/]+)/show/(?P<id>[^/?#&]+)' + _PAGE_SIZE = 25 + _TESTS = [{ + 'url': 'https://video.vice.com/en_us/show/fck-thats-delicious', + 'info_dict': { + 'id': '57a2040c8cb727dec794c901', + 'title': 'F*ck, That’s Delicious', + 'description': 'The life and eating habits of rap’s greatest bon vivant, Action Bronson.', + }, + 'playlist_mincount': 64, + }, { + 'url': 'https://www.vicetv.com/en_us/show/fck-thats-delicious', + 'only_matching': True, + }] + + def _fetch_page(self, locale, show_id, page): + videos = self._call_api('videos', 'show_id', show_id, locale, '''body + id + url''', ', page: %d, per_page: %d' % (page + 1, self._PAGE_SIZE)) + for video in videos: + yield self.url_result( + video['url'], ViceIE.ie_key(), video.get('id')) + + def _real_extract(self, url): + locale, display_id = self._match_valid_url(url).groups() + show = self._call_api('shows', 'slug', display_id, locale, '''dek + id + title''')[0] + show_id = show['id'] + + entries = OnDemandPagedList( + functools.partial(self._fetch_page, locale, show_id), + self._PAGE_SIZE) + + return self.playlist_result( + entries, show_id, show.get('title'), show.get('dek')) + + +class ViceArticleIE(ViceBaseIE): + IE_NAME = 'vice:article' + _VALID_URL = r'https://(?:www\.)?vice\.com/(?P<locale>[^/]+)/article/(?:[0-9a-z]{6}/)?(?P<id>[^?#]+)' + + _TESTS = [{ + 'url': 'https://www.vice.com/en_us/article/on-set-with-the-woman-making-mormon-porn-in-utah', + 'info_dict': { + 'id': '58dc0a3dee202d2a0ccfcbd8', + 'ext': 'mp4', + 'title': 'Mormon War on Porn', + 'description': 'md5:1c5d91fe25fa8aa304f9def118b92dbf', + 'uploader': 'vice', + 'uploader_id': '57a204088cb727dec794c67b', + 'timestamp': 1491883129, + 'upload_date': '20170411', + 'age_limit': 17, + }, + 'params': { + # AES-encrypted m3u8 + 'skip_download': True, + }, + 'add_ie': [ViceIE.ie_key()], + }, { + 'url': 'https://www.vice.com/en_us/article/how-to-hack-a-car', + 'md5': '13010ee0bc694ea87ec40724397c2349', + 'info_dict': { + 'id': '3jstaBeXgAs', + 'ext': 'mp4', + 'title': 'How to Hack a Car: Phreaked Out (Episode 2)', + 'description': 'md5:ee95453f7ff495db8efe14ae8bf56f30', + 'uploader': 'Motherboard', + 'uploader_id': 'MotherboardTV', + 'upload_date': '20140529', + }, + 'add_ie': [YoutubeIE.ie_key()], + }, { + 'url': 'https://www.vice.com/en_us/article/znm9dx/karley-sciortino-slutever-reloaded', + 'md5': 'a7ecf64ee4fa19b916c16f4b56184ae2', + 'info_dict': { + 'id': '57f41d3556a0a80f54726060', + 'ext': 'mp4', + 'title': "Making The World's First Male Sex Doll", + 'description': 'md5:19b00b215b99961cf869c40fbe9df755', + 'uploader': 'vice', + 'uploader_id': '57a204088cb727dec794c67b', + 'timestamp': 1476919911, + 'upload_date': '20161019', + 'age_limit': 17, + }, + 'params': { + 'skip_download': True, + 'format': 'bestvideo', + }, + 'add_ie': [ViceIE.ie_key()], + }, { + 'url': 'https://www.vice.com/en_us/article/cowboy-capitalists-part-1', + 'only_matching': True, + }, { + 'url': 'https://www.vice.com/ru/article/big-night-out-ibiza-clive-martin-229', + 'only_matching': True, + }] + + def _real_extract(self, url): + locale, display_id = self._match_valid_url(url).groups() + + article = self._call_api('articles', 'slug', display_id, locale, '''body + embed_code''')[0] + body = article['body'] + + def _url_res(video_url, ie_key): + return { + '_type': 'url_transparent', + 'url': video_url, + 'display_id': display_id, + 'ie_key': ie_key, + } + + vice_url = ViceIE._extract_url(body) + if vice_url: + return _url_res(vice_url, ViceIE.ie_key()) + + embed_code = self._search_regex( + r'embedCode=([^&\'"]+)', body, + 'ooyala embed code', default=None) + if embed_code: + return _url_res('ooyala:%s' % embed_code, 'Ooyala') + + youtube_url = YoutubeIE._extract_url(body) + if youtube_url: + return _url_res(youtube_url, YoutubeIE.ie_key()) + + video_url = self._html_search_regex( + r'data-video-url="([^"]+)"', + article['embed_code'], 'video URL') + + return _url_res(video_url, ViceIE.ie_key()) diff --git a/youtube_dl/extractor/vidbit.py b/yt_dlp/extractor/vidbit.py index 91f45b7cc..91f45b7cc 100644 --- a/youtube_dl/extractor/vidbit.py +++ b/yt_dlp/extractor/vidbit.py diff --git a/yt_dlp/extractor/viddler.py b/yt_dlp/extractor/viddler.py new file mode 100644 index 000000000..ecc48246f --- /dev/null +++ b/yt_dlp/extractor/viddler.py @@ -0,0 +1,137 @@ +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ( + float_or_none, + int_or_none, +) + + +class ViddlerIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?viddler\.com/(?:v|embed|player)/(?P<id>[a-z0-9]+)(?:.+?\bsecret=(\d+))?' + _TESTS = [{ + 'url': 'http://www.viddler.com/v/43903784', + 'md5': '9eee21161d2c7f5b39690c3e325fab2f', + 'info_dict': { + 'id': '43903784', + 'ext': 'mov', + 'title': 'Video Made Easy', + 'description': 'md5:6a697ebd844ff3093bd2e82c37b409cd', + 'uploader': 'viddler', + 'timestamp': 1335371429, + 'upload_date': '20120425', + 'duration': 100.89, + 'thumbnail': r're:^https?://.*\.jpg$', + 'view_count': int, + 'comment_count': int, + 'categories': ['video content', 'high quality video', 'video made easy', 'how to produce video with limited resources', 'viddler'], + } + }, { + 'url': 'http://www.viddler.com/v/4d03aad9/', + 'md5': 'f12c5a7fa839c47a79363bfdf69404fb', + 'info_dict': { + 'id': '4d03aad9', + 'ext': 'ts', + 'title': 'WALL-TO-GORTAT', + 'upload_date': '20150126', + 'uploader': 'deadspin', + 'timestamp': 1422285291, + 'view_count': int, + 'comment_count': int, + } + }, { + 'url': 'http://www.viddler.com/player/221ebbbd/0/', + 'md5': '740511f61d3d1bb71dc14a0fe01a1c10', + 'info_dict': { + 'id': '221ebbbd', + 'ext': 'mov', + 'title': 'LETeens-Grammar-snack-third-conditional', + 'description': ' ', + 'upload_date': '20140929', + 'uploader': 'BCLETeens', + 'timestamp': 1411997190, + 'view_count': int, + 'comment_count': int, + } + }, { + # secret protected + 'url': 'http://www.viddler.com/v/890c0985?secret=34051570', + 'info_dict': { + 'id': '890c0985', + 'ext': 'mp4', + 'title': 'Complete Property Training - Traineeships', + 'description': ' ', + 'upload_date': '20130606', + 'uploader': 'TiffanyBowtell', + 'timestamp': 1370496993, + 'view_count': int, + 'comment_count': int, + }, + 'params': { + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + video_id, secret = self._match_valid_url(url).groups() + + query = { + 'video_id': video_id, + 'key': 'v0vhrt7bg2xq1vyxhkct', + } + if secret: + query['secret'] = secret + + data = self._download_json( + 'http://api.viddler.com/api/v2/viddler.videos.getPlaybackDetails.json', + video_id, headers={'Referer': url}, query=query)['video'] + + formats = [] + for filed in data['files']: + if filed.get('status', 'ready') != 'ready': + continue + format_id = filed.get('profile_id') or filed['profile_name'] + f = { + 'format_id': format_id, + 'format_note': filed['profile_name'], + 'url': self._proto_relative_url(filed['url']), + 'width': int_or_none(filed.get('width')), + 'height': int_or_none(filed.get('height')), + 'filesize': int_or_none(filed.get('size')), + 'ext': filed.get('ext'), + 'source_preference': -1, + } + formats.append(f) + + if filed.get('cdn_url'): + f = f.copy() + f['url'] = self._proto_relative_url(filed['cdn_url'], 'http:') + f['format_id'] = format_id + '-cdn' + f['source_preference'] = 1 + formats.append(f) + + if filed.get('html5_video_source'): + f = f.copy() + f['url'] = self._proto_relative_url(filed['html5_video_source']) + f['format_id'] = format_id + '-html5' + f['source_preference'] = 0 + formats.append(f) + self._sort_formats(formats) + + categories = [ + t.get('text') for t in data.get('tags', []) if 'text' in t] + + return { + 'id': video_id, + 'title': data['title'], + 'formats': formats, + 'description': data.get('description'), + 'timestamp': int_or_none(data.get('upload_time')), + 'thumbnail': self._proto_relative_url(data.get('thumbnail_url')), + 'uploader': data.get('author'), + 'duration': float_or_none(data.get('length')), + 'view_count': int_or_none(data.get('view_count')), + 'comment_count': int_or_none(data.get('comment_count')), + 'categories': categories, + } diff --git a/yt_dlp/extractor/videa.py b/yt_dlp/extractor/videa.py new file mode 100644 index 000000000..512ade7af --- /dev/null +++ b/yt_dlp/extractor/videa.py @@ -0,0 +1,199 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import random +import re +import string + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + mimetype2ext, + parse_codecs, + parse_qs, + update_url_query, + urljoin, + xpath_element, + xpath_text, +) +from ..compat import ( + compat_b64decode, + compat_ord, + compat_struct_pack, +) + + +class VideaIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + videa(?:kid)?\.hu/ + (?: + videok/(?:[^/]+/)*[^?#&]+-| + (?:videojs_)?player\?.*?\bv=| + player/v/ + ) + (?P<id>[^?#&]+) + ''' + _TESTS = [{ + 'url': 'http://videa.hu/videok/allatok/az-orult-kigyasz-285-kigyot-kigyo-8YfIAjxwWGwT8HVQ', + 'md5': '97a7af41faeaffd9f1fc864a7c7e7603', + 'info_dict': { + 'id': '8YfIAjxwWGwT8HVQ', + 'ext': 'mp4', + 'title': 'Az őrült kígyász 285 kígyót enged szabadon', + 'thumbnail': r're:^https?://.*', + 'duration': 21, + }, + }, { + 'url': 'http://videa.hu/videok/origo/jarmuvek/supercars-elozes-jAHDWfWSJH5XuFhH', + 'md5': 'd57ccd8812c7fd491d33b1eab8c99975', + 'info_dict': { + 'id': 'jAHDWfWSJH5XuFhH', + 'ext': 'mp4', + 'title': 'Supercars előzés', + 'thumbnail': r're:^https?://.*', + 'duration': 64, + }, + }, { + 'url': 'http://videa.hu/player?v=8YfIAjxwWGwT8HVQ', + 'md5': '97a7af41faeaffd9f1fc864a7c7e7603', + 'info_dict': { + 'id': '8YfIAjxwWGwT8HVQ', + 'ext': 'mp4', + 'title': 'Az őrült kígyász 285 kígyót enged szabadon', + 'thumbnail': r're:^https?://.*', + 'duration': 21, + }, + }, { + 'url': 'http://videa.hu/player/v/8YfIAjxwWGwT8HVQ?autoplay=1', + 'only_matching': True, + }, { + 'url': 'https://videakid.hu/videok/origo/jarmuvek/supercars-elozes-jAHDWfWSJH5XuFhH', + 'only_matching': True, + }, { + 'url': 'https://videakid.hu/player?v=8YfIAjxwWGwT8HVQ', + 'only_matching': True, + }, { + 'url': 'https://videakid.hu/player/v/8YfIAjxwWGwT8HVQ?autoplay=1', + 'only_matching': True, + }] + _STATIC_SECRET = 'xHb0ZvME5q8CBcoQi6AngerDu3FGO9fkUlwPmLVY_RTzj2hJIS4NasXWKy1td7p' + + @staticmethod + def _extract_urls(webpage): + return [url for _, url in re.findall( + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//videa\.hu/player\?.*?\bv=.+?)\1', + webpage)] + + @staticmethod + def rc4(cipher_text, key): + res = b'' + + key_len = len(key) + S = list(range(256)) + + j = 0 + for i in range(256): + j = (j + S[i] + ord(key[i % key_len])) % 256 + S[i], S[j] = S[j], S[i] + + i = 0 + j = 0 + for m in range(len(cipher_text)): + i = (i + 1) % 256 + j = (j + S[i]) % 256 + S[i], S[j] = S[j], S[i] + k = S[(S[i] + S[j]) % 256] + res += compat_struct_pack('B', k ^ compat_ord(cipher_text[m])) + + return res.decode() + + def _real_extract(self, url): + video_id = self._match_id(url) + + video_page = self._download_webpage(url, video_id) + + if 'videa.hu/player' in url: + player_url = url + player_page = video_page + else: + player_url = self._search_regex( + r'<iframe.*?src="(/player\?[^"]+)"', video_page, 'player url') + player_url = urljoin(url, player_url) + player_page = self._download_webpage(player_url, video_id) + + nonce = self._search_regex( + r'_xt\s*=\s*"([^"]+)"', player_page, 'nonce') + l = nonce[:32] + s = nonce[32:] + result = '' + for i in range(0, 32): + result += s[i - (self._STATIC_SECRET.index(l[i]) - 31)] + + query = parse_qs(player_url) + random_seed = ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(8)) + query['_s'] = random_seed + query['_t'] = result[:16] + + b64_info, handle = self._download_webpage_handle( + 'http://videa.hu/videaplayer_get_xml.php', video_id, query=query) + if b64_info.startswith('<?xml'): + info = self._parse_xml(b64_info, video_id) + else: + key = result[16:] + random_seed + handle.headers['x-videa-xs'] + info = self._parse_xml(self.rc4( + compat_b64decode(b64_info), key), video_id) + + video = xpath_element(info, './video', 'video') + if not video: + raise ExtractorError(xpath_element( + info, './error', fatal=True), expected=True) + sources = xpath_element( + info, './video_sources', 'sources', fatal=True) + hash_values = xpath_element( + info, './hash_values', 'hash values', fatal=False) + + title = xpath_text(video, './title', fatal=True) + + formats = [] + for source in sources.findall('./video_source'): + source_url = source.text + source_name = source.get('name') + source_exp = source.get('exp') + if not (source_url and source_name): + continue + hash_value = None + if hash_values: + hash_value = xpath_text(hash_values, 'hash_value_' + source_name) + if hash_value and source_exp: + source_url = update_url_query(source_url, { + 'md5': hash_value, + 'expires': source_exp, + }) + f = parse_codecs(source.get('codecs')) + f.update({ + 'url': self._proto_relative_url(source_url), + 'ext': mimetype2ext(source.get('mimetype')) or 'mp4', + 'format_id': source.get('name'), + 'width': int_or_none(source.get('width')), + 'height': int_or_none(source.get('height')), + }) + formats.append(f) + self._sort_formats(formats) + + thumbnail = self._proto_relative_url(xpath_text(video, './poster_src')) + + age_limit = None + is_adult = xpath_text(video, './is_adult_content', default=None) + if is_adult: + age_limit = 18 if is_adult == '1' else 0 + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': int_or_none(xpath_text(video, './duration')), + 'age_limit': age_limit, + 'formats': formats, + } diff --git a/youtube_dl/extractor/videodetective.py b/yt_dlp/extractor/videodetective.py index fe70db713..fe70db713 100644 --- a/youtube_dl/extractor/videodetective.py +++ b/yt_dlp/extractor/videodetective.py diff --git a/youtube_dl/extractor/videofyme.py b/yt_dlp/extractor/videofyme.py index cd3f50a63..cd3f50a63 100644 --- a/youtube_dl/extractor/videofyme.py +++ b/yt_dlp/extractor/videofyme.py diff --git a/yt_dlp/extractor/videomore.py b/yt_dlp/extractor/videomore.py new file mode 100644 index 000000000..17ef3b1b9 --- /dev/null +++ b/yt_dlp/extractor/videomore.py @@ -0,0 +1,320 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_str, +) +from ..utils import ( + int_or_none, + parse_qs, +) + + +class VideomoreBaseIE(InfoExtractor): + _API_BASE_URL = 'https://more.tv/api/v3/web/' + _VALID_URL_BASE = r'https?://(?:videomore\.ru|more\.tv)/' + + def _download_page_data(self, display_id): + return self._download_json( + self._API_BASE_URL + 'PageData', display_id, query={ + 'url': '/' + display_id, + })['attributes']['response']['data'] + + def _track_url_result(self, track): + track_vod = track['trackVod'] + video_url = track_vod.get('playerLink') or track_vod['link'] + return self.url_result( + video_url, VideomoreIE.ie_key(), track_vod.get('hubId')) + + +class VideomoreIE(InfoExtractor): + IE_NAME = 'videomore' + _VALID_URL = r'''(?x) + videomore:(?P<sid>\d+)$| + https?:// + (?: + videomore\.ru/ + (?: + embed| + [^/]+/[^/]+ + )/| + (?: + (?:player\.)?videomore\.ru| + siren\.more\.tv/player + )/[^/]*\?.*?\btrack_id=| + odysseus\.more.tv/player/(?P<partner_id>\d+)/ + ) + (?P<id>\d+) + (?:[/?#&]|\.(?:xml|json)|$) + ''' + _TESTS = [{ + 'url': 'http://videomore.ru/kino_v_detalayah/5_sezon/367617', + 'md5': '44455a346edc0d509ac5b5a5b531dc35', + 'info_dict': { + 'id': '367617', + 'ext': 'flv', + 'title': 'Кино в деталях 5 сезон В гостях Алексей Чумаков и Юлия Ковальчук', + 'series': 'Кино в деталях', + 'episode': 'В гостях Алексей Чумаков и Юлия Ковальчук', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 2910, + 'view_count': int, + 'comment_count': int, + 'age_limit': 16, + }, + 'skip': 'The video is not available for viewing.', + }, { + 'url': 'http://videomore.ru/embed/259974', + 'info_dict': { + 'id': '259974', + 'ext': 'mp4', + 'title': 'Молодежка 2 сезон 40 серия', + 'series': 'Молодежка', + 'season': '2 сезон', + 'episode': '40 серия', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 2789, + 'view_count': int, + 'age_limit': 16, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://videomore.ru/molodezhka/sezon_promo/341073', + 'info_dict': { + 'id': '341073', + 'ext': 'flv', + 'title': 'Промо Команда проиграла из-за Бакина?', + 'episode': 'Команда проиграла из-за Бакина?', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 29, + 'age_limit': 16, + 'view_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'The video is not available for viewing.', + }, { + 'url': 'http://videomore.ru/elki_3?track_id=364623', + 'only_matching': True, + }, { + 'url': 'http://videomore.ru/embed/364623', + 'only_matching': True, + }, { + 'url': 'http://videomore.ru/video/tracks/364623.xml', + 'only_matching': True, + }, { + 'url': 'http://videomore.ru/video/tracks/364623.json', + 'only_matching': True, + }, { + 'url': 'http://videomore.ru/video/tracks/158031/quotes/33248', + 'only_matching': True, + }, { + 'url': 'videomore:367617', + 'only_matching': True, + }, { + 'url': 'https://player.videomore.ru/?partner_id=97&track_id=736234&autoplay=0&userToken=', + 'only_matching': True, + }, { + 'url': 'https://odysseus.more.tv/player/1788/352317', + 'only_matching': True, + }, { + 'url': 'https://siren.more.tv/player/config?track_id=352317&partner_id=1788&user_token=', + 'only_matching': True, + }] + _GEO_BYPASS = False + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<object[^>]+data=(["\'])https?://videomore\.ru/player\.swf\?.*config=(?P<url>https?://videomore\.ru/(?:[^/]+/)+\d+\.xml).*\1', + webpage) + if not mobj: + mobj = re.search( + r'<iframe[^>]+src=([\'"])(?P<url>https?://videomore\.ru/embed/\d+)', + webpage) + + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('sid') or mobj.group('id') + partner_id = mobj.group('partner_id') or parse_qs(url).get('partner_id', [None])[0] or '97' + + item = self._download_json( + 'https://siren.more.tv/player/config', video_id, query={ + 'partner_id': partner_id, + 'track_id': video_id, + })['data']['playlist']['items'][0] + + title = item.get('title') + series = item.get('project_name') + season = item.get('season_name') + episode = item.get('episode_name') + if not title: + title = [] + for v in (series, season, episode): + if v: + title.append(v) + title = ' '.join(title) + + streams = item.get('streams') or [] + for protocol in ('DASH', 'HLS'): + stream_url = item.get(protocol.lower() + '_url') + if stream_url: + streams.append({'protocol': protocol, 'url': stream_url}) + + formats = [] + for stream in streams: + stream_url = stream.get('url') + if not stream_url: + continue + protocol = stream.get('protocol') + if protocol == 'DASH': + formats.extend(self._extract_mpd_formats( + stream_url, video_id, mpd_id='dash', fatal=False)) + elif protocol == 'HLS': + formats.extend(self._extract_m3u8_formats( + stream_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif protocol == 'MSS': + formats.extend(self._extract_ism_formats( + stream_url, video_id, ism_id='mss', fatal=False)) + + if not formats: + error = item.get('error') + if error: + if error in ('Данное видео недоступно для просмотра на территории этой страны', 'Данное видео доступно для просмотра только на территории России'): + self.raise_geo_restricted(countries=['RU'], metadata_available=True) + self.raise_no_formats(error, expected=True) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'series': series, + 'season': season, + 'episode': episode, + 'thumbnail': item.get('thumbnail_url'), + 'duration': int_or_none(item.get('duration')), + 'view_count': int_or_none(item.get('views')), + 'age_limit': int_or_none(item.get('min_age')), + 'formats': formats, + } + + +class VideomoreVideoIE(VideomoreBaseIE): + IE_NAME = 'videomore:video' + _VALID_URL = VideomoreBaseIE._VALID_URL_BASE + r'(?P<id>(?:(?:[^/]+/){2})?[^/?#&]+)(?:/*|[?#&].*?)$' + _TESTS = [{ + # single video with og:video:iframe + 'url': 'http://videomore.ru/elki_3', + 'info_dict': { + 'id': '364623', + 'ext': 'flv', + 'title': 'Ёлки 3', + 'description': '', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 5579, + 'age_limit': 6, + 'view_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Requires logging in', + }, { + # season single series with og:video:iframe + 'url': 'http://videomore.ru/poslednii_ment/1_sezon/14_seriya', + 'info_dict': { + 'id': '352317', + 'ext': 'mp4', + 'title': 'Последний мент 1 сезон 14 серия', + 'series': 'Последний мент', + 'season': '1 сезон', + 'episode': '14 серия', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 2464, + 'age_limit': 16, + 'view_count': int, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://videomore.ru/sejchas_v_seti/serii_221-240/226_vypusk', + 'only_matching': True, + }, { + # single video without og:video:iframe + 'url': 'http://videomore.ru/marin_i_ego_druzya', + 'info_dict': { + 'id': '359073', + 'ext': 'flv', + 'title': '1 серия. Здравствуй, Аквавилль!', + 'description': 'md5:c6003179538b5d353e7bcd5b1372b2d7', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 754, + 'age_limit': 6, + 'view_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'redirects to https://more.tv/' + }, { + 'url': 'https://videomore.ru/molodezhka/6_sezon/29_seriya?utm_so', + 'only_matching': True, + }, { + 'url': 'https://more.tv/poslednii_ment/1_sezon/14_seriya', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if VideomoreIE.suitable(url) else super(VideomoreVideoIE, cls).suitable(url) + + def _real_extract(self, url): + display_id = self._match_id(url) + return self._track_url_result(self._download_page_data(display_id)) + + +class VideomoreSeasonIE(VideomoreBaseIE): + IE_NAME = 'videomore:season' + _VALID_URL = VideomoreBaseIE._VALID_URL_BASE + r'(?!embed)(?P<id>[^/]+/[^/?#&]+)(?:/*|[?#&].*?)$' + _TESTS = [{ + 'url': 'http://videomore.ru/molodezhka/film_o_filme', + 'info_dict': { + 'id': 'molodezhka/film_o_filme', + 'title': 'Фильм о фильме', + }, + 'playlist_mincount': 3, + }, { + 'url': 'http://videomore.ru/molodezhka/sezon_promo?utm_so', + 'only_matching': True, + }, { + 'url': 'https://more.tv/molodezhka/film_o_filme', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return (False if (VideomoreIE.suitable(url) or VideomoreVideoIE.suitable(url)) + else super(VideomoreSeasonIE, cls).suitable(url)) + + def _real_extract(self, url): + display_id = self._match_id(url) + season = self._download_page_data(display_id) + season_id = compat_str(season['id']) + tracks = self._download_json( + self._API_BASE_URL + 'seasons/%s/tracks' % season_id, + season_id)['data'] + entries = [] + for track in tracks: + entries.append(self._track_url_result(track)) + return self.playlist_result(entries, display_id, season.get('title')) diff --git a/youtube_dl/extractor/videopress.py b/yt_dlp/extractor/videopress.py index 6376ff096..6376ff096 100644 --- a/youtube_dl/extractor/videopress.py +++ b/yt_dlp/extractor/videopress.py diff --git a/yt_dlp/extractor/vidio.py b/yt_dlp/extractor/vidio.py new file mode 100644 index 000000000..571448bf2 --- /dev/null +++ b/yt_dlp/extractor/vidio.py @@ -0,0 +1,295 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ( + clean_html, + ExtractorError, + get_element_by_class, + int_or_none, + parse_iso8601, + smuggle_url, + str_or_none, + strip_or_none, + try_get, + unsmuggle_url, + urlencode_postdata, +) + + +class VidioBaseIE(InfoExtractor): + _LOGIN_URL = 'https://www.vidio.com/users/login' + _NETRC_MACHINE = 'vidio' + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + def is_logged_in(): + res = self._download_json( + 'https://www.vidio.com/interactions.json', None, 'Checking if logged in', fatal=False) or {} + return bool(res.get('current_user')) + + if is_logged_in(): + return + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading log in page') + + login_form = self._form_hidden_inputs("login-form", login_page) + login_form.update({ + 'user[login]': username, + 'user[password]': password, + }) + login_post, login_post_urlh = self._download_webpage_handle( + self._LOGIN_URL, None, 'Logging in', data=urlencode_postdata(login_form), expected_status=[302, 401]) + + if login_post_urlh.status == 401: + if get_element_by_class('onboarding-content-register-popup__title', login_post): + raise ExtractorError( + 'Unable to log in: The provided email has not registered yet.', expected=True) + + reason = get_element_by_class('onboarding-form__general-error', login_post) or get_element_by_class('onboarding-modal__title', login_post) + if 'Akun terhubung ke' in reason: + raise ExtractorError( + 'Unable to log in: Your account is linked to a social media account. ' + 'Use --cookies to provide account credentials instead', expected=True) + elif reason: + subreason = get_element_by_class('onboarding-modal__description-text', login_post) or '' + raise ExtractorError( + 'Unable to log in: %s. %s' % (reason, clean_html(subreason)), expected=True) + raise ExtractorError('Unable to log in') + + def _real_initialize(self): + self._api_key = self._download_json( + 'https://www.vidio.com/auth', None, data=b'')['api_key'] + self._login() + + def _call_api(self, url, video_id, note=None): + return self._download_json(url, video_id, note=note, headers={ + 'Content-Type': 'application/vnd.api+json', + 'X-API-KEY': self._api_key, + }) + + +class VidioIE(VidioBaseIE): + _VALID_URL = r'https?://(?:www\.)?vidio\.com/watch/(?P<id>\d+)-(?P<display_id>[^/?#&]+)' + _TESTS = [{ + 'url': 'http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015', + 'md5': 'cd2801394afc164e9775db6a140b91fe', + 'info_dict': { + 'id': '165683', + 'display_id': 'dj_ambred-booyah-live-2015', + 'ext': 'mp4', + 'title': 'DJ_AMBRED - Booyah (Live 2015)', + 'description': 'md5:27dc15f819b6a78a626490881adbadf8', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 149, + 'like_count': int, + 'uploader': 'TWELVE Pic', + 'timestamp': 1444902800, + 'upload_date': '20151015', + 'uploader_id': 'twelvepictures', + 'channel': 'Cover Music Video', + 'channel_id': '280236', + 'view_count': int, + 'dislike_count': int, + 'comment_count': int, + 'tags': 'count:4', + }, + }, { + 'url': 'https://www.vidio.com/watch/77949-south-korea-test-fires-missile-that-can-strike-all-of-the-north', + 'only_matching': True, + }, { + # Premier-exclusive video + 'url': 'https://www.vidio.com/watch/1550718-stand-by-me-doraemon', + 'only_matching': True + }] + + def _real_extract(self, url): + match = self._match_valid_url(url).groupdict() + video_id, display_id = match.get('id'), match.get('display_id') + data = self._call_api('https://api.vidio.com/videos/' + video_id, display_id) + video = data['videos'][0] + title = video['title'].strip() + is_premium = video.get('is_premium') + + if is_premium: + sources = self._download_json( + 'https://www.vidio.com/interactions_stream.json?video_id=%s&type=videos' % video_id, + display_id, note='Downloading premier API JSON') + if not (sources.get('source') or sources.get('source_dash')): + self.raise_login_required('This video is only available for registered users with the appropriate subscription') + + formats, subs = [], {} + if sources.get('source'): + hls_formats, hls_subs = self._extract_m3u8_formats_and_subtitles( + sources['source'], display_id, 'mp4', 'm3u8_native') + formats.extend(hls_formats) + subs.update(hls_subs) + if sources.get('source_dash'): # TODO: Find video example with source_dash + dash_formats, dash_subs = self._extract_mpd_formats_and_subtitles( + sources['source_dash'], display_id, 'dash') + formats.extend(dash_formats) + subs.update(dash_subs) + else: + hls_url = data['clips'][0]['hls_url'] + formats, subs = self._extract_m3u8_formats_and_subtitles( + hls_url, display_id, 'mp4', 'm3u8_native') + + self._sort_formats(formats) + + get_first = lambda x: try_get(data, lambda y: y[x + 's'][0], dict) or {} + channel = get_first('channel') + user = get_first('user') + username = user.get('username') + get_count = lambda x: int_or_none(video.get('total_' + x)) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': strip_or_none(video.get('description')), + 'thumbnail': video.get('image_url_medium'), + 'duration': int_or_none(video.get('duration')), + 'like_count': get_count('likes'), + 'formats': formats, + 'subtitles': subs, + 'uploader': user.get('name'), + 'timestamp': parse_iso8601(video.get('created_at')), + 'uploader_id': username, + 'uploader_url': 'https://www.vidio.com/@' + username if username else None, + 'channel': channel.get('name'), + 'channel_id': str_or_none(channel.get('id')), + 'view_count': get_count('view_count'), + 'dislike_count': get_count('dislikes'), + 'comment_count': get_count('comments'), + 'tags': video.get('tag_list'), + } + + +class VidioPremierIE(VidioBaseIE): + _VALID_URL = r'https?://(?:www\.)?vidio\.com/premier/(?P<id>\d+)/(?P<display_id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.vidio.com/premier/2885/badai-pasti-berlalu', + 'playlist_mincount': 14, + }, { + # Series with both free and premier-exclusive videos + 'url': 'https://www.vidio.com/premier/2567/sosmed', + 'only_matching': True, + }] + + def _playlist_entries(self, playlist_url, display_id): + index = 1 + while playlist_url: + playlist_json = self._call_api(playlist_url, display_id, 'Downloading API JSON page %s' % index) + for video_json in playlist_json.get('data', []): + link = video_json['links']['watchpage'] + yield self.url_result(link, 'Vidio', video_json['id']) + playlist_url = try_get(playlist_json, lambda x: x['links']['next']) + index += 1 + + def _real_extract(self, url): + url, idata = unsmuggle_url(url, {}) + playlist_id, display_id = self._match_valid_url(url).groups() + + playlist_url = idata.get('url') + if playlist_url: # Smuggled data contains an API URL. Download only that playlist + playlist_id = idata['id'] + return self.playlist_result( + self._playlist_entries(playlist_url, playlist_id), + playlist_id=playlist_id, playlist_title=idata.get('title')) + + playlist_data = self._call_api('https://api.vidio.com/content_profiles/%s/playlists' % playlist_id, display_id) + + return self.playlist_from_matches( + playlist_data.get('data', []), playlist_id=playlist_id, ie=self.ie_key(), + getter=lambda data: smuggle_url(url, { + 'url': data['relationships']['videos']['links']['related'], + 'id': data['id'], + 'title': try_get(data, lambda x: x['attributes']['name']) + })) + + +class VidioLiveIE(VidioBaseIE): + _VALID_URL = r'https?://(?:www\.)?vidio\.com/live/(?P<id>\d+)-(?P<display_id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.vidio.com/live/204-sctv', + 'info_dict': { + 'id': '204', + 'title': 'SCTV', + 'uploader': 'SCTV', + 'uploader_id': 'sctv', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + }, { + # Premier-exclusive livestream + 'url': 'https://www.vidio.com/live/6362-tvn', + 'only_matching': True, + }, { + # DRM premier-exclusive livestream + 'url': 'https://www.vidio.com/live/6299-bein-1', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id, display_id = self._match_valid_url(url).groups() + stream_data = self._call_api( + 'https://www.vidio.com/api/livestreamings/%s/detail' % video_id, display_id) + stream_meta = stream_data['livestreamings'][0] + user = stream_data.get('users', [{}])[0] + + title = stream_meta.get('title') + username = user.get('username') + + formats = [] + if stream_meta.get('is_drm'): + if not self.get_param('allow_unplayable_formats'): + self.report_drm(video_id) + if stream_meta.get('is_premium'): + sources = self._download_json( + 'https://www.vidio.com/interactions_stream.json?video_id=%s&type=livestreamings' % video_id, + display_id, note='Downloading premier API JSON') + if not (sources.get('source') or sources.get('source_dash')): + self.raise_login_required('This video is only available for registered users with the appropriate subscription') + + if str_or_none(sources.get('source')): + token_json = self._download_json( + 'https://www.vidio.com/live/%s/tokens' % video_id, + display_id, note='Downloading HLS token JSON', data=b'') + formats.extend(self._extract_m3u8_formats( + sources['source'] + '?' + token_json.get('token', ''), display_id, 'mp4', 'm3u8_native')) + if str_or_none(sources.get('source_dash')): + pass + else: + if stream_meta.get('stream_token_url'): + token_json = self._download_json( + 'https://www.vidio.com/live/%s/tokens' % video_id, + display_id, note='Downloading HLS token JSON', data=b'') + formats.extend(self._extract_m3u8_formats( + stream_meta['stream_token_url'] + '?' + token_json.get('token', ''), + display_id, 'mp4', 'm3u8_native')) + if stream_meta.get('stream_dash_url'): + pass + if stream_meta.get('stream_url'): + formats.extend(self._extract_m3u8_formats( + stream_meta['stream_url'], display_id, 'mp4', 'm3u8_native')) + self._sort_formats(formats) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'is_live': True, + 'description': strip_or_none(stream_meta.get('description')), + 'thumbnail': stream_meta.get('image'), + 'like_count': int_or_none(stream_meta.get('like')), + 'dislike_count': int_or_none(stream_meta.get('dislike')), + 'formats': formats, + 'uploader': user.get('name'), + 'timestamp': parse_iso8601(stream_meta.get('start_time')), + 'uploader_id': username, + 'uploader_url': 'https://www.vidio.com/@' + username if username else None, + } diff --git a/youtube_dl/extractor/vidlii.py b/yt_dlp/extractor/vidlii.py index f4774256b..f4774256b 100644 --- a/youtube_dl/extractor/vidlii.py +++ b/yt_dlp/extractor/vidlii.py diff --git a/yt_dlp/extractor/vidzi.py b/yt_dlp/extractor/vidzi.py new file mode 100644 index 000000000..42ea4952c --- /dev/null +++ b/yt_dlp/extractor/vidzi.py @@ -0,0 +1,68 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + decode_packed_codes, + js_to_json, + NO_DEFAULT, + PACKED_CODES_RE, +) + + +class VidziIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?vidzi\.(?:tv|cc|si|nu)/(?:embed-)?(?P<id>[0-9a-zA-Z]+)' + _TESTS = [{ + 'url': 'http://vidzi.tv/cghql9yq6emu.html', + 'md5': '4f16c71ca0c8c8635ab6932b5f3f1660', + 'info_dict': { + 'id': 'cghql9yq6emu', + 'ext': 'mp4', + 'title': 'youtube-dl test video 1\\\\2\'3/4<5\\\\6ä7↭', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://vidzi.tv/embed-4z2yb0rzphe9-600x338.html', + 'only_matching': True, + }, { + 'url': 'http://vidzi.cc/cghql9yq6emu.html', + 'only_matching': True, + }, { + 'url': 'https://vidzi.si/rph9gztxj1et.html', + 'only_matching': True, + }, { + 'url': 'http://vidzi.nu/cghql9yq6emu.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'http://vidzi.tv/%s' % video_id, video_id) + title = self._html_search_regex( + r'(?s)<h2 class="video-title">(.*?)</h2>', webpage, 'title') + + codes = [webpage] + codes.extend([ + decode_packed_codes(mobj.group(0)).replace('\\\'', '\'') + for mobj in re.finditer(PACKED_CODES_RE, webpage)]) + for num, code in enumerate(codes, 1): + jwplayer_data = self._parse_json( + self._search_regex( + r'setup\(([^)]+)\)', code, 'jwplayer data', + default=NO_DEFAULT if num == len(codes) else '{}'), + video_id, transform_source=lambda s: js_to_json( + re.sub(r'\s*\+\s*window\[.+?\]', '', s))) + if jwplayer_data: + break + + info_dict = self._parse_jwplayer_data(jwplayer_data, video_id, require_title=False) + info_dict['title'] = title + + return info_dict diff --git a/yt_dlp/extractor/vier.py b/yt_dlp/extractor/vier.py new file mode 100644 index 000000000..94aa350e7 --- /dev/null +++ b/yt_dlp/extractor/vier.py @@ -0,0 +1,264 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import itertools + +from .common import InfoExtractor +from ..utils import ( + urlencode_postdata, + int_or_none, + unified_strdate, +) + + +class VierIE(InfoExtractor): + IE_NAME = 'vier' + IE_DESC = 'vier.be and vijf.be' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?(?P<site>vier|vijf)\.be/ + (?: + (?: + [^/]+/videos| + video(?:/[^/]+)* + )/ + (?P<display_id>[^/]+)(?:/(?P<id>\d+))?| + (?: + video/v3/embed| + embed/video/public + )/(?P<embed_id>\d+) + ) + ''' + _NETRC_MACHINE = 'vier' + _TESTS = [{ + 'url': 'http://www.vier.be/planb/videos/het-wordt-warm-de-moestuin/16129', + 'md5': 'e4ae2054a6b040ef1e289e20d111b46e', + 'info_dict': { + 'id': '16129', + 'display_id': 'het-wordt-warm-de-moestuin', + 'ext': 'mp4', + 'title': 'Het wordt warm in De Moestuin', + 'description': 'De vele uren werk eisen hun tol. Wim droomt van assistentie...', + 'upload_date': '20121025', + 'series': 'Plan B', + 'tags': ['De Moestuin', 'Moestuin', 'meisjes', 'Tomaat', 'Wim', 'Droom'], + }, + }, { + 'url': 'http://www.vijf.be/temptationisland/videos/zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas/2561614', + 'info_dict': { + 'id': '2561614', + 'display_id': 'zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas', + 'ext': 'mp4', + 'title': 'md5:84f45fe48b8c1fa296a7f6d208d080a7', + 'description': 'md5:0356d4981e58b8cbee19355cbd51a8fe', + 'upload_date': '20170228', + 'series': 'Temptation Island', + 'tags': list, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.vier.be/janigaat/videos/jani-gaat-naar-tokio-aflevering-4/2674839', + 'info_dict': { + 'id': '2674839', + 'display_id': 'jani-gaat-naar-tokio-aflevering-4', + 'ext': 'mp4', + 'title': 'Jani gaat naar Tokio - Aflevering 4', + 'description': 'md5:aa8d611541db6ae9e863125704511f88', + 'upload_date': '20170501', + 'series': 'Jani gaat', + 'episode_number': 4, + 'tags': ['Jani Gaat', 'Volledige Aflevering'], + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Requires account credentials', + }, { + # Requires account credentials but bypassed extraction via v3/embed page + # without metadata + 'url': 'http://www.vier.be/janigaat/videos/jani-gaat-naar-tokio-aflevering-4/2674839', + 'info_dict': { + 'id': '2674839', + 'display_id': 'jani-gaat-naar-tokio-aflevering-4', + 'ext': 'mp4', + 'title': 'jani-gaat-naar-tokio-aflevering-4', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Log in to extract metadata'], + }, { + # Without video id in URL + 'url': 'http://www.vier.be/planb/videos/dit-najaar-plan-b', + 'only_matching': True, + }, { + 'url': 'http://www.vier.be/video/v3/embed/16129', + 'only_matching': True, + }, { + 'url': 'https://www.vijf.be/embed/video/public/4093', + 'only_matching': True, + }, { + 'url': 'https://www.vier.be/video/blockbusters/in-juli-en-augustus-summer-classics', + 'only_matching': True, + }, { + 'url': 'https://www.vier.be/video/achter-de-rug/2017/achter-de-rug-seizoen-1-aflevering-6', + 'only_matching': True, + }] + + def _real_initialize(self): + self._logged_in = False + + def _login(self, site): + username, password = self._get_login_info() + if username is None or password is None: + return + + login_page = self._download_webpage( + 'http://www.%s.be/user/login' % site, + None, note='Logging in', errnote='Unable to log in', + data=urlencode_postdata({ + 'form_id': 'user_login', + 'name': username, + 'pass': password, + }), + headers={'Content-Type': 'application/x-www-form-urlencoded'}) + + login_error = self._html_search_regex( + r'(?s)<div class="messages error">\s*<div>\s*<h2.+?</h2>(.+?)<', + login_page, 'login error', default=None) + if login_error: + self.report_warning('Unable to log in: %s' % login_error) + else: + self._logged_in = True + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + embed_id = mobj.group('embed_id') + display_id = mobj.group('display_id') or embed_id + video_id = mobj.group('id') or embed_id + site = mobj.group('site') + + if not self._logged_in: + self._login(site) + + webpage = self._download_webpage(url, display_id) + + if r'id="user-login"' in webpage: + self.report_warning( + 'Log in to extract metadata', video_id=display_id) + webpage = self._download_webpage( + 'http://www.%s.be/video/v3/embed/%s' % (site, video_id), + display_id) + + video_id = self._search_regex( + [r'data-nid="(\d+)"', r'"nid"\s*:\s*"(\d+)"'], + webpage, 'video id', default=video_id or display_id) + + playlist_url = self._search_regex( + r'data-file=(["\'])(?P<url>(?:https?:)?//[^/]+/.+?\.m3u8.*?)\1', + webpage, 'm3u8 url', default=None, group='url') + + if not playlist_url: + application = self._search_regex( + [r'data-application="([^"]+)"', r'"application"\s*:\s*"([^"]+)"'], + webpage, 'application', default=site + '_vod') + filename = self._search_regex( + [r'data-filename="([^"]+)"', r'"filename"\s*:\s*"([^"]+)"'], + webpage, 'filename') + playlist_url = 'http://vod.streamcloud.be/%s/_definst_/mp4:%s.mp4/playlist.m3u8' % (application, filename) + + formats = self._extract_wowza_formats( + playlist_url, display_id, skip_protocols=['dash']) + self._sort_formats(formats) + + title = self._og_search_title(webpage, default=display_id) + description = self._html_search_regex( + r'(?s)<div\b[^>]+\bclass=(["\'])[^>]*?\bfield-type-text-with-summary\b[^>]*?\1[^>]*>.*?<p>(?P<value>.+?)</p>', + webpage, 'description', default=None, group='value') + thumbnail = self._og_search_thumbnail(webpage, default=None) + upload_date = unified_strdate(self._html_search_regex( + r'(?s)<div\b[^>]+\bclass=(["\'])[^>]*?\bfield-name-post-date\b[^>]*?\1[^>]*>.*?(?P<value>\d{2}/\d{2}/\d{4})', + webpage, 'upload date', default=None, group='value')) + + series = self._search_regex( + r'data-program=(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, + 'series', default=None, group='value') + episode_number = int_or_none(self._search_regex( + r'(?i)aflevering (\d+)', title, 'episode number', default=None)) + tags = re.findall(r'<a\b[^>]+\bhref=["\']/tags/[^>]+>([^<]+)<', webpage) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'series': series, + 'episode_number': episode_number, + 'tags': tags, + 'formats': formats, + } + + +class VierVideosIE(InfoExtractor): + IE_NAME = 'vier:videos' + _VALID_URL = r'https?://(?:www\.)?(?P<site>vier|vijf)\.be/(?P<program>[^/]+)/videos(?:\?.*\bpage=(?P<page>\d+)|$)' + _TESTS = [{ + 'url': 'http://www.vier.be/demoestuin/videos', + 'info_dict': { + 'id': 'demoestuin', + }, + 'playlist_mincount': 153, + }, { + 'url': 'http://www.vijf.be/temptationisland/videos', + 'info_dict': { + 'id': 'temptationisland', + }, + 'playlist_mincount': 159, + }, { + 'url': 'http://www.vier.be/demoestuin/videos?page=6', + 'info_dict': { + 'id': 'demoestuin-page6', + }, + 'playlist_mincount': 20, + }, { + 'url': 'http://www.vier.be/demoestuin/videos?page=7', + 'info_dict': { + 'id': 'demoestuin-page7', + }, + 'playlist_mincount': 13, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + program = mobj.group('program') + site = mobj.group('site') + + page_id = mobj.group('page') + if page_id: + page_id = int(page_id) + start_page = page_id + playlist_id = '%s-page%d' % (program, page_id) + else: + start_page = 0 + playlist_id = program + + entries = [] + for current_page_id in itertools.count(start_page): + current_page = self._download_webpage( + 'http://www.%s.be/%s/videos?page=%d' % (site, program, current_page_id), + program, + 'Downloading page %d' % (current_page_id + 1)) + page_entries = [ + self.url_result('http://www.' + site + '.be' + video_url, 'Vier') + for video_url in re.findall( + r'<h[23]><a href="(/[^/]+/videos/[^/]+(?:/\d+)?)">', current_page)] + entries.extend(page_entries) + if page_id or '>Meer<' not in current_page: + break + + return self.playlist_result(entries, playlist_id) diff --git a/yt_dlp/extractor/viewlift.py b/yt_dlp/extractor/viewlift.py new file mode 100644 index 000000000..c3b2e863d --- /dev/null +++ b/yt_dlp/extractor/viewlift.py @@ -0,0 +1,250 @@ +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ( + ExtractorError, + int_or_none, + parse_age_limit, +) + + +class ViewLiftBaseIE(InfoExtractor): + _API_BASE = 'https://prod-api.viewlift.com/' + _DOMAINS_REGEX = r'(?:(?:main\.)?snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|(?:monumental|lax)sportsnetwork|vayafilm|failarmy|ftfnext|lnppass\.legapallacanestro|moviespree|app\.myoutdoortv|neoufitness|pflmma|theidentitytb)\.com|(?:hoichoi|app\.horseandcountry|kronon|marquee|supercrosslive)\.tv' + _SITE_MAP = { + 'ftfnext': 'lax', + 'funnyforfree': 'snagfilms', + 'hoichoi': 'hoichoitv', + 'kiddovid': 'snagfilms', + 'laxsportsnetwork': 'lax', + 'legapallacanestro': 'lnp', + 'marquee': 'marquee-tv', + 'monumentalsportsnetwork': 'monumental-network', + 'moviespree': 'bingeflix', + 'pflmma': 'pfl', + 'snagxtreme': 'snagfilms', + 'theidentitytb': 'tampabay', + 'vayafilm': 'snagfilms', + } + _TOKENS = {} + + def _call_api(self, site, path, video_id, query): + token = self._TOKENS.get(site) + if not token: + token_query = {'site': site} + email, password = self._get_login_info(netrc_machine=site) + if email: + resp = self._download_json( + self._API_BASE + 'identity/signin', video_id, + 'Logging in', query=token_query, data=json.dumps({ + 'email': email, + 'password': password, + }).encode()) + else: + resp = self._download_json( + self._API_BASE + 'identity/anonymous-token', video_id, + 'Downloading authorization token', query=token_query) + self._TOKENS[site] = token = resp['authorizationToken'] + return self._download_json( + self._API_BASE + path, video_id, + headers={'Authorization': token}, query=query) + + +class ViewLiftEmbedIE(ViewLiftBaseIE): + IE_NAME = 'viewlift:embed' + _VALID_URL = r'https?://(?:(?:www|embed)\.)?(?P<domain>%s)/embed/player\?.*\bfilmId=(?P<id>[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' % ViewLiftBaseIE._DOMAINS_REGEX + _TESTS = [{ + 'url': 'http://embed.snagfilms.com/embed/player?filmId=74849a00-85a9-11e1-9660-123139220831&w=500', + 'md5': '2924e9215c6eff7a55ed35b72276bd93', + 'info_dict': { + 'id': '74849a00-85a9-11e1-9660-123139220831', + 'ext': 'mp4', + 'title': '#whilewewatch', + 'description': 'md5:b542bef32a6f657dadd0df06e26fb0c8', + 'timestamp': 1334350096, + 'upload_date': '20120413', + } + }, { + # invalid labels, 360p is better that 480p + 'url': 'http://www.snagfilms.com/embed/player?filmId=17ca0950-a74a-11e0-a92a-0026bb61d036', + 'md5': '882fca19b9eb27ef865efeeaed376a48', + 'info_dict': { + 'id': '17ca0950-a74a-11e0-a92a-0026bb61d036', + 'ext': 'mp4', + 'title': 'Life in Limbo', + }, + 'skip': 'The video does not exist', + }, { + 'url': 'http://www.snagfilms.com/embed/player?filmId=0000014c-de2f-d5d6-abcf-ffef58af0017', + 'only_matching': True, + }] + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:embed\.)?(?:%s)/embed/player.+?)\1' % ViewLiftBaseIE._DOMAINS_REGEX, + webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + domain, film_id = self._match_valid_url(url).groups() + site = domain.split('.')[-2] + if site in self._SITE_MAP: + site = self._SITE_MAP[site] + try: + content_data = self._call_api( + site, 'entitlement/video/status', film_id, { + 'id': film_id + })['video'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + error_message = self._parse_json(e.cause.read().decode(), film_id).get('errorMessage') + if error_message == 'User does not have a valid subscription or has not purchased this content.': + self.raise_login_required() + raise ExtractorError(error_message, expected=True) + raise + gist = content_data['gist'] + title = gist['title'] + video_assets = content_data['streamingInfo']['videoAssets'] + + formats = [] + mpeg_video_assets = video_assets.get('mpeg') or [] + for video_asset in mpeg_video_assets: + video_asset_url = video_asset.get('url') + if not video_asset: + continue + bitrate = int_or_none(video_asset.get('bitrate')) + height = int_or_none(self._search_regex( + r'^_?(\d+)[pP]$', video_asset.get('renditionValue'), + 'height', default=None)) + formats.append({ + 'url': video_asset_url, + 'format_id': 'http%s' % ('-%d' % bitrate if bitrate else ''), + 'tbr': bitrate, + 'height': height, + 'vcodec': video_asset.get('codec'), + }) + + hls_url = video_assets.get('hls') + if hls_url: + formats.extend(self._extract_m3u8_formats( + hls_url, film_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + self._sort_formats(formats) + + info = { + 'id': film_id, + 'title': title, + 'description': gist.get('description'), + 'thumbnail': gist.get('videoImageUrl'), + 'duration': int_or_none(gist.get('runtime')), + 'age_limit': parse_age_limit(content_data.get('parentalRating')), + 'timestamp': int_or_none(gist.get('publishDate'), 1000), + 'formats': formats, + } + for k in ('categories', 'tags'): + info[k] = [v['title'] for v in content_data.get(k, []) if v.get('title')] + return info + + +class ViewLiftIE(ViewLiftBaseIE): + IE_NAME = 'viewlift' + _VALID_URL = r'https?://(?:www\.)?(?P<domain>%s)(?P<path>(?:/(?:films/title|show|(?:news/)?videos?|watch))?/(?P<id>[^?#]+))' % ViewLiftBaseIE._DOMAINS_REGEX + _TESTS = [{ + 'url': 'http://www.snagfilms.com/films/title/lost_for_life', + 'md5': '19844f897b35af219773fd63bdec2942', + 'info_dict': { + 'id': '0000014c-de2f-d5d6-abcf-ffef58af0017', + 'display_id': 'lost_for_life', + 'ext': 'mp4', + 'title': 'Lost for Life', + 'description': 'md5:ea10b5a50405ae1f7b5269a6ec594102', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 4489, + 'categories': 'mincount:3', + 'age_limit': 14, + 'upload_date': '20150421', + 'timestamp': 1429656820, + } + }, { + 'url': 'http://www.snagfilms.com/show/the_world_cut_project/india', + 'md5': 'e6292e5b837642bbda82d7f8bf3fbdfd', + 'info_dict': { + 'id': '00000145-d75c-d96e-a9c7-ff5c67b20000', + 'display_id': 'the_world_cut_project/india', + 'ext': 'mp4', + 'title': 'India', + 'description': 'md5:5c168c5a8f4719c146aad2e0dfac6f5f', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 979, + 'timestamp': 1399478279, + 'upload_date': '20140507', + } + }, { + 'url': 'http://main.snagfilms.com/augie_alone/s_2_ep_12_love', + 'info_dict': { + 'id': '00000148-7b53-de26-a9fb-fbf306f70020', + 'display_id': 'augie_alone/s_2_ep_12_love', + 'ext': 'mp4', + 'title': 'S. 2 Ep. 12 - Love', + 'description': 'Augie finds love.', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 107, + 'upload_date': '20141012', + 'timestamp': 1413129540, + 'age_limit': 17, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://main.snagfilms.com/films/title/the_freebie', + 'only_matching': True, + }, { + # Film is not playable in your area. + 'url': 'http://www.snagfilms.com/films/title/inside_mecca', + 'only_matching': True, + }, { + # Film is not available. + 'url': 'http://www.snagfilms.com/show/augie_alone/flirting', + 'only_matching': True, + }, { + 'url': 'http://www.winnersview.com/videos/the-good-son', + 'only_matching': True, + }, { + # Was once Kaltura embed + 'url': 'https://www.monumentalsportsnetwork.com/videos/john-carlson-postgame-2-25-15', + 'only_matching': True, + }, { + 'url': 'https://www.marquee.tv/watch/sadlerswells-sacredmonsters', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if ViewLiftEmbedIE.suitable(url) else super(ViewLiftIE, cls).suitable(url) + + def _real_extract(self, url): + domain, path, display_id = self._match_valid_url(url).groups() + site = domain.split('.')[-2] + if site in self._SITE_MAP: + site = self._SITE_MAP[site] + modules = self._call_api( + site, 'content/pages', display_id, { + 'includeContent': 'true', + 'moduleOffset': 1, + 'path': path, + 'site': site, + })['modules'] + film_id = next(m['contentData'][0]['gist']['id'] for m in modules if m.get('moduleType') == 'VideoDetailModule') + return { + '_type': 'url_transparent', + 'url': 'http://%s/embed/player?filmId=%s' % (domain, film_id), + 'id': film_id, + 'display_id': display_id, + 'ie_key': 'ViewLiftEmbed', + } diff --git a/yt_dlp/extractor/viidea.py b/yt_dlp/extractor/viidea.py new file mode 100644 index 000000000..0da06818b --- /dev/null +++ b/yt_dlp/extractor/viidea.py @@ -0,0 +1,202 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_HTTPError, + compat_str, + compat_urlparse, +) +from ..utils import ( + ExtractorError, + js_to_json, + parse_duration, + parse_iso8601, +) + + +class ViideaIE(InfoExtractor): + _VALID_URL = r'''(?x)https?://(?:www\.)?(?: + videolectures\.net| + flexilearn\.viidea\.net| + presentations\.ocwconsortium\.org| + video\.travel-zoom\.si| + video\.pomp-forum\.si| + tv\.nil\.si| + video\.hekovnik.com| + video\.szko\.si| + kpk\.viidea\.com| + inside\.viidea\.net| + video\.kiberpipa\.org| + bvvideo\.si| + kongres\.viidea\.net| + edemokracija\.viidea\.com + )(?:/lecture)?/(?P<id>[^/]+)(?:/video/(?P<part>\d+))?/*(?:[#?].*)?$''' + + _TESTS = [{ + 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/', + 'info_dict': { + 'id': '20171', + 'display_id': 'promogram_igor_mekjavic_eng', + 'ext': 'mp4', + 'title': 'Automatics, robotics and biocybernetics', + 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482', + 'thumbnail': r're:http://.*\.jpg', + 'timestamp': 1372349289, + 'upload_date': '20130627', + 'duration': 565, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # video with invalid direct format links (HTTP 403) + 'url': 'http://videolectures.net/russir2010_filippova_nlp/', + 'info_dict': { + 'id': '14891', + 'display_id': 'russir2010_filippova_nlp', + 'ext': 'flv', + 'title': 'NLP at Google', + 'description': 'md5:fc7a6d9bf0302d7cc0e53f7ca23747b3', + 'thumbnail': r're:http://.*\.jpg', + 'timestamp': 1284375600, + 'upload_date': '20100913', + 'duration': 5352, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, { + # event playlist + 'url': 'http://videolectures.net/deeplearning2015_montreal/', + 'info_dict': { + 'id': '23181', + 'title': 'Deep Learning Summer School, Montreal 2015', + 'description': 'md5:0533a85e4bd918df52a01f0e1ebe87b7', + 'thumbnail': r're:http://.*\.jpg', + 'timestamp': 1438560000, + }, + 'playlist_count': 30, + }, { + # multi part lecture + 'url': 'http://videolectures.net/mlss09uk_bishop_ibi/', + 'info_dict': { + 'id': '9737', + 'display_id': 'mlss09uk_bishop_ibi', + 'title': 'Introduction To Bayesian Inference', + 'thumbnail': r're:http://.*\.jpg', + 'timestamp': 1251622800, + }, + 'playlist': [{ + 'info_dict': { + 'id': '9737_part1', + 'display_id': 'mlss09uk_bishop_ibi_part1', + 'ext': 'wmv', + 'title': 'Introduction To Bayesian Inference (Part 1)', + 'thumbnail': r're:http://.*\.jpg', + 'duration': 4622, + 'timestamp': 1251622800, + 'upload_date': '20090830', + }, + }, { + 'info_dict': { + 'id': '9737_part2', + 'display_id': 'mlss09uk_bishop_ibi_part2', + 'ext': 'wmv', + 'title': 'Introduction To Bayesian Inference (Part 2)', + 'thumbnail': r're:http://.*\.jpg', + 'duration': 5641, + 'timestamp': 1251622800, + 'upload_date': '20090830', + }, + }], + 'playlist_count': 2, + }] + + def _real_extract(self, url): + lecture_slug, explicit_part_id = self._match_valid_url(url).groups() + + webpage = self._download_webpage(url, lecture_slug) + + cfg = self._parse_json(self._search_regex( + [r'cfg\s*:\s*({.+?})\s*,\s*[\da-zA-Z_]+\s*:\s*\(?\s*function', + r'cfg\s*:\s*({[^}]+})'], + webpage, 'cfg'), lecture_slug, js_to_json) + + lecture_id = compat_str(cfg['obj_id']) + + base_url = self._proto_relative_url(cfg['livepipe'], 'http:') + + try: + lecture_data = self._download_json( + '%s/site/api/lecture/%s?format=json' % (base_url, lecture_id), + lecture_id)['lecture'][0] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + msg = self._parse_json( + e.cause.read().decode('utf-8'), lecture_id) + raise ExtractorError(msg['detail'], expected=True) + raise + + lecture_info = { + 'id': lecture_id, + 'display_id': lecture_slug, + 'title': lecture_data['title'], + 'timestamp': parse_iso8601(lecture_data.get('time')), + 'description': lecture_data.get('description_wiki'), + 'thumbnail': lecture_data.get('thumb'), + } + + playlist_entries = [] + lecture_type = lecture_data.get('type') + parts = [compat_str(video) for video in cfg.get('videos', [])] + if parts: + multipart = len(parts) > 1 + + def extract_part(part_id): + smil_url = '%s/%s/video/%s/smil.xml' % (base_url, lecture_slug, part_id) + smil = self._download_smil(smil_url, lecture_id) + info = self._parse_smil(smil, smil_url, lecture_id) + self._sort_formats(info['formats']) + info['id'] = lecture_id if not multipart else '%s_part%s' % (lecture_id, part_id) + info['display_id'] = lecture_slug if not multipart else '%s_part%s' % (lecture_slug, part_id) + if multipart: + info['title'] += ' (Part %s)' % part_id + switch = smil.find('.//switch') + if switch is not None: + info['duration'] = parse_duration(switch.attrib.get('dur')) + item_info = lecture_info.copy() + item_info.update(info) + return item_info + + if explicit_part_id or not multipart: + result = extract_part(explicit_part_id or parts[0]) + else: + result = { + '_type': 'multi_video', + 'entries': [extract_part(part) for part in parts], + } + result.update(lecture_info) + + # Immediately return explicitly requested part or non event item + if explicit_part_id or lecture_type != 'evt': + return result + + playlist_entries.append(result) + + # It's probably a playlist + if not parts or lecture_type == 'evt': + playlist_webpage = self._download_webpage( + '%s/site/ajax/drilldown/?id=%s' % (base_url, lecture_id), lecture_id) + entries = [ + self.url_result(compat_urlparse.urljoin(url, video_url), 'Viidea') + for _, video_url in re.findall( + r'<a[^>]+href=(["\'])(.+?)\1[^>]+id=["\']lec=\d+', playlist_webpage)] + playlist_entries.extend(entries) + + playlist = self.playlist_result(playlist_entries, lecture_id) + playlist.update(lecture_info) + return playlist diff --git a/yt_dlp/extractor/viki.py b/yt_dlp/extractor/viki.py new file mode 100644 index 000000000..acb5ae550 --- /dev/null +++ b/yt_dlp/extractor/viki.py @@ -0,0 +1,365 @@ +# coding: utf-8 +from __future__ import unicode_literals +import hashlib +import hmac +import json +import time + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + parse_age_limit, + parse_iso8601, + try_get, +) + + +class VikiBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?viki\.(?:com|net|mx|jp|fr)/' + _API_URL_TEMPLATE = 'https://api.viki.io%s' + + _DEVICE_ID = '86085977d' # used for android api + _APP = '100005a' + _APP_VERSION = '6.11.3' + _APP_SECRET = 'd96704b180208dbb2efa30fe44c48bd8690441af9f567ba8fd710a72badc85198f7472' + + _GEO_BYPASS = False + _NETRC_MACHINE = 'viki' + + _token = None + + _ERRORS = { + 'geo': 'Sorry, this content is not available in your region.', + 'upcoming': 'Sorry, this content is not yet available.', + 'paywall': 'Sorry, this content is only available to Viki Pass Plus subscribers', + } + + def _stream_headers(self, timestamp, sig): + return { + 'X-Viki-manufacturer': 'vivo', + 'X-Viki-device-model': 'vivo 1606', + 'X-Viki-device-os-ver': '6.0.1', + 'X-Viki-connection-type': 'WIFI', + 'X-Viki-carrier': '', + 'X-Viki-as-id': '100005a-1625321982-3932', + 'timestamp': str(timestamp), + 'signature': str(sig), + 'x-viki-app-ver': self._APP_VERSION + } + + def _api_query(self, path, version=4, **kwargs): + path += '?' if '?' not in path else '&' + query = f'/v{version}/{path}app={self._APP}' + if self._token: + query += '&token=%s' % self._token + return query + ''.join(f'&{name}={val}' for name, val in kwargs.items()) + + def _sign_query(self, path): + timestamp = int(time.time()) + query = self._api_query(path, version=5) + sig = hmac.new( + self._APP_SECRET.encode('ascii'), f'{query}&t={timestamp}'.encode('ascii'), hashlib.sha1).hexdigest() + return timestamp, sig, self._API_URL_TEMPLATE % query + + def _call_api( + self, path, video_id, note='Downloading JSON metadata', data=None, query=None, fatal=True): + if query is None: + timestamp, sig, url = self._sign_query(path) + else: + url = self._API_URL_TEMPLATE % self._api_query(path, version=4) + resp = self._download_json( + url, video_id, note, fatal=fatal, query=query, + data=json.dumps(data).encode('utf-8') if data else None, + headers=({'x-viki-app-ver': self._APP_VERSION} if data + else self._stream_headers(timestamp, sig) if query is None + else None), expected_status=400) or {} + + self._raise_error(resp.get('error'), fatal) + return resp + + def _raise_error(self, error, fatal=True): + if error is None: + return + msg = '%s said: %s' % (self.IE_NAME, error) + if fatal: + raise ExtractorError(msg, expected=True) + else: + self.report_warning(msg) + + def _check_errors(self, data): + for reason, status in (data.get('blocking') or {}).items(): + if status and reason in self._ERRORS: + message = self._ERRORS[reason] + if reason == 'geo': + self.raise_geo_restricted(msg=message) + elif reason == 'paywall': + if try_get(data, lambda x: x['paywallable']['tvod']): + self._raise_error('This video is for rent only or TVOD (Transactional Video On demand)') + self.raise_login_required(message) + self._raise_error(message) + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + self._token = self._call_api( + 'sessions.json', None, 'Logging in', fatal=False, + data={'username': username, 'password': password}).get('token') + if not self._token: + self.report_warning('Login Failed: Unable to get session token') + + @staticmethod + def dict_selection(dict_obj, preferred_key): + if preferred_key in dict_obj: + return dict_obj[preferred_key] + return (list(filter(None, dict_obj.values())) or [None])[0] + + +class VikiIE(VikiBaseIE): + IE_NAME = 'viki' + _VALID_URL = r'%s(?:videos|player)/(?P<id>[0-9]+v)' % VikiBaseIE._VALID_URL_BASE + _TESTS = [{ + 'note': 'Free non-DRM video with storyboards in MPD', + 'url': 'https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1', + 'info_dict': { + 'id': '1175236v', + 'ext': 'mp4', + 'title': 'Choosing Spouse by Lottery - Episode 1', + 'timestamp': 1606463239, + 'age_limit': 13, + 'uploader': 'FCC', + 'upload_date': '20201127', + }, + 'params': { + 'format': 'bestvideo', + }, + }, { + 'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14', + 'info_dict': { + 'id': '1023585v', + 'ext': 'mp4', + 'title': 'Heirs - Episode 14', + 'uploader': 'SBS Contents Hub', + 'timestamp': 1385047627, + 'upload_date': '20131121', + 'age_limit': 13, + 'duration': 3570, + 'episode_number': 14, + }, + 'params': { + 'format': 'bestvideo', + }, + 'skip': 'Blocked in the US', + }, { + # clip + 'url': 'http://www.viki.com/videos/1067139v-the-avengers-age-of-ultron-press-conference', + 'md5': '86c0b5dbd4d83a6611a79987cc7a1989', + 'info_dict': { + 'id': '1067139v', + 'ext': 'mp4', + 'title': "'The Avengers: Age of Ultron' Press Conference", + 'description': 'md5:d70b2f9428f5488321bfe1db10d612ea', + 'duration': 352, + 'timestamp': 1430380829, + 'upload_date': '20150430', + 'uploader': 'Arirang TV', + 'like_count': int, + 'age_limit': 0, + }, + 'skip': 'Sorry. There was an error loading this video', + }, { + 'url': 'http://www.viki.com/videos/1048879v-ankhon-dekhi', + 'info_dict': { + 'id': '1048879v', + 'ext': 'mp4', + 'title': 'Ankhon Dekhi', + 'duration': 6512, + 'timestamp': 1408532356, + 'upload_date': '20140820', + 'uploader': 'Spuul', + 'like_count': int, + 'age_limit': 13, + }, + 'skip': 'Blocked in the US', + }, { + # episode + 'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1', + 'md5': '0a53dc252e6e690feccd756861495a8c', + 'info_dict': { + 'id': '44699v', + 'ext': 'mp4', + 'title': 'Boys Over Flowers - Episode 1', + 'description': 'md5:b89cf50038b480b88b5b3c93589a9076', + 'duration': 4172, + 'timestamp': 1270496524, + 'upload_date': '20100405', + 'uploader': 'group8', + 'like_count': int, + 'age_limit': 13, + 'episode_number': 1, + }, + 'params': { + 'format': 'bestvideo', + }, + }, { + # youtube external + 'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1', + 'md5': '63f8600c1da6f01b7640eee7eca4f1da', + 'info_dict': { + 'id': '50562v', + 'ext': 'webm', + 'title': 'Poor Nastya [COMPLETE] - Episode 1', + 'description': '', + 'duration': 606, + 'timestamp': 1274949505, + 'upload_date': '20101213', + 'uploader': 'ad14065n', + 'uploader_id': 'ad14065n', + 'like_count': int, + 'age_limit': 13, + }, + 'skip': 'Page not found!', + }, { + 'url': 'http://www.viki.com/player/44699v', + 'only_matching': True, + }, { + # non-English description + 'url': 'http://www.viki.com/videos/158036v-love-in-magic', + 'md5': '41faaba0de90483fb4848952af7c7d0d', + 'info_dict': { + 'id': '158036v', + 'ext': 'mp4', + 'uploader': 'I Planet Entertainment', + 'upload_date': '20111122', + 'timestamp': 1321985454, + 'description': 'md5:44b1e46619df3a072294645c770cef36', + 'title': 'Love In Magic', + 'age_limit': 13, + }, + 'params': { + 'format': 'bestvideo', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + video = self._call_api(f'videos/{video_id}.json', video_id, 'Downloading video JSON', query={}) + self._check_errors(video) + + title = try_get(video, lambda x: x['titles']['en'], str) + episode_number = int_or_none(video.get('number')) + if not title: + title = 'Episode %d' % episode_number if video.get('type') == 'episode' else video.get('id') or video_id + container_titles = try_get(video, lambda x: x['container']['titles'], dict) or {} + container_title = self.dict_selection(container_titles, 'en') + title = '%s - %s' % (container_title, title) + + thumbnails = [{ + 'id': thumbnail_id, + 'url': thumbnail['url'], + } for thumbnail_id, thumbnail in (video.get('images') or {}).items() if thumbnail.get('url')] + + resp = self._call_api( + 'playback_streams/%s.json?drms=dt1,dt2&device_id=%s' % (video_id, self._DEVICE_ID), + video_id, 'Downloading video streams JSON')['main'][0] + + stream_id = try_get(resp, lambda x: x['properties']['track']['stream_id']) + subtitles = dict((lang, [{ + 'ext': ext, + 'url': self._API_URL_TEMPLATE % self._api_query( + f'videos/{video_id}/auth_subtitles/{lang}.{ext}', stream_id=stream_id) + } for ext in ('srt', 'vtt')]) for lang in (video.get('subtitle_completions') or {}).keys()) + + mpd_url = resp['url'] + # 1080p is hidden in another mpd which can be found in the current manifest content + mpd_content = self._download_webpage(mpd_url, video_id, note='Downloading initial MPD manifest') + mpd_url = self._search_regex( + r'(?mi)<BaseURL>(http.+.mpd)', mpd_content, 'new manifest', default=mpd_url) + formats = self._extract_mpd_formats(mpd_url, video_id) + self._sort_formats(formats) + + return { + 'id': video_id, + 'formats': formats, + 'title': title, + 'description': self.dict_selection(video.get('descriptions', {}), 'en'), + 'duration': int_or_none(video.get('duration')), + 'timestamp': parse_iso8601(video.get('created_at')), + 'uploader': video.get('author'), + 'uploader_url': video.get('author_url'), + 'like_count': int_or_none(try_get(video, lambda x: x['likes']['count'])), + 'age_limit': parse_age_limit(video.get('rating')), + 'thumbnails': thumbnails, + 'subtitles': subtitles, + 'episode_number': episode_number, + } + + +class VikiChannelIE(VikiBaseIE): + IE_NAME = 'viki:channel' + _VALID_URL = r'%s(?:tv|news|movies|artists)/(?P<id>[0-9]+c)' % VikiBaseIE._VALID_URL_BASE + _TESTS = [{ + 'url': 'http://www.viki.com/tv/50c-boys-over-flowers', + 'info_dict': { + 'id': '50c', + 'title': 'Boys Over Flowers', + 'description': 'md5:804ce6e7837e1fd527ad2f25420f4d59', + }, + 'playlist_mincount': 51, + }, { + 'url': 'http://www.viki.com/tv/1354c-poor-nastya-complete', + 'info_dict': { + 'id': '1354c', + 'title': 'Poor Nastya [COMPLETE]', + 'description': 'md5:05bf5471385aa8b21c18ad450e350525', + }, + 'playlist_count': 127, + 'skip': 'Page not found', + }, { + 'url': 'http://www.viki.com/news/24569c-showbiz-korea', + 'only_matching': True, + }, { + 'url': 'http://www.viki.com/movies/22047c-pride-and-prejudice-2005', + 'only_matching': True, + }, { + 'url': 'http://www.viki.com/artists/2141c-shinee', + 'only_matching': True, + }] + + _video_types = ('episodes', 'movies', 'clips', 'trailers') + + def _entries(self, channel_id): + params = { + 'app': self._APP, 'token': self._token, 'only_ids': 'true', + 'direction': 'asc', 'sort': 'number', 'per_page': 30 + } + video_types = self._configuration_arg('video_types') or self._video_types + for video_type in video_types: + if video_type not in self._video_types: + self.report_warning(f'Unknown video_type: {video_type}') + page_num = 0 + while True: + page_num += 1 + params['page'] = page_num + res = self._call_api( + f'containers/{channel_id}/{video_type}.json', channel_id, query=params, fatal=False, + note='Downloading %s JSON page %d' % (video_type.title(), page_num)) + + for video_id in res.get('response') or []: + yield self.url_result(f'https://www.viki.com/videos/{video_id}', VikiIE.ie_key(), video_id) + if not res.get('more'): + break + + def _real_extract(self, url): + channel_id = self._match_id(url) + channel = self._call_api('containers/%s.json' % channel_id, channel_id, 'Downloading channel JSON') + self._check_errors(channel) + return self.playlist_result( + self._entries(channel_id), channel_id, + self.dict_selection(channel['titles'], 'en'), + self.dict_selection(channel['descriptions'], 'en')) diff --git a/yt_dlp/extractor/vimeo.py b/yt_dlp/extractor/vimeo.py new file mode 100644 index 000000000..8b367a4e6 --- /dev/null +++ b/yt_dlp/extractor/vimeo.py @@ -0,0 +1,1290 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import base64 +import functools +import json +import re +import itertools + +from .common import InfoExtractor +from ..compat import ( + compat_kwargs, + compat_HTTPError, + compat_str, + compat_urlparse, +) +from ..utils import ( + clean_html, + determine_ext, + dict_get, + ExtractorError, + js_to_json, + int_or_none, + merge_dicts, + OnDemandPagedList, + parse_filesize, + parse_iso8601, + parse_qs, + RegexNotFoundError, + sanitized_Request, + smuggle_url, + std_headers, + str_or_none, + try_get, + unified_timestamp, + unsmuggle_url, + urlencode_postdata, + urljoin, + unescapeHTML, +) + + +class VimeoBaseInfoExtractor(InfoExtractor): + _NETRC_MACHINE = 'vimeo' + _LOGIN_REQUIRED = False + _LOGIN_URL = 'https://vimeo.com/log_in' + + def _login(self): + username, password = self._get_login_info() + if username is None: + if self._LOGIN_REQUIRED: + raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) + return + webpage = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login page') + token, vuid = self._extract_xsrft_and_vuid(webpage) + data = { + 'action': 'login', + 'email': username, + 'password': password, + 'service': 'vimeo', + 'token': token, + } + self._set_vimeo_cookie('vuid', vuid) + try: + self._download_webpage( + self._LOGIN_URL, None, 'Logging in', + data=urlencode_postdata(data), headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': self._LOGIN_URL, + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 418: + raise ExtractorError( + 'Unable to log in: bad username or password', + expected=True) + raise ExtractorError('Unable to log in') + + def _get_video_password(self): + password = self.get_param('videopassword') + if password is None: + raise ExtractorError( + 'This video is protected by a password, use the --video-password option', + expected=True) + return password + + def _verify_video_password(self, url, video_id, password, token, vuid): + if url.startswith('http://'): + # vimeo only supports https now, but the user can give an http url + url = url.replace('http://', 'https://') + self._set_vimeo_cookie('vuid', vuid) + return self._download_webpage( + url + '/password', video_id, 'Verifying the password', + 'Wrong password', data=urlencode_postdata({ + 'password': password, + 'token': token, + }), headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': url, + }) + + def _extract_xsrft_and_vuid(self, webpage): + xsrft = self._search_regex( + r'(?:(?P<q1>["\'])xsrft(?P=q1)\s*:|xsrft\s*[=:])\s*(?P<q>["\'])(?P<xsrft>.+?)(?P=q)', + webpage, 'login token', group='xsrft') + vuid = self._search_regex( + r'["\']vuid["\']\s*:\s*(["\'])(?P<vuid>.+?)\1', + webpage, 'vuid', group='vuid') + return xsrft, vuid + + def _extract_vimeo_config(self, webpage, video_id, *args, **kwargs): + vimeo_config = self._search_regex( + r'vimeo\.config\s*=\s*(?:({.+?})|_extend\([^,]+,\s+({.+?})\));', + webpage, 'vimeo config', *args, **compat_kwargs(kwargs)) + if vimeo_config: + return self._parse_json(vimeo_config, video_id) + + def _set_vimeo_cookie(self, name, value): + self._set_cookie('vimeo.com', name, value) + + def _vimeo_sort_formats(self, formats): + # Bitrates are completely broken. Single m3u8 may contain entries in kbps and bps + # at the same time without actual units specified. This lead to wrong sorting. + # But since yt-dlp prefers 'res,fps' anyway, 'field_preference' is not needed + self._sort_formats(formats) + + def _parse_config(self, config, video_id): + video_data = config['video'] + video_title = video_data['title'] + live_event = video_data.get('live_event') or {} + is_live = live_event.get('status') == 'started' + + formats = [] + config_files = video_data.get('files') or config['request'].get('files', {}) + for f in config_files.get('progressive', []): + video_url = f.get('url') + if not video_url: + continue + formats.append({ + 'url': video_url, + 'format_id': 'http-%s' % f.get('quality'), + 'width': int_or_none(f.get('width')), + 'height': int_or_none(f.get('height')), + 'fps': int_or_none(f.get('fps')), + 'tbr': int_or_none(f.get('bitrate')), + }) + + # TODO: fix handling of 308 status code returned for live archive manifest requests + sep_pattern = r'/sep/video/' + for files_type in ('hls', 'dash'): + for cdn_name, cdn_data in config_files.get(files_type, {}).get('cdns', {}).items(): + manifest_url = cdn_data.get('url') + if not manifest_url: + continue + format_id = '%s-%s' % (files_type, cdn_name) + sep_manifest_urls = [] + if re.search(sep_pattern, manifest_url): + for suffix, repl in (('', 'video'), ('_sep', 'sep/video')): + sep_manifest_urls.append((format_id + suffix, re.sub( + sep_pattern, '/%s/' % repl, manifest_url))) + else: + sep_manifest_urls = [(format_id, manifest_url)] + for f_id, m_url in sep_manifest_urls: + if files_type == 'hls': + formats.extend(self._extract_m3u8_formats( + m_url, video_id, 'mp4', + 'm3u8' if is_live else 'm3u8_native', m3u8_id=f_id, + note='Downloading %s m3u8 information' % cdn_name, + fatal=False)) + elif files_type == 'dash': + if 'json=1' in m_url: + real_m_url = (self._download_json(m_url, video_id, fatal=False) or {}).get('url') + if real_m_url: + m_url = real_m_url + mpd_formats = self._extract_mpd_formats( + m_url.replace('/master.json', '/master.mpd'), video_id, f_id, + 'Downloading %s MPD information' % cdn_name, + fatal=False) + formats.extend(mpd_formats) + + live_archive = live_event.get('archive') or {} + live_archive_source_url = live_archive.get('source_url') + if live_archive_source_url and live_archive.get('status') == 'done': + formats.append({ + 'format_id': 'live-archive-source', + 'url': live_archive_source_url, + 'quality': 10, + }) + + subtitles = {} + text_tracks = config['request'].get('text_tracks') + if text_tracks: + for tt in text_tracks: + subtitles[tt['lang']] = [{ + 'ext': 'vtt', + 'url': urljoin('https://vimeo.com', tt['url']), + }] + + thumbnails = [] + if not is_live: + for key, thumb in video_data.get('thumbs', {}).items(): + thumbnails.append({ + 'id': key, + 'width': int_or_none(key), + 'url': thumb, + }) + thumbnail = video_data.get('thumbnail') + if thumbnail: + thumbnails.append({ + 'url': thumbnail, + }) + + owner = video_data.get('owner') or {} + video_uploader_url = owner.get('url') + + return { + 'id': str_or_none(video_data.get('id')) or video_id, + 'title': self._live_title(video_title) if is_live else video_title, + 'uploader': owner.get('name'), + 'uploader_id': video_uploader_url.split('/')[-1] if video_uploader_url else None, + 'uploader_url': video_uploader_url, + 'thumbnails': thumbnails, + 'duration': int_or_none(video_data.get('duration')), + 'formats': formats, + 'subtitles': subtitles, + 'is_live': is_live, + } + + def _extract_original_format(self, url, video_id, unlisted_hash=None): + query = {'action': 'load_download_config'} + if unlisted_hash: + query['unlisted_hash'] = unlisted_hash + download_data = self._download_json( + url, video_id, fatal=False, query=query, + headers={'X-Requested-With': 'XMLHttpRequest'}) + if download_data: + source_file = download_data.get('source_file') + if isinstance(source_file, dict): + download_url = source_file.get('download_url') + if download_url and not source_file.get('is_cold') and not source_file.get('is_defrosting'): + source_name = source_file.get('public_name', 'Original') + if self._is_valid_url(download_url, video_id, '%s video' % source_name): + ext = (try_get( + source_file, lambda x: x['extension'], + compat_str) or determine_ext( + download_url, None) or 'mp4').lower() + return { + 'url': download_url, + 'ext': ext, + 'width': int_or_none(source_file.get('width')), + 'height': int_or_none(source_file.get('height')), + 'filesize': parse_filesize(source_file.get('size')), + 'format_id': source_name, + 'quality': 1, + } + + jwt_response = self._download_json( + 'https://vimeo.com/_rv/viewer', video_id, note='Downloading jwt token', fatal=False) or {} + if not jwt_response.get('jwt'): + return + headers = {'Authorization': 'jwt %s' % jwt_response['jwt']} + original_response = self._download_json( + f'https://api.vimeo.com/videos/{video_id}', video_id, + headers=headers, fatal=False) or {} + for download_data in original_response.get('download') or {}: + download_url = download_data.get('link') + if not download_url or download_data.get('quality') != 'source': + continue + query = parse_qs(download_url) + return { + 'url': download_url, + 'ext': determine_ext(query.get('filename', [''])[0].lower()), + 'format_id': download_data.get('public_name', 'Original'), + 'width': int_or_none(download_data.get('width')), + 'height': int_or_none(download_data.get('height')), + 'fps': int_or_none(download_data.get('fps')), + 'filesize': int_or_none(download_data.get('size')), + 'quality': 1, + } + + +class VimeoIE(VimeoBaseInfoExtractor): + """Information extractor for vimeo.com.""" + + # _VALID_URL matches Vimeo URLs + _VALID_URL = r'''(?x) + https?:// + (?: + (?: + www| + player + ) + \. + )? + vimeo(?:pro)?\.com/ + (?!(?:channels|album|showcase)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/) + (?:.*?/)? + (?: + (?: + play_redirect_hls| + moogaloop\.swf)\?clip_id= + )? + (?:videos?/)? + (?P<id>[0-9]+) + (?:/(?P<unlisted_hash>[\da-f]{10}))? + /?(?:[?&].*)?(?:[#].*)?$ + ''' + IE_NAME = 'vimeo' + _TESTS = [ + { + 'url': 'http://vimeo.com/56015672#at=0', + 'md5': '8879b6cc097e987f02484baf890129e5', + 'info_dict': { + 'id': '56015672', + 'ext': 'mp4', + 'title': "youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550", + 'description': 'md5:2d3305bad981a06ff79f027f19865021', + 'timestamp': 1355990239, + 'upload_date': '20121220', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user7108434', + 'uploader_id': 'user7108434', + 'uploader': 'Filippo Valsorda', + 'duration': 10, + 'license': 'by-sa', + }, + 'params': { + 'format': 'best[protocol=https]', + }, + }, + { + 'url': 'http://vimeopro.com/openstreetmapus/state-of-the-map-us-2013/video/68093876', + 'md5': '3b5ca6aa22b60dfeeadf50b72e44ed82', + 'note': 'Vimeo Pro video (#1197)', + 'info_dict': { + 'id': '68093876', + 'ext': 'mp4', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/openstreetmapus', + 'uploader_id': 'openstreetmapus', + 'uploader': 'OpenStreetMap US', + 'title': 'Andy Allan - Putting the Carto into OpenStreetMap Cartography', + 'description': 'md5:2c362968038d4499f4d79f88458590c1', + 'duration': 1595, + 'upload_date': '20130610', + 'timestamp': 1370893156, + }, + 'params': { + 'format': 'best[protocol=https]', + }, + }, + { + 'url': 'http://player.vimeo.com/video/54469442', + 'md5': '619b811a4417aa4abe78dc653becf511', + 'note': 'Videos that embed the url in the player page', + 'info_dict': { + 'id': '54469442', + 'ext': 'mp4', + 'title': 'Kathy Sierra: Building the minimum Badass User, Business of Software 2012', + 'uploader': 'Business of Software', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/businessofsoftware', + 'uploader_id': 'businessofsoftware', + 'duration': 3610, + 'description': None, + }, + 'params': { + 'format': 'best[protocol=https]', + }, + 'expected_warnings': ['Unable to download JSON metadata'], + }, + { + 'url': 'http://vimeo.com/68375962', + 'md5': 'aaf896bdb7ddd6476df50007a0ac0ae7', + 'note': 'Video protected with password', + 'info_dict': { + 'id': '68375962', + 'ext': 'mp4', + 'title': 'youtube-dl password protected test video', + 'timestamp': 1371200155, + 'upload_date': '20130614', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user18948128', + 'uploader_id': 'user18948128', + 'uploader': 'Jaime Marquínez Ferrándiz', + 'duration': 10, + 'description': 'md5:dca3ea23adb29ee387127bc4ddfce63f', + }, + 'params': { + 'format': 'best[protocol=https]', + 'videopassword': 'youtube-dl', + }, + }, + { + 'url': 'http://vimeo.com/channels/keypeele/75629013', + 'md5': '2f86a05afe9d7abc0b9126d229bbe15d', + 'info_dict': { + 'id': '75629013', + 'ext': 'mp4', + 'title': 'Key & Peele: Terrorist Interrogation', + 'description': 'md5:8678b246399b070816b12313e8b4eb5c', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/atencio', + 'uploader_id': 'atencio', + 'uploader': 'Peter Atencio', + 'channel_id': 'keypeele', + 'channel_url': r're:https?://(?:www\.)?vimeo\.com/channels/keypeele', + 'timestamp': 1380339469, + 'upload_date': '20130928', + 'duration': 187, + }, + 'expected_warnings': ['Unable to download JSON metadata'], + }, + { + 'url': 'http://vimeo.com/76979871', + 'note': 'Video with subtitles', + 'info_dict': { + 'id': '76979871', + 'ext': 'mp4', + 'title': 'The New Vimeo Player (You Know, For Videos)', + 'description': 'md5:2ec900bf97c3f389378a96aee11260ea', + 'timestamp': 1381846109, + 'upload_date': '20131015', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/staff', + 'uploader_id': 'staff', + 'uploader': 'Vimeo Staff', + 'duration': 62, + } + }, + { + # from https://www.ouya.tv/game/Pier-Solar-and-the-Great-Architects/ + 'url': 'https://player.vimeo.com/video/98044508', + 'note': 'The js code contains assignments to the same variable as the config', + 'info_dict': { + 'id': '98044508', + 'ext': 'mp4', + 'title': 'Pier Solar OUYA Official Trailer', + 'uploader': 'Tulio Gonçalves', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user28849593', + 'uploader_id': 'user28849593', + }, + }, + { + # contains original format + 'url': 'https://vimeo.com/33951933', + 'md5': '53c688fa95a55bf4b7293d37a89c5c53', + 'info_dict': { + 'id': '33951933', + 'ext': 'mp4', + 'title': 'FOX CLASSICS - Forever Classic ID - A Full Minute', + 'uploader': 'The DMCI', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/dmci', + 'uploader_id': 'dmci', + 'timestamp': 1324343742, + 'upload_date': '20111220', + 'description': 'md5:ae23671e82d05415868f7ad1aec21147', + }, + }, + { + 'note': 'Contains original format not accessible in webpage', + 'url': 'https://vimeo.com/393756517', + 'md5': 'c464af248b592190a5ffbb5d33f382b0', + 'info_dict': { + 'id': '393756517', + 'ext': 'mov', + 'timestamp': 1582642091, + 'uploader_id': 'frameworkla', + 'title': 'Straight To Hell - Sabrina: Netflix', + 'uploader': 'Framework Studio', + 'description': 'md5:f2edc61af3ea7a5592681ddbb683db73', + 'upload_date': '20200225', + }, + 'expected_warnings': ['Unable to download JSON metadata'], + }, + { + # only available via https://vimeo.com/channels/tributes/6213729 and + # not via https://vimeo.com/6213729 + 'url': 'https://vimeo.com/channels/tributes/6213729', + 'info_dict': { + 'id': '6213729', + 'ext': 'mp4', + 'title': 'Vimeo Tribute: The Shining', + 'uploader': 'Casey Donahue', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/caseydonahue', + 'uploader_id': 'caseydonahue', + 'channel_url': r're:https?://(?:www\.)?vimeo\.com/channels/tributes', + 'channel_id': 'tributes', + 'timestamp': 1250886430, + 'upload_date': '20090821', + 'description': 'md5:bdbf314014e58713e6e5b66eb252f4a6', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download JSON metadata'], + }, + { + # redirects to ondemand extractor and should be passed through it + # for successful extraction + 'url': 'https://vimeo.com/73445910', + 'info_dict': { + 'id': '73445910', + 'ext': 'mp4', + 'title': 'The Reluctant Revolutionary', + 'uploader': '10Ft Films', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/tenfootfilms', + 'uploader_id': 'tenfootfilms', + 'description': 'md5:0fa704e05b04f91f40b7f3ca2e801384', + 'upload_date': '20130830', + 'timestamp': 1377853339, + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download JSON metadata'], + 'skip': 'this page is no longer available.', + }, + { + 'url': 'http://player.vimeo.com/video/68375962', + 'md5': 'aaf896bdb7ddd6476df50007a0ac0ae7', + 'info_dict': { + 'id': '68375962', + 'ext': 'mp4', + 'title': 'youtube-dl password protected test video', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user18948128', + 'uploader_id': 'user18948128', + 'uploader': 'Jaime Marquínez Ferrándiz', + 'duration': 10, + }, + 'params': { + 'format': 'best[protocol=https]', + 'videopassword': 'youtube-dl', + }, + }, + { + 'url': 'http://vimeo.com/moogaloop.swf?clip_id=2539741', + 'only_matching': True, + }, + { + 'url': 'https://vimeo.com/109815029', + 'note': 'Video not completely processed, "failed" seed status', + 'only_matching': True, + }, + { + 'url': 'https://vimeo.com/groups/travelhd/videos/22439234', + 'only_matching': True, + }, + { + 'url': 'https://vimeo.com/album/2632481/video/79010983', + 'only_matching': True, + }, + { + 'url': 'https://vimeo.com/showcase/3253534/video/119195465', + 'note': 'A video in a password protected album (showcase)', + 'info_dict': { + 'id': '119195465', + 'ext': 'mp4', + 'title': 'youtube-dl test video \'ä"BaW_jenozKc', + 'uploader': 'Philipp Hagemeister', + 'uploader_id': 'user20132939', + 'description': 'md5:fa7b6c6d8db0bdc353893df2f111855b', + 'upload_date': '20150209', + 'timestamp': 1423518307, + }, + 'params': { + 'format': 'best[protocol=https]', + 'videopassword': 'youtube-dl', + }, + }, + { + # source file returns 403: Forbidden + 'url': 'https://vimeo.com/7809605', + 'only_matching': True, + }, + { + 'url': 'https://vimeo.com/160743502/abd0e13fb4', + 'only_matching': True, + }, + { + # requires passing unlisted_hash(a52724358e) to load_download_config request + 'url': 'https://vimeo.com/392479337/a52724358e', + 'only_matching': True, + } + # https://gettingthingsdone.com/workflowmap/ + # vimeo embed with check-password page protected by Referer header + ] + + @staticmethod + def _smuggle_referrer(url, referrer_url): + return smuggle_url(url, {'http_headers': {'Referer': referrer_url}}) + + @staticmethod + def _extract_urls(url, webpage): + urls = [] + # Look for embedded (iframe) Vimeo player + for mobj in re.finditer( + r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/\d+.*?)\1', + webpage): + urls.append(VimeoIE._smuggle_referrer(unescapeHTML(mobj.group('url')), url)) + PLAIN_EMBED_RE = ( + # Look for embedded (swf embed) Vimeo player + r'<embed[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)\1', + # Look more for non-standard embedded Vimeo player + r'<video[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?vimeo\.com/[0-9]+)\1', + ) + for embed_re in PLAIN_EMBED_RE: + for mobj in re.finditer(embed_re, webpage): + urls.append(mobj.group('url')) + return urls + + @staticmethod + def _extract_url(url, webpage): + urls = VimeoIE._extract_urls(url, webpage) + return urls[0] if urls else None + + def _verify_player_video_password(self, url, video_id, headers): + password = self._get_video_password() + data = urlencode_postdata({ + 'password': base64.b64encode(password.encode()), + }) + headers = merge_dicts(headers, { + 'Content-Type': 'application/x-www-form-urlencoded', + }) + checked = self._download_json( + url + '/check-password', video_id, + 'Verifying the password', data=data, headers=headers) + if checked is False: + raise ExtractorError('Wrong video password', expected=True) + return checked + + def _real_initialize(self): + self._login() + + def _try_album_password(self, url): + album_id = self._search_regex( + r'vimeo\.com/(?:album|showcase)/([^/]+)', url, 'album id', default=None) + if not album_id: + return + viewer = self._download_json( + 'https://vimeo.com/_rv/viewer', album_id, fatal=False) + if not viewer: + webpage = self._download_webpage(url, album_id) + viewer = self._parse_json(self._search_regex( + r'bootstrap_data\s*=\s*({.+?})</script>', + webpage, 'bootstrap data'), album_id)['viewer'] + jwt = viewer['jwt'] + album = self._download_json( + 'https://api.vimeo.com/albums/' + album_id, + album_id, headers={'Authorization': 'jwt ' + jwt}, + query={'fields': 'description,name,privacy'}) + if try_get(album, lambda x: x['privacy']['view']) == 'password': + password = self.get_param('videopassword') + if not password: + raise ExtractorError( + 'This album is protected by a password, use the --video-password option', + expected=True) + self._set_vimeo_cookie('vuid', viewer['vuid']) + try: + self._download_json( + 'https://vimeo.com/showcase/%s/auth' % album_id, + album_id, 'Verifying the password', data=urlencode_postdata({ + 'password': password, + 'token': viewer['xsrft'], + }), headers={ + 'X-Requested-With': 'XMLHttpRequest', + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + raise ExtractorError('Wrong password', expected=True) + raise + + def _real_extract(self, url): + url, data = unsmuggle_url(url, {}) + headers = std_headers.copy() + if 'http_headers' in data: + headers.update(data['http_headers']) + if 'Referer' not in headers: + headers['Referer'] = url + + # Extract ID from URL + video_id, unlisted_hash = self._match_valid_url(url).groups() + if unlisted_hash: + token = self._download_json( + 'https://vimeo.com/_rv/jwt', video_id, headers={ + 'X-Requested-With': 'XMLHttpRequest' + })['token'] + video = self._download_json( + 'https://api.vimeo.com/videos/%s:%s' % (video_id, unlisted_hash), + video_id, headers={ + 'Authorization': 'jwt ' + token, + }, query={ + 'fields': 'config_url,created_time,description,license,metadata.connections.comments.total,metadata.connections.likes.total,release_time,stats.plays', + }) + info = self._parse_config(self._download_json( + video['config_url'], video_id), video_id) + self._vimeo_sort_formats(info['formats']) + get_timestamp = lambda x: parse_iso8601(video.get(x + '_time')) + info.update({ + 'description': video.get('description'), + 'license': video.get('license'), + 'release_timestamp': get_timestamp('release'), + 'timestamp': get_timestamp('created'), + 'view_count': int_or_none(try_get(video, lambda x: x['stats']['plays'])), + }) + connections = try_get( + video, lambda x: x['metadata']['connections'], dict) or {} + for k in ('comment', 'like'): + info[k + '_count'] = int_or_none(try_get(connections, lambda x: x[k + 's']['total'])) + return info + + orig_url = url + is_pro = 'vimeopro.com/' in url + is_player = '://player.vimeo.com/video/' in url + if is_pro: + # some videos require portfolio_id to be present in player url + # https://github.com/ytdl-org/youtube-dl/issues/20070 + url = self._extract_url(url, self._download_webpage(url, video_id)) + if not url: + url = 'https://vimeo.com/' + video_id + elif is_player: + url = 'https://player.vimeo.com/video/' + video_id + elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf')): + url = 'https://vimeo.com/' + video_id + + self._try_album_password(url) + try: + # Retrieve video webpage to extract further information + webpage, urlh = self._download_webpage_handle( + url, video_id, headers=headers) + redirect_url = urlh.geturl() + except ExtractorError as ee: + if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: + errmsg = ee.cause.read() + if b'Because of its privacy settings, this video cannot be played here' in errmsg: + raise ExtractorError( + 'Cannot download embed-only video without embedding ' + 'URL. Please call yt-dlp with the URL of the page ' + 'that embeds this video.', + expected=True) + raise + + # Now we begin extracting as much information as we can from what we + # retrieved. First we extract the information common to all extractors, + # and latter we extract those that are Vimeo specific. + self.report_extraction(video_id) + + vimeo_config = self._extract_vimeo_config(webpage, video_id, default=None) + if vimeo_config: + seed_status = vimeo_config.get('seed_status', {}) + if seed_status.get('state') == 'failed': + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, seed_status['title']), + expected=True) + + cc_license = None + timestamp = None + video_description = None + + # Extract the config JSON + try: + try: + config_url = self._html_search_regex( + r' data-config-url="(.+?)"', webpage, + 'config URL', default=None) + if not config_url: + # Sometimes new react-based page is served instead of old one that require + # different config URL extraction approach (see + # https://github.com/ytdl-org/youtube-dl/pull/7209) + page_config = self._parse_json(self._search_regex( + r'vimeo\.(?:clip|vod_title)_page_config\s*=\s*({.+?});', + webpage, 'page config'), video_id) + config_url = page_config['player']['config_url'] + cc_license = page_config.get('cc_license') + timestamp = try_get( + page_config, lambda x: x['clip']['uploaded_on'], + compat_str) + video_description = clean_html(dict_get( + page_config, ('description', 'description_html_escaped'))) + config = self._download_json(config_url, video_id) + except RegexNotFoundError: + # For pro videos or player.vimeo.com urls + # We try to find out to which variable is assigned the config dic + m_variable_name = re.search(r'(\w)\.video\.id', webpage) + if m_variable_name is not None: + config_re = [r'%s=({[^}].+?});' % re.escape(m_variable_name.group(1))] + else: + config_re = [r' = {config:({.+?}),assets:', r'(?:[abc])=({.+?});'] + config_re.append(r'\bvar\s+r\s*=\s*({.+?})\s*;') + config_re.append(r'\bconfig\s*=\s*({.+?})\s*;') + config = self._search_regex(config_re, webpage, 'info section', + flags=re.DOTALL) + config = json.loads(config) + except Exception as e: + if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage): + raise ExtractorError('The author has restricted the access to this video, try with the "--referer" option') + + if re.search(r'<form[^>]+?id="pw_form"', webpage) is not None: + if '_video_password_verified' in data: + raise ExtractorError('video password verification failed!') + video_password = self._get_video_password() + token, vuid = self._extract_xsrft_and_vuid(webpage) + self._verify_video_password( + redirect_url, video_id, video_password, token, vuid) + return self._real_extract( + smuggle_url(redirect_url, {'_video_password_verified': 'verified'})) + else: + raise ExtractorError('Unable to extract info section', + cause=e) + else: + if config.get('view') == 4: + config = self._verify_player_video_password(redirect_url, video_id, headers) + + video = config.get('video') or {} + vod = video.get('vod') or {} + + def is_rented(): + if '>You rented this title.<' in webpage: + return True + if config.get('user', {}).get('purchased'): + return True + for purchase_option in vod.get('purchase_options', []): + if purchase_option.get('purchased'): + return True + label = purchase_option.get('label_string') + if label and (label.startswith('You rented this') or label.endswith(' remaining')): + return True + return False + + if is_rented() and vod.get('is_trailer'): + feature_id = vod.get('feature_id') + if feature_id and not data.get('force_feature_id', False): + return self.url_result(smuggle_url( + 'https://player.vimeo.com/player/%s' % feature_id, + {'force_feature_id': True}), 'Vimeo') + + # Extract video description + if not video_description: + video_description = self._html_search_regex( + r'(?s)<div\s+class="[^"]*description[^"]*"[^>]*>(.*?)</div>', + webpage, 'description', default=None) + if not video_description: + video_description = self._html_search_meta( + 'description', webpage, default=None) + if not video_description and is_pro: + orig_webpage = self._download_webpage( + orig_url, video_id, + note='Downloading webpage for description', + fatal=False) + if orig_webpage: + video_description = self._html_search_meta( + 'description', orig_webpage, default=None) + if not video_description and not is_player: + self.report_warning('Cannot find video description') + + # Extract upload date + if not timestamp: + timestamp = self._search_regex( + r'<time[^>]+datetime="([^"]+)"', webpage, + 'timestamp', default=None) + + try: + view_count = int(self._search_regex(r'UserPlays:(\d+)', webpage, 'view count')) + like_count = int(self._search_regex(r'UserLikes:(\d+)', webpage, 'like count')) + comment_count = int(self._search_regex(r'UserComments:(\d+)', webpage, 'comment count')) + except RegexNotFoundError: + # This info is only available in vimeo.com/{id} urls + view_count = None + like_count = None + comment_count = None + + formats = [] + + source_format = self._extract_original_format( + 'https://vimeo.com/' + video_id, video_id, video.get('unlisted_hash')) + if source_format: + formats.append(source_format) + + info_dict_config = self._parse_config(config, video_id) + formats.extend(info_dict_config['formats']) + self._vimeo_sort_formats(formats) + + json_ld = self._search_json_ld(webpage, video_id, default={}) + + if not cc_license: + cc_license = self._search_regex( + r'<link[^>]+rel=["\']license["\'][^>]+href=(["\'])(?P<license>(?:(?!\1).)+)\1', + webpage, 'license', default=None, group='license') + + channel_id = self._search_regex( + r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None) + channel_url = 'https://vimeo.com/channels/%s' % channel_id if channel_id else None + + info_dict = { + 'formats': formats, + 'timestamp': unified_timestamp(timestamp), + 'description': video_description, + 'webpage_url': url, + 'view_count': view_count, + 'like_count': like_count, + 'comment_count': comment_count, + 'license': cc_license, + 'channel_id': channel_id, + 'channel_url': channel_url, + } + + info_dict = merge_dicts(info_dict, info_dict_config, json_ld) + + return info_dict + + +class VimeoOndemandIE(VimeoIE): + IE_NAME = 'vimeo:ondemand' + _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/([^/]+/)?(?P<id>[^/?#&]+)' + _TESTS = [{ + # ondemand video not available via https://vimeo.com/id + 'url': 'https://vimeo.com/ondemand/20704', + 'md5': 'c424deda8c7f73c1dfb3edd7630e2f35', + 'info_dict': { + 'id': '105442900', + 'ext': 'mp4', + 'title': 'המעבדה - במאי יותם פלדמן', + 'uploader': 'גם סרטים', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/gumfilms', + 'uploader_id': 'gumfilms', + 'description': 'md5:4c027c965e439de4baab621e48b60791', + 'upload_date': '20140906', + 'timestamp': 1410032453, + }, + 'params': { + 'format': 'best[protocol=https]', + }, + 'expected_warnings': ['Unable to download JSON metadata'], + }, { + # requires Referer to be passed along with og:video:url + 'url': 'https://vimeo.com/ondemand/36938/126682985', + 'info_dict': { + 'id': '126584684', + 'ext': 'mp4', + 'title': 'Rävlock, rätt läte på rätt plats', + 'uploader': 'Lindroth & Norin', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/lindrothnorin', + 'uploader_id': 'lindrothnorin', + 'description': 'md5:c3c46a90529612c8279fb6af803fc0df', + 'upload_date': '20150502', + 'timestamp': 1430586422, + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download JSON metadata'], + }, { + 'url': 'https://vimeo.com/ondemand/nazmaalik', + 'only_matching': True, + }, { + 'url': 'https://vimeo.com/ondemand/141692381', + 'only_matching': True, + }, { + 'url': 'https://vimeo.com/ondemand/thelastcolony/150274832', + 'only_matching': True, + }] + + +class VimeoChannelIE(VimeoBaseInfoExtractor): + IE_NAME = 'vimeo:channel' + _VALID_URL = r'https://vimeo\.com/channels/(?P<id>[^/?#]+)/?(?:$|[?#])' + _MORE_PAGES_INDICATOR = r'<a.+?rel="next"' + _TITLE = None + _TITLE_RE = r'<link rel="alternate"[^>]+?title="(.*?)"' + _TESTS = [{ + 'url': 'https://vimeo.com/channels/tributes', + 'info_dict': { + 'id': 'tributes', + 'title': 'Vimeo Tributes', + }, + 'playlist_mincount': 25, + }] + _BASE_URL_TEMPL = 'https://vimeo.com/channels/%s' + + def _page_url(self, base_url, pagenum): + return '%s/videos/page:%d/' % (base_url, pagenum) + + def _extract_list_title(self, webpage): + return self._TITLE or self._html_search_regex( + self._TITLE_RE, webpage, 'list title', fatal=False) + + def _title_and_entries(self, list_id, base_url): + for pagenum in itertools.count(1): + page_url = self._page_url(base_url, pagenum) + webpage = self._download_webpage( + page_url, list_id, + 'Downloading page %s' % pagenum) + + if pagenum == 1: + yield self._extract_list_title(webpage) + + # Try extracting href first since not all videos are available via + # short https://vimeo.com/id URL (e.g. https://vimeo.com/channels/tributes/6213729) + clips = re.findall( + r'id="clip_(\d+)"[^>]*>\s*<a[^>]+href="(/(?:[^/]+/)*\1)(?:[^>]+\btitle="([^"]+)")?', webpage) + if clips: + for video_id, video_url, video_title in clips: + yield self.url_result( + compat_urlparse.urljoin(base_url, video_url), + VimeoIE.ie_key(), video_id=video_id, video_title=video_title) + # More relaxed fallback + else: + for video_id in re.findall(r'id=["\']clip_(\d+)', webpage): + yield self.url_result( + 'https://vimeo.com/%s' % video_id, + VimeoIE.ie_key(), video_id=video_id) + + if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None: + break + + def _extract_videos(self, list_id, base_url): + title_and_entries = self._title_and_entries(list_id, base_url) + list_title = next(title_and_entries) + return self.playlist_result(title_and_entries, list_id, list_title) + + def _real_extract(self, url): + channel_id = self._match_id(url) + return self._extract_videos(channel_id, self._BASE_URL_TEMPL % channel_id) + + +class VimeoUserIE(VimeoChannelIE): + IE_NAME = 'vimeo:user' + _VALID_URL = r'https://vimeo\.com/(?!(?:[0-9]+|watchlater)(?:$|[?#/]))(?P<id>[^/]+)(?:/videos|[#?]|$)' + _TITLE_RE = r'<a[^>]+?class="user">([^<>]+?)</a>' + _TESTS = [{ + 'url': 'https://vimeo.com/nkistudio/videos', + 'info_dict': { + 'title': 'Nki', + 'id': 'nkistudio', + }, + 'playlist_mincount': 66, + }] + _BASE_URL_TEMPL = 'https://vimeo.com/%s' + + +class VimeoAlbumIE(VimeoBaseInfoExtractor): + IE_NAME = 'vimeo:album' + _VALID_URL = r'https://vimeo\.com/(?:album|showcase)/(?P<id>\d+)(?:$|[?#]|/(?!video))' + _TITLE_RE = r'<header id="page_header">\n\s*<h1>(.*?)</h1>' + _TESTS = [{ + 'url': 'https://vimeo.com/album/2632481', + 'info_dict': { + 'id': '2632481', + 'title': 'Staff Favorites: November 2013', + }, + 'playlist_mincount': 13, + }, { + 'note': 'Password-protected album', + 'url': 'https://vimeo.com/album/3253534', + 'info_dict': { + 'title': 'test', + 'id': '3253534', + }, + 'playlist_count': 1, + 'params': { + 'videopassword': 'youtube-dl', + } + }] + _PAGE_SIZE = 100 + + def _fetch_page(self, album_id, authorization, hashed_pass, page): + api_page = page + 1 + query = { + 'fields': 'link,uri', + 'page': api_page, + 'per_page': self._PAGE_SIZE, + } + if hashed_pass: + query['_hashed_pass'] = hashed_pass + try: + videos = self._download_json( + 'https://api.vimeo.com/albums/%s/videos' % album_id, + album_id, 'Downloading page %d' % api_page, query=query, headers={ + 'Authorization': 'jwt ' + authorization, + })['data'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + return + for video in videos: + link = video.get('link') + if not link: + continue + uri = video.get('uri') + video_id = self._search_regex(r'/videos/(\d+)', uri, 'video_id', default=None) if uri else None + yield self.url_result(link, VimeoIE.ie_key(), video_id) + + def _real_extract(self, url): + album_id = self._match_id(url) + viewer = self._download_json( + 'https://vimeo.com/_rv/viewer', album_id, fatal=False) + if not viewer: + webpage = self._download_webpage(url, album_id) + viewer = self._parse_json(self._search_regex( + r'bootstrap_data\s*=\s*({.+?})</script>', + webpage, 'bootstrap data'), album_id)['viewer'] + jwt = viewer['jwt'] + album = self._download_json( + 'https://api.vimeo.com/albums/' + album_id, + album_id, headers={'Authorization': 'jwt ' + jwt}, + query={'fields': 'description,name,privacy'}) + hashed_pass = None + if try_get(album, lambda x: x['privacy']['view']) == 'password': + password = self.get_param('videopassword') + if not password: + raise ExtractorError( + 'This album is protected by a password, use the --video-password option', + expected=True) + self._set_vimeo_cookie('vuid', viewer['vuid']) + try: + hashed_pass = self._download_json( + 'https://vimeo.com/showcase/%s/auth' % album_id, + album_id, 'Verifying the password', data=urlencode_postdata({ + 'password': password, + 'token': viewer['xsrft'], + }), headers={ + 'X-Requested-With': 'XMLHttpRequest', + })['hashed_pass'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + raise ExtractorError('Wrong password', expected=True) + raise + entries = OnDemandPagedList(functools.partial( + self._fetch_page, album_id, jwt, hashed_pass), self._PAGE_SIZE) + return self.playlist_result( + entries, album_id, album.get('name'), album.get('description')) + + +class VimeoGroupsIE(VimeoChannelIE): + IE_NAME = 'vimeo:group' + _VALID_URL = r'https://vimeo\.com/groups/(?P<id>[^/]+)(?:/(?!videos?/\d+)|$)' + _TESTS = [{ + 'url': 'https://vimeo.com/groups/kattykay', + 'info_dict': { + 'id': 'kattykay', + 'title': 'Katty Kay', + }, + 'playlist_mincount': 27, + }] + _BASE_URL_TEMPL = 'https://vimeo.com/groups/%s' + + +class VimeoReviewIE(VimeoBaseInfoExtractor): + IE_NAME = 'vimeo:review' + IE_DESC = 'Review pages on vimeo' + _VALID_URL = r'(?P<url>https://vimeo\.com/[^/]+/review/(?P<id>[^/]+)/[0-9a-f]{10})' + _TESTS = [{ + 'url': 'https://vimeo.com/user21297594/review/75524534/3c257a1b5d', + 'md5': 'c507a72f780cacc12b2248bb4006d253', + 'info_dict': { + 'id': '75524534', + 'ext': 'mp4', + 'title': "DICK HARDWICK 'Comedian'", + 'uploader': 'Richard Hardwick', + 'uploader_id': 'user21297594', + 'description': "Comedian Dick Hardwick's five minute demo filmed in front of a live theater audience.\nEdit by Doug Mattocks", + }, + 'expected_warnings': ['Unable to download JSON metadata'], + }, { + 'note': 'video player needs Referer', + 'url': 'https://vimeo.com/user22258446/review/91613211/13f927e053', + 'md5': '6295fdab8f4bf6a002d058b2c6dce276', + 'info_dict': { + 'id': '91613211', + 'ext': 'mp4', + 'title': 're:(?i)^Death by dogma versus assembling agile . Sander Hoogendoorn', + 'uploader': 'DevWeek Events', + 'duration': 2773, + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader_id': 'user22258446', + }, + 'skip': 'video gone', + }, { + 'note': 'Password protected', + 'url': 'https://vimeo.com/user37284429/review/138823582/c4d865efde', + 'info_dict': { + 'id': '138823582', + 'ext': 'mp4', + 'title': 'EFFICIENT PICKUP MASTERCLASS MODULE 1', + 'uploader': 'TMB', + 'uploader_id': 'user37284429', + }, + 'params': { + 'videopassword': 'holygrail', + }, + 'skip': 'video gone', + }] + + def _real_initialize(self): + self._login() + + def _real_extract(self, url): + page_url, video_id = self._match_valid_url(url).groups() + data = self._download_json( + page_url.replace('/review/', '/review/data/'), video_id) + if data.get('isLocked') is True: + video_password = self._get_video_password() + viewer = self._download_json( + 'https://vimeo.com/_rv/viewer', video_id) + webpage = self._verify_video_password( + 'https://vimeo.com/' + video_id, video_id, + video_password, viewer['xsrft'], viewer['vuid']) + clip_page_config = self._parse_json(self._search_regex( + r'window\.vimeo\.clip_page_config\s*=\s*({.+?});', + webpage, 'clip page config'), video_id) + config_url = clip_page_config['player']['config_url'] + clip_data = clip_page_config.get('clip') or {} + else: + clip_data = data['clipData'] + config_url = clip_data['configUrl'] + config = self._download_json(config_url, video_id) + info_dict = self._parse_config(config, video_id) + source_format = self._extract_original_format( + page_url + '/action', video_id) + if source_format: + info_dict['formats'].append(source_format) + self._vimeo_sort_formats(info_dict['formats']) + info_dict['description'] = clean_html(clip_data.get('description')) + return info_dict + + +class VimeoWatchLaterIE(VimeoChannelIE): + IE_NAME = 'vimeo:watchlater' + IE_DESC = 'Vimeo watch later list, "vimeowatchlater" keyword (requires authentication)' + _VALID_URL = r'https://vimeo\.com/(?:home/)?watchlater|:vimeowatchlater' + _TITLE = 'Watch Later' + _LOGIN_REQUIRED = True + _TESTS = [{ + 'url': 'https://vimeo.com/watchlater', + 'only_matching': True, + }] + + def _real_initialize(self): + self._login() + + def _page_url(self, base_url, pagenum): + url = '%s/page:%d/' % (base_url, pagenum) + request = sanitized_Request(url) + # Set the header to get a partial html page with the ids, + # the normal page doesn't contain them. + request.add_header('X-Requested-With', 'XMLHttpRequest') + return request + + def _real_extract(self, url): + return self._extract_videos('watchlater', 'https://vimeo.com/watchlater') + + +class VimeoLikesIE(VimeoChannelIE): + _VALID_URL = r'https://(?:www\.)?vimeo\.com/(?P<id>[^/]+)/likes/?(?:$|[?#]|sort:)' + IE_NAME = 'vimeo:likes' + IE_DESC = 'Vimeo user likes' + _TESTS = [{ + 'url': 'https://vimeo.com/user755559/likes/', + 'playlist_mincount': 293, + 'info_dict': { + 'id': 'user755559', + 'title': 'urza’s Likes', + }, + }, { + 'url': 'https://vimeo.com/stormlapse/likes', + 'only_matching': True, + }] + + def _page_url(self, base_url, pagenum): + return '%s/page:%d/' % (base_url, pagenum) + + def _real_extract(self, url): + user_id = self._match_id(url) + return self._extract_videos(user_id, 'https://vimeo.com/%s/likes' % user_id) + + +class VHXEmbedIE(VimeoBaseInfoExtractor): + IE_NAME = 'vhx:embed' + _VALID_URL = r'https?://embed\.vhx\.tv/videos/(?P<id>\d+)' + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe[^>]+src="(https?://embed\.vhx\.tv/videos/\d+[^"]*)"', webpage) + return unescapeHTML(mobj.group(1)) if mobj else None + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + config_url = self._parse_json(self._search_regex( + r'window\.OTTData\s*=\s*({.+})', webpage, + 'ott data'), video_id, js_to_json)['config_url'] + config = self._download_json(config_url, video_id) + info = self._parse_config(config, video_id) + info['id'] = video_id + self._vimeo_sort_formats(info['formats']) + return info diff --git a/youtube_dl/extractor/vimple.py b/yt_dlp/extractor/vimple.py index c74b43766..c74b43766 100644 --- a/youtube_dl/extractor/vimple.py +++ b/yt_dlp/extractor/vimple.py diff --git a/yt_dlp/extractor/vine.py b/yt_dlp/extractor/vine.py new file mode 100644 index 000000000..07fce0daa --- /dev/null +++ b/yt_dlp/extractor/vine.py @@ -0,0 +1,154 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + int_or_none, + unified_timestamp, +) + + +class VineIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?vine\.co/(?:v|oembed)/(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://vine.co/v/b9KOOWX7HUx', + 'md5': '2f36fed6235b16da96ce9b4dc890940d', + 'info_dict': { + 'id': 'b9KOOWX7HUx', + 'ext': 'mp4', + 'title': 'Chicken.', + 'alt_title': 'Vine by Jack', + 'timestamp': 1368997951, + 'upload_date': '20130519', + 'uploader': 'Jack', + 'uploader_id': '76', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + }, + }, { + 'url': 'https://vine.co/v/e192BnZnZ9V', + 'info_dict': { + 'id': 'e192BnZnZ9V', + 'ext': 'mp4', + 'title': 'ยิ้ม~ เขิน~ อาย~ น่าร้ากอ้ะ >//< @n_whitewo @orlameena #lovesicktheseries #lovesickseason2', + 'alt_title': 'Vine by Pimry_zaa', + 'timestamp': 1436057405, + 'upload_date': '20150705', + 'uploader': 'Pimry_zaa', + 'uploader_id': '1135760698325307392', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://vine.co/v/MYxVapFvz2z', + 'only_matching': True, + }, { + 'url': 'https://vine.co/v/bxVjBbZlPUH', + 'only_matching': True, + }, { + 'url': 'https://vine.co/oembed/MYxVapFvz2z.json', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + data = self._download_json( + 'https://archive.vine.co/posts/%s.json' % video_id, video_id) + + def video_url(kind): + for url_suffix in ('Url', 'URL'): + format_url = data.get('video%s%s' % (kind, url_suffix)) + if format_url: + return format_url + + formats = [] + for quality, format_id in enumerate(('low', '', 'dash')): + format_url = video_url(format_id.capitalize()) + if not format_url: + continue + # DASH link returns plain mp4 + if format_id == 'dash' and determine_ext(format_url) == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False)) + else: + formats.append({ + 'url': format_url, + 'format_id': format_id or 'standard', + 'quality': quality, + }) + self._check_formats(formats, video_id) + self._sort_formats(formats) + + username = data.get('username') + + alt_title = 'Vine by %s' % username if username else None + + return { + 'id': video_id, + 'title': data.get('description') or alt_title or 'Vine video', + 'alt_title': alt_title, + 'thumbnail': data.get('thumbnailUrl'), + 'timestamp': unified_timestamp(data.get('created')), + 'uploader': username, + 'uploader_id': data.get('userIdStr'), + 'view_count': int_or_none(data.get('loops')), + 'like_count': int_or_none(data.get('likes')), + 'comment_count': int_or_none(data.get('comments')), + 'repost_count': int_or_none(data.get('reposts')), + 'formats': formats, + } + + +class VineUserIE(InfoExtractor): + IE_NAME = 'vine:user' + _VALID_URL = r'https?://vine\.co/(?P<u>u/)?(?P<user>[^/]+)' + _VINE_BASE_URL = 'https://vine.co/' + _TESTS = [{ + 'url': 'https://vine.co/itsruthb', + 'info_dict': { + 'id': 'itsruthb', + 'title': 'Ruth B', + 'description': '| Instagram/Twitter: itsruthb | still a lost boy from neverland', + }, + 'playlist_mincount': 611, + }, { + 'url': 'https://vine.co/u/942914934646415360', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if VineIE.suitable(url) else super(VineUserIE, cls).suitable(url) + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + user = mobj.group('user') + u = mobj.group('u') + + profile_url = '%sapi/users/profiles/%s%s' % ( + self._VINE_BASE_URL, 'vanity/' if not u else '', user) + profile_data = self._download_json( + profile_url, user, note='Downloading user profile data') + + data = profile_data['data'] + user_id = data.get('userId') or data['userIdStr'] + profile = self._download_json( + 'https://archive.vine.co/profiles/%s.json' % user_id, user_id) + entries = [ + self.url_result( + 'https://vine.co/v/%s' % post_id, ie='Vine', video_id=post_id) + for post_id in profile['posts'] + if post_id and isinstance(post_id, compat_str)] + return self.playlist_result( + entries, user, profile.get('username'), profile.get('description')) diff --git a/youtube_dl/extractor/viqeo.py b/yt_dlp/extractor/viqeo.py index be7dfa814..be7dfa814 100644 --- a/youtube_dl/extractor/viqeo.py +++ b/yt_dlp/extractor/viqeo.py diff --git a/yt_dlp/extractor/viu.py b/yt_dlp/extractor/viu.py new file mode 100644 index 000000000..1b34c5296 --- /dev/null +++ b/yt_dlp/extractor/viu.py @@ -0,0 +1,403 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..compat import ( + compat_kwargs, + compat_str, + compat_urlparse, + compat_urllib_request, +) +from ..utils import ( + ExtractorError, + int_or_none, + try_get, + smuggle_url, + unsmuggle_url, +) + + +class ViuBaseIE(InfoExtractor): + def _real_initialize(self): + viu_auth_res = self._request_webpage( + 'https://www.viu.com/api/apps/v2/authenticate', None, + 'Requesting Viu auth', query={ + 'acct': 'test', + 'appid': 'viu_desktop', + 'fmt': 'json', + 'iid': 'guest', + 'languageid': 'default', + 'platform': 'desktop', + 'userid': 'guest', + 'useridtype': 'guest', + 'ver': '1.0' + }, headers=self.geo_verification_headers()) + self._auth_token = viu_auth_res.info()['X-VIU-AUTH'] + + def _call_api(self, path, *args, **kwargs): + headers = self.geo_verification_headers() + headers.update({ + 'X-VIU-AUTH': self._auth_token + }) + headers.update(kwargs.get('headers', {})) + kwargs['headers'] = headers + response = self._download_json( + 'https://www.viu.com/api/' + path, *args, + **compat_kwargs(kwargs))['response'] + if response.get('status') != 'success': + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, response['message']), expected=True) + return response + + +class ViuIE(ViuBaseIE): + _VALID_URL = r'(?:viu:|https?://[^/]+\.viu\.com/[a-z]{2}/media/)(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.viu.com/en/media/1116705532?containerId=playlist-22168059', + 'info_dict': { + 'id': '1116705532', + 'ext': 'mp4', + 'title': 'Citizen Khan - Ep 1', + 'description': 'md5:d7ea1604f49e5ba79c212c551ce2110e', + }, + 'params': { + 'skip_download': 'm3u8 download', + }, + 'skip': 'Geo-restricted to India', + }, { + 'url': 'https://www.viu.com/en/media/1130599965', + 'info_dict': { + 'id': '1130599965', + 'ext': 'mp4', + 'title': 'Jealousy Incarnate - Episode 1', + 'description': 'md5:d3d82375cab969415d2720b6894361e9', + }, + 'params': { + 'skip_download': 'm3u8 download', + }, + 'skip': 'Geo-restricted to Indonesia', + }, { + 'url': 'https://india.viu.com/en/media/1126286865', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + video_data = self._call_api( + 'clip/load', video_id, 'Downloading video data', query={ + 'appid': 'viu_desktop', + 'fmt': 'json', + 'id': video_id + })['item'][0] + + title = video_data['title'] + + m3u8_url = None + url_path = video_data.get('urlpathd') or video_data.get('urlpath') + tdirforwhole = video_data.get('tdirforwhole') + # #EXT-X-BYTERANGE is not supported by native hls downloader + # and ffmpeg (#10955) + # hls_file = video_data.get('hlsfile') + hls_file = video_data.get('jwhlsfile') + if url_path and tdirforwhole and hls_file: + m3u8_url = '%s/%s/%s' % (url_path, tdirforwhole, hls_file) + else: + # m3u8_url = re.sub( + # r'(/hlsc_)[a-z]+(\d+\.m3u8)', + # r'\1whe\2', video_data['href']) + m3u8_url = video_data['href'] + formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') + self._sort_formats(formats) + + subtitles = {} + for key, value in video_data.items(): + mobj = re.match(r'^subtitle_(?P<lang>[^_]+)_(?P<ext>(vtt|srt))', key) + if not mobj: + continue + subtitles.setdefault(mobj.group('lang'), []).append({ + 'url': value, + 'ext': mobj.group('ext') + }) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('description'), + 'series': video_data.get('moviealbumshowname'), + 'episode': title, + 'episode_number': int_or_none(video_data.get('episodeno')), + 'duration': int_or_none(video_data.get('duration')), + 'formats': formats, + 'subtitles': subtitles, + } + + +class ViuPlaylistIE(ViuBaseIE): + IE_NAME = 'viu:playlist' + _VALID_URL = r'https?://www\.viu\.com/[^/]+/listing/playlist-(?P<id>\d+)' + _TEST = { + 'url': 'https://www.viu.com/en/listing/playlist-22461380', + 'info_dict': { + 'id': '22461380', + 'title': 'The Good Wife', + }, + 'playlist_count': 16, + 'skip': 'Geo-restricted to Indonesia', + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + playlist_data = self._call_api( + 'container/load', playlist_id, + 'Downloading playlist info', query={ + 'appid': 'viu_desktop', + 'fmt': 'json', + 'id': 'playlist-' + playlist_id + })['container'] + + entries = [] + for item in playlist_data.get('item', []): + item_id = item.get('id') + if not item_id: + continue + item_id = compat_str(item_id) + entries.append(self.url_result( + 'viu:' + item_id, 'Viu', item_id)) + + return self.playlist_result( + entries, playlist_id, playlist_data.get('title')) + + +class ViuOTTIE(InfoExtractor): + IE_NAME = 'viu:ott' + _NETRC_MACHINE = 'viu' + _VALID_URL = r'https?://(?:www\.)?viu\.com/ott/(?P<country_code>[a-z]{2})/(?P<lang_code>[a-z]{2}-[a-z]{2})/vod/(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://www.viu.com/ott/sg/en-us/vod/3421/The%20Prime%20Minister%20and%20I', + 'info_dict': { + 'id': '3421', + 'ext': 'mp4', + 'title': 'A New Beginning', + 'description': 'md5:1e7486a619b6399b25ba6a41c0fe5b2c', + }, + 'params': { + 'skip_download': 'm3u8 download', + 'noplaylist': True, + }, + 'skip': 'Geo-restricted to Singapore', + }, { + 'url': 'http://www.viu.com/ott/hk/zh-hk/vod/7123/%E5%A4%A7%E4%BA%BA%E5%A5%B3%E5%AD%90', + 'info_dict': { + 'id': '7123', + 'ext': 'mp4', + 'title': '這就是我的生活之道', + 'description': 'md5:4eb0d8b08cf04fcdc6bbbeb16043434f', + }, + 'params': { + 'skip_download': 'm3u8 download', + 'noplaylist': True, + }, + 'skip': 'Geo-restricted to Hong Kong', + }, { + 'url': 'https://www.viu.com/ott/hk/zh-hk/vod/68776/%E6%99%82%E5%B0%9A%E5%AA%BD%E5%92%AA', + 'playlist_count': 12, + 'info_dict': { + 'id': '3916', + 'title': '時尚媽咪', + }, + 'params': { + 'skip_download': 'm3u8 download', + 'noplaylist': False, + }, + 'skip': 'Geo-restricted to Hong Kong', + }] + + _AREA_ID = { + 'HK': 1, + 'SG': 2, + 'TH': 4, + 'PH': 5, + } + _LANGUAGE_FLAG = { + 'zh-hk': 1, + 'zh-cn': 2, + 'en-us': 3, + } + _user_info = None + + def _detect_error(self, response): + code = response.get('status', {}).get('code') + if code > 0: + message = try_get(response, lambda x: x['status']['message']) + raise ExtractorError('%s said: %s (%s)' % ( + self.IE_NAME, message, code), expected=True) + return response['data'] + + def _raise_login_required(self): + raise ExtractorError( + 'This video requires login. ' + 'Specify --username and --password or --netrc (machine: %s) ' + 'to provide account credentials.' % self._NETRC_MACHINE, + expected=True) + + def _login(self, country_code, video_id): + if not self._user_info: + username, password = self._get_login_info() + if username is None or password is None: + return + + data = self._download_json( + compat_urllib_request.Request( + 'https://www.viu.com/ott/%s/index.php' % country_code, method='POST'), + video_id, 'Logging in', errnote=False, fatal=False, + query={'r': 'user/login'}, + data=json.dumps({ + 'username': username, + 'password': password, + 'platform_flag_label': 'web', + }).encode()) + self._user_info = self._detect_error(data)['user'] + + return self._user_info + + def _real_extract(self, url): + url, idata = unsmuggle_url(url, {}) + country_code, lang_code, video_id = self._match_valid_url(url).groups() + + query = { + 'r': 'vod/ajax-detail', + 'platform_flag_label': 'web', + 'product_id': video_id, + } + + area_id = self._AREA_ID.get(country_code.upper()) + if area_id: + query['area_id'] = area_id + + product_data = self._download_json( + 'http://www.viu.com/ott/%s/index.php' % country_code, video_id, + 'Downloading video info', query=query)['data'] + + video_data = product_data.get('current_product') + if not video_data: + raise ExtractorError('This video is not available in your region.', expected=True) + + series_id = video_data.get('series_id') + if not self.get_param('noplaylist') and not idata.get('force_noplaylist'): + self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % series_id) + series = product_data.get('series', {}) + product = series.get('product') + if product: + entries = [] + for entry in sorted(product, key=lambda x: int_or_none(x.get('number', 0))): + item_id = entry.get('product_id') + if not item_id: + continue + item_id = compat_str(item_id) + entries.append(self.url_result( + smuggle_url( + 'http://www.viu.com/ott/%s/%s/vod/%s/' % (country_code, lang_code, item_id), + {'force_noplaylist': True}), # prevent infinite recursion + 'ViuOTT', + item_id, + entry.get('synopsis', '').strip())) + + return self.playlist_result(entries, series_id, series.get('name'), series.get('description')) + + if self.get_param('noplaylist'): + self.to_screen('Downloading just video %s because of --no-playlist' % video_id) + + duration_limit = False + query = { + 'ccs_product_id': video_data['ccs_product_id'], + 'language_flag_id': self._LANGUAGE_FLAG.get(lang_code.lower()) or '3', + } + headers = { + 'Referer': url, + 'Origin': url, + } + try: + stream_data = self._download_json( + 'https://d1k2us671qcoau.cloudfront.net/distribute_web_%s.php' % country_code, + video_id, 'Downloading stream info', query=query, headers=headers) + stream_data = self._detect_error(stream_data)['stream'] + except (ExtractorError, KeyError): + stream_data = None + if video_data.get('user_level', 0) > 0: + user = self._login(country_code, video_id) + if user: + query['identity'] = user['identity'] + stream_data = self._download_json( + 'https://d1k2us671qcoau.cloudfront.net/distribute_web_%s.php' % country_code, + video_id, 'Downloading stream info', query=query, headers=headers) + stream_data = self._detect_error(stream_data).get('stream') + else: + # preview is limited to 3min for non-members + # try to bypass the duration limit + duration_limit = True + query['duration'] = '180' + stream_data = self._download_json( + 'https://d1k2us671qcoau.cloudfront.net/distribute_web_%s.php' % country_code, + video_id, 'Downloading stream info', query=query, headers=headers) + try: + stream_data = self._detect_error(stream_data)['stream'] + except (ExtractorError, KeyError): # if still not working, give up + self._raise_login_required() + + if not stream_data: + raise ExtractorError('Cannot get stream info', expected=True) + + stream_sizes = stream_data.get('size', {}) + formats = [] + for vid_format, stream_url in stream_data.get('url', {}).items(): + height = int_or_none(self._search_regex( + r's(\d+)p', vid_format, 'height', default=None)) + + # bypass preview duration limit + if duration_limit: + stream_url = compat_urlparse.urlparse(stream_url) + query = dict(compat_urlparse.parse_qsl(stream_url.query, keep_blank_values=True)) + time_duration = int_or_none(video_data.get('time_duration')) + query.update({ + 'duration': time_duration if time_duration > 0 else '9999999', + 'duration_start': '0', + }) + stream_url = stream_url._replace(query=compat_urlparse.urlencode(query)).geturl() + + formats.append({ + 'format_id': vid_format, + 'url': stream_url, + 'height': height, + 'ext': 'mp4', + 'filesize': int_or_none(stream_sizes.get(vid_format)) + }) + self._sort_formats(formats) + + subtitles = {} + for sub in video_data.get('subtitle', []): + sub_url = sub.get('url') + if not sub_url: + continue + subtitles.setdefault(sub.get('name'), []).append({ + 'url': sub_url, + 'ext': 'srt', + }) + + title = video_data['synopsis'].strip() + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('description'), + 'series': product_data.get('series', {}).get('name'), + 'episode': title, + 'episode_number': int_or_none(video_data.get('number')), + 'duration': int_or_none(stream_data.get('duration')), + 'thumbnail': video_data.get('cover_image_url'), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py new file mode 100644 index 000000000..d8a9b9ab4 --- /dev/null +++ b/yt_dlp/extractor/vk.py @@ -0,0 +1,689 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import collections +import functools +import re + +from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + clean_html, + ExtractorError, + get_element_by_class, + int_or_none, + OnDemandPagedList, + orderedSet, + str_or_none, + str_to_int, + unescapeHTML, + unified_timestamp, + url_or_none, + urlencode_postdata, +) +from .dailymotion import DailymotionIE +from .odnoklassniki import OdnoklassnikiIE +from .pladform import PladformIE +from .vimeo import VimeoIE +from .youtube import YoutubeIE + + +class VKBaseIE(InfoExtractor): + _NETRC_MACHINE = 'vk' + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + login_page, url_handle = self._download_webpage_handle( + 'https://vk.com', None, 'Downloading login page') + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'email': username.encode('cp1251'), + 'pass': password.encode('cp1251'), + }) + + # vk serves two same remixlhk cookies in Set-Cookie header and expects + # first one to be actually set + self._apply_first_set_cookie_header(url_handle, 'remixlhk') + + login_page = self._download_webpage( + 'https://login.vk.com/?act=login', None, + note='Logging in', + data=urlencode_postdata(login_form)) + + if re.search(r'onLoginFailed', login_page): + raise ExtractorError( + 'Unable to login, incorrect username and/or password', expected=True) + + def _real_initialize(self): + self._login() + + def _download_payload(self, path, video_id, data, fatal=True): + data['al'] = 1 + code, payload = self._download_json( + 'https://vk.com/%s.php' % path, video_id, + data=urlencode_postdata(data), fatal=fatal, + headers={'X-Requested-With': 'XMLHttpRequest'})['payload'] + if code == '3': + self.raise_login_required() + elif code == '8': + raise ExtractorError(clean_html(payload[0][1:-1]), expected=True) + return payload + + +class VKIE(VKBaseIE): + IE_NAME = 'vk' + IE_DESC = 'VK' + _VALID_URL = r'''(?x) + https?:// + (?: + (?: + (?:(?:m|new)\.)?vk\.com/video_| + (?:www\.)?daxab.com/ + ) + ext\.php\?(?P<embed_query>.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+).*)| + (?: + (?:(?:m|new)\.)?vk\.com/(?:.+?\?.*?z=)?video| + (?:www\.)?daxab.com/embed/ + ) + (?P<videoid>-?\d+_\d+)(?:.*\blist=(?P<list_id>[\da-f]+))? + ) + ''' + _TESTS = [ + { + 'url': 'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521', + 'md5': '7babad3b85ea2e91948005b1b8b0cb84', + 'info_dict': { + 'id': '-77521_162222515', + 'ext': 'mp4', + 'title': 'ProtivoGunz - Хуёвая песня', + 'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*', + 'uploader_id': '-77521', + 'duration': 195, + 'timestamp': 1329049880, + 'upload_date': '20120212', + }, + }, + { + 'url': 'http://vk.com/video205387401_165548505', + 'info_dict': { + 'id': '205387401_165548505', + 'ext': 'mp4', + 'title': 'No name', + 'uploader': 'Tom Cruise', + 'uploader_id': '205387401', + 'duration': 9, + 'timestamp': 1374364108, + 'upload_date': '20130720', + } + }, + { + 'note': 'Embedded video', + 'url': 'https://vk.com/video_ext.php?oid=-77521&id=162222515&hash=87b046504ccd8bfa', + 'md5': '7babad3b85ea2e91948005b1b8b0cb84', + 'info_dict': { + 'id': '-77521_162222515', + 'ext': 'mp4', + 'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*', + 'title': 'ProtivoGunz - Хуёвая песня', + 'duration': 195, + 'upload_date': '20120212', + 'timestamp': 1329049880, + 'uploader_id': '-77521', + }, + }, + { + # VIDEO NOW REMOVED + # please update if you find a video whose URL follows the same pattern + 'url': 'http://vk.com/video-8871596_164049491', + 'md5': 'a590bcaf3d543576c9bd162812387666', + 'note': 'Only available for registered users', + 'info_dict': { + 'id': '-8871596_164049491', + 'ext': 'mp4', + 'uploader': 'Триллеры', + 'title': '► Бойцовский клуб / Fight Club 1999 [HD 720]', + 'duration': 8352, + 'upload_date': '20121218', + 'view_count': int, + }, + 'skip': 'Removed', + }, + { + 'url': 'http://vk.com/hd_kino_mania?z=video-43215063_168067957%2F15c66b9b533119788d', + 'info_dict': { + 'id': '-43215063_168067957', + 'ext': 'mp4', + 'uploader': 'Bro Mazter', + 'title': ' ', + 'duration': 7291, + 'upload_date': '20140328', + 'uploader_id': '223413403', + 'timestamp': 1396018030, + }, + 'skip': 'Requires vk account credentials', + }, + { + 'url': 'http://m.vk.com/video-43215063_169084319?list=125c627d1aa1cebb83&from=wall-43215063_2566540', + 'md5': '0c45586baa71b7cb1d0784ee3f4e00a6', + 'note': 'ivi.ru embed', + 'info_dict': { + 'id': '-43215063_169084319', + 'ext': 'mp4', + 'title': 'Книга Илая', + 'duration': 6771, + 'upload_date': '20140626', + 'view_count': int, + }, + 'skip': 'Removed', + }, + { + # video (removed?) only available with list id + 'url': 'https://vk.com/video30481095_171201961?list=8764ae2d21f14088d4', + 'md5': '091287af5402239a1051c37ec7b92913', + 'info_dict': { + 'id': '30481095_171201961', + 'ext': 'mp4', + 'title': 'ТюменцевВВ_09.07.2015', + 'uploader': 'Anton Ivanov', + 'duration': 109, + 'upload_date': '20150709', + 'view_count': int, + }, + 'skip': 'Removed', + }, + { + # youtube embed + 'url': 'https://vk.com/video276849682_170681728', + 'info_dict': { + 'id': 'V3K4mi0SYkc', + 'ext': 'mp4', + 'title': "DSWD Awards 'Children's Joy Foundation, Inc.' Certificate of Registration and License to Operate", + 'description': 'md5:bf9c26cfa4acdfb146362682edd3827a', + 'duration': 178, + 'upload_date': '20130116', + 'uploader': "Children's Joy Foundation Inc.", + 'uploader_id': 'thecjf', + 'view_count': int, + }, + }, + { + # dailymotion embed + 'url': 'https://vk.com/video-37468416_456239855', + 'info_dict': { + 'id': 'k3lz2cmXyRuJQSjGHUv', + 'ext': 'mp4', + 'title': 'md5:d52606645c20b0ddbb21655adaa4f56f', + 'description': 'md5:424b8e88cc873217f520e582ba28bb36', + 'uploader': 'AniLibria.Tv', + 'upload_date': '20160914', + 'uploader_id': 'x1p5vl5', + 'timestamp': 1473877246, + }, + 'params': { + 'skip_download': True, + }, + }, + { + # video key is extra_data not url\d+ + 'url': 'http://vk.com/video-110305615_171782105', + 'md5': 'e13fcda136f99764872e739d13fac1d1', + 'info_dict': { + 'id': '-110305615_171782105', + 'ext': 'mp4', + 'title': 'S-Dance, репетиции к The way show', + 'uploader': 'THE WAY SHOW | 17 апреля', + 'uploader_id': '-110305615', + 'timestamp': 1454859345, + 'upload_date': '20160207', + }, + 'params': { + 'skip_download': True, + }, + }, + { + # finished live stream, postlive_mp4 + 'url': 'https://vk.com/videos-387766?z=video-387766_456242764%2Fpl_-387766_-2', + 'info_dict': { + 'id': '-387766_456242764', + 'ext': 'mp4', + 'title': 'ИгроМир 2016 День 1 — Игромания Утром', + 'uploader': 'Игромания', + 'duration': 5239, + # TODO: use act=show to extract view_count + # 'view_count': int, + 'upload_date': '20160929', + 'uploader_id': '-387766', + 'timestamp': 1475137527, + }, + 'params': { + 'skip_download': True, + }, + }, + { + # live stream, hls and rtmp links, most likely already finished live + # stream by the time you are reading this comment + 'url': 'https://vk.com/video-140332_456239111', + 'only_matching': True, + }, + { + # removed video, just testing that we match the pattern + 'url': 'http://vk.com/feed?z=video-43215063_166094326%2Fbb50cacd3177146d7a', + 'only_matching': True, + }, + { + # age restricted video, requires vk account credentials + 'url': 'https://vk.com/video205387401_164765225', + 'only_matching': True, + }, + { + # pladform embed + 'url': 'https://vk.com/video-76116461_171554880', + 'only_matching': True, + }, + { + 'url': 'http://new.vk.com/video205387401_165548505', + 'only_matching': True, + }, + { + # This video is no longer available, because its author has been blocked. + 'url': 'https://vk.com/video-10639516_456240611', + 'only_matching': True, + }, + { + # The video is not available in your region. + 'url': 'https://vk.com/video-51812607_171445436', + 'only_matching': True, + }] + + @staticmethod + def _extract_sibnet_urls(webpage): + # https://help.sibnet.ru/?sibnet_video_embed + return [unescapeHTML(mobj.group('url')) for mobj in re.finditer( + r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//video\.sibnet\.ru/shell\.php\?.*?\bvideoid=\d+.*?)\1', + webpage)] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('videoid') + + mv_data = {} + if video_id: + data = { + 'act': 'show_inline', + 'video': video_id, + } + # Some videos (removed?) can only be downloaded with list id specified + list_id = mobj.group('list_id') + if list_id: + data['list'] = list_id + + payload = self._download_payload('al_video', video_id, data) + info_page = payload[1] + opts = payload[-1] + mv_data = opts.get('mvData') or {} + player = opts.get('player') or {} + else: + video_id = '%s_%s' % (mobj.group('oid'), mobj.group('id')) + + info_page = self._download_webpage( + 'http://vk.com/video_ext.php?' + mobj.group('embed_query'), video_id) + + error_message = self._html_search_regex( + [r'(?s)<!><div[^>]+class="video_layer_message"[^>]*>(.+?)</div>', + r'(?s)<div[^>]+id="video_ext_msg"[^>]*>(.+?)</div>'], + info_page, 'error message', default=None) + if error_message: + raise ExtractorError(error_message, expected=True) + + if re.search(r'<!>/login\.php\?.*\bact=security_check', info_page): + raise ExtractorError( + 'You are trying to log in from an unusual location. You should confirm ownership at vk.com to log in with this IP.', + expected=True) + + ERROR_COPYRIGHT = 'Video %s has been removed from public access due to rightholder complaint.' + + ERRORS = { + r'>Видеозапись .*? была изъята из публичного доступа в связи с обращением правообладателя.<': + ERROR_COPYRIGHT, + + r'>The video .*? was removed from public access by request of the copyright holder.<': + ERROR_COPYRIGHT, + + r'<!>Please log in or <': + 'Video %s is only available for registered users, ' + 'use --username and --password options to provide account credentials.', + + r'<!>Unknown error': + 'Video %s does not exist.', + + r'<!>Видео временно недоступно': + 'Video %s is temporarily unavailable.', + + r'<!>Access denied': + 'Access denied to video %s.', + + r'<!>Видеозапись недоступна, так как её автор был заблокирован.': + 'Video %s is no longer available, because its author has been blocked.', + + r'<!>This video is no longer available, because its author has been blocked.': + 'Video %s is no longer available, because its author has been blocked.', + + r'<!>This video is no longer available, because it has been deleted.': + 'Video %s is no longer available, because it has been deleted.', + + r'<!>The video .+? is not available in your region.': + 'Video %s is not available in your region.', + } + + for error_re, error_msg in ERRORS.items(): + if re.search(error_re, info_page): + raise ExtractorError(error_msg % video_id, expected=True) + + player = self._parse_json(self._search_regex( + r'var\s+playerParams\s*=\s*({.+?})\s*;\s*\n', + info_page, 'player params'), video_id) + + youtube_url = YoutubeIE._extract_url(info_page) + if youtube_url: + return self.url_result(youtube_url, YoutubeIE.ie_key()) + + vimeo_url = VimeoIE._extract_url(url, info_page) + if vimeo_url is not None: + return self.url_result(vimeo_url, VimeoIE.ie_key()) + + pladform_url = PladformIE._extract_url(info_page) + if pladform_url: + return self.url_result(pladform_url, PladformIE.ie_key()) + + m_rutube = re.search( + r'\ssrc="((?:https?:)?//rutube\.ru\\?/(?:video|play)\\?/embed(?:.*?))\\?"', info_page) + if m_rutube is not None: + rutube_url = self._proto_relative_url( + m_rutube.group(1).replace('\\', '')) + return self.url_result(rutube_url) + + dailymotion_urls = DailymotionIE._extract_urls(info_page) + if dailymotion_urls: + return self.url_result(dailymotion_urls[0], DailymotionIE.ie_key()) + + odnoklassniki_url = OdnoklassnikiIE._extract_url(info_page) + if odnoklassniki_url: + return self.url_result(odnoklassniki_url, OdnoklassnikiIE.ie_key()) + + sibnet_urls = self._extract_sibnet_urls(info_page) + if sibnet_urls: + return self.url_result(sibnet_urls[0]) + + m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.+?});', info_page) + if m_opts: + m_opts_url = re.search(r"url\s*:\s*'((?!/\b)[^']+)", m_opts.group(1)) + if m_opts_url: + opts_url = m_opts_url.group(1) + if opts_url.startswith('//'): + opts_url = 'http:' + opts_url + return self.url_result(opts_url) + + data = player['params'][0] + title = unescapeHTML(data['md_title']) + + # 2 = live + # 3 = post live (finished live) + is_live = data.get('live') == 2 + if is_live: + title = self._live_title(title) + + timestamp = unified_timestamp(self._html_search_regex( + r'class=["\']mv_info_date[^>]+>([^<]+)(?:<|from)', info_page, + 'upload date', default=None)) or int_or_none(data.get('date')) + + view_count = str_to_int(self._search_regex( + r'class=["\']mv_views_count[^>]+>\s*([\d,.]+)', + info_page, 'view count', default=None)) + + formats = [] + for format_id, format_url in data.items(): + format_url = url_or_none(format_url) + if not format_url or not format_url.startswith(('http', '//', 'rtmp')): + continue + if (format_id.startswith(('url', 'cache')) + or format_id in ('extra_data', 'live_mp4', 'postlive_mp4')): + height = int_or_none(self._search_regex( + r'^(?:url|cache)(\d+)', format_id, 'height', default=None)) + formats.append({ + 'format_id': format_id, + 'url': format_url, + 'height': height, + }) + elif format_id == 'hls': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', 'm3u8_native', + m3u8_id=format_id, fatal=False, live=is_live)) + elif format_id == 'rtmp': + formats.append({ + 'format_id': format_id, + 'url': format_url, + 'ext': 'flv', + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'formats': formats, + 'title': title, + 'thumbnail': data.get('jpg'), + 'uploader': data.get('md_author'), + 'uploader_id': str_or_none(data.get('author_id') or mv_data.get('authorId')), + 'duration': int_or_none(data.get('duration') or mv_data.get('duration')), + 'timestamp': timestamp, + 'view_count': view_count, + 'like_count': int_or_none(mv_data.get('likes')), + 'comment_count': int_or_none(mv_data.get('commcount')), + 'is_live': is_live, + } + + +class VKUserVideosIE(VKBaseIE): + IE_NAME = 'vk:uservideos' + IE_DESC = "VK - User's Videos" + _VALID_URL = r'https?://(?:(?:m|new)\.)?vk\.com/videos(?P<id>-?[0-9]+)(?!\?.*\bz=video)(?:[/?#&](?:.*?\bsection=(?P<section>\w+))?|$)' + _TEMPLATE_URL = 'https://vk.com/videos' + _TESTS = [{ + 'url': 'https://vk.com/videos-767561', + 'info_dict': { + 'id': '-767561_all', + }, + 'playlist_mincount': 1150, + }, { + 'url': 'https://vk.com/videos-767561?section=uploaded', + 'info_dict': { + 'id': '-767561_uploaded', + }, + 'playlist_mincount': 425, + }, { + 'url': 'http://vk.com/videos205387401', + 'only_matching': True, + }, { + 'url': 'http://vk.com/videos-77521', + 'only_matching': True, + }, { + 'url': 'http://vk.com/videos-97664626?section=all', + 'only_matching': True, + }, { + 'url': 'http://m.vk.com/videos205387401', + 'only_matching': True, + }, { + 'url': 'http://new.vk.com/videos205387401', + 'only_matching': True, + }] + _PAGE_SIZE = 1000 + _VIDEO = collections.namedtuple('Video', ['owner_id', 'id']) + + def _fetch_page(self, page_id, section, page): + l = self._download_payload('al_video', page_id, { + 'act': 'load_videos_silent', + 'offset': page * self._PAGE_SIZE, + 'oid': page_id, + 'section': section, + })[0][section]['list'] + + for video in l: + v = self._VIDEO._make(video[:2]) + video_id = '%d_%d' % (v.owner_id, v.id) + yield self.url_result( + 'http://vk.com/video' + video_id, VKIE.ie_key(), video_id) + + def _real_extract(self, url): + page_id, section = self._match_valid_url(url).groups() + if not section: + section = 'all' + + entries = OnDemandPagedList( + functools.partial(self._fetch_page, page_id, section), + self._PAGE_SIZE) + + return self.playlist_result(entries, '%s_%s' % (page_id, section)) + + +class VKWallPostIE(VKBaseIE): + IE_NAME = 'vk:wallpost' + _VALID_URL = r'https?://(?:(?:(?:(?:m|new)\.)?vk\.com/(?:[^?]+\?.*\bw=)?wall(?P<id>-?\d+_\d+)))' + _TESTS = [{ + # public page URL, audio playlist + 'url': 'https://vk.com/bs.official?w=wall-23538238_35', + 'info_dict': { + 'id': '-23538238_35', + 'title': 'Black Shadow - Wall post -23538238_35', + 'description': 'md5:3f84b9c4f9ef499731cf1ced9998cc0c', + }, + 'playlist': [{ + 'md5': '5ba93864ec5b85f7ce19a9af4af080f6', + 'info_dict': { + 'id': '135220665_111806521', + 'ext': 'mp4', + 'title': 'Black Shadow - Слепое Верование', + 'duration': 370, + 'uploader': 'Black Shadow', + 'artist': 'Black Shadow', + 'track': 'Слепое Верование', + }, + }, { + 'md5': '4cc7e804579122b17ea95af7834c9233', + 'info_dict': { + 'id': '135220665_111802303', + 'ext': 'mp4', + 'title': 'Black Shadow - Война - Негасимое Бездны Пламя!', + 'duration': 423, + 'uploader': 'Black Shadow', + 'artist': 'Black Shadow', + 'track': 'Война - Негасимое Бездны Пламя!', + }, + }], + 'params': { + 'skip_download': True, + 'usenetrc': True, + }, + 'skip': 'Requires vk account credentials', + }, { + # single YouTube embed, no leading - + 'url': 'https://vk.com/wall85155021_6319', + 'info_dict': { + 'id': '85155021_6319', + 'title': 'Сергей Горбунов - Wall post 85155021_6319', + }, + 'playlist_count': 1, + 'params': { + 'usenetrc': True, + }, + 'skip': 'Requires vk account credentials', + }, { + # wall page URL + 'url': 'https://vk.com/wall-23538238_35', + 'only_matching': True, + }, { + # mobile wall page URL + 'url': 'https://m.vk.com/wall-23538238_35', + 'only_matching': True, + }] + _BASE64_CHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMN0PQRSTUVWXYZO123456789+/=' + _AUDIO = collections.namedtuple('Audio', ['id', 'owner_id', 'url', 'title', 'performer', 'duration', 'album_id', 'unk', 'author_link', 'lyrics', 'flags', 'context', 'extra', 'hashes', 'cover_url', 'ads']) + + def _decode(self, enc): + dec = '' + e = n = 0 + for c in enc: + r = self._BASE64_CHARS.index(c) + cond = n % 4 + e = 64 * e + r if cond else r + n += 1 + if cond: + dec += chr(255 & e >> (-2 * n & 6)) + return dec + + def _unmask_url(self, mask_url, vk_id): + if 'audio_api_unavailable' in mask_url: + extra = mask_url.split('?extra=')[1].split('#') + func, base = self._decode(extra[1]).split(chr(11)) + mask_url = list(self._decode(extra[0])) + url_len = len(mask_url) + indexes = [None] * url_len + index = int(base) ^ vk_id + for n in range(url_len - 1, -1, -1): + index = (url_len * (n + 1) ^ index + n) % url_len + indexes[n] = index + for n in range(1, url_len): + c = mask_url[n] + index = indexes[url_len - 1 - n] + mask_url[n] = mask_url[index] + mask_url[index] = c + mask_url = ''.join(mask_url) + return mask_url + + def _real_extract(self, url): + post_id = self._match_id(url) + + webpage = self._download_payload('wkview', post_id, { + 'act': 'show', + 'w': 'wall' + post_id, + })[1] + + description = clean_html(get_element_by_class('wall_post_text', webpage)) + uploader = clean_html(get_element_by_class('author', webpage)) + + entries = [] + + for audio in re.findall(r'data-audio="([^"]+)', webpage): + audio = self._parse_json(unescapeHTML(audio), post_id) + a = self._AUDIO._make(audio[:16]) + if not a.url: + continue + title = unescapeHTML(a.title) + performer = unescapeHTML(a.performer) + entries.append({ + 'id': '%s_%s' % (a.owner_id, a.id), + 'url': self._unmask_url(a.url, a.ads['vk_id']), + 'title': '%s - %s' % (performer, title) if performer else title, + 'thumbnails': [{'url': c_url} for c_url in a.cover_url.split(',')] if a.cover_url else None, + 'duration': int_or_none(a.duration), + 'uploader': uploader, + 'artist': performer, + 'track': title, + 'ext': 'mp4', + 'protocol': 'm3u8', + }) + + for video in re.finditer( + r'<a[^>]+href=(["\'])(?P<url>/video(?:-?[\d_]+).*?)\1', webpage): + entries.append(self.url_result( + compat_urlparse.urljoin(url, video.group('url')), VKIE.ie_key())) + + title = 'Wall post %s' % post_id + + return self.playlist_result( + orderedSet(entries), post_id, + '%s - %s' % (uploader, title) if uploader else title, + description) diff --git a/yt_dlp/extractor/vlive.py b/yt_dlp/extractor/vlive.py new file mode 100644 index 000000000..84f51a544 --- /dev/null +++ b/yt_dlp/extractor/vlive.py @@ -0,0 +1,383 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools +import json + +from .naver import NaverBaseIE +from ..compat import ( + compat_HTTPError, + compat_str, +) +from ..utils import ( + ExtractorError, + int_or_none, + merge_dicts, + str_or_none, + strip_or_none, + try_get, + urlencode_postdata, +) + + +class VLiveBaseIE(NaverBaseIE): + _APP_ID = '8c6cc7b45d2568fb668be6e05b6e5a3b' + + +class VLiveIE(VLiveBaseIE): + IE_NAME = 'vlive' + _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/(?:video|embed)/(?P<id>[0-9]+)' + _NETRC_MACHINE = 'vlive' + _TESTS = [{ + 'url': 'http://www.vlive.tv/video/1326', + 'md5': 'cc7314812855ce56de70a06a27314983', + 'info_dict': { + 'id': '1326', + 'ext': 'mp4', + 'title': "Girl's Day's Broadcast", + 'creator': "Girl's Day", + 'view_count': int, + 'uploader_id': 'muploader_a', + }, + }, { + 'url': 'http://www.vlive.tv/video/16937', + 'info_dict': { + 'id': '16937', + 'ext': 'mp4', + 'title': '첸백시 걍방', + 'creator': 'EXO', + 'view_count': int, + 'subtitles': 'mincount:12', + 'uploader_id': 'muploader_j', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.vlive.tv/video/129100', + 'md5': 'ca2569453b79d66e5b919e5d308bff6b', + 'info_dict': { + 'id': '129100', + 'ext': 'mp4', + 'title': '[V LIVE] [BTS+] Run BTS! 2019 - EP.71 :: Behind the scene', + 'creator': 'BTS+', + 'view_count': int, + 'subtitles': 'mincount:10', + }, + 'skip': 'This video is only available for CH+ subscribers', + }, { + 'url': 'https://www.vlive.tv/embed/1326', + 'only_matching': True, + }, { + # works only with gcc=KR + 'url': 'https://www.vlive.tv/video/225019', + 'only_matching': True, + }, { + 'url': 'https://www.vlive.tv/video/223906', + 'info_dict': { + 'id': '58', + 'title': 'RUN BTS!' + }, + 'playlist_mincount': 120 + }] + + def _real_initialize(self): + self._login() + + def _login(self): + email, password = self._get_login_info() + if None in (email, password): + return + + def is_logged_in(): + login_info = self._download_json( + 'https://www.vlive.tv/auth/loginInfo', None, + note='Downloading login info', + headers={'Referer': 'https://www.vlive.tv/home'}) + return try_get( + login_info, lambda x: x['message']['login'], bool) or False + + LOGIN_URL = 'https://www.vlive.tv/auth/email/login' + self._request_webpage( + LOGIN_URL, None, note='Downloading login cookies') + + self._download_webpage( + LOGIN_URL, None, note='Logging in', + data=urlencode_postdata({'email': email, 'pwd': password}), + headers={ + 'Referer': LOGIN_URL, + 'Content-Type': 'application/x-www-form-urlencoded' + }) + + if not is_logged_in(): + raise ExtractorError('Unable to log in', expected=True) + + def _call_api(self, path_template, video_id, fields=None, limit=None): + query = {'appId': self._APP_ID, 'gcc': 'KR', 'platformType': 'PC'} + if fields: + query['fields'] = fields + if limit: + query['limit'] = limit + try: + return self._download_json( + 'https://www.vlive.tv/globalv-web/vam-web/' + path_template % video_id, video_id, + 'Downloading %s JSON metadata' % path_template.split('/')[-1].split('-')[0], + headers={'Referer': 'https://www.vlive.tv/'}, query=query) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + self.raise_login_required(json.loads(e.cause.read().decode('utf-8'))['message']) + raise + + def _real_extract(self, url): + video_id = self._match_id(url) + + post = self._call_api( + 'post/v1.0/officialVideoPost-%s', video_id, + 'author{nickname},channel{channelCode,channelName},officialVideo{commentCount,exposeStatus,likeCount,playCount,playTime,status,title,type,vodId},playlist{playlistSeq,totalCount,name}') + + playlist = post.get('playlist') + if not playlist or self.get_param('noplaylist'): + if playlist: + self.to_screen( + 'Downloading just video %s because of --no-playlist' + % video_id) + + video = post['officialVideo'] + return self._get_vlive_info(post, video, video_id) + else: + playlist_name = playlist.get('name') + playlist_id = str_or_none(playlist.get('playlistSeq')) + playlist_count = str_or_none(playlist.get('totalCount')) + + playlist = self._call_api( + 'playlist/v1.0/playlist-%s/posts', playlist_id, 'data', limit=playlist_count) + + entries = [] + for video_data in playlist['data']: + video = video_data.get('officialVideo') + video_id = str_or_none(video.get('videoSeq')) + entries.append(self._get_vlive_info(video_data, video, video_id)) + + return self.playlist_result(entries, playlist_id, playlist_name) + + def _get_vlive_info(self, post, video, video_id): + def get_common_fields(): + channel = post.get('channel') or {} + return { + 'title': video.get('title'), + 'creator': post.get('author', {}).get('nickname'), + 'channel': channel.get('channelName'), + 'channel_id': channel.get('channelCode'), + 'duration': int_or_none(video.get('playTime')), + 'view_count': int_or_none(video.get('playCount')), + 'like_count': int_or_none(video.get('likeCount')), + 'comment_count': int_or_none(video.get('commentCount')), + } + + video_type = video.get('type') + if video_type == 'VOD': + inkey = self._call_api('video/v1.0/vod/%s/inkey', video_id)['inkey'] + vod_id = video['vodId'] + info_dict = merge_dicts( + get_common_fields(), + self._extract_video_info(video_id, vod_id, inkey)) + thumbnail = video.get('thumb') + if thumbnail: + if not info_dict.get('thumbnails') and info_dict.get('thumbnail'): + info_dict['thumbnails'] = [{'url': info_dict.pop('thumbnail')}] + info_dict.setdefault('thumbnails', []).append({'url': thumbnail, 'preference': 1}) + return info_dict + elif video_type == 'LIVE': + status = video.get('status') + if status == 'ON_AIR': + stream_url = self._call_api( + 'old/v3/live/%s/playInfo', + video_id)['result']['adaptiveStreamUrl'] + formats = self._extract_m3u8_formats(stream_url, video_id, 'mp4') + self._sort_formats(formats) + info = get_common_fields() + info.update({ + 'title': self._live_title(video['title']), + 'id': video_id, + 'formats': formats, + 'is_live': True, + }) + return info + elif status == 'ENDED': + raise ExtractorError( + 'Uploading for replay. Please wait...', expected=True) + elif status == 'RESERVED': + raise ExtractorError('Coming soon!', expected=True) + elif video.get('exposeStatus') == 'CANCEL': + raise ExtractorError( + 'We are sorry, but the live broadcast has been canceled.', + expected=True) + else: + raise ExtractorError('Unknown status ' + status) + + +class VLivePostIE(VLiveIE): + IE_NAME = 'vlive:post' + _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/post/(?P<id>\d-\d+)' + _TESTS = [{ + # uploadType = SOS + 'url': 'https://www.vlive.tv/post/1-20088044', + 'info_dict': { + 'id': '1-20088044', + 'title': 'Hola estrellitas la tierra les dice hola (si era así no?) Ha...', + 'description': 'md5:fab8a1e50e6e51608907f46c7fa4b407', + }, + 'playlist_count': 3, + }, { + # uploadType = V + 'url': 'https://www.vlive.tv/post/1-20087926', + 'info_dict': { + 'id': '1-20087926', + 'title': 'James Corden: And so, the baby becamos the Papa💜😭💪😭', + }, + 'playlist_count': 1, + }] + _FVIDEO_TMPL = 'fvideo/v1.0/fvideo-%%s/%s' + _SOS_TMPL = _FVIDEO_TMPL % 'sosPlayInfo' + _INKEY_TMPL = _FVIDEO_TMPL % 'inKey' + + def _real_extract(self, url): + post_id = self._match_id(url) + + post = self._call_api( + 'post/v1.0/post-%s', post_id, + 'attachments{video},officialVideo{videoSeq},plainBody,title') + + video_seq = str_or_none(try_get( + post, lambda x: x['officialVideo']['videoSeq'])) + if video_seq: + return self.url_result( + 'http://www.vlive.tv/video/' + video_seq, + VLiveIE.ie_key(), video_seq) + + title = post['title'] + entries = [] + for idx, video in enumerate(post['attachments']['video'].values()): + video_id = video.get('videoId') + if not video_id: + continue + upload_type = video.get('uploadType') + upload_info = video.get('uploadInfo') or {} + entry = None + if upload_type == 'SOS': + download = self._call_api( + self._SOS_TMPL, video_id)['videoUrl']['download'] + formats = [] + for f_id, f_url in download.items(): + formats.append({ + 'format_id': f_id, + 'url': f_url, + 'height': int_or_none(f_id[:-1]), + }) + self._sort_formats(formats) + entry = { + 'formats': formats, + 'id': video_id, + 'thumbnail': upload_info.get('imageUrl'), + } + elif upload_type == 'V': + vod_id = upload_info.get('videoId') + if not vod_id: + continue + inkey = self._call_api(self._INKEY_TMPL, video_id)['inKey'] + entry = self._extract_video_info(video_id, vod_id, inkey) + if entry: + entry['title'] = '%s_part%s' % (title, idx) + entries.append(entry) + return self.playlist_result( + entries, post_id, title, strip_or_none(post.get('plainBody'))) + + +class VLiveChannelIE(VLiveBaseIE): + IE_NAME = 'vlive:channel' + _VALID_URL = r'https?://(?:channels\.vlive\.tv|(?:(?:www|m)\.)?vlive\.tv/channel)/(?P<id>[0-9A-Z]+)' + _TESTS = [{ + 'url': 'http://channels.vlive.tv/FCD4B', + 'info_dict': { + 'id': 'FCD4B', + 'title': 'MAMAMOO', + }, + 'playlist_mincount': 110 + }, { + 'url': 'https://www.vlive.tv/channel/FCD4B', + 'only_matching': True, + }] + + def _call_api(self, path, channel_key_suffix, channel_value, note, query): + q = { + 'app_id': self._APP_ID, + 'channel' + channel_key_suffix: channel_value, + } + q.update(query) + return self._download_json( + 'http://api.vfan.vlive.tv/vproxy/channelplus/' + path, + channel_value, note='Downloading ' + note, query=q)['result'] + + def _real_extract(self, url): + channel_code = self._match_id(url) + + channel_seq = self._call_api( + 'decodeChannelCode', 'Code', channel_code, + 'decode channel code', {})['channelSeq'] + + channel_name = None + entries = [] + + for page_num in itertools.count(1): + video_list = self._call_api( + 'getChannelVideoList', 'Seq', channel_seq, + 'channel list page #%d' % page_num, { + # Large values of maxNumOfRows (~300 or above) may cause + # empty responses (see [1]), e.g. this happens for [2] that + # has more than 300 videos. + # 1. https://github.com/ytdl-org/youtube-dl/issues/13830 + # 2. http://channels.vlive.tv/EDBF. + 'maxNumOfRows': 100, + 'pageNo': page_num + } + ) + + if not channel_name: + channel_name = try_get( + video_list, + lambda x: x['channelInfo']['channelName'], + compat_str) + + videos = try_get( + video_list, lambda x: x['videoList'], list) + if not videos: + break + + for video in videos: + video_id = video.get('videoSeq') + video_type = video.get('videoType') + + if not video_id or not video_type: + continue + video_id = compat_str(video_id) + + if video_type in ('PLAYLIST'): + first_video_id = try_get( + video, + lambda x: x['videoPlaylist']['videoList'][0]['videoSeq'], int) + + if not first_video_id: + continue + + entries.append( + self.url_result( + 'http://www.vlive.tv/video/%s' % first_video_id, + ie=VLiveIE.ie_key(), video_id=first_video_id)) + else: + entries.append( + self.url_result( + 'http://www.vlive.tv/video/%s' % video_id, + ie=VLiveIE.ie_key(), video_id=video_id)) + + return self.playlist_result( + entries, channel_code, channel_name) diff --git a/youtube_dl/extractor/vodlocker.py b/yt_dlp/extractor/vodlocker.py index 02c9617d2..02c9617d2 100644 --- a/youtube_dl/extractor/vodlocker.py +++ b/yt_dlp/extractor/vodlocker.py diff --git a/youtube_dl/extractor/vodpl.py b/yt_dlp/extractor/vodpl.py index 9e919708e..9e919708e 100644 --- a/youtube_dl/extractor/vodpl.py +++ b/yt_dlp/extractor/vodpl.py diff --git a/youtube_dl/extractor/vodplatform.py b/yt_dlp/extractor/vodplatform.py index 74d2257e7..74d2257e7 100644 --- a/youtube_dl/extractor/vodplatform.py +++ b/yt_dlp/extractor/vodplatform.py diff --git a/youtube_dl/extractor/voicerepublic.py b/yt_dlp/extractor/voicerepublic.py index a52e40afa..a52e40afa 100644 --- a/youtube_dl/extractor/voicerepublic.py +++ b/yt_dlp/extractor/voicerepublic.py diff --git a/yt_dlp/extractor/voicy.py b/yt_dlp/extractor/voicy.py new file mode 100644 index 000000000..11ebe76e1 --- /dev/null +++ b/yt_dlp/extractor/voicy.py @@ -0,0 +1,147 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + smuggle_url, + traverse_obj, + unsmuggle_url, + unified_strdate, +) + +import itertools + + +class VoicyBaseIE(InfoExtractor): + def _extract_from_playlist_data(self, value): + voice_id = compat_str(value.get('PlaylistId')) + upload_date = unified_strdate(value.get('Published'), False) + items = [self._extract_single_article(voice_data) for voice_data in value['VoiceData']] + return { + '_type': 'multi_video', + 'entries': items, + 'id': voice_id, + 'title': compat_str(value.get('PlaylistName')), + 'uploader': value.get('SpeakerName'), + 'uploader_id': compat_str(value.get('SpeakerId')), + 'channel': value.get('ChannelName'), + 'channel_id': compat_str(value.get('ChannelId')), + 'upload_date': upload_date, + } + + def _extract_single_article(self, entry): + formats = [{ + 'url': entry['VoiceHlsFile'], + 'format_id': 'hls', + 'ext': 'm4a', + 'acodec': 'aac', + 'vcodec': 'none', + 'protocol': 'm3u8_native', + }, { + 'url': entry['VoiceFile'], + 'format_id': 'mp3', + 'ext': 'mp3', + 'acodec': 'mp3', + 'vcodec': 'none', + }] + self._sort_formats(formats) + return { + 'id': compat_str(entry.get('ArticleId')), + 'title': entry.get('ArticleTitle'), + 'description': entry.get('MediaName'), + 'formats': formats, + } + + def _call_api(self, url, video_id, **kwargs): + response = self._download_json(url, video_id, **kwargs) + if response.get('Status') != 0: + message = traverse_obj(response, ('Value', 'Error', 'Message'), expected_type=compat_str) + if not message: + message = 'There was a error in the response: %d' % response.get('Status') + raise ExtractorError(message, expected=False) + return response.get('Value') + + +class VoicyIE(VoicyBaseIE): + IE_NAME = 'voicy' + _VALID_URL = r'https?://voicy\.jp/channel/(?P<channel_id>\d+)/(?P<id>\d+)' + ARTICLE_LIST_API_URL = 'https://vmw.api.voicy.jp/articles_list?channel_id=%s&pid=%s' + _TESTS = [{ + 'url': 'https://voicy.jp/channel/1253/122754', + 'info_dict': { + 'id': '122754', + 'title': '1/21(木)声日記:ついに原稿終わった!!', + 'uploader': 'ちょまど@ ITエンジニアなオタク', + 'uploader_id': '7339', + }, + 'playlist_mincount': 9, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + assert mobj + voice_id = mobj.group('id') + channel_id = mobj.group('channel_id') + url, article_list = unsmuggle_url(url) + if not article_list: + article_list = self._call_api(self.ARTICLE_LIST_API_URL % (channel_id, voice_id), voice_id) + return self._extract_from_playlist_data(article_list) + + +class VoicyChannelIE(VoicyBaseIE): + IE_NAME = 'voicy:channel' + _VALID_URL = r'https?://voicy\.jp/channel/(?P<id>\d+)' + PROGRAM_LIST_API_URL = 'https://vmw.api.voicy.jp/program_list/all?channel_id=%s&limit=20&public_type=3%s' + _TESTS = [{ + 'url': 'https://voicy.jp/channel/1253/', + 'info_dict': { + 'id': '7339', + 'title': 'ゆるふわ日常ラジオ #ちょまラジ', + 'uploader': 'ちょまど@ ITエンジニアなオタク', + 'uploader_id': '7339', + }, + 'playlist_mincount': 54, + }] + + @classmethod + def suitable(cls, url): + return not VoicyIE.suitable(url) and super(VoicyChannelIE, cls).suitable(url) + + def _entries(self, channel_id): + pager = '' + for count in itertools.count(1): + article_list = self._call_api(self.PROGRAM_LIST_API_URL % (channel_id, pager), channel_id, note='Paging #%d' % count) + playlist_data = article_list.get('PlaylistData') + if not playlist_data: + break + yield from playlist_data + last = playlist_data[-1] + pager = '&pid=%d&p_date=%s&play_count=%s' % (last['PlaylistId'], last['Published'], last['PlayCount']) + + def _real_extract(self, url): + channel_id = self._match_id(url) + articles = self._entries(channel_id) + + first_article = next(articles, None) + title = traverse_obj(first_article, ('ChannelName', ), expected_type=compat_str) + speaker_name = traverse_obj(first_article, ('SpeakerName', ), expected_type=compat_str) + if not title and speaker_name: + title = 'Uploads from %s' % speaker_name + if not title: + title = 'Uploads from channel ID %s' % channel_id + + articles = itertools.chain([first_article], articles) if first_article else articles + + playlist = ( + self.url_result(smuggle_url('https://voicy.jp/channel/%s/%d' % (channel_id, value['PlaylistId']), value), VoicyIE.ie_key()) + for value in articles) + return { + '_type': 'playlist', + 'entries': playlist, + 'id': channel_id, + 'title': title, + 'channel': speaker_name, + 'channel_id': channel_id, + } diff --git a/yt_dlp/extractor/voot.py b/yt_dlp/extractor/voot.py new file mode 100644 index 000000000..e2944ec63 --- /dev/null +++ b/yt_dlp/extractor/voot.py @@ -0,0 +1,150 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + try_get, + unified_timestamp, +) + + +class VootIE(InfoExtractor): + _VALID_URL = r'''(?x) + (?: + voot:| + (?:https?://)(?:www\.)?voot\.com/? + (?: + movies/[^/]+/| + (?:shows|kids)/(?:[^/]+/){4} + ) + ) + (?P<id>\d{3,}) + ''' + _GEO_COUNTRIES = ['IN'] + _TESTS = [{ + 'url': 'https://www.voot.com/shows/ishq-ka-rang-safed/1/360558/is-this-the-end-of-kamini-/441353', + 'info_dict': { + 'id': '0_8ledb18o', + 'ext': 'mp4', + 'title': 'Ishq Ka Rang Safed - Season 01 - Episode 340', + 'description': 'md5:06291fbbbc4dcbe21235c40c262507c1', + 'timestamp': 1472162937, + 'upload_date': '20160825', + 'series': 'Ishq Ka Rang Safed', + 'season_number': 1, + 'episode': 'Is this the end of Kamini?', + 'episode_number': 340, + 'view_count': int, + 'like_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Failed to download m3u8 information'], + }, { + 'url': 'https://www.voot.com/kids/characters/mighty-cat-masked-niyander-e-/400478/school-bag-disappears/440925', + 'only_matching': True, + }, { + 'url': 'https://www.voot.com/movies/pandavas-5/424627', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + media_info = self._download_json( + 'https://wapi.voot.com/ws/ott/getMediaInfo.json', video_id, + query={ + 'platform': 'Web', + 'pId': 2, + 'mediaId': video_id, + }) + + status_code = try_get(media_info, lambda x: x['status']['code'], int) + if status_code != 0: + raise ExtractorError(media_info['status']['message'], expected=True) + + media = media_info['assets'] + + entry_id = media['EntryId'] + title = media['MediaName'] + formats = self._extract_m3u8_formats( + 'https://cdnapisec.kaltura.com/p/1982551/playManifest/pt/https/f/applehttp/t/web/e/' + entry_id, + video_id, 'mp4', m3u8_id='hls') + self._sort_formats(formats) + + description, series, season_number, episode, episode_number = [None] * 5 + + for meta in try_get(media, lambda x: x['Metas'], list) or []: + key, value = meta.get('Key'), meta.get('Value') + if not key or not value: + continue + if key == 'ContentSynopsis': + description = value + elif key == 'RefSeriesTitle': + series = value + elif key == 'RefSeriesSeason': + season_number = int_or_none(value) + elif key == 'EpisodeMainTitle': + episode = value + elif key == 'EpisodeNo': + episode_number = int_or_none(value) + return { + 'extractor_key': 'Kaltura', + 'id': entry_id, + 'title': title, + 'description': description, + 'series': series, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, + 'timestamp': unified_timestamp(media.get('CreationDate')), + 'duration': int_or_none(media.get('Duration')), + 'view_count': int_or_none(media.get('ViewCounter')), + 'like_count': int_or_none(media.get('like_counter')), + 'formats': formats, + } + + +class VootSeriesIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?voot\.com/shows/[^/]+/(?P<id>\d{3,})' + _TESTS = [{ + 'url': 'https://www.voot.com/shows/chakravartin-ashoka-samrat/100002', + 'playlist_mincount': 442, + 'info_dict': { + 'id': '100002', + }, + }, { + 'url': 'https://www.voot.com/shows/ishq-ka-rang-safed/100003', + 'playlist_mincount': 341, + 'info_dict': { + 'id': '100003', + }, + }] + _SHOW_API = 'https://psapi.voot.com/media/voot/v1/voot-web/content/generic/season-by-show?sort=season%3Aasc&id={}&responseType=common' + _SEASON_API = 'https://psapi.voot.com/media/voot/v1/voot-web/content/generic/series-wise-episode?sort=episode%3Aasc&id={}&responseType=common&page={:d}' + + def _entries(self, show_id): + show_json = self._download_json(self._SHOW_API.format(show_id), video_id=show_id) + for season in show_json.get('result', []): + page_num = 1 + season_id = try_get(season, lambda x: x['id'], compat_str) + season_json = self._download_json(self._SEASON_API.format(season_id, page_num), + video_id=season_id, + note='Downloading JSON metadata page %d' % page_num) + episodes_json = season_json.get('result', []) + while episodes_json: + page_num += 1 + for episode in episodes_json: + video_id = episode.get('id') + yield self.url_result( + 'voot:%s' % video_id, ie=VootIE.ie_key(), video_id=video_id) + episodes_json = self._download_json(self._SEASON_API.format(season_id, page_num), + video_id=season_id, + note='Downloading JSON metadata page %d' % page_num)['result'] + + def _real_extract(self, url): + show_id = self._match_id(url) + return self.playlist_result(self._entries(show_id), playlist_id=show_id) diff --git a/youtube_dl/extractor/voxmedia.py b/yt_dlp/extractor/voxmedia.py index 661208125..661208125 100644 --- a/youtube_dl/extractor/voxmedia.py +++ b/yt_dlp/extractor/voxmedia.py diff --git a/youtube_dl/extractor/vrak.py b/yt_dlp/extractor/vrak.py index daa247cce..daa247cce 100644 --- a/youtube_dl/extractor/vrak.py +++ b/yt_dlp/extractor/vrak.py diff --git a/yt_dlp/extractor/vrt.py b/yt_dlp/extractor/vrt.py new file mode 100644 index 000000000..10dc94abc --- /dev/null +++ b/yt_dlp/extractor/vrt.py @@ -0,0 +1,86 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + float_or_none, + get_element_by_class, + strip_or_none, + unified_timestamp, +) + + +class VRTIE(InfoExtractor): + IE_DESC = 'VRT NWS, Flanders News, Flandern Info and Sporza' + _VALID_URL = r'https?://(?:www\.)?(?P<site>vrt\.be/vrtnws|sporza\.be)/[a-z]{2}/\d{4}/\d{2}/\d{2}/(?P<id>[^/?&#]+)' + _TESTS = [{ + 'url': 'https://www.vrt.be/vrtnws/nl/2019/05/15/beelden-van-binnenkant-notre-dame-een-maand-na-de-brand/', + 'md5': 'e1663accf5cf13f375f3cd0d10476669', + 'info_dict': { + 'id': 'pbs-pub-7855fc7b-1448-49bc-b073-316cb60caa71$vid-2ca50305-c38a-4762-9890-65cbd098b7bd', + 'ext': 'mp4', + 'title': 'Beelden van binnenkant Notre-Dame, één maand na de brand', + 'description': 'Op maandagavond 15 april ging een deel van het dakgebinte van de Parijse kathedraal in vlammen op.', + 'timestamp': 1557924660, + 'upload_date': '20190515', + 'duration': 31.2, + }, + }, { + 'url': 'https://sporza.be/nl/2019/05/15/de-belgian-cats-zijn-klaar-voor-het-ek/', + 'md5': '910bba927566e9ab992278f647eb4b75', + 'info_dict': { + 'id': 'pbs-pub-f2c86a46-8138-413a-a4b9-a0015a16ce2c$vid-1f112b31-e58e-4379-908d-aca6d80f8818', + 'ext': 'mp4', + 'title': 'De Belgian Cats zijn klaar voor het EK mét Ann Wauters', + 'timestamp': 1557923760, + 'upload_date': '20190515', + 'duration': 115.17, + }, + }, { + 'url': 'https://www.vrt.be/vrtnws/en/2019/05/15/belgium_s-eurovision-entry-falls-at-the-first-hurdle/', + 'only_matching': True, + }, { + 'url': 'https://www.vrt.be/vrtnws/de/2019/05/15/aus-fuer-eliott-im-halbfinale-des-eurosongfestivals/', + 'only_matching': True, + }] + _CLIENT_MAP = { + 'vrt.be/vrtnws': 'vrtnieuws', + 'sporza.be': 'sporza', + } + + def _real_extract(self, url): + site, display_id = self._match_valid_url(url).groups() + webpage = self._download_webpage(url, display_id) + attrs = extract_attributes(self._search_regex( + r'(<[^>]+class="vrtvideo( [^"]*)?"[^>]*>)', webpage, 'vrt video')) + + asset_id = attrs['data-video-id'] + publication_id = attrs.get('data-publication-id') + if publication_id: + asset_id = publication_id + '$' + asset_id + client = attrs.get('data-client-code') or self._CLIENT_MAP[site] + + title = strip_or_none(get_element_by_class( + 'vrt-title', webpage) or self._html_search_meta( + ['og:title', 'twitter:title', 'name'], webpage)) + description = self._html_search_meta( + ['og:description', 'twitter:description', 'description'], webpage) + if description == '…': + description = None + timestamp = unified_timestamp(self._html_search_meta( + 'article:published_time', webpage)) + + return { + '_type': 'url_transparent', + 'id': asset_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': attrs.get('data-posterimage'), + 'timestamp': timestamp, + 'duration': float_or_none(attrs.get('data-duration'), 1000), + 'url': 'https://mediazone.vrt.be/api/v1/%s/assets/%s' % (client, asset_id), + 'ie_key': 'Canvas', + } diff --git a/yt_dlp/extractor/vrv.py b/yt_dlp/extractor/vrv.py new file mode 100644 index 000000000..419602148 --- /dev/null +++ b/yt_dlp/extractor/vrv.py @@ -0,0 +1,278 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import base64 +import json +import hashlib +import hmac +import random +import string +import time + +from .common import InfoExtractor +from ..compat import ( + compat_HTTPError, + compat_urllib_parse_urlencode, + compat_urllib_parse, +) +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + traverse_obj, +) + + +class VRVBaseIE(InfoExtractor): + _API_DOMAIN = None + _API_PARAMS = {} + _CMS_SIGNING = {} + _TOKEN = None + _TOKEN_SECRET = '' + + def _call_api(self, path, video_id, note, data=None): + # https://tools.ietf.org/html/rfc5849#section-3 + base_url = self._API_DOMAIN + '/core/' + path + query = [ + ('oauth_consumer_key', self._API_PARAMS['oAuthKey']), + ('oauth_nonce', ''.join([random.choice(string.ascii_letters) for _ in range(32)])), + ('oauth_signature_method', 'HMAC-SHA1'), + ('oauth_timestamp', int(time.time())), + ] + if self._TOKEN: + query.append(('oauth_token', self._TOKEN)) + encoded_query = compat_urllib_parse_urlencode(query) + headers = self.geo_verification_headers() + if data: + data = json.dumps(data).encode() + headers['Content-Type'] = 'application/json' + base_string = '&'.join([ + 'POST' if data else 'GET', + compat_urllib_parse.quote(base_url, ''), + compat_urllib_parse.quote(encoded_query, '')]) + oauth_signature = base64.b64encode(hmac.new( + (self._API_PARAMS['oAuthSecret'] + '&' + self._TOKEN_SECRET).encode('ascii'), + base_string.encode(), hashlib.sha1).digest()).decode() + encoded_query += '&oauth_signature=' + compat_urllib_parse.quote(oauth_signature, '') + try: + return self._download_json( + '?'.join([base_url, encoded_query]), video_id, + note='Downloading %s JSON metadata' % note, headers=headers, data=data) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + raise ExtractorError(json.loads(e.cause.read().decode())['message'], expected=True) + raise + + def _call_cms(self, path, video_id, note): + if not self._CMS_SIGNING: + index = self._call_api('index', video_id, 'CMS Signing') + self._CMS_SIGNING = index.get('cms_signing') or {} + if not self._CMS_SIGNING: + for signing_policy in index.get('signing_policies', []): + signing_path = signing_policy.get('path') + if signing_path and signing_path.startswith('/cms/'): + name, value = signing_policy.get('name'), signing_policy.get('value') + if name and value: + self._CMS_SIGNING[name] = value + return self._download_json( + self._API_DOMAIN + path, video_id, query=self._CMS_SIGNING, + note='Downloading %s JSON metadata' % note, headers=self.geo_verification_headers()) + + def _get_cms_resource(self, resource_key, video_id): + return self._call_api( + 'cms_resource', video_id, 'resource path', data={ + 'resource_key': resource_key, + })['__links__']['cms_resource']['href'] + + def _real_initialize(self): + webpage = self._download_webpage( + 'https://vrv.co/', None, headers=self.geo_verification_headers()) + self._API_PARAMS = self._parse_json(self._search_regex( + [ + r'window\.__APP_CONFIG__\s*=\s*({.+?})(?:</script>|;)', + r'window\.__APP_CONFIG__\s*=\s*({.+})' + ], webpage, 'app config'), None)['cxApiParams'] + self._API_DOMAIN = self._API_PARAMS.get('apiDomain', 'https://api.vrv.co') + + +class VRVIE(VRVBaseIE): + IE_NAME = 'vrv' + _VALID_URL = r'https?://(?:www\.)?vrv\.co/watch/(?P<id>[A-Z0-9]+)' + _TESTS = [{ + 'url': 'https://vrv.co/watch/GR9PNZ396/Hidden-America-with-Jonah-Ray:BOSTON-WHERE-THE-PAST-IS-THE-PRESENT', + 'info_dict': { + 'id': 'GR9PNZ396', + 'ext': 'mp4', + 'title': 'BOSTON: WHERE THE PAST IS THE PRESENT', + 'description': 'md5:4ec8844ac262ca2df9e67c0983c6b83f', + 'uploader_id': 'seeso', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # movie listing + 'url': 'https://vrv.co/watch/G6NQXZ1J6/Lily-CAT', + 'info_dict': { + 'id': 'G6NQXZ1J6', + 'title': 'Lily C.A.T', + 'description': 'md5:988b031e7809a6aeb60968be4af7db07', + }, + 'playlist_count': 2, + }] + _NETRC_MACHINE = 'vrv' + + def _real_initialize(self): + super(VRVIE, self)._real_initialize() + + email, password = self._get_login_info() + if email is None: + return + + token_credentials = self._call_api( + 'authenticate/by:credentials', None, 'Token Credentials', data={ + 'email': email, + 'password': password, + }) + self._TOKEN = token_credentials['oauth_token'] + self._TOKEN_SECRET = token_credentials['oauth_token_secret'] + + def _extract_vrv_formats(self, url, video_id, stream_format, audio_lang, hardsub_lang): + if not url or stream_format not in ('hls', 'dash', 'adaptive_hls'): + return [] + stream_id_list = [] + if audio_lang: + stream_id_list.append('audio-%s' % audio_lang) + if hardsub_lang: + stream_id_list.append('hardsub-%s' % hardsub_lang) + format_id = stream_format + if stream_id_list: + format_id += '-' + '-'.join(stream_id_list) + if 'hls' in stream_format: + adaptive_formats = self._extract_m3u8_formats( + url, video_id, 'mp4', m3u8_id=format_id, + note='Downloading %s information' % format_id, + fatal=False) + elif stream_format == 'dash': + adaptive_formats = self._extract_mpd_formats( + url, video_id, mpd_id=format_id, + note='Downloading %s information' % format_id, + fatal=False) + if audio_lang: + for f in adaptive_formats: + if f.get('acodec') != 'none': + f['language'] = audio_lang + return adaptive_formats + + def _real_extract(self, url): + video_id = self._match_id(url) + + object_data = self._call_cms(self._get_cms_resource( + 'cms:/objects/' + video_id, video_id), video_id, 'object')['items'][0] + resource_path = object_data['__links__']['resource']['href'] + video_data = self._call_cms(resource_path, video_id, 'video') + title = video_data['title'] + description = video_data.get('description') + + if video_data.get('__class__') == 'movie_listing': + items = self._call_cms( + video_data['__links__']['movie_listing/movies']['href'], + video_id, 'movie listing').get('items') or [] + if len(items) != 1: + entries = [] + for item in items: + item_id = item.get('id') + if not item_id: + continue + entries.append(self.url_result( + 'https://vrv.co/watch/' + item_id, + self.ie_key(), item_id, item.get('title'))) + return self.playlist_result(entries, video_id, title, description) + video_data = items[0] + + streams_path = video_data['__links__'].get('streams', {}).get('href') + if not streams_path: + self.raise_login_required() + streams_json = self._call_cms(streams_path, video_id, 'streams') + + audio_locale = streams_json.get('audio_locale') + formats = [] + for stream_type, streams in streams_json.get('streams', {}).items(): + if stream_type in ('adaptive_hls', 'adaptive_dash'): + for stream in streams.values(): + formats.extend(self._extract_vrv_formats( + stream.get('url'), video_id, stream_type.split('_')[1], + audio_locale, stream.get('hardsub_locale'))) + self._sort_formats(formats) + + subtitles = {} + for k in ('captions', 'subtitles'): + for subtitle in streams_json.get(k, {}).values(): + subtitle_url = subtitle.get('url') + if not subtitle_url: + continue + subtitles.setdefault(subtitle.get('locale', 'en-US'), []).append({ + 'url': subtitle_url, + 'ext': subtitle.get('format', 'ass'), + }) + + thumbnails = [] + for thumbnail in traverse_obj(video_data, ('images', 'thumbnail', ..., ...)) or []: + thumbnail_url = thumbnail.get('source') + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': thumbnails, + 'description': description, + 'duration': float_or_none(video_data.get('duration_ms'), 1000), + 'uploader_id': video_data.get('channel_id'), + 'series': video_data.get('series_title'), + 'season': video_data.get('season_title'), + 'season_number': int_or_none(video_data.get('season_number')), + 'season_id': video_data.get('season_id'), + 'episode': title, + 'episode_number': int_or_none(video_data.get('episode_number')), + 'episode_id': video_data.get('production_episode_id'), + } + + +class VRVSeriesIE(VRVBaseIE): + IE_NAME = 'vrv:series' + _VALID_URL = r'https?://(?:www\.)?vrv\.co/series/(?P<id>[A-Z0-9]+)' + _TEST = { + 'url': 'https://vrv.co/series/G68VXG3G6/The-Perfect-Insider', + 'info_dict': { + 'id': 'G68VXG3G6', + }, + 'playlist_mincount': 11, + } + + def _real_extract(self, url): + series_id = self._match_id(url) + + seasons_path = self._get_cms_resource( + 'cms:/seasons?series_id=' + series_id, series_id) + seasons_data = self._call_cms(seasons_path, series_id, 'seasons') + + entries = [] + for season in seasons_data.get('items', []): + episodes_path = season['__links__']['season/episodes']['href'] + episodes = self._call_cms(episodes_path, series_id, 'episodes') + for episode in episodes.get('items', []): + episode_id = episode['id'] + entries.append(self.url_result( + 'https://vrv.co/watch/' + episode_id, + 'VRV', episode_id, episode.get('title'))) + + return self.playlist_result(entries, series_id) diff --git a/youtube_dl/extractor/vshare.py b/yt_dlp/extractor/vshare.py index c631ac1fa..c631ac1fa 100644 --- a/youtube_dl/extractor/vshare.py +++ b/yt_dlp/extractor/vshare.py diff --git a/youtube_dl/extractor/vtm.py b/yt_dlp/extractor/vtm.py index 093f1aa69..093f1aa69 100644 --- a/youtube_dl/extractor/vtm.py +++ b/yt_dlp/extractor/vtm.py diff --git a/yt_dlp/extractor/vube.py b/yt_dlp/extractor/vube.py new file mode 100644 index 000000000..1c8f80ae9 --- /dev/null +++ b/yt_dlp/extractor/vube.py @@ -0,0 +1,170 @@ +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..compat import ( + compat_str, +) +from ..utils import ( + int_or_none, +) + + +class VubeIE(InfoExtractor): + IE_NAME = 'vube' + IE_DESC = 'Vube.com' + _VALID_URL = r'https?://vube\.com/(?:[^/]+/)+(?P<id>[\da-zA-Z]{10})\b' + + _TESTS = [ + { + 'url': 'http://vube.com/trending/William+Wei/Y8NUZ69Tf7?t=s', + 'md5': 'e7aabe1f8f1aa826b9e4735e1f9cee42', + 'info_dict': { + 'id': 'Y8NUZ69Tf7', + 'ext': 'mp4', + 'title': 'Best Drummer Ever [HD]', + 'description': 'md5:2d63c4b277b85c2277761c2cf7337d71', + 'thumbnail': r're:^https?://.*\.jpg', + 'uploader': 'William', + 'timestamp': 1406876915, + 'upload_date': '20140801', + 'duration': 258.051, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'categories': ['amazing', 'hd', 'best drummer ever', 'william wei', 'bucket drumming', 'street drummer', 'epic street drumming'], + }, + 'skip': 'Not accessible from Travis CI server', + }, { + 'url': 'http://vube.com/Chiara+Grispo+Video+Channel/YL2qNPkqon', + 'md5': 'db7aba89d4603dadd627e9d1973946fe', + 'info_dict': { + 'id': 'YL2qNPkqon', + 'ext': 'mp4', + 'title': 'Chiara Grispo - Price Tag by Jessie J', + 'description': 'md5:8ea652a1f36818352428cb5134933313', + 'thumbnail': r're:^http://frame\.thestaticvube\.com/snap/[0-9x]+/102e7e63057-5ebc-4f5c-4065-6ce4ebde131f\.jpg$', + 'uploader': 'Chiara.Grispo', + 'timestamp': 1388743358, + 'upload_date': '20140103', + 'duration': 170.56, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'categories': ['pop', 'music', 'cover', 'singing', 'jessie j', 'price tag', 'chiara grispo'], + }, + 'skip': 'Removed due to DMCA', + }, + { + 'url': 'http://vube.com/SerainaMusic/my-7-year-old-sister-and-i-singing-alive-by-krewella/UeBhTudbfS?t=s&n=1', + 'md5': '5d4a52492d76f72712117ce6b0d98d08', + 'info_dict': { + 'id': 'UeBhTudbfS', + 'ext': 'mp4', + 'title': 'My 7 year old Sister and I singing "Alive" by Krewella', + 'description': 'md5:40bcacb97796339f1690642c21d56f4a', + 'thumbnail': r're:^http://frame\.thestaticvube\.com/snap/[0-9x]+/102265d5a9f-0f17-4f6b-5753-adf08484ee1e\.jpg$', + 'uploader': 'Seraina', + 'timestamp': 1396492438, + 'upload_date': '20140403', + 'duration': 240.107, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'categories': ['seraina', 'jessica', 'krewella', 'alive'], + }, + 'skip': 'Removed due to DMCA', + }, { + 'url': 'http://vube.com/vote/Siren+Gene/0nmsMY5vEq?n=2&t=s', + 'md5': '0584fc13b50f887127d9d1007589d27f', + 'info_dict': { + 'id': '0nmsMY5vEq', + 'ext': 'mp4', + 'title': 'Frozen - Let It Go Cover by Siren Gene', + 'description': 'My rendition of "Let It Go" originally sung by Idina Menzel.', + 'thumbnail': r're:^http://frame\.thestaticvube\.com/snap/[0-9x]+/10283ab622a-86c9-4681-51f2-30d1f65774af\.jpg$', + 'uploader': 'Siren', + 'timestamp': 1395448018, + 'upload_date': '20140322', + 'duration': 221.788, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'categories': ['let it go', 'cover', 'idina menzel', 'frozen', 'singing', 'disney', 'siren gene'], + }, + 'skip': 'Removed due to DMCA', + } + ] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + + video = self._download_json( + 'http://vube.com/t-api/v1/video/%s' % video_id, video_id, 'Downloading video JSON') + + public_id = video['public_id'] + + formats = [] + + for media in video['media'].get('video', []) + video['media'].get('audio', []): + if media['transcoding_status'] != 'processed': + continue + fmt = { + 'url': 'http://video.thestaticvube.com/video/%s/%s.mp4' % (media['media_resolution_id'], public_id), + 'abr': int(media['audio_bitrate']), + 'format_id': compat_str(media['media_resolution_id']), + } + vbr = int(media['video_bitrate']) + if vbr: + fmt.update({ + 'vbr': vbr, + 'height': int(media['height']), + }) + formats.append(fmt) + + if not formats and video.get('vst') == 'dmca': + self.raise_no_formats( + 'This video has been removed in response to a complaint received under the US Digital Millennium Copyright Act.', + expected=True) + + self._sort_formats(formats) + + title = video['title'] + description = video.get('description') + thumbnail = self._proto_relative_url(video.get('thumbnail_src'), scheme='http:') + uploader = video.get('user_alias') or video.get('channel') + timestamp = int_or_none(video.get('upload_time')) + duration = video['duration'] + view_count = video.get('raw_view_count') + like_count = video.get('total_likes') + dislike_count = video.get('total_hates') + + comments = video.get('comments') + comment_count = None + if comments is None: + comment_data = self._download_json( + 'http://vube.com/api/video/%s/comment' % video_id, + video_id, 'Downloading video comment JSON', fatal=False) + if comment_data is not None: + comment_count = int_or_none(comment_data.get('total')) + else: + comment_count = len(comments) + + categories = [tag['text'] for tag in video['tags']] + + return { + 'id': video_id, + 'formats': formats, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'timestamp': timestamp, + 'duration': duration, + 'view_count': view_count, + 'like_count': like_count, + 'dislike_count': dislike_count, + 'comment_count': comment_count, + 'categories': categories, + } diff --git a/youtube_dl/extractor/vuclip.py b/yt_dlp/extractor/vuclip.py index 55e087bdb..55e087bdb 100644 --- a/youtube_dl/extractor/vuclip.py +++ b/yt_dlp/extractor/vuclip.py diff --git a/yt_dlp/extractor/vupload.py b/yt_dlp/extractor/vupload.py new file mode 100644 index 000000000..9846ababc --- /dev/null +++ b/yt_dlp/extractor/vupload.py @@ -0,0 +1,51 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + parse_duration, + parse_filesize, + extract_attributes, + int_or_none, +) + + +class VuploadIE(InfoExtractor): + _VALID_URL = r'https://vupload\.com/v/(?P<id>[a-z0-9]+)' + _TESTS = [{ + 'url': 'https://vupload.com/v/u28d0pl2tphy', + 'md5': '9b42a4a193cca64d80248e58527d83c8', + 'info_dict': { + 'id': 'u28d0pl2tphy', + 'ext': 'mp4', + 'description': 'md5:e9e6c0045c78cbf0d5bb19a55ce199fb', + 'title': 'md5:e9e6c0045c78cbf0d5bb19a55ce199fb', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex(r'<title>(.+?)</title>', webpage, 'title') + video_e = self._html_search_regex(r'\|([a-z0-9]{60})\|', webpage, 'video') + video_url = f'https://wurize.megaupload.to/{video_e}/v.mp4' + duration = parse_duration(self._html_search_regex( + r'<i\s*class=["\']fad\s*fa-clock["\']></i>\s*([\d:]+)\s*</div>', webpage, 'duration', fatal=False)) + filesize_approx = parse_filesize(self._html_search_regex( + r'<i\s*class=["\']fad\s*fa-save["\']></i>\s*([^<]+)\s*</div>', webpage, 'filesize', fatal=False)) + extra_video_info = extract_attributes(self._html_search_regex( + r'(<video[^>]+>)', webpage, 'video_info', fatal=False)) + description = self._html_search_meta('description', webpage) + + return { + 'id': video_id, + 'url': video_url, + 'duration': duration, + 'filesize_approx': filesize_approx, + 'width': int_or_none(extra_video_info.get('width')), + 'height': int_or_none(extra_video_info.get('height')), + 'format_id': extra_video_info.get('height', '') + 'p', + 'title': title, + 'description': description, + } diff --git a/yt_dlp/extractor/vvvvid.py b/yt_dlp/extractor/vvvvid.py new file mode 100644 index 000000000..3faa90fbd --- /dev/null +++ b/yt_dlp/extractor/vvvvid.py @@ -0,0 +1,284 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .youtube import YoutubeIE +from ..utils import ( + ExtractorError, + int_or_none, + str_or_none, +) + + +class VVVVIDIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?vvvvid\.it/(?:#!)?(?:show|anime|film|series)/' + _VALID_URL = r'%s(?P<show_id>\d+)/[^/]+/(?P<season_id>\d+)/(?P<id>[0-9]+)' % _VALID_URL_BASE + _TESTS = [{ + # video_type == 'video/vvvvid' + 'url': 'https://www.vvvvid.it/#!show/434/perche-dovrei-guardarlo-di-dario-moccia/437/489048/ping-pong', + 'md5': 'b8d3cecc2e981adc3835adf07f6df91b', + 'info_dict': { + 'id': '489048', + 'ext': 'mp4', + 'title': 'Ping Pong', + 'duration': 239, + 'series': '"Perché dovrei guardarlo?" di Dario Moccia', + 'season_id': '437', + 'episode': 'Ping Pong', + 'episode_number': 1, + 'episode_id': '3334', + 'view_count': int, + 'like_count': int, + 'repost_count': int, + }, + 'params': { + 'skip_download': True, + }, + }, { + # video_type == 'video/rcs' + 'url': 'https://www.vvvvid.it/#!show/376/death-note-live-action/377/482493/episodio-01', + 'md5': '33e0edfba720ad73a8782157fdebc648', + 'info_dict': { + 'id': '482493', + 'ext': 'mp4', + 'title': 'Episodio 01', + }, + 'params': { + 'skip_download': True, + }, + }, { + # video_type == 'video/youtube' + 'url': 'https://www.vvvvid.it/show/404/one-punch-man/406/486683/trailer', + 'md5': '33e0edfba720ad73a8782157fdebc648', + 'info_dict': { + 'id': 'RzmFKUDOUgw', + 'ext': 'mp4', + 'title': 'Trailer', + 'upload_date': '20150906', + 'description': 'md5:a5e802558d35247fee285875328c0b80', + 'uploader_id': 'BandaiVisual', + 'uploader': 'BANDAI NAMCO Arts Channel', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.vvvvid.it/show/434/perche-dovrei-guardarlo-di-dario-moccia/437/489048', + 'only_matching': True + }] + _conn_id = None + + def _real_initialize(self): + self._conn_id = self._download_json( + 'https://www.vvvvid.it/user/login', + None, headers=self.geo_verification_headers())['data']['conn_id'] + + def _download_info(self, show_id, path, video_id, fatal=True, query=None): + q = { + 'conn_id': self._conn_id, + } + if query: + q.update(query) + response = self._download_json( + 'https://www.vvvvid.it/vvvvid/ondemand/%s/%s' % (show_id, path), + video_id, headers=self.geo_verification_headers(), query=q, fatal=fatal) + if not (response or fatal): + return + if response.get('result') == 'error': + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, response['message']), expected=True) + return response['data'] + + def _extract_common_video_info(self, video_data): + return { + 'thumbnail': video_data.get('thumbnail'), + 'episode_id': str_or_none(video_data.get('id')), + } + + def _real_extract(self, url): + show_id, season_id, video_id = self._match_valid_url(url).groups() + + response = self._download_info( + show_id, 'season/%s' % season_id, + video_id, query={'video_id': video_id}) + + vid = int(video_id) + video_data = list(filter( + lambda episode: episode.get('video_id') == vid, response))[0] + title = video_data['title'] + formats = [] + + # vvvvid embed_info decryption algorithm is reverse engineered from function $ds(h) at vvvvid.js + def ds(h): + g = "MNOPIJKL89+/4567UVWXQRSTEFGHABCDcdefYZabstuvopqr0123wxyzklmnghij" + + def f(m): + l = [] + o = 0 + b = False + m_len = len(m) + while ((not b) and o < m_len): + n = m[o] << 2 + o += 1 + k = -1 + j = -1 + if o < m_len: + n += m[o] >> 4 + o += 1 + if o < m_len: + k = (m[o - 1] << 4) & 255 + k += m[o] >> 2 + o += 1 + if o < m_len: + j = (m[o - 1] << 6) & 255 + j += m[o] + o += 1 + else: + b = True + else: + b = True + else: + b = True + l.append(n) + if k != -1: + l.append(k) + if j != -1: + l.append(j) + return l + + c = [] + for e in h: + c.append(g.index(e)) + + c_len = len(c) + for e in range(c_len * 2 - 1, -1, -1): + a = c[e % c_len] ^ c[(e + 1) % c_len] + c[e % c_len] = a + + c = f(c) + d = '' + for e in c: + d += chr(e) + + return d + + info = {} + + def metadata_from_url(r_url): + if not info and r_url: + mobj = re.search(r'_(?:S(\d+))?Ep(\d+)', r_url) + if mobj: + info['episode_number'] = int(mobj.group(2)) + season_number = mobj.group(1) + if season_number: + info['season_number'] = int(season_number) + + video_type = video_data.get('video_type') + is_youtube = False + for quality in ('', '_sd'): + embed_code = video_data.get('embed_info' + quality) + if not embed_code: + continue + embed_code = ds(embed_code) + if video_type == 'video/kenc': + embed_code = re.sub(r'https?(://[^/]+)/z/', r'https\1/i/', embed_code).replace('/manifest.f4m', '/master.m3u8') + kenc = self._download_json( + 'https://www.vvvvid.it/kenc', video_id, query={ + 'action': 'kt', + 'conn_id': self._conn_id, + 'url': embed_code, + }, fatal=False) or {} + kenc_message = kenc.get('message') + if kenc_message: + embed_code += '?' + ds(kenc_message) + formats.extend(self._extract_m3u8_formats( + embed_code, video_id, 'mp4', m3u8_id='hls', fatal=False)) + elif video_type == 'video/rcs': + formats.extend(self._extract_akamai_formats(embed_code, video_id)) + elif video_type == 'video/youtube': + info.update({ + '_type': 'url_transparent', + 'ie_key': YoutubeIE.ie_key(), + 'url': embed_code, + }) + is_youtube = True + break + else: + formats.extend(self._extract_wowza_formats( + 'http://sb.top-ix.org/videomg/_definst_/mp4:%s/playlist.m3u8' % embed_code, video_id)) + metadata_from_url(embed_code) + + if not is_youtube: + self._sort_formats(formats) + info['formats'] = formats + + metadata_from_url(video_data.get('thumbnail')) + info.update(self._extract_common_video_info(video_data)) + info.update({ + 'id': video_id, + 'title': title, + 'duration': int_or_none(video_data.get('length')), + 'series': video_data.get('show_title'), + 'season_id': season_id, + 'episode': title, + 'view_count': int_or_none(video_data.get('views')), + 'like_count': int_or_none(video_data.get('video_likes')), + 'repost_count': int_or_none(video_data.get('video_shares')), + }) + return info + + +class VVVVIDShowIE(VVVVIDIE): + _VALID_URL = r'(?P<base_url>%s(?P<id>\d+)(?:/(?P<show_title>[^/?&#]+))?)/?(?:[?#&]|$)' % VVVVIDIE._VALID_URL_BASE + _TESTS = [{ + 'url': 'https://www.vvvvid.it/show/156/psyco-pass', + 'info_dict': { + 'id': '156', + 'title': 'Psycho-Pass', + 'description': 'md5:94d572c0bd85894b193b8aebc9a3a806', + }, + 'playlist_count': 46, + }, { + 'url': 'https://www.vvvvid.it/show/156', + 'only_matching': True, + }] + + def _real_extract(self, url): + base_url, show_id, show_title = self._match_valid_url(url).groups() + + seasons = self._download_info( + show_id, 'seasons/', show_title) + + show_info = self._download_info( + show_id, 'info/', show_title, fatal=False) + + if not show_title: + base_url += "/title" + + entries = [] + for season in (seasons or []): + episodes = season.get('episodes') or [] + playlist_title = season.get('name') or show_info.get('title') + for episode in episodes: + if episode.get('playable') is False: + continue + season_id = str_or_none(episode.get('season_id')) + video_id = str_or_none(episode.get('video_id')) + if not (season_id and video_id): + continue + info = self._extract_common_video_info(episode) + info.update({ + '_type': 'url_transparent', + 'ie_key': VVVVIDIE.ie_key(), + 'url': '/'.join([base_url, season_id, video_id]), + 'title': episode.get('title'), + 'description': episode.get('description'), + 'season_id': season_id, + 'playlist_title': playlist_title, + }) + entries.append(info) + + return self.playlist_result( + entries, show_id, show_info.get('title'), show_info.get('description')) diff --git a/youtube_dl/extractor/vyborymos.py b/yt_dlp/extractor/vyborymos.py index 9e703c4b6..9e703c4b6 100644 --- a/youtube_dl/extractor/vyborymos.py +++ b/yt_dlp/extractor/vyborymos.py diff --git a/yt_dlp/extractor/vzaar.py b/yt_dlp/extractor/vzaar.py new file mode 100644 index 000000000..54f88bba8 --- /dev/null +++ b/yt_dlp/extractor/vzaar.py @@ -0,0 +1,112 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + float_or_none, + unified_timestamp, + url_or_none, +) + + +class VzaarIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www|view)\.)?vzaar\.com/(?:videos/)?(?P<id>\d+)' + _TESTS = [{ + # HTTP and HLS + 'url': 'https://vzaar.com/videos/1152805', + 'md5': 'bde5ddfeb104a6c56a93a06b04901dbf', + 'info_dict': { + 'id': '1152805', + 'ext': 'mp4', + 'title': 'sample video (public)', + }, + }, { + 'url': 'https://view.vzaar.com/27272/player', + 'md5': '3b50012ac9bbce7f445550d54e0508f2', + 'info_dict': { + 'id': '27272', + 'ext': 'mp3', + 'title': 'MP3', + }, + }, { + # hlsAes = true + 'url': 'https://view.vzaar.com/11379930/player', + 'info_dict': { + 'id': '11379930', + 'ext': 'mp4', + 'title': 'Videoaula', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # with null videoTitle + 'url': 'https://view.vzaar.com/20313539/download', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r'<iframe[^>]+src=["\']((?:https?:)?//(?:view\.vzaar\.com)/[0-9]+)', + webpage) + + def _real_extract(self, url): + video_id = self._match_id(url) + video_data = self._download_json( + 'http://view.vzaar.com/v2/%s/video' % video_id, video_id) + + title = video_data.get('videoTitle') or video_id + + formats = [] + + source_url = url_or_none(video_data.get('sourceUrl')) + if source_url: + f = { + 'url': source_url, + 'format_id': 'http', + 'quality': 1, + } + if 'audio' in source_url: + f.update({ + 'vcodec': 'none', + 'ext': 'mp3', + }) + else: + f.update({ + 'width': int_or_none(video_data.get('width')), + 'height': int_or_none(video_data.get('height')), + 'ext': 'mp4', + 'fps': float_or_none(video_data.get('fps')), + }) + formats.append(f) + + video_guid = video_data.get('guid') + usp = video_data.get('usp') + if video_data.get('uspEnabled') and isinstance(video_guid, compat_str) and isinstance(usp, dict): + hls_aes = video_data.get('hlsAes') + qs = '&'.join('%s=%s' % (k, v) for k, v in usp.items()) + url_templ = 'http://%%s.vzaar.com/v5/usp%s/%s/%s.ism%%s?' % ('aes' if hls_aes else '', video_guid, video_id) + m3u8_formats = self._extract_m3u8_formats( + url_templ % ('fable', '/.m3u8') + qs, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False) + if hls_aes: + for f in m3u8_formats: + f['_decryption_key_url'] = url_templ % ('goose', '') + qs + formats.extend(m3u8_formats) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': self._proto_relative_url(video_data.get('poster')), + 'duration': float_or_none(video_data.get('videoDuration')), + 'timestamp': unified_timestamp(video_data.get('ts')), + 'formats': formats, + } diff --git a/yt_dlp/extractor/wakanim.py b/yt_dlp/extractor/wakanim.py new file mode 100644 index 000000000..c956d616e --- /dev/null +++ b/yt_dlp/extractor/wakanim.py @@ -0,0 +1,66 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + merge_dicts, + urljoin, +) + + +class WakanimIE(InfoExtractor): + _VALID_URL = r'https://(?:www\.)?wakanim\.tv/[^/]+/v2/catalogue/episode/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.wakanim.tv/de/v2/catalogue/episode/2997/the-asterisk-war-omu-staffel-1-episode-02-omu', + 'info_dict': { + 'id': '2997', + 'ext': 'mp4', + 'title': 'Episode 02', + 'description': 'md5:2927701ea2f7e901de8bfa8d39b2852d', + 'series': 'The Asterisk War (OmU.)', + 'season_number': 1, + 'episode': 'Episode 02', + 'episode_number': 2, + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, + }, { + # DRM Protected + 'url': 'https://www.wakanim.tv/de/v2/catalogue/episode/7843/sword-art-online-alicization-omu-arc-2-folge-15-omu', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + m3u8_url = urljoin(url, self._search_regex( + r'file\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, 'm3u8 url', + group='url')) + if not self.get_param('allow_unplayable_formats'): + # https://docs.microsoft.com/en-us/azure/media-services/previous/media-services-content-protection-overview#streaming-urls + encryption = self._search_regex( + r'encryption%3D(c(?:enc|bc(?:s-aapl)?))', + m3u8_url, 'encryption', default=None) + if encryption in ('cenc', 'cbcs-aapl'): + self.report_drm(video_id) + + formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + + info = self._search_json_ld(webpage, video_id, default={}) + + title = self._search_regex( + (r'<h1[^>]+\bclass=["\']episode_h1[^>]+\btitle=(["\'])(?P<title>(?:(?!\1).)+)\1', + r'<span[^>]+\bclass=["\']episode_title["\'][^>]*>(?P<title>[^<]+)'), + webpage, 'title', default=None, group='title') + + return merge_dicts(info, { + 'id': video_id, + 'title': title, + 'formats': formats, + }) diff --git a/yt_dlp/extractor/walla.py b/yt_dlp/extractor/walla.py new file mode 100644 index 000000000..00f081bca --- /dev/null +++ b/yt_dlp/extractor/walla.py @@ -0,0 +1,86 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + xpath_text, + int_or_none, +) + + +class WallaIE(InfoExtractor): + _VALID_URL = r'https?://vod\.walla\.co\.il/[^/]+/(?P<id>\d+)/(?P<display_id>.+)' + _TEST = { + 'url': 'http://vod.walla.co.il/movie/2642630/one-direction-all-for-one', + 'info_dict': { + 'id': '2642630', + 'display_id': 'one-direction-all-for-one', + 'ext': 'flv', + 'title': 'וואן דיירקשן: ההיסטריה', + 'description': 'md5:de9e2512a92442574cdb0913c49bc4d8', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 3600, + }, + 'params': { + # rtmp download + 'skip_download': True, + } + } + + _SUBTITLE_LANGS = { + 'עברית': 'heb', + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') + + video = self._download_xml( + 'http://video2.walla.co.il/?w=null/null/%s/@@/video/flv_pl' % video_id, + display_id) + + item = video.find('./items/item') + + title = xpath_text(item, './title', 'title') + description = xpath_text(item, './synopsis', 'description') + thumbnail = xpath_text(item, './preview_pic', 'thumbnail') + duration = int_or_none(xpath_text(item, './duration', 'duration')) + + subtitles = {} + for subtitle in item.findall('./subtitles/subtitle'): + lang = xpath_text(subtitle, './title') + subtitles[self._SUBTITLE_LANGS.get(lang, lang)] = [{ + 'ext': 'srt', + 'url': xpath_text(subtitle, './src'), + }] + + formats = [] + for quality in item.findall('./qualities/quality'): + format_id = xpath_text(quality, './title') + fmt = { + 'url': 'rtmp://wafla.walla.co.il/vod', + 'play_path': xpath_text(quality, './src'), + 'player_url': 'http://isc.walla.co.il/w9/swf/video_swf/vod/WallaMediaPlayerAvod.swf', + 'page_url': url, + 'ext': 'flv', + 'format_id': xpath_text(quality, './title'), + } + m = re.search(r'^(?P<height>\d+)[Pp]', format_id) + if m: + fmt['height'] = int(m.group('height')) + formats.append(fmt) + self._sort_formats(formats) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/youtube_dl/extractor/washingtonpost.py b/yt_dlp/extractor/washingtonpost.py index 8afb1af83..8afb1af83 100644 --- a/youtube_dl/extractor/washingtonpost.py +++ b/yt_dlp/extractor/washingtonpost.py diff --git a/yt_dlp/extractor/wat.py b/yt_dlp/extractor/wat.py new file mode 100644 index 000000000..9ff4523db --- /dev/null +++ b/yt_dlp/extractor/wat.py @@ -0,0 +1,112 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + try_get, + unified_strdate, +) + + +class WatIE(InfoExtractor): + _VALID_URL = r'(?:wat:|https?://(?:www\.)?wat\.tv/video/.*-)(?P<id>[0-9a-z]+)' + IE_NAME = 'wat.tv' + _TESTS = [ + { + 'url': 'http://www.wat.tv/video/soupe-figues-l-orange-aux-epices-6z1uz_2hvf7_.html', + 'info_dict': { + 'id': '11713067', + 'ext': 'mp4', + 'title': 'Soupe de figues à l\'orange et aux épices', + 'description': 'Retrouvez l\'émission "Petits plats en équilibre", diffusée le 18 août 2014.', + 'upload_date': '20140819', + 'duration': 120, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'expected_warnings': ['HTTP Error 404'], + 'skip': 'This content is no longer available', + }, + { + 'url': 'http://www.wat.tv/video/gregory-lemarchal-voix-ange-6z1v7_6ygkj_.html', + 'md5': 'b16574df2c3cd1a36ca0098f2a791925', + 'info_dict': { + 'id': '11713075', + 'ext': 'mp4', + 'title': 'Grégory Lemarchal, une voix d\'ange depuis 10 ans (1/3)', + 'upload_date': '20140816', + }, + 'expected_warnings': ["Ce contenu n'est pas disponible pour l'instant."], + 'skip': 'This content is no longer available', + }, + ] + _GEO_BYPASS = False + + def _real_extract(self, url): + video_id = self._match_id(url) + video_id = video_id if video_id.isdigit() and len(video_id) > 6 else compat_str(int(video_id, 36)) + + # 'contentv4' is used in the website, but it also returns the related + # videos, we don't need them + # video_data = self._download_json( + # 'http://www.wat.tv/interface/contentv4s/' + video_id, video_id) + video_data = self._download_json( + 'https://mediainfo.tf1.fr/mediainfocombo/' + video_id, + video_id, query={'context': 'MYTF1'}) + video_info = video_data['media'] + + error_desc = video_info.get('error_desc') + if error_desc: + if video_info.get('error_code') == 'GEOBLOCKED': + self.raise_geo_restricted(error_desc, video_info.get('geoList')) + raise ExtractorError(error_desc, expected=True) + + title = video_info['title'] + + formats = [] + subtitles = {} + + def extract_formats(manifest_urls): + for f, f_url in manifest_urls.items(): + if not f_url: + continue + if f in ('dash', 'mpd'): + fmts, subs = self._extract_mpd_formats_and_subtitles( + f_url.replace('://das-q1.tf1.fr/', '://das-q1-ssl.tf1.fr/'), + video_id, mpd_id='dash', fatal=False) + elif f == 'hls': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + f_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False) + else: + continue + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + delivery = video_data.get('delivery') or {} + extract_formats({delivery.get('format'): delivery.get('url')}) + if not formats: + if delivery.get('drm'): + self.report_drm(video_id) + manifest_urls = self._download_json( + 'http://www.wat.tv/get/webhtml/' + video_id, video_id, fatal=False) + if manifest_urls: + extract_formats(manifest_urls) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': video_info.get('preview'), + 'upload_date': unified_strdate(try_get( + video_data, lambda x: x['mediametrie']['chapters'][0]['estatS4'])), + 'duration': int_or_none(video_info.get('duration')), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/yt_dlp/extractor/watchbox.py b/yt_dlp/extractor/watchbox.py new file mode 100644 index 000000000..7469fe962 --- /dev/null +++ b/yt_dlp/extractor/watchbox.py @@ -0,0 +1,160 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + js_to_json, + strip_or_none, + try_get, + unescapeHTML, + unified_timestamp, +) + + +class WatchBoxIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?watchbox\.de/(?P<kind>serien|filme)/(?:[^/]+/)*[^/]+-(?P<id>\d+)' + _TESTS = [{ + # film + 'url': 'https://www.watchbox.de/filme/free-jimmy-12325.html', + 'info_dict': { + 'id': '341368', + 'ext': 'mp4', + 'title': 'Free Jimmy', + 'description': 'md5:bcd8bafbbf9dc0ef98063d344d7cc5f6', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 4890, + 'age_limit': 16, + 'release_year': 2009, + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, + 'expected_warnings': ['Failed to download m3u8 information'], + }, { + # episode + 'url': 'https://www.watchbox.de/serien/ugly-americans-12231/staffel-1/date-in-der-hoelle-328286.html', + 'info_dict': { + 'id': '328286', + 'ext': 'mp4', + 'title': 'S01 E01 - Date in der Hölle', + 'description': 'md5:2f31c74a8186899f33cb5114491dae2b', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 1291, + 'age_limit': 12, + 'release_year': 2010, + 'series': 'Ugly Americans', + 'season_number': 1, + 'episode': 'Date in der Hölle', + 'episode_number': 1, + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, + 'expected_warnings': ['Failed to download m3u8 information'], + }, { + 'url': 'https://www.watchbox.de/serien/ugly-americans-12231/staffel-2/der-ring-des-powers-328270', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + kind, video_id = mobj.group('kind', 'id') + + webpage = self._download_webpage(url, video_id) + + player_config = self._parse_json( + self._search_regex( + r'data-player-conf=(["\'])(?P<data>{.+?})\1', webpage, + 'player config', default='{}', group='data'), + video_id, transform_source=unescapeHTML, fatal=False) + + if not player_config: + player_config = self._parse_json( + self._search_regex( + r'playerConf\s*=\s*({.+?})\s*;', webpage, 'player config', + default='{}'), + video_id, transform_source=js_to_json, fatal=False) or {} + + source = player_config.get('source') or {} + + video_id = compat_str(source.get('videoId') or video_id) + + devapi = self._download_json( + 'http://api.watchbox.de/devapi/id/%s' % video_id, video_id, query={ + 'format': 'json', + 'apikey': 'hbbtv', + }, fatal=False) + + item = try_get(devapi, lambda x: x['items'][0], dict) or {} + + title = item.get('title') or try_get( + item, lambda x: x['movie']['headline_movie'], + compat_str) or source['title'] + + formats = [] + hls_url = item.get('media_videourl_hls') or source.get('hls') + if hls_url: + formats.extend(self._extract_m3u8_formats( + hls_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + dash_url = item.get('media_videourl_wv') or source.get('dash') + if dash_url: + formats.extend(self._extract_mpd_formats( + dash_url, video_id, mpd_id='dash', fatal=False)) + mp4_url = item.get('media_videourl') + if mp4_url: + formats.append({ + 'url': mp4_url, + 'format_id': 'mp4', + 'width': int_or_none(item.get('width')), + 'height': int_or_none(item.get('height')), + 'tbr': int_or_none(item.get('bitrate')), + }) + self._sort_formats(formats) + + description = strip_or_none(item.get('descr')) + thumbnail = item.get('media_content_thumbnail_large') or source.get('poster') or item.get('media_thumbnail') + duration = int_or_none(item.get('media_length') or source.get('length')) + timestamp = unified_timestamp(item.get('pubDate')) + view_count = int_or_none(item.get('media_views')) + age_limit = int_or_none(try_get(item, lambda x: x['movie']['fsk'])) + release_year = int_or_none(try_get(item, lambda x: x['movie']['rel_year'])) + + info = { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': timestamp, + 'view_count': view_count, + 'age_limit': age_limit, + 'release_year': release_year, + 'formats': formats, + } + + if kind.lower() == 'serien': + series = try_get( + item, lambda x: x['special']['title'], + compat_str) or source.get('format') + season_number = int_or_none(self._search_regex( + r'^S(\d{1,2})\s*E\d{1,2}', title, 'season number', + default=None) or self._search_regex( + r'/staffel-(\d+)/', url, 'season number', default=None)) + episode = source.get('title') + episode_number = int_or_none(self._search_regex( + r'^S\d{1,2}\s*E(\d{1,2})', title, 'episode number', + default=None)) + info.update({ + 'series': series, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, + }) + + return info diff --git a/yt_dlp/extractor/watchindianporn.py b/yt_dlp/extractor/watchindianporn.py new file mode 100644 index 000000000..a86819173 --- /dev/null +++ b/yt_dlp/extractor/watchindianporn.py @@ -0,0 +1,68 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import parse_duration + + +class WatchIndianPornIE(InfoExtractor): + IE_DESC = 'Watch Indian Porn' + _VALID_URL = r'https?://(?:www\.)?watchindianporn\.net/(?:[^/]+/)*video/(?P<display_id>[^/]+)-(?P<id>[a-zA-Z0-9]+)\.html' + _TEST = { + 'url': 'http://www.watchindianporn.net/video/hot-milf-from-kerala-shows-off-her-gorgeous-large-breasts-on-camera-RZa2avywNPa.html', + 'md5': '249589a164dde236ec65832bfce17440', + 'info_dict': { + 'id': 'RZa2avywNPa', + 'display_id': 'hot-milf-from-kerala-shows-off-her-gorgeous-large-breasts-on-camera', + 'ext': 'mp4', + 'title': 'Hot milf from kerala shows off her gorgeous large breasts on camera', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 226, + 'view_count': int, + 'categories': list, + 'age_limit': 18, + } + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') + + webpage = self._download_webpage(url, display_id) + + info_dict = self._parse_html5_media_entries(url, webpage, video_id)[0] + + title = self._html_search_regex(( + r'<title>(.+?)\s*-\s*Indian\s+Porn</title>', + r'<h4>(.+?)</h4>' + ), webpage, 'title') + + duration = parse_duration(self._search_regex( + r'Time:\s*<strong>\s*(.+?)\s*</strong>', + webpage, 'duration', fatal=False)) + + view_count = int(self._search_regex( + r'(?s)Time:\s*<strong>.*?</strong>.*?<strong>\s*(\d+)\s*</strong>', + webpage, 'view count', fatal=False)) + + categories = re.findall( + r'<a[^>]+class=[\'"]categories[\'"][^>]*>\s*([^<]+)\s*</a>', + webpage) + + info_dict.update({ + 'id': video_id, + 'display_id': display_id, + 'http_headers': { + 'Referer': url, + }, + 'title': title, + 'duration': duration, + 'view_count': view_count, + 'categories': categories, + 'age_limit': 18, + }) + + return info_dict diff --git a/yt_dlp/extractor/wdr.py b/yt_dlp/extractor/wdr.py new file mode 100644 index 000000000..f54aa6ff9 --- /dev/null +++ b/yt_dlp/extractor/wdr.py @@ -0,0 +1,354 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urlparse, +) +from ..utils import ( + determine_ext, + ExtractorError, + js_to_json, + strip_jsonp, + try_get, + unified_strdate, + update_url_query, + urlhandle_detect_ext, + url_or_none, +) + + +class WDRIE(InfoExtractor): + _VALID_URL = r'https?://deviceids-medp\.wdr\.de/ondemand/\d+/(?P<id>\d+)\.js' + _GEO_COUNTRIES = ['DE'] + _TEST = { + 'url': 'http://deviceids-medp.wdr.de/ondemand/155/1557833.js', + 'info_dict': { + 'id': 'mdb-1557833', + 'ext': 'mp4', + 'title': 'Biathlon-Staffel verpasst Podest bei Olympia-Generalprobe', + 'upload_date': '20180112', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + metadata = self._download_json( + url, video_id, transform_source=strip_jsonp) + + is_live = metadata.get('mediaType') == 'live' + + tracker_data = metadata['trackerData'] + title = tracker_data['trackerClipTitle'] + media_resource = metadata['mediaResource'] + + formats = [] + subtitles = {} + + # check if the metadata contains a direct URL to a file + for kind, media in media_resource.items(): + if kind == 'captionsHash': + for ext, url in media.items(): + subtitles.setdefault('de', []).append({ + 'url': url, + 'ext': ext, + }) + continue + + if kind not in ('dflt', 'alt'): + continue + if not isinstance(media, dict): + continue + + for tag_name, medium_url in media.items(): + if tag_name not in ('videoURL', 'audioURL'): + continue + + ext = determine_ext(medium_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + medium_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls')) + elif ext == 'f4m': + manifest_url = update_url_query( + medium_url, {'hdcore': '3.2.0', 'plugin': 'aasp-3.2.0.77.18'}) + formats.extend(self._extract_f4m_formats( + manifest_url, video_id, f4m_id='hds', fatal=False)) + elif ext == 'smil': + formats.extend(self._extract_smil_formats( + medium_url, 'stream', fatal=False)) + else: + a_format = { + 'url': medium_url + } + if ext == 'unknown_video': + urlh = self._request_webpage( + medium_url, video_id, note='Determining extension') + ext = urlhandle_detect_ext(urlh) + a_format['ext'] = ext + formats.append(a_format) + + self._sort_formats(formats) + + caption_url = media_resource.get('captionURL') + if caption_url: + subtitles['de'] = [{ + 'url': caption_url, + 'ext': 'ttml', + }] + captions_hash = media_resource.get('captionsHash') + if isinstance(captions_hash, dict): + for ext, format_url in captions_hash.items(): + format_url = url_or_none(format_url) + if not format_url: + continue + subtitles.setdefault('de', []).append({ + 'url': format_url, + 'ext': determine_ext(format_url, None) or ext, + }) + + return { + 'id': tracker_data.get('trackerClipId', video_id), + 'title': self._live_title(title) if is_live else title, + 'alt_title': tracker_data.get('trackerClipSubcategory'), + 'formats': formats, + 'subtitles': subtitles, + 'upload_date': unified_strdate(tracker_data.get('trackerClipAirTime')), + 'is_live': is_live, + } + + +class WDRPageIE(InfoExtractor): + _CURRENT_MAUS_URL = r'https?://(?:www\.)wdrmaus.de/(?:[^/]+/){1,2}[^/?#]+\.php5' + _PAGE_REGEX = r'/(?:mediathek/)?(?:[^/]+/)*(?P<display_id>[^/]+)\.html' + _VALID_URL = r'https?://(?:www\d?\.)?(?:(?:kinder\.)?wdr\d?|sportschau)\.de' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL + + _TESTS = [ + { + 'url': 'http://www1.wdr.de/mediathek/video/sendungen/doku-am-freitag/video-geheimnis-aachener-dom-100.html', + # HDS download, MD5 is unstable + 'info_dict': { + 'id': 'mdb-1058683', + 'ext': 'flv', + 'display_id': 'doku-am-freitag/video-geheimnis-aachener-dom-100', + 'title': 'Geheimnis Aachener Dom', + 'alt_title': 'Doku am Freitag', + 'upload_date': '20160304', + 'description': 'md5:87be8ff14d8dfd7a7ee46f0299b52318', + 'is_live': False, + 'subtitles': {'de': [{ + 'url': 'http://ondemand-ww.wdr.de/medp/fsk0/105/1058683/1058683_12220974.xml', + 'ext': 'ttml', + }]}, + }, + 'skip': 'HTTP Error 404: Not Found', + }, + { + 'url': 'http://www1.wdr.de/mediathek/audio/wdr3/wdr3-gespraech-am-samstag/audio-schriftstellerin-juli-zeh-100.html', + 'md5': 'f4c1f96d01cf285240f53ea4309663d8', + 'info_dict': { + 'id': 'mdb-1072000', + 'ext': 'mp3', + 'display_id': 'wdr3-gespraech-am-samstag/audio-schriftstellerin-juli-zeh-100', + 'title': 'Schriftstellerin Juli Zeh', + 'alt_title': 'WDR 3 Gespräch am Samstag', + 'upload_date': '20160312', + 'description': 'md5:e127d320bc2b1f149be697ce044a3dd7', + 'is_live': False, + 'subtitles': {} + }, + 'skip': 'HTTP Error 404: Not Found', + }, + { + 'url': 'http://www1.wdr.de/mediathek/video/live/index.html', + 'info_dict': { + 'id': 'mdb-1406149', + 'ext': 'mp4', + 'title': r're:^WDR Fernsehen im Livestream \(nur in Deutschland erreichbar\) [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'alt_title': 'WDR Fernsehen Live', + 'upload_date': '20150101', + 'is_live': True, + }, + 'params': { + 'skip_download': True, # m3u8 download + }, + }, + { + 'url': 'http://www1.wdr.de/mediathek/video/sendungen/aktuelle-stunde/aktuelle-stunde-120.html', + 'playlist_mincount': 7, + 'info_dict': { + 'id': 'aktuelle-stunde-120', + }, + }, + { + 'url': 'http://www.wdrmaus.de/aktuelle-sendung/index.php5', + 'info_dict': { + 'id': 'mdb-1552552', + 'ext': 'mp4', + 'upload_date': 're:^[0-9]{8}$', + 'title': 're:^Die Sendung mit der Maus vom [0-9.]{10}$', + }, + 'skip': 'The id changes from week to week because of the new episode' + }, + { + 'url': 'http://www.wdrmaus.de/filme/sachgeschichten/achterbahn.php5', + 'md5': '803138901f6368ee497b4d195bb164f2', + 'info_dict': { + 'id': 'mdb-186083', + 'ext': 'mp4', + 'upload_date': '20130919', + 'title': 'Sachgeschichte - Achterbahn ', + }, + }, + { + 'url': 'http://www1.wdr.de/radio/player/radioplayer116~_layout-popupVersion.html', + # Live stream, MD5 unstable + 'info_dict': { + 'id': 'mdb-869971', + 'ext': 'mp4', + 'title': r're:^COSMO Livestream [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'upload_date': '20160101', + }, + 'params': { + 'skip_download': True, # m3u8 download + } + }, + { + 'url': 'http://www.sportschau.de/handballem2018/handball-nationalmannschaft-em-stolperstein-vorrunde-100.html', + 'info_dict': { + 'id': 'mdb-1556012', + 'ext': 'mp4', + 'title': 'DHB-Vizepräsident Bob Hanning - "Die Weltspitze ist extrem breit"', + 'upload_date': '20180111', + }, + 'params': { + 'skip_download': True, + }, + }, + { + 'url': 'http://www.sportschau.de/handballem2018/audio-vorschau---die-handball-em-startet-mit-grossem-favoritenfeld-100.html', + 'only_matching': True, + }, + { + 'url': 'https://kinder.wdr.de/tv/die-sendung-mit-dem-elefanten/av/video-folge---astronaut-100.html', + 'only_matching': True, + }, + ] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + display_id = mobj.group('display_id') + webpage = self._download_webpage(url, display_id) + + entries = [] + + # Article with several videos + + # for wdr.de the data-extension is in a tag with the class "mediaLink" + # for wdr.de radio players, in a tag with the class "wdrrPlayerPlayBtn" + # for wdrmaus, in a tag with the class "videoButton" (previously a link + # to the page in a multiline "videoLink"-tag) + for mobj in re.finditer( + r'''(?sx)class= + (?: + (["\'])(?:mediaLink|wdrrPlayerPlayBtn|videoButton)\b.*?\1[^>]+| + (["\'])videoLink\b.*?\2[\s]*>\n[^\n]* + )data-extension=(["\'])(?P<data>(?:(?!\3).)+)\3 + ''', webpage): + media_link_obj = self._parse_json( + mobj.group('data'), display_id, transform_source=js_to_json, + fatal=False) + if not media_link_obj: + continue + jsonp_url = try_get( + media_link_obj, lambda x: x['mediaObj']['url'], compat_str) + if jsonp_url: + entries.append(self.url_result(jsonp_url, ie=WDRIE.ie_key())) + + # Playlist (e.g. https://www1.wdr.de/mediathek/video/sendungen/aktuelle-stunde/aktuelle-stunde-120.html) + if not entries: + entries = [ + self.url_result( + compat_urlparse.urljoin(url, mobj.group('href')), + ie=WDRPageIE.ie_key()) + for mobj in re.finditer( + r'<a[^>]+\bhref=(["\'])(?P<href>(?:(?!\1).)+)\1[^>]+\bdata-extension=', + webpage) if re.match(self._PAGE_REGEX, mobj.group('href')) + ] + + return self.playlist_result(entries, playlist_id=display_id) + + +class WDRElefantIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)wdrmaus\.de/elefantenseite/#(?P<id>.+)' + _TEST = { + 'url': 'http://www.wdrmaus.de/elefantenseite/#folge_ostern_2015', + 'info_dict': { + 'title': 'Folge Oster-Spezial 2015', + 'id': 'mdb-1088195', + 'ext': 'mp4', + 'age_limit': None, + 'upload_date': '20150406' + }, + 'params': { + 'skip_download': True, + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + # Table of Contents seems to always be at this address, so fetch it directly. + # The website fetches configurationJS.php5, which links to tableOfContentsJS.php5. + table_of_contents = self._download_json( + 'https://www.wdrmaus.de/elefantenseite/data/tableOfContentsJS.php5', + display_id) + if display_id not in table_of_contents: + raise ExtractorError( + 'No entry in site\'s table of contents for this URL. ' + 'Is the fragment part of the URL (after the #) correct?', + expected=True) + xml_metadata_path = table_of_contents[display_id]['xmlPath'] + xml_metadata = self._download_xml( + 'https://www.wdrmaus.de/elefantenseite/' + xml_metadata_path, + display_id) + zmdb_url_element = xml_metadata.find('./movie/zmdb_url') + if zmdb_url_element is None: + raise ExtractorError( + '%s is not a video' % display_id, expected=True) + return self.url_result(zmdb_url_element.text, ie=WDRIE.ie_key()) + + +class WDRMobileIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?://mobile-ondemand\.wdr\.de/ + .*?/fsk(?P<age_limit>[0-9]+) + /[0-9]+/[0-9]+/ + (?P<id>[0-9]+)_(?P<title>[0-9]+)''' + IE_NAME = 'wdr:mobile' + _TEST = { + 'url': 'http://mobile-ondemand.wdr.de/CMS2010/mdb/ondemand/weltweit/fsk0/42/421735/421735_4283021.mp4', + 'info_dict': { + 'title': '4283021', + 'id': '421735', + 'ext': 'mp4', + 'age_limit': 0, + }, + 'skip': 'Problems with loading data.' + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + return { + 'id': mobj.group('id'), + 'title': mobj.group('title'), + 'age_limit': int(mobj.group('age_limit')), + 'url': url, + 'http_headers': { + 'User-Agent': 'mobile', + }, + } diff --git a/youtube_dl/extractor/webcaster.py b/yt_dlp/extractor/webcaster.py index e4b65f54f..e4b65f54f 100644 --- a/youtube_dl/extractor/webcaster.py +++ b/yt_dlp/extractor/webcaster.py diff --git a/youtube_dl/extractor/webofstories.py b/yt_dlp/extractor/webofstories.py index f2b8d19b4..f2b8d19b4 100644 --- a/youtube_dl/extractor/webofstories.py +++ b/yt_dlp/extractor/webofstories.py diff --git a/youtube_dl/extractor/weibo.py b/yt_dlp/extractor/weibo.py index 621df5b54..621df5b54 100644 --- a/youtube_dl/extractor/weibo.py +++ b/yt_dlp/extractor/weibo.py diff --git a/youtube_dl/extractor/weiqitv.py b/yt_dlp/extractor/weiqitv.py index 7e0befd39..7e0befd39 100644 --- a/youtube_dl/extractor/weiqitv.py +++ b/yt_dlp/extractor/weiqitv.py diff --git a/yt_dlp/extractor/whowatch.py b/yt_dlp/extractor/whowatch.py new file mode 100644 index 000000000..f8bc2e73a --- /dev/null +++ b/yt_dlp/extractor/whowatch.py @@ -0,0 +1,99 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + qualities, + try_get, + ExtractorError, +) +from ..compat import compat_str + + +class WhoWatchIE(InfoExtractor): + IE_NAME = 'whowatch' + _VALID_URL = r'https?://whowatch\.tv/viewer/(?P<id>\d+)' + + _TESTS = [{ + 'url': 'https://whowatch.tv/viewer/21450171', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + self._download_webpage(url, video_id) + metadata = self._download_json('https://api.whowatch.tv/lives/%s' % video_id, video_id) + live_data = self._download_json('https://api.whowatch.tv/lives/%s/play' % video_id, video_id) + + title = try_get(None, ( + lambda x: live_data['share_info']['live_title'][1:-1], + lambda x: metadata['live']['title'], + ), compat_str) + + hls_url = live_data.get('hls_url') + if not hls_url: + raise ExtractorError(live_data.get('error_message') or 'The user is offline.', expected=True) + + QUALITIES = qualities(['low', 'medium', 'high', 'veryhigh']) + formats = [] + + for i, fmt in enumerate(live_data.get('streams') or []): + name = fmt.get('quality') or fmt.get('name') or compat_str(i) + hls_url = fmt.get('hls_url') + rtmp_url = fmt.get('rtmp_url') + audio_only = fmt.get('audio_only') + quality = QUALITIES(fmt.get('quality')) + + if hls_url: + hls_fmts = self._extract_m3u8_formats( + hls_url, video_id, ext='mp4', m3u8_id='hls-%s' % name, quality=quality) + formats.extend(hls_fmts) + else: + hls_fmts = [] + + # RTMP url for audio_only is same as high format, so skip it + if rtmp_url and not audio_only: + formats.append({ + 'url': rtmp_url, + 'format_id': 'rtmp-%s' % name, + 'ext': 'mp4', + 'protocol': 'rtmp_ffmpeg', # ffmpeg can, while rtmpdump can't + 'vcodec': 'h264', + 'acodec': 'aac', + 'quality': quality, + 'format_note': fmt.get('label'), + # note: HLS and RTMP have same resolution for now, so it's acceptable + 'width': try_get(hls_fmts, lambda x: x[0]['width'], int), + 'height': try_get(hls_fmts, lambda x: x[0]['height'], int), + }) + + # This contains the same formats as the above manifests and is used only as a fallback + formats.extend(self._extract_m3u8_formats( + hls_url, video_id, ext='mp4', m3u8_id='hls')) + self._remove_duplicate_formats(formats) + self._sort_formats(formats) + + uploader_url = try_get(metadata, lambda x: x['live']['user']['user_path'], compat_str) + if uploader_url: + uploader_url = 'https://whowatch.tv/profile/%s' % uploader_url + uploader_id = compat_str(try_get(metadata, lambda x: x['live']['user']['id'], int)) + uploader = try_get(metadata, lambda x: x['live']['user']['name'], compat_str) + thumbnail = try_get(metadata, lambda x: x['live']['latest_thumbnail_url'], compat_str) + timestamp = int_or_none(try_get(metadata, lambda x: x['live']['started_at'], int), scale=1000) + view_count = try_get(metadata, lambda x: x['live']['total_view_count'], int) + comment_count = try_get(metadata, lambda x: x['live']['comment_count'], int) + + return { + 'id': video_id, + 'title': title, + 'uploader_id': uploader_id, + 'uploader_url': uploader_url, + 'uploader': uploader, + 'formats': formats, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'view_count': view_count, + 'comment_count': comment_count, + 'is_live': True, + } diff --git a/yt_dlp/extractor/wimtv.py b/yt_dlp/extractor/wimtv.py new file mode 100644 index 000000000..ea953bf77 --- /dev/null +++ b/yt_dlp/extractor/wimtv.py @@ -0,0 +1,163 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + parse_duration, + urlencode_postdata, + ExtractorError, +) + + +class WimTVIE(InfoExtractor): + _player = None + _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' + _VALID_URL = r'''(?x) + https?://platform.wim.tv/ + (?: + (?:embed/)?\? + |\#/webtv/.+?/ + ) + (?P<type>vod|live|cast)[=/] + (?P<id>%s).*?''' % _UUID_RE + _TESTS = [{ + # vod stream + 'url': 'https://platform.wim.tv/embed/?vod=db29fb32-bade-47b6-a3a6-cb69fe80267a', + 'md5': 'db29fb32-bade-47b6-a3a6-cb69fe80267a', + 'info_dict': { + 'id': 'db29fb32-bade-47b6-a3a6-cb69fe80267a', + 'ext': 'mp4', + 'title': 'AMA SUPERCROSS 2020 - R2 ST. LOUIS', + 'duration': 6481, + 'thumbnail': r're:https?://.+?/thumbnail/.+?/720$' + }, + 'params': { + 'skip_download': True, + }, + }, { + # live stream + 'url': 'https://platform.wim.tv/embed/?live=28e22c22-49db-40f3-8c37-8cbb0ff44556&autostart=true', + 'info_dict': { + 'id': '28e22c22-49db-40f3-8c37-8cbb0ff44556', + 'ext': 'mp4', + 'title': 'Streaming MSmotorTV', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://platform.wim.tv/#/webtv/automotornews/vod/422492b6-539e-474d-9c6b-68c9d5893365', + 'only_matching': True, + }, { + 'url': 'https://platform.wim.tv/#/webtv/renzoarborechannel/cast/f47e0d15-5b45-455e-bf0d-dba8ffa96365', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r'<iframe[^>]+src=["\'](?P<url>%s)' % WimTVIE._VALID_URL, + webpage)] + + def _real_initialize(self): + if not self._player: + self._get_player_data() + + def _get_player_data(self): + msg_id = 'Player data' + self._player = {} + + datas = [{ + 'url': 'https://platform.wim.tv/common/libs/player/wimtv/wim-rest.js', + 'vars': [{ + 'regex': r'appAuth = "(.+?)"', + 'variable': 'app_auth', + }] + }, { + 'url': 'https://platform.wim.tv/common/config/endpointconfig.js', + 'vars': [{ + 'regex': r'PRODUCTION_HOSTNAME_THUMB = "(.+?)"', + 'variable': 'thumb_server', + }, { + 'regex': r'PRODUCTION_HOSTNAME_THUMB\s*\+\s*"(.+?)"', + 'variable': 'thumb_server_path', + }] + }] + + for data in datas: + temp = self._download_webpage(data['url'], msg_id) + for var in data['vars']: + val = self._search_regex(var['regex'], temp, msg_id) + if not val: + raise ExtractorError('%s not found' % var['variable']) + self._player[var['variable']] = val + + def _generate_token(self): + json = self._download_json( + 'https://platform.wim.tv/wimtv-server/oauth/token', 'Token generation', + headers={'Authorization': 'Basic %s' % self._player['app_auth']}, + data=urlencode_postdata({'grant_type': 'client_credentials'})) + token = json.get('access_token') + if not token: + raise ExtractorError('access token not generated') + return token + + def _generate_thumbnail(self, thumb_id, width='720'): + if not thumb_id or not self._player.get('thumb_server'): + return None + if not self._player.get('thumb_server_path'): + self._player['thumb_server_path'] = '' + return '%s%s/asset/thumbnail/%s/%s' % ( + self._player['thumb_server'], + self._player['thumb_server_path'], + thumb_id, width) + + def _real_extract(self, url): + urlc = self._match_valid_url(url).groupdict() + video_id = urlc['id'] + stream_type = is_live = None + if urlc['type'] in {'live', 'cast'}: + stream_type = urlc['type'] + '/channel' + is_live = True + else: + stream_type = 'vod' + is_live = False + token = self._generate_token() + json = self._download_json( + 'https://platform.wim.tv/wimtv-server/api/public/%s/%s/play' % ( + stream_type, video_id), video_id, + headers={'Authorization': 'Bearer %s' % token, + 'Content-Type': 'application/json'}, + data=bytes('{}', 'utf-8')) + + formats = [] + for src in json.get('srcs') or []: + if src.get('mimeType') == 'application/x-mpegurl': + formats.extend( + self._extract_m3u8_formats( + src.get('uniqueStreamer'), video_id, 'mp4')) + if src.get('mimeType') == 'video/flash': + formats.append({ + 'format_id': 'rtmp', + 'url': src.get('uniqueStreamer'), + 'ext': determine_ext(src.get('uniqueStreamer'), 'flv'), + 'rtmp_live': is_live, + }) + json = json.get('resource') + thumb = self._generate_thumbnail(json.get('thumbnailId')) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': json.get('title') or json.get('name'), + 'duration': parse_duration(json.get('duration')), + 'formats': formats, + 'thumbnail': thumb, + 'is_live': is_live, + } diff --git a/yt_dlp/extractor/wistia.py b/yt_dlp/extractor/wistia.py new file mode 100644 index 000000000..a170966c3 --- /dev/null +++ b/yt_dlp/extractor/wistia.py @@ -0,0 +1,199 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + try_get, + unescapeHTML, +) + + +class WistiaBaseIE(InfoExtractor): + _VALID_ID_REGEX = r'(?P<id>[a-z0-9]{10})' + _VALID_URL_BASE = r'https?://(?:fast\.)?wistia\.(?:net|com)/embed/' + _EMBED_BASE_URL = 'http://fast.wistia.com/embed/' + + def _download_embed_config(self, config_type, config_id, referer): + base_url = self._EMBED_BASE_URL + '%ss/%s' % (config_type, config_id) + embed_config = self._download_json( + base_url + '.json', config_id, headers={ + 'Referer': referer if referer.startswith('http') else base_url, # Some videos require this. + }) + + if isinstance(embed_config, dict) and embed_config.get('error'): + raise ExtractorError( + 'Error while getting the playlist', expected=True) + + return embed_config + + def _extract_media(self, embed_config): + data = embed_config['media'] + video_id = data['hashedId'] + title = data['name'] + + formats = [] + thumbnails = [] + for a in data['assets']: + aurl = a.get('url') + if not aurl: + continue + astatus = a.get('status') + atype = a.get('type') + if (astatus is not None and astatus != 2) or atype in ('preview', 'storyboard'): + continue + elif atype in ('still', 'still_image'): + thumbnails.append({ + 'url': aurl, + 'width': int_or_none(a.get('width')), + 'height': int_or_none(a.get('height')), + 'filesize': int_or_none(a.get('size')), + }) + else: + aext = a.get('ext') + display_name = a.get('display_name') + format_id = atype + if atype and atype.endswith('_video') and display_name: + format_id = '%s-%s' % (atype[:-6], display_name) + f = { + 'format_id': format_id, + 'url': aurl, + 'tbr': int_or_none(a.get('bitrate')) or None, + 'quality': 1 if atype == 'original' else None, + } + if display_name == 'Audio': + f.update({ + 'vcodec': 'none', + }) + else: + f.update({ + 'width': int_or_none(a.get('width')), + 'height': int_or_none(a.get('height')), + 'vcodec': a.get('codec'), + }) + if a.get('container') == 'm3u8' or aext == 'm3u8': + ts_f = f.copy() + ts_f.update({ + 'ext': 'ts', + 'format_id': f['format_id'].replace('hls-', 'ts-'), + 'url': f['url'].replace('.bin', '.ts'), + }) + formats.append(ts_f) + f.update({ + 'ext': 'mp4', + 'protocol': 'm3u8_native', + }) + else: + f.update({ + 'container': a.get('container'), + 'ext': aext, + 'filesize': int_or_none(a.get('size')), + }) + formats.append(f) + + self._sort_formats(formats) + + subtitles = {} + for caption in data.get('captions', []): + language = caption.get('language') + if not language: + continue + subtitles[language] = [{ + 'url': self._EMBED_BASE_URL + 'captions/' + video_id + '.vtt?language=' + language, + }] + + return { + 'id': video_id, + 'title': title, + 'description': data.get('seoDescription'), + 'formats': formats, + 'thumbnails': thumbnails, + 'duration': float_or_none(data.get('duration')), + 'timestamp': int_or_none(data.get('createdAt')), + 'subtitles': subtitles, + } + + +class WistiaIE(WistiaBaseIE): + _VALID_URL = r'(?:wistia:|%s(?:iframe|medias)/)%s' % (WistiaBaseIE._VALID_URL_BASE, WistiaBaseIE._VALID_ID_REGEX) + + _TESTS = [{ + # with hls video + 'url': 'wistia:807fafadvk', + 'md5': 'daff0f3687a41d9a71b40e0e8c2610fe', + 'info_dict': { + 'id': '807fafadvk', + 'ext': 'mp4', + 'title': 'Drip Brennan Dunn Workshop', + 'description': 'a JV Webinars video', + 'upload_date': '20160518', + 'timestamp': 1463607249, + 'duration': 4987.11, + }, + }, { + 'url': 'wistia:sh7fpupwlt', + 'only_matching': True, + }, { + 'url': 'http://fast.wistia.net/embed/iframe/sh7fpupwlt', + 'only_matching': True, + }, { + 'url': 'http://fast.wistia.com/embed/iframe/sh7fpupwlt', + 'only_matching': True, + }, { + 'url': 'http://fast.wistia.net/embed/medias/sh7fpupwlt.json', + 'only_matching': True, + }] + + # https://wistia.com/support/embed-and-share/video-on-your-website + @staticmethod + def _extract_url(webpage): + urls = WistiaIE._extract_urls(webpage) + return urls[0] if urls else None + + @staticmethod + def _extract_urls(webpage): + urls = [] + for match in re.finditer( + r'<(?:meta[^>]+?content|(?:iframe|script)[^>]+?src)=["\'](?P<url>(?:https?:)?//(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/[a-z0-9]{10})', webpage): + urls.append(unescapeHTML(match.group('url'))) + for match in re.finditer( + r'''(?sx) + <div[^>]+class=(["'])(?:(?!\1).)*?\bwistia_async_(?P<id>[a-z0-9]{10})\b(?:(?!\1).)*?\1 + ''', webpage): + urls.append('wistia:%s' % match.group('id')) + for match in re.finditer(r'(?:data-wistia-?id=["\']|Wistia\.embed\(["\']|id=["\']wistia_)(?P<id>[a-z0-9]{10})', webpage): + urls.append('wistia:%s' % match.group('id')) + return urls + + def _real_extract(self, url): + video_id = self._match_id(url) + embed_config = self._download_embed_config('media', video_id, url) + return self._extract_media(embed_config) + + +class WistiaPlaylistIE(WistiaBaseIE): + _VALID_URL = r'%splaylists/%s' % (WistiaIE._VALID_URL_BASE, WistiaIE._VALID_ID_REGEX) + + _TEST = { + 'url': 'https://fast.wistia.net/embed/playlists/aodt9etokc', + 'info_dict': { + 'id': 'aodt9etokc', + }, + 'playlist_count': 3, + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + playlist = self._download_embed_config('playlist', playlist_id, url) + + entries = [] + for media in (try_get(playlist, lambda x: x[0]['medias']) or []): + embed_config = media.get('embed_config') + if not embed_config: + continue + entries.append(self._extract_media(embed_config)) + + return self.playlist_result(entries, playlist_id) diff --git a/youtube_dl/extractor/worldstarhiphop.py b/yt_dlp/extractor/worldstarhiphop.py index 82587b4ce..82587b4ce 100644 --- a/youtube_dl/extractor/worldstarhiphop.py +++ b/yt_dlp/extractor/worldstarhiphop.py diff --git a/youtube_dl/extractor/wsj.py b/yt_dlp/extractor/wsj.py index 67236f377..67236f377 100644 --- a/youtube_dl/extractor/wsj.py +++ b/yt_dlp/extractor/wsj.py diff --git a/youtube_dl/extractor/wwe.py b/yt_dlp/extractor/wwe.py index bebc77bb5..bebc77bb5 100644 --- a/youtube_dl/extractor/wwe.py +++ b/yt_dlp/extractor/wwe.py diff --git a/youtube_dl/extractor/xbef.py b/yt_dlp/extractor/xbef.py index 4c41e98b2..4c41e98b2 100644 --- a/youtube_dl/extractor/xbef.py +++ b/yt_dlp/extractor/xbef.py diff --git a/yt_dlp/extractor/xboxclips.py b/yt_dlp/extractor/xboxclips.py new file mode 100644 index 000000000..9bac982f8 --- /dev/null +++ b/yt_dlp/extractor/xboxclips.py @@ -0,0 +1,65 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + month_by_abbreviation, + parse_filesize, + parse_qs, +) + + +class XboxClipsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?:xboxclips\.com|gameclips\.io)/(?:video\.php\?.*vid=|[^/]+/)(?P<id>[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' + _TESTS = [{ + 'url': 'http://xboxclips.com/video.php?uid=2533274823424419&gamertag=Iabdulelah&vid=074a69a9-5faf-46aa-b93b-9909c1720325', + 'md5': 'fbe1ec805e920aeb8eced3c3e657df5d', + 'info_dict': { + 'id': '074a69a9-5faf-46aa-b93b-9909c1720325', + 'ext': 'mp4', + 'title': 'iAbdulElah playing Titanfall', + 'filesize_approx': 26800000, + 'upload_date': '20140807', + 'duration': 56, + } + }, { + 'url': 'https://gameclips.io/iAbdulElah/074a69a9-5faf-46aa-b93b-9909c1720325', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + if '/video.php' in url: + qs = parse_qs(url) + url = 'https://gameclips.io/%s/%s' % (qs['gamertag'][0], qs['vid'][0]) + + webpage = self._download_webpage(url, video_id) + info = self._parse_html5_media_entries(url, webpage, video_id)[0] + + title = self._html_search_meta(['og:title', 'twitter:title'], webpage) + upload_date = None + mobj = re.search( + r'>Recorded: (\d{2})-(Jan|Feb|Mar|Apr|May|Ju[nl]|Aug|Sep|Oct|Nov|Dec)-(\d{4})', + webpage) + if mobj: + upload_date = '%s%.2d%s' % (mobj.group(3), month_by_abbreviation(mobj.group(2)), mobj.group(1)) + filesize = parse_filesize(self._html_search_regex( + r'>Size: ([^<]+)<', webpage, 'file size', fatal=False)) + duration = int_or_none(self._html_search_regex( + r'>Duration: (\d+) Seconds<', webpage, 'duration', fatal=False)) + view_count = int_or_none(self._html_search_regex( + r'>Views: (\d+)<', webpage, 'view count', fatal=False)) + + info.update({ + 'id': video_id, + 'title': title, + 'upload_date': upload_date, + 'filesize_approx': filesize, + 'duration': duration, + 'view_count': view_count, + }) + return info diff --git a/yt_dlp/extractor/xfileshare.py b/yt_dlp/extractor/xfileshare.py new file mode 100644 index 000000000..cd97c77dc --- /dev/null +++ b/yt_dlp/extractor/xfileshare.py @@ -0,0 +1,201 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_chr +from ..utils import ( + decode_packed_codes, + determine_ext, + ExtractorError, + int_or_none, + js_to_json, + urlencode_postdata, +) + + +# based on openload_decode from 2bfeee69b976fe049761dd3012e30b637ee05a58 +def aa_decode(aa_code): + symbol_table = [ + ('7', '((゚ー゚) + (o^_^o))'), + ('6', '((o^_^o) +(o^_^o))'), + ('5', '((゚ー゚) + (゚Θ゚))'), + ('2', '((o^_^o) - (゚Θ゚))'), + ('4', '(゚ー゚)'), + ('3', '(o^_^o)'), + ('1', '(゚Θ゚)'), + ('0', '(c^_^o)'), + ] + delim = '(゚Д゚)[゚ε゚]+' + ret = '' + for aa_char in aa_code.split(delim): + for val, pat in symbol_table: + aa_char = aa_char.replace(pat, val) + aa_char = aa_char.replace('+ ', '') + m = re.match(r'^\d+', aa_char) + if m: + ret += compat_chr(int(m.group(0), 8)) + else: + m = re.match(r'^u([\da-f]+)', aa_char) + if m: + ret += compat_chr(int(m.group(1), 16)) + return ret + + +class XFileShareIE(InfoExtractor): + _SITES = ( + (r'aparat\.cam', 'Aparat'), + (r'clipwatching\.com', 'ClipWatching'), + (r'gounlimited\.to', 'GoUnlimited'), + (r'govid\.me', 'GoVid'), + (r'holavid\.com', 'HolaVid'), + (r'streamty\.com', 'Streamty'), + (r'thevideobee\.to', 'TheVideoBee'), + (r'uqload\.com', 'Uqload'), + (r'vidbom\.com', 'VidBom'), + (r'vidlo\.us', 'vidlo'), + (r'vidlocker\.xyz', 'VidLocker'), + (r'vidshare\.tv', 'VidShare'), + (r'vup\.to', 'VUp'), + (r'wolfstream\.tv', 'WolfStream'), + (r'xvideosharing\.com', 'XVideoSharing'), + ) + + IE_DESC = 'XFileShare based sites: %s' % ', '.join(list(zip(*_SITES))[1]) + _VALID_URL = (r'https?://(?:www\.)?(?P<host>%s)/(?:embed-)?(?P<id>[0-9a-zA-Z]+)' + % '|'.join(site for site in list(zip(*_SITES))[0])) + + _FILE_NOT_FOUND_REGEXES = ( + r'>(?:404 - )?File Not Found<', + r'>The file was removed by administrator<', + ) + + _TESTS = [{ + 'url': 'http://xvideosharing.com/fq65f94nd2ve', + 'md5': '4181f63957e8fe90ac836fa58dc3c8a6', + 'info_dict': { + 'id': 'fq65f94nd2ve', + 'ext': 'mp4', + 'title': 'sample', + 'thumbnail': r're:http://.*\.jpg', + }, + }, { + 'url': 'https://aparat.cam/n4d6dh0wvlpr', + 'only_matching': True, + }, { + 'url': 'https://wolfstream.tv/nthme29v9u2x', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:%s)/embed-[0-9a-zA-Z]+.*?)\1' + % '|'.join(site for site in list(zip(*XFileShareIE._SITES))[0]), + webpage)] + + def _real_extract(self, url): + host, video_id = self._match_valid_url(url).groups() + + url = 'https://%s/' % host + ('embed-%s.html' % video_id if host in ('govid.me', 'vidlo.us') else video_id) + webpage = self._download_webpage(url, video_id) + + if any(re.search(p, webpage) for p in self._FILE_NOT_FOUND_REGEXES): + raise ExtractorError('Video %s does not exist' % video_id, expected=True) + + fields = self._hidden_inputs(webpage) + + if fields.get('op') == 'download1': + countdown = int_or_none(self._search_regex( + r'<span id="countdown_str">(?:[Ww]ait)?\s*<span id="cxc">(\d+)</span>\s*(?:seconds?)?</span>', + webpage, 'countdown', default=None)) + if countdown: + self._sleep(countdown, video_id) + + webpage = self._download_webpage( + url, video_id, 'Downloading video page', + data=urlencode_postdata(fields), headers={ + 'Referer': url, + 'Content-type': 'application/x-www-form-urlencoded', + }) + + title = (self._search_regex( + (r'style="z-index: [0-9]+;">([^<]+)</span>', + r'<td nowrap>([^<]+)</td>', + r'h4-fine[^>]*>([^<]+)<', + r'>Watch (.+)[ <]', + r'<h2 class="video-page-head">([^<]+)</h2>', + r'<h2 style="[^"]*color:#403f3d[^"]*"[^>]*>([^<]+)<', # streamin.to + r'title\s*:\s*"([^"]+)"'), # govid.me + webpage, 'title', default=None) or self._og_search_title( + webpage, default=None) or video_id).strip() + + for regex, func in ( + (r'(eval\(function\(p,a,c,k,e,d\){.+)', decode_packed_codes), + (r'(゚.+)', aa_decode)): + obf_code = self._search_regex(regex, webpage, 'obfuscated code', default=None) + if obf_code: + webpage = webpage.replace(obf_code, func(obf_code)) + + formats = [] + + jwplayer_data = self._search_regex( + [ + r'jwplayer\("[^"]+"\)\.load\(\[({.+?})\]\);', + r'jwplayer\("[^"]+"\)\.setup\(({.+?})\);', + ], webpage, + 'jwplayer data', default=None) + if jwplayer_data: + jwplayer_data = self._parse_json( + jwplayer_data.replace(r"\'", "'"), video_id, js_to_json) + if jwplayer_data: + formats = self._parse_jwplayer_data( + jwplayer_data, video_id, False, + m3u8_id='hls', mpd_id='dash')['formats'] + + if not formats: + urls = [] + for regex in ( + r'(?:file|src)\s*:\s*(["\'])(?P<url>http(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1', + r'file_link\s*=\s*(["\'])(?P<url>http(?:(?!\1).)+)\1', + r'addVariable\((\\?["\'])file\1\s*,\s*(\\?["\'])(?P<url>http(?:(?!\2).)+)\2\)', + r'<embed[^>]+src=(["\'])(?P<url>http(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1'): + for mobj in re.finditer(regex, webpage): + video_url = mobj.group('url') + if video_url not in urls: + urls.append(video_url) + + sources = self._search_regex( + r'sources\s*:\s*(\[(?!{)[^\]]+\])', webpage, 'sources', default=None) + if sources: + urls.extend(self._parse_json(sources, video_id)) + + formats = [] + for video_url in urls: + if determine_ext(video_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False)) + else: + formats.append({ + 'url': video_url, + 'format_id': 'sd', + }) + self._sort_formats(formats) + + thumbnail = self._search_regex( + [ + r'<video[^>]+poster="([^"]+)"', + r'(?:image|poster)\s*:\s*["\'](http[^"\']+)["\'],', + ], webpage, 'thumbnail', default=None) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'formats': formats, + } diff --git a/yt_dlp/extractor/xhamster.py b/yt_dlp/extractor/xhamster.py new file mode 100644 index 000000000..9d4ed47d4 --- /dev/null +++ b/yt_dlp/extractor/xhamster.py @@ -0,0 +1,455 @@ +from __future__ import unicode_literals + +import itertools +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + clean_html, + determine_ext, + dict_get, + extract_attributes, + ExtractorError, + float_or_none, + int_or_none, + parse_duration, + str_or_none, + try_get, + unified_strdate, + url_or_none, + urljoin, +) + + +class XHamsterIE(InfoExtractor): + _DOMAINS = r'(?:xhamster\.(?:com|one|desi)|xhms\.pro|xhamster\d+\.com)' + _VALID_URL = r'''(?x) + https?:// + (?:.+?\.)?%s/ + (?: + movies/(?P<id>[\dA-Za-z]+)/(?P<display_id>[^/]*)\.html| + videos/(?P<display_id_2>[^/]*)-(?P<id_2>[\dA-Za-z]+) + ) + ''' % _DOMAINS + _TESTS = [{ + 'url': 'https://xhamster.com/videos/femaleagent-shy-beauty-takes-the-bait-1509445', + 'md5': '98b4687efb1ffd331c4197854dc09e8f', + 'info_dict': { + 'id': '1509445', + 'display_id': 'femaleagent-shy-beauty-takes-the-bait', + 'ext': 'mp4', + 'title': 'FemaleAgent Shy beauty takes the bait', + 'timestamp': 1350194821, + 'upload_date': '20121014', + 'uploader': 'Ruseful2011', + 'duration': 893, + 'age_limit': 18, + }, + }, { + 'url': 'https://xhamster.com/videos/britney-spears-sexy-booty-2221348?hd=', + 'info_dict': { + 'id': '2221348', + 'display_id': 'britney-spears-sexy-booty', + 'ext': 'mp4', + 'title': 'Britney Spears Sexy Booty', + 'timestamp': 1379123460, + 'upload_date': '20130914', + 'uploader': 'jojo747400', + 'duration': 200, + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, + }, { + # empty seo, unavailable via new URL schema + 'url': 'http://xhamster.com/movies/5667973/.html', + 'info_dict': { + 'id': '5667973', + 'ext': 'mp4', + 'title': '....', + 'timestamp': 1454948101, + 'upload_date': '20160208', + 'uploader': 'parejafree', + 'duration': 72, + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, + }, { + # mobile site + 'url': 'https://m.xhamster.com/videos/cute-teen-jacqueline-solo-masturbation-8559111', + 'only_matching': True, + }, { + 'url': 'https://xhamster.com/movies/2272726/amber_slayed_by_the_knight.html', + 'only_matching': True, + }, { + # This video is visible for marcoalfa123456's friends only + 'url': 'https://it.xhamster.com/movies/7263980/la_mia_vicina.html', + 'only_matching': True, + }, { + # new URL schema + 'url': 'https://pt.xhamster.com/videos/euro-pedal-pumping-7937821', + 'only_matching': True, + }, { + 'url': 'https://xhamster.one/videos/femaleagent-shy-beauty-takes-the-bait-1509445', + 'only_matching': True, + }, { + 'url': 'https://xhamster.desi/videos/femaleagent-shy-beauty-takes-the-bait-1509445', + 'only_matching': True, + }, { + 'url': 'https://xhamster2.com/videos/femaleagent-shy-beauty-takes-the-bait-1509445', + 'only_matching': True, + }, { + 'url': 'https://xhamster11.com/videos/femaleagent-shy-beauty-takes-the-bait-1509445', + 'only_matching': True, + }, { + 'url': 'https://xhamster26.com/videos/femaleagent-shy-beauty-takes-the-bait-1509445', + 'only_matching': True, + }, { + 'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html', + 'only_matching': True, + }, { + 'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd', + 'only_matching': True, + }, { + 'url': 'http://de.xhamster.com/videos/skinny-girl-fucks-herself-hard-in-the-forest-xhnBJZx', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') or mobj.group('id_2') + display_id = mobj.group('display_id') or mobj.group('display_id_2') + + desktop_url = re.sub(r'^(https?://(?:.+?\.)?)m\.', r'\1', url) + webpage, urlh = self._download_webpage_handle(desktop_url, video_id) + + error = self._html_search_regex( + r'<div[^>]+id=["\']videoClosed["\'][^>]*>(.+?)</div>', + webpage, 'error', default=None) + if error: + raise ExtractorError(error, expected=True) + + age_limit = self._rta_search(webpage) + + def get_height(s): + return int_or_none(self._search_regex( + r'^(\d+)[pP]', s, 'height', default=None)) + + initials = self._parse_json( + self._search_regex( + (r'window\.initials\s*=\s*({.+?})\s*;\s*</script>', + r'window\.initials\s*=\s*({.+?})\s*;'), webpage, 'initials', + default='{}'), + video_id, fatal=False) + if initials: + video = initials['videoModel'] + title = video['title'] + formats = [] + format_urls = set() + format_sizes = {} + sources = try_get(video, lambda x: x['sources'], dict) or {} + for format_id, formats_dict in sources.items(): + if not isinstance(formats_dict, dict): + continue + download_sources = try_get(sources, lambda x: x['download'], dict) or {} + for quality, format_dict in download_sources.items(): + if not isinstance(format_dict, dict): + continue + format_sizes[quality] = float_or_none(format_dict.get('size')) + for quality, format_item in formats_dict.items(): + if format_id == 'download': + # Download link takes some time to be generated, + # skipping for now + continue + format_url = format_item + format_url = url_or_none(format_url) + if not format_url or format_url in format_urls: + continue + format_urls.add(format_url) + formats.append({ + 'format_id': '%s-%s' % (format_id, quality), + 'url': format_url, + 'ext': determine_ext(format_url, 'mp4'), + 'height': get_height(quality), + 'filesize': format_sizes.get(quality), + 'http_headers': { + 'Referer': urlh.geturl(), + }, + }) + xplayer_sources = try_get( + initials, lambda x: x['xplayerSettings']['sources'], dict) + if xplayer_sources: + hls_sources = xplayer_sources.get('hls') + if isinstance(hls_sources, dict): + for hls_format_key in ('url', 'fallback'): + hls_url = hls_sources.get(hls_format_key) + if not hls_url: + continue + hls_url = urljoin(url, hls_url) + if not hls_url or hls_url in format_urls: + continue + format_urls.add(hls_url) + formats.extend(self._extract_m3u8_formats( + hls_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + standard_sources = xplayer_sources.get('standard') + if isinstance(standard_sources, dict): + for format_id, formats_list in standard_sources.items(): + if not isinstance(formats_list, list): + continue + for standard_format in formats_list: + if not isinstance(standard_format, dict): + continue + for standard_format_key in ('url', 'fallback'): + standard_url = standard_format.get(standard_format_key) + if not standard_url: + continue + standard_url = urljoin(url, standard_url) + if not standard_url or standard_url in format_urls: + continue + format_urls.add(standard_url) + ext = determine_ext(standard_url, 'mp4') + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + standard_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + continue + quality = (str_or_none(standard_format.get('quality')) + or str_or_none(standard_format.get('label')) + or '') + formats.append({ + 'format_id': '%s-%s' % (format_id, quality), + 'url': standard_url, + 'ext': ext, + 'height': get_height(quality), + 'filesize': format_sizes.get(quality), + 'http_headers': { + 'Referer': standard_url, + }, + }) + self._sort_formats(formats) + + categories_list = video.get('categories') + if isinstance(categories_list, list): + categories = [] + for c in categories_list: + if not isinstance(c, dict): + continue + c_name = c.get('name') + if isinstance(c_name, compat_str): + categories.append(c_name) + else: + categories = None + + uploader_url = url_or_none(try_get(video, lambda x: x['author']['pageURL'])) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': video.get('description'), + 'timestamp': int_or_none(video.get('created')), + 'uploader': try_get( + video, lambda x: x['author']['name'], compat_str), + 'uploader_url': uploader_url, + 'uploader_id': uploader_url.split('/')[-1] if uploader_url else None, + 'thumbnail': video.get('thumbURL'), + 'duration': int_or_none(video.get('duration')), + 'view_count': int_or_none(video.get('views')), + 'like_count': int_or_none(try_get( + video, lambda x: x['rating']['likes'], int)), + 'dislike_count': int_or_none(try_get( + video, lambda x: x['rating']['dislikes'], int)), + 'comment_count': int_or_none(video.get('views')), + 'age_limit': age_limit, + 'categories': categories, + 'formats': formats, + } + + # Old layout fallback + + title = self._html_search_regex( + [r'<h1[^>]*>([^<]+)</h1>', + r'<meta[^>]+itemprop=".*?caption.*?"[^>]+content="(.+?)"', + r'<title[^>]*>(.+?)(?:,\s*[^,]*?\s*Porn\s*[^,]*?:\s*xHamster[^<]*| - xHamster\.com)</title>'], + webpage, 'title') + + formats = [] + format_urls = set() + + sources = self._parse_json( + self._search_regex( + r'sources\s*:\s*({.+?})\s*,?\s*\n', webpage, 'sources', + default='{}'), + video_id, fatal=False) + for format_id, format_url in sources.items(): + format_url = url_or_none(format_url) + if not format_url: + continue + if format_url in format_urls: + continue + format_urls.add(format_url) + formats.append({ + 'format_id': format_id, + 'url': format_url, + 'height': get_height(format_id), + }) + + video_url = self._search_regex( + [r'''file\s*:\s*(?P<q>["'])(?P<mp4>.+?)(?P=q)''', + r'''<a\s+href=(?P<q>["'])(?P<mp4>.+?)(?P=q)\s+class=["']mp4Thumb''', + r'''<video[^>]+file=(?P<q>["'])(?P<mp4>.+?)(?P=q)[^>]*>'''], + webpage, 'video url', group='mp4', default=None) + if video_url and video_url not in format_urls: + formats.append({ + 'url': video_url, + }) + + self._sort_formats(formats) + + # Only a few videos have an description + mobj = re.search(r'<span>Description: </span>([^<]+)', webpage) + description = mobj.group(1) if mobj else None + + upload_date = unified_strdate(self._search_regex( + r'hint=["\'](\d{4}-\d{2}-\d{2}) \d{2}:\d{2}:\d{2} [A-Z]{3,4}', + webpage, 'upload date', fatal=False)) + + uploader = self._html_search_regex( + r'<span[^>]+itemprop=["\']author[^>]+><a[^>]+><span[^>]+>([^<]+)', + webpage, 'uploader', default='anonymous') + + thumbnail = self._search_regex( + [r'''["']thumbUrl["']\s*:\s*(?P<q>["'])(?P<thumbnail>.+?)(?P=q)''', + r'''<video[^>]+"poster"=(?P<q>["'])(?P<thumbnail>.+?)(?P=q)[^>]*>'''], + webpage, 'thumbnail', fatal=False, group='thumbnail') + + duration = parse_duration(self._search_regex( + [r'<[^<]+\bitemprop=["\']duration["\'][^<]+\bcontent=["\'](.+?)["\']', + r'Runtime:\s*</span>\s*([\d:]+)'], webpage, + 'duration', fatal=False)) + + view_count = int_or_none(self._search_regex( + r'content=["\']User(?:View|Play)s:(\d+)', + webpage, 'view count', fatal=False)) + + mobj = re.search(r'hint=[\'"](?P<likecount>\d+) Likes / (?P<dislikecount>\d+) Dislikes', webpage) + (like_count, dislike_count) = (mobj.group('likecount'), mobj.group('dislikecount')) if mobj else (None, None) + + mobj = re.search(r'</label>Comments \((?P<commentcount>\d+)\)</div>', webpage) + comment_count = mobj.group('commentcount') if mobj else 0 + + categories_html = self._search_regex( + r'(?s)<table.+?(<span>Categories:.+?)</table>', webpage, + 'categories', default=None) + categories = [clean_html(category) for category in re.findall( + r'<a[^>]+>(.+?)</a>', categories_html)] if categories_html else None + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'upload_date': upload_date, + 'uploader': uploader, + 'uploader_id': uploader.lower() if uploader else None, + 'thumbnail': thumbnail, + 'duration': duration, + 'view_count': view_count, + 'like_count': int_or_none(like_count), + 'dislike_count': int_or_none(dislike_count), + 'comment_count': int_or_none(comment_count), + 'age_limit': age_limit, + 'categories': categories, + 'formats': formats, + } + + +class XHamsterEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:.+?\.)?%s/xembed\.php\?video=(?P<id>\d+)' % XHamsterIE._DOMAINS + _TEST = { + 'url': 'http://xhamster.com/xembed.php?video=3328539', + 'info_dict': { + 'id': '3328539', + 'ext': 'mp4', + 'title': 'Pen Masturbation', + 'timestamp': 1406581861, + 'upload_date': '20140728', + 'uploader': 'ManyakisArt', + 'duration': 5, + 'age_limit': 18, + } + } + + @staticmethod + def _extract_urls(webpage): + return [url for _, url in re.findall( + r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?xhamster\.com/xembed\.php\?video=\d+)\1', + webpage)] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_url = self._search_regex( + r'href="(https?://xhamster\.com/(?:movies/{0}/[^"]*\.html|videos/[^/]*-{0})[^"]*)"'.format(video_id), + webpage, 'xhamster url', default=None) + + if not video_url: + vars = self._parse_json( + self._search_regex(r'vars\s*:\s*({.+?})\s*,\s*\n', webpage, 'vars'), + video_id) + video_url = dict_get(vars, ('downloadLink', 'homepageLink', 'commentsLink', 'shareUrl')) + + return self.url_result(video_url, 'XHamster') + + +class XHamsterUserIE(InfoExtractor): + _VALID_URL = r'https?://(?:.+?\.)?%s/users/(?P<id>[^/?#&]+)' % XHamsterIE._DOMAINS + _TESTS = [{ + # Paginated user profile + 'url': 'https://xhamster.com/users/netvideogirls/videos', + 'info_dict': { + 'id': 'netvideogirls', + }, + 'playlist_mincount': 267, + }, { + # Non-paginated user profile + 'url': 'https://xhamster.com/users/firatkaan/videos', + 'info_dict': { + 'id': 'firatkaan', + }, + 'playlist_mincount': 1, + }] + + def _entries(self, user_id): + next_page_url = 'https://xhamster.com/users/%s/videos/1' % user_id + for pagenum in itertools.count(1): + page = self._download_webpage( + next_page_url, user_id, 'Downloading page %s' % pagenum) + for video_tag in re.findall( + r'(<a[^>]+class=["\'].*?\bvideo-thumb__image-container[^>]+>)', + page): + video = extract_attributes(video_tag) + video_url = url_or_none(video.get('href')) + if not video_url or not XHamsterIE.suitable(video_url): + continue + video_id = XHamsterIE._match_id(video_url) + yield self.url_result( + video_url, ie=XHamsterIE.ie_key(), video_id=video_id) + mobj = re.search(r'<a[^>]+data-page=["\']next[^>]+>', page) + if not mobj: + break + next_page = extract_attributes(mobj.group(0)) + next_page_url = url_or_none(next_page.get('href')) + if not next_page_url: + break + + def _real_extract(self, url): + user_id = self._match_id(url) + return self.playlist_result(self._entries(user_id), user_id) diff --git a/youtube_dl/extractor/xiami.py b/yt_dlp/extractor/xiami.py index 769aab331..769aab331 100644 --- a/youtube_dl/extractor/xiami.py +++ b/yt_dlp/extractor/xiami.py diff --git a/yt_dlp/extractor/ximalaya.py b/yt_dlp/extractor/ximalaya.py new file mode 100644 index 000000000..802d1bb1b --- /dev/null +++ b/yt_dlp/extractor/ximalaya.py @@ -0,0 +1,233 @@ +# coding: utf-8 + +from __future__ import unicode_literals + +import itertools +import re + +from .common import InfoExtractor + + +class XimalayaBaseIE(InfoExtractor): + _GEO_COUNTRIES = ['CN'] + + +class XimalayaIE(XimalayaBaseIE): + IE_NAME = 'ximalaya' + IE_DESC = '喜马拉雅FM' + _VALID_URL = r'https?://(?:www\.|m\.)?ximalaya\.com/(?P<uid>[0-9]+)/sound/(?P<id>[0-9]+)' + _USER_URL_FORMAT = '%s://www.ximalaya.com/zhubo/%i/' + _TESTS = [ + { + 'url': 'http://www.ximalaya.com/61425525/sound/47740352/', + 'info_dict': { + 'id': '47740352', + 'ext': 'm4a', + 'uploader': '小彬彬爱听书', + 'uploader_id': 61425525, + 'uploader_url': 'http://www.ximalaya.com/zhubo/61425525/', + 'title': '261.唐诗三百首.卷八.送孟浩然之广陵.李白', + 'description': "contains:《送孟浩然之广陵》\n作者:李白\n故人西辞黄鹤楼,烟花三月下扬州。\n孤帆远影碧空尽,惟见长江天际流。", + 'thumbnails': [ + { + 'name': 'cover_url', + 'url': r're:^https?://.*\.jpg$', + }, + { + 'name': 'cover_url_142', + 'url': r're:^https?://.*\.jpg$', + 'width': 180, + 'height': 180 + } + ], + 'categories': ['renwen', '人文'], + 'duration': 93, + 'view_count': int, + 'like_count': int, + } + }, + { + 'url': 'http://m.ximalaya.com/61425525/sound/47740352/', + 'info_dict': { + 'id': '47740352', + 'ext': 'm4a', + 'uploader': '小彬彬爱听书', + 'uploader_id': 61425525, + 'uploader_url': 'http://www.ximalaya.com/zhubo/61425525/', + 'title': '261.唐诗三百首.卷八.送孟浩然之广陵.李白', + 'description': "contains:《送孟浩然之广陵》\n作者:李白\n故人西辞黄鹤楼,烟花三月下扬州。\n孤帆远影碧空尽,惟见长江天际流。", + 'thumbnails': [ + { + 'name': 'cover_url', + 'url': r're:^https?://.*\.jpg$', + }, + { + 'name': 'cover_url_142', + 'url': r're:^https?://.*\.jpg$', + 'width': 180, + 'height': 180 + } + ], + 'categories': ['renwen', '人文'], + 'duration': 93, + 'view_count': int, + 'like_count': int, + } + }, + { + 'url': 'https://www.ximalaya.com/11045267/sound/15705996/', + 'info_dict': { + 'id': '15705996', + 'ext': 'm4a', + 'uploader': '李延隆老师', + 'uploader_id': 11045267, + 'uploader_url': 'https://www.ximalaya.com/zhubo/11045267/', + 'title': 'Lesson 1 Excuse me!', + 'description': "contains:Listen to the tape then answer\xa0this question. Whose handbag is it?\n" + "听录音,然后回答问题,这是谁的手袋?", + 'thumbnails': [ + { + 'name': 'cover_url', + 'url': r're:^https?://.*\.jpg$', + }, + { + 'name': 'cover_url_142', + 'url': r're:^https?://.*\.jpg$', + 'width': 180, + 'height': 180 + } + ], + 'categories': ['train', '外语'], + 'duration': 40, + 'view_count': int, + 'like_count': int, + } + }, + ] + + def _real_extract(self, url): + + is_m = 'm.ximalaya' in url + scheme = 'https' if url.startswith('https') else 'http' + + audio_id = self._match_id(url) + webpage = self._download_webpage(url, audio_id, + note='Download sound page for %s' % audio_id, + errnote='Unable to get sound page') + + audio_info_file = '%s://m.ximalaya.com/tracks/%s.json' % (scheme, audio_id) + audio_info = self._download_json(audio_info_file, audio_id, + 'Downloading info json %s' % audio_info_file, + 'Unable to download info file') + + formats = [] + for bps, k in (('24k', 'play_path_32'), ('64k', 'play_path_64')): + if audio_info.get(k): + formats.append({ + 'format_id': bps, + 'url': audio_info[k], + }) + + thumbnails = [] + for k in audio_info.keys(): + # cover pics kyes like: cover_url', 'cover_url_142' + if k.startswith('cover_url'): + thumbnail = {'name': k, 'url': audio_info[k]} + if k == 'cover_url_142': + thumbnail['width'] = 180 + thumbnail['height'] = 180 + thumbnails.append(thumbnail) + + audio_uploader_id = audio_info.get('uid') + + if is_m: + audio_description = self._html_search_regex(r'(?s)<section\s+class=["\']content[^>]+>(.+?)</section>', + webpage, 'audio_description', fatal=False) + else: + audio_description = self._html_search_regex(r'(?s)<div\s+class=["\']rich_intro[^>]*>(.+?</article>)', + webpage, 'audio_description', fatal=False) + + if not audio_description: + audio_description_file = '%s://www.ximalaya.com/sounds/%s/rich_intro' % (scheme, audio_id) + audio_description = self._download_webpage(audio_description_file, audio_id, + note='Downloading description file %s' % audio_description_file, + errnote='Unable to download descrip file', + fatal=False) + audio_description = audio_description.strip() if audio_description else None + + return { + 'id': audio_id, + 'uploader': audio_info.get('nickname'), + 'uploader_id': audio_uploader_id, + 'uploader_url': self._USER_URL_FORMAT % (scheme, audio_uploader_id) if audio_uploader_id else None, + 'title': audio_info['title'], + 'thumbnails': thumbnails, + 'description': audio_description, + 'categories': list(filter(None, (audio_info.get('category_name'), audio_info.get('category_title')))), + 'duration': audio_info.get('duration'), + 'view_count': audio_info.get('play_count'), + 'like_count': audio_info.get('favorites_count'), + 'formats': formats, + } + + +class XimalayaAlbumIE(XimalayaBaseIE): + IE_NAME = 'ximalaya:album' + IE_DESC = '喜马拉雅FM 专辑' + _VALID_URL = r'https?://(?:www\.|m\.)?ximalaya\.com/(?P<uid>[0-9]+)/album/(?P<id>[0-9]+)' + _TEMPLATE_URL = '%s://www.ximalaya.com/%s/album/%s/' + _BASE_URL_TEMPL = '%s://www.ximalaya.com%s' + _LIST_VIDEO_RE = r'<a[^>]+?href="(?P<url>/%s/sound/(?P<id>\d+)/?)"[^>]+?title="(?P<title>[^>]+)">' + _TESTS = [{ + 'url': 'http://www.ximalaya.com/61425525/album/5534601/', + 'info_dict': { + 'title': '唐诗三百首(含赏析)', + 'id': '5534601', + }, + 'playlist_count': 312, + }, { + 'url': 'http://m.ximalaya.com/61425525/album/5534601', + 'info_dict': { + 'title': '唐诗三百首(含赏析)', + 'id': '5534601', + }, + 'playlist_count': 312, + }, + ] + + def _real_extract(self, url): + self.scheme = scheme = 'https' if url.startswith('https') else 'http' + + mobj = self._match_valid_url(url) + uid, playlist_id = mobj.group('uid'), mobj.group('id') + + webpage = self._download_webpage(self._TEMPLATE_URL % (scheme, uid, playlist_id), playlist_id, + note='Download album page for %s' % playlist_id, + errnote='Unable to get album info') + + title = self._html_search_regex(r'detailContent_title[^>]*><h1(?:[^>]+)?>([^<]+)</h1>', + webpage, 'title', fatal=False) + + return self.playlist_result(self._entries(webpage, playlist_id, uid), playlist_id, title) + + def _entries(self, page, playlist_id, uid): + html = page + for page_num in itertools.count(1): + for entry in self._process_page(html, uid): + yield entry + + next_url = self._search_regex(r'<a\s+href=(["\'])(?P<more>[\S]+)\1[^>]+rel=(["\'])next\3', + html, 'list_next_url', default=None, group='more') + if not next_url: + break + + next_full_url = self._BASE_URL_TEMPL % (self.scheme, next_url) + html = self._download_webpage(next_full_url, playlist_id) + + def _process_page(self, html, uid): + find_from = html.index('album_soundlist') + for mobj in re.finditer(self._LIST_VIDEO_RE % uid, html[find_from:]): + yield self.url_result(self._BASE_URL_TEMPL % (self.scheme, mobj.group('url')), + XimalayaIE.ie_key(), + mobj.group('id'), + mobj.group('title')) diff --git a/youtube_dl/extractor/xminus.py b/yt_dlp/extractor/xminus.py index 36e5ead1e..36e5ead1e 100644 --- a/youtube_dl/extractor/xminus.py +++ b/yt_dlp/extractor/xminus.py diff --git a/yt_dlp/extractor/xnxx.py b/yt_dlp/extractor/xnxx.py new file mode 100644 index 000000000..dd4fb54d4 --- /dev/null +++ b/yt_dlp/extractor/xnxx.py @@ -0,0 +1,84 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, + NO_DEFAULT, + str_to_int, +) + + +class XNXXIE(InfoExtractor): + _VALID_URL = r'https?://(?:video|www)\.xnxx\.com/video-?(?P<id>[0-9a-z]+)/' + _TESTS = [{ + 'url': 'http://www.xnxx.com/video-55awb78/skyrim_test_video', + 'md5': '7583e96c15c0f21e9da3453d9920fbba', + 'info_dict': { + 'id': '55awb78', + 'ext': 'mp4', + 'title': 'Skyrim Test Video', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 469, + 'view_count': int, + 'age_limit': 18, + }, + }, { + 'url': 'http://video.xnxx.com/video1135332/lida_naked_funny_actress_5_', + 'only_matching': True, + }, { + 'url': 'http://www.xnxx.com/video-55awb78/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + def get(meta, default=NO_DEFAULT, fatal=True): + return self._search_regex( + r'set%s\s*\(\s*(["\'])(?P<value>(?:(?!\1).)+)\1' % meta, + webpage, meta, default=default, fatal=fatal, group='value') + + title = self._og_search_title( + webpage, default=None) or get('VideoTitle') + + formats = [] + for mobj in re.finditer( + r'setVideo(?:Url(?P<id>Low|High)|HLS)\s*\(\s*(?P<q>["\'])(?P<url>(?:https?:)?//.+?)(?P=q)', webpage): + format_url = mobj.group('url') + if determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + quality=1, m3u8_id='hls', fatal=False)) + else: + format_id = mobj.group('id') + if format_id: + format_id = format_id.lower() + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'quality': -1 if format_id == 'low' else 0, + }) + self._sort_formats(formats) + + thumbnail = self._og_search_thumbnail(webpage, default=None) or get( + 'ThumbUrl', fatal=False) or get('ThumbUrl169', fatal=False) + duration = int_or_none(self._og_search_property('duration', webpage)) + view_count = str_to_int(self._search_regex( + r'id=["\']nb-views-number[^>]+>([\d,.]+)', webpage, 'view count', + default=None)) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'view_count': view_count, + 'age_limit': 18, + 'formats': formats, + } diff --git a/yt_dlp/extractor/xstream.py b/yt_dlp/extractor/xstream.py new file mode 100644 index 000000000..792843df5 --- /dev/null +++ b/yt_dlp/extractor/xstream.py @@ -0,0 +1,119 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, + xpath_with_ns, + xpath_text, + find_xpath_attr, +) + + +class XstreamIE(InfoExtractor): + _VALID_URL = r'''(?x) + (?: + xstream:| + https?://frontend\.xstream\.(?:dk|net)/ + ) + (?P<partner_id>[^/]+) + (?: + :| + /feed/video/\?.*?\bid= + ) + (?P<id>\d+) + ''' + _TESTS = [{ + 'url': 'http://frontend.xstream.dk/btno/feed/video/?platform=web&id=86588', + 'md5': 'd7d17e3337dc80de6d3a540aefbe441b', + 'info_dict': { + 'id': '86588', + 'ext': 'mov', + 'title': 'Otto Wollertsen', + 'description': 'Vestlendingen Otto Fredrik Wollertsen', + 'timestamp': 1430473209, + 'upload_date': '20150501', + }, + }, { + 'url': 'http://frontend.xstream.dk/ap/feed/video/?platform=web&id=21039', + 'only_matching': True, + }] + + def _extract_video_info(self, partner_id, video_id): + data = self._download_xml( + 'http://frontend.xstream.dk/%s/feed/video/?platform=web&id=%s' + % (partner_id, video_id), + video_id) + + NS_MAP = { + 'atom': 'http://www.w3.org/2005/Atom', + 'xt': 'http://xstream.dk/', + 'media': 'http://search.yahoo.com/mrss/', + } + + entry = data.find(xpath_with_ns('./atom:entry', NS_MAP)) + + title = xpath_text( + entry, xpath_with_ns('./atom:title', NS_MAP), 'title') + description = xpath_text( + entry, xpath_with_ns('./atom:summary', NS_MAP), 'description') + timestamp = parse_iso8601(xpath_text( + entry, xpath_with_ns('./atom:published', NS_MAP), 'upload date')) + + formats = [] + media_group = entry.find(xpath_with_ns('./media:group', NS_MAP)) + for media_content in media_group.findall(xpath_with_ns('./media:content', NS_MAP)): + media_url = media_content.get('url') + if not media_url: + continue + tbr = int_or_none(media_content.get('bitrate')) + mobj = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>[^/]+))/(?P<playpath>.+)$', media_url) + if mobj: + formats.append({ + 'url': mobj.group('url'), + 'play_path': 'mp4:%s' % mobj.group('playpath'), + 'app': mobj.group('app'), + 'ext': 'flv', + 'tbr': tbr, + 'format_id': 'rtmp-%d' % tbr, + }) + else: + formats.append({ + 'url': media_url, + 'tbr': tbr, + }) + self._sort_formats(formats) + + link = find_xpath_attr( + entry, xpath_with_ns('./atom:link', NS_MAP), 'rel', 'original') + if link is not None: + formats.append({ + 'url': link.get('href'), + 'format_id': link.get('rel'), + 'quality': 1, + }) + + thumbnails = [{ + 'url': splash.get('url'), + 'width': int_or_none(splash.get('width')), + 'height': int_or_none(splash.get('height')), + } for splash in media_group.findall(xpath_with_ns('./xt:splash', NS_MAP))] + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'formats': formats, + 'thumbnails': thumbnails, + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + partner_id = mobj.group('partner_id') + video_id = mobj.group('id') + + return self._extract_video_info(partner_id, video_id) diff --git a/yt_dlp/extractor/xtube.py b/yt_dlp/extractor/xtube.py new file mode 100644 index 000000000..abd319188 --- /dev/null +++ b/yt_dlp/extractor/xtube.py @@ -0,0 +1,217 @@ +from __future__ import unicode_literals + +import itertools +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + js_to_json, + orderedSet, + parse_duration, + sanitized_Request, + str_to_int, + url_or_none, +) + + +class XTubeIE(InfoExtractor): + _VALID_URL = r'''(?x) + (?: + xtube:| + https?://(?:www\.)?xtube\.com/(?:watch\.php\?.*\bv=|video-watch/(?:embedded/)?(?P<display_id>[^/]+)-) + ) + (?P<id>[^/?&#]+) + ''' + + _TESTS = [{ + # old URL schema + 'url': 'http://www.xtube.com/watch.php?v=kVTUy_G222_', + 'md5': '092fbdd3cbe292c920ef6fc6a8a9cdab', + 'info_dict': { + 'id': 'kVTUy_G222_', + 'ext': 'mp4', + 'title': 'strange erotica', + 'description': 'contains:an ET kind of thing', + 'uploader': 'greenshowers', + 'duration': 450, + 'view_count': int, + 'comment_count': int, + 'age_limit': 18, + } + }, { + # new URL schema + 'url': 'http://www.xtube.com/video-watch/strange-erotica-625837', + 'only_matching': True, + }, { + 'url': 'xtube:625837', + 'only_matching': True, + }, { + 'url': 'xtube:kVTUy_G222_', + 'only_matching': True, + }, { + 'url': 'https://www.xtube.com/video-watch/embedded/milf-tara-and-teen-shared-and-cum-covered-extreme-bukkake-32203482?embedsize=big', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') + + if not display_id: + display_id = video_id + + if video_id.isdigit() and len(video_id) < 11: + url_pattern = 'http://www.xtube.com/video-watch/-%s' + else: + url_pattern = 'http://www.xtube.com/watch.php?v=%s' + + webpage = self._download_webpage( + url_pattern % video_id, display_id, headers={ + 'Cookie': 'age_verified=1; cookiesAccepted=1', + }) + + title, thumbnail, duration, sources, media_definition = [None] * 5 + + config = self._parse_json(self._search_regex( + r'playerConf\s*=\s*({.+?})\s*,\s*(?:\n|loaderConf|playerWrapper)', webpage, 'config', + default='{}'), video_id, transform_source=js_to_json, fatal=False) + if config: + config = config.get('mainRoll') + if isinstance(config, dict): + title = config.get('title') + thumbnail = config.get('poster') + duration = int_or_none(config.get('duration')) + sources = config.get('sources') or config.get('format') + media_definition = config.get('mediaDefinition') + + if not isinstance(sources, dict) and not media_definition: + sources = self._parse_json(self._search_regex( + r'(["\'])?sources\1?\s*:\s*(?P<sources>{.+?}),', + webpage, 'sources', group='sources'), video_id, + transform_source=js_to_json) + + formats = [] + format_urls = set() + + if isinstance(sources, dict): + for format_id, format_url in sources.items(): + format_url = url_or_none(format_url) + if not format_url: + continue + if format_url in format_urls: + continue + format_urls.add(format_url) + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'height': int_or_none(format_id), + }) + + if isinstance(media_definition, list): + for media in media_definition: + video_url = url_or_none(media.get('videoUrl')) + if not video_url: + continue + if video_url in format_urls: + continue + format_urls.add(video_url) + format_id = media.get('format') + if format_id == 'hls': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif format_id == 'mp4': + height = int_or_none(media.get('quality')) + formats.append({ + 'url': video_url, + 'format_id': '%s-%d' % (format_id, height) if height else format_id, + 'height': height, + }) + + self._remove_duplicate_formats(formats) + self._sort_formats(formats) + + if not title: + title = self._search_regex( + (r'<h1>\s*(?P<title>[^<]+?)\s*</h1>', r'videoTitle\s*:\s*(["\'])(?P<title>.+?)\1'), + webpage, 'title', group='title') + description = self._og_search_description( + webpage, default=None) or self._html_search_meta( + 'twitter:description', webpage, default=None) or self._search_regex( + r'</h1>\s*<p>([^<]+)', webpage, 'description', fatal=False) + uploader = self._search_regex( + (r'<input[^>]+name="contentOwnerId"[^>]+value="([^"]+)"', + r'<span[^>]+class="nickname"[^>]*>([^<]+)'), + webpage, 'uploader', fatal=False) + if not duration: + duration = parse_duration(self._search_regex( + r'<dt>Runtime:?</dt>\s*<dd>([^<]+)</dd>', + webpage, 'duration', fatal=False)) + view_count = str_to_int(self._search_regex( + (r'["\']viewsCount["\'][^>]*>(\d+)\s+views', + r'<dt>Views:?</dt>\s*<dd>([\d,\.]+)</dd>'), + webpage, 'view count', fatal=False)) + comment_count = str_to_int(self._html_search_regex( + r'>Comments? \(([\d,\.]+)\)<', + webpage, 'comment count', fatal=False)) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'duration': duration, + 'view_count': view_count, + 'comment_count': comment_count, + 'age_limit': 18, + 'formats': formats, + } + + +class XTubeUserIE(InfoExtractor): + IE_DESC = 'XTube user profile' + _VALID_URL = r'https?://(?:www\.)?xtube\.com/profile/(?P<id>[^/]+-\d+)' + _TEST = { + 'url': 'http://www.xtube.com/profile/greenshowers-4056496', + 'info_dict': { + 'id': 'greenshowers-4056496', + 'age_limit': 18, + }, + 'playlist_mincount': 154, + } + + def _real_extract(self, url): + user_id = self._match_id(url) + + entries = [] + for pagenum in itertools.count(1): + request = sanitized_Request( + 'http://www.xtube.com/profile/%s/videos/%d' % (user_id, pagenum), + headers={ + 'Cookie': 'popunder=4', + 'X-Requested-With': 'XMLHttpRequest', + 'Referer': url, + }) + + page = self._download_json( + request, user_id, 'Downloading videos JSON page %d' % pagenum) + + html = page.get('html') + if not html: + break + + for video_id in orderedSet([video_id for _, video_id in re.findall( + r'data-plid=(["\'])(.+?)\1', html)]): + entries.append(self.url_result('xtube:%s' % video_id, XTubeIE.ie_key())) + + page_count = int_or_none(page.get('pageCount')) + if not page_count or pagenum == page_count: + break + + playlist = self.playlist_result(entries, user_id) + playlist['age_limit'] = 18 + return playlist diff --git a/youtube_dl/extractor/xuite.py b/yt_dlp/extractor/xuite.py index 0276c0dbb..0276c0dbb 100644 --- a/youtube_dl/extractor/xuite.py +++ b/yt_dlp/extractor/xuite.py diff --git a/youtube_dl/extractor/xvideos.py b/yt_dlp/extractor/xvideos.py index 8fc64914c..8fc64914c 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/yt_dlp/extractor/xvideos.py diff --git a/yt_dlp/extractor/xxxymovies.py b/yt_dlp/extractor/xxxymovies.py new file mode 100644 index 000000000..0d536015c --- /dev/null +++ b/yt_dlp/extractor/xxxymovies.py @@ -0,0 +1,80 @@ +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ( + parse_duration, + int_or_none, +) + + +class XXXYMoviesIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?xxxymovies\.com/videos/(?P<id>\d+)/(?P<display_id>[^/]+)' + _TEST = { + 'url': 'http://xxxymovies.com/videos/138669/ecstatic-orgasm-sofcore/', + 'md5': '810b1bdbbffff89dd13bdb369fe7be4b', + 'info_dict': { + 'id': '138669', + 'display_id': 'ecstatic-orgasm-sofcore', + 'ext': 'mp4', + 'title': 'Ecstatic Orgasm Sofcore', + 'duration': 931, + 'categories': list, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'age_limit': 18, + } + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') + + webpage = self._download_webpage(url, display_id) + + video_url = self._search_regex( + r"video_url\s*:\s*'([^']+)'", webpage, 'video URL') + + title = self._html_search_regex( + [r'<div[^>]+\bclass="block_header"[^>]*>\s*<h1>([^<]+)<', + r'<title>(.*?)\s*-\s*(?:XXXYMovies\.com|XXX\s+Movies)</title>'], + webpage, 'title') + + thumbnail = self._search_regex( + r"preview_url\s*:\s*'([^']+)'", + webpage, 'thumbnail', fatal=False) + + categories = self._html_search_meta( + 'keywords', webpage, 'categories', default='').split(',') + + duration = parse_duration(self._search_regex( + r'<span>Duration:</span>\s*(\d+:\d+)', + webpage, 'duration', fatal=False)) + + view_count = int_or_none(self._html_search_regex( + r'<div class="video_views">\s*(\d+)', + webpage, 'view count', fatal=False)) + like_count = int_or_none(self._search_regex( + r'>\s*Likes? <b>\((\d+)\)', + webpage, 'like count', fatal=False)) + dislike_count = int_or_none(self._search_regex( + r'>\s*Dislike <b>\((\d+)\)</b>', + webpage, 'dislike count', fatal=False)) + + age_limit = self._rta_search(webpage) + + return { + 'id': video_id, + 'display_id': display_id, + 'url': video_url, + 'title': title, + 'thumbnail': thumbnail, + 'categories': categories, + 'duration': duration, + 'view_count': view_count, + 'like_count': like_count, + 'dislike_count': dislike_count, + 'age_limit': age_limit, + } diff --git a/yt_dlp/extractor/yahoo.py b/yt_dlp/extractor/yahoo.py new file mode 100644 index 000000000..53556de00 --- /dev/null +++ b/yt_dlp/extractor/yahoo.py @@ -0,0 +1,572 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import hashlib +import itertools +import re + +from .common import InfoExtractor, SearchInfoExtractor +from ..compat import ( + compat_str, + compat_urllib_parse, +) +from ..utils import ( + clean_html, + ExtractorError, + int_or_none, + mimetype2ext, + parse_iso8601, + smuggle_url, + try_get, + url_or_none, +) + +from .brightcove import BrightcoveNewIE +from .youtube import YoutubeIE + + +class YahooIE(InfoExtractor): + IE_DESC = 'Yahoo screen and movies' + _VALID_URL = r'(?P<url>https?://(?:(?P<country>[a-zA-Z]{2}(?:-[a-zA-Z]{2})?|malaysia)\.)?(?:[\da-zA-Z_-]+\.)?yahoo\.com/(?:[^/]+/)*(?P<id>[^?&#]*-[0-9]+(?:-[a-z]+)?)\.html)' + _TESTS = [{ + 'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html', + 'info_dict': { + 'id': '2d25e626-2378-391f-ada0-ddaf1417e588', + 'ext': 'mp4', + 'title': 'Julian Smith & Travis Legg Watch Julian Smith', + 'description': 'Julian and Travis watch Julian Smith', + 'duration': 6863, + 'timestamp': 1369812016, + 'upload_date': '20130529', + }, + 'skip': 'No longer exists', + }, { + 'url': 'https://screen.yahoo.com/community/community-sizzle-reel-203225340.html?format=embed', + 'md5': '7993e572fac98e044588d0b5260f4352', + 'info_dict': { + 'id': '4fe78544-8d48-39d8-97cd-13f205d9fcdb', + 'ext': 'mp4', + 'title': "Yahoo Saves 'Community'", + 'description': 'md5:4d4145af2fd3de00cbb6c1d664105053', + 'duration': 170, + 'timestamp': 1406838636, + 'upload_date': '20140731', + }, + 'skip': 'Unfortunately, this video is not available in your region', + }, { + 'url': 'https://uk.screen.yahoo.com/editor-picks/cute-raccoon-freed-drain-using-091756545.html', + 'md5': '71298482f7c64cbb7fa064e4553ff1c1', + 'info_dict': { + 'id': 'b3affa53-2e14-3590-852b-0e0db6cd1a58', + 'ext': 'webm', + 'title': 'Cute Raccoon Freed From Drain\u00a0Using Angle Grinder', + 'description': 'md5:f66c890e1490f4910a9953c941dee944', + 'duration': 97, + 'timestamp': 1414489862, + 'upload_date': '20141028', + }, + 'skip': 'No longer exists', + }, { + 'url': 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html', + 'md5': '88e209b417f173d86186bef6e4d1f160', + 'info_dict': { + 'id': 'f885cf7f-43d4-3450-9fac-46ac30ece521', + 'ext': 'mp4', + 'title': 'China Moses Is Crazy About the Blues', + 'description': 'md5:9900ab8cd5808175c7b3fe55b979bed0', + 'duration': 128, + 'timestamp': 1385722202, + 'upload_date': '20131129', + } + }, { + 'url': 'https://www.yahoo.com/movies/v/true-story-trailer-173000497.html', + 'md5': '2a9752f74cb898af5d1083ea9f661b58', + 'info_dict': { + 'id': '071c4013-ce30-3a93-a5b2-e0413cd4a9d1', + 'ext': 'mp4', + 'title': '\'True Story\' Trailer', + 'description': 'True Story', + 'duration': 150, + 'timestamp': 1418919206, + 'upload_date': '20141218', + }, + }, { + 'url': 'https://gma.yahoo.com/pizza-delivery-man-surprised-huge-tip-college-kids-195200785.html', + 'only_matching': True, + }, { + 'note': 'NBC Sports embeds', + 'url': 'http://sports.yahoo.com/blogs/ncaab-the-dagger/tyler-kalinoski-s-buzzer-beater-caps-davidson-s-comeback-win-185609842.html?guid=nbc_cbk_davidsonbuzzerbeater_150313', + 'info_dict': { + 'id': '9CsDKds0kvHI', + 'ext': 'flv', + 'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d', + 'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson', + 'upload_date': '20150313', + 'uploader': 'NBCU-SPORTS', + 'timestamp': 1426270238, + }, + }, { + 'url': 'https://tw.news.yahoo.com/-100120367.html', + 'only_matching': True, + }, { + # Query result is embedded in webpage, but explicit request to video API fails with geo restriction + 'url': 'https://screen.yahoo.com/community/communitary-community-episode-1-ladders-154501237.html', + 'md5': '4fbafb9c9b6f07aa8f870629f6671b35', + 'info_dict': { + 'id': '1f32853c-a271-3eef-8cb6-f6d6872cb504', + 'ext': 'mp4', + 'title': 'Communitary - Community Episode 1: Ladders', + 'description': 'md5:8fc39608213295748e1e289807838c97', + 'duration': 1646, + 'timestamp': 1440436550, + 'upload_date': '20150824', + 'series': 'Communitary', + 'season_number': 6, + 'episode_number': 1, + }, + 'skip': 'No longer exists', + }, { + # ytwnews://cavideo/ + 'url': 'https://tw.video.yahoo.com/movie-tw/單車天使-中文版預-092316541.html', + 'info_dict': { + 'id': 'ba133ff2-0793-3510-b636-59dfe9ff6cff', + 'ext': 'mp4', + 'title': '單車天使 - 中文版預', + 'description': '中文版預', + 'timestamp': 1476696196, + 'upload_date': '20161017', + }, + 'params': { + 'skip_download': True, + }, + }, { + # Contains both a Yahoo hosted video and multiple Youtube embeds + 'url': 'https://www.yahoo.com/entertainment/gwen-stefani-reveals-the-pop-hit-she-passed-on-assigns-it-to-her-voice-contestant-instead-033045672.html', + 'info_dict': { + 'id': '46c5d95a-528f-3d03-b732-732fcadd51de', + 'title': 'Gwen Stefani reveals the pop hit she passed on, assigns it to her \'Voice\' contestant instead', + 'description': 'Gwen decided not to record this hit herself, but she decided it was the perfect fit for Kyndall Inskeep.', + }, + 'playlist': [{ + 'info_dict': { + 'id': '966d4262-4fd1-3aaa-b45b-049ca6e38ba6', + 'ext': 'mp4', + 'title': 'Gwen Stefani reveals she turned down one of Sia\'s best songs', + 'description': 'On "The Voice" Tuesday, Gwen Stefani told Taylor Swift which Sia hit was almost hers.', + 'timestamp': 1572406500, + 'upload_date': '20191030', + }, + }, { + 'info_dict': { + 'id': '352CFDOQrKg', + 'ext': 'mp4', + 'title': 'Kyndal Inskeep "Performs the Hell Out of" Sia\'s "Elastic Heart" - The Voice Knockouts 2019', + 'description': 'md5:7fe8e3d5806f96002e55f190d1d94479', + 'uploader': 'The Voice', + 'uploader_id': 'NBCTheVoice', + 'upload_date': '20191029', + }, + }], + 'params': { + 'playlistend': 2, + }, + 'expected_warnings': ['HTTP Error 404', 'Ignoring subtitle tracks'], + }, { + 'url': 'https://malaysia.news.yahoo.com/video/bystanders-help-ontario-policeman-bust-190932818.html', + 'only_matching': True, + }, { + 'url': 'https://es-us.noticias.yahoo.com/es-la-puerta-irrompible-que-110539379.html', + 'only_matching': True, + }, { + 'url': 'https://www.yahoo.com/entertainment/v/longtime-cbs-news-60-minutes-032036500-cbs.html', + 'only_matching': True, + }] + + def _extract_yahoo_video(self, video_id, country): + video = self._download_json( + 'https://%s.yahoo.com/_td/api/resource/VideoService.videos;view=full;video_ids=["%s"]' % (country, video_id), + video_id, 'Downloading video JSON metadata')[0] + title = video['title'] + + if country == 'malaysia': + country = 'my' + + is_live = video.get('live_state') == 'live' + fmts = ('m3u8',) if is_live else ('webm', 'mp4') + + urls = [] + formats = [] + subtitles = {} + for fmt in fmts: + media_obj = self._download_json( + 'https://video-api.yql.yahoo.com/v1/video/sapi/streams/' + video_id, + video_id, 'Downloading %s JSON metadata' % fmt, + headers=self.geo_verification_headers(), query={ + 'format': fmt, + 'region': country.upper(), + })['query']['results']['mediaObj'][0] + msg = media_obj.get('status', {}).get('msg') + + for s in media_obj.get('streams', []): + host = s.get('host') + path = s.get('path') + if not host or not path: + continue + s_url = host + path + if s.get('format') == 'm3u8': + formats.extend(self._extract_m3u8_formats( + s_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + continue + tbr = int_or_none(s.get('bitrate')) + formats.append({ + 'url': s_url, + 'format_id': fmt + ('-%d' % tbr if tbr else ''), + 'width': int_or_none(s.get('width')), + 'height': int_or_none(s.get('height')), + 'tbr': tbr, + 'fps': int_or_none(s.get('framerate')), + }) + + for cc in media_obj.get('closedcaptions', []): + cc_url = cc.get('url') + if not cc_url or cc_url in urls: + continue + urls.append(cc_url) + subtitles.setdefault(cc.get('lang') or 'en-US', []).append({ + 'url': cc_url, + 'ext': mimetype2ext(cc.get('content_type')), + }) + + streaming_url = video.get('streaming_url') + if streaming_url and not is_live: + formats.extend(self._extract_m3u8_formats( + streaming_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + + if not formats and msg == 'geo restricted': + self.raise_geo_restricted(metadata_available=True) + + self._sort_formats(formats) + + thumbnails = [] + for thumb in video.get('thumbnails', []): + thumb_url = thumb.get('url') + if not thumb_url: + continue + thumbnails.append({ + 'id': thumb.get('tag'), + 'url': thumb.get('url'), + 'width': int_or_none(thumb.get('width')), + 'height': int_or_none(thumb.get('height')), + }) + + series_info = video.get('series_info') or {} + + return { + 'id': video_id, + 'title': self._live_title(title) if is_live else title, + 'formats': formats, + 'thumbnails': thumbnails, + 'description': clean_html(video.get('description')), + 'timestamp': parse_iso8601(video.get('publish_time')), + 'subtitles': subtitles, + 'duration': int_or_none(video.get('duration')), + 'view_count': int_or_none(video.get('view_count')), + 'is_live': is_live, + 'series': video.get('show_name'), + 'season_number': int_or_none(series_info.get('season_number')), + 'episode_number': int_or_none(series_info.get('episode_number')), + } + + def _real_extract(self, url): + url, country, display_id = self._match_valid_url(url).groups() + if not country: + country = 'us' + else: + country = country.split('-')[0] + + items = self._download_json( + 'https://%s.yahoo.com/caas/content/article' % country, display_id, + 'Downloading content JSON metadata', query={ + 'url': url + })['items'][0] + + item = items['data']['partnerData'] + if item.get('type') != 'video': + entries = [] + + cover = item.get('cover') or {} + if cover.get('type') == 'yvideo': + cover_url = cover.get('url') + if cover_url: + entries.append(self.url_result( + cover_url, 'Yahoo', cover.get('uuid'))) + + for e in (item.get('body') or []): + if e.get('type') == 'videoIframe': + iframe_url = e.get('url') + if iframe_url: + entries.append(self.url_result(iframe_url)) + + if item.get('type') == 'storywithleadvideo': + iframe_url = try_get(item, lambda x: x['meta']['player']['url']) + if iframe_url: + entries.append(self.url_result(iframe_url)) + else: + self.report_warning("Yahoo didn't provide an iframe url for this storywithleadvideo") + + if items.get('markup'): + entries.extend( + self.url_result(yt_url) for yt_url in YoutubeIE._extract_urls(items['markup'])) + + return self.playlist_result( + entries, item.get('uuid'), + item.get('title'), item.get('summary')) + + info = self._extract_yahoo_video(item['uuid'], country) + info['display_id'] = display_id + return info + + +class YahooSearchIE(SearchInfoExtractor): + IE_DESC = 'Yahoo screen search' + _MAX_RESULTS = 1000 + IE_NAME = 'screen.yahoo:search' + _SEARCH_KEY = 'yvsearch' + + def _search_results(self, query): + for pagenum in itertools.count(0): + result_url = 'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30) + info = self._download_json(result_url, query, + note='Downloading results page ' + str(pagenum + 1)) + yield from (self.url_result(result['rurl']) for result in info['results']) + if info['m']['last'] >= info['m']['total'] - 1: + break + + +class YahooGyaOPlayerIE(InfoExtractor): + IE_NAME = 'yahoo:gyao:player' + _VALID_URL = r'https?://(?:gyao\.yahoo\.co\.jp/(?:player|episode(?:/[^/]+)?)|streaming\.yahoo\.co\.jp/c/y)/(?P<id>\d+/v\d+/v\d+|[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _TESTS = [{ + 'url': 'https://gyao.yahoo.co.jp/player/00998/v00818/v0000000000000008564/', + 'info_dict': { + 'id': '5993125228001', + 'ext': 'mp4', + 'title': 'フューリー 【字幕版】', + 'description': 'md5:21e691c798a15330eda4db17a8fe45a5', + 'uploader_id': '4235717419001', + 'upload_date': '20190124', + 'timestamp': 1548294365, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'https://streaming.yahoo.co.jp/c/y/01034/v00133/v0000000000000000706/', + 'only_matching': True, + }, { + 'url': 'https://gyao.yahoo.co.jp/episode/%E3%81%8D%E3%81%AE%E3%81%86%E4%BD%95%E9%A3%9F%E3%81%B9%E3%81%9F%EF%BC%9F%20%E7%AC%AC2%E8%A9%B1%202019%2F4%2F12%E6%94%BE%E9%80%81%E5%88%86/5cb02352-b725-409e-9f8d-88f947a9f682', + 'only_matching': True, + }, { + 'url': 'https://gyao.yahoo.co.jp/episode/5fa1226c-ef8d-4e93-af7a-fd92f4e30597', + 'only_matching': True, + }] + _GEO_BYPASS = False + + def _real_extract(self, url): + video_id = self._match_id(url).replace('/', ':') + headers = self.geo_verification_headers() + headers['Accept'] = 'application/json' + resp = self._download_json( + 'https://gyao.yahoo.co.jp/apis/playback/graphql', video_id, query={ + 'appId': 'dj00aiZpPUNJeDh2cU1RazU3UCZzPWNvbnN1bWVyc2VjcmV0Jng9NTk-', + 'query': '''{ + content(parameter: {contentId: "%s", logicaAgent: PC_WEB}) { + video { + delivery { + id + } + title + } + } +}''' % video_id, + }, headers=headers) + content = resp['data']['content'] + if not content: + msg = resp['errors'][0]['message'] + if msg == 'not in japan': + self.raise_geo_restricted(countries=['JP']) + raise ExtractorError(msg) + video = content['video'] + return { + '_type': 'url_transparent', + 'id': video_id, + 'title': video['title'], + 'url': smuggle_url( + 'http://players.brightcove.net/4235717419001/SyG5P0gjb_default/index.html?videoId=' + video['delivery']['id'], + {'geo_countries': ['JP']}), + 'ie_key': BrightcoveNewIE.ie_key(), + } + + +class YahooGyaOIE(InfoExtractor): + IE_NAME = 'yahoo:gyao' + _VALID_URL = r'https?://(?:gyao\.yahoo\.co\.jp/(?:p|title(?:/[^/]+)?)|streaming\.yahoo\.co\.jp/p/y)/(?P<id>\d+/v\d+|[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _TESTS = [{ + 'url': 'https://gyao.yahoo.co.jp/p/00449/v03102/', + 'info_dict': { + 'id': '00449:v03102', + }, + 'playlist_count': 2, + }, { + 'url': 'https://streaming.yahoo.co.jp/p/y/01034/v00133/', + 'only_matching': True, + }, { + 'url': 'https://gyao.yahoo.co.jp/title/%E3%81%97%E3%82%83%E3%81%B9%E3%81%8F%E3%82%8A007/5b025a49-b2e5-4dc7-945c-09c6634afacf', + 'only_matching': True, + }, { + 'url': 'https://gyao.yahoo.co.jp/title/5b025a49-b2e5-4dc7-945c-09c6634afacf', + 'only_matching': True, + }] + + def _real_extract(self, url): + program_id = self._match_id(url).replace('/', ':') + videos = self._download_json( + 'https://gyao.yahoo.co.jp/api/programs/%s/videos' % program_id, program_id)['videos'] + entries = [] + for video in videos: + video_id = video.get('id') + if not video_id: + continue + entries.append(self.url_result( + 'https://gyao.yahoo.co.jp/player/%s/' % video_id.replace(':', '/'), + YahooGyaOPlayerIE.ie_key(), video_id)) + return self.playlist_result(entries, program_id) + + +class YahooJapanNewsIE(InfoExtractor): + IE_NAME = 'yahoo:japannews' + IE_DESC = 'Yahoo! Japan News' + _VALID_URL = r'https?://(?P<host>(?:news|headlines)\.yahoo\.co\.jp)[^\d]*(?P<id>\d[\d-]*\d)?' + _GEO_COUNTRIES = ['JP'] + _TESTS = [{ + 'url': 'https://headlines.yahoo.co.jp/videonews/ann?a=20190716-00000071-ann-int', + 'info_dict': { + 'id': '1736242', + 'ext': 'mp4', + 'title': 'ムン大統領が対日批判を強化“現金化”効果は?(テレビ朝日系(ANN)) - Yahoo!ニュース', + 'description': '韓国の元徴用工らを巡る裁判の原告が弁護士が差し押さえた三菱重工業の資産を売却して - Yahoo!ニュース(テレビ朝日系(ANN))', + 'thumbnail': r're:^https?://.*\.[a-zA-Z\d]{3,4}$', + }, + 'params': { + 'skip_download': True, + }, + }, { + # geo restricted + 'url': 'https://headlines.yahoo.co.jp/hl?a=20190721-00000001-oxv-l04', + 'only_matching': True, + }, { + 'url': 'https://headlines.yahoo.co.jp/videonews/', + 'only_matching': True, + }, { + 'url': 'https://news.yahoo.co.jp', + 'only_matching': True, + }, { + 'url': 'https://news.yahoo.co.jp/byline/hashimotojunji/20190628-00131977/', + 'only_matching': True, + }, { + 'url': 'https://news.yahoo.co.jp/feature/1356', + 'only_matching': True + }] + + def _extract_formats(self, json_data, content_id): + formats = [] + + video_data = try_get( + json_data, + lambda x: x['ResultSet']['Result'][0]['VideoUrlSet']['VideoUrl'], + list) + for vid in video_data or []: + delivery = vid.get('delivery') + url = url_or_none(vid.get('Url')) + if not delivery or not url: + continue + elif delivery == 'hls': + formats.extend( + self._extract_m3u8_formats( + url, content_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'url': url, + 'format_id': 'http-%s' % compat_str(vid.get('bitrate', '')), + 'height': int_or_none(vid.get('height')), + 'width': int_or_none(vid.get('width')), + 'tbr': int_or_none(vid.get('bitrate')), + }) + self._remove_duplicate_formats(formats) + self._sort_formats(formats) + + return formats + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + host = mobj.group('host') + display_id = mobj.group('id') or host + + webpage = self._download_webpage(url, display_id) + + title = self._html_search_meta( + ['og:title', 'twitter:title'], webpage, 'title', default=None + ) or self._html_search_regex('<title>([^<]+)</title>', webpage, 'title') + + if display_id == host: + # Headline page (w/ multiple BC playlists) ('news.yahoo.co.jp', 'headlines.yahoo.co.jp/videonews/', ...) + stream_plists = re.findall(r'plist=(\d+)', webpage) or re.findall(r'plist["\']:\s*["\']([^"\']+)', webpage) + entries = [ + self.url_result( + smuggle_url( + 'http://players.brightcove.net/5690807595001/HyZNerRl7_default/index.html?playlistId=%s' % plist_id, + {'geo_countries': ['JP']}), + ie='BrightcoveNew', video_id=plist_id) + for plist_id in stream_plists] + return self.playlist_result(entries, playlist_title=title) + + # Article page + description = self._html_search_meta( + ['og:description', 'description', 'twitter:description'], + webpage, 'description', default=None) + thumbnail = self._og_search_thumbnail( + webpage, default=None) or self._html_search_meta( + 'twitter:image', webpage, 'thumbnail', default=None) + space_id = self._search_regex([ + r'<script[^>]+class=["\']yvpub-player["\'][^>]+spaceid=([^&"\']+)', + r'YAHOO\.JP\.srch\.\w+link\.onLoad[^;]+spaceID["\' ]*:["\' ]+([^"\']+)', + r'<!--\s+SpaceID=(\d+)' + ], webpage, 'spaceid') + + content_id = self._search_regex( + r'<script[^>]+class=["\']yvpub-player["\'][^>]+contentid=(?P<contentid>[^&"\']+)', + webpage, 'contentid', group='contentid') + + json_data = self._download_json( + 'https://feapi-yvpub.yahooapis.jp/v1/content/%s' % content_id, + content_id, + query={ + 'appid': 'dj0zaiZpPVZMTVFJR0FwZWpiMyZzPWNvbnN1bWVyc2VjcmV0Jng9YjU-', + 'output': 'json', + 'space_id': space_id, + 'domain': host, + 'ak': hashlib.md5('_'.join((space_id, host)).encode()).hexdigest(), + 'device_type': '1100', + }) + formats = self._extract_formats(json_data, content_id) + + return { + 'id': content_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'formats': formats, + } diff --git a/yt_dlp/extractor/yandexdisk.py b/yt_dlp/extractor/yandexdisk.py new file mode 100644 index 000000000..c15f3a4f3 --- /dev/null +++ b/yt_dlp/extractor/yandexdisk.py @@ -0,0 +1,146 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + float_or_none, + int_or_none, + mimetype2ext, + try_get, + urljoin, +) + + +class YandexDiskIE(InfoExtractor): + _VALID_URL = r'''(?x)https?:// + (?P<domain> + yadi\.sk| + disk\.yandex\. + (?: + az| + by| + co(?:m(?:\.(?:am|ge|tr))?|\.il)| + ee| + fr| + k[gz]| + l[tv]| + md| + t[jm]| + u[az]| + ru + ) + )/(?:[di]/|public.*?\bhash=)(?P<id>[^/?#&]+)''' + + _TESTS = [{ + 'url': 'https://yadi.sk/i/VdOeDou8eZs6Y', + 'md5': 'a4a8d52958c8fddcf9845935070402ae', + 'info_dict': { + 'id': 'VdOeDou8eZs6Y', + 'ext': 'mp4', + 'title': '4.mp4', + 'duration': 168.6, + 'uploader': 'y.botova', + 'uploader_id': '300043621', + 'view_count': int, + }, + 'expected_warnings': ['Unable to download JSON metadata'], + }, { + 'url': 'https://yadi.sk/d/h3WAXvDS3Li3Ce', + 'only_matching': True, + }, { + 'url': 'https://yadi.sk/public?hash=5DZ296JK9GWCLp02f6jrObjnctjRxMs8L6%2B%2FuhNqk38%3D', + 'only_matching': True, + }] + + def _real_extract(self, url): + domain, video_id = self._match_valid_url(url).groups() + + webpage = self._download_webpage(url, video_id) + store = self._parse_json(self._search_regex( + r'<script[^>]+id="store-prefetch"[^>]*>\s*({.+?})\s*</script>', + webpage, 'store'), video_id) + resource = store['resources'][store['rootResourceId']] + + title = resource['name'] + meta = resource.get('meta') or {} + + public_url = meta.get('short_url') + if public_url: + video_id = self._match_id(public_url) + + source_url = (self._download_json( + 'https://cloud-api.yandex.net/v1/disk/public/resources/download', + video_id, query={'public_key': url}, fatal=False) or {}).get('href') + video_streams = resource.get('videoStreams') or {} + video_hash = resource.get('hash') or url + environment = store.get('environment') or {} + sk = environment.get('sk') + yandexuid = environment.get('yandexuid') + if sk and yandexuid and not (source_url and video_streams): + self._set_cookie(domain, 'yandexuid', yandexuid) + + def call_api(action): + return (self._download_json( + urljoin(url, '/public/api/') + action, video_id, data=json.dumps({ + 'hash': video_hash, + 'sk': sk, + }).encode(), headers={ + 'Content-Type': 'text/plain', + }, fatal=False) or {}).get('data') or {} + if not source_url: + # TODO: figure out how to detect if download limit has + # been reached and then avoid unnecessary source format + # extraction requests + source_url = call_api('download-url').get('url') + if not video_streams: + video_streams = call_api('get-video-streams') + + formats = [] + if source_url: + formats.append({ + 'url': source_url, + 'format_id': 'source', + 'ext': determine_ext(title, meta.get('ext') or mimetype2ext(meta.get('mime_type')) or 'mp4'), + 'quality': 1, + 'filesize': int_or_none(meta.get('size')) + }) + + for video in (video_streams.get('videos') or []): + format_url = video.get('url') + if not format_url: + continue + if video.get('dimension') == 'adaptive': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + else: + size = video.get('size') or {} + height = int_or_none(size.get('height')) + format_id = 'hls' + if height: + format_id += '-%dp' % height + formats.append({ + 'ext': 'mp4', + 'format_id': format_id, + 'height': height, + 'protocol': 'm3u8_native', + 'url': format_url, + 'width': int_or_none(size.get('width')), + }) + self._sort_formats(formats) + + uid = resource.get('uid') + display_name = try_get(store, lambda x: x['users'][uid]['displayName']) + + return { + 'id': video_id, + 'title': title, + 'duration': float_or_none(video_streams.get('duration'), 1000), + 'uploader': display_name, + 'uploader_id': uid, + 'view_count': int_or_none(meta.get('views_counter')), + 'formats': formats, + } diff --git a/yt_dlp/extractor/yandexmusic.py b/yt_dlp/extractor/yandexmusic.py new file mode 100644 index 000000000..a3558cc12 --- /dev/null +++ b/yt_dlp/extractor/yandexmusic.py @@ -0,0 +1,458 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import hashlib +import itertools + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + float_or_none, + try_get, +) + + +class YandexMusicBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://music\.yandex\.(?P<tld>ru|kz|ua|by|com)' + + @staticmethod + def _handle_error(response): + if isinstance(response, dict): + error = response.get('error') + if error: + raise ExtractorError(error, expected=True) + if response.get('type') == 'captcha' or 'captcha' in response: + YandexMusicBaseIE._raise_captcha() + + @staticmethod + def _raise_captcha(): + raise ExtractorError( + 'YandexMusic has considered yt-dlp requests automated and ' + 'asks you to solve a CAPTCHA. You can either wait for some ' + 'time until unblocked and optionally use --sleep-interval ' + 'in future or alternatively you can go to https://music.yandex.ru/ ' + 'solve CAPTCHA, then export cookies and pass cookie file to ' + 'yt-dlp with --cookies', + expected=True) + + def _download_webpage_handle(self, *args, **kwargs): + webpage = super(YandexMusicBaseIE, self)._download_webpage_handle(*args, **kwargs) + if 'Нам очень жаль, но запросы, поступившие с вашего IP-адреса, похожи на автоматические.' in webpage: + self._raise_captcha() + return webpage + + def _download_json(self, *args, **kwargs): + response = super(YandexMusicBaseIE, self)._download_json(*args, **kwargs) + self._handle_error(response) + return response + + def _call_api(self, ep, tld, url, item_id, note, query): + return self._download_json( + 'https://music.yandex.%s/handlers/%s.jsx' % (tld, ep), + item_id, note, + fatal=False, + headers={ + 'Referer': url, + 'X-Requested-With': 'XMLHttpRequest', + 'X-Retpath-Y': url, + }, + query=query) + + +class YandexMusicTrackIE(YandexMusicBaseIE): + IE_NAME = 'yandexmusic:track' + IE_DESC = 'Яндекс.Музыка - Трек' + _VALID_URL = r'%s/album/(?P<album_id>\d+)/track/(?P<id>\d+)' % YandexMusicBaseIE._VALID_URL_BASE + + _TESTS = [{ + 'url': 'http://music.yandex.ru/album/540508/track/4878838', + 'md5': 'dec8b661f12027ceaba33318787fff76', + 'info_dict': { + 'id': '4878838', + 'ext': 'mp3', + 'title': 'md5:c63e19341fdbe84e43425a30bc777856', + 'filesize': int, + 'duration': 193.04, + 'track': 'md5:210508c6ffdfd67a493a6c378f22c3ff', + 'album': 'md5:cd04fb13c4efeafdfa0a6a6aca36d01a', + 'album_artist': 'md5:5f54c35462c07952df33d97cfb5fc200', + 'artist': 'md5:e6fd86621825f14dc0b25db3acd68160', + 'release_year': 2009, + }, + # 'skip': 'Travis CI servers blocked by YandexMusic', + }, { + # multiple disks + 'url': 'http://music.yandex.ru/album/3840501/track/705105', + 'md5': '82a54e9e787301dd45aba093cf6e58c0', + 'info_dict': { + 'id': '705105', + 'ext': 'mp3', + 'title': 'md5:f86d4a9188279860a83000277024c1a6', + 'filesize': int, + 'duration': 239.27, + 'track': 'md5:40f887f0666ba1aa10b835aca44807d1', + 'album': 'md5:624f5224b14f5c88a8e812fd7fbf1873', + 'album_artist': 'md5:dd35f2af4e8927100cbe6f5e62e1fb12', + 'artist': 'md5:dd35f2af4e8927100cbe6f5e62e1fb12', + 'release_year': 2016, + 'genre': 'pop', + 'disc_number': 2, + 'track_number': 9, + }, + # 'skip': 'Travis CI servers blocked by YandexMusic', + }, { + 'url': 'http://music.yandex.com/album/540508/track/4878838', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + tld, album_id, track_id = mobj.group('tld'), mobj.group('album_id'), mobj.group('id') + + track = self._call_api( + 'track', tld, url, track_id, 'Downloading track JSON', + {'track': '%s:%s' % (track_id, album_id)})['track'] + track_title = track['title'] + + download_data = self._download_json( + 'https://music.yandex.ru/api/v2.1/handlers/track/%s:%s/web-album_track-track-track-main/download/m' % (track_id, album_id), + track_id, 'Downloading track location url JSON', + headers={'X-Retpath-Y': url}) + + fd_data = self._download_json( + download_data['src'], track_id, + 'Downloading track location JSON', + query={'format': 'json'}) + key = hashlib.md5(('XGRlBW9FXlekgbPrRHuSiA' + fd_data['path'][1:] + fd_data['s']).encode('utf-8')).hexdigest() + f_url = 'http://%s/get-mp3/%s/%s?track-id=%s ' % (fd_data['host'], key, fd_data['ts'] + fd_data['path'], track['id']) + + thumbnail = None + cover_uri = track.get('albums', [{}])[0].get('coverUri') + if cover_uri: + thumbnail = cover_uri.replace('%%', 'orig') + if not thumbnail.startswith('http'): + thumbnail = 'http://' + thumbnail + + track_info = { + 'id': track_id, + 'ext': 'mp3', + 'url': f_url, + 'filesize': int_or_none(track.get('fileSize')), + 'duration': float_or_none(track.get('durationMs'), 1000), + 'thumbnail': thumbnail, + 'track': track_title, + 'acodec': download_data.get('codec'), + 'abr': int_or_none(download_data.get('bitrate')), + } + + def extract_artist_name(artist): + decomposed = artist.get('decomposed') + if not isinstance(decomposed, list): + return artist['name'] + parts = [artist['name']] + for element in decomposed: + if isinstance(element, dict) and element.get('name'): + parts.append(element['name']) + elif isinstance(element, compat_str): + parts.append(element) + return ''.join(parts) + + def extract_artist(artist_list): + if artist_list and isinstance(artist_list, list): + artists_names = [extract_artist_name(a) for a in artist_list if a.get('name')] + if artists_names: + return ', '.join(artists_names) + + albums = track.get('albums') + if albums and isinstance(albums, list): + album = albums[0] + if isinstance(album, dict): + year = album.get('year') + disc_number = int_or_none(try_get( + album, lambda x: x['trackPosition']['volume'])) + track_number = int_or_none(try_get( + album, lambda x: x['trackPosition']['index'])) + track_info.update({ + 'album': album.get('title'), + 'album_artist': extract_artist(album.get('artists')), + 'release_year': int_or_none(year), + 'genre': album.get('genre'), + 'disc_number': disc_number, + 'track_number': track_number, + }) + + track_artist = extract_artist(track.get('artists')) + if track_artist: + track_info.update({ + 'artist': track_artist, + 'title': '%s - %s' % (track_artist, track_title), + }) + else: + track_info['title'] = track_title + + return track_info + + +class YandexMusicPlaylistBaseIE(YandexMusicBaseIE): + def _extract_tracks(self, source, item_id, url, tld): + tracks = source['tracks'] + track_ids = [compat_str(track_id) for track_id in source['trackIds']] + + # tracks dictionary shipped with playlist.jsx API is limited to 150 tracks, + # missing tracks should be retrieved manually. + if len(tracks) < len(track_ids): + present_track_ids = set([ + compat_str(track['id']) + for track in tracks if track.get('id')]) + missing_track_ids = [ + track_id for track_id in track_ids + if track_id not in present_track_ids] + # Request missing tracks in chunks to avoid exceeding max HTTP header size, + # see https://github.com/ytdl-org/youtube-dl/issues/27355 + _TRACKS_PER_CHUNK = 250 + for chunk_num in itertools.count(0): + start = chunk_num * _TRACKS_PER_CHUNK + end = start + _TRACKS_PER_CHUNK + missing_track_ids_req = missing_track_ids[start:end] + assert missing_track_ids_req + missing_tracks = self._call_api( + 'track-entries', tld, url, item_id, + 'Downloading missing tracks JSON chunk %d' % (chunk_num + 1), { + 'entries': ','.join(missing_track_ids_req), + 'lang': tld, + 'external-domain': 'music.yandex.%s' % tld, + 'overembed': 'false', + 'strict': 'true', + }) + if missing_tracks: + tracks.extend(missing_tracks) + if end >= len(missing_track_ids): + break + + return tracks + + def _build_playlist(self, tracks): + entries = [] + for track in tracks: + track_id = track.get('id') or track.get('realId') + if not track_id: + continue + albums = track.get('albums') + if not albums or not isinstance(albums, list): + continue + album = albums[0] + if not isinstance(album, dict): + continue + album_id = album.get('id') + if not album_id: + continue + entries.append(self.url_result( + 'http://music.yandex.ru/album/%s/track/%s' % (album_id, track_id), + ie=YandexMusicTrackIE.ie_key(), video_id=track_id)) + return entries + + +class YandexMusicAlbumIE(YandexMusicPlaylistBaseIE): + IE_NAME = 'yandexmusic:album' + IE_DESC = 'Яндекс.Музыка - Альбом' + _VALID_URL = r'%s/album/(?P<id>\d+)' % YandexMusicBaseIE._VALID_URL_BASE + + _TESTS = [{ + 'url': 'http://music.yandex.ru/album/540508', + 'info_dict': { + 'id': '540508', + 'title': 'md5:7ed1c3567f28d14be9f61179116f5571', + }, + 'playlist_count': 50, + # 'skip': 'Travis CI servers blocked by YandexMusic', + }, { + 'url': 'https://music.yandex.ru/album/3840501', + 'info_dict': { + 'id': '3840501', + 'title': 'md5:36733472cdaa7dcb1fd9473f7da8e50f', + }, + 'playlist_count': 33, + # 'skip': 'Travis CI servers blocked by YandexMusic', + }, { + # empty artists + 'url': 'https://music.yandex.ru/album/9091882', + 'info_dict': { + 'id': '9091882', + 'title': 'ТЕД на русском', + }, + 'playlist_count': 187, + }] + + @classmethod + def suitable(cls, url): + return False if YandexMusicTrackIE.suitable(url) else super(YandexMusicAlbumIE, cls).suitable(url) + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + tld = mobj.group('tld') + album_id = mobj.group('id') + + album = self._call_api( + 'album', tld, url, album_id, 'Downloading album JSON', + {'album': album_id}) + + entries = self._build_playlist([track for volume in album['volumes'] for track in volume]) + + title = album['title'] + artist = try_get(album, lambda x: x['artists'][0]['name'], compat_str) + if artist: + title = '%s - %s' % (artist, title) + year = album.get('year') + if year: + title += ' (%s)' % year + + return self.playlist_result(entries, compat_str(album['id']), title) + + +class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): + IE_NAME = 'yandexmusic:playlist' + IE_DESC = 'Яндекс.Музыка - Плейлист' + _VALID_URL = r'%s/users/(?P<user>[^/]+)/playlists/(?P<id>\d+)' % YandexMusicBaseIE._VALID_URL_BASE + + _TESTS = [{ + 'url': 'http://music.yandex.ru/users/music.partners/playlists/1245', + 'info_dict': { + 'id': '1245', + 'title': 'md5:841559b3fe2b998eca88d0d2e22a3097', + 'description': 'md5:3b9f27b0efbe53f2ee1e844d07155cc9', + }, + 'playlist_count': 5, + # 'skip': 'Travis CI servers blocked by YandexMusic', + }, { + 'url': 'https://music.yandex.ru/users/ya.playlist/playlists/1036', + 'only_matching': True, + }, { + # playlist exceeding the limit of 150 tracks (see + # https://github.com/ytdl-org/youtube-dl/issues/6666) + 'url': 'https://music.yandex.ru/users/mesiaz/playlists/1364', + 'info_dict': { + 'id': '1364', + 'title': 'md5:b3b400f997d3f878a13ae0699653f7db', + }, + 'playlist_mincount': 437, + # 'skip': 'Travis CI servers blocked by YandexMusic', + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + tld = mobj.group('tld') + user = mobj.group('user') + playlist_id = mobj.group('id') + + playlist = self._call_api( + 'playlist', tld, url, playlist_id, 'Downloading playlist JSON', { + 'owner': user, + 'kinds': playlist_id, + 'light': 'true', + 'lang': tld, + 'external-domain': 'music.yandex.%s' % tld, + 'overembed': 'false', + })['playlist'] + + tracks = self._extract_tracks(playlist, playlist_id, url, tld) + + return self.playlist_result( + self._build_playlist(tracks), + compat_str(playlist_id), + playlist.get('title'), playlist.get('description')) + + +class YandexMusicArtistBaseIE(YandexMusicPlaylistBaseIE): + def _call_artist(self, tld, url, artist_id): + return self._call_api( + 'artist', tld, url, artist_id, + 'Downloading artist %s JSON' % self._ARTIST_WHAT, { + 'artist': artist_id, + 'what': self._ARTIST_WHAT, + 'sort': self._ARTIST_SORT or '', + 'dir': '', + 'period': '', + 'lang': tld, + 'external-domain': 'music.yandex.%s' % tld, + 'overembed': 'false', + }) + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + tld = mobj.group('tld') + artist_id = mobj.group('id') + data = self._call_artist(tld, url, artist_id) + tracks = self._extract_tracks(data, artist_id, url, tld) + title = try_get(data, lambda x: x['artist']['name'], compat_str) + return self.playlist_result( + self._build_playlist(tracks), artist_id, title) + + +class YandexMusicArtistTracksIE(YandexMusicArtistBaseIE): + IE_NAME = 'yandexmusic:artist:tracks' + IE_DESC = 'Яндекс.Музыка - Артист - Треки' + _VALID_URL = r'%s/artist/(?P<id>\d+)/tracks' % YandexMusicBaseIE._VALID_URL_BASE + + _TESTS = [{ + 'url': 'https://music.yandex.ru/artist/617526/tracks', + 'info_dict': { + 'id': '617526', + 'title': 'md5:131aef29d45fd5a965ca613e708c040b', + }, + 'playlist_count': 507, + # 'skip': 'Travis CI servers blocked by YandexMusic', + }] + + _ARTIST_SORT = '' + _ARTIST_WHAT = 'tracks' + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + tld = mobj.group('tld') + artist_id = mobj.group('id') + data = self._call_artist(tld, url, artist_id) + tracks = self._extract_tracks(data, artist_id, url, tld) + artist = try_get(data, lambda x: x['artist']['name'], compat_str) + title = '%s - %s' % (artist or artist_id, 'Треки') + return self.playlist_result( + self._build_playlist(tracks), artist_id, title) + + +class YandexMusicArtistAlbumsIE(YandexMusicArtistBaseIE): + IE_NAME = 'yandexmusic:artist:albums' + IE_DESC = 'Яндекс.Музыка - Артист - Альбомы' + _VALID_URL = r'%s/artist/(?P<id>\d+)/albums' % YandexMusicBaseIE._VALID_URL_BASE + + _TESTS = [{ + 'url': 'https://music.yandex.ru/artist/617526/albums', + 'info_dict': { + 'id': '617526', + 'title': 'md5:55dc58d5c85699b7fb41ee926700236c', + }, + 'playlist_count': 8, + # 'skip': 'Travis CI servers blocked by YandexMusic', + }] + + _ARTIST_SORT = 'year' + _ARTIST_WHAT = 'albums' + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + tld = mobj.group('tld') + artist_id = mobj.group('id') + data = self._call_artist(tld, url, artist_id) + entries = [] + for album in data['albums']: + if not isinstance(album, dict): + continue + album_id = album.get('id') + if not album_id: + continue + entries.append(self.url_result( + 'http://music.yandex.ru/album/%s' % album_id, + ie=YandexMusicAlbumIE.ie_key(), video_id=album_id)) + artist = try_get(data, lambda x: x['artist']['name'], compat_str) + title = '%s - %s' % (artist or artist_id, 'Альбомы') + return self.playlist_result(entries, artist_id, title) diff --git a/yt_dlp/extractor/yandexvideo.py b/yt_dlp/extractor/yandexvideo.py new file mode 100644 index 000000000..9974d65d6 --- /dev/null +++ b/yt_dlp/extractor/yandexvideo.py @@ -0,0 +1,232 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, + try_get, + url_or_none, +) + + +class YandexVideoIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?: + yandex\.ru(?:/(?:portal/(?:video|efir)|efir))?/?\?.*?stream_id=| + frontend\.vh\.yandex\.ru/player/ + ) + (?P<id>(?:[\da-f]{32}|[\w-]{12})) + ''' + _TESTS = [{ + 'url': 'https://yandex.ru/portal/video?stream_id=4dbb36ec4e0526d58f9f2dc8f0ecf374', + 'md5': 'e02a05bfaf0d9615ef07ae3a10f4faf4', + 'info_dict': { + 'id': '4dbb36ec4e0526d58f9f2dc8f0ecf374', + 'ext': 'mp4', + 'title': 'Русский Вудсток - главный рок-фест в истории СССР / вДудь', + 'description': 'md5:7d6b8d4bc4a3b9a56499916c1ea5b5fa', + 'thumbnail': r're:^https?://', + 'timestamp': 1549972939, + 'duration': 5575, + 'age_limit': 18, + 'upload_date': '20190212', + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + }, + }, { + 'url': 'https://yandex.ru/portal/efir?stream_id=4dbb262b4fe5cf15a215de4f34eee34d&from=morda', + 'only_matching': True, + }, { + 'url': 'https://yandex.ru/?stream_id=4dbb262b4fe5cf15a215de4f34eee34d', + 'only_matching': True, + }, { + 'url': 'https://frontend.vh.yandex.ru/player/4dbb262b4fe5cf15a215de4f34eee34d?from=morda', + 'only_matching': True, + }, { + # vod-episode, series episode + 'url': 'https://yandex.ru/portal/video?stream_id=45b11db6e4b68797919c93751a938cee', + 'only_matching': True, + }, { + # episode, sports + 'url': 'https://yandex.ru/?stream_channel=1538487871&stream_id=4132a07f71fb0396be93d74b3477131d', + 'only_matching': True, + }, { + # DASH with DRM + 'url': 'https://yandex.ru/portal/video?from=morda&stream_id=485a92d94518d73a9d0ff778e13505f8', + 'only_matching': True, + }, { + 'url': 'https://yandex.ru/efir?stream_active=watching&stream_id=v7a2dZ-v5mSI&from_block=efir_newtab', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + player = try_get((self._download_json( + 'https://frontend.vh.yandex.ru/graphql', video_id, data=('''{ + player(content_id: "%s") { + computed_title + content_url + description + dislikes + duration + likes + program_title + release_date + release_date_ut + release_year + restriction_age + season + start_time + streams + thumbnail + title + views_count + } +}''' % video_id).encode(), fatal=False)), lambda x: x['player']['content']) + if not player or player.get('error'): + player = self._download_json( + 'https://frontend.vh.yandex.ru/v23/player/%s.json' % video_id, + video_id, query={ + 'stream_options': 'hires', + 'disable_trackings': 1, + }) + content = player['content'] + + title = content.get('title') or content['computed_title'] + + formats = [] + streams = content.get('streams') or [] + streams.append({'url': content.get('content_url')}) + for stream in streams: + content_url = url_or_none(stream.get('url')) + if not content_url: + continue + ext = determine_ext(content_url) + if ext == 'ismc': + continue + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + content_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + content_url, video_id, mpd_id='dash', fatal=False)) + else: + formats.append({'url': content_url}) + + self._sort_formats(formats) + + timestamp = (int_or_none(content.get('release_date')) + or int_or_none(content.get('release_date_ut')) + or int_or_none(content.get('start_time'))) + season = content.get('season') or {} + + return { + 'id': video_id, + 'title': title, + 'description': content.get('description'), + 'thumbnail': content.get('thumbnail'), + 'timestamp': timestamp, + 'duration': int_or_none(content.get('duration')), + 'series': content.get('program_title'), + 'age_limit': int_or_none(content.get('restriction_age')), + 'view_count': int_or_none(content.get('views_count')), + 'like_count': int_or_none(content.get('likes')), + 'dislike_count': int_or_none(content.get('dislikes')), + 'season_number': int_or_none(season.get('season_number')), + 'season_id': season.get('id'), + 'release_year': int_or_none(content.get('release_year')), + 'formats': formats, + } + + +class ZenYandexIE(InfoExtractor): + _VALID_URL = r'https?://zen\.yandex\.ru/media/(?:id/[^/]+/|[^/]+/)(?:[a-z0-9-]+)-(?P<id>[a-z0-9-]+)' + _TESTS = [{ + 'url': 'https://zen.yandex.ru/media/popmech/izverjenie-vulkana-iz-spichek-zreliscnyi-opyt-6002240ff8b1af50bb2da5e3', + 'info_dict': { + 'id': '6002240ff8b1af50bb2da5e3', + 'ext': 'mp4', + 'title': 'Извержение вулкана из спичек: зрелищный опыт', + 'description': 'md5:053ad3c61b5596d510c9a199dc8ee633', + 'thumbnail': 'https://avatars.mds.yandex.net/get-zen-pub-og/3558619/pub_6002240ff8b1af50bb2da5e3_600bad814d953e4132a30b5e/orig', + 'uploader': 'Популярная механика', + }, + }, { + 'url': 'https://zen.yandex.ru/media/id/606fd806cc13cb3c58c05cf5/vot-eto-focus-dedy-morozy-na-gidrociklah-60c7c443da18892ebfe85ed7', + 'info_dict': { + 'id': '60c7c443da18892ebfe85ed7', + 'ext': 'mp4', + 'title': 'ВОТ ЭТО Focus. Деды Морозы на гидроциклах', + 'description': 'md5:8684912f6086f298f8078d4af0e8a600', + 'thumbnail': 'https://avatars.mds.yandex.net/get-zen-pub-og/4410519/pub_60c7c443da18892ebfe85ed7_60c7c48e060a163121f42cc3/orig', + 'uploader': 'AcademeG DailyStream' + }, + }, { + 'url': 'https://zen.yandex.ru/media/id/606fd806cc13cb3c58c05cf5/novyi-samsung-fold-3-moskvich-barahlit-612f93b7f8d48e7e945792a2?from=channel&rid=2286618386.482.1630817595976.42360', + 'only_matching': True, + }] + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + data_json = self._parse_json(self._search_regex(r'w\._data\s?=\s?({.+?});', webpage, 'metadata'), id) + stream_json = try_get(data_json, lambda x: x['publication']['content']['gifContent'], dict) + stream_url = stream_json.get('stream') or try_get(stream_json, lambda x: x['streams']['url']) + formats = self._extract_m3u8_formats(stream_url, id) + self._sort_formats(formats) + return { + 'id': id, + 'title': try_get(data_json, (lambda x: x['og']['title'], lambda x: x['publication']['content']['preview']['title'])), + 'uploader': data_json.get('authorName') or try_get(data_json, lambda x: x['publisher']['name']), + 'description': try_get(data_json, lambda x: x['og']['description']), + 'thumbnail': try_get(data_json, lambda x: x['og']['imageUrl']), + 'formats': formats, + } + + +class ZenYandexChannelIE(InfoExtractor): + _VALID_URL = r'https?://zen\.yandex\.ru/(?!media)(?:id/)?(?P<id>[a-z0-9-_]+)' + _TESTS = [{ + 'url': 'https://zen.yandex.ru/tok_media', + 'info_dict': { + 'id': 'tok_media', + }, + 'playlist_mincount': 169, + }, { + 'url': 'https://zen.yandex.ru/id/606fd806cc13cb3c58c05cf5', + 'info_dict': { + 'id': '606fd806cc13cb3c58c05cf5', + }, + 'playlist_mincount': 657, + }] + + def _entries(self, id, url): + webpage = self._download_webpage(url, id) + data_json = self._parse_json(re.findall(r'var\s?data\s?=\s?({.+?})\s?;', webpage)[-1], id) + for key in data_json.keys(): + if key.startswith('__serverState__'): + data_json = data_json[key] + items = list(try_get(data_json, lambda x: x['feed']['items'], dict).values()) + more = try_get(data_json, lambda x: x['links']['more']) or None + for page in itertools.count(1): + for item in items: + video_id = item.get('publication_id') or item.get('publicationId') + video_url = item.get('link') + yield self.url_result(video_url, ie=ZenYandexIE.ie_key(), video_id=video_id.split(':')[-1]) + if not more: + break + data_json = self._download_json(more, id, note='Downloading Page %d' % page) + items = data_json.get('items', []) + more = try_get(data_json, lambda x: x['more']['link']) or None + + def _real_extract(self, url): + id = self._match_id(url) + return self.playlist_result(self._entries(id, url), playlist_id=id) diff --git a/youtube_dl/extractor/yapfiles.py b/yt_dlp/extractor/yapfiles.py index cfb368de9..cfb368de9 100644 --- a/youtube_dl/extractor/yapfiles.py +++ b/yt_dlp/extractor/yapfiles.py diff --git a/youtube_dl/extractor/yesjapan.py b/yt_dlp/extractor/yesjapan.py index 681338c96..681338c96 100644 --- a/youtube_dl/extractor/yesjapan.py +++ b/yt_dlp/extractor/yesjapan.py diff --git a/youtube_dl/extractor/yinyuetai.py b/yt_dlp/extractor/yinyuetai.py index 1fd8d35c6..1fd8d35c6 100644 --- a/youtube_dl/extractor/yinyuetai.py +++ b/yt_dlp/extractor/yinyuetai.py diff --git a/youtube_dl/extractor/ynet.py b/yt_dlp/extractor/ynet.py index c4ae4d88e..c4ae4d88e 100644 --- a/youtube_dl/extractor/ynet.py +++ b/yt_dlp/extractor/ynet.py diff --git a/yt_dlp/extractor/youjizz.py b/yt_dlp/extractor/youjizz.py new file mode 100644 index 000000000..5f5fbf21c --- /dev/null +++ b/yt_dlp/extractor/youjizz.py @@ -0,0 +1,94 @@ +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, + parse_duration, + url_or_none, +) + + +class YouJizzIE(InfoExtractor): + _VALID_URL = r'https?://(?:\w+\.)?youjizz\.com/videos/(?:[^/#?]*-(?P<id>\d+)\.html|embed/(?P<embed_id>\d+))' + _TESTS = [{ + 'url': 'http://www.youjizz.com/videos/zeichentrick-1-2189178.html', + 'md5': 'b1e1dfaa8bb9537d8b84eeda9cf4acf4', + 'info_dict': { + 'id': '2189178', + 'ext': 'mp4', + 'title': 'Zeichentrick 1', + 'age_limit': 18, + 'duration': 2874, + } + }, { + 'url': 'http://www.youjizz.com/videos/-2189178.html', + 'only_matching': True, + }, { + 'url': 'https://www.youjizz.com/videos/embed/31991001', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') or mobj.group('embed_id') + + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex( + r'<title>(.+?)</title>', webpage, 'title') + + formats = [] + + encodings = self._parse_json( + self._search_regex( + r'[Ee]ncodings\s*=\s*(\[.+?\]);\n', webpage, 'encodings', + default='[]'), + video_id, fatal=False) + for encoding in encodings: + if not isinstance(encoding, dict): + continue + format_url = url_or_none(encoding.get('filename')) + if not format_url: + continue + if determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + else: + format_id = encoding.get('name') or encoding.get('quality') + height = int_or_none(self._search_regex( + r'^(\d+)[pP]', format_id, 'height', default=None)) + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'height': height, + }) + + if formats: + info_dict = { + 'formats': formats, + } + else: + # YouJizz's HTML5 player has invalid HTML + webpage = webpage.replace('"controls', '" controls') + info_dict = self._parse_html5_media_entries( + url, webpage, video_id)[0] + + duration = parse_duration(self._search_regex( + r'<strong>Runtime:</strong>([^<]+)', webpage, 'duration', + default=None)) + uploader = self._search_regex( + r'<strong>Uploaded By:.*?<a[^>]*>([^<]+)', webpage, 'uploader', + default=None) + + info_dict.update({ + 'id': video_id, + 'title': title, + 'age_limit': self._rta_search(webpage), + 'duration': duration, + 'uploader': uploader, + }) + + return info_dict diff --git a/yt_dlp/extractor/youku.py b/yt_dlp/extractor/youku.py new file mode 100644 index 000000000..b50579915 --- /dev/null +++ b/yt_dlp/extractor/youku.py @@ -0,0 +1,309 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import random +import re +import string +import time + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + get_element_by_class, + js_to_json, + str_or_none, + strip_jsonp, +) + + +class YoukuIE(InfoExtractor): + IE_NAME = 'youku' + IE_DESC = '优酷' + _VALID_URL = r'''(?x) + (?: + https?://( + (?:v|player)\.youku\.com/(?:v_show/id_|player\.php/sid/)| + video\.tudou\.com/v/)| + youku:) + (?P<id>[A-Za-z0-9]+)(?:\.html|/v\.swf|) + ''' + + _TESTS = [{ + # MD5 is unstable + 'url': 'http://v.youku.com/v_show/id_XMTc1ODE5Njcy.html', + 'info_dict': { + 'id': 'XMTc1ODE5Njcy', + 'title': '★Smile﹗♡ Git Fresh -Booty Music舞蹈.', + 'ext': 'mp4', + 'duration': 74.73, + 'thumbnail': r're:^https?://.*', + 'uploader': '。躲猫猫、', + 'uploader_id': '36017967', + 'uploader_url': 'http://i.youku.com/u/UMTQ0MDcxODY4', + 'tags': list, + } + }, { + 'url': 'http://player.youku.com/player.php/sid/XNDgyMDQ2NTQw/v.swf', + 'only_matching': True, + }, { + 'url': 'http://v.youku.com/v_show/id_XODgxNjg1Mzk2_ev_1.html', + 'info_dict': { + 'id': 'XODgxNjg1Mzk2', + 'ext': 'mp4', + 'title': '武媚娘传奇 85', + 'duration': 1999.61, + 'thumbnail': r're:^https?://.*', + 'uploader': '疯狂豆花', + 'uploader_id': '62583473', + 'uploader_url': 'http://i.youku.com/u/UMjUwMzMzODky', + 'tags': list, + }, + }, { + 'url': 'http://v.youku.com/v_show/id_XMTI1OTczNDM5Mg==.html', + 'info_dict': { + 'id': 'XMTI1OTczNDM5Mg', + 'ext': 'mp4', + 'title': '花千骨 04', + 'duration': 2363, + 'thumbnail': r're:^https?://.*', + 'uploader': '放剧场-花千骨', + 'uploader_id': '772849359', + 'uploader_url': 'http://i.youku.com/u/UMzA5MTM5NzQzNg==', + 'tags': list, + }, + }, { + 'url': 'http://v.youku.com/v_show/id_XNjA1NzA2Njgw.html', + 'note': 'Video protected with password', + 'info_dict': { + 'id': 'XNjA1NzA2Njgw', + 'ext': 'mp4', + 'title': '邢義田复旦讲座之想象中的胡人—从“左衽孔子”说起', + 'duration': 7264.5, + 'thumbnail': r're:^https?://.*', + 'uploader': 'FoxJin1006', + 'uploader_id': '322014285', + 'uploader_url': 'http://i.youku.com/u/UMTI4ODA1NzE0MA==', + 'tags': list, + }, + 'params': { + 'videopassword': '100600', + }, + }, { + # /play/get.json contains streams with "channel_type":"tail" + 'url': 'http://v.youku.com/v_show/id_XOTUxMzg4NDMy.html', + 'info_dict': { + 'id': 'XOTUxMzg4NDMy', + 'ext': 'mp4', + 'title': '我的世界☆明月庄主☆车震猎杀☆杀人艺术Minecraft', + 'duration': 702.08, + 'thumbnail': r're:^https?://.*', + 'uploader': '明月庄主moon', + 'uploader_id': '38465621', + 'uploader_url': 'http://i.youku.com/u/UMTUzODYyNDg0', + 'tags': list, + }, + }, { + 'url': 'http://video.tudou.com/v/XMjIyNzAzMTQ4NA==.html?f=46177805', + 'info_dict': { + 'id': 'XMjIyNzAzMTQ4NA', + 'ext': 'mp4', + 'title': '卡马乔国足开大脚长传冲吊集锦', + 'duration': 289, + 'thumbnail': r're:^https?://.*', + 'uploader': '阿卜杜拉之星', + 'uploader_id': '2382249', + 'uploader_url': 'http://i.youku.com/u/UOTUyODk5Ng==', + 'tags': list, + }, + }, { + 'url': 'http://video.tudou.com/v/XMjE4ODI3OTg2MA==.html', + 'only_matching': True, + }] + + @staticmethod + def get_ysuid(): + return '%d%s' % (int(time.time()), ''.join([ + random.choice(string.ascii_letters) for i in range(3)])) + + def get_format_name(self, fm): + _dict = { + '3gp': 'h6', + '3gphd': 'h5', + 'flv': 'h4', + 'flvhd': 'h4', + 'mp4': 'h3', + 'mp4hd': 'h3', + 'mp4hd2': 'h4', + 'mp4hd3': 'h4', + 'hd2': 'h2', + 'hd3': 'h1', + } + return _dict.get(fm) + + def _real_extract(self, url): + video_id = self._match_id(url) + + self._set_cookie('youku.com', '__ysuid', self.get_ysuid()) + self._set_cookie('youku.com', 'xreferrer', 'http://www.youku.com') + + _, urlh = self._download_webpage_handle( + 'https://log.mmstat.com/eg.js', video_id, 'Retrieving cna info') + # The etag header is '"foobar"'; let's remove the double quotes + cna = urlh.headers['etag'][1:-1] + + # request basic data + basic_data_params = { + 'vid': video_id, + 'ccode': '0532', + 'client_ip': '192.168.1.1', + 'utid': cna, + 'client_ts': time.time() / 1000, + } + + video_password = self.get_param('videopassword') + if video_password: + basic_data_params['password'] = video_password + + headers = { + 'Referer': url, + } + headers.update(self.geo_verification_headers()) + data = self._download_json( + 'https://ups.youku.com/ups/get.json', video_id, + 'Downloading JSON metadata', + query=basic_data_params, headers=headers)['data'] + + error = data.get('error') + if error: + error_note = error.get('note') + if error_note is not None and '因版权原因无法观看此视频' in error_note: + raise ExtractorError( + 'Youku said: Sorry, this video is available in China only', expected=True) + elif error_note and '该视频被设为私密' in error_note: + raise ExtractorError( + 'Youku said: Sorry, this video is private', expected=True) + else: + msg = 'Youku server reported error %i' % error.get('code') + if error_note is not None: + msg += ': ' + error_note + raise ExtractorError(msg) + + # get video title + video_data = data['video'] + title = video_data['title'] + + formats = [{ + 'url': stream['m3u8_url'], + 'format_id': self.get_format_name(stream.get('stream_type')), + 'ext': 'mp4', + 'protocol': 'm3u8_native', + 'filesize': int(stream.get('size')), + 'width': stream.get('width'), + 'height': stream.get('height'), + } for stream in data['stream'] if stream.get('channel_type') != 'tail'] + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'duration': video_data.get('seconds'), + 'thumbnail': video_data.get('logo'), + 'uploader': video_data.get('username'), + 'uploader_id': str_or_none(video_data.get('userid')), + 'uploader_url': data.get('uploader', {}).get('homepage'), + 'tags': video_data.get('tags'), + } + + +class YoukuShowIE(InfoExtractor): + _VALID_URL = r'https?://list\.youku\.com/show/id_(?P<id>[0-9a-z]+)\.html' + IE_NAME = 'youku:show' + + _TESTS = [{ + 'url': 'http://list.youku.com/show/id_zc7c670be07ff11e48b3f.html', + 'info_dict': { + 'id': 'zc7c670be07ff11e48b3f', + 'title': '花千骨 DVD版', + 'description': 'md5:a1ae6f5618571bbeb5c9821f9c81b558', + }, + 'playlist_count': 50, + }, { + # Episode number not starting from 1 + 'url': 'http://list.youku.com/show/id_zefbfbd70efbfbd780bef.html', + 'info_dict': { + 'id': 'zefbfbd70efbfbd780bef', + 'title': '超级飞侠3', + 'description': 'md5:275715156abebe5ccc2a1992e9d56b98', + }, + 'playlist_count': 24, + }, { + # Ongoing playlist. The initial page is the last one + 'url': 'http://list.youku.com/show/id_za7c275ecd7b411e1a19e.html', + 'only_matching': True, + }, { + # No data-id value. + 'url': 'http://list.youku.com/show/id_zefbfbd61237fefbfbdef.html', + 'only_matching': True, + }, { + # Wrong number of reload_id. + 'url': 'http://list.youku.com/show/id_z20eb4acaf5c211e3b2ad.html', + 'only_matching': True, + }] + + def _extract_entries(self, playlist_data_url, show_id, note, query): + query['callback'] = 'cb' + playlist_data = self._download_json( + playlist_data_url, show_id, query=query, note=note, + transform_source=lambda s: js_to_json(strip_jsonp(s))).get('html') + if playlist_data is None: + return [None, None] + drama_list = (get_element_by_class('p-drama-grid', playlist_data) + or get_element_by_class('p-drama-half-row', playlist_data)) + if drama_list is None: + raise ExtractorError('No episodes found') + video_urls = re.findall(r'<a[^>]+href="([^"]+)"', drama_list) + return playlist_data, [ + self.url_result(self._proto_relative_url(video_url, 'http:'), YoukuIE.ie_key()) + for video_url in video_urls] + + def _real_extract(self, url): + show_id = self._match_id(url) + webpage = self._download_webpage(url, show_id) + + entries = [] + page_config = self._parse_json(self._search_regex( + r'var\s+PageConfig\s*=\s*({.+});', webpage, 'page config'), + show_id, transform_source=js_to_json) + first_page, initial_entries = self._extract_entries( + 'http://list.youku.com/show/module', show_id, + note='Downloading initial playlist data page', + query={ + 'id': page_config['showid'], + 'tab': 'showInfo', + }) + first_page_reload_id = self._html_search_regex( + r'<div[^>]+id="(reload_\d+)', first_page, 'first page reload id') + # The first reload_id has the same items as first_page + reload_ids = re.findall('<li[^>]+data-id="([^"]+)">', first_page) + entries.extend(initial_entries) + for idx, reload_id in enumerate(reload_ids): + if reload_id == first_page_reload_id: + continue + _, new_entries = self._extract_entries( + 'http://list.youku.com/show/episode', show_id, + note='Downloading playlist data page %d' % (idx + 1), + query={ + 'id': page_config['showid'], + 'stage': reload_id, + }) + if new_entries is not None: + entries.extend(new_entries) + desc = self._html_search_meta('description', webpage, fatal=False) + playlist_title = desc.split(',')[0] if desc else None + detail_li = get_element_by_class('p-intro', webpage) + playlist_description = get_element_by_class( + 'intro-more', detail_li) if detail_li else None + + return self.playlist_result( + entries, show_id, playlist_title, playlist_description) diff --git a/youtube_dl/extractor/younow.py b/yt_dlp/extractor/younow.py index 04dbc87fc..04dbc87fc 100644 --- a/youtube_dl/extractor/younow.py +++ b/yt_dlp/extractor/younow.py diff --git a/yt_dlp/extractor/youporn.py b/yt_dlp/extractor/youporn.py new file mode 100644 index 000000000..5feb568e7 --- /dev/null +++ b/yt_dlp/extractor/youporn.py @@ -0,0 +1,184 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + int_or_none, + str_to_int, + unified_strdate, + url_or_none, +) + + +class YouPornIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?youporn\.com/(?:watch|embed)/(?P<id>\d+)(?:/(?P<display_id>[^/?#&]+))?' + _TESTS = [{ + 'url': 'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/', + 'md5': '3744d24c50438cf5b6f6d59feb5055c2', + 'info_dict': { + 'id': '505835', + 'display_id': 'sex-ed-is-it-safe-to-masturbate-daily', + 'ext': 'mp4', + 'title': 'Sex Ed: Is It Safe To Masturbate Daily?', + 'description': 'Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 210, + 'uploader': 'Ask Dan And Jennifer', + 'upload_date': '20101217', + 'average_rating': int, + 'view_count': int, + 'categories': list, + 'tags': list, + 'age_limit': 18, + }, + 'skip': 'This video has been disabled', + }, { + # Unknown uploader + 'url': 'http://www.youporn.com/watch/561726/big-tits-awesome-brunette-on-amazing-webcam-show/?from=related3&al=2&from_id=561726&pos=4', + 'info_dict': { + 'id': '561726', + 'display_id': 'big-tits-awesome-brunette-on-amazing-webcam-show', + 'ext': 'mp4', + 'title': 'Big Tits Awesome Brunette On amazing webcam show', + 'description': 'http://sweetlivegirls.com Big Tits Awesome Brunette On amazing webcam show.mp4', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Unknown', + 'upload_date': '20110418', + 'average_rating': int, + 'view_count': int, + 'categories': list, + 'tags': list, + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, + 'skip': '404', + }, { + 'url': 'https://www.youporn.com/embed/505835/sex-ed-is-it-safe-to-masturbate-daily/', + 'only_matching': True, + }, { + 'url': 'http://www.youporn.com/watch/505835', + 'only_matching': True, + }, { + 'url': 'https://www.youporn.com/watch/13922959/femdom-principal/', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r'<iframe[^>]+\bsrc=["\']((?:https?:)?//(?:www\.)?youporn\.com/embed/\d+)', + webpage) + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') or video_id + + definitions = self._download_json( + 'https://www.youporn.com/api/video/media_definitions/%s/' % video_id, + display_id) + + formats = [] + for definition in definitions: + if not isinstance(definition, dict): + continue + video_url = url_or_none(definition.get('videoUrl')) + if not video_url: + continue + f = { + 'url': video_url, + 'filesize': int_or_none(definition.get('videoSize')), + } + height = int_or_none(definition.get('quality')) + # Video URL's path looks like this: + # /201012/17/505835/720p_1500k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4 + # /201012/17/505835/vl_240p_240k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4 + # /videos/201703/11/109285532/1080P_4000K_109285532.mp4 + # We will benefit from it by extracting some metadata + mobj = re.search(r'(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+', video_url) + if mobj: + if not height: + height = int(mobj.group('height')) + bitrate = int(mobj.group('bitrate')) + f.update({ + 'format_id': '%dp-%dk' % (height, bitrate), + 'tbr': bitrate, + }) + f['height'] = height + formats.append(f) + self._sort_formats(formats) + + webpage = self._download_webpage( + 'http://www.youporn.com/watch/%s' % video_id, display_id, + headers={'Cookie': 'age_verified=1'}) + + title = self._html_search_regex( + r'(?s)<div[^>]+class=["\']watchVideoTitle[^>]+>(.+?)</div>', + webpage, 'title', default=None) or self._og_search_title( + webpage, default=None) or self._html_search_meta( + 'title', webpage, fatal=True) + + description = self._html_search_regex( + r'(?s)<div[^>]+\bid=["\']description["\'][^>]*>(.+?)</div>', + webpage, 'description', + default=None) or self._og_search_description( + webpage, default=None) + thumbnail = self._search_regex( + r'(?:imageurl\s*=|poster\s*:)\s*(["\'])(?P<thumbnail>.+?)\1', + webpage, 'thumbnail', fatal=False, group='thumbnail') + duration = int_or_none(self._html_search_meta( + 'video:duration', webpage, 'duration', fatal=False)) + + uploader = self._html_search_regex( + r'(?s)<div[^>]+class=["\']submitByLink["\'][^>]*>(.+?)</div>', + webpage, 'uploader', fatal=False) + upload_date = unified_strdate(self._html_search_regex( + [r'UPLOADED:\s*<span>([^<]+)', + r'Date\s+[Aa]dded:\s*<span>([^<]+)', + r'(?s)<div[^>]+class=["\']videoInfo(?:Date|Time)["\'][^>]*>(.+?)</div>'], + webpage, 'upload date', fatal=False)) + + age_limit = self._rta_search(webpage) + + view_count = None + views = self._search_regex( + r'(<div[^>]+\bclass=["\']js_videoInfoViews["\']>)', webpage, + 'views', default=None) + if views: + view_count = str_to_int(extract_attributes(views).get('data-value')) + comment_count = str_to_int(self._search_regex( + r'>All [Cc]omments? \(([\d,.]+)\)', + webpage, 'comment count', default=None)) + + def extract_tag_box(regex, title): + tag_box = self._search_regex(regex, webpage, title, default=None) + if not tag_box: + return [] + return re.findall(r'<a[^>]+href=[^>]+>([^<]+)', tag_box) + + categories = extract_tag_box( + r'(?s)Categories:.*?</[^>]+>(.+?)</div>', 'categories') + tags = extract_tag_box( + r'(?s)Tags:.*?</div>\s*<div[^>]+class=["\']tagBoxContent["\'][^>]*>(.+?)</div>', + 'tags') + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'uploader': uploader, + 'upload_date': upload_date, + 'view_count': view_count, + 'comment_count': comment_count, + 'categories': categories, + 'tags': tags, + 'age_limit': age_limit, + 'formats': formats, + } diff --git a/youtube_dl/extractor/yourporn.py b/yt_dlp/extractor/yourporn.py index 98347491e..98347491e 100644 --- a/youtube_dl/extractor/yourporn.py +++ b/yt_dlp/extractor/yourporn.py diff --git a/youtube_dl/extractor/yourupload.py b/yt_dlp/extractor/yourupload.py index 9fa772838..9fa772838 100644 --- a/youtube_dl/extractor/yourupload.py +++ b/yt_dlp/extractor/yourupload.py diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py new file mode 100644 index 000000000..892993c9b --- /dev/null +++ b/yt_dlp/extractor/youtube.py @@ -0,0 +1,4840 @@ +# coding: utf-8 + +from __future__ import unicode_literals + +import base64 +import calendar +import copy +import datetime +import hashlib +import itertools +import json +import os.path +import random +import re +import time +import traceback + +from .common import InfoExtractor, SearchInfoExtractor +from ..compat import ( + compat_chr, + compat_HTTPError, + compat_parse_qs, + compat_str, + compat_urllib_parse_unquote_plus, + compat_urllib_parse_urlencode, + compat_urllib_parse_urlparse, + compat_urlparse, +) +from ..jsinterp import JSInterpreter +from ..utils import ( + bytes_to_intlist, + clean_html, + datetime_from_str, + dict_get, + error_to_compat_str, + ExtractorError, + float_or_none, + format_field, + int_or_none, + intlist_to_bytes, + is_html, + mimetype2ext, + network_exceptions, + orderedSet, + parse_codecs, + parse_count, + parse_duration, + parse_iso8601, + parse_qs, + qualities, + remove_end, + remove_start, + smuggle_url, + str_or_none, + str_to_int, + traverse_obj, + try_get, + unescapeHTML, + unified_strdate, + unsmuggle_url, + update_url_query, + url_or_none, + urljoin, + variadic, +) + + +# any clients starting with _ cannot be explicity requested by the user +INNERTUBE_CLIENTS = { + 'web': { + 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'WEB', + 'clientVersion': '2.20210622.10.00', + } + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 1 + }, + 'web_embedded': { + 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'WEB_EMBEDDED_PLAYER', + 'clientVersion': '1.20210620.0.1', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 56 + }, + 'web_music': { + 'INNERTUBE_API_KEY': 'AIzaSyC9XL3ZjWddXya6X74dJoCTL-WEYFDNX30', + 'INNERTUBE_HOST': 'music.youtube.com', + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'WEB_REMIX', + 'clientVersion': '1.20210621.00.00', + } + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 67, + }, + 'web_creator': { + 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'WEB_CREATOR', + 'clientVersion': '1.20210621.00.00', + } + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 62, + }, + 'android': { + 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'ANDROID', + 'clientVersion': '16.20', + } + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 3, + 'REQUIRE_JS_PLAYER': False + }, + 'android_embedded': { + 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'ANDROID_EMBEDDED_PLAYER', + 'clientVersion': '16.20', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 55, + 'REQUIRE_JS_PLAYER': False + }, + 'android_music': { + 'INNERTUBE_API_KEY': 'AIzaSyC9XL3ZjWddXya6X74dJoCTL-WEYFDNX30', + 'INNERTUBE_HOST': 'music.youtube.com', + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'ANDROID_MUSIC', + 'clientVersion': '4.32', + } + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 21, + 'REQUIRE_JS_PLAYER': False + }, + 'android_creator': { + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'ANDROID_CREATOR', + 'clientVersion': '21.24.100', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 14, + 'REQUIRE_JS_PLAYER': False + }, + # ios has HLS live streams + # See: https://github.com/TeamNewPipe/NewPipeExtractor/issues/680 + 'ios': { + 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'IOS', + 'clientVersion': '16.20', + } + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 5, + 'REQUIRE_JS_PLAYER': False + }, + 'ios_embedded': { + 'INNERTUBE_API_KEY': 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8', + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'IOS_MESSAGES_EXTENSION', + 'clientVersion': '16.20', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 66, + 'REQUIRE_JS_PLAYER': False + }, + 'ios_music': { + 'INNERTUBE_API_KEY': 'AIzaSyDK3iBpDP9nHVTk2qL73FLJICfOC3c51Og', + 'INNERTUBE_HOST': 'music.youtube.com', + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'IOS_MUSIC', + 'clientVersion': '4.32', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 26, + 'REQUIRE_JS_PLAYER': False + }, + 'ios_creator': { + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'IOS_CREATOR', + 'clientVersion': '21.24.100', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 15, + 'REQUIRE_JS_PLAYER': False + }, + # mweb has 'ultralow' formats + # See: https://github.com/yt-dlp/yt-dlp/pull/557 + 'mweb': { + 'INNERTUBE_API_KEY': 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8', + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'MWEB', + 'clientVersion': '2.20210721.07.00', + } + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 2 + }, +} + + +def build_innertube_clients(): + third_party = { + 'embedUrl': 'https://google.com', # Can be any valid URL + } + base_clients = ('android', 'web', 'ios', 'mweb') + priority = qualities(base_clients[::-1]) + + for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()): + ytcfg.setdefault('INNERTUBE_API_KEY', 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8') + ytcfg.setdefault('INNERTUBE_HOST', 'www.youtube.com') + ytcfg.setdefault('REQUIRE_JS_PLAYER', True) + ytcfg['INNERTUBE_CONTEXT']['client'].setdefault('hl', 'en') + ytcfg['priority'] = 10 * priority(client.split('_', 1)[0]) + + if client in base_clients: + INNERTUBE_CLIENTS[f'{client}_agegate'] = agegate_ytcfg = copy.deepcopy(ytcfg) + agegate_ytcfg['INNERTUBE_CONTEXT']['client']['clientScreen'] = 'EMBED' + agegate_ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = third_party + agegate_ytcfg['priority'] -= 1 + elif client.endswith('_embedded'): + ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = third_party + ytcfg['priority'] -= 2 + else: + ytcfg['priority'] -= 3 + + +build_innertube_clients() + + +class YoutubeBaseInfoExtractor(InfoExtractor): + """Provide base functions for Youtube extractors""" + + _RESERVED_NAMES = ( + r'channel|c|user|playlist|watch|w|v|embed|e|watch_popup|clip|' + r'shorts|movies|results|shared|hashtag|trending|feed|feeds|' + r'browse|oembed|get_video_info|iframe_api|s/player|' + r'storefront|oops|index|account|reporthistory|t/terms|about|upload|signin|logout') + + _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)' + + _NETRC_MACHINE = 'youtube' + + # If True it will raise an error if no login info is provided + _LOGIN_REQUIRED = False + + r''' # Unused since login is broken + _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' + _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge' + + _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup' + _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge' + _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}' + ''' + + def _login(self): + """ + Attempt to log in to YouTube. + True is returned if successful or skipped. + False is returned if login failed. + + If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised. + """ + + def warn(message): + self.report_warning(message) + + # username+password login is broken + if (self._LOGIN_REQUIRED + and self.get_param('cookiefile') is None + and self.get_param('cookiesfrombrowser') is None): + self.raise_login_required( + 'Login details are needed to download this content', method='cookies') + username, password = self._get_login_info() + if username: + warn('Logging in using username and password is broken. %s' % self._LOGIN_HINTS['cookies']) + return + + # Everything below this is broken! + r''' + # No authentication to be performed + if username is None: + if self._LOGIN_REQUIRED and self.get_param('cookiefile') is None: + raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) + # if self.get_param('cookiefile'): # TODO remove 'and False' later - too many people using outdated cookies and open issues, remind them. + # self.to_screen('[Cookies] Reminder - Make sure to always use up to date cookies!') + return True + + login_page = self._download_webpage( + self._LOGIN_URL, None, + note='Downloading login page', + errnote='unable to fetch login page', fatal=False) + if login_page is False: + return + + login_form = self._hidden_inputs(login_page) + + def req(url, f_req, note, errnote): + data = login_form.copy() + data.update({ + 'pstMsg': 1, + 'checkConnection': 'youtube', + 'checkedDomains': 'youtube', + 'hl': 'en', + 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]', + 'f.req': json.dumps(f_req), + 'flowName': 'GlifWebSignIn', + 'flowEntry': 'ServiceLogin', + # TODO: reverse actual botguard identifier generation algo + 'bgRequest': '["identifier",""]', + }) + return self._download_json( + url, None, note=note, errnote=errnote, + transform_source=lambda s: re.sub(r'^[^[]*', '', s), + fatal=False, + data=urlencode_postdata(data), headers={ + 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8', + 'Google-Accounts-XSRF': 1, + }) + + lookup_req = [ + username, + None, [], None, 'US', None, None, 2, False, True, + [ + None, None, + [2, 1, None, 1, + 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', + None, [], 4], + 1, [None, None, []], None, None, None, True + ], + username, + ] + + lookup_results = req( + self._LOOKUP_URL, lookup_req, + 'Looking up account info', 'Unable to look up account info') + + if lookup_results is False: + return False + + user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str) + if not user_hash: + warn('Unable to extract user hash') + return False + + challenge_req = [ + user_hash, + None, 1, None, [1, None, None, None, [password, None, True]], + [ + None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4], + 1, [None, None, []], None, None, None, True + ]] + + challenge_results = req( + self._CHALLENGE_URL, challenge_req, + 'Logging in', 'Unable to log in') + + if challenge_results is False: + return + + login_res = try_get(challenge_results, lambda x: x[0][5], list) + if login_res: + login_msg = try_get(login_res, lambda x: x[5], compat_str) + warn( + 'Unable to login: %s' % 'Invalid password' + if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg) + return False + + res = try_get(challenge_results, lambda x: x[0][-1], list) + if not res: + warn('Unable to extract result entry') + return False + + login_challenge = try_get(res, lambda x: x[0][0], list) + if login_challenge: + challenge_str = try_get(login_challenge, lambda x: x[2], compat_str) + if challenge_str == 'TWO_STEP_VERIFICATION': + # SEND_SUCCESS - TFA code has been successfully sent to phone + # QUOTA_EXCEEDED - reached the limit of TFA codes + status = try_get(login_challenge, lambda x: x[5], compat_str) + if status == 'QUOTA_EXCEEDED': + warn('Exceeded the limit of TFA codes, try later') + return False + + tl = try_get(challenge_results, lambda x: x[1][2], compat_str) + if not tl: + warn('Unable to extract TL') + return False + + tfa_code = self._get_tfa_info('2-step verification code') + + if not tfa_code: + warn( + 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>' + '(Note that only TOTP (Google Authenticator App) codes work at this time.)') + return False + + tfa_code = remove_start(tfa_code, 'G-') + + tfa_req = [ + user_hash, None, 2, None, + [ + 9, None, None, None, None, None, None, None, + [None, tfa_code, True, 2] + ]] + + tfa_results = req( + self._TFA_URL.format(tl), tfa_req, + 'Submitting TFA code', 'Unable to submit TFA code') + + if tfa_results is False: + return False + + tfa_res = try_get(tfa_results, lambda x: x[0][5], list) + if tfa_res: + tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str) + warn( + 'Unable to finish TFA: %s' % 'Invalid TFA code' + if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg) + return False + + check_cookie_url = try_get( + tfa_results, lambda x: x[0][-1][2], compat_str) + else: + CHALLENGES = { + 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.", + 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.', + 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.", + } + challenge = CHALLENGES.get( + challenge_str, + '%s returned error %s.' % (self.IE_NAME, challenge_str)) + warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge) + return False + else: + check_cookie_url = try_get(res, lambda x: x[2], compat_str) + + if not check_cookie_url: + warn('Unable to extract CheckCookie URL') + return False + + check_cookie_results = self._download_webpage( + check_cookie_url, None, 'Checking cookie', fatal=False) + + if check_cookie_results is False: + return False + + if 'https://myaccount.google.com/' not in check_cookie_results: + warn('Unable to log in') + return False + + return True + ''' + + def _initialize_consent(self): + cookies = self._get_cookies('https://www.youtube.com/') + if cookies.get('__Secure-3PSID'): + return + consent_id = None + consent = cookies.get('CONSENT') + if consent: + if 'YES' in consent.value: + return + consent_id = self._search_regex( + r'PENDING\+(\d+)', consent.value, 'consent', default=None) + if not consent_id: + consent_id = random.randint(100, 999) + self._set_cookie('.youtube.com', 'CONSENT', 'YES+cb.20210328-17-p0.en+FX+%s' % consent_id) + + def _real_initialize(self): + self._initialize_consent() + if self._downloader is None: + return + if not self._login(): + return + + _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;' + _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;' + _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)' + + def _get_default_ytcfg(self, client='web'): + return copy.deepcopy(INNERTUBE_CLIENTS[client]) + + def _get_innertube_host(self, client='web'): + return INNERTUBE_CLIENTS[client]['INNERTUBE_HOST'] + + def _ytcfg_get_safe(self, ytcfg, getter, expected_type=None, default_client='web'): + # try_get but with fallback to default ytcfg client values when present + _func = lambda y: try_get(y, getter, expected_type) + return _func(ytcfg) or _func(self._get_default_ytcfg(default_client)) + + def _extract_client_name(self, ytcfg, default_client='web'): + return self._ytcfg_get_safe( + ytcfg, (lambda x: x['INNERTUBE_CLIENT_NAME'], + lambda x: x['INNERTUBE_CONTEXT']['client']['clientName']), compat_str, default_client) + + def _extract_client_version(self, ytcfg, default_client='web'): + return self._ytcfg_get_safe( + ytcfg, (lambda x: x['INNERTUBE_CLIENT_VERSION'], + lambda x: x['INNERTUBE_CONTEXT']['client']['clientVersion']), compat_str, default_client) + + def _extract_api_key(self, ytcfg=None, default_client='web'): + return self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_API_KEY'], compat_str, default_client) + + def _extract_context(self, ytcfg=None, default_client='web'): + _get_context = lambda y: try_get(y, lambda x: x['INNERTUBE_CONTEXT'], dict) + context = _get_context(ytcfg) + if context: + return context + + context = _get_context(self._get_default_ytcfg(default_client)) + if not ytcfg: + return context + + # Recreate the client context (required) + context['client'].update({ + 'clientVersion': self._extract_client_version(ytcfg, default_client), + 'clientName': self._extract_client_name(ytcfg, default_client), + }) + visitor_data = try_get(ytcfg, lambda x: x['VISITOR_DATA'], compat_str) + if visitor_data: + context['client']['visitorData'] = visitor_data + return context + + _SAPISID = None + + def _generate_sapisidhash_header(self, origin='https://www.youtube.com'): + time_now = round(time.time()) + if self._SAPISID is None: + yt_cookies = self._get_cookies('https://www.youtube.com') + # Sometimes SAPISID cookie isn't present but __Secure-3PAPISID is. + # See: https://github.com/yt-dlp/yt-dlp/issues/393 + sapisid_cookie = dict_get( + yt_cookies, ('__Secure-3PAPISID', 'SAPISID')) + if sapisid_cookie and sapisid_cookie.value: + self._SAPISID = sapisid_cookie.value + self.write_debug('Extracted SAPISID cookie') + # SAPISID cookie is required if not already present + if not yt_cookies.get('SAPISID'): + self.write_debug('Copying __Secure-3PAPISID cookie to SAPISID cookie') + self._set_cookie( + '.youtube.com', 'SAPISID', self._SAPISID, secure=True, expire_time=time_now + 3600) + else: + self._SAPISID = False + if not self._SAPISID: + return None + # SAPISIDHASH algorithm from https://stackoverflow.com/a/32065323 + sapisidhash = hashlib.sha1( + f'{time_now} {self._SAPISID} {origin}'.encode('utf-8')).hexdigest() + return f'SAPISIDHASH {time_now}_{sapisidhash}' + + def _call_api(self, ep, query, video_id, fatal=True, headers=None, + note='Downloading API JSON', errnote='Unable to download API page', + context=None, api_key=None, api_hostname=None, default_client='web'): + + data = {'context': context} if context else {'context': self._extract_context(default_client=default_client)} + data.update(query) + real_headers = self.generate_api_headers(default_client=default_client) + real_headers.update({'content-type': 'application/json'}) + if headers: + real_headers.update(headers) + return self._download_json( + 'https://%s/youtubei/v1/%s' % (api_hostname or self._get_innertube_host(default_client), ep), + video_id=video_id, fatal=fatal, note=note, errnote=errnote, + data=json.dumps(data).encode('utf8'), headers=real_headers, + query={'key': api_key or self._extract_api_key()}) + + def extract_yt_initial_data(self, item_id, webpage, fatal=True): + data = self._search_regex( + (r'%s\s*%s' % (self._YT_INITIAL_DATA_RE, self._YT_INITIAL_BOUNDARY_RE), + self._YT_INITIAL_DATA_RE), webpage, 'yt initial data', fatal=fatal) + if data: + return self._parse_json(data, item_id, fatal=fatal) + + @staticmethod + def _extract_session_index(*data): + """ + Index of current account in account list. + See: https://github.com/yt-dlp/yt-dlp/pull/519 + """ + for ytcfg in data: + session_index = int_or_none(try_get(ytcfg, lambda x: x['SESSION_INDEX'])) + if session_index is not None: + return session_index + + # Deprecated? + def _extract_identity_token(self, ytcfg=None, webpage=None): + if ytcfg: + token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str) + if token: + return token + if webpage: + return self._search_regex( + r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage, + 'identity token', default=None, fatal=False) + + @staticmethod + def _extract_account_syncid(*args): + """ + Extract syncId required to download private playlists of secondary channels + @params response and/or ytcfg + """ + for data in args: + # ytcfg includes channel_syncid if on secondary channel + delegated_sid = try_get(data, lambda x: x['DELEGATED_SESSION_ID'], compat_str) + if delegated_sid: + return delegated_sid + sync_ids = (try_get( + data, (lambda x: x['responseContext']['mainAppWebResponseContext']['datasyncId'], + lambda x: x['DATASYNC_ID']), compat_str) or '').split('||') + if len(sync_ids) >= 2 and sync_ids[1]: + # datasyncid is of the form "channel_syncid||user_syncid" for secondary channel + # and just "user_syncid||" for primary channel. We only want the channel_syncid + return sync_ids[0] + + @staticmethod + def _extract_visitor_data(*args): + """ + Extracts visitorData from an API response or ytcfg + Appears to be used to track session state + """ + return traverse_obj( + args, (..., ('VISITOR_DATA', ('INNERTUBE_CONTEXT', 'client', 'visitorData'), ('responseContext', 'visitorData'))), + expected_type=compat_str, get_all=False) + + @property + def is_authenticated(self): + return bool(self._generate_sapisidhash_header()) + + def extract_ytcfg(self, video_id, webpage): + if not webpage: + return {} + return self._parse_json( + self._search_regex( + r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg', + default='{}'), video_id, fatal=False) or {} + + def generate_api_headers( + self, *, ytcfg=None, account_syncid=None, session_index=None, + visitor_data=None, identity_token=None, api_hostname=None, default_client='web'): + + origin = 'https://' + (api_hostname if api_hostname else self._get_innertube_host(default_client)) + headers = { + 'X-YouTube-Client-Name': compat_str( + self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CONTEXT_CLIENT_NAME'], default_client=default_client)), + 'X-YouTube-Client-Version': self._extract_client_version(ytcfg, default_client), + 'Origin': origin, + 'X-Youtube-Identity-Token': identity_token or self._extract_identity_token(ytcfg), + 'X-Goog-PageId': account_syncid or self._extract_account_syncid(ytcfg), + 'X-Goog-Visitor-Id': visitor_data or self._extract_visitor_data(ytcfg) + } + if session_index is None: + session_index = self._extract_session_index(ytcfg) + if account_syncid or session_index is not None: + headers['X-Goog-AuthUser'] = session_index if session_index is not None else 0 + + auth = self._generate_sapisidhash_header(origin) + if auth is not None: + headers['Authorization'] = auth + headers['X-Origin'] = origin + return {h: v for h, v in headers.items() if v is not None} + + @staticmethod + def _build_api_continuation_query(continuation, ctp=None): + query = { + 'continuation': continuation + } + # TODO: Inconsistency with clickTrackingParams. + # Currently we have a fixed ctp contained within context (from ytcfg) + # and a ctp in root query for continuation. + if ctp: + query['clickTracking'] = {'clickTrackingParams': ctp} + return query + + @classmethod + def _extract_next_continuation_data(cls, renderer): + next_continuation = try_get( + renderer, (lambda x: x['continuations'][0]['nextContinuationData'], + lambda x: x['continuation']['reloadContinuationData']), dict) + if not next_continuation: + return + continuation = next_continuation.get('continuation') + if not continuation: + return + ctp = next_continuation.get('clickTrackingParams') + return cls._build_api_continuation_query(continuation, ctp) + + @classmethod + def _extract_continuation_ep_data(cls, continuation_ep: dict): + if isinstance(continuation_ep, dict): + continuation = try_get( + continuation_ep, lambda x: x['continuationCommand']['token'], compat_str) + if not continuation: + return + ctp = continuation_ep.get('clickTrackingParams') + return cls._build_api_continuation_query(continuation, ctp) + + @classmethod + def _extract_continuation(cls, renderer): + next_continuation = cls._extract_next_continuation_data(renderer) + if next_continuation: + return next_continuation + + contents = [] + for key in ('contents', 'items'): + contents.extend(try_get(renderer, lambda x: x[key], list) or []) + + for content in contents: + if not isinstance(content, dict): + continue + continuation_ep = try_get( + content, (lambda x: x['continuationItemRenderer']['continuationEndpoint'], + lambda x: x['continuationItemRenderer']['button']['buttonRenderer']['command']), + dict) + continuation = cls._extract_continuation_ep_data(continuation_ep) + if continuation: + return continuation + + @classmethod + def _extract_alerts(cls, data): + for alert_dict in try_get(data, lambda x: x['alerts'], list) or []: + if not isinstance(alert_dict, dict): + continue + for alert in alert_dict.values(): + alert_type = alert.get('type') + if not alert_type: + continue + message = cls._get_text(alert, 'text') + if message: + yield alert_type, message + + def _report_alerts(self, alerts, expected=True, fatal=True, only_once=False): + errors = [] + warnings = [] + for alert_type, alert_message in alerts: + if alert_type.lower() == 'error' and fatal: + errors.append([alert_type, alert_message]) + else: + warnings.append([alert_type, alert_message]) + + for alert_type, alert_message in (warnings + errors[:-1]): + self.report_warning('YouTube said: %s - %s' % (alert_type, alert_message), only_once=only_once) + if errors: + raise ExtractorError('YouTube said: %s' % errors[-1][1], expected=expected) + + def _extract_and_report_alerts(self, data, *args, **kwargs): + return self._report_alerts(self._extract_alerts(data), *args, **kwargs) + + def _extract_badges(self, renderer: dict): + badges = set() + for badge in try_get(renderer, lambda x: x['badges'], list) or []: + label = try_get(badge, lambda x: x['metadataBadgeRenderer']['label'], compat_str) + if label: + badges.add(label.lower()) + return badges + + @staticmethod + def _get_text(data, *path_list, max_runs=None): + for path in path_list or [None]: + if path is None: + obj = [data] + else: + obj = traverse_obj(data, path, default=[]) + if not any(key is ... or isinstance(key, (list, tuple)) for key in variadic(path)): + obj = [obj] + for item in obj: + text = try_get(item, lambda x: x['simpleText'], compat_str) + if text: + return text + runs = try_get(item, lambda x: x['runs'], list) or [] + if not runs and isinstance(item, list): + runs = item + + runs = runs[:min(len(runs), max_runs or len(runs))] + text = ''.join(traverse_obj(runs, (..., 'text'), expected_type=str, default=[])) + if text: + return text + + def _extract_response(self, item_id, query, note='Downloading API JSON', headers=None, + ytcfg=None, check_get_keys=None, ep='browse', fatal=True, api_hostname=None, + default_client='web'): + response = None + last_error = None + count = -1 + retries = self.get_param('extractor_retries', 3) + if check_get_keys is None: + check_get_keys = [] + while count < retries: + count += 1 + if last_error: + self.report_warning('%s. Retrying ...' % remove_end(last_error, '.')) + try: + response = self._call_api( + ep=ep, fatal=True, headers=headers, + video_id=item_id, query=query, + context=self._extract_context(ytcfg, default_client), + api_key=self._extract_api_key(ytcfg, default_client), + api_hostname=api_hostname, default_client=default_client, + note='%s%s' % (note, ' (retry #%d)' % count if count else '')) + except ExtractorError as e: + if isinstance(e.cause, network_exceptions): + if isinstance(e.cause, compat_HTTPError) and not is_html(e.cause.read(512)): + e.cause.seek(0) + yt_error = try_get( + self._parse_json(e.cause.read().decode(), item_id, fatal=False), + lambda x: x['error']['message'], compat_str) + if yt_error: + self._report_alerts([('ERROR', yt_error)], fatal=False) + # Downloading page may result in intermittent 5xx HTTP error + # Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289 + # We also want to catch all other network exceptions since errors in later pages can be troublesome + # See https://github.com/yt-dlp/yt-dlp/issues/507#issuecomment-880188210 + if not isinstance(e.cause, compat_HTTPError) or e.cause.code not in (403, 429): + last_error = error_to_compat_str(e.cause or e.msg) + if count < retries: + continue + if fatal: + raise + else: + self.report_warning(error_to_compat_str(e)) + return + + else: + try: + self._extract_and_report_alerts(response, only_once=True) + except ExtractorError as e: + # YouTube servers may return errors we want to retry on in a 200 OK response + # See: https://github.com/yt-dlp/yt-dlp/issues/839 + if 'unknown error' in e.msg.lower(): + last_error = e.msg + continue + if fatal: + raise + self.report_warning(error_to_compat_str(e)) + return + if not check_get_keys or dict_get(response, check_get_keys): + break + # Youtube sometimes sends incomplete data + # See: https://github.com/ytdl-org/youtube-dl/issues/28194 + last_error = 'Incomplete data received' + if count >= retries: + if fatal: + raise ExtractorError(last_error) + else: + self.report_warning(last_error) + return + return response + + @staticmethod + def is_music_url(url): + return re.match(r'https?://music\.youtube\.com/', url) is not None + + def _extract_video(self, renderer): + video_id = renderer.get('videoId') + title = self._get_text(renderer, 'title') + description = self._get_text(renderer, 'descriptionSnippet') + duration = parse_duration(self._get_text( + renderer, 'lengthText', ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'text'))) + view_count_text = self._get_text(renderer, 'viewCountText') or '' + view_count = str_to_int(self._search_regex( + r'^([\d,]+)', re.sub(r'\s', '', view_count_text), + 'view count', default=None)) + + uploader = self._get_text(renderer, 'ownerText', 'shortBylineText') + + return { + '_type': 'url', + 'ie_key': YoutubeIE.ie_key(), + 'id': video_id, + 'url': f'https://www.youtube.com/watch?v={video_id}', + 'title': title, + 'description': description, + 'duration': duration, + 'view_count': view_count, + 'uploader': uploader, + } + + +class YoutubeIE(YoutubeBaseInfoExtractor): + IE_DESC = 'YouTube.com' + _INVIDIOUS_SITES = ( + # invidious-redirect websites + r'(?:www\.)?redirect\.invidious\.io', + r'(?:(?:www|dev)\.)?invidio\.us', + # Invidious instances taken from https://github.com/iv-org/documentation/blob/master/Invidious-Instances.md + r'(?:www\.)?invidious\.pussthecat\.org', + r'(?:www\.)?invidious\.zee\.li', + r'(?:www\.)?invidious\.ethibox\.fr', + r'(?:www\.)?invidious\.3o7z6yfxhbw7n3za4rss6l434kmv55cgw2vuziwuigpwegswvwzqipyd\.onion', + # youtube-dl invidious instances list + r'(?:(?:www|no)\.)?invidiou\.sh', + r'(?:(?:www|fi)\.)?invidious\.snopyta\.org', + r'(?:www\.)?invidious\.kabi\.tk', + r'(?:www\.)?invidious\.mastodon\.host', + r'(?:www\.)?invidious\.zapashcanon\.fr', + r'(?:www\.)?(?:invidious(?:-us)?|piped)\.kavin\.rocks', + r'(?:www\.)?invidious\.tinfoil-hat\.net', + r'(?:www\.)?invidious\.himiko\.cloud', + r'(?:www\.)?invidious\.reallyancient\.tech', + r'(?:www\.)?invidious\.tube', + r'(?:www\.)?invidiou\.site', + r'(?:www\.)?invidious\.site', + r'(?:www\.)?invidious\.xyz', + r'(?:www\.)?invidious\.nixnet\.xyz', + r'(?:www\.)?invidious\.048596\.xyz', + r'(?:www\.)?invidious\.drycat\.fr', + r'(?:www\.)?inv\.skyn3t\.in', + r'(?:www\.)?tube\.poal\.co', + r'(?:www\.)?tube\.connect\.cafe', + r'(?:www\.)?vid\.wxzm\.sx', + r'(?:www\.)?vid\.mint\.lgbt', + r'(?:www\.)?vid\.puffyan\.us', + r'(?:www\.)?yewtu\.be', + r'(?:www\.)?yt\.elukerio\.org', + r'(?:www\.)?yt\.lelux\.fi', + r'(?:www\.)?invidious\.ggc-project\.de', + r'(?:www\.)?yt\.maisputain\.ovh', + r'(?:www\.)?ytprivate\.com', + r'(?:www\.)?invidious\.13ad\.de', + r'(?:www\.)?invidious\.toot\.koeln', + r'(?:www\.)?invidious\.fdn\.fr', + r'(?:www\.)?watch\.nettohikari\.com', + r'(?:www\.)?invidious\.namazso\.eu', + r'(?:www\.)?invidious\.silkky\.cloud', + r'(?:www\.)?invidious\.exonip\.de', + r'(?:www\.)?invidious\.riverside\.rocks', + r'(?:www\.)?invidious\.blamefran\.net', + r'(?:www\.)?invidious\.moomoo\.de', + r'(?:www\.)?ytb\.trom\.tf', + r'(?:www\.)?yt\.cyberhost\.uk', + r'(?:www\.)?kgg2m7yk5aybusll\.onion', + r'(?:www\.)?qklhadlycap4cnod\.onion', + r'(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion', + r'(?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion', + r'(?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion', + r'(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion', + r'(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p', + r'(?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion', + r'(?:www\.)?w6ijuptxiku4xpnnaetxvnkc5vqcdu7mgns2u77qefoixi63vbvnpnqd\.onion', + r'(?:www\.)?kbjggqkzv65ivcqj6bumvp337z6264huv5kpkwuv6gu5yjiskvan7fad\.onion', + r'(?:www\.)?grwp24hodrefzvjjuccrkw3mjq4tzhaaq32amf33dzpmuxe7ilepcmad\.onion', + r'(?:www\.)?hpniueoejy4opn7bc4ftgazyqjoeqwlvh2uiku2xqku6zpoa4bf5ruid\.onion', + ) + _VALID_URL = r"""(?x)^ + ( + (?:https?://|//) # http(s):// or protocol-independent URL + (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com| + (?:www\.)?deturl\.com/www\.youtube\.com| + (?:www\.)?pwnyoutube\.com| + (?:www\.)?hooktube\.com| + (?:www\.)?yourepeat\.com| + tube\.majestyc\.net| + %(invidious)s| + youtube\.googleapis\.com)/ # the various hostnames, with wildcard subdomains + (?:.*?\#/)? # handle anchor (#/) redirect urls + (?: # the various things that can precede the ID: + (?:(?:v|embed|e|shorts)/(?!videoseries)) # v/ or embed/ or e/ or shorts/ + |(?: # or the v= param in all its forms + (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) + (?:\?|\#!?) # the params delimiter ? or # or #! + (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&v=V36LpHqtcDY) + v= + ) + )) + |(?: + youtu\.be| # just youtu.be/xxxx + vid\.plus| # or vid.plus/xxxx + zwearz\.com/watch| # or zwearz.com/watch/xxxx + %(invidious)s + )/ + |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId= + ) + )? # all until now is optional -> you can pass the naked ID + (?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID + (?(1).+)? # if we found the ID, everything can follow + (?:\#|$)""" % { + 'invidious': '|'.join(_INVIDIOUS_SITES), + } + _PLAYER_INFO_RE = ( + r'/s/player/(?P<id>[a-zA-Z0-9_-]{8,})/player', + r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$', + r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.js$', + ) + _formats = { + '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, + '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, + '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'}, + '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'}, + '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'}, + '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, + '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well + '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'}, + '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, + '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, + '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, + '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, + '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, + '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, + '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + + + # 3D videos + '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20}, + '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20}, + '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20}, + '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20}, + '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20}, + '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20}, + '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20}, + + # Apple HTTP Live Streaming + '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, + '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, + '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, + '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, + '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10}, + '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10}, + '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, + '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10}, + + # DASH mp4 video + '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559) + '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, + '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, + '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'}, + + # Dash mp4 audio + '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'}, + '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'}, + '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'}, + '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, + '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, + '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'}, + '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'}, + + # Dash webm + '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'}, + '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug) + '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + + # Dash webm audio + '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128}, + '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256}, + + # Dash webm audio with opus inside + '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50}, + '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70}, + '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160}, + + # RTMP (unnamed) + '_rtmp': {'protocol': 'rtmp'}, + + # av01 video only formats sometimes served with "unknown" codecs + '394': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'av01.0.00M.08'}, + '395': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'av01.0.00M.08'}, + '396': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'av01.0.01M.08'}, + '397': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'av01.0.04M.08'}, + '398': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'av01.0.05M.08'}, + '399': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'av01.0.08M.08'}, + '400': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'av01.0.12M.08'}, + '401': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'av01.0.12M.08'}, + } + _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt') + + _GEO_BYPASS = False + + IE_NAME = 'youtube' + _TESTS = [ + { + 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9', + 'info_dict': { + 'id': 'BaW_jenozKc', + 'ext': 'mp4', + 'title': 'youtube-dl test video "\'/\\ä↭𝕐', + 'uploader': 'Philipp Hagemeister', + 'uploader_id': 'phihag', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', + 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q', + 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q', + 'upload_date': '20121002', + 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', + 'categories': ['Science & Technology'], + 'tags': ['youtube-dl'], + 'duration': 10, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'start_time': 1, + 'end_time': 9, + } + }, + { + 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ', + 'note': 'Embed-only video (#1746)', + 'info_dict': { + 'id': 'yZIXLfi8CZQ', + 'ext': 'mp4', + 'upload_date': '20120608', + 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012', + 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7', + 'uploader': 'SET India', + 'uploader_id': 'setindia', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia', + 'age_limit': 18, + }, + 'skip': 'Private video', + }, + { + 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=yZIXLfi8CZQ', + 'note': 'Use the first video ID in the URL', + 'info_dict': { + 'id': 'BaW_jenozKc', + 'ext': 'mp4', + 'title': 'youtube-dl test video "\'/\\ä↭𝕐', + 'uploader': 'Philipp Hagemeister', + 'uploader_id': 'phihag', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', + 'upload_date': '20121002', + 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', + 'categories': ['Science & Technology'], + 'tags': ['youtube-dl'], + 'duration': 10, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + }, + 'params': { + 'skip_download': True, + }, + }, + { + 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I', + 'note': '256k DASH audio (format 141) via DASH manifest', + 'info_dict': { + 'id': 'a9LDPn-MO4I', + 'ext': 'm4a', + 'upload_date': '20121002', + 'uploader_id': '8KVIDEO', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO', + 'description': '', + 'uploader': '8KVIDEO', + 'title': 'UHDTV TEST 8K VIDEO.mp4' + }, + 'params': { + 'youtube_include_dash_manifest': True, + 'format': '141', + }, + 'skip': 'format 141 not served anymore', + }, + # DASH manifest with encrypted signature + { + 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA', + 'info_dict': { + 'id': 'IB3lcPjvWLA', + 'ext': 'm4a', + 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson', + 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf', + 'duration': 244, + 'uploader': 'AfrojackVEVO', + 'uploader_id': 'AfrojackVEVO', + 'upload_date': '20131011', + 'abr': 129.495, + }, + 'params': { + 'youtube_include_dash_manifest': True, + 'format': '141/bestaudio[ext=m4a]', + }, + }, + # Age-gate videos. See https://github.com/yt-dlp/yt-dlp/pull/575#issuecomment-888837000 + { + 'note': 'Embed allowed age-gate video', + 'url': 'https://youtube.com/watch?v=HtVdAasjOgU', + 'info_dict': { + 'id': 'HtVdAasjOgU', + 'ext': 'mp4', + 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer', + 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}', + 'duration': 142, + 'uploader': 'The Witcher', + 'uploader_id': 'WitcherGame', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame', + 'upload_date': '20140605', + 'age_limit': 18, + }, + }, + { + 'note': 'Age-gate video with embed allowed in public site', + 'url': 'https://youtube.com/watch?v=HsUATh_Nc2U', + 'info_dict': { + 'id': 'HsUATh_Nc2U', + 'ext': 'mp4', + 'title': 'Godzilla 2 (Official Video)', + 'description': 'md5:bf77e03fcae5529475e500129b05668a', + 'upload_date': '20200408', + 'uploader_id': 'FlyingKitty900', + 'uploader': 'FlyingKitty', + 'age_limit': 18, + }, + }, + { + 'note': 'Age-gate video embedable only with clientScreen=EMBED', + 'url': 'https://youtube.com/watch?v=Tq92D6wQ1mg', + 'info_dict': { + 'id': 'Tq92D6wQ1mg', + 'title': '[MMD] Adios - EVERGLOW [+Motion DL]', + 'ext': 'mp4', + 'upload_date': '20191227', + 'uploader_id': 'UC1yoRdFoFJaCY-AGfD9W0wQ', + 'uploader': 'Projekt Melody', + 'description': 'md5:17eccca93a786d51bc67646756894066', + 'age_limit': 18, + }, + }, + { + 'note': 'Non-Agegated non-embeddable video', + 'url': 'https://youtube.com/watch?v=MeJVWBSsPAY', + 'info_dict': { + 'id': 'MeJVWBSsPAY', + 'ext': 'mp4', + 'title': 'OOMPH! - Such Mich Find Mich (Lyrics)', + 'uploader': 'Herr Lurik', + 'uploader_id': 'st3in234', + 'description': 'Fan Video. Music & Lyrics by OOMPH!.', + 'upload_date': '20130730', + }, + }, + { + 'note': 'Non-bypassable age-gated video', + 'url': 'https://youtube.com/watch?v=Cr381pDsSsA', + 'only_matching': True, + }, + # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421) + # YouTube Red ad is not captured for creator + { + 'url': '__2ABJjxzNo', + 'info_dict': { + 'id': '__2ABJjxzNo', + 'ext': 'mp4', + 'duration': 266, + 'upload_date': '20100430', + 'uploader_id': 'deadmau5', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5', + 'creator': 'deadmau5', + 'description': 'md5:6cbcd3a92ce1bc676fc4d6ab4ace2336', + 'uploader': 'deadmau5', + 'title': 'Deadmau5 - Some Chords (HD)', + 'alt_title': 'Some Chords', + }, + 'expected_warnings': [ + 'DASH manifest missing', + ] + }, + # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431) + { + 'url': 'lqQg6PlCWgI', + 'info_dict': { + 'id': 'lqQg6PlCWgI', + 'ext': 'mp4', + 'duration': 6085, + 'upload_date': '20150827', + 'uploader_id': 'olympic', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic', + 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games', + 'uploader': 'Olympics', + 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games', + }, + 'params': { + 'skip_download': 'requires avconv', + } + }, + # Non-square pixels + { + 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0', + 'info_dict': { + 'id': '_b-2C3KPAM0', + 'ext': 'mp4', + 'stretched_ratio': 16 / 9., + 'duration': 85, + 'upload_date': '20110310', + 'uploader_id': 'AllenMeow', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow', + 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯', + 'uploader': '孫ᄋᄅ', + 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人', + }, + }, + # url_encoded_fmt_stream_map is empty string + { + 'url': 'qEJwOuvDf7I', + 'info_dict': { + 'id': 'qEJwOuvDf7I', + 'ext': 'webm', + 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге', + 'description': '', + 'upload_date': '20150404', + 'uploader_id': 'spbelect', + 'uploader': 'Наблюдатели Петербурга', + }, + 'params': { + 'skip_download': 'requires avconv', + }, + 'skip': 'This live event has ended.', + }, + # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097) + { + 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y', + 'info_dict': { + 'id': 'FIl7x6_3R5Y', + 'ext': 'webm', + 'title': 'md5:7b81415841e02ecd4313668cde88737a', + 'description': 'md5:116377fd2963b81ec4ce64b542173306', + 'duration': 220, + 'upload_date': '20150625', + 'uploader_id': 'dorappi2000', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000', + 'uploader': 'dorappi2000', + 'formats': 'mincount:31', + }, + 'skip': 'not actual anymore', + }, + # DASH manifest with segment_list + { + 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8', + 'md5': '8ce563a1d667b599d21064e982ab9e31', + 'info_dict': { + 'id': 'CsmdDsKjzN8', + 'ext': 'mp4', + 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510 + 'uploader': 'Airtek', + 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.', + 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ', + 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015', + }, + 'params': { + 'youtube_include_dash_manifest': True, + 'format': '135', # bestvideo + }, + 'skip': 'This live event has ended.', + }, + { + # Multifeed videos (multiple cameras), URL is for Main Camera + 'url': 'https://www.youtube.com/watch?v=jvGDaLqkpTg', + 'info_dict': { + 'id': 'jvGDaLqkpTg', + 'title': 'Tom Clancy Free Weekend Rainbow Whatever', + 'description': 'md5:e03b909557865076822aa169218d6a5d', + }, + 'playlist': [{ + 'info_dict': { + 'id': 'jvGDaLqkpTg', + 'ext': 'mp4', + 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Main Camera)', + 'description': 'md5:e03b909557865076822aa169218d6a5d', + 'duration': 10643, + 'upload_date': '20161111', + 'uploader': 'Team PGP', + 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg', + }, + }, { + 'info_dict': { + 'id': '3AKt1R1aDnw', + 'ext': 'mp4', + 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 2)', + 'description': 'md5:e03b909557865076822aa169218d6a5d', + 'duration': 10991, + 'upload_date': '20161111', + 'uploader': 'Team PGP', + 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg', + }, + }, { + 'info_dict': { + 'id': 'RtAMM00gpVc', + 'ext': 'mp4', + 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 3)', + 'description': 'md5:e03b909557865076822aa169218d6a5d', + 'duration': 10995, + 'upload_date': '20161111', + 'uploader': 'Team PGP', + 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg', + }, + }, { + 'info_dict': { + 'id': '6N2fdlP3C5U', + 'ext': 'mp4', + 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 4)', + 'description': 'md5:e03b909557865076822aa169218d6a5d', + 'duration': 10990, + 'upload_date': '20161111', + 'uploader': 'Team PGP', + 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg', + }, + }], + 'params': { + 'skip_download': True, + }, + 'skip': 'Not multifeed anymore', + }, + { + # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536) + 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo', + 'info_dict': { + 'id': 'gVfLd0zydlo', + 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30', + }, + 'playlist_count': 2, + 'skip': 'Not multifeed anymore', + }, + { + 'url': 'https://vid.plus/FlRa-iH7PGw', + 'only_matching': True, + }, + { + 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html', + 'only_matching': True, + }, + { + # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468) + # Also tests cut-off URL expansion in video description (see + # https://github.com/ytdl-org/youtube-dl/issues/1892, + # https://github.com/ytdl-org/youtube-dl/issues/8164) + 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg', + 'info_dict': { + 'id': 'lsguqyKfVQg', + 'ext': 'mp4', + 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21', + 'alt_title': 'Dark Walk', + 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a', + 'duration': 133, + 'upload_date': '20151119', + 'uploader_id': 'IronSoulElf', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf', + 'uploader': 'IronSoulElf', + 'creator': 'Todd Haberman;\nDaniel Law Heath and Aaron Kaplan', + 'track': 'Dark Walk', + 'artist': 'Todd Haberman;\nDaniel Law Heath and Aaron Kaplan', + 'album': 'Position Music - Production Music Vol. 143 - Dark Walk', + }, + 'params': { + 'skip_download': True, + }, + }, + { + # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468) + 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8', + 'only_matching': True, + }, + { + # Video with yt:stretch=17:0 + 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM', + 'info_dict': { + 'id': 'Q39EVAstoRM', + 'ext': 'mp4', + 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4', + 'description': 'md5:ee18a25c350637c8faff806845bddee9', + 'upload_date': '20151107', + 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA', + 'uploader': 'CH GAMER DROID', + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'This video does not exist.', + }, + { + # Video with incomplete 'yt:stretch=16:' + 'url': 'https://www.youtube.com/watch?v=FRhJzUSJbGI', + 'only_matching': True, + }, + { + # Video licensed under Creative Commons + 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA', + 'info_dict': { + 'id': 'M4gD1WSo5mA', + 'ext': 'mp4', + 'title': 'md5:e41008789470fc2533a3252216f1c1d1', + 'description': 'md5:a677553cf0840649b731a3024aeff4cc', + 'duration': 721, + 'upload_date': '20150127', + 'uploader_id': 'BerkmanCenter', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter', + 'uploader': 'The Berkman Klein Center for Internet & Society', + 'license': 'Creative Commons Attribution license (reuse allowed)', + }, + 'params': { + 'skip_download': True, + }, + }, + { + # Channel-like uploader_url + 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg', + 'info_dict': { + 'id': 'eQcmzGIKrzg', + 'ext': 'mp4', + 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders', + 'description': 'md5:13a2503d7b5904ef4b223aa101628f39', + 'duration': 4060, + 'upload_date': '20151119', + 'uploader': 'Bernie Sanders', + 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg', + 'license': 'Creative Commons Attribution license (reuse allowed)', + }, + 'params': { + 'skip_download': True, + }, + }, + { + 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;v=V36LpHqtcDY', + 'only_matching': True, + }, + { + # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059) + 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo', + 'only_matching': True, + }, + { + # Rental video preview + 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg', + 'info_dict': { + 'id': 'uGpuVWrhIzE', + 'ext': 'mp4', + 'title': 'Piku - Trailer', + 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb', + 'upload_date': '20150811', + 'uploader': 'FlixMatrix', + 'uploader_id': 'FlixMatrixKaravan', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan', + 'license': 'Standard YouTube License', + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'This video is not available.', + }, + { + # YouTube Red video with episode data + 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4', + 'info_dict': { + 'id': 'iqKdEhx-dD4', + 'ext': 'mp4', + 'title': 'Isolation - Mind Field (Ep 1)', + 'description': 'md5:f540112edec5d09fc8cc752d3d4ba3cd', + 'duration': 2085, + 'upload_date': '20170118', + 'uploader': 'Vsauce', + 'uploader_id': 'Vsauce', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce', + 'series': 'Mind Field', + 'season_number': 1, + 'episode_number': 1, + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': [ + 'Skipping DASH manifest', + ], + }, + { + # The following content has been identified by the YouTube community + # as inappropriate or offensive to some audiences. + 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI', + 'info_dict': { + 'id': '6SJNVb0GnPI', + 'ext': 'mp4', + 'title': 'Race Differences in Intelligence', + 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1', + 'duration': 965, + 'upload_date': '20140124', + 'uploader': 'New Century Foundation', + 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg', + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'This video has been removed for violating YouTube\'s policy on hate speech.', + }, + { + # itag 212 + 'url': '1t24XAntNCY', + 'only_matching': True, + }, + { + # geo restricted to JP + 'url': 'sJL6WA-aGkQ', + 'only_matching': True, + }, + { + 'url': 'https://invidio.us/watch?v=BaW_jenozKc', + 'only_matching': True, + }, + { + 'url': 'https://redirect.invidious.io/watch?v=BaW_jenozKc', + 'only_matching': True, + }, + { + # from https://nitter.pussthecat.org/YouTube/status/1360363141947944964#m + 'url': 'https://redirect.invidious.io/Yh0AhrY9GjA', + 'only_matching': True, + }, + { + # DRM protected + 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc', + 'only_matching': True, + }, + { + # Video with unsupported adaptive stream type formats + 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U', + 'info_dict': { + 'id': 'Z4Vy8R84T1U', + 'ext': 'mp4', + 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'duration': 433, + 'upload_date': '20130923', + 'uploader': 'Amelia Putri Harwita', + 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q', + 'formats': 'maxcount:10', + }, + 'params': { + 'skip_download': True, + 'youtube_include_dash_manifest': False, + }, + 'skip': 'not actual anymore', + }, + { + # Youtube Music Auto-generated description + 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs', + 'info_dict': { + 'id': 'MgNrAu2pzNs', + 'ext': 'mp4', + 'title': 'Voyeur Girl', + 'description': 'md5:7ae382a65843d6df2685993e90a8628f', + 'upload_date': '20190312', + 'uploader': 'Stephen - Topic', + 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA', + 'artist': 'Stephen', + 'track': 'Voyeur Girl', + 'album': 'it\'s too much love to know my dear', + 'release_date': '20190313', + 'release_year': 2019, + }, + 'params': { + 'skip_download': True, + }, + }, + { + 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q', + 'only_matching': True, + }, + { + # invalid -> valid video id redirection + 'url': 'DJztXj2GPfl', + 'info_dict': { + 'id': 'DJztXj2GPfk', + 'ext': 'mp4', + 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)', + 'description': 'md5:bf577a41da97918e94fa9798d9228825', + 'upload_date': '20090125', + 'uploader': 'Prochorowka', + 'uploader_id': 'Prochorowka', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka', + 'artist': 'Panjabi MC', + 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix', + 'album': 'Beware of the Boys (Mundian To Bach Ke)', + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Video unavailable', + }, + { + # empty description results in an empty string + 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k', + 'info_dict': { + 'id': 'x41yOUIvK2k', + 'ext': 'mp4', + 'title': 'IMG 3456', + 'description': '', + 'upload_date': '20170613', + 'uploader_id': 'ElevageOrVert', + 'uploader': 'ElevageOrVert', + }, + 'params': { + 'skip_download': True, + }, + }, + { + # with '};' inside yt initial data (see [1]) + # see [2] for an example with '};' inside ytInitialPlayerResponse + # 1. https://github.com/ytdl-org/youtube-dl/issues/27093 + # 2. https://github.com/ytdl-org/youtube-dl/issues/27216 + 'url': 'https://www.youtube.com/watch?v=CHqg6qOn4no', + 'info_dict': { + 'id': 'CHqg6qOn4no', + 'ext': 'mp4', + 'title': 'Part 77 Sort a list of simple types in c#', + 'description': 'md5:b8746fa52e10cdbf47997903f13b20dc', + 'upload_date': '20130831', + 'uploader_id': 'kudvenkat', + 'uploader': 'kudvenkat', + }, + 'params': { + 'skip_download': True, + }, + }, + { + # another example of '};' in ytInitialData + 'url': 'https://www.youtube.com/watch?v=gVfgbahppCY', + 'only_matching': True, + }, + { + 'url': 'https://www.youtube.com/watch_popup?v=63RmMXCd_bQ', + 'only_matching': True, + }, + { + # https://github.com/ytdl-org/youtube-dl/pull/28094 + 'url': 'OtqTfy26tG0', + 'info_dict': { + 'id': 'OtqTfy26tG0', + 'ext': 'mp4', + 'title': 'Burn Out', + 'description': 'md5:8d07b84dcbcbfb34bc12a56d968b6131', + 'upload_date': '20141120', + 'uploader': 'The Cinematic Orchestra - Topic', + 'uploader_id': 'UCIzsJBIyo8hhpFm1NK0uLgw', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCIzsJBIyo8hhpFm1NK0uLgw', + 'artist': 'The Cinematic Orchestra', + 'track': 'Burn Out', + 'album': 'Every Day', + 'release_data': None, + 'release_year': None, + }, + 'params': { + 'skip_download': True, + }, + }, + { + # controversial video, only works with bpctr when authenticated with cookies + 'url': 'https://www.youtube.com/watch?v=nGC3D_FkCmg', + 'only_matching': True, + }, + { + # controversial video, requires bpctr/contentCheckOk + 'url': 'https://www.youtube.com/watch?v=SZJvDhaSDnc', + 'info_dict': { + 'id': 'SZJvDhaSDnc', + 'ext': 'mp4', + 'title': 'San Diego teen commits suicide after bullying over embarrassing video', + 'channel_id': 'UC-SJ6nODDmufqBzPBwCvYvQ', + 'uploader': 'CBS This Morning', + 'uploader_id': 'CBSThisMorning', + 'upload_date': '20140716', + 'description': 'md5:acde3a73d3f133fc97e837a9f76b53b7' + } + }, + { + # restricted location, https://github.com/ytdl-org/youtube-dl/issues/28685 + 'url': 'cBvYw8_A0vQ', + 'info_dict': { + 'id': 'cBvYw8_A0vQ', + 'ext': 'mp4', + 'title': '4K Ueno Okachimachi Street Scenes 上野御徒町歩き', + 'description': 'md5:ea770e474b7cd6722b4c95b833c03630', + 'upload_date': '20201120', + 'uploader': 'Walk around Japan', + 'uploader_id': 'UC3o_t8PzBmXf5S9b7GLx1Mw', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UC3o_t8PzBmXf5S9b7GLx1Mw', + }, + 'params': { + 'skip_download': True, + }, + }, { + # Has multiple audio streams + 'url': 'WaOKSUlf4TM', + 'only_matching': True + }, { + # Requires Premium: has format 141 when requested using YTM url + 'url': 'https://music.youtube.com/watch?v=XclachpHxis', + 'only_matching': True + }, { + # multiple subtitles with same lang_code + 'url': 'https://www.youtube.com/watch?v=wsQiKKfKxug', + 'only_matching': True, + }, { + # Force use android client fallback + 'url': 'https://www.youtube.com/watch?v=YOelRv7fMxY', + 'info_dict': { + 'id': 'YOelRv7fMxY', + 'title': 'DIGGING A SECRET TUNNEL Part 1', + 'ext': '3gp', + 'upload_date': '20210624', + 'channel_id': 'UCp68_FLety0O-n9QU6phsgw', + 'uploader': 'colinfurze', + 'uploader_id': 'colinfurze', + 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCp68_FLety0O-n9QU6phsgw', + 'description': 'md5:b5096f56af7ccd7a555c84db81738b22' + }, + 'params': { + 'format': '17', # 3gp format available on android + 'extractor_args': {'youtube': {'player_client': ['android']}}, + }, + }, + { + # Skip download of additional client configs (remix client config in this case) + 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs', + 'only_matching': True, + 'params': { + 'extractor_args': {'youtube': {'player_skip': ['configs']}}, + }, + }, { + # shorts + 'url': 'https://www.youtube.com/shorts/BGQWPY4IigY', + 'only_matching': True, + }, + ] + + @classmethod + def suitable(cls, url): + from ..utils import parse_qs + + qs = parse_qs(url) + if qs.get('list', [None])[0]: + return False + return super(YoutubeIE, cls).suitable(url) + + def __init__(self, *args, **kwargs): + super(YoutubeIE, self).__init__(*args, **kwargs) + self._code_cache = {} + self._player_cache = {} + + def _extract_player_url(self, *ytcfgs, webpage=None): + player_url = traverse_obj( + ytcfgs, (..., 'PLAYER_JS_URL'), (..., 'WEB_PLAYER_CONTEXT_CONFIGS', ..., 'jsUrl'), + get_all=False, expected_type=compat_str) + if not player_url: + return + if player_url.startswith('//'): + player_url = 'https:' + player_url + elif not re.match(r'https?://', player_url): + player_url = compat_urlparse.urljoin( + 'https://www.youtube.com', player_url) + return player_url + + def _download_player_url(self, video_id, fatal=False): + res = self._download_webpage( + 'https://www.youtube.com/iframe_api', + note='Downloading iframe API JS', video_id=video_id, fatal=fatal) + if res: + player_version = self._search_regex( + r'player\\?/([0-9a-fA-F]{8})\\?/', res, 'player version', fatal=fatal) + if player_version: + return f'https://www.youtube.com/s/player/{player_version}/player_ias.vflset/en_US/base.js' + + def _signature_cache_id(self, example_sig): + """ Return a string representation of a signature """ + return '.'.join(compat_str(len(part)) for part in example_sig.split('.')) + + @classmethod + def _extract_player_info(cls, player_url): + for player_re in cls._PLAYER_INFO_RE: + id_m = re.search(player_re, player_url) + if id_m: + break + else: + raise ExtractorError('Cannot identify player %r' % player_url) + return id_m.group('id') + + def _load_player(self, video_id, player_url, fatal=True) -> bool: + player_id = self._extract_player_info(player_url) + if player_id not in self._code_cache: + code = self._download_webpage( + player_url, video_id, fatal=fatal, + note='Downloading player ' + player_id, + errnote='Download of %s failed' % player_url) + if code: + self._code_cache[player_id] = code + return player_id in self._code_cache + + def _extract_signature_function(self, video_id, player_url, example_sig): + player_id = self._extract_player_info(player_url) + + # Read from filesystem cache + func_id = 'js_%s_%s' % ( + player_id, self._signature_cache_id(example_sig)) + assert os.path.basename(func_id) == func_id + + cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id) + if cache_spec is not None: + return lambda s: ''.join(s[i] for i in cache_spec) + + if self._load_player(video_id, player_url): + code = self._code_cache[player_id] + res = self._parse_sig_js(code) + + test_string = ''.join(map(compat_chr, range(len(example_sig)))) + cache_res = res(test_string) + cache_spec = [ord(c) for c in cache_res] + + self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec) + return res + + def _print_sig_code(self, func, example_sig): + def gen_sig_code(idxs): + def _genslice(start, end, step): + starts = '' if start == 0 else str(start) + ends = (':%d' % (end + step)) if end + step >= 0 else ':' + steps = '' if step == 1 else (':%d' % step) + return 's[%s%s%s]' % (starts, ends, steps) + + step = None + # Quelch pyflakes warnings - start will be set when step is set + start = '(Never used)' + for i, prev in zip(idxs[1:], idxs[:-1]): + if step is not None: + if i - prev == step: + continue + yield _genslice(start, prev, step) + step = None + continue + if i - prev in [-1, 1]: + step = i - prev + start = prev + continue + else: + yield 's[%d]' % prev + if step is None: + yield 's[%d]' % i + else: + yield _genslice(start, i, step) + + test_string = ''.join(map(compat_chr, range(len(example_sig)))) + cache_res = func(test_string) + cache_spec = [ord(c) for c in cache_res] + expr_code = ' + '.join(gen_sig_code(cache_spec)) + signature_id_tuple = '(%s)' % ( + ', '.join(compat_str(len(p)) for p in example_sig.split('.'))) + code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n' + ' return %s\n') % (signature_id_tuple, expr_code) + self.to_screen('Extracted signature function:\n' + code) + + def _parse_sig_js(self, jscode): + funcname = self._search_regex( + (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', + r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', + r'\bm=(?P<sig>[a-zA-Z0-9$]{2,})\(decodeURIComponent\(h\.s\)\)', + r'\bc&&\(c=(?P<sig>[a-zA-Z0-9$]{2,})\(decodeURIComponent\(c\)\)', + r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\);[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\)', + r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', + r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', + # Obsolete patterns + r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', + r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(', + r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(', + r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', + r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', + r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', + r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', + r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('), + jscode, 'Initial JS player signature function name', group='sig') + + jsi = JSInterpreter(jscode) + initial_function = jsi.extract_function(funcname) + return lambda s: initial_function([s]) + + def _decrypt_signature(self, s, video_id, player_url): + """Turn the encrypted s field into a working signature""" + + if player_url is None: + raise ExtractorError('Cannot decrypt signature without player_url') + + try: + player_id = (player_url, self._signature_cache_id(s)) + if player_id not in self._player_cache: + func = self._extract_signature_function( + video_id, player_url, s + ) + self._player_cache[player_id] = func + func = self._player_cache[player_id] + if self.get_param('youtube_print_sig_code'): + self._print_sig_code(func, s) + return func(s) + except Exception as e: + tb = traceback.format_exc() + raise ExtractorError( + 'Signature extraction failed: ' + tb, cause=e) + + def _extract_signature_timestamp(self, video_id, player_url, ytcfg=None, fatal=False): + """ + Extract signatureTimestamp (sts) + Required to tell API what sig/player version is in use. + """ + sts = None + if isinstance(ytcfg, dict): + sts = int_or_none(ytcfg.get('STS')) + + if not sts: + # Attempt to extract from player + if player_url is None: + error_msg = 'Cannot extract signature timestamp without player_url.' + if fatal: + raise ExtractorError(error_msg) + self.report_warning(error_msg) + return + if self._load_player(video_id, player_url, fatal=fatal): + player_id = self._extract_player_info(player_url) + code = self._code_cache[player_id] + sts = int_or_none(self._search_regex( + r'(?:signatureTimestamp|sts)\s*:\s*(?P<sts>[0-9]{5})', code, + 'JS player signature timestamp', group='sts', fatal=fatal)) + return sts + + def _mark_watched(self, video_id, player_responses): + playback_url = traverse_obj( + player_responses, (..., 'playbackTracking', 'videostatsPlaybackUrl', 'baseUrl'), + expected_type=url_or_none, get_all=False) + if not playback_url: + self.report_warning('Unable to mark watched') + return + parsed_playback_url = compat_urlparse.urlparse(playback_url) + qs = compat_urlparse.parse_qs(parsed_playback_url.query) + + # cpn generation algorithm is reverse engineered from base.js. + # In fact it works even with dummy cpn. + CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_' + cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16))) + + qs.update({ + 'ver': ['2'], + 'cpn': [cpn], + }) + playback_url = compat_urlparse.urlunparse( + parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True))) + + self._download_webpage( + playback_url, video_id, 'Marking watched', + 'Unable to mark watched', fatal=False) + + @staticmethod + def _extract_urls(webpage): + # Embedded YouTube player + entries = [ + unescapeHTML(mobj.group('url')) + for mobj in re.finditer(r'''(?x) + (?: + <iframe[^>]+?src=| + data-video-url=| + <embed[^>]+?src=| + embedSWF\(?:\s*| + <object[^>]+data=| + new\s+SWFObject\( + ) + (["\']) + (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/ + (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?) + \1''', webpage)] + + # lazyYT YouTube embed + entries.extend(list(map( + unescapeHTML, + re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)))) + + # Wordpress "YouTube Video Importer" plugin + matches = re.findall(r'''(?x)<div[^>]+ + class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+ + data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage) + entries.extend(m[-1] for m in matches) + + return entries + + @staticmethod + def _extract_url(webpage): + urls = YoutubeIE._extract_urls(webpage) + return urls[0] if urls else None + + @classmethod + def extract_id(cls, url): + mobj = re.match(cls._VALID_URL, url, re.VERBOSE) + if mobj is None: + raise ExtractorError('Invalid URL: %s' % url) + return mobj.group('id') + + def _extract_chapters_from_json(self, data, duration): + chapter_list = traverse_obj( + data, ( + 'playerOverlays', 'playerOverlayRenderer', 'decoratedPlayerBarRenderer', + 'decoratedPlayerBarRenderer', 'playerBar', 'chapteredPlayerBarRenderer', 'chapters' + ), expected_type=list) + + return self._extract_chapters( + chapter_list, + chapter_time=lambda chapter: float_or_none( + traverse_obj(chapter, ('chapterRenderer', 'timeRangeStartMillis')), scale=1000), + chapter_title=lambda chapter: traverse_obj( + chapter, ('chapterRenderer', 'title', 'simpleText'), expected_type=str), + duration=duration) + + def _extract_chapters_from_engagement_panel(self, data, duration): + content_list = traverse_obj( + data, + ('engagementPanels', ..., 'engagementPanelSectionListRenderer', 'content', 'macroMarkersListRenderer', 'contents'), + expected_type=list, default=[]) + chapter_time = lambda chapter: parse_duration(self._get_text(chapter, 'timeDescription')) + chapter_title = lambda chapter: self._get_text(chapter, 'title') + + return next(( + filter(None, ( + self._extract_chapters( + traverse_obj(contents, (..., 'macroMarkersListItemRenderer')), + chapter_time, chapter_title, duration) + for contents in content_list + ))), []) + + def _extract_chapters(self, chapter_list, chapter_time, chapter_title, duration): + chapters = [] + last_chapter = {'start_time': 0} + for idx, chapter in enumerate(chapter_list or []): + title = chapter_title(chapter) + start_time = chapter_time(chapter) + if start_time is None: + continue + last_chapter['end_time'] = start_time + if start_time < last_chapter['start_time']: + if idx == 1: + chapters.pop() + self.report_warning('Invalid start time for chapter "%s"' % last_chapter['title']) + else: + self.report_warning(f'Invalid start time for chapter "{title}"') + continue + last_chapter = {'start_time': start_time, 'title': title} + chapters.append(last_chapter) + last_chapter['end_time'] = duration + return chapters + + def _extract_yt_initial_variable(self, webpage, regex, video_id, name): + return self._parse_json(self._search_regex( + (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE), + regex), webpage, name, default='{}'), video_id, fatal=False) + + @staticmethod + def parse_time_text(time_text): + """ + Parse the comment time text + time_text is in the format 'X units ago (edited)' + """ + time_text_split = time_text.split(' ') + if len(time_text_split) >= 3: + try: + return datetime_from_str('now-%s%s' % (time_text_split[0], time_text_split[1]), precision='auto') + except ValueError: + return None + + def _extract_comment(self, comment_renderer, parent=None): + comment_id = comment_renderer.get('commentId') + if not comment_id: + return + + text = self._get_text(comment_renderer, 'contentText') + + # note: timestamp is an estimate calculated from the current time and time_text + time_text = self._get_text(comment_renderer, 'publishedTimeText') or '' + time_text_dt = self.parse_time_text(time_text) + if isinstance(time_text_dt, datetime.datetime): + timestamp = calendar.timegm(time_text_dt.timetuple()) + author = self._get_text(comment_renderer, 'authorText') + author_id = try_get(comment_renderer, + lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], compat_str) + + votes = parse_count(try_get(comment_renderer, (lambda x: x['voteCount']['simpleText'], + lambda x: x['likeCount']), compat_str)) or 0 + author_thumbnail = try_get(comment_renderer, + lambda x: x['authorThumbnail']['thumbnails'][-1]['url'], compat_str) + + author_is_uploader = try_get(comment_renderer, lambda x: x['authorIsChannelOwner'], bool) + is_favorited = 'creatorHeart' in (try_get( + comment_renderer, lambda x: x['actionButtons']['commentActionButtonsRenderer'], dict) or {}) + return { + 'id': comment_id, + 'text': text, + 'timestamp': timestamp, + 'time_text': time_text, + 'like_count': votes, + 'is_favorited': is_favorited, + 'author': author, + 'author_id': author_id, + 'author_thumbnail': author_thumbnail, + 'author_is_uploader': author_is_uploader, + 'parent': parent or 'root' + } + + def _comment_entries(self, root_continuation_data, ytcfg, video_id, parent=None, comment_counts=None): + + def extract_header(contents): + _continuation = None + for content in contents: + comments_header_renderer = try_get(content, lambda x: x['commentsHeaderRenderer']) + expected_comment_count = parse_count(self._get_text( + comments_header_renderer, 'countText', 'commentsCount', max_runs=1)) + + if expected_comment_count: + comment_counts[1] = expected_comment_count + self.to_screen('Downloading ~%d comments' % expected_comment_count) + sort_mode_str = self._configuration_arg('comment_sort', [''])[0] + comment_sort_index = int(sort_mode_str != 'top') # 1 = new, 0 = top + + sort_menu_item = try_get( + comments_header_renderer, + lambda x: x['sortMenu']['sortFilterSubMenuRenderer']['subMenuItems'][comment_sort_index], dict) or {} + sort_continuation_ep = sort_menu_item.get('serviceEndpoint') or {} + + _continuation = self._extract_continuation_ep_data(sort_continuation_ep) or self._extract_continuation(sort_menu_item) + if not _continuation: + continue + + sort_text = sort_menu_item.get('title') + if isinstance(sort_text, compat_str): + sort_text = sort_text.lower() + else: + sort_text = 'top comments' if comment_sort_index == 0 else 'newest first' + self.to_screen('Sorting comments by %s' % sort_text) + break + return _continuation + + def extract_thread(contents): + if not parent: + comment_counts[2] = 0 + for content in contents: + comment_thread_renderer = try_get(content, lambda x: x['commentThreadRenderer']) + comment_renderer = try_get( + comment_thread_renderer, (lambda x: x['comment']['commentRenderer'], dict)) or try_get( + content, (lambda x: x['commentRenderer'], dict)) + + if not comment_renderer: + continue + comment = self._extract_comment(comment_renderer, parent) + if not comment: + continue + comment_counts[0] += 1 + yield comment + # Attempt to get the replies + comment_replies_renderer = try_get( + comment_thread_renderer, lambda x: x['replies']['commentRepliesRenderer'], dict) + + if comment_replies_renderer: + comment_counts[2] += 1 + comment_entries_iter = self._comment_entries( + comment_replies_renderer, ytcfg, video_id, + parent=comment.get('id'), comment_counts=comment_counts) + + for reply_comment in comment_entries_iter: + yield reply_comment + + # YouTube comments have a max depth of 2 + max_depth = int_or_none(self._configuration_arg('max_comment_depth', [''])[0]) or float('inf') + if max_depth == 1 and parent: + return + if not comment_counts: + # comment so far, est. total comments, current comment thread # + comment_counts = [0, 0, 0] + + continuation = self._extract_continuation(root_continuation_data) + if continuation and len(continuation['continuation']) < 27: + self.write_debug('Detected old API continuation token. Generating new API compatible token.') + continuation_token = self._generate_comment_continuation(video_id) + continuation = self._build_api_continuation_query(continuation_token, None) + + message = self._get_text(root_continuation_data, ('contents', ..., 'messageRenderer', 'text'), max_runs=1) + if message and not parent: + self.report_warning(message, video_id=video_id) + + visitor_data = None + is_first_continuation = parent is None + + for page_num in itertools.count(0): + if not continuation: + break + headers = self.generate_api_headers(ytcfg=ytcfg, visitor_data=visitor_data) + comment_prog_str = '(%d/%d)' % (comment_counts[0], comment_counts[1]) + if page_num == 0: + if is_first_continuation: + note_prefix = 'Downloading comment section API JSON' + else: + note_prefix = ' Downloading comment API JSON reply thread %d %s' % ( + comment_counts[2], comment_prog_str) + else: + note_prefix = '%sDownloading comment%s API JSON page %d %s' % ( + ' ' if parent else '', ' replies' if parent else '', + page_num, comment_prog_str) + + response = self._extract_response( + item_id=None, query=continuation, + ep='next', ytcfg=ytcfg, headers=headers, note=note_prefix, + check_get_keys=('onResponseReceivedEndpoints', 'continuationContents')) + if not response: + break + visitor_data = try_get( + response, + lambda x: x['responseContext']['webResponseContextExtensionData']['ytConfigData']['visitorData'], + compat_str) or visitor_data + + continuation_contents = dict_get(response, ('onResponseReceivedEndpoints', 'continuationContents')) + + continuation = None + if isinstance(continuation_contents, list): + for continuation_section in continuation_contents: + if not isinstance(continuation_section, dict): + continue + continuation_items = try_get( + continuation_section, + (lambda x: x['reloadContinuationItemsCommand']['continuationItems'], + lambda x: x['appendContinuationItemsAction']['continuationItems']), + list) or [] + if is_first_continuation: + continuation = extract_header(continuation_items) + is_first_continuation = False + if continuation: + break + continue + count = 0 + for count, entry in enumerate(extract_thread(continuation_items)): + yield entry + continuation = self._extract_continuation({'contents': continuation_items}) + if continuation: + # Sometimes YouTube provides a continuation without any comments + # In most cases we end up just downloading these with very little comments to come. + if count == 0: + if not parent: + self.report_warning('No comments received - assuming end of comments') + continuation = None + break + + # Deprecated response structure + elif isinstance(continuation_contents, dict): + known_continuation_renderers = ('itemSectionContinuation', 'commentRepliesContinuation') + for key, continuation_renderer in continuation_contents.items(): + if key not in known_continuation_renderers: + continue + if not isinstance(continuation_renderer, dict): + continue + if is_first_continuation: + header_continuation_items = [continuation_renderer.get('header') or {}] + continuation = extract_header(header_continuation_items) + is_first_continuation = False + if continuation: + break + + # Sometimes YouTube provides a continuation without any comments + # In most cases we end up just downloading these with very little comments to come. + count = 0 + for count, entry in enumerate(extract_thread(continuation_renderer.get('contents') or {})): + yield entry + continuation = self._extract_continuation(continuation_renderer) + if count == 0: + if not parent: + self.report_warning('No comments received - assuming end of comments') + continuation = None + break + + @staticmethod + def _generate_comment_continuation(video_id): + """ + Generates initial comment section continuation token from given video id + """ + b64_vid_id = base64.b64encode(bytes(video_id.encode('utf-8'))) + parts = ('Eg0SCw==', b64_vid_id, 'GAYyJyIRIgs=', b64_vid_id, 'MAB4AjAAQhBjb21tZW50cy1zZWN0aW9u') + new_continuation_intlist = list(itertools.chain.from_iterable( + [bytes_to_intlist(base64.b64decode(part)) for part in parts])) + return base64.b64encode(intlist_to_bytes(new_continuation_intlist)).decode('utf-8') + + def _get_comments(self, ytcfg, video_id, contents, webpage): + """Entry for comment extraction""" + def _real_comment_extract(contents): + renderer = next(( + item for item in traverse_obj(contents, (..., 'itemSectionRenderer'), default={}) + if item.get('sectionIdentifier') == 'comment-item-section'), None) + yield from self._comment_entries(renderer, ytcfg, video_id) + + max_comments = int_or_none(self._configuration_arg('max_comments', [''])[0]) + # Force English regardless of account setting to prevent parsing issues + # See: https://github.com/yt-dlp/yt-dlp/issues/532 + ytcfg = copy.deepcopy(ytcfg) + traverse_obj( + ytcfg, ('INNERTUBE_CONTEXT', 'client'), expected_type=dict, default={})['hl'] = 'en' + return itertools.islice(_real_comment_extract(contents), 0, max_comments) + + @staticmethod + def _get_checkok_params(): + return {'contentCheckOk': True, 'racyCheckOk': True} + + @classmethod + def _generate_player_context(cls, sts=None): + context = { + 'html5Preference': 'HTML5_PREF_WANTS', + } + if sts is not None: + context['signatureTimestamp'] = sts + return { + 'playbackContext': { + 'contentPlaybackContext': context + }, + **cls._get_checkok_params() + } + + @staticmethod + def _is_agegated(player_response): + if traverse_obj(player_response, ('playabilityStatus', 'desktopLegacyAgeGateReason')): + return True + + reasons = traverse_obj(player_response, ('playabilityStatus', ('status', 'reason')), default=[]) + AGE_GATE_REASONS = ( + 'confirm your age', 'age-restricted', 'inappropriate', # reason + 'age_verification_required', 'age_check_required', # status + ) + return any(expected in reason for expected in AGE_GATE_REASONS for reason in reasons) + + @staticmethod + def _is_unplayable(player_response): + return traverse_obj(player_response, ('playabilityStatus', 'status')) == 'UNPLAYABLE' + + def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, player_url, initial_pr): + + session_index = self._extract_session_index(player_ytcfg, master_ytcfg) + syncid = self._extract_account_syncid(player_ytcfg, master_ytcfg, initial_pr) + sts = self._extract_signature_timestamp(video_id, player_url, master_ytcfg, fatal=False) if player_url else None + headers = self.generate_api_headers( + ytcfg=player_ytcfg, account_syncid=syncid, session_index=session_index, default_client=client) + + yt_query = {'videoId': video_id} + yt_query.update(self._generate_player_context(sts)) + return self._extract_response( + item_id=video_id, ep='player', query=yt_query, + ytcfg=player_ytcfg, headers=headers, fatal=True, + default_client=client, + note='Downloading %s player API JSON' % client.replace('_', ' ').strip() + ) or None + + def _get_requested_clients(self, url, smuggled_data): + requested_clients = [] + allowed_clients = sorted( + [client for client in INNERTUBE_CLIENTS.keys() if client[:1] != '_'], + key=lambda client: INNERTUBE_CLIENTS[client]['priority'], reverse=True) + for client in self._configuration_arg('player_client'): + if client in allowed_clients: + requested_clients.append(client) + elif client == 'all': + requested_clients.extend(allowed_clients) + else: + self.report_warning(f'Skipping unsupported client {client}') + if not requested_clients: + requested_clients = ['android', 'web'] + + if smuggled_data.get('is_music_url') or self.is_music_url(url): + requested_clients.extend( + f'{client}_music' for client in requested_clients if f'{client}_music' in INNERTUBE_CLIENTS) + + return orderedSet(requested_clients) + + def _extract_player_ytcfg(self, client, video_id): + url = { + 'web_music': 'https://music.youtube.com', + 'web_embedded': f'https://www.youtube.com/embed/{video_id}?html5=1' + }.get(client) + if not url: + return {} + webpage = self._download_webpage(url, video_id, fatal=False, note=f'Downloading {client} config') + return self.extract_ytcfg(video_id, webpage) or {} + + def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg): + initial_pr = None + if webpage: + initial_pr = self._extract_yt_initial_variable( + webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE, + video_id, 'initial player response') + + original_clients = clients + clients = clients[::-1] + prs = [] + + def append_client(client_name): + if client_name in INNERTUBE_CLIENTS and client_name not in original_clients: + clients.append(client_name) + + # Android player_response does not have microFormats which are needed for + # extraction of some data. So we return the initial_pr with formats + # stripped out even if not requested by the user + # See: https://github.com/yt-dlp/yt-dlp/issues/501 + if initial_pr: + pr = dict(initial_pr) + pr['streamingData'] = None + prs.append(pr) + + last_error = None + tried_iframe_fallback = False + player_url = None + while clients: + client = clients.pop() + player_ytcfg = master_ytcfg if client == 'web' else {} + if 'configs' not in self._configuration_arg('player_skip'): + player_ytcfg = self._extract_player_ytcfg(client, video_id) or player_ytcfg + + player_url = player_url or self._extract_player_url(master_ytcfg, player_ytcfg, webpage=webpage) + require_js_player = self._get_default_ytcfg(client).get('REQUIRE_JS_PLAYER') + if 'js' in self._configuration_arg('player_skip'): + require_js_player = False + player_url = None + + if not player_url and not tried_iframe_fallback and require_js_player: + player_url = self._download_player_url(video_id) + tried_iframe_fallback = True + + try: + pr = initial_pr if client == 'web' and initial_pr else self._extract_player_response( + client, video_id, player_ytcfg or master_ytcfg, player_ytcfg, player_url if require_js_player else None, initial_pr) + except ExtractorError as e: + if last_error: + self.report_warning(last_error) + last_error = e + continue + + if pr: + prs.append(pr) + + # creator clients can bypass AGE_VERIFICATION_REQUIRED if logged in + if client.endswith('_agegate') and self._is_unplayable(pr) and self.is_authenticated: + append_client(client.replace('_agegate', '_creator')) + elif self._is_agegated(pr): + append_client(f'{client}_agegate') + + if last_error: + if not len(prs): + raise last_error + self.report_warning(last_error) + return prs, player_url + + def _extract_formats(self, streaming_data, video_id, player_url, is_live): + itags, stream_ids = [], [] + itag_qualities, res_qualities = {}, {} + q = qualities([ + # Normally tiny is the smallest video-only formats. But + # audio-only formats with unknown quality may get tagged as tiny + 'tiny', + 'audio_quality_ultralow', 'audio_quality_low', 'audio_quality_medium', 'audio_quality_high', # Audio only formats + 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres' + ]) + streaming_formats = traverse_obj(streaming_data, (..., ('formats', 'adaptiveFormats'), ...), default=[]) + + for fmt in streaming_formats: + if fmt.get('targetDurationSec') or fmt.get('drmFamilies'): + continue + + itag = str_or_none(fmt.get('itag')) + audio_track = fmt.get('audioTrack') or {} + stream_id = '%s.%s' % (itag or '', audio_track.get('id', '')) + if stream_id in stream_ids: + continue + + quality = fmt.get('quality') + height = int_or_none(fmt.get('height')) + if quality == 'tiny' or not quality: + quality = fmt.get('audioQuality', '').lower() or quality + # The 3gp format (17) in android client has a quality of "small", + # but is actually worse than other formats + if itag == '17': + quality = 'tiny' + if quality: + if itag: + itag_qualities[itag] = quality + if height: + res_qualities[height] = quality + # FORMAT_STREAM_TYPE_OTF(otf=1) requires downloading the init fragment + # (adding `&sq=0` to the URL) and parsing emsg box to determine the + # number of fragment that would subsequently requested with (`&sq=N`) + if fmt.get('type') == 'FORMAT_STREAM_TYPE_OTF': + continue + + fmt_url = fmt.get('url') + if not fmt_url: + sc = compat_parse_qs(fmt.get('signatureCipher')) + fmt_url = url_or_none(try_get(sc, lambda x: x['url'][0])) + encrypted_sig = try_get(sc, lambda x: x['s'][0]) + if not (sc and fmt_url and encrypted_sig): + continue + if not player_url: + continue + signature = self._decrypt_signature(sc['s'][0], video_id, player_url) + sp = try_get(sc, lambda x: x['sp'][0]) or 'signature' + fmt_url += '&' + sp + '=' + signature + + if itag: + itags.append(itag) + stream_ids.append(stream_id) + + tbr = float_or_none( + fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) + dct = { + 'asr': int_or_none(fmt.get('audioSampleRate')), + 'filesize': int_or_none(fmt.get('contentLength')), + 'format_id': itag, + 'format_note': ', '.join(filter(None, ( + '%s%s' % (audio_track.get('displayName') or '', + ' (default)' if audio_track.get('audioIsDefault') else ''), + fmt.get('qualityLabel') or quality.replace('audio_quality_', '')))), + 'fps': int_or_none(fmt.get('fps')), + 'height': height, + 'quality': q(quality), + 'tbr': tbr, + 'url': fmt_url, + 'width': int_or_none(fmt.get('width')), + 'language': audio_track.get('id', '').split('.')[0], + 'language_preference': 1 if audio_track.get('audioIsDefault') else -1, + } + mime_mobj = re.match( + r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', fmt.get('mimeType') or '') + if mime_mobj: + dct['ext'] = mimetype2ext(mime_mobj.group(1)) + dct.update(parse_codecs(mime_mobj.group(2))) + no_audio = dct.get('acodec') == 'none' + no_video = dct.get('vcodec') == 'none' + if no_audio: + dct['vbr'] = tbr + if no_video: + dct['abr'] = tbr + if no_audio or no_video: + dct['downloader_options'] = { + # Youtube throttles chunks >~10M + 'http_chunk_size': 10485760, + } + if dct.get('ext'): + dct['container'] = dct['ext'] + '_dash' + yield dct + + skip_manifests = self._configuration_arg('skip') + get_dash = ( + (not is_live or self._configuration_arg('include_live_dash')) + and 'dash' not in skip_manifests and self.get_param('youtube_include_dash_manifest', True)) + get_hls = 'hls' not in skip_manifests and self.get_param('youtube_include_hls_manifest', True) + + def guess_quality(f): + for val, qdict in ((f.get('format_id'), itag_qualities), (f.get('height'), res_qualities)): + if val in qdict: + return q(qdict[val]) + return -1 + + for sd in streaming_data: + hls_manifest_url = get_hls and sd.get('hlsManifestUrl') + if hls_manifest_url: + for f in self._extract_m3u8_formats(hls_manifest_url, video_id, 'mp4', fatal=False): + itag = self._search_regex( + r'/itag/(\d+)', f['url'], 'itag', default=None) + if itag in itags: + itag += '-hls' + if itag in itags: + continue + if itag: + f['format_id'] = itag + itags.append(itag) + f['quality'] = guess_quality(f) + yield f + + dash_manifest_url = get_dash and sd.get('dashManifestUrl') + if dash_manifest_url: + for f in self._extract_mpd_formats(dash_manifest_url, video_id, fatal=False): + itag = f['format_id'] + if itag in itags: + itag += '-dash' + if itag in itags: + continue + if itag: + f['format_id'] = itag + itags.append(itag) + f['quality'] = guess_quality(f) + filesize = int_or_none(self._search_regex( + r'/clen/(\d+)', f.get('fragment_base_url') + or f['url'], 'file size', default=None)) + if filesize: + f['filesize'] = filesize + yield f + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + video_id = self._match_id(url) + + base_url = self.http_scheme() + '//www.youtube.com/' + webpage_url = base_url + 'watch?v=' + video_id + webpage = None + if 'webpage' not in self._configuration_arg('player_skip'): + webpage = self._download_webpage( + webpage_url + '&bpctr=9999999999&has_verified=1', video_id, fatal=False) + + master_ytcfg = self.extract_ytcfg(video_id, webpage) or self._get_default_ytcfg() + + player_responses, player_url = self._extract_player_responses( + self._get_requested_clients(url, smuggled_data), + video_id, webpage, master_ytcfg) + + get_first = lambda obj, keys, **kwargs: traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False) + + playability_statuses = traverse_obj( + player_responses, (..., 'playabilityStatus'), expected_type=dict, default=[]) + + trailer_video_id = get_first( + playability_statuses, + ('errorScreen', 'playerLegacyDesktopYpcTrailerRenderer', 'trailerVideoId'), + expected_type=str) + if trailer_video_id: + return self.url_result( + trailer_video_id, self.ie_key(), trailer_video_id) + + search_meta = ((lambda x: self._html_search_meta(x, webpage, default=None)) + if webpage else (lambda x: None)) + + video_details = traverse_obj( + player_responses, (..., 'videoDetails'), expected_type=dict, default=[]) + microformats = traverse_obj( + player_responses, (..., 'microformat', 'playerMicroformatRenderer'), + expected_type=dict, default=[]) + video_title = ( + get_first(video_details, 'title') + or self._get_text(microformats, (..., 'title')) + or search_meta(['og:title', 'twitter:title', 'title'])) + video_description = get_first(video_details, 'shortDescription') + + if not smuggled_data.get('force_singlefeed', False): + if not self.get_param('noplaylist'): + multifeed_metadata_list = get_first( + player_responses, + ('multicamera', 'playerLegacyMulticameraRenderer', 'metadataList'), + expected_type=str) + if multifeed_metadata_list: + entries = [] + feed_ids = [] + for feed in multifeed_metadata_list.split(','): + # Unquote should take place before split on comma (,) since textual + # fields may contain comma as well (see + # https://github.com/ytdl-org/youtube-dl/issues/8536) + feed_data = compat_parse_qs( + compat_urllib_parse_unquote_plus(feed)) + + def feed_entry(name): + return try_get( + feed_data, lambda x: x[name][0], compat_str) + + feed_id = feed_entry('id') + if not feed_id: + continue + feed_title = feed_entry('title') + title = video_title + if feed_title: + title += ' (%s)' % feed_title + entries.append({ + '_type': 'url_transparent', + 'ie_key': 'Youtube', + 'url': smuggle_url( + '%swatch?v=%s' % (base_url, feed_data['id'][0]), + {'force_singlefeed': True}), + 'title': title, + }) + feed_ids.append(feed_id) + self.to_screen( + 'Downloading multifeed video (%s) - add --no-playlist to just download video %s' + % (', '.join(feed_ids), video_id)) + return self.playlist_result( + entries, video_id, video_title, video_description) + else: + self.to_screen('Downloading just video %s because of --no-playlist' % video_id) + + live_broadcast_details = traverse_obj(microformats, (..., 'liveBroadcastDetails')) + is_live = get_first(video_details, 'isLive') + if is_live is None: + is_live = get_first(live_broadcast_details, 'isLiveNow') + + streaming_data = traverse_obj(player_responses, (..., 'streamingData'), default=[]) + formats = list(self._extract_formats(streaming_data, video_id, player_url, is_live)) + + if not formats: + if not self.get_param('allow_unplayable_formats') and traverse_obj(streaming_data, (..., 'licenseInfos')): + self.report_drm(video_id) + pemr = get_first( + playability_statuses, + ('errorScreen', 'playerErrorMessageRenderer'), expected_type=dict) or {} + reason = self._get_text(pemr, 'reason') or get_first(playability_statuses, 'reason') + subreason = clean_html(self._get_text(pemr, 'subreason') or '') + if subreason: + if subreason == 'The uploader has not made this video available in your country.': + countries = get_first(microformats, 'availableCountries') + if not countries: + regions_allowed = search_meta('regionsAllowed') + countries = regions_allowed.split(',') if regions_allowed else None + self.raise_geo_restricted(subreason, countries, metadata_available=True) + reason += f'. {subreason}' + if reason: + self.raise_no_formats(reason, expected=True) + + for f in formats: + if '&c=WEB&' in f['url'] and '&ratebypass=yes&' not in f['url']: # throttled + f['source_preference'] = -10 + # TODO: this method is not reliable + f['format_note'] = format_field(f, 'format_note', '%s ') + '(maybe throttled)' + + # Source is given priority since formats that throttle are given lower source_preference + # When throttling issue is fully fixed, remove this + self._sort_formats(formats, ('quality', 'res', 'fps', 'source', 'codec:vp9.2', 'lang')) + + keywords = get_first(video_details, 'keywords', expected_type=list) or [] + if not keywords and webpage: + keywords = [ + unescapeHTML(m.group('content')) + for m in re.finditer(self._meta_regex('og:video:tag'), webpage)] + for keyword in keywords: + if keyword.startswith('yt:stretch='): + mobj = re.search(r'(\d+)\s*:\s*(\d+)', keyword) + if mobj: + # NB: float is intentional for forcing float division + w, h = (float(v) for v in mobj.groups()) + if w > 0 and h > 0: + ratio = w / h + for f in formats: + if f.get('vcodec') != 'none': + f['stretched_ratio'] = ratio + break + + thumbnails = [] + thumbnail_dicts = traverse_obj( + (video_details, microformats), (..., ..., 'thumbnail', 'thumbnails', ...), + expected_type=dict, default=[]) + for thumbnail in thumbnail_dicts: + thumbnail_url = thumbnail.get('url') + if not thumbnail_url: + continue + # Sometimes youtube gives a wrong thumbnail URL. See: + # https://github.com/yt-dlp/yt-dlp/issues/233 + # https://github.com/ytdl-org/youtube-dl/issues/28023 + if 'maxresdefault' in thumbnail_url: + thumbnail_url = thumbnail_url.split('?')[0] + thumbnails.append({ + 'url': thumbnail_url, + 'height': int_or_none(thumbnail.get('height')), + 'width': int_or_none(thumbnail.get('width')), + }) + thumbnail_url = search_meta(['og:image', 'twitter:image']) + if thumbnail_url: + thumbnails.append({ + 'url': thumbnail_url, + }) + # The best resolution thumbnails sometimes does not appear in the webpage + # See: https://github.com/ytdl-org/youtube-dl/issues/29049, https://github.com/yt-dlp/yt-dlp/issues/340 + # List of possible thumbnails - Ref: <https://stackoverflow.com/a/20542029> + hq_thumbnail_names = ['maxresdefault', 'hq720', 'sddefault', 'sd1', 'sd2', 'sd3'] + # TODO: Test them also? - For some videos, even these don't exist + guaranteed_thumbnail_names = [ + 'hqdefault', 'hq1', 'hq2', 'hq3', '0', + 'mqdefault', 'mq1', 'mq2', 'mq3', + 'default', '1', '2', '3' + ] + thumbnail_names = hq_thumbnail_names + guaranteed_thumbnail_names + n_thumbnail_names = len(thumbnail_names) + + thumbnails.extend({ + 'url': 'https://i.ytimg.com/vi{webp}/{video_id}/{name}{live}.{ext}'.format( + video_id=video_id, name=name, ext=ext, + webp='_webp' if ext == 'webp' else '', live='_live' if is_live else ''), + '_test_url': name in hq_thumbnail_names, + } for name in thumbnail_names for ext in ('webp', 'jpg')) + for thumb in thumbnails: + i = next((i for i, t in enumerate(thumbnail_names) if f'/{video_id}/{t}' in thumb['url']), n_thumbnail_names) + thumb['preference'] = (0 if '.webp' in thumb['url'] else -1) - (2 * i) + self._remove_duplicate_formats(thumbnails) + + category = get_first(microformats, 'category') or search_meta('genre') + channel_id = str_or_none( + get_first(video_details, 'channelId') + or get_first(microformats, 'externalChannelId') + or search_meta('channelId')) + duration = int_or_none( + get_first(video_details, 'lengthSeconds') + or get_first(microformats, 'lengthSeconds') + or parse_duration(search_meta('duration'))) or None + owner_profile_url = get_first(microformats, 'ownerProfileUrl') + + live_content = get_first(video_details, 'isLiveContent') + is_upcoming = get_first(video_details, 'isUpcoming') + if is_live is None: + if is_upcoming or live_content is False: + is_live = False + if is_upcoming is None and (live_content or is_live): + is_upcoming = False + live_starttime = parse_iso8601(get_first(live_broadcast_details, 'startTimestamp')) + live_endtime = parse_iso8601(get_first(live_broadcast_details, 'endTimestamp')) + if not duration and live_endtime and live_starttime: + duration = live_endtime - live_starttime + + info = { + 'id': video_id, + 'title': self._live_title(video_title) if is_live else video_title, + 'formats': formats, + 'thumbnails': thumbnails, + 'description': video_description, + 'upload_date': unified_strdate( + get_first(microformats, 'uploadDate') + or search_meta('uploadDate')), + 'uploader': get_first(video_details, 'author'), + 'uploader_id': self._search_regex(r'/(?:channel|user)/([^/?&#]+)', owner_profile_url, 'uploader id') if owner_profile_url else None, + 'uploader_url': owner_profile_url, + 'channel_id': channel_id, + 'channel_url': f'https://www.youtube.com/channel/{channel_id}' if channel_id else None, + 'duration': duration, + 'view_count': int_or_none( + get_first((video_details, microformats), (..., 'viewCount')) + or search_meta('interactionCount')), + 'average_rating': float_or_none(get_first(video_details, 'averageRating')), + 'age_limit': 18 if ( + get_first(microformats, 'isFamilySafe') is False + or search_meta('isFamilyFriendly') == 'false' + or search_meta('og:restrictions:age') == '18+') else 0, + 'webpage_url': webpage_url, + 'categories': [category] if category else None, + 'tags': keywords, + 'playable_in_embed': get_first(playability_statuses, 'playableInEmbed'), + 'is_live': is_live, + 'was_live': (False if is_live or is_upcoming or live_content is False + else None if is_live is None or is_upcoming is None + else live_content), + 'live_status': 'is_upcoming' if is_upcoming else None, # rest will be set by YoutubeDL + 'release_timestamp': live_starttime, + } + + pctr = traverse_obj(player_responses, (..., 'captions', 'playerCaptionsTracklistRenderer'), expected_type=dict) + if pctr: + def get_lang_code(track): + return (remove_start(track.get('vssId') or '', '.').replace('.', '-') + or track.get('languageCode')) + + # Converted into dicts to remove duplicates + captions = { + get_lang_code(sub): sub + for sub in traverse_obj(pctr, (..., 'captionTracks', ...), default=[])} + translation_languages = { + lang.get('languageCode'): self._get_text(lang.get('languageName'), max_runs=1) + for lang in traverse_obj(pctr, (..., 'translationLanguages', ...), default=[])} + + def process_language(container, base_url, lang_code, sub_name, query): + lang_subs = container.setdefault(lang_code, []) + for fmt in self._SUBTITLE_FORMATS: + query.update({ + 'fmt': fmt, + }) + lang_subs.append({ + 'ext': fmt, + 'url': update_url_query(base_url, query), + 'name': sub_name, + }) + + subtitles, automatic_captions = {}, {} + for lang_code, caption_track in captions.items(): + base_url = caption_track.get('baseUrl') + if not base_url: + continue + lang_name = self._get_text(caption_track, 'name', max_runs=1) + if caption_track.get('kind') != 'asr': + if not lang_code: + continue + process_language( + subtitles, base_url, lang_code, lang_name, {}) + if not caption_track.get('isTranslatable'): + continue + for trans_code, trans_name in translation_languages.items(): + if not trans_code: + continue + if caption_track.get('kind') != 'asr': + trans_code += f'-{lang_code}' + trans_name += format_field(lang_name, template=' from %s') + process_language( + automatic_captions, base_url, trans_code, trans_name, {'tlang': trans_code}) + info['automatic_captions'] = automatic_captions + info['subtitles'] = subtitles + + parsed_url = compat_urllib_parse_urlparse(url) + for component in [parsed_url.fragment, parsed_url.query]: + query = compat_parse_qs(component) + for k, v in query.items(): + for d_k, s_ks in [('start', ('start', 't')), ('end', ('end',))]: + d_k += '_time' + if d_k not in info and k in s_ks: + info[d_k] = parse_duration(query[k][0]) + + # Youtube Music Auto-generated description + if video_description: + mobj = re.search(r'(?s)(?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description) + if mobj: + release_year = mobj.group('release_year') + release_date = mobj.group('release_date') + if release_date: + release_date = release_date.replace('-', '') + if not release_year: + release_year = release_date[:4] + info.update({ + 'album': mobj.group('album'.strip()), + 'artist': mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·')), + 'track': mobj.group('track').strip(), + 'release_date': release_date, + 'release_year': int_or_none(release_year), + }) + + initial_data = None + if webpage: + initial_data = self._extract_yt_initial_variable( + webpage, self._YT_INITIAL_DATA_RE, video_id, + 'yt initial data') + if not initial_data: + query = {'videoId': video_id} + query.update(self._get_checkok_params()) + initial_data = self._extract_response( + item_id=video_id, ep='next', fatal=False, + ytcfg=master_ytcfg, query=query, + headers=self.generate_api_headers(ytcfg=master_ytcfg), + note='Downloading initial data API JSON') + + try: + # This will error if there is no livechat + initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation'] + info.setdefault('subtitles', {})['live_chat'] = [{ + 'url': 'https://www.youtube.com/watch?v=%s' % video_id, # url is needed to set cookies + 'video_id': video_id, + 'ext': 'json', + 'protocol': 'youtube_live_chat' if is_live or is_upcoming else 'youtube_live_chat_replay', + }] + except (KeyError, IndexError, TypeError): + pass + + if initial_data: + info['chapters'] = ( + self._extract_chapters_from_json(initial_data, duration) + or self._extract_chapters_from_engagement_panel(initial_data, duration) + or None) + + contents = try_get( + initial_data, + lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], + list) or [] + for content in contents: + vpir = content.get('videoPrimaryInfoRenderer') + if vpir: + stl = vpir.get('superTitleLink') + if stl: + stl = self._get_text(stl) + if try_get( + vpir, + lambda x: x['superTitleIcon']['iconType']) == 'LOCATION_PIN': + info['location'] = stl + else: + mobj = re.search(r'(.+?)\s*S(\d+)\s*•\s*E(\d+)', stl) + if mobj: + info.update({ + 'series': mobj.group(1), + 'season_number': int(mobj.group(2)), + 'episode_number': int(mobj.group(3)), + }) + for tlb in (try_get( + vpir, + lambda x: x['videoActions']['menuRenderer']['topLevelButtons'], + list) or []): + tbr = tlb.get('toggleButtonRenderer') or {} + for getter, regex in [( + lambda x: x['defaultText']['accessibility']['accessibilityData'], + r'(?P<count>[\d,]+)\s*(?P<type>(?:dis)?like)'), ([ + lambda x: x['accessibility'], + lambda x: x['accessibilityData']['accessibilityData'], + ], r'(?P<type>(?:dis)?like) this video along with (?P<count>[\d,]+) other people')]: + label = (try_get(tbr, getter, dict) or {}).get('label') + if label: + mobj = re.match(regex, label) + if mobj: + info[mobj.group('type') + '_count'] = str_to_int(mobj.group('count')) + break + sbr_tooltip = try_get( + vpir, lambda x: x['sentimentBar']['sentimentBarRenderer']['tooltip']) + if sbr_tooltip: + like_count, dislike_count = sbr_tooltip.split(' / ') + info.update({ + 'like_count': str_to_int(like_count), + 'dislike_count': str_to_int(dislike_count), + }) + vsir = content.get('videoSecondaryInfoRenderer') + if vsir: + info['channel'] = self._get_text(vsir, ('owner', 'videoOwnerRenderer', 'title')) + rows = try_get( + vsir, + lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'], + list) or [] + multiple_songs = False + for row in rows: + if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True: + multiple_songs = True + break + for row in rows: + mrr = row.get('metadataRowRenderer') or {} + mrr_title = mrr.get('title') + if not mrr_title: + continue + mrr_title = self._get_text(mrr, 'title') + mrr_contents_text = self._get_text(mrr, ('contents', 0)) + if mrr_title == 'License': + info['license'] = mrr_contents_text + elif not multiple_songs: + if mrr_title == 'Album': + info['album'] = mrr_contents_text + elif mrr_title == 'Artist': + info['artist'] = mrr_contents_text + elif mrr_title == 'Song': + info['track'] = mrr_contents_text + + fallbacks = { + 'channel': 'uploader', + 'channel_id': 'uploader_id', + 'channel_url': 'uploader_url', + } + for to, frm in fallbacks.items(): + if not info.get(to): + info[to] = info.get(frm) + + for s_k, d_k in [('artist', 'creator'), ('track', 'alt_title')]: + v = info.get(s_k) + if v: + info[d_k] = v + + is_private = get_first(video_details, 'isPrivate', expected_type=bool) + is_unlisted = get_first(microformats, 'isUnlisted', expected_type=bool) + is_membersonly = None + is_premium = None + if initial_data and is_private is not None: + is_membersonly = False + is_premium = False + contents = try_get(initial_data, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], list) or [] + badge_labels = set() + for content in contents: + if not isinstance(content, dict): + continue + badge_labels.update(self._extract_badges(content.get('videoPrimaryInfoRenderer'))) + for badge_label in badge_labels: + if badge_label.lower() == 'members only': + is_membersonly = True + elif badge_label.lower() == 'premium': + is_premium = True + elif badge_label.lower() == 'unlisted': + is_unlisted = True + + info['availability'] = self._availability( + is_private=is_private, + needs_premium=is_premium, + needs_subscription=is_membersonly, + needs_auth=info['age_limit'] >= 18, + is_unlisted=None if is_private is None else is_unlisted) + + info['__post_extractor'] = self.extract_comments(master_ytcfg, video_id, contents, webpage) + + self.mark_watched(video_id, player_responses) + + return info + + +class YoutubeTabIE(YoutubeBaseInfoExtractor): + IE_DESC = 'YouTube.com tab' + _VALID_URL = r'''(?x) + https?:// + (?:\w+\.)? + (?: + youtube(?:kids)?\.com| + invidio\.us + )/ + (?: + (?P<channel_type>channel|c|user|browse)/| + (?P<not_channel> + feed/|hashtag/| + (?:playlist|watch)\?.*?\blist= + )| + (?!(?:%s)\b) # Direct URLs + ) + (?P<id>[^/?\#&]+) + ''' % YoutubeBaseInfoExtractor._RESERVED_NAMES + IE_NAME = 'youtube:tab' + + _TESTS = [{ + 'note': 'playlists, multipage', + 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid', + 'playlist_mincount': 94, + 'info_dict': { + 'id': 'UCqj7Cz7revf5maW9g5pgNcg', + 'title': 'Игорь Клейнер - Playlists', + 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2', + 'uploader': 'Игорь Клейнер', + 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg', + }, + }, { + 'note': 'playlists, multipage, different order', + 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd', + 'playlist_mincount': 94, + 'info_dict': { + 'id': 'UCqj7Cz7revf5maW9g5pgNcg', + 'title': 'Игорь Клейнер - Playlists', + 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2', + 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg', + 'uploader': 'Игорь Клейнер', + }, + }, { + 'note': 'playlists, series', + 'url': 'https://www.youtube.com/c/3blue1brown/playlists?view=50&sort=dd&shelf_id=3', + 'playlist_mincount': 5, + 'info_dict': { + 'id': 'UCYO_jab_esuFRV4b17AJtAw', + 'title': '3Blue1Brown - Playlists', + 'description': 'md5:e1384e8a133307dd10edee76e875d62f', + 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw', + 'uploader': '3Blue1Brown', + }, + }, { + 'note': 'playlists, singlepage', + 'url': 'https://www.youtube.com/user/ThirstForScience/playlists', + 'playlist_mincount': 4, + 'info_dict': { + 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ', + 'title': 'ThirstForScience - Playlists', + 'description': 'md5:609399d937ea957b0f53cbffb747a14c', + 'uploader': 'ThirstForScience', + 'uploader_id': 'UCAEtajcuhQ6an9WEzY9LEMQ', + } + }, { + 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists', + 'only_matching': True, + }, { + 'note': 'basic, single video playlist', + 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', + 'info_dict': { + 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', + 'uploader': 'Sergey M.', + 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', + 'title': 'youtube-dl public playlist', + }, + 'playlist_count': 1, + }, { + 'note': 'empty playlist', + 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', + 'info_dict': { + 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', + 'uploader': 'Sergey M.', + 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', + 'title': 'youtube-dl empty playlist', + }, + 'playlist_count': 0, + }, { + 'note': 'Home tab', + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Home', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + 'uploader': 'lex will', + 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + }, + 'playlist_mincount': 2, + }, { + 'note': 'Videos tab', + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Videos', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + 'uploader': 'lex will', + 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + }, + 'playlist_mincount': 975, + }, { + 'note': 'Videos tab, sorted by popular', + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Videos', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + 'uploader': 'lex will', + 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + }, + 'playlist_mincount': 199, + }, { + 'note': 'Playlists tab', + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Playlists', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + 'uploader': 'lex will', + 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + }, + 'playlist_mincount': 17, + }, { + 'note': 'Community tab', + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Community', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + 'uploader': 'lex will', + 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + }, + 'playlist_mincount': 18, + }, { + 'note': 'Channels tab', + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Channels', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + 'uploader': 'lex will', + 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + }, + 'playlist_mincount': 12, + }, { + 'note': 'Search tab', + 'url': 'https://www.youtube.com/c/3blue1brown/search?query=linear%20algebra', + 'playlist_mincount': 40, + 'info_dict': { + 'id': 'UCYO_jab_esuFRV4b17AJtAw', + 'title': '3Blue1Brown - Search - linear algebra', + 'description': 'md5:e1384e8a133307dd10edee76e875d62f', + 'uploader': '3Blue1Brown', + 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw', + }, + }, { + 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA', + 'only_matching': True, + }, { + 'url': 'https://www.youtubekids.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', + 'only_matching': True, + }, { + 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', + 'only_matching': True, + }, { + 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.', + 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', + 'info_dict': { + 'title': '29C3: Not my department', + 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', + 'uploader': 'Christiaan008', + 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg', + 'description': 'md5:a14dc1a8ef8307a9807fe136a0660268', + }, + 'playlist_count': 96, + }, { + 'note': 'Large playlist', + 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q', + 'info_dict': { + 'title': 'Uploads from Cauchemar', + 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q', + 'uploader': 'Cauchemar', + 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q', + }, + 'playlist_mincount': 1123, + }, { + 'note': 'even larger playlist, 8832 videos', + 'url': 'http://www.youtube.com/user/NASAgovVideo/videos', + 'only_matching': True, + }, { + 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos', + 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA', + 'info_dict': { + 'title': 'Uploads from Interstellar Movie', + 'id': 'UUXw-G3eDE9trcvY2sBMM_aA', + 'uploader': 'Interstellar Movie', + 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA', + }, + 'playlist_mincount': 21, + }, { + 'note': 'Playlist with "show unavailable videos" button', + 'url': 'https://www.youtube.com/playlist?list=UUTYLiWFZy8xtPwxFwX9rV7Q', + 'info_dict': { + 'title': 'Uploads from Phim Siêu Nhân Nhật Bản', + 'id': 'UUTYLiWFZy8xtPwxFwX9rV7Q', + 'uploader': 'Phim Siêu Nhân Nhật Bản', + 'uploader_id': 'UCTYLiWFZy8xtPwxFwX9rV7Q', + }, + 'playlist_mincount': 200, + }, { + 'note': 'Playlist with unavailable videos in page 7', + 'url': 'https://www.youtube.com/playlist?list=UU8l9frL61Yl5KFOl87nIm2w', + 'info_dict': { + 'title': 'Uploads from BlankTV', + 'id': 'UU8l9frL61Yl5KFOl87nIm2w', + 'uploader': 'BlankTV', + 'uploader_id': 'UC8l9frL61Yl5KFOl87nIm2w', + }, + 'playlist_mincount': 1000, + }, { + 'note': 'https://github.com/ytdl-org/youtube-dl/issues/21844', + 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', + 'info_dict': { + 'title': 'Data Analysis with Dr Mike Pound', + 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', + 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA', + 'uploader': 'Computerphile', + 'description': 'md5:7f567c574d13d3f8c0954d9ffee4e487', + }, + 'playlist_mincount': 11, + }, { + 'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', + 'only_matching': True, + }, { + 'note': 'Playlist URL that does not actually serve a playlist', + 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4', + 'info_dict': { + 'id': 'FqZTN594JQw', + 'ext': 'webm', + 'title': "Smiley's People 01 detective, Adventure Series, Action", + 'uploader': 'STREEM', + 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng', + 'upload_date': '20150526', + 'license': 'Standard YouTube License', + 'description': 'md5:507cdcb5a49ac0da37a920ece610be80', + 'categories': ['People & Blogs'], + 'tags': list, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'This video is not available.', + 'add_ie': [YoutubeIE.ie_key()], + }, { + 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live', + 'info_dict': { + 'id': '3yImotZU3tw', # This will keep changing + 'ext': 'mp4', + 'title': compat_str, + 'uploader': 'Sky News', + 'uploader_id': 'skynews', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews', + 'upload_date': r're:\d{8}', + 'description': compat_str, + 'categories': ['News & Politics'], + 'tags': list, + 'like_count': int, + 'dislike_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Downloading just video ', 'Ignoring subtitle tracks found in '], + }, { + 'url': 'https://www.youtube.com/user/TheYoungTurks/live', + 'info_dict': { + 'id': 'a48o2S1cPoo', + 'ext': 'mp4', + 'title': 'The Young Turks - Live Main Show', + 'uploader': 'The Young Turks', + 'uploader_id': 'TheYoungTurks', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks', + 'upload_date': '20150715', + 'license': 'Standard YouTube License', + 'description': 'md5:438179573adcdff3c97ebb1ee632b891', + 'categories': ['News & Politics'], + 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'], + 'like_count': int, + 'dislike_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/c/CommanderVideoHq/live', + 'only_matching': True, + }, { + 'note': 'A channel that is not live. Should raise error', + 'url': 'https://www.youtube.com/user/numberphile/live', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/feed/trending', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/feed/library', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/feed/history', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/feed/subscriptions', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/feed/watch_later', + 'only_matching': True, + }, { + 'note': 'Recommended - redirects to home page.', + 'url': 'https://www.youtube.com/feed/recommended', + 'only_matching': True, + }, { + 'note': 'inline playlist with not always working continuations', + 'url': 'https://www.youtube.com/watch?v=UC6u0Tct-Fo&list=PL36D642111D65BE7C', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/course', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/zsecurity', + 'only_matching': True, + }, { + 'url': 'http://www.youtube.com/NASAgovVideo/videos', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/TheYoungTurks/live', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/hashtag/cctv9', + 'info_dict': { + 'id': 'cctv9', + 'title': '#cctv9', + }, + 'playlist_mincount': 350, + }, { + 'url': 'https://www.youtube.com/watch?list=PLW4dVinRY435CBE_JD3t-0SRXKfnZHS1P&feature=youtu.be&v=M9cJMXmQ_ZU', + 'only_matching': True, + }, { + 'note': 'Requires Premium: should request additional YTM-info webpage (and have format 141) for videos in playlist', + 'url': 'https://music.youtube.com/playlist?list=PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq', + 'only_matching': True + }, { + 'note': '/browse/ should redirect to /channel/', + 'url': 'https://music.youtube.com/browse/UC1a8OFewdjuLq6KlF8M_8Ng', + 'only_matching': True + }, { + 'note': 'VLPL, should redirect to playlist?list=PL...', + 'url': 'https://music.youtube.com/browse/VLPLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq', + 'info_dict': { + 'id': 'PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq', + 'uploader': 'NoCopyrightSounds', + 'description': 'Providing you with copyright free / safe music for gaming, live streaming, studying and more!', + 'uploader_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg', + 'title': 'NCS Releases', + }, + 'playlist_mincount': 166, + }, { + 'note': 'Topic, should redirect to playlist?list=UU...', + 'url': 'https://music.youtube.com/browse/UC9ALqqC4aIeG5iDs7i90Bfw', + 'info_dict': { + 'id': 'UU9ALqqC4aIeG5iDs7i90Bfw', + 'uploader_id': 'UC9ALqqC4aIeG5iDs7i90Bfw', + 'title': 'Uploads from Royalty Free Music - Topic', + 'uploader': 'Royalty Free Music - Topic', + }, + 'expected_warnings': [ + 'A channel/user page was given', + 'The URL does not have a videos tab', + ], + 'playlist_mincount': 101, + }, { + 'note': 'Topic without a UU playlist', + 'url': 'https://www.youtube.com/channel/UCtFRv9O2AHqOZjjynzrv-xg', + 'info_dict': { + 'id': 'UCtFRv9O2AHqOZjjynzrv-xg', + 'title': 'UCtFRv9O2AHqOZjjynzrv-xg', + }, + 'expected_warnings': [ + 'A channel/user page was given', + 'The URL does not have a videos tab', + 'Falling back to channel URL', + ], + 'playlist_mincount': 9, + }, { + 'note': 'Youtube music Album', + 'url': 'https://music.youtube.com/browse/MPREb_gTAcphH99wE', + 'info_dict': { + 'id': 'OLAK5uy_l1m0thk3g31NmIIz_vMIbWtyv7eZixlH0', + 'title': 'Album - Royalty Free Music Library V2 (50 Songs)', + }, + 'playlist_count': 50, + }, { + 'note': 'unlisted single video playlist', + 'url': 'https://www.youtube.com/playlist?list=PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf', + 'info_dict': { + 'uploader_id': 'UC9zHu_mHU96r19o-wV5Qs1Q', + 'uploader': 'colethedj', + 'id': 'PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf', + 'title': 'yt-dlp unlisted playlist test', + 'availability': 'unlisted' + }, + 'playlist_count': 1, + }, { + 'note': 'API Fallback: Recommended - redirects to home page. Requires visitorData', + 'url': 'https://www.youtube.com/feed/recommended', + 'info_dict': { + 'id': 'recommended', + 'title': 'recommended', + }, + 'playlist_mincount': 50, + 'params': { + 'skip_download': True, + 'extractor_args': {'youtubetab': {'skip': ['webpage']}} + }, + }, { + 'note': 'API Fallback: /videos tab, sorted by oldest first', + 'url': 'https://www.youtube.com/user/theCodyReeder/videos?view=0&sort=da&flow=grid', + 'info_dict': { + 'id': 'UCu6mSoMNzHQiBIOCkHUa2Aw', + 'title': 'Cody\'sLab - Videos', + 'description': 'md5:d083b7c2f0c67ee7a6c74c3e9b4243fa', + 'uploader': 'Cody\'sLab', + 'uploader_id': 'UCu6mSoMNzHQiBIOCkHUa2Aw', + }, + 'playlist_mincount': 650, + 'params': { + 'skip_download': True, + 'extractor_args': {'youtubetab': {'skip': ['webpage']}} + }, + }, { + 'note': 'API Fallback: Topic, should redirect to playlist?list=UU...', + 'url': 'https://music.youtube.com/browse/UC9ALqqC4aIeG5iDs7i90Bfw', + 'info_dict': { + 'id': 'UU9ALqqC4aIeG5iDs7i90Bfw', + 'uploader_id': 'UC9ALqqC4aIeG5iDs7i90Bfw', + 'title': 'Uploads from Royalty Free Music - Topic', + 'uploader': 'Royalty Free Music - Topic', + }, + 'expected_warnings': [ + 'A channel/user page was given', + 'The URL does not have a videos tab', + ], + 'playlist_mincount': 101, + 'params': { + 'skip_download': True, + 'extractor_args': {'youtubetab': {'skip': ['webpage']}} + }, + }] + + @classmethod + def suitable(cls, url): + return False if YoutubeIE.suitable(url) else super( + YoutubeTabIE, cls).suitable(url) + + def _extract_channel_id(self, webpage): + channel_id = self._html_search_meta( + 'channelId', webpage, 'channel id', default=None) + if channel_id: + return channel_id + channel_url = self._html_search_meta( + ('og:url', 'al:ios:url', 'al:android:url', 'al:web:url', + 'twitter:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad', + 'twitter:app:url:googleplay'), webpage, 'channel url') + return self._search_regex( + r'https?://(?:www\.)?youtube\.com/channel/([^/?#&])+', + channel_url, 'channel id') + + @staticmethod + def _extract_basic_item_renderer(item): + # Modified from _extract_grid_item_renderer + known_basic_renderers = ( + 'playlistRenderer', 'videoRenderer', 'channelRenderer', 'showRenderer' + ) + for key, renderer in item.items(): + if not isinstance(renderer, dict): + continue + elif key in known_basic_renderers: + return renderer + elif key.startswith('grid') and key.endswith('Renderer'): + return renderer + + def _grid_entries(self, grid_renderer): + for item in grid_renderer['items']: + if not isinstance(item, dict): + continue + renderer = self._extract_basic_item_renderer(item) + if not isinstance(renderer, dict): + continue + title = self._get_text(renderer, 'title') + + # playlist + playlist_id = renderer.get('playlistId') + if playlist_id: + yield self.url_result( + 'https://www.youtube.com/playlist?list=%s' % playlist_id, + ie=YoutubeTabIE.ie_key(), video_id=playlist_id, + video_title=title) + continue + # video + video_id = renderer.get('videoId') + if video_id: + yield self._extract_video(renderer) + continue + # channel + channel_id = renderer.get('channelId') + if channel_id: + yield self.url_result( + 'https://www.youtube.com/channel/%s' % channel_id, + ie=YoutubeTabIE.ie_key(), video_title=title) + continue + # generic endpoint URL support + ep_url = urljoin('https://www.youtube.com/', try_get( + renderer, lambda x: x['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'], + compat_str)) + if ep_url: + for ie in (YoutubeTabIE, YoutubePlaylistIE, YoutubeIE): + if ie.suitable(ep_url): + yield self.url_result( + ep_url, ie=ie.ie_key(), video_id=ie._match_id(ep_url), video_title=title) + break + + def _shelf_entries_from_content(self, shelf_renderer): + content = shelf_renderer.get('content') + if not isinstance(content, dict): + return + renderer = content.get('gridRenderer') or content.get('expandedShelfContentsRenderer') + if renderer: + # TODO: add support for nested playlists so each shelf is processed + # as separate playlist + # TODO: this includes only first N items + for entry in self._grid_entries(renderer): + yield entry + renderer = content.get('horizontalListRenderer') + if renderer: + # TODO + pass + + def _shelf_entries(self, shelf_renderer, skip_channels=False): + ep = try_get( + shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'], + compat_str) + shelf_url = urljoin('https://www.youtube.com', ep) + if shelf_url: + # Skipping links to another channels, note that checking for + # endpoint.commandMetadata.webCommandMetadata.webPageTypwebPageType == WEB_PAGE_TYPE_CHANNEL + # will not work + if skip_channels and '/channels?' in shelf_url: + return + title = self._get_text(shelf_renderer, 'title') + yield self.url_result(shelf_url, video_title=title) + # Shelf may not contain shelf URL, fallback to extraction from content + for entry in self._shelf_entries_from_content(shelf_renderer): + yield entry + + def _playlist_entries(self, video_list_renderer): + for content in video_list_renderer['contents']: + if not isinstance(content, dict): + continue + renderer = content.get('playlistVideoRenderer') or content.get('playlistPanelVideoRenderer') + if not isinstance(renderer, dict): + continue + video_id = renderer.get('videoId') + if not video_id: + continue + yield self._extract_video(renderer) + + def _rich_entries(self, rich_grid_renderer): + renderer = try_get( + rich_grid_renderer, lambda x: x['content']['videoRenderer'], dict) or {} + video_id = renderer.get('videoId') + if not video_id: + return + yield self._extract_video(renderer) + + def _video_entry(self, video_renderer): + video_id = video_renderer.get('videoId') + if video_id: + return self._extract_video(video_renderer) + + def _post_thread_entries(self, post_thread_renderer): + post_renderer = try_get( + post_thread_renderer, lambda x: x['post']['backstagePostRenderer'], dict) + if not post_renderer: + return + # video attachment + video_renderer = try_get( + post_renderer, lambda x: x['backstageAttachment']['videoRenderer'], dict) or {} + video_id = video_renderer.get('videoId') + if video_id: + entry = self._extract_video(video_renderer) + if entry: + yield entry + # playlist attachment + playlist_id = try_get( + post_renderer, lambda x: x['backstageAttachment']['playlistRenderer']['playlistId'], compat_str) + if playlist_id: + yield self.url_result( + 'https://www.youtube.com/playlist?list=%s' % playlist_id, + ie=YoutubeTabIE.ie_key(), video_id=playlist_id) + # inline video links + runs = try_get(post_renderer, lambda x: x['contentText']['runs'], list) or [] + for run in runs: + if not isinstance(run, dict): + continue + ep_url = try_get( + run, lambda x: x['navigationEndpoint']['urlEndpoint']['url'], compat_str) + if not ep_url: + continue + if not YoutubeIE.suitable(ep_url): + continue + ep_video_id = YoutubeIE._match_id(ep_url) + if video_id == ep_video_id: + continue + yield self.url_result(ep_url, ie=YoutubeIE.ie_key(), video_id=ep_video_id) + + def _post_thread_continuation_entries(self, post_thread_continuation): + contents = post_thread_continuation.get('contents') + if not isinstance(contents, list): + return + for content in contents: + renderer = content.get('backstagePostThreadRenderer') + if not isinstance(renderer, dict): + continue + for entry in self._post_thread_entries(renderer): + yield entry + + r''' # unused + def _rich_grid_entries(self, contents): + for content in contents: + video_renderer = try_get(content, lambda x: x['richItemRenderer']['content']['videoRenderer'], dict) + if video_renderer: + entry = self._video_entry(video_renderer) + if entry: + yield entry + ''' + def _entries(self, tab, item_id, ytcfg, account_syncid, visitor_data): + + def extract_entries(parent_renderer): # this needs to called again for continuation to work with feeds + contents = try_get(parent_renderer, lambda x: x['contents'], list) or [] + for content in contents: + if not isinstance(content, dict): + continue + is_renderer = try_get(content, lambda x: x['itemSectionRenderer'], dict) + if not is_renderer: + renderer = content.get('richItemRenderer') + if renderer: + for entry in self._rich_entries(renderer): + yield entry + continuation_list[0] = self._extract_continuation(parent_renderer) + continue + isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or [] + for isr_content in isr_contents: + if not isinstance(isr_content, dict): + continue + + known_renderers = { + 'playlistVideoListRenderer': self._playlist_entries, + 'gridRenderer': self._grid_entries, + 'shelfRenderer': lambda x: self._shelf_entries(x, tab.get('title') != 'Channels'), + 'backstagePostThreadRenderer': self._post_thread_entries, + 'videoRenderer': lambda x: [self._video_entry(x)], + } + for key, renderer in isr_content.items(): + if key not in known_renderers: + continue + for entry in known_renderers[key](renderer): + if entry: + yield entry + continuation_list[0] = self._extract_continuation(renderer) + break + + if not continuation_list[0]: + continuation_list[0] = self._extract_continuation(is_renderer) + + if not continuation_list[0]: + continuation_list[0] = self._extract_continuation(parent_renderer) + + continuation_list = [None] # Python 2 does not support nonlocal + tab_content = try_get(tab, lambda x: x['content'], dict) + if not tab_content: + return + parent_renderer = ( + try_get(tab_content, lambda x: x['sectionListRenderer'], dict) + or try_get(tab_content, lambda x: x['richGridRenderer'], dict) or {}) + for entry in extract_entries(parent_renderer): + yield entry + continuation = continuation_list[0] + + for page_num in itertools.count(1): + if not continuation: + break + headers = self.generate_api_headers( + ytcfg=ytcfg, account_syncid=account_syncid, visitor_data=visitor_data) + response = self._extract_response( + item_id='%s page %s' % (item_id, page_num), + query=continuation, headers=headers, ytcfg=ytcfg, + check_get_keys=('continuationContents', 'onResponseReceivedActions', 'onResponseReceivedEndpoints')) + + if not response: + break + # Extracting updated visitor data is required to prevent an infinite extraction loop in some cases + # See: https://github.com/ytdl-org/youtube-dl/issues/28702 + visitor_data = self._extract_visitor_data(response) or visitor_data + + known_continuation_renderers = { + 'playlistVideoListContinuation': self._playlist_entries, + 'gridContinuation': self._grid_entries, + 'itemSectionContinuation': self._post_thread_continuation_entries, + 'sectionListContinuation': extract_entries, # for feeds + } + continuation_contents = try_get( + response, lambda x: x['continuationContents'], dict) or {} + continuation_renderer = None + for key, value in continuation_contents.items(): + if key not in known_continuation_renderers: + continue + continuation_renderer = value + continuation_list = [None] + for entry in known_continuation_renderers[key](continuation_renderer): + yield entry + continuation = continuation_list[0] or self._extract_continuation(continuation_renderer) + break + if continuation_renderer: + continue + + known_renderers = { + 'gridPlaylistRenderer': (self._grid_entries, 'items'), + 'gridVideoRenderer': (self._grid_entries, 'items'), + 'gridChannelRenderer': (self._grid_entries, 'items'), + 'playlistVideoRenderer': (self._playlist_entries, 'contents'), + 'itemSectionRenderer': (extract_entries, 'contents'), # for feeds + 'richItemRenderer': (extract_entries, 'contents'), # for hashtag + 'backstagePostThreadRenderer': (self._post_thread_continuation_entries, 'contents') + } + on_response_received = dict_get(response, ('onResponseReceivedActions', 'onResponseReceivedEndpoints')) + continuation_items = try_get( + on_response_received, lambda x: x[0]['appendContinuationItemsAction']['continuationItems'], list) + continuation_item = try_get(continuation_items, lambda x: x[0], dict) or {} + video_items_renderer = None + for key, value in continuation_item.items(): + if key not in known_renderers: + continue + video_items_renderer = {known_renderers[key][1]: continuation_items} + continuation_list = [None] + for entry in known_renderers[key][0](video_items_renderer): + yield entry + continuation = continuation_list[0] or self._extract_continuation(video_items_renderer) + break + if video_items_renderer: + continue + break + + @staticmethod + def _extract_selected_tab(tabs): + for tab in tabs: + renderer = dict_get(tab, ('tabRenderer', 'expandableTabRenderer')) or {} + if renderer.get('selected') is True: + return renderer + else: + raise ExtractorError('Unable to find selected tab') + + @classmethod + def _extract_uploader(cls, data): + uploader = {} + renderer = cls._extract_sidebar_info_renderer(data, 'playlistSidebarSecondaryInfoRenderer') or {} + owner = try_get( + renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict) + if owner: + uploader['uploader'] = owner.get('text') + uploader['uploader_id'] = try_get( + owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str) + uploader['uploader_url'] = urljoin( + 'https://www.youtube.com/', + try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str)) + return {k: v for k, v in uploader.items() if v is not None} + + def _extract_from_tabs(self, item_id, ytcfg, data, tabs): + playlist_id = title = description = channel_url = channel_name = channel_id = None + thumbnails_list = [] + tags = [] + + selected_tab = self._extract_selected_tab(tabs) + renderer = try_get( + data, lambda x: x['metadata']['channelMetadataRenderer'], dict) + if renderer: + channel_name = renderer.get('title') + channel_url = renderer.get('channelUrl') + channel_id = renderer.get('externalId') + else: + renderer = try_get( + data, lambda x: x['metadata']['playlistMetadataRenderer'], dict) + + if renderer: + title = renderer.get('title') + description = renderer.get('description', '') + playlist_id = channel_id + tags = renderer.get('keywords', '').split() + thumbnails_list = ( + try_get(renderer, lambda x: x['avatar']['thumbnails'], list) + or try_get( + self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer'), + lambda x: x['thumbnailRenderer']['playlistVideoThumbnailRenderer']['thumbnail']['thumbnails'], + list) + or []) + + thumbnails = [] + for t in thumbnails_list: + if not isinstance(t, dict): + continue + thumbnail_url = url_or_none(t.get('url')) + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'width': int_or_none(t.get('width')), + 'height': int_or_none(t.get('height')), + }) + if playlist_id is None: + playlist_id = item_id + if title is None: + title = ( + try_get(data, lambda x: x['header']['hashtagHeaderRenderer']['hashtag']['simpleText']) + or playlist_id) + title += format_field(selected_tab, 'title', ' - %s') + title += format_field(selected_tab, 'expandedText', ' - %s') + metadata = { + 'playlist_id': playlist_id, + 'playlist_title': title, + 'playlist_description': description, + 'uploader': channel_name, + 'uploader_id': channel_id, + 'uploader_url': channel_url, + 'thumbnails': thumbnails, + 'tags': tags, + } + availability = self._extract_availability(data) + if availability: + metadata['availability'] = availability + if not channel_id: + metadata.update(self._extract_uploader(data)) + metadata.update({ + 'channel': metadata['uploader'], + 'channel_id': metadata['uploader_id'], + 'channel_url': metadata['uploader_url']}) + return self.playlist_result( + self._entries( + selected_tab, playlist_id, ytcfg, + self._extract_account_syncid(ytcfg, data), + self._extract_visitor_data(data, ytcfg)), + **metadata) + + def _extract_mix_playlist(self, playlist, playlist_id, data, ytcfg): + first_id = last_id = response = None + for page_num in itertools.count(1): + videos = list(self._playlist_entries(playlist)) + if not videos: + return + start = next((i for i, v in enumerate(videos) if v['id'] == last_id), -1) + 1 + if start >= len(videos): + return + for video in videos[start:]: + if video['id'] == first_id: + self.to_screen('First video %s found again; Assuming end of Mix' % first_id) + return + yield video + first_id = first_id or videos[0]['id'] + last_id = videos[-1]['id'] + watch_endpoint = try_get( + playlist, lambda x: x['contents'][-1]['playlistPanelVideoRenderer']['navigationEndpoint']['watchEndpoint']) + headers = self.generate_api_headers( + ytcfg=ytcfg, account_syncid=self._extract_account_syncid(ytcfg, data), + visitor_data=self._extract_visitor_data(response, data, ytcfg)) + query = { + 'playlistId': playlist_id, + 'videoId': watch_endpoint.get('videoId') or last_id, + 'index': watch_endpoint.get('index') or len(videos), + 'params': watch_endpoint.get('params') or 'OAE%3D' + } + response = self._extract_response( + item_id='%s page %d' % (playlist_id, page_num), + query=query, ep='next', headers=headers, ytcfg=ytcfg, + check_get_keys='contents' + ) + playlist = try_get( + response, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict) + + def _extract_from_playlist(self, item_id, url, data, playlist, ytcfg): + title = playlist.get('title') or try_get( + data, lambda x: x['titleText']['simpleText'], compat_str) + playlist_id = playlist.get('playlistId') or item_id + + # Delegating everything except mix playlists to regular tab-based playlist URL + playlist_url = urljoin(url, try_get( + playlist, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'], + compat_str)) + if playlist_url and playlist_url != url: + return self.url_result( + playlist_url, ie=YoutubeTabIE.ie_key(), video_id=playlist_id, + video_title=title) + + return self.playlist_result( + self._extract_mix_playlist(playlist, playlist_id, data, ytcfg), + playlist_id=playlist_id, playlist_title=title) + + def _extract_availability(self, data): + """ + Gets the availability of a given playlist/tab. + Note: Unless YouTube tells us explicitly, we do not assume it is public + @param data: response + """ + is_private = is_unlisted = None + renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer') or {} + badge_labels = self._extract_badges(renderer) + + # Personal playlists, when authenticated, have a dropdown visibility selector instead of a badge + privacy_dropdown_entries = try_get( + renderer, lambda x: x['privacyForm']['dropdownFormFieldRenderer']['dropdown']['dropdownRenderer']['entries'], list) or [] + for renderer_dict in privacy_dropdown_entries: + is_selected = try_get( + renderer_dict, lambda x: x['privacyDropdownItemRenderer']['isSelected'], bool) or False + if not is_selected: + continue + label = self._get_text(renderer_dict, ('privacyDropdownItemRenderer', 'label')) + if label: + badge_labels.add(label.lower()) + break + + for badge_label in badge_labels: + if badge_label == 'unlisted': + is_unlisted = True + elif badge_label == 'private': + is_private = True + elif badge_label == 'public': + is_unlisted = is_private = False + return self._availability(is_private, False, False, False, is_unlisted) + + @staticmethod + def _extract_sidebar_info_renderer(data, info_renderer, expected_type=dict): + sidebar_renderer = try_get( + data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list) or [] + for item in sidebar_renderer: + renderer = try_get(item, lambda x: x[info_renderer], expected_type) + if renderer: + return renderer + + def _reload_with_unavailable_videos(self, item_id, data, ytcfg): + """ + Get playlist with unavailable videos if the 'show unavailable videos' button exists. + """ + browse_id = params = None + renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer') + if not renderer: + return + menu_renderer = try_get( + renderer, lambda x: x['menu']['menuRenderer']['items'], list) or [] + for menu_item in menu_renderer: + if not isinstance(menu_item, dict): + continue + nav_item_renderer = menu_item.get('menuNavigationItemRenderer') + text = try_get( + nav_item_renderer, lambda x: x['text']['simpleText'], compat_str) + if not text or text.lower() != 'show unavailable videos': + continue + browse_endpoint = try_get( + nav_item_renderer, lambda x: x['navigationEndpoint']['browseEndpoint'], dict) or {} + browse_id = browse_endpoint.get('browseId') + params = browse_endpoint.get('params') + break + + headers = self.generate_api_headers( + ytcfg=ytcfg, account_syncid=self._extract_account_syncid(ytcfg, data), + visitor_data=self._extract_visitor_data(data, ytcfg)) + query = { + 'params': params or 'wgYCCAA=', + 'browseId': browse_id or 'VL%s' % item_id + } + return self._extract_response( + item_id=item_id, headers=headers, query=query, + check_get_keys='contents', fatal=False, ytcfg=ytcfg, + note='Downloading API JSON with unavailable videos') + + def _extract_webpage(self, url, item_id, fatal=True): + retries = self.get_param('extractor_retries', 3) + count = -1 + webpage = data = last_error = None + while count < retries: + count += 1 + # Sometimes youtube returns a webpage with incomplete ytInitialData + # See: https://github.com/yt-dlp/yt-dlp/issues/116 + if last_error: + self.report_warning('%s. Retrying ...' % last_error) + try: + webpage = self._download_webpage( + url, item_id, + note='Downloading webpage%s' % (' (retry #%d)' % count if count else '',)) + data = self.extract_yt_initial_data(item_id, webpage or '', fatal=fatal) or {} + except ExtractorError as e: + if isinstance(e.cause, network_exceptions): + if not isinstance(e.cause, compat_HTTPError) or e.cause.code not in (403, 429): + last_error = error_to_compat_str(e.cause or e.msg) + if count < retries: + continue + if fatal: + raise + self.report_warning(error_to_compat_str(e)) + break + else: + try: + self._extract_and_report_alerts(data) + except ExtractorError as e: + if fatal: + raise + self.report_warning(error_to_compat_str(e)) + break + + if dict_get(data, ('contents', 'currentVideoEndpoint')): + break + + last_error = 'Incomplete yt initial data received' + if count >= retries: + if fatal: + raise ExtractorError(last_error) + self.report_warning(last_error) + break + + return webpage, data + + def _extract_data(self, url, item_id, ytcfg=None, fatal=True, webpage_fatal=False, default_client='web'): + data = None + if 'webpage' not in self._configuration_arg('skip'): + webpage, data = self._extract_webpage(url, item_id, fatal=webpage_fatal) + ytcfg = ytcfg or self.extract_ytcfg(item_id, webpage) + if not data: + if not ytcfg and self.is_authenticated: + msg = 'Playlists that require authentication may not extract correctly without a successful webpage download.' + if 'authcheck' not in self._configuration_arg('skip') and fatal: + raise ExtractorError( + msg + ' If you are not downloading private content, or your cookies are only for the first account and channel,' + ' pass "--extractor-args youtubetab:skip=authcheck" to skip this check', + expected=True) + self.report_warning(msg, only_once=True) + data = self._extract_tab_endpoint(url, item_id, ytcfg, fatal=fatal, default_client=default_client) + return data, ytcfg + + def _extract_tab_endpoint(self, url, item_id, ytcfg=None, fatal=True, default_client='web'): + headers = self.generate_api_headers(ytcfg=ytcfg, default_client=default_client) + resolve_response = self._extract_response( + item_id=item_id, query={'url': url}, check_get_keys='endpoint', headers=headers, ytcfg=ytcfg, fatal=fatal, + ep='navigation/resolve_url', note='Downloading API parameters API JSON', default_client=default_client) + endpoints = {'browseEndpoint': 'browse', 'watchEndpoint': 'next'} + for ep_key, ep in endpoints.items(): + params = try_get(resolve_response, lambda x: x['endpoint'][ep_key], dict) + if params: + return self._extract_response( + item_id=item_id, query=params, ep=ep, headers=headers, + ytcfg=ytcfg, fatal=fatal, default_client=default_client, + check_get_keys=('contents', 'currentVideoEndpoint')) + err_note = 'Failed to resolve url (does the playlist exist?)' + if fatal: + raise ExtractorError(err_note, expected=True) + self.report_warning(err_note, item_id) + + @staticmethod + def _smuggle_data(entries, data): + for entry in entries: + if data: + entry['url'] = smuggle_url(entry['url'], data) + yield entry + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + if self.is_music_url(url): + smuggled_data['is_music_url'] = True + info_dict = self.__real_extract(url, smuggled_data) + if info_dict.get('entries'): + info_dict['entries'] = self._smuggle_data(info_dict['entries'], smuggled_data) + return info_dict + + _url_re = re.compile(r'(?P<pre>%s)(?(channel_type)(?P<tab>/\w+))?(?P<post>.*)$' % _VALID_URL) + + def __real_extract(self, url, smuggled_data): + item_id = self._match_id(url) + url = compat_urlparse.urlunparse( + compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com')) + compat_opts = self.get_param('compat_opts', []) + + def get_mobj(url): + mobj = self._url_re.match(url).groupdict() + mobj.update((k, '') for k, v in mobj.items() if v is None) + return mobj + + mobj = get_mobj(url) + # Youtube returns incomplete data if tabname is not lower case + pre, tab, post, is_channel = mobj['pre'], mobj['tab'].lower(), mobj['post'], not mobj['not_channel'] + if is_channel: + if smuggled_data.get('is_music_url'): + if item_id[:2] == 'VL': + # Youtube music VL channels have an equivalent playlist + item_id = item_id[2:] + pre, tab, post, is_channel = 'https://www.youtube.com/playlist?list=%s' % item_id, '', '', False + elif item_id[:2] == 'MP': + # Resolve albums (/[channel/browse]/MP...) to their equivalent playlist + mdata = self._extract_tab_endpoint( + 'https://music.youtube.com/channel/%s' % item_id, item_id, default_client='web_music') + murl = traverse_obj( + mdata, ('microformat', 'microformatDataRenderer', 'urlCanonical'), get_all=False, expected_type=compat_str) + if not murl: + raise ExtractorError('Failed to resolve album to playlist.') + return self.url_result(murl, ie=YoutubeTabIE.ie_key()) + elif mobj['channel_type'] == 'browse': + # Youtube music /browse/ should be changed to /channel/ + pre = 'https://www.youtube.com/channel/%s' % item_id + if is_channel and not tab and 'no-youtube-channel-redirect' not in compat_opts: + # Home URLs should redirect to /videos/ + self.report_warning( + 'A channel/user page was given. All the channel\'s videos will be downloaded. ' + 'To download only the videos in the home page, add a "/featured" to the URL') + tab = '/videos' + + url = ''.join((pre, tab, post)) + mobj = get_mobj(url) + + # Handle both video/playlist URLs + qs = parse_qs(url) + video_id = qs.get('v', [None])[0] + playlist_id = qs.get('list', [None])[0] + + if not video_id and mobj['not_channel'].startswith('watch'): + if not playlist_id: + # If there is neither video or playlist ids, youtube redirects to home page, which is undesirable + raise ExtractorError('Unable to recognize tab page') + # Common mistake: https://www.youtube.com/watch?list=playlist_id + self.report_warning('A video URL was given without video ID. Trying to download playlist %s' % playlist_id) + url = 'https://www.youtube.com/playlist?list=%s' % playlist_id + mobj = get_mobj(url) + + if video_id and playlist_id: + if self.get_param('noplaylist'): + self.to_screen('Downloading just video %s because of --no-playlist' % video_id) + return self.url_result(f'https://www.youtube.com/watch?v={video_id}', ie=YoutubeIE.ie_key(), video_id=video_id) + self.to_screen('Downloading playlist %s; add --no-playlist to just download video %s' % (playlist_id, video_id)) + + data, ytcfg = self._extract_data(url, item_id) + + tabs = try_get( + data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list) + if tabs: + selected_tab = self._extract_selected_tab(tabs) + tab_name = selected_tab.get('title', '') + if 'no-youtube-channel-redirect' not in compat_opts: + if mobj['tab'] == '/live': + # Live tab should have redirected to the video + raise ExtractorError('The channel is not currently live', expected=True) + if mobj['tab'] == '/videos' and tab_name.lower() != mobj['tab'][1:]: + if not mobj['not_channel'] and item_id[:2] == 'UC': + # Topic channels don't have /videos. Use the equivalent playlist instead + self.report_warning('The URL does not have a %s tab. Trying to redirect to playlist UU%s instead' % (mobj['tab'][1:], item_id[2:])) + pl_id = 'UU%s' % item_id[2:] + pl_url = 'https://www.youtube.com/playlist?list=%s%s' % (pl_id, mobj['post']) + try: + data, ytcfg, item_id, url = *self._extract_data(pl_url, pl_id, ytcfg=ytcfg, fatal=True), pl_id, pl_url + except ExtractorError: + self.report_warning('The playlist gave error. Falling back to channel URL') + else: + self.report_warning('The URL does not have a %s tab. %s is being downloaded instead' % (mobj['tab'][1:], tab_name)) + + self.write_debug('Final URL: %s' % url) + + # YouTube sometimes provides a button to reload playlist with unavailable videos. + if 'no-youtube-unavailable-videos' not in compat_opts: + data = self._reload_with_unavailable_videos(item_id, data, ytcfg) or data + self._extract_and_report_alerts(data, only_once=True) + tabs = try_get( + data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list) + if tabs: + return self._extract_from_tabs(item_id, ytcfg, data, tabs) + + playlist = try_get( + data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict) + if playlist: + return self._extract_from_playlist(item_id, url, data, playlist, ytcfg) + + video_id = try_get( + data, lambda x: x['currentVideoEndpoint']['watchEndpoint']['videoId'], + compat_str) or video_id + if video_id: + if mobj['tab'] != '/live': # live tab is expected to redirect to video + self.report_warning('Unable to recognize playlist. Downloading just video %s' % video_id) + return self.url_result(f'https://www.youtube.com/watch?v={video_id}', ie=YoutubeIE.ie_key(), video_id=video_id) + + raise ExtractorError('Unable to recognize tab page') + + +class YoutubePlaylistIE(InfoExtractor): + IE_DESC = 'YouTube.com playlists' + _VALID_URL = r'''(?x)(?: + (?:https?://)? + (?:\w+\.)? + (?: + (?: + youtube(?:kids)?\.com| + invidio\.us + ) + /.*?\?.*?\blist= + )? + (?P<id>%(playlist_id)s) + )''' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} + IE_NAME = 'youtube:playlist' + _TESTS = [{ + 'note': 'issue #673', + 'url': 'PLBB231211A4F62143', + 'info_dict': { + 'title': '[OLD]Team Fortress 2 (Class-based LP)', + 'id': 'PLBB231211A4F62143', + 'uploader': 'Wickydoo', + 'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q', + 'description': 'md5:8fa6f52abb47a9552002fa3ddfc57fc2', + }, + 'playlist_mincount': 29, + }, { + 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl', + 'info_dict': { + 'title': 'YDL_safe_search', + 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl', + }, + 'playlist_count': 2, + 'skip': 'This playlist is private', + }, { + 'note': 'embedded', + 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', + 'playlist_count': 4, + 'info_dict': { + 'title': 'JODA15', + 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', + 'uploader': 'milan', + 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw', + } + }, { + 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', + 'playlist_mincount': 654, + 'info_dict': { + 'title': '2018 Chinese New Singles (11/6 updated)', + 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', + 'uploader': 'LBK', + 'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA', + 'description': 'md5:da521864744d60a198e3a88af4db0d9d', + } + }, { + 'url': 'TLGGrESM50VT6acwMjAyMjAxNw', + 'only_matching': True, + }, { + # music album playlist + 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + if YoutubeTabIE.suitable(url): + return False + # Hack for lazy extractors until more generic solution is implemented + # (see #28780) + from .youtube import parse_qs + qs = parse_qs(url) + if qs.get('v', [None])[0]: + return False + return super(YoutubePlaylistIE, cls).suitable(url) + + def _real_extract(self, url): + playlist_id = self._match_id(url) + is_music_url = YoutubeBaseInfoExtractor.is_music_url(url) + url = update_url_query( + 'https://www.youtube.com/playlist', + parse_qs(url) or {'list': playlist_id}) + if is_music_url: + url = smuggle_url(url, {'is_music_url': True}) + return self.url_result(url, ie=YoutubeTabIE.ie_key(), video_id=playlist_id) + + +class YoutubeYtBeIE(InfoExtractor): + IE_DESC = 'youtu.be' + _VALID_URL = r'https?://youtu\.be/(?P<id>[0-9A-Za-z_-]{11})/*?.*?\blist=(?P<playlist_id>%(playlist_id)s)' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} + _TESTS = [{ + 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5', + 'info_dict': { + 'id': 'yeWKywCrFtk', + 'ext': 'mp4', + 'title': 'Small Scale Baler and Braiding Rugs', + 'uploader': 'Backus-Page House Museum', + 'uploader_id': 'backuspagemuseum', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum', + 'upload_date': '20161008', + 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a', + 'categories': ['Nonprofits & Activism'], + 'tags': list, + 'like_count': int, + 'dislike_count': int, + }, + 'params': { + 'noplaylist': True, + 'skip_download': True, + }, + }, { + 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + playlist_id = mobj.group('playlist_id') + return self.url_result( + update_url_query('https://www.youtube.com/watch', { + 'v': video_id, + 'list': playlist_id, + 'feature': 'youtu.be', + }), ie=YoutubeTabIE.ie_key(), video_id=playlist_id) + + +class YoutubeYtUserIE(InfoExtractor): + IE_DESC = 'YouTube.com user videos, URL or "ytuser" keyword' + _VALID_URL = r'ytuser:(?P<id>.+)' + _TESTS = [{ + 'url': 'ytuser:phihag', + 'only_matching': True, + }] + + def _real_extract(self, url): + user_id = self._match_id(url) + return self.url_result( + 'https://www.youtube.com/user/%s' % user_id, + ie=YoutubeTabIE.ie_key(), video_id=user_id) + + +class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): + IE_NAME = 'youtube:favorites' + IE_DESC = 'YouTube.com liked videos, ":ytfav" for short (requires authentication)' + _VALID_URL = r':ytfav(?:ou?rite)?s?' + _LOGIN_REQUIRED = True + _TESTS = [{ + 'url': ':ytfav', + 'only_matching': True, + }, { + 'url': ':ytfavorites', + 'only_matching': True, + }] + + def _real_extract(self, url): + return self.url_result( + 'https://www.youtube.com/playlist?list=LL', + ie=YoutubeTabIE.ie_key()) + + +class YoutubeSearchIE(SearchInfoExtractor, YoutubeTabIE): + IE_DESC = 'YouTube.com searches, "ytsearch" keyword' + # there doesn't appear to be a real limit, for example if you search for + # 'python' you get more than 8.000.000 results + _MAX_RESULTS = float('inf') + IE_NAME = 'youtube:search' + _SEARCH_KEY = 'ytsearch' + _SEARCH_PARAMS = None + _TESTS = [] + + def _search_results(self, query): + data = {'query': query} + if self._SEARCH_PARAMS: + data['params'] = self._SEARCH_PARAMS + continuation = {} + for page_num in itertools.count(1): + data.update(continuation) + search = self._extract_response( + item_id='query "%s" page %s' % (query, page_num), ep='search', query=data, + check_get_keys=('contents', 'onResponseReceivedCommands') + ) + if not search: + break + slr_contents = try_get( + search, + (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'], + lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']), + list) + if not slr_contents: + break + + # Youtube sometimes adds promoted content to searches, + # changing the index location of videos and token. + # So we search through all entries till we find them. + continuation = None + for slr_content in slr_contents: + if not continuation: + continuation = self._extract_continuation({'contents': [slr_content]}) + + isr_contents = try_get( + slr_content, + lambda x: x['itemSectionRenderer']['contents'], + list) + if not isr_contents: + continue + for content in isr_contents: + if not isinstance(content, dict): + continue + video = content.get('videoRenderer') + if not isinstance(video, dict): + continue + video_id = video.get('videoId') + if not video_id: + continue + + yield self._extract_video(video) + + if not continuation: + break + + +class YoutubeSearchDateIE(YoutubeSearchIE): + IE_NAME = YoutubeSearchIE.IE_NAME + ':date' + _SEARCH_KEY = 'ytsearchdate' + IE_DESC = 'YouTube.com searches, newest videos first, "ytsearchdate" keyword' + _SEARCH_PARAMS = 'CAI%3D' + + +class YoutubeSearchURLIE(YoutubeSearchIE): + IE_DESC = 'YouTube.com search URLs' + IE_NAME = YoutubeSearchIE.IE_NAME + '_url' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?:[^&]+)(?:[&]|$)' + # _MAX_RESULTS = 100 + _TESTS = [{ + 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', + 'playlist_mincount': 5, + 'info_dict': { + 'id': 'youtube-dl test video', + 'title': 'youtube-dl test video', + } + }, { + 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB', + 'only_matching': True, + }] + + @classmethod + def _make_valid_url(cls): + return cls._VALID_URL + + def _real_extract(self, url): + qs = parse_qs(url) + query = (qs.get('search_query') or qs.get('q'))[0] + self._SEARCH_PARAMS = qs.get('sp', ('',))[0] + return self._get_n_results(query, self._MAX_RESULTS) + + +class YoutubeFeedsInfoExtractor(YoutubeTabIE): + """ + Base class for feed extractors + Subclasses must define the _FEED_NAME property. + """ + _LOGIN_REQUIRED = True + _TESTS = [] + + @property + def IE_NAME(self): + return 'youtube:%s' % self._FEED_NAME + + def _real_extract(self, url): + return self.url_result( + 'https://www.youtube.com/feed/%s' % self._FEED_NAME, + ie=YoutubeTabIE.ie_key()) + + +class YoutubeWatchLaterIE(InfoExtractor): + IE_NAME = 'youtube:watchlater' + IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)' + _VALID_URL = r':ytwatchlater' + _TESTS = [{ + 'url': ':ytwatchlater', + 'only_matching': True, + }] + + def _real_extract(self, url): + return self.url_result( + 'https://www.youtube.com/playlist?list=WL', ie=YoutubeTabIE.ie_key()) + + +class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): + IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/?(?:[?#]|$)|:ytrec(?:ommended)?' + _FEED_NAME = 'recommended' + _LOGIN_REQUIRED = False + _TESTS = [{ + 'url': ':ytrec', + 'only_matching': True, + }, { + 'url': ':ytrecommended', + 'only_matching': True, + }, { + 'url': 'https://youtube.com', + 'only_matching': True, + }] + + +class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): + IE_DESC = 'YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication)' + _VALID_URL = r':ytsub(?:scription)?s?' + _FEED_NAME = 'subscriptions' + _TESTS = [{ + 'url': ':ytsubs', + 'only_matching': True, + }, { + 'url': ':ytsubscriptions', + 'only_matching': True, + }] + + +class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): + IE_DESC = 'Youtube watch history, ":ythis" for short (requires authentication)' + _VALID_URL = r':ythis(?:tory)?' + _FEED_NAME = 'history' + _TESTS = [{ + 'url': ':ythistory', + 'only_matching': True, + }] + + +class YoutubeTruncatedURLIE(InfoExtractor): + IE_NAME = 'youtube:truncated_url' + IE_DESC = False # Do not list + _VALID_URL = r'''(?x) + (?:https?://)? + (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/ + (?:watch\?(?: + feature=[a-z_]+| + annotation_id=annotation_[^&]+| + x-yt-cl=[0-9]+| + hl=[^&]*| + t=[0-9]+ + )? + | + attribution_link\?a=[^&]+ + ) + $ + ''' + + _TESTS = [{ + 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/watch?', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/watch?feature=foo', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/watch?hl=en-GB', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/watch?t=2372', + 'only_matching': True, + }] + + def _real_extract(self, url): + raise ExtractorError( + 'Did you forget to quote the URL? Remember that & is a meta ' + 'character in most shells, so you want to put the URL in quotes, ' + 'like youtube-dl ' + '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" ' + ' or simply youtube-dl BaW_jenozKc .', + expected=True) + + +class YoutubeClipIE(InfoExtractor): + IE_NAME = 'youtube:clip' + IE_DESC = False # Do not list + _VALID_URL = r'https?://(?:www\.)?youtube\.com/clip/' + + def _real_extract(self, url): + self.report_warning('YouTube clips are not currently supported. The entire video will be downloaded instead') + return self.url_result(url, 'Generic') + + +class YoutubeTruncatedIDIE(InfoExtractor): + IE_NAME = 'youtube:truncated_id' + IE_DESC = False # Do not list + _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$' + + _TESTS = [{ + 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + raise ExtractorError( + 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url), + expected=True) diff --git a/yt_dlp/extractor/zapiks.py b/yt_dlp/extractor/zapiks.py new file mode 100644 index 000000000..161b011ab --- /dev/null +++ b/yt_dlp/extractor/zapiks.py @@ -0,0 +1,109 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + parse_duration, + parse_iso8601, + xpath_with_ns, + xpath_text, + int_or_none, +) + + +class ZapiksIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?zapiks\.(?:fr|com)/(?:(?:[a-z]{2}/)?(?P<display_id>.+?)\.html|index\.php\?.*\bmedia_id=(?P<id>\d+))' + _TESTS = [ + { + 'url': 'http://www.zapiks.fr/ep2s3-bon-appetit-eh-be-viva.html', + 'md5': 'aeb3c473b2d564b2d46d664d28d5f050', + 'info_dict': { + 'id': '80798', + 'ext': 'mp4', + 'title': 'EP2S3 - Bon Appétit - Eh bé viva les pyrénées con!', + 'description': 'md5:7054d6f6f620c6519be1fe710d4da847', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 528, + 'timestamp': 1359044972, + 'upload_date': '20130124', + 'view_count': int, + }, + }, + { + 'url': 'http://www.zapiks.com/ep3s5-bon-appetit-baqueira-m-1.html', + 'only_matching': True, + }, + { + 'url': 'http://www.zapiks.com/nl/ep3s5-bon-appetit-baqueira-m-1.html', + 'only_matching': True, + }, + { + 'url': 'http://www.zapiks.fr/index.php?action=playerIframe&media_id=118046&width=640&height=360&autoStart=false&language=fr', + 'only_matching': True, + }, + ] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') or video_id + + webpage = self._download_webpage(url, display_id) + + if not video_id: + video_id = self._search_regex( + r'data-media-id="(\d+)"', webpage, 'video id') + + playlist = self._download_xml( + 'http://www.zapiks.fr/view/index.php?action=playlist&media_id=%s&lang=en' % video_id, + display_id) + + NS_MAP = { + 'jwplayer': 'http://rss.jwpcdn.com/' + } + + def ns(path): + return xpath_with_ns(path, NS_MAP) + + item = playlist.find('./channel/item') + + title = xpath_text(item, 'title', 'title') or self._og_search_title(webpage) + description = self._og_search_description(webpage, default=None) + thumbnail = xpath_text( + item, ns('./jwplayer:image'), 'thumbnail') or self._og_search_thumbnail(webpage, default=None) + duration = parse_duration(self._html_search_meta( + 'duration', webpage, 'duration', default=None)) + timestamp = parse_iso8601(self._html_search_meta( + 'uploadDate', webpage, 'upload date', default=None), ' ') + + view_count = int_or_none(self._search_regex( + r'UserPlays:(\d+)', webpage, 'view count', default=None)) + comment_count = int_or_none(self._search_regex( + r'UserComments:(\d+)', webpage, 'comment count', default=None)) + + formats = [] + for source in item.findall(ns('./jwplayer:source')): + format_id = source.attrib['label'] + f = { + 'url': source.attrib['file'], + 'format_id': format_id, + } + m = re.search(r'^(?P<height>\d+)[pP]', format_id) + if m: + f['height'] = int(m.group('height')) + formats.append(f) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': timestamp, + 'view_count': view_count, + 'comment_count': comment_count, + 'formats': formats, + } diff --git a/yt_dlp/extractor/zaq1.py b/yt_dlp/extractor/zaq1.py new file mode 100644 index 000000000..889aff5d8 --- /dev/null +++ b/yt_dlp/extractor/zaq1.py @@ -0,0 +1,101 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + unified_timestamp, +) + + +class Zaq1IE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?zaq1\.pl/video/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'http://zaq1.pl/video/xev0e', + 'md5': '24a5eb3f052e604ae597c4d0d19b351e', + 'info_dict': { + 'id': 'xev0e', + 'title': 'DJ NA WESELE. TANIEC Z FIGURAMI.węgrów/sokołów podlaski/siedlce/mińsk mazowiecki/warszawa', + 'description': 'www.facebook.com/weseledjKontakt: 728 448 199 / 505 419 147', + 'ext': 'mp4', + 'duration': 511, + 'timestamp': 1490896361, + 'uploader': 'Anonim', + 'upload_date': '20170330', + 'view_count': int, + } + }, { + # malformed JSON-LD + 'url': 'http://zaq1.pl/video/x81vn', + 'info_dict': { + 'id': 'x81vn', + 'title': 'SEKRETNE ŻYCIE WALTERA MITTY', + 'ext': 'mp4', + 'duration': 6234, + 'timestamp': 1493494860, + 'uploader': 'Anonim', + 'upload_date': '20170429', + 'view_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Failed to parse JSON'], + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_url = self._search_regex( + r'data-video-url=(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'video url', group='url') + + info = self._search_json_ld(webpage, video_id, fatal=False) + + def extract_data(field, name, fatal=False): + return self._search_regex( + r'data-%s=(["\'])(?P<field>(?:(?!\1).)+)\1' % field, + webpage, field, fatal=fatal, group='field') + + if not info.get('title'): + info['title'] = extract_data('file-name', 'title', fatal=True) + + if not info.get('duration'): + info['duration'] = int_or_none(extract_data('duration', 'duration')) + + if not info.get('thumbnail'): + info['thumbnail'] = extract_data('photo-url', 'thumbnail') + + if not info.get('timestamp'): + info['timestamp'] = unified_timestamp(self._html_search_meta( + 'uploadDate', webpage, 'timestamp')) + + if not info.get('interactionCount'): + info['view_count'] = int_or_none(self._html_search_meta( + 'interactionCount', webpage, 'view count')) + + uploader = self._html_search_regex( + r'Wideo dodał:\s*<a[^>]*>([^<]+)</a>', webpage, 'uploader', + fatal=False) + + width = int_or_none(self._html_search_meta( + 'width', webpage, fatal=False)) + height = int_or_none(self._html_search_meta( + 'height', webpage, fatal=False)) + + info.update({ + 'id': video_id, + 'formats': [{ + 'url': video_url, + 'width': width, + 'height': height, + 'http_headers': { + 'Referer': url, + }, + }], + 'uploader': uploader, + }) + + return info diff --git a/yt_dlp/extractor/zattoo.py b/yt_dlp/extractor/zattoo.py new file mode 100644 index 000000000..a13d12436 --- /dev/null +++ b/yt_dlp/extractor/zattoo.py @@ -0,0 +1,433 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +from uuid import uuid4 + +from .common import InfoExtractor +from ..compat import ( + compat_HTTPError, + compat_str, +) +from ..utils import ( + ExtractorError, + int_or_none, + try_get, + url_or_none, + urlencode_postdata, +) + + +class ZattooPlatformBaseIE(InfoExtractor): + _power_guide_hash = None + + def _host_url(self): + return 'https://%s' % (self._API_HOST if hasattr(self, '_API_HOST') else self._HOST) + + def _login(self): + username, password = self._get_login_info() + if not username or not password: + self.raise_login_required( + 'A valid %s account is needed to access this media.' + % self._NETRC_MACHINE) + + try: + data = self._download_json( + '%s/zapi/v2/account/login' % self._host_url(), None, 'Logging in', + data=urlencode_postdata({ + 'login': username, + 'password': password, + 'remember': 'true', + }), headers={ + 'Referer': '%s/login' % self._host_url(), + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + raise ExtractorError( + 'Unable to login: incorrect username and/or password', + expected=True) + raise + + self._power_guide_hash = data['session']['power_guide_hash'] + + def _real_initialize(self): + webpage = self._download_webpage( + self._host_url(), None, 'Downloading app token') + app_token = self._html_search_regex( + r'appToken\s*=\s*(["\'])(?P<token>(?:(?!\1).)+?)\1', + webpage, 'app token', group='token') + app_version = self._html_search_regex( + r'<!--\w+-(.+?)-', webpage, 'app version', default='2.8.2') + + # Will setup appropriate cookies + self._request_webpage( + '%s/zapi/v2/session/hello' % self._host_url(), None, + 'Opening session', data=urlencode_postdata({ + 'client_app_token': app_token, + 'uuid': compat_str(uuid4()), + 'lang': 'en', + 'app_version': app_version, + 'format': 'json', + })) + + self._login() + + def _extract_cid(self, video_id, channel_name): + channel_groups = self._download_json( + '%s/zapi/v2/cached/channels/%s' % (self._host_url(), + self._power_guide_hash), + video_id, 'Downloading channel list', + query={'details': False})['channel_groups'] + channel_list = [] + for chgrp in channel_groups: + channel_list.extend(chgrp['channels']) + try: + return next( + chan['cid'] for chan in channel_list + if chan.get('cid') and ( + chan.get('display_alias') == channel_name + or chan.get('cid') == channel_name)) + except StopIteration: + raise ExtractorError('Could not extract channel id') + + def _extract_cid_and_video_info(self, video_id): + data = self._download_json( + '%s/zapi/v2/cached/program/power_details/%s' % ( + self._host_url(), self._power_guide_hash), + video_id, + 'Downloading video information', + query={ + 'program_ids': video_id, + 'complete': True, + }) + + p = data['programs'][0] + cid = p['cid'] + + info_dict = { + 'id': video_id, + 'title': p.get('t') or p['et'], + 'description': p.get('d'), + 'thumbnail': p.get('i_url'), + 'creator': p.get('channel_name'), + 'episode': p.get('et'), + 'episode_number': int_or_none(p.get('e_no')), + 'season_number': int_or_none(p.get('s_no')), + 'release_year': int_or_none(p.get('year')), + 'categories': try_get(p, lambda x: x['c'], list), + 'tags': try_get(p, lambda x: x['g'], list) + } + + return cid, info_dict + + def _extract_formats(self, cid, video_id, record_id=None, is_live=False): + postdata_common = { + 'https_watch_urls': True, + } + + if is_live: + postdata_common.update({'timeshift': 10800}) + url = '%s/zapi/watch/live/%s' % (self._host_url(), cid) + elif record_id: + url = '%s/zapi/watch/recording/%s' % (self._host_url(), record_id) + else: + url = '%s/zapi/watch/recall/%s/%s' % (self._host_url(), cid, video_id) + + formats = [] + for stream_type in ('dash', 'hls', 'hls5', 'hds'): + postdata = postdata_common.copy() + postdata['stream_type'] = stream_type + + data = self._download_json( + url, video_id, 'Downloading %s formats' % stream_type.upper(), + data=urlencode_postdata(postdata), fatal=False) + if not data: + continue + + watch_urls = try_get( + data, lambda x: x['stream']['watch_urls'], list) + if not watch_urls: + continue + + for watch in watch_urls: + if not isinstance(watch, dict): + continue + watch_url = url_or_none(watch.get('url')) + if not watch_url: + continue + format_id_list = [stream_type] + maxrate = watch.get('maxrate') + if maxrate: + format_id_list.append(compat_str(maxrate)) + audio_channel = watch.get('audio_channel') + if audio_channel: + format_id_list.append(compat_str(audio_channel)) + preference = 1 if audio_channel == 'A' else None + format_id = '-'.join(format_id_list) + if stream_type in ('dash', 'dash_widevine', 'dash_playready'): + this_formats = self._extract_mpd_formats( + watch_url, video_id, mpd_id=format_id, fatal=False) + elif stream_type in ('hls', 'hls5', 'hls5_fairplay'): + this_formats = self._extract_m3u8_formats( + watch_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id=format_id, + fatal=False) + elif stream_type == 'hds': + this_formats = self._extract_f4m_formats( + watch_url, video_id, f4m_id=format_id, fatal=False) + elif stream_type == 'smooth_playready': + this_formats = self._extract_ism_formats( + watch_url, video_id, ism_id=format_id, fatal=False) + else: + assert False + for this_format in this_formats: + this_format['quality'] = preference + formats.extend(this_formats) + self._sort_formats(formats) + return formats + + def _extract_video(self, channel_name, video_id, record_id=None, is_live=False): + if is_live: + cid = self._extract_cid(video_id, channel_name) + info_dict = { + 'id': channel_name, + 'title': self._live_title(channel_name), + 'is_live': True, + } + else: + cid, info_dict = self._extract_cid_and_video_info(video_id) + formats = self._extract_formats( + cid, video_id, record_id=record_id, is_live=is_live) + info_dict['formats'] = formats + return info_dict + + +class QuicklineBaseIE(ZattooPlatformBaseIE): + _NETRC_MACHINE = 'quickline' + _HOST = 'mobiltv.quickline.com' + + +class QuicklineIE(QuicklineBaseIE): + _VALID_URL = r'https?://(?:www\.)?%s/watch/(?P<channel>[^/]+)/(?P<id>[0-9]+)' % re.escape(QuicklineBaseIE._HOST) + + _TEST = { + 'url': 'https://mobiltv.quickline.com/watch/prosieben/130671867-maze-runner-die-auserwaehlten-in-der-brandwueste', + 'only_matching': True, + } + + def _real_extract(self, url): + channel_name, video_id = self._match_valid_url(url).groups() + return self._extract_video(channel_name, video_id) + + +class QuicklineLiveIE(QuicklineBaseIE): + _VALID_URL = r'https?://(?:www\.)?%s/watch/(?P<id>[^/]+)' % re.escape(QuicklineBaseIE._HOST) + + _TEST = { + 'url': 'https://mobiltv.quickline.com/watch/srf1', + 'only_matching': True, + } + + @classmethod + def suitable(cls, url): + return False if QuicklineIE.suitable(url) else super(QuicklineLiveIE, cls).suitable(url) + + def _real_extract(self, url): + channel_name = video_id = self._match_id(url) + return self._extract_video(channel_name, video_id, is_live=True) + + +class ZattooBaseIE(ZattooPlatformBaseIE): + _NETRC_MACHINE = 'zattoo' + _HOST = 'zattoo.com' + + +def _make_valid_url(tmpl, host): + return tmpl % re.escape(host) + + +class ZattooIE(ZattooBaseIE): + _VALID_URL_TEMPLATE = r'https?://(?:www\.)?%s/watch/(?P<channel>[^/]+?)/(?P<id>[0-9]+)[^/]+(?:/(?P<recid>[0-9]+))?' + _VALID_URL = _make_valid_url(_VALID_URL_TEMPLATE, ZattooBaseIE._HOST) + + # Since regular videos are only available for 7 days and recorded videos + # are only available for a specific user, we cannot have detailed tests. + _TESTS = [{ + 'url': 'https://zattoo.com/watch/prosieben/130671867-maze-runner-die-auserwaehlten-in-der-brandwueste', + 'only_matching': True, + }, { + 'url': 'https://zattoo.com/watch/srf_zwei/132905652-eishockey-spengler-cup/102791477/1512211800000/1514433500000/92000', + 'only_matching': True, + }] + + def _real_extract(self, url): + channel_name, video_id, record_id = self._match_valid_url(url).groups() + return self._extract_video(channel_name, video_id, record_id) + + +class ZattooLiveIE(ZattooBaseIE): + _VALID_URL = r'https?://(?:www\.)?zattoo\.com/watch/(?P<id>[^/]+)' + + _TEST = { + 'url': 'https://zattoo.com/watch/srf1', + 'only_matching': True, + } + + @classmethod + def suitable(cls, url): + return False if ZattooIE.suitable(url) else super(ZattooLiveIE, cls).suitable(url) + + def _real_extract(self, url): + channel_name = video_id = self._match_id(url) + return self._extract_video(channel_name, video_id, is_live=True) + + +class NetPlusIE(ZattooIE): + _NETRC_MACHINE = 'netplus' + _HOST = 'netplus.tv' + _API_HOST = 'www.%s' % _HOST + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://www.netplus.tv/watch/abc/123-abc', + 'only_matching': True, + }] + + +class MNetTVIE(ZattooIE): + _NETRC_MACHINE = 'mnettv' + _HOST = 'tvplus.m-net.de' + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://tvplus.m-net.de/watch/abc/123-abc', + 'only_matching': True, + }] + + +class WalyTVIE(ZattooIE): + _NETRC_MACHINE = 'walytv' + _HOST = 'player.waly.tv' + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://player.waly.tv/watch/abc/123-abc', + 'only_matching': True, + }] + + +class BBVTVIE(ZattooIE): + _NETRC_MACHINE = 'bbvtv' + _HOST = 'bbv-tv.net' + _API_HOST = 'www.%s' % _HOST + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://www.bbv-tv.net/watch/abc/123-abc', + 'only_matching': True, + }] + + +class VTXTVIE(ZattooIE): + _NETRC_MACHINE = 'vtxtv' + _HOST = 'vtxtv.ch' + _API_HOST = 'www.%s' % _HOST + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://www.vtxtv.ch/watch/abc/123-abc', + 'only_matching': True, + }] + + +class MyVisionTVIE(ZattooIE): + _NETRC_MACHINE = 'myvisiontv' + _HOST = 'myvisiontv.ch' + _API_HOST = 'www.%s' % _HOST + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://www.myvisiontv.ch/watch/abc/123-abc', + 'only_matching': True, + }] + + +class GlattvisionTVIE(ZattooIE): + _NETRC_MACHINE = 'glattvisiontv' + _HOST = 'iptv.glattvision.ch' + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://iptv.glattvision.ch/watch/abc/123-abc', + 'only_matching': True, + }] + + +class SAKTVIE(ZattooIE): + _NETRC_MACHINE = 'saktv' + _HOST = 'saktv.ch' + _API_HOST = 'www.%s' % _HOST + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://www.saktv.ch/watch/abc/123-abc', + 'only_matching': True, + }] + + +class EWETVIE(ZattooIE): + _NETRC_MACHINE = 'ewetv' + _HOST = 'tvonline.ewe.de' + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://tvonline.ewe.de/watch/abc/123-abc', + 'only_matching': True, + }] + + +class QuantumTVIE(ZattooIE): + _NETRC_MACHINE = 'quantumtv' + _HOST = 'quantum-tv.com' + _API_HOST = 'www.%s' % _HOST + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://www.quantum-tv.com/watch/abc/123-abc', + 'only_matching': True, + }] + + +class OsnatelTVIE(ZattooIE): + _NETRC_MACHINE = 'osnateltv' + _HOST = 'tvonline.osnatel.de' + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://tvonline.osnatel.de/watch/abc/123-abc', + 'only_matching': True, + }] + + +class EinsUndEinsTVIE(ZattooIE): + _NETRC_MACHINE = '1und1tv' + _HOST = '1und1.tv' + _API_HOST = 'www.%s' % _HOST + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://www.1und1.tv/watch/abc/123-abc', + 'only_matching': True, + }] + + +class SaltTVIE(ZattooIE): + _NETRC_MACHINE = 'salttv' + _HOST = 'tv.salt.ch' + _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + + _TESTS = [{ + 'url': 'https://tv.salt.ch/watch/abc/123-abc', + 'only_matching': True, + }] diff --git a/yt_dlp/extractor/zdf.py b/yt_dlp/extractor/zdf.py new file mode 100644 index 000000000..8c279c5ab --- /dev/null +++ b/yt_dlp/extractor/zdf.py @@ -0,0 +1,380 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + float_or_none, + int_or_none, + merge_dicts, + NO_DEFAULT, + orderedSet, + parse_codecs, + qualities, + str_or_none, + try_get, + unified_timestamp, + update_url_query, + url_or_none, + urljoin, +) + + +class ZDFBaseIE(InfoExtractor): + _GEO_COUNTRIES = ['DE'] + _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh', 'hd') + + def _call_api(self, url, video_id, item, api_token=None, referrer=None): + headers = {} + if api_token: + headers['Api-Auth'] = 'Bearer %s' % api_token + if referrer: + headers['Referer'] = referrer + return self._download_json( + url, video_id, 'Downloading JSON %s' % item, headers=headers) + + @staticmethod + def _extract_subtitles(src): + subtitles = {} + for caption in try_get(src, lambda x: x['captions'], list) or []: + subtitle_url = url_or_none(caption.get('uri')) + if subtitle_url: + lang = caption.get('language', 'deu') + subtitles.setdefault(lang, []).append({ + 'url': subtitle_url, + }) + return subtitles + + def _extract_format(self, video_id, formats, format_urls, meta): + format_url = url_or_none(meta.get('url')) + if not format_url or format_url in format_urls: + return + format_urls.add(format_url) + + mime_type, ext = meta.get('mimeType'), determine_ext(format_url) + if mime_type == 'application/x-mpegURL' or ext == 'm3u8': + new_formats = self._extract_m3u8_formats( + format_url, video_id, 'mp4', m3u8_id='hls', + entry_protocol='m3u8_native', fatal=False) + elif mime_type == 'application/f4m+xml' or ext == 'f4m': + new_formats = self._extract_f4m_formats( + update_url_query(format_url, {'hdcore': '3.7.0'}), video_id, f4m_id='hds', fatal=False) + else: + f = parse_codecs(meta.get('mimeCodec')) + if not f and meta.get('type'): + data = meta['type'].split('_') + if try_get(data, lambda x: x[2]) == ext: + f = {'vcodec': data[0], 'acodec': data[1]} + f.update({ + 'url': format_url, + 'format_id': '-'.join(filter(str_or_none, ('http', meta.get('type'), meta.get('quality')))), + }) + new_formats = [f] + formats.extend(merge_dicts(f, { + 'format_note': ', '.join(filter(None, (meta.get('quality'), meta.get('class')))), + 'language': meta.get('language'), + 'language_preference': 10 if meta.get('class') == 'main' else -10 if meta.get('class') == 'ad' else -1, + 'quality': qualities(self._QUALITIES)(meta.get('quality')), + }) for f in new_formats) + + def _extract_ptmd(self, ptmd_url, video_id, api_token, referrer): + ptmd = self._call_api( + ptmd_url, video_id, 'metadata', api_token, referrer) + + content_id = ptmd.get('basename') or ptmd_url.split('/')[-1] + + formats = [] + track_uris = set() + for p in ptmd['priorityList']: + formitaeten = p.get('formitaeten') + if not isinstance(formitaeten, list): + continue + for f in formitaeten: + f_qualities = f.get('qualities') + if not isinstance(f_qualities, list): + continue + for quality in f_qualities: + tracks = try_get(quality, lambda x: x['audio']['tracks'], list) + if not tracks: + continue + for track in tracks: + self._extract_format( + content_id, formats, track_uris, { + 'url': track.get('uri'), + 'type': f.get('type'), + 'mimeType': f.get('mimeType'), + 'quality': quality.get('quality'), + 'class': track.get('class'), + 'language': track.get('language'), + }) + self._sort_formats(formats, ('hasaud', 'res', 'quality', 'language_preference')) + + duration = float_or_none(try_get( + ptmd, lambda x: x['attributes']['duration']['value']), scale=1000) + + return { + 'extractor_key': ZDFIE.ie_key(), + 'id': content_id, + 'duration': duration, + 'formats': formats, + 'subtitles': self._extract_subtitles(ptmd), + } + + def _extract_player(self, webpage, video_id, fatal=True): + return self._parse_json( + self._search_regex( + r'(?s)data-zdfplayer-jsb=(["\'])(?P<json>{.+?})\1', webpage, + 'player JSON', default='{}' if not fatal else NO_DEFAULT, + group='json'), + video_id) + + +class ZDFIE(ZDFBaseIE): + _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)\.html' + _TESTS = [{ + # Same as https://www.phoenix.de/sendungen/ereignisse/corona-nachgehakt/wohin-fuehrt-der-protest-in-der-pandemie-a-2050630.html + 'url': 'https://www.zdf.de/politik/phoenix-sendungen/wohin-fuehrt-der-protest-in-der-pandemie-100.html', + 'md5': '34ec321e7eb34231fd88616c65c92db0', + 'info_dict': { + 'id': '210222_phx_nachgehakt_corona_protest', + 'ext': 'mp4', + 'title': 'Wohin führt der Protest in der Pandemie?', + 'description': 'md5:7d643fe7f565e53a24aac036b2122fbd', + 'duration': 1691, + 'timestamp': 1613948400, + 'upload_date': '20210221', + }, + }, { + # Same as https://www.3sat.de/film/ab-18/10-wochen-sommer-108.html + 'url': 'https://www.zdf.de/dokumentation/ab-18/10-wochen-sommer-102.html', + 'md5': '0aff3e7bc72c8813f5e0fae333316a1d', + 'info_dict': { + 'id': '141007_ab18_10wochensommer_film', + 'ext': 'mp4', + 'title': 'Ab 18! - 10 Wochen Sommer', + 'description': 'md5:8253f41dc99ce2c3ff892dac2d65fe26', + 'duration': 2660, + 'timestamp': 1608604200, + 'upload_date': '20201222', + }, + }, { + 'url': 'https://www.zdf.de/dokumentation/terra-x/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100.html', + 'info_dict': { + 'id': '151025_magie_farben2_tex', + 'ext': 'mp4', + 'title': 'Die Magie der Farben (2/2)', + 'description': 'md5:a89da10c928c6235401066b60a6d5c1a', + 'duration': 2615, + 'timestamp': 1465021200, + 'upload_date': '20160604', + }, + }, { + # Same as https://www.phoenix.de/sendungen/dokumentationen/gesten-der-maechtigen-i-a-89468.html?ref=suche + 'url': 'https://www.zdf.de/politik/phoenix-sendungen/die-gesten-der-maechtigen-100.html', + 'only_matching': True, + }, { + # Same as https://www.3sat.de/film/spielfilm/der-hauptmann-100.html + 'url': 'https://www.zdf.de/filme/filme-sonstige/der-hauptmann-112.html', + 'only_matching': True, + }, { + # Same as https://www.3sat.de/wissen/nano/nano-21-mai-2019-102.html, equal media ids + 'url': 'https://www.zdf.de/wissen/nano/nano-21-mai-2019-102.html', + 'only_matching': True, + }, { + 'url': 'https://www.zdf.de/service-und-hilfe/die-neue-zdf-mediathek/zdfmediathek-trailer-100.html', + 'only_matching': True, + }, { + 'url': 'https://www.zdf.de/filme/taunuskrimi/die-lebenden-und-die-toten-1---ein-taunuskrimi-100.html', + 'only_matching': True, + }, { + 'url': 'https://www.zdf.de/dokumentation/planet-e/planet-e-uebersichtsseite-weitere-dokumentationen-von-planet-e-100.html', + 'only_matching': True, + }] + + def _extract_entry(self, url, player, content, video_id): + title = content.get('title') or content['teaserHeadline'] + + t = content['mainVideoContent']['http://zdf.de/rels/target'] + + ptmd_path = t.get('http://zdf.de/rels/streams/ptmd') + + if not ptmd_path: + ptmd_path = t[ + 'http://zdf.de/rels/streams/ptmd-template'].replace( + '{playerId}', 'ngplayer_2_4') + + info = self._extract_ptmd( + urljoin(url, ptmd_path), video_id, player['apiToken'], url) + + thumbnails = [] + layouts = try_get( + content, lambda x: x['teaserImageRef']['layouts'], dict) + if layouts: + for layout_key, layout_url in layouts.items(): + layout_url = url_or_none(layout_url) + if not layout_url: + continue + thumbnail = { + 'url': layout_url, + 'format_id': layout_key, + } + mobj = re.search(r'(?P<width>\d+)x(?P<height>\d+)', layout_key) + if mobj: + thumbnail.update({ + 'width': int(mobj.group('width')), + 'height': int(mobj.group('height')), + }) + thumbnails.append(thumbnail) + + return merge_dicts(info, { + 'title': title, + 'description': content.get('leadParagraph') or content.get('teasertext'), + 'duration': int_or_none(t.get('duration')), + 'timestamp': unified_timestamp(content.get('editorialDate')), + 'thumbnails': thumbnails, + }) + + def _extract_regular(self, url, player, video_id): + content = self._call_api( + player['content'], video_id, 'content', player['apiToken'], url) + return self._extract_entry(player['content'], player, content, video_id) + + def _extract_mobile(self, video_id): + video = self._download_json( + 'https://zdf-cdn.live.cellular.de/mediathekV2/document/%s' % video_id, + video_id) + + document = video['document'] + + title = document['titel'] + content_id = document['basename'] + + formats = [] + format_urls = set() + for f in document['formitaeten']: + self._extract_format(content_id, formats, format_urls, f) + self._sort_formats(formats) + + thumbnails = [] + teaser_bild = document.get('teaserBild') + if isinstance(teaser_bild, dict): + for thumbnail_key, thumbnail in teaser_bild.items(): + thumbnail_url = try_get( + thumbnail, lambda x: x['url'], compat_str) + if thumbnail_url: + thumbnails.append({ + 'url': thumbnail_url, + 'id': thumbnail_key, + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) + + return { + 'id': content_id, + 'title': title, + 'description': document.get('beschreibung'), + 'duration': int_or_none(document.get('length')), + 'timestamp': unified_timestamp(document.get('date')) or unified_timestamp( + try_get(video, lambda x: x['meta']['editorialDate'], compat_str)), + 'thumbnails': thumbnails, + 'subtitles': self._extract_subtitles(document), + 'formats': formats, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id, fatal=False) + if webpage: + player = self._extract_player(webpage, url, fatal=False) + if player: + return self._extract_regular(url, player, video_id) + + return self._extract_mobile(video_id) + + +class ZDFChannelIE(ZDFBaseIE): + _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.zdf.de/sport/das-aktuelle-sportstudio', + 'info_dict': { + 'id': 'das-aktuelle-sportstudio', + 'title': 'das aktuelle sportstudio | ZDF', + }, + 'playlist_mincount': 23, + }, { + 'url': 'https://www.zdf.de/dokumentation/planet-e', + 'info_dict': { + 'id': 'planet-e', + 'title': 'planet e.', + }, + 'playlist_mincount': 50, + }, { + 'url': 'https://www.zdf.de/filme/taunuskrimi/', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if ZDFIE.suitable(url) else super(ZDFChannelIE, cls).suitable(url) + + def _real_extract(self, url): + channel_id = self._match_id(url) + + webpage = self._download_webpage(url, channel_id) + + entries = [ + self.url_result(item_url, ie=ZDFIE.ie_key()) + for item_url in orderedSet(re.findall( + r'data-plusbar-url=["\'](http.+?\.html)', webpage))] + + return self.playlist_result( + entries, channel_id, self._og_search_title(webpage, fatal=False)) + + r""" + player = self._extract_player(webpage, channel_id) + + channel_id = self._search_regex( + r'docId\s*:\s*(["\'])(?P<id>(?!\1).+?)\1', webpage, + 'channel id', group='id') + + channel = self._call_api( + 'https://api.zdf.de/content/documents/%s.json' % channel_id, + player, url, channel_id) + + items = [] + for module in channel['module']: + for teaser in try_get(module, lambda x: x['teaser'], list) or []: + t = try_get( + teaser, lambda x: x['http://zdf.de/rels/target'], dict) + if not t: + continue + items.extend(try_get( + t, + lambda x: x['resultsWithVideo']['http://zdf.de/rels/search/results'], + list) or []) + items.extend(try_get( + module, + lambda x: x['filterRef']['resultsWithVideo']['http://zdf.de/rels/search/results'], + list) or []) + + entries = [] + entry_urls = set() + for item in items: + t = try_get(item, lambda x: x['http://zdf.de/rels/target'], dict) + if not t: + continue + sharing_url = t.get('http://zdf.de/rels/sharing-url') + if not sharing_url or not isinstance(sharing_url, compat_str): + continue + if sharing_url in entry_urls: + continue + entry_urls.add(sharing_url) + entries.append(self.url_result( + sharing_url, ie=ZDFIE.ie_key(), video_id=t.get('id'))) + + return self.playlist_result(entries, channel_id, channel.get('title')) + """ diff --git a/yt_dlp/extractor/zee5.py b/yt_dlp/extractor/zee5.py new file mode 100644 index 000000000..536604167 --- /dev/null +++ b/yt_dlp/extractor/zee5.py @@ -0,0 +1,244 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + parse_age_limit, + str_or_none, + try_get, + unified_strdate, + unified_timestamp, + url_or_none, +) + + +class Zee5IE(InfoExtractor): + _VALID_URL = r'''(?x) + (?: + zee5:| + (?:https?://)(?:www\.)?zee5\.com/(?:[^#?]+/)? + (?: + (?:tvshows|kids|zee5originals)(?:/[^#/?]+){3} + |movies/[^#/?]+ + )/(?P<display_id>[^#/?]+)/ + ) + (?P<id>[^#/?]+)/?(?:$|[?#]) + ''' + _TESTS = [{ + 'url': 'https://www.zee5.com/movies/details/krishna-the-birth/0-0-63098', + 'info_dict': { + 'id': '0-0-63098', + 'ext': 'mp4', + 'display_id': 'krishna-the-birth', + 'title': 'Krishna - The Birth', + 'duration': 4368, + 'average_rating': 4, + 'description': compat_str, + 'alt_title': 'Krishna - The Birth', + 'uploader': 'Zee Entertainment Enterprises Ltd', + 'release_date': '20060101', + 'upload_date': '20060101', + 'timestamp': 1136073600, + 'thumbnail': 'https://akamaividz.zee5.com/resources/0-0-63098/list/270x152/0063098_list_80888170.jpg', + 'tags': list + }, + 'params': { + 'format': 'bv', + }, + }, { + 'url': 'https://zee5.com/tvshows/details/krishna-balram/0-6-1871/episode-1-the-test-of-bramha/0-1-233402', + 'info_dict': { + 'id': '0-1-233402', + 'ext': 'mp4', + 'display_id': 'episode-1-the-test-of-bramha', + 'title': 'Episode 1 - The Test Of Bramha', + 'duration': 1336, + 'average_rating': 4, + 'description': compat_str, + 'alt_title': 'Episode 1 - The Test Of Bramha', + 'uploader': 'Zee Entertainment Enterprises Ltd', + 'release_date': '20090101', + 'upload_date': '20090101', + 'timestamp': 1230768000, + 'thumbnail': 'https://akamaividz.zee5.com/resources/0-1-233402/list/270x152/01233402_list.jpg', + 'series': 'Krishna Balram', + 'season_number': 1, + 'episode_number': 1, + 'tags': list, + }, + 'params': { + 'format': 'bv', + }, + }, { + 'url': 'https://www.zee5.com/hi/tvshows/details/kundali-bhagya/0-6-366/kundali-bhagya-march-08-2021/0-1-manual_7g9jv1os7730?country=IN', + 'only_matching': True + }, { + 'url': 'https://www.zee5.com/global/hi/tvshows/details/kundali-bhagya/0-6-366/kundali-bhagya-march-08-2021/0-1-manual_7g9jv1os7730', + 'only_matching': True + }] + _DETAIL_API_URL = 'https://spapi.zee5.com/singlePlayback/getDetails?content_id={}&device_id={}&platform_name=desktop_web&country=IN&check_parental_control=false' + _DEVICE_ID = 'iIxsxYf40cqO3koIkwzKHZhnJzHN13zb' + _USER_TOKEN = None + _LOGIN_HINT = 'Use "--username <mobile_number>" to login using otp or "--username token" and "--password <user_token>" to login using user token.' + _NETRC_MACHINE = 'zee5' + + def _login(self): + username, password = self._get_login_info() + if username: + if len(username) == 10 and username.isdigit() and self._USER_TOKEN is None: + self.report_login() + otp_request_json = self._download_json('https://b2bapi.zee5.com/device/sendotp_v1.php?phoneno=91{}'.format(username), + None, note='Sending OTP') + if otp_request_json['code'] == 0: + self.to_screen(otp_request_json['message']) + else: + raise ExtractorError(otp_request_json['message'], expected=True) + otp_code = self._get_tfa_info('OTP') + otp_verify_json = self._download_json('https://b2bapi.zee5.com/device/verifyotp_v1.php?phoneno=91{}&otp={}&guest_token={}&platform=web'.format(username, otp_code, self._DEVICE_ID), + None, note='Verifying OTP', fatal=False) + if not otp_verify_json: + raise ExtractorError('Unable to verify OTP.', expected=True) + self._USER_TOKEN = otp_verify_json.get('token') + if not self._USER_TOKEN: + raise ExtractorError(otp_request_json['message'], expected=True) + elif username.lower() == 'token' and len(password) > 1198: + self._USER_TOKEN = password + + def _real_initialize(self): + self._login() + + def _real_extract(self, url): + video_id, display_id = self._match_valid_url(url).group('id', 'display_id') + access_token_request = self._download_json( + 'https://useraction.zee5.com/token/platform_tokens.php?platform_name=web_app', + video_id, note='Downloading access token') + data = { + 'x-access-token': access_token_request['token'] + } + if self._USER_TOKEN: + data['Authorization'] = 'bearer %s' % self._USER_TOKEN + else: + data['X-Z5-Guest-Token'] = self._DEVICE_ID + + json_data = self._download_json( + self._DETAIL_API_URL.format(video_id, self._DEVICE_ID), + video_id, headers={'content-type': 'application/json'}, data=json.dumps(data).encode('utf-8')) + asset_data = json_data['assetDetails'] + show_data = json_data.get('showDetails', {}) + if 'premium' in asset_data['business_type']: + raise ExtractorError('Premium content is DRM protected.', expected=True) + if not asset_data.get('hls_url'): + self.raise_login_required(self._LOGIN_HINT, metadata_available=True, method=None) + formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(asset_data['hls_url'], video_id, 'mp4', fatal=False) + self._sort_formats(formats) + + subtitles = {} + for sub in asset_data.get('subtitle_url', []): + sub_url = sub.get('url') + if not sub_url: + continue + subtitles.setdefault(sub.get('language', 'en'), []).append({ + 'url': self._proto_relative_url(sub_url), + }) + subtitles = self._merge_subtitles(subtitles, m3u8_subs) + return { + 'id': video_id, + 'display_id': display_id, + 'title': asset_data['title'], + 'formats': formats, + 'subtitles': subtitles, + 'duration': int_or_none(asset_data.get('duration')), + 'average_rating': int_or_none(asset_data.get('rating')), + 'description': str_or_none(asset_data.get('description')), + 'alt_title': str_or_none(asset_data.get('original_title')), + 'uploader': str_or_none(asset_data.get('content_owner')), + 'age_limit': parse_age_limit(asset_data.get('age_rating')), + 'release_date': unified_strdate(asset_data.get('release_date')), + 'timestamp': unified_timestamp(asset_data.get('release_date')), + 'thumbnail': url_or_none(asset_data.get('image_url')), + 'series': str_or_none(asset_data.get('tvshow_name')), + 'season': try_get(show_data, lambda x: x['seasons']['title'], str), + 'season_number': int_or_none(try_get(show_data, lambda x: x['seasons'][0]['orderid'])), + 'episode_number': int_or_none(try_get(asset_data, lambda x: x['orderid'])), + 'tags': try_get(asset_data, lambda x: x['tags'], list) + } + + +class Zee5SeriesIE(InfoExtractor): + IE_NAME = 'zee5:series' + _VALID_URL = r'''(?x) + (?: + zee5:series:| + (?:https?://)(?:www\.)?zee5\.com/(?:[^#?]+/)? + (?:tvshows|kids|zee5originals)(?:/[^#/?]+){2}/ + ) + (?P<id>[^#/?]+)/?(?:$|[?#]) + ''' + _TESTS = [{ + 'url': 'https://www.zee5.com/kids/kids-shows/krishna-balram/0-6-1871', + 'playlist_mincount': 43, + 'info_dict': { + 'id': '0-6-1871', + }, + }, { + 'url': 'https://www.zee5.com/tvshows/details/bhabi-ji-ghar-par-hai/0-6-199', + 'playlist_mincount': 1500, + 'info_dict': { + 'id': '0-6-199', + }, + }, { + 'url': 'https://www.zee5.com/tvshows/details/agent-raghav-crime-branch/0-6-965', + 'playlist_mincount': 24, + 'info_dict': { + 'id': '0-6-965', + }, + }, { + 'url': 'https://www.zee5.com/ta/tvshows/details/nagabhairavi/0-6-3201', + 'playlist_mincount': 3, + 'info_dict': { + 'id': '0-6-3201', + }, + }, { + 'url': 'https://www.zee5.com/global/hi/tvshows/details/khwaabon-ki-zamin-par/0-6-270', + 'playlist_mincount': 150, + 'info_dict': { + 'id': '0-6-270', + }, + } + ] + + def _entries(self, show_id): + access_token_request = self._download_json( + 'https://useraction.zee5.com/token/platform_tokens.php?platform_name=web_app', + show_id, note='Downloading access token') + headers = { + 'X-Access-Token': access_token_request['token'], + 'Referer': 'https://www.zee5.com/', + } + show_url = 'https://gwapi.zee5.com/content/tvshow/{}?translation=en&country=IN'.format(show_id) + + page_num = 0 + show_json = self._download_json(show_url, video_id=show_id, headers=headers) + for season in show_json.get('seasons') or []: + season_id = try_get(season, lambda x: x['id'], compat_str) + next_url = 'https://gwapi.zee5.com/content/tvshow/?season_id={}&type=episode&translation=en&country=IN&on_air=false&asset_subtype=tvshow&page=1&limit=100'.format(season_id) + while next_url: + page_num += 1 + episodes_json = self._download_json( + next_url, video_id=show_id, headers=headers, + note='Downloading JSON metadata page %d' % page_num) + for episode in try_get(episodes_json, lambda x: x['episode'], list) or []: + video_id = episode.get('id') + yield self.url_result( + 'zee5:%s' % video_id, + ie=Zee5IE.ie_key(), video_id=video_id) + next_url = url_or_none(episodes_json.get('next_episode_api')) + + def _real_extract(self, url): + show_id = self._match_id(url) + return self.playlist_result(self._entries(show_id), playlist_id=show_id) diff --git a/youtube_dl/extractor/zhihu.py b/yt_dlp/extractor/zhihu.py index d1ed55be3..d1ed55be3 100644 --- a/youtube_dl/extractor/zhihu.py +++ b/yt_dlp/extractor/zhihu.py diff --git a/yt_dlp/extractor/zingmp3.py b/yt_dlp/extractor/zingmp3.py new file mode 100644 index 000000000..a3edc158f --- /dev/null +++ b/yt_dlp/extractor/zingmp3.py @@ -0,0 +1,160 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, +) + + +class ZingMp3BaseIE(InfoExtractor): + _VALID_URL_TMPL = r'https?://(?:mp3\.zing|zingmp3)\.vn/(?:%s)/[^/]+/(?P<id>\w+)\.html' + _GEO_COUNTRIES = ['VN'] + + def _extract_item(self, item, fatal): + item_id = item['id'] + title = item.get('name') or item['title'] + + formats = [] + for k, v in (item.get('source') or {}).items(): + if not v: + continue + if k in ('mp4', 'hls'): + for res, video_url in v.items(): + if not video_url: + continue + if k == 'hls': + formats.extend(self._extract_m3u8_formats( + video_url, item_id, 'mp4', + 'm3u8_native', m3u8_id=k, fatal=False)) + elif k == 'mp4': + formats.append({ + 'format_id': 'mp4-' + res, + 'url': video_url, + 'height': int_or_none(self._search_regex( + r'^(\d+)p', res, 'resolution', default=None)), + }) + else: + formats.append({ + 'ext': 'mp3', + 'format_id': k, + 'tbr': int_or_none(k), + 'url': self._proto_relative_url(v), + 'vcodec': 'none', + }) + if not formats: + if not fatal: + return + msg = item['msg'] + if msg == 'Sorry, this content is not available in your country.': + self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True) + self.raise_no_formats(msg, expected=True) + self._sort_formats(formats) + + subtitles = None + lyric = item.get('lyric') + if lyric: + subtitles = { + 'origin': [{ + 'url': lyric, + }], + } + + album = item.get('album') or {} + + return { + 'id': item_id, + 'title': title, + 'formats': formats, + 'thumbnail': item.get('thumbnail'), + 'subtitles': subtitles, + 'duration': int_or_none(item.get('duration')), + 'track': title, + 'artist': item.get('artists_names'), + 'album': album.get('name') or album.get('title'), + 'album_artist': album.get('artists_names'), + } + + def _real_extract(self, url): + page_id = self._match_id(url) + webpage = self._download_webpage( + url.replace('://zingmp3.vn/', '://mp3.zing.vn/'), + page_id, query={'play_song': 1}) + data_path = self._search_regex( + r'data-xml="([^"]+)', webpage, 'data path') + return self._process_data(self._download_json( + 'https://mp3.zing.vn/xhr' + data_path, page_id)['data']) + + +class ZingMp3IE(ZingMp3BaseIE): + _VALID_URL = ZingMp3BaseIE._VALID_URL_TMPL % 'bai-hat|video-clip' + _TESTS = [{ + 'url': 'http://mp3.zing.vn/bai-hat/Xa-Mai-Xa-Bao-Thy/ZWZB9WAB.html', + 'md5': 'ead7ae13693b3205cbc89536a077daed', + 'info_dict': { + 'id': 'ZWZB9WAB', + 'title': 'Xa Mãi Xa', + 'ext': 'mp3', + 'thumbnail': r're:^https?://.+\.jpg', + 'subtitles': { + 'origin': [{ + 'ext': 'lrc', + }] + }, + 'duration': 255, + 'track': 'Xa Mãi Xa', + 'artist': 'Bảo Thy', + 'album': 'Special Album', + 'album_artist': 'Bảo Thy', + }, + }, { + 'url': 'https://mp3.zing.vn/video-clip/Suong-Hoa-Dua-Loi-K-ICM-RYO/ZO8ZF7C7.html', + 'md5': 'e9c972b693aa88301ef981c8151c4343', + 'info_dict': { + 'id': 'ZO8ZF7C7', + 'title': 'Sương Hoa Đưa Lối', + 'ext': 'mp4', + 'thumbnail': r're:^https?://.+\.jpg', + 'duration': 207, + 'track': 'Sương Hoa Đưa Lối', + 'artist': 'K-ICM, RYO', + }, + }, { + 'url': 'https://zingmp3.vn/bai-hat/Xa-Mai-Xa-Bao-Thy/ZWZB9WAB.html', + 'only_matching': True, + }] + IE_NAME = 'zingmp3' + IE_DESC = 'mp3.zing.vn' + + def _process_data(self, data): + return self._extract_item(data, True) + + +class ZingMp3AlbumIE(ZingMp3BaseIE): + _VALID_URL = ZingMp3BaseIE._VALID_URL_TMPL % 'album|playlist' + _TESTS = [{ + 'url': 'http://mp3.zing.vn/album/Lau-Dai-Tinh-Ai-Bang-Kieu-Minh-Tuyet/ZWZBWDAF.html', + 'info_dict': { + '_type': 'playlist', + 'id': 'ZWZBWDAF', + 'title': 'Lâu Đài Tình Ái', + }, + 'playlist_count': 10, + }, { + 'url': 'http://mp3.zing.vn/playlist/Duong-Hong-Loan-apollobee/IWCAACCB.html', + 'only_matching': True, + }, { + 'url': 'https://zingmp3.vn/album/Lau-Dai-Tinh-Ai-Bang-Kieu-Minh-Tuyet/ZWZBWDAF.html', + 'only_matching': True, + }] + IE_NAME = 'zingmp3:album' + + def _process_data(self, data): + def entries(): + for item in (data.get('items') or []): + entry = self._extract_item(item, False) + if entry: + yield entry + info = data.get('info') or {} + return self.playlist_result( + entries(), info.get('id'), info.get('name') or info.get('title')) diff --git a/yt_dlp/extractor/zoom.py b/yt_dlp/extractor/zoom.py new file mode 100644 index 000000000..25a0902f6 --- /dev/null +++ b/yt_dlp/extractor/zoom.py @@ -0,0 +1,77 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + js_to_json, + parse_filesize, + urlencode_postdata, + urljoin, +) + + +class ZoomIE(InfoExtractor): + IE_NAME = 'zoom' + _VALID_URL = r'(?P<base_url>https?://(?:[^.]+\.)?zoom.us/)rec(?:ording)?/(?:play|share)/(?P<id>[A-Za-z0-9_.-]+)' + _TEST = { + 'url': 'https://economist.zoom.us/rec/play/dUk_CNBETmZ5VA2BwEl-jjakPpJ3M1pcfVYAPRsoIbEByGsLjUZtaa4yCATQuOL3der8BlTwxQePl_j0.EImBkXzTIaPvdZO5', + 'md5': 'ab445e8c911fddc4f9adc842c2c5d434', + 'info_dict': { + 'id': 'dUk_CNBETmZ5VA2BwEl-jjakPpJ3M1pcfVYAPRsoIbEByGsLjUZtaa4yCATQuOL3der8BlTwxQePl_j0.EImBkXzTIaPvdZO5', + 'ext': 'mp4', + 'title': 'China\'s "two sessions" and the new five-year plan', + } + } + + def _real_extract(self, url): + base_url, play_id = self._match_valid_url(url).groups() + webpage = self._download_webpage(url, play_id) + + try: + form = self._form_hidden_inputs('password_form', webpage) + except ExtractorError: + form = None + if form: + password = self.get_param('videopassword') + if not password: + raise ExtractorError( + 'This video is protected by a passcode, use the --video-password option', expected=True) + is_meeting = form.get('useWhichPasswd') == 'meeting' + validation = self._download_json( + base_url + 'rec/validate%s_passwd' % ('_meet' if is_meeting else ''), + play_id, 'Validating passcode', 'Wrong passcode', data=urlencode_postdata({ + 'id': form[('meet' if is_meeting else 'file') + 'Id'], + 'passwd': password, + 'action': form.get('action'), + })) + if not validation.get('status'): + raise ExtractorError(validation['errorMessage'], expected=True) + webpage = self._download_webpage(url, play_id) + + data = self._parse_json(self._search_regex( + r'(?s)window\.__data__\s*=\s*({.+?});', + webpage, 'data'), play_id, js_to_json) + + subtitles = {} + for _type in ('transcript', 'cc'): + if data.get('%sUrl' % _type): + subtitles[_type] = [{ + 'url': urljoin(base_url, data['%sUrl' % _type]), + 'ext': 'vtt', + }] + + return { + 'id': play_id, + 'title': data['topic'], + 'url': data['viewMp4Url'], + 'subtitles': subtitles, + 'width': int_or_none(data.get('viewResolvtionsWidth')), + 'height': int_or_none(data.get('viewResolvtionsHeight')), + 'http_headers': { + 'Referer': base_url, + }, + 'filesize_approx': parse_filesize(data.get('fileSize')), + } diff --git a/yt_dlp/extractor/zype.py b/yt_dlp/extractor/zype.py new file mode 100644 index 000000000..7663cb36b --- /dev/null +++ b/yt_dlp/extractor/zype.py @@ -0,0 +1,146 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ( + dict_get, + ExtractorError, + int_or_none, + js_to_json, + parse_iso8601, +) + + +class ZypeIE(InfoExtractor): + _ID_RE = r'[\da-fA-F]+' + _COMMON_RE = r'//player\.zype\.com/embed/%s\.(?:js|json|html)\?.*?(?:access_token|(?:ap[ip]|player)_key)=' + _VALID_URL = r'https?:%s[^&]+' % (_COMMON_RE % ('(?P<id>%s)' % _ID_RE)) + _TEST = { + 'url': 'https://player.zype.com/embed/5b400b834b32992a310622b9.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ&autoplay=false&controls=true&da=false', + 'md5': 'eaee31d474c76a955bdaba02a505c595', + 'info_dict': { + 'id': '5b400b834b32992a310622b9', + 'ext': 'mp4', + 'title': 'Smoky Barbecue Favorites', + 'thumbnail': r're:^https?://.*\.jpe?g', + 'description': 'md5:5ff01e76316bd8d46508af26dc86023b', + 'timestamp': 1504915200, + 'upload_date': '20170909', + }, + } + + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?%s.+?)\1' % (ZypeIE._COMMON_RE % ZypeIE._ID_RE), + webpage)] + + def _real_extract(self, url): + video_id = self._match_id(url) + + try: + response = self._download_json(re.sub( + r'\.(?:js|html)\?', '.json?', url), video_id)['response'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 401, 403): + raise ExtractorError(self._parse_json( + e.cause.read().decode(), video_id)['message'], expected=True) + raise + + body = response['body'] + video = response['video'] + title = video['title'] + + subtitles = {} + + if isinstance(body, dict): + formats = [] + for output in body.get('outputs', []): + output_url = output.get('url') + if not output_url: + continue + name = output.get('name') + if name == 'm3u8': + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + output_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False) + else: + f = { + 'format_id': name, + 'tbr': int_or_none(output.get('bitrate')), + 'url': output_url, + } + if name in ('m4a', 'mp3'): + f['vcodec'] = 'none' + else: + f.update({ + 'height': int_or_none(output.get('height')), + 'width': int_or_none(output.get('width')), + }) + formats.append(f) + text_tracks = body.get('subtitles') or [] + else: + m3u8_url = self._search_regex( + r'(["\'])(?P<url>(?:(?!\1).)+\.m3u8(?:(?!\1).)*)\1', + body, 'm3u8 url', group='url', default=None) + if not m3u8_url: + source = self._search_regex( + r'(?s)sources\s*:\s*\[\s*({.+?})\s*\]', body, 'source') + + def get_attr(key): + return self._search_regex( + r'\b%s\s*:\s*([\'"])(?P<val>(?:(?!\1).)+)\1' % key, + source, key, group='val') + + if get_attr('integration') == 'verizon-media': + m3u8_url = 'https://content.uplynk.com/%s.m3u8' % get_attr('id') + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls') + text_tracks = self._search_regex( + r'textTracks\s*:\s*(\[[^]]+\])', + body, 'text tracks', default=None) + if text_tracks: + text_tracks = self._parse_json( + text_tracks, video_id, js_to_json, False) + self._sort_formats(formats) + + if text_tracks: + for text_track in text_tracks: + tt_url = dict_get(text_track, ('file', 'src')) + if not tt_url: + continue + subtitles.setdefault(text_track.get('label') or 'English', []).append({ + 'url': tt_url, + }) + + thumbnails = [] + for thumbnail in video.get('thumbnails', []): + thumbnail_url = thumbnail.get('url') + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) + + return { + 'id': video_id, + 'display_id': video.get('friendly_title'), + 'title': title, + 'thumbnails': thumbnails, + 'description': dict_get(video, ('description', 'ott_description', 'short_description')), + 'timestamp': parse_iso8601(video.get('published_at')), + 'duration': int_or_none(video.get('duration')), + 'view_count': int_or_none(video.get('request_count')), + 'average_rating': int_or_none(video.get('rating')), + 'season_number': int_or_none(video.get('season')), + 'episode_number': int_or_none(video.get('episode')), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/youtube_dl/jsinterp.py b/yt_dlp/jsinterp.py index 7bda59610..7bda59610 100644 --- a/youtube_dl/jsinterp.py +++ b/yt_dlp/jsinterp.py diff --git a/yt_dlp/minicurses.py b/yt_dlp/minicurses.py new file mode 100644 index 000000000..a6e159a14 --- /dev/null +++ b/yt_dlp/minicurses.py @@ -0,0 +1,109 @@ +import functools +from threading import Lock +from .utils import supports_terminal_sequences, TERMINAL_SEQUENCES, write_string + + +class MultilinePrinterBase: + def __init__(self, stream=None, lines=1): + self.stream = stream + self.maximum = lines - 1 + + def __enter__(self): + return self + + def __exit__(self, *args): + self.end() + + def print_at_line(self, text, pos): + pass + + def end(self): + pass + + def _add_line_number(self, text, line): + if self.maximum: + return f'{line + 1}: {text}' + return text + + def write(self, *text): + write_string(''.join(text), self.stream) + + +class QuietMultilinePrinter(MultilinePrinterBase): + pass + + +class MultilineLogger(MultilinePrinterBase): + def write(self, *text): + self.stream.debug(''.join(text)) + + def print_at_line(self, text, pos): + # stream is the logger object, not an actual stream + self.write(self._add_line_number(text, pos)) + + +class BreaklineStatusPrinter(MultilinePrinterBase): + def print_at_line(self, text, pos): + self.write(self._add_line_number(text, pos), '\n') + + +class MultilinePrinter(MultilinePrinterBase): + def __init__(self, stream=None, lines=1, preserve_output=True): + super().__init__(stream, lines) + self.preserve_output = preserve_output + self._lastline = self._lastlength = 0 + self._movelock = Lock() + self._HAVE_FULLCAP = supports_terminal_sequences(self.stream) + + def lock(func): + @functools.wraps(func) + def wrapper(self, *args, **kwargs): + with self._movelock: + return func(self, *args, **kwargs) + return wrapper + + def _move_cursor(self, dest): + current = min(self._lastline, self.maximum) + yield '\r' + distance = dest - current + if distance < 0: + yield TERMINAL_SEQUENCES['UP'] * -distance + elif distance > 0: + yield TERMINAL_SEQUENCES['DOWN'] * distance + self._lastline = dest + + @lock + def print_at_line(self, text, pos): + if self._HAVE_FULLCAP: + self.write(*self._move_cursor(pos), TERMINAL_SEQUENCES['ERASE_LINE'], text) + + text = self._add_line_number(text, pos) + textlen = len(text) + if self._lastline == pos: + # move cursor at the start of progress when writing to same line + prefix = '\r' + if self._lastlength > textlen: + text += ' ' * (self._lastlength - textlen) + self._lastlength = textlen + else: + # otherwise, break the line + prefix = '\n' + self._lastlength = textlen + self.write(prefix, text) + self._lastline = pos + + @lock + def end(self): + # move cursor to the end of the last line, and write line break + # so that other to_screen calls can precede + text = self._move_cursor(self.maximum) if self._HAVE_FULLCAP else [] + if self.preserve_output: + self.write(*text, '\n') + return + + if self._HAVE_FULLCAP: + self.write( + *text, TERMINAL_SEQUENCES['ERASE_LINE'], + f'{TERMINAL_SEQUENCES["UP"]}{TERMINAL_SEQUENCES["ERASE_LINE"]}' * self.maximum) + else: + self.write(*text, ' ' * self._lastlength) diff --git a/yt_dlp/options.py b/yt_dlp/options.py new file mode 100644 index 000000000..aa774616c --- /dev/null +++ b/yt_dlp/options.py @@ -0,0 +1,1616 @@ +from __future__ import unicode_literals + +import os.path +import optparse +import re +import sys + +from .compat import ( + compat_expanduser, + compat_get_terminal_size, + compat_getenv, + compat_kwargs, + compat_shlex_split, +) +from .utils import ( + expand_path, + get_executable_path, + OUTTMPL_TYPES, + preferredencoding, + remove_end, + write_string, +) +from .cookies import SUPPORTED_BROWSERS +from .version import __version__ + +from .downloader.external import list_external_downloaders +from .postprocessor import ( + FFmpegExtractAudioPP, + FFmpegSubtitlesConvertorPP, + FFmpegThumbnailsConvertorPP, + FFmpegVideoRemuxerPP, + SponsorBlockPP, +) +from .postprocessor.modify_chapters import DEFAULT_SPONSORBLOCK_CHAPTER_TITLE + + +def _hide_login_info(opts): + PRIVATE_OPTS = set(['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username']) + eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$') + + def _scrub_eq(o): + m = eqre.match(o) + if m: + return m.group('key') + '=PRIVATE' + else: + return o + + opts = list(map(_scrub_eq, opts)) + for idx, opt in enumerate(opts): + if opt in PRIVATE_OPTS and idx + 1 < len(opts): + opts[idx + 1] = 'PRIVATE' + return opts + + +def parseOpts(overrideArguments=None): + def _readOptions(filename_bytes, default=[]): + try: + optionf = open(filename_bytes) + except IOError: + return default # silently skip if file is not present + try: + # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56 + contents = optionf.read() + if sys.version_info < (3,): + contents = contents.decode(preferredencoding()) + res = compat_shlex_split(contents, comments=True) + finally: + optionf.close() + return res + + def _readUserConf(package_name, default=[]): + # .config + xdg_config_home = compat_getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config') + userConfFile = os.path.join(xdg_config_home, package_name, 'config') + if not os.path.isfile(userConfFile): + userConfFile = os.path.join(xdg_config_home, '%s.conf' % package_name) + userConf = _readOptions(userConfFile, default=None) + if userConf is not None: + return userConf, userConfFile + + # appdata + appdata_dir = compat_getenv('appdata') + if appdata_dir: + userConfFile = os.path.join(appdata_dir, package_name, 'config') + userConf = _readOptions(userConfFile, default=None) + if userConf is None: + userConfFile += '.txt' + userConf = _readOptions(userConfFile, default=None) + if userConf is not None: + return userConf, userConfFile + + # home + userConfFile = os.path.join(compat_expanduser('~'), '%s.conf' % package_name) + userConf = _readOptions(userConfFile, default=None) + if userConf is None: + userConfFile += '.txt' + userConf = _readOptions(userConfFile, default=None) + if userConf is not None: + return userConf, userConfFile + + return default, None + + def _format_option_string(option): + ''' ('-o', '--option') -> -o, --format METAVAR''' + + opts = [] + + if option._short_opts: + opts.append(option._short_opts[0]) + if option._long_opts: + opts.append(option._long_opts[0]) + if len(opts) > 1: + opts.insert(1, ', ') + + if option.takes_value(): + opts.append(' %s' % option.metavar) + + return ''.join(opts) + + def _list_from_options_callback(option, opt_str, value, parser, append=True, delim=',', process=str.strip): + # append can be True, False or -1 (prepend) + current = getattr(parser.values, option.dest) if append else [] + value = list(filter(None, [process(value)] if delim is None else map(process, value.split(delim)))) + setattr( + parser.values, option.dest, + current + value if append is True else value + current) + + def _set_from_options_callback( + option, opt_str, value, parser, delim=',', allowed_values=None, aliases={}, + process=lambda x: x.lower().strip()): + current = getattr(parser.values, option.dest) + values = [process(value)] if delim is None else list(map(process, value.split(delim)[::-1])) + while values: + actual_val = val = values.pop() + if val == 'all': + current.update(allowed_values) + elif val == '-all': + current = set() + elif val in aliases: + values.extend(aliases[val]) + else: + if val[0] == '-': + val = val[1:] + current.discard(val) + else: + current.update([val]) + if allowed_values is not None and val not in allowed_values: + raise optparse.OptionValueError(f'wrong {option.metavar} for {opt_str}: {actual_val}') + + setattr(parser.values, option.dest, current) + + def _dict_from_options_callback( + option, opt_str, value, parser, + allowed_keys=r'[\w-]+', delimiter=':', default_key=None, process=None, multiple_keys=True): + + out_dict = getattr(parser.values, option.dest) + if multiple_keys: + allowed_keys = r'(%s)(,(%s))*' % (allowed_keys, allowed_keys) + mobj = re.match(r'(?i)(?P<keys>%s)%s(?P<val>.*)$' % (allowed_keys, delimiter), value) + if mobj is not None: + keys = [k.strip() for k in mobj.group('keys').lower().split(',')] + val = mobj.group('val') + elif default_key is not None: + keys, val = [default_key], value + else: + raise optparse.OptionValueError( + 'wrong %s formatting; it should be %s, not "%s"' % (opt_str, option.metavar, value)) + try: + val = process(val) if process else val + except Exception as err: + raise optparse.OptionValueError( + 'wrong %s formatting; %s' % (opt_str, err)) + for key in keys: + out_dict[key] = val + + # No need to wrap help messages if we're on a wide console + columns = compat_get_terminal_size().columns + max_width = columns if columns else 80 + # 47% is chosen because that is how README.md is currently formatted + # and moving help text even further to the right is undesirable. + # This can be reduced in the future to get a prettier output + max_help_position = int(0.47 * max_width) + + fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position) + fmt.format_option_strings = _format_option_string + + kw = { + 'version': __version__, + 'formatter': fmt, + 'usage': '%prog [OPTIONS] URL [URL...]', + 'conflict_handler': 'resolve', + } + + parser = optparse.OptionParser(**compat_kwargs(kw)) + + general = optparse.OptionGroup(parser, 'General Options') + general.add_option( + '-h', '--help', + action='help', + help='Print this help text and exit') + general.add_option( + '--version', + action='version', + help='Print program version and exit') + general.add_option( + '-i', '--ignore-errors', + action='store_true', dest='ignoreerrors', + help='Ignore download and postprocessing errors. The download will be considered successfull even if the postprocessing fails') + general.add_option( + '--no-abort-on-error', + action='store_const', dest='ignoreerrors', const='only_download', + help='Continue with next video on download errors; e.g. to skip unavailable videos in a playlist (default)') + general.add_option( + '--abort-on-error', '--no-ignore-errors', + action='store_false', dest='ignoreerrors', + help='Abort downloading of further videos if an error occurs (Alias: --no-ignore-errors)') + general.add_option( + '--dump-user-agent', + action='store_true', dest='dump_user_agent', default=False, + help='Display the current user-agent and exit') + general.add_option( + '--list-extractors', + action='store_true', dest='list_extractors', default=False, + help='List all supported extractors and exit') + general.add_option( + '--extractor-descriptions', + action='store_true', dest='list_extractor_descriptions', default=False, + help='Output descriptions of all supported extractors and exit') + general.add_option( + '--force-generic-extractor', + action='store_true', dest='force_generic_extractor', default=False, + help='Force extraction to use the generic extractor') + general.add_option( + '--default-search', + dest='default_search', metavar='PREFIX', + help='Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for the search term "large apple". Use the value "auto" to let yt-dlp guess ("auto_warning" to emit a warning when guessing). "error" just throws an error. The default value "fixup_error" repairs broken URLs, but emits an error if this is not possible instead of searching') + general.add_option( + '--ignore-config', '--no-config', + action='store_true', dest='ignoreconfig', + help=( + 'Disable loading any configuration files except the one provided by --config-location. ' + 'When given inside a configuration file, no further configuration files are loaded. ' + 'Additionally, (for backward compatibility) if this option is found inside the ' + 'system configuration file, the user configuration is not loaded')) + general.add_option( + '--config-location', + dest='config_location', metavar='PATH', + help='Location of the main configuration file; either the path to the config or its containing directory') + general.add_option( + '--flat-playlist', + action='store_const', dest='extract_flat', const='in_playlist', default=False, + help='Do not extract the videos of a playlist, only list them') + general.add_option( + '--no-flat-playlist', + action='store_false', dest='extract_flat', + help='Extract the videos of a playlist') + general.add_option( + '--mark-watched', + action='store_true', dest='mark_watched', default=False, + help='Mark videos watched (even with --simulate). Currently only supported for YouTube') + general.add_option( + '--no-mark-watched', + action='store_false', dest='mark_watched', + help='Do not mark videos watched (default)') + general.add_option( + '--no-colors', + action='store_true', dest='no_color', default=False, + help='Do not emit color codes in output') + general.add_option( + '--compat-options', + metavar='OPTS', dest='compat_opts', default=set(), type='str', + action='callback', callback=_set_from_options_callback, + callback_kwargs={ + 'allowed_values': { + 'filename', 'format-sort', 'abort-on-error', 'format-spec', 'no-playlist-metafiles', + 'multistreams', 'no-live-chat', 'playlist-index', 'list-formats', 'no-direct-merge', + 'no-youtube-channel-redirect', 'no-youtube-unavailable-videos', 'no-attach-info-json', + 'embed-thumbnail-atomicparsley', 'seperate-video-versions', 'no-clean-infojson', 'no-keep-subs', + }, 'aliases': { + 'youtube-dl': ['-multistreams', 'all'], + 'youtube-dlc': ['-no-youtube-channel-redirect', '-no-live-chat', 'all'], + } + }, help=( + 'Options that can help keep compatibility with youtube-dl or youtube-dlc ' + 'configurations by reverting some of the changes made in yt-dlp. ' + 'See "Differences in default behavior" for details')) + + network = optparse.OptionGroup(parser, 'Network Options') + network.add_option( + '--proxy', dest='proxy', + default=None, metavar='URL', + help=( + 'Use the specified HTTP/HTTPS/SOCKS proxy. To enable ' + 'SOCKS proxy, specify a proper scheme. For example ' + 'socks5://127.0.0.1:1080/. Pass in an empty string (--proxy "") ' + 'for direct connection')) + network.add_option( + '--socket-timeout', + dest='socket_timeout', type=float, default=None, metavar='SECONDS', + help='Time to wait before giving up, in seconds') + network.add_option( + '--source-address', + metavar='IP', dest='source_address', default=None, + help='Client-side IP address to bind to', + ) + network.add_option( + '-4', '--force-ipv4', + action='store_const', const='0.0.0.0', dest='source_address', + help='Make all connections via IPv4', + ) + network.add_option( + '-6', '--force-ipv6', + action='store_const', const='::', dest='source_address', + help='Make all connections via IPv6', + ) + + geo = optparse.OptionGroup(parser, 'Geo-restriction') + geo.add_option( + '--geo-verification-proxy', + dest='geo_verification_proxy', default=None, metavar='URL', + help=( + 'Use this proxy to verify the IP address for some geo-restricted sites. ' + 'The default proxy specified by --proxy (or none, if the option is not present) is used for the actual downloading')) + geo.add_option( + '--cn-verification-proxy', + dest='cn_verification_proxy', default=None, metavar='URL', + help=optparse.SUPPRESS_HELP) + geo.add_option( + '--geo-bypass', + action='store_true', dest='geo_bypass', default=True, + help='Bypass geographic restriction via faking X-Forwarded-For HTTP header') + geo.add_option( + '--no-geo-bypass', + action='store_false', dest='geo_bypass', default=True, + help='Do not bypass geographic restriction via faking X-Forwarded-For HTTP header') + geo.add_option( + '--geo-bypass-country', metavar='CODE', + dest='geo_bypass_country', default=None, + help='Force bypass geographic restriction with explicitly provided two-letter ISO 3166-2 country code') + geo.add_option( + '--geo-bypass-ip-block', metavar='IP_BLOCK', + dest='geo_bypass_ip_block', default=None, + help='Force bypass geographic restriction with explicitly provided IP block in CIDR notation') + + selection = optparse.OptionGroup(parser, 'Video Selection') + selection.add_option( + '--playlist-start', + dest='playliststart', metavar='NUMBER', default=1, type=int, + help='Playlist video to start at (default is %default)') + selection.add_option( + '--playlist-end', + dest='playlistend', metavar='NUMBER', default=None, type=int, + help='Playlist video to end at (default is last)') + selection.add_option( + '--playlist-items', + dest='playlist_items', metavar='ITEM_SPEC', default=None, + help='Playlist video items to download. Specify indices of the videos in the playlist separated by commas like: "--playlist-items 1,2,5,8" if you want to download videos indexed 1, 2, 5, 8 in the playlist. You can specify range: "--playlist-items 1-3,7,10-13", it will download the videos at index 1, 2, 3, 7, 10, 11, 12 and 13') + selection.add_option( + '--match-title', + dest='matchtitle', metavar='REGEX', + help=optparse.SUPPRESS_HELP) + selection.add_option( + '--reject-title', + dest='rejecttitle', metavar='REGEX', + help=optparse.SUPPRESS_HELP) + selection.add_option( + '--max-downloads', + dest='max_downloads', metavar='NUMBER', type=int, default=None, + help='Abort after downloading NUMBER files') + selection.add_option( + '--min-filesize', + metavar='SIZE', dest='min_filesize', default=None, + help='Do not download any videos smaller than SIZE (e.g. 50k or 44.6m)') + selection.add_option( + '--max-filesize', + metavar='SIZE', dest='max_filesize', default=None, + help='Do not download any videos larger than SIZE (e.g. 50k or 44.6m)') + selection.add_option( + '--date', + metavar='DATE', dest='date', default=None, + help=( + 'Download only videos uploaded in this date. ' + 'The date can be "YYYYMMDD" or in the format ' + '"(now|today)[+-][0-9](day|week|month|year)(s)?"')) + selection.add_option( + '--datebefore', + metavar='DATE', dest='datebefore', default=None, + help=( + 'Download only videos uploaded on or before this date. ' + 'The date formats accepted is the same as --date')) + selection.add_option( + '--dateafter', + metavar='DATE', dest='dateafter', default=None, + help=( + 'Download only videos uploaded on or after this date. ' + 'The date formats accepted is the same as --date')) + selection.add_option( + '--min-views', + metavar='COUNT', dest='min_views', default=None, type=int, + help=optparse.SUPPRESS_HELP) + selection.add_option( + '--max-views', + metavar='COUNT', dest='max_views', default=None, type=int, + help=optparse.SUPPRESS_HELP) + selection.add_option( + '--match-filter', + metavar='FILTER', dest='match_filter', default=None, + help=( + 'Generic video filter. Any field (see "OUTPUT TEMPLATE") can be compared with a ' + 'number or a string using the operators defined in "Filtering formats". ' + 'You can also simply specify a field to match if the field is present ' + 'and "!field" to check if the field is not present. In addition, ' + 'Python style regular expression matching can be done using "~=", ' + 'and multiple filters can be checked with "&". ' + 'Use a "\\" to escape "&" or quotes if needed. Eg: --match-filter ' + '"!is_live & like_count>?100 & description~=\'(?i)\\bcats \\& dogs\\b\'" ' + 'matches only videos that are not live, has a like count more than 100 ' + '(or the like field is not available), and also has a description ' + 'that contains the phrase "cats & dogs" (ignoring case)')) + selection.add_option( + '--no-match-filter', + metavar='FILTER', dest='match_filter', action='store_const', const=None, + help='Do not use generic video filter (default)') + selection.add_option( + '--no-playlist', + action='store_true', dest='noplaylist', default=False, + help='Download only the video, if the URL refers to a video and a playlist') + selection.add_option( + '--yes-playlist', + action='store_false', dest='noplaylist', + help='Download the playlist, if the URL refers to a video and a playlist') + selection.add_option( + '--age-limit', + metavar='YEARS', dest='age_limit', default=None, type=int, + help='Download only videos suitable for the given age') + selection.add_option( + '--download-archive', metavar='FILE', + dest='download_archive', + help='Download only videos not listed in the archive file. Record the IDs of all downloaded videos in it') + selection.add_option( + '--break-on-existing', + action='store_true', dest='break_on_existing', default=False, + help='Stop the download process when encountering a file that is in the archive') + selection.add_option( + '--break-on-reject', + action='store_true', dest='break_on_reject', default=False, + help='Stop the download process when encountering a file that has been filtered out') + selection.add_option( + '--skip-playlist-after-errors', metavar='N', + dest='skip_playlist_after_errors', default=None, type=int, + help='Number of allowed failures until the rest of the playlist is skipped') + selection.add_option( + '--no-download-archive', + dest='download_archive', action="store_const", const=None, + help='Do not use archive file (default)') + selection.add_option( + '--include-ads', + dest='include_ads', action='store_true', + help=optparse.SUPPRESS_HELP) + selection.add_option( + '--no-include-ads', + dest='include_ads', action='store_false', + help=optparse.SUPPRESS_HELP) + + authentication = optparse.OptionGroup(parser, 'Authentication Options') + authentication.add_option( + '-u', '--username', + dest='username', metavar='USERNAME', + help='Login with this account ID') + authentication.add_option( + '-p', '--password', + dest='password', metavar='PASSWORD', + help='Account password. If this option is left out, yt-dlp will ask interactively') + authentication.add_option( + '-2', '--twofactor', + dest='twofactor', metavar='TWOFACTOR', + help='Two-factor authentication code') + authentication.add_option( + '-n', '--netrc', + action='store_true', dest='usenetrc', default=False, + help='Use .netrc authentication data') + authentication.add_option( + '--netrc-location', + dest='netrc_location', metavar='PATH', + help='Location of .netrc authentication data; either the path or its containing directory. Defaults to ~/.netrc') + authentication.add_option( + '--video-password', + dest='videopassword', metavar='PASSWORD', + help='Video password (vimeo, youku)') + authentication.add_option( + '--ap-mso', + dest='ap_mso', metavar='MSO', + help='Adobe Pass multiple-system operator (TV provider) identifier, use --ap-list-mso for a list of available MSOs') + authentication.add_option( + '--ap-username', + dest='ap_username', metavar='USERNAME', + help='Multiple-system operator account login') + authentication.add_option( + '--ap-password', + dest='ap_password', metavar='PASSWORD', + help='Multiple-system operator account password. If this option is left out, yt-dlp will ask interactively') + authentication.add_option( + '--ap-list-mso', + action='store_true', dest='ap_list_mso', default=False, + help='List all supported multiple-system operators') + + video_format = optparse.OptionGroup(parser, 'Video Format Options') + video_format.add_option( + '-f', '--format', + action='store', dest='format', metavar='FORMAT', default=None, + help='Video format code, see "FORMAT SELECTION" for more details') + video_format.add_option( + '-S', '--format-sort', metavar='SORTORDER', + dest='format_sort', default=[], type='str', action='callback', + callback=_list_from_options_callback, callback_kwargs={'append': -1}, + help='Sort the formats by the fields given, see "Sorting Formats" for more details') + video_format.add_option( + '--format-sort-force', '--S-force', + action='store_true', dest='format_sort_force', metavar='FORMAT', default=False, + help=( + 'Force user specified sort order to have precedence over all fields, ' + 'see "Sorting Formats" for more details')) + video_format.add_option( + '--no-format-sort-force', + action='store_false', dest='format_sort_force', metavar='FORMAT', default=False, + help=( + 'Some fields have precedence over the user specified sort order (default), ' + 'see "Sorting Formats" for more details')) + video_format.add_option( + '--video-multistreams', + action='store_true', dest='allow_multiple_video_streams', default=None, + help='Allow multiple video streams to be merged into a single file') + video_format.add_option( + '--no-video-multistreams', + action='store_false', dest='allow_multiple_video_streams', + help='Only one video stream is downloaded for each output file (default)') + video_format.add_option( + '--audio-multistreams', + action='store_true', dest='allow_multiple_audio_streams', default=None, + help='Allow multiple audio streams to be merged into a single file') + video_format.add_option( + '--no-audio-multistreams', + action='store_false', dest='allow_multiple_audio_streams', + help='Only one audio stream is downloaded for each output file (default)') + video_format.add_option( + '--all-formats', + action='store_const', dest='format', const='all', + help=optparse.SUPPRESS_HELP) + video_format.add_option( + '--prefer-free-formats', + action='store_true', dest='prefer_free_formats', default=False, + help=( + 'Prefer video formats with free containers over non-free ones of same quality. ' + 'Use with "-S ext" to strictly prefer free containers irrespective of quality')) + video_format.add_option( + '--no-prefer-free-formats', + action='store_false', dest='prefer_free_formats', default=False, + help="Don't give any special preference to free containers (default)") + video_format.add_option( + '--check-formats', + action='store_true', dest='check_formats', default=None, + help='Check that the formats selected are actually downloadable') + video_format.add_option( + '--no-check-formats', + action='store_false', dest='check_formats', + help='Do not check that the formats selected are actually downloadable') + video_format.add_option( + '-F', '--list-formats', + action='store_true', dest='listformats', + help='List available formats of each video. Simulate unless --no-simulate is used') + video_format.add_option( + '--list-formats-as-table', + action='store_true', dest='listformats_table', default=True, + help=optparse.SUPPRESS_HELP) + video_format.add_option( + '--list-formats-old', '--no-list-formats-as-table', + action='store_false', dest='listformats_table', + help=optparse.SUPPRESS_HELP) + video_format.add_option( + '--merge-output-format', + action='store', dest='merge_output_format', metavar='FORMAT', default=None, + help=( + 'If a merge is required (e.g. bestvideo+bestaudio), ' + 'output to given container format. One of mkv, mp4, ogg, webm, flv. ' + 'Ignored if no merge is required')) + video_format.add_option( + '--allow-unplayable-formats', + action='store_true', dest='allow_unplayable_formats', default=False, + help=optparse.SUPPRESS_HELP) + video_format.add_option( + '--no-allow-unplayable-formats', + action='store_false', dest='allow_unplayable_formats', + help=optparse.SUPPRESS_HELP) + + subtitles = optparse.OptionGroup(parser, 'Subtitle Options') + subtitles.add_option( + '--write-subs', '--write-srt', + action='store_true', dest='writesubtitles', default=False, + help='Write subtitle file') + subtitles.add_option( + '--no-write-subs', '--no-write-srt', + action='store_false', dest='writesubtitles', + help='Do not write subtitle file (default)') + subtitles.add_option( + '--write-auto-subs', '--write-automatic-subs', + action='store_true', dest='writeautomaticsub', default=False, + help='Write automatically generated subtitle file (Alias: --write-automatic-subs)') + subtitles.add_option( + '--no-write-auto-subs', '--no-write-automatic-subs', + action='store_false', dest='writeautomaticsub', default=False, + help='Do not write auto-generated subtitles (default) (Alias: --no-write-automatic-subs)') + subtitles.add_option( + '--all-subs', + action='store_true', dest='allsubtitles', default=False, + help=optparse.SUPPRESS_HELP) + subtitles.add_option( + '--list-subs', + action='store_true', dest='listsubtitles', default=False, + help='List available subtitles of each video. Simulate unless --no-simulate is used') + subtitles.add_option( + '--sub-format', + action='store', dest='subtitlesformat', metavar='FORMAT', default='best', + help='Subtitle format, accepts formats preference, for example: "srt" or "ass/srt/best"') + subtitles.add_option( + '--sub-langs', '--srt-langs', + action='callback', dest='subtitleslangs', metavar='LANGS', type='str', + default=[], callback=_list_from_options_callback, + help=( + 'Languages of the subtitles to download (can be regex) or "all" separated by commas. (Eg: --sub-langs en.*,ja) ' + 'You can prefix the language code with a "-" to exempt it from the requested languages. (Eg: --sub-langs all,-live_chat) ' + 'Use --list-subs for a list of available language tags')) + + downloader = optparse.OptionGroup(parser, 'Download Options') + downloader.add_option( + '-N', '--concurrent-fragments', + dest='concurrent_fragment_downloads', metavar='N', default=1, type=int, + help='Number of fragments of a dash/hlsnative video that should be download concurrently (default is %default)') + downloader.add_option( + '-r', '--limit-rate', '--rate-limit', + dest='ratelimit', metavar='RATE', + help='Maximum download rate in bytes per second (e.g. 50K or 4.2M)') + downloader.add_option( + '--throttled-rate', + dest='throttledratelimit', metavar='RATE', + help='Minimum download rate in bytes per second below which throttling is assumed and the video data is re-extracted (e.g. 100K)') + downloader.add_option( + '-R', '--retries', + dest='retries', metavar='RETRIES', default=10, + help='Number of retries (default is %default), or "infinite"') + downloader.add_option( + '--fragment-retries', + dest='fragment_retries', metavar='RETRIES', default=10, + help='Number of retries for a fragment (default is %default), or "infinite" (DASH, hlsnative and ISM)') + downloader.add_option( + '--skip-unavailable-fragments', '--no-abort-on-unavailable-fragment', + action='store_true', dest='skip_unavailable_fragments', default=True, + help='Skip unavailable fragments for DASH, hlsnative and ISM (default) (Alias: --no-abort-on-unavailable-fragment)') + downloader.add_option( + '--abort-on-unavailable-fragment', '--no-skip-unavailable-fragments', + action='store_false', dest='skip_unavailable_fragments', + help='Abort downloading if a fragment is unavailable (Alias: --no-skip-unavailable-fragments)') + downloader.add_option( + '--keep-fragments', + action='store_true', dest='keep_fragments', default=False, + help='Keep downloaded fragments on disk after downloading is finished') + downloader.add_option( + '--no-keep-fragments', + action='store_false', dest='keep_fragments', + help='Delete downloaded fragments after downloading is finished (default)') + downloader.add_option( + '--buffer-size', + dest='buffersize', metavar='SIZE', default='1024', + help='Size of download buffer (e.g. 1024 or 16K) (default is %default)') + downloader.add_option( + '--resize-buffer', + action='store_false', dest='noresizebuffer', + help='The buffer size is automatically resized from an initial value of --buffer-size (default)') + downloader.add_option( + '--no-resize-buffer', + action='store_true', dest='noresizebuffer', default=False, + help='Do not automatically adjust the buffer size') + downloader.add_option( + '--http-chunk-size', + dest='http_chunk_size', metavar='SIZE', default=None, + help=( + 'Size of a chunk for chunk-based HTTP downloading (e.g. 10485760 or 10M) (default is disabled). ' + 'May be useful for bypassing bandwidth throttling imposed by a webserver (experimental)')) + downloader.add_option( + '--test', + action='store_true', dest='test', default=False, + help=optparse.SUPPRESS_HELP) + downloader.add_option( + '--playlist-reverse', + action='store_true', + help='Download playlist videos in reverse order') + downloader.add_option( + '--no-playlist-reverse', + action='store_false', dest='playlist_reverse', + help='Download playlist videos in default order (default)') + downloader.add_option( + '--playlist-random', + action='store_true', + help='Download playlist videos in random order') + downloader.add_option( + '--xattr-set-filesize', + dest='xattr_set_filesize', action='store_true', + help='Set file xattribute ytdl.filesize with expected file size') + downloader.add_option( + '--hls-prefer-native', + dest='hls_prefer_native', action='store_true', default=None, + help=optparse.SUPPRESS_HELP) + downloader.add_option( + '--hls-prefer-ffmpeg', + dest='hls_prefer_native', action='store_false', default=None, + help=optparse.SUPPRESS_HELP) + downloader.add_option( + '--hls-use-mpegts', + dest='hls_use_mpegts', action='store_true', default=None, + help=( + 'Use the mpegts container for HLS videos; ' + 'allowing some players to play the video while downloading, ' + 'and reducing the chance of file corruption if download is interrupted. ' + 'This is enabled by default for live streams')) + downloader.add_option( + '--no-hls-use-mpegts', + dest='hls_use_mpegts', action='store_false', + help=( + 'Do not use the mpegts container for HLS videos. ' + 'This is default when not downloading live streams')) + downloader.add_option( + '--downloader', '--external-downloader', + dest='external_downloader', metavar='[PROTO:]NAME', default={}, type='str', + action='callback', callback=_dict_from_options_callback, + callback_kwargs={ + 'allowed_keys': 'http|ftp|m3u8|dash|rtsp|rtmp|mms', + 'default_key': 'default', + 'process': str.strip + }, help=( + 'Name or path of the external downloader to use (optionally) prefixed by ' + 'the protocols (http, ftp, m3u8, dash, rstp, rtmp, mms) to use it for. ' + 'Currently supports native, %s (Recommended: aria2c). ' + 'You can use this option multiple times to set different downloaders for different protocols. ' + 'For example, --downloader aria2c --downloader "dash,m3u8:native" will use ' + 'aria2c for http/ftp downloads, and the native downloader for dash/m3u8 downloads ' + '(Alias: --external-downloader)' % ', '.join(list_external_downloaders()))) + downloader.add_option( + '--downloader-args', '--external-downloader-args', + metavar='NAME:ARGS', dest='external_downloader_args', default={}, type='str', + action='callback', callback=_dict_from_options_callback, + callback_kwargs={ + 'allowed_keys': r'ffmpeg_[io]\d*|%s' % '|'.join(list_external_downloaders()), + 'default_key': 'default', + 'process': compat_shlex_split + }, help=( + 'Give these arguments to the external downloader. ' + 'Specify the downloader name and the arguments separated by a colon ":". ' + 'For ffmpeg, arguments can be passed to different positions using the same syntax as --postprocessor-args. ' + 'You can use this option multiple times to give different arguments to different downloaders ' + '(Alias: --external-downloader-args)')) + + workarounds = optparse.OptionGroup(parser, 'Workarounds') + workarounds.add_option( + '--encoding', + dest='encoding', metavar='ENCODING', + help='Force the specified encoding (experimental)') + workarounds.add_option( + '--no-check-certificates', + action='store_true', dest='no_check_certificate', default=False, + help='Suppress HTTPS certificate validation') + workarounds.add_option( + '--prefer-insecure', '--prefer-unsecure', + action='store_true', dest='prefer_insecure', + help='Use an unencrypted connection to retrieve information about the video (Currently supported only for YouTube)') + workarounds.add_option( + '--user-agent', + metavar='UA', dest='user_agent', + help='Specify a custom user agent') + workarounds.add_option( + '--referer', + metavar='URL', dest='referer', default=None, + help='Specify a custom referer, use if the video access is restricted to one domain', + ) + workarounds.add_option( + '--add-header', + metavar='FIELD:VALUE', dest='headers', default={}, type='str', + action='callback', callback=_dict_from_options_callback, + callback_kwargs={'multiple_keys': False}, + help='Specify a custom HTTP header and its value, separated by a colon ":". You can use this option multiple times', + ) + workarounds.add_option( + '--bidi-workaround', + dest='bidi_workaround', action='store_true', + help='Work around terminals that lack bidirectional text support. Requires bidiv or fribidi executable in PATH') + workarounds.add_option( + '--sleep-requests', metavar='SECONDS', + dest='sleep_interval_requests', type=float, + help='Number of seconds to sleep between requests during data extraction') + workarounds.add_option( + '--sleep-interval', '--min-sleep-interval', metavar='SECONDS', + dest='sleep_interval', type=float, + help=( + 'Number of seconds to sleep before each download. ' + 'This is the minimum time to sleep when used along with --max-sleep-interval ' + '(Alias: --min-sleep-interval)')) + workarounds.add_option( + '--max-sleep-interval', metavar='SECONDS', + dest='max_sleep_interval', type=float, + help='Maximum number of seconds to sleep. Can only be used along with --min-sleep-interval') + workarounds.add_option( + '--sleep-subtitles', metavar='SECONDS', + dest='sleep_interval_subtitles', default=0, type=int, + help='Number of seconds to sleep before each subtitle download') + + verbosity = optparse.OptionGroup(parser, 'Verbosity and Simulation Options') + verbosity.add_option( + '-q', '--quiet', + action='store_true', dest='quiet', default=False, + help='Activate quiet mode. If used with --verbose, print the log to stderr') + verbosity.add_option( + '--no-warnings', + dest='no_warnings', action='store_true', default=False, + help='Ignore warnings') + verbosity.add_option( + '-s', '--simulate', + action='store_true', dest='simulate', default=None, + help='Do not download the video and do not write anything to disk') + verbosity.add_option( + '--no-simulate', + action='store_false', dest='simulate', + help='Download the video even if printing/listing options are used') + verbosity.add_option( + '--ignore-no-formats-error', + action='store_true', dest='ignore_no_formats_error', default=False, + help=( + 'Ignore "No video formats" error. Usefull for extracting metadata ' + 'even if the videos are not actually available for download (experimental)')) + verbosity.add_option( + '--no-ignore-no-formats-error', + action='store_false', dest='ignore_no_formats_error', + help='Throw error when no downloadable video formats are found (default)') + verbosity.add_option( + '--skip-download', '--no-download', + action='store_true', dest='skip_download', default=False, + help='Do not download the video but write all related files (Alias: --no-download)') + verbosity.add_option( + '-O', '--print', + metavar='TEMPLATE', action='append', dest='forceprint', + help=( + 'Quiet, but print the given fields for each video. Simulate unless --no-simulate is used. ' + 'Either a field name or same syntax as the output template can be used')) + verbosity.add_option( + '-g', '--get-url', + action='store_true', dest='geturl', default=False, + help=optparse.SUPPRESS_HELP) + verbosity.add_option( + '-e', '--get-title', + action='store_true', dest='gettitle', default=False, + help=optparse.SUPPRESS_HELP) + verbosity.add_option( + '--get-id', + action='store_true', dest='getid', default=False, + help=optparse.SUPPRESS_HELP) + verbosity.add_option( + '--get-thumbnail', + action='store_true', dest='getthumbnail', default=False, + help=optparse.SUPPRESS_HELP) + verbosity.add_option( + '--get-description', + action='store_true', dest='getdescription', default=False, + help=optparse.SUPPRESS_HELP) + verbosity.add_option( + '--get-duration', + action='store_true', dest='getduration', default=False, + help=optparse.SUPPRESS_HELP) + verbosity.add_option( + '--get-filename', + action='store_true', dest='getfilename', default=False, + help=optparse.SUPPRESS_HELP) + verbosity.add_option( + '--get-format', + action='store_true', dest='getformat', default=False, + help=optparse.SUPPRESS_HELP) + verbosity.add_option( + '-j', '--dump-json', + action='store_true', dest='dumpjson', default=False, + help='Quiet, but print JSON information for each video. Simulate unless --no-simulate is used. See "OUTPUT TEMPLATE" for a description of available keys') + verbosity.add_option( + '-J', '--dump-single-json', + action='store_true', dest='dump_single_json', default=False, + help=( + 'Quiet, but print JSON information for each url or infojson passed. Simulate unless --no-simulate is used. ' + 'If the URL refers to a playlist, the whole playlist information is dumped in a single line')) + verbosity.add_option( + '--print-json', + action='store_true', dest='print_json', default=False, + help=optparse.SUPPRESS_HELP) + verbosity.add_option( + '--force-write-archive', '--force-write-download-archive', '--force-download-archive', + action='store_true', dest='force_write_download_archive', default=False, + help=( + 'Force download archive entries to be written as far as no errors occur, ' + 'even if -s or another simulation option is used (Alias: --force-download-archive)')) + verbosity.add_option( + '--newline', + action='store_true', dest='progress_with_newline', default=False, + help='Output progress bar as new lines') + verbosity.add_option( + '--no-progress', + action='store_true', dest='noprogress', default=None, + help='Do not print progress bar') + verbosity.add_option( + '--progress', + action='store_false', dest='noprogress', + help='Show progress bar, even if in quiet mode') + verbosity.add_option( + '--console-title', + action='store_true', dest='consoletitle', default=False, + help='Display progress in console titlebar') + verbosity.add_option( + '--progress-template', + metavar='[TYPES:]TEMPLATE', dest='progress_template', default={}, type='str', + action='callback', callback=_dict_from_options_callback, + callback_kwargs={ + 'allowed_keys': '(download|postprocess)(-title)?', + 'default_key': 'download' + }, help=( + 'Template for progress outputs, optionally prefixed with one of "download:" (default), ' + '"download-title:" (the console title), "postprocess:", or "postprocess-title:". ' + 'The video\'s fields are accessible under the "info" key and ' + 'the progress attributes are accessible under "progress" key. Eg: ' + # TODO: Document the fields inside "progress" + '--console-title --progress-template "download-title:%(info.id)s-%(progress.eta)s"')) + verbosity.add_option( + '-v', '--verbose', + action='store_true', dest='verbose', default=False, + help='Print various debugging information') + verbosity.add_option( + '--dump-pages', '--dump-intermediate-pages', + action='store_true', dest='dump_intermediate_pages', default=False, + help='Print downloaded pages encoded using base64 to debug problems (very verbose)') + verbosity.add_option( + '--write-pages', + action='store_true', dest='write_pages', default=False, + help='Write downloaded intermediary pages to files in the current directory to debug problems') + verbosity.add_option( + '--youtube-print-sig-code', + action='store_true', dest='youtube_print_sig_code', default=False, + help=optparse.SUPPRESS_HELP) + verbosity.add_option( + '--print-traffic', '--dump-headers', + dest='debug_printtraffic', action='store_true', default=False, + help='Display sent and read HTTP traffic') + verbosity.add_option( + '-C', '--call-home', + dest='call_home', action='store_true', default=False, + # help='[Broken] Contact the yt-dlp server for debugging') + help=optparse.SUPPRESS_HELP) + verbosity.add_option( + '--no-call-home', + dest='call_home', action='store_false', + # help='Do not contact the yt-dlp server for debugging (default)') + help=optparse.SUPPRESS_HELP) + + filesystem = optparse.OptionGroup(parser, 'Filesystem Options') + filesystem.add_option( + '-a', '--batch-file', + dest='batchfile', metavar='FILE', + help="File containing URLs to download ('-' for stdin), one URL per line. " + "Lines starting with '#', ';' or ']' are considered as comments and ignored") + filesystem.add_option( + '-P', '--paths', + metavar='[TYPES:]PATH', dest='paths', default={}, type='str', + action='callback', callback=_dict_from_options_callback, + callback_kwargs={ + 'allowed_keys': 'home|temp|%s' % '|'.join(OUTTMPL_TYPES.keys()), + 'default_key': 'home' + }, help=( + 'The paths where the files should be downloaded. ' + 'Specify the type of file and the path separated by a colon ":". ' + 'All the same types as --output are supported. ' + 'Additionally, you can also provide "home" (default) and "temp" paths. ' + 'All intermediary files are first downloaded to the temp path and ' + 'then the final files are moved over to the home path after download is finished. ' + 'This option is ignored if --output is an absolute path')) + filesystem.add_option( + '-o', '--output', + metavar='[TYPES:]TEMPLATE', dest='outtmpl', default={}, type='str', + action='callback', callback=_dict_from_options_callback, + callback_kwargs={ + 'allowed_keys': '|'.join(OUTTMPL_TYPES.keys()), + 'default_key': 'default' + }, help='Output filename template; see "OUTPUT TEMPLATE" for details') + filesystem.add_option( + '--output-na-placeholder', + dest='outtmpl_na_placeholder', metavar='TEXT', default='NA', + help=('Placeholder value for unavailable meta fields in output filename template (default: "%default")')) + filesystem.add_option( + '--autonumber-size', + dest='autonumber_size', metavar='NUMBER', type=int, + help=optparse.SUPPRESS_HELP) + filesystem.add_option( + '--autonumber-start', + dest='autonumber_start', metavar='NUMBER', default=1, type=int, + help=optparse.SUPPRESS_HELP) + filesystem.add_option( + '--restrict-filenames', + action='store_true', dest='restrictfilenames', default=False, + help='Restrict filenames to only ASCII characters, and avoid "&" and spaces in filenames') + filesystem.add_option( + '--no-restrict-filenames', + action='store_false', dest='restrictfilenames', + help='Allow Unicode characters, "&" and spaces in filenames (default)') + filesystem.add_option( + '--windows-filenames', + action='store_true', dest='windowsfilenames', default=False, + help='Force filenames to be windows compatible') + filesystem.add_option( + '--no-windows-filenames', + action='store_false', dest='windowsfilenames', + help='Make filenames windows compatible only if using windows (default)') + filesystem.add_option( + '--trim-filenames', '--trim-file-names', metavar='LENGTH', + dest='trim_file_name', default=0, type=int, + help='Limit the filename length (excluding extension) to the specified number of characters') + filesystem.add_option( + '-w', '--no-overwrites', + action='store_false', dest='overwrites', default=None, + help='Do not overwrite any files') + filesystem.add_option( + '--force-overwrites', '--yes-overwrites', + action='store_true', dest='overwrites', + help='Overwrite all video and metadata files. This option includes --no-continue') + filesystem.add_option( + '--no-force-overwrites', + action='store_const', dest='overwrites', const=None, + help='Do not overwrite the video, but overwrite related files (default)') + filesystem.add_option( + '-c', '--continue', + action='store_true', dest='continue_dl', default=True, + help='Resume partially downloaded files/fragments (default)') + filesystem.add_option( + '--no-continue', + action='store_false', dest='continue_dl', + help=( + 'Do not resume partially downloaded fragments. ' + 'If the file is not fragmented, restart download of the entire file')) + filesystem.add_option( + '--part', + action='store_false', dest='nopart', default=False, + help='Use .part files instead of writing directly into output file (default)') + filesystem.add_option( + '--no-part', + action='store_true', dest='nopart', + help='Do not use .part files - write directly into output file') + filesystem.add_option( + '--mtime', + action='store_true', dest='updatetime', default=True, + help='Use the Last-modified header to set the file modification time (default)') + filesystem.add_option( + '--no-mtime', + action='store_false', dest='updatetime', + help='Do not use the Last-modified header to set the file modification time') + filesystem.add_option( + '--write-description', + action='store_true', dest='writedescription', default=False, + help='Write video description to a .description file') + filesystem.add_option( + '--no-write-description', + action='store_false', dest='writedescription', + help='Do not write video description (default)') + filesystem.add_option( + '--write-info-json', + action='store_true', dest='writeinfojson', default=False, + help='Write video metadata to a .info.json file (this may contain personal information)') + filesystem.add_option( + '--no-write-info-json', + action='store_false', dest='writeinfojson', + help='Do not write video metadata (default)') + filesystem.add_option( + '--write-annotations', + action='store_true', dest='writeannotations', default=False, + help=optparse.SUPPRESS_HELP) + filesystem.add_option( + '--no-write-annotations', + action='store_false', dest='writeannotations', + help=optparse.SUPPRESS_HELP) + filesystem.add_option( + '--write-playlist-metafiles', + action='store_true', dest='allow_playlist_files', default=None, + help=( + 'Write playlist metadata in addition to the video metadata ' + 'when using --write-info-json, --write-description etc. (default)')) + filesystem.add_option( + '--no-write-playlist-metafiles', + action='store_false', dest='allow_playlist_files', + help='Do not write playlist metadata when using --write-info-json, --write-description etc.') + filesystem.add_option( + '--clean-infojson', + action='store_true', dest='clean_infojson', default=None, + help=( + 'Remove some private fields such as filenames from the infojson. ' + 'Note that it could still contain some personal information (default)')) + filesystem.add_option( + '--no-clean-infojson', + action='store_false', dest='clean_infojson', + help='Write all fields to the infojson') + filesystem.add_option( + '--write-comments', '--get-comments', + action='store_true', dest='getcomments', default=False, + help=( + 'Retrieve video comments to be placed in the infojson. ' + 'The comments are fetched even without this option if the extraction is known to be quick (Alias: --get-comments)')) + filesystem.add_option( + '--no-write-comments', '--no-get-comments', + action='store_false', dest='getcomments', + help='Do not retrieve video comments unless the extraction is known to be quick (Alias: --no-get-comments)') + filesystem.add_option( + '--load-info-json', '--load-info', + dest='load_info_filename', metavar='FILE', + help='JSON file containing the video information (created with the "--write-info-json" option)') + filesystem.add_option( + '--cookies', + dest='cookiefile', metavar='FILE', + help='File to read cookies from and dump cookie jar in') + filesystem.add_option( + '--no-cookies', + action='store_const', const=None, dest='cookiefile', metavar='FILE', + help='Do not read/dump cookies from/to file (default)') + filesystem.add_option( + '--cookies-from-browser', + dest='cookiesfrombrowser', metavar='BROWSER[:PROFILE]', + help=( + 'Load cookies from a user profile of the given web browser. ' + 'Currently supported browsers are: {}. ' + 'You can specify the user profile name or directory using ' + '"BROWSER:PROFILE_NAME" or "BROWSER:PROFILE_PATH". ' + 'If no profile is given, the most recently accessed one is used'.format( + ', '.join(sorted(SUPPORTED_BROWSERS))))) + filesystem.add_option( + '--no-cookies-from-browser', + action='store_const', const=None, dest='cookiesfrombrowser', + help='Do not load cookies from browser (default)') + filesystem.add_option( + '--cache-dir', dest='cachedir', default=None, metavar='DIR', + help='Location in the filesystem where youtube-dl can store some downloaded information (such as client ids and signatures) permanently. By default $XDG_CACHE_HOME/yt-dlp or ~/.cache/yt-dlp') + filesystem.add_option( + '--no-cache-dir', action='store_false', dest='cachedir', + help='Disable filesystem caching') + filesystem.add_option( + '--rm-cache-dir', + action='store_true', dest='rm_cachedir', + help='Delete all filesystem cache files') + + thumbnail = optparse.OptionGroup(parser, 'Thumbnail Options') + thumbnail.add_option( + '--write-thumbnail', + action='store_true', dest='writethumbnail', default=False, + help='Write thumbnail image to disk') + thumbnail.add_option( + '--no-write-thumbnail', + action='store_false', dest='writethumbnail', + help='Do not write thumbnail image to disk (default)') + thumbnail.add_option( + '--write-all-thumbnails', + action='store_true', dest='write_all_thumbnails', default=False, + help='Write all thumbnail image formats to disk') + thumbnail.add_option( + '--list-thumbnails', + action='store_true', dest='list_thumbnails', default=False, + help='List available thumbnails of each video. Simulate unless --no-simulate is used') + + link = optparse.OptionGroup(parser, 'Internet Shortcut Options') + link.add_option( + '--write-link', + action='store_true', dest='writelink', default=False, + help='Write an internet shortcut file, depending on the current platform (.url, .webloc or .desktop). The URL may be cached by the OS') + link.add_option( + '--write-url-link', + action='store_true', dest='writeurllink', default=False, + help='Write a .url Windows internet shortcut. The OS caches the URL based on the file path') + link.add_option( + '--write-webloc-link', + action='store_true', dest='writewebloclink', default=False, + help='Write a .webloc macOS internet shortcut') + link.add_option( + '--write-desktop-link', + action='store_true', dest='writedesktoplink', default=False, + help='Write a .desktop Linux internet shortcut') + + postproc = optparse.OptionGroup(parser, 'Post-Processing Options') + postproc.add_option( + '-x', '--extract-audio', + action='store_true', dest='extractaudio', default=False, + help='Convert video files to audio-only files (requires ffmpeg and ffprobe)') + postproc.add_option( + '--audio-format', metavar='FORMAT', dest='audioformat', default='best', + help=( + 'Specify audio format to convert the audio to when -x is used. Currently supported formats are: ' + 'best (default) or one of %s' % '|'.join(FFmpegExtractAudioPP.SUPPORTED_EXTS))) + postproc.add_option( + '--audio-quality', metavar='QUALITY', + dest='audioquality', default='5', + help='Specify ffmpeg audio quality, insert a value between 0 (better) and 9 (worse) for VBR or a specific bitrate like 128K (default %default)') + postproc.add_option( + '--remux-video', + metavar='FORMAT', dest='remuxvideo', default=None, + help=( + 'Remux the video into another container if necessary (currently supported: %s). ' + 'If target container does not support the video/audio codec, remuxing will fail. ' + 'You can specify multiple rules; Eg. "aac>m4a/mov>mp4/mkv" will remux aac to m4a, mov to mp4 ' + 'and anything else to mkv.' % '|'.join(FFmpegVideoRemuxerPP.SUPPORTED_EXTS))) + postproc.add_option( + '--recode-video', + metavar='FORMAT', dest='recodevideo', default=None, + help=( + 'Re-encode the video into another format if re-encoding is necessary. ' + 'The syntax and supported formats are the same as --remux-video')) + postproc.add_option( + '--postprocessor-args', '--ppa', + metavar='NAME:ARGS', dest='postprocessor_args', default={}, type='str', + action='callback', callback=_dict_from_options_callback, + callback_kwargs={ + 'allowed_keys': r'\w+(?:\+\w+)?', 'default_key': 'default-compat', + 'process': compat_shlex_split, + 'multiple_keys': False + }, help=( + 'Give these arguments to the postprocessors. ' + 'Specify the postprocessor/executable name and the arguments separated by a colon ":" ' + 'to give the argument to the specified postprocessor/executable. Supported PP are: ' + 'Merger, ModifyChapters, SplitChapters, ExtractAudio, VideoRemuxer, VideoConvertor, ' + 'Metadata, EmbedSubtitle, EmbedThumbnail, SubtitlesConvertor, ThumbnailsConvertor, ' + 'FixupStretched, FixupM4a, FixupM3u8, FixupTimestamp and FixupDuration. ' + 'The supported executables are: AtomicParsley, FFmpeg and FFprobe. ' + 'You can also specify "PP+EXE:ARGS" to give the arguments to the specified executable ' + 'only when being used by the specified postprocessor. Additionally, for ffmpeg/ffprobe, ' + '"_i"/"_o" can be appended to the prefix optionally followed by a number to pass the argument ' + 'before the specified input/output file. Eg: --ppa "Merger+ffmpeg_i1:-v quiet". ' + 'You can use this option multiple times to give different arguments to different ' + 'postprocessors. (Alias: --ppa)')) + postproc.add_option( + '-k', '--keep-video', + action='store_true', dest='keepvideo', default=False, + help='Keep the intermediate video file on disk after post-processing') + postproc.add_option( + '--no-keep-video', + action='store_false', dest='keepvideo', + help='Delete the intermediate video file after post-processing (default)') + postproc.add_option( + '--post-overwrites', + action='store_false', dest='nopostoverwrites', + help='Overwrite post-processed files (default)') + postproc.add_option( + '--no-post-overwrites', + action='store_true', dest='nopostoverwrites', default=False, + help='Do not overwrite post-processed files') + postproc.add_option( + '--embed-subs', + action='store_true', dest='embedsubtitles', default=False, + help='Embed subtitles in the video (only for mp4, webm and mkv videos)') + postproc.add_option( + '--no-embed-subs', + action='store_false', dest='embedsubtitles', + help='Do not embed subtitles (default)') + postproc.add_option( + '--embed-thumbnail', + action='store_true', dest='embedthumbnail', default=False, + help='Embed thumbnail in the video as cover art') + postproc.add_option( + '--no-embed-thumbnail', + action='store_false', dest='embedthumbnail', + help='Do not embed thumbnail (default)') + postproc.add_option( + '--embed-metadata', '--add-metadata', + action='store_true', dest='addmetadata', default=False, + help='Embed metadata to the video file. Also adds chapters to file unless --no-add-chapters is used (Alias: --add-metadata)') + postproc.add_option( + '--no-embed-metadata', '--no-add-metadata', + action='store_false', dest='addmetadata', + help='Do not add metadata to file (default) (Alias: --no-add-metadata)') + postproc.add_option( + '--embed-chapters', '--add-chapters', + action='store_true', dest='addchapters', default=None, + help='Add chapter markers to the video file (Alias: --add-chapters)') + postproc.add_option( + '--no-embed-chapters', '--no-add-chapters', + action='store_false', dest='addchapters', + help='Do not add chapter markers (default) (Alias: --no-add-chapters)') + postproc.add_option( + '--metadata-from-title', + metavar='FORMAT', dest='metafromtitle', + help=optparse.SUPPRESS_HELP) + postproc.add_option( + '--parse-metadata', + metavar='FROM:TO', dest='parse_metadata', action='append', + help=( + 'Parse additional metadata like title/artist from other fields; ' + 'see "MODIFYING METADATA" for details')) + postproc.add_option( + '--replace-in-metadata', + dest='parse_metadata', metavar='FIELDS REGEX REPLACE', action='append', nargs=3, + help='Replace text in a metadata field using the given regex. This option can be used multiple times') + postproc.add_option( + '--xattrs', + action='store_true', dest='xattrs', default=False, + help='Write metadata to the video file\'s xattrs (using dublin core and xdg standards)') + postproc.add_option( + '--fixup', + metavar='POLICY', dest='fixup', default=None, + choices=('never', 'ignore', 'warn', 'detect_or_warn', 'force'), + help=( + 'Automatically correct known faults of the file. ' + 'One of never (do nothing), warn (only emit a warning), ' + 'detect_or_warn (the default; fix file if we can, warn otherwise), ' + 'force (try fixing even if file already exists')) + postproc.add_option( + '--prefer-avconv', '--no-prefer-ffmpeg', + action='store_false', dest='prefer_ffmpeg', + help=optparse.SUPPRESS_HELP) + postproc.add_option( + '--prefer-ffmpeg', '--no-prefer-avconv', + action='store_true', dest='prefer_ffmpeg', default=True, + help=optparse.SUPPRESS_HELP) + postproc.add_option( + '--ffmpeg-location', '--avconv-location', metavar='PATH', + dest='ffmpeg_location', + help='Location of the ffmpeg binary; either the path to the binary or its containing directory') + postproc.add_option( + '--exec', metavar='CMD', + action='append', dest='exec_cmd', + help=( + 'Execute a command on the file after downloading and post-processing. ' + 'Same syntax as the output template can be used to pass any field as arguments to the command. ' + 'An additional field "filepath" that contains the final path of the downloaded file is also available. ' + 'If no fields are passed, %(filepath)q is appended to the end of the command. ' + 'This option can be used multiple times')) + postproc.add_option( + '--no-exec', + action='store_const', dest='exec_cmd', const=[], + help='Remove any previously defined --exec') + postproc.add_option( + '--exec-before-download', metavar='CMD', + action='append', dest='exec_before_dl_cmd', + help=( + 'Execute a command before the actual download. ' + 'The syntax is the same as --exec but "filepath" is not available. ' + 'This option can be used multiple times')) + postproc.add_option( + '--no-exec-before-download', + action='store_const', dest='exec_before_dl_cmd', const=[], + help='Remove any previously defined --exec-before-download') + postproc.add_option( + '--convert-subs', '--convert-sub', '--convert-subtitles', + metavar='FORMAT', dest='convertsubtitles', default=None, + help=( + 'Convert the subtitles to another format (currently supported: %s) ' + '(Alias: --convert-subtitles)' % '|'.join(FFmpegSubtitlesConvertorPP.SUPPORTED_EXTS))) + postproc.add_option( + '--convert-thumbnails', + metavar='FORMAT', dest='convertthumbnails', default=None, + help=( + 'Convert the thumbnails to another format ' + '(currently supported: %s) ' % '|'.join(FFmpegThumbnailsConvertorPP.SUPPORTED_EXTS))) + postproc.add_option( + '--split-chapters', '--split-tracks', + dest='split_chapters', action='store_true', default=False, + help=( + 'Split video into multiple files based on internal chapters. ' + 'The "chapter:" prefix can be used with "--paths" and "--output" to ' + 'set the output filename for the split files. See "OUTPUT TEMPLATE" for details')) + postproc.add_option( + '--no-split-chapters', '--no-split-tracks', + dest='split_chapters', action='store_false', + help='Do not split video based on chapters (default)') + postproc.add_option( + '--remove-chapters', + metavar='REGEX', dest='remove_chapters', action='append', + help='Remove chapters whose title matches the given regular expression. This option can be used multiple times') + postproc.add_option( + '--no-remove-chapters', dest='remove_chapters', action='store_const', const=None, + help='Do not remove any chapters from the file (default)') + postproc.add_option( + '--force-keyframes-at-cuts', + action='store_true', dest='force_keyframes_at_cuts', default=False, + help=( + 'Force keyframes around the chapters before removing/splitting them. ' + 'Requires a reencode and thus is very slow, but the resulting video ' + 'may have fewer artifacts around the cuts')) + postproc.add_option( + '--no-force-keyframes-at-cuts', + action='store_false', dest='force_keyframes_at_cuts', + help='Do not force keyframes around the chapters when cutting/splitting (default)') + _postprocessor_opts_parser = lambda key, val='': ( + *(item.split('=', 1) for item in (val.split(';') if val else [])), + ('key', remove_end(key, 'PP'))) + postproc.add_option( + '--use-postprocessor', + metavar='NAME[:ARGS]', dest='add_postprocessors', default=[], type='str', + action='callback', callback=_list_from_options_callback, + callback_kwargs={ + 'delim': None, + 'process': lambda val: dict(_postprocessor_opts_parser(*val.split(':', 1))) + }, help=( + 'The (case sensitive) name of plugin postprocessors to be enabled, ' + 'and (optionally) arguments to be passed to it, seperated by a colon ":". ' + 'ARGS are a semicolon ";" delimited list of NAME=VALUE. ' + 'The "when" argument determines when the postprocessor is invoked. ' + 'It can be one of "pre_process" (after extraction), ' + '"before_dl" (before video download), "post_process" (after video download; default) ' + 'or "after_move" (after moving file to their final locations). ' + 'This option can be used multiple times to add different postprocessors')) + + sponsorblock = optparse.OptionGroup(parser, 'SponsorBlock Options', description=( + 'Make chapter entries for, or remove various segments (sponsor, introductions, etc.) ' + 'from downloaded YouTube videos using the SponsorBlock API (https://sponsor.ajay.app)')) + sponsorblock.add_option( + '--sponsorblock-mark', metavar='CATS', + dest='sponsorblock_mark', default=set(), action='callback', type='str', + callback=_set_from_options_callback, callback_kwargs={'allowed_values': SponsorBlockPP.CATEGORIES.keys()}, + help=( + 'SponsorBlock categories to create chapters for, separated by commas. ' + 'Available categories are all, %s. You can prefix the category with a "-" to exempt it. ' + 'See https://wiki.sponsor.ajay.app/index.php/Segment_Categories for description of the categories. ' + 'Eg: --sponsorblock-mark all,-preview' % ', '.join(SponsorBlockPP.CATEGORIES.keys()))) + sponsorblock.add_option( + '--sponsorblock-remove', metavar='CATS', + dest='sponsorblock_remove', default=set(), action='callback', type='str', + callback=_set_from_options_callback, callback_kwargs={'allowed_values': SponsorBlockPP.CATEGORIES.keys()}, + help=( + 'SponsorBlock categories to be removed from the video file, separated by commas. ' + 'If a category is present in both mark and remove, remove takes precedence. ' + 'The syntax and available categories are the same as for --sponsorblock-mark')) + sponsorblock.add_option( + '--sponsorblock-chapter-title', metavar='TEMPLATE', + default=DEFAULT_SPONSORBLOCK_CHAPTER_TITLE, dest='sponsorblock_chapter_title', + help=( + 'The title template for SponsorBlock chapters created by --sponsorblock-mark. ' + 'The same syntax as the output template is used, but the only available fields are ' + 'start_time, end_time, category, categories, name, category_names. Defaults to "%default"')) + sponsorblock.add_option( + '--no-sponsorblock', default=False, + action='store_true', dest='no_sponsorblock', + help='Disable both --sponsorblock-mark and --sponsorblock-remove') + sponsorblock.add_option( + '--sponsorblock-api', metavar='URL', + default='https://sponsor.ajay.app', dest='sponsorblock_api', + help='SponsorBlock API location, defaults to %default') + + sponsorblock.add_option( + '--sponskrub', + action='store_true', dest='sponskrub', default=False, + help=optparse.SUPPRESS_HELP) + sponsorblock.add_option( + '--no-sponskrub', + action='store_false', dest='sponskrub', + help=optparse.SUPPRESS_HELP) + sponsorblock.add_option( + '--sponskrub-cut', default=False, + action='store_true', dest='sponskrub_cut', + help=optparse.SUPPRESS_HELP) + sponsorblock.add_option( + '--no-sponskrub-cut', + action='store_false', dest='sponskrub_cut', + help=optparse.SUPPRESS_HELP) + sponsorblock.add_option( + '--sponskrub-force', default=False, + action='store_true', dest='sponskrub_force', + help=optparse.SUPPRESS_HELP) + sponsorblock.add_option( + '--no-sponskrub-force', + action='store_true', dest='sponskrub_force', + help=optparse.SUPPRESS_HELP) + sponsorblock.add_option( + '--sponskrub-location', metavar='PATH', + dest='sponskrub_path', default='', + help=optparse.SUPPRESS_HELP) + sponsorblock.add_option( + '--sponskrub-args', dest='sponskrub_args', metavar='ARGS', + help=optparse.SUPPRESS_HELP) + + extractor = optparse.OptionGroup(parser, 'Extractor Options') + extractor.add_option( + '--extractor-retries', + dest='extractor_retries', metavar='RETRIES', default=3, + help='Number of retries for known extractor errors (default is %default), or "infinite"') + extractor.add_option( + '--allow-dynamic-mpd', '--no-ignore-dynamic-mpd', + action='store_true', dest='dynamic_mpd', default=True, + help='Process dynamic DASH manifests (default) (Alias: --no-ignore-dynamic-mpd)') + extractor.add_option( + '--ignore-dynamic-mpd', '--no-allow-dynamic-mpd', + action='store_false', dest='dynamic_mpd', + help='Do not process dynamic DASH manifests (Alias: --no-allow-dynamic-mpd)') + extractor.add_option( + '--hls-split-discontinuity', + dest='hls_split_discontinuity', action='store_true', default=False, + help='Split HLS playlists to different formats at discontinuities such as ad breaks' + ) + extractor.add_option( + '--no-hls-split-discontinuity', + dest='hls_split_discontinuity', action='store_false', + help='Do not split HLS playlists to different formats at discontinuities such as ad breaks (default)') + _extractor_arg_parser = lambda key, vals='': (key.strip().lower().replace('-', '_'), [val.strip() for val in vals.split(',')]) + extractor.add_option( + '--extractor-args', + metavar='KEY:ARGS', dest='extractor_args', default={}, type='str', + action='callback', callback=_dict_from_options_callback, + callback_kwargs={ + 'multiple_keys': False, + 'process': lambda val: dict( + _extractor_arg_parser(*arg.split('=', 1)) for arg in val.split(';')) + }, help=( + 'Pass these arguments to the extractor. See "EXTRACTOR ARGUMENTS" for details. ' + 'You can use this option multiple times to give arguments for different extractors')) + extractor.add_option( + '--youtube-include-dash-manifest', '--no-youtube-skip-dash-manifest', + action='store_true', dest='youtube_include_dash_manifest', default=True, + help=optparse.SUPPRESS_HELP) + extractor.add_option( + '--youtube-skip-dash-manifest', '--no-youtube-include-dash-manifest', + action='store_false', dest='youtube_include_dash_manifest', + help=optparse.SUPPRESS_HELP) + extractor.add_option( + '--youtube-include-hls-manifest', '--no-youtube-skip-hls-manifest', + action='store_true', dest='youtube_include_hls_manifest', default=True, + help=optparse.SUPPRESS_HELP) + extractor.add_option( + '--youtube-skip-hls-manifest', '--no-youtube-include-hls-manifest', + action='store_false', dest='youtube_include_hls_manifest', + help=optparse.SUPPRESS_HELP) + + parser.add_option_group(general) + parser.add_option_group(network) + parser.add_option_group(geo) + parser.add_option_group(selection) + parser.add_option_group(downloader) + parser.add_option_group(filesystem) + parser.add_option_group(thumbnail) + parser.add_option_group(link) + parser.add_option_group(verbosity) + parser.add_option_group(workarounds) + parser.add_option_group(video_format) + parser.add_option_group(subtitles) + parser.add_option_group(authentication) + parser.add_option_group(postproc) + parser.add_option_group(sponsorblock) + parser.add_option_group(extractor) + + if overrideArguments is not None: + opts, args = parser.parse_args(overrideArguments) + if opts.verbose: + write_string('[debug] Override config: ' + repr(overrideArguments) + '\n') + else: + def compat_conf(conf): + if sys.version_info < (3,): + return [a.decode(preferredencoding(), 'replace') for a in conf] + return conf + + configs = { + 'command-line': compat_conf(sys.argv[1:]), + 'custom': [], 'home': [], 'portable': [], 'user': [], 'system': []} + paths = {'command-line': False} + + def read_options(name, path, user=False): + ''' loads config files and returns ignoreconfig ''' + # Multiple package names can be given here + # Eg: ('yt-dlp', 'youtube-dlc', 'youtube-dl') will look for + # the configuration file of any of these three packages + for package in ('yt-dlp',): + if user: + config, current_path = _readUserConf(package, default=None) + else: + current_path = os.path.join(path, '%s.conf' % package) + config = _readOptions(current_path, default=None) + if config is not None: + configs[name], paths[name] = config, current_path + return parser.parse_args(config)[0].ignoreconfig + return False + + def get_configs(): + opts, _ = parser.parse_args(configs['command-line']) + if opts.config_location is not None: + location = compat_expanduser(opts.config_location) + if os.path.isdir(location): + location = os.path.join(location, 'yt-dlp.conf') + if not os.path.exists(location): + parser.error('config-location %s does not exist.' % location) + config = _readOptions(location, default=None) + if config: + configs['custom'], paths['custom'] = config, location + + if opts.ignoreconfig: + return + if parser.parse_args(configs['custom'])[0].ignoreconfig: + return + if read_options('portable', get_executable_path()): + return + opts, _ = parser.parse_args(configs['portable'] + configs['custom'] + configs['command-line']) + if read_options('home', expand_path(opts.paths.get('home', '')).strip()): + return + if read_options('system', '/etc'): + return + if read_options('user', None, user=True): + configs['system'], paths['system'] = [], None + + get_configs() + argv = configs['system'] + configs['user'] + configs['home'] + configs['portable'] + configs['custom'] + configs['command-line'] + opts, args = parser.parse_args(argv) + if opts.verbose: + for label in ('Command-line', 'Custom', 'Portable', 'Home', 'User', 'System'): + key = label.lower() + if paths.get(key): + write_string(f'[debug] {label} config file: {paths[key]}\n') + if paths.get(key) is not None: + write_string(f'[debug] {label} config: {_hide_login_info(configs[key])!r}\n') + + return parser, opts, args diff --git a/yt_dlp/postprocessor/__init__.py b/yt_dlp/postprocessor/__init__.py new file mode 100644 index 000000000..07c87b76a --- /dev/null +++ b/yt_dlp/postprocessor/__init__.py @@ -0,0 +1,43 @@ +# flake8: noqa: F401 + +from ..utils import load_plugins + +from .embedthumbnail import EmbedThumbnailPP +from .exec import ExecPP, ExecAfterDownloadPP +from .ffmpeg import ( + FFmpegPostProcessor, + FFmpegEmbedSubtitlePP, + FFmpegExtractAudioPP, + FFmpegFixupDurationPP, + FFmpegFixupStretchedPP, + FFmpegFixupTimestampPP, + FFmpegFixupM3u8PP, + FFmpegFixupM4aPP, + FFmpegMergerPP, + FFmpegMetadataPP, + FFmpegSubtitlesConvertorPP, + FFmpegThumbnailsConvertorPP, + FFmpegSplitChaptersPP, + FFmpegVideoConvertorPP, + FFmpegVideoRemuxerPP, +) +from .metadataparser import ( + MetadataFromFieldPP, + MetadataFromTitlePP, + MetadataParserPP, +) +from .modify_chapters import ModifyChaptersPP +from .movefilesafterdownload import MoveFilesAfterDownloadPP +from .sponskrub import SponSkrubPP +from .sponsorblock import SponsorBlockPP +from .xattrpp import XAttrMetadataPP + +_PLUGIN_CLASSES = load_plugins('postprocessor', 'PP', globals()) + + +def get_postprocessor(key): + return globals()[key + 'PP'] + + +__all__ = [name for name in globals().keys() if name.endswith('IE')] +__all__.append('FFmpegPostProcessor') diff --git a/yt_dlp/postprocessor/common.py b/yt_dlp/postprocessor/common.py new file mode 100644 index 000000000..b36716743 --- /dev/null +++ b/yt_dlp/postprocessor/common.py @@ -0,0 +1,178 @@ +from __future__ import unicode_literals + +import copy +import functools +import os + +from ..compat import compat_str +from ..utils import ( + _configuration_args, + encodeFilename, + PostProcessingError, +) + + +class PostProcessorMetaClass(type): + @staticmethod + def run_wrapper(func): + @functools.wraps(func) + def run(self, info, *args, **kwargs): + info_copy = copy.deepcopy(self._copy_infodict(info)) + self._hook_progress({'status': 'started'}, info_copy) + ret = func(self, info, *args, **kwargs) + if ret is not None: + _, info = ret + self._hook_progress({'status': 'finished'}, info_copy) + return ret + return run + + def __new__(cls, name, bases, attrs): + if 'run' in attrs: + attrs['run'] = cls.run_wrapper(attrs['run']) + return type.__new__(cls, name, bases, attrs) + + +class PostProcessor(metaclass=PostProcessorMetaClass): + """Post Processor class. + + PostProcessor objects can be added to downloaders with their + add_post_processor() method. When the downloader has finished a + successful download, it will take its internal chain of PostProcessors + and start calling the run() method on each one of them, first with + an initial argument and then with the returned value of the previous + PostProcessor. + + The chain will be stopped if one of them ever returns None or the end + of the chain is reached. + + PostProcessor objects follow a "mutual registration" process similar + to InfoExtractor objects. + + Optionally PostProcessor can use a list of additional command-line arguments + with self._configuration_args. + """ + + _downloader = None + + def __init__(self, downloader=None): + self._progress_hooks = [] + self.add_progress_hook(self.report_progress) + self.set_downloader(downloader) + self.PP_NAME = self.pp_key() + + @classmethod + def pp_key(cls): + name = cls.__name__[:-2] + return compat_str(name[6:]) if name[:6].lower() == 'ffmpeg' else name + + def to_screen(self, text, prefix=True, *args, **kwargs): + tag = '[%s] ' % self.PP_NAME if prefix else '' + if self._downloader: + return self._downloader.to_screen('%s%s' % (tag, text), *args, **kwargs) + + def report_warning(self, text, *args, **kwargs): + if self._downloader: + return self._downloader.report_warning(text, *args, **kwargs) + + def report_error(self, text, *args, **kwargs): + # Exists only for compatibility. Do not use + if self._downloader: + return self._downloader.report_error(text, *args, **kwargs) + + def write_debug(self, text, *args, **kwargs): + if self._downloader: + return self._downloader.write_debug(text, *args, **kwargs) + + def get_param(self, name, default=None, *args, **kwargs): + if self._downloader: + return self._downloader.params.get(name, default, *args, **kwargs) + return default + + def set_downloader(self, downloader): + """Sets the downloader for this PP.""" + self._downloader = downloader + for ph in getattr(downloader, '_postprocessor_hooks', []): + self.add_progress_hook(ph) + + def _copy_infodict(self, info_dict): + return getattr(self._downloader, '_copy_infodict', dict)(info_dict) + + @staticmethod + def _restrict_to(*, video=True, audio=True, images=True): + allowed = {'video': video, 'audio': audio, 'images': images} + + def decorator(func): + @functools.wraps(func) + def wrapper(self, info): + format_type = ( + 'video' if info.get('vcodec') != 'none' + else 'audio' if info.get('acodec') != 'none' + else 'images') + if allowed[format_type]: + return func(self, info) + else: + self.to_screen('Skipping %s' % format_type) + return [], info + return wrapper + return decorator + + def run(self, information): + """Run the PostProcessor. + + The "information" argument is a dictionary like the ones + composed by InfoExtractors. The only difference is that this + one has an extra field called "filepath" that points to the + downloaded file. + + This method returns a tuple, the first element is a list of the files + that can be deleted, and the second of which is the updated + information. + + In addition, this method may raise a PostProcessingError + exception if post processing fails. + """ + return [], information # by default, keep file and do nothing + + def try_utime(self, path, atime, mtime, errnote='Cannot update utime of file'): + try: + os.utime(encodeFilename(path), (atime, mtime)) + except Exception: + self.report_warning(errnote) + + def _configuration_args(self, exe, *args, **kwargs): + return _configuration_args( + self.pp_key(), self.get_param('postprocessor_args'), exe, *args, **kwargs) + + def _hook_progress(self, status, info_dict): + if not self._progress_hooks: + return + status.update({ + 'info_dict': info_dict, + 'postprocessor': self.pp_key(), + }) + for ph in self._progress_hooks: + ph(status) + + def add_progress_hook(self, ph): + # See YoutubeDl.py (search for postprocessor_hooks) for a description of this interface + self._progress_hooks.append(ph) + + def report_progress(self, s): + s['_default_template'] = '%(postprocessor)s %(status)s' % s + + progress_dict = s.copy() + progress_dict.pop('info_dict') + progress_dict = {'info': s['info_dict'], 'progress': progress_dict} + + progress_template = self.get_param('progress_template', {}) + tmpl = progress_template.get('postprocess') + if tmpl: + self._downloader.to_stdout(self._downloader.evaluate_outtmpl(tmpl, progress_dict)) + + self._downloader.to_console_title(self._downloader.evaluate_outtmpl( + progress_template.get('postprocess-title') or 'yt-dlp %(progress._default_template)s', + progress_dict)) + + +class AudioConversionError(PostProcessingError): + pass diff --git a/yt_dlp/postprocessor/embedthumbnail.py b/yt_dlp/postprocessor/embedthumbnail.py new file mode 100644 index 000000000..3139a6338 --- /dev/null +++ b/yt_dlp/postprocessor/embedthumbnail.py @@ -0,0 +1,235 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import base64 +import imghdr +import os +import subprocess +import re + +try: + from mutagen.flac import Picture, FLAC + from mutagen.mp4 import MP4, MP4Cover + from mutagen.oggopus import OggOpus + from mutagen.oggvorbis import OggVorbis + has_mutagen = True +except ImportError: + has_mutagen = False + +from .common import PostProcessor +from .ffmpeg import ( + FFmpegPostProcessor, + FFmpegThumbnailsConvertorPP, +) +from ..utils import ( + check_executable, + encodeArgument, + encodeFilename, + error_to_compat_str, + PostProcessingError, + prepend_extension, + process_communicate_or_kill, + shell_quote, +) + + +class EmbedThumbnailPPError(PostProcessingError): + pass + + +class EmbedThumbnailPP(FFmpegPostProcessor): + + def __init__(self, downloader=None, already_have_thumbnail=False): + FFmpegPostProcessor.__init__(self, downloader) + self._already_have_thumbnail = already_have_thumbnail + + def _get_thumbnail_resolution(self, filename, thumbnail_dict): + def guess(): + width, height = thumbnail_dict.get('width'), thumbnail_dict.get('height') + if width and height: + return width, height + + try: + size_regex = r',\s*(?P<w>\d+)x(?P<h>\d+)\s*[,\[]' + size_result = self.run_ffmpeg(filename, None, ['-hide_banner'], expected_retcodes=(1,)) + mobj = re.search(size_regex, size_result) + if mobj is None: + return guess() + except PostProcessingError as err: + self.report_warning('unable to find the thumbnail resolution; %s' % error_to_compat_str(err)) + return guess() + return int(mobj.group('w')), int(mobj.group('h')) + + def _report_run(self, exe, filename): + self.to_screen('%s: Adding thumbnail to "%s"' % (exe, filename)) + + @PostProcessor._restrict_to(images=False) + def run(self, info): + filename = info['filepath'] + temp_filename = prepend_extension(filename, 'temp') + + if not info.get('thumbnails'): + self.to_screen('There aren\'t any thumbnails to embed') + return [], info + + idx = next((-i for i, t in enumerate(info['thumbnails'][::-1], 1) if t.get('filepath')), None) + if idx is None: + self.to_screen('There are no thumbnails on disk') + return [], info + thumbnail_filename = info['thumbnails'][idx]['filepath'] + if not os.path.exists(encodeFilename(thumbnail_filename)): + self.report_warning('Skipping embedding the thumbnail because the file is missing.') + return [], info + + # Correct extension for WebP file with wrong extension (see #25687, #25717) + convertor = FFmpegThumbnailsConvertorPP(self._downloader) + convertor.fixup_webp(info, idx) + + original_thumbnail = thumbnail_filename = info['thumbnails'][idx]['filepath'] + + # Convert unsupported thumbnail formats to PNG (see #25687, #25717) + # Original behavior was to convert to JPG, but since JPG is a lossy + # format, there will be some additional data loss. + # PNG, on the other hand, is lossless. + thumbnail_ext = os.path.splitext(thumbnail_filename)[1][1:] + if thumbnail_ext not in ('jpg', 'jpeg', 'png'): + thumbnail_filename = convertor.convert_thumbnail(thumbnail_filename, 'png') + thumbnail_ext = 'png' + + mtime = os.stat(encodeFilename(filename)).st_mtime + + success = True + if info['ext'] == 'mp3': + options = [ + '-c', 'copy', '-map', '0:0', '-map', '1:0', '-id3v2_version', '3', + '-metadata:s:v', 'title="Album cover"', '-metadata:s:v', 'comment="Cover (front)"'] + + self._report_run('ffmpeg', filename) + self.run_ffmpeg_multiple_files([filename, thumbnail_filename], temp_filename, options) + + elif info['ext'] in ['mkv', 'mka']: + options = ['-c', 'copy', '-map', '0', '-dn'] + + mimetype = 'image/%s' % ('png' if thumbnail_ext == 'png' else 'jpeg') + old_stream, new_stream = self.get_stream_number( + filename, ('tags', 'mimetype'), mimetype) + if old_stream is not None: + options.extend(['-map', '-0:%d' % old_stream]) + new_stream -= 1 + options.extend([ + '-attach', thumbnail_filename, + '-metadata:s:%d' % new_stream, 'mimetype=%s' % mimetype, + '-metadata:s:%d' % new_stream, 'filename=cover.%s' % thumbnail_ext]) + + self._report_run('ffmpeg', filename) + self.run_ffmpeg(filename, temp_filename, options) + + elif info['ext'] in ['m4a', 'mp4', 'mov']: + prefer_atomicparsley = 'embed-thumbnail-atomicparsley' in self.get_param('compat_opts', []) + # Method 1: Use mutagen + if not has_mutagen or prefer_atomicparsley: + success = False + else: + try: + self._report_run('mutagen', filename) + meta = MP4(filename) + # NOTE: the 'covr' atom is a non-standard MPEG-4 atom, + # Apple iTunes 'M4A' files include the 'moov.udta.meta.ilst' atom. + f = {'jpeg': MP4Cover.FORMAT_JPEG, 'png': MP4Cover.FORMAT_PNG}[imghdr.what(thumbnail_filename)] + with open(thumbnail_filename, 'rb') as thumbfile: + thumb_data = thumbfile.read() + meta.tags['covr'] = [MP4Cover(data=thumb_data, imageformat=f)] + meta.save() + temp_filename = filename + except Exception as err: + self.report_warning('unable to embed using mutagen; %s' % error_to_compat_str(err)) + success = False + + # Method 2: Use ffmpeg+ffprobe + if not success and not prefer_atomicparsley: + success = True + try: + options = ['-c', 'copy', '-map', '0', '-dn', '-map', '1'] + + old_stream, new_stream = self.get_stream_number( + filename, ('disposition', 'attached_pic'), 1) + if old_stream is not None: + options.extend(['-map', '-0:%d' % old_stream]) + new_stream -= 1 + options.extend(['-disposition:%s' % new_stream, 'attached_pic']) + + self._report_run('ffmpeg', filename) + self.run_ffmpeg_multiple_files([filename, thumbnail_filename], temp_filename, options) + except PostProcessingError as err: + self.report_warning('unable to embed using ffprobe & ffmpeg; %s' % error_to_compat_str(err)) + success = False + + # Method 3: Use AtomicParsley + if not success: + success = True + atomicparsley = next(( + x for x in ['AtomicParsley', 'atomicparsley'] + if check_executable(x, ['-v'])), None) + if atomicparsley is None: + raise EmbedThumbnailPPError('AtomicParsley was not found. Please install') + + cmd = [encodeFilename(atomicparsley, True), + encodeFilename(filename, True), + encodeArgument('--artwork'), + encodeFilename(thumbnail_filename, True), + encodeArgument('-o'), + encodeFilename(temp_filename, True)] + cmd += [encodeArgument(o) for o in self._configuration_args('AtomicParsley')] + + self._report_run('atomicparsley', filename) + self.write_debug('AtomicParsley command line: %s' % shell_quote(cmd)) + p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout, stderr = process_communicate_or_kill(p) + if p.returncode != 0: + msg = stderr.decode('utf-8', 'replace').strip() + raise EmbedThumbnailPPError(msg) + # for formats that don't support thumbnails (like 3gp) AtomicParsley + # won't create to the temporary file + if b'No changes' in stdout: + self.report_warning('The file format doesn\'t support embedding a thumbnail') + success = False + + elif info['ext'] in ['ogg', 'opus', 'flac']: + if not has_mutagen: + raise EmbedThumbnailPPError('module mutagen was not found. Please install using `python -m pip install mutagen`') + + self._report_run('mutagen', filename) + f = {'opus': OggOpus, 'flac': FLAC, 'ogg': OggVorbis}[info['ext']](filename) + + pic = Picture() + pic.mime = 'image/%s' % imghdr.what(thumbnail_filename) + with open(thumbnail_filename, 'rb') as thumbfile: + pic.data = thumbfile.read() + pic.type = 3 # front cover + res = self._get_thumbnail_resolution(thumbnail_filename, info['thumbnails'][idx]) + if res is not None: + pic.width, pic.height = res + + if info['ext'] == 'flac': + f.add_picture(pic) + else: + # https://wiki.xiph.org/VorbisComment#METADATA_BLOCK_PICTURE + f['METADATA_BLOCK_PICTURE'] = base64.b64encode(pic.write()).decode('ascii') + f.save() + temp_filename = filename + + else: + raise EmbedThumbnailPPError('Supported filetypes for thumbnail embedding are: mp3, mkv/mka, ogg/opus/flac, m4a/mp4/mov') + + if success and temp_filename != filename: + os.replace(temp_filename, filename) + + self.try_utime(filename, mtime, mtime) + + files_to_delete = [thumbnail_filename] + if self._already_have_thumbnail: + if original_thumbnail == thumbnail_filename: + files_to_delete = [] + elif original_thumbnail != thumbnail_filename: + files_to_delete.append(original_thumbnail) + return files_to_delete, info diff --git a/yt_dlp/postprocessor/exec.py b/yt_dlp/postprocessor/exec.py new file mode 100644 index 000000000..7a3cb4999 --- /dev/null +++ b/yt_dlp/postprocessor/exec.py @@ -0,0 +1,42 @@ +from __future__ import unicode_literals + +import subprocess + +from .common import PostProcessor +from ..compat import compat_shlex_quote +from ..utils import ( + encodeArgument, + PostProcessingError, + variadic, +) + + +class ExecPP(PostProcessor): + + def __init__(self, downloader, exec_cmd): + PostProcessor.__init__(self, downloader) + self.exec_cmd = variadic(exec_cmd) + + def parse_cmd(self, cmd, info): + tmpl, tmpl_dict = self._downloader.prepare_outtmpl(cmd, info) + if tmpl_dict: # if there are no replacements, tmpl_dict = {} + return self._downloader.escape_outtmpl(tmpl) % tmpl_dict + + # If no replacements are found, replace {} for backard compatibility + if '{}' not in cmd: + cmd += ' {}' + return cmd.replace('{}', compat_shlex_quote( + info.get('filepath') or info['_filename'])) + + def run(self, info): + for tmpl in self.exec_cmd: + cmd = self.parse_cmd(tmpl, info) + self.to_screen('Executing command: %s' % cmd) + retCode = subprocess.call(encodeArgument(cmd), shell=True) + if retCode != 0: + raise PostProcessingError('Command returned error code %d' % retCode) + return [], info + + +class ExecAfterDownloadPP(ExecPP): # for backward compatibility + pass diff --git a/yt_dlp/postprocessor/ffmpeg.py b/yt_dlp/postprocessor/ffmpeg.py new file mode 100644 index 000000000..e6aa2940a --- /dev/null +++ b/yt_dlp/postprocessor/ffmpeg.py @@ -0,0 +1,1018 @@ +from __future__ import unicode_literals + +import io +import itertools +import os +import subprocess +import time +import re +import json + +from .common import AudioConversionError, PostProcessor + +from ..compat import compat_str, compat_numeric_types +from ..utils import ( + dfxp2srt, + encodeArgument, + encodeFilename, + float_or_none, + get_exe_version, + is_outdated_version, + ISO639Utils, + orderedSet, + PostProcessingError, + prepend_extension, + process_communicate_or_kill, + replace_extension, + shell_quote, + traverse_obj, + variadic, +) + + +EXT_TO_OUT_FORMATS = { + 'aac': 'adts', + 'flac': 'flac', + 'm4a': 'ipod', + 'mka': 'matroska', + 'mkv': 'matroska', + 'mpg': 'mpeg', + 'ogv': 'ogg', + 'ts': 'mpegts', + 'wma': 'asf', + 'wmv': 'asf', +} +ACODECS = { + 'mp3': 'libmp3lame', + 'aac': 'aac', + 'flac': 'flac', + 'm4a': 'aac', + 'opus': 'libopus', + 'vorbis': 'libvorbis', + 'wav': None, +} + + +class FFmpegPostProcessorError(PostProcessingError): + pass + + +class FFmpegPostProcessor(PostProcessor): + def __init__(self, downloader=None): + PostProcessor.__init__(self, downloader) + self._determine_executables() + + def check_version(self): + if not self.available: + raise FFmpegPostProcessorError('ffmpeg not found. Please install or provide the path using --ffmpeg-location') + + required_version = '10-0' if self.basename == 'avconv' else '1.0' + if is_outdated_version( + self._versions[self.basename], required_version): + warning = 'Your copy of %s is outdated, update %s to version %s or newer if you encounter any errors.' % ( + self.basename, self.basename, required_version) + self.report_warning(warning) + + @staticmethod + def get_versions(downloader=None): + return FFmpegPostProcessor(downloader)._versions + + def _determine_executables(self): + programs = ['avprobe', 'avconv', 'ffmpeg', 'ffprobe'] + prefer_ffmpeg = True + + def get_ffmpeg_version(path): + ver = get_exe_version(path, args=['-version']) + if ver: + regexs = [ + r'(?:\d+:)?([0-9.]+)-[0-9]+ubuntu[0-9.]+$', # Ubuntu, see [1] + r'n([0-9.]+)$', # Arch Linux + # 1. http://www.ducea.com/2006/06/17/ubuntu-package-version-naming-explanation/ + ] + for regex in regexs: + mobj = re.match(regex, ver) + if mobj: + ver = mobj.group(1) + return ver + + self.basename = None + self.probe_basename = None + + self._paths = None + self._versions = None + if self._downloader: + prefer_ffmpeg = self.get_param('prefer_ffmpeg', True) + location = self.get_param('ffmpeg_location') + if location is not None: + if not os.path.exists(location): + self.report_warning( + 'ffmpeg-location %s does not exist! ' + 'Continuing without ffmpeg.' % (location)) + self._versions = {} + return + elif os.path.isdir(location): + dirname, basename = location, None + else: + basename = os.path.splitext(os.path.basename(location))[0] + basename = next((p for p in programs if basename.startswith(p)), 'ffmpeg') + dirname = os.path.dirname(os.path.abspath(location)) + if basename in ('ffmpeg', 'ffprobe'): + prefer_ffmpeg = True + + self._paths = dict( + (p, os.path.join(dirname, p)) for p in programs) + if basename: + self._paths[basename] = location + self._versions = dict( + (p, get_ffmpeg_version(self._paths[p])) for p in programs) + if self._versions is None: + self._versions = dict( + (p, get_ffmpeg_version(p)) for p in programs) + self._paths = dict((p, p) for p in programs) + + if prefer_ffmpeg is False: + prefs = ('avconv', 'ffmpeg') + else: + prefs = ('ffmpeg', 'avconv') + for p in prefs: + if self._versions[p]: + self.basename = p + break + + if prefer_ffmpeg is False: + prefs = ('avprobe', 'ffprobe') + else: + prefs = ('ffprobe', 'avprobe') + for p in prefs: + if self._versions[p]: + self.probe_basename = p + break + + @property + def available(self): + return self.basename is not None + + @property + def executable(self): + return self._paths[self.basename] + + @property + def probe_available(self): + return self.probe_basename is not None + + @property + def probe_executable(self): + return self._paths[self.probe_basename] + + def get_audio_codec(self, path): + if not self.probe_available and not self.available: + raise PostProcessingError('ffprobe and ffmpeg not found. Please install or provide the path using --ffmpeg-location') + try: + if self.probe_available: + cmd = [ + encodeFilename(self.probe_executable, True), + encodeArgument('-show_streams')] + else: + cmd = [ + encodeFilename(self.executable, True), + encodeArgument('-i')] + cmd.append(encodeFilename(self._ffmpeg_filename_argument(path), True)) + self.write_debug('%s command line: %s' % (self.basename, shell_quote(cmd))) + handle = subprocess.Popen( + cmd, stderr=subprocess.PIPE, + stdout=subprocess.PIPE, stdin=subprocess.PIPE) + stdout_data, stderr_data = process_communicate_or_kill(handle) + expected_ret = 0 if self.probe_available else 1 + if handle.wait() != expected_ret: + return None + except (IOError, OSError): + return None + output = (stdout_data if self.probe_available else stderr_data).decode('ascii', 'ignore') + if self.probe_available: + audio_codec = None + for line in output.split('\n'): + if line.startswith('codec_name='): + audio_codec = line.split('=')[1].strip() + elif line.strip() == 'codec_type=audio' and audio_codec is not None: + return audio_codec + else: + # Stream #FILE_INDEX:STREAM_INDEX[STREAM_ID](LANGUAGE): CODEC_TYPE: CODEC_NAME + mobj = re.search( + r'Stream\s*#\d+:\d+(?:\[0x[0-9a-f]+\])?(?:\([a-z]{3}\))?:\s*Audio:\s*([0-9a-z]+)', + output) + if mobj: + return mobj.group(1) + return None + + def get_metadata_object(self, path, opts=[]): + if self.probe_basename != 'ffprobe': + if self.probe_available: + self.report_warning('Only ffprobe is supported for metadata extraction') + raise PostProcessingError('ffprobe not found. Please install or provide the path using --ffmpeg-location') + self.check_version() + + cmd = [ + encodeFilename(self.probe_executable, True), + encodeArgument('-hide_banner'), + encodeArgument('-show_format'), + encodeArgument('-show_streams'), + encodeArgument('-print_format'), + encodeArgument('json'), + ] + + cmd += opts + cmd.append(encodeFilename(self._ffmpeg_filename_argument(path), True)) + self.write_debug('ffprobe command line: %s' % shell_quote(cmd)) + p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) + stdout, stderr = p.communicate() + return json.loads(stdout.decode('utf-8', 'replace')) + + def get_stream_number(self, path, keys, value): + streams = self.get_metadata_object(path)['streams'] + num = next( + (i for i, stream in enumerate(streams) if traverse_obj(stream, keys, casesense=False) == value), + None) + return num, len(streams) + + def _get_real_video_duration(self, info, fatal=True): + try: + if '_real_duration' not in info: + info['_real_duration'] = float_or_none( + traverse_obj(self.get_metadata_object(info['filepath']), ('format', 'duration'))) + if not info['_real_duration']: + raise PostProcessingError('ffprobe returned empty duration') + except PostProcessingError as e: + if fatal: + raise PostProcessingError(f'Unable to determine video duration; {e}') + return info.setdefault('_real_duration', None) + + def _duration_mismatch(self, d1, d2): + if not d1 or not d2: + return None + return abs(d1 - d2) > 1 + + def run_ffmpeg_multiple_files(self, input_paths, out_path, opts, **kwargs): + return self.real_run_ffmpeg( + [(path, []) for path in input_paths], + [(out_path, opts)], **kwargs) + + def real_run_ffmpeg(self, input_path_opts, output_path_opts, *, expected_retcodes=(0,)): + self.check_version() + + oldest_mtime = min( + os.stat(encodeFilename(path)).st_mtime for path, _ in input_path_opts if path) + + cmd = [encodeFilename(self.executable, True), encodeArgument('-y')] + # avconv does not have repeat option + if self.basename == 'ffmpeg': + cmd += [encodeArgument('-loglevel'), encodeArgument('repeat+info')] + + def make_args(file, args, name, number): + keys = ['_%s%d' % (name, number), '_%s' % name] + if name == 'o' and number == 1: + keys.append('') + args += self._configuration_args(self.basename, keys) + if name == 'i': + args.append('-i') + return ( + [encodeArgument(arg) for arg in args] + + [encodeFilename(self._ffmpeg_filename_argument(file), True)]) + + for arg_type, path_opts in (('i', input_path_opts), ('o', output_path_opts)): + cmd += itertools.chain.from_iterable( + make_args(path, list(opts), arg_type, i + 1) + for i, (path, opts) in enumerate(path_opts) if path) + + self.write_debug('ffmpeg command line: %s' % shell_quote(cmd)) + p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) + stdout, stderr = process_communicate_or_kill(p) + if p.returncode not in variadic(expected_retcodes): + stderr = stderr.decode('utf-8', 'replace').strip() + self.write_debug(stderr) + raise FFmpegPostProcessorError(stderr.split('\n')[-1]) + for out_path, _ in output_path_opts: + if out_path: + self.try_utime(out_path, oldest_mtime, oldest_mtime) + return stderr.decode('utf-8', 'replace') + + def run_ffmpeg(self, path, out_path, opts, **kwargs): + return self.run_ffmpeg_multiple_files([path], out_path, opts, **kwargs) + + @staticmethod + def _ffmpeg_filename_argument(fn): + # Always use 'file:' because the filename may contain ':' (ffmpeg + # interprets that as a protocol) or can start with '-' (-- is broken in + # ffmpeg, see https://ffmpeg.org/trac/ffmpeg/ticket/2127 for details) + # Also leave '-' intact in order not to break streaming to stdout. + if fn.startswith(('http://', 'https://')): + return fn + return 'file:' + fn if fn != '-' else fn + + @staticmethod + def _quote_for_ffmpeg(string): + # See https://ffmpeg.org/ffmpeg-utils.html#toc-Quoting-and-escaping + # A sequence of '' produces '\'''\''; + # final replace removes the empty '' between \' \'. + string = string.replace("'", r"'\''").replace("'''", "'") + # Handle potential ' at string boundaries. + string = string[1:] if string[0] == "'" else "'" + string + return string[:-1] if string[-1] == "'" else string + "'" + + def force_keyframes(self, filename, timestamps): + timestamps = orderedSet(timestamps) + if timestamps[0] == 0: + timestamps = timestamps[1:] + keyframe_file = prepend_extension(filename, 'keyframes.temp') + self.to_screen(f'Re-encoding "{filename}" with appropriate keyframes') + self.run_ffmpeg(filename, keyframe_file, ['-force_key_frames', ','.join( + f'{t:.6f}' for t in timestamps)]) + return keyframe_file + + def concat_files(self, in_files, out_file, concat_opts=None): + """ + Use concat demuxer to concatenate multiple files having identical streams. + + Only inpoint, outpoint, and duration concat options are supported. + See https://ffmpeg.org/ffmpeg-formats.html#concat-1 for details + """ + concat_file = f'{out_file}.concat' + self.write_debug(f'Writing concat spec to {concat_file}') + with open(concat_file, 'wt', encoding='utf-8') as f: + f.writelines(self._concat_spec(in_files, concat_opts)) + + out_flags = ['-c', 'copy'] + if out_file.rpartition('.')[-1] in ('mp4', 'mov'): + # For some reason, '-c copy' is not enough to copy subtitles + out_flags.extend(['-c:s', 'mov_text', '-movflags', '+faststart']) + + try: + self.real_run_ffmpeg( + [(concat_file, ['-hide_banner', '-nostdin', '-f', 'concat', '-safe', '0'])], + [(out_file, out_flags)]) + finally: + os.remove(concat_file) + + @classmethod + def _concat_spec(cls, in_files, concat_opts=None): + if concat_opts is None: + concat_opts = [{}] * len(in_files) + yield 'ffconcat version 1.0\n' + for file, opts in zip(in_files, concat_opts): + yield f'file {cls._quote_for_ffmpeg(cls._ffmpeg_filename_argument(file))}\n' + # Iterate explicitly to yield the following directives in order, ignoring the rest. + for directive in 'inpoint', 'outpoint', 'duration': + if directive in opts: + yield f'{directive} {opts[directive]}\n' + + +class FFmpegExtractAudioPP(FFmpegPostProcessor): + COMMON_AUDIO_EXTS = ('wav', 'flac', 'm4a', 'aiff', 'mp3', 'ogg', 'mka', 'opus', 'wma') + SUPPORTED_EXTS = ('best', 'aac', 'flac', 'mp3', 'm4a', 'opus', 'vorbis', 'wav') + + def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, nopostoverwrites=False): + FFmpegPostProcessor.__init__(self, downloader) + self._preferredcodec = preferredcodec or 'best' + self._preferredquality = preferredquality + self._nopostoverwrites = nopostoverwrites + + def run_ffmpeg(self, path, out_path, codec, more_opts): + if codec is None: + acodec_opts = [] + else: + acodec_opts = ['-acodec', codec] + opts = ['-vn'] + acodec_opts + more_opts + try: + FFmpegPostProcessor.run_ffmpeg(self, path, out_path, opts) + except FFmpegPostProcessorError as err: + raise AudioConversionError(err.msg) + + @PostProcessor._restrict_to(images=False) + def run(self, information): + path = information['filepath'] + orig_ext = information['ext'] + + if self._preferredcodec == 'best' and orig_ext in self.COMMON_AUDIO_EXTS: + self.to_screen('Skipping audio extraction since the file is already in a common audio format') + return [], information + + filecodec = self.get_audio_codec(path) + if filecodec is None: + raise PostProcessingError('WARNING: unable to obtain file audio codec with ffprobe') + + more_opts = [] + if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'): + if filecodec == 'aac' and self._preferredcodec in ['m4a', 'best']: + # Lossless, but in another container + acodec = 'copy' + extension = 'm4a' + more_opts = ['-bsf:a', 'aac_adtstoasc'] + elif filecodec in ['aac', 'flac', 'mp3', 'vorbis', 'opus']: + # Lossless if possible + acodec = 'copy' + extension = filecodec + if filecodec == 'aac': + more_opts = ['-f', 'adts'] + if filecodec == 'vorbis': + extension = 'ogg' + else: + # MP3 otherwise. + acodec = 'libmp3lame' + extension = 'mp3' + more_opts = [] + if self._preferredquality is not None: + if int(self._preferredquality) < 10: + more_opts += ['-q:a', self._preferredquality] + else: + more_opts += ['-b:a', self._preferredquality + 'k'] + else: + # We convert the audio (lossy if codec is lossy) + acodec = ACODECS[self._preferredcodec] + extension = self._preferredcodec + more_opts = [] + if self._preferredquality is not None: + # The opus codec doesn't support the -aq option + if int(self._preferredquality) < 10 and extension != 'opus': + more_opts += ['-q:a', self._preferredquality] + else: + more_opts += ['-b:a', self._preferredquality + 'k'] + if self._preferredcodec == 'aac': + more_opts += ['-f', 'adts'] + if self._preferredcodec == 'm4a': + more_opts += ['-bsf:a', 'aac_adtstoasc'] + if self._preferredcodec == 'vorbis': + extension = 'ogg' + if self._preferredcodec == 'wav': + extension = 'wav' + more_opts += ['-f', 'wav'] + + prefix, sep, ext = path.rpartition('.') # not os.path.splitext, since the latter does not work on unicode in all setups + new_path = prefix + sep + extension + + information['filepath'] = new_path + information['ext'] = extension + + # If we download foo.mp3 and convert it to... foo.mp3, then don't delete foo.mp3, silly. + if (new_path == path + or (self._nopostoverwrites and os.path.exists(encodeFilename(new_path)))): + self.to_screen('Post-process file %s exists, skipping' % new_path) + return [], information + + try: + self.to_screen('Destination: ' + new_path) + self.run_ffmpeg(path, new_path, acodec, more_opts) + except AudioConversionError as e: + raise PostProcessingError( + 'audio conversion failed: ' + e.msg) + except Exception: + raise PostProcessingError('error running ' + self.basename) + + # Try to update the date time for extracted audio file. + if information.get('filetime') is not None: + self.try_utime( + new_path, time.time(), information['filetime'], + errnote='Cannot update utime of audio file') + + return [path], information + + +class FFmpegVideoConvertorPP(FFmpegPostProcessor): + SUPPORTED_EXTS = ('mp4', 'mkv', 'flv', 'webm', 'mov', 'avi', 'mp3', 'mka', 'm4a', 'ogg', 'opus') + FORMAT_RE = re.compile(r'{0}(?:/{0})*$'.format(r'(?:\w+>)?(?:%s)' % '|'.join(SUPPORTED_EXTS))) + _ACTION = 'converting' + + def __init__(self, downloader=None, preferedformat=None): + super(FFmpegVideoConvertorPP, self).__init__(downloader) + self._preferedformats = preferedformat.lower().split('/') + + def _target_ext(self, source_ext): + for pair in self._preferedformats: + kv = pair.split('>') + if len(kv) == 1 or kv[0].strip() == source_ext: + return kv[-1].strip() + + @staticmethod + def _options(target_ext): + if target_ext == 'avi': + return ['-c:v', 'libxvid', '-vtag', 'XVID'] + return [] + + @PostProcessor._restrict_to(images=False) + def run(self, info): + filename, source_ext = info['filepath'], info['ext'].lower() + target_ext = self._target_ext(source_ext) + _skip_msg = ( + f'could not find a mapping for {source_ext}' if not target_ext + else f'already is in target format {source_ext}' if source_ext == target_ext + else None) + if _skip_msg: + self.to_screen(f'Not {self._ACTION} media file {filename!r}; {_skip_msg}') + return [], info + + outpath = replace_extension(filename, target_ext, source_ext) + self.to_screen(f'{self._ACTION.title()} video from {source_ext} to {target_ext}; Destination: {outpath}') + self.run_ffmpeg(filename, outpath, self._options(target_ext)) + + info['filepath'] = outpath + info['format'] = info['ext'] = target_ext + return [filename], info + + +class FFmpegVideoRemuxerPP(FFmpegVideoConvertorPP): + _ACTION = 'remuxing' + + @staticmethod + def _options(target_ext): + options = ['-c', 'copy', '-map', '0', '-dn'] + if target_ext in ['mp4', 'm4a', 'mov']: + options.extend(['-movflags', '+faststart']) + return options + + +class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): + def __init__(self, downloader=None, already_have_subtitle=False): + super(FFmpegEmbedSubtitlePP, self).__init__(downloader) + self._already_have_subtitle = already_have_subtitle + + @PostProcessor._restrict_to(images=False) + def run(self, information): + if information['ext'] not in ('mp4', 'webm', 'mkv'): + self.to_screen('Subtitles can only be embedded in mp4, webm or mkv files') + return [], information + subtitles = information.get('requested_subtitles') + if not subtitles: + self.to_screen('There aren\'t any subtitles to embed') + return [], information + + filename = information['filepath'] + if information.get('duration') and self._duration_mismatch( + self._get_real_video_duration(information, False), information['duration']): + self.to_screen(f'Skipping {self.pp_key()} since the real and expected durations mismatch') + return [], information + + ext = information['ext'] + sub_langs, sub_names, sub_filenames = [], [], [] + webm_vtt_warn = False + mp4_ass_warn = False + + for lang, sub_info in subtitles.items(): + if not os.path.exists(sub_info.get('filepath', '')): + self.report_warning(f'Skipping embedding {lang} subtitle because the file is missing') + continue + sub_ext = sub_info['ext'] + if sub_ext == 'json': + self.report_warning('JSON subtitles cannot be embedded') + elif ext != 'webm' or ext == 'webm' and sub_ext == 'vtt': + sub_langs.append(lang) + sub_names.append(sub_info.get('name')) + sub_filenames.append(sub_info['filepath']) + else: + if not webm_vtt_warn and ext == 'webm' and sub_ext != 'vtt': + webm_vtt_warn = True + self.report_warning('Only WebVTT subtitles can be embedded in webm files') + if not mp4_ass_warn and ext == 'mp4' and sub_ext == 'ass': + mp4_ass_warn = True + self.report_warning('ASS subtitles cannot be properly embedded in mp4 files; expect issues') + + if not sub_langs: + return [], information + + input_files = [filename] + sub_filenames + + opts = [ + '-c', 'copy', '-map', '0', '-dn', + # Don't copy the existing subtitles, we may be running the + # postprocessor a second time + '-map', '-0:s', + # Don't copy Apple TV chapters track, bin_data (see #19042, #19024, + # https://trac.ffmpeg.org/ticket/6016) + '-map', '-0:d', + ] + if information['ext'] == 'mp4': + opts += ['-c:s', 'mov_text'] + for i, (lang, name) in enumerate(zip(sub_langs, sub_names)): + opts.extend(['-map', '%d:0' % (i + 1)]) + lang_code = ISO639Utils.short2long(lang) or lang + opts.extend(['-metadata:s:s:%d' % i, 'language=%s' % lang_code]) + if name: + opts.extend(['-metadata:s:s:%d' % i, 'handler_name=%s' % name, + '-metadata:s:s:%d' % i, 'title=%s' % name]) + + temp_filename = prepend_extension(filename, 'temp') + self.to_screen('Embedding subtitles in "%s"' % filename) + self.run_ffmpeg_multiple_files(input_files, temp_filename, opts) + os.replace(temp_filename, filename) + + files_to_delete = [] if self._already_have_subtitle else sub_filenames + return files_to_delete, information + + +class FFmpegMetadataPP(FFmpegPostProcessor): + + def __init__(self, downloader, add_metadata=True, add_chapters=True): + FFmpegPostProcessor.__init__(self, downloader) + self._add_metadata = add_metadata + self._add_chapters = add_chapters + + @staticmethod + def _options(target_ext): + yield from ('-map', '0', '-dn') + if target_ext == 'm4a': + yield from ('-vn', '-acodec', 'copy') + else: + yield from ('-c', 'copy') + + @PostProcessor._restrict_to(images=False) + def run(self, info): + filename, metadata_filename = info['filepath'], None + options = [] + if self._add_chapters and info.get('chapters'): + metadata_filename = replace_extension(filename, 'meta') + options.extend(self._get_chapter_opts(info['chapters'], metadata_filename)) + if self._add_metadata: + options.extend(self._get_metadata_opts(info)) + + if not options: + self.to_screen('There isn\'t any metadata to add') + return [], info + + temp_filename = prepend_extension(filename, 'temp') + self.to_screen('Adding metadata to "%s"' % filename) + self.run_ffmpeg_multiple_files( + (filename, metadata_filename), temp_filename, + itertools.chain(self._options(info['ext']), *options)) + if metadata_filename: + os.remove(metadata_filename) + os.replace(temp_filename, filename) + return [], info + + @staticmethod + def _get_chapter_opts(chapters, metadata_filename): + with io.open(metadata_filename, 'wt', encoding='utf-8') as f: + def ffmpeg_escape(text): + return re.sub(r'([\\=;#\n])', r'\\\1', text) + + metadata_file_content = ';FFMETADATA1\n' + for chapter in chapters: + metadata_file_content += '[CHAPTER]\nTIMEBASE=1/1000\n' + metadata_file_content += 'START=%d\n' % (chapter['start_time'] * 1000) + metadata_file_content += 'END=%d\n' % (chapter['end_time'] * 1000) + chapter_title = chapter.get('title') + if chapter_title: + metadata_file_content += 'title=%s\n' % ffmpeg_escape(chapter_title) + f.write(metadata_file_content) + yield ('-map_metadata', '1') + + def _get_metadata_opts(self, info): + metadata = {} + + def add(meta_list, info_list=None): + if not meta_list: + return + for info_f in variadic(info_list or meta_list): + if isinstance(info.get(info_f), (compat_str, compat_numeric_types)): + for meta_f in variadic(meta_list): + metadata[meta_f] = info[info_f] + break + + # See [1-4] for some info on media metadata/metadata supported + # by ffmpeg. + # 1. https://kdenlive.org/en/project/adding-meta-data-to-mp4-video/ + # 2. https://wiki.multimedia.cx/index.php/FFmpeg_Metadata + # 3. https://kodi.wiki/view/Video_file_tagging + + add('title', ('track', 'title')) + add('date', 'upload_date') + add(('description', 'synopsis'), 'description') + add(('purl', 'comment'), 'webpage_url') + add('track', 'track_number') + add('artist', ('artist', 'creator', 'uploader', 'uploader_id')) + add('genre') + add('album') + add('album_artist') + add('disc', 'disc_number') + add('show', 'series') + add('season_number') + add('episode_id', ('episode', 'episode_id')) + add('episode_sort', 'episode_number') + + prefix = 'meta_' + for key in filter(lambda k: k.startswith(prefix), info.keys()): + add(key[len(prefix):], key) + + for name, value in metadata.items(): + yield ('-metadata', f'{name}={value}') + + stream_idx = 0 + for fmt in info.get('requested_formats') or []: + stream_count = 2 if 'none' not in (fmt.get('vcodec'), fmt.get('acodec')) else 1 + if fmt.get('language'): + lang = ISO639Utils.short2long(fmt['language']) or fmt['language'] + for i in range(stream_count): + yield ('-metadata:s:%d' % (stream_idx + i), 'language=%s' % lang) + stream_idx += stream_count + + if ('no-attach-info-json' not in self.get_param('compat_opts', []) + and '__infojson_filename' in info and info['ext'] in ('mkv', 'mka')): + old_stream, new_stream = self.get_stream_number(info['filepath'], ('tags', 'mimetype'), 'application/json') + if old_stream is not None: + yield ('-map', '-0:%d' % old_stream) + new_stream -= 1 + + yield ('-attach', info['__infojson_filename'], + '-metadata:s:%d' % new_stream, 'mimetype=application/json') + + +class FFmpegMergerPP(FFmpegPostProcessor): + @PostProcessor._restrict_to(images=False) + def run(self, info): + filename = info['filepath'] + temp_filename = prepend_extension(filename, 'temp') + args = ['-c', 'copy'] + audio_streams = 0 + for (i, fmt) in enumerate(info['requested_formats']): + if fmt.get('acodec') != 'none': + args.extend(['-map', f'{i}:a:0']) + aac_fixup = fmt['protocol'].startswith('m3u8') and self.get_audio_codec(fmt['filepath']) == 'aac' + if aac_fixup: + args.extend([f'-bsf:a:{audio_streams}', 'aac_adtstoasc']) + audio_streams += 1 + if fmt.get('vcodec') != 'none': + args.extend(['-map', '%u:v:0' % (i)]) + self.to_screen('Merging formats into "%s"' % filename) + self.run_ffmpeg_multiple_files(info['__files_to_merge'], temp_filename, args) + os.rename(encodeFilename(temp_filename), encodeFilename(filename)) + return info['__files_to_merge'], info + + def can_merge(self): + # TODO: figure out merge-capable ffmpeg version + if self.basename != 'avconv': + return True + + required_version = '10-0' + if is_outdated_version( + self._versions[self.basename], required_version): + warning = ('Your copy of %s is outdated and unable to properly mux separate video and audio files, ' + 'yt-dlp will download single file media. ' + 'Update %s to version %s or newer to fix this.') % ( + self.basename, self.basename, required_version) + self.report_warning(warning) + return False + return True + + +class FFmpegFixupPostProcessor(FFmpegPostProcessor): + def _fixup(self, msg, filename, options): + temp_filename = prepend_extension(filename, 'temp') + + self.to_screen(f'{msg} of "{filename}"') + self.run_ffmpeg(filename, temp_filename, options) + + os.replace(temp_filename, filename) + + +class FFmpegFixupStretchedPP(FFmpegFixupPostProcessor): + @PostProcessor._restrict_to(images=False, audio=False) + def run(self, info): + stretched_ratio = info.get('stretched_ratio') + if stretched_ratio not in (None, 1): + self._fixup('Fixing aspect ratio', info['filepath'], [ + '-c', 'copy', '-map', '0', '-dn', '-aspect', '%f' % stretched_ratio]) + return [], info + + +class FFmpegFixupM4aPP(FFmpegFixupPostProcessor): + @PostProcessor._restrict_to(images=False, video=False) + def run(self, info): + if info.get('container') == 'm4a_dash': + self._fixup('Correcting container', info['filepath'], [ + '-c', 'copy', '-map', '0', '-dn', '-f', 'mp4']) + return [], info + + +class FFmpegFixupM3u8PP(FFmpegFixupPostProcessor): + @PostProcessor._restrict_to(images=False) + def run(self, info): + if self.get_audio_codec(info['filepath']) == 'aac': + self._fixup('Fixing malformed AAC bitstream', info['filepath'], [ + '-c', 'copy', '-map', '0', '-dn', '-f', 'mp4', '-bsf:a', 'aac_adtstoasc']) + return [], info + + +class FFmpegFixupTimestampPP(FFmpegFixupPostProcessor): + + def __init__(self, downloader=None, trim=0.001): + # "trim" should be used when the video contains unintended packets + super(FFmpegFixupTimestampPP, self).__init__(downloader) + assert isinstance(trim, (int, float)) + self.trim = str(trim) + + @PostProcessor._restrict_to(images=False) + def run(self, info): + required_version = '4.4' + if is_outdated_version(self._versions[self.basename], required_version): + self.report_warning( + 'A re-encode is needed to fix timestamps in older versions of ffmpeg. ' + f'Please install ffmpeg {required_version} or later to fixup without re-encoding') + opts = ['-vf', 'setpts=PTS-STARTPTS'] + else: + opts = ['-c', 'copy', '-bsf', 'setts=ts=TS-STARTPTS'] + self._fixup('Fixing frame timestamp', info['filepath'], opts + ['-map', '0', '-dn', '-ss', self.trim]) + return [], info + + +class FFmpegFixupDurationPP(FFmpegFixupPostProcessor): + @PostProcessor._restrict_to(images=False) + def run(self, info): + self._fixup('Fixing video duration', info['filepath'], ['-c', 'copy', '-map', '0', '-dn']) + return [], info + + +class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor): + SUPPORTED_EXTS = ('srt', 'vtt', 'ass', 'lrc') + + def __init__(self, downloader=None, format=None): + super(FFmpegSubtitlesConvertorPP, self).__init__(downloader) + self.format = format + + def run(self, info): + subs = info.get('requested_subtitles') + new_ext = self.format + new_format = new_ext + if new_format == 'vtt': + new_format = 'webvtt' + if subs is None: + self.to_screen('There aren\'t any subtitles to convert') + return [], info + self.to_screen('Converting subtitles') + sub_filenames = [] + for lang, sub in subs.items(): + if not os.path.exists(sub.get('filepath', '')): + self.report_warning(f'Skipping embedding {lang} subtitle because the file is missing') + continue + ext = sub['ext'] + if ext == new_ext: + self.to_screen('Subtitle file for %s is already in the requested format' % new_ext) + continue + elif ext == 'json': + self.to_screen( + 'You have requested to convert json subtitles into another format, ' + 'which is currently not possible') + continue + old_file = sub['filepath'] + sub_filenames.append(old_file) + new_file = replace_extension(old_file, new_ext) + + if ext in ('dfxp', 'ttml', 'tt'): + self.report_warning( + 'You have requested to convert dfxp (TTML) subtitles into another format, ' + 'which results in style information loss') + + dfxp_file = old_file + srt_file = replace_extension(old_file, 'srt') + + with open(dfxp_file, 'rb') as f: + srt_data = dfxp2srt(f.read()) + + with io.open(srt_file, 'wt', encoding='utf-8') as f: + f.write(srt_data) + old_file = srt_file + + subs[lang] = { + 'ext': 'srt', + 'data': srt_data, + 'filepath': srt_file, + } + + if new_ext == 'srt': + continue + else: + sub_filenames.append(srt_file) + + self.run_ffmpeg(old_file, new_file, ['-f', new_format]) + + with io.open(new_file, 'rt', encoding='utf-8') as f: + subs[lang] = { + 'ext': new_ext, + 'data': f.read(), + 'filepath': new_file, + } + + info['__files_to_move'][new_file] = replace_extension( + info['__files_to_move'][sub['filepath']], new_ext) + + return sub_filenames, info + + +class FFmpegSplitChaptersPP(FFmpegPostProcessor): + def __init__(self, downloader, force_keyframes=False): + FFmpegPostProcessor.__init__(self, downloader) + self._force_keyframes = force_keyframes + + def _prepare_filename(self, number, chapter, info): + info = info.copy() + info.update({ + 'section_number': number, + 'section_title': chapter.get('title'), + 'section_start': chapter.get('start_time'), + 'section_end': chapter.get('end_time'), + }) + return self._downloader.prepare_filename(info, 'chapter') + + def _ffmpeg_args_for_chapter(self, number, chapter, info): + destination = self._prepare_filename(number, chapter, info) + if not self._downloader._ensure_dir_exists(encodeFilename(destination)): + return + + chapter['filepath'] = destination + self.to_screen('Chapter %03d; Destination: %s' % (number, destination)) + return ( + destination, + ['-ss', compat_str(chapter['start_time']), + '-t', compat_str(chapter['end_time'] - chapter['start_time'])]) + + @PostProcessor._restrict_to(images=False) + def run(self, info): + chapters = info.get('chapters') or [] + if not chapters: + self.to_screen('Chapter information is unavailable') + return [], info + + in_file = info['filepath'] + if self._force_keyframes and len(chapters) > 1: + in_file = self.force_keyframes(in_file, (c['start_time'] for c in chapters)) + self.to_screen('Splitting video by chapters; %d chapters found' % len(chapters)) + for idx, chapter in enumerate(chapters): + destination, opts = self._ffmpeg_args_for_chapter(idx + 1, chapter, info) + self.real_run_ffmpeg([(in_file, opts)], [(destination, ['-c', 'copy'])]) + if in_file != info['filepath']: + os.remove(in_file) + return [], info + + +class FFmpegThumbnailsConvertorPP(FFmpegPostProcessor): + SUPPORTED_EXTS = ('jpg', 'png') + + def __init__(self, downloader=None, format=None): + super(FFmpegThumbnailsConvertorPP, self).__init__(downloader) + self.format = format + + @staticmethod + def is_webp(path): + with open(encodeFilename(path), 'rb') as f: + b = f.read(12) + return b[0:4] == b'RIFF' and b[8:] == b'WEBP' + + def fixup_webp(self, info, idx=-1): + thumbnail_filename = info['thumbnails'][idx]['filepath'] + _, thumbnail_ext = os.path.splitext(thumbnail_filename) + if thumbnail_ext: + thumbnail_ext = thumbnail_ext[1:].lower() + if thumbnail_ext != 'webp' and self.is_webp(thumbnail_filename): + self.to_screen('Correcting thumbnail "%s" extension to webp' % thumbnail_filename) + webp_filename = replace_extension(thumbnail_filename, 'webp') + os.replace(thumbnail_filename, webp_filename) + info['thumbnails'][idx]['filepath'] = webp_filename + info['__files_to_move'][webp_filename] = replace_extension( + info['__files_to_move'].pop(thumbnail_filename), 'webp') + + @staticmethod + def _options(target_ext): + if target_ext == 'jpg': + return ['-bsf:v', 'mjpeg2jpeg'] + return [] + + def convert_thumbnail(self, thumbnail_filename, target_ext): + thumbnail_conv_filename = replace_extension(thumbnail_filename, target_ext) + + self.to_screen('Converting thumbnail "%s" to %s' % (thumbnail_filename, target_ext)) + self.real_run_ffmpeg( + [(thumbnail_filename, ['-f', 'image2', '-pattern_type', 'none'])], + [(thumbnail_conv_filename.replace('%', '%%'), self._options(target_ext))]) + return thumbnail_conv_filename + + def run(self, info): + files_to_delete = [] + has_thumbnail = False + + for idx, thumbnail_dict in enumerate(info['thumbnails']): + if 'filepath' not in thumbnail_dict: + continue + has_thumbnail = True + self.fixup_webp(info, idx) + original_thumbnail = thumbnail_dict['filepath'] + _, thumbnail_ext = os.path.splitext(original_thumbnail) + if thumbnail_ext: + thumbnail_ext = thumbnail_ext[1:].lower() + if thumbnail_ext == 'jpeg': + thumbnail_ext = 'jpg' + if thumbnail_ext == self.format: + self.to_screen('Thumbnail "%s" is already in the requested format' % original_thumbnail) + continue + thumbnail_dict['filepath'] = self.convert_thumbnail(original_thumbnail, self.format) + files_to_delete.append(original_thumbnail) + info['__files_to_move'][thumbnail_dict['filepath']] = replace_extension( + info['__files_to_move'][original_thumbnail], self.format) + + if not has_thumbnail: + self.to_screen('There aren\'t any thumbnails to convert') + return files_to_delete, info diff --git a/yt_dlp/postprocessor/metadataparser.py b/yt_dlp/postprocessor/metadataparser.py new file mode 100644 index 000000000..96aac9beb --- /dev/null +++ b/yt_dlp/postprocessor/metadataparser.py @@ -0,0 +1,116 @@ +import re + +from enum import Enum + +from .common import PostProcessor + + +class MetadataParserPP(PostProcessor): + class Actions(Enum): + INTERPRET = 'interpretter' + REPLACE = 'replacer' + + def __init__(self, downloader, actions): + PostProcessor.__init__(self, downloader) + self._actions = [] + for f in actions: + action = f[0] + assert isinstance(action, self.Actions) + self._actions.append(getattr(self, action._value_)(*f[1:])) + + @classmethod + def validate_action(cls, action, *data): + ''' Each action can be: + (Actions.INTERPRET, from, to) OR + (Actions.REPLACE, field, search, replace) + ''' + if not isinstance(action, cls.Actions): + raise ValueError(f'{action!r} is not a valid action') + getattr(cls, action._value_)(cls, *data) + + @staticmethod + def field_to_template(tmpl): + if re.match(r'[a-zA-Z_]+$', tmpl): + return f'%({tmpl})s' + return tmpl + + @staticmethod + def format_to_regex(fmt): + r""" + Converts a string like + '%(title)s - %(artist)s' + to a regex like + '(?P<title>.+)\ \-\ (?P<artist>.+)' + """ + if not re.search(r'%\(\w+\)s', fmt): + return fmt + lastpos = 0 + regex = '' + # replace %(..)s with regex group and escape other string parts + for match in re.finditer(r'%\((\w+)\)s', fmt): + regex += re.escape(fmt[lastpos:match.start()]) + regex += rf'(?P<{match.group(1)}>.+)' + lastpos = match.end() + if lastpos < len(fmt): + regex += re.escape(fmt[lastpos:]) + return regex + + def run(self, info): + for f in self._actions: + f(info) + return [], info + + def interpretter(self, inp, out): + def f(info): + data_to_parse = self._downloader.evaluate_outtmpl(template, info) + self.write_debug(f'Searching for {out_re.pattern!r} in {template!r}') + match = out_re.search(data_to_parse) + if match is None: + self.report_warning(f'Could not interpret {inp!r} as {out!r}') + return + for attribute, value in match.groupdict().items(): + info[attribute] = value + self.to_screen('Parsed %s from %r: %r' % (attribute, template, value if value is not None else 'NA')) + + template = self.field_to_template(inp) + out_re = re.compile(self.format_to_regex(out)) + return f + + def replacer(self, field, search, replace): + def f(info): + val = info.get(field) + if val is None: + self.report_warning(f'Video does not have a {field}') + return + elif not isinstance(val, str): + self.report_warning(f'Cannot replace in field {field} since it is a {type(val).__name__}') + return + self.write_debug(f'Replacing all {search!r} in {field} with {replace!r}') + info[field], n = search_re.subn(replace, val) + if n: + self.to_screen(f'Changed {field} to: {info[field]}') + else: + self.to_screen(f'Did not find {search!r} in {field}') + + search_re = re.compile(search) + return f + + +class MetadataFromFieldPP(MetadataParserPP): + @classmethod + def to_action(cls, f): + match = re.match(r'(?P<in>.*?)(?<!\\):(?P<out>.+)$', f) + if match is None: + raise ValueError(f'it should be FROM:TO, not {f!r}') + return ( + cls.Actions.INTERPRET, + match.group('in').replace('\\:', ':'), + match.group('out')) + + def __init__(self, downloader, formats): + MetadataParserPP.__init__(self, downloader, [self.to_action(f) for f in formats]) + + +class MetadataFromTitlePP(MetadataParserPP): # for backward compatibility + def __init__(self, downloader, titleformat): + MetadataParserPP.__init__(self, downloader, [(self.Actions.INTERPRET, 'title', titleformat)]) diff --git a/yt_dlp/postprocessor/modify_chapters.py b/yt_dlp/postprocessor/modify_chapters.py new file mode 100644 index 000000000..72a705fc5 --- /dev/null +++ b/yt_dlp/postprocessor/modify_chapters.py @@ -0,0 +1,327 @@ +import copy +import heapq +import os + +from .common import PostProcessor +from .ffmpeg import ( + FFmpegPostProcessor, + FFmpegSubtitlesConvertorPP +) +from .sponsorblock import SponsorBlockPP +from ..utils import ( + orderedSet, + PostProcessingError, + prepend_extension, +) + + +_TINY_CHAPTER_DURATION = 1 +DEFAULT_SPONSORBLOCK_CHAPTER_TITLE = '[SponsorBlock]: %(category_names)l' + + +class ModifyChaptersPP(FFmpegPostProcessor): + def __init__(self, downloader, remove_chapters_patterns=None, remove_sponsor_segments=None, + sponsorblock_chapter_title=DEFAULT_SPONSORBLOCK_CHAPTER_TITLE, force_keyframes=False): + FFmpegPostProcessor.__init__(self, downloader) + self._remove_chapters_patterns = set(remove_chapters_patterns or []) + self._remove_sponsor_segments = set(remove_sponsor_segments or []) + self._sponsorblock_chapter_title = sponsorblock_chapter_title + self._force_keyframes = force_keyframes + + @PostProcessor._restrict_to(images=False) + def run(self, info): + chapters, sponsor_chapters = self._mark_chapters_to_remove( + info.get('chapters') or [], info.get('sponsorblock_chapters') or []) + if not chapters and not sponsor_chapters: + return [], info + + real_duration = self._get_real_video_duration(info) + if not chapters: + chapters = [{'start_time': 0, 'end_time': real_duration, 'title': info['title']}] + + info['chapters'], cuts = self._remove_marked_arrange_sponsors(chapters + sponsor_chapters) + if not cuts: + return [], info + + if self._duration_mismatch(real_duration, info.get('duration')): + if not self._duration_mismatch(real_duration, info['chapters'][-1]['end_time']): + self.to_screen(f'Skipping {self.pp_key()} since the video appears to be already cut') + return [], info + if not info.get('__real_download'): + raise PostProcessingError('Cannot cut video since the real and expected durations mismatch. ' + 'Different chapters may have already been removed') + else: + self.write_debug('Expected and actual durations mismatch') + + concat_opts = self._make_concat_opts(cuts, real_duration) + + def remove_chapters(file, is_sub): + return file, self.remove_chapters(file, cuts, concat_opts, self._force_keyframes and not is_sub) + + in_out_files = [remove_chapters(info['filepath'], False)] + in_out_files.extend(remove_chapters(in_file, True) for in_file in self._get_supported_subs(info)) + + # Renaming should only happen after all files are processed + files_to_remove = [] + for in_file, out_file in in_out_files: + uncut_file = prepend_extension(in_file, 'uncut') + os.replace(in_file, uncut_file) + os.replace(out_file, in_file) + files_to_remove.append(uncut_file) + + info['_real_duration'] = info['chapters'][-1]['end_time'] + return files_to_remove, info + + def _mark_chapters_to_remove(self, chapters, sponsor_chapters): + if self._remove_chapters_patterns: + warn_no_chapter_to_remove = True + if not chapters: + self.to_screen('Chapter information is unavailable') + warn_no_chapter_to_remove = False + for c in chapters: + if any(regex.search(c['title']) for regex in self._remove_chapters_patterns): + c['remove'] = True + warn_no_chapter_to_remove = False + if warn_no_chapter_to_remove: + self.to_screen('There are no chapters matching the regex') + + if self._remove_sponsor_segments: + warn_no_chapter_to_remove = True + if not sponsor_chapters: + self.to_screen('SponsorBlock information is unavailable') + warn_no_chapter_to_remove = False + for c in sponsor_chapters: + if c['category'] in self._remove_sponsor_segments: + c['remove'] = True + warn_no_chapter_to_remove = False + if warn_no_chapter_to_remove: + self.to_screen('There are no matching SponsorBlock chapters') + + return chapters, sponsor_chapters + + def _get_supported_subs(self, info): + for sub in (info.get('requested_subtitles') or {}).values(): + sub_file = sub.get('filepath') + # The file might have been removed by --embed-subs + if not sub_file or not os.path.exists(sub_file): + continue + ext = sub['ext'] + if ext not in FFmpegSubtitlesConvertorPP.SUPPORTED_EXTS: + self.report_warning(f'Cannot remove chapters from external {ext} subtitles; "{sub_file}" is now out of sync') + continue + # TODO: create __real_download for subs? + yield sub_file + + def _remove_marked_arrange_sponsors(self, chapters): + # Store cuts separately, since adjacent and overlapping cuts must be merged. + cuts = [] + + def append_cut(c): + assert 'remove' in c + last_to_cut = cuts[-1] if cuts else None + if last_to_cut and last_to_cut['end_time'] >= c['start_time']: + last_to_cut['end_time'] = max(last_to_cut['end_time'], c['end_time']) + else: + cuts.append(c) + return len(cuts) - 1 + + def excess_duration(c): + # Cuts that are completely within the chapter reduce chapters' duration. + # Since cuts can overlap, excess duration may be less that the sum of cuts' durations. + # To avoid that, chapter stores the index to the fist cut within the chapter, + # instead of storing excess duration. append_cut ensures that subsequent cuts (if any) + # will be merged with previous ones (if necessary). + cut_idx, excess = c.pop('cut_idx', len(cuts)), 0 + while cut_idx < len(cuts): + cut = cuts[cut_idx] + if cut['start_time'] >= c['end_time']: + break + if cut['end_time'] > c['start_time']: + excess += min(cut['end_time'], c['end_time']) + excess -= max(cut['start_time'], c['start_time']) + cut_idx += 1 + return excess + + new_chapters = [] + + def append_chapter(c): + assert 'remove' not in c + length = c['end_time'] - c['start_time'] - excess_duration(c) + # Chapter is completely covered by cuts or sponsors. + if length <= 0: + return + start = new_chapters[-1]['end_time'] if new_chapters else 0 + c.update(start_time=start, end_time=start + length) + new_chapters.append(c) + + # Turn into a priority queue, index is a tie breaker. + # Plain stack sorted by start_time is not enough: after splitting the chapter, + # the part returned to the stack is not guaranteed to have start_time + # less than or equal to the that of the stack's head. + chapters = [(c['start_time'], i, c) for i, c in enumerate(chapters)] + heapq.heapify(chapters) + + _, cur_i, cur_chapter = heapq.heappop(chapters) + while chapters: + _, i, c = heapq.heappop(chapters) + # Non-overlapping chapters or cuts can be appended directly. However, + # adjacent non-overlapping cuts must be merged, which is handled by append_cut. + if cur_chapter['end_time'] <= c['start_time']: + (append_chapter if 'remove' not in cur_chapter else append_cut)(cur_chapter) + cur_i, cur_chapter = i, c + continue + + # Eight possibilities for overlapping chapters: (cut, cut), (cut, sponsor), + # (cut, normal), (sponsor, cut), (normal, cut), (sponsor, sponsor), + # (sponsor, normal), and (normal, sponsor). There is no (normal, normal): + # normal chapters are assumed not to overlap. + if 'remove' in cur_chapter: + # (cut, cut): adjust end_time. + if 'remove' in c: + cur_chapter['end_time'] = max(cur_chapter['end_time'], c['end_time']) + # (cut, sponsor/normal): chop the beginning of the later chapter + # (if it's not completely hidden by the cut). Push to the priority queue + # to restore sorting by start_time: with beginning chopped, c may actually + # start later than the remaining chapters from the queue. + elif cur_chapter['end_time'] < c['end_time']: + c['start_time'] = cur_chapter['end_time'] + c['_was_cut'] = True + heapq.heappush(chapters, (c['start_time'], i, c)) + # (sponsor/normal, cut). + elif 'remove' in c: + cur_chapter['_was_cut'] = True + # Chop the end of the current chapter if the cut is not contained within it. + # Chopping the end doesn't break start_time sorting, no PQ push is necessary. + if cur_chapter['end_time'] <= c['end_time']: + cur_chapter['end_time'] = c['start_time'] + append_chapter(cur_chapter) + cur_i, cur_chapter = i, c + continue + # Current chapter contains the cut within it. If the current chapter is + # a sponsor chapter, check whether the categories before and after the cut differ. + if '_categories' in cur_chapter: + after_c = dict(cur_chapter, start_time=c['end_time'], _categories=[]) + cur_cats = [] + for cat_start_end in cur_chapter['_categories']: + if cat_start_end[1] < c['start_time']: + cur_cats.append(cat_start_end) + if cat_start_end[2] > c['end_time']: + after_c['_categories'].append(cat_start_end) + cur_chapter['_categories'] = cur_cats + if cur_chapter['_categories'] != after_c['_categories']: + # Categories before and after the cut differ: push the after part to PQ. + heapq.heappush(chapters, (after_c['start_time'], cur_i, after_c)) + cur_chapter['end_time'] = c['start_time'] + append_chapter(cur_chapter) + cur_i, cur_chapter = i, c + continue + # Either sponsor categories before and after the cut are the same or + # we're dealing with a normal chapter. Just register an outstanding cut: + # subsequent append_chapter will reduce the duration. + cur_chapter.setdefault('cut_idx', append_cut(c)) + # (sponsor, normal): if a normal chapter is not completely overlapped, + # chop the beginning of it and push it to PQ. + elif '_categories' in cur_chapter and '_categories' not in c: + if cur_chapter['end_time'] < c['end_time']: + c['start_time'] = cur_chapter['end_time'] + c['_was_cut'] = True + heapq.heappush(chapters, (c['start_time'], i, c)) + # (normal, sponsor) and (sponsor, sponsor) + else: + assert '_categories' in c + cur_chapter['_was_cut'] = True + c['_was_cut'] = True + # Push the part after the sponsor to PQ. + if cur_chapter['end_time'] > c['end_time']: + # deepcopy to make categories in after_c and cur_chapter/c refer to different lists. + after_c = dict(copy.deepcopy(cur_chapter), start_time=c['end_time']) + heapq.heappush(chapters, (after_c['start_time'], cur_i, after_c)) + # Push the part after the overlap to PQ. + elif c['end_time'] > cur_chapter['end_time']: + after_cur = dict(copy.deepcopy(c), start_time=cur_chapter['end_time']) + heapq.heappush(chapters, (after_cur['start_time'], cur_i, after_cur)) + c['end_time'] = cur_chapter['end_time'] + # (sponsor, sponsor): merge categories in the overlap. + if '_categories' in cur_chapter: + c['_categories'] = cur_chapter['_categories'] + c['_categories'] + # Inherit the cuts that the current chapter has accumulated within it. + if 'cut_idx' in cur_chapter: + c['cut_idx'] = cur_chapter['cut_idx'] + cur_chapter['end_time'] = c['start_time'] + append_chapter(cur_chapter) + cur_i, cur_chapter = i, c + (append_chapter if 'remove' not in cur_chapter else append_cut)(cur_chapter) + return self._remove_tiny_rename_sponsors(new_chapters), cuts + + def _remove_tiny_rename_sponsors(self, chapters): + new_chapters = [] + for i, c in enumerate(chapters): + # Merge with the previous/next if the chapter is tiny. + # Only tiny chapters resulting from a cut can be skipped. + # Chapters that were already tiny in the original list will be preserved. + if (('_was_cut' in c or '_categories' in c) + and c['end_time'] - c['start_time'] < _TINY_CHAPTER_DURATION): + if not new_chapters: + # Prepend tiny chapter to the next one if possible. + if i < len(chapters) - 1: + chapters[i + 1]['start_time'] = c['start_time'] + continue + else: + old_c = new_chapters[-1] + if i < len(chapters) - 1: + next_c = chapters[i + 1] + # Not a typo: key names in old_c and next_c are really different. + prev_is_sponsor = 'categories' in old_c + next_is_sponsor = '_categories' in next_c + # Preferentially prepend tiny normals to normals and sponsors to sponsors. + if (('_categories' not in c and prev_is_sponsor and not next_is_sponsor) + or ('_categories' in c and not prev_is_sponsor and next_is_sponsor)): + next_c['start_time'] = c['start_time'] + continue + old_c['end_time'] = c['end_time'] + continue + + c.pop('_was_cut', None) + cats = c.pop('_categories', None) + if cats: + category = min(cats, key=lambda c: c[2] - c[1])[0] + cats = orderedSet(x[0] for x in cats) + c.update({ + 'category': category, + 'categories': cats, + 'name': SponsorBlockPP.CATEGORIES[category], + 'category_names': [SponsorBlockPP.CATEGORIES[c] for c in cats] + }) + c['title'] = self._downloader.evaluate_outtmpl(self._sponsorblock_chapter_title, c) + # Merge identically named sponsors. + if (new_chapters and 'categories' in new_chapters[-1] + and new_chapters[-1]['title'] == c['title']): + new_chapters[-1]['end_time'] = c['end_time'] + continue + new_chapters.append(c) + return new_chapters + + def remove_chapters(self, filename, ranges_to_cut, concat_opts, force_keyframes=False): + in_file = filename + out_file = prepend_extension(in_file, 'temp') + if force_keyframes: + in_file = self.force_keyframes(in_file, (t for c in ranges_to_cut for t in (c['start_time'], c['end_time']))) + self.to_screen(f'Removing chapters from {filename}') + self.concat_files([in_file] * len(concat_opts), out_file, concat_opts) + if in_file != filename: + os.remove(in_file) + return out_file + + @staticmethod + def _make_concat_opts(chapters_to_remove, duration): + opts = [{}] + for s in chapters_to_remove: + # Do not create 0 duration chunk at the beginning. + if s['start_time'] == 0: + opts[-1]['inpoint'] = f'{s["end_time"]:.6f}' + continue + opts[-1]['outpoint'] = f'{s["start_time"]:.6f}' + # Do not create 0 duration chunk at the end. + if s['end_time'] != duration: + opts.append({'inpoint': f'{s["end_time"]:.6f}'}) + return opts diff --git a/yt_dlp/postprocessor/movefilesafterdownload.py b/yt_dlp/postprocessor/movefilesafterdownload.py new file mode 100644 index 000000000..1064a8cb8 --- /dev/null +++ b/yt_dlp/postprocessor/movefilesafterdownload.py @@ -0,0 +1,54 @@ +from __future__ import unicode_literals +import os +import shutil + +from .common import PostProcessor +from ..utils import ( + decodeFilename, + encodeFilename, + make_dir, + PostProcessingError, +) + + +class MoveFilesAfterDownloadPP(PostProcessor): + + def __init__(self, downloader=None, downloaded=True): + PostProcessor.__init__(self, downloader) + self._downloaded = downloaded + + @classmethod + def pp_key(cls): + return 'MoveFiles' + + def run(self, info): + dl_path, dl_name = os.path.split(encodeFilename(info['filepath'])) + finaldir = info.get('__finaldir', dl_path) + finalpath = os.path.join(finaldir, dl_name) + if self._downloaded: + info['__files_to_move'][info['filepath']] = decodeFilename(finalpath) + + make_newfilename = lambda old: decodeFilename(os.path.join(finaldir, os.path.basename(encodeFilename(old)))) + for oldfile, newfile in info['__files_to_move'].items(): + if not newfile: + newfile = make_newfilename(oldfile) + if os.path.abspath(encodeFilename(oldfile)) == os.path.abspath(encodeFilename(newfile)): + continue + if not os.path.exists(encodeFilename(oldfile)): + self.report_warning('File "%s" cannot be found' % oldfile) + continue + if os.path.exists(encodeFilename(newfile)): + if self.get_param('overwrites', True): + self.report_warning('Replacing existing file "%s"' % newfile) + os.remove(encodeFilename(newfile)) + else: + self.report_warning( + 'Cannot move file "%s" out of temporary directory since "%s" already exists. ' + % (oldfile, newfile)) + continue + make_dir(newfile, PostProcessingError) + self.to_screen('Moving file "%s" to "%s"' % (oldfile, newfile)) + shutil.move(oldfile, newfile) # os.rename cannot move between volumes + + info['filepath'] = finalpath + return [], info diff --git a/yt_dlp/postprocessor/sponskrub.py b/yt_dlp/postprocessor/sponskrub.py new file mode 100644 index 000000000..932555a0e --- /dev/null +++ b/yt_dlp/postprocessor/sponskrub.py @@ -0,0 +1,96 @@ +from __future__ import unicode_literals +import os +import subprocess + +from .common import PostProcessor +from ..compat import compat_shlex_split +from ..utils import ( + check_executable, + cli_option, + encodeArgument, + encodeFilename, + shell_quote, + str_or_none, + PostProcessingError, + prepend_extension, + process_communicate_or_kill, +) + + +# Deprecated in favor of the native implementation +class SponSkrubPP(PostProcessor): + _temp_ext = 'spons' + _exe_name = 'sponskrub' + + def __init__(self, downloader, path='', args=None, ignoreerror=False, cut=False, force=False): + PostProcessor.__init__(self, downloader) + self.force = force + self.cutout = cut + self.args = str_or_none(args) or '' # For backward compatibility + self.path = self.get_exe(path) + + if not ignoreerror and self.path is None: + if path: + raise PostProcessingError('sponskrub not found in "%s"' % path) + else: + raise PostProcessingError('sponskrub not found. Please install or provide the path using --sponskrub-path') + + def get_exe(self, path=''): + if not path or not check_executable(path, ['-h']): + path = os.path.join(path, self._exe_name) + if not check_executable(path, ['-h']): + return None + return path + + @PostProcessor._restrict_to(images=False) + def run(self, information): + if self.path is None: + return [], information + + filename = information['filepath'] + if not os.path.exists(encodeFilename(filename)): # no download + return [], information + + if information['extractor_key'].lower() != 'youtube': + self.to_screen('Skipping sponskrub since it is not a YouTube video') + return [], information + if self.cutout and not self.force and not information.get('__real_download', False): + self.report_warning( + 'Skipping sponskrub since the video was already downloaded. ' + 'Use --sponskrub-force to run sponskrub anyway') + return [], information + + self.to_screen('Trying to %s sponsor sections' % ('remove' if self.cutout else 'mark')) + if self.cutout: + self.report_warning('Cutting out sponsor segments will cause the subtitles to go out of sync.') + if not information.get('__real_download', False): + self.report_warning('If sponskrub is run multiple times, unintended parts of the video could be cut out.') + + temp_filename = prepend_extension(filename, self._temp_ext) + if os.path.exists(encodeFilename(temp_filename)): + os.remove(encodeFilename(temp_filename)) + + cmd = [self.path] + if not self.cutout: + cmd += ['-chapter'] + cmd += cli_option(self._downloader.params, '-proxy', 'proxy') + cmd += compat_shlex_split(self.args) # For backward compatibility + cmd += self._configuration_args(self._exe_name, use_compat=False) + cmd += ['--', information['id'], filename, temp_filename] + cmd = [encodeArgument(i) for i in cmd] + + self.write_debug('sponskrub command line: %s' % shell_quote(cmd)) + pipe = None if self.get_param('verbose') else subprocess.PIPE + p = subprocess.Popen(cmd, stdout=pipe) + stdout = process_communicate_or_kill(p)[0] + + if p.returncode == 0: + os.replace(temp_filename, filename) + self.to_screen('Sponsor sections have been %s' % ('removed' if self.cutout else 'marked')) + elif p.returncode == 3: + self.to_screen('No segments in the SponsorBlock database') + else: + msg = stdout.decode('utf-8', 'replace').strip() if stdout else '' + msg = msg.split('\n')[0 if msg.lower().startswith('unrecognised') else -1] + raise PostProcessingError(msg if msg else 'sponskrub failed with error code %s' % p.returncode) + return [], information diff --git a/yt_dlp/postprocessor/sponsorblock.py b/yt_dlp/postprocessor/sponsorblock.py new file mode 100644 index 000000000..7265a9de7 --- /dev/null +++ b/yt_dlp/postprocessor/sponsorblock.py @@ -0,0 +1,96 @@ +import json +import re +from hashlib import sha256 + +from .ffmpeg import FFmpegPostProcessor +from ..compat import compat_urllib_parse_urlencode, compat_HTTPError +from ..utils import PostProcessingError, network_exceptions, sanitized_Request + + +class SponsorBlockPP(FFmpegPostProcessor): + + EXTRACTORS = { + 'Youtube': 'YouTube', + } + CATEGORIES = { + 'sponsor': 'Sponsor', + 'intro': 'Intermission/Intro Animation', + 'outro': 'Endcards/Credits', + 'selfpromo': 'Unpaid/Self Promotion', + 'interaction': 'Interaction Reminder', + 'preview': 'Preview/Recap', + 'music_offtopic': 'Non-Music Section' + } + + def __init__(self, downloader, categories=None, api='https://sponsor.ajay.app'): + FFmpegPostProcessor.__init__(self, downloader) + self._categories = tuple(categories or self.CATEGORIES.keys()) + self._API_URL = api if re.match('^https?://', api) else 'https://' + api + + def run(self, info): + extractor = info['extractor_key'] + if extractor not in self.EXTRACTORS: + self.to_screen(f'SponsorBlock is not supported for {extractor}') + return [], info + + info['sponsorblock_chapters'] = self._get_sponsor_chapters(info, info['duration']) + return [], info + + def _get_sponsor_chapters(self, info, duration): + segments = self._get_sponsor_segments(info['id'], self.EXTRACTORS[info['extractor_key']]) + + def duration_filter(s): + start_end = s['segment'] + # Ignore milliseconds difference at the start. + if start_end[0] <= 1: + start_end[0] = 0 + # Ignore milliseconds difference at the end. + # Never allow the segment to exceed the video. + if duration and duration - start_end[1] <= 1: + start_end[1] = duration + # SponsorBlock duration may be absent or it may deviate from the real one. + return s['videoDuration'] == 0 or not duration or abs(duration - s['videoDuration']) <= 1 + + duration_match = [s for s in segments if duration_filter(s)] + if len(duration_match) != len(segments): + self.report_warning('Some SponsorBlock segments are from a video of different duration, maybe from an old version of this video') + + def to_chapter(s): + (start, end), cat = s['segment'], s['category'] + return { + 'start_time': start, + 'end_time': end, + 'category': cat, + 'title': self.CATEGORIES[cat], + '_categories': [(cat, start, end)] + } + + sponsor_chapters = [to_chapter(s) for s in duration_match] + if not sponsor_chapters: + self.to_screen('No segments were found in the SponsorBlock database') + else: + self.to_screen(f'Found {len(sponsor_chapters)} segments in the SponsorBlock database') + return sponsor_chapters + + def _get_sponsor_segments(self, video_id, service): + hash = sha256(video_id.encode('ascii')).hexdigest() + # SponsorBlock API recommends using first 4 hash characters. + url = f'{self._API_URL}/api/skipSegments/{hash[:4]}?' + compat_urllib_parse_urlencode({ + 'service': service, + 'categories': json.dumps(self._categories), + }) + for d in self._get_json(url): + if d['videoID'] == video_id: + return d['segments'] + return [] + + def _get_json(self, url): + self.write_debug(f'SponsorBlock query: {url}') + try: + rsp = self._downloader.urlopen(sanitized_Request(url)) + except network_exceptions as e: + if isinstance(e, compat_HTTPError) and e.code == 404: + return [] + raise PostProcessingError(f'Unable to communicate with SponsorBlock API - {e}') + + return json.loads(rsp.read().decode(rsp.info().get_param('charset') or 'utf-8')) diff --git a/yt_dlp/postprocessor/xattrpp.py b/yt_dlp/postprocessor/xattrpp.py new file mode 100644 index 000000000..93acd6d13 --- /dev/null +++ b/yt_dlp/postprocessor/xattrpp.py @@ -0,0 +1,78 @@ +from __future__ import unicode_literals + +from .common import PostProcessor +from ..compat import compat_os_name +from ..utils import ( + hyphenate_date, + write_xattr, + PostProcessingError, + XAttrMetadataError, + XAttrUnavailableError, +) + + +class XAttrMetadataPP(PostProcessor): + # + # More info about extended attributes for media: + # http://freedesktop.org/wiki/CommonExtendedAttributes/ + # http://www.freedesktop.org/wiki/PhreedomDraft/ + # http://dublincore.org/documents/usageguide/elements.shtml + # + # TODO: + # * capture youtube keywords and put them in 'user.dublincore.subject' (comma-separated) + # * figure out which xattrs can be used for 'duration', 'thumbnail', 'resolution' + # + + def run(self, info): + """ Set extended attributes on downloaded file (if xattr support is found). """ + + # Write the metadata to the file's xattrs + self.to_screen('Writing metadata to file\'s xattrs') + + filename = info['filepath'] + + try: + xattr_mapping = { + 'user.xdg.referrer.url': 'webpage_url', + # 'user.xdg.comment': 'description', + 'user.dublincore.title': 'title', + 'user.dublincore.date': 'upload_date', + 'user.dublincore.description': 'description', + 'user.dublincore.contributor': 'uploader', + 'user.dublincore.format': 'format', + } + + num_written = 0 + for xattrname, infoname in xattr_mapping.items(): + + value = info.get(infoname) + + if value: + if infoname == 'upload_date': + value = hyphenate_date(value) + + byte_value = value.encode('utf-8') + write_xattr(filename, xattrname, byte_value) + num_written += 1 + + return [], info + + except XAttrUnavailableError as e: + raise PostProcessingError(str(e)) + + except XAttrMetadataError as e: + if e.reason == 'NO_SPACE': + self.report_warning( + 'There\'s no disk space left, disk quota exceeded or filesystem xattr limit exceeded. ' + + (('Some ' if num_written else '') + 'extended attributes are not written.').capitalize()) + elif e.reason == 'VALUE_TOO_LONG': + self.report_warning( + 'Unable to write extended attributes due to too long values.') + else: + msg = 'This filesystem doesn\'t support extended attributes. ' + if compat_os_name == 'nt': + msg += 'You need to use NTFS.' + else: + msg += '(You may have to enable them in your /etc/fstab)' + raise PostProcessingError(str(e)) + return [], info diff --git a/youtube_dl/socks.py b/yt_dlp/socks.py index 5d4adbe72..5d4adbe72 100644 --- a/youtube_dl/socks.py +++ b/yt_dlp/socks.py diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py new file mode 100644 index 000000000..1c213aa44 --- /dev/null +++ b/yt_dlp/utils.py @@ -0,0 +1,6471 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +from __future__ import unicode_literals + +import base64 +import binascii +import calendar +import codecs +import collections +import contextlib +import ctypes +import datetime +import email.utils +import email.header +import errno +import functools +import gzip +import hashlib +import hmac +import importlib.util +import io +import itertools +import json +import locale +import math +import operator +import os +import platform +import random +import re +import socket +import ssl +import subprocess +import sys +import tempfile +import time +import traceback +import xml.etree.ElementTree +import zlib + +from .compat import ( + compat_HTMLParseError, + compat_HTMLParser, + compat_HTTPError, + compat_basestring, + compat_chr, + compat_cookiejar, + compat_ctypes_WINFUNCTYPE, + compat_etree_fromstring, + compat_expanduser, + compat_html_entities, + compat_html_entities_html5, + compat_http_client, + compat_integer_types, + compat_numeric_types, + compat_kwargs, + compat_os_name, + compat_parse_qs, + compat_shlex_quote, + compat_str, + compat_struct_pack, + compat_struct_unpack, + compat_urllib_error, + compat_urllib_parse, + compat_urllib_parse_urlencode, + compat_urllib_parse_urlparse, + compat_urllib_parse_urlunparse, + compat_urllib_parse_quote, + compat_urllib_parse_quote_plus, + compat_urllib_parse_unquote_plus, + compat_urllib_request, + compat_urlparse, + compat_xpath, +) + +from .socks import ( + ProxyType, + sockssocket, +) + + +def register_socks_protocols(): + # "Register" SOCKS protocols + # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904 + # URLs with protocols not in urlparse.uses_netloc are not handled correctly + for scheme in ('socks', 'socks4', 'socks4a', 'socks5'): + if scheme not in compat_urlparse.uses_netloc: + compat_urlparse.uses_netloc.append(scheme) + + +# This is not clearly defined otherwise +compiled_regex_type = type(re.compile('')) + + +def random_user_agent(): + _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36' + _CHROME_VERSIONS = ( + '74.0.3729.129', + '76.0.3780.3', + '76.0.3780.2', + '74.0.3729.128', + '76.0.3780.1', + '76.0.3780.0', + '75.0.3770.15', + '74.0.3729.127', + '74.0.3729.126', + '76.0.3779.1', + '76.0.3779.0', + '75.0.3770.14', + '74.0.3729.125', + '76.0.3778.1', + '76.0.3778.0', + '75.0.3770.13', + '74.0.3729.124', + '74.0.3729.123', + '73.0.3683.121', + '76.0.3777.1', + '76.0.3777.0', + '75.0.3770.12', + '74.0.3729.122', + '76.0.3776.4', + '75.0.3770.11', + '74.0.3729.121', + '76.0.3776.3', + '76.0.3776.2', + '73.0.3683.120', + '74.0.3729.120', + '74.0.3729.119', + '74.0.3729.118', + '76.0.3776.1', + '76.0.3776.0', + '76.0.3775.5', + '75.0.3770.10', + '74.0.3729.117', + '76.0.3775.4', + '76.0.3775.3', + '74.0.3729.116', + '75.0.3770.9', + '76.0.3775.2', + '76.0.3775.1', + '76.0.3775.0', + '75.0.3770.8', + '74.0.3729.115', + '74.0.3729.114', + '76.0.3774.1', + '76.0.3774.0', + '75.0.3770.7', + '74.0.3729.113', + '74.0.3729.112', + '74.0.3729.111', + '76.0.3773.1', + '76.0.3773.0', + '75.0.3770.6', + '74.0.3729.110', + '74.0.3729.109', + '76.0.3772.1', + '76.0.3772.0', + '75.0.3770.5', + '74.0.3729.108', + '74.0.3729.107', + '76.0.3771.1', + '76.0.3771.0', + '75.0.3770.4', + '74.0.3729.106', + '74.0.3729.105', + '75.0.3770.3', + '74.0.3729.104', + '74.0.3729.103', + '74.0.3729.102', + '75.0.3770.2', + '74.0.3729.101', + '75.0.3770.1', + '75.0.3770.0', + '74.0.3729.100', + '75.0.3769.5', + '75.0.3769.4', + '74.0.3729.99', + '75.0.3769.3', + '75.0.3769.2', + '75.0.3768.6', + '74.0.3729.98', + '75.0.3769.1', + '75.0.3769.0', + '74.0.3729.97', + '73.0.3683.119', + '73.0.3683.118', + '74.0.3729.96', + '75.0.3768.5', + '75.0.3768.4', + '75.0.3768.3', + '75.0.3768.2', + '74.0.3729.95', + '74.0.3729.94', + '75.0.3768.1', + '75.0.3768.0', + '74.0.3729.93', + '74.0.3729.92', + '73.0.3683.117', + '74.0.3729.91', + '75.0.3766.3', + '74.0.3729.90', + '75.0.3767.2', + '75.0.3767.1', + '75.0.3767.0', + '74.0.3729.89', + '73.0.3683.116', + '75.0.3766.2', + '74.0.3729.88', + '75.0.3766.1', + '75.0.3766.0', + '74.0.3729.87', + '73.0.3683.115', + '74.0.3729.86', + '75.0.3765.1', + '75.0.3765.0', + '74.0.3729.85', + '73.0.3683.114', + '74.0.3729.84', + '75.0.3764.1', + '75.0.3764.0', + '74.0.3729.83', + '73.0.3683.113', + '75.0.3763.2', + '75.0.3761.4', + '74.0.3729.82', + '75.0.3763.1', + '75.0.3763.0', + '74.0.3729.81', + '73.0.3683.112', + '75.0.3762.1', + '75.0.3762.0', + '74.0.3729.80', + '75.0.3761.3', + '74.0.3729.79', + '73.0.3683.111', + '75.0.3761.2', + '74.0.3729.78', + '74.0.3729.77', + '75.0.3761.1', + '75.0.3761.0', + '73.0.3683.110', + '74.0.3729.76', + '74.0.3729.75', + '75.0.3760.0', + '74.0.3729.74', + '75.0.3759.8', + '75.0.3759.7', + '75.0.3759.6', + '74.0.3729.73', + '75.0.3759.5', + '74.0.3729.72', + '73.0.3683.109', + '75.0.3759.4', + '75.0.3759.3', + '74.0.3729.71', + '75.0.3759.2', + '74.0.3729.70', + '73.0.3683.108', + '74.0.3729.69', + '75.0.3759.1', + '75.0.3759.0', + '74.0.3729.68', + '73.0.3683.107', + '74.0.3729.67', + '75.0.3758.1', + '75.0.3758.0', + '74.0.3729.66', + '73.0.3683.106', + '74.0.3729.65', + '75.0.3757.1', + '75.0.3757.0', + '74.0.3729.64', + '73.0.3683.105', + '74.0.3729.63', + '75.0.3756.1', + '75.0.3756.0', + '74.0.3729.62', + '73.0.3683.104', + '75.0.3755.3', + '75.0.3755.2', + '73.0.3683.103', + '75.0.3755.1', + '75.0.3755.0', + '74.0.3729.61', + '73.0.3683.102', + '74.0.3729.60', + '75.0.3754.2', + '74.0.3729.59', + '75.0.3753.4', + '74.0.3729.58', + '75.0.3754.1', + '75.0.3754.0', + '74.0.3729.57', + '73.0.3683.101', + '75.0.3753.3', + '75.0.3752.2', + '75.0.3753.2', + '74.0.3729.56', + '75.0.3753.1', + '75.0.3753.0', + '74.0.3729.55', + '73.0.3683.100', + '74.0.3729.54', + '75.0.3752.1', + '75.0.3752.0', + '74.0.3729.53', + '73.0.3683.99', + '74.0.3729.52', + '75.0.3751.1', + '75.0.3751.0', + '74.0.3729.51', + '73.0.3683.98', + '74.0.3729.50', + '75.0.3750.0', + '74.0.3729.49', + '74.0.3729.48', + '74.0.3729.47', + '75.0.3749.3', + '74.0.3729.46', + '73.0.3683.97', + '75.0.3749.2', + '74.0.3729.45', + '75.0.3749.1', + '75.0.3749.0', + '74.0.3729.44', + '73.0.3683.96', + '74.0.3729.43', + '74.0.3729.42', + '75.0.3748.1', + '75.0.3748.0', + '74.0.3729.41', + '75.0.3747.1', + '73.0.3683.95', + '75.0.3746.4', + '74.0.3729.40', + '74.0.3729.39', + '75.0.3747.0', + '75.0.3746.3', + '75.0.3746.2', + '74.0.3729.38', + '75.0.3746.1', + '75.0.3746.0', + '74.0.3729.37', + '73.0.3683.94', + '75.0.3745.5', + '75.0.3745.4', + '75.0.3745.3', + '75.0.3745.2', + '74.0.3729.36', + '75.0.3745.1', + '75.0.3745.0', + '75.0.3744.2', + '74.0.3729.35', + '73.0.3683.93', + '74.0.3729.34', + '75.0.3744.1', + '75.0.3744.0', + '74.0.3729.33', + '73.0.3683.92', + '74.0.3729.32', + '74.0.3729.31', + '73.0.3683.91', + '75.0.3741.2', + '75.0.3740.5', + '74.0.3729.30', + '75.0.3741.1', + '75.0.3741.0', + '74.0.3729.29', + '75.0.3740.4', + '73.0.3683.90', + '74.0.3729.28', + '75.0.3740.3', + '73.0.3683.89', + '75.0.3740.2', + '74.0.3729.27', + '75.0.3740.1', + '75.0.3740.0', + '74.0.3729.26', + '73.0.3683.88', + '73.0.3683.87', + '74.0.3729.25', + '75.0.3739.1', + '75.0.3739.0', + '73.0.3683.86', + '74.0.3729.24', + '73.0.3683.85', + '75.0.3738.4', + '75.0.3738.3', + '75.0.3738.2', + '75.0.3738.1', + '75.0.3738.0', + '74.0.3729.23', + '73.0.3683.84', + '74.0.3729.22', + '74.0.3729.21', + '75.0.3737.1', + '75.0.3737.0', + '74.0.3729.20', + '73.0.3683.83', + '74.0.3729.19', + '75.0.3736.1', + '75.0.3736.0', + '74.0.3729.18', + '73.0.3683.82', + '74.0.3729.17', + '75.0.3735.1', + '75.0.3735.0', + '74.0.3729.16', + '73.0.3683.81', + '75.0.3734.1', + '75.0.3734.0', + '74.0.3729.15', + '73.0.3683.80', + '74.0.3729.14', + '75.0.3733.1', + '75.0.3733.0', + '75.0.3732.1', + '74.0.3729.13', + '74.0.3729.12', + '73.0.3683.79', + '74.0.3729.11', + '75.0.3732.0', + '74.0.3729.10', + '73.0.3683.78', + '74.0.3729.9', + '74.0.3729.8', + '74.0.3729.7', + '75.0.3731.3', + '75.0.3731.2', + '75.0.3731.0', + '74.0.3729.6', + '73.0.3683.77', + '73.0.3683.76', + '75.0.3730.5', + '75.0.3730.4', + '73.0.3683.75', + '74.0.3729.5', + '73.0.3683.74', + '75.0.3730.3', + '75.0.3730.2', + '74.0.3729.4', + '73.0.3683.73', + '73.0.3683.72', + '75.0.3730.1', + '75.0.3730.0', + '74.0.3729.3', + '73.0.3683.71', + '74.0.3729.2', + '73.0.3683.70', + '74.0.3729.1', + '74.0.3729.0', + '74.0.3726.4', + '73.0.3683.69', + '74.0.3726.3', + '74.0.3728.0', + '74.0.3726.2', + '73.0.3683.68', + '74.0.3726.1', + '74.0.3726.0', + '74.0.3725.4', + '73.0.3683.67', + '73.0.3683.66', + '74.0.3725.3', + '74.0.3725.2', + '74.0.3725.1', + '74.0.3724.8', + '74.0.3725.0', + '73.0.3683.65', + '74.0.3724.7', + '74.0.3724.6', + '74.0.3724.5', + '74.0.3724.4', + '74.0.3724.3', + '74.0.3724.2', + '74.0.3724.1', + '74.0.3724.0', + '73.0.3683.64', + '74.0.3723.1', + '74.0.3723.0', + '73.0.3683.63', + '74.0.3722.1', + '74.0.3722.0', + '73.0.3683.62', + '74.0.3718.9', + '74.0.3702.3', + '74.0.3721.3', + '74.0.3721.2', + '74.0.3721.1', + '74.0.3721.0', + '74.0.3720.6', + '73.0.3683.61', + '72.0.3626.122', + '73.0.3683.60', + '74.0.3720.5', + '72.0.3626.121', + '74.0.3718.8', + '74.0.3720.4', + '74.0.3720.3', + '74.0.3718.7', + '74.0.3720.2', + '74.0.3720.1', + '74.0.3720.0', + '74.0.3718.6', + '74.0.3719.5', + '73.0.3683.59', + '74.0.3718.5', + '74.0.3718.4', + '74.0.3719.4', + '74.0.3719.3', + '74.0.3719.2', + '74.0.3719.1', + '73.0.3683.58', + '74.0.3719.0', + '73.0.3683.57', + '73.0.3683.56', + '74.0.3718.3', + '73.0.3683.55', + '74.0.3718.2', + '74.0.3718.1', + '74.0.3718.0', + '73.0.3683.54', + '74.0.3717.2', + '73.0.3683.53', + '74.0.3717.1', + '74.0.3717.0', + '73.0.3683.52', + '74.0.3716.1', + '74.0.3716.0', + '73.0.3683.51', + '74.0.3715.1', + '74.0.3715.0', + '73.0.3683.50', + '74.0.3711.2', + '74.0.3714.2', + '74.0.3713.3', + '74.0.3714.1', + '74.0.3714.0', + '73.0.3683.49', + '74.0.3713.1', + '74.0.3713.0', + '72.0.3626.120', + '73.0.3683.48', + '74.0.3712.2', + '74.0.3712.1', + '74.0.3712.0', + '73.0.3683.47', + '72.0.3626.119', + '73.0.3683.46', + '74.0.3710.2', + '72.0.3626.118', + '74.0.3711.1', + '74.0.3711.0', + '73.0.3683.45', + '72.0.3626.117', + '74.0.3710.1', + '74.0.3710.0', + '73.0.3683.44', + '72.0.3626.116', + '74.0.3709.1', + '74.0.3709.0', + '74.0.3704.9', + '73.0.3683.43', + '72.0.3626.115', + '74.0.3704.8', + '74.0.3704.7', + '74.0.3708.0', + '74.0.3706.7', + '74.0.3704.6', + '73.0.3683.42', + '72.0.3626.114', + '74.0.3706.6', + '72.0.3626.113', + '74.0.3704.5', + '74.0.3706.5', + '74.0.3706.4', + '74.0.3706.3', + '74.0.3706.2', + '74.0.3706.1', + '74.0.3706.0', + '73.0.3683.41', + '72.0.3626.112', + '74.0.3705.1', + '74.0.3705.0', + '73.0.3683.40', + '72.0.3626.111', + '73.0.3683.39', + '74.0.3704.4', + '73.0.3683.38', + '74.0.3704.3', + '74.0.3704.2', + '74.0.3704.1', + '74.0.3704.0', + '73.0.3683.37', + '72.0.3626.110', + '72.0.3626.109', + '74.0.3703.3', + '74.0.3703.2', + '73.0.3683.36', + '74.0.3703.1', + '74.0.3703.0', + '73.0.3683.35', + '72.0.3626.108', + '74.0.3702.2', + '74.0.3699.3', + '74.0.3702.1', + '74.0.3702.0', + '73.0.3683.34', + '72.0.3626.107', + '73.0.3683.33', + '74.0.3701.1', + '74.0.3701.0', + '73.0.3683.32', + '73.0.3683.31', + '72.0.3626.105', + '74.0.3700.1', + '74.0.3700.0', + '73.0.3683.29', + '72.0.3626.103', + '74.0.3699.2', + '74.0.3699.1', + '74.0.3699.0', + '73.0.3683.28', + '72.0.3626.102', + '73.0.3683.27', + '73.0.3683.26', + '74.0.3698.0', + '74.0.3696.2', + '72.0.3626.101', + '73.0.3683.25', + '74.0.3696.1', + '74.0.3696.0', + '74.0.3694.8', + '72.0.3626.100', + '74.0.3694.7', + '74.0.3694.6', + '74.0.3694.5', + '74.0.3694.4', + '72.0.3626.99', + '72.0.3626.98', + '74.0.3694.3', + '73.0.3683.24', + '72.0.3626.97', + '72.0.3626.96', + '72.0.3626.95', + '73.0.3683.23', + '72.0.3626.94', + '73.0.3683.22', + '73.0.3683.21', + '72.0.3626.93', + '74.0.3694.2', + '72.0.3626.92', + '74.0.3694.1', + '74.0.3694.0', + '74.0.3693.6', + '73.0.3683.20', + '72.0.3626.91', + '74.0.3693.5', + '74.0.3693.4', + '74.0.3693.3', + '74.0.3693.2', + '73.0.3683.19', + '74.0.3693.1', + '74.0.3693.0', + '73.0.3683.18', + '72.0.3626.90', + '74.0.3692.1', + '74.0.3692.0', + '73.0.3683.17', + '72.0.3626.89', + '74.0.3687.3', + '74.0.3691.1', + '74.0.3691.0', + '73.0.3683.16', + '72.0.3626.88', + '72.0.3626.87', + '73.0.3683.15', + '74.0.3690.1', + '74.0.3690.0', + '73.0.3683.14', + '72.0.3626.86', + '73.0.3683.13', + '73.0.3683.12', + '74.0.3689.1', + '74.0.3689.0', + '73.0.3683.11', + '72.0.3626.85', + '73.0.3683.10', + '72.0.3626.84', + '73.0.3683.9', + '74.0.3688.1', + '74.0.3688.0', + '73.0.3683.8', + '72.0.3626.83', + '74.0.3687.2', + '74.0.3687.1', + '74.0.3687.0', + '73.0.3683.7', + '72.0.3626.82', + '74.0.3686.4', + '72.0.3626.81', + '74.0.3686.3', + '74.0.3686.2', + '74.0.3686.1', + '74.0.3686.0', + '73.0.3683.6', + '72.0.3626.80', + '74.0.3685.1', + '74.0.3685.0', + '73.0.3683.5', + '72.0.3626.79', + '74.0.3684.1', + '74.0.3684.0', + '73.0.3683.4', + '72.0.3626.78', + '72.0.3626.77', + '73.0.3683.3', + '73.0.3683.2', + '72.0.3626.76', + '73.0.3683.1', + '73.0.3683.0', + '72.0.3626.75', + '71.0.3578.141', + '73.0.3682.1', + '73.0.3682.0', + '72.0.3626.74', + '71.0.3578.140', + '73.0.3681.4', + '73.0.3681.3', + '73.0.3681.2', + '73.0.3681.1', + '73.0.3681.0', + '72.0.3626.73', + '71.0.3578.139', + '72.0.3626.72', + '72.0.3626.71', + '73.0.3680.1', + '73.0.3680.0', + '72.0.3626.70', + '71.0.3578.138', + '73.0.3678.2', + '73.0.3679.1', + '73.0.3679.0', + '72.0.3626.69', + '71.0.3578.137', + '73.0.3678.1', + '73.0.3678.0', + '71.0.3578.136', + '73.0.3677.1', + '73.0.3677.0', + '72.0.3626.68', + '72.0.3626.67', + '71.0.3578.135', + '73.0.3676.1', + '73.0.3676.0', + '73.0.3674.2', + '72.0.3626.66', + '71.0.3578.134', + '73.0.3674.1', + '73.0.3674.0', + '72.0.3626.65', + '71.0.3578.133', + '73.0.3673.2', + '73.0.3673.1', + '73.0.3673.0', + '72.0.3626.64', + '71.0.3578.132', + '72.0.3626.63', + '72.0.3626.62', + '72.0.3626.61', + '72.0.3626.60', + '73.0.3672.1', + '73.0.3672.0', + '72.0.3626.59', + '71.0.3578.131', + '73.0.3671.3', + '73.0.3671.2', + '73.0.3671.1', + '73.0.3671.0', + '72.0.3626.58', + '71.0.3578.130', + '73.0.3670.1', + '73.0.3670.0', + '72.0.3626.57', + '71.0.3578.129', + '73.0.3669.1', + '73.0.3669.0', + '72.0.3626.56', + '71.0.3578.128', + '73.0.3668.2', + '73.0.3668.1', + '73.0.3668.0', + '72.0.3626.55', + '71.0.3578.127', + '73.0.3667.2', + '73.0.3667.1', + '73.0.3667.0', + '72.0.3626.54', + '71.0.3578.126', + '73.0.3666.1', + '73.0.3666.0', + '72.0.3626.53', + '71.0.3578.125', + '73.0.3665.4', + '73.0.3665.3', + '72.0.3626.52', + '73.0.3665.2', + '73.0.3664.4', + '73.0.3665.1', + '73.0.3665.0', + '72.0.3626.51', + '71.0.3578.124', + '72.0.3626.50', + '73.0.3664.3', + '73.0.3664.2', + '73.0.3664.1', + '73.0.3664.0', + '73.0.3663.2', + '72.0.3626.49', + '71.0.3578.123', + '73.0.3663.1', + '73.0.3663.0', + '72.0.3626.48', + '71.0.3578.122', + '73.0.3662.1', + '73.0.3662.0', + '72.0.3626.47', + '71.0.3578.121', + '73.0.3661.1', + '72.0.3626.46', + '73.0.3661.0', + '72.0.3626.45', + '71.0.3578.120', + '73.0.3660.2', + '73.0.3660.1', + '73.0.3660.0', + '72.0.3626.44', + '71.0.3578.119', + '73.0.3659.1', + '73.0.3659.0', + '72.0.3626.43', + '71.0.3578.118', + '73.0.3658.1', + '73.0.3658.0', + '72.0.3626.42', + '71.0.3578.117', + '73.0.3657.1', + '73.0.3657.0', + '72.0.3626.41', + '71.0.3578.116', + '73.0.3656.1', + '73.0.3656.0', + '72.0.3626.40', + '71.0.3578.115', + '73.0.3655.1', + '73.0.3655.0', + '72.0.3626.39', + '71.0.3578.114', + '73.0.3654.1', + '73.0.3654.0', + '72.0.3626.38', + '71.0.3578.113', + '73.0.3653.1', + '73.0.3653.0', + '72.0.3626.37', + '71.0.3578.112', + '73.0.3652.1', + '73.0.3652.0', + '72.0.3626.36', + '71.0.3578.111', + '73.0.3651.1', + '73.0.3651.0', + '72.0.3626.35', + '71.0.3578.110', + '73.0.3650.1', + '73.0.3650.0', + '72.0.3626.34', + '71.0.3578.109', + '73.0.3649.1', + '73.0.3649.0', + '72.0.3626.33', + '71.0.3578.108', + '73.0.3648.2', + '73.0.3648.1', + '73.0.3648.0', + '72.0.3626.32', + '71.0.3578.107', + '73.0.3647.2', + '73.0.3647.1', + '73.0.3647.0', + '72.0.3626.31', + '71.0.3578.106', + '73.0.3635.3', + '73.0.3646.2', + '73.0.3646.1', + '73.0.3646.0', + '72.0.3626.30', + '71.0.3578.105', + '72.0.3626.29', + '73.0.3645.2', + '73.0.3645.1', + '73.0.3645.0', + '72.0.3626.28', + '71.0.3578.104', + '72.0.3626.27', + '72.0.3626.26', + '72.0.3626.25', + '72.0.3626.24', + '73.0.3644.0', + '73.0.3643.2', + '72.0.3626.23', + '71.0.3578.103', + '73.0.3643.1', + '73.0.3643.0', + '72.0.3626.22', + '71.0.3578.102', + '73.0.3642.1', + '73.0.3642.0', + '72.0.3626.21', + '71.0.3578.101', + '73.0.3641.1', + '73.0.3641.0', + '72.0.3626.20', + '71.0.3578.100', + '72.0.3626.19', + '73.0.3640.1', + '73.0.3640.0', + '72.0.3626.18', + '73.0.3639.1', + '71.0.3578.99', + '73.0.3639.0', + '72.0.3626.17', + '73.0.3638.2', + '72.0.3626.16', + '73.0.3638.1', + '73.0.3638.0', + '72.0.3626.15', + '71.0.3578.98', + '73.0.3635.2', + '71.0.3578.97', + '73.0.3637.1', + '73.0.3637.0', + '72.0.3626.14', + '71.0.3578.96', + '71.0.3578.95', + '72.0.3626.13', + '71.0.3578.94', + '73.0.3636.2', + '71.0.3578.93', + '73.0.3636.1', + '73.0.3636.0', + '72.0.3626.12', + '71.0.3578.92', + '73.0.3635.1', + '73.0.3635.0', + '72.0.3626.11', + '71.0.3578.91', + '73.0.3634.2', + '73.0.3634.1', + '73.0.3634.0', + '72.0.3626.10', + '71.0.3578.90', + '71.0.3578.89', + '73.0.3633.2', + '73.0.3633.1', + '73.0.3633.0', + '72.0.3610.4', + '72.0.3626.9', + '71.0.3578.88', + '73.0.3632.5', + '73.0.3632.4', + '73.0.3632.3', + '73.0.3632.2', + '73.0.3632.1', + '73.0.3632.0', + '72.0.3626.8', + '71.0.3578.87', + '73.0.3631.2', + '73.0.3631.1', + '73.0.3631.0', + '72.0.3626.7', + '71.0.3578.86', + '72.0.3626.6', + '73.0.3630.1', + '73.0.3630.0', + '72.0.3626.5', + '71.0.3578.85', + '72.0.3626.4', + '73.0.3628.3', + '73.0.3628.2', + '73.0.3629.1', + '73.0.3629.0', + '72.0.3626.3', + '71.0.3578.84', + '73.0.3628.1', + '73.0.3628.0', + '71.0.3578.83', + '73.0.3627.1', + '73.0.3627.0', + '72.0.3626.2', + '71.0.3578.82', + '71.0.3578.81', + '71.0.3578.80', + '72.0.3626.1', + '72.0.3626.0', + '71.0.3578.79', + '70.0.3538.124', + '71.0.3578.78', + '72.0.3623.4', + '72.0.3625.2', + '72.0.3625.1', + '72.0.3625.0', + '71.0.3578.77', + '70.0.3538.123', + '72.0.3624.4', + '72.0.3624.3', + '72.0.3624.2', + '71.0.3578.76', + '72.0.3624.1', + '72.0.3624.0', + '72.0.3623.3', + '71.0.3578.75', + '70.0.3538.122', + '71.0.3578.74', + '72.0.3623.2', + '72.0.3610.3', + '72.0.3623.1', + '72.0.3623.0', + '72.0.3622.3', + '72.0.3622.2', + '71.0.3578.73', + '70.0.3538.121', + '72.0.3622.1', + '72.0.3622.0', + '71.0.3578.72', + '70.0.3538.120', + '72.0.3621.1', + '72.0.3621.0', + '71.0.3578.71', + '70.0.3538.119', + '72.0.3620.1', + '72.0.3620.0', + '71.0.3578.70', + '70.0.3538.118', + '71.0.3578.69', + '72.0.3619.1', + '72.0.3619.0', + '71.0.3578.68', + '70.0.3538.117', + '71.0.3578.67', + '72.0.3618.1', + '72.0.3618.0', + '71.0.3578.66', + '70.0.3538.116', + '72.0.3617.1', + '72.0.3617.0', + '71.0.3578.65', + '70.0.3538.115', + '72.0.3602.3', + '71.0.3578.64', + '72.0.3616.1', + '72.0.3616.0', + '71.0.3578.63', + '70.0.3538.114', + '71.0.3578.62', + '72.0.3615.1', + '72.0.3615.0', + '71.0.3578.61', + '70.0.3538.113', + '72.0.3614.1', + '72.0.3614.0', + '71.0.3578.60', + '70.0.3538.112', + '72.0.3613.1', + '72.0.3613.0', + '71.0.3578.59', + '70.0.3538.111', + '72.0.3612.2', + '72.0.3612.1', + '72.0.3612.0', + '70.0.3538.110', + '71.0.3578.58', + '70.0.3538.109', + '72.0.3611.2', + '72.0.3611.1', + '72.0.3611.0', + '71.0.3578.57', + '70.0.3538.108', + '72.0.3610.2', + '71.0.3578.56', + '71.0.3578.55', + '72.0.3610.1', + '72.0.3610.0', + '71.0.3578.54', + '70.0.3538.107', + '71.0.3578.53', + '72.0.3609.3', + '71.0.3578.52', + '72.0.3609.2', + '71.0.3578.51', + '72.0.3608.5', + '72.0.3609.1', + '72.0.3609.0', + '71.0.3578.50', + '70.0.3538.106', + '72.0.3608.4', + '72.0.3608.3', + '72.0.3608.2', + '71.0.3578.49', + '72.0.3608.1', + '72.0.3608.0', + '70.0.3538.105', + '71.0.3578.48', + '72.0.3607.1', + '72.0.3607.0', + '71.0.3578.47', + '70.0.3538.104', + '72.0.3606.2', + '72.0.3606.1', + '72.0.3606.0', + '71.0.3578.46', + '70.0.3538.103', + '70.0.3538.102', + '72.0.3605.3', + '72.0.3605.2', + '72.0.3605.1', + '72.0.3605.0', + '71.0.3578.45', + '70.0.3538.101', + '71.0.3578.44', + '71.0.3578.43', + '70.0.3538.100', + '70.0.3538.99', + '71.0.3578.42', + '72.0.3604.1', + '72.0.3604.0', + '71.0.3578.41', + '70.0.3538.98', + '71.0.3578.40', + '72.0.3603.2', + '72.0.3603.1', + '72.0.3603.0', + '71.0.3578.39', + '70.0.3538.97', + '72.0.3602.2', + '71.0.3578.38', + '71.0.3578.37', + '72.0.3602.1', + '72.0.3602.0', + '71.0.3578.36', + '70.0.3538.96', + '72.0.3601.1', + '72.0.3601.0', + '71.0.3578.35', + '70.0.3538.95', + '72.0.3600.1', + '72.0.3600.0', + '71.0.3578.34', + '70.0.3538.94', + '72.0.3599.3', + '72.0.3599.2', + '72.0.3599.1', + '72.0.3599.0', + '71.0.3578.33', + '70.0.3538.93', + '72.0.3598.1', + '72.0.3598.0', + '71.0.3578.32', + '70.0.3538.87', + '72.0.3597.1', + '72.0.3597.0', + '72.0.3596.2', + '71.0.3578.31', + '70.0.3538.86', + '71.0.3578.30', + '71.0.3578.29', + '72.0.3596.1', + '72.0.3596.0', + '71.0.3578.28', + '70.0.3538.85', + '72.0.3595.2', + '72.0.3591.3', + '72.0.3595.1', + '72.0.3595.0', + '71.0.3578.27', + '70.0.3538.84', + '72.0.3594.1', + '72.0.3594.0', + '71.0.3578.26', + '70.0.3538.83', + '72.0.3593.2', + '72.0.3593.1', + '72.0.3593.0', + '71.0.3578.25', + '70.0.3538.82', + '72.0.3589.3', + '72.0.3592.2', + '72.0.3592.1', + '72.0.3592.0', + '71.0.3578.24', + '72.0.3589.2', + '70.0.3538.81', + '70.0.3538.80', + '72.0.3591.2', + '72.0.3591.1', + '72.0.3591.0', + '71.0.3578.23', + '70.0.3538.79', + '71.0.3578.22', + '72.0.3590.1', + '72.0.3590.0', + '71.0.3578.21', + '70.0.3538.78', + '70.0.3538.77', + '72.0.3589.1', + '72.0.3589.0', + '71.0.3578.20', + '70.0.3538.76', + '71.0.3578.19', + '70.0.3538.75', + '72.0.3588.1', + '72.0.3588.0', + '71.0.3578.18', + '70.0.3538.74', + '72.0.3586.2', + '72.0.3587.0', + '71.0.3578.17', + '70.0.3538.73', + '72.0.3586.1', + '72.0.3586.0', + '71.0.3578.16', + '70.0.3538.72', + '72.0.3585.1', + '72.0.3585.0', + '71.0.3578.15', + '70.0.3538.71', + '71.0.3578.14', + '72.0.3584.1', + '72.0.3584.0', + '71.0.3578.13', + '70.0.3538.70', + '72.0.3583.2', + '71.0.3578.12', + '72.0.3583.1', + '72.0.3583.0', + '71.0.3578.11', + '70.0.3538.69', + '71.0.3578.10', + '72.0.3582.0', + '72.0.3581.4', + '71.0.3578.9', + '70.0.3538.67', + '72.0.3581.3', + '72.0.3581.2', + '72.0.3581.1', + '72.0.3581.0', + '71.0.3578.8', + '70.0.3538.66', + '72.0.3580.1', + '72.0.3580.0', + '71.0.3578.7', + '70.0.3538.65', + '71.0.3578.6', + '72.0.3579.1', + '72.0.3579.0', + '71.0.3578.5', + '70.0.3538.64', + '71.0.3578.4', + '71.0.3578.3', + '71.0.3578.2', + '71.0.3578.1', + '71.0.3578.0', + '70.0.3538.63', + '69.0.3497.128', + '70.0.3538.62', + '70.0.3538.61', + '70.0.3538.60', + '70.0.3538.59', + '71.0.3577.1', + '71.0.3577.0', + '70.0.3538.58', + '69.0.3497.127', + '71.0.3576.2', + '71.0.3576.1', + '71.0.3576.0', + '70.0.3538.57', + '70.0.3538.56', + '71.0.3575.2', + '70.0.3538.55', + '69.0.3497.126', + '70.0.3538.54', + '71.0.3575.1', + '71.0.3575.0', + '71.0.3574.1', + '71.0.3574.0', + '70.0.3538.53', + '69.0.3497.125', + '70.0.3538.52', + '71.0.3573.1', + '71.0.3573.0', + '70.0.3538.51', + '69.0.3497.124', + '71.0.3572.1', + '71.0.3572.0', + '70.0.3538.50', + '69.0.3497.123', + '71.0.3571.2', + '70.0.3538.49', + '69.0.3497.122', + '71.0.3571.1', + '71.0.3571.0', + '70.0.3538.48', + '69.0.3497.121', + '71.0.3570.1', + '71.0.3570.0', + '70.0.3538.47', + '69.0.3497.120', + '71.0.3568.2', + '71.0.3569.1', + '71.0.3569.0', + '70.0.3538.46', + '69.0.3497.119', + '70.0.3538.45', + '71.0.3568.1', + '71.0.3568.0', + '70.0.3538.44', + '69.0.3497.118', + '70.0.3538.43', + '70.0.3538.42', + '71.0.3567.1', + '71.0.3567.0', + '70.0.3538.41', + '69.0.3497.117', + '71.0.3566.1', + '71.0.3566.0', + '70.0.3538.40', + '69.0.3497.116', + '71.0.3565.1', + '71.0.3565.0', + '70.0.3538.39', + '69.0.3497.115', + '71.0.3564.1', + '71.0.3564.0', + '70.0.3538.38', + '69.0.3497.114', + '71.0.3563.0', + '71.0.3562.2', + '70.0.3538.37', + '69.0.3497.113', + '70.0.3538.36', + '70.0.3538.35', + '71.0.3562.1', + '71.0.3562.0', + '70.0.3538.34', + '69.0.3497.112', + '70.0.3538.33', + '71.0.3561.1', + '71.0.3561.0', + '70.0.3538.32', + '69.0.3497.111', + '71.0.3559.6', + '71.0.3560.1', + '71.0.3560.0', + '71.0.3559.5', + '71.0.3559.4', + '70.0.3538.31', + '69.0.3497.110', + '71.0.3559.3', + '70.0.3538.30', + '69.0.3497.109', + '71.0.3559.2', + '71.0.3559.1', + '71.0.3559.0', + '70.0.3538.29', + '69.0.3497.108', + '71.0.3558.2', + '71.0.3558.1', + '71.0.3558.0', + '70.0.3538.28', + '69.0.3497.107', + '71.0.3557.2', + '71.0.3557.1', + '71.0.3557.0', + '70.0.3538.27', + '69.0.3497.106', + '71.0.3554.4', + '70.0.3538.26', + '71.0.3556.1', + '71.0.3556.0', + '70.0.3538.25', + '71.0.3554.3', + '69.0.3497.105', + '71.0.3554.2', + '70.0.3538.24', + '69.0.3497.104', + '71.0.3555.2', + '70.0.3538.23', + '71.0.3555.1', + '71.0.3555.0', + '70.0.3538.22', + '69.0.3497.103', + '71.0.3554.1', + '71.0.3554.0', + '70.0.3538.21', + '69.0.3497.102', + '71.0.3553.3', + '70.0.3538.20', + '69.0.3497.101', + '71.0.3553.2', + '69.0.3497.100', + '71.0.3553.1', + '71.0.3553.0', + '70.0.3538.19', + '69.0.3497.99', + '69.0.3497.98', + '69.0.3497.97', + '71.0.3552.6', + '71.0.3552.5', + '71.0.3552.4', + '71.0.3552.3', + '71.0.3552.2', + '71.0.3552.1', + '71.0.3552.0', + '70.0.3538.18', + '69.0.3497.96', + '71.0.3551.3', + '71.0.3551.2', + '71.0.3551.1', + '71.0.3551.0', + '70.0.3538.17', + '69.0.3497.95', + '71.0.3550.3', + '71.0.3550.2', + '71.0.3550.1', + '71.0.3550.0', + '70.0.3538.16', + '69.0.3497.94', + '71.0.3549.1', + '71.0.3549.0', + '70.0.3538.15', + '69.0.3497.93', + '69.0.3497.92', + '71.0.3548.1', + '71.0.3548.0', + '70.0.3538.14', + '69.0.3497.91', + '71.0.3547.1', + '71.0.3547.0', + '70.0.3538.13', + '69.0.3497.90', + '71.0.3546.2', + '69.0.3497.89', + '71.0.3546.1', + '71.0.3546.0', + '70.0.3538.12', + '69.0.3497.88', + '71.0.3545.4', + '71.0.3545.3', + '71.0.3545.2', + '71.0.3545.1', + '71.0.3545.0', + '70.0.3538.11', + '69.0.3497.87', + '71.0.3544.5', + '71.0.3544.4', + '71.0.3544.3', + '71.0.3544.2', + '71.0.3544.1', + '71.0.3544.0', + '69.0.3497.86', + '70.0.3538.10', + '69.0.3497.85', + '70.0.3538.9', + '69.0.3497.84', + '71.0.3543.4', + '70.0.3538.8', + '71.0.3543.3', + '71.0.3543.2', + '71.0.3543.1', + '71.0.3543.0', + '70.0.3538.7', + '69.0.3497.83', + '71.0.3542.2', + '71.0.3542.1', + '71.0.3542.0', + '70.0.3538.6', + '69.0.3497.82', + '69.0.3497.81', + '71.0.3541.1', + '71.0.3541.0', + '70.0.3538.5', + '69.0.3497.80', + '71.0.3540.1', + '71.0.3540.0', + '70.0.3538.4', + '69.0.3497.79', + '70.0.3538.3', + '71.0.3539.1', + '71.0.3539.0', + '69.0.3497.78', + '68.0.3440.134', + '69.0.3497.77', + '70.0.3538.2', + '70.0.3538.1', + '70.0.3538.0', + '69.0.3497.76', + '68.0.3440.133', + '69.0.3497.75', + '70.0.3537.2', + '70.0.3537.1', + '70.0.3537.0', + '69.0.3497.74', + '68.0.3440.132', + '70.0.3536.0', + '70.0.3535.5', + '70.0.3535.4', + '70.0.3535.3', + '69.0.3497.73', + '68.0.3440.131', + '70.0.3532.8', + '70.0.3532.7', + '69.0.3497.72', + '69.0.3497.71', + '70.0.3535.2', + '70.0.3535.1', + '70.0.3535.0', + '69.0.3497.70', + '68.0.3440.130', + '69.0.3497.69', + '68.0.3440.129', + '70.0.3534.4', + '70.0.3534.3', + '70.0.3534.2', + '70.0.3534.1', + '70.0.3534.0', + '69.0.3497.68', + '68.0.3440.128', + '70.0.3533.2', + '70.0.3533.1', + '70.0.3533.0', + '69.0.3497.67', + '68.0.3440.127', + '70.0.3532.6', + '70.0.3532.5', + '70.0.3532.4', + '69.0.3497.66', + '68.0.3440.126', + '70.0.3532.3', + '70.0.3532.2', + '70.0.3532.1', + '69.0.3497.60', + '69.0.3497.65', + '69.0.3497.64', + '70.0.3532.0', + '70.0.3531.0', + '70.0.3530.4', + '70.0.3530.3', + '70.0.3530.2', + '69.0.3497.58', + '68.0.3440.125', + '69.0.3497.57', + '69.0.3497.56', + '69.0.3497.55', + '69.0.3497.54', + '70.0.3530.1', + '70.0.3530.0', + '69.0.3497.53', + '68.0.3440.124', + '69.0.3497.52', + '70.0.3529.3', + '70.0.3529.2', + '70.0.3529.1', + '70.0.3529.0', + '69.0.3497.51', + '70.0.3528.4', + '68.0.3440.123', + '70.0.3528.3', + '70.0.3528.2', + '70.0.3528.1', + '70.0.3528.0', + '69.0.3497.50', + '68.0.3440.122', + '70.0.3527.1', + '70.0.3527.0', + '69.0.3497.49', + '68.0.3440.121', + '70.0.3526.1', + '70.0.3526.0', + '68.0.3440.120', + '69.0.3497.48', + '69.0.3497.47', + '68.0.3440.119', + '68.0.3440.118', + '70.0.3525.5', + '70.0.3525.4', + '70.0.3525.3', + '68.0.3440.117', + '69.0.3497.46', + '70.0.3525.2', + '70.0.3525.1', + '70.0.3525.0', + '69.0.3497.45', + '68.0.3440.116', + '70.0.3524.4', + '70.0.3524.3', + '69.0.3497.44', + '70.0.3524.2', + '70.0.3524.1', + '70.0.3524.0', + '70.0.3523.2', + '69.0.3497.43', + '68.0.3440.115', + '70.0.3505.9', + '69.0.3497.42', + '70.0.3505.8', + '70.0.3523.1', + '70.0.3523.0', + '69.0.3497.41', + '68.0.3440.114', + '70.0.3505.7', + '69.0.3497.40', + '70.0.3522.1', + '70.0.3522.0', + '70.0.3521.2', + '69.0.3497.39', + '68.0.3440.113', + '70.0.3505.6', + '70.0.3521.1', + '70.0.3521.0', + '69.0.3497.38', + '68.0.3440.112', + '70.0.3520.1', + '70.0.3520.0', + '69.0.3497.37', + '68.0.3440.111', + '70.0.3519.3', + '70.0.3519.2', + '70.0.3519.1', + '70.0.3519.0', + '69.0.3497.36', + '68.0.3440.110', + '70.0.3518.1', + '70.0.3518.0', + '69.0.3497.35', + '69.0.3497.34', + '68.0.3440.109', + '70.0.3517.1', + '70.0.3517.0', + '69.0.3497.33', + '68.0.3440.108', + '69.0.3497.32', + '70.0.3516.3', + '70.0.3516.2', + '70.0.3516.1', + '70.0.3516.0', + '69.0.3497.31', + '68.0.3440.107', + '70.0.3515.4', + '68.0.3440.106', + '70.0.3515.3', + '70.0.3515.2', + '70.0.3515.1', + '70.0.3515.0', + '69.0.3497.30', + '68.0.3440.105', + '68.0.3440.104', + '70.0.3514.2', + '70.0.3514.1', + '70.0.3514.0', + '69.0.3497.29', + '68.0.3440.103', + '70.0.3513.1', + '70.0.3513.0', + '69.0.3497.28', + ) + return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS) + + +std_headers = { + 'User-Agent': random_user_agent(), + 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Encoding': 'gzip, deflate', + 'Accept-Language': 'en-us,en;q=0.5', +} + + +USER_AGENTS = { + 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27', +} + + +NO_DEFAULT = object() + +ENGLISH_MONTH_NAMES = [ + 'January', 'February', 'March', 'April', 'May', 'June', + 'July', 'August', 'September', 'October', 'November', 'December'] + +MONTH_NAMES = { + 'en': ENGLISH_MONTH_NAMES, + 'fr': [ + 'janvier', 'février', 'mars', 'avril', 'mai', 'juin', + 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'], +} + +KNOWN_EXTENSIONS = ( + 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac', + 'flv', 'f4v', 'f4a', 'f4b', + 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus', + 'mkv', 'mka', 'mk3d', + 'avi', 'divx', + 'mov', + 'asf', 'wmv', 'wma', + '3gp', '3g2', + 'mp3', + 'flac', + 'ape', + 'wav', + 'f4f', 'f4m', 'm3u8', 'smil') + +# needed for sanitizing filenames in restricted mode +ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ', + itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'], + 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y'))) + +DATE_FORMATS = ( + '%d %B %Y', + '%d %b %Y', + '%B %d %Y', + '%B %dst %Y', + '%B %dnd %Y', + '%B %drd %Y', + '%B %dth %Y', + '%b %d %Y', + '%b %dst %Y', + '%b %dnd %Y', + '%b %drd %Y', + '%b %dth %Y', + '%b %dst %Y %I:%M', + '%b %dnd %Y %I:%M', + '%b %drd %Y %I:%M', + '%b %dth %Y %I:%M', + '%Y %m %d', + '%Y-%m-%d', + '%Y.%m.%d.', + '%Y/%m/%d', + '%Y/%m/%d %H:%M', + '%Y/%m/%d %H:%M:%S', + '%Y%m%d%H%M', + '%Y%m%d%H%M%S', + '%Y-%m-%d %H:%M', + '%Y-%m-%d %H:%M:%S', + '%Y-%m-%d %H:%M:%S.%f', + '%Y-%m-%d %H:%M:%S:%f', + '%d.%m.%Y %H:%M', + '%d.%m.%Y %H.%M', + '%Y-%m-%dT%H:%M:%SZ', + '%Y-%m-%dT%H:%M:%S.%fZ', + '%Y-%m-%dT%H:%M:%S.%f0Z', + '%Y-%m-%dT%H:%M:%S', + '%Y-%m-%dT%H:%M:%S.%f', + '%Y-%m-%dT%H:%M', + '%b %d %Y at %H:%M', + '%b %d %Y at %H:%M:%S', + '%B %d %Y at %H:%M', + '%B %d %Y at %H:%M:%S', + '%H:%M %d-%b-%Y', +) + +DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS) +DATE_FORMATS_DAY_FIRST.extend([ + '%d-%m-%Y', + '%d.%m.%Y', + '%d.%m.%y', + '%d/%m/%Y', + '%d/%m/%y', + '%d/%m/%Y %H:%M:%S', +]) + +DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS) +DATE_FORMATS_MONTH_FIRST.extend([ + '%m-%d-%Y', + '%m.%d.%Y', + '%m/%d/%Y', + '%m/%d/%y', + '%m/%d/%Y %H:%M:%S', +]) + +PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)" +JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>' + + +def preferredencoding(): + """Get preferred encoding. + + Returns the best encoding scheme for the system, based on + locale.getpreferredencoding() and some further tweaks. + """ + try: + pref = locale.getpreferredencoding() + 'TEST'.encode(pref) + except Exception: + pref = 'UTF-8' + + return pref + + +def write_json_file(obj, fn): + """ Encode obj as JSON and write it to fn, atomically if possible """ + + fn = encodeFilename(fn) + if sys.version_info < (3, 0) and sys.platform != 'win32': + encoding = get_filesystem_encoding() + # os.path.basename returns a bytes object, but NamedTemporaryFile + # will fail if the filename contains non ascii characters unless we + # use a unicode object + path_basename = lambda f: os.path.basename(fn).decode(encoding) + # the same for os.path.dirname + path_dirname = lambda f: os.path.dirname(fn).decode(encoding) + else: + path_basename = os.path.basename + path_dirname = os.path.dirname + + args = { + 'suffix': '.tmp', + 'prefix': path_basename(fn) + '.', + 'dir': path_dirname(fn), + 'delete': False, + } + + # In Python 2.x, json.dump expects a bytestream. + # In Python 3.x, it writes to a character stream + if sys.version_info < (3, 0): + args['mode'] = 'wb' + else: + args.update({ + 'mode': 'w', + 'encoding': 'utf-8', + }) + + tf = tempfile.NamedTemporaryFile(**compat_kwargs(args)) + + try: + with tf: + json.dump(obj, tf) + if sys.platform == 'win32': + # Need to remove existing file on Windows, else os.rename raises + # WindowsError or FileExistsError. + try: + os.unlink(fn) + except OSError: + pass + try: + mask = os.umask(0) + os.umask(mask) + os.chmod(tf.name, 0o666 & ~mask) + except OSError: + pass + os.rename(tf.name, fn) + except Exception: + try: + os.remove(tf.name) + except OSError: + pass + raise + + +if sys.version_info >= (2, 7): + def find_xpath_attr(node, xpath, key, val=None): + """ Find the xpath xpath[@key=val] """ + assert re.match(r'^[a-zA-Z_-]+$', key) + expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val)) + return node.find(expr) +else: + def find_xpath_attr(node, xpath, key, val=None): + for f in node.findall(compat_xpath(xpath)): + if key not in f.attrib: + continue + if val is None or f.attrib.get(key) == val: + return f + return None + +# On python2.6 the xml.etree.ElementTree.Element methods don't support +# the namespace parameter + + +def xpath_with_ns(path, ns_map): + components = [c.split(':') for c in path.split('/')] + replaced = [] + for c in components: + if len(c) == 1: + replaced.append(c[0]) + else: + ns, tag = c + replaced.append('{%s}%s' % (ns_map[ns], tag)) + return '/'.join(replaced) + + +def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT): + def _find_xpath(xpath): + return node.find(compat_xpath(xpath)) + + if isinstance(xpath, (str, compat_str)): + n = _find_xpath(xpath) + else: + for xp in xpath: + n = _find_xpath(xp) + if n is not None: + break + + if n is None: + if default is not NO_DEFAULT: + return default + elif fatal: + name = xpath if name is None else name + raise ExtractorError('Could not find XML element %s' % name) + else: + return None + return n + + +def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT): + n = xpath_element(node, xpath, name, fatal=fatal, default=default) + if n is None or n == default: + return n + if n.text is None: + if default is not NO_DEFAULT: + return default + elif fatal: + name = xpath if name is None else name + raise ExtractorError('Could not find XML element\'s text %s' % name) + else: + return None + return n.text + + +def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT): + n = find_xpath_attr(node, xpath, key) + if n is None: + if default is not NO_DEFAULT: + return default + elif fatal: + name = '%s[@%s]' % (xpath, key) if name is None else name + raise ExtractorError('Could not find XML attribute %s' % name) + else: + return None + return n.attrib[key] + + +def get_element_by_id(id, html): + """Return the content of the tag with the specified ID in the passed HTML document""" + return get_element_by_attribute('id', id, html) + + +def get_element_by_class(class_name, html): + """Return the content of the first tag with the specified class in the passed HTML document""" + retval = get_elements_by_class(class_name, html) + return retval[0] if retval else None + + +def get_element_by_attribute(attribute, value, html, escape_value=True): + retval = get_elements_by_attribute(attribute, value, html, escape_value) + return retval[0] if retval else None + + +def get_elements_by_class(class_name, html): + """Return the content of all tags with the specified class in the passed HTML document as a list""" + return get_elements_by_attribute( + 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name), + html, escape_value=False) + + +def get_elements_by_attribute(attribute, value, html, escape_value=True): + """Return the content of the tag with the specified attribute in the passed HTML document""" + + value = re.escape(value) if escape_value else value + + retlist = [] + for m in re.finditer(r'''(?xs) + <([a-zA-Z0-9:._-]+) + (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*? + \s+%s=['"]?%s['"]? + (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*? + \s*> + (?P<content>.*?) + </\1> + ''' % (re.escape(attribute), value), html): + res = m.group('content') + + if res.startswith('"') or res.startswith("'"): + res = res[1:-1] + + retlist.append(unescapeHTML(res)) + + return retlist + + +class HTMLAttributeParser(compat_HTMLParser): + """Trivial HTML parser to gather the attributes for a single element""" + + def __init__(self): + self.attrs = {} + compat_HTMLParser.__init__(self) + + def handle_starttag(self, tag, attrs): + self.attrs = dict(attrs) + + +def extract_attributes(html_element): + """Given a string for an HTML element such as + <el + a="foo" B="bar" c="&98;az" d=boz + empty= noval entity="&" + sq='"' dq="'" + > + Decode and return a dictionary of attributes. + { + 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz', + 'empty': '', 'noval': None, 'entity': '&', + 'sq': '"', 'dq': '\'' + }. + NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions, + but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5. + """ + parser = HTMLAttributeParser() + try: + parser.feed(html_element) + parser.close() + # Older Python may throw HTMLParseError in case of malformed HTML + except compat_HTMLParseError: + pass + return parser.attrs + + +def clean_html(html): + """Clean an HTML snippet into a readable string""" + + if html is None: # Convenience for sanitizing descriptions etc. + return html + + # Newline vs <br /> + html = html.replace('\n', ' ') + html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html) + html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html) + # Strip html tags + html = re.sub('<.*?>', '', html) + # Replace html entities + html = unescapeHTML(html) + return html.strip() + + +def sanitize_open(filename, open_mode): + """Try to open the given filename, and slightly tweak it if this fails. + + Attempts to open the given filename. If this fails, it tries to change + the filename slightly, step by step, until it's either able to open it + or it fails and raises a final exception, like the standard open() + function. + + It returns the tuple (stream, definitive_file_name). + """ + try: + if filename == '-': + if sys.platform == 'win32': + import msvcrt + msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY) + return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename) + stream = open(encodeFilename(filename), open_mode) + return (stream, filename) + except (IOError, OSError) as err: + if err.errno in (errno.EACCES,): + raise + + # In case of error, try to remove win32 forbidden chars + alt_filename = sanitize_path(filename) + if alt_filename == filename: + raise + else: + # An exception here should be caught in the caller + stream = open(encodeFilename(alt_filename), open_mode) + return (stream, alt_filename) + + +def timeconvert(timestr): + """Convert RFC 2822 defined time string into system timestamp""" + timestamp = None + timetuple = email.utils.parsedate_tz(timestr) + if timetuple is not None: + timestamp = email.utils.mktime_tz(timetuple) + return timestamp + + +def sanitize_filename(s, restricted=False, is_id=False): + """Sanitizes a string so it could be used as part of a filename. + If restricted is set, use a stricter subset of allowed characters. + Set is_id if this is not an arbitrary string, but an ID that should be kept + if possible. + """ + def replace_insane(char): + if restricted and char in ACCENT_CHARS: + return ACCENT_CHARS[char] + elif not restricted and char == '\n': + return ' ' + elif char == '?' or ord(char) < 32 or ord(char) == 127: + return '' + elif char == '"': + return '' if restricted else '\'' + elif char == ':': + return '_-' if restricted else ' -' + elif char in '\\/|*<>': + return '_' + if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()): + return '_' + if restricted and ord(char) > 127: + return '_' + return char + + if s == '': + return '' + # Handle timestamps + s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) + result = ''.join(map(replace_insane, s)) + if not is_id: + while '__' in result: + result = result.replace('__', '_') + result = result.strip('_') + # Common case of "Foreign band name - English song title" + if restricted and result.startswith('-_'): + result = result[2:] + if result.startswith('-'): + result = '_' + result[len('-'):] + result = result.lstrip('.') + if not result: + result = '_' + return result + + +def sanitize_path(s, force=False): + """Sanitizes and normalizes path on Windows""" + if sys.platform == 'win32': + force = False + drive_or_unc, _ = os.path.splitdrive(s) + if sys.version_info < (2, 7) and not drive_or_unc: + drive_or_unc, _ = os.path.splitunc(s) + elif force: + drive_or_unc = '' + else: + return s + + norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep) + if drive_or_unc: + norm_path.pop(0) + sanitized_path = [ + path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part) + for path_part in norm_path] + if drive_or_unc: + sanitized_path.insert(0, drive_or_unc + os.path.sep) + elif force and s[0] == os.path.sep: + sanitized_path.insert(0, os.path.sep) + return os.path.join(*sanitized_path) + + +def sanitize_url(url): + # Prepend protocol-less URLs with `http:` scheme in order to mitigate + # the number of unwanted failures due to missing protocol + if url.startswith('//'): + return 'http:%s' % url + # Fix some common typos seen so far + COMMON_TYPOS = ( + # https://github.com/ytdl-org/youtube-dl/issues/15649 + (r'^httpss://', r'https://'), + # https://bx1.be/lives/direct-tv/ + (r'^rmtp([es]?)://', r'rtmp\1://'), + ) + for mistake, fixup in COMMON_TYPOS: + if re.match(mistake, url): + return re.sub(mistake, fixup, url) + return url + + +def extract_basic_auth(url): + parts = compat_urlparse.urlsplit(url) + if parts.username is None: + return url, None + url = compat_urlparse.urlunsplit(parts._replace(netloc=( + parts.hostname if parts.port is None + else '%s:%d' % (parts.hostname, parts.port)))) + auth_payload = base64.b64encode( + ('%s:%s' % (parts.username, parts.password or '')).encode('utf-8')) + return url, 'Basic ' + auth_payload.decode('utf-8') + + +def sanitized_Request(url, *args, **kwargs): + url, auth_header = extract_basic_auth(escape_url(sanitize_url(url))) + if auth_header is not None: + headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {}) + headers['Authorization'] = auth_header + return compat_urllib_request.Request(url, *args, **kwargs) + + +def expand_path(s): + """Expand shell variables and ~""" + return os.path.expandvars(compat_expanduser(s)) + + +def orderedSet(iterable): + """ Remove all duplicates from the input iterable """ + res = [] + for el in iterable: + if el not in res: + res.append(el) + return res + + +def _htmlentity_transform(entity_with_semicolon): + """Transforms an HTML entity to a character.""" + entity = entity_with_semicolon[:-1] + + # Known non-numeric HTML entity + if entity in compat_html_entities.name2codepoint: + return compat_chr(compat_html_entities.name2codepoint[entity]) + + # TODO: HTML5 allows entities without a semicolon. For example, + # 'Éric' should be decoded as 'Éric'. + if entity_with_semicolon in compat_html_entities_html5: + return compat_html_entities_html5[entity_with_semicolon] + + mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity) + if mobj is not None: + numstr = mobj.group(1) + if numstr.startswith('x'): + base = 16 + numstr = '0%s' % numstr + else: + base = 10 + # See https://github.com/ytdl-org/youtube-dl/issues/7518 + try: + return compat_chr(int(numstr, base)) + except ValueError: + pass + + # Unknown entity in name, return its literal representation + return '&%s;' % entity + + +def unescapeHTML(s): + if s is None: + return None + assert type(s) == compat_str + + return re.sub( + r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s) + + +def escapeHTML(text): + return ( + text + .replace('&', '&') + .replace('<', '<') + .replace('>', '>') + .replace('"', '"') + .replace("'", ''') + ) + + +def process_communicate_or_kill(p, *args, **kwargs): + try: + return p.communicate(*args, **kwargs) + except BaseException: # Including KeyboardInterrupt + p.kill() + p.wait() + raise + + +def get_subprocess_encoding(): + if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5: + # For subprocess calls, encode with locale encoding + # Refer to http://stackoverflow.com/a/9951851/35070 + encoding = preferredencoding() + else: + encoding = sys.getfilesystemencoding() + if encoding is None: + encoding = 'utf-8' + return encoding + + +def encodeFilename(s, for_subprocess=False): + """ + @param s The name of the file + """ + + assert type(s) == compat_str + + # Python 3 has a Unicode API + if sys.version_info >= (3, 0): + return s + + # Pass '' directly to use Unicode APIs on Windows 2000 and up + # (Detecting Windows NT 4 is tricky because 'major >= 4' would + # match Windows 9x series as well. Besides, NT 4 is obsolete.) + if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5: + return s + + # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible + if sys.platform.startswith('java'): + return s + + return s.encode(get_subprocess_encoding(), 'ignore') + + +def decodeFilename(b, for_subprocess=False): + + if sys.version_info >= (3, 0): + return b + + if not isinstance(b, bytes): + return b + + return b.decode(get_subprocess_encoding(), 'ignore') + + +def encodeArgument(s): + if not isinstance(s, compat_str): + # Legacy code that uses byte strings + # Uncomment the following line after fixing all post processors + # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s)) + s = s.decode('ascii') + return encodeFilename(s, True) + + +def decodeArgument(b): + return decodeFilename(b, True) + + +def decodeOption(optval): + if optval is None: + return optval + if isinstance(optval, bytes): + optval = optval.decode(preferredencoding()) + + assert isinstance(optval, compat_str) + return optval + + +def formatSeconds(secs, delim=':', msec=False): + if secs > 3600: + ret = '%d%s%02d%s%02d' % (secs // 3600, delim, (secs % 3600) // 60, delim, secs % 60) + elif secs > 60: + ret = '%d%s%02d' % (secs // 60, delim, secs % 60) + else: + ret = '%d' % secs + return '%s.%03d' % (ret, secs % 1) if msec else ret + + +def _ssl_load_windows_store_certs(ssl_context, storename): + # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py + try: + certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename) + if encoding == 'x509_asn' and ( + trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)] + except PermissionError: + return + for cert in certs: + try: + ssl_context.load_verify_locations(cadata=cert) + except ssl.SSLError: + pass + + +def make_HTTPS_handler(params, **kwargs): + opts_check_certificate = not params.get('nocheckcertificate') + context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) + context.check_hostname = opts_check_certificate + context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE + if opts_check_certificate: + try: + context.load_default_certs() + # Work around the issue in load_default_certs when there are bad certificates. See: + # https://github.com/yt-dlp/yt-dlp/issues/1060, + # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312 + except ssl.SSLError: + # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151 + if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'): + # Create a new context to discard any certificates that were already loaded + context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) + context.check_hostname, context.verify_mode = True, ssl.CERT_REQUIRED + for storename in ('CA', 'ROOT'): + _ssl_load_windows_store_certs(context, storename) + context.set_default_verify_paths() + return YoutubeDLHTTPSHandler(params, context=context, **kwargs) + + +def bug_reports_message(before=';'): + if ytdl_is_updateable(): + update_cmd = 'type doas pacman -Sy hypervideo to update' + else: + update_cmd = 'see https://git.conocimientoslibres.ga/software/hypervideo.git/about/#how-do-i-update-hypervideo' + msg = 'please report this issue on https://github.com/yt-dlp/yt-dlp .' + msg += ' Make sure you are using the latest version; %s.' % update_cmd + msg += ' Be sure to call yt-dlp with the --verbose flag and include its complete output.' + + before = before.rstrip() + if not before or before.endswith(('.', '!', '?')): + msg = msg[0].title() + msg[1:] + + return (before + ' ' if before else '') + msg + + +class YoutubeDLError(Exception): + """Base exception for YoutubeDL errors.""" + pass + + +network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error] +if hasattr(ssl, 'CertificateError'): + network_exceptions.append(ssl.CertificateError) +network_exceptions = tuple(network_exceptions) + + +class ExtractorError(YoutubeDLError): + """Error during info extraction.""" + + def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None): + """ tb, if given, is the original traceback (so that it can be printed out). + If expected is set, this is a normal error message and most likely not a bug in yt-dlp. + """ + if sys.exc_info()[0] in network_exceptions: + expected = True + + self.msg = str(msg) + self.traceback = tb + self.expected = expected + self.cause = cause + self.video_id = video_id + self.ie = ie + self.exc_info = sys.exc_info() # preserve original exception + + super(ExtractorError, self).__init__(''.join(( + format_field(ie, template='[%s] '), + format_field(video_id, template='%s: '), + self.msg, + format_field(cause, template=' (caused by %r)'), + '' if expected else bug_reports_message()))) + + def format_traceback(self): + if self.traceback is None: + return None + return ''.join(traceback.format_tb(self.traceback)) + + +class UnsupportedError(ExtractorError): + def __init__(self, url): + super(UnsupportedError, self).__init__( + 'Unsupported URL: %s' % url, expected=True) + self.url = url + + +class RegexNotFoundError(ExtractorError): + """Error when a regex didn't match""" + pass + + +class GeoRestrictedError(ExtractorError): + """Geographic restriction Error exception. + + This exception may be thrown when a video is not available from your + geographic location due to geographic restrictions imposed by a website. + """ + + def __init__(self, msg, countries=None): + super(GeoRestrictedError, self).__init__(msg, expected=True) + self.msg = msg + self.countries = countries + + +class DownloadError(YoutubeDLError): + """Download Error exception. + + This exception may be thrown by FileDownloader objects if they are not + configured to continue on errors. They will contain the appropriate + error message. + """ + + def __init__(self, msg, exc_info=None): + """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """ + super(DownloadError, self).__init__(msg) + self.exc_info = exc_info + + +class EntryNotInPlaylist(YoutubeDLError): + """Entry not in playlist exception. + + This exception will be thrown by YoutubeDL when a requested entry + is not found in the playlist info_dict + """ + pass + + +class SameFileError(YoutubeDLError): + """Same File exception. + + This exception will be thrown by FileDownloader objects if they detect + multiple files would have to be downloaded to the same file on disk. + """ + pass + + +class PostProcessingError(YoutubeDLError): + """Post Processing exception. + + This exception may be raised by PostProcessor's .run() method to + indicate an error in the postprocessing task. + """ + + def __init__(self, msg): + super(PostProcessingError, self).__init__(msg) + self.msg = msg + + +class ExistingVideoReached(YoutubeDLError): + """ --max-downloads limit has been reached. """ + pass + + +class RejectedVideoReached(YoutubeDLError): + """ --max-downloads limit has been reached. """ + pass + + +class ThrottledDownload(YoutubeDLError): + """ Download speed below --throttled-rate. """ + pass + + +class MaxDownloadsReached(YoutubeDLError): + """ --max-downloads limit has been reached. """ + pass + + +class UnavailableVideoError(YoutubeDLError): + """Unavailable Format exception. + + This exception will be thrown when a video is requested + in a format that is not available for that video. + """ + pass + + +class ContentTooShortError(YoutubeDLError): + """Content Too Short exception. + + This exception may be raised by FileDownloader objects when a file they + download is too small for what the server announced first, indicating + the connection was probably interrupted. + """ + + def __init__(self, downloaded, expected): + super(ContentTooShortError, self).__init__( + 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected) + ) + # Both in bytes + self.downloaded = downloaded + self.expected = expected + + +class XAttrMetadataError(YoutubeDLError): + def __init__(self, code=None, msg='Unknown error'): + super(XAttrMetadataError, self).__init__(msg) + self.code = code + self.msg = msg + + # Parsing code and msg + if (self.code in (errno.ENOSPC, errno.EDQUOT) + or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg): + self.reason = 'NO_SPACE' + elif self.code == errno.E2BIG or 'Argument list too long' in self.msg: + self.reason = 'VALUE_TOO_LONG' + else: + self.reason = 'NOT_SUPPORTED' + + +class XAttrUnavailableError(YoutubeDLError): + pass + + +def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs): + # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting + # expected HTTP responses to meet HTTP/1.0 or later (see also + # https://github.com/ytdl-org/youtube-dl/issues/6727) + if sys.version_info < (3, 0): + kwargs['strict'] = True + hc = http_class(*args, **compat_kwargs(kwargs)) + source_address = ydl_handler._params.get('source_address') + + if source_address is not None: + # This is to workaround _create_connection() from socket where it will try all + # address data from getaddrinfo() including IPv6. This filters the result from + # getaddrinfo() based on the source_address value. + # This is based on the cpython socket.create_connection() function. + # https://github.com/python/cpython/blob/master/Lib/socket.py#L691 + def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None): + host, port = address + err = None + addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM) + af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6 + ip_addrs = [addr for addr in addrs if addr[0] == af] + if addrs and not ip_addrs: + ip_version = 'v4' if af == socket.AF_INET else 'v6' + raise socket.error( + "No remote IP%s addresses available for connect, can't use '%s' as source address" + % (ip_version, source_address[0])) + for res in ip_addrs: + af, socktype, proto, canonname, sa = res + sock = None + try: + sock = socket.socket(af, socktype, proto) + if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT: + sock.settimeout(timeout) + sock.bind(source_address) + sock.connect(sa) + err = None # Explicitly break reference cycle + return sock + except socket.error as _: + err = _ + if sock is not None: + sock.close() + if err is not None: + raise err + else: + raise socket.error('getaddrinfo returns an empty list') + if hasattr(hc, '_create_connection'): + hc._create_connection = _create_connection + sa = (source_address, 0) + if hasattr(hc, 'source_address'): # Python 2.7+ + hc.source_address = sa + else: # Python 2.6 + def _hc_connect(self, *args, **kwargs): + sock = _create_connection( + (self.host, self.port), self.timeout, sa) + if is_https: + self.sock = ssl.wrap_socket( + sock, self.key_file, self.cert_file, + ssl_version=ssl.PROTOCOL_TLSv1) + else: + self.sock = sock + hc.connect = functools.partial(_hc_connect, hc) + + return hc + + +def handle_youtubedl_headers(headers): + filtered_headers = headers + + if 'Youtubedl-no-compression' in filtered_headers: + filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding') + del filtered_headers['Youtubedl-no-compression'] + + return filtered_headers + + +class YoutubeDLHandler(compat_urllib_request.HTTPHandler): + """Handler for HTTP requests and responses. + + This class, when installed with an OpenerDirector, automatically adds + the standard headers to every HTTP request and handles gzipped and + deflated responses from web servers. If compression is to be avoided in + a particular request, the original request in the program code only has + to include the HTTP header "Youtubedl-no-compression", which will be + removed before making the real request. + + Part of this code was copied from: + + http://techknack.net/python-urllib2-handlers/ + + Andrew Rowls, the author of that code, agreed to release it to the + public domain. + """ + + def __init__(self, params, *args, **kwargs): + compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs) + self._params = params + + def http_open(self, req): + conn_class = compat_http_client.HTTPConnection + + socks_proxy = req.headers.get('Ytdl-socks-proxy') + if socks_proxy: + conn_class = make_socks_conn_class(conn_class, socks_proxy) + del req.headers['Ytdl-socks-proxy'] + + return self.do_open(functools.partial( + _create_http_connection, self, conn_class, False), + req) + + @staticmethod + def deflate(data): + if not data: + return data + try: + return zlib.decompress(data, -zlib.MAX_WBITS) + except zlib.error: + return zlib.decompress(data) + + def http_request(self, req): + # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not + # always respected by websites, some tend to give out URLs with non percent-encoded + # non-ASCII characters (see telemb.py, ard.py [#3412]) + # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991) + # To work around aforementioned issue we will replace request's original URL with + # percent-encoded one + # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09) + # the code of this workaround has been moved here from YoutubeDL.urlopen() + url = req.get_full_url() + url_escaped = escape_url(url) + + # Substitute URL if any change after escaping + if url != url_escaped: + req = update_Request(req, url=url_escaped) + + for h, v in std_headers.items(): + # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275 + # The dict keys are capitalized because of this bug by urllib + if h.capitalize() not in req.headers: + req.add_header(h, v) + + req.headers = handle_youtubedl_headers(req.headers) + + if sys.version_info < (2, 7) and '#' in req.get_full_url(): + # Python 2.6 is brain-dead when it comes to fragments + req._Request__original = req._Request__original.partition('#')[0] + req._Request__r_type = req._Request__r_type.partition('#')[0] + + return req + + def http_response(self, req, resp): + old_resp = resp + # gzip + if resp.headers.get('Content-encoding', '') == 'gzip': + content = resp.read() + gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb') + try: + uncompressed = io.BytesIO(gz.read()) + except IOError as original_ioerror: + # There may be junk add the end of the file + # See http://stackoverflow.com/q/4928560/35070 for details + for i in range(1, 1024): + try: + gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb') + uncompressed = io.BytesIO(gz.read()) + except IOError: + continue + break + else: + raise original_ioerror + resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code) + resp.msg = old_resp.msg + del resp.headers['Content-encoding'] + # deflate + if resp.headers.get('Content-encoding', '') == 'deflate': + gz = io.BytesIO(self.deflate(resp.read())) + resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code) + resp.msg = old_resp.msg + del resp.headers['Content-encoding'] + # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see + # https://github.com/ytdl-org/youtube-dl/issues/6457). + if 300 <= resp.code < 400: + location = resp.headers.get('Location') + if location: + # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3 + if sys.version_info >= (3, 0): + location = location.encode('iso-8859-1').decode('utf-8') + else: + location = location.decode('utf-8') + location_escaped = escape_url(location) + if location != location_escaped: + del resp.headers['Location'] + if sys.version_info < (3, 0): + location_escaped = location_escaped.encode('utf-8') + resp.headers['Location'] = location_escaped + return resp + + https_request = http_request + https_response = http_response + + +def make_socks_conn_class(base_class, socks_proxy): + assert issubclass(base_class, ( + compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection)) + + url_components = compat_urlparse.urlparse(socks_proxy) + if url_components.scheme.lower() == 'socks5': + socks_type = ProxyType.SOCKS5 + elif url_components.scheme.lower() in ('socks', 'socks4'): + socks_type = ProxyType.SOCKS4 + elif url_components.scheme.lower() == 'socks4a': + socks_type = ProxyType.SOCKS4A + + def unquote_if_non_empty(s): + if not s: + return s + return compat_urllib_parse_unquote_plus(s) + + proxy_args = ( + socks_type, + url_components.hostname, url_components.port or 1080, + True, # Remote DNS + unquote_if_non_empty(url_components.username), + unquote_if_non_empty(url_components.password), + ) + + class SocksConnection(base_class): + def connect(self): + self.sock = sockssocket() + self.sock.setproxy(*proxy_args) + if type(self.timeout) in (int, float): + self.sock.settimeout(self.timeout) + self.sock.connect((self.host, self.port)) + + if isinstance(self, compat_http_client.HTTPSConnection): + if hasattr(self, '_context'): # Python > 2.6 + self.sock = self._context.wrap_socket( + self.sock, server_hostname=self.host) + else: + self.sock = ssl.wrap_socket(self.sock) + + return SocksConnection + + +class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler): + def __init__(self, params, https_conn_class=None, *args, **kwargs): + compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs) + self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection + self._params = params + + def https_open(self, req): + kwargs = {} + conn_class = self._https_conn_class + + if hasattr(self, '_context'): # python > 2.6 + kwargs['context'] = self._context + if hasattr(self, '_check_hostname'): # python 3.x + kwargs['check_hostname'] = self._check_hostname + + socks_proxy = req.headers.get('Ytdl-socks-proxy') + if socks_proxy: + conn_class = make_socks_conn_class(conn_class, socks_proxy) + del req.headers['Ytdl-socks-proxy'] + + return self.do_open(functools.partial( + _create_http_connection, self, conn_class, True), + req, **kwargs) + + +class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar): + """ + See [1] for cookie file format. + + 1. https://curl.haxx.se/docs/http-cookies.html + """ + _HTTPONLY_PREFIX = '#HttpOnly_' + _ENTRY_LEN = 7 + _HEADER = '''# Netscape HTTP Cookie File +# This file is generated by yt-dlp. Do not edit. + +''' + _CookieFileEntry = collections.namedtuple( + 'CookieFileEntry', + ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value')) + + def save(self, filename=None, ignore_discard=False, ignore_expires=False): + """ + Save cookies to a file. + + Most of the code is taken from CPython 3.8 and slightly adapted + to support cookie files with UTF-8 in both python 2 and 3. + """ + if filename is None: + if self.filename is not None: + filename = self.filename + else: + raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT) + + # Store session cookies with `expires` set to 0 instead of an empty + # string + for cookie in self: + if cookie.expires is None: + cookie.expires = 0 + + with io.open(filename, 'w', encoding='utf-8') as f: + f.write(self._HEADER) + now = time.time() + for cookie in self: + if not ignore_discard and cookie.discard: + continue + if not ignore_expires and cookie.is_expired(now): + continue + if cookie.secure: + secure = 'TRUE' + else: + secure = 'FALSE' + if cookie.domain.startswith('.'): + initial_dot = 'TRUE' + else: + initial_dot = 'FALSE' + if cookie.expires is not None: + expires = compat_str(cookie.expires) + else: + expires = '' + if cookie.value is None: + # cookies.txt regards 'Set-Cookie: foo' as a cookie + # with no name, whereas http.cookiejar regards it as a + # cookie with no value. + name = '' + value = cookie.name + else: + name = cookie.name + value = cookie.value + f.write( + '\t'.join([cookie.domain, initial_dot, cookie.path, + secure, expires, name, value]) + '\n') + + def load(self, filename=None, ignore_discard=False, ignore_expires=False): + """Load cookies from a file.""" + if filename is None: + if self.filename is not None: + filename = self.filename + else: + raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT) + + def prepare_line(line): + if line.startswith(self._HTTPONLY_PREFIX): + line = line[len(self._HTTPONLY_PREFIX):] + # comments and empty lines are fine + if line.startswith('#') or not line.strip(): + return line + cookie_list = line.split('\t') + if len(cookie_list) != self._ENTRY_LEN: + raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list)) + cookie = self._CookieFileEntry(*cookie_list) + if cookie.expires_at and not cookie.expires_at.isdigit(): + raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at) + return line + + cf = io.StringIO() + with io.open(filename, encoding='utf-8') as f: + for line in f: + try: + cf.write(prepare_line(line)) + except compat_cookiejar.LoadError as e: + write_string( + 'WARNING: skipping cookie file entry due to %s: %r\n' + % (e, line), sys.stderr) + continue + cf.seek(0) + self._really_load(cf, filename, ignore_discard, ignore_expires) + # Session cookies are denoted by either `expires` field set to + # an empty string or 0. MozillaCookieJar only recognizes the former + # (see [1]). So we need force the latter to be recognized as session + # cookies on our own. + # Session cookies may be important for cookies-based authentication, + # e.g. usually, when user does not check 'Remember me' check box while + # logging in on a site, some important cookies are stored as session + # cookies so that not recognizing them will result in failed login. + # 1. https://bugs.python.org/issue17164 + for cookie in self: + # Treat `expires=0` cookies as session cookies + if cookie.expires == 0: + cookie.expires = None + cookie.discard = True + + +class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor): + def __init__(self, cookiejar=None): + compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar) + + def http_response(self, request, response): + # Python 2 will choke on next HTTP request in row if there are non-ASCII + # characters in Set-Cookie HTTP header of last response (see + # https://github.com/ytdl-org/youtube-dl/issues/6769). + # In order to at least prevent crashing we will percent encode Set-Cookie + # header before HTTPCookieProcessor starts processing it. + # if sys.version_info < (3, 0) and response.headers: + # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'): + # set_cookie = response.headers.get(set_cookie_header) + # if set_cookie: + # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ") + # if set_cookie != set_cookie_escaped: + # del response.headers[set_cookie_header] + # response.headers[set_cookie_header] = set_cookie_escaped + return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response) + + https_request = compat_urllib_request.HTTPCookieProcessor.http_request + https_response = http_response + + +class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler): + """YoutubeDL redirect handler + + The code is based on HTTPRedirectHandler implementation from CPython [1]. + + This redirect handler solves two issues: + - ensures redirect URL is always unicode under python 2 + - introduces support for experimental HTTP response status code + 308 Permanent Redirect [2] used by some sites [3] + + 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py + 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308 + 3. https://github.com/ytdl-org/youtube-dl/issues/28768 + """ + + http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302 + + def redirect_request(self, req, fp, code, msg, headers, newurl): + """Return a Request or None in response to a redirect. + + This is called by the http_error_30x methods when a + redirection response is received. If a redirection should + take place, return a new Request to allow http_error_30x to + perform the redirect. Otherwise, raise HTTPError if no-one + else should try to handle this url. Return None if you can't + but another Handler might. + """ + m = req.get_method() + if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD") + or code in (301, 302, 303) and m == "POST")): + raise compat_HTTPError(req.full_url, code, msg, headers, fp) + # Strictly (according to RFC 2616), 301 or 302 in response to + # a POST MUST NOT cause a redirection without confirmation + # from the user (of urllib.request, in this case). In practice, + # essentially all clients do redirect in this case, so we do + # the same. + + # On python 2 urlh.geturl() may sometimes return redirect URL + # as byte string instead of unicode. This workaround allows + # to force it always return unicode. + if sys.version_info[0] < 3: + newurl = compat_str(newurl) + + # Be conciliant with URIs containing a space. This is mainly + # redundant with the more complete encoding done in http_error_302(), + # but it is kept for compatibility with other callers. + newurl = newurl.replace(' ', '%20') + + CONTENT_HEADERS = ("content-length", "content-type") + # NB: don't use dict comprehension for python 2.6 compatibility + newheaders = dict((k, v) for k, v in req.headers.items() + if k.lower() not in CONTENT_HEADERS) + return compat_urllib_request.Request( + newurl, headers=newheaders, origin_req_host=req.origin_req_host, + unverifiable=True) + + +def extract_timezone(date_str): + m = re.search( + r'''(?x) + ^.{8,}? # >=8 char non-TZ prefix, if present + (?P<tz>Z| # just the UTC Z, or + (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or + (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits + [ ]? # optional space + (?P<sign>\+|-) # +/- + (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm + $) + ''', date_str) + if not m: + timezone = datetime.timedelta() + else: + date_str = date_str[:-len(m.group('tz'))] + if not m.group('sign'): + timezone = datetime.timedelta() + else: + sign = 1 if m.group('sign') == '+' else -1 + timezone = datetime.timedelta( + hours=sign * int(m.group('hours')), + minutes=sign * int(m.group('minutes'))) + return timezone, date_str + + +def parse_iso8601(date_str, delimiter='T', timezone=None): + """ Return a UNIX timestamp from the given date """ + + if date_str is None: + return None + + date_str = re.sub(r'\.[0-9]+', '', date_str) + + if timezone is None: + timezone, date_str = extract_timezone(date_str) + + try: + date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter) + dt = datetime.datetime.strptime(date_str, date_format) - timezone + return calendar.timegm(dt.timetuple()) + except ValueError: + pass + + +def date_formats(day_first=True): + return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST + + +def unified_strdate(date_str, day_first=True): + """Return a string with the date in the format YYYYMMDD""" + + if date_str is None: + return None + upload_date = None + # Replace commas + date_str = date_str.replace(',', ' ') + # Remove AM/PM + timezone + date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str) + _, date_str = extract_timezone(date_str) + + for expression in date_formats(day_first): + try: + upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d') + except ValueError: + pass + if upload_date is None: + timetuple = email.utils.parsedate_tz(date_str) + if timetuple: + try: + upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d') + except ValueError: + pass + if upload_date is not None: + return compat_str(upload_date) + + +def unified_timestamp(date_str, day_first=True): + if date_str is None: + return None + + date_str = re.sub(r'[,|]', '', date_str) + + pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0 + timezone, date_str = extract_timezone(date_str) + + # Remove AM/PM + timezone + date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str) + + # Remove unrecognized timezones from ISO 8601 alike timestamps + m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str) + if m: + date_str = date_str[:-len(m.group('tz'))] + + # Python only supports microseconds, so remove nanoseconds + m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str) + if m: + date_str = m.group(1) + + for expression in date_formats(day_first): + try: + dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta) + return calendar.timegm(dt.timetuple()) + except ValueError: + pass + timetuple = email.utils.parsedate_tz(date_str) + if timetuple: + return calendar.timegm(timetuple) + pm_delta * 3600 + + +def determine_ext(url, default_ext='unknown_video'): + if url is None or '.' not in url: + return default_ext + guess = url.partition('?')[0].rpartition('.')[2] + if re.match(r'^[A-Za-z0-9]+$', guess): + return guess + # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download + elif guess.rstrip('/') in KNOWN_EXTENSIONS: + return guess.rstrip('/') + else: + return default_ext + + +def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None): + return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext) + + +def datetime_from_str(date_str, precision='auto', format='%Y%m%d'): + """ + Return a datetime object from a string in the format YYYYMMDD or + (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)? + + format: string date format used to return datetime object from + precision: round the time portion of a datetime object. + auto|microsecond|second|minute|hour|day. + auto: round to the unit provided in date_str (if applicable). + """ + auto_precision = False + if precision == 'auto': + auto_precision = True + precision = 'microsecond' + today = datetime_round(datetime.datetime.now(), precision) + if date_str in ('now', 'today'): + return today + if date_str == 'yesterday': + return today - datetime.timedelta(days=1) + match = re.match( + r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)(s)?', + date_str) + if match is not None: + start_time = datetime_from_str(match.group('start'), precision, format) + time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1) + unit = match.group('unit') + if unit == 'month' or unit == 'year': + new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time) + unit = 'day' + else: + if unit == 'week': + unit = 'day' + time *= 7 + delta = datetime.timedelta(**{unit + 's': time}) + new_date = start_time + delta + if auto_precision: + return datetime_round(new_date, unit) + return new_date + + return datetime_round(datetime.datetime.strptime(date_str, format), precision) + + +def date_from_str(date_str, format='%Y%m%d'): + """ + Return a datetime object from a string in the format YYYYMMDD or + (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)? + + format: string date format used to return datetime object from + """ + return datetime_from_str(date_str, precision='microsecond', format=format).date() + + +def datetime_add_months(dt, months): + """Increment/Decrement a datetime object by months.""" + month = dt.month + months - 1 + year = dt.year + month // 12 + month = month % 12 + 1 + day = min(dt.day, calendar.monthrange(year, month)[1]) + return dt.replace(year, month, day) + + +def datetime_round(dt, precision='day'): + """ + Round a datetime object's time to a specific precision + """ + if precision == 'microsecond': + return dt + + unit_seconds = { + 'day': 86400, + 'hour': 3600, + 'minute': 60, + 'second': 1, + } + roundto = lambda x, n: ((x + n / 2) // n) * n + timestamp = calendar.timegm(dt.timetuple()) + return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision])) + + +def hyphenate_date(date_str): + """ + Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format""" + match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str) + if match is not None: + return '-'.join(match.groups()) + else: + return date_str + + +class DateRange(object): + """Represents a time interval between two dates""" + + def __init__(self, start=None, end=None): + """start and end must be strings in the format accepted by date""" + if start is not None: + self.start = date_from_str(start) + else: + self.start = datetime.datetime.min.date() + if end is not None: + self.end = date_from_str(end) + else: + self.end = datetime.datetime.max.date() + if self.start > self.end: + raise ValueError('Date range: "%s" , the start date must be before the end date' % self) + + @classmethod + def day(cls, day): + """Returns a range that only contains the given day""" + return cls(day, day) + + def __contains__(self, date): + """Check if the date is in the range""" + if not isinstance(date, datetime.date): + date = date_from_str(date) + return self.start <= date <= self.end + + def __str__(self): + return '%s - %s' % (self.start.isoformat(), self.end.isoformat()) + + +def platform_name(): + """ Returns the platform name as a compat_str """ + res = platform.platform() + if isinstance(res, bytes): + res = res.decode(preferredencoding()) + + assert isinstance(res, compat_str) + return res + + +def get_windows_version(): + ''' Get Windows version. None if it's not running on Windows ''' + if compat_os_name == 'nt': + return version_tuple(platform.win32_ver()[1]) + else: + return None + + +def _windows_write_string(s, out): + """ Returns True if the string was written using special methods, + False if it has yet to be written out.""" + # Adapted from http://stackoverflow.com/a/3259271/35070 + + import ctypes + import ctypes.wintypes + + WIN_OUTPUT_IDS = { + 1: -11, + 2: -12, + } + + try: + fileno = out.fileno() + except AttributeError: + # If the output stream doesn't have a fileno, it's virtual + return False + except io.UnsupportedOperation: + # Some strange Windows pseudo files? + return False + if fileno not in WIN_OUTPUT_IDS: + return False + + GetStdHandle = compat_ctypes_WINFUNCTYPE( + ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)( + ('GetStdHandle', ctypes.windll.kernel32)) + h = GetStdHandle(WIN_OUTPUT_IDS[fileno]) + + WriteConsoleW = compat_ctypes_WINFUNCTYPE( + ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR, + ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD), + ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32)) + written = ctypes.wintypes.DWORD(0) + + GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32)) + FILE_TYPE_CHAR = 0x0002 + FILE_TYPE_REMOTE = 0x8000 + GetConsoleMode = compat_ctypes_WINFUNCTYPE( + ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, + ctypes.POINTER(ctypes.wintypes.DWORD))( + ('GetConsoleMode', ctypes.windll.kernel32)) + INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value + + def not_a_console(handle): + if handle == INVALID_HANDLE_VALUE or handle is None: + return True + return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR + or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0) + + if not_a_console(h): + return False + + def next_nonbmp_pos(s): + try: + return next(i for i, c in enumerate(s) if ord(c) > 0xffff) + except StopIteration: + return len(s) + + while s: + count = min(next_nonbmp_pos(s), 1024) + + ret = WriteConsoleW( + h, s, count if count else 2, ctypes.byref(written), None) + if ret == 0: + raise OSError('Failed to write string') + if not count: # We just wrote a non-BMP character + assert written.value == 2 + s = s[1:] + else: + assert written.value > 0 + s = s[written.value:] + return True + + +def write_string(s, out=None, encoding=None): + if out is None: + out = sys.stderr + assert type(s) == compat_str + + if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'): + if _windows_write_string(s, out): + return + + if ('b' in getattr(out, 'mode', '') + or sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr + byt = s.encode(encoding or preferredencoding(), 'ignore') + out.write(byt) + elif hasattr(out, 'buffer'): + enc = encoding or getattr(out, 'encoding', None) or preferredencoding() + byt = s.encode(enc, 'ignore') + out.buffer.write(byt) + else: + out.write(s) + out.flush() + + +def bytes_to_intlist(bs): + if not bs: + return [] + if isinstance(bs[0], int): # Python 3 + return list(bs) + else: + return [ord(c) for c in bs] + + +def intlist_to_bytes(xs): + if not xs: + return b'' + return compat_struct_pack('%dB' % len(xs), *xs) + + +# Cross-platform file locking +if sys.platform == 'win32': + import ctypes.wintypes + import msvcrt + + class OVERLAPPED(ctypes.Structure): + _fields_ = [ + ('Internal', ctypes.wintypes.LPVOID), + ('InternalHigh', ctypes.wintypes.LPVOID), + ('Offset', ctypes.wintypes.DWORD), + ('OffsetHigh', ctypes.wintypes.DWORD), + ('hEvent', ctypes.wintypes.HANDLE), + ] + + kernel32 = ctypes.windll.kernel32 + LockFileEx = kernel32.LockFileEx + LockFileEx.argtypes = [ + ctypes.wintypes.HANDLE, # hFile + ctypes.wintypes.DWORD, # dwFlags + ctypes.wintypes.DWORD, # dwReserved + ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow + ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh + ctypes.POINTER(OVERLAPPED) # Overlapped + ] + LockFileEx.restype = ctypes.wintypes.BOOL + UnlockFileEx = kernel32.UnlockFileEx + UnlockFileEx.argtypes = [ + ctypes.wintypes.HANDLE, # hFile + ctypes.wintypes.DWORD, # dwReserved + ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow + ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh + ctypes.POINTER(OVERLAPPED) # Overlapped + ] + UnlockFileEx.restype = ctypes.wintypes.BOOL + whole_low = 0xffffffff + whole_high = 0x7fffffff + + def _lock_file(f, exclusive): + overlapped = OVERLAPPED() + overlapped.Offset = 0 + overlapped.OffsetHigh = 0 + overlapped.hEvent = 0 + f._lock_file_overlapped_p = ctypes.pointer(overlapped) + handle = msvcrt.get_osfhandle(f.fileno()) + if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0, + whole_low, whole_high, f._lock_file_overlapped_p): + raise OSError('Locking file failed: %r' % ctypes.FormatError()) + + def _unlock_file(f): + assert f._lock_file_overlapped_p + handle = msvcrt.get_osfhandle(f.fileno()) + if not UnlockFileEx(handle, 0, + whole_low, whole_high, f._lock_file_overlapped_p): + raise OSError('Unlocking file failed: %r' % ctypes.FormatError()) + +else: + # Some platforms, such as Jython, is missing fcntl + try: + import fcntl + + def _lock_file(f, exclusive): + fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH) + + def _unlock_file(f): + fcntl.flock(f, fcntl.LOCK_UN) + except ImportError: + UNSUPPORTED_MSG = 'file locking is not supported on this platform' + + def _lock_file(f, exclusive): + raise IOError(UNSUPPORTED_MSG) + + def _unlock_file(f): + raise IOError(UNSUPPORTED_MSG) + + +class locked_file(object): + def __init__(self, filename, mode, encoding=None): + assert mode in ['r', 'a', 'w'] + self.f = io.open(filename, mode, encoding=encoding) + self.mode = mode + + def __enter__(self): + exclusive = self.mode != 'r' + try: + _lock_file(self.f, exclusive) + except IOError: + self.f.close() + raise + return self + + def __exit__(self, etype, value, traceback): + try: + _unlock_file(self.f) + finally: + self.f.close() + + def __iter__(self): + return iter(self.f) + + def write(self, *args): + return self.f.write(*args) + + def read(self, *args): + return self.f.read(*args) + + +def get_filesystem_encoding(): + encoding = sys.getfilesystemencoding() + return encoding if encoding is not None else 'utf-8' + + +def shell_quote(args): + quoted_args = [] + encoding = get_filesystem_encoding() + for a in args: + if isinstance(a, bytes): + # We may get a filename encoded with 'encodeFilename' + a = a.decode(encoding) + quoted_args.append(compat_shlex_quote(a)) + return ' '.join(quoted_args) + + +def smuggle_url(url, data): + """ Pass additional data in a URL for internal use. """ + + url, idata = unsmuggle_url(url, {}) + data.update(idata) + sdata = compat_urllib_parse_urlencode( + {'__youtubedl_smuggle': json.dumps(data)}) + return url + '#' + sdata + + +def unsmuggle_url(smug_url, default=None): + if '#__youtubedl_smuggle' not in smug_url: + return smug_url, default + url, _, sdata = smug_url.rpartition('#') + jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0] + data = json.loads(jsond) + return url, data + + +def format_bytes(bytes): + if bytes is None: + return 'N/A' + if type(bytes) is str: + bytes = float(bytes) + if bytes == 0.0: + exponent = 0 + else: + exponent = int(math.log(bytes, 1024.0)) + suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent] + converted = float(bytes) / float(1024 ** exponent) + return '%.2f%s' % (converted, suffix) + + +def lookup_unit_table(unit_table, s): + units_re = '|'.join(re.escape(u) for u in unit_table) + m = re.match( + r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s) + if not m: + return None + num_str = m.group('num').replace(',', '.') + mult = unit_table[m.group('unit')] + return int(float(num_str) * mult) + + +def parse_filesize(s): + if s is None: + return None + + # The lower-case forms are of course incorrect and unofficial, + # but we support those too + _UNIT_TABLE = { + 'B': 1, + 'b': 1, + 'bytes': 1, + 'KiB': 1024, + 'KB': 1000, + 'kB': 1024, + 'Kb': 1000, + 'kb': 1000, + 'kilobytes': 1000, + 'kibibytes': 1024, + 'MiB': 1024 ** 2, + 'MB': 1000 ** 2, + 'mB': 1024 ** 2, + 'Mb': 1000 ** 2, + 'mb': 1000 ** 2, + 'megabytes': 1000 ** 2, + 'mebibytes': 1024 ** 2, + 'GiB': 1024 ** 3, + 'GB': 1000 ** 3, + 'gB': 1024 ** 3, + 'Gb': 1000 ** 3, + 'gb': 1000 ** 3, + 'gigabytes': 1000 ** 3, + 'gibibytes': 1024 ** 3, + 'TiB': 1024 ** 4, + 'TB': 1000 ** 4, + 'tB': 1024 ** 4, + 'Tb': 1000 ** 4, + 'tb': 1000 ** 4, + 'terabytes': 1000 ** 4, + 'tebibytes': 1024 ** 4, + 'PiB': 1024 ** 5, + 'PB': 1000 ** 5, + 'pB': 1024 ** 5, + 'Pb': 1000 ** 5, + 'pb': 1000 ** 5, + 'petabytes': 1000 ** 5, + 'pebibytes': 1024 ** 5, + 'EiB': 1024 ** 6, + 'EB': 1000 ** 6, + 'eB': 1024 ** 6, + 'Eb': 1000 ** 6, + 'eb': 1000 ** 6, + 'exabytes': 1000 ** 6, + 'exbibytes': 1024 ** 6, + 'ZiB': 1024 ** 7, + 'ZB': 1000 ** 7, + 'zB': 1024 ** 7, + 'Zb': 1000 ** 7, + 'zb': 1000 ** 7, + 'zettabytes': 1000 ** 7, + 'zebibytes': 1024 ** 7, + 'YiB': 1024 ** 8, + 'YB': 1000 ** 8, + 'yB': 1024 ** 8, + 'Yb': 1000 ** 8, + 'yb': 1000 ** 8, + 'yottabytes': 1000 ** 8, + 'yobibytes': 1024 ** 8, + } + + return lookup_unit_table(_UNIT_TABLE, s) + + +def parse_count(s): + if s is None: + return None + + s = s.strip() + + if re.match(r'^[\d,.]+$', s): + return str_to_int(s) + + _UNIT_TABLE = { + 'k': 1000, + 'K': 1000, + 'm': 1000 ** 2, + 'M': 1000 ** 2, + 'kk': 1000 ** 2, + 'KK': 1000 ** 2, + } + + return lookup_unit_table(_UNIT_TABLE, s) + + +def parse_resolution(s): + if s is None: + return {} + + mobj = re.search(r'\b(?P<w>\d+)\s*[xX×]\s*(?P<h>\d+)\b', s) + if mobj: + return { + 'width': int(mobj.group('w')), + 'height': int(mobj.group('h')), + } + + mobj = re.search(r'\b(\d+)[pPiI]\b', s) + if mobj: + return {'height': int(mobj.group(1))} + + mobj = re.search(r'\b([48])[kK]\b', s) + if mobj: + return {'height': int(mobj.group(1)) * 540} + + return {} + + +def parse_bitrate(s): + if not isinstance(s, compat_str): + return + mobj = re.search(r'\b(\d+)\s*kbps', s) + if mobj: + return int(mobj.group(1)) + + +def month_by_name(name, lang='en'): + """ Return the number of a month by (locale-independently) English name """ + + month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en']) + + try: + return month_names.index(name) + 1 + except ValueError: + return None + + +def month_by_abbreviation(abbrev): + """ Return the number of a month by (locale-independently) English + abbreviations """ + + try: + return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1 + except ValueError: + return None + + +def fix_xml_ampersands(xml_str): + """Replace all the '&' by '&' in XML""" + return re.sub( + r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)', + '&', + xml_str) + + +def setproctitle(title): + assert isinstance(title, compat_str) + + # ctypes in Jython is not complete + # http://bugs.jython.org/issue2148 + if sys.platform.startswith('java'): + return + + try: + libc = ctypes.cdll.LoadLibrary('libc.so.6') + except OSError: + return + except TypeError: + # LoadLibrary in Windows Python 2.7.13 only expects + # a bytestring, but since unicode_literals turns + # every string into a unicode string, it fails. + return + title_bytes = title.encode('utf-8') + buf = ctypes.create_string_buffer(len(title_bytes)) + buf.value = title_bytes + try: + libc.prctl(15, buf, 0, 0, 0) + except AttributeError: + return # Strange libc, just skip this + + +def remove_start(s, start): + return s[len(start):] if s is not None and s.startswith(start) else s + + +def remove_end(s, end): + return s[:-len(end)] if s is not None and s.endswith(end) else s + + +def remove_quotes(s): + if s is None or len(s) < 2: + return s + for quote in ('"', "'", ): + if s[0] == quote and s[-1] == quote: + return s[1:-1] + return s + + +def get_domain(url): + domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url) + return domain.group('domain') if domain else None + + +def url_basename(url): + path = compat_urlparse.urlparse(url).path + return path.strip('/').split('/')[-1] + + +def base_url(url): + return re.match(r'https?://[^?#&]+/', url).group() + + +def urljoin(base, path): + if isinstance(path, bytes): + path = path.decode('utf-8') + if not isinstance(path, compat_str) or not path: + return None + if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path): + return path + if isinstance(base, bytes): + base = base.decode('utf-8') + if not isinstance(base, compat_str) or not re.match( + r'^(?:https?:)?//', base): + return None + return compat_urlparse.urljoin(base, path) + + +class HEADRequest(compat_urllib_request.Request): + def get_method(self): + return 'HEAD' + + +class PUTRequest(compat_urllib_request.Request): + def get_method(self): + return 'PUT' + + +def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1): + if get_attr: + if v is not None: + v = getattr(v, get_attr, None) + if v == '': + v = None + if v is None: + return default + try: + return int(v) * invscale // scale + except (ValueError, TypeError): + return default + + +def str_or_none(v, default=None): + return default if v is None else compat_str(v) + + +def str_to_int(int_str): + """ A more relaxed version of int_or_none """ + if isinstance(int_str, compat_integer_types): + return int_str + elif isinstance(int_str, compat_str): + int_str = re.sub(r'[,\.\+]', '', int_str) + return int_or_none(int_str) + + +def float_or_none(v, scale=1, invscale=1, default=None): + if v is None: + return default + try: + return float(v) * invscale / scale + except (ValueError, TypeError): + return default + + +def bool_or_none(v, default=None): + return v if isinstance(v, bool) else default + + +def strip_or_none(v, default=None): + return v.strip() if isinstance(v, compat_str) else default + + +def url_or_none(url): + if not url or not isinstance(url, compat_str): + return None + url = url.strip() + return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None + + +def strftime_or_none(timestamp, date_format, default=None): + datetime_object = None + try: + if isinstance(timestamp, compat_numeric_types): # unix timestamp + datetime_object = datetime.datetime.utcfromtimestamp(timestamp) + elif isinstance(timestamp, compat_str): # assume YYYYMMDD + datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d') + return datetime_object.strftime(date_format) + except (ValueError, TypeError, AttributeError): + return default + + +def parse_duration(s): + if not isinstance(s, compat_basestring): + return None + + s = s.strip() + + days, hours, mins, secs, ms = [None] * 5 + m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s) + if m: + days, hours, mins, secs, ms = m.groups() + else: + m = re.match( + r'''(?ix)(?:P? + (?: + [0-9]+\s*y(?:ears?)?\s* + )? + (?: + [0-9]+\s*m(?:onths?)?\s* + )? + (?: + [0-9]+\s*w(?:eeks?)?\s* + )? + (?: + (?P<days>[0-9]+)\s*d(?:ays?)?\s* + )? + T)? + (?: + (?P<hours>[0-9]+)\s*h(?:ours?)?\s* + )? + (?: + (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s* + )? + (?: + (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s* + )?Z?$''', s) + if m: + days, hours, mins, secs, ms = m.groups() + else: + m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s) + if m: + hours, mins = m.groups() + else: + return None + + duration = 0 + if secs: + duration += float(secs) + if mins: + duration += float(mins) * 60 + if hours: + duration += float(hours) * 60 * 60 + if days: + duration += float(days) * 24 * 60 * 60 + if ms: + duration += float(ms) + return duration + + +def prepend_extension(filename, ext, expected_real_ext=None): + name, real_ext = os.path.splitext(filename) + return ( + '{0}.{1}{2}'.format(name, ext, real_ext) + if not expected_real_ext or real_ext[1:] == expected_real_ext + else '{0}.{1}'.format(filename, ext)) + + +def replace_extension(filename, ext, expected_real_ext=None): + name, real_ext = os.path.splitext(filename) + return '{0}.{1}'.format( + name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename, + ext) + + +def check_executable(exe, args=[]): + """ Checks if the given binary is installed somewhere in PATH, and returns its name. + args can be a list of arguments for a short output (like -version) """ + try: + process_communicate_or_kill(subprocess.Popen( + [exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)) + except OSError: + return False + return exe + + +def get_exe_version(exe, args=['--version'], + version_re=None, unrecognized='present'): + """ Returns the version of the specified executable, + or False if the executable is not present """ + try: + # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers + # SIGTTOU if yt-dlp is run in the background. + # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656 + out, _ = process_communicate_or_kill(subprocess.Popen( + [encodeArgument(exe)] + args, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, stderr=subprocess.STDOUT)) + except OSError: + return False + if isinstance(out, bytes): # Python 2.x + out = out.decode('ascii', 'ignore') + return detect_exe_version(out, version_re, unrecognized) + + +def detect_exe_version(output, version_re=None, unrecognized='present'): + assert isinstance(output, compat_str) + if version_re is None: + version_re = r'version\s+([-0-9._a-zA-Z]+)' + m = re.search(version_re, output) + if m: + return m.group(1) + else: + return unrecognized + + +class LazyList(collections.abc.Sequence): + ''' Lazy immutable list from an iterable + Note that slices of a LazyList are lists and not LazyList''' + + class IndexError(IndexError): + pass + + def __init__(self, iterable): + self.__iterable = iter(iterable) + self.__cache = [] + self.__reversed = False + + def __iter__(self): + if self.__reversed: + # We need to consume the entire iterable to iterate in reverse + yield from self.exhaust() + return + yield from self.__cache + for item in self.__iterable: + self.__cache.append(item) + yield item + + def __exhaust(self): + self.__cache.extend(self.__iterable) + return self.__cache + + def exhaust(self): + ''' Evaluate the entire iterable ''' + return self.__exhaust()[::-1 if self.__reversed else 1] + + @staticmethod + def __reverse_index(x): + return None if x is None else -(x + 1) + + def __getitem__(self, idx): + if isinstance(idx, slice): + if self.__reversed: + idx = slice(self.__reverse_index(idx.start), self.__reverse_index(idx.stop), -(idx.step or 1)) + start, stop, step = idx.start, idx.stop, idx.step or 1 + elif isinstance(idx, int): + if self.__reversed: + idx = self.__reverse_index(idx) + start, stop, step = idx, idx, 0 + else: + raise TypeError('indices must be integers or slices') + if ((start or 0) < 0 or (stop or 0) < 0 + or (start is None and step < 0) + or (stop is None and step > 0)): + # We need to consume the entire iterable to be able to slice from the end + # Obviously, never use this with infinite iterables + self.__exhaust() + try: + return self.__cache[idx] + except IndexError as e: + raise self.IndexError(e) from e + n = max(start or 0, stop or 0) - len(self.__cache) + 1 + if n > 0: + self.__cache.extend(itertools.islice(self.__iterable, n)) + try: + return self.__cache[idx] + except IndexError as e: + raise self.IndexError(e) from e + + def __bool__(self): + try: + self[-1] if self.__reversed else self[0] + except self.IndexError: + return False + return True + + def __len__(self): + self.__exhaust() + return len(self.__cache) + + def reverse(self): + self.__reversed = not self.__reversed + return self + + def __repr__(self): + # repr and str should mimic a list. So we exhaust the iterable + return repr(self.exhaust()) + + def __str__(self): + return repr(self.exhaust()) + + +class PagedList: + def __len__(self): + # This is only useful for tests + return len(self.getslice()) + + def __init__(self, pagefunc, pagesize, use_cache=True): + self._pagefunc = pagefunc + self._pagesize = pagesize + self._use_cache = use_cache + self._cache = {} + + def getpage(self, pagenum): + page_results = self._cache.get(pagenum) or list(self._pagefunc(pagenum)) + if self._use_cache: + self._cache[pagenum] = page_results + return page_results + + def getslice(self, start=0, end=None): + return list(self._getslice(start, end)) + + def _getslice(self, start, end): + raise NotImplementedError('This method must be implemented by subclasses') + + def __getitem__(self, idx): + # NOTE: cache must be enabled if this is used + if not isinstance(idx, int) or idx < 0: + raise TypeError('indices must be non-negative integers') + entries = self.getslice(idx, idx + 1) + return entries[0] if entries else None + + +class OnDemandPagedList(PagedList): + def _getslice(self, start, end): + for pagenum in itertools.count(start // self._pagesize): + firstid = pagenum * self._pagesize + nextfirstid = pagenum * self._pagesize + self._pagesize + if start >= nextfirstid: + continue + + startv = ( + start % self._pagesize + if firstid <= start < nextfirstid + else 0) + endv = ( + ((end - 1) % self._pagesize) + 1 + if (end is not None and firstid <= end <= nextfirstid) + else None) + + page_results = self.getpage(pagenum) + if startv != 0 or endv is not None: + page_results = page_results[startv:endv] + yield from page_results + + # A little optimization - if current page is not "full", ie. does + # not contain page_size videos then we can assume that this page + # is the last one - there are no more ids on further pages - + # i.e. no need to query again. + if len(page_results) + startv < self._pagesize: + break + + # If we got the whole page, but the next page is not interesting, + # break out early as well + if end == nextfirstid: + break + + +class InAdvancePagedList(PagedList): + def __init__(self, pagefunc, pagecount, pagesize): + self._pagecount = pagecount + PagedList.__init__(self, pagefunc, pagesize, True) + + def _getslice(self, start, end): + start_page = start // self._pagesize + end_page = ( + self._pagecount if end is None else (end // self._pagesize + 1)) + skip_elems = start - start_page * self._pagesize + only_more = None if end is None else end - start + for pagenum in range(start_page, end_page): + page_results = self.getpage(pagenum) + if skip_elems: + page_results = page_results[skip_elems:] + skip_elems = None + if only_more is not None: + if len(page_results) < only_more: + only_more -= len(page_results) + else: + yield from page_results[:only_more] + break + yield from page_results + + +def uppercase_escape(s): + unicode_escape = codecs.getdecoder('unicode_escape') + return re.sub( + r'\\U[0-9a-fA-F]{8}', + lambda m: unicode_escape(m.group(0))[0], + s) + + +def lowercase_escape(s): + unicode_escape = codecs.getdecoder('unicode_escape') + return re.sub( + r'\\u[0-9a-fA-F]{4}', + lambda m: unicode_escape(m.group(0))[0], + s) + + +def escape_rfc3986(s): + """Escape non-ASCII characters as suggested by RFC 3986""" + if sys.version_info < (3, 0) and isinstance(s, compat_str): + s = s.encode('utf-8') + return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]") + + +def escape_url(url): + """Escape URL as suggested by RFC 3986""" + url_parsed = compat_urllib_parse_urlparse(url) + return url_parsed._replace( + netloc=url_parsed.netloc.encode('idna').decode('ascii'), + path=escape_rfc3986(url_parsed.path), + params=escape_rfc3986(url_parsed.params), + query=escape_rfc3986(url_parsed.query), + fragment=escape_rfc3986(url_parsed.fragment) + ).geturl() + + +def parse_qs(url): + return compat_parse_qs(compat_urllib_parse_urlparse(url).query) + + +def read_batch_urls(batch_fd): + def fixup(url): + if not isinstance(url, compat_str): + url = url.decode('utf-8', 'replace') + BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff') + for bom in BOM_UTF8: + if url.startswith(bom): + url = url[len(bom):] + url = url.lstrip() + if not url or url.startswith(('#', ';', ']')): + return False + # "#" cannot be stripped out since it is part of the URI + # However, it can be safely stipped out if follwing a whitespace + return re.split(r'\s#', url, 1)[0].rstrip() + + with contextlib.closing(batch_fd) as fd: + return [url for url in map(fixup, fd) if url] + + +def urlencode_postdata(*args, **kargs): + return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii') + + +def update_url_query(url, query): + if not query: + return url + parsed_url = compat_urlparse.urlparse(url) + qs = compat_parse_qs(parsed_url.query) + qs.update(query) + return compat_urlparse.urlunparse(parsed_url._replace( + query=compat_urllib_parse_urlencode(qs, True))) + + +def update_Request(req, url=None, data=None, headers={}, query={}): + req_headers = req.headers.copy() + req_headers.update(headers) + req_data = data or req.data + req_url = update_url_query(url or req.get_full_url(), query) + req_get_method = req.get_method() + if req_get_method == 'HEAD': + req_type = HEADRequest + elif req_get_method == 'PUT': + req_type = PUTRequest + else: + req_type = compat_urllib_request.Request + new_req = req_type( + req_url, data=req_data, headers=req_headers, + origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) + if hasattr(req, 'timeout'): + new_req.timeout = req.timeout + return new_req + + +def _multipart_encode_impl(data, boundary): + content_type = 'multipart/form-data; boundary=%s' % boundary + + out = b'' + for k, v in data.items(): + out += b'--' + boundary.encode('ascii') + b'\r\n' + if isinstance(k, compat_str): + k = k.encode('utf-8') + if isinstance(v, compat_str): + v = v.encode('utf-8') + # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578 + # suggests sending UTF-8 directly. Firefox sends UTF-8, too + content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n' + if boundary.encode('ascii') in content: + raise ValueError('Boundary overlaps with data') + out += content + + out += b'--' + boundary.encode('ascii') + b'--\r\n' + + return out, content_type + + +def multipart_encode(data, boundary=None): + ''' + Encode a dict to RFC 7578-compliant form-data + + data: + A dict where keys and values can be either Unicode or bytes-like + objects. + boundary: + If specified a Unicode object, it's used as the boundary. Otherwise + a random boundary is generated. + + Reference: https://tools.ietf.org/html/rfc7578 + ''' + has_specified_boundary = boundary is not None + + while True: + if boundary is None: + boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff)) + + try: + out, content_type = _multipart_encode_impl(data, boundary) + break + except ValueError: + if has_specified_boundary: + raise + boundary = None + + return out, content_type + + +def dict_get(d, key_or_keys, default=None, skip_false_values=True): + if isinstance(key_or_keys, (list, tuple)): + for key in key_or_keys: + if key not in d or d[key] is None or skip_false_values and not d[key]: + continue + return d[key] + return default + return d.get(key_or_keys, default) + + +def try_get(src, getter, expected_type=None): + for get in variadic(getter): + try: + v = get(src) + except (AttributeError, KeyError, TypeError, IndexError): + pass + else: + if expected_type is None or isinstance(v, expected_type): + return v + + +def merge_dicts(*dicts): + merged = {} + for a_dict in dicts: + for k, v in a_dict.items(): + if v is None: + continue + if (k not in merged + or (isinstance(v, compat_str) and v + and isinstance(merged[k], compat_str) + and not merged[k])): + merged[k] = v + return merged + + +def encode_compat_str(string, encoding=preferredencoding(), errors='strict'): + return string if isinstance(string, compat_str) else compat_str(string, encoding, errors) + + +US_RATINGS = { + 'G': 0, + 'PG': 10, + 'PG-13': 13, + 'R': 16, + 'NC': 18, +} + + +TV_PARENTAL_GUIDELINES = { + 'TV-Y': 0, + 'TV-Y7': 7, + 'TV-G': 0, + 'TV-PG': 0, + 'TV-14': 14, + 'TV-MA': 17, +} + + +def parse_age_limit(s): + if type(s) == int: + return s if 0 <= s <= 21 else None + if not isinstance(s, compat_basestring): + return None + m = re.match(r'^(?P<age>\d{1,2})\+?$', s) + if m: + return int(m.group('age')) + s = s.upper() + if s in US_RATINGS: + return US_RATINGS[s] + m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s) + if m: + return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)] + return None + + +def strip_jsonp(code): + return re.sub( + r'''(?sx)^ + (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*) + (?:\s*&&\s*(?P=func_name))? + \s*\(\s*(?P<callback_data>.*)\);? + \s*?(?://[^\n]*)*$''', + r'\g<callback_data>', code) + + +def js_to_json(code, vars={}): + # vars is a dict of var, val pairs to substitute + COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n' + SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE) + INTEGER_TABLE = ( + (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16), + (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8), + ) + + def fix_kv(m): + v = m.group(0) + if v in ('true', 'false', 'null'): + return v + elif v in ('undefined', 'void 0'): + return 'null' + elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',': + return "" + + if v[0] in ("'", '"'): + v = re.sub(r'(?s)\\.|"', lambda m: { + '"': '\\"', + "\\'": "'", + '\\\n': '', + '\\x': '\\u00', + }.get(m.group(0), m.group(0)), v[1:-1]) + else: + for regex, base in INTEGER_TABLE: + im = re.match(regex, v) + if im: + i = int(im.group(1), base) + return '"%d":' % i if v.endswith(':') else '%d' % i + + if v in vars: + return vars[v] + + return '"%s"' % v + + return re.sub(r'''(?sx) + "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"| + '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'| + {comment}|,(?={skip}[\]}}])| + void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*| + \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?| + [0-9]+(?={skip}:)| + !+ + '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code) + + +def qualities(quality_ids): + """ Get a numeric quality value out of a list of possible values """ + def q(qid): + try: + return quality_ids.index(qid) + except ValueError: + return -1 + return q + + +DEFAULT_OUTTMPL = { + 'default': '%(title)s [%(id)s].%(ext)s', + 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s', +} +OUTTMPL_TYPES = { + 'chapter': None, + 'subtitle': None, + 'thumbnail': None, + 'description': 'description', + 'annotation': 'annotations.xml', + 'infojson': 'info.json', + 'pl_thumbnail': None, + 'pl_description': 'description', + 'pl_infojson': 'info.json', +} + +# As of [1] format syntax is: +# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type +# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting +STR_FORMAT_RE_TMPL = r'''(?x) + (?<!%)(?P<prefix>(?:%%)*) + % + (?P<has_key>\((?P<key>{0})\))? + (?P<format> + (?P<conversion>[#0\-+ ]+)? + (?P<min_width>\d+)? + (?P<precision>\.\d+)? + (?P<len_mod>[hlL])? # unused in python + {1} # conversion type + ) +''' + + +STR_FORMAT_TYPES = 'diouxXeEfFgGcrs' + + +def limit_length(s, length): + """ Add ellipses to overly long strings """ + if s is None: + return None + ELLIPSES = '...' + if len(s) > length: + return s[:length - len(ELLIPSES)] + ELLIPSES + return s + + +def version_tuple(v): + return tuple(int(e) for e in re.split(r'[-.]', v)) + + +def is_outdated_version(version, limit, assume_new=True): + if not version: + return not assume_new + try: + return version_tuple(version) < version_tuple(limit) + except ValueError: + return not assume_new + + +def ytdl_is_updateable(): + """ Returns if yt-dlp can be updated with -U """ + + from .update import is_non_updateable + + return not is_non_updateable() + + +def args_to_str(args): + # Get a short string representation for a subprocess command + return ' '.join(compat_shlex_quote(a) for a in args) + + +def error_to_compat_str(err): + err_str = str(err) + # On python 2 error byte string must be decoded with proper + # encoding rather than ascii + if sys.version_info[0] < 3: + err_str = err_str.decode(preferredencoding()) + return err_str + + +def mimetype2ext(mt): + if mt is None: + return None + + mt, _, params = mt.partition(';') + mt = mt.strip() + + FULL_MAP = { + 'audio/mp4': 'm4a', + # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as + # it's the most popular one + 'audio/mpeg': 'mp3', + 'audio/x-wav': 'wav', + 'audio/wav': 'wav', + 'audio/wave': 'wav', + } + + ext = FULL_MAP.get(mt) + if ext is not None: + return ext + + SUBTYPE_MAP = { + '3gpp': '3gp', + 'smptett+xml': 'tt', + 'ttaf+xml': 'dfxp', + 'ttml+xml': 'ttml', + 'x-flv': 'flv', + 'x-mp4-fragmented': 'mp4', + 'x-ms-sami': 'sami', + 'x-ms-wmv': 'wmv', + 'mpegurl': 'm3u8', + 'x-mpegurl': 'm3u8', + 'vnd.apple.mpegurl': 'm3u8', + 'dash+xml': 'mpd', + 'f4m+xml': 'f4m', + 'hds+xml': 'f4m', + 'vnd.ms-sstr+xml': 'ism', + 'quicktime': 'mov', + 'mp2t': 'ts', + 'x-wav': 'wav', + 'filmstrip+json': 'fs', + 'svg+xml': 'svg', + } + + _, _, subtype = mt.rpartition('/') + ext = SUBTYPE_MAP.get(subtype.lower()) + if ext is not None: + return ext + + SUFFIX_MAP = { + 'json': 'json', + 'xml': 'xml', + 'zip': 'zip', + 'gzip': 'gz', + } + + _, _, suffix = subtype.partition('+') + ext = SUFFIX_MAP.get(suffix) + if ext is not None: + return ext + + return subtype.replace('+', '.') + + +def parse_codecs(codecs_str): + # http://tools.ietf.org/html/rfc6381 + if not codecs_str: + return {} + split_codecs = list(filter(None, map( + str.strip, codecs_str.strip().strip(',').split(',')))) + vcodec, acodec = None, None + for full_codec in split_codecs: + codec = full_codec.split('.')[0] + if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1', 'av01', 'theora', 'dvh1', 'dvhe'): + if not vcodec: + vcodec = full_codec + elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'): + if not acodec: + acodec = full_codec + else: + write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr) + if not vcodec and not acodec: + if len(split_codecs) == 2: + return { + 'vcodec': split_codecs[0], + 'acodec': split_codecs[1], + } + else: + return { + 'vcodec': vcodec or 'none', + 'acodec': acodec or 'none', + } + return {} + + +def urlhandle_detect_ext(url_handle): + getheader = url_handle.headers.get + + cd = getheader('Content-Disposition') + if cd: + m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd) + if m: + e = determine_ext(m.group('filename'), default_ext=None) + if e: + return e + + return mimetype2ext(getheader('Content-Type')) + + +def encode_data_uri(data, mime_type): + return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii')) + + +def age_restricted(content_limit, age_limit): + """ Returns True iff the content should be blocked """ + + if age_limit is None: # No limit set + return False + if content_limit is None: + return False # Content available for everyone + return age_limit < content_limit + + +def is_html(first_bytes): + """ Detect whether a file contains HTML by examining its first bytes. """ + + BOMS = [ + (b'\xef\xbb\xbf', 'utf-8'), + (b'\x00\x00\xfe\xff', 'utf-32-be'), + (b'\xff\xfe\x00\x00', 'utf-32-le'), + (b'\xff\xfe', 'utf-16-le'), + (b'\xfe\xff', 'utf-16-be'), + ] + for bom, enc in BOMS: + if first_bytes.startswith(bom): + s = first_bytes[len(bom):].decode(enc, 'replace') + break + else: + s = first_bytes.decode('utf-8', 'replace') + + return re.match(r'^\s*<', s) + + +def determine_protocol(info_dict): + protocol = info_dict.get('protocol') + if protocol is not None: + return protocol + + url = info_dict['url'] + if url.startswith('rtmp'): + return 'rtmp' + elif url.startswith('mms'): + return 'mms' + elif url.startswith('rtsp'): + return 'rtsp' + + ext = determine_ext(url) + if ext == 'm3u8': + return 'm3u8' + elif ext == 'f4m': + return 'f4m' + + return compat_urllib_parse_urlparse(url).scheme + + +def render_table(header_row, data, delim=False, extraGap=0, hideEmpty=False): + """ Render a list of rows, each as a list of values """ + + def get_max_lens(table): + return [max(len(compat_str(v)) for v in col) for col in zip(*table)] + + def filter_using_list(row, filterArray): + return [col for (take, col) in zip(filterArray, row) if take] + + if hideEmpty: + max_lens = get_max_lens(data) + header_row = filter_using_list(header_row, max_lens) + data = [filter_using_list(row, max_lens) for row in data] + + table = [header_row] + data + max_lens = get_max_lens(table) + if delim: + table = [header_row] + [['-' * ml for ml in max_lens]] + data + format_str = ' '.join('%-' + compat_str(ml + extraGap) + 's' for ml in max_lens[:-1]) + ' %s' + return '\n'.join(format_str % tuple(row) for row in table) + + +def _match_one(filter_part, dct, incomplete): + # TODO: Generalize code with YoutubeDL._build_format_filter + STRING_OPERATORS = { + '*=': operator.contains, + '^=': lambda attr, value: attr.startswith(value), + '$=': lambda attr, value: attr.endswith(value), + '~=': lambda attr, value: re.search(value, attr), + } + COMPARISON_OPERATORS = { + **STRING_OPERATORS, + '<=': operator.le, # "<=" must be defined above "<" + '<': operator.lt, + '>=': operator.ge, + '>': operator.gt, + '=': operator.eq, + } + + operator_rex = re.compile(r'''(?x)\s* + (?P<key>[a-z_]+) + \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s* + (?: + (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)| + (?P<strval>.+?) + ) + \s*$ + ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys()))) + m = operator_rex.search(filter_part) + if m: + m = m.groupdict() + unnegated_op = COMPARISON_OPERATORS[m['op']] + if m['negation']: + op = lambda attr, value: not unnegated_op(attr, value) + else: + op = unnegated_op + comparison_value = m['quotedstrval'] or m['strval'] or m['intval'] + if m['quote']: + comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote']) + actual_value = dct.get(m['key']) + numeric_comparison = None + if isinstance(actual_value, compat_numeric_types): + # If the original field is a string and matching comparisonvalue is + # a number we should respect the origin of the original field + # and process comparison value as a string (see + # https://github.com/ytdl-org/youtube-dl/issues/11082) + try: + numeric_comparison = int(comparison_value) + except ValueError: + numeric_comparison = parse_filesize(comparison_value) + if numeric_comparison is None: + numeric_comparison = parse_filesize(f'{comparison_value}B') + if numeric_comparison is None: + numeric_comparison = parse_duration(comparison_value) + if numeric_comparison is not None and m['op'] in STRING_OPERATORS: + raise ValueError('Operator %s only supports string values!' % m['op']) + if actual_value is None: + return incomplete or m['none_inclusive'] + return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison) + + UNARY_OPERATORS = { + '': lambda v: (v is True) if isinstance(v, bool) else (v is not None), + '!': lambda v: (v is False) if isinstance(v, bool) else (v is None), + } + operator_rex = re.compile(r'''(?x)\s* + (?P<op>%s)\s*(?P<key>[a-z_]+) + \s*$ + ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys()))) + m = operator_rex.search(filter_part) + if m: + op = UNARY_OPERATORS[m.group('op')] + actual_value = dct.get(m.group('key')) + if incomplete and actual_value is None: + return True + return op(actual_value) + + raise ValueError('Invalid filter part %r' % filter_part) + + +def match_str(filter_str, dct, incomplete=False): + """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false + When incomplete, all conditions passes on missing fields + """ + return all( + _match_one(filter_part.replace(r'\&', '&'), dct, incomplete) + for filter_part in re.split(r'(?<!\\)&', filter_str)) + + +def match_filter_func(filter_str): + def _match_func(info_dict, *args, **kwargs): + if match_str(filter_str, info_dict, *args, **kwargs): + return None + else: + video_title = info_dict.get('title', info_dict.get('id', 'video')) + return '%s does not pass filter %s, skipping ..' % (video_title, filter_str) + return _match_func + + +def parse_dfxp_time_expr(time_expr): + if not time_expr: + return + + mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr) + if mobj: + return float(mobj.group('time_offset')) + + mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr) + if mobj: + return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.')) + + +def srt_subtitles_timecode(seconds): + return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000) + + +def dfxp2srt(dfxp_data): + ''' + @param dfxp_data A bytes-like object containing DFXP data + @returns A unicode object containing converted SRT data + ''' + LEGACY_NAMESPACES = ( + (b'http://www.w3.org/ns/ttml', [ + b'http://www.w3.org/2004/11/ttaf1', + b'http://www.w3.org/2006/04/ttaf1', + b'http://www.w3.org/2006/10/ttaf1', + ]), + (b'http://www.w3.org/ns/ttml#styling', [ + b'http://www.w3.org/ns/ttml#style', + ]), + ) + + SUPPORTED_STYLING = [ + 'color', + 'fontFamily', + 'fontSize', + 'fontStyle', + 'fontWeight', + 'textDecoration' + ] + + _x = functools.partial(xpath_with_ns, ns_map={ + 'xml': 'http://www.w3.org/XML/1998/namespace', + 'ttml': 'http://www.w3.org/ns/ttml', + 'tts': 'http://www.w3.org/ns/ttml#styling', + }) + + styles = {} + default_style = {} + + class TTMLPElementParser(object): + _out = '' + _unclosed_elements = [] + _applied_styles = [] + + def start(self, tag, attrib): + if tag in (_x('ttml:br'), 'br'): + self._out += '\n' + else: + unclosed_elements = [] + style = {} + element_style_id = attrib.get('style') + if default_style: + style.update(default_style) + if element_style_id: + style.update(styles.get(element_style_id, {})) + for prop in SUPPORTED_STYLING: + prop_val = attrib.get(_x('tts:' + prop)) + if prop_val: + style[prop] = prop_val + if style: + font = '' + for k, v in sorted(style.items()): + if self._applied_styles and self._applied_styles[-1].get(k) == v: + continue + if k == 'color': + font += ' color="%s"' % v + elif k == 'fontSize': + font += ' size="%s"' % v + elif k == 'fontFamily': + font += ' face="%s"' % v + elif k == 'fontWeight' and v == 'bold': + self._out += '<b>' + unclosed_elements.append('b') + elif k == 'fontStyle' and v == 'italic': + self._out += '<i>' + unclosed_elements.append('i') + elif k == 'textDecoration' and v == 'underline': + self._out += '<u>' + unclosed_elements.append('u') + if font: + self._out += '<font' + font + '>' + unclosed_elements.append('font') + applied_style = {} + if self._applied_styles: + applied_style.update(self._applied_styles[-1]) + applied_style.update(style) + self._applied_styles.append(applied_style) + self._unclosed_elements.append(unclosed_elements) + + def end(self, tag): + if tag not in (_x('ttml:br'), 'br'): + unclosed_elements = self._unclosed_elements.pop() + for element in reversed(unclosed_elements): + self._out += '</%s>' % element + if unclosed_elements and self._applied_styles: + self._applied_styles.pop() + + def data(self, data): + self._out += data + + def close(self): + return self._out.strip() + + def parse_node(node): + target = TTMLPElementParser() + parser = xml.etree.ElementTree.XMLParser(target=target) + parser.feed(xml.etree.ElementTree.tostring(node)) + return parser.close() + + for k, v in LEGACY_NAMESPACES: + for ns in v: + dfxp_data = dfxp_data.replace(ns, k) + + dfxp = compat_etree_fromstring(dfxp_data) + out = [] + paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p') + + if not paras: + raise ValueError('Invalid dfxp/TTML subtitle') + + repeat = False + while True: + for style in dfxp.findall(_x('.//ttml:style')): + style_id = style.get('id') or style.get(_x('xml:id')) + if not style_id: + continue + parent_style_id = style.get('style') + if parent_style_id: + if parent_style_id not in styles: + repeat = True + continue + styles[style_id] = styles[parent_style_id].copy() + for prop in SUPPORTED_STYLING: + prop_val = style.get(_x('tts:' + prop)) + if prop_val: + styles.setdefault(style_id, {})[prop] = prop_val + if repeat: + repeat = False + else: + break + + for p in ('body', 'div'): + ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p]) + if ele is None: + continue + style = styles.get(ele.get('style')) + if not style: + continue + default_style.update(style) + + for para, index in zip(paras, itertools.count(1)): + begin_time = parse_dfxp_time_expr(para.attrib.get('begin')) + end_time = parse_dfxp_time_expr(para.attrib.get('end')) + dur = parse_dfxp_time_expr(para.attrib.get('dur')) + if begin_time is None: + continue + if not end_time: + if not dur: + continue + end_time = begin_time + dur + out.append('%d\n%s --> %s\n%s\n\n' % ( + index, + srt_subtitles_timecode(begin_time), + srt_subtitles_timecode(end_time), + parse_node(para))) + + return ''.join(out) + + +def cli_option(params, command_option, param): + param = params.get(param) + if param: + param = compat_str(param) + return [command_option, param] if param is not None else [] + + +def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None): + param = params.get(param) + if param is None: + return [] + assert isinstance(param, bool) + if separator: + return [command_option + separator + (true_value if param else false_value)] + return [command_option, true_value if param else false_value] + + +def cli_valueless_option(params, command_option, param, expected_value=True): + param = params.get(param) + return [command_option] if param == expected_value else [] + + +def cli_configuration_args(argdict, keys, default=[], use_compat=True): + if isinstance(argdict, (list, tuple)): # for backward compatibility + if use_compat: + return argdict + else: + argdict = None + if argdict is None: + return default + assert isinstance(argdict, dict) + + assert isinstance(keys, (list, tuple)) + for key_list in keys: + arg_list = list(filter( + lambda x: x is not None, + [argdict.get(key.lower()) for key in variadic(key_list)])) + if arg_list: + return [arg for args in arg_list for arg in args] + return default + + +def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True): + main_key, exe = main_key.lower(), exe.lower() + root_key = exe if main_key == exe else f'{main_key}+{exe}' + keys = [f'{root_key}{k}' for k in (keys or [''])] + if root_key in keys: + if main_key != exe: + keys.append((main_key, exe)) + keys.append('default') + else: + use_compat = False + return cli_configuration_args(argdict, keys, default, use_compat) + + +class ISO639Utils(object): + # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt + _lang_map = { + 'aa': 'aar', + 'ab': 'abk', + 'ae': 'ave', + 'af': 'afr', + 'ak': 'aka', + 'am': 'amh', + 'an': 'arg', + 'ar': 'ara', + 'as': 'asm', + 'av': 'ava', + 'ay': 'aym', + 'az': 'aze', + 'ba': 'bak', + 'be': 'bel', + 'bg': 'bul', + 'bh': 'bih', + 'bi': 'bis', + 'bm': 'bam', + 'bn': 'ben', + 'bo': 'bod', + 'br': 'bre', + 'bs': 'bos', + 'ca': 'cat', + 'ce': 'che', + 'ch': 'cha', + 'co': 'cos', + 'cr': 'cre', + 'cs': 'ces', + 'cu': 'chu', + 'cv': 'chv', + 'cy': 'cym', + 'da': 'dan', + 'de': 'deu', + 'dv': 'div', + 'dz': 'dzo', + 'ee': 'ewe', + 'el': 'ell', + 'en': 'eng', + 'eo': 'epo', + 'es': 'spa', + 'et': 'est', + 'eu': 'eus', + 'fa': 'fas', + 'ff': 'ful', + 'fi': 'fin', + 'fj': 'fij', + 'fo': 'fao', + 'fr': 'fra', + 'fy': 'fry', + 'ga': 'gle', + 'gd': 'gla', + 'gl': 'glg', + 'gn': 'grn', + 'gu': 'guj', + 'gv': 'glv', + 'ha': 'hau', + 'he': 'heb', + 'iw': 'heb', # Replaced by he in 1989 revision + 'hi': 'hin', + 'ho': 'hmo', + 'hr': 'hrv', + 'ht': 'hat', + 'hu': 'hun', + 'hy': 'hye', + 'hz': 'her', + 'ia': 'ina', + 'id': 'ind', + 'in': 'ind', # Replaced by id in 1989 revision + 'ie': 'ile', + 'ig': 'ibo', + 'ii': 'iii', + 'ik': 'ipk', + 'io': 'ido', + 'is': 'isl', + 'it': 'ita', + 'iu': 'iku', + 'ja': 'jpn', + 'jv': 'jav', + 'ka': 'kat', + 'kg': 'kon', + 'ki': 'kik', + 'kj': 'kua', + 'kk': 'kaz', + 'kl': 'kal', + 'km': 'khm', + 'kn': 'kan', + 'ko': 'kor', + 'kr': 'kau', + 'ks': 'kas', + 'ku': 'kur', + 'kv': 'kom', + 'kw': 'cor', + 'ky': 'kir', + 'la': 'lat', + 'lb': 'ltz', + 'lg': 'lug', + 'li': 'lim', + 'ln': 'lin', + 'lo': 'lao', + 'lt': 'lit', + 'lu': 'lub', + 'lv': 'lav', + 'mg': 'mlg', + 'mh': 'mah', + 'mi': 'mri', + 'mk': 'mkd', + 'ml': 'mal', + 'mn': 'mon', + 'mr': 'mar', + 'ms': 'msa', + 'mt': 'mlt', + 'my': 'mya', + 'na': 'nau', + 'nb': 'nob', + 'nd': 'nde', + 'ne': 'nep', + 'ng': 'ndo', + 'nl': 'nld', + 'nn': 'nno', + 'no': 'nor', + 'nr': 'nbl', + 'nv': 'nav', + 'ny': 'nya', + 'oc': 'oci', + 'oj': 'oji', + 'om': 'orm', + 'or': 'ori', + 'os': 'oss', + 'pa': 'pan', + 'pi': 'pli', + 'pl': 'pol', + 'ps': 'pus', + 'pt': 'por', + 'qu': 'que', + 'rm': 'roh', + 'rn': 'run', + 'ro': 'ron', + 'ru': 'rus', + 'rw': 'kin', + 'sa': 'san', + 'sc': 'srd', + 'sd': 'snd', + 'se': 'sme', + 'sg': 'sag', + 'si': 'sin', + 'sk': 'slk', + 'sl': 'slv', + 'sm': 'smo', + 'sn': 'sna', + 'so': 'som', + 'sq': 'sqi', + 'sr': 'srp', + 'ss': 'ssw', + 'st': 'sot', + 'su': 'sun', + 'sv': 'swe', + 'sw': 'swa', + 'ta': 'tam', + 'te': 'tel', + 'tg': 'tgk', + 'th': 'tha', + 'ti': 'tir', + 'tk': 'tuk', + 'tl': 'tgl', + 'tn': 'tsn', + 'to': 'ton', + 'tr': 'tur', + 'ts': 'tso', + 'tt': 'tat', + 'tw': 'twi', + 'ty': 'tah', + 'ug': 'uig', + 'uk': 'ukr', + 'ur': 'urd', + 'uz': 'uzb', + 've': 'ven', + 'vi': 'vie', + 'vo': 'vol', + 'wa': 'wln', + 'wo': 'wol', + 'xh': 'xho', + 'yi': 'yid', + 'ji': 'yid', # Replaced by yi in 1989 revision + 'yo': 'yor', + 'za': 'zha', + 'zh': 'zho', + 'zu': 'zul', + } + + @classmethod + def short2long(cls, code): + """Convert language code from ISO 639-1 to ISO 639-2/T""" + return cls._lang_map.get(code[:2]) + + @classmethod + def long2short(cls, code): + """Convert language code from ISO 639-2/T to ISO 639-1""" + for short_name, long_name in cls._lang_map.items(): + if long_name == code: + return short_name + + +class ISO3166Utils(object): + # From http://data.okfn.org/data/core/country-list + _country_map = { + 'AF': 'Afghanistan', + 'AX': 'Åland Islands', + 'AL': 'Albania', + 'DZ': 'Algeria', + 'AS': 'American Samoa', + 'AD': 'Andorra', + 'AO': 'Angola', + 'AI': 'Anguilla', + 'AQ': 'Antarctica', + 'AG': 'Antigua and Barbuda', + 'AR': 'Argentina', + 'AM': 'Armenia', + 'AW': 'Aruba', + 'AU': 'Australia', + 'AT': 'Austria', + 'AZ': 'Azerbaijan', + 'BS': 'Bahamas', + 'BH': 'Bahrain', + 'BD': 'Bangladesh', + 'BB': 'Barbados', + 'BY': 'Belarus', + 'BE': 'Belgium', + 'BZ': 'Belize', + 'BJ': 'Benin', + 'BM': 'Bermuda', + 'BT': 'Bhutan', + 'BO': 'Bolivia, Plurinational State of', + 'BQ': 'Bonaire, Sint Eustatius and Saba', + 'BA': 'Bosnia and Herzegovina', + 'BW': 'Botswana', + 'BV': 'Bouvet Island', + 'BR': 'Brazil', + 'IO': 'British Indian Ocean Territory', + 'BN': 'Brunei Darussalam', + 'BG': 'Bulgaria', + 'BF': 'Burkina Faso', + 'BI': 'Burundi', + 'KH': 'Cambodia', + 'CM': 'Cameroon', + 'CA': 'Canada', + 'CV': 'Cape Verde', + 'KY': 'Cayman Islands', + 'CF': 'Central African Republic', + 'TD': 'Chad', + 'CL': 'Chile', + 'CN': 'China', + 'CX': 'Christmas Island', + 'CC': 'Cocos (Keeling) Islands', + 'CO': 'Colombia', + 'KM': 'Comoros', + 'CG': 'Congo', + 'CD': 'Congo, the Democratic Republic of the', + 'CK': 'Cook Islands', + 'CR': 'Costa Rica', + 'CI': 'Côte d\'Ivoire', + 'HR': 'Croatia', + 'CU': 'Cuba', + 'CW': 'Curaçao', + 'CY': 'Cyprus', + 'CZ': 'Czech Republic', + 'DK': 'Denmark', + 'DJ': 'Djibouti', + 'DM': 'Dominica', + 'DO': 'Dominican Republic', + 'EC': 'Ecuador', + 'EG': 'Egypt', + 'SV': 'El Salvador', + 'GQ': 'Equatorial Guinea', + 'ER': 'Eritrea', + 'EE': 'Estonia', + 'ET': 'Ethiopia', + 'FK': 'Falkland Islands (Malvinas)', + 'FO': 'Faroe Islands', + 'FJ': 'Fiji', + 'FI': 'Finland', + 'FR': 'France', + 'GF': 'French Guiana', + 'PF': 'French Polynesia', + 'TF': 'French Southern Territories', + 'GA': 'Gabon', + 'GM': 'Gambia', + 'GE': 'Georgia', + 'DE': 'Germany', + 'GH': 'Ghana', + 'GI': 'Gibraltar', + 'GR': 'Greece', + 'GL': 'Greenland', + 'GD': 'Grenada', + 'GP': 'Guadeloupe', + 'GU': 'Guam', + 'GT': 'Guatemala', + 'GG': 'Guernsey', + 'GN': 'Guinea', + 'GW': 'Guinea-Bissau', + 'GY': 'Guyana', + 'HT': 'Haiti', + 'HM': 'Heard Island and McDonald Islands', + 'VA': 'Holy See (Vatican City State)', + 'HN': 'Honduras', + 'HK': 'Hong Kong', + 'HU': 'Hungary', + 'IS': 'Iceland', + 'IN': 'India', + 'ID': 'Indonesia', + 'IR': 'Iran, Islamic Republic of', + 'IQ': 'Iraq', + 'IE': 'Ireland', + 'IM': 'Isle of Man', + 'IL': 'Israel', + 'IT': 'Italy', + 'JM': 'Jamaica', + 'JP': 'Japan', + 'JE': 'Jersey', + 'JO': 'Jordan', + 'KZ': 'Kazakhstan', + 'KE': 'Kenya', + 'KI': 'Kiribati', + 'KP': 'Korea, Democratic People\'s Republic of', + 'KR': 'Korea, Republic of', + 'KW': 'Kuwait', + 'KG': 'Kyrgyzstan', + 'LA': 'Lao People\'s Democratic Republic', + 'LV': 'Latvia', + 'LB': 'Lebanon', + 'LS': 'Lesotho', + 'LR': 'Liberia', + 'LY': 'Libya', + 'LI': 'Liechtenstein', + 'LT': 'Lithuania', + 'LU': 'Luxembourg', + 'MO': 'Macao', + 'MK': 'Macedonia, the Former Yugoslav Republic of', + 'MG': 'Madagascar', + 'MW': 'Malawi', + 'MY': 'Malaysia', + 'MV': 'Maldives', + 'ML': 'Mali', + 'MT': 'Malta', + 'MH': 'Marshall Islands', + 'MQ': 'Martinique', + 'MR': 'Mauritania', + 'MU': 'Mauritius', + 'YT': 'Mayotte', + 'MX': 'Mexico', + 'FM': 'Micronesia, Federated States of', + 'MD': 'Moldova, Republic of', + 'MC': 'Monaco', + 'MN': 'Mongolia', + 'ME': 'Montenegro', + 'MS': 'Montserrat', + 'MA': 'Morocco', + 'MZ': 'Mozambique', + 'MM': 'Myanmar', + 'NA': 'Namibia', + 'NR': 'Nauru', + 'NP': 'Nepal', + 'NL': 'Netherlands', + 'NC': 'New Caledonia', + 'NZ': 'New Zealand', + 'NI': 'Nicaragua', + 'NE': 'Niger', + 'NG': 'Nigeria', + 'NU': 'Niue', + 'NF': 'Norfolk Island', + 'MP': 'Northern Mariana Islands', + 'NO': 'Norway', + 'OM': 'Oman', + 'PK': 'Pakistan', + 'PW': 'Palau', + 'PS': 'Palestine, State of', + 'PA': 'Panama', + 'PG': 'Papua New Guinea', + 'PY': 'Paraguay', + 'PE': 'Peru', + 'PH': 'Philippines', + 'PN': 'Pitcairn', + 'PL': 'Poland', + 'PT': 'Portugal', + 'PR': 'Puerto Rico', + 'QA': 'Qatar', + 'RE': 'Réunion', + 'RO': 'Romania', + 'RU': 'Russian Federation', + 'RW': 'Rwanda', + 'BL': 'Saint Barthélemy', + 'SH': 'Saint Helena, Ascension and Tristan da Cunha', + 'KN': 'Saint Kitts and Nevis', + 'LC': 'Saint Lucia', + 'MF': 'Saint Martin (French part)', + 'PM': 'Saint Pierre and Miquelon', + 'VC': 'Saint Vincent and the Grenadines', + 'WS': 'Samoa', + 'SM': 'San Marino', + 'ST': 'Sao Tome and Principe', + 'SA': 'Saudi Arabia', + 'SN': 'Senegal', + 'RS': 'Serbia', + 'SC': 'Seychelles', + 'SL': 'Sierra Leone', + 'SG': 'Singapore', + 'SX': 'Sint Maarten (Dutch part)', + 'SK': 'Slovakia', + 'SI': 'Slovenia', + 'SB': 'Solomon Islands', + 'SO': 'Somalia', + 'ZA': 'South Africa', + 'GS': 'South Georgia and the South Sandwich Islands', + 'SS': 'South Sudan', + 'ES': 'Spain', + 'LK': 'Sri Lanka', + 'SD': 'Sudan', + 'SR': 'Suriname', + 'SJ': 'Svalbard and Jan Mayen', + 'SZ': 'Swaziland', + 'SE': 'Sweden', + 'CH': 'Switzerland', + 'SY': 'Syrian Arab Republic', + 'TW': 'Taiwan, Province of China', + 'TJ': 'Tajikistan', + 'TZ': 'Tanzania, United Republic of', + 'TH': 'Thailand', + 'TL': 'Timor-Leste', + 'TG': 'Togo', + 'TK': 'Tokelau', + 'TO': 'Tonga', + 'TT': 'Trinidad and Tobago', + 'TN': 'Tunisia', + 'TR': 'Turkey', + 'TM': 'Turkmenistan', + 'TC': 'Turks and Caicos Islands', + 'TV': 'Tuvalu', + 'UG': 'Uganda', + 'UA': 'Ukraine', + 'AE': 'United Arab Emirates', + 'GB': 'United Kingdom', + 'US': 'United States', + 'UM': 'United States Minor Outlying Islands', + 'UY': 'Uruguay', + 'UZ': 'Uzbekistan', + 'VU': 'Vanuatu', + 'VE': 'Venezuela, Bolivarian Republic of', + 'VN': 'Viet Nam', + 'VG': 'Virgin Islands, British', + 'VI': 'Virgin Islands, U.S.', + 'WF': 'Wallis and Futuna', + 'EH': 'Western Sahara', + 'YE': 'Yemen', + 'ZM': 'Zambia', + 'ZW': 'Zimbabwe', + } + + @classmethod + def short2full(cls, code): + """Convert an ISO 3166-2 country code to the corresponding full name""" + return cls._country_map.get(code.upper()) + + +class GeoUtils(object): + # Major IPv4 address blocks per country + _country_ip_map = { + 'AD': '46.172.224.0/19', + 'AE': '94.200.0.0/13', + 'AF': '149.54.0.0/17', + 'AG': '209.59.64.0/18', + 'AI': '204.14.248.0/21', + 'AL': '46.99.0.0/16', + 'AM': '46.70.0.0/15', + 'AO': '105.168.0.0/13', + 'AP': '182.50.184.0/21', + 'AQ': '23.154.160.0/24', + 'AR': '181.0.0.0/12', + 'AS': '202.70.112.0/20', + 'AT': '77.116.0.0/14', + 'AU': '1.128.0.0/11', + 'AW': '181.41.0.0/18', + 'AX': '185.217.4.0/22', + 'AZ': '5.197.0.0/16', + 'BA': '31.176.128.0/17', + 'BB': '65.48.128.0/17', + 'BD': '114.130.0.0/16', + 'BE': '57.0.0.0/8', + 'BF': '102.178.0.0/15', + 'BG': '95.42.0.0/15', + 'BH': '37.131.0.0/17', + 'BI': '154.117.192.0/18', + 'BJ': '137.255.0.0/16', + 'BL': '185.212.72.0/23', + 'BM': '196.12.64.0/18', + 'BN': '156.31.0.0/16', + 'BO': '161.56.0.0/16', + 'BQ': '161.0.80.0/20', + 'BR': '191.128.0.0/12', + 'BS': '24.51.64.0/18', + 'BT': '119.2.96.0/19', + 'BW': '168.167.0.0/16', + 'BY': '178.120.0.0/13', + 'BZ': '179.42.192.0/18', + 'CA': '99.224.0.0/11', + 'CD': '41.243.0.0/16', + 'CF': '197.242.176.0/21', + 'CG': '160.113.0.0/16', + 'CH': '85.0.0.0/13', + 'CI': '102.136.0.0/14', + 'CK': '202.65.32.0/19', + 'CL': '152.172.0.0/14', + 'CM': '102.244.0.0/14', + 'CN': '36.128.0.0/10', + 'CO': '181.240.0.0/12', + 'CR': '201.192.0.0/12', + 'CU': '152.206.0.0/15', + 'CV': '165.90.96.0/19', + 'CW': '190.88.128.0/17', + 'CY': '31.153.0.0/16', + 'CZ': '88.100.0.0/14', + 'DE': '53.0.0.0/8', + 'DJ': '197.241.0.0/17', + 'DK': '87.48.0.0/12', + 'DM': '192.243.48.0/20', + 'DO': '152.166.0.0/15', + 'DZ': '41.96.0.0/12', + 'EC': '186.68.0.0/15', + 'EE': '90.190.0.0/15', + 'EG': '156.160.0.0/11', + 'ER': '196.200.96.0/20', + 'ES': '88.0.0.0/11', + 'ET': '196.188.0.0/14', + 'EU': '2.16.0.0/13', + 'FI': '91.152.0.0/13', + 'FJ': '144.120.0.0/16', + 'FK': '80.73.208.0/21', + 'FM': '119.252.112.0/20', + 'FO': '88.85.32.0/19', + 'FR': '90.0.0.0/9', + 'GA': '41.158.0.0/15', + 'GB': '25.0.0.0/8', + 'GD': '74.122.88.0/21', + 'GE': '31.146.0.0/16', + 'GF': '161.22.64.0/18', + 'GG': '62.68.160.0/19', + 'GH': '154.160.0.0/12', + 'GI': '95.164.0.0/16', + 'GL': '88.83.0.0/19', + 'GM': '160.182.0.0/15', + 'GN': '197.149.192.0/18', + 'GP': '104.250.0.0/19', + 'GQ': '105.235.224.0/20', + 'GR': '94.64.0.0/13', + 'GT': '168.234.0.0/16', + 'GU': '168.123.0.0/16', + 'GW': '197.214.80.0/20', + 'GY': '181.41.64.0/18', + 'HK': '113.252.0.0/14', + 'HN': '181.210.0.0/16', + 'HR': '93.136.0.0/13', + 'HT': '148.102.128.0/17', + 'HU': '84.0.0.0/14', + 'ID': '39.192.0.0/10', + 'IE': '87.32.0.0/12', + 'IL': '79.176.0.0/13', + 'IM': '5.62.80.0/20', + 'IN': '117.192.0.0/10', + 'IO': '203.83.48.0/21', + 'IQ': '37.236.0.0/14', + 'IR': '2.176.0.0/12', + 'IS': '82.221.0.0/16', + 'IT': '79.0.0.0/10', + 'JE': '87.244.64.0/18', + 'JM': '72.27.0.0/17', + 'JO': '176.29.0.0/16', + 'JP': '133.0.0.0/8', + 'KE': '105.48.0.0/12', + 'KG': '158.181.128.0/17', + 'KH': '36.37.128.0/17', + 'KI': '103.25.140.0/22', + 'KM': '197.255.224.0/20', + 'KN': '198.167.192.0/19', + 'KP': '175.45.176.0/22', + 'KR': '175.192.0.0/10', + 'KW': '37.36.0.0/14', + 'KY': '64.96.0.0/15', + 'KZ': '2.72.0.0/13', + 'LA': '115.84.64.0/18', + 'LB': '178.135.0.0/16', + 'LC': '24.92.144.0/20', + 'LI': '82.117.0.0/19', + 'LK': '112.134.0.0/15', + 'LR': '102.183.0.0/16', + 'LS': '129.232.0.0/17', + 'LT': '78.56.0.0/13', + 'LU': '188.42.0.0/16', + 'LV': '46.109.0.0/16', + 'LY': '41.252.0.0/14', + 'MA': '105.128.0.0/11', + 'MC': '88.209.64.0/18', + 'MD': '37.246.0.0/16', + 'ME': '178.175.0.0/17', + 'MF': '74.112.232.0/21', + 'MG': '154.126.0.0/17', + 'MH': '117.103.88.0/21', + 'MK': '77.28.0.0/15', + 'ML': '154.118.128.0/18', + 'MM': '37.111.0.0/17', + 'MN': '49.0.128.0/17', + 'MO': '60.246.0.0/16', + 'MP': '202.88.64.0/20', + 'MQ': '109.203.224.0/19', + 'MR': '41.188.64.0/18', + 'MS': '208.90.112.0/22', + 'MT': '46.11.0.0/16', + 'MU': '105.16.0.0/12', + 'MV': '27.114.128.0/18', + 'MW': '102.70.0.0/15', + 'MX': '187.192.0.0/11', + 'MY': '175.136.0.0/13', + 'MZ': '197.218.0.0/15', + 'NA': '41.182.0.0/16', + 'NC': '101.101.0.0/18', + 'NE': '197.214.0.0/18', + 'NF': '203.17.240.0/22', + 'NG': '105.112.0.0/12', + 'NI': '186.76.0.0/15', + 'NL': '145.96.0.0/11', + 'NO': '84.208.0.0/13', + 'NP': '36.252.0.0/15', + 'NR': '203.98.224.0/19', + 'NU': '49.156.48.0/22', + 'NZ': '49.224.0.0/14', + 'OM': '5.36.0.0/15', + 'PA': '186.72.0.0/15', + 'PE': '186.160.0.0/14', + 'PF': '123.50.64.0/18', + 'PG': '124.240.192.0/19', + 'PH': '49.144.0.0/13', + 'PK': '39.32.0.0/11', + 'PL': '83.0.0.0/11', + 'PM': '70.36.0.0/20', + 'PR': '66.50.0.0/16', + 'PS': '188.161.0.0/16', + 'PT': '85.240.0.0/13', + 'PW': '202.124.224.0/20', + 'PY': '181.120.0.0/14', + 'QA': '37.210.0.0/15', + 'RE': '102.35.0.0/16', + 'RO': '79.112.0.0/13', + 'RS': '93.86.0.0/15', + 'RU': '5.136.0.0/13', + 'RW': '41.186.0.0/16', + 'SA': '188.48.0.0/13', + 'SB': '202.1.160.0/19', + 'SC': '154.192.0.0/11', + 'SD': '102.120.0.0/13', + 'SE': '78.64.0.0/12', + 'SG': '8.128.0.0/10', + 'SI': '188.196.0.0/14', + 'SK': '78.98.0.0/15', + 'SL': '102.143.0.0/17', + 'SM': '89.186.32.0/19', + 'SN': '41.82.0.0/15', + 'SO': '154.115.192.0/18', + 'SR': '186.179.128.0/17', + 'SS': '105.235.208.0/21', + 'ST': '197.159.160.0/19', + 'SV': '168.243.0.0/16', + 'SX': '190.102.0.0/20', + 'SY': '5.0.0.0/16', + 'SZ': '41.84.224.0/19', + 'TC': '65.255.48.0/20', + 'TD': '154.68.128.0/19', + 'TG': '196.168.0.0/14', + 'TH': '171.96.0.0/13', + 'TJ': '85.9.128.0/18', + 'TK': '27.96.24.0/21', + 'TL': '180.189.160.0/20', + 'TM': '95.85.96.0/19', + 'TN': '197.0.0.0/11', + 'TO': '175.176.144.0/21', + 'TR': '78.160.0.0/11', + 'TT': '186.44.0.0/15', + 'TV': '202.2.96.0/19', + 'TW': '120.96.0.0/11', + 'TZ': '156.156.0.0/14', + 'UA': '37.52.0.0/14', + 'UG': '102.80.0.0/13', + 'US': '6.0.0.0/8', + 'UY': '167.56.0.0/13', + 'UZ': '84.54.64.0/18', + 'VA': '212.77.0.0/19', + 'VC': '207.191.240.0/21', + 'VE': '186.88.0.0/13', + 'VG': '66.81.192.0/20', + 'VI': '146.226.0.0/16', + 'VN': '14.160.0.0/11', + 'VU': '202.80.32.0/20', + 'WF': '117.20.32.0/21', + 'WS': '202.4.32.0/19', + 'YE': '134.35.0.0/16', + 'YT': '41.242.116.0/22', + 'ZA': '41.0.0.0/11', + 'ZM': '102.144.0.0/13', + 'ZW': '102.177.192.0/18', + } + + @classmethod + def random_ipv4(cls, code_or_block): + if len(code_or_block) == 2: + block = cls._country_ip_map.get(code_or_block.upper()) + if not block: + return None + else: + block = code_or_block + addr, preflen = block.split('/') + addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0] + addr_max = addr_min | (0xffffffff >> int(preflen)) + return compat_str(socket.inet_ntoa( + compat_struct_pack('!L', random.randint(addr_min, addr_max)))) + + +class PerRequestProxyHandler(compat_urllib_request.ProxyHandler): + def __init__(self, proxies=None): + # Set default handlers + for type in ('http', 'https'): + setattr(self, '%s_open' % type, + lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open: + meth(r, proxy, type)) + compat_urllib_request.ProxyHandler.__init__(self, proxies) + + def proxy_open(self, req, proxy, type): + req_proxy = req.headers.get('Ytdl-request-proxy') + if req_proxy is not None: + proxy = req_proxy + del req.headers['Ytdl-request-proxy'] + + if proxy == '__noproxy__': + return None # No Proxy + if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'): + req.add_header('Ytdl-socks-proxy', proxy) + # yt-dlp's http/https handlers do wrapping the socket with socks + return None + return compat_urllib_request.ProxyHandler.proxy_open( + self, req, proxy, type) + + +# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is +# released into Public Domain +# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387 + +def long_to_bytes(n, blocksize=0): + """long_to_bytes(n:long, blocksize:int) : string + Convert a long integer to a byte string. + + If optional blocksize is given and greater than zero, pad the front of the + byte string with binary zeros so that the length is a multiple of + blocksize. + """ + # after much testing, this algorithm was deemed to be the fastest + s = b'' + n = int(n) + while n > 0: + s = compat_struct_pack('>I', n & 0xffffffff) + s + n = n >> 32 + # strip off leading zeros + for i in range(len(s)): + if s[i] != b'\000'[0]: + break + else: + # only happens when n == 0 + s = b'\000' + i = 0 + s = s[i:] + # add back some pad bytes. this could be done more efficiently w.r.t. the + # de-padding being done above, but sigh... + if blocksize > 0 and len(s) % blocksize: + s = (blocksize - len(s) % blocksize) * b'\000' + s + return s + + +def bytes_to_long(s): + """bytes_to_long(string) : long + Convert a byte string to a long integer. + + This is (essentially) the inverse of long_to_bytes(). + """ + acc = 0 + length = len(s) + if length % 4: + extra = (4 - length % 4) + s = b'\000' * extra + s + length = length + extra + for i in range(0, length, 4): + acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0] + return acc + + +def ohdave_rsa_encrypt(data, exponent, modulus): + ''' + Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/ + + Input: + data: data to encrypt, bytes-like object + exponent, modulus: parameter e and N of RSA algorithm, both integer + Output: hex string of encrypted data + + Limitation: supports one block encryption only + ''' + + payload = int(binascii.hexlify(data[::-1]), 16) + encrypted = pow(payload, exponent, modulus) + return '%x' % encrypted + + +def pkcs1pad(data, length): + """ + Padding input data with PKCS#1 scheme + + @param {int[]} data input data + @param {int} length target length + @returns {int[]} padded data + """ + if len(data) > length - 11: + raise ValueError('Input data too long for PKCS#1 padding') + + pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)] + return [0, 2] + pseudo_random + [0] + data + + +def encode_base_n(num, n, table=None): + FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' + if not table: + table = FULL_TABLE[:n] + + if n > len(table): + raise ValueError('base %d exceeds table length %d' % (n, len(table))) + + if num == 0: + return table[0] + + ret = '' + while num: + ret = table[num % n] + ret + num = num // n + return ret + + +def decode_packed_codes(code): + mobj = re.search(PACKED_CODES_RE, code) + obfuscated_code, base, count, symbols = mobj.groups() + base = int(base) + count = int(count) + symbols = symbols.split('|') + symbol_table = {} + + while count: + count -= 1 + base_n_count = encode_base_n(count, base) + symbol_table[base_n_count] = symbols[count] or base_n_count + + return re.sub( + r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)], + obfuscated_code) + + +def caesar(s, alphabet, shift): + if shift == 0: + return s + l = len(alphabet) + return ''.join( + alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c + for c in s) + + +def rot47(s): + return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47) + + +def parse_m3u8_attributes(attrib): + info = {} + for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib): + if val.startswith('"'): + val = val[1:-1] + info[key] = val + return info + + +def urshift(val, n): + return val >> n if val >= 0 else (val + 0x100000000) >> n + + +# Based on png2str() written by @gdkchan and improved by @yokrysty +# Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706 +def decode_png(png_data): + # Reference: https://www.w3.org/TR/PNG/ + header = png_data[8:] + + if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR': + raise IOError('Not a valid PNG file.') + + int_map = {1: '>B', 2: '>H', 4: '>I'} + unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0] + + chunks = [] + + while header: + length = unpack_integer(header[:4]) + header = header[4:] + + chunk_type = header[:4] + header = header[4:] + + chunk_data = header[:length] + header = header[length:] + + header = header[4:] # Skip CRC + + chunks.append({ + 'type': chunk_type, + 'length': length, + 'data': chunk_data + }) + + ihdr = chunks[0]['data'] + + width = unpack_integer(ihdr[:4]) + height = unpack_integer(ihdr[4:8]) + + idat = b'' + + for chunk in chunks: + if chunk['type'] == b'IDAT': + idat += chunk['data'] + + if not idat: + raise IOError('Unable to read PNG data.') + + decompressed_data = bytearray(zlib.decompress(idat)) + + stride = width * 3 + pixels = [] + + def _get_pixel(idx): + x = idx % stride + y = idx // stride + return pixels[y][x] + + for y in range(height): + basePos = y * (1 + stride) + filter_type = decompressed_data[basePos] + + current_row = [] + + pixels.append(current_row) + + for x in range(stride): + color = decompressed_data[1 + basePos + x] + basex = y * stride + x + left = 0 + up = 0 + + if x > 2: + left = _get_pixel(basex - 3) + if y > 0: + up = _get_pixel(basex - stride) + + if filter_type == 1: # Sub + color = (color + left) & 0xff + elif filter_type == 2: # Up + color = (color + up) & 0xff + elif filter_type == 3: # Average + color = (color + ((left + up) >> 1)) & 0xff + elif filter_type == 4: # Paeth + a = left + b = up + c = 0 + + if x > 2 and y > 0: + c = _get_pixel(basex - stride - 3) + + p = a + b - c + + pa = abs(p - a) + pb = abs(p - b) + pc = abs(p - c) + + if pa <= pb and pa <= pc: + color = (color + a) & 0xff + elif pb <= pc: + color = (color + b) & 0xff + else: + color = (color + c) & 0xff + + current_row.append(color) + + return width, height, pixels + + +def write_xattr(path, key, value): + # This mess below finds the best xattr tool for the job + try: + # try the pyxattr module... + import xattr + + if hasattr(xattr, 'set'): # pyxattr + # Unicode arguments are not supported in python-pyxattr until + # version 0.5.0 + # See https://github.com/ytdl-org/youtube-dl/issues/5498 + pyxattr_required_version = '0.5.0' + if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version): + # TODO: fallback to CLI tools + raise XAttrUnavailableError( + 'python-pyxattr is detected but is too old. ' + 'yt-dlp requires %s or above while your version is %s. ' + 'Falling back to other xattr implementations' % ( + pyxattr_required_version, xattr.__version__)) + + setxattr = xattr.set + else: # xattr + setxattr = xattr.setxattr + + try: + setxattr(path, key, value) + except EnvironmentError as e: + raise XAttrMetadataError(e.errno, e.strerror) + + except ImportError: + if compat_os_name == 'nt': + # Write xattrs to NTFS Alternate Data Streams: + # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29 + assert ':' not in key + assert os.path.exists(path) + + ads_fn = path + ':' + key + try: + with open(ads_fn, 'wb') as f: + f.write(value) + except EnvironmentError as e: + raise XAttrMetadataError(e.errno, e.strerror) + else: + user_has_setfattr = check_executable('setfattr', ['--version']) + user_has_xattr = check_executable('xattr', ['-h']) + + if user_has_setfattr or user_has_xattr: + + value = value.decode('utf-8') + if user_has_setfattr: + executable = 'setfattr' + opts = ['-n', key, '-v', value] + elif user_has_xattr: + executable = 'xattr' + opts = ['-w', key, value] + + cmd = ([encodeFilename(executable, True)] + + [encodeArgument(o) for o in opts] + + [encodeFilename(path, True)]) + + try: + p = subprocess.Popen( + cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) + except EnvironmentError as e: + raise XAttrMetadataError(e.errno, e.strerror) + stdout, stderr = process_communicate_or_kill(p) + stderr = stderr.decode('utf-8', 'replace') + if p.returncode != 0: + raise XAttrMetadataError(p.returncode, stderr) + + else: + # On Unix, and can't find pyxattr, setfattr, or xattr. + if sys.platform.startswith('linux'): + raise XAttrUnavailableError( + "Couldn't find a tool to set the xattrs. " + "Install either the python 'pyxattr' or 'xattr' " + "modules, or the GNU 'attr' package " + "(which contains the 'setfattr' tool).") + else: + raise XAttrUnavailableError( + "Couldn't find a tool to set the xattrs. " + "Install either the python 'xattr' module, " + "or the 'xattr' binary.") + + +def random_birthday(year_field, month_field, day_field): + start_date = datetime.date(1950, 1, 1) + end_date = datetime.date(1995, 12, 31) + offset = random.randint(0, (end_date - start_date).days) + random_date = start_date + datetime.timedelta(offset) + return { + year_field: str(random_date.year), + month_field: str(random_date.month), + day_field: str(random_date.day), + } + + +# Templates for internet shortcut files, which are plain text files. +DOT_URL_LINK_TEMPLATE = ''' +[InternetShortcut] +URL=%(url)s +'''.lstrip() + +DOT_WEBLOC_LINK_TEMPLATE = ''' +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd"> +<plist version="1.0"> +<dict> +\t<key>URL</key> +\t<string>%(url)s</string> +</dict> +</plist> +'''.lstrip() + +DOT_DESKTOP_LINK_TEMPLATE = ''' +[Desktop Entry] +Encoding=UTF-8 +Name=%(filename)s +Type=Link +URL=%(url)s +Icon=text-html +'''.lstrip() + + +def iri_to_uri(iri): + """ + Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only). + + The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact. + """ + + iri_parts = compat_urllib_parse_urlparse(iri) + + if '[' in iri_parts.netloc: + raise ValueError('IPv6 URIs are not, yet, supported.') + # Querying `.netloc`, when there's only one bracket, also raises a ValueError. + + # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is. + + net_location = '' + if iri_parts.username: + net_location += compat_urllib_parse_quote(iri_parts.username, safe=r"!$%&'()*+,~") + if iri_parts.password is not None: + net_location += ':' + compat_urllib_parse_quote(iri_parts.password, safe=r"!$%&'()*+,~") + net_location += '@' + + net_location += iri_parts.hostname.encode('idna').decode('utf-8') # Punycode for Unicode hostnames. + # The 'idna' encoding produces ASCII text. + if iri_parts.port is not None and iri_parts.port != 80: + net_location += ':' + str(iri_parts.port) + + return compat_urllib_parse_urlunparse( + (iri_parts.scheme, + net_location, + + compat_urllib_parse_quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"), + + # Unsure about the `safe` argument, since this is a legacy way of handling parameters. + compat_urllib_parse_quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"), + + # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component. + compat_urllib_parse_quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"), + + compat_urllib_parse_quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~"))) + + # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes. + + +def to_high_limit_path(path): + if sys.platform in ['win32', 'cygwin']: + # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited. + return r'\\?\ '.rstrip() + os.path.abspath(path) + + return path + + +def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None): + if field is None: + val = obj if obj is not None else default + else: + val = obj.get(field, default) + if func and val not in ignore: + val = func(val) + return template % val if val not in ignore else default + + +def clean_podcast_url(url): + return re.sub(r'''(?x) + (?: + (?: + chtbl\.com/track| + media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/ + play\.podtrac\.com + )/[^/]+| + (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure + flex\.acast\.com| + pd(?: + cn\.co| # https://podcorn.com/analytics-prefix/ + st\.fm # https://podsights.com/docs/ + )/e + )/''', '', url) + + +_HEX_TABLE = '0123456789abcdef' + + +def random_uuidv4(): + return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx') + + +def make_dir(path, to_screen=None): + try: + dn = os.path.dirname(path) + if dn and not os.path.exists(dn): + os.makedirs(dn) + return True + except (OSError, IOError) as err: + if callable(to_screen) is not None: + to_screen('unable to create directory ' + error_to_compat_str(err)) + return False + + +def get_executable_path(): + from zipimport import zipimporter + if hasattr(sys, 'frozen'): # Running from PyInstaller + path = os.path.dirname(sys.executable) + elif isinstance(globals().get('__loader__'), zipimporter): # Running from ZIP + path = os.path.join(os.path.dirname(__file__), '../..') + else: + path = os.path.join(os.path.dirname(__file__), '..') + return os.path.abspath(path) + + +def load_plugins(name, suffix, namespace): + classes = {} + try: + plugins_spec = importlib.util.spec_from_file_location( + name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py')) + plugins = importlib.util.module_from_spec(plugins_spec) + sys.modules[plugins_spec.name] = plugins + plugins_spec.loader.exec_module(plugins) + for name in dir(plugins): + if name in namespace: + continue + if not name.endswith(suffix): + continue + klass = getattr(plugins, name) + classes[name] = namespace[name] = klass + except FileNotFoundError: + pass + return classes + + +def traverse_obj( + obj, *path_list, default=None, expected_type=None, get_all=True, + casesense=True, is_user_input=False, traverse_string=False): + ''' Traverse nested list/dict/tuple + @param path_list A list of paths which are checked one by one. + Each path is a list of keys where each key is a string, + a function, a tuple of strings or "...". + When a fuction is given, it takes the key as argument and + returns whether the key matches or not. When a tuple is given, + all the keys given in the tuple are traversed, and + "..." traverses all the keys in the object + @param default Default value to return + @param expected_type Only accept final value of this type (Can also be any callable) + @param get_all Return all the values obtained from a path or only the first one + @param casesense Whether to consider dictionary keys as case sensitive + @param is_user_input Whether the keys are generated from user input. If True, + strings are converted to int/slice if necessary + @param traverse_string Whether to traverse inside strings. If True, any + non-compatible object will also be converted into a string + # TODO: Write tests + ''' + if not casesense: + _lower = lambda k: (k.lower() if isinstance(k, str) else k) + path_list = (map(_lower, variadic(path)) for path in path_list) + + def _traverse_obj(obj, path, _current_depth=0): + nonlocal depth + if obj is None: + return None + path = tuple(variadic(path)) + for i, key in enumerate(path): + if isinstance(key, (list, tuple)): + obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key] + key = ... + if key is ...: + obj = (obj.values() if isinstance(obj, dict) + else obj if isinstance(obj, (list, tuple, LazyList)) + else str(obj) if traverse_string else []) + _current_depth += 1 + depth = max(depth, _current_depth) + return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj] + elif callable(key): + if isinstance(obj, (list, tuple, LazyList)): + obj = enumerate(obj) + elif isinstance(obj, dict): + obj = obj.items() + else: + if not traverse_string: + return None + obj = str(obj) + _current_depth += 1 + depth = max(depth, _current_depth) + return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if key(k)] + elif isinstance(obj, dict) and not (is_user_input and key == ':'): + obj = (obj.get(key) if casesense or (key in obj) + else next((v for k, v in obj.items() if _lower(k) == key), None)) + else: + if is_user_input: + key = (int_or_none(key) if ':' not in key + else slice(*map(int_or_none, key.split(':')))) + if key == slice(None): + return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth) + if not isinstance(key, (int, slice)): + return None + if not isinstance(obj, (list, tuple, LazyList)): + if not traverse_string: + return None + obj = str(obj) + try: + obj = obj[key] + except IndexError: + return None + return obj + + if isinstance(expected_type, type): + type_test = lambda val: val if isinstance(val, expected_type) else None + elif expected_type is not None: + type_test = expected_type + else: + type_test = lambda val: val + + for path in path_list: + depth = 0 + val = _traverse_obj(obj, path) + if val is not None: + if depth: + for _ in range(depth - 1): + val = itertools.chain.from_iterable(v for v in val if v is not None) + val = [v for v in map(type_test, val) if v is not None] + if val: + return val if get_all else val[0] + else: + val = type_test(val) + if val is not None: + return val + return default + + +def traverse_dict(dictn, keys, casesense=True): + ''' For backward compatibility. Do not use ''' + return traverse_obj(dictn, keys, casesense=casesense, + is_user_input=True, traverse_string=True) + + +def variadic(x, allowed_types=(str, bytes)): + return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,) + + +# create a JSON Web Signature (jws) with HS256 algorithm +# the resulting format is in JWS Compact Serialization +# implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html +# implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html +def jwt_encode_hs256(payload_data, key, headers={}): + header_data = { + 'alg': 'HS256', + 'typ': 'JWT', + } + if headers: + header_data.update(headers) + header_b64 = base64.b64encode(json.dumps(header_data).encode('utf-8')) + payload_b64 = base64.b64encode(json.dumps(payload_data).encode('utf-8')) + h = hmac.new(key.encode('utf-8'), header_b64 + b'.' + payload_b64, hashlib.sha256) + signature_b64 = base64.b64encode(h.digest()) + token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64 + return token + + +def supports_terminal_sequences(stream): + if compat_os_name == 'nt': + if get_windows_version() < (10, 0, 10586): + return False + elif not os.getenv('TERM'): + return False + try: + return stream.isatty() + except BaseException: + return False + + +TERMINAL_SEQUENCES = { + 'DOWN': '\n', + 'UP': '\x1b[A', + 'ERASE_LINE': '\x1b[K', + 'RED': '\033[0;31m', + 'YELLOW': '\033[0;33m', + 'BLUE': '\033[0;34m', + 'RESET_STYLE': '\033[0m', +} diff --git a/yt_dlp/version.py b/yt_dlp/version.py new file mode 100644 index 000000000..83b6fea9f --- /dev/null +++ b/yt_dlp/version.py @@ -0,0 +1,3 @@ +from __future__ import unicode_literals + +__version__ = '2021.10.10' diff --git a/yt_dlp/webvtt.py b/yt_dlp/webvtt.py new file mode 100644 index 000000000..cd936e7e5 --- /dev/null +++ b/yt_dlp/webvtt.py @@ -0,0 +1,402 @@ +# coding: utf-8 +from __future__ import unicode_literals, print_function, division + +""" +A partial parser for WebVTT segments. Interprets enough of the WebVTT stream +to be able to assemble a single stand-alone subtitle file, suitably adjusting +timestamps on the way, while everything else is passed through unmodified. + +Regular expressions based on the W3C WebVTT specification +<https://www.w3.org/TR/webvtt1/>. The X-TIMESTAMP-MAP extension is described +in RFC 8216 §3.5 <https://tools.ietf.org/html/rfc8216#section-3.5>. +""" + +import re +import io +from .utils import int_or_none +from .compat import ( + compat_str as str, + compat_Pattern, + compat_Match, +) + + +class _MatchParser(object): + """ + An object that maintains the current parsing position and allows + conveniently advancing it as syntax elements are successfully parsed. + """ + + def __init__(self, string): + self._data = string + self._pos = 0 + + def match(self, r): + if isinstance(r, compat_Pattern): + return r.match(self._data, self._pos) + if isinstance(r, str): + if self._data.startswith(r, self._pos): + return len(r) + return None + raise ValueError(r) + + def advance(self, by): + if by is None: + amt = 0 + elif isinstance(by, compat_Match): + amt = len(by.group(0)) + elif isinstance(by, str): + amt = len(by) + elif isinstance(by, int): + amt = by + else: + raise ValueError(by) + self._pos += amt + return by + + def consume(self, r): + return self.advance(self.match(r)) + + def child(self): + return _MatchChildParser(self) + + +class _MatchChildParser(_MatchParser): + """ + A child parser state, which advances through the same data as + its parent, but has an independent position. This is useful when + advancing through syntax elements we might later want to backtrack + from. + """ + + def __init__(self, parent): + super(_MatchChildParser, self).__init__(parent._data) + self.__parent = parent + self._pos = parent._pos + + def commit(self): + """ + Advance the parent state to the current position of this child state. + """ + self.__parent._pos = self._pos + return self.__parent + + +class ParseError(Exception): + def __init__(self, parser): + super(ParseError, self).__init__("Parse error at position %u (near %r)" % ( + parser._pos, parser._data[parser._pos:parser._pos + 20] + )) + + +# While the specification <https://www.w3.org/TR/webvtt1/#webvtt-timestamp> +# prescribes that hours must be *2 or more* digits, timestamps with a single +# digit for the hour part has been seen in the wild. +# See https://github.com/yt-dlp/yt-dlp/issues/921 +_REGEX_TS = re.compile(r'''(?x) + (?:([0-9]{1,}):)? + ([0-9]{2}): + ([0-9]{2})\. + ([0-9]{3})? +''') +_REGEX_EOF = re.compile(r'\Z') +_REGEX_NL = re.compile(r'(?:\r\n|[\r\n])') +_REGEX_BLANK = re.compile(r'(?:\r\n|[\r\n])+') + + +def _parse_ts(ts): + """ + Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS) + into an MPEG PES timestamp: a tick counter at 90 kHz resolution. + """ + + h, min, s, ms = ts.groups() + return 90 * ( + int(h or 0) * 3600000 + # noqa: W504,E221,E222 + int(min) * 60000 + # noqa: W504,E221,E222 + int(s) * 1000 + # noqa: W504,E221,E222 + int(ms) # noqa: W504,E221,E222 + ) + + +def _format_ts(ts): + """ + Convert an MPEG PES timestamp into a WebVTT timestamp. + This will lose sub-millisecond precision. + """ + msec = int((ts + 45) // 90) + secs, msec = divmod(msec, 1000) + mins, secs = divmod(secs, 60) + hrs, mins = divmod(mins, 60) + return '%02u:%02u:%02u.%03u' % (hrs, mins, secs, msec) + + +class Block(object): + """ + An abstract WebVTT block. + """ + + def __init__(self, **kwargs): + for key, val in kwargs.items(): + setattr(self, key, val) + + @classmethod + def parse(cls, parser): + m = parser.match(cls._REGEX) + if not m: + return None + parser.advance(m) + return cls(raw=m.group(0)) + + def write_into(self, stream): + stream.write(self.raw) + + +class HeaderBlock(Block): + """ + A WebVTT block that may only appear in the header part of the file, + i.e. before any cue blocks. + """ + + pass + + +class Magic(HeaderBlock): + _REGEX = re.compile(r'\ufeff?WEBVTT([ \t][^\r\n]*)?(?:\r\n|[\r\n])') + + # XXX: The X-TIMESTAMP-MAP extension is described in RFC 8216 §3.5 + # <https://tools.ietf.org/html/rfc8216#section-3.5>, but the RFC + # doesn’t specify the exact grammar nor where in the WebVTT + # syntax it should be placed; the below has been devised based + # on usage in the wild + # + # And strictly speaking, the presence of this extension violates + # the W3C WebVTT spec. Oh well. + + _REGEX_TSMAP = re.compile(r'X-TIMESTAMP-MAP=') + _REGEX_TSMAP_LOCAL = re.compile(r'LOCAL:') + _REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)') + _REGEX_TSMAP_SEP = re.compile(r'[ \t]*,[ \t]*') + + @classmethod + def __parse_tsmap(cls, parser): + parser = parser.child() + + while True: + m = parser.consume(cls._REGEX_TSMAP_LOCAL) + if m: + m = parser.consume(_REGEX_TS) + if m is None: + raise ParseError(parser) + local = _parse_ts(m) + if local is None: + raise ParseError(parser) + else: + m = parser.consume(cls._REGEX_TSMAP_MPEGTS) + if m: + mpegts = int_or_none(m.group(1)) + if mpegts is None: + raise ParseError(parser) + else: + raise ParseError(parser) + if parser.consume(cls._REGEX_TSMAP_SEP): + continue + if parser.consume(_REGEX_NL): + break + raise ParseError(parser) + + parser.commit() + return local, mpegts + + @classmethod + def parse(cls, parser): + parser = parser.child() + + m = parser.consume(cls._REGEX) + if not m: + raise ParseError(parser) + + extra = m.group(1) + local, mpegts = None, None + if parser.consume(cls._REGEX_TSMAP): + local, mpegts = cls.__parse_tsmap(parser) + if not parser.consume(_REGEX_NL): + raise ParseError(parser) + parser.commit() + return cls(extra=extra, mpegts=mpegts, local=local) + + def write_into(self, stream): + stream.write('WEBVTT') + if self.extra is not None: + stream.write(self.extra) + stream.write('\n') + if self.local or self.mpegts: + stream.write('X-TIMESTAMP-MAP=LOCAL:') + stream.write(_format_ts(self.local if self.local is not None else 0)) + stream.write(',MPEGTS:') + stream.write(str(self.mpegts if self.mpegts is not None else 0)) + stream.write('\n') + stream.write('\n') + + +class StyleBlock(HeaderBlock): + _REGEX = re.compile(r'''(?x) + STYLE[\ \t]*(?:\r\n|[\r\n]) + ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))* + (?:\r\n|[\r\n]) + ''') + + +class RegionBlock(HeaderBlock): + _REGEX = re.compile(r'''(?x) + REGION[\ \t]* + ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))* + (?:\r\n|[\r\n]) + ''') + + +class CommentBlock(Block): + _REGEX = re.compile(r'''(?x) + NOTE(?:\r\n|[\ \t\r\n]) + ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))* + (?:\r\n|[\r\n]) + ''') + + +class CueBlock(Block): + """ + A cue block. The payload is not interpreted. + """ + + _REGEX_ID = re.compile(r'((?:(?!-->)[^\r\n])+)(?:\r\n|[\r\n])') + _REGEX_ARROW = re.compile(r'[ \t]+-->[ \t]+') + _REGEX_SETTINGS = re.compile(r'[ \t]+((?:(?!-->)[^\r\n])+)') + _REGEX_PAYLOAD = re.compile(r'[^\r\n]+(?:\r\n|[\r\n])?') + + @classmethod + def parse(cls, parser): + parser = parser.child() + + id = None + m = parser.consume(cls._REGEX_ID) + if m: + id = m.group(1) + + m0 = parser.consume(_REGEX_TS) + if not m0: + return None + if not parser.consume(cls._REGEX_ARROW): + return None + m1 = parser.consume(_REGEX_TS) + if not m1: + return None + m2 = parser.consume(cls._REGEX_SETTINGS) + if not parser.consume(_REGEX_NL): + return None + + start = _parse_ts(m0) + end = _parse_ts(m1) + settings = m2.group(1) if m2 is not None else None + + text = io.StringIO() + while True: + m = parser.consume(cls._REGEX_PAYLOAD) + if not m: + break + text.write(m.group(0)) + + parser.commit() + return cls( + id=id, + start=start, end=end, settings=settings, + text=text.getvalue() + ) + + def write_into(self, stream): + if self.id is not None: + stream.write(self.id) + stream.write('\n') + stream.write(_format_ts(self.start)) + stream.write(' --> ') + stream.write(_format_ts(self.end)) + if self.settings is not None: + stream.write(' ') + stream.write(self.settings) + stream.write('\n') + stream.write(self.text) + stream.write('\n') + + @property + def as_json(self): + return { + 'id': self.id, + 'start': self.start, + 'end': self.end, + 'text': self.text, + 'settings': self.settings, + } + + def __eq__(self, other): + return self.as_json == other.as_json + + @classmethod + def from_json(cls, json): + return cls( + id=json['id'], + start=json['start'], + end=json['end'], + text=json['text'], + settings=json['settings'] + ) + + def hinges(self, other): + if self.text != other.text: + return False + if self.settings != other.settings: + return False + return self.start <= self.end == other.start <= other.end + + +def parse_fragment(frag_content): + """ + A generator that yields (partially) parsed WebVTT blocks when given + a bytes object containing the raw contents of a WebVTT file. + """ + + parser = _MatchParser(frag_content.decode('utf-8')) + + yield Magic.parse(parser) + + while not parser.match(_REGEX_EOF): + if parser.consume(_REGEX_BLANK): + continue + + block = RegionBlock.parse(parser) + if block: + yield block + continue + block = StyleBlock.parse(parser) + if block: + yield block + continue + block = CommentBlock.parse(parser) + if block: + yield block # XXX: or skip + continue + + break + + while not parser.match(_REGEX_EOF): + if parser.consume(_REGEX_BLANK): + continue + + block = CommentBlock.parse(parser) + if block: + yield block # XXX: or skip + continue + block = CueBlock.parse(parser) + if block: + yield block + continue + + raise ParseError(parser) |